Syntax coloring for xaml
I wrote some code that would generate syntax coloring for xaml (and for that matter, most other XML), figured I would share it out. Some assembly required:
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
namespace BuildQuickStart
{
/*
* this file implements a mostly correct XML tokenizer. The token boundaries
* have been chosen to match Visual Studio syntax highlighting, so a few of
* the boundaries are little weird. (Especially comments) known issues:
*
* Doesn't handle DTD's
* mediocre handling of processing instructions <? ?> -- it won't crash,
* but the token boundaries are wrong
* Doesn't enforce correct XML
* there's probably a few cases where it will die if given in valid XML
*
*
* This tokenizer has been designed to be restartable, so you can tokenize
* one line of XML at a time.
*/
//enum TokenColors
//{
// Punctuation, StringLiteral, ElementName, AttributeName, Comment, Normal
//}
enum XmlTokenKind : short
{
Open, // <
Close,//>
SelfClose,// />
OpenClose,// </
ElementName,
ElementWhitespace,//whitespace between attributes
AttributeName,
Equals, // inside attribute
AttributeValue, // attribute value
CommentBegin, // <!--
CommentText,
CommentEnd, // -->
Entity, // >
OpenProcessingInstruction, // <?
CloseProcessingInstruction, // ?>
CDataBegin, // <![CDATA[
CDataEnd,// ]]>
TextContent,
//WhitespaceContent, // text content that's whitespace. Space is embedded inside
EOF, // end of file
}
// Used so you can restart the tokenizer for the next line of XML
enum XmlTokenizerMode
{
InsideComment,
InsideProcessingInstruction,
AfterOpen,
AfterAttributeName,
AfterAttributeEquals,
InsideElement, // after element name, before attribute or />
OutsideElement,
InsideCData,
}
struct XmlToken
{
public XmlTokenKind Kind;
public short Length;
public XmlToken(XmlTokenKind kind, int length)
{
Kind = kind;
Length = (short)length;
}
}
// XML tokenizer, tokens are designed to match Visual Studio syntax highlighting
class XmlTokenizer
{
string input;
int position = 0;
XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement;
public static List<XmlToken> Tokenize(string input)
{
XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement;
XmlTokenizer tokenizer = new XmlTokenizer();
return tokenizer.Tokenize(input, ref mode);
}
public List<XmlToken> Tokenize(string input, ref XmlTokenizerMode _mode)
{
this.input = input;
this.mode = _mode;
this.position = 0;
List<XmlToken> result = Tokenize();
_mode = this.mode;
return result;
}
private List<XmlToken> Tokenize()
{
List<XmlToken> list = new List<XmlToken>();
XmlToken token;
do {
int previousPosition = position;
token = NextToken();
string tokenText = input.Substring(previousPosition, token.Length);
list.Add(token);
} while (token.Kind != XmlTokenKind.EOF);
List<string> strings = TokensToStrings(list, input);
return list;
}
private List<string> TokensToStrings(List<XmlToken> list, string input)
{
List<string> output = new List<string>();
int position = 0;
foreach (XmlToken token in list) {
output.Add(input.Substring(position, token.Length));
position += token.Length;
}
return output;
}
// debugging function
public string RemainingText
{
get { return input.Substring(position); }
}
private XmlToken NextToken()
{
if (position >= input.Length)
return new XmlToken(XmlTokenKind.EOF, 0);
XmlToken token;
switch (mode) {
case XmlTokenizerMode.AfterAttributeEquals:
token = TokenizeAttributeValue();
break;
case XmlTokenizerMode.AfterAttributeName:
token = TokenizeSimple("=", XmlTokenKind.Equals, XmlTokenizerMode.AfterAttributeEquals);
break;
case XmlTokenizerMode.AfterOpen:
token = TokenizeName(XmlTokenKind.ElementName, XmlTokenizerMode.InsideElement);
break;
case XmlTokenizerMode.InsideCData:
token = TokenizeInsideCData();
break;
case XmlTokenizerMode.InsideComment:
token = TokenizeInsideComment();
break;
case XmlTokenizerMode.InsideElement:
token = TokenizeInsideElement();
break;
case XmlTokenizerMode.InsideProcessingInstruction:
token = TokenizeInsideProcessingInstruction();
break;
case XmlTokenizerMode.OutsideElement:
token = TokenizeOutsideElement();
break;
default:
token = new XmlToken(XmlTokenKind.EOF, 0);
Debug.Fail("missing case");
break;
}
return token;
}
private bool IsNameCharacter(char character)
{
// XML rule: Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
bool result = char.IsLetterOrDigit(character)
|| character == '.' | character == '-' | character == '_' | character == ':';
return result;
}
private XmlToken TokenizeAttributeValue()
{
Debug.Assert(mode == XmlTokenizerMode.AfterAttributeEquals);
int closePosition = input.IndexOf(input[position], position + 1);
XmlToken token = new XmlToken(XmlTokenKind.AttributeValue, closePosition + 1 - position);
position = closePosition + 1;
mode = XmlTokenizerMode.InsideElement;
return token;
}
private XmlToken TokenizeName(XmlTokenKind kind, XmlTokenizerMode nextMode)
{
Debug.Assert(mode == XmlTokenizerMode.AfterOpen || mode == XmlTokenizerMode.InsideElement);
int i;
for (i = position; i < input.Length; i++) {
if (!IsNameCharacter(input[i])) {
break;
}
}
XmlToken token = new XmlToken(kind, i - position);
mode = nextMode;
position = i;
return token;
}
private XmlToken TokenizeElementWhitespace()
{
int i;
for (i = position; i < input.Length; i++) {
if (!char.IsWhiteSpace(input[i])) {
break;
}
}
XmlToken token = new XmlToken(XmlTokenKind.ElementWhitespace, i - position);
position = i;
return token;
}
private bool StartsWith(string text)
{
if (position + text.Length > input.Length)
return false;
else
return input.Substring(position, text.Length) == text;
}
private XmlToken TokenizeInsideElement()
{
if (char.IsWhiteSpace(input[position]))
return TokenizeElementWhitespace();
else if (StartsWith("/>"))
return TokenizeSimple("/>", XmlTokenKind.SelfClose, XmlTokenizerMode.OutsideElement);
else if (StartsWith(">"))
return TokenizeSimple(">", XmlTokenKind.Close, XmlTokenizerMode.OutsideElement);
else {
return TokenizeName(XmlTokenKind.AttributeName, XmlTokenizerMode.AfterAttributeName);
}
}
//// We break on newlines because that makes it easier for us
//// to ignore the space after comments
//private Token TokenizeWhitespaceContent()
//{
// Debug.Assert(char.IsWhiteSpace(input[position]));
// bool sawNewline = false;
// int i;
// for (i = position; i < input.Length; i++) {
// if (!char.IsWhiteSpace(input[i])) {
// break;
// } else if (input[i] == '\n' || input[i] == '\r') {
// sawNewline = true;
// } else if (sawNewline) {
// break;
// }
// }
// Token token = new Token(TokenKind.WhitespaceContent, i - position);
// position = i;
// return token;
//}
private XmlToken TokenizeText()
{
Debug.Assert(input[position] != '<');
Debug.Assert(input[position] != '&');
Debug.Assert(mode == XmlTokenizerMode.OutsideElement);
int i;
for (i = position; i < input.Length; i++) {
if (input[i] == '<' || input[i] == '&') {
break;
}
}
XmlToken token = new XmlToken(XmlTokenKind.TextContent, i - position);
position = i;
return token;
}
private XmlToken TokenizeOutsideElement()
{
Debug.Assert(mode == XmlTokenizerMode.OutsideElement);
if (position >= input.Length)
return new XmlToken(XmlTokenKind.EOF, 0);
switch (input[position]) {
case '<':
return TokenizeOpen();
case '&':
return TokenizeEntity();
default:
return TokenizeText();
}
}
private XmlToken TokenizeSimple(string text, XmlTokenKind kind, XmlTokenizerMode nextMode)
{
XmlToken token = new XmlToken(kind, text.Length);
position += text.Length;
mode = nextMode;
return token;
}
private XmlToken TokenizeOpen()
{
Debug.Assert(input[position] == '<');
if (StartsWith("<!--")) {
return TokenizeSimple("<!--", XmlTokenKind.CommentBegin, XmlTokenizerMode.InsideComment);
} else if (StartsWith("<![CDATA[")) {
return TokenizeSimple("<![CDATA[", XmlTokenKind.CDataBegin, XmlTokenizerMode.InsideCData);
} else if (StartsWith("<?")) {
return TokenizeSimple("<?", XmlTokenKind.OpenProcessingInstruction, XmlTokenizerMode.InsideProcessingInstruction);
} else if (StartsWith("</")) {
return TokenizeSimple("</", XmlTokenKind.OpenClose, XmlTokenizerMode.AfterOpen);
} else {
return TokenizeSimple("<", XmlTokenKind.Open, XmlTokenizerMode.AfterOpen);
}
}
private XmlToken TokenizeEntity()
{
Debug.Assert(mode == XmlTokenizerMode.OutsideElement);
Debug.Assert(input[position] == '&');
XmlToken token = new XmlToken(XmlTokenKind.Entity, input.IndexOf(';', position) - position);
position += token.Length;
return token;
}
private XmlToken TokenizeInsideProcessingInstruction()
{
Debug.Assert(mode == XmlTokenizerMode.InsideProcessingInstruction);
int tokenend = input.IndexOf("?>", position);
if (position == tokenend) {
position += "?>".Length;
mode = XmlTokenizerMode.OutsideElement;
return new XmlToken(XmlTokenKind.CloseProcessingInstruction, "?>".Length);
} else {
XmlToken token = new XmlToken(XmlTokenKind.TextContent, tokenend - position);
position = tokenend;
return token;
}
}
private XmlToken TokenizeInsideCData()
{
Debug.Assert(mode == XmlTokenizerMode.InsideCData);
int tokenend = input.IndexOf("]]>", position);
if (position == tokenend) {
position += "]]>".Length;
mode = XmlTokenizerMode.OutsideElement;
return new XmlToken(XmlTokenKind.CDataEnd, "]]>".Length);
} else {
XmlToken token = new XmlToken(XmlTokenKind.TextContent, tokenend - position);
position = tokenend;
return token;
}
}
private XmlToken TokenizeInsideComment()
{
Debug.Assert(mode == XmlTokenizerMode.InsideComment);
int tokenend = input.IndexOf("-->", position);
if (position == tokenend) {
position += "-->".Length;
mode = XmlTokenizerMode.OutsideElement;
return new XmlToken(XmlTokenKind.CommentEnd, "-->".Length);
} else {
XmlToken token = new XmlToken(XmlTokenKind.CommentText, tokenend - position);
position = tokenend;
return token;
}
}
}
}
static string ColorForToken(XmlToken token, string tokenText)
{
string color = "black";
switch (token.Kind) {
case XmlTokenKind.Open:
case XmlTokenKind.OpenClose:
case XmlTokenKind.Close:
case XmlTokenKind.SelfClose:
case XmlTokenKind.CommentBegin:
case XmlTokenKind.CommentEnd:
case XmlTokenKind.CDataBegin:
case XmlTokenKind.CDataEnd:
case XmlTokenKind.Equals:
case XmlTokenKind.OpenProcessingInstruction:
case XmlTokenKind.CloseProcessingInstruction:
case XmlTokenKind.AttributeValue:
color = "blue";
break;
case XmlTokenKind.ElementName:
color = "brown";
break;
case XmlTokenKind.TextContent:
color = "black";
break;
case XmlTokenKind.AttributeName:
case XmlTokenKind.Entity:
color = "red";
break;
case XmlTokenKind.CommentText:
color = "green";
break;
}
if (token.Kind == XmlTokenKind.ElementWhitespace
|| (token.Kind == XmlTokenKind.TextContent && tokenText.Trim() == "")) {
color = null;
}
return color;
}
// Sample usage:
XmlTokenizer tokenizer = new XmlTokenizer();
XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement;
List<XmlToken> tokens = tokenizer.Tokenize(line, ref mode);
List<string> tokenTexts = new List<string>(tokens.Count);
List<string> colors = new List<string>(tokens.Count);
int position = 0;
foreach (XmlToken token in tokens) {
string tokenText = line.Substring(position, token.Length);
tokenTexts.Add(tokenText);
string color = ColorForToken(token, tokenText);
colors.Add(color);
position += token.Length;
}