xml syntax highlighting in Silverlight
Several years back I posted a XML tokenizer for syntax highlighting. At the time I didn't post a complete app, since it was part of a larger project; if someone recently asked me for a little more context for the tokenizer so I put together a quickie Silverlight project showing how to use the tokenizer for syntax highlighting:
<UserControl x:Class="XmlNotepad.MainPage"
xmlns="https://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="https://schemas.microsoft.com/winfx/2006/xaml"
xmlns:d="https://schemas.microsoft.com/expression/blend/2008" xmlns:mc="https://schemas.openxmlformats.org/markup-compatibility/2006"
mc:Ignorable="d" d:DesignWidth="640" d:DesignHeight="480">
<Grid x:Name="LayoutRoot" Background="AliceBlue">
<Grid.RowDefinitions>
<RowDefinition Height="*"/>
<RowDefinition Height="*"/>
</Grid.RowDefinitions>
<TextBox x:Name="box" AcceptsReturn="True" Grid.Row="0"/>
<TextBlock x:Name="block" Grid.Row="1"/>
</Grid>
</UserControl>
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Windows;
using System.Windows.Controls;
using System.Windows.Documents;
using System.Windows.Input;
using System.Windows.Media;
using System.Windows.Media.Animation;
using System.Windows.Shapes;
namespace XmlNotepad {
public partial class MainPage : UserControl {
public MainPage() {
InitializeComponent();
box.TextChanged += new TextChangedEventHandler(box_TextChanged);
}
// for xaml
private static Color ColorForToken(XmlToken token, string tokenText) {
Color color = Colors.Black;
switch (token.Kind) {
case XmlTokenKind.Open:
case XmlTokenKind.OpenClose:
case XmlTokenKind.Close:
case XmlTokenKind.SelfClose:
case XmlTokenKind.CommentBegin:
case XmlTokenKind.CommentEnd:
case XmlTokenKind.CDataBegin:
case XmlTokenKind.CDataEnd:
case XmlTokenKind.Equals:
case XmlTokenKind.OpenProcessingInstruction:
case XmlTokenKind.CloseProcessingInstruction:
case XmlTokenKind.AttributeValue:
color = Colors.Blue;
break;
case XmlTokenKind.ElementName:
color = Colors.Brown;
break;
case XmlTokenKind.TextContent:
color = Colors.Black;
break;
case XmlTokenKind.AttributeName:
case XmlTokenKind.Entity:
color = Colors.Red;
break;
case XmlTokenKind.CommentText:
color = Colors.Green;
break;
}
return color;
}
void box_TextChanged(object sender, TextChangedEventArgs e) {
block.Inlines.Clear();
XmlTokenizer tokenizer = new XmlTokenizer();
XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement;
string xml = box.Text;
List<XmlToken> tokens = tokenizer.Tokenize(xml, ref mode);
List<string> tokenTexts = new List<string>(tokens.Count);
List<Color> colors = new List<Color>(tokens.Count);
int position = 0;
foreach (XmlToken token in tokens) {
string tokenText = xml.Substring(position, token.Length);
tokenTexts.Add(tokenText);
Color color = ColorForToken(token, tokenText);
colors.Add(color);
position += token.Length;
}
for (int i = 0; i < tokens.Count; i++) {
Run run = new Run();
run.Foreground = new SolidColorBrush(colors[i]);
run.Text = tokenTexts[i];
block.Inlines.Add(run);
}
}
}
}
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
namespace XmlNotepad
{
/*
* this file implements a mostly correct XML tokenizer. The token boundaries
* have been chosen to match Visual Studio syntax highlighting, so a few of
* the boundaries are little weird. (Especially comments) known issues:
*
* Doesn't handle DTD's
* mediocre handling of processing instructions <? ?> -- it won't crash,
* but the token boundaries are wrong
* Doesn't enforce correct XML
* there's a few cases where it will die if given invalid XML
*
*
* This tokenizer has been designed to be restartable, so you can tokenize
* one line of XML at a time.
*/
//enum TokenColors
//{
// Punctuation, StringLiteral, ElementName, AttributeName, Comment, Normal
//}
enum XmlTokenKind : short
{
Open, // <
Close,//>
SelfClose,// />
OpenClose,// </
ElementName,
ElementWhitespace,//whitespace between attributes
AttributeName,
Equals, // inside attribute
AttributeValue, // attribute value
CommentBegin, // <!--
CommentText,
CommentEnd, // -->
Entity, // >
OpenProcessingInstruction, // <?
CloseProcessingInstruction, // ?>
CDataBegin, // <![CDATA[
CDataEnd,// ]]>
TextContent,
//WhitespaceContent, // text content that's whitespace. Space is embedded inside
EOF, // end of file
}
// Used so you can restart the tokenizer for the next line of XML
enum XmlTokenizerMode
{
InsideComment,
InsideProcessingInstruction,
AfterOpen,
AfterAttributeName,
AfterAttributeEquals,
InsideElement, // after element name, before attribute or />
OutsideElement,
InsideCData,
}
struct XmlToken
{
public XmlTokenKind Kind;
public short Length;
public XmlToken(XmlTokenKind kind, int length)
{
Kind = kind;
Length = (short)length;
}
}
// XML tokenizer, tokens are designed to match Visual Studio syntax highlighting
class XmlTokenizer
{
string input;
int position = 0;
XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement;
public static List<XmlToken> Tokenize(string input)
{
XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement;
XmlTokenizer tokenizer = new XmlTokenizer();
return tokenizer.Tokenize(input, ref mode);
}
public List<XmlToken> Tokenize(string input, ref XmlTokenizerMode _mode)
{
this.input = input;
this.mode = _mode;
this.position = 0;
List<XmlToken> result = Tokenize();
_mode = this.mode;
return result;
}
private List<XmlToken> Tokenize()
{
List<XmlToken> list = new List<XmlToken>();
XmlToken token;
do {
int previousPosition = position;
token = NextToken();
string tokenText = input.Substring(previousPosition, token.Length);
list.Add(token);
} while (token.Kind != XmlTokenKind.EOF);
List<string> strings = TokensToStrings(list, input);
return list;
}
private List<string> TokensToStrings(List<XmlToken> list, string input)
{
List<string> output = new List<string>();
int position = 0;
foreach (XmlToken token in list) {
output.Add(input.Substring(position, token.Length));
position += token.Length;
}
return output;
}
// debugging function
public string RemainingText
{
get { return input.Substring(position); }
}
private XmlToken NextToken()
{
if (position >= input.Length)
return new XmlToken(XmlTokenKind.EOF, 0);
XmlToken token;
switch (mode) {
case XmlTokenizerMode.AfterAttributeEquals:
token = TokenizeAttributeValue();
break;
case XmlTokenizerMode.AfterAttributeName:
token = TokenizeSimple("=", XmlTokenKind.Equals, XmlTokenizerMode.AfterAttributeEquals);
break;
case XmlTokenizerMode.AfterOpen:
token = TokenizeName(XmlTokenKind.ElementName, XmlTokenizerMode.InsideElement);
break;
case XmlTokenizerMode.InsideCData:
token = TokenizeInsideCData();
break;
case XmlTokenizerMode.InsideComment:
token = TokenizeInsideComment();
break;
case XmlTokenizerMode.InsideElement:
token = TokenizeInsideElement();
break;
case XmlTokenizerMode.InsideProcessingInstruction:
token = TokenizeInsideProcessingInstruction();
break;
case XmlTokenizerMode.OutsideElement:
token = TokenizeOutsideElement();
break;
default:
token = new XmlToken(XmlTokenKind.EOF, 0);
throw new Exception ("missing case");
break;
}
return token;
}
private bool IsNameCharacter(char character)
{
// XML rule: Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
bool result = char.IsLetterOrDigit(character)
|| character == '.' | character == '-' | character == '_' | character == ':';
return result;
}
private XmlToken TokenizeAttributeValue()
{
Debug.Assert(mode == XmlTokenizerMode.AfterAttributeEquals);
int closePosition = input.IndexOf(input[position], position + 1);
XmlToken token = new XmlToken(XmlTokenKind.AttributeValue, closePosition + 1 - position);
position = closePosition + 1;
mode = XmlTokenizerMode.InsideElement;
return token;
}
private XmlToken TokenizeName(XmlTokenKind kind, XmlTokenizerMode nextMode)
{
Debug.Assert(mode == XmlTokenizerMode.AfterOpen || mode == XmlTokenizerMode.InsideElement);
int i;
for (i = position; i < input.Length; i++) {
if (!IsNameCharacter(input[i])) {
break;
}
}
XmlToken token = new XmlToken(kind, i - position);
mode = nextMode;
position = i;
return token;
}
private XmlToken TokenizeElementWhitespace()
{
int i;
for (i = position; i < input.Length; i++) {
if (!char.IsWhiteSpace(input[i])) {
break;
}
}
XmlToken token = new XmlToken(XmlTokenKind.ElementWhitespace, i - position);
position = i;
return token;
}
private bool StartsWith(string text)
{
if (position + text.Length > input.Length)
return false;
else
return input.Substring(position, text.Length) == text;
}
private XmlToken TokenizeInsideElement()
{
if (char.IsWhiteSpace(input[position]))
return TokenizeElementWhitespace();
else if (StartsWith("/>"))
return TokenizeSimple("/>", XmlTokenKind.SelfClose, XmlTokenizerMode.OutsideElement);
else if (StartsWith(">"))
return TokenizeSimple(">", XmlTokenKind.Close, XmlTokenizerMode.OutsideElement);
else {
return TokenizeName(XmlTokenKind.AttributeName, XmlTokenizerMode.AfterAttributeName);
}
}
//// We break on newlines because that makes it easier for us
//// to ignore the space after comments
//private Token TokenizeWhitespaceContent()
//{
// Debug.Assert(char.IsWhiteSpace(input[position]));
// bool sawNewline = false;
// int i;
// for (i = position; i < input.Length; i++) {
// if (!char.IsWhiteSpace(input[i])) {
// break;
// } else if (input[i] == '\n' || input[i] == '\r') {
// sawNewline = true;
// } else if (sawNewline) {
// break;
// }
// }
// Token token = new Token(TokenKind.WhitespaceContent, i - position);
// position = i;
// return token;
//}
private XmlToken TokenizeText()
{
Debug.Assert(input[position] != '<');
Debug.Assert(input[position] != '&');
Debug.Assert(mode == XmlTokenizerMode.OutsideElement);
int i;
for (i = position; i < input.Length; i++) {
if (input[i] == '<' || input[i] == '&') {
break;
}
}
XmlToken token = new XmlToken(XmlTokenKind.TextContent, i - position);
position = i;
return token;
}
private XmlToken TokenizeOutsideElement()
{
Debug.Assert(mode == XmlTokenizerMode.OutsideElement);
if (position >= input.Length)
return new XmlToken(XmlTokenKind.EOF, 0);
switch (input[position]) {
case '<':
return TokenizeOpen();
case '&':
return TokenizeEntity();
default:
return TokenizeText();
}
}
private XmlToken TokenizeSimple(string text, XmlTokenKind kind, XmlTokenizerMode nextMode)
{
XmlToken token = new XmlToken(kind, text.Length);
position += text.Length;
mode = nextMode;
return token;
}
private XmlToken TokenizeOpen()
{
Debug.Assert(input[position] == '<');
if (StartsWith("<!--")) {
return TokenizeSimple("<!--", XmlTokenKind.CommentBegin, XmlTokenizerMode.InsideComment);
} else if (StartsWith("<![CDATA[")) {
return TokenizeSimple("<![CDATA[", XmlTokenKind.CDataBegin, XmlTokenizerMode.InsideCData);
} else if (StartsWith("<?")) {
return TokenizeSimple("<?", XmlTokenKind.OpenProcessingInstruction, XmlTokenizerMode.InsideProcessingInstruction);
} else if (StartsWith("</")) {
return TokenizeSimple("</", XmlTokenKind.OpenClose, XmlTokenizerMode.AfterOpen);
} else {
return TokenizeSimple("<", XmlTokenKind.Open, XmlTokenizerMode.AfterOpen);
}
}
private XmlToken TokenizeEntity()
{
Debug.Assert(mode == XmlTokenizerMode.OutsideElement);
Debug.Assert(input[position] == '&');
XmlToken token = new XmlToken(XmlTokenKind.Entity, input.IndexOf(';', position) - position);
position += token.Length;
return token;
}
private XmlToken TokenizeInsideProcessingInstruction()
{
Debug.Assert(mode == XmlTokenizerMode.InsideProcessingInstruction);
int tokenend = input.IndexOf("?>", position);
if (position == tokenend) {
position += "?>".Length;
mode = XmlTokenizerMode.OutsideElement;
return new XmlToken(XmlTokenKind.CloseProcessingInstruction, "?>".Length);
} else {
XmlToken token = new XmlToken(XmlTokenKind.TextContent, tokenend - position);
position = tokenend;
return token;
}
}
private XmlToken TokenizeInsideCData()
{
Debug.Assert(mode == XmlTokenizerMode.InsideCData);
int tokenend = input.IndexOf("]]>", position);
if (position == tokenend) {
position += "]]>".Length;
mode = XmlTokenizerMode.OutsideElement;
return new XmlToken(XmlTokenKind.CDataEnd, "]]>".Length);
} else {
XmlToken token = new XmlToken(XmlTokenKind.TextContent, tokenend - position);
position = tokenend;
return token;
}
}
private XmlToken TokenizeInsideComment()
{
Debug.Assert(mode == XmlTokenizerMode.InsideComment);
int tokenend = input.IndexOf("-->", position);
if (position == tokenend) {
position += "-->".Length;
mode = XmlTokenizerMode.OutsideElement;
return new XmlToken(XmlTokenKind.CommentEnd, "-->".Length);
} else {
XmlToken token = new XmlToken(XmlTokenKind.CommentText, tokenend - position);
position = tokenend;
return token;
}
}
}
}
If you're doing an interactive editor, you'll want to fix a few straightforward bugs in the tokenizer for handling invalid XML. Enjoy!