Partager via


Ease of Maintenance of LINQ Code (2)

In two previous posts, I developed a somewhat involved query to search through a word processing document for style names and/or paragraph content.  This is a query that I’m developing for the PowerTools for Open XML project.  In those posts, as I evolved the query, I showed each iteration of it, highlighting the changes I made.  This post continues modifying that query:

  • This blog is inactive.
    New blog: EricWhite.com/blog

    Blog TOCFirst, I added a few tests that cause complete code coverage given the attached source document.

  • Second, I changed the way that I handled XNames in the example, using pre-atomized XName objects in a static class.  I highlighted these changes in green.  I detailed this approach in this post.  I should have developed this query using the pre-atomization approach from the start.  My conscience started bothering me, so had to correct the code.

  • Third, I took Keith’s suggestion from the previous post, and re-wrote ContainsAnyStyles and ContainsAnyContent.  The new versions are much smaller and cleaner.  In this process, did a minor refactoring to pre-convert search strings to lower case when the search is case insensitive and not using regular expressions.

Hey, if anyone else has any additional suggestions for this query, I’m interested!  J

Here is the evolved query:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.Xml;
using System.Xml.Linq;
using DocumentFormat.OpenXml.Packaging;

public static class LocalExtensions
{
public static string StringConcatenate<T>(this IEnumerable<T> source,
Func<T, string> func)
{
StringBuilder sb = new StringBuilder();
foreach (T item in source)
sb.Append(func(item));
return sb.ToString();
}

public static string StringConcatenate<T>(this IEnumerable<T> source,
Func<T, string> func, string separator)
{
StringBuilder sb = new StringBuilder();
foreach (T item in source)
sb.Append(func(item)).Append(separator);
if (sb.Length > separator.Length)
sb.Length -= separator.Length;
return sb.ToString();
}

public static XDocument GetXDocument(this OpenXmlPart part)
{
XDocument xdoc = part.Annotation<XDocument>();
if (xdoc != null)
return xdoc;
using (StreamReader sr = new StreamReader(part.GetStream()))
using (XmlReader xr = XmlReader.Create(sr))
xdoc = XDocument.Load(xr);
part.AddAnnotation(xdoc);
return xdoc;
}
}

public static class W
{
public static XNamespace w =
"https://schemas.openxmlformats.org/wordprocessingml/2006/main";

public static XName style = w + "style";
public static XName type = w + "type";
public static XName styleId = w + "styleId";
public static XName name = w + "name";
public static XName val = w + "val";
public static XName basedOn = w + "basedOn";
public static XName r = w + "r";
public static XName ins = w + "ins";
// "default" is not a valid identifier, so must use _default
public static XName _default = w + "default";
public static XName body = w + "body";
public static XName pPr = w + "pPr";
public static XName pStyle = w + "pStyle";
public static XName p = w + "p";
public static XName t = w + "t";
}

classProgram
{
staticbool ContainsAnyStyles(IEnumerable<string> stylesToSearch,
IEnumerable<string> searchStrings)
{
return stylesToSearch.Intersect(searchStrings).Any();
}

staticbool ContainsAnyContent(string stringToSearch, IEnumerable<string> searchStrings,
IEnumerable<Regex> regularExpressions, bool isRegularExpression, bool caseInsensitive)
{
if (isRegularExpression)
return regularExpressions.Any(r => r.IsMatch(stringToSearch));
else
if (caseInsensitive)
return searchStrings.Any(s => stringToSearch.ToLower().Contains(s));
else
return searchStrings.Any(s => stringToSearch.Contains(s));
}

static IEnumerable<string> GetAllStyleIdsAndNames(WordprocessingDocument doc, string styleId)
{
string localStyleId = styleId;
yield return styleId;

string styleNameForFirstStyle = (string)doc
.MainDocumentPart
.StyleDefinitionsPart
.GetXDocument()
.Root
.Elements(W.style)
.Where(e => (string)e.Attribute(W.type) == "paragraph" &&
(string)e.Attribute(W.styleId) == styleId)
.Elements(W.name)
.Attributes(W.val)
.FirstOrDefault();

if (styleNameForFirstStyle != null)
yield return styleNameForFirstStyle;

while (true)
{
XElement style = doc
.MainDocumentPart
.StyleDefinitionsPart
.GetXDocument()
.Root
.Elements(W.style)
.Where(e => (string)e.Attribute(W.type) == "paragraph" &&
(string)e.Attribute(W.styleId) == localStyleId)
.FirstOrDefault();

if (style == null)
yield break;

var basedOn = (string)style
.Elements(W.basedOn)
.Attributes(W.val)
.FirstOrDefault();

if (basedOn == null)
yield break;

yield return basedOn;

XElement basedOnStyle = doc
.MainDocumentPart
.StyleDefinitionsPart
.GetXDocument()
.Root
.Elements(W.style)
.Where(e => (string)e.Attribute(W.type) == "paragraph" &&
(string)e.Attribute(W.styleId) == basedOn)
.FirstOrDefault();

string basedOnStyleName = (string)basedOnStyle
.Elements(W.name)
.Attributes(W.val)
.FirstOrDefault();

if (basedOnStyleName != null)
yield return basedOnStyleName;

localStyleId = basedOn;
}
}

static int[] SearchInDocument(WordprocessingDocument doc,
IEnumerable<string> styleSearchString, IEnumerable<string> contentSearchString,
bool isRegularExpression, bool caseInsensitive)
{
RegexOptions options;
Regex[] regularExpressions = null;
if (isRegularExpression && contentSearchString != null)
{
if (caseInsensitive)
options = RegexOptions.IgnoreCase | RegexOptions.Compiled;
else
options = RegexOptions.Compiled;
regularExpressions = contentSearchString
.Select(s => new Regex(s, options)).ToArray();
}

string[] contentSearchStringToUse = null;
if (contentSearchString != null)
{
if (!isRegularExpression && caseInsensitive)
contentSearchStringToUse = contentSearchString.Select(s => s.ToLower()).ToArray();
else
contentSearchStringToUse = contentSearchString.ToArray();
}

var defaultStyleName = (string)doc
.MainDocumentPart
.StyleDefinitionsPart
.GetXDocument()
.Root
.Elements(W.style)
.Where(style =>
(string)style.Attribute(W.type) == "paragraph" &&
(string)style.Attribute(W._default) == "1")
.First()
.Attribute(W.styleId);

var q1 = doc
.MainDocumentPart
.GetXDocument()
.Root
.Element(W.body)
.Elements()
.Select((p, i) =>
{
var styleNode = p
.Elements(W.pPr)
.Elements(W.pStyle)
.FirstOrDefault();
var styleName = styleNode != null ?
(string)styleNode.Attribute(W.val) :
defaultStyleName;
return new
{
Element = p,
Index = i,
StyleName = styleName
};
}
);

var q2 = q1
.Select(i =>
{
string text = null;
if (i.Element.Name == W.p)
text = i.Element.Elements()
.Where(z => z.Name == W.r || z.Name == W.ins)
.Descendants(W.t)
.StringConcatenate(element => (string)element);
else
text = i.Element
.Descendants(W.p)
.StringConcatenate(p => p
.Elements()
.Where(z => z.Name == W.r || z.Name == W.ins)
.Descendants(W.t)
.StringConcatenate(element => (string)element),
Environment.NewLine
);

return new
{
Element = i.Element,
StyleName = i.StyleName,
Index = i.Index,
Text = text
};
}
);

var q3 = q2
.Select(i =>
new
{
Element = i.Element,
StyleName = i.StyleName,
Index = i.Index,
Text = i.Text,
InheritedStyles = GetAllStyleIdsAndNames(doc, i.StyleName).Distinct()
}
);

int[] q4 = null;
if (styleSearchString != null)
q4 = q3
.Where(i => ContainsAnyStyles(i.InheritedStyles, styleSearchString))
.Select(i => i.Index)
.ToArray();

int[] q5 = null;
if (contentSearchStringToUse != null)
q5 = q3
.Where(i => ContainsAnyContent(i.Text, contentSearchStringToUse, regularExpressions,
isRegularExpression, caseInsensitive))
.Select(i => i.Index)
.ToArray();

int[] q6 = null;
if (q4 != null && q5 != null)
q6 = q4.Intersect(q5).ToArray();
else
q6 = q5 != null ? q5 : q4;

return q6;
}

static int[] SearchInDocument(string filename,
IEnumerable<string> styleSearchString, IEnumerable<string> contentSearchString,
bool isRegularExpression, bool caseInsensitive)
{
using (WordprocessingDocument doc =
WordprocessingDocument.Open(filename, false))
return SearchInDocument(doc, styleSearchString, contentSearchString,
isRegularExpression, caseInsensitive);
}

static int[] SearchInDocument(string filename, string styleSearchString,
string contentSearchString, bool isRegularExpression, bool caseInsensitive)
{
return SearchInDocument(filename,
styleSearchString != null ? new List<string>() { styleSearchString } : null,
contentSearchString != null ? new List<string>() { contentSearchString } : null,
isRegularExpression, caseInsensitive);
}

static void Main(string[] args)
{
Console.WriteLine("Test 1");
int[] results1 = SearchInDocument(
"Test.docx", new[] { "Normal" }, new[] { "h.*o", "aaa" }, true, false);
foreach (var i in results1) Console.WriteLine(i);
Console.WriteLine(results1.SequenceEqual(new[] { 7, 10 }) ? "Passed" : "Failed");
Console.WriteLine();

Console.WriteLine("Test 2");
int[] results2 = SearchInDocument(
"Test.docx", new[] { "NotAStyle" }, new[] { "h.*o", "aaa" }, true, false);
foreach (var i in results2) Console.WriteLine(i);
Console.WriteLine(results2.SequenceEqual(new int[] { }) ? "Passed" : "Failed");
Console.WriteLine();

Console.WriteLine("Test 3");
int[] results3 = SearchInDocument(
"Test.docx", new[] { "Heading1" }, null, true, false);
foreach (var i in results3) Console.WriteLine(i);
Console.WriteLine(results3.SequenceEqual(new int[] { 0 }) ? "Passed" : "Failed");
Console.WriteLine();

Console.WriteLine("Test 4");
int[] results4 = SearchInDocument(
"Test.docx", new[] { "Normal" }, new[] { "h.*o", "aaa" }, true, true);
foreach (var i in results4) Console.WriteLine(i);
Console.WriteLine(
results4.SequenceEqual(new int[] { 0, 6, 7, 8, 10 }) ? "Passed" : "Failed");
Console.WriteLine();

Console.WriteLine("Test 5");
int[] results5 = SearchInDocument(
"Test.docx", null, new[] { "hello", "aaa" }, false, false);
foreach (var i in results5) Console.WriteLine(i);
Console.WriteLine(
results5.SequenceEqual(new int[] { 7, 10 }) ? "Passed" : "Failed");
Console.WriteLine();

Console.WriteLine("Test 6");
int[] results6 = SearchInDocument(
"Test.docx", null, new[] { "hello", "aaa" }, false, true);
foreach (var i in results6) Console.WriteLine(i);
Console.WriteLine(
results6.SequenceEqual(new int[] { 0, 6, 7, 8, 10 }) ? "Passed" : "Failed");
Console.WriteLine();

Console.WriteLine("Test 7");
int[] results7 = SearchInDocument("Test.docx", "Heading1", "Aaa", false, false);
foreach (var i in results7) Console.WriteLine(i);
Console.WriteLine(results7.SequenceEqual(new int[] { 0 }) ? "Passed" : "Failed");
Console.WriteLine();
}
}

Code is attached.

Search-Paragraphs.zip

Comments

  • Anonymous
    August 11, 2009
    "...did a minor refactoring to pre-convert search strings to lower case when the search is case insensitive..." For best practice, shouldn't that be "upper case"? http://msdn.microsoft.com/en-us/library/bb386042.aspx "Strings should be normalized to uppercase. There is a small group of characters that when converted to lowercase cannot make a round trip."

  • Anonymous
    August 12, 2009
    Great point, John.  You're right, upper case is better. -Eric