SharePoint 2013 Content Enrichment: Regular Expression Data Extraction
There are a number of posts on getting started writing a Content Enrichment Web Service. I found https://msdn.microsoft.com/en-us/library/office/jj163982.aspx a great starting point. I recently needed to write a CEWS application to extract regex patterns from a managed property. I thought I would share this as template.
Define InProperty to be the name of the managed property you would like to extract from and OutProperty to be the Managed property that will hold the extracted data.
using System;
using System.Collections.Generic;
using System.IO;
using Microsoft.Office.Server.Search.ContentProcessingEnrichment;
using Microsoft.Office.Server.Search.ContentProcessingEnrichment.PropertyTypes;
using System.Text.RegularExpressions;
namespace RegExContentProcessingEnrichmentService
{
public class RegExContentProcessingEnrichmentService : IContentProcessingEnrichmentService
{
// Defines the error code for managed properties with an unexpected type.
privateconstint UnexpectedType = 1;
// Defines the error code for encountering unexpected exceptions.
privateconstint UnexpectedError = 2;
// out is the managed property we will write the extracted data to
private const string OutProperty = "out";
// in contains the text we will match our pattern against
private const string InProperty = "in";
private readonly ProcessedItem processedItemHolder = new ProcessedItem
{
ItemProperties = new List<AbstractProperty>()
};
public ProcessedItem ProcessItem(Item item)
{
processedItemHolder.ErrorCode = 0;
processedItemHolder.ItemProperties.Clear();
try {
// placeholder for output.
// We have defined the Managed Property as multi-valued, so we want to output a list
Property<List<String>> output =new Property<List<String>>();
// store the input string
String input=null;
// loop through each property
// presents a more elegant method of selecting the desired properties
foreach (var property in item.ItemProperties)
{
// Check if this is the input property.
if (property.Name.Equals(InProperty, StringComparison.Ordinal))
{
// I had some issues getting the value via the example method. This seems to work fine
input= (String)property.ObjectValue;
}
}
if (input!= null)
{
// regex to find in the input
string pattern = "Value=\"([^\"]*)\"";
MatchCollection matches = Regex.Matches(input, pattern, RegexOptions.Singleline);
// initialize the output
output.Value = new List<String>();
output.Name = OutProperty;
foreach (Match match in matches)
{
// add the value of the matching group to the output managed property
output.Value.Add(match.Groups[1].Value);
}
processedItemHolder.ItemProperties.Add(output);
}
}
catch (Exception)
{
processedItemHolder.ErrorCode = UnexpectedError;
}
return processedItemHolder;
}
}
}