Algorithm of finding the duplicated sub-strings in two strings
One tough issue, especially require the better performance and memory usage.
However, I done it :)
1. It builds a table strlen(s1)*strlen(s2)
2. Compare the s1 and s2 and fill the equaled bit in the table
3. Find out the longest connected lines in that table
4. Re-build the string with the line's offset
Not sure if it is the best algorithm to do that :)
However it only spend 47ms to compare and find out the duplicate string ("pro") of "programming" and "professional" in 10000 times.
"abbazyabcyzabcegz" and ""xabcyzabcdegabbazyabh" for 10,000 times
Milliseconds: 109
Cool?
//////
StrNode* GetDupStr(__notnull LPCWSTR lpszP1, __notnull LPCWSTR lpszP2)
{
// 0. valid parameters, if SAL does not work
if (lpszP1==NULL || lpszP2==NULL || wcslen(lpszP1)==0 || wcslen(lpszP2)==0)
return NULL;
//
// 1. build the table
//
// calculate the size
int nLen1 = (int)wcslen(lpszP1);
int nLen2 = (int)wcslen(lpszP2);
int nMapLen = sizeof(bool)*nLen1*nLen2;
// allocate the memory @allocate abMap
bool* abMap;
abMap = (bool*)malloc(nMapLen);
//ZeroMemory
memset(abMap, 0x0, nMapLen);
//
// 2. compare and fill the table
//
for(int i=0; i<nLen1; i++)
//rows
{
WCHAR cRow = lpszP1[i];
//check if there is duplicated char in previous chars, if so, memcpy directly
for(int k=0; k<i; k++)
{
if (lpszP1[k]==cRow)
{
memcpy(&abMap[i*nLen2], &abMap[k*nLen2], nLen2);
DEBUG("Dup: %c\n", cRow);
}
}
//loop cols
for(int j=0; j<nLen2; j++)
//cols
{
//check equality
if (cRow==lpszP2[j])
{
//MarkEqual(abMap, CalPos(nLen2, i, j));
MarkBit(abMap, CalcBitPos(nLen2, i, j));
}
}
}
//
// 3. print out the table
//
#ifdef _DUPSTR_DEBUG_
PrintTable(abMap, lpszP1, lpszP2);
DEBUG("\n", NULL);
#endif
//
// 4. search table, find out the connected points
//
// get minimun strlen of two strings
int nMinLen = nLen1 < nLen2 ? nLen1 : nLen2;
// build maxNode var @allocate pMaxNode
BNode *pMaxNode = NULL;
//
// loop the cols and rows from longest to shortest, and compare the maxlength with current nLen
// jump out if maxLen > nLen
//
//loop for cols
for(int i=0; i<nLen2; i++)
{
//current len
int nLen = nMinLen < nLen2-i ? nMinLen : nLen2-i;
if ( pMaxNode==NULL || pMaxNode->len<nLen)
{
//get pNodes
BNode *pNode = CheckPoints(&abMap[CalcBitPos(nLen2, 0, i)], nLen, nLen2+1);
//rebuild
RebuildMaxNodes(&pMaxNode, &pNode);
}
}
//then loop for rows
for(int i=1; i<nLen1; i++)
{
int nLen = nMinLen < nLen1-i ? nMinLen : nLen1-i;
if ( pMaxNode==NULL || pMaxNode->len<nLen)
{
BNode *pNode = CheckPoints(&abMap[CalcBitPos(nLen2, i, 0)], nLen, nLen2+1);
/* @refactor RebuildMaxNodes
if (pNode!=NULL)
{
if (pMaxNode==NULL)
{
DEBUG("leave null\n", NULL);
pMaxNode = pNode;
}
else if (pMaxNode->len < pNode->len)
{
DEBUG("got new length: %d\n", pNode->len);
//free previous pMaxNode
ReleaseNode(pMaxNode);
pMaxNode = pNode;
}
else if (pMaxNode->len == pNode->len)
{
DEBUG("get duplicated\n", NULL);
pMaxNode->next = pNode;
}
else
{
ReleaseNode(pNode);
}
}
*/
RebuildMaxNodes(&pMaxNode, &pNode);
}
}
//
// 5. print out pMaxNodes
//
StrNode* pstrNode = NULL;
//
BNode* pn = pMaxNode;
//foreach pMaxNode
while(pn!=NULL)
{
//calculate the string offset
int nStrOffset = (int)((pn->pbStart - abMap) % nLen2);
//allocate new string @allocate lpszOutput
LPWSTR lpszOutput = new WCHAR[pn->len+1];
memset(lpszOutput, 0x0, (pn->len+1)*sizeof(WCHAR));
memcpy(lpszOutput, lpszP2+nStrOffset, pn->len * sizeof(WCHAR));
DEBUG("%x => ", pn->pbStart);
DEBUG("%d => ", pn->len);
DEBUG("%s\n", lpszOutput);
//DEBUG("%x => %d => %s\n", pn->pbStart, pn->len, lpszOutput);
//attach string to return structure
if (pstrNode==NULL)
{
//build the node
pstrNode = BuildStrNode(lpszP2+nStrOffset, pn->len, lpszOutput);
HEAPLEAK(pstrNode->magic, 0xCACF0169);
}
else
{
//build the node
/* @refactor BuildStrNode
StrNode* psn = new StrNode;
memset(psn, 0x0, sizeof(StrNode));
psn->lpSrc = lpszP2+nStrOffset;
psn->len = pn->len;
psn->str = lpszOutput;
psn->next = NULL;
pstrNode->next = psn;
*/
StrNode* psn = pstrNode;
while(psn->next!=NULL)
{
psn = psn->next;
}
psn->next = BuildStrNode(lpszP2+nStrOffset, pn->len, lpszOutput);
HEAPLEAK(psn->next->magic, 0xCACF0189);
}
pn = pn->next;
}
//@release pMaxNode
ReleaseBNode(pMaxNode);
//@release abMap
delete abMap;
return pstrNode;
}
Attached full code files.