Share via


diff in PowerShell

I have previously shown a variety of sed implemented in PowerShell. Here is another tool from the same series: diff in PowerShell. It's not fancy and not fast but it does the basic work, and is fully portable in PowerShell. Took me about a couple of hours to write. It gets used like this:

 $diff = Find-DiffSimple -Left (Get-Content -Encoding $Encoding $ExpectFile) -Right (Get-Content -Encoding $Encoding $ResultFile)

The format of the data returned is like the classic diff, before the Context and Unified versions.

And here is the implementation that includes a couple of helper functions:

 function Find-DiffSimple
{
<#
.SYNOPSIS
Find the difference between two arrays of strings, in a simple
quick-and-dirty way.

The algorithm is kind of dumb, using a limited window.

.OUTPUTS
The strings marked with direction.
#>
    param(
        ## The strings on the left side.
        [string[]] $Left,
        ## The strings on the right side side.
        [string[]] $Right,
        ## The maximum number of strings that can be under
        ## consideration at the moment. Any longer differeing elements
        ## will be broken up into the chunks of this size.
        ## Must be at least 2.
        [int32] $MaxWindow = 10000,
        ## Print the equal lines as well.
        [bool] $PrintEqual = $false,
        ## Print the position
        [bool] $PrintPos = $true
    )

    if ($MaxWindow -lt 2) {
        $MaxWindow = 2 # otherwise the logic doesn't make sense
    }

    # all the data is symmetric, stored in arrays with index 0 (left) or 1(right)
    $data = @( $Left, $Right )
    [int32[]]$sz = @( $Left.Length, $Right.Length )
    [int32[]]$pos = @( 0, 0 ) # position for reading the next line
    [int32[]]$bp = @( 0, 0 ) # position of the first buffered line
    $buf = @( @{}, @{} ) # buffer for the fast finding, the key is the line, the value is the list of positions where it occurs

    $prefix = @( '<', '>' ) # prefix that shows the origin of the line

    for ([int32]$i = 0; $true; $i = 1 - $i) {
        $j = 1 - $i
        if ($pos[$i] -ge $sz[$i]) {
            $i = $j
            if ($pos[$i] -ge $sz[$i]) {
                break
            }
            $j = 1 - $i
        }

        $p = $pos[$i]
        $pos[$i]++

        $line = $data[$i][$p]

        #"DEBUG: --- $i --- line '$line'"

        if ($buf[$j].ContainsKey($line)) { # there is a match
            $jentry = $buf[$j][$line] # the matching entry, may be $null
            #"DEBUG: jentry"
            #$jentry # DEBUG

            $jp = $jentry[0]
            #"DEBUG: jp $jp"

            if ($i -eq 0) {
                Show-DiffBuffer $PrintPos $data[$i] $buf[$i] $bp[$i] $p $prefix[$i]
                Show-DiffBuffer $PrintPos $data[$j] $buf[$j] $bp[$j] $jp $prefix[$j]
            } else {
                Show-DiffBuffer $PrintPos $data[$j] $buf[$j] $bp[$j] $jp $prefix[$j]
                Show-DiffBuffer $PrintPos $data[$i] $buf[$i] $bp[$i] $p $prefix[$i]
            }
            Remove-DiffBufferLine $line $buf[$j]

            $bp[$i] = $pos[$i]
            $bp[$j] = $jp + 1

            if ($PrintEqual) {
                "= $line"
            }
            #"DEBUG: afterwards buf[0]:"
            #$buf[0] # DEBUG
            #"DEBUG: afterwards buf[1]:"
            #$buf[1] # DEBUG
        } else {
            # add the line to the buffer
            if ($buf[$i].ContainsKey($line)) {
                [void] $buf[$i][$line].Add($p)
            } else {
                $list = New-Object System.Collections.ArrayList
                [void] $list.Add($p)
                $buf[$i][$line] = $list
            }
            #"DEBUG: added '$line' to buf[$i]:"
            #$buf[$i] # DEBUG

            if ($p - $bp[$i] -ge $MaxWindow) { # $p is behind by one, so this means the buffer overflow
                $newbp = $bp[$i] + 1
                Show-DiffBuffer $PrintPos $data[$i] $buf[$i] $bp[$i] $newbp $prefix[$i]
                $bp[$i] = $newbp
            }
        }
    }
    # dump the remaining buffers
    Show-DiffBuffer $PrintPos $data[0] $buf[0] $bp[0] $pos[0] $prefix[0]
    Show-DiffBuffer $PrintPos $data[1] $buf[1] $bp[1] $pos[1] $prefix[1]
}
Export-ModuleMember -Function Find-DiffSimple
Set-Alias xdiff Find-DiffSimple
Export-ModuleMember -Alias xdiff

function Show-DiffBuffer
{
<#
.SYNOPSIS
Internal: Dump the contents of one side of the comparison buffer.

.OUTPUTS
The lines removed from the buffer.
#>
    param(
        ## Enables the printing of the line position.
        [bool]$printPos,
        ## The original data lines.
        $data,
        ## The buffer, indexed by data contents.
        $buf,
        ## The first index of data to dump.
        [int32]$start,
        ## The index just past the data to dump (i.e. exclusive).
        [int32]$end,
        ## The line prefix showing the origin of the lines.
        [string]$prefix
    )
    if ($start -lt $end) {
        if ($printPos) {
            $first = $start + 1 # convert the line indexes to base-1
            "@ $prefix $first $end"
        }
        for ($i = $start; $i -lt $end; $i++) {
            $line = $data[$i]
            "$prefix $line" # the return value
            Remove-DiffBufferLine $line $buf
        }
    }
}

function Remove-DiffBufferLine
{
<#
.SYNOPSIS
Internal: Remove one line from the buffer.
#>
    param(
        ## The text of the line to remove.
        [string]$line,
        ## The buffer, indexed by data contents.
        $buf
    )

    $entry = $buf[$line]
    if ($entry.Count -eq 1) {
        $buf.Remove($line)
    } else {
        $entry.RemoveAt(0)
    }

}

See Also: all the text tools