Gewusst wie: Abfragen von Dateiduplikaten in einer Verzeichnisstruktur (LINQ)
Es ist möglich, dass sich Dateien mit demselben Namen in mehr als einem Ordner befinden.Im Visual Studio-Installationsordner weisen zum Beispiel mehrere Ordner eine readme.htm-Datei auf.In diesem Beispiel wird gezeigt, wie solche mehrfach vorkommenden Dateinamen in einem angegebenen Stammordner abgefragt werden.Das zweite Beispiel zeigt, wie Dateien abgefragt werden, deren Größe und Erstellungszeit ebenfalls übereinstimmen.
Beispiel
Module QueryDuplicateFileNames
Public Sub Main()
Dim path As String = "C:\Program Files\Microsoft Visual Studio 9.0\Common7"
QueryDuplicates1(path)
' Uncomment to run this query instead
' QueryDuplicates2(path)
End Sub
Sub QueryDuplicates1(ByVal root As String)
Dim dir As New System.IO.DirectoryInfo(root)
Dim duplicates = From aFile In dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories) _
Order By aFile.Name _
Group aFile By aFile.Name Into newGroup = Group _
Where newGroup.Count() >= 2 _
Select newGroup
' Page the display so that the results can be read.
Dim trimLength = root.Length
PageOutput(duplicates, trimLength)
End Sub
Sub QueryDuplicates2(ByVal root As String)
' This time a composite key is used. This sub finds all files
' that have been copied into multiple subfolders.
Dim dir As New System.IO.DirectoryInfo(root)
Dim duplicates = From aFile In Dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories) _
Order By aFile.Name _
Group aFile By aFile.Name, aFile.CreationTime, aFile.Length Into newGroup = Group _
Where newGroup.Count() >= 2 _
Select newGroup
' Page the display so that the results can be read.
Dim trimLength = root.Length
PageOutput(duplicates, trimLength)
End Sub
' Pages console diplay for large query results. No more than one group per page.
' This sub specifically works with group queries of FileInfo objects
' but can be modified for any type.
Sub PageOutput(ByVal groupQuery, ByVal charsToSkip)
' "3" = 1 line for extension key + 1 for "Press any key" + 1 for input cursor.
Dim numLines As Integer = Console.WindowHeight - 3
' Flag to indicate whether there are more results to diplay
Dim goAgain As Boolean = True
For Each fg As IEnumerable(Of System.IO.FileInfo) In groupQuery
' Start a new extension at the top of a page.
Dim currentLine As Integer = 0
Do While (currentLine < fg.Count())
Console.Clear()
' Get the next page of results
' No more than one filename per page
Dim resultPage = From file In fg _
Skip currentLine Take numLines
' Execute the query. Trim the paths in the output.
For Each line In resultPage
Console.WriteLine(vbTab & line.FullName.Substring(charsToSkip))
Next
' Advance the current position
currentLine = numLines + currentLine
' Give the user a chance to break out of the loop
Console.WriteLine("Press any key for next page or the 'End' key to exit.")
Dim key As ConsoleKey = Console.ReadKey().Key
If key = ConsoleKey.End Then
goAgain = False
Exit For
End If
Loop
Next
End Sub
End Module
class QueryDuplicateFileNames
{
static void Main(string[] args)
{
// Uncomment QueryDuplicates2 to run that query.
QueryDuplicates();
// QueryDuplicates2();
// Keep the console window open in debug mode.
Console.WriteLine("Press any key to exit.");
Console.ReadKey();
}
static void QueryDuplicates()
{
// Change the root drive or folder if necessary
string startFolder = @"c:\program files\Microsoft Visual Studio 9.0\";
// Take a snapshot of the file system.
System.IO.DirectoryInfo dir = new System.IO.DirectoryInfo(startFolder);
// This method assumes that the application has discovery permissions
// for all folders under the specified path.
IEnumerable<System.IO.FileInfo> fileList = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories);
// used in WriteLine to keep the lines shorter
int charsToSkip = startFolder.Length;
// var can be used for convenience with groups.
var queryDupNames =
from file in fileList
group file.FullName.Substring(charsToSkip) by file.Name into fileGroup
where fileGroup.Count() > 1
select fileGroup;
// Pass the query to a method that will
// output one page at a time.
PageOutput<string, string>(queryDupNames);
}
// A Group key that can be passed to a separate method.
// Override Equals and GetHashCode to define equality for the key.
// Override ToString to provide a friendly name for Key.ToString()
class PortableKey
{
public string Name { get; set; }
public DateTime CreationTime { get; set; }
public long Length { get; set; }
public override bool Equals(object obj)
{
PortableKey other = (PortableKey)obj;
return other.CreationTime == this.CreationTime &&
other.Length == this.Length &&
other.Name == this.Name;
}
public override int GetHashCode()
{
string str = String.Format("{0}{1}{2}", this.CreationTime, this.Length, this.Name);
return str.GetHashCode();
}
public override string ToString()
{
return String.Format("{0} {1} {2}", this.Name, this.Length, this.CreationTime);
}
}
static void QueryDuplicates2()
{
// Change the root drive or folder if necessary.
string startFolder = @"c:\program files\Microsoft Visual Studio 9.0\Common7";
// Make the the lines shorter for the console display
int charsToSkip = startFolder.Length;
// Take a snapshot of the file system.
System.IO.DirectoryInfo dir = new System.IO.DirectoryInfo(startFolder);
IEnumerable<System.IO.FileInfo> fileList = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories);
// Note the use of a compound key. Files that match
// all three properties belong to the same group.
// A named type is used to enable the query to be
// passed to another method. Anonymous types can also be used
// for composite keys but cannot be passed across method boundaries
//
var queryDupFiles =
from file in fileList
group file.FullName.Substring(charsToSkip) by
new PortableKey { Name = file.Name, CreationTime = file.CreationTime, Length = file.Length } into fileGroup
where fileGroup.Count() > 1
select fileGroup;
var list = queryDupFiles.ToList();
int i = queryDupFiles.Count();
PageOutput<PortableKey, string>(queryDupFiles);
}
// A generic method to page the output of the QueryDuplications methods
// Here the type of the group must be specified explicitly. "var" cannot
// be used in method signatures. This method does not display more than one
// group per page.
private static void PageOutput<K, V>(IEnumerable<System.Linq.IGrouping<K, V>> groupByExtList)
{
// Flag to break out of paging loop.
bool goAgain = true;
// "3" = 1 line for extension + 1 for "Press any key" + 1 for input cursor.
int numLines = Console.WindowHeight - 3;
// Iterate through the outer collection of groups.
foreach (var filegroup in groupByExtList)
{
// Start a new extension at the top of a page.
int currentLine = 0;
// Output only as many lines of the current group as will fit in the window.
do
{
Console.Clear();
Console.WriteLine("Filename = {0}", filegroup.Key.ToString() == String.Empty ? "[none]" : filegroup.Key.ToString());
// Get 'numLines' number of items starting at number 'currentLine'.
var resultPage = filegroup.Skip(currentLine).Take(numLines);
//Execute the resultPage query
foreach (var fileName in resultPage)
{
Console.WriteLine("\t{0}", fileName);
}
// Increment the line counter.
currentLine += numLines;
// Give the user a chance to escape.
Console.WriteLine("Press any key to continue or the 'End' key to break...");
ConsoleKey key = Console.ReadKey().Key;
if (key == ConsoleKey.End)
{
goAgain = false;
break;
}
} while (currentLine < filegroup.Count());
if (goAgain == false)
break;
}
}
}
Die erste Abfrage verwendet einen einfachen Schlüssel zur Ermittlung von Übereinstimmungen. Auf diese Weise werden Dateien gefunden, die den gleichen Namen haben, deren Inhalt aber unterschiedlich sein könnte.Bei der zweiten Abfrage wird ein zusammengesetzter Schlüssel verwendet, der Übereinstimmungen in drei Eigenschaften des FileInfo-Objekts bestimmt.Mit dieser Abfrage ist es wahrscheinlicher, Dateien zu finden, die den gleichen Namen sowie einen ähnlichen oder identischen Inhalt haben.
Kompilieren des Codes
Erstellen Sie ein Visual Studio-Projekt für .NET Framework, Version 3.5.Standardmäßig weist das Projekt einen Verweis auf System.Core.dll und eine using-Direktive (C#) oder einen importierten Namespace (Visual Basic) für den System.Linq-Namespace auf.Fügen Sie in C#-Projekten eine using-Direktive für den System.IO-Namespace hinzu.
Kopieren Sie diesen Code ins Projekt.
Drücken Sie F5, um das Programm zu kompilieren und auszuführen.
Drücken Sie eine beliebige Taste, um das Konsolenfenster zu schließen.
Robuste Programmierung
Für umfassende Abfrageoperationen über die Inhalte mehrerer Arten von Dokumenten und Dateien empfiehlt es sich, das Modul Windows Desktop Search zu verwenden.