Cómo: Buscar archivos duplicados en un árbol de directorios (LINQ)
Actualización: noviembre 2007
A veces, en varias carpetas puede haber archivos que tienen el mismo nombre. Por ejemplo, bajo la carpeta de instalación de Visual Studio, hay varias carpetas con un archivo readme.htm. En este ejemplo se muestra cómo buscar nombres de archivo duplicados bajo una carpeta raíz especificada. En el segundo ejemplo se muestra cómo buscar archivos cuyo tamaño y fecha de creación también coinciden.
Module QueryDuplicateFileNames
Public Sub Main()
Dim path As String = "C:\Program Files\Microsoft Visual Studio 9.0\Common7"
' Uncomment to run this query instead
End Sub
Sub QueryDuplicates1(ByVal root As String)
Dim duplicates = From aFile In GetFiles(root) _
Order By aFile.Name _
Group aFile By aFile.Name Into newGroup = Group _
Where newGroup.Count() >= 2 _
Select newGroup
' Page the display so that the results can be read.
Dim trimLength = root.Length
PageOutput(duplicates, trimLength)
End Sub
Sub QueryDuplicates2(ByVal root As String)
' This time a composite key is used. This sub finds all files
' that have been copied into multiple subfolders.
Dim duplicates = From aFile In GetFiles(root) _
Order By aFile.Name _
Group aFile By aFile.Name, aFile.CreationTime, aFile.Length Into newGroup = Group _
Where newGroup.Count() >= 2 _
Select newGroup
' Page the display so that the results can be read.
Dim trimLength = root.Length
PageOutput(duplicates, trimLength)
End Sub
' Pages console diplay for large query results. No more than one group per page.
' This sub specifically works with group queries of FileInfo objects
' but can be modified for any type.
Sub PageOutput(ByVal groupQuery, ByVal charsToSkip)
' "3" = 1 line for extension key + 1 for "Press any key" + 1 for input cursor.
Dim numLines As Integer = Console.WindowHeight - 3
' Flag to indicate whether there are more results to diplay
Dim goAgain As Boolean = True
For Each fg As IEnumerable(Of System.IO.FileInfo) In groupQuery
' Start a new extension at the top of a page.
Dim currentLine As Integer = 0
Do While (currentLine < fg.Count())
' Get the next page of results
' No more than one filename per page
Dim resultPage = From file In fg _
Skip currentLine Take numLines
' Execute the query. Trim the paths in the output.
For Each line In resultPage
Console.WriteLine(vbTab & line.FullName.Substring(charsToSkip))
' Advance the current position
currentLine = numLines + currentLine
' Give the user a chance to break out of the loop
Console.WriteLine("Press any key for next page or the 'End' key to exit.")
Dim key As ConsoleKey = Console.ReadKey().Key
If key = ConsoleKey.End Then
goAgain = False
Exit For
End If
End Sub
' Function to retrieve a list of files. Note that this is a copy
' of the file information.
Function GetFiles(ByVal root As String) As System.Collections.Generic.IEnumerable(Of System.IO.FileInfo)
Return From file In My.Computer.FileSystem.GetFiles _
(root, FileIO.SearchOption.SearchAllSubDirectories, "*.*") _
Select New System.IO.FileInfo(file)
End Function
End Module
class QueryDuplicateFileNames
static void Main(string[] args)
// Uncomment QueryDuplicates2 to run that query.
// QueryDuplicates2();
// Keep the console window open in debug mode.
Console.WriteLine("Press any key to exit.");
static void QueryDuplicates()
// Change the root drive or folder if necessary
string startFolder = @"c:\program files\Microsoft Visual Studio 9.0\";
// Take a snapshot of the file system.
IEnumerable<System.IO.FileInfo> fileList = GetFiles(startFolder);
// used in WriteLine to keep the lines shorter
int charsToSkip = startFolder.Length;
// var can be used for convenience with groups.
var queryDupNames =
from file in fileList
group file.FullName.Substring(charsToSkip) by file.Name into fileGroup
where fileGroup.Count() > 1
select fileGroup;
// Pass the query to a method that will
// output one page at a time.
// A Group key that can be passed to a separate method.
// Override Equals and GetHashCode to define equality for the key.
// Override ToString to provide a friendly name for Key.ToString()
class PortableKey
public string Name { get; set; }
public DateTime CreationTime { get; set; }
public long Length {get; set;}
public override bool Equals(object obj)
PortableKey other = (PortableKey)obj;
return other.CreationTime == this.CreationTime &&
other.Length == this.Length &&
other.Name == this.Name;
public override int GetHashCode()
string str = String.Format("{0}{1}{2}", this.CreationTime, this.Length, this.Name);
return str.GetHashCode();
public override string ToString()
return String.Format("{0} {1} {2}", this.Name, this.Length, this.CreationTime);
static void QueryDuplicates2()
// Change the root drive or folder if necessary.
string startFolder = @"c:\program files\Microsoft Visual Studio 9.0\Common7";
// Make the the lines shorter for the console display
int charsToSkip = startFolder.Length;
// Take a snapshot of the file system.
IEnumerable<System.IO.FileInfo> fileList = GetFiles(startFolder);
// Note the use of a compound key. Files that match
// all three properties belong to the same group.
// A named type is used to enable the query to be
// passed to another method. Anonymous types can also be used
// for composite keys but cannot be passed across method boundaries
var queryDupFiles =
from file in fileList
group file.FullName.Substring(charsToSkip) by
new PortableKey{ Name=file.Name, CreationTime=file.CreationTime, Length=file.Length } into fileGroup
where fileGroup.Count() > 1
select fileGroup;
var list = queryDupFiles.ToList();
int i = queryDupFiles.Count();
PageOutput<PortableKey, string>(queryDupFiles);
// A generic method to page the output of the QueryDuplications methods
// Here the type of the group must be specified explicitly. "var" cannot
// be used in method signatures. This method does not display more than one
// group per page.
private static void PageOutput<K,V>(IEnumerable<System.Linq.IGrouping<K, V>> groupByExtList)
// Flag to break out of paging loop.
bool goAgain = true;
// "3" = 1 line for extension + 1 for "Press any key" + 1 for input cursor.
int numLines = Console.WindowHeight - 3;
// Iterate through the outer collection of groups.
foreach (var filegroup in groupByExtList)
// Start a new extension at the top of a page.
int currentLine = 0;
// Output only as many lines of the current group as will fit in the window.
Console.WriteLine("Filename = {0}", filegroup.Key.ToString() == String.Empty ? "[none]" : filegroup.Key.ToString());
// Get 'numLines' number of items starting at number 'currentLine'.
var resultPage = filegroup.Skip(currentLine).Take(numLines);
//Execute the resultPage query
foreach (var fileName in resultPage)
Console.WriteLine("\t{0}", fileName);
// Increment the line counter.
currentLine += numLines;
// Give the user a chance to escape.
Console.WriteLine("Press any key to continue or the 'End' key to break...");
ConsoleKey key = Console.ReadKey().Key;
if (key == ConsoleKey.End)
goAgain = false;
} while (currentLine < filegroup.Count());
if (goAgain == false)
// This method assumes that the application has discovery
// permissions for all folders under the specified path.
static IEnumerable<System.IO.FileInfo> GetFiles(string path)
if (!System.IO.Directory.Exists(path))
throw new System.IO.DirectoryNotFoundException();
string[] fileNames = null;
List<System.IO.FileInfo> files = new List<System.IO.FileInfo>();
fileNames = System.IO.Directory.GetFiles(path, "*.*", System.IO.SearchOption.AllDirectories);
foreach (string name in fileNames)
files.Add(new System.IO.FileInfo(name));
return files;
La primera consulta utiliza una clave simple para determinar una coincidencia; se buscan archivos que tienen el mismo nombre pero cuyo contenido podría ser diferente. La segunda consulta usa una clave compuesta para buscar las coincidencias de tres propiedades del objeto FileInfo. Lo más probable es que esta consulta encuentre archivos que tienen el mismo nombre y contenido similar o idéntico.
Compilar el código
Cree un proyecto de Visual Studio orientado a .NET Framework versión 3.5. De manera predeterminada, el proyecto contiene una referencia a System.Core.dll y una directiva using (C#) o un espacio de nombres importado (Visual Basic) para el espacio de nombres System.Linq. En los proyectos de C#, agregue una directiva using para el espacio de nombres System.IO.
Copie este código en el proyecto.
Presione F5 para compilar y ejecutar el programa.
Presione cualquier tecla para salir de la ventana de consola.
Programación eficaz
Cuando realice operaciones de consulta intensivas sobre el contenido de múltiples tipos de documentos y archivos, considere el uso del motor de Windows Desktop Search.