This is an old revision of the document!

Find duplicate files in SFTP/FTP server

The following example uses WinSCP .NET assembly from a PowerShell script. If you have another preferred language, you can easily translate it.

You can use the script to efficiently find duplicate files on a remote SFTP/FTP server. The script first iterates remote directory tree and looks for files with the same size. When it finds any, it by default downloads the files and compares them locally.

If you known that the server supports a protocol extension for calculating checksums, you can improve the script efficiency by adding -remoteChecksumAlg switch, to make the script ask the server for the checksum, sparing the file download.

Advertisement

In the latest beta version, you can install this script as a WinSCP extension by using this page URL in the Add Extension command.

To run the script manually use:

powershell.exe -File C:\path\FindDuplicates.ps1 -remotePath "/path" -remoteChecksumAlg sha-1
# @name         Find &Duplicates
# @command      powershell.exe -ExecutionPolicy Bypass -File "%EXTENSION_PATH%" -sessionUrl "!S" -remotePath "!/" -pause
# @description  Searches for duplicate files on the server, starting from the current directory
# @flag         RemoteFiles
# @version      1
 
param (
    # Use Generate URL function to obtain a value for -sessionUrl parameter.
    $sessionUrl = "sftp://user:mypassword;fingerprint=ssh-rsa-xx-xx-xx@example.com/",
    [Parameter(Mandatory)]
    $remotePath,
    $remoteChecksumAlg = $Null,
    [Switch]
    $pause = $False
)
 
function FileChecksum ($remotePath)
{
    if (!($checksums.ContainsKey($remotePath)))
    {
        if ($remoteChecksumAlg -eq $Null)
        {
            Write-Host ("Downloading file {0}..." -f $remotePath)
            # Download file
            $localPath = [System.IO.Path]::GetTempFileName()
            $transferResult = $session.GetFiles($remotePath, $localPath)
 
            if ($transferResult.IsSuccess)
            {
                $stream = [System.IO.File]::OpenRead($localPath)
                $checksum = [System.BitConverter]::ToString($sha1.ComputeHash($stream))
                $stream.Dispose()
                
                Write-Host ("Downloaded file {0} checksum is {1}" -f $remotePath, $checksum)
 
                Remove-Item $localPath
            }
            else
            {
                Write-Host ("Error downloading file {0}: {1}" -f $remotePath, $transferResult.Failures[0])
                $checksum = $False
            }
        }
        else
        {
            Write-Host ("Request checksum for file {0}..." -f $remotePath)
            $checksum = [System.BitConverter]::ToString($session.CalculateFileChecksum($remoteChecksumAlg, $remotePath))
            Write-Host ("File {0} checksum is {1}" -f $remotePath, $checksum)
        }
 
        $checksums[$remotePath] = $checksum
    }
 
    return $checksums[$remotePath]
}
 
function FindDuplicatesInDirectory ($remotePath)
{
    Write-Host ("Finding duplicates in directory {0} ..." -f $remotePath)
 
    try
    {
        $directoryInfo = $session.ListDirectory($remotePath)
 
        foreach ($fileInfo in $directoryInfo.Files)
        {
            $remoteFilePath = ($remotePath + "/" + $fileInfo.Name) 
            
            if ($fileInfo.IsDirectory)
            {
                # Skip references to current and parent directories
                if (($fileInfo.Name -ne ".") -and
                    ($fileInfo.Name -ne ".."))
                {
                    # Recurse into subdirectories
                    FindDuplicatesInDirectory $remoteFilePath
                }
            }
            else
            {
                Write-Host ("Found file {0} with size {1}" -f $remoteFilePath, $fileInfo.Length)
 
                if ($sizes.ContainsKey($fileInfo.Length))
                {
                    $checksum = FileChecksum($remoteFilePath)
 
                    foreach ($otherFilePath in $sizes[$fileInfo.Length])
                    {
                        $otherChecksum = FileChecksum($otherFilePath)
 
                        if ($checksum -eq $otherChecksum)
                        {
                            Write-Host ("Checksums of files {0} and {1} are identical" -f $remoteFilePath, $otherFilePath)
                            $duplicates[$remoteFilePath] = $otherFilePath
                        }
                    }
                }
                else
                {
                    $sizes[$fileInfo.Length] = @()
                }
 
                $sizes[$fileInfo.Length] += $remoteFilePath
            }
        }
    }
    catch [Exception]
    {
        Write-Host ("Error processing directory {0}: {1}" -f $remotePath, $_.Exception.Message)
    }
}
 
try
{
    # Load WinSCP .NET assembly
    Add-Type -Path "WinSCPnet.dll"
 
    # Setup session options from URL
    $sessionOptions = New-Object WinSCP.SessionOptions
    $sessionOptions.ParseUrl($sessionUrl)
 
    $session = New-Object WinSCP.Session
    
    try
    {
        # Connect
        $session.Open($sessionOptions)
 
        $sizes = @{}
        $checksums = @{}
        $duplicates = @{}
        
        $sha1 = [System.Security.Cryptography.SHA1]::Create()
 
        # Start recursion
        FindDuplicatesInDirectory $remotePath
    }
    finally
    {
        # Disconnect, clean up
        $session.Dispose()
    }
 
    # Print results
    Write-Host
 
    if ($duplicates.Count -gt 0)
    {
        Write-Host "Duplicates found:"
 
        foreach ($path1 in $duplicates.Keys)
        {
            Write-Host ("{0} <=> {1}" -f $path1, $duplicates[$path1])
        }
    }
    else
    {
        Write-Host "No duplicates found."
    }
 
    $result = 0
}
catch [Exception]
{
    Write-Host $_.Exception.Message
    $result = 1
}
 
# Pause if -pause switch was used
if ($pause)
{
    Write-Host "Press any key to exit..."
    [System.Console]::ReadKey() | Out-Null
}
 
exit $result

Advertisement

Last modified: by martin