Duplicate Finder Script

From Federal Burro of Information
Revision as of 23:58, 9 June 2023 by David (talk | contribs) (Created page with " finder.py <pre> #!/usr/bin/env python3 import os import hashlib def get_file_checksum(filename): """ Returns the SHA-256 hash of the contents of a file """ sha256 = hashlib.sha256() with open(filename, 'rb') as f: while True: data = f.read(1024) if not data: print("x", end = '') break sha256.update(data) print("!", end = '') return sha256.hexdigest() def fi...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search

finder.py

#!/usr/bin/env python3
import os
import hashlib

def get_file_checksum(filename):
    """
    Returns the SHA-256 hash of the contents of a file
    """
    sha256 = hashlib.sha256()
    with open(filename, 'rb') as f:
        while True:
            data = f.read(1024)
            if not data:
                print("x", end = '')
                break
            sha256.update(data)
            print("!", end = '')
    return sha256.hexdigest()

def find_duplicate_files(rootdir):
    """
    Finds duplicate files in a directory tree rooted at rootdir
    """
    checksums = {}
    duplicates = []
    for dirpath, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            checksum = get_file_checksum(filepath)
            if checksum in checksums:
                duplicates.append((filepath, checksums[checksum]))
            else:
                checksums[checksum] = filepath
    return duplicates

if __name__ == '__main__':
    duplicates = find_duplicate_files('/home/david/david/')
    for dup in duplicates:
        print(f'{dup[0]} is a duplicate of {dup[1]}')