Duplicate Finder Script
From Federal Burro of Information
finder.py
#!/usr/bin/env python3 import os import hashlib def get_file_checksum(filename): """ Returns the SHA-256 hash of the contents of a file """ sha256 = hashlib.sha256() with open(filename, 'rb') as f: while True: data = f.read(1024) if not data: print("x", end = '') break sha256.update(data) print("!", end = '') return sha256.hexdigest() def find_duplicate_files(rootdir): """ Finds duplicate files in a directory tree rooted at rootdir """ checksums = {} duplicates = [] for dirpath, dirnames, filenames in os.walk(rootdir): for filename in filenames: filepath = os.path.join(dirpath, filename) checksum = get_file_checksum(filepath) if checksum in checksums: duplicates.append((filepath, checksums[checksum])) else: checksums[checksum] = filepath return duplicates if __name__ == '__main__': duplicates = find_duplicate_files('/home/david/david/') for dup in duplicates: print(f'{dup[0]} is a duplicate of {dup[1]}')