python/dedup/dedup.py

   1 #!/usr/bin/env python2.7
   2 #
   3 # Identify duplicate files to possibly replace with hard links to save space:
   4 #
   5 #     find /Users/boris -type f | ./dedup.py --scan > boris.txt
   6 #     cat boris.txt | ./dedup.py --analyze
   7 #
   8 # The candidate list generated by those steps should be carefully reviewed
   9 # for safety.  For example, any files under version control or subject to
  10 # modification must be left alone, even if they appear to be duplicates.
  11 #
  12 # When satisfied that only safe files are to be deduped,
  13 #
  14 #     cat boris.txt | ./dedup.py --implement
  15 #
  16 # Note that in the "implement" stage, deduplication messages will be echoed
  17 # in arbitrary order due to parallel execution.
  18 #
  19
  20 import os
  21 import sys
  22 import threading
  23 import subprocess
  24 from collections import defaultdict
  25
  26
  27 # scan params
  28 XARGS_COUNT = 50
  29 MAX_SUBPROCS = 8
  30
  31 # analysis params
  32 MIN_FILE_SIZE = 256*1024
  33 MIN_DEDUP = 2*1024*1024
  34
  35
  36 ts_lock = threading.RLock()
  37
  38
  39 def tsout(msg):
  40     with ts_lock:
  41         sys.stdout.write(str(msg))
  42         sys.stdout.write("\n")
  43
  44
  45 def tserr(msg, lock=threading.RLock()):
  46     with ts_lock:
  47         sys.stderr.write(str(msg))
  48         sys.stderr.write("\n")
  49
  50
  51 subprocs = threading.Semaphore(MAX_SUBPROCS)
  52
  53
  54 SCAN_RESULTS = []
  55
  56
  57 def process(chunk):
  58     try:
  59         md5_command = "/sbin/md5 -r".split() + chunk
  60         md5_raw_results = subprocess.check_output(md5_command)
  61         md5_results = []
  62         for i, line in enumerate(md5_raw_results.split("\n")):
  63             if not line:
  64                 continue
  65             md5_hash, file_name = line.lstrip().split(None, 1)
  66             orig_file_name = chunk[i]
  67             if file_name != orig_file_name:
  68                 tserr("WARNING: Filename not preserved by md5: before '{}', after '{}'.  Skipping.".format(orig_file_name, file_name))
  69                 continue
  70             sr = os.stat(orig_file_name)
  71             r = (md5_hash, sr.st_nlink, sr.st_size, orig_file_name)
  72             md5_results.append(r)
  73             tsout("{} {} {} {}".format(*r))
  74         with ts_lock:
  75             SCAN_RESULTS.extend(md5_results)
  76     finally:
  77         subprocs.release()
  78
  79
  80 def enqueue_scan(chunk):
  81     subprocs.acquire()
  82     threading.Thread(target=process, args=[chunk]).start()
  83
  84
  85 def scan():
  86     tserr("Reading filenames from stdin.")
  87     line_count = 0
  88     chunk = []
  89     for line in sys.stdin:
  90         assert line
  91         line_count += 1
  92         line = line[:-1]
  93         if len(chunk) == XARGS_COUNT:
  94             enqueue_scan(chunk)
  95             chunk = []
  96         chunk.append(line)
  97     enqueue_scan(chunk)
  98     for i in xrange(MAX_SUBPROCS):
  99         subprocs.acquire()
 100     tserr("Scanned {} files.".format(len(SCAN_RESULTS)))
 101
 102
 103 class MD5Hash(object):
 104
 105     def __init__(self):
 106         self.md5_hash = None
 107         self.files = []
 108         self.current_size = 0.0
 109         self.ideal_size = None
 110
 111     def add_file(self, md5_hash, n_link, size, file_name):
 112         assert not self.md5_hash or self.md5_hash == md5_hash
 113         self.md5_hash = md5_hash
 114         assert not self.ideal_size or self.ideal_size == size
 115         self.ideal_size = size
 116         self.files.append((md5_hash, n_link, size, file_name))
 117         self.current_size += (float(size) / n_link)
 118
 119     def dedup(self):
 120         try:
 121             src = self.files[0]
 122             for dst in self.files[1:]:
 123                 src_filename = src[3]
 124                 dst_filename = dst[3]
 125                 try:
 126                     subprocess.check_output(["/usr/bin/cmp", src_filename, dst_filename])
 127                 except:
 128                     tserr("Skipping due to mismatch: '{}' '{}'".format(src_filename, dst_filename))
 129                 else:
 130                     subprocess.check_output(["/bin/ln", "-f", src_filename, dst_filename])
 131                     tsout("Deduplicated '{}' => '{}'".format(dst_filename, src_filename))
 132         finally:
 133             subprocs.release()
 134
 135
 136 def enqueue_dedup(md5):
 137     subprocs.acquire()
 138     threading.Thread(target=md5.dedup).start()
 139
 140
 141 def optimize(dry_run):
 142     all_files = defaultdict(MD5Hash)
 143     files = defaultdict(MD5Hash)
 144     line_count = 0
 145     for line in sys.stdin:
 146         assert line
 147         line_count += 1
 148         line = line[:-1]
 149         md5_hash, n_link, size, file_name = line.split(None, 3)
 150         n_link = int(n_link)
 151         size = int(size)
 152         all_files[md5_hash].add_file(md5_hash, n_link, size, file_name)
 153         if size < MIN_FILE_SIZE:
 154             continue
 155         files[md5_hash].add_file(md5_hash, n_link, size, file_name)
 156     mb = 1024*1024.0
 157     operated_upon = []
 158     for md5 in sorted(files.itervalues(), key=lambda f: f.current_size - f.ideal_size):
 159         if md5.current_size - md5.ideal_size < MIN_DEDUP:
 160             continue
 161         operated_upon.append(md5)
 162         tsout("")
 163         tsout("Save {:.1f} MB by deduping\n{}".format((md5.current_size - md5.ideal_size) / mb, "".join(["\n        " + f[3] for f in md5.files])))
 164         tsout("")
 165         if not dry_run:
 166             enqueue_dedup(md5)
 167     if not dry_run:
 168         for i in xrange(MAX_SUBPROCS):
 169             subprocs.acquire()
 170     ideal = sum(f.ideal_size for f in operated_upon) / mb
 171     current = sum(f.current_size for f in operated_upon) / mb
 172     total = sum(f.current_size for f in all_files.itervalues()) / mb
 173     eligible_count = sum(len(f.files) for f in operated_upon)
 174     tsout("Parameters: MIN_FILE_SIZE={:3.1f} MB, MIN_DEDUP={:3.1f} MB.".format(MIN_FILE_SIZE/mb, MIN_DEDUP/mb))
 175     tsout("{} files eligible for deduplication = {:3.1f}% of {} total files.".format(eligible_count, 100.0*eligible_count/line_count, line_count))
 176     tsout("Space savings {:.0f} MB = {:3.1f}% of {:.0f} MB total space.".format(current - ideal, 100.0*(current - ideal)/max(total,0.00001), total, MIN_DEDUP/mb))
 177
 178
 179 if __name__ == "__main__":
 180     if len(sys.argv) >= 2 and sys.argv[1].strip('-').lower() == "scan":
 181         scan()
 182     elif len(sys.argv) >= 2 and sys.argv[1].strip('-').lower() in ("analyze", "analyse", "analysis"):
 183         optimize(dry_run=True)
 184     elif len(sys.argv) >= 2 and sys.argv[1].strip('-').lower() in ("optimize", "optimise", "implement", "implementation"):
 185         optimize(dry_run=False)
 186     else:
 187         tserr("Unsupported command line.")
 188         sys.exit(-1)
 189     #tsprint(repr(RESULTS))