Show progress during "0store optimise"
[zeroinstall.git] / zeroinstall / zerostore / optimise.py
blob627e9d96d03e5fb8481c2b6016801def86c124af
1 """Optimise the cache."""
3 # Copyright (C) 2009, Thomas Leonard
4 # See the README file for details, or visit http://0install.net.
6 from __future__ import print_function
8 from zeroinstall import _
9 import os, sys
10 from logging import warn
12 def _already_linked(a, b):
13 ai = os.stat(a)
14 bi = os.stat(b)
15 return (ai.st_dev, ai.st_ino) == (bi.st_dev, bi.st_ino)
17 def _byte_identical(a, b):
18 af = file(a, 'rb')
19 bf = file(b, 'rb')
20 while True:
21 adata = af.read(100)
22 bdata = bf.read(100)
23 if adata != bdata:
24 return False
25 if not adata:
26 return True
28 def _link(a, b, tmpfile):
29 """Keep 'a', delete 'b' and hard-link to 'a'"""
30 if not _byte_identical(a, b):
31 warn(_("Files should be identical, but they're not!\n%(file_a)s\n%(file_b)s"), {'file_a': a, 'file_b': b})
33 b_dir = os.path.dirname(b)
34 old_mode = os.lstat(b_dir).st_mode
35 os.chmod(b_dir, old_mode | 0o200) # Need write access briefly
36 try:
37 os.link(a, tmpfile)
38 try:
39 os.rename(tmpfile, b)
40 except:
41 os.unlink(tmpfile)
42 raise
43 finally:
44 os.chmod(b_dir, old_mode)
46 def optimise(impl_dir):
47 """Scan an implementation cache directory for duplicate files, and
48 hard-link any duplicates together to save space.
49 @param impl_dir: a $cache/0install.net/implementations directory
50 @type impl_dir: str
51 @return: (unique bytes, duplicated bytes, already linked, manifest size)
52 @rtype: (int, int, int, int)"""
54 first_copy = {} # TypeDigest -> Path
55 dup_size = uniq_size = already_linked = man_size = 0
57 import random
59 for x in range(10):
60 tmpfile = os.path.join(impl_dir, 'optimise-%d' % random.randint(0, 1000000))
61 if not os.path.exists(tmpfile):
62 break
63 else:
64 raise Exception(_("Can't generate unused tempfile name!"))
66 dirs = os.listdir(impl_dir)
67 total = len(dirs)
68 msg = ""
69 def clear():
70 print("\r" + (" " * len(msg)) + "\r", end='')
71 for i, impl in enumerate(dirs):
72 clear()
73 msg = _("[%(done)d / %(total)d] Reading manifests...") % {'done': i, 'total': total}
74 print(msg, end='')
75 sys.stdout.flush()
77 if impl.startswith('.') or '=' not in impl:
78 warn(_("Skipping non-implementation '%s'"), impl)
79 continue
80 manifest_path = os.path.join(impl_dir, impl, '.manifest')
81 try:
82 ms = file(manifest_path, 'rb')
83 except OSError as ex:
84 warn(_("Failed to read manifest file '%(manifest_path)s': %(exception)s"), {'manifest': manifest_path, 'exception': str(ex)})
85 continue
87 alg = impl.split('=', 1)[0]
88 if alg == 'sha1': continue
90 man_size += os.path.getsize(manifest_path)
92 dir = ""
93 for line in ms:
94 if line[0] == 'D':
95 itype, path = line.split(' ', 1)
96 assert path.startswith('/')
97 dir = path[1:-1] # Strip slash and newline
98 continue
100 if line[0] == "S":
101 itype, digest, size, rest = line.split(' ', 3)
102 uniq_size += int(size)
103 continue
105 assert line[0] in "FX"
107 itype, digest, mtime, size, path = line.split(' ', 4)
108 path = path[:-1] # Strip newline
109 size = int(size)
111 key = (itype, digest, mtime, size)
112 loc_path = (impl, dir, path)
114 first_loc = first_copy.get(key, None)
115 if first_loc:
116 first_full = os.path.join(impl_dir, *first_loc)
117 new_full = os.path.join(impl_dir, *loc_path)
118 if _already_linked(first_full, new_full):
119 already_linked += size
120 else:
121 _link(first_full, new_full, tmpfile)
122 dup_size += size
123 else:
124 first_copy[key] = loc_path
125 uniq_size += size
126 clear()
127 return (uniq_size, dup_size, already_linked, man_size)