Log using the "0install" logger rather than "root"
[zeroinstall/solver.git] / zeroinstall / zerostore / optimise.py
blob8ba46a39368b061b89899bb1a5511c423e3d6c35
1 """Optimise the cache."""
3 # Copyright (C) 2009, Thomas Leonard
4 # See the README file for details, or visit http://0install.net.
6 from __future__ import print_function
8 from zeroinstall import _, logger
9 import os, sys
11 def _already_linked(a, b):
12 ai = os.stat(a)
13 bi = os.stat(b)
14 return (ai.st_dev, ai.st_ino) == (bi.st_dev, bi.st_ino)
16 def _byte_identical(a, b):
17 with open(a, 'rb') as af:
18 with open(b, 'rb') as bf:
19 while True:
20 adata = af.read(100)
21 bdata = bf.read(100)
22 if adata != bdata:
23 return False
24 if not adata:
25 return True
27 def _link(a, b, tmpfile):
28 """Keep 'a', delete 'b' and hard-link to 'a'"""
29 if not _byte_identical(a, b):
30 logger.warn(_("Files should be identical, but they're not!\n%(file_a)s\n%(file_b)s"), {'file_a': a, 'file_b': b})
32 b_dir = os.path.dirname(b)
33 old_mode = os.lstat(b_dir).st_mode
34 os.chmod(b_dir, old_mode | 0o200) # Need write access briefly
35 try:
36 os.link(a, tmpfile)
37 try:
38 os.rename(tmpfile, b)
39 except:
40 os.unlink(tmpfile)
41 raise
42 finally:
43 os.chmod(b_dir, old_mode)
45 def optimise(impl_dir):
46 """Scan an implementation cache directory for duplicate files, and
47 hard-link any duplicates together to save space.
48 @param impl_dir: a $cache/0install.net/implementations directory
49 @type impl_dir: str
50 @return: (unique bytes, duplicated bytes, already linked, manifest size)
51 @rtype: (int, int, int, int)"""
53 first_copy = {} # TypeDigest -> Path
54 dup_size = uniq_size = already_linked = man_size = 0
56 import random
57 from zeroinstall.zerostore import BadDigest, parse_algorithm_digest_pair
59 for x in range(10):
60 tmpfile = os.path.join(impl_dir, 'optimise-%d' % random.randint(0, 1000000))
61 if not os.path.exists(tmpfile):
62 break
63 else:
64 raise Exception(_("Can't generate unused tempfile name!"))
66 dirs = os.listdir(impl_dir)
67 total = len(dirs)
68 msg = ""
69 def clear():
70 print("\r" + (" " * len(msg)) + "\r", end='')
71 for i, impl in enumerate(dirs):
72 clear()
73 msg = _("[%(done)d / %(total)d] Reading manifests...") % {'done': i, 'total': total}
74 print(msg, end='')
75 sys.stdout.flush()
77 try:
78 alg, manifest_digest = parse_algorithm_digest_pair(impl)
79 except BadDigest:
80 logger.warn(_("Skipping non-implementation '%s'"), impl)
81 continue
82 manifest_path = os.path.join(impl_dir, impl, '.manifest')
83 try:
84 ms = open(manifest_path, 'rt')
85 except OSError as ex:
86 logger.warn(_("Failed to read manifest file '%(manifest_path)s': %(exception)s"), {'manifest': manifest_path, 'exception': str(ex)})
87 continue
89 if alg == 'sha1': continue
91 man_size += os.path.getsize(manifest_path)
93 dir = ""
94 for line in ms:
95 if line[0] == 'D':
96 itype, path = line.split(' ', 1)
97 assert path.startswith('/')
98 dir = path[1:-1] # Strip slash and newline
99 continue
101 if line[0] == "S":
102 itype, digest, size, rest = line.split(' ', 3)
103 uniq_size += int(size)
104 continue
106 assert line[0] in "FX"
108 itype, digest, mtime, size, path = line.split(' ', 4)
109 path = path[:-1] # Strip newline
110 size = int(size)
112 key = (itype, digest, mtime, size)
113 loc_path = (impl, dir, path)
115 first_loc = first_copy.get(key, None)
116 if first_loc:
117 first_full = os.path.join(impl_dir, *first_loc)
118 new_full = os.path.join(impl_dir, *loc_path)
119 if _already_linked(first_full, new_full):
120 already_linked += size
121 else:
122 _link(first_full, new_full, tmpfile)
123 dup_size += size
124 else:
125 first_copy[key] = loc_path
126 uniq_size += size
127 clear()
128 return (uniq_size, dup_size, already_linked, man_size)