Handle errors hashing files that vanish.
[asure.git] / assurance / main.py
blob6494dcf1fbc9b66a14476f76ac96752ed23548a3
1 #! /usr/bin/env python
3 # Directory integrity scanner.
5 from stat import *
6 import os
7 import sys
8 from os.path import join
10 from cPickle import dump, load
11 import gzip
13 import hashing
15 def walk(top):
16 """Root of directory generator"""
17 topstat = os.lstat(top)
18 for x in walker(top, '.', topstat):
19 yield x
21 def walker(path, name, dirstat):
22 """Directory tree generator.
24 At one point, this started as a copy of os.walk from Python's
25 library. Even the arguments are different now.
26 """
28 try:
29 names = os.listdir(path)
30 except OSError:
31 sys.stderr.write("Warning, can't read dir: %s\n" % path)
32 return
34 # The verification algorithm requires the names to be sorted.
35 names.sort()
37 # Stat each name found, and put the result in one of two lists.
38 dirs, nondirs = [], []
39 for onename in names:
40 if path == '.' and (onename == "0sure.dat.gz" or
41 onename == "0sure.bak.gz" or
42 onename == "0sure.0.gz"):
43 continue
44 st = os.lstat(join(path, onename))
45 if S_ISDIR(st.st_mode):
46 dirs.append((onename, st))
47 else:
48 nondirs.append((onename, st))
50 # Indicate "entering" the directory.
51 yield 'd', name, convert_stat(dirstat)
53 # Then recursively walk into all of the subdirectories.
54 for (onename, st) in dirs:
55 subpath = join(path, onename)
56 if st.st_dev == dirstat.st_dev:
57 for x in walker(subpath, onename, st):
58 yield x
60 # Then yield each entry that is not a subdirectory.
61 for (onename, st) in nondirs:
62 yield '-', onename, convert_stat(st)
64 # Last, yield the leaving.
65 yield ('u',)
67 # Convert the passed stat info into an association of the information
68 # itself. Does not do anything that requires reading the file (such
69 # as readlink or md5).
70 def convert_stat(st):
71 if S_ISDIR(st.st_mode):
72 return { 'kind': 'dir',
73 'uid': st.st_uid,
74 'gid': st.st_gid,
75 'perm': S_IMODE(st.st_mode) }
77 elif S_ISREG(st.st_mode):
78 return { 'kind': 'file',
79 'uid': st.st_uid,
80 'gid': st.st_gid,
81 'mtime': st.st_mtime,
82 'ctime': st.st_ctime,
83 'ino': st.st_ino,
84 'perm': S_IMODE(st.st_mode) }
86 elif S_ISLNK(st.st_mode):
87 return { 'kind': 'lnk' }
89 elif S_ISSOCK:
90 return { 'kind': 'sock',
91 'uid': st.st_uid,
92 'gid': st.st_gid,
93 'perm': S_IMODE(st.st_mode) }
95 elif S_ISFIFO:
96 return { 'kind': 'fifo',
97 'uid': st.st_uid,
98 'gid': st.st_gid,
99 'perm': S_IMODE(st.st_mode) }
101 elif S_ISBLK:
102 return { 'kind': 'blk',
103 'uid': st.st_uid,
104 'gid': st.st_gid,
105 'devmaj': os.major(st.st_rdev),
106 'devmin': os.minor(st.st_rdev),
107 'perm': S_IMODE(st.st_mode) }
109 elif S_ISCHR:
110 return { 'kind': 'chr',
111 'uid': st.st_uid,
112 'gid': st.st_gid,
113 'devmaj': os.major(st.st_rdev),
114 'devmin': os.minor(st.st_rdev),
115 'perm': S_IMODE(st.st_mode) }
117 else:
118 raise "Unknown file kind"
120 def empty_tree():
121 """Make an empty tree. No meaningful attributes for the root
122 directory"""
123 yield 'd', '.', {}
124 yield 'u',
125 return
127 def empty_generator():
128 return
129 yield ()
131 mode_add, mode_delete, mode_both = (1, 2, 3)
133 class comparer:
134 """Class for comparing two directory iterations. Keeps track of
135 state, and allows child classes to define handlers for the various
136 types of differences found."""
138 def __init__(self, left, right):
139 self.__left = left
140 self.__right = right
142 # Default handlers for the 6 possible changes (or not changes)
143 # that can happen in a directory. The adds and deletes take an
144 # additional argument that will be set to true if this added or
145 # remoted entity is contained in an entirely new directory. Some
146 # handlers may want to avoid printing verbose messages for the
147 # contents of added or deleted directories, and can use this
148 # value.
149 def handle_same_dir(self, path, a, b):
150 #print "same_dir(%s, %s, %s)" % (path, a, b)
151 return empty_generator()
152 def handle_delete_dir(self, path, a, recursing):
153 #print "delete_dir(%s, %s, %s)" % (path, a, recursing)
154 return empty_generator()
155 def handle_add_dir(self, path, a, recursing):
156 #print "add_dir(%s, %s, %s)" % (path, a, recursing)
157 return empty_generator()
158 def handle_same_nondir(self, path, a, b):
159 #print "same_nondir(%s, %s, %s)" % (path, a, b)
160 return empty_generator()
161 def handle_delete_nondir(self, path, a, recursing):
162 #print "delete_nondir(%s, %s, %s)" % (path, a, recursing)
163 return empty_generator()
164 def handle_add_nondir(self, path, a, recursing):
165 #print "add_nondir(%s, %s, %s)" % (path, a, recursing)
166 return empty_generator()
167 def handle_leave(self, path, mode):
168 """Handle the leaving of a directory. Instead of 'recursing',
169 the mode is defined as 'mode_add' (1) for add, 'mode_delete'
170 (2) for delete, or these two or'd together 'mode_both' (3) for
171 both"""
172 return empty_generator()
174 def run(self):
175 a = self.__left.next()
176 if a[0] != 'd':
177 raise "Scan doesn't start with a directory"
178 b = self.__right.next()
179 if b[0] != 'd':
180 raise "Tree walk doesn't start with a directory"
181 for x in self.handle_same_dir(".", a, b):
182 yield x
183 for x in self.__run(b[1], 1):
184 yield x
186 def __run(self, path, depth):
187 """Iterate both pairs of directories equally
189 Processes the contents of a single directory, recursively
190 calling itself to handle child directories. Returns with both
191 iterators advanced past the 'u' node that ends the dir."""
192 # print "run(%d): '%s'" % (depth, path)
193 a = self.__left.next()
194 b = self.__right.next()
196 while True:
197 # print "Comparing (%d) %s and %s" % (depth, a, b)
198 if a[0] == 'u' and b[0] == 'u':
199 # Both are leaving the directory.
200 # print "leave(%d): '%s'" % (depth, path)
201 for x in self.handle_leave(path, mode_both):
202 yield x
203 return
205 elif a[0] == 'd' and b[0] == 'd':
206 # Both looking at a directory entry.
208 if a[1] == b[1]:
209 # if the name is the same, walk the tree.
210 for x in self.handle_same_dir(path, a, b):
211 yield x
212 for x in self.__run(os.path.join(path, a[1]), depth + 1):
213 yield x
214 a = self.__left.next()
215 b = self.__right.next()
216 continue
218 elif a[1] < b[1]:
219 # A directory has been deleted.
220 for x in self.handle_delete_dir(path, a, False):
221 yield x
222 for x in self.delete_whole_dir(self.__left,
223 os.path.join(path, a[1])):
224 yield x
225 a = self.__left.next()
226 continue
228 else:
229 # A directory has been added.
230 for x in self.handle_add_dir(path, b, False):
231 yield x
233 for x in self.add_whole_dir(self.__right,
234 os.path.join(path, b[1])):
235 yield x
236 b = self.__right.next()
237 continue
239 elif a[0] == '-' and b[0] == '-':
240 # Both are looking at a non-dir.
242 if a[1] == b[1]:
243 # Same name as well.
244 for x in self.handle_same_nondir(path, a, b):
245 yield x
246 a = self.__left.next()
247 b = self.__right.next()
248 continue
250 elif a[1] < b[1]:
251 # Deleted non-dir.
252 for x in self.handle_delete_nondir(path, a, False):
253 yield x
254 a = self.__left.next()
255 continue
257 else:
258 # Added non-dir.
259 for x in self.handle_add_nondir(path, b, False):
260 yield x
261 b = self.__right.next()
262 continue
264 elif a[0] == '-' and b[0] == 'u':
265 for x in self.handle_delete_nondir(path, a, False):
266 yield x
267 a = self.__left.next()
268 continue
270 elif a[0] == 'u' and b[0] == '-':
271 for x in self.handle_add_nondir(path, b, False):
272 yield x
273 b = self.__right.next()
274 continue
276 elif a[0] == 'd' and (b[0] == '-' or b[0] == 'u'):
277 for x in self.handle_delete_dir(path, a, False):
278 yield x
279 for x in self.delete_whole_dir(self.__left,
280 os.path.join(path, a[1])):
281 yield x
282 a = self.__left.next()
283 continue
285 elif (a[0] == '-' or a[0] == 'u') and b[0] == 'd':
286 for x in self.handle_add_dir(path, b, False):
287 yield x
288 for x in self.add_whole_dir(self.__right,
289 os.path.join(path, b[1])):
290 yield x
291 b = self.__right.next()
292 continue
294 else:
295 print "Unhandled case: '%s' and '%s'" % (a[0], b[0])
296 sys.exit(2)
298 def add_whole_dir(self, iter, path):
299 "Consume entries until this directory has been added"
300 # print "add_whole_dir: %s" % path
301 while True:
302 a = iter.next()
303 if a[0] == 'u':
304 for x in self.handle_leave(path, mode_add):
305 yield x
306 return
307 elif a[0] == 'd':
308 for x in self.handle_add_dir(path, a, True):
309 yield x
310 for x in self.add_whole_dir(iter, os.path.join(path, a[1])):
311 yield x
312 else:
313 for x in self.handle_add_nondir(path, a, True):
314 yield x
316 def delete_whole_dir(self, iter, path):
317 "Consume entries until this directory has been deleted"
318 # print "delete_whole_dir: %s" % path
319 while True:
320 a = iter.next()
321 if a[0] == 'u':
322 for x in self.handle_leave(path, mode_delete):
323 yield x
324 return
325 elif a[0] == 'd':
326 for x in self.handle_delete_dir(path, a, True):
327 yield x
328 for x in self.delete_whole_dir(iter, os.path.join(path, a[1])):
329 yield x
330 else:
331 for x in self.handle_delete_nondir(path, a, True):
332 yield x
334 __must_match = {
335 'dir': ['uid', 'gid', 'perm'],
336 'file': ['uid', 'gid', 'mtime', 'perm', 'md5'],
337 'lnk': ['targ'],
338 'sock': ['uid', 'gid', 'perm'],
339 'fifo': ['uid', 'gid', 'perm'],
340 'blk': ['uid', 'gid', 'perm', 'devmaj', 'devmin'],
341 'chr': ['uid', 'gid', 'perm', 'devmaj', 'devmin'],
343 def compare_entries(path, a, b):
344 if a['kind'] != b['kind']:
345 yield "- %-20s %s" % (a['kind'], path)
346 yield "+ %-20s %s" % (b['kind'], path)
347 return
348 misses = []
349 for item in __must_match[a['kind']]:
350 if not (a.has_key(item) and b.has_key(item)):
351 misses.append(item)
352 elif a[item] != b[item]:
353 misses.append(item)
354 if misses:
355 yield " [%-18s] %s" % (",".join(misses), path)
356 if 'targ' in misses:
357 if a.has_key('targ'):
358 yield " old targ: %s" % a['targ']
359 if b.has_key('targ'):
360 yield " new targ: %s" % b['targ']
361 return
363 class check_comparer(comparer):
364 """Comparer for comparing either two trees, or a tree and a
365 filesystem. 'right' should be the newer tree.
366 Yields strings giving the tree differences.
368 def handle_same_dir(self, path, a, b):
369 return compare_entries(os.path.join(path, a[1]), a[2], b[2])
371 def handle_delete_dir(self, path, a, recursing):
372 if recursing:
373 return
374 else:
375 yield "- %-20s %s" % ('dir', os.path.join(path, a[1]))
376 def handle_add_dir(self, path, a, recursing):
377 if recursing:
378 return
379 else:
380 yield "+ %-20s %s" % ('dir', os.path.join(path, a[1]))
381 def handle_same_nondir(self, path, a, b):
382 return compare_entries(os.path.join(path, a[1]), a[2], b[2])
384 def handle_delete_nondir(self, path, a, recursing):
385 if recursing:
386 return
387 else:
388 yield "- %-20s %s" % (a[2]['kind'], os.path.join(path, a[1]))
389 def handle_add_nondir(self, path, a, recursing):
390 if recursing:
391 return
392 else:
393 yield "+ %-20s %s" % (a[2]['kind'], os.path.join(path, a[1]))
395 def update_link(assoc, path, name):
396 if assoc['kind'] == 'lnk':
397 assoc['targ'] = os.readlink(os.path.join(path, name))
399 def same_inode(a, b):
400 """Do these two nodes reference what appears to be the same,
401 unmodified inode."""
402 return (a['kind'] == b['kind'] and
403 a['ino'] == b['ino'] and
404 a['ctime'] == b['ctime'])
406 class update_comparer(comparer):
407 """Yields a tree equivalent to the right tree, which should be
408 coming from a live filesystem. Fills in symlink destinations and
409 file md5sums (if possible)."""
411 def handle_same_dir(self, path, a, b):
412 yield b
413 return
415 def handle_add_dir(self, path, a, recursing):
416 yield a
417 return
419 def handle_same_nondir(self, path, a, b):
420 update_link(b[2], path, b[1])
421 if b[2]['kind'] == 'file':
422 if same_inode(a[2], b[2]):
423 b[2]['md5'] = a[2]['md5']
424 else:
425 try:
426 b[2]['md5'] = hashing.hashof(os.path.join(path, b[1]))
427 except OSError:
428 b[2]['md5'] = '[error]'
429 yield b
430 return
432 def handle_add_nondir(self, path, a, recursing):
433 update_link(a[2], path, a[1])
434 if a[2]['kind'] == 'file':
435 try:
436 a[2]['md5'] = hashing.hashof(os.path.join(path, a[1]))
437 except OSError:
438 a[2]['md5'] = '[error]'
439 yield a
440 return
442 def handle_leave(self, path, mode):
443 if (mode & mode_add) != 0:
444 yield 'u',
445 return
447 version = 'Asure scan version 1.1'
449 def read1_0(fd):
450 try:
451 while True:
452 yield load(fd)
453 except EOFError:
454 return
456 def read1_1(fd):
457 try:
458 while True:
459 for item in load(fd):
460 yield item
461 except EOFError:
462 return
464 readers = {
465 'Asure scan version 1.0': read1_0,
466 'Asure scan version 1.1': read1_1 }
468 def reader(path):
469 """Iterate over a previously written dump"""
470 fd = gzip.open(path, 'rb')
471 vers = load(fd)
472 if readers.has_key(vers):
473 for item in readers[vers](fd):
474 yield item
475 else:
476 raise "Unsupported version of asure file"
477 fd.close()
479 use_protocol = -1
481 def writer_new(path, iter):
482 """Write the given item (probably assembled iterator)"""
483 fd = gzip.open(path, 'wb')
484 dump(version, fd, use_protocol)
485 items = []
486 for item in iter:
487 items.append(item)
488 if len(items) >= 100:
489 dump(items, fd, use_protocol)
490 items = []
491 if len(items) > 0:
492 dump(items, fd, use_protocol)
493 fd.close()
495 def writer_old(path, iter):
496 """Write the given item (probably assembled iterator)"""
497 fd = gzip.open(path, 'wb')
498 dump('Asure scan version 1.0', fd, use_protocol)
499 for item in iter:
500 dump(item, fd, use_protocol)
501 fd.close()
503 def writer(path, iter):
504 writer_new(path, iter)
506 def rename_cycle():
507 """Cycle through the names"""
508 try:
509 os.rename('0sure.dat.gz', '0sure.bak.gz')
510 except OSError:
511 pass
512 os.rename('0sure.0.gz', '0sure.dat.gz')
514 def fresh_scan():
515 """Perform a fresh scan of the filesystem"""
516 tree = update_comparer(empty_tree(), walk('.'))
517 writer('0sure.0.gz', tree.run())
518 rename_cycle()
520 def check_scan():
521 """Perform a scan of the filesystem, and compare it with the scan
522 file. reports differences."""
523 prior = reader('0sure.dat.gz')
524 cur = update_comparer(empty_tree(), walk('.')).run()
525 # compare_trees(prior, cur)
526 for x in check_comparer(prior, cur).run():
527 print x
529 def update():
530 """Scan filesystem, but also read the previous scan to cache md5
531 hashes of files that haven't had any inode changes"""
532 prior = reader('0sure.dat.gz')
533 cur = update_comparer(prior, walk('.')).run()
534 writer('0sure.0.gz', cur)
535 rename_cycle()
537 def signoff():
538 """Compare the previous scan with the current."""
539 prior = reader('0sure.bak.gz')
540 cur = reader('0sure.dat.gz')
541 for x in check_comparer(prior, cur).run():
542 print x
544 def show():
545 """Show the contents of the scan file"""
546 indent = 0
547 for i in reader('0sure.dat.gz'):
548 if i[0] == 'u':
549 indent -= 1
550 print "%s%s" % (" " * indent, i)
551 if i[0] == 'd':
552 indent += 1
554 def nothing():
555 """Just read the scan file, doing nothing with it"""
556 for i in reader('0sure.dat.gz'):
557 pass
559 def copy():
560 """Copy the latest scan, can be used to update to a newer storage
561 format."""
562 writer('0sure.0.gz', reader('0sure.dat.gz'))
563 os.rename('0sure.0.gz', '0sure.dat.gz')
565 commands = {
566 'scan': fresh_scan,
567 'update': update,
568 'check': check_scan,
569 'signoff': signoff,
570 'show': show,
571 'copy': copy,
572 'nothing': nothing }
574 def main(argv):
575 if len(argv) != 1:
576 usage()
577 if commands.has_key(argv[0]):
578 commands[argv[0]]()
579 else:
580 usage()
582 def usage():
583 print "Usage: asure {%s}" % '|'.join(commands.keys())
584 sys.exit(1)
586 if __name__ == '__main__':
587 "Test this"
588 main(sys.argv[1:])