python/gethashs/gethashes.py

   1 #!/usr/bin/env python3
   2
   3 import os, sys, hashlib
   4 import sqlite3, getopt, mmap
   5 import re
   6 import gzip
   7 #from os.path import join, getsize
   8 from datetime import datetime
   9 import pprint
  10
  11 epoch = datetime.utcfromtimestamp(0)
  12 def epoch_seconds(dt):
  13     return (dt - epoch).total_seconds()
  14
  15 BUF_SIZE = 1048576  # lets read stuff in 1Mb chunks!
  16 def sha1file(fname=None, blocksize=BUF_SIZE):
  17     if fname is None:
  18         return None
  19     sha1 = hashlib.sha1()
  20     with open(fname, 'rb') as f:
  21         with mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) as mm:
  22             for block in iter(lambda: mm.read(blocksize), b""):
  23                 sha1.update(block)
  24     return sha1.hexdigest()
  25
  26 class Config: # https://stackoverflow.com/a/47016739/159695
  27     def __init__(self, **kwds):
  28         self.verbose=0 # -1=quiet  0=norm  1=noisy
  29         self.mode=0 # 0:Test, 1:Create
  30         self.skipNewer=0
  31         self.sha1dump=0
  32         self.ssize=1048576
  33         self.startpoint = ''.join(['.',os.sep])
  34         self.__dict__.update(kwds) # Must be last to accept assigned member variable.
  35     def __repr__(self):
  36         args = ['%s=%s' % (k, repr(v)) for (k,v) in vars(self).items()]
  37         return '%s(%s)' % ( self.__class__.__qualname__, ', '.join(args) )
  38
  39 def pverbose(s,nl='\n'):
  40     if config.verbose>0:
  41         sys.stdout.write(s+nl)
  42 def pinfo(s,nl='\n'):
  43     if config.verbose>=0 or config.verbose==-3:
  44         sys.stdout.write(s+nl)
  45 def perror(s,nl='\n'):
  46     if config.verbose>=-1:
  47         sys.stdout.flush() # avoid inconsistent screen state if stdout has unflushed data
  48         sys.stderr.write(s+nl)
  49 def printusage(err=0):
  50     phelp = err and perror or pinfo # False->pinfo, True->perror
  51     phelp('Usage: gethashes [opts] [-p dir] [-T|-C] [-f file] [files...]')
  52     #phelp('  -T       Test mode (default)')
  53     #phelp('  -C       Create mode')
  54     #phelp('  -t <t>   set type to <t> (%s, or auto(default))'%', '.join(sorted(hashlib.algorithms_available)))
  55     phelp('  -p <d>   change to directory <d> before doing anything')
  56     phelp('  -f <f>   use <f> as gzipped hash file (<d>.hash.gz)')
  57     #phelp('Options in Create mode:')
  58     phelp('  -s [s][k,m]   load .sha1 files in subdirectories and skip older recorded files larger than [s] [*1024, *1048576] (default=1m)')
  59     phelp('  -a            Always skip recorded files even if loaded .sha1 file is older')
  60     #phelp('  -1            Also create <f>.sha1 file')
  61     #phelp('Options in Test mode:')
  62     #phelp('  -b <l>        Output list of bad files to file <l>')
  63     #phelp('Other Options:')
  64     phelp('  -v/-q    verbose/quiet, change verbosity [-1,2]')
  65     phelp(' --help/-h show help')
  66     phelp(' --version show gethashes and module versions')
  67     sys.exit(err)
  68
  69 config=Config()
  70
  71 # https://stackoverflow.com/questions/635483/what-is-the-best-way-to-implement-nested-dictionaries/19829714#19829714
  72 class Vividict(dict):
  73     def __missing__(self, key):
  74         value = self[key] = type(self)() # retain local pointer to value
  75         return value                     # faster to return than dict lookup
  76
  77 OldHashes = Vividict()
  78 HitHashes = 0
  79 # rem from `cfv` L1116:`_foosum_rem`. `re.match()` checks for a match only at the beginning of the string, thus not r'^'.
  80 sha1rem=re.compile(r'([0-9a-fA-F]{40}) ([ *])([^\r\n]+)[\r\n]*$')
  81
  82 def loadsha1(root,afile):
  83     global OldHashes
  84     rname = os.path.join(root,afile)
  85     mtime = os.path.getmtime(rname)
  86     itextmode = 0
  87     imissing = 0
  88     for line in open(rname, encoding='utf-8', errors='surrogateescape'):
  89         x = sha1rem.match(line)
  90         if not x: return -1
  91         if x.group(2)==' ':
  92             if not itextmode:
  93                 pinfo('[!] Textmode in "%s".'%(rname))
  94             itextmode += 1
  95             continue
  96         iname = os.path.join(root,x.group(3))
  97         try:
  98             istat = os.stat(iname)
  99         except FileNotFoundError:
 100             if not imissing:
 101                 pinfo('[!] Missing file from "%s".'%(rname))
 102             imissing += 1
 103             continue
 104         if istat.st_size < config.ssize:
 105             continue
 106         if not config.skipNewer:
 107             itime = os.path.getmtime(iname)
 108             #isize = os.path.getsize(iname)
 109             #pprint.pprint(['t:',iname,mtime,itime])
 110             if mtime < itime:
 111                 continue
 112         OldHashes[istat.st_dev][istat.st_ino] = x.group(1)
 113     if itextmode>1 :
 114         pinfo('[!] Textmode %d times in "%s" !'%(itextmode,rname))
 115     if imissing>1 :
 116         pinfo('[!] Missing %d times in "%s" !'%(imissing,rname))
 117     return
 118
 119 def main(argv=None):
 120     if argv is None:
 121         argv = sys.argv[1:]
 122     if not argv:
 123         argv.append('.')
 124         argv[0] = ''.join([argv[0].rstrip(os.sep),os.sep])
 125     #pprint.pprint(argv) # <-- DEBUG
 126
 127     try:
 128         opts, args = getopt.gnu_getopt(argv, "CTf:p:s:a1b:vqh?", ['help','version'])
 129     except getopt.GetoptError as err:
 130         # print help information and exit:
 131         print(err)  # will print something like "option -a not recognized"
 132         printusage(1)
 133
 134     try:
 135         prevopt=''
 136         for o, a in opts:
 137             if o=='-p':
 138                 config.startpoint = ''.join([a.rstrip(os.sep),os.sep])
 139                 os.chdir(a) # also checks PermissionError and FileNotFoundError for me
 140             elif o=='-f':
 141                 config.hashfile = a
 142             elif o=='-s':
 143                 config.ssize = human2bytes(a)
 144             elif o=='-a':
 145                 config.skipNewer = 1
 146             elif o=='-1':
 147                 config.sha1dump = 1
 148             elif o=='-v':
 149                 if config.verbose >=0: config.verbose +=1
 150             elif o=='-q':
 151                 if config.verbose >=0:
 152                     config.verbose =-1
 153                 else:
 154                     config.verbose -=1
 155             elif o in ("-h", "--help", '-?'):
 156                 printusage()
 157             elif o=='-V' or o=='--version':
 158                 print('gethashes %s'%version)
 159                 print('python %08x-%s'%(sys.hexversion,sys.platform))
 160                 sys.exit(0)
 161             else:
 162                 assert False, "unhandled option"
 163             prevopt=o
 164     except RuntimeError as e:
 165         perror('cfv: %s'%e)
 166         sys.exit(1)
 167     if not hasattr(config, 'hashfile'):
 168         dirName = os.path.basename(os.path.abspath(config.startpoint))
 169         #dirName = os.path.basename(os.getcwd())
 170         config.hashfile = ''.join([config.startpoint, dirName, '.hash.gz'])
 171     pprint.pprint(config) # <-- DEBUG
 172     doCreation()
 173
 174 def doCreation():
 175     f_out = gzip.open(config.hashfile, 'wt', encoding='utf-8', errors='surrogateescape')
 176
 177     for root, dirs, files in os.walk(config.startpoint): # os.walk(top, topdown=True, onerror=None, followlinks=False)
 178         if '@eaDir' in dirs:
 179             dirs.remove('@eaDir')  # don't visit "@eaDir" directories
 180         dirs.sort(reverse=True)
 181         files.sort(reverse=True)
 182         relroot = root.rpartition(config.startpoint)[2]
 183         #if not relroot: relroot = '.'
 184         global OldHashes
 185         global HitHashes
 186         for afile in files:
 187             if afile.endswith(".sha1"):
 188                 rname = os.path.join(root,afile)
 189                 pinfo('[!]Loading SHA1 hash from "%s"'%rname)
 190                 loadsha1(root,afile)
 191         for afile in files:
 192             rname = os.path.join(root,afile)
 193             try:
 194                 if os.path.samefile(rname,config.hashfile):
 195                     continue
 196             except FileNotFoundError:
 197                 pinfo('[!] FileNotFound:"%s".'%rname)
 198                 continue
 199             #fname = os.sep.join(filter(None,[relroot,afile]))
 200             fname = os.path.join(relroot,afile)
 201             istat = os.stat(rname)
 202             if istat.st_size == 0:
 203                 continue
 204             elif (istat.st_dev in OldHashes) and (istat.st_ino in OldHashes[istat.st_dev]):
 205                 ihash = OldHashes[istat.st_dev][istat.st_ino]
 206                 HitHashes += 1
 207             else:
 208                 ihash = sha1file(rname)
 209             #pprint.pprint(('O:',fname,rname,ihash,HitHashes))
 210             f_out.write('%s *%s\n'%(ihash,fname))
 211             #mtime = os.path.getmtime(rname)
 212             #stime = datetime.utcfromtimestamp(mtime).strftime('%Y%m%du%H%M%S')
 213             #rtime = datetime.strptime(''.join([stime,'UTC']),'%Y%m%du%H%M%S%Z')
 214             #print(rname,fname,stime,mtime,epoch_seconds(rtime))
 215             #fsize = os.path.getsize(rname)
 216             #hsize = bytes2human(fsize)
 217             #print(fname,hsize,stime,sha1file(rname),sep='\t')
 218     f_out.close()
 219     if HitHashes:
 220         pinfo('[!]Skipped hashing of %d recorded file(s).'%HitHashes)
 221     pinfo('\n[!]Done. Test with `cfv -p %s -f %s`.'%(config.startpoint,config.hashfile))
 222
 223 # https://github.com/giampaolo/pyftpdlib/blob/0430c92e9d852a6d175b489c0ebf17fbc0190914/scripts/ftpbench#L139
 224 def bytes2human(n, format="%(value).1f%(symbol)s", intfmt="%(value).0f %(symbol)s"):
 225     """
 226     >>> bytes2human(10000)
 227     '9K'
 228     >>> bytes2human(100001221)
 229     '95M'
 230     """
 231     symbols = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
 232     prefix = {}
 233     for i, s in enumerate(symbols[1:]):
 234         prefix[s] = 1 << (i + 1) * 10
 235     for symbol in reversed(symbols[1:]):
 236         if n >= prefix[symbol]:
 237             value = float(n) / prefix[symbol]
 238             return format % locals()
 239     #import re
 240     #intfmt=re.sub(r'\(value\)\.(\d+)',r'(value).0',format)
 241     #print(intfmt)
 242     return intfmt % dict(symbol=symbols[0], value=n)
 243
 244 # http://goo.gl/zeJZl
 245 def human2bytes(s):
 246     """
 247     >>> human2bytes('1M')
 248     1048576
 249     >>> human2bytes('1G')
 250     1073741824
 251     """
 252     symbols = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
 253     letter = s[-1:].strip().upper()
 254     num = s[:-1]
 255     assert num.isdigit() and letter in symbols, s
 256     num = float(num)
 257     prefix = {symbols[0]: 1}
 258     for i, s in enumerate(symbols[1:]):
 259         prefix[s] = 1 << (i + 1) * 10
 260     return int(num * prefix[letter])
 261
 262 version='0.1'
 263 if __name__ == '__main__':
 264     main()
 265
 266 def isBlank (myString):
 267     return not (myString and myString.strip())
 268
 269 def isNotBlank (myString):
 270     return bool(myString and myString.strip())