new file: extract_bam_tag.sh
[GalaxyCodeBases.git] / python / gethashs / gethashes.py
blob4a1a164a7cff7e0348b3e039323f28dd23ea849e
1 #!/usr/bin/env python3
3 import os, sys, hashlib
4 import sqlite3, getopt, mmap
5 import re
6 import gzip
7 #from os.path import join, getsize
8 from datetime import datetime
9 import pprint
11 epoch = datetime.utcfromtimestamp(0)
12 def epoch_seconds(dt):
13 return (dt - epoch).total_seconds()
15 BUF_SIZE = 1048576 # lets read stuff in 1Mb chunks!
16 def sha1file(fname=None, blocksize=BUF_SIZE):
17 if fname is None:
18 return None
19 sha1 = hashlib.sha1()
20 with open(fname, 'rb') as f:
21 with mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) as mm:
22 for block in iter(lambda: mm.read(blocksize), b""):
23 sha1.update(block)
24 return sha1.hexdigest()
26 class Config: # https://stackoverflow.com/a/47016739/159695
27 def __init__(self, **kwds):
28 self.verbose=0 # -1=quiet 0=norm 1=noisy
29 self.mode=0 # 0:Test, 1:Create
30 self.skipNewer=0
31 self.sha1dump=0
32 self.ssize=1048576
33 self.startpoint = ''.join(['.',os.sep])
34 self.__dict__.update(kwds) # Must be last to accept assigned member variable.
35 def __repr__(self):
36 args = ['%s=%s' % (k, repr(v)) for (k,v) in vars(self).items()]
37 return '%s(%s)' % ( self.__class__.__qualname__, ', '.join(args) )
39 def pverbose(s,nl='\n'):
40 if config.verbose>0:
41 sys.stdout.write(s+nl)
42 def pinfo(s,nl='\n'):
43 if config.verbose>=0 or config.verbose==-3:
44 sys.stdout.write(s+nl)
45 def perror(s,nl='\n'):
46 if config.verbose>=-1:
47 sys.stdout.flush() # avoid inconsistent screen state if stdout has unflushed data
48 sys.stderr.write(s+nl)
49 def printusage(err=0):
50 phelp = err and perror or pinfo # False->pinfo, True->perror
51 phelp('Usage: gethashes [opts] [-p dir] [-T|-C] [-f file] [files...]')
52 #phelp(' -T Test mode (default)')
53 #phelp(' -C Create mode')
54 #phelp(' -t <t> set type to <t> (%s, or auto(default))'%', '.join(sorted(hashlib.algorithms_available)))
55 phelp(' -p <d> change to directory <d> before doing anything')
56 phelp(' -f <f> use <f> as gzipped hash file (<d>.hash.gz)')
57 #phelp('Options in Create mode:')
58 phelp(' -s [s][k,m] load .sha1 files in subdirectories and skip older recorded files larger than [s] [*1024, *1048576] (default=1m)')
59 phelp(' -a Always skip recorded files even if loaded .sha1 file is older')
60 #phelp(' -1 Also create <f>.sha1 file')
61 #phelp('Options in Test mode:')
62 #phelp(' -b <l> Output list of bad files to file <l>')
63 #phelp('Other Options:')
64 phelp(' -v/-q verbose/quiet, change verbosity [-1,2]')
65 phelp(' --help/-h show help')
66 phelp(' --version show gethashes and module versions')
67 sys.exit(err)
69 config=Config()
71 # https://stackoverflow.com/questions/635483/what-is-the-best-way-to-implement-nested-dictionaries/19829714#19829714
72 class Vividict(dict):
73 def __missing__(self, key):
74 value = self[key] = type(self)() # retain local pointer to value
75 return value # faster to return than dict lookup
77 OldHashes = Vividict()
78 HitHashes = 0
79 # rem from `cfv` L1116:`_foosum_rem`. `re.match()` checks for a match only at the beginning of the string, thus not r'^'.
80 sha1rem=re.compile(r'([0-9a-fA-F]{40}) ([ *])([^\r\n]+)[\r\n]*$')
82 def loadsha1(root,afile):
83 global OldHashes
84 rname = os.path.join(root,afile)
85 mtime = os.path.getmtime(rname)
86 itextmode = 0
87 imissing = 0
88 for line in open(rname, encoding='utf-8', errors='surrogateescape'):
89 x = sha1rem.match(line)
90 if not x: return -1
91 if x.group(2)==' ':
92 if not itextmode:
93 pinfo('[!] Textmode in "%s".'%(rname))
94 itextmode += 1
95 continue
96 iname = os.path.join(root,x.group(3))
97 try:
98 istat = os.stat(iname)
99 except FileNotFoundError:
100 if not imissing:
101 pinfo('[!] Missing file from "%s".'%(rname))
102 imissing += 1
103 continue
104 if istat.st_size < config.ssize:
105 continue
106 if not config.skipNewer:
107 itime = os.path.getmtime(iname)
108 #isize = os.path.getsize(iname)
109 #pprint.pprint(['t:',iname,mtime,itime])
110 if mtime < itime:
111 continue
112 OldHashes[istat.st_dev][istat.st_ino] = x.group(1)
113 if itextmode>1 :
114 pinfo('[!] Textmode %d times in "%s" !'%(itextmode,rname))
115 if imissing>1 :
116 pinfo('[!] Missing %d times in "%s" !'%(imissing,rname))
117 return
119 def main(argv=None):
120 if argv is None:
121 argv = sys.argv[1:]
122 if not argv:
123 argv.append('.')
124 argv[0] = ''.join([argv[0].rstrip(os.sep),os.sep])
125 #pprint.pprint(argv) # <-- DEBUG
127 try:
128 opts, args = getopt.gnu_getopt(argv, "CTf:p:s:a1b:vqh?", ['help','version'])
129 except getopt.GetoptError as err:
130 # print help information and exit:
131 print(err) # will print something like "option -a not recognized"
132 printusage(1)
134 try:
135 prevopt=''
136 for o, a in opts:
137 if o=='-p':
138 config.startpoint = ''.join([a.rstrip(os.sep),os.sep])
139 os.chdir(a) # also checks PermissionError and FileNotFoundError for me
140 elif o=='-f':
141 config.hashfile = a
142 elif o=='-s':
143 config.ssize = human2bytes(a)
144 elif o=='-a':
145 config.skipNewer = 1
146 elif o=='-1':
147 config.sha1dump = 1
148 elif o=='-v':
149 if config.verbose >=0: config.verbose +=1
150 elif o=='-q':
151 if config.verbose >=0:
152 config.verbose =-1
153 else:
154 config.verbose -=1
155 elif o in ("-h", "--help", '-?'):
156 printusage()
157 elif o=='-V' or o=='--version':
158 print('gethashes %s'%version)
159 print('python %08x-%s'%(sys.hexversion,sys.platform))
160 sys.exit(0)
161 else:
162 assert False, "unhandled option"
163 prevopt=o
164 except RuntimeError as e:
165 perror('cfv: %s'%e)
166 sys.exit(1)
167 if not hasattr(config, 'hashfile'):
168 dirName = os.path.basename(os.path.abspath(config.startpoint))
169 #dirName = os.path.basename(os.getcwd())
170 config.hashfile = ''.join([config.startpoint, dirName, '.hash.gz'])
171 pprint.pprint(config) # <-- DEBUG
172 doCreation()
174 def doCreation():
175 f_out = gzip.open(config.hashfile, 'wt', encoding='utf-8', errors='surrogateescape')
177 for root, dirs, files in os.walk(config.startpoint): # os.walk(top, topdown=True, onerror=None, followlinks=False)
178 if '@eaDir' in dirs:
179 dirs.remove('@eaDir') # don't visit "@eaDir" directories
180 dirs.sort(reverse=True)
181 files.sort(reverse=True)
182 relroot = root.rpartition(config.startpoint)[2]
183 #if not relroot: relroot = '.'
184 global OldHashes
185 global HitHashes
186 for afile in files:
187 if afile.endswith(".sha1"):
188 rname = os.path.join(root,afile)
189 pinfo('[!]Loading SHA1 hash from "%s"'%rname)
190 loadsha1(root,afile)
191 for afile in files:
192 rname = os.path.join(root,afile)
193 try:
194 if os.path.samefile(rname,config.hashfile):
195 continue
196 except FileNotFoundError:
197 pinfo('[!] FileNotFound:"%s".'%rname)
198 continue
199 #fname = os.sep.join(filter(None,[relroot,afile]))
200 fname = os.path.join(relroot,afile)
201 istat = os.stat(rname)
202 if istat.st_size == 0:
203 continue
204 elif (istat.st_dev in OldHashes) and (istat.st_ino in OldHashes[istat.st_dev]):
205 ihash = OldHashes[istat.st_dev][istat.st_ino]
206 HitHashes += 1
207 else:
208 ihash = sha1file(rname)
209 #pprint.pprint(('O:',fname,rname,ihash,HitHashes))
210 f_out.write('%s *%s\n'%(ihash,fname))
211 #mtime = os.path.getmtime(rname)
212 #stime = datetime.utcfromtimestamp(mtime).strftime('%Y%m%du%H%M%S')
213 #rtime = datetime.strptime(''.join([stime,'UTC']),'%Y%m%du%H%M%S%Z')
214 #print(rname,fname,stime,mtime,epoch_seconds(rtime))
215 #fsize = os.path.getsize(rname)
216 #hsize = bytes2human(fsize)
217 #print(fname,hsize,stime,sha1file(rname),sep='\t')
218 f_out.close()
219 if HitHashes:
220 pinfo('[!]Skipped hashing of %d recorded file(s).'%HitHashes)
221 pinfo('\n[!]Done. Test with `cfv -p %s -f %s`.'%(config.startpoint,config.hashfile))
223 # https://github.com/giampaolo/pyftpdlib/blob/0430c92e9d852a6d175b489c0ebf17fbc0190914/scripts/ftpbench#L139
224 def bytes2human(n, format="%(value).1f%(symbol)s", intfmt="%(value).0f %(symbol)s"):
226 >>> bytes2human(10000)
227 '9K'
228 >>> bytes2human(100001221)
229 '95M'
231 symbols = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
232 prefix = {}
233 for i, s in enumerate(symbols[1:]):
234 prefix[s] = 1 << (i + 1) * 10
235 for symbol in reversed(symbols[1:]):
236 if n >= prefix[symbol]:
237 value = float(n) / prefix[symbol]
238 return format % locals()
239 #import re
240 #intfmt=re.sub(r'\(value\)\.(\d+)',r'(value).0',format)
241 #print(intfmt)
242 return intfmt % dict(symbol=symbols[0], value=n)
244 # http://goo.gl/zeJZl
245 def human2bytes(s):
247 >>> human2bytes('1M')
248 1048576
249 >>> human2bytes('1G')
250 1073741824
252 symbols = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
253 letter = s[-1:].strip().upper()
254 num = s[:-1]
255 assert num.isdigit() and letter in symbols, s
256 num = float(num)
257 prefix = {symbols[0]: 1}
258 for i, s in enumerate(symbols[1:]):
259 prefix[s] = 1 << (i + 1) * 10
260 return int(num * prefix[letter])
262 version='0.1'
263 if __name__ == '__main__':
264 main()
266 def isBlank (myString):
267 return not (myString and myString.strip())
269 def isNotBlank (myString):
270 return bool(myString and myString.strip())