modified: tasks/common.wdl
[GalaxyCodeBases.git] / python / etc / getlmers.py
blob3242de15a2385eed4ffa3efd4686c8ef08eba542
1 #!/usr/bin/env python3
3 #InFile = 'out.kmers.1k'
5 #import csv
6 import primer3
7 import itertools
8 import re
10 from pprint import pprint
12 import math
13 from collections import Counter
15 myKmerLen = 18
16 myEntropyKmer = myKmerLen - 3
17 myMinEntropy = myKmerLen * 3
19 def entropy(seq, max_k=5):
20 #p, lns = Counter(s), float(len(s))
21 #return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
22 SeqLength = len(seq)
23 Kmers = [ [] for x in range(max_k) ]
24 for i in range(SeqLength):
25 for thisK in range(1,max_k+1):
26 if SeqLength-i+1 > thisK:
27 this_kmer = seq[i:(i+thisK)]
28 Kmers[thisK-1].append(this_kmer)
29 sumSNa = 0
30 sumSNe = 0
31 for thisKsub1 in range(max_k): # https://rosettacode.org/wiki/Entropy
32 p = Counter(Kmers[thisKsub1])
33 lns = float(len(Kmers[thisKsub1]))
34 n = 4**(thisKsub1+1)
35 thisSNa = -sum( count * math.log(count/lns, 2) for count in p.values())
36 thisSNe = -sum( count * math.log(count/lns, n) for count in p.values())
37 sumSNa = sumSNa + thisSNa
38 sumSNe = sumSNe + thisSNe
39 return [sumSNa,sumSNe]
40 #ttt = entropy('GATTACATATC',11)
41 #print(str(ttt))
44 reTri = re.compile(r"(\w)\1{2,}")
46 for i in itertools.product('ACTG', repeat=myKmerLen):
47 if len(set(i)) < 4:
48 continue
49 oneKmer = ''.join(i)
50 reTriFound = reTri.search(oneKmer)
51 if reTriFound:
52 continue
53 #print(oneKmer)
54 #print(reTriFound)
55 thisE = entropy(oneKmer,myEntropyKmer)
56 if thisE[1] < myMinEntropy:
57 continue
58 hp=primer3.calcHairpin(oneKmer)
59 hd=primer3.calcHomodimer(oneKmer)
60 primer3Flag = ['pF','dF']
61 if hp.structure_found:
62 primer3Flag[0] = 'pT'
63 if hd.structure_found:
64 primer3Flag[1] = 'dT'
65 print(' '.join([oneKmer,str(thisE[0]),str(thisE[1]),primer3Flag[0],primer3Flag[1],str(round(hp.tm,2)),str(round(hd.tm,2))]))
67 exit()
69 hp=primer3.calcHairpin('CCCCCATCCGATCAGGGGG')
70 hd=primer3.calcHomodimer('CCCCCATCCGATCAGGGGG')
72 """
73 grep OK out.kmers.f1 | awk -v size=2.5 '{ b=int($5/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'
75 grep OK out.kmers.f1 | awk -v size=0.5 '{ b=int($6/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'
76 """