modified: nfig1.py
[GalaxyCodeBases.git] / python / etc / cntMpileup.py
blobabcb83f8bada7d987e682f59c64e8951e0432b71
1 #!/usr/bin/env pypy3
3 import lz4.frame
4 import os
5 import csv
6 import itertools
7 import sys
8 import collections
10 CoreKeys = ["{}{}".format(x,y) for x,y in itertools.product(['fwd','rev'], ['A','C','G','T','N'])]
11 CounterKeys = CoreKeys + ['cntIns','cntDel'];
12 theSamples = ['B7','D3','Normal'];
14 def main():
15 if len(sys.argv) < 3 :
16 print('Usage:',sys.argv[0],'<samtools.mpileup.lz4> <BED file>',file=sys.stderr,flush=True);
17 exit(0);
18 inDepthFile = sys.argv[1]
19 inBEDFile = sys.argv[2]
21 theZone = collections.defaultdict(dict)
22 with open(inBEDFile, newline='') as csvfile:
23 tsvin = csv.DictReader(csvfile, delimiter='\t', fieldnames=['ChrID','Start','End'])
24 for row in tsvin:
25 theZone[row['ChrID']][row['Start']] = 1
26 #print(theZone)
27 #print("# Samples: [{}]".format(','.join(theSamples)))
28 print('# Format: {}'.format(','.join(CoreKeys)))
29 print("\t".join(['ChrID','Pos']+theSamples))
30 with lz4.frame.open(inDepthFile, newline='', mode='rt') as fp:
31 tsvin = csv.DictReader(fp, delimiter='\t', fieldnames=['ChrID','Pos','Ref'] +
32 ["{}_{}".format(x,y) for x,y in itertools.product(theSamples, ['Depth','Bases','Qs'])]
34 sbc = {key:collections.Counter() for key in theSamples}
35 bc = {key:{} for key in theSamples}
36 try:
37 for row in tsvin:
38 #print(row)
39 if row['ChrID'] in theZone:
40 if row['Pos'] in theZone[row['ChrID']]:
41 #print(row)
42 print("\t".join((row['ChrID'],row['Pos'])),end='\t')
43 for k in theSamples:
44 bc[k] = Base_Counter(row['{}_Depth'.format(k)],row['{}_Bases'.format(k)])
45 sbc[k] += {x: bc[k][x] for x in CounterKeys}
46 #print(bc[k])
47 print(",".join( [str(bc[k][x]) for x in CoreKeys] ),end='\t')
48 print(flush=True);
49 except KeyboardInterrupt:
50 print('\n#[!]Ctrl+C pressed.',file=sys.stderr,flush=True)
51 sys.stdout.flush()
52 pass
53 #print(sbc);
55 # Base Counting Subroutine *[Completed]
56 # https://github.com/photonchang/allelecount/blob/master/allelecount.py
57 def Base_Counter(inDepth,inBases):
58 # Cleaning up Base String + Indel Counting
59 CleanString = ''
60 countIn = 0
61 countDel = 0
62 IndelHolder = []
63 IndelDeterminant = 0
64 CleanBool = False
66 for currentIndex, Strholder in enumerate(inBases):
67 # Skipping of '^' Signage
68 if CleanBool == True:
69 CleanBool = False
70 continue
72 if Strholder == '^':
73 CleanBool = True
74 continue
76 # Skipping Indel
77 if IndelDeterminant > 0:
78 IndelDeterminant -= 1
79 continue
81 if Strholder == '+':
82 countIn += 1
84 # Determining Length of Indel
85 # Since Illumina NGS has an upper limit of less than 999bp read length
86 IndelDeterminant = 0
88 if (currentIndex + 4) <= len(inBases):
89 if inBases[currentIndex + 1: currentIndex + 4].isnumeric() == True:
90 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 4]) + 3
91 elif inBases[currentIndex + 1: currentIndex + 3]. isnumeric() == True:
92 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 3]) + 2
93 elif inBases[currentIndex + 1: currentIndex + 2].isnumeric() == True:
94 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 2]) + 1
95 elif (currentIndex + 3) <= len(inBases):
96 if inBases[currentIndex + 1: currentIndex + 3]. isnumeric() == True:
97 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 3]) + 2
98 elif inBases[currentIndex + 1: currentIndex + 2].isnumeric() == True:
99 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 2]) + 1
101 IndelHolder.append(inBases[currentIndex:currentIndex + IndelDeterminant + 1])
102 continue
104 if Strholder == '-':
105 countDel += 1
107 # Determining Length of Indel
108 # Since Illumina NGS has an upper limit of less than 999bp read length
109 IndelDeterminant = 0
111 if (currentIndex + 4) <= len(inBases):
112 if inBases[currentIndex + 1: currentIndex + 4].isnumeric() == True:
113 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 4]) + 3
114 elif inBases[currentIndex + 1: currentIndex + 3]. isnumeric() == True:
115 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 3]) + 2
116 elif inBases[currentIndex + 1: currentIndex + 2].isnumeric() == True:
117 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 2]) + 1
118 elif (currentIndex + 3) <= len(inBases):
119 if inBases[currentIndex + 1: currentIndex + 3]. isnumeric() == True:
120 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 3]) + 2
121 elif inBases[currentIndex + 1: currentIndex + 2].isnumeric() == True:
122 IndelDeterminant = int(inBases[currentIndex + 1: currentIndex + 2]) + 1
124 IndelHolder.append(inBases[currentIndex: currentIndex + len(str(IndelDeterminant)) + 1])
125 continue
127 CleanString += Strholder
129 else:
130 # Transferring Back Cleaned String
131 inBases = CleanString
133 # '$' Signage Stripping
134 inBases = inBases.replace('$', '')
136 # Base Count Var Initialization
137 bigA = 0
138 bigC = 0
139 bigG = 0
140 bigT = 0
141 smallA = 0
142 smallC = 0
143 smallG = 0
144 smallT = 0
145 delBase = 0
147 # Base Counting
148 bigA = inBases.count('A')
149 bigC = inBases.count('C')
150 bigG = inBases.count('G')
151 bigT = inBases.count('T')
152 bigN = inBases.count('N')
154 smallA = inBases.count('a')
155 smallC = inBases.count('c')
156 smallG = inBases.count('g')
157 smallT = inBases.count('t')
158 smallN = inBases.count('n')
160 delBase = inBases.count('*')
162 # Internal Check - Throws Out Error (NOT STD-IN/OUT Compatible: Should break pipeline)
163 InternalCounter = 0
164 InternalCounter = bigA + bigC + bigG + bigT + bigN + smallA + smallC + smallG + smallT + smallN + delBase
165 if InternalCounter != int(inDepth):
166 print('Error:')
167 print('Reported count: ' + inDepth)
168 print('Internal counter: ' + str(InternalCounter))
169 print('Internal sum: ' + str(len(inBases)))
170 print('Number of insertions: ' + str(countIn))
171 print('Number of deleions: ' + str(countDel))
172 print('Post-processed bases: ' + inBases)
173 #sys.exit()
175 # Indel Compilation
176 IndelSetDict = set(IndelHolder)
177 tmpIndelString = ''
178 FinalIndelHolder = []
180 for EveryIndel in IndelSetDict:
181 tmpIndelString = ''
182 tmpIndelString = str(IndelHolder.count(EveryIndel)) + ":" + EveryIndel
183 FinalIndelHolder.append(tmpIndelString)
185 # Return Output
186 FinalOutput = collections.OrderedDict.fromkeys( CounterKeys + ['Indels'] )
187 FinalOutput['fwdA'] = bigA; FinalOutput['fwdC'] = bigC; FinalOutput['fwdG'] = bigG; FinalOutput['fwdT'] = bigT; FinalOutput['fwdN'] = bigN;
188 FinalOutput['revA'] = smallA; FinalOutput['revC'] = smallC; FinalOutput['revG'] = smallG; FinalOutput['revT'] = smallT; FinalOutput['revN'] = smallN;
189 FinalOutput['cntIns'] = countIn; FinalOutput['cntDel'] = countDel; FinalOutput['Indels'] = ';'.join(FinalIndelHolder);
190 return FinalOutput
192 if __name__ == "__main__":
193 main() # time python3 ./samdepthplot.py t.tsv.gz 1
196 ./cntMpileup.py z.mp9k.lz4 z.bed