modified: fig1.py
[GalaxyCodeBases.git] / python / mixture / parse_pileup.py
blob8ee4286206d2af3ed468f290e8ffb5c64bee8a40
1 #!/usr/bin/env python3
2 # Modified by Daniel Gomez-Sanchez: adding opening bgzipped pileups
3 # <https://github.com/magicDGS/PoolHMM/blob/master/Modules/parse_pileup.py>
4 # <https://github.com/bihealth/vcfpy/blob/master/vcfpy/parser.py>
5 # <https://github.com/genomeannotation/pileup_analyzer>
6 import sys
7 import getopt
8 import re
9 import os.path
10 import gzip
12 import zstandard
13 import io
15 import pileup_parser_classes
17 PileObj = pileup_parser_classes.Pile()
18 parseBase = pileup_parser_classes.PileSanitizer()
20 def openPileup(pileup_file, log=False):
21 """
22 Open pileup in text format or gzipped
23 pileup_prefix is the prefix of .pileup.gz or .pileup
24 mode is the way of opening the file
25 log=True prints if compressed or not compressed are used
26 return a filehandler
27 exit if IOError found
28 """
29 #pileup file loading
30 try:
31 if pileup_file.endswith('.zstd'):
32 fh = open(pileup_file, 'rb')
33 dctx = zstandard.ZstdDecompressor()
34 stream_reader = dctx.stream_reader(fh)
35 pileup = io.TextIOWrapper(stream_reader, encoding='utf-8')
36 elif pileup_file.endswith('.gz'):
37 pileup = gzip.open(pileup_file, 'rt')
38 else:
39 pileup = open(pileup_file, 'rt')
40 if log:
41 print ("Input file: %s" % pileup_file)
42 ## return the fh
43 return pileup
44 except IOError:
45 print ("Could not open input file %s" % pileup_file)
46 sys.exit()
48 def mpileup_parser(line):
49 if line == "\n":
50 print ("pileup parser empty line provided")
51 sys.exit(8)
52 #...
54 #line is splited to get ch, pos, rc, cov, nucs & qual infos
55 try:
56 line_items=line.split()
57 except ValueError:
58 print ("could not parse pileup entry %s" % (line))
59 sys.exit(9)
60 #...
61 RowIDpre = ['ChrID','Pos','Ref']
62 LineSplited = len(line_items)
63 if (LineSplited % 3) == 0 and LineSplited >= 6:
64 SampleCnt = int(LineSplited / 3) -1
65 SampleRows=['Depth','Bases','Quals']
66 SampleNO = range(1,SampleCnt+1)
67 RowIDsmp = [[str(j)+str(i) for j in SampleRows] for i in SampleNO]
68 RowIDall = RowIDpre
69 for i in RowIDsmp:
70 RowIDall.extend(i)
71 #(ch, pos, rc, cov, nucs, qual) = line_items
72 for rid, raw_data in zip(RowIDall, line_items):
73 print(rid,'=',raw_data)
74 LineData = dict(zip(RowIDall, line_items))
75 for i in SampleNO:
76 (d,b,q) = mpileup_item_parser(LineData['Ref'], LineData['Depth'+str(i)],LineData['Bases'+str(i)],LineData['Quals'+str(i)])
77 #print('ret:',d,b,q)
78 else:
79 print("wrong number of columns in pileup line (SampleCnt=%d): %s" % (SampleCnt,line))
80 sys.exit()
82 def mpileup_item_parser(Ref,Depth,Bases,Quals):
83 print(Ref,Depth,Bases,Quals)
84 result1 = parseBase.sanitize(Bases)
85 print(result1)
86 return [Depth,Bases,Quals]
88 def xxxx(line):
89 #nucs is filtered
90 #step 1
91 s1 = re.findall(r"[-+](\d+)", nucs)
93 for n in s1:
94 l1 = ['[-+]',n,'[ACGTNacgtn]{',n, '}']
95 pattern = r''.join(l1)
96 nucs = re.sub(pattern, '', nucs)
97 #...
99 #step2
100 nucs = re.sub('\^.', '', nucs)
102 #step3
103 nucs = re.sub('\$', '', nucs)
105 #step 4
106 nucs = re.sub('\.', rc.upper(), nucs)
108 #step 5
109 nucs = re.sub(',', rc.lower(), nucs)
110 #...
112 #exception size of sequence != size of quality
113 if len(nucs) != len(qual):
114 print ("Size of sequence does not equal size of quality: %s, %s" % (nucs,line))
115 sys.exit(10)
116 #...
118 #filter the pileup file by quality
119 i, ac, tc, cc, gc, nco, dell, co = 0, 0, 0, 0, 0, 0, 0, 0
121 nucs_filtered = []
122 qual_filtered = []
123 for qc in qual:
124 nc = nucs[i]
125 quality = self.encode(qc)
126 if quality >= self.minQual:
127 co += 1
128 if nc == "A" or nc == "a":
129 ac += 1
130 elif nc == 'T' or nc == 't':
131 tc += 1
132 elif nc == "C" or nc == "c":
133 cc += 1
134 elif nc == "G" or nc == "g":
135 gc += 1
136 elif nc == "N" or nc == "n":
137 nco += 1
138 elif nc == "*":
139 dell += 1
140 else:
141 print ("Could not parse pileup; Unknown allele : %s in %s" % (a,line))
142 sys.exit(1)
143 #...
144 if nc != "*" and nc != "n" and nc != "N":
145 nucs_filtered.append(nc)
146 qual_filtered.append(quality)
148 #...
149 i += 1
150 #...
152 alar = [{'a':'A', 'c':ac}, {'a':'T', 'c':tc}, {'a':'C', 'c':cc}, {'a':'G', 'c':gc}]
153 eucov = ac + tc + cc + gc
155 if len(nucs_filtered) != len(qual_filtered):
156 print ("Error: Length of nucleotides does not agree with lenght of quality string!")
157 sys.exit(11)
158 #...
160 if len(nucs_filtered) != eucov:
161 print ("Error : Coverage does not agree with length of nucleotides : %d n: %d " % (eucov, len(nucs_filtered)))
162 sys.exit(12)
163 #...
165 # pos chr refc nucs qual totcov eucov alleles A T C G N del valid valid_alleles derived_allele allele_code
166 entry={
167 'pos':pos,
168 'ch':ch,
169 'refc':rc,
170 'nucs':nucs_filtered,
171 'qual':qual_filtered,
172 'totcov':co,
173 'eucov':eucov,
174 'alleles':alar,
175 'del':dell,
176 'N':nco
179 alleles = 0
180 if ac >= self.minCount:
181 alleles += 1
182 #...
184 if tc >= self.minCount:
185 alleles += 1
186 #...
188 if cc >= self.minCount:
189 alleles += 1
190 #...
192 if gc >= self.minCount:
193 alleles += 1
194 #...
196 allele_code = "na"
197 if alleles == 1:
198 allele_code = "M"
199 elif alleles == 2:
200 allele_code = "S"
201 elif alleles >2:
202 allele_code = "T"
203 entry['allele_code'] = allele_code
206 valid = 1
207 if entry['del'] >0 or entry['eucov'] < self.minCoverage or entry['eucov'] > self.maxCoverage:
208 valid = 0
209 entry["valid"] = valid
211 entry['ancestral_allele']="N"
212 entry['derived_allele']="N"
213 entry['removed_alleles']= 0
214 entry['unfolded']=1
215 entry['refc']=entry['refc'].upper()
217 return entry
218 #...
221 if __name__ == "__main__":
222 #import doctest
223 #doctest.testmod()
224 import sys
225 with openPileup(sys.argv[1]) as TextIn:
226 for line in TextIn:
227 record = mpileup_parser(line)
228 print(record)
229 exit()