2 # Modified by Daniel Gomez-Sanchez: adding opening bgzipped pileups
3 # <https://github.com/magicDGS/PoolHMM/blob/master/Modules/parse_pileup.py>
4 # <https://github.com/bihealth/vcfpy/blob/master/vcfpy/parser.py>
5 # <https://github.com/genomeannotation/pileup_analyzer>
15 import pileup_parser_classes
17 PileObj
= pileup_parser_classes
.Pile()
18 parseBase
= pileup_parser_classes
.PileSanitizer()
20 def openPileup(pileup_file
, log
=False):
22 Open pileup in text format or gzipped
23 pileup_prefix is the prefix of .pileup.gz or .pileup
24 mode is the way of opening the file
25 log=True prints if compressed or not compressed are used
31 if pileup_file
.endswith('.zstd'):
32 fh
= open(pileup_file
, 'rb')
33 dctx
= zstandard
.ZstdDecompressor()
34 stream_reader
= dctx
.stream_reader(fh
)
35 pileup
= io
.TextIOWrapper(stream_reader
, encoding
='utf-8')
36 elif pileup_file
.endswith('.gz'):
37 pileup
= gzip
.open(pileup_file
, 'rt')
39 pileup
= open(pileup_file
, 'rt')
41 print ("Input file: %s" % pileup_file
)
45 print ("Could not open input file %s" % pileup_file
)
48 def mpileup_parser(line
):
50 print ("pileup parser empty line provided")
54 #line is splited to get ch, pos, rc, cov, nucs & qual infos
56 line_items
=line
.split()
58 print ("could not parse pileup entry %s" % (line
))
61 RowIDpre
= ['ChrID','Pos','Ref']
62 LineSplited
= len(line_items
)
63 if (LineSplited
% 3) == 0 and LineSplited
>= 6:
64 SampleCnt
= int(LineSplited
/ 3) -1
65 SampleRows
=['Depth','Bases','Quals']
66 SampleNO
= range(1,SampleCnt
+1)
67 RowIDsmp
= [[str(j
)+str(i
) for j
in SampleRows
] for i
in SampleNO
]
71 #(ch, pos, rc, cov, nucs, qual) = line_items
72 for rid
, raw_data
in zip(RowIDall
, line_items
):
73 print(rid
,'=',raw_data
)
74 LineData
= dict(zip(RowIDall
, line_items
))
76 (d
,b
,q
) = mpileup_item_parser(LineData
['Ref'], LineData
['Depth'+str(i
)],LineData
['Bases'+str(i
)],LineData
['Quals'+str(i
)])
79 print("wrong number of columns in pileup line (SampleCnt=%d): %s" % (SampleCnt
,line
))
82 def mpileup_item_parser(Ref
,Depth
,Bases
,Quals
):
83 print(Ref
,Depth
,Bases
,Quals
)
84 result1
= parseBase
.sanitize(Bases
)
86 return [Depth
,Bases
,Quals
]
91 s1
= re
.findall(r
"[-+](\d+)", nucs
)
94 l1
= ['[-+]',n
,'[ACGTNacgtn]{',n
, '}']
95 pattern
= r
''.join(l1
)
96 nucs
= re
.sub(pattern
, '', nucs
)
100 nucs
= re
.sub('\^.', '', nucs
)
103 nucs
= re
.sub('\$', '', nucs
)
106 nucs
= re
.sub('\.', rc
.upper(), nucs
)
109 nucs
= re
.sub(',', rc
.lower(), nucs
)
112 #exception size of sequence != size of quality
113 if len(nucs
) != len(qual
):
114 print ("Size of sequence does not equal size of quality: %s, %s" % (nucs
,line
))
118 #filter the pileup file by quality
119 i
, ac
, tc
, cc
, gc
, nco
, dell
, co
= 0, 0, 0, 0, 0, 0, 0, 0
125 quality
= self
.encode(qc
)
126 if quality
>= self
.minQual
:
128 if nc
== "A" or nc
== "a":
130 elif nc
== 'T' or nc
== 't':
132 elif nc
== "C" or nc
== "c":
134 elif nc
== "G" or nc
== "g":
136 elif nc
== "N" or nc
== "n":
141 print ("Could not parse pileup; Unknown allele : %s in %s" % (a
,line
))
144 if nc
!= "*" and nc
!= "n" and nc
!= "N":
145 nucs_filtered
.append(nc
)
146 qual_filtered
.append(quality
)
152 alar
= [{'a':'A', 'c':ac
}, {'a':'T', 'c':tc
}, {'a':'C', 'c':cc
}, {'a':'G', 'c':gc
}]
153 eucov
= ac
+ tc
+ cc
+ gc
155 if len(nucs_filtered
) != len(qual_filtered
):
156 print ("Error: Length of nucleotides does not agree with lenght of quality string!")
160 if len(nucs_filtered
) != eucov
:
161 print ("Error : Coverage does not agree with length of nucleotides : %d n: %d " % (eucov
, len(nucs_filtered
)))
165 # pos chr refc nucs qual totcov eucov alleles A T C G N del valid valid_alleles derived_allele allele_code
170 'nucs':nucs_filtered
,
171 'qual':qual_filtered
,
180 if ac
>= self
.minCount
:
184 if tc
>= self
.minCount
:
188 if cc
>= self
.minCount
:
192 if gc
>= self
.minCount
:
203 entry
['allele_code'] = allele_code
207 if entry
['del'] >0 or entry
['eucov'] < self
.minCoverage
or entry
['eucov'] > self
.maxCoverage
:
209 entry
["valid"] = valid
211 entry
['ancestral_allele']="N"
212 entry
['derived_allele']="N"
213 entry
['removed_alleles']= 0
215 entry
['refc']=entry
['refc'].upper()
221 if __name__
== "__main__":
225 with
openPileup(sys
.argv
[1]) as TextIn
:
227 record
= mpileup_parser(line
)