modified: spffq.py
[GalaxyCodeBases.git] / python / salus / spffq.py
blob20774a0459857a1843374f0f38352a2076d1935a
1 #!/usr/bin/env python3
3 import sys
4 import io
5 import argparse
6 import pathlib
7 import gzip
8 import pyfastx
9 import pafpy
11 import pprint
12 pp = pprint.PrettyPrinter(indent=4)
14 def eprint(*args, **kwargs) -> None:
15 print(*args, **kwargs, file=sys.stderr, flush=True)
17 def init_argparse() -> argparse.ArgumentParser:
18 parser = argparse.ArgumentParser(
19 description='intersection of FastQ Read 2 with spatial barcodes, and dump matching Read 1 with spatial coordinates',
20 epilog='Contact: <huxs@salus-bio.com>')
21 #parser.add_argument('-s', '--spatial', type=pathlib.Path, default='spatial.txt', metavar='file', help='For spatial.txt[.gz]')
22 parser.add_argument('-1', '--read1', type=pathlib.Path, default='Unmapped.out.mate1', metavar='file', help='For Unmapped.out.mate1[.gz]')
23 #parser.add_argument('-2', '--read2', type=pathlib.Path, default='Unmapped.out.mate2', metavar='file', help='For Unmapped.out.mate2[.gz]')
24 parser.add_argument('-p', '--read2-paf', type=pathlib.Path, default='Unmapped.mate2.paf', metavar='file', help='For Unmapped.mate2.paf')
25 #parser.add_argument('-m', '--max-mismatch', dest='mismatch', type=int, default=1, help='max allowed mismatch, default=1')
26 parser.add_argument('-o', '--output', type=pathlib.Path, default='Unmapped.fq.gz', dest='outfile')
27 #parser.add_argument('-z', '--gzip', action=argparse.BooleanOptionalAction, default=True, help='Output gzipped files, default on', dest='gzip')
28 #parser.add_argument('-n', '--dryrun', '--dry-run', action='store_true', dest='dryrun')
29 parser.add_argument(
30 "-v", "--version", action="version",
31 version=f"{parser.prog} version 1.0.0"
33 return parser
35 def fileOpener(filename):
36 f = open(filename,'rb')
37 fh = f
38 if (f.read(2) == b'\x1f\x8b'):
39 f.seek(0)
40 fh = gzip.GzipFile(fileobj=f, mode='rb')
41 else:
42 f.seek(0)
43 #fht = io.TextIOWrapper(fh, encoding='utf-8', line_buffering=True)
44 return fh
46 def main() -> None:
47 parser = init_argparse()
48 if len(sys.argv) == 1:
49 parser.print_help()
50 eprint('''
51 Requirements:
52 (01) perl -lane 'print ">",join("_",@F),"\\n$F[0]"' spatial.txt | minimap2 -k 15 -d spatial.miniref - 2>spatial.miniref.log
53 (2a) seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 -o Unmapped.mate2.paf 2>Unmapped.mate2.paf.log
54 (2b) seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 2>Unmapped.mate2.paf.log | gzip -1c >Unmapped.mate2.paf.gz
55 ''')
56 exit(0);
57 args = parser.parse_args()
58 pp.pprint(args)
59 eprint('[!]Read1:[',args.read1,'], Read2.PAF:[',args.read2_paf,']. OutFile:[',args.outfile,']',sep='');
60 skipped = 0
61 found = 0
62 with gzip.open(args.outfile, mode='wt', compresslevel=1) as fh:
63 with fileOpener(args.read2_paf) as fp2:
64 with pafpy.PafFile(fp2) as paf:
65 for record in paf:
66 if record.is_primary():
67 #pp.pprint(record)
68 for name,seq,qual in pyfastx.Fastq(args.read1.as_posix(), build_index=False):
69 #print('|'.join((name, seq, qual)))
70 if record.qname == name:
71 print( '|'.join((
72 str(found), str(skipped),
73 str(record.tags['cg']),str(record.tags['cs']),
74 name
75 )) )
76 found +=1
77 break
78 else:
79 skipped +=1
81 if __name__ == "__main__":
82 main() # time ./spffq.py -1 n4457360.Unmapped.out.mate1.gz -p n175410.Unmapped.mate2.paf.gz
84 '''
85 [1]+ Running perl -lane 'print ">",join("_",@F),"\n$F[0]"' spatial.txt | minimap2 -k 15 -d spatial.miniref - 2> spatial.miniref.log &
86 [2]+ Running seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 -o Unmapped2.paf 2> Unmapped2.err &
87 '''