new file: alignfq.py
[GalaxyCodeBases.git] / python / etc / alignfq.py
blobf7dacf381ef5ef678c05d31a1f105f173d717b37
1 #!/usr/bin/env python3
3 import sys
4 from skbio import DNA
5 from skbio.alignment import global_pairwise_align_nucleotide
7 def eprint(*args, **kwargs):
8 print(*args, file=sys.stderr, **kwargs)
10 substitution_matrix = {'A': {'A': 1, 'C': -2, 'G': -2, 'T': -2, 'U': -2, 'N': -1},
11 'C': {'A': -2, 'C': 1, 'G': -2, 'T': -2, 'U': -2, 'N': -1},
12 'G': {'A': -2, 'C': -2, 'G': 1, 'T': -2, 'U': -2, 'N': -1},
13 'T': {'A': -2, 'C': -2, 'G': -2, 'T': 1, 'U': -2, 'N': -1},
14 'U': {'A': -2, 'C': -2, 'G': -2, 'T': -2, 'U': 1, 'N': -1},
15 'N': {'A': -1, 'C': -1, 'G': -1, 'T': -1, 'U': -1, 'N': 0}}
17 def getAdapters():
18 IlluminaP7adapter = 'CAAGCAGAAGACGGCATACGAGAT'
19 TruseqRead1primer5E = 'TCTTTCCCTACACGACGCTCTTCCGATCT'
20 SpacialBarcode = 'N' * 30
21 SeqScopeUMIpHR1B = 'CCCGTTCGCAACATGTCTGGCGTCATA' # https://doi.org/10.1016/j.cell.2021.05.010
22 SeqScopeHDMIRead1 = 'TCTTGTGACTACAGCACCCTCGACTCTCGC'
23 pUMI = 'N' * 10
24 polyT = 'T' * 30
25 #mRNAdocker = 'VN'
26 # mRNA
27 #qUMI = 'N' * 9
28 TruSeqRead2primerRC = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
29 IlluminaP5adapterRC = 'GTGTAGATCTCGGTGGTCGCCGTATCATT'
31 AdapterL = IlluminaP7adapter + TruseqRead1primer5E + SpacialBarcode + SeqScopeUMIpHR1B + SeqScopeHDMIRead1 + pUMI + polyT
32 AdapterR = TruSeqRead2primerRC + IlluminaP5adapterRC
33 Adapters = {'AdapterL':DNA(AdapterL), 'AdapterR':DNA(AdapterR)}
34 return Adapters
36 def aliOneSeq(Adapters,targetSeq):
37 Alignments = {}
38 Scores = {}
39 for aid in Adapters.keys():
40 Adapter = Adapters[aid]
41 TargetSeq = DNA(targetSeq)
42 rcTargetSeq = TargetSeq.reverse_complement()
43 alignment, score, start_end_positions = global_pairwise_align_nucleotide(
44 TargetSeq, Adapter, substitution_matrix=substitution_matrix
46 RCalignment, RCscore, RCstart_end_positions = global_pairwise_align_nucleotide(
47 rcTargetSeq, Adapter, substitution_matrix=substitution_matrix
49 strand = '*'
50 thisScore = 0
51 if RCalignment == None or score > RCscore:
52 resAli = alignment
53 strand = '+'
54 thisScore = score
55 else:
56 resAli = RCalignment
57 strand = '-'
58 thisScore = RCscore
59 if thisScore > 5:
60 Alignments[aid] = (resAli,strand,thisScore)
61 return Alignments
63 def printAli(Alignments,seq,seqid):
64 prnStr = ':'.join((seqid,seq))
65 print(prnStr)
66 for tid in Alignments.keys():
67 (resAli,strand,thisScore) = Alignments[tid]
68 tmp = []
69 resAli.write(tmp,format='stockholm')
70 prnStr = ' '.join((
71 tid, strand,
72 str(thisScore)
74 print(prnStr)
75 prnStr = ''.join(tmp[1:3])
76 print(prnStr)
77 return
79 def process(lines=None):
80 ks = ['name', 'sequence', 'optional', 'quality']
81 return {k: v for k, v in zip(ks, lines)}
83 def main():
84 if len(sys.argv) < 3 :
85 print('Usage:',sys.argv[0],'<fq file> <umi length> >alignments.out',file=sys.stderr,flush=True);
86 exit(0);
87 fqFile = sys.argv[1]
88 umiLength = int(sys.argv[2])
90 Adapters = getAdapters()
92 n = 4
93 with open(fqFile) as fqfile:
94 lines = []
95 for line in fqfile:
96 lines.append(line.rstrip())
97 if len(lines) == n:
98 record = process(lines)
99 record['name'] = ' '.join((record['name'][1:],record['sequence'][:umiLength]))
100 record['sequence'] = record['sequence'][umiLength:]
101 lines = []
102 maxAlignments = aliOneSeq(Adapters,record['sequence'])
103 printAli(maxAlignments,record['sequence'],record['name'])
105 if __name__ == "__main__":
106 main() # time ./alignkmer.py deumi31-9.out.rnk2 1000000 2>/dev/null |head
108 # GGAGTTGCAAAAGGTCTGCGAGAGTCGAGAGTGCTGTAGTCACAAGATATGACGCCAGACATGTTGCGAACGGGTAAAACTACCCTACACT