5 from skbio
.alignment
import global_pairwise_align_nucleotide
7 def eprint(*args
, **kwargs
):
8 print(*args
, file=sys
.stderr
, **kwargs
)
10 substitution_matrix
= {'A': {'A': 1, 'C': -2, 'G': -2, 'T': -2, 'U': -2, 'N': -1},
11 'C': {'A': -2, 'C': 1, 'G': -2, 'T': -2, 'U': -2, 'N': -1},
12 'G': {'A': -2, 'C': -2, 'G': 1, 'T': -2, 'U': -2, 'N': -1},
13 'T': {'A': -2, 'C': -2, 'G': -2, 'T': 1, 'U': -2, 'N': -1},
14 'U': {'A': -2, 'C': -2, 'G': -2, 'T': -2, 'U': 1, 'N': -1},
15 'N': {'A': -1, 'C': -1, 'G': -1, 'T': -1, 'U': -1, 'N': 0}}
18 IlluminaP7adapter
= 'CAAGCAGAAGACGGCATACGAGAT'
19 TruseqRead1primer5E
= 'TCTTTCCCTACACGACGCTCTTCCGATCT'
20 SpacialBarcode
= 'N' * 30
21 SeqScopeUMIpHR1B
= 'CCCGTTCGCAACATGTCTGGCGTCATA' # https://doi.org/10.1016/j.cell.2021.05.010
22 SeqScopeHDMIRead1
= 'TCTTGTGACTACAGCACCCTCGACTCTCGC'
28 TruSeqRead2primerRC
= 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
29 IlluminaP5adapterRC
= 'GTGTAGATCTCGGTGGTCGCCGTATCATT'
31 AdapterL
= IlluminaP7adapter
+ TruseqRead1primer5E
+ SpacialBarcode
+ SeqScopeUMIpHR1B
+ SeqScopeHDMIRead1
+ pUMI
+ polyT
32 AdapterR
= TruSeqRead2primerRC
+ IlluminaP5adapterRC
33 Adapters
= {'AdapterL':DNA(AdapterL
), 'AdapterR':DNA(AdapterR
)}
36 def aliOneSeq(Adapters
,targetSeq
):
39 for aid
in Adapters
.keys():
40 Adapter
= Adapters
[aid
]
41 TargetSeq
= DNA(targetSeq
)
42 rcTargetSeq
= TargetSeq
.reverse_complement()
43 alignment
, score
, start_end_positions
= global_pairwise_align_nucleotide(
44 TargetSeq
, Adapter
, substitution_matrix
=substitution_matrix
46 RCalignment
, RCscore
, RCstart_end_positions
= global_pairwise_align_nucleotide(
47 rcTargetSeq
, Adapter
, substitution_matrix
=substitution_matrix
51 if RCalignment
== None or score
> RCscore
:
60 Alignments
[aid
] = (resAli
,strand
,thisScore
)
63 def printAli(Alignments
,seq
,seqid
):
64 prnStr
= ':'.join((seqid
,seq
))
66 for tid
in Alignments
.keys():
67 (resAli
,strand
,thisScore
) = Alignments
[tid
]
69 resAli
.write(tmp
,format
='stockholm')
75 prnStr
= ''.join(tmp
[1:3])
79 def process(lines
=None):
80 ks
= ['name', 'sequence', 'optional', 'quality']
81 return {k
: v
for k
, v
in zip(ks
, lines
)}
84 if len(sys
.argv
) < 3 :
85 print('Usage:',sys
.argv
[0],'<fq file> <umi length> >alignments.out',file=sys
.stderr
,flush
=True);
88 umiLength
= int(sys
.argv
[2])
90 Adapters
= getAdapters()
93 with
open(fqFile
) as fqfile
:
96 lines
.append(line
.rstrip())
98 record
= process(lines
)
99 record
['name'] = ' '.join((record
['name'][1:],record
['sequence'][:umiLength
]))
100 record
['sequence'] = record
['sequence'][umiLength
:]
102 maxAlignments
= aliOneSeq(Adapters
,record
['sequence'])
103 printAli(maxAlignments
,record
['sequence'],record
['name'])
105 if __name__
== "__main__":
106 main() # time ./alignkmer.py deumi31-9.out.rnk2 1000000 2>/dev/null |head
108 # GGAGTTGCAAAAGGTCTGCGAGAGTCGAGAGTGCTGTAGTCACAAGATATGACGCCAGACATGTTGCGAACGGGTAAAACTACCCTACACT