added SQLTable pickle test
[pygr.git] / pygr / sequtil.py
blob2cd3d476c20dd0019d81ca22c0cd4a703fde11ac
2 DNA_SEQTYPE=0
3 RNA_SEQTYPE=1
4 PROTEIN_SEQTYPE=2
6 def guess_seqtype(s):
7 dna_letters='AaTtUuGgCcNn'
8 ndna=0
9 nU=0
10 nT=0
11 for l in s:
12 if l in dna_letters:
13 ndna += 1
14 if l=='U' or l=='u':
15 nU += 1
16 elif l=='T' or l=='t':
17 nT += 1
18 ratio=ndna/float(len(s))
19 if ratio>0.85:
20 if nT>nU:
21 return DNA_SEQTYPE
22 else:
23 return RNA_SEQTYPE
24 else:
25 return PROTEIN_SEQTYPE
27 seq_id_counter=0
28 def new_seq_id():
29 global seq_id_counter
30 seq_id_counter += 1
31 return str(seq_id_counter-1)
34 def write_fasta(ofile,s,chunk=60,id=None,reformatter=None):
35 "Trivial FASTA output"
36 if id is None:
37 try:
38 id = str(s.id)
39 except AttributeError:
40 id = new_seq_id()
42 ofile.write('>' + id + '\n')
43 seq = str(s)
44 if reformatter is not None: # APPLY THE DESIRED REFORMATTING
45 seq = reformatter(seq)
46 end = len(seq)
47 pos = 0
48 while 1:
49 ofile.write(seq[pos:pos+chunk] + '\n')
50 pos += chunk
51 if pos >= end:
52 break
53 return id # IN CASE CALLER WANTS TEMP ID WE MAY HAVE ASSIGNED
55 def read_fasta(ifile):
56 "iterate over id,title,seq from stream ifile"
57 id = None
58 isEmpty = True
59 for line in ifile:
60 if '>' == line[0]:
61 if id is not None and len(seq) > 0:
62 yield id,title,seq
63 isEmpty = False
64 id = line[1:].split()[0]
65 title = line[len(id)+2:]
66 seq = ''
67 elif id is not None: # READ SEQUENCE
68 for word in line.split(): # GET RID OF WHITESPACE
69 seq += word
70 if id is not None and len(seq) > 0:
71 yield id,title,seq
72 elif isEmpty:
73 raise IOError('no readable sequence in FASTA file!')
75 def read_fasta_one_line(ifile): # @CTB deprecated; remove
76 "read a single sequence line, return id,title,seq"
77 id = None
78 seq = ''
79 while True:
80 line = ifile.readline(1024) # READ AT MOST 1KB
81 if line == '': # EOF
82 break
83 elif '>' == line[0]:
84 id = line[1:].split()[0]
85 title = line[len(id)+2:]
86 elif id is not None: # READ SEQUENCE
87 for word in line.split(): # GET RID OF WHITESPACE
88 seq += word
89 if len(seq)>0:
90 return id,title,seq
91 raise IOError('no readable sequence in FASTA file!')
93 def read_fasta_lengths(ifile):
94 "Generate sequence ID,length from stream ifile"
95 id = None
96 seqLength = 0
97 isEmpty = True
98 for line in ifile:
99 if '>' == line[0]:
100 if id is not None and seqLength > 0:
101 yield id,seqLength
102 isEmpty = False
103 id = line[1:].split()[0]
104 seqLength = 0
105 elif id is not None: # READ SEQUENCE
106 for word in line.split(): # GET RID OF WHITESPACE
107 seqLength += len(word)
108 if id is not None and seqLength > 0:
109 yield id,seqLength
110 elif isEmpty:
111 raise IOError('no readable sequence in FASTA file!')
113 class AATranslation(object):
114 'customizable translation class'
115 geneticCode = dict(TTY='F', TTR='L', TCN='S', TAY='Y', TGY='C', TGG='W',
116 CTN='L', CCN='P', CAY='H', CAR='Q', CGN='R',
117 ATY='I', ATA='I', ATG='M', ACN='T', AAY='N', AAR='K',
118 AGY='S', AGR='R',
119 GTN='V', GCN='A', GAY='D', GAR='E', GGN='G',
120 TAR='*', TGA='*')
121 def __init__(self):
122 'initialize our translation dictionary by applying N,Y,R codes'
123 geneticCode = self.geneticCode.copy()
124 for codon,aa in self.geneticCode.items():
125 if codon[2] == 'N':
126 geneticCode[codon[:2]+'A'] = aa
127 geneticCode[codon[:2]+'T'] = aa
128 geneticCode[codon[:2]+'G'] = aa
129 geneticCode[codon[:2]+'C'] = aa
130 elif codon[2] == 'Y':
131 geneticCode[codon[:2]+'T'] = aa
132 geneticCode[codon[:2]+'C'] = aa
133 elif codon[2] == 'R':
134 geneticCode[codon[:2]+'A'] = aa
135 geneticCode[codon[:2]+'G'] = aa
136 self.geneticCode = geneticCode
137 def __call__(self, s):
138 'translate nucleotide string s to amino acid string'
139 s = s.upper()
140 s = s.replace('U', 'T')
141 l = []
142 for i in range(0, len(s), 3):
143 try:
144 l.append(self.geneticCode[s[i:i+3]])
145 except KeyError:
146 l.append('X') # uninterpretable
147 return ''.join(l)
149 translate_orf = AATranslation() # default translation function