pygr/sequtil.py

   1
   2 DNA_SEQTYPE=0
   3 RNA_SEQTYPE=1
   4 PROTEIN_SEQTYPE=2
   5
   6 def guess_seqtype(s):
   7     dna_letters='AaTtUuGgCcNn'
   8     ndna=0
   9     nU=0
  10     nT=0
  11     for l in s:
  12         if l in dna_letters:
  13             ndna += 1
  14         if l=='U' or l=='u':
  15             nU += 1
  16         elif l=='T' or l=='t':
  17             nT += 1
  18     ratio=ndna/float(len(s))
  19     if ratio>0.85:
  20         if nT>nU:
  21             return DNA_SEQTYPE
  22         else:
  23             return RNA_SEQTYPE
  24     else:
  25         return PROTEIN_SEQTYPE
  26
  27 seq_id_counter=0
  28 def new_seq_id():
  29     global seq_id_counter
  30     seq_id_counter += 1
  31     return str(seq_id_counter-1)
  32
  33
  34 def write_fasta(ofile,s,chunk=60,id=None,reformatter=None):
  35     "Trivial FASTA output"
  36     if id is None:
  37         try:
  38             id = str(s.id)
  39         except AttributeError:
  40             id = new_seq_id()
  41
  42     ofile.write('>' + id + '\n')
  43     seq = str(s)
  44     if reformatter is not None: # APPLY THE DESIRED REFORMATTING
  45         seq = reformatter(seq)
  46     end = len(seq)
  47     pos = 0
  48     while 1:
  49         ofile.write(seq[pos:pos+chunk] + '\n')
  50         pos += chunk
  51         if pos >= end:
  52             break
  53     return id # IN CASE CALLER WANTS TEMP ID WE MAY HAVE ASSIGNED
  54
  55 def read_fasta(ifile):
  56     "iterate over id,title,seq from stream ifile"
  57     id = None
  58     isEmpty = True
  59     for line in ifile:
  60         if '>' == line[0]:
  61             if id is not None and len(seq) > 0:
  62                 yield id,title,seq
  63                 isEmpty = False
  64             id = line[1:].split()[0]
  65             title = line[len(id)+2:]
  66             seq = ''
  67         elif id is not None: # READ SEQUENCE
  68             for word in line.split(): # GET RID OF WHITESPACE
  69                 seq += word
  70     if id is not None and len(seq) > 0:
  71         yield id,title,seq
  72     elif isEmpty:
  73         raise IOError('no readable sequence in FASTA file!')
  74
  75 def read_fasta_one_line(ifile): # @CTB deprecated; remove
  76     "read a single sequence line, return id,title,seq"
  77     id = None
  78     seq = ''
  79     while True:
  80         line = ifile.readline(1024) # READ AT MOST 1KB
  81         if line == '': # EOF
  82             break
  83         elif '>' == line[0]:
  84             id = line[1:].split()[0]
  85             title = line[len(id)+2:]
  86         elif id is not None: # READ SEQUENCE
  87             for word in line.split(): # GET RID OF WHITESPACE
  88                 seq += word
  89             if len(seq)>0:
  90                 return id,title,seq
  91     raise IOError('no readable sequence in FASTA file!')
  92
  93 def read_fasta_lengths(ifile):
  94     "Generate sequence ID,length from stream ifile"
  95     id = None
  96     seqLength = 0
  97     isEmpty = True
  98     for line in ifile:
  99         if '>' == line[0]:
 100             if id is not None and seqLength > 0:
 101                 yield id,seqLength
 102                 isEmpty = False
 103             id = line[1:].split()[0]
 104             seqLength = 0
 105         elif id is not None: # READ SEQUENCE
 106             for word in line.split(): # GET RID OF WHITESPACE
 107                 seqLength += len(word)
 108     if id is not None and seqLength > 0:
 109         yield id,seqLength
 110     elif isEmpty:
 111         raise IOError('no readable sequence in FASTA file!')
 112
 113 class AATranslation(object):
 114     'customizable translation class'
 115     geneticCode = dict(TTY='F', TTR='L', TCN='S', TAY='Y', TGY='C', TGG='W',
 116                        CTN='L', CCN='P', CAY='H', CAR='Q', CGN='R',
 117                        ATY='I', ATA='I', ATG='M', ACN='T', AAY='N', AAR='K',
 118                        AGY='S', AGR='R',
 119                        GTN='V', GCN='A', GAY='D', GAR='E', GGN='G',
 120                        TAR='*', TGA='*')
 121     def __init__(self):
 122         'initialize our translation dictionary by applying N,Y,R codes'
 123         geneticCode = self.geneticCode.copy()
 124         for codon,aa in self.geneticCode.items():
 125             if codon[2] == 'N':
 126                 geneticCode[codon[:2]+'A'] = aa
 127                 geneticCode[codon[:2]+'T'] = aa
 128                 geneticCode[codon[:2]+'G'] = aa
 129                 geneticCode[codon[:2]+'C'] = aa
 130             elif codon[2] == 'Y':
 131                 geneticCode[codon[:2]+'T'] = aa
 132                 geneticCode[codon[:2]+'C'] = aa
 133             elif codon[2] == 'R':
 134                 geneticCode[codon[:2]+'A'] = aa
 135                 geneticCode[codon[:2]+'G'] = aa
 136         self.geneticCode = geneticCode
 137     def __call__(self, s):
 138         'translate nucleotide string s to amino acid string'
 139         s = s.upper()
 140         s = s.replace('U', 'T')
 141         l = []
 142         for i in range(0, len(s), 3):
 143             try:
 144                 l.append(self.geneticCode[s[i:i+3]])
 145             except KeyError:
 146                 l.append('X') # uninterpretable
 147         return ''.join(l)
 148
 149 translate_orf = AATranslation() # default translation function
 150
 151
 152