greylag_censor_database.py

   1 #!/usr/bin/env python
   2
   3 """Create a censor file for the given FASTA sequence database.  This file
   4    notes subsequences that are redundant, and therefore need not be searched.
   5    The locations of the censored subsequences are noted so that correct search
   6    output can still be generated.  The censor file is a gzip'ed Python pickle
   7    file with suffix '.censor'.
   8 """
   9
  10 from __future__ import with_statement
  11
  12 __copyright__ = '''
  13     greylag, a collection of programs for MS/MS protein analysis
  14     Copyright (C) 2006-2008  Stowers Institute for Medical Research
  15
  16     This program is free software: you can redistribute it and/or modify
  17     it under the terms of the GNU General Public License as published by
  18     the Free Software Foundation, either version 3 of the License, or
  19     (at your option) any later version.
  20
  21     This program is distributed in the hope that it will be useful,
  22     but WITHOUT ANY WARRANTY; without even the implied warranty of
  23     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  24     GNU General Public License for more details.
  25
  26     You should have received a copy of the GNU General Public License
  27     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  28
  29     Contact: Mike Coleman
  30              Stowers Institute for Medical Research
  31              1000 East 50th Street
  32              Kansas City, Missouri  64110
  33              USA
  34 '''
  35
  36
  37 import contextlib
  38 import cPickle
  39 import gzip
  40 import optparse
  41 import os.path
  42 import re
  43 import sys
  44
  45 from greylag import VERSION
  46
  47 ##FIX
  48 import greylag_chase
  49
  50
  51 def error(s):
  52     sys.exit('error: ' + s)
  53
  54
  55 def main(args=sys.argv[1:]):
  56     parser = optparse.OptionParser(usage=
  57                                    "usage: %prog [options] <fasta-file>",
  58                                    description=__doc__)
  59     pa = parser.add_option
  60     pa("--copyright", action="store_true", dest="copyright",
  61        help="print copyright and exit")
  62     pa("--version", action="store_true", dest="version",
  63        help="print version and exit")
  64     (options, args) = parser.parse_args(args=args)
  65
  66     if options.copyright:
  67         print __copyright__
  68         sys.exit(0)
  69     if options.version:
  70         print VERSION
  71         sys.exit(0)
  72
  73     if len(args) != 1:
  74         parser.print_help()
  75         sys.exit(1)
  76
  77
  78     # [(locusname, defline, seq, filename), ...]
  79     fasta_db = list(greylag_chase.read_fasta_files([args[0]]))
  80     # [(idno, offset, locusname, defline, seq, seq_filename), ...]
  81     db = []
  82     for idno, (locusname, defline, sequence, filename) in enumerate(fasta_db):
  83         db.extend(greylag_chase.split_sequence_into_aa_runs(idno, locusname,
  84                                                             defline, sequence,
  85                                                             filename))
  86     db_residue_count = sum(len(dbi[3]) for dbi in db)
  87
  88     print "# read %s sequences (%s runs, %s residues)" % (len(fasta_db), len(db),
  89                                                           db_residue_count)
  90
  91     # each node within the tree is pair: the first element is a 26-ary list
  92     # holding child nodes; the second is a list of annotations
  93     LETTERS = 26
  94     censor_tree = ([None] * LETTERS, [])
  95
  96     # minimum interesting peptide length (for statistics)
  97     ## FIX
  98     MIN_PEPTIDE_LENGTH = 0
  99     MAX_PEPTIDE_LENGTH = 10
 100
 101     # statistics
 102     peptides_to_search = 0
 103     peptides_to_skip = 0
 104     #  residues in the above peptides
 105     residues_to_search = 0
 106     residues_to_skip = 0
 107
 108     for idno, offset, locusname, defline, seq, seq_filename in db:
 109         sys.stderr.write('.')
 110         for start in xrange(len(seq)):
 111             suffix = seq[start:]
 112             sstart = 0
 113             ctp = censor_tree
 114
 115             while suffix[sstart:]:
 116                 # walk the previously seen part of suffix
 117                 i = ord(suffix[sstart]) - ord('A')
 118                 if not ctp[0][i]:
 119                     break       # prefix of suffix unseen
 120                 ctp = ctp[0][i]
 121                 sstart += 1
 122                 #print 'in %s skipping %s' % (suffix, suffix[:sstart])
 123                 if sstart >= MIN_PEPTIDE_LENGTH:
 124                     peptides_to_skip += 1
 125                     residues_to_skip += sstart
 126
 127             while suffix[sstart:]:
 128                 # add the rest of suffix (if any) to the censor tree
 129                 i = ord(suffix[sstart]) - ord('A')
 130                 assert not ctp[0][i]
 131                 if sstart <= MAX_PEPTIDE_LENGTH:
 132                     ctp[0][i] = ([None] * LETTERS, [])
 133                     ctp = ctp[0][i]
 134                 sstart += 1
 135                 #print 'in %s adding %s' % (suffix, suffix[:sstart])
 136                 if sstart >= MIN_PEPTIDE_LENGTH:
 137                     peptides_to_search += 1
 138                     residues_to_search += sstart
 139
 140     #from pprint import pprint
 141     #pprint(censor_tree)
 142
 143     print ('# search %s (%s residues), skip %s (%s residues)'
 144            % (peptides_to_search, residues_to_search, peptides_to_skip,
 145               residues_to_skip))
 146     print '# average search length %.2f' % (float(residues_to_search)
 147                                             / peptides_to_search)
 148
 149     print '# residue search proportion %.3f' % (float(residues_to_search)
 150                                                 / (residues_to_search
 151                                                    + residues_to_skip))
 152
 153     sys.exit()
 154
 155
 156     for fn in args:
 157         with open(fn) as specfile:
 158             contents = specfile.read()
 159         specnames = set()
 160         with contextlib.closing(gzip.open(fn + '.idx', 'w')) as idx:
 161             ms = [ m for m in re.finditer(r'^:.*$', contents, re.MULTILINE) ]
 162             specnames = [ m.group() for m in ms ]
 163             if len(set(specnames)) < len(ms):
 164                 error("duplicate spectrum names not allowed")
 165             if specnames != sorted(specnames):
 166                 error("spectra must be ordered by name")
 167             offsets = [ m.start() for m in ms ]
 168             cPickle.dump({ 'offsets' : offsets,
 169                            'file size' : os.path.getsize(fn) },
 170                          idx, cPickle.HIGHEST_PROTOCOL)
 171
 172
 173 if __name__ == '__main__':
 174     main()