Merge branch 'slowbisect'
[greylag.git] / greylag_censor_database.py
blob0a98d4f4c317b30bb17954af576800b81c2b0410
1 #!/usr/bin/env python
3 """Create a censor file for the given FASTA sequence database. This file
4 notes subsequences that are redundant, and therefore need not be searched.
5 The locations of the censored subsequences are noted so that correct search
6 output can still be generated. The censor file is a gzip'ed Python pickle
7 file with suffix '.censor'.
8 """
10 from __future__ import with_statement
12 __copyright__ = '''
13 greylag, a collection of programs for MS/MS protein analysis
14 Copyright (C) 2006-2008 Stowers Institute for Medical Research
16 This program is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
21 This program is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
26 You should have received a copy of the GNU General Public License
27 along with this program. If not, see <http://www.gnu.org/licenses/>.
29 Contact: Mike Coleman
30 Stowers Institute for Medical Research
31 1000 East 50th Street
32 Kansas City, Missouri 64110
33 USA
34 '''
37 import contextlib
38 import cPickle
39 import gzip
40 import optparse
41 import os.path
42 import re
43 import sys
45 from greylag import VERSION
47 ##FIX
48 import greylag_chase
51 def error(s):
52 sys.exit('error: ' + s)
55 def main(args=sys.argv[1:]):
56 parser = optparse.OptionParser(usage=
57 "usage: %prog [options] <fasta-file>",
58 description=__doc__)
59 pa = parser.add_option
60 pa("--copyright", action="store_true", dest="copyright",
61 help="print copyright and exit")
62 pa("--version", action="store_true", dest="version",
63 help="print version and exit")
64 (options, args) = parser.parse_args(args=args)
66 if options.copyright:
67 print __copyright__
68 sys.exit(0)
69 if options.version:
70 print VERSION
71 sys.exit(0)
73 if len(args) != 1:
74 parser.print_help()
75 sys.exit(1)
78 # [(locusname, defline, seq, filename), ...]
79 fasta_db = list(greylag_chase.read_fasta_files([args[0]]))
80 # [(idno, offset, locusname, defline, seq, seq_filename), ...]
81 db = []
82 for idno, (locusname, defline, sequence, filename) in enumerate(fasta_db):
83 db.extend(greylag_chase.split_sequence_into_aa_runs(idno, locusname,
84 defline, sequence,
85 filename))
86 db_residue_count = sum(len(dbi[3]) for dbi in db)
88 print "# read %s sequences (%s runs, %s residues)" % (len(fasta_db), len(db),
89 db_residue_count)
91 # each node within the tree is pair: the first element is a 26-ary list
92 # holding child nodes; the second is a list of annotations
93 LETTERS = 26
94 censor_tree = ([None] * LETTERS, [])
96 # minimum interesting peptide length (for statistics)
97 ## FIX
98 MIN_PEPTIDE_LENGTH = 0
99 MAX_PEPTIDE_LENGTH = 10
101 # statistics
102 peptides_to_search = 0
103 peptides_to_skip = 0
104 # residues in the above peptides
105 residues_to_search = 0
106 residues_to_skip = 0
108 for idno, offset, locusname, defline, seq, seq_filename in db:
109 sys.stderr.write('.')
110 for start in xrange(len(seq)):
111 suffix = seq[start:]
112 sstart = 0
113 ctp = censor_tree
115 while suffix[sstart:]:
116 # walk the previously seen part of suffix
117 i = ord(suffix[sstart]) - ord('A')
118 if not ctp[0][i]:
119 break # prefix of suffix unseen
120 ctp = ctp[0][i]
121 sstart += 1
122 #print 'in %s skipping %s' % (suffix, suffix[:sstart])
123 if sstart >= MIN_PEPTIDE_LENGTH:
124 peptides_to_skip += 1
125 residues_to_skip += sstart
127 while suffix[sstart:]:
128 # add the rest of suffix (if any) to the censor tree
129 i = ord(suffix[sstart]) - ord('A')
130 assert not ctp[0][i]
131 if sstart <= MAX_PEPTIDE_LENGTH:
132 ctp[0][i] = ([None] * LETTERS, [])
133 ctp = ctp[0][i]
134 sstart += 1
135 #print 'in %s adding %s' % (suffix, suffix[:sstart])
136 if sstart >= MIN_PEPTIDE_LENGTH:
137 peptides_to_search += 1
138 residues_to_search += sstart
140 #from pprint import pprint
141 #pprint(censor_tree)
143 print ('# search %s (%s residues), skip %s (%s residues)'
144 % (peptides_to_search, residues_to_search, peptides_to_skip,
145 residues_to_skip))
146 print '# average search length %.2f' % (float(residues_to_search)
147 / peptides_to_search)
149 print '# residue search proportion %.3f' % (float(residues_to_search)
150 / (residues_to_search
151 + residues_to_skip))
153 sys.exit()
156 for fn in args:
157 with open(fn) as specfile:
158 contents = specfile.read()
159 specnames = set()
160 with contextlib.closing(gzip.open(fn + '.idx', 'w')) as idx:
161 ms = [ m for m in re.finditer(r'^:.*$', contents, re.MULTILINE) ]
162 specnames = [ m.group() for m in ms ]
163 if len(set(specnames)) < len(ms):
164 error("duplicate spectrum names not allowed")
165 if specnames != sorted(specnames):
166 error("spectra must be ordered by name")
167 offsets = [ m.start() for m in ms ]
168 cPickle.dump({ 'offsets' : offsets,
169 'file size' : os.path.getsize(fn) },
170 idx, cPickle.HIGHEST_PROTOCOL)
173 if __name__ == '__main__':
174 main()