3 """Create a censor file for the given FASTA sequence database. This file
4 notes subsequences that are redundant, and therefore need not be searched.
5 The locations of the censored subsequences are noted so that correct search
6 output can still be generated. The censor file is a gzip'ed Python pickle
7 file with suffix '.censor'.
10 from __future__
import with_statement
13 greylag, a collection of programs for MS/MS protein analysis
14 Copyright (C) 2006-2008 Stowers Institute for Medical Research
16 This program is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
21 This program is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
26 You should have received a copy of the GNU General Public License
27 along with this program. If not, see <http://www.gnu.org/licenses/>.
30 Stowers Institute for Medical Research
32 Kansas City, Missouri 64110
45 from greylag
import VERSION
52 sys
.exit('error: ' + s
)
55 def main(args
=sys
.argv
[1:]):
56 parser
= optparse
.OptionParser(usage
=
57 "usage: %prog [options] <fasta-file>",
59 pa
= parser
.add_option
60 pa("--copyright", action
="store_true", dest
="copyright",
61 help="print copyright and exit")
62 pa("--version", action
="store_true", dest
="version",
63 help="print version and exit")
64 (options
, args
) = parser
.parse_args(args
=args
)
78 # [(locusname, defline, seq, filename), ...]
79 fasta_db
= list(greylag_chase
.read_fasta_files([args
[0]]))
80 # [(idno, offset, locusname, defline, seq, seq_filename), ...]
82 for idno
, (locusname
, defline
, sequence
, filename
) in enumerate(fasta_db
):
83 db
.extend(greylag_chase
.split_sequence_into_aa_runs(idno
, locusname
,
86 db_residue_count
= sum(len(dbi
[3]) for dbi
in db
)
88 print "# read %s sequences (%s runs, %s residues)" % (len(fasta_db
), len(db
),
91 # each node within the tree is pair: the first element is a 26-ary list
92 # holding child nodes; the second is a list of annotations
94 censor_tree
= ([None] * LETTERS
, [])
96 # minimum interesting peptide length (for statistics)
98 MIN_PEPTIDE_LENGTH
= 0
99 MAX_PEPTIDE_LENGTH
= 10
102 peptides_to_search
= 0
104 # residues in the above peptides
105 residues_to_search
= 0
108 for idno
, offset
, locusname
, defline
, seq
, seq_filename
in db
:
109 sys
.stderr
.write('.')
110 for start
in xrange(len(seq
)):
115 while suffix
[sstart
:]:
116 # walk the previously seen part of suffix
117 i
= ord(suffix
[sstart
]) - ord('A')
119 break # prefix of suffix unseen
122 #print 'in %s skipping %s' % (suffix, suffix[:sstart])
123 if sstart
>= MIN_PEPTIDE_LENGTH
:
124 peptides_to_skip
+= 1
125 residues_to_skip
+= sstart
127 while suffix
[sstart
:]:
128 # add the rest of suffix (if any) to the censor tree
129 i
= ord(suffix
[sstart
]) - ord('A')
131 if sstart
<= MAX_PEPTIDE_LENGTH
:
132 ctp
[0][i
] = ([None] * LETTERS
, [])
135 #print 'in %s adding %s' % (suffix, suffix[:sstart])
136 if sstart
>= MIN_PEPTIDE_LENGTH
:
137 peptides_to_search
+= 1
138 residues_to_search
+= sstart
140 #from pprint import pprint
143 print ('# search %s (%s residues), skip %s (%s residues)'
144 % (peptides_to_search
, residues_to_search
, peptides_to_skip
,
146 print '# average search length %.2f' % (float(residues_to_search
)
147 / peptides_to_search
)
149 print '# residue search proportion %.3f' % (float(residues_to_search
)
150 / (residues_to_search
157 with
open(fn
) as specfile
:
158 contents
= specfile
.read()
160 with contextlib
.closing(gzip
.open(fn
+ '.idx', 'w')) as idx
:
161 ms
= [ m
for m
in re
.finditer(r
'^:.*$', contents
, re
.MULTILINE
) ]
162 specnames
= [ m
.group() for m
in ms
]
163 if len(set(specnames
)) < len(ms
):
164 error("duplicate spectrum names not allowed")
165 if specnames
!= sorted(specnames
):
166 error("spectra must be ordered by name")
167 offsets
= [ m
.start() for m
in ms
]
168 cPickle
.dump({ 'offsets' : offsets
,
169 'file size' : os
.path
.getsize(fn
) },
170 idx
, cPickle
.HIGHEST_PROTOCOL
)
173 if __name__
== '__main__':