Check sequence db locus name uniqueness
[greylag.git] / greylag-index-spectra.py
blob152aacac5bfe0bd82ed8ebd127d17f765e599bdc
1 #!/usr/bin/env python
3 '''Create a trivial index giving the starting point of each spectrum, as a
4 byte offset from the file beginning. (The index is stored in Python pickle
5 format, compressed with gzip.) Also checks that spectra names are unique
6 and that spectra are ordered by name, which other greylag programs assume.
7 '''
9 from __future__ import with_statement
11 __copyright__ = '''
12 greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
14 This program is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 2 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License along
25 with this program; if not, write to the Free Software Foundation, Inc.,
26 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 '''
29 __version__ = "0.0"
32 import contextlib
33 import cPickle
34 import gzip
35 import optparse
36 import os.path
37 import re
38 import sys
41 def error(s):
42 sys.exit('error: ' + s)
45 def main(args=sys.argv[1:]):
46 parser = optparse.OptionParser(usage=
47 "usage: %prog [options] <ms2-file>...",
48 description=__doc__)
49 pa = parser.add_option
50 pa("--copyright", action="store_true", dest="copyright",
51 help="print copyright and exit")
52 pa("--version", action="store_true", dest="version",
53 help="print version and exit")
54 (options, args) = parser.parse_args(args=args)
56 if options.copyright:
57 print __copyright__
58 sys.exit(0)
59 if options.version:
60 print __version__
61 sys.exit(0)
63 if (len(args) < 1
64 or any(True for f in args if not f.endswith('.ms2'))):
65 parser.print_help()
66 sys.exit(1)
68 for fn in args:
69 with open(fn) as specfile:
70 contents = specfile.read()
71 specnames = set()
72 prevname = ''
73 offset = 0
74 with contextlib.closing(gzip.open(fn + '.idx', 'w')) as idx:
75 ms = [ m for m in re.finditer('^:.*$', contents, re.MULTILINE) ]
76 specnames = [ m.group() for m in ms ]
77 if len(set(specnames)) < len(ms):
78 error("duplicate spectrum names not allowed")
79 if specnames != sorted(specnames):
80 error("spectra must be ordered by name")
81 offsets = [ m.start() for m in ms ]
82 cPickle.dump({ 'offsets' : offsets,
83 'file size' : os.path.getsize(fn) },
84 idx, cPickle.HIGHEST_PROTOCOL)
87 if __name__ == '__main__':
88 main()