greylag_index_spectra.py

   1 #!/usr/bin/env greylag-python
   2
   3 '''Create a trivial index giving the starting point of each spectrum, as a
   4    byte offset from the file beginning.  (The index is stored in Python pickle
   5    format, compressed with gzip.)  Also checks that spectra names are unique
   6    and that spectra are ordered by name, which other greylag programs assume.
   7 '''
   8
   9 from __future__ import with_statement
  10
  11 __copyright__ = '''
  12     greylag, Copyright (C) 2006-2007, Stowers Institute for Medical Research
  13
  14     This program is free software; you can redistribute it and/or modify
  15     it under the terms of the GNU General Public License as published by
  16     the Free Software Foundation; either version 2 of the License, or
  17     (at your option) any later version.
  18
  19     This program is distributed in the hope that it will be useful,
  20     but WITHOUT ANY WARRANTY; without even the implied warranty of
  21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22     GNU General Public License for more details.
  23
  24     You should have received a copy of the GNU General Public License along
  25     with this program; if not, write to the Free Software Foundation, Inc.,
  26     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  27 '''
  28
  29 __version__ = "0.0"
  30
  31
  32 import contextlib
  33 import cPickle
  34 import gzip
  35 import optparse
  36 import os.path
  37 import re
  38 import sys
  39
  40
  41 def error(s):
  42     sys.exit('error: ' + s)
  43
  44
  45 def main(args=sys.argv[1:]):
  46     parser = optparse.OptionParser(usage=
  47                                    "usage: %prog [options] <ms2-file>...",
  48                                    description=__doc__)
  49     pa = parser.add_option
  50     pa("--copyright", action="store_true", dest="copyright",
  51        help="print copyright and exit")
  52     pa("--version", action="store_true", dest="version",
  53        help="print version and exit")
  54     (options, args) = parser.parse_args(args=args)
  55
  56     if options.copyright:
  57         print __copyright__
  58         sys.exit(0)
  59     if options.version:
  60         print __version__
  61         sys.exit(0)
  62
  63     if (len(args) < 1
  64         or any(True for f in args if not f.endswith('.ms2'))):
  65         parser.print_help()
  66         sys.exit(1)
  67
  68     for fn in args:
  69         with open(fn) as specfile:
  70             contents = specfile.read()
  71         specnames = set()
  72         prevname = ''
  73         offset = 0
  74         with contextlib.closing(gzip.open(fn + '.idx', 'w')) as idx:
  75             ms = [ m for m in re.finditer('^:.*$', contents, re.MULTILINE) ]
  76             specnames = [ m.group() for m in ms ]
  77             if len(set(specnames)) < len(ms):
  78                 error("duplicate spectrum names not allowed")
  79             if specnames != sorted(specnames):
  80                 error("spectra must be ordered by name")
  81             offsets = [ m.start() for m in ms ]
  82             cPickle.dump({ 'offsets' : offsets,
  83                            'file size' : os.path.getsize(fn) },
  84                          idx, cPickle.HIGHEST_PROTOCOL)
  85
  86
  87 if __name__ == '__main__':
  88     main()