Merge commit 'v0.1.2'
[greylag.git] / greylag_flatten_fasta.py
blobc9eb62dc7f9ab3196cdeebda4a17409aae8ee237
1 #!/usr/bin/env python
3 """
4 Convert between FASTA-formatted input and one-sequence-per-line output so that
5 the sequences can be easily manipulated with UNIX text tools (e.g., grep,
6 head, wc, split, sort, etc.).
8 In order for '--inverse' to work correctly, the same flags must be supplied as
9 were supplied during the forward conversion (the script does not try to
10 guess). With '--defline=after', the conversion should be perfectly
11 invertible, modulo whitespace and wrapping. For '--defline=omit', an
12 artificial defline will be constructed based on the filename and line number.
14 """
16 __copyright__ = '''
17 greylag, a collection of programs for MS/MS protein analysis
18 Copyright (C) 2006-2008 Stowers Institute for Medical Research
20 This program is free software: you can redistribute it and/or modify
21 it under the terms of the GNU General Public License as published by
22 the Free Software Foundation, either version 3 of the License, or
23 (at your option) any later version.
25 This program is distributed in the hope that it will be useful,
26 but WITHOUT ANY WARRANTY; without even the implied warranty of
27 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 GNU General Public License for more details.
30 You should have received a copy of the GNU General Public License
31 along with this program. If not, see <http://www.gnu.org/licenses/>.
33 Contact: Mike Coleman
34 Stowers Institute for Medical Research
35 1000 East 50th Street
36 Kansas City, Missouri 64110
37 USA
38 '''
41 import fileinput
42 import optparse
43 import re
44 import sys
46 import greylag
49 # no backtrace on SIGPIPE
50 try:
51 import signal
52 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
53 except Exception:
54 pass
57 def error(s, *args):
58 "fatal error"
59 # if we're unit testing, just throw an exception
60 if __name__ != "__main__":
61 raise Exception((s + " (fatal error)") % args)
62 print >> sys.stderr, ("error: " + s) % args
63 sys.exit(1)
65 # errors are fatal
66 greylag.chase_error = error
69 def warn(message):
70 print >> sys.stderr, "warning: %s [at %s:%s]" \
71 % (message, fileinput.filename(), fileinput.filelineno())
74 def write_flattened_locus(options, defline, sequence):
75 if options.defline == 'after':
76 print '%s%s>%s' % (sequence, options.delimiter, defline)
77 elif options.defline == 'before':
78 if options.delimiter in defline:
79 warn("delimiter present in defline")
80 print '>%s%s%s' % (defline, options.delimiter, sequence)
81 else:
82 print sequence
85 def _main():
86 parser = optparse.OptionParser(usage="usage: %prog [options] [<file>...]",
87 description=__doc__)
88 parser.add_option("-d", "--delimiter", dest="delimiter", default='\t',
89 help="delimiter between defline and sequence"
90 " [default TAB]", metavar="STRING")
91 parser.add_option("-D", "--defline", dest="defline",
92 choices=('before', 'after', 'omit'), default="after",
93 help="position of defline with respect to sequence, one"
94 " of 'before', 'after' [default], or 'omit'",
95 metavar="POSITION")
96 parser.add_option("-i", "--inverse", dest="inverse", action="store_true",
97 help="do the inverse transformation (flat to FASTA)")
98 DEFAULT_WRAP = 80
99 parser.add_option("-w", "--wrap", dest="wrap", type="int",
100 default=DEFAULT_WRAP,
101 help="for --inverse, wrap sequence to specified width"
102 " [default %s, 0 means don't wrap at all]" % DEFAULT_WRAP,
103 metavar="COLUMNS")
104 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
105 help="be verbose")
106 parser.add_option("--copyright", action="store_true", dest="copyright",
107 help="print copyright and exit")
108 options, args = parser.parse_args()
110 if options.wrap < 0:
111 parser.print_help()
112 sys.exit(1)
114 if not options.inverse:
115 if not args:
116 files = [ sys.stdin ]
117 else:
118 files = [ open(fn) for fn in args ]
120 for f in files:
121 for locusname, defline, sequence in greylag.read_fasta_file(f):
122 write_flattened_locus(options, defline, sequence)
123 else:
124 for line in fileinput.input(args):
125 if options.defline != 'omit':
126 parts = line.split(options.delimiter, 1)
127 if len(parts) < 2:
128 error("input line lacks delimiter")
129 if options.defline == 'before':
130 defline, sequence = parts
131 else:
132 sequence, defline = parts
133 else:
134 sequence = line
135 defline = "%s:%s" % (fileinput.filename(),
136 fileinput.filelineno())
137 sequence = sequence.strip()
138 print defline.strip()
139 if options.wrap:
140 for start in range(0, len(sequence), options.wrap):
141 print sequence[start:start+options.wrap]
142 else:
143 print sequence
146 if __name__ == '__main__':
147 _main()