5 This script will accept FASTA formatted input and generate a decoy sequence
6 for each input sequence. (Thus, the output will have twice as many sequences
7 as the input.) A prefix (e.g. 'SHUFFLED_') will be added to the defline of
8 each decoy sequence. Also, all sequences will be re-wrapped.
10 The script will strip duplicate loci in the input, giving an error if the
11 duplicates, which have the same locus name, do not also have the same
14 For shuffling, the randomness seed defaults to zero. Use different values if
15 you want different shuffles for a given input. Note that the random number
16 generator is reseeded after each sequence is generated. This means that a
17 particular input sequence will always be mapped to the same shuffled sequence,
18 independent of what other sequences exist in the input, which is desirable for
24 greylag, a collection of programs for MS/MS protein analysis
25 Copyright (C) 2006-2008 Stowers Institute for Medical Research
27 This program is free software: you can redistribute it and/or modify
28 it under the terms of the GNU General Public License as published by
29 the Free Software Foundation, either version 3 of the License, or
30 (at your option) any later version.
32 This program is distributed in the hope that it will be useful,
33 but WITHOUT ANY WARRANTY; without even the implied warranty of
34 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35 GNU General Public License for more details.
37 You should have received a copy of the GNU General Public License
38 along with this program. If not, see <http://www.gnu.org/licenses/>.
41 Stowers Institute for Medical Research
43 Kansas City, Missouri 64110
48 from collections
import defaultdict
59 print >> sys
.stderr
, "warning: %s" % message
62 sys
.exit('error: ' + s
)
65 greylag
.chase_error
= error
68 class abstract_decoy_maker
:
70 """Return a decoy sequence of the same length. (Argument is a list of
71 residues, and may be modified by this function.)
73 error("abstract method not implemented")
75 class reverse_decoy_maker(abstract_decoy_maker
):
80 class shuffle_decoy_maker(abstract_decoy_maker
):
81 def __init__(self
, random_seed
):
82 self
.random_seed
= random_seed
85 random
.seed(self
.random_seed
)
89 class markov_decoy_maker(abstract_decoy_maker
):
90 def __init__(self
, random_seed
, length
, original_sequence_file
):
91 random
.seed(random_seed
)
94 # for order 0 through self.length:
95 # [ length-mer -> subsequent residues -> count, ... ]
96 self
.transition
= [ defaultdict(lambda: defaultdict(int))
97 for i
in range(length
+1) ]
98 for locusname
, defline
, sequence
, filename \
99 in greylag
.read_fasta_files([original_sequence_file
]):
100 for order
in range(length
+1):
101 seq
= '-' * order
+ sequence
102 for i
in xrange(len(sequence
)):
103 self
.transition
[order
][seq
[i
:i
+order
]][seq
[i
+order
]] += 1
106 #for n, t in enumerate(self.transition):
107 # print '### order', n
108 # for k in sorted(t.keys()):
109 # pprint.pprint((k, dict(t[k])))
112 # FIX: this could be cached for better performance
113 def _choose_next(self
, hist
):
114 histpairs
= hist
.items()
115 total
= sum(x
[1] for x
in histpairs
)
116 choice
= random
.randrange(total
)
118 for r
, count
in histpairs
:
125 key
= '-' * self
.length
127 for i
in xrange(len(s
)):
128 for order
in range(self
.length
, -1, -1):
129 k
= key
[-order
:] if order
else ''
130 if k
not in self
.transition
[order
]:
132 r
= self
._choose
_next
(self
.transition
[order
][k
])
138 if key
: # if self.length==0, key is ''
143 def add_sixmers(sixmer_set
, sequence
):
144 for i
in range(len(sequence
) - 6 + 1):
145 sixmer_set
.add(sequence
[i
:i
+6])
148 def write_locus(options
, decoy_maker
, seen
, sixmers
, locusname
, defline
,
152 sequence_hash
= h
.digest()
154 if locusname
in seen
:
155 seen_defline
, seen_hash
= seen
[locusname
]
156 if seen_hash
!= sequence_hash
:
157 error("differing sequence for locus '%s'" % locusname
)
158 if options
.verbose
and seen_defline
!= defline
:
159 warn("differing deflines for locus '%s'" % locusname
)
161 seen
[locusname
] = (defline
, sequence_hash
)
163 decoy_sequence
= ''.join(decoy_maker
.make(list(sequence
)))
164 decoy_defline
= greylag
.DEFAULT_DECOY_PREFIX
+ locusname
+ ' FALSE POSITIVE'
166 add_sixmers(sixmers
[0], sequence
)
167 add_sixmers(sixmers
[1], decoy_sequence
)
169 for d
, s
in [(defline
, sequence
), (decoy_defline
, decoy_sequence
)]:
170 if options
.no_original
and d
== defline
:
174 for start
in range(0, len(s
), options
.wrap
):
175 print s
[start
:start
+options
.wrap
]
181 parser
= optparse
.OptionParser(usage
="usage: %prog [options] <file>",
183 parser
.add_option("-r", "--reverse", action
="store_true",
185 help="reverse the sequences, instead of shuffling")
186 parser
.add_option("-m", "--markov", type="int", dest
="markov_length",
187 help="generate Markov sequences, with memory of length"
188 " N, instead of shuffling", default
=None, metavar
="N")
189 parser
.add_option("-s", "--seed", type="int", dest
="seed",
190 help="seed for randomness [default 0]",
191 default
=0, metavar
="N")
192 parser
.add_option("-n", "--no-original", action
="store_true",
194 help="don't output original sequences")
195 parser
.add_option("-v", "--verbose", action
="store_true",
196 dest
="verbose", help="be verbose")
198 parser
.add_option("-w", "--wrap", dest
="wrap", type="int",
199 default
=DEFAULT_WRAP
,
200 help="wrap sequence to specified width"
201 " [default %s, 0 means don't wrap at all]" % DEFAULT_WRAP
,
203 parser
.add_option("--copyright", action
="store_true", dest
="copyright",
204 help="print copyright and exit")
205 (options
, args
) = parser
.parse_args()
208 or options
.markov_length
!= None and options
.reverse
209 or options
.markov_length
!= None and options
.markov_length
< 0):
213 if options
.markov_length
!= None:
214 decoy_maker
= markov_decoy_maker(options
.seed
, options
.markov_length
,
216 elif options
.reverse
:
217 decoy_maker
= reverse_decoy_maker()
219 decoy_maker
= shuffle_decoy_maker(options
.seed
)
221 # locus id -> (defline, hash of sequence)
224 # real and decoy 6-mers seen
225 sixmers
= (set(), set())
227 for locusname
, defline
, sequence
, filename \
228 in greylag
.read_fasta_files([args
[0]]):
229 write_locus(options
, decoy_maker
, seen
, sixmers
,
230 locusname
, defline
, sequence
)
232 common_sixmers
= sixmers
[0] & sixmers
[1]
233 print >> sys
.stderr
, ("six-mers: %s real %s decoy %s both"
234 % (len(sixmers
[0]) - len(common_sixmers
),
235 len(sixmers
[1]) - len(common_sixmers
),
236 len(common_sixmers
)))
239 if __name__
== '__main__':