speelink fixes, patch courtesy Charles Plessy, fixes #3256
[bioperl-run.git] / lib / Bio / Tools / Run / TigrAssembler.pm
blob757d130f373c1809324f1cb3e8778ecb658f10cc
1 # BioPerl module for Bio::Tools::Run::TigrAssembler
3 # Copyright Florent E Angly <florent-dot-angly-at-gmail-dot-com>
5 # You may distribute this module under the same terms as perl itself
7 # POD documentation - main docs before the code
9 =head1 NAME
11 Bio::Tools::Run::TigrAssembler - Wrapper for local execution of TIGR Assembler
14 =head1 SYNOPSIS
16 use Bio::Tools::Run::TigrAssembler;
17 # Run TIGR Assembler using an input FASTA file
18 my $factory = Bio::Tools::Run::TigrAssembler->new( -minimum_overlap_length => 35 );
19 my $asm_obj = $factory->run($fasta_file, $qual_file);
20 # An assembly object is returned by default
21 for my $contig ($assembly->all_contigs) {
22 ... do something ...
25 # Read some sequences
26 use Bio::SeqIO;
27 my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta');
28 my @seqs;
29 while (my $seq = $sio->next_seq()) {
30 push @seqs,$seq;
33 # Run TIGR Assembler with input sequence objects and return an assembly file
34 my $asm_file = 'results.tigr';
35 $factory->out_type($asm_file);
36 $factory->run(\@seqs);
38 # Use LIGR Assembler instead
39 my $ligr = Bio::Tools::Run::TigrAssembler->new(
40 -program_name => 'LIGR_Assembler',
41 -trimmed_seq => 1
43 $ligr->run(\@seqs);
45 =head1 DESCRIPTION
47 Wrapper module for the local execution of the DNA assembly program TIGR
48 Assembler v2.0. TIGR Assembler is open source software under The Artistic
49 License and available at: http://www.tigr.org/software/assembler/
51 This module runs TIGR Assembler by feeding it a FASTA file or sequence objects
52 and returning an assembly file or assembly and IO objects. When the input is
53 Bioperl object, sequences less than 39 bp long are filtered out since they are
54 not supported by TIGR Assembler.
56 If provided in the following way, TIGR Assembler will use additional
57 information present in the sequence descriptions for assembly:
58 >seq_name minimum_clone_length maximum_clone_length median_clone_length
59 clear_end5 clear_end3
61 >db|seq_name minimum_clone_length maximum_clone_length median_clone_length
62 clear_end5 clear_end3
63 e.g.
64 >GHIBF57F 500 3000 1750 33 587
66 This module also supports LIGR Assembler, a variant of TIGR Assembler:
67 http://sourceforge.net/projects/ligr-assembler/
69 =head1 FEEDBACK
71 =head2 Mailing Lists
73 User feedback is an integral part of the evolution of this and other Bioperl
74 modules. Send your comments and suggestions preferably to one of the Bioperl
75 mailing lists. Your participation is much appreciated.
77 bioperl-l@bioperl.org - General discussion
78 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
80 =head2 Support
82 Please direct usage questions or support issues to the mailing list:
84 I<bioperl-l@bioperl.org>
86 rather than to the module maintainer directly. Many experienced and
87 reponsive experts will be able look at the problem and quickly
88 address it. Please include a thorough description of the problem
89 with code and data examples if at all possible.
91 =head2 Reporting Bugs
93 Report bugs to the Bioperl bug tracking system to help us keep track the bugs
94 and their resolution. Bug reports can be submitted via the web:
96 http://redmine.open-bio.org/projects/bioperl/
98 =head1 AUTHOR - Florent E Angly
100 Email: florent-dot-angly-at-gmail-dot-com
102 =head1 APPENDIX
104 The rest of the documentation details each of the object methods. Internal
105 methods are usually preceded with a _
107 =cut
110 package Bio::Tools::Run::TigrAssembler;
112 use strict;
113 use IPC::Run;
115 use base qw( Bio::Root::Root Bio::Tools::Run::AssemblerBase );
117 our $program_name = 'TIGR_Assembler'; # name of the executable
118 our @program_params = (qw( minimum_percent minimum_length max_err_32 quality_file
119 maximum_end resort_after ));
120 our @program_switches = (qw( include_singlets consider_low_scores safe_merging_stop
121 ignore_tandem_32mers use_tandem_32mers not_random incl_bad_seq trimmed_seq ));
122 our %param_translation = (
123 'quality_file' => 'q',
124 'minimum_percent' => 'p',
125 'minimum_length' => 'l',
126 'include_singlets' => 's',
127 'max_err_32' => 'g',
128 'consider_low_scores' => 'L',
129 'maximum_end' => 'e',
130 'ignore_tandem_32mers' => 't',
131 'use_tandem_32mers' => 'u',
132 'safe_merging_stop' => 'X',
133 'not_random' => 'N',
134 'resort_after' => 'r',
135 'incl_bad_seq' => 'b',
136 'trimmed_seq' => 'i'
138 our $qual_param = 'quality_file';
139 our $use_dash = 1;
140 our $join = ' ';
141 our $asm_format = 'tigr';
142 our $min_len = 39;
145 =head2 new
147 Title : new
148 Usage : $factory->new( -minimum_percent => 95,
149 -minimum_length => 50,
150 -include_singlets => 1 );
151 Function: Create a TIGR Assembler factory
152 Returns : A Bio::Tools::Run::TigrAssembler object
153 Args :
155 TIGR Assembler options available in this module:
157 minimum_percent / minimum_overlap_similarity: the minimum percent identity
158 that two DNA fragments must achieve over their entire region of overlap in
159 order to be considered as a possible assembly. Adjustments are made by the
160 program to take into account that the ends of sequences are lower quality
161 and doubled base calls are the most frequent sequencing error.
162 minimum_length / minimum_overlap_length: the minimum length two DNA fragments
163 must overlap to be considered as a possible assembly (warning: this option
164 is not strictly respected by TIGR Assembler...)
165 include_singlets: a flag which indicates that singletons (assemblies made up
166 of a single DNA fragment) should be included in the lassie output_file - the
167 default is to not include singletons.
168 max_err_32: the maximum number + 1 of alignment errors (mismatches or gaps)
169 allowed within any contiguous 32 base pairs in the overlap region between
170 two DNA fragments in the same assembly. This is meant to split apart splice
171 variants which have short splice differences and would not be disqualified
172 by the -p minimum_percent parameter.
173 consider_low_scores: a flag which causes even very LOW pairwise scores to be
174 considered - caution using this flag may cause longer run time and a worse
175 assembly.
176 maximum_end: the maximum length at the end of a DNA fragment that does not
177 match another overlapping DNA fragment (sometimes referred to as overhang)
178 that will not disqualify a DNA fragment from becoming part of an assembly.
179 ignore_tandem_32mers: a flag which causes tandem 32mers (a tandem 32mer is a
180 32mer which occurs more than once in at least one sequence read) to be
181 ignored (this is now the default behavior and this flag is for backward
182 compatibility)
183 use_tandem_32mers: a flag which causes tandem 32mers to be used for pairwise
184 comparison opposite of the -t flag which is now the default).
185 safe_merging_stop: a flag which causes merging to stop when only sequences
186 which appear to be repeats are left and these cannot be merged based on
187 clone length constraints.
188 not_random: a flag which indicates that the DNA fragments in the input_file
189 should not be treated as random genomic fragments for the purpose of
190 determining repeat regions.
191 resort_after: specifies how many sequences should be merged before resorting
192 the possible merges based on clone constraints.
194 LIGR Assembler has the same options as TIGR Assembler, and the following:
196 incl_bad_seq: keep all sequences including potential chimeras and splice variants
197 trimmed_seq: indicates that the sequences are trimmed. High quality scores will be
198 given on the whole sequence length instead of just in the middle)
200 =cut
202 sub new {
203 my ($class,@args) = @_;
204 my $self = $class->SUPER::new(@args);
205 $self->_set_program_options(\@args, \@program_params, \@program_switches,
206 \%param_translation, $qual_param, $use_dash, $join);
207 *minimum_overlap_length = \&minimum_length;
208 *minimum_overlap_similarity = \&minimum_percent;
209 $self->program_name($program_name) if not defined $self->program_name();
210 $self->_assembly_format($asm_format);
211 return $self;
215 =head2 out_type
217 Title : out_type
218 Usage : $factory->out_type('Bio::Assembly::ScaffoldI')
219 Function: Get/set the desired type of output
220 Returns : The type of results to return
221 Args : Desired type of results to return (optional):
222 'Bio::Assembly::IO' object
223 'Bio::Assembly::ScaffoldI' object (default)
224 The name of a file to save the results in
226 =cut
229 =head2 run
231 Title : run
232 Usage : $factory->run($fasta_file);
233 Function: Run TIGR Assembler
234 Returns : - a Bio::Assembly::ScaffoldI object, a Bio::Assembly::IO
235 object, a filename, or undef if all sequences were too small to
236 be usable
237 Returns : Assembly results (file, IO object or assembly object)
238 Args : - sequence input (FASTA file or sequence object arrayref)
239 - optional quality score input (QUAL file or quality score object
240 arrayref)
241 =cut
244 =head2 _run
246 Title : _run
247 Usage : $assembler->_run()
248 Function: Make a system call and run Newbler
249 Returns : An assembly file
250 Args : - FASTA file, SFF file and MID, or analysis dir and MID
251 - optional QUAL file
253 =cut
255 sub _run {
256 my ($self, $fasta_file, $qual_file) = @_;
258 # Setup needed files and filehandles first
259 my ($output_fh, $output_file ) = $self->_prepare_output_file( );
260 my ($scratch_fh, $scratch_file) = $self->io->tempfile( -dir => $self->tempdir() );
261 my ($stderr_fh, $stderr_file ) = $self->io->tempfile( -dir => $self->tempdir() );
263 # Get program executable
264 my $exe = $self->executable;
266 # Get command-line options
267 my $options = $self->_translate_params();
269 # Usage: TIGR_Assembler [options] scratch_file < input_file > output_file
270 my @program_args = ( $exe, @$options, $scratch_file );
271 my $stdin = $fasta_file;
272 my $stdout = $output_file;
273 my $stderr = $stderr_file;
275 my @ipc_args = ( \@program_args,
276 '<', $fasta_file,
277 '>', $output_file,
278 '2>', $stderr_file );
280 # Print command for debugging
281 if ($self->verbose() >= 0) {
282 my $cmd = '';
283 $cmd .= join ( ' ', @program_args );
284 for ( my $i = 1 ; $i < scalar @ipc_args ; $i++ ) {
285 my $element = $ipc_args[$i];
286 my $ref = ref($element);
287 my $value;
288 if ( $ref && $ref eq 'SCALAR') {
289 $value = $$element;
290 } else {
291 $value = $element;
293 $cmd .= " $value";
295 $self->debug( "$exe command = $cmd\n" );
298 # Execute command
299 eval {
300 IPC::Run::run(@ipc_args) || die("There was a problem running $exe: $!");
302 if ($@) {
303 $self->throw("$exe call crashed: $@");
305 $self->debug(join("\n", "$exe STDERR", $stderr_file)) if $stderr_file;
306 # TIGR Assembler's stderr reports a lot more than just errors
308 # Close filehandles
309 close($scratch_fh);
310 close($output_fh);
311 close($stderr_fh);
313 # Import assembly
314 return $output_file;
318 =head2 _remove_small_sequences
320 Title : _remove_small_sequences
321 Usage : $assembler->_remove_small_sequences(\@seqs, \@quals)
322 Function: Remove sequences below a threshold length
323 Returns : a new sequence object array reference
324 a new quality score object array reference
325 Args : sequence object array reference
326 quality score object array reference (optional)
328 =cut
330 # Aliasing function _prepare_input_sequences to _remove_small_sequences
331 *_prepare_input_sequences = \&_remove_small_sequences;
333 sub _remove_small_sequences {
334 my ($self, $seqs, $quals) = @_;
335 # The threshold length, $min_len, has been registered as a global variable
336 my @new_seqs;
337 my @new_quals;
339 if (ref($seqs) =~ m/ARRAY/i) {
340 my @removed;
341 my $nof_seqs = scalar @$seqs;
342 for my $i (1 .. $nof_seqs) {
343 my $seq = $$seqs[$i-1];
344 if ($seq->length >= $min_len) {
345 push @new_seqs, $seq;
346 if ($quals) {
347 my $qual = $$quals[$i-1];
348 push @new_quals, $qual;
350 } else {
351 push @removed, $seq->id;
354 if (scalar @removed > 0) {
355 $self->warn("The following sequences were removed because they are smaller".
356 " than $min_len bp: @removed\n");
359 return \@new_seqs, \@new_quals;