3 # BioPerl module for Bio::Tools::Run::Maq
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Mark A. Jensen <maj@fortinbras.us>
9 # Copyright Mark A. Jensen
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::Run::Maq - Run wrapper for the Maq short-read assembler *BETA*
22 $maq_fac = Bio::Tools::Run::Maq->new();
23 $maq_assy = $maq_fac->run( 'reads.fastq', 'refseq.fas' );
24 # if IO::Uncompress::Gunzip is available...
25 $maq_assy = $maq_fac->run( 'reads.fastq.gz', 'refseq.gz');
27 $maq_assy = $maq_fac->run( 'reads.fastq', 'refseq.fas', 'paired-reads.fastq');
29 $maq_fac->set_parameters( -c2q_min_map_quality => 60 );
30 $maq_assy = $maq_fac->run( 'reads.fastq', 'refseq.fas', 'paired-reads.fastq');
32 # run maq commands separately
33 $maq_fac = Bio::Tools::Run::Maq->new(
35 -single_end_quality => 1 );
36 $maq_fac->run_maq( -bfa => 'refseq.bfa',
37 -map => 'maq_assy.map',
38 -txt => 'maq_assy.pup.txt' );
42 This module provides a wrapper interface for Heng Li's
43 reference-directed short read assembly suite C<maq> (see
44 L<http://maq.sourceforge.net/maq-man.shtml> for manuals and
47 There are two modes of action.
53 The first is a simple pipeline through the C<maq> commands, taking
54 your read data in and squirting out an assembly object of type
55 L<Bio::Assembly::IO::maq>. The pipeline is based on the one performed
60 data conversion to fasta2bfa, fastq2bfq
63 map sequence reads map
66 assemble, creating assemble
69 convert map & cns mapview, cns2fq
73 Command-line options can be directed to the C<map>, C<assemble>, and
74 C<cns2fq> steps. See L</OPTIONS> below.
78 The second mode is direct access to C<maq> commands. To run a C<maq>
79 command, construct a run factory, specifying the desired command using
80 the C<-command> argument in the factory constructor, along with
81 options specific to that command (see L</OPTIONS>):
83 $maqfac->Bio::Tools::Run::Maq->new( -command => 'fasta2bfa' );
85 To execute, use the C<run_maq> methods. Input and output files are
86 specified in the arguments of C<run_maq> (see L</FILES>):
88 $maqfac->run_maq( -fas => "myref.fas", -bfa => "myref.bfa" );
94 C<maq> is complex, with many subprograms (commands) and command-line
95 options and file specs for each. This module attempts to provide
96 commands and options comprehensively. You can browse the choices like so:
98 $maqfac = Bio::Tools::Run::Maq->new( -command => 'assemble' );
100 @all_commands = $maqfac->available_parameters('commands');
101 @all_commands = $maqfac->available_commands; # alias
103 @assemble_params = $maqfac->available_parameters('params');
104 @assemble_switches = $maqfac->available_parameters('switches');
105 @assemble_all_options = $maqfac->available_parameters();
107 Reasonably mnemonic names have been assigned to the single-letter
108 command line options. These are the names returned by
109 C<available_parameters>, and can be used in the factory constructor
110 like typical BioPerl named parameters.
112 See L<http://maq.sourceforge.net/maq-manpage.shtml> for the gory details.
116 When a command requires filenames, these are provided to the C<run_maq> method, not
117 the constructor (C<new()>). To see the set of files required by a command, use
118 C<available_parameters('filespec')> or the alias C<filespec()>:
120 $maqfac = Bio::Tools::Run::Maq->new( -command => 'map' );
121 @filespec = $maqfac->filespec;
123 This example returns the following array:
131 This indicates that map (C<maq> binary mapfile), bfa (C<maq> binary
132 fasta), and bfq (C<maq> binary fastq) files MUST be specified, another
133 bfq file MAY be specified, and a log file receiving STDERR also MAY be
134 specified. Use these in the C<run_maq> call like so:
136 $maqfac->run_maq( -map => 'my.map', -bfa => 'myrefseq.bfa',
137 -bfq1 => 'reads1.bfq', -bfq2 => 'reads2.bfq' );
139 Here, the C<log> parameter was unspecified. Therefore, the object will store
140 the programs STDERR output for you in the C<stderr()> attribute:
142 handle_map_warning($maqfac) if ($maqfac->stderr =~ /warning/);
144 STDOUT for a run is also saved, in C<stdout()>, unless a file is specified
145 to slurp it according to the filespec. C<maq> STDOUT usually contains useful
146 information on the run.
152 User feedback is an integral part of the evolution of this and other
153 Bioperl modules. Send your comments and suggestions preferably to
154 the Bioperl mailing list. Your participation is much appreciated.
156 bioperl-l@bioperl.org - General discussion
157 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
161 Please direct usage questions or support issues to the mailing list:
163 L<bioperl-l@bioperl.org>
165 rather than to the module maintainer directly. Many experienced and
166 reponsive experts will be able look at the problem and quickly
167 address it. Please include a thorough description of the problem
168 with code and data examples if at all possible.
170 =head2 Reporting Bugs
172 Report bugs to the Bioperl bug tracking system to help us keep track
173 of the bugs and their resolution. Bug reports can be submitted via
176 http://bugzilla.open-bio.org/
178 =head1 AUTHOR - Mark A. Jensen
180 Email maj -at- fortinbras -dot- us
184 The rest of the documentation details each of the object methods.
185 Internal methods are usually preceded with a _
189 # Let the code begin...
192 package Bio
::Tools
::Run
::Maq
;
194 our $HAVE_IO_UNCOMPRESS;
197 eval {require IO
::Uncompress
::Gunzip
; $HAVE_IO_UNCOMPRESS = 1};
202 # Object preamble - inherits from Bio::Root::Root
206 use Bio
::Tools
::Run
::Maq
::Config
;
207 use Bio
::Tools
::GuessSeqFormat
;
208 use File
::Basename
qw(fileparse);
210 use base
qw(Bio::Root::Root Bio::Tools::Run::AssemblerBase );
213 our $program_name = 'maq'; # name of the executable
216 # other globals required by Bio::Tools::Run::AssemblerBase are
217 # imported from Bio::Tools::Run::Maq::Config
219 our $qual_param = 'quality_file';
223 our $asm_format = 'maq';
228 Usage : my $obj = new Bio::Tools::Run::Maq();
229 Function: Builds a new Bio::Tools::Run::Maq object
230 Returns : an instance of Bio::Tools::Run::Maq
236 my ($class,@args) = @_;
237 my $self = $class->SUPER::new
(@args);
238 $self->parameters_changed(1);
239 $self->_register_program_commands( \
@program_commands, \
%command_prefixes );
240 unless (grep /command/, @args) {
241 push @args, '-command', 'run';
243 $self->_set_program_options(\
@args, \
@program_params, \
@program_switches,
244 \
%param_translation, $qual_param, $use_dash, $join);
245 $self->program_name($program_name) if not defined $self->program_name();
246 if ($^O
=~ /cygwin/) {
247 my @kludge = `PATH=\$PATH:/usr/bin:/usr/local/bin which $program_name`;
249 $self->program_name($kludge[0]);
251 $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI
252 $self->_assembly_format($asm_format);
259 Usage : $assembly = $maq_assembler->run($read1_fastq_file,
262 Function: Run the maq assembly pipeline.
263 Returns : Assembly results (file, IO object or Assembly object)
264 Args : - fastq file containing single-end reads
265 - fasta file containing the reference sequence
266 - [optional] fastq file containing paired-end reads
267 Note : gzipped inputs are allowed if IO::Uncompress::Gunzip
273 my ($self, $rd1_file, $ref_file, $rd2_file) = @_;
276 $self->_check_executable();
277 $rd1_file or $self->throw("Fastq reads file required at arg 1");
278 $ref_file or $self->throw("Fasta refseq file required at arg 2");
279 # expand gzipped files as nec.
280 for ($rd1_file, $ref_file, $rd2_file) {
283 unless ($HAVE_IO_UNCOMPRESS) {
284 croak
( "IO::Uncompress::Gunzip not available, can't expand '$_'" );
286 my ($tfh, $tf) = $self->io->tempfile;
287 my $z = IO
::Uncompress
::Gunzip
->new($_);
288 while (<$z>) { print $tfh $_ }
293 my $guesser = Bio
::Tools
::GuessSeqFormat
->new(-file
=>$rd1_file);
295 $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 1");
296 $guesser = Bio
::Tools
::GuessSeqFormat
->new(-file
=>$ref_file);
297 $guesser->guess eq 'fasta' or $self->throw("Refseq file doesn't look like fasta at arg 2");
299 $guesser = Bio
::Tools
::GuessSeqFormat
->new(-file
=>$rd2_file);
300 $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 3");
303 # maq format conversion
304 ($rd1_file, $ref_file, $rd2_file) = $self->_prepare_input_sequences($rd1_file, $ref_file, $rd2_file);
307 my ($maq_file, $faq_file) = $self->_run($rd1_file, $ref_file, $rd2_file);
309 # Export results in desired object type
310 my $asm = $self->_export_results($maq_file);
317 Usage : $obj->run_maq( @file_args )
318 Function: Run a maq command as specified during object contruction
320 Args : a specification of the files to operate on:
325 my ($self, @args) = @_;
326 # _translate_params will provide an array of command/parameters/switches
327 # -- these are set at object construction
328 # to set up the run, need to add the files to the call
329 # -- provide these as arguments to this function
331 my $cmd = $self->command if $self->can('command');
332 $self->throw("No maq command specified for the object") unless $cmd;
333 # setup files necessary for this command
334 my $filespec = $command_files{$cmd};
335 $self->throw("No command-line file specification is defined for command '$cmd'; check Bio::Tools::Run::Maq::Config") unless $filespec;
337 # parse args based on filespec
339 $self->throw("Named args are required") unless !(@args % 2);
345 $s =~ s/^[012]?[<>]//;
346 $s =~ s/[^a-zA-Z0-9_]//g;
348 } grep !/[#]/, @
$filespec;
349 !defined($args{$_}) && $self->throw("Required filearg '$_' not specified") for @req;
351 my ($in, $out, $err);
354 defined($args{$1}) && ( open($out,">", $args{$1}) or $self->throw("Open for write error : $!"));
358 defined($args{$1}) && (open($err, ">", $args{$1}) or $self->throw("Open for write error : $!"));
362 defined($args{$1}) && (open($in, "<", $args{$1}) or $self->throw("Open for read error : $!"));
367 $in || ($in = \
$dum);
368 $out || ($out = \
$self->{'stdout'});
369 $err || ($err = \
$self->{'stderr'});
371 # Get program executable
372 my $exe = $self->executable;
373 # Get command-line options
374 my $options = $self->_translate_params();
375 # Get file specs sans redirects in correct order
378 $s =~ s/[^a-zA-Z0-9_]//g;
380 } grep !/[<>]/, @
$filespec;
381 my @files = @args{@specs};
385 splice(@files, $_, 1, @
{$files[$_]}) if (ref($files[$_]) eq 'ARRAY');
387 @files = map { defined $_ ?
$_ : () } @files; # squish undefs
388 my @ipc_args = ( $exe, @
$options, @files );
391 IPC
::Run
::run
(\
@ipc_args, $in, $out, $err) or
392 die ("There was a problem running $exe : $!");
395 $self->throw("$exe call crashed: $@");
398 # return arguments as specified on call
405 Usage : $fac->stdout()
406 Function: store the output from STDOUT for the run,
407 if no file specified in run_maq()
409 Returns : scalar string
410 Args : on set, new value (a scalar or undef, optional)
417 return $self->{'stdout'} = shift if @_;
418 return $self->{'stdout'};
424 Usage : $fac->stderr()
425 Function: store the output from STDERR for the run,
426 if no file is specified in run_maq()
428 Returns : scalar string
429 Args : on set, new value (a scalar or undef, optional)
436 return $self->{'stderr'} = shift if @_;
437 return $self->{'stderr'};
442 =head1 Bio::Tools::Run::AssemblerBase overrides
444 =head2 _check_sequence_input()
450 sub _check_sequence_input
{
454 =head2 _check_optional_quality_input()
460 sub _check_optional_quality_input
{
464 =head2 _prepare_input_sequences
466 Convert input fastq and fasta to maq format.
470 sub _prepare_input_sequences
{
472 my ($self, @args) = @_;
473 my (%args, $read1, $read2, $refseq);
474 if (grep /^-/, @args) { # named parms
475 $self->throw("Input args not an even number") unless !(@args % 2);
477 ($read1, $refseq, $read2) = @args{qw( -read1 -refseq -read2 )};
480 ($read1, $refseq, $read2) = @args;
482 # just handle file input for now...
483 $self->throw("maq requires at least one FASTQ read file and one FASTA reference sequence")
484 unless (defined $read1 && defined $refseq);
485 $self->throw("File cannot be found")
486 unless ( -e
$read1 && -e
$refseq && (!defined $read2 || -e
$read2) );
488 # maq needs its own fasta/fastq format. Use its own converters to
489 # create tempfiles in bfa, bfq format.
490 my ($ref_h, $ref_file, $rd1_h, $rd1_file, $rd2_h, $rd2_file);
491 ($ref_h, $ref_file) = $self->io->tempfile( -dir
=> $self->tempdir() );
492 ($rd1_h, $rd1_file) = $self->io->tempfile( -dir
=> $self->tempdir() );
495 my $fac = Bio
::Tools
::Run
::Maq
->new( -command
=> 'fasta2bfa' );
496 $fac->run_maq( -bfa
=> $ref_file, -fas
=> $refseq );
497 $fac->set_parameters( -command
=> 'fastq2bfq' );
498 $fac->run_maq( -bfq
=> $rd1_file, -faq
=> $read1 );
499 if (defined $read2) {
500 ($rd2_h, $rd2_file) = $self->io->tempfile( -dir
=> $self->tempdir() );
502 $fac->run_maq( -bfq
=> $rd2_file, -faq
=> $read2);
504 return ($rd1_file, $ref_file, $rd2_file);
507 =head2 _collate_subcmd_args()
509 Title : _collate_subcmd_args
510 Usage : $args_hash = $self->_collate_subcmd_args
511 Function: collate parameters and switches into command-specific
512 arg lists for passing to new()
513 Returns : hash of named argument lists
514 Args : [optional] composite cmd prefix (scalar string)
519 sub _collate_subcmd_args
{
523 # default command is 'run'
525 my @subcmds = @
{$composite_commands{$cmd}};
527 my $cur_options = $self->{'_options'};
530 foreach my $subcmd (@subcmds) {
531 # find the composite cmd form of the argument in
532 # the current params and switches
533 # e.g., map_max_mismatches
534 my @params = grep /^${subcmd}_/, @
{$$cur_options{'_params'}};
535 my @switches = grep /^${subcmd}_/, @
{$$cur_options{'_switches'}};
537 # create an argument list suitable for passing to new() of
538 # the subcommand factory...
539 foreach my $opt (@params, @switches) {
541 $subopt =~ s/^${subcmd}_//;
542 push(@
{$ret{$subcmd}}, '-'.$subopt => $self->$opt) if defined $self->$opt;
551 Usage : $factory->_run()
552 Function: Run a maq assembly pipeline
553 Returns : depends on call (An assembly file)
554 Args : - single end read file in maq bfq format
555 - reference seq file in maq bfa format
556 - [optional] paired end read file in maq bfq format
561 my ($self, $rd1_file, $ref_file, $rd2_file) = @_;
562 my ($cmd, $filespec, @ipc_args);
563 # Get program executable
564 my $exe = $self->executable;
566 # treat run() as a separate command and duplicate the component-specific
567 # parameters in the config globals
569 # Setup needed files and filehandles first
570 my $tdir = $self->tempdir();
571 my ($maph, $mapf) = $self->io->tempfile( -template
=> 'mapXXXX', -dir
=> $tdir ); #map
572 my ($cnsh, $cnsf) = $self->io->tempfile( -template
=> 'cnsXXXX', -dir
=> $tdir ); #consensus
573 my ($maqh, $maqf) = $self->_prepare_output_file();
574 my ($nm,$dr,$suf) = fileparse
($maqf,".maq");
575 my $faqf = $dr.$nm.".cns.fastq";
577 $_->close for ($maph, $cnsh, $maqh);
579 # Get command-line options for the component commands:
580 my $subcmd_args = $self->_collate_subcmd_args();
581 # map reads to ref seq
582 # set up subcommand options
584 my $maq = Bio
::Tools
::Run
::Maq
->new(
586 @
{$subcmd_args->{map}}
588 $maq->run_maq( -map => $mapf, -bfa
=> $ref_file, -bfq1
=> $rd1_file,
589 -bfq2
=> $rd2_file );
590 # assemble reads into consensus
591 $maq = Bio
::Tools
::Run
::Maq
->new(
592 -command
=> 'assemble',
593 @
{$subcmd_args->{asm
}}
595 $maq->run_maq( -cns
=> $cnsf, -bfa
=> $ref_file, -map => $mapf );
596 # convert map into plain text
597 $maq = Bio
::Tools
::Run
::Maq
->new(
598 -command
=> 'mapview'
600 $maq->run_maq( -map => $mapf, -txt
=> $maqf );
602 # convert consensus into plain text fastq
603 $maq = Bio
::Tools
::Run
::Maq
->new(
604 -command
=> 'cns2fq',
605 @
{$subcmd_args->{c2q
}}
607 $maq->run_maq( -cns
=> $cnsf, -faq
=> $faqf );
609 return ($maqf, $faqf);
613 =head2 available_parameters()
615 Title : available_parameters
616 Usage : @cmds = $fac->available_commands('commands');
617 Function: Use to browse available commands, params, or switches
618 Returns : array of scalar strings
619 Args : 'commands' : all maq commands
620 'params' : parameters for this object's command
621 'switches' : boolean switches for this object's command
622 'filespec' : the filename spec for this object's command
623 4Geeks : Overrides Bio::ParameterBaseI via
624 Bio::Tools::Run::AssemblerBase
628 sub available_parameters
{
631 for ($subset) { # get commands
632 !defined && do { # delegate
633 return $self->SUPER::available_parameters
($subset);
636 return grep !/^run$/, @program_commands;
638 m/^f/i && do { # get file spec
639 return @
{$command_files{$self->command}};
641 do { #else delegate...
642 return $self->SUPER::available_parameters
($subset);
647 sub available_commands
{ shift->available_parameters('commands') };
649 sub filespec
{ shift->available_parameters('filespec') };