allow other SearchIO formats and parameters (though axt and sim4 are not working...
[bioperl-run.git] / lib / Bio / Tools / Run / Alignment / Blat.pm
blob13f8e00d2b30faa0114d4ba6cac53b4ffda3d59e
1 # $Id$
3 # Copyright Balamurugan Kumarasamy
5 # You may distribute this module under the same terms as perl itself
6 # POD documentation - main docs before the code
8 =head1 NAME
10 Bio::Tools::Run::Alignment::Blat
12 =head1 SYNOPSIS
14 Build a Blat factory.
16 use Bio::Tools::Run::Alignment::Blat;
18 my $factory = Bio::Tools::Run::Alignment::Blat->new();
20 # Pass the factory a Bio::Seq object
21 # @feats is an array of Bio::SeqFeature::Generic objects
22 my @feats = $factory->run($seq,$DB);
24 =head1 DESCRIPTION
26 Wrapper module for Blat program. This newer version allows for all
27 parameters to be set.
29 Key bits not implemented yet (TODO):
31 =over 3
33 =item * Implement all needed L<Bio::Tools::Run::WrapperBase> methods
35 Missing are a few, including version().
37 =item * Re-implement using L<IPC::Run>
39 Would like to get this running under something less reliant on OS-dependent
40 changes within code.
42 =item * No .2bit or .nib conversions yet
44 These require callouts to faToNib or faTwoTwoBit, which may or may not be
45 installed on a user's machine. We can possibly add functionality to
46 check for faToTwoBit/faToNib and other UCSC tools in the future.
48 =back
50 =head1 FEEDBACK
52 =head2 Mailing Lists
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to one
56 of the Bioperl mailing lists. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 =head2 Support
63 Please direct usage questions or support issues to the mailing list:
65 I<bioperl-l@bioperl.org>
67 rather than to the module maintainer directly. Many experienced and
68 reponsive experts will be able look at the problem and quickly
69 address it. Please include a thorough description of the problem
70 with code and data examples if at all possible.
72 =head2 Reporting Bugs
74 Report bugs to the Bioperl bug tracking system to help us keep track
75 the bugs and their resolution. Bug reports can be submitted via the
76 web:
78 http://bugzilla.open-bio.org/
80 =head1 AUTHOR - Bala
82 Email bala@tll.org.sg
84 =head1 APPENDIX
86 The rest of the documentation details each of the object
87 methods. Internal methods are usually preceded with a _
89 =cut
91 package Bio::Tools::Run::Alignment::Blat;
93 use strict;
94 use warnings;
95 use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
97 use Bio::SeqIO;
98 use Bio::Root::Root;
99 use Bio::Factory::ApplicationFactoryI;
100 use Bio::SearchIO;
101 use Bio::Tools::Run::WrapperBase;
103 our ($PROGRAM, $PROGRAMDIR, $PROGRAMNAME);
105 our %BLAT_PARAMS = map {$_ => 1} qw(ooc t q tileSize stepSize oneOff
106 minMatch minScore minIdentity maxGap makeOoc repmatch mask qMask repeats
107 minRepeatsDivergence dots out maxIntron);
108 our %BLAT_SWITCHES = map {$_ => 1} qw(prot noHead trimT noTrimA trimHardA
109 fastMap fine extendThroughN);
111 our %LOCAL_ATTRIBUTES = map {$_ => 1} qw(db DB qsegment hsegment
112 outfile_name quiet);
114 #psl - Default. Tab separated format, no sequence
115 #pslx - Tab separated format with sequence
116 #axt - blastz-associated axt format
117 #maf - multiz-associated maf format
118 #sim4 - similar to sim4 format
119 #wublast - similar to wublast format
120 #blast - similar to NCBI blast format
121 #blast8- NCBI blast tabular format
122 #blast9 - NCBI blast tabular format with comments
125 our %searchio_map = (
126 'psl' => 'psl',
127 'pslx' => 'psl', # I don't think there is support for this yet
128 'axt' => 'axt',
129 'blast' => 'blast',
130 'sim4' => 'sim4',
131 'wublast' => 'blast',
132 'blast8' => 'blasttable',
133 'blast9' => 'blasttable'
136 =head2 new
138 Title : new
139 Usage : $blat->new(@params)
140 Function: creates a new Blat factory
141 Returns : Bio::Tools::Run::Alignment::Blat
142 Args :
144 =cut
146 sub new {
147 my ($class,@args) = @_;
148 my $self = $class->SUPER::new(@args);
149 $self->io->_initialize_io();
150 $self->set_parameters(@args);
151 return $self;
154 =head2 program_name
156 Title : program_name
157 Usage : $factory->program_name()
158 Function: holds the program name
159 Returns : string
160 Args : None
162 =cut
164 sub program_name {
165 return 'blat';
168 =head2 program_dir
170 Title : program_dir
171 Usage : $factory->program_dir(@params)
172 Function: returns the program directory, obtained from ENV variable.
173 Returns : string
174 Args :
176 =cut
178 sub program_dir {
179 return Bio::Root::IO->catfile($ENV{BLATDIR}) if $ENV{BLATDIR};
182 =head2 run
184 Title : run()
185 Usage : $obj->run($query)
186 Function: Runs Blat and creates an array of featrues
187 Returns : An array of Bio::SeqFeature::Generic objects
188 Args : A Bio::PrimarySeqI or a file name
190 =cut
192 sub run {
193 my ($self,$query) = @_;
194 my @feats;
196 if (ref($query) ) { # it is an object
197 if (ref($query) =~ /GLOB/) {
198 $self->throw("Cannot use filehandle as argument to run()");
200 my $infile = $self->_writeSeqFile($query);
201 return $self->_run($infile);
202 } else {
203 return $self->_run($query);
207 =head2 align
209 Title : align
210 Usage : $obj->align($query)
211 Function: Alias to run()
213 =cut
215 sub align {
216 return shift->run(@_);
219 =head2 db
221 =cut
223 sub db {
224 my $self = shift;
225 return $self->{blat_db} = shift if @_;
226 return $self->{blat_db};
229 # this is a kludge for tests (so one might expect this to be used elsewhere).
230 # None of the other parameters worked in the past, so not replacing them
232 *DB = \&db;
234 =head2 qsegment
236 Title : qsegment
237 Usage : $obj->qsegment('sequence_a:0-1000')
238 Function : pass in a B<UCSC-compliant> string for the query sequence(s)
239 Returns : string
240 Args : string
241 Status : New
242 Note : Requires the sequence(s) in question be 2bit or nib format
243 Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence
244 begins at 0, but start isn't counted with length), whereas BioPerl
245 coordinates are 1-based closed (sequence begins with 1, both start
246 and end are counted in the length of the segment). For example, a
247 segment that is 'sequence_a:0-1000' will have BioPerl coordinates of
248 'sequence_a:1-1000', both with the same length (1000).
250 =cut
252 sub qsegment {
253 my $self = shift;
254 return $self->{blat_qsegment} = shift if @_;
255 return $self->{blat_qsegment};
258 =head2 tsegment
260 Title : tsegment
261 Usage : $obj->tsegment('sequence_a:0-1000')
262 Function : pass in a B<UCSC-compliant> string for the target sequence(s)
263 Returns : string
264 Args : string
265 Status : New
266 Note : Requires the sequence(s) in question be 2bit or nib format
267 Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence
268 begins at 0, but start isn't counted with length), whereas BioPerl
269 coordinates are 1-based closed (sequence begins with 1, both start
270 and end are counted in the length of the segment). For example, a
271 segment that is 'sequence_a:0-1000' will have BioPerl coordinates of
272 'sequence_a:1-1000', both with the same length (1000).
274 =cut
276 sub tsegment {
277 my $self = shift;
278 return $self->{blat_tsegment} = shift if @_;
279 return $self->{blat_tsegment};
282 # override this, otherwise one gets a default of 'mlc'
283 sub outfile_name {
284 my $self = shift;
285 return $self->{blat_outfile} = shift if @_;
286 return $self->{blat_outfile};
289 =head2 searchio
291 Title : searchio
292 Usage : $obj->searchio{-writer => $writer}
293 Function : Pass in additional parameters to the returned Bio::SearchIO parser
294 Returns : Hash reference with Bio::SearchIO parameters
295 Args : Hash reference
296 Status : New
297 Note : Currently, this implementation overrides any passed -format
298 parameter based on whether the output is changed ('out'). This
299 may change if requested, but we can't see the utility of doing so,
300 as requesting mismatched output/parser combinations is just a recipe
301 for disaster
303 =cut
305 sub searchio {
306 my ($self, $params) = @_;
307 if ($params && ref $params eq 'HASH') {
308 if ($self->{parameters}->{out} && exists $searchio_map{$self->{parameters}->{out}}) {
309 $params->{-format} = $searchio_map{$self->{parameters}->{out}};
310 } else {
311 $params->{-format} = 'psl';
313 $self->{blat_searchio} = $params;
316 return $self->{blat_searchio} ||
317 {-format => exists($self->{parameters}->{out}) ?
318 $searchio_map{$self->{parameters}->{out}} : 'psl'};
321 =head1 Bio::ParameterBaseI-specific methods
323 These methods are part of the Bio::ParameterBaseI interface
325 =cut
327 =head2 set_parameters
329 Title : set_parameters
330 Usage : $pobj->set_parameters(%params);
331 Function: sets the parameters listed in the hash or array
332 Returns : None
333 Args : [optional] hash or array of parameter/values. These can optionally
334 be hash or array references
335 Note : This only sets parameters; to set methods use the method name
336 =cut
338 sub set_parameters {
339 my $self = shift;
340 # circumvent any issues arising from passing in refs
341 my %args = (ref($_[0]) eq 'HASH') ? %{$_[0]} :
342 (ref($_[0]) eq 'ARRAY') ? @{$_[0]} :
344 # set the parameters passed in, but only ones supported for the program
345 %args = map { my $a = $_;
346 $a =~ s{^-}{};
347 $a => $args{$_};
348 } sort keys %args;
350 while (my ($key, $val) = each %args) {
351 if (exists $BLAT_PARAMS{$key}) {
352 $self->{parameters}->{$key} = $val;
353 } elsif (exists $BLAT_SWITCHES{$key}) {
354 $self->{parameters}->{$key} = $BLAT_SWITCHES{$key} ? 1 : 0;
355 } elsif ($LOCAL_ATTRIBUTES{$key} && $self->can($key)) {
356 $self->$key($val);
361 =head2 reset_parameters
363 Title : reset_parameters
364 Usage : resets values
365 Function: resets parameters to either undef or value in passed hash
366 Returns : none
367 Args : [optional] hash of parameter-value pairs
369 =cut
371 sub reset_parameters {
372 my $self = shift;
373 delete $self->{parameters};
374 if (@_) {
375 $self->set_parameters(@_);
379 =head2 validate_parameters
381 Title : validate_parameters
382 Usage : $pobj->validate_parameters(1);
383 Function: sets a flag indicating whether to validate parameters via
384 set_parameters() or reset_parameters()
385 Returns : Bool
386 Args : [optional] value evaluating to True/False
387 Note : NYI
389 =cut
391 sub validate_parameters { 0 }
393 =head2 parameters_changed
395 Title : parameters_changed
396 Usage : if ($pobj->parameters_changed) {...}
397 Function: Returns boolean true (1) if parameters have changed
398 Returns : Boolean (0 or 1)
399 Args : None
400 Note : This module does not run state checks, so this always returns True
402 =cut
404 sub parameters_changed { 1 }
406 =head2 available_parameters
408 Title : available_parameters
409 Usage : @params = $pobj->available_parameters()
410 Function: Returns a list of the available parameters
411 Returns : Array of parameters
412 Args : [optional] name of executable being used; defaults to returning all
413 available parameters
415 =cut
417 sub available_parameters {
418 my ($self, $exec) = @_;
419 my @params = (sort keys %BLAT_PARAMS, sort keys %BLAT_SWITCHES);
420 return @params;
423 =head2 get_parameters
425 Title : get_parameters
426 Usage : %params = $pobj->get_parameters;
427 Function: Returns list of set key-value pairs, parameter => value
428 Returns : List of key-value pairs
429 Args : none
431 =cut
433 sub get_parameters {
434 my ($self, $option) = @_;
435 $option ||= ''; # no option
436 my %params;
437 if (exists $self->{parameters}) {
438 %params = map {$_ => $self->{parameters}->{$_}} sort keys %{$self->{parameters}};
439 } else {
440 %params = ();
442 return %params;
445 =head1 to_* methods
447 All to_* methods are implementation-specific
449 =cut
451 =head2 to_exe_string
453 Title : to_exe_string
454 Usage : $string = $pobj->to_exe_string;
455 Function: Returns string (command line string in this case)
456 Returns : String
457 Args :
459 =cut
461 sub to_exe_string {
462 my ($self, @passed) = @_;
463 my ($seq) = $self->_rearrange([qw(SEQ_FILE)], @passed);
464 $self->throw("Must provide a seq_file") unless defined $seq;
465 my %params = $self->get_parameters();
467 my ($exe, $db, $qseg, $tseg) = ($self->executable,
468 $self->db,
469 $self->qsegment,
470 $self->tsegment);
472 $self->throw("Executable not found") unless defined($exe);
474 if ($tseg) {
475 $db .= ":$tseg";
478 if ($qseg) {
479 $seq .= ":$qseg";
482 my @params;
484 for my $p (sort keys %BLAT_SWITCHES) {
485 if (exists $params{$p}) {
486 push @params, "-$p"
490 for my $p (sort keys %BLAT_PARAMS) {
491 if (exists $params{$p}) {
492 push @params, "-$p=$params{$p}"
496 # this only passes in the first seq file (no globs are allow AFAIK)
498 push @params, ($db, $seq);
500 # quiet! Unfortunately, it is NYI
502 my $string = "$exe ".join(' ',@params);
504 $string;
507 #=head2 _input
509 # Title : _input
510 # Usage : obj->_input($seqFile)
511 # Function: Internal (not to be used directly)
512 # Returns :
513 # Args :
515 #=cut
517 sub _input() {
518 my ($self,$infile1) = @_;
519 if(defined $infile1){
520 $self->{'input'}=$infile1;
522 return $self->{'input'};
525 #=head2 _database
527 # Title : _database
528 # Usage : obj->_database($seqFile)
529 # Function: Internal (not to be used directly)
530 # Returns :
531 # Args :
533 #=cut
535 sub _database() {
536 my ($self,$infile1) = @_;
537 $self->{'db'} = $infile1 if(defined $infile1);
538 return $self->{'db'};
542 #=head2 _run
544 # Title : _run
545 # Usage : $obj->_run()
546 # Function: Internal (not to be used directly)
547 # Returns : An array of Bio::SeqFeature::Generic objects
548 # Args :
550 #=cut
552 sub _run {
553 my ($self)= shift;
554 my $str = $self->to_exe_string(-seq_file => shift);
556 my $out = $self->outfile_name || $self->_tempfile;
558 $str .= " $out".$self->_quiet;
559 $self->debug($str."\n") if( $self->verbose > 0 );
561 my $status = system($str);
562 $self->throw( "Blat call ($str) crashed: $? \n") unless $status==0;
564 my $blat_obj;
565 if (ref ($out) !~ /GLOB/) {
566 $blat_obj = Bio::SearchIO->new(%{$self->searchio},
567 -file => $out);
568 } else {
569 $blat_obj = Bio::SearchIO->new(%{$self->searchio},
570 -fh => $out);
572 return $blat_obj;
576 #=head2 _writeSeqFile
578 # Title : _writeSeqFile
579 # Usage : obj->_writeSeqFile($seq)
580 # Function: Internal (not to be used directly)
581 # Returns :
582 # Args :
584 #=cut
586 sub _writeSeqFile {
587 my ($self,$seq) = @_;
588 my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$Bio::Root::IO::TEMPDIR);
589 my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta');
590 $in->write_seq($seq);
591 $in->close();
592 return $inputfile;
595 sub _tempfile {
596 my $self = shift;
597 my ($tfh,$outfile) = $self->io->tempfile(-dir=>$Bio::Root::IO::TEMPDIR);
598 # this is because we only want a unique filename
599 close($tfh);
600 return $outfile;
603 sub _quiet {
604 my $self = shift;
605 my $q = '';
606 # BLAT output goes to a file, all other output is STDOUT
607 if ($self->quiet) {
608 $q = $^O =~ /Win/i ? ' 2>&1 NUL' : ' > /dev/null 2>&1';