From 651990afdfab6e165b0660cd43976d0af2ef8d40 Mon Sep 17 00:00:00 2001 From: Chris Fields Date: Fri, 5 Nov 2010 13:26:32 -0500 Subject: [PATCH] allow other SearchIO formats and parameters (though axt and sim4 are not working as expected) --- lib/Bio/Tools/Run/Alignment/Blat.pm | 122 ++++++++++++++++++++++++++++-------- t/Blat.t | 86 +++++++++++++++++-------- 2 files changed, 157 insertions(+), 51 deletions(-) diff --git a/lib/Bio/Tools/Run/Alignment/Blat.pm b/lib/Bio/Tools/Run/Alignment/Blat.pm index 67328ae..13f8e00 100755 --- a/lib/Bio/Tools/Run/Alignment/Blat.pm +++ b/lib/Bio/Tools/Run/Alignment/Blat.pm @@ -42,17 +42,8 @@ changes within code. =item * No .2bit or .nib conversions yet These require callouts to faToNib or faTwoTwoBit, which may or may not be -installed on a user's machine. - -=item * Optional parsing of other L formats. - -The default is PSL; no built-in checks are being performed yet for other -formats. - -Blat can output other formats potentially parsable via BioPerl, including -BLAST, WU-BLAST, AXT, etc. Not to mention, we should have the ability to -pass on additional arguments to any parser instance (optional flags, such as -parsing PSL for distinct hit-based contiguous regions) +installed on a user's machine. We can possibly add functionality to +check for faToTwoBit/faToNib and other UCSC tools in the future. =back @@ -118,7 +109,29 @@ our %BLAT_SWITCHES = map {$_ => 1} qw(prot noHead trimT noTrimA trimHardA fastMap fine extendThroughN); our %LOCAL_ATTRIBUTES = map {$_ => 1} qw(db DB qsegment hsegment - outfile_name searchio quiet); + outfile_name quiet); + + #psl - Default. Tab separated format, no sequence + #pslx - Tab separated format with sequence + #axt - blastz-associated axt format + #maf - multiz-associated maf format + #sim4 - similar to sim4 format + #wublast - similar to wublast format + #blast - similar to NCBI blast format + #blast8- NCBI blast tabular format + #blast9 - NCBI blast tabular format with comments + + +our %searchio_map = ( + 'psl' => 'psl', + 'pslx' => 'psl', # I don't think there is support for this yet + 'axt' => 'axt', + 'blast' => 'blast', + 'sim4' => 'sim4', + 'wublast' => 'blast', + 'blast8' => 'blasttable', + 'blast9' => 'blasttable' +); =head2 new @@ -172,7 +185,7 @@ sub program_dir { Usage : $obj->run($query) Function: Runs Blat and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects - Args : A Bio::PrimarySeqI + Args : A Bio::PrimarySeqI or a file name =cut @@ -218,12 +231,48 @@ sub db { *DB = \&db; +=head2 qsegment + + Title : qsegment + Usage : $obj->qsegment('sequence_a:0-1000') + Function : pass in a B string for the query sequence(s) + Returns : string + Args : string + Status : New + Note : Requires the sequence(s) in question be 2bit or nib format + Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence + begins at 0, but start isn't counted with length), whereas BioPerl + coordinates are 1-based closed (sequence begins with 1, both start + and end are counted in the length of the segment). For example, a + segment that is 'sequence_a:0-1000' will have BioPerl coordinates of + 'sequence_a:1-1000', both with the same length (1000). + +=cut + sub qsegment { my $self = shift; return $self->{blat_qsegment} = shift if @_; return $self->{blat_qsegment}; } +=head2 tsegment + + Title : tsegment + Usage : $obj->tsegment('sequence_a:0-1000') + Function : pass in a B string for the target sequence(s) + Returns : string + Args : string + Status : New + Note : Requires the sequence(s) in question be 2bit or nib format + Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence + begins at 0, but start isn't counted with length), whereas BioPerl + coordinates are 1-based closed (sequence begins with 1, both start + and end are counted in the length of the segment). For example, a + segment that is 'sequence_a:0-1000' will have BioPerl coordinates of + 'sequence_a:1-1000', both with the same length (1000). + +=cut + sub tsegment { my $self = shift; return $self->{blat_tsegment} = shift if @_; @@ -237,12 +286,36 @@ sub outfile_name { return $self->{blat_outfile}; } +=head2 searchio + + Title : searchio + Usage : $obj->searchio{-writer => $writer} + Function : Pass in additional parameters to the returned Bio::SearchIO parser + Returns : Hash reference with Bio::SearchIO parameters + Args : Hash reference + Status : New + Note : Currently, this implementation overrides any passed -format + parameter based on whether the output is changed ('out'). This + may change if requested, but we can't see the utility of doing so, + as requesting mismatched output/parser combinations is just a recipe + for disaster + +=cut + sub searchio { - my $self = shift; - $self->{blat_searchio} = shift if @_; + my ($self, $params) = @_; + if ($params && ref $params eq 'HASH') { + if ($self->{parameters}->{out} && exists $searchio_map{$self->{parameters}->{out}}) { + $params->{-format} = $searchio_map{$self->{parameters}->{out}}; + } else { + $params->{-format} = 'psl'; + } + $self->{blat_searchio} = $params; + } - # TODO: This needs to check for - return $self->{blat_searchio} || 'psl'; + return $self->{blat_searchio} || + {-format => exists($self->{parameters}->{out}) ? + $searchio_map{$self->{parameters}->{out}} : 'psl'}; } =head1 Bio::ParameterBaseI-specific methods @@ -391,20 +464,19 @@ sub to_exe_string { $self->throw("Must provide a seq_file") unless defined $seq; my %params = $self->get_parameters(); - my ($exe, $prog, $db, $qseg, $tseg) = ($self->executable, - $self->program_name, + my ($exe, $db, $qseg, $tseg) = ($self->executable, $self->db, $self->qsegment, $self->tsegment); $self->throw("Executable not found") unless defined($exe); - if ($qseg) { - $db .= ":$qseg"; + if ($tseg) { + $db .= ":$tseg"; } - if ($tseg) { - $seq .= ":$tseg"; + if ($qseg) { + $seq .= ":$qseg"; } my @params; @@ -491,10 +563,10 @@ sub _run { my $blat_obj; if (ref ($out) !~ /GLOB/) { - $blat_obj = Bio::SearchIO->new(-format => $self->searchio, + $blat_obj = Bio::SearchIO->new(%{$self->searchio}, -file => $out); } else { - $blat_obj = Bio::SearchIO->new(-format => $self->searchio, + $blat_obj = Bio::SearchIO->new(%{$self->searchio}, -fh => $out); } return $blat_obj; diff --git a/t/Blat.t b/t/Blat.t index c88adc5..a947f68 100644 --- a/t/Blat.t +++ b/t/Blat.t @@ -5,7 +5,7 @@ use strict; BEGIN { use Bio::Root::Test; - test_begin(-tests => 20); + test_begin(-tests => 33); use_ok('Bio::Tools::Run::Alignment::Blat'); use_ok('Bio::SeqIO'); @@ -23,31 +23,65 @@ ok $factory->isa('Bio::Tools::Run::Alignment::Blat'); my $blat_present = $factory->executable(); SKIP: { - test_skip(-requires_executable => $factory, - -tests => 10); - - my $searchio = $factory->align($query); - my $result = $searchio->next_result; - my $hit = $result->next_hit; - my $hsp = $hit->next_hsp; - isa_ok($hsp, "Bio::Search::HSP::HSPI"); - is($hsp->query->start,1); - is($hsp->query->end,1775); - is($hsp->hit->start,1); - is($hsp->hit->end,1775); - my $sio = Bio::SeqIO->new(-file=>$query,-format=>'fasta'); - - my $seq = $sio->next_seq ; - - $searchio = $factory->align($seq); - $result = $searchio->next_result; - $hit = $result->next_hit; - $hsp = $hit->next_hsp; - isa_ok($hsp, "Bio::Search::HSP::HSPI"); - is($hsp->query->start,1); - is($hsp->query->end,1775); - is($hsp->hit->start,1); - is($hsp->hit->end,1775); + test_skip(-requires_executable => $factory, + -tests => 10); + + my $searchio = $factory->align($query); + my $result = $searchio->next_result; + my $hit = $result->next_hit; + my $hsp = $hit->next_hsp; + isa_ok($hsp, "Bio::Search::HSP::HSPI"); + is($hsp->query->start,1); + is($hsp->query->end,1775); + is($hsp->hit->start,1); + is($hsp->hit->end,1775); + my $sio = Bio::SeqIO->new(-file=>$query,-format=>'fasta'); + + my $seq = $sio->next_seq ; + + $searchio = $factory->align($seq); + like($searchio, qr/psl/, 'PSL parser (default)'); + $result = $searchio->next_result; + $hit = $result->next_hit; + $hsp = $hit->next_hsp; + isa_ok($hsp, "Bio::Search::HSP::HSPI"); + is($hsp->query->start,1); + is($hsp->query->end,1775); + is($hsp->hit->start,1); + is($hsp->hit->end,1775); + + # test alternate formats (not all of these work!) + $factory->reset_parameters('quiet' => 1, + "DB" => $db, + out => 'blast'); + $searchio = $factory->align($query); + + like($searchio, qr/blast/, 'BLAST parser'); + + $result = $searchio->next_result; + $hit = $result->next_hit; + $hsp = $hit->next_hsp; + isa_ok($hsp, "Bio::Search::HSP::HSPI"); + is($hsp->query->start,1); + is($hsp->query->end,1775); + is($hsp->hit->start,1); + is($hsp->hit->end,1775); + + $factory->reset_parameters('quiet' => 1, + "DB" => $db, + out => 'blast9'); + $searchio = $factory->align($query); + + like($searchio, qr/blasttable/, 'Tabuar BLAST parser'); + + $result = $searchio->next_result; + $hit = $result->next_hit; + $hsp = $hit->next_hsp; + isa_ok($hsp, "Bio::Search::HSP::HSPI"); + is($hsp->query->start,1); + is($hsp->query->end,1775); + is($hsp->hit->start,1); + is($hsp->hit->end,1775); } # new wrapper; regions -- 2.11.4.GIT