3 # BioPerl module for Bio::Tools::Run::RemoteBlast
5 # FORMERLY Cared for by Jason Stajich, Mat Wiepert
7 # Somewhat cared for by Roger Hall, Chris Fields (when they have time)
9 # Copyright Jason Stajich, Bioperl
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast
22 #Remote-blast "factory object" creation and blast-parameter initialization
24 use Bio::Tools::Run::RemoteBlast;
30 my @params = ( '-prog' => $prog,
33 '-readmethod' => 'SearchIO' );
35 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
37 #change a query paramter
38 $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]';
40 #change a retrieval parameter
41 $Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'DESCRIPTIONS'} = 1000;
44 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
46 #$v is just to turn on and off the messages
49 my $str = Bio::SeqIO->new(-file=>'amino.fa' , -format => 'fasta' );
51 while (my $input = $str->next_seq()){
52 #Blast a sequence against a database:
54 #Alternatively, you could pass in a file with many
55 #sequences rather than loop through sequence one at a time
56 #Remove the loop starting 'while (my $input = $str->next_seq())'
57 #and swap the two lines below for an example of that.
58 my $r = $factory->submit_blast($input);
59 #my $r = $factory->submit_blast('amino.fa');
61 print STDERR "waiting..." if( $v > 0 );
62 while ( my @rids = $factory->each_rid ) {
63 foreach my $rid ( @rids ) {
64 my $rc = $factory->retrieve_blast($rid);
67 $factory->remove_rid($rid);
69 print STDERR "." if ( $v > 0 );
72 my $result = $rc->next_result();
74 my $filename = $result->query_name()."\.out";
75 $factory->save_output($filename);
76 $factory->remove_rid($rid);
77 print "\nQuery Name: ", $result->query_name(), "\n";
78 while ( my $hit = $result->next_hit ) {
79 next unless ( $v > 0);
80 print "\thit name is ", $hit->name, "\n";
81 while( my $hsp = $hit->next_hsp ) {
82 print "\t\tscore is ", $hsp->score, "\n";
90 # This example shows how to change a CGI parameter:
91 $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM45';
92 $Bio::Tools::Run::RemoteBlast::HEADER{'GAPCOSTS'} = '15 2';
94 # And this is how to delete a CGI parameter:
95 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
100 Class for remote execution of the NCBI Blast via HTTP.
102 For a description of the many CGI parameters see:
103 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html
105 Various additional options and input formats are available.
111 User feedback is an integral part of the evolution of this and other
112 Bioperl modules. Send your comments and suggestions preferably to one
113 of the Bioperl mailing lists. Your participation is much appreciated.
115 bioperl-l@bioperl.org - General discussion
116 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
118 =head2 Reporting Bugs
120 Report bugs to the Bioperl bug tracking system to help us keep track
121 the bugs and their resolution. Bug reports can be submitted via the
124 http://bugzilla.bioperl.org
128 Please do NOT contact Jason directly about this module. Please post to
129 the bioperl mailing list (L<FEEDBACK>). If you would like to be the
130 official maintainer of this module, please volunteer on the list and
131 we will make it official in this POD.
133 First written by Jason Stajich, many others have helped keep it running.
137 The rest of the documentation details each of the object
138 methods. Internal methods are usually preceded with a _
142 package Bio
::Tools
::Run
::RemoteBlast
;
144 use vars
qw($AUTOLOAD $URLBASE %HEADER %RETRIEVALHEADER
145 $RIDLINE $MODVERSION %PUTPARAMS %GETPARAMS);
150 use Bio::Tools::BPlite;
153 use HTTP::Request::Common;
155 use base qw(Bio::Root::Root Bio::Root::IO);
158 $MODVERSION = $Bio::Root
::Version
::VERSION
;
159 $URLBASE = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi';
161 # In GET/PUTPARAMS the values are regexes which validate the input.
163 'AUTO_FORMAT' => '(Off|(Semi|Full)auto)', # Off, Semiauto, Fullauto
164 'COMPOSITION_BASED_STATISTICS' => '(yes|no)', # yes, no
166 'DB_GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22
167 'DISPLAY_SORT' => '\d',
168 'ENDPOINTS' => '(yes|no)', # yes,no
169 'ENTREZ_QUERY' => '.*',
170 'EXPECT' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double
171 'FILTER' => '[LRm]', # L or R or m
172 'GAPCOSTS' => '-?\d+(\.\d+)\s+i-?\d+(\.\d+)',
173 # Two space separated float values
174 'GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22
175 'HITLIST_SIZE' => '\d+', # Positive integer
176 'I_THRESH' => '-?\d+(\.\d+)([eE]-\d+)?', # float
177 'LAYOUT' => '(One|Two)Windows?', # onewindow, twowindows
178 'LCASE_MASK' => '(yes|no)', # yes, no
179 'MATRIX_NAME' => '.*',
180 'NUCL_PENALTY' => '-\d+', # Negative integer
181 'NUCL_REWARD' => '-?\d+', # Integer
182 'OTHER_ADVANCED' => '.*',
183 'PERC_IDENT' => '\d\d+', # Integer, 0-99 inclusive
184 'PHI_PATTERN' => '.*',
185 'PROGRAM' => 't?blast[pnx]',
186 # tblastp, tblastn, tblastx, blastp, blastn, blastx
188 'QUERY_FILE' => '.*',
189 'QUERY_BELIEVE_DEFLINE' => '(yes|no)', # yes, no
190 'QUERY_FROM' => '\d+', # Positive integer
191 'QUERY_TO' => '\d+', # Positive integer
192 'SEARCHSP_EFF' => '\d+', # Positive integer
193 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)',
194 # plain,psi,phi,rpsblast,megablast
195 'SHORT_QUERY_ADJUST' => '(true|false)',
196 'THRESHOLD' => '-?\d+', # Integer
197 'UNGAPPED_ALIGNMENT' => '(yes|no)', # yes, no
198 'WORD_SIZE' => '\d+' # Positive integer
201 'ALIGNMENTS' => '\d+', # Positive integer
203 '(Pairwise|(Flat)?QueryAnchored(NoIdentities)?|Tabular)',
204 # Pairwise, QueryAnchored, QueryAnchoredNoIdentities,
205 # FlatQueryAnchored, FlatQueryAnchoredNoIdentities, Tabular
206 'DESCRIPTIONS' => '\d+', # Positive integer
207 'ENTREZ_LINKS_NEW_WINDOW' => '(yes|no)', # yes, no
208 'EXPECT_LOW' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double
209 'EXPECT_HIGH' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double
210 'FORMAT_ENTREZ_QUERY' => '',
212 '(Alignment|Neighbors|PSSM|SearchInfo|TaxBlast(Parent|MultiFrame)?)',
213 # Alignment, Neighbors, PSSM, SearchInfo
214 # TaxBlast, TaxblastParent, TaxBlastMultiFrame
215 'FORMAT_TYPE' => '((HT|X)ML|ASN\.1|Text)',
216 # HTML, Text, ASN.1, XML
217 'NCBI_GI' => '(yes|no)', # yes, no
219 'RESULTS_FILE' => '(yes|no)', # yes, no
220 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)',
221 # plain,psi,phi,rpsblast,megablast
222 'SHOW_OVERVIEW' => '(yes|no)' # yes, no
225 # Default values go in here for PUT
228 'FORMAT_OBJECT' => 'Alignment',
229 'COMPOSITION_BASED_STATISTICS' => 'off',
233 'PROGRAM' => 'blastp',
237 # Default values go in here for GET
240 'ALIGNMENTS' => '50',
241 'ALIGNMENT_VIEW' => 'Pairwise',
242 'DESCRIPTIONS' => '100',
243 'FORMAT_TYPE' => 'Text',
246 $RIDLINE = 'RID\s+=\s+(\S+)';
250 my ($caller, @args) = @_;
252 my $self = $caller->SUPER::new
(@args);
253 # so that tempfiles are cleaned up
254 $self->_initialize_io();
255 my ($prog, $data, $readmethod, $url_base) =
256 $self->_rearrange([qw(PROG DATA READMETHOD URL_BASE)],
258 # Use these two parameters for backward-compatibility.
259 # Overridden by PROGRAM and DATABASE if supplied.
260 $self->submit_parameter('PROGRAM',$prog) if $prog;
261 $self->submit_parameter('DATABASE',$data) if $data;
263 $readmethod = 'SearchIO' unless defined $readmethod;
264 $self->readmethod($readmethod);
266 # Now read the rest of the parameters and set them all
268 # PUT parameters first
269 my @putValues = $self->_rearrange([keys %PUTPARAMS],@args);
271 @putNames{keys %PUTPARAMS} = @putValues;
272 foreach my $putName (keys %putNames) {
273 $self->submit_parameter($putName,$putNames{$putName});
275 # GET parameters second
276 my @getValues = $self->_rearrange([keys %GETPARAMS],@args);
278 @getNames{keys %GETPARAMS} = @getValues;
279 foreach my $getName (keys %getNames) {
280 $self->retrieve_parameter($getName,$getNames{$getName});
282 # private variable to keep track of total rids
283 $self->{'_total_rids'} = 0;
284 $url_base ||= $URLBASE; # default to regular NCBI BLAST URL
285 $self->set_url_base($url_base);
289 =head2 retrieve_parameter
291 Title : retrieve_parameter
292 Usage : my $db = $self->retrieve_parameter
293 Function: Get/Set the named parameter for the retrieve_blast operation.
295 Args : $name : name of GET parameter
296 $val : optional value to set the parameter to
300 sub retrieve_parameter
{
301 my ($self, $name, $val) = @_;
303 $self->throw($name." is not a valid GET parameter.") unless
304 exists $GETPARAMS{$name};
306 my $regex = $GETPARAMS{$name};
307 $val =~ m/^$regex$/i or
308 $self->throw("Value ".$val." for GET parameter ".$name." does not match expression ".$regex.". Rejecting.");
309 $RETRIEVALHEADER{$name} = $val;
311 return $RETRIEVALHEADER{$name};
314 =head2 submit_parameter
316 Title : submit_parameter
317 Usage : my $db = $self->submit_parameter
318 Function: Get/Set the named parameter for the submit_blast operation.
320 Args : $name : name of PUT parameter
321 $val : optional value to set the parameter to
325 sub submit_parameter
{
326 my ($self, $name, $val) = @_;
328 $self->throw($name." is not a valid PUT parameter.") unless
329 exists $PUTPARAMS{$name};
331 my $regex = $PUTPARAMS{$name};
332 $val =~ m/^$regex$/i or
333 $self->throw("Value ".$val." for PUT parameter ".$name." does not match expression ".$regex.". Rejecting.");
334 $HEADER{$name} = $val;
336 return $HEADER{$name};
342 Usage : my $header = $self->header
343 Function: Get HTTP header for blast query
357 Usage : my $readmethod = $self->readmethod
358 Function: Get/Set the method to read the blast report
360 Args : string [ Blast, BPlite, blasttable, xml ]
365 my ($self, $val) = @_;
367 $self->{'_readmethod'} = $val;
369 return $self->{'_readmethod'};
376 Usage : my $prog = $self->program
377 Function: Get/Set the program to run. Retained for backwards-compatibility.
379 Args : string [ blastp, blastn, blastx, tblastn, tblastx ]
384 my ($self, $val) = @_;
385 return $self->submit_parameter('PROGRAM',$val);
392 Usage : my $db = $self->database
393 Function: Get/Set the database to search. Retained for backwards-compatibility.
395 Args : string [ swissprot, nr, nt, etc... ]
400 my ($self, $val) = @_;
401 return $self->submit_parameter('DATABASE',$val);
408 Usage : my $expect = $self->expect
409 Function: Get/Set the E value cutoff. Retained for backwards-compatibility.
411 Args : string [ '1e-4' ]
416 my ($self, $val) = @_;
417 return $self->submit_parameter('EXPECT',$val);
423 Usage : my $ua = $self->ua or
425 Function: Get/Set a LWP::UserAgent for use
426 Returns : reference to LWP::UserAgent Object
428 Comments: Will create a UserAgent if none has been requested before.
433 my ($self, $value) = @_;
434 if( ! defined $self->{'_ua'} ) {
435 $self->{'_ua'} = LWP
::UserAgent
->new(env_proxy
=> 1, parse_head
=> 0);
438 $self->{'_ua'}->agent("bioperl-$nm/$MODVERSION");
440 return $self->{'_ua'};
446 Usage : $httpproxy = $db->proxy('http') or
447 $db->proxy(['http','ftp'], 'http://myproxy' )
448 Function: Get/Set a proxy for use of proxy
449 Returns : a string indicating the proxy
450 Args : $protocol : an array ref of the protocol(s) to set/get
451 $proxyurl : url of the proxy to use for the specified protocol
456 my ($self,$protocol,$proxy) = @_;
457 return if ( !defined $self->ua || !defined $protocol
458 || !defined $proxy );
459 return $self->ua->proxy($protocol,$proxy);
463 my ($self, @vals) = @_;
465 $self->{'_rids'}->{$_} = $self->{'_total_rids'};
466 $self->{'_total_rids'}++;
468 return scalar keys %{$self->{'_rids'}};
472 my ($self, @vals) = @_;
474 delete $self->{'_rids'}->{$_};
476 return scalar keys %{$self->{'_rids'}};
481 # sort on key value, a little tricky...
482 my @sort_rids = sort {$self->{'_rids'}->{$a} <=> $self->{'_rids'}->{$b}} keys %{$self->{'_rids'}};
489 Usage : $self->submit_blast([$seq1,$seq2]);
490 Function: Submit blast jobs to ncbi blast queue on sequence(s)
491 Returns : Blast report object as defined by $self->readmethod
494 * array ref of sequence objects
495 * filename of file containing fasta formatted sequences
500 my ($self, $input) = @_;
501 my @seqs = $self->_load_input($input);
502 my $url_base = $self->get_url_base;
503 return 0 unless ( @seqs );
505 my %header = $self->header;
506 $header{$_} ||= $RETRIEVALHEADER{$_} foreach (keys %RETRIEVALHEADER);
507 foreach my $seq ( @seqs ) {
508 #If query has a fasta header, the output has the query line.
509 $header{'QUERY'} = ">".(defined $seq->display_id() ?
$seq->display_id() : "").
510 " ".(defined $seq->desc() ?
$seq->desc() : "")."\n".$seq->seq();
511 my $request = POST
$url_base, [%header];
512 $self->warn($request->as_string) if ( $self->verbose > 0);
513 my $response = $self->ua->request( $request);
515 if( $response->is_success ) {
516 my @subdata = split(/\n/, $response->content );
518 foreach ( @subdata ) {
521 $self->debug("RID: $1\n");
527 $self->warn("req was ". $request->as_string() . "\n");
528 $self->warn(join('', @subdata));
532 # should try and be a little more verbose here
533 $self->warn("req was ". $request->as_string() . "\n" .
534 $response->error_as_HTML);
541 =head2 retrieve_blast
543 Title : retrieve_blast
544 Usage : my $blastreport = $blastfactory->retrieve_blast($rid);
545 Function: Attempts to retrieve a blast report from remote blast queue
546 Returns : -1 on error,
547 0 on 'job not finished',
549 Args : Remote Blast ID (RID)
554 my($self, $rid) = @_;
555 my ($fh,$tempfile) = $self->tempfile();
556 close $fh; #explicit close
557 my $url_base = $self->get_url_base;
558 my %hdr = %RETRIEVALHEADER;
560 my $req = POST
$url_base, [%hdr];
561 $self->debug("retrieve request is " . $req->as_string());
562 my $response = $self->ua->request($req, $tempfile);
563 if( $response->is_success ) {
564 if( $self->verbose > 0 ) {
565 #print content of reply if verbose > 1
566 open(my $DEBUG, $tempfile) || $self->throw("cannot open $tempfile");
567 while(<$DEBUG>) { print $_; }
570 open(my $TMP, $tempfile) || $self->throw("Error opening $tempfile");
576 while(my $line = <$TMP>) {
580 if($line =~ /<\?xml version=/ ) { # xml time
582 $self->readmethod('blastxml');
585 if($line =~ /QBlastInfoBegin/i ) {
588 if($line =~ /Status=(WAITING|ERROR|READY)/i ) {
590 if( $status eq 'WAITING' ) {
592 } elsif( $status eq 'ERROR' ) {
594 open(my $ERR, "<$tempfile") or $self->throw("cannot open file $tempfile");
595 $self->warn(join("", <$ERR>));
598 } elsif( $status eq 'READY' ) {
602 $self->warn("Unknown status $1:\n");
606 } elsif ($line =~ /ERROR/i ) {
608 open(my $ERR, "<$tempfile") or $self->throw("cannot open file $tempfile");
609 $self->warn(join("", <$ERR>));
618 my $mthd = $self->readmethod;
619 $mthd = ($mthd =~ /blasttable/i) ?
'blasttable' :
620 ($mthd =~ /xml/i) ?
'blastxml' :
621 ($mthd =~ /pull/i) ?
'blast_pull' :
623 $blastobj = Bio
::SearchIO
->new( -file
=> $tempfile,
625 ## store filename in object ##
626 $self->file($tempfile);
628 } elsif (!$got_content) {
629 # server returned no content, can't be good
630 $self->warn("Server failed to return any data");
632 } else { # still working
637 $self->warn($response->error_as_HTML);
645 Usage : my $saveoutput = $self->save_output($filename)
646 Function: Method to save the blast report
647 Returns : 1 (throws error otherwise)
648 Args : string [rid, filename]
653 my ($self, $filename) = @_;
654 if( ! defined $filename ) {
655 $self->throw("Can't save blast output. You must specify a filename to save to.");
657 my $blastfile = $self->file;
658 #open temp file and output file, have to filter out some HTML
659 open(my $TMP, $blastfile) or $self->throw("cannot open $blastfile");
661 open(my $SAVEOUT, ">", $filename) or $self->throw("cannot open $filename");
665 if(/^(?:[T]?BLAST[NPX])\s*.+$/i ||
666 /^RPS-BLAST\s*.+$/i ||
667 /<\?xml\sversion=/ ||
668 /^#\s+(?:[T]?BLAST[NPX])\s*.+$/) {
680 my ($self, $input) = @_;
682 if( ! defined $input ) {
683 $self->throw("Calling remote blast with no input");
688 my $seqio = Bio
::SeqIO
->new(-format
=> 'fasta',
690 while( my $seq = $seqio->next_seq ) {
694 $self->throw("Input $input was not a valid filename");
696 } elsif( ref($input) =~ /ARRAY/i ) {
697 foreach ( @
$input ) {
698 if( ref($_) && $_->isa('Bio::PrimarySeqI') ) {
701 $self->warn("Trying to add a " . ref($_) .
702 " but expected a Bio::PrimarySeqI");
706 $self->throw("Did not pass in valid input -- no sequence objects found");
708 } elsif( $input->isa('Bio::PrimarySeqI') ) {
719 Usage : $self->set_url_base($url)
720 Function: Method to override the default NCBI BLAST database
722 Args : string (database url like
723 NOTE : This is highly experimental; we cannot maintain support on
724 databases other than the default NCBI database at this time
730 $self->{'_urlbase'} = shift if @_;
736 Usage : my $url = $self->set_url_base
737 Function: Get the current URL for BLAST database searching
738 Returns : string (URL used for remote blast searches)
745 return $self->{'_urlbase'};