1 # BioPerl module for Bio::DB::SeqHound
3 # You may distribute this module under the same terms as perl itself
4 # POD documentation - main docs before the code
9 Bio::DB::SeqHound - Database object interface to SeqHound
13 use Bio::DB::SeqHound;
14 $sh = Bio::DB::SeqHound->new();
16 $seq = $sh->get_Seq_by_acc("CAA28783"); # Accession Number
20 $seq = $sh->get_Seq_by_gi(4557225); # GI Number
28 SeqHound is a database of biological sequences and structures. This
29 script allows the retrieval of sequence objects (Bio::Seq) from the
30 SeqHound database at the Blueprint Initiative.
32 Bioperl extension permitting use of the SeqHound Database System
33 developed by researchers at
35 The Blueprint Initiative
36 Samuel Lunenfeld Research Institute
43 known bugs: fail to get sequences for some RefSeq record with CONTIG,
46 E<lt>seqhound@blueprint.orgE<gt>
50 User feedback is an integral part of the evolution of this Bioperl module. Send
51 your comments and suggestions preferably to seqhound.usergroup mailing lists.
52 Your participation is much appreciated.
54 E<lt>seqhound.usergroup@lists.blueprint.orgE<gt>
58 For more information on SeqHound http://dogboxonline.unleashedinformatics.com/
62 This software is provided 'as is' without warranty of any kind.
66 Rong Yao, Hao Lieu, Ian Donaldson
68 E<lt>seqhound@blueprint.orgE<gt>
72 The rest of the documentation details each of the object
73 methods. Internal methods are usually preceded with a _
77 # Let the code begin...
79 package Bio
::DB
::SeqHound
;
81 use vars
qw($HOSTBASE $CGILOCATION $LOGFILENAME);
86 use POSIX qw(strftime);
88 use base
qw(Bio::DB::WebDBSeqI Bio::Root::Root);
91 $HOSTBASE = 'http://dogboxonline.unleashedinformatics.com';
92 $CGILOCATION = '/cgi-bin/seqrem?fnct=';
93 $LOGFILENAME = 'shoundlog';
97 # helper method to get db specific options
102 Usage : $sh = Bio::DB::SeqHound->new(@options);
103 Function: Creates a new seqhound handle
104 Returns : New seqhound handle
110 my ($class, @args ) = @_;
111 my $self = $class->SUPER::new
(@args);
112 if ($self->_init_SeqHound eq "TRUE"){
120 =head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
124 Title : get_Seq_by_id
125 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN');
126 Function: Gets a Bio::Seq object by its name
127 Returns : a Bio::Seq object
128 Args : the id (as a string) of a sequence
129 Throws : "id does not exist" exception
130 Example : Each of these calls retrieves the same sequence record
131 $seq = $db->get_Seq_by_id(56); #retrieval by GI
132 $seq = $db->get_Seq_by_id("X02597"); #retrieval by NCBI accession
133 $seq = $db->get_Seq_by_id("BTACHRE"); #retrieval by sequence "name"
134 a sequence "name" is a secondary identifier (usually assigned by the
135 submitting database external to the NCBI) that may not be visible in
136 the GenBank flat file version of the record but is always present in
138 Note : Since in GenBank.pm, this function accepts a gi, an accession number
139 or a sequence name, SeqHound also satisfies these inputs.
140 If the input uid is a number, it is treated as a gi, if the uid is a
141 string, it is treated as an accession number first. If the search still
142 fails, it is treated as a sequence name.
143 Since SeqHound stores biological data from different source sequence
144 databases like: GenBank, GenPept, SwissProt, EMBL, RefSeq,
145 you can pass ids from the above databases to this function.
146 The Bio::Seq object returned by this function is identical to the
147 Bio::Seq generated by the GenBank.pm and GenPept.pm.
148 The Bio::Seq object returned by this function sometimes has minor
149 difference in the SeqFeature from the Bio::Seq object generated
151 The Bio::Seq objects created from this function will have the NCBI
152 versions of the SwissProt and EMBL sequence data information.
159 my $seqio= $self-> _get_Seq_from_gbff
($id);
161 return $seqio->next_seq;
164 elsif ($id =~ /^\S+$/){
165 #print "id is string, try search by accession or name\n";
166 my $gi = $self ->_get_gi_from_acc ($id);
168 my $gi = $self->_get_gi_from_name($id);
170 my $seqio = $self->_get_Seq_from_gbff($gi);
172 return $seqio->next_seq;
177 my $seqio = $self->_get_Seq_from_gbff($gi);
179 return $seqio->next_seq;
182 my $gi = $self->_get_gi_from_name($id);
184 my $seqio = $self->_get_Seq_from_gbff($gi);
186 return $seqio->next_seq;
194 $self->warn("[get_Seq_by_id]: invalid input id.");
197 $self->warn("[get_Seq_by_id]: id $id does not exist");
202 =head2 get_Seq_by_acc
204 Title : get_Seq_by_acc
205 Usage : $seq = $db->get_Seq_by_acc('M34830');
206 Function: Gets a Seq object by accession numbers
207 Returns : a Bio::Seq object
208 Args : the accession number as a string
209 Throws : "id does not exist" exception
210 Note : Since in GenBank.pm, this function accepts an accession number
211 or a sequence name, SeqHound also satisfies these inputs.
212 If the input uid is a string, it is treated as an accession number first.
213 If the search fails, it is treated as a sequence name.
214 Since SeqHound stores biological data from different source sequence
215 databases like: GenBank, GenPept, SwissProt, EMBL, RefSeq,
216 you can pass ids from the above databases to this function.
217 The Bio::Seq object returned by this function is identical to the
218 Bio::Seq generated by the GenBank.pm and GenPept.pm.
219 The Bio::Seq object returned by this function sometimes has minor
220 difference in the SeqFeature from the Bio::Seq object generated
222 The Bio::Seq objects created from this function will have the NCBI
223 versions of the SwissProt and EMBL sequence data information.
228 my ($self, $acc) = @_;
229 #exclude $acc is a number, since function does not accept gi as input
230 if ($acc =~ /^\d+$/) {
231 $self->warn ("[get_Seq_by_acc]: id $acc does not exist");
235 $gi= $self->_get_gi_from_acc($acc);
236 #print "get_Seq_by_acc: gi = $gi\n";
238 my $seqio = $self->_get_Seq_from_gbff($gi);
240 return $seqio->next_seq;
243 #else, treat input as sequence name
245 $gi = $self->_get_gi_from_name($acc);
246 #print "in get_Seq_by_acc: else gi = $gi\n";
248 my $seqio = $self->_get_Seq_from_gbff($gi);
250 return $seqio->next_seq;
254 $self->warn("[get_Seq_by_acc]: id $acc does not exist.");
261 Title : get_Seq_by_gi
262 Usage : $seq = $sh->get_Seq_by_gi('405830');
263 Function: Gets a Bio::Seq object by gi number
264 Returns : A Bio::Seq object
265 Args : gi number (as a string)
266 Throws : "gi does not exist" exception
267 Note : call the same code get_Seq_by_id
273 my ($self, $gi) = @_;
274 return get_Seq_by_id
($self, $gi);
277 =head2 get_Seq_by_version
279 Title : get_Seq_by_version
280 Usage : $seq = $db->get_Seq_by_version('X77802');
281 Function: Gets a Bio::Seq object by sequence version
282 Returns : A Bio::Seq object
283 Args : accession.version (as a string)
284 Throws : "acc.version does not exist" exception
285 Note : SeqHound only keeps the most up-to-date version of a sequence. So
286 for the above example, use
287 $seq = $db->get_Seq_by_acc('X77802');
291 =head2 get_Stream_by_query
293 Title : get_Stream_by_query
294 Usage : $seq = $db->get_Stream_by_query($query);
295 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
296 at a time. For large numbers of sequences, this is far superior
297 than get_Stream_by_[id/acc]().
298 Example : $query_string = 'Candida maltosa 26S ribosomal RNA gene';
299 $query = Bio::DB::Query::GenBank->new(-db=>'nucleotide',
300 -query=>$query_string);
301 $stream = $sh->get_Stream_by_query($query);
303 $query = Bio::DB::Query::GenBank->new (-db=> 'nucleotide',
304 -ids=>['X02597', 'X63732', 11002, 4557284]);
305 $stream = $sh->get_Stream_by_query($query);
306 Returns : a Bio::SeqIO stream object
307 Args : $query : A Bio::DB::Query::GenBank object. It is suggested that
308 you create a Bio::DB::Query::GenBank object and get the entry
309 count before you fetch a potentially large stream.
313 sub get_Stream_by_query
{
314 my ($self, $query) = @_;
315 my @ids = $query->ids;
316 #print join ",", @ids, "\n";
317 return get_Stream_by_id
($self, \
@ids);
321 =head2 get_Stream_by_id
323 Title : get_Stream_by_id
324 Usage : $stream = $db->get_Stream_by_id(['J05128', 'S43442', 34996479]);
325 Function: Gets a series of Seq objects by unique identifiers
326 Returns : a Bio::SeqIO stream object
327 Args : $ref : a reference to an array of unique identifiers for
328 the desired sequence entries, according to genbank.pm
329 this function accepts gi, accession number
331 Note : Since in GenBank.pm, this function accepts a gi, an accession number
332 or a sequence name, SeqHound also satisfies these inputs.
333 If the input uid is a number, it is treated as a gi, if the uid is a
334 string, it is treated as an accession number first. If the search still
335 fails, it is treated as a sequence name.
336 Since SeqHound stores biological data from different source sequence
337 databases like: GenBank, GenPept, SwissProt, EMBL, RefSeq,
338 you can pass ids from the above databases to this function.
339 The Bio::Seq object returned by this function is identical to the
340 Bio::Seq generated by the GenBank.pm and GenPept.pm.
341 The Bio::Seq object returned by this function sometimes has minor
342 difference in the SeqFeature from the Bio::Seq object generated
344 The Bio::Seq objects created from this function will have the NCBI
345 versions of the SwissProt and EMBL sequence data information.
351 my ($self, $id) = @_;
352 my (@gilist, @not_exist);
354 $self->warn("[get_Stream_by_id]: undefined input id");
357 if (ref($id)=~ /array/i){
358 foreach my $i (@
$id){
362 elsif ($i =~ /^\S+$/) {
363 my $gi = _get_gi_from_acc
($self, $i);
365 $gi = _get_gi_from_name
($self, $i);
367 $self->warn("[get_Stream_by_id]: id $i does not exist.");
368 push (@not_exist, $i);
379 $self->warn("[get_Stream_by_id]: id $i does not exist.");
380 push (@not_exist, $i);
383 my $seqio = _get_Seq_from_gbff
($self, \
@gilist);
392 =head2 get_Stream_by_acc
394 Title : get_Stream_by_acc
395 Usage : $seq = $db->get_Stream_by_acc(['M98777', 'M34830']);
396 Function: Gets a series of Seq objects by accession numbers
397 Returns : a Bio::SeqIO stream object
398 Args : $ref : a reference to an array of accession numbers for
399 the desired sequence entries
400 Note : For SeqHound, this just calls the same code for get_Stream_by_id()
404 sub get_Stream_by_acc
406 my ($self, $acc) = @_;
407 return get_Stream_by_id
($self, $acc);
410 =head2 get_Stream_by_gi
412 Title : get_Stream_by_gi
413 Usage : $seq = $db->get_Seq_by_gi([161966, 255064]);
414 Function: Gets a series of Seq objects by gi numbers
415 Returns : a Bio::SeqIO stream object
416 Args : $ref : a reference to an array of gi numbers for
417 the desired sequence entries
418 Note : For SeqHound, this just calls the same code for get_Stream_by_id()
422 sub get_Stream_by_gi
{
423 my ($self, $gi) = @_;
424 return get_Stream_by_id
($self, $gi);
430 Usage : my $lcontent = $self->get_request;
431 Function: get the output from SeqHound API http call
432 Returns : the result of the remote call from SeqHound
433 Args : %qualifiers = a hash of qualifiers
434 (SeqHound function name, id, query etc)
435 Example : $lcontent = $self->get_request(-funcname=>'SeqHoundGetGenBankff',
438 Note : this function overrides the implementation in Bio::DB::WebDBSeqI.
444 my ( @qualifiers) = @_;
445 my ($funcname, $query, $uids, $other) = $self->_rearrange([qw(FUNCNAME QUERY UIDS OTHER)],
447 # print ("get funcname = $funcname, query = $query, uids= $uids\n");
448 unless( defined $funcname ne '') {
449 $self->throw("please specify the SeqHound function for query");
451 my $url = $HOSTBASE . $CGILOCATION . $funcname;
452 unless( defined $uids ne '') {
453 $self->throw("please specify a uid or a list of uids to fetch");
455 unless ( defined $query && $query ne '') {
456 $self->throw("please specify a valid query field");
459 if (defined $uids && defined $query) {
460 if( ref($uids) =~ /array/i ) {
461 $uids = join(",", @
$uids);
463 $url=$url."&".$query."=".$uids;
465 $url=$url."&".$other;
467 my $ua = LWP
::UserAgent
->new(env_proxy
=> 1);
468 my $req = HTTP
::Request
->new ('GET', $url);
469 my $res = $ua->request($req);
470 if ($res->is_success){
471 return $res->content;
474 my $result = "HTTP::Request error: ".$res->status_line."\n";
475 $self->warn("$result");
482 =head2 postprocess_data
484 Title : postprocess_data
485 Usage : $self->postprocess_data (-funcname => $funcname,
486 -lcontent => $lcontent,
487 -outtype => $outtype);
488 Function: process return String from http seqrem call
489 output type can be a string or a Bio::SeqIO object.
491 Args : $funcname is the API function name of SeqHound
492 $lcontent is a string output from SeqHound server http call
493 $outtype is a string or a Bio::SeqIO object
494 Example : $seqio = $self->postprocess_data ( -lcontent => $lcontent,
495 -funcname => 'SeqHoundGetGenBankffList',
496 -outtype => 'Bio::SeqIO');
498 $gi = $self->postprocess_data( -lcontent => $lcontent,
499 -funcname => 'SeqHoundFindAcc',
500 -outtype => 'string');
501 Note : this method overrides the method works for genbank/genpept,
508 my ($self, @args) = @_;
509 my ($funcname, $lcontent, $outtype) = $self->_rearrange(
510 [qw(FUNCNAME LCONTENT OUTTYPE)], @args);
512 if (!defined $outtype){
513 $self->throw("please specify the output type, string, Bio::SeqIO etc");
515 if (!defined $lcontent){
516 $self->throw("please provide the result from SeqHound call");
518 if (!defined $funcname){
519 $self->throw("Please provide the function name");
522 #set up verbosity level if need record in the log file
523 my $log_msg = "Writing into '$LOGFILENAME' log file.\n";
524 my $now = strftime
("%a %b %e %H:%M:%S %Y", localtime);
525 if ($lcontent eq "") {
526 $self->debug($log_msg);
527 open my $LOG, '>>', $LOGFILENAME or $self->throw("Could not append file '$LOGFILENAME': $!");
528 print $LOG "$now $funcname. No reply.\n";
532 elsif ($lcontent =~ /HTTP::Request error/) {
533 $self->debug($log_msg);
534 open my $LOG, '>>', $LOGFILENAME or $self->throw("Could not append file '$LOGFILENAME': $!");
535 print $LOG "$now $funcname. Http::Request error problem.\n";
539 elsif ($lcontent =~ /SEQHOUND_ERROR/) {
540 $self->debug($log_msg);
541 open my $LOG, '>>', $LOGFILENAME or $self->throw("Could not append file '$LOGFILENAME': $!");
542 print $LOG "$now $funcname error. SEQHOUND_ERROR found.\n";
546 elsif ($lcontent =~ /SEQHOUND_NULL/) {
547 $self->debug($log_msg);
548 open my $LOG, '>>', $LOGFILENAME or $self->throw("Could not append file '$LOGFILENAME': $!");
549 print $LOG "$now $funcname Value not found in the database. SEQHOUND_NULL found.\n";
555 my @lines = split(/\n/, $lcontent, 2);
556 if ($lines[1] =~ /^-1/) {
557 $self->debug($log_msg);
558 open my $LOG, '>>', $LOGFILENAME or $self->throw("Could not append file '$LOGFILENAME': $!");
559 print $LOG "$now $funcname Value not found in the database. -1 found.\n";
563 elsif ($lines[1] =~ /^0/) {
564 $self->debug($log_msg);
565 open my $LOG, '>>', $LOGFILENAME or $self->throw("Could not append file '$LOGFILENAME': $!");
566 print $LOG "$now $funcname failed.\n";
575 #a list of functions in SeqHound which can wrap into Bio::seqIO object
576 if ($outtype eq 'Bio::SeqIO'){
577 my $buf = IO
::String
->new($result);
578 my $io = Bio
::SeqIO
->new (-format
=> 'genbank', -fh
=> $buf);
579 if (defined $io && $io ne ''){
584 #return a string if outtype is "string"
589 =head2 _get_gi_from_name
591 Title : _get_gi_from_name
592 Usage : $self->_get_gi_from_name('J05128');
593 Function: get the gene identifier from a sequence name
595 Return : gene identifier or undef
596 Args : a string represented sequence name
600 sub _get_gi_from_name
602 my ($self, $name) = @_;
604 $ret = $self->get_request( -funcname
=> 'SeqHoundFindName',
607 #print "_get_gi_from_name: ret = $ret\n";
608 $gi = $self->postprocess_data(-lcontent
=> $ret,
609 -funcname
=> 'SeqHoundFindName',
610 -outtype
=> 'string');
611 #print "_get_gi_from_name: gi = $gi\n";
615 =head2 _get_gi_from_acc
617 Title : _get_gi_from_acc
618 Usage : $self->_get_gi_from_acc('M34830')
619 Function: get the gene identifier from an accession number
621 Return : gene identifier or undef
622 Args : a string represented accession number
628 my ($self, $acc) = @_;
630 $ret = $self->get_request ( -funcname
=> 'SeqHoundFindAcc',
633 #print "_get_gi_from_acc: ret = $ret\n";
634 $gi = $self->postprocess_data( -lcontent
=> $ret,
635 -funcname
=> 'SeqHoundFindAcc',
636 -outtype
=> 'string');
637 #print "_get_gi_from_acc: gi = $gi\n";
641 =head2 _get_Seq_from_gbff
643 Title : _get_Seq_from_gbff
644 Usage : $self->_get_Seq_from_gbff($str)
645 Function: get the Bio::SeqIO stream object from gi or a list of gi
647 Return : Bio::SeqIO or undef
648 Args : a string represented gene identifier or
649 a list of gene identifiers
650 Example : $seq = $self->_get_Seq_from_gbff(141740);
652 $seq = $self->_get_Seq_from_gbff([141740, 255064, 45185482]);
656 sub _get_Seq_from_gbff
658 my ($self, $gi) = @_;
660 $self->warn("[_get_Seq_from_gbff]: undefined input gi");
664 if (ref($gi) =~ /array/i){
667 $lcontent = "SEQHOUND_OK\n";
668 while ($#copyArr != -1){
669 @tempArr =_MaxSizeArray
(\
@copyArr);
670 #in order to keep the correct output order as GenBank does
671 my $gi = join (",", reverse(@tempArr));
673 my $ret = $self->get_request( -funcname
=> 'SeqHoundGetGenBankffList',
677 my @lines = split(/\n/, $ret, 2);
678 if($lines[0] =~ /SEQHOUND_ERROR/ || $lines[0] =~ /SEQHOUND_NULL/){
681 if ($lines[1] =~ /^(null)/ || $lines[1] eq ""){
687 #append genbank flat files for long list
688 $lcontent = $lcontent.$result;
692 #else $gi is a single variable
694 $lcontent = $self->get_request( -funcname
=> 'SeqHoundGetGenBankffList',
698 my $seqio = $self->postprocess_data ( -lcontent
=> $lcontent,
699 -funcname
=> 'SeqHoundGetGenBankffList',
700 -outtype
=> 'Bio::SeqIO');
706 =head2 _init_SeqHound
708 Title : _init_SeqHound
709 Usage : $self->_init_SeqHound();
710 Function: call SeqHoundInit at blueprint server
711 Return : $result (TRUE or FALSE)
719 my $ret = $self->get_request(-funcname
=> 'SeqHoundInit',
720 -query
=> 'NetEntrezOnToo',
722 -other
=> 'appname=Bioperl');
723 my $result = $self->postprocess_data(-lcontent
=> $ret,
724 -funcname
=> 'SeqHoundInit',
725 -outtype
=> 'string');
726 return $result || 'FALSE';
732 Title : _MaxSizeArray
733 Usage : $self->_MaxSizeArray(\@arr)
734 Function: get an array with the limit size
735 Return : an array with the limit size
736 Args : a reference to an array
745 my $len = scalar(@
$argArr);
746 for(my $i = 0; $i < $len;){
747 $copyArr[$i++] = $$argArr[0];