Updates:
[bioperl-live.git] / Bio / Tools / EUtilities.pm
blob5d09f816bee71bae2f51298c11c9b28e1b709919
1 # $Id$
3 # BioPerl module for Bio::Tools::EUtilities
5 # Cared for by Chris Fields
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::EUtilities - NCBI eutil XML parsers
17 =head1 SYNOPSIS
19 # from file or fh
21 my $parser = Bio::Tools::EUtilities->new(
22 -eutil => 'einfo',
23 -file => 'output.xml'
26 # or HTTP::Response object...
28 my $parser = Bio::Tools::EUtilities->new(
29 -eutil => 'esearch',
30 -response => $response
33 # esearch, esummary, elink
35 @ids = $parser->get_ids(); # returns array or array ref of IDs
37 # egquery, espell
39 $term = $parser->get_term(); # returns array or array ref of IDs
41 # elink, einfo
43 $db = $parser->get_database(); # returns database
45 # Query-related methods (esearch, egquery, espell data)
46 # eutil data centered on use of search terms
48 my $ct = $parser->get_count; # uses optional database for egquery count
49 my $translation = $parser->get_count;
51 my $corrected = $parser->get_corrected_query; # espell
53 while (my $gquery = $parser->next_GlobalQuery) {
54 # iterates through egquery data
57 # Info-related methods (einfo data)
58 # database-related information
60 my $desc = $parser->get_description;
61 my $update = $parser->get_last_update;
62 my $nm = $parser->get_menu_name;
63 my $ct = $parser->get_record_count;
65 while (my $field = $parser->next_FieldInfo) {
66 # ...
68 while (my $field = $parser->next_LinkInfo) {
69 # ...
72 # History methods (epost data, some data returned from elink)
73 # data which enables one to retrieve and query against user-stored
74 # information on the NCBI server
76 while (my $cookie = $parser->next_History) {
77 # ...
80 my @hists = $parser->get_Histories;
82 # Bio::Tools::EUtilities::Summary (esummary data)
83 # information on a specific database record
85 # retrieve nested docsum data
86 while (my $docsum = $parser->next_DocSum) {
87 print "ID:",$docsum->get_ids,"\n";
88 while (my $item = $docsum->next_Item) {
89 # do stuff here...
90 while (my $listitem = $docsum->next_ListItem) {
91 # do stuff here...
92 while (my $listitem = $docsum->next_Structure) {
93 # do stuff here...
99 # retrieve flattened item list per DocSum
100 while (my $docsum = $parser->next_DocSum) {
101 my @items = $docsum->get_all_DocSum_Items;
104 =head1 DESCRIPTION
106 Parses NCBI eutils XML output for retrieving IDs and other information. Part of
107 the BioPerl EUtilities system.
109 This is a general parser for eutils XML; data from efetch is NOT parsed (this
110 requires separate format-dependent parsers). All other XML for eutils is
111 parsed. These modules can be used independently of Bio::DB::EUtilities
112 and Bio::Tools::EUtilities::EUtilParameters; if used in this way, only data present in the XML
113 will be parsed out (other bits are retrieved from a passed in
114 Bio::Tools::EUtilities::EUtilParameters instance used while querying the database)
116 =head1 TODO
118 This module is largely complete. However there are a few holes which will
119 eventually be filled in. TranslationSets from esearch are not currently parsed,
120 for instance.
122 =head1 FEEDBACK
124 =head2 Mailing Lists
126 User feedback is an integral part of the
127 evolution of this and other Bioperl modules. Send
128 your comments and suggestions preferably to one
129 of the Bioperl mailing lists. Your participation
130 is much appreciated.
132 bioperl-l@lists.open-bio.org - General discussion
133 http://www.bioperl.org/wiki/Mailing_lists - About the mailing lists
135 =head2 Reporting Bugs
137 Report bugs to the Bioperl bug tracking system to
138 help us keep track the bugs and their resolution.
139 Bug reports can be submitted via the web.
141 http://bugzilla.open-bio.org/
143 =head1 AUTHOR
145 Email cjfields at uiuc dot edu
147 =head1 APPENDIX
149 The rest of the documentation details each of the
150 object methods. Internal methods are usually
151 preceded with a _
153 =cut
155 # Let the code begin...
157 package Bio::Tools::EUtilities;
158 use strict;
159 use warnings;
161 use base qw(Bio::Root::IO Bio::Tools::EUtilities::EUtilDataI);
162 use XML::Simple;
164 =head2 Constructor methods
166 =cut
168 =head2 new
170 Title : new
171 Usage : my $parser = Bio::Tools::EUtilities->new(-file => 'my.xml',
172 -eutil => 'esearch');
173 Function : create Bio::Tools::EUtilities instance
174 Returns : new Bio::Tools::EUtilities instance
175 Args : -file/-fh - File or filehandle
176 -eutil - eutil parser to use (supports all but efetch)
177 -response - HTTP::Response object (optional)
179 =cut
183 my %DATA_MODULE = (
184 'esearch' => 'Query',
185 'egquery' => 'Query',
186 'espell' => 'Query',
187 'epost' => 'Query',
188 'elink' => 'Link',
189 'einfo' => 'Info',
190 'esummary' => 'Summary',
193 sub new {
194 my($caller,@args) = @_;
195 my $class = ref $caller || $caller;
196 if ($class =~ m{Bio::Tools::EUtilities::(\S+)}) {
197 my ($self) = $class->SUPER::new(@args);
198 $self->_initialize(@args);
199 return $self;
200 } else {
201 my %param = @args;
202 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
203 my $eutil = $param{'-eutil'} || $class->throw("Need eutil to make instance");
204 return unless( $class->_load_eutil_module( $DATA_MODULE{$eutil}) );
205 return "Bio::Tools::EUtilities::$DATA_MODULE{$eutil}"->new(-datatype => lc $DATA_MODULE{$eutil},
206 -eutil => $eutil,
207 @args);
211 sub _initialize {
212 my ($self, @args) = @_;
213 my ($response, $pobj, $type, $eutil, $cache, $lazy) =
214 $self->_rearrange([qw(RESPONSE
215 PARAMETERS
216 DATATYPE
217 EUTIL
218 CACHE_RESPONSE
219 LAZY)], @args);
220 $lazy ||= 0;
221 $cache ||= 0;
222 $self->datatype($type);
223 $self->eutil($eutil);
224 # lazy parsing only implemented for elink and esummary (where returned data
225 # can be quite long). Also, no point to parsing lazily when the data is
226 # already in memory in an HTTP::Response object, so turn it off and chunk
227 # the Response object after parsing.
228 $response && $self->response($response);
229 $pobj && $self->parameter_base($pobj);
230 $self->cache_response($cache);
231 $lazy = 0 if ($response) || ($eutil ne 'elink' && $eutil ne 'esummary');
232 # setting parser to 'lazy' mode is permanent (can't reset later)
233 $self->{'_lazy'} = $lazy;
234 $self->{'_parsed'} = 0;
239 =head1 Bio::Tools::EUtilities methods
241 =head2 cache_response
243 Title : cache_response
244 Usage : $parser->cache_response(1)
245 Function : sets flag to cache response object (off by default)
246 Returns : value eval'ing to TRUE or FALSE
247 Args : value eval'ing to TRUE or FALSE
248 Note : must be set prior to any parsing run
250 =cut
252 sub cache_response {
253 my ($self, $cache) = @_;
254 if (defined $cache) {
255 $self->{'_cache_response'} = ($cache) ? 1 : 0;
257 return $self->{'_cache_response'};
260 =head2 response
262 Title : response
263 Usage : my $response = $parser->response;
264 Function : Get/Set HTTP::Response object
265 Returns : HTTP::Response
266 Args : HTTP::Response
267 Note : to prevent object from destruction set cache_response() to TRUE
269 =cut
271 sub response {
272 my ($self, $response) = @_;
273 if ($response) {
274 $self->throw('Not an HTTP::Response object') unless (ref $response && $response->isa('HTTP::Response'));
275 $self->{'_response'} = $response;
277 return $self->{'_response'};
280 =head2 parameter_base
282 Title : parameter_base
283 Usage : my $response = $parser->parameter_base;
284 Function : Get/Set Bio::ParameterBaseI object (should be Bio::Tools::EUtilities::EUtilParameters)
285 Returns : Bio::Tools::EUtilities::EUtilParameters || undef
286 Args : (optional) Bio::Tools::EUtilities::EUtilParameters
287 Note : If this object is present, it may be used as a last resort for
288 some data values if parsed XML does not contain said values (for
289 instance, database, term, IDs, etc).
291 =cut
293 sub parameter_base {
294 my ($self, $pb) = @_;
295 if ($pb) {
296 $self->throw('Not an Bio::ParameterBaseI object') unless (ref $pb && $pb->isa('Bio::ParameterBaseI'));
297 $self->warn('Not an Bio::Tools::EUtilities::EUtilParameters object; may experience some turbulence...') unless (ref $pb && $pb->isa('Bio::Tools::EUtilities::EUtilParameters'));
298 $self->{'_parameter_base'} = $pb;
300 return $self->{'_parameter_base'};
303 =head2 data_parsed
305 Title : data_parsed
306 Usage : if ($parser->data_parsed) {...}
307 Function : returns TRUE if data has been parsed
308 Returns : value eval'ing to TRUE or FALSE
309 Args : none (set within parser)
310 Note : mainly internal method (set in case user wants to check
311 whether parser is exhausted).
313 =cut
315 sub data_parsed {
316 return shift->{'_parsed'};
319 =head2 is_lazy
321 Title : is_lazy
322 Usage : if ($parser->is_lazy) {...}
323 Function : returns TRUE if parser is set to lazy parsing mode
324 (only affects elink/esummary)
325 Returns : Boolean
326 Args : none
327 Note : Permanently set in constructor. Still highly experimental.
328 Don't stare directly at happy fun ball...
330 =cut
332 sub is_lazy {
333 return shift->{'_lazy'};
336 =head2 parse_data
338 Title : parse_data
339 Usage : $parser->parse_data
340 Function : direct call to parse data; normally implicitly called
341 Returns : none
342 Args : none
344 =cut
347 my %EUTIL_DATA = (
348 'esummary' => [qw(DocSum Item)],
349 'epost' => [],
350 'egquery' => [],
351 'einfo' => [qw(Field Link)],
352 'elink' => [qw(LinkSet LinkSetDb LinkSetDbHistory IdUrlSet
353 Id IdLinkSet ObjUrl Link LinkInfo)],
354 'espell' => [qw(Original Replaced)],
355 'esearch' => [qw(Id ErrorList WarningList)],
358 sub parse_data {
359 my $self = shift;
360 my $eutil = $self->eutil;
361 my $xs = XML::Simple->new();
362 my $response = $self->response ? $self->response :
363 $self->_fh ? $self->_fh :
364 $self->throw('No response or stream specified');
365 my $simple = ($eutil eq 'espell') ?
366 $xs->XMLin($self->_fix_espell($response), forcearray => $EUTIL_DATA{$eutil}) :
367 ($response && $response->isa("HTTP::Response")) ?
368 $xs->XMLin($response->content, forcearray => $EUTIL_DATA{$eutil}) :
369 $xs->XMLin($response, forcearray => $EUTIL_DATA{$eutil});
370 # check for errors
371 if ($simple->{ERROR}) {
372 my $error = $simple->{ERROR};
373 $self->throw("NCBI $eutil fatal error: ".$error) unless ref $error;
375 if ($simple->{InvalidIdList}) {
376 $self->warn("NCBI $eutil error: Invalid ID List".$simple->{InvalidIdList});
377 return;
379 if ($simple->{ErrorList} || $simple->{WarningList}) {
380 my @errorlist = @{ $simple->{ErrorList} } if $simple->{ErrorList};
381 my @warninglist = @{ $simple->{WarningList} } if $simple->{WarningList};
382 my ($err_warn);
383 for my $error (@errorlist) {
384 my $messages = join("\n\t",map {"$_ [".$error->{$_}.']'}
385 grep {!ref $error->{$_}} keys %$error);
386 $err_warn .= "Error : $messages";
388 for my $warn (@warninglist) {
389 my $messages = join("\n\t",map {"$_ [".$warn->{$_}.']'}
390 grep {!ref $warn->{$_}} keys %$warn);
391 $err_warn .= "Warnings : $messages";
393 chomp($err_warn);
394 $self->warn("NCBI $eutil Errors/Warnings:\n".$err_warn)
395 # don't return as some data may still be useful
397 delete $self->{'_response'} unless $self->cache_response;
398 $self->{'_parsed'} = 1;
399 $self->_add_data($simple);
402 # implemented only for elink/esummary, still experimental
404 sub parse_chunk {
405 my $self = shift;
406 my $eutil = $self->eutil;
407 my $tag = $eutil eq 'elink' ? 'LinkSet' :
408 $eutil eq 'esummary' ? 'DocSum' :
409 $self->throw("Only eutil elink/esummary use parse_chunk()");
410 my $xs = XML::Simple->new();
411 if ($self->response) {
412 $self->throw("Lazy parsing not implemented for HTTP::Response data yet");
413 delete $self->{'_response'} if !$self->cache_response && $self->data_parsed;
414 } else { # has to be a file/filehandle
415 my $fh = $self->_fh;
416 my ($chunk, $seendoc, $line);
417 CHUNK:
418 while ($line = <$fh>) {
419 next unless $seendoc || $line =~ m{^<$tag>};
420 $seendoc = 1;
421 $chunk .= $line;
422 last if $line =~ m{^</$tag>};
424 if (!defined $line) {
425 $self->{'_parsed'} = 1;
426 return;
428 $self->_add_data(
429 $xs->XMLin($chunk, forcearray => $EUTIL_DATA{$eutil}, KeepRoot => 1)
436 =head2 to_string
438 Title : to_string
439 Usage : $foo->to_string()
440 Function : converts current object to string
441 Returns : none
442 Args : (optional) simple data for text formatting
443 Note : Implemented in plugins
445 =cut
447 sub to_string {
448 my $self = shift;
449 $self->parse_data if ($self->can('parse_data') && !$self->data_parsed);
450 return sprintf("%-20s:%s\n\n", 'EUtil', $self->eutil);
453 =head2 print_all
455 Title : print_all
456 Usage : $info->print_all();
457 $info->print_all(-fh => $fh, -cb => $coderef);
458 Function : prints (dumps) all data in parser. Unless a coderef is supplied,
459 this just dumps the parser-specific to_string method to either a
460 file/fh or STDOUT
461 Returns : none
462 Args : [optional]
463 -file : file to print to
464 -fh : filehandle to print to (cannot be used concurrently with file)
465 -cb : coderef to use in place of default print method. This is
466 passed in the parser object
467 -wrap : number of columns to wrap default text output to (def = 80)
468 Notes : only applicable for einfo. If -file or -fh are not defined,
469 prints to STDOUT
471 =cut
473 sub print_all {
474 my ($self, @args) = @_;
475 $self->_print_handler(@args);
478 =head1 Bio::Tools::EUtilities::EUtilDataI methods
480 =head2 eutil
482 Title : eutil
483 Usage : $eutil->$foo->eutil
484 Function : Get/Set eutil
485 Returns : string
486 Args : string (eutil)
487 Throws : on invalid eutil
489 =cut
491 =head2 datatype
493 Title : datatype
494 Usage : $type = $foo->datatype;
495 Function : Get/Set data object type
496 Returns : string
497 Args : string
499 =cut
501 =head1 Methods useful for multiple eutils
503 =head2 get_ids
505 Title : get_ids
506 Usage : my @ids = $parser->get_ids
507 Function : returns array of requested IDs (see Notes for more specifics)
508 Returns : array
509 Args : [conditional] not required except when running elink queries against
510 multiple databases. In case of the latter, the database name is
511 optional but recommended when retrieving IDs as the ID list will
512 be globbed together. In such cases, if a db name isn't provided a
513 warning is issued as a reminder.
514 Notes : esearch : returned ID list
515 elink : returned ID list (see Args above for caveats)
516 all others : from parameter_base->id or undef
518 =cut
520 sub get_ids {
521 my ($self, $request) = @_;
522 my $eutil = $self->eutil;
523 if ($self->is_lazy) {
524 $self->warn('get_ids() not implemented when using lazy mode');
525 return;
527 $self->parse_data unless $self->data_parsed;
528 if ($eutil eq 'esearch') {
529 return $self->{'_id'} ? @{ $self->{'_id'} } : ();
530 } elsif ($eutil eq 'elink') {
531 my @ids;
532 if ($request) {
533 if (ref $request eq 'CODE') {
534 push @ids, map {$_->get_ids }
535 grep { $request->($_) } $self->get_LinkSets;
536 } else {
537 push @ids,
538 map { @{$_->[0]} }
539 grep {grep { $_ eq $request } @{$_->[1]}}
540 map {[[$_->get_ids], [$_->get_databases]]} $self->get_LinkSets;
542 } else {
543 $self->warn('Multiple database present, IDs will be globbed together')
544 if $self->get_linked_databases > 1;
545 push @ids, map {$_->get_ids } $self->get_LinkSets;
547 return @ids;
548 } elsif ($eutil eq 'esummary') {
549 unless (exists $self->{'_id'}) {
550 push @{$self->{'_id'}}, map {$_->get_id } $self->get_DocSums;
552 return @{$self->{'_id'}};
553 } elsif (my $pb = $self->parameter_base) {
554 my $ids = $pb->id;
555 return $ids ? @{$ids} : ();
556 } else {
557 return ()
561 =head2 get_database
563 Title : get_database
564 Usage : my $db = $info->get_database;
565 Function : returns single database name (eutil-compatible). This is the
566 queried database. For most eutils this is straightforward. For
567 elinks (which have 'db' and 'dbfrom') this is db/dbto, for egquery,
568 it is the first db in the list (you probably want get_databases
569 instead)
570 Returns : string
571 Args : none
572 Notes : egquery : first db in the query (you probably want get_databases)
573 einfo : the queried database
574 espell : the queried database
575 all others : from parameter_base->db or undef
577 =cut
579 sub get_database {
580 return ($_[0]->get_databases)[0];
583 =head2 get_db (alias for get_database)
585 =cut
587 sub get_db {
588 return shift->get_database;
591 =head2 get_databases
593 Title : get_databases
594 Usage : my @dbs = $parser->get_databases
595 Function : returns list of databases
596 Returns : array of strings
597 Args : none
598 Notes : This is guaranteed to return a list of databases. For a single
599 database use the convenience method get_db/get_database
601 egquery : list of all databases in the query
602 einfo : the queried database, or the available databases
603 espell : the queried database
604 elink : collected from each LinkSet
605 all others : from parameter_base->db or undef
607 =cut
609 sub get_databases {
610 my ($self, $db) = @_;
611 $self->parse_data unless $self->data_parsed;
612 my $eutil = $self->eutil;
613 my @dbs;
614 if ($eutil eq 'einfo' || $eutil eq 'espell') {
615 @dbs = $self->{'_dbname'} ||
616 $self->{'_database'} ||
617 $self->get_available_databases;
618 } elsif ($eutil eq 'egquery') {
619 @dbs = map {$_->get_database} ($self->get_GlobalQueries);
620 } elsif ($eutil eq 'elink') {
621 # only unique dbs
622 my %tmp;
623 @dbs = sort grep {!$tmp{$_}++}
624 map {($_->get_databases)} $self->get_LinkSets;
625 } elsif ($self->parameter_base) {
626 if ($self->parameter_base->eutil eq 'elink') {
627 @dbs = $self->parameter_base->dbfrom;
628 } else {
629 @dbs = $self->parameter_base->db;
632 return @dbs;
635 =head2 get_dbs (alias for get_databases)
637 =cut
639 sub get_dbs {
640 return shift->get_databases;
643 =head2 next_History
645 Title : next_History
646 Usage : while (my $hist=$parser->next_History) {...}
647 Function : returns next HistoryI (if present).
648 Returns : Bio::Tools::EUtilities::HistoryI (Cookie or LinkSet)
649 Args : none
650 Note : esearch, epost, and elink are all capable of returning data which
651 indicates search results (in the form of UIDs) is stored on the
652 remote server. Access to this data is wrapped up in simple interface
653 (HistoryI), which is implemented in two classes:
654 Bio::DB::EUtilities::History (the simplest) and
655 Bio::DB::EUtilities::LinkSet. In general, calls to epost and esearch
656 will only return a single HistoryI object (formerly known as a
657 Cookie), but calls to elink can generate many depending on the
658 number of IDs, the correspondence, etc. Hence this iterator, which
659 allows one to retrieve said data one piece at a time.
661 =cut
663 sub next_History {
664 my $self = shift;
665 $self->parse_data unless $self->data_parsed;
666 $self->{'_histories_it'} = $self->generate_iterator('histories')
667 if (!exists $self->{'_histories_it'});
668 my $hist = $self->{'_histories_it'}->();
671 =head2 next_cookie (alias for next_History)
673 =cut
675 sub next_cookie {
676 return shift->next_History;
679 =head2 get_Histories
681 Title : get_Histories
682 Usage : my @hists = $parser->get_Histories
683 Function : returns list of HistoryI objects.
684 Returns : list of Bio::Tools::EUtilities::HistoryI (History or LinkSet)
685 Args : none
687 =cut
689 sub get_Histories {
690 my $self = shift;
691 $self->parse_data unless $self->data_parsed;
692 ref $self->{'_histories'} ? return @{ $self->{'_histories'} } : return ();
695 =head1 Query-related methods
697 =head2 get_count
699 Title : get_count
700 Usage : my $ct = $parser->get_count
701 Function : returns the count (hits for a search)
702 Returns : integer
703 Args : [CONDITIONAL] string with database name - used to retrieve
704 count from specific database when using egquery
705 Notes : egquery : count for specified database (specified above)
706 esearch : count for last search
707 all others : undef
709 =cut
711 sub get_count {
712 my ($self, $db) = @_;
713 $self->parse_data unless $self->data_parsed;
714 # egquery
715 if ($self->datatype eq 'multidbquery') {
716 if (!$db) {
717 $self->warn('Must specify database to get count from');
718 return;
720 my ($gq) = grep {$_->get_database eq $db} $self->get_GlobalQueries;
721 $gq && return $gq->get_count;
722 $self->warn("Unknown database $db");
723 return;
724 } else {
725 return $self->{'_count'};
729 =head2 get_term
731 Title : get_term
732 Usage : $st = $qd->get_term;
733 Function : retrieve the term for the global search
734 Returns : string
735 Args : none
736 Notes : egquery : search term
737 espell : search term
738 esearch : from parameter_base->term or undef
739 all others : undef
741 =cut
743 sub get_term {
744 my ($self, @args) = @_;
745 $self->parse_data unless $self->data_parsed;
746 $self->{'_term'} ? $self->{'_term'} :
747 $self->{'_query'} ? $self->{'_query'} :
748 $self->parameter_base ? $self->parameter_base->term :
749 return;
752 =head2 get_translation_from
754 Title : get_translation_from
755 Usage : $string = $qd->get_translation_from();
756 Function: portion of the original query replaced with translated_to()
757 Returns : string
758 Args : none
759 Note : only applicable for esearch
761 =cut
763 sub get_translation_from {
764 my $self = shift;
765 $self->parse_data unless $self->data_parsed;
766 return $self->{'_translation'}->{'From'};
769 =head2 get_translation_to
771 Title : get_translation_to
772 Usage : $string = $qd->get_translation_to();
773 Function: replaced string used in place of the original query term in translation_from()
774 Returns : string
775 Args : none
776 Note : only applicable for esearch
778 =cut
780 sub get_translation_to {
781 my $self = shift;
782 $self->parse_data unless $self->data_parsed;
783 return $self->{'_translation'}->{'To'};
786 =head2 get_retstart
788 Title : get_retstart
789 Usage : $start = $qd->get_retstart();
790 Function : retstart setting for the query (either set or NCBI default)
791 Returns : Integer
792 Args : none
793 Notes : esearch : retstart
794 esummary : retstart
795 all others : from parameter_base->retstart or undef
797 =cut
799 sub get_retstart {
800 my $self = shift;
801 $self->parse_data unless $self->data_parsed;
802 return $self->{'_retstart'};
805 =head2 get_retmax
807 Title : get_retmax
808 Usage : $max = $qd->get_retmax();
809 Function : retmax setting for the query (either set or NCBI default)
810 Returns : Integer
811 Args : none
812 Notes : esearch : retmax
813 esummary : retmax
814 all others : from parameter_base->retmax or undef
816 =cut
818 sub get_retmax {
819 my $self = shift;
820 $self->parse_data unless $self->data_parsed;
821 return $self->{'_retmax'};
824 =head2 get_query_translation
826 Title : get_query_translation
827 Usage : $string = $qd->get_query_translation();
828 Function: returns the translated query used for the search (if any)
829 Returns : string
830 Args : none
831 Notes : only applicable for esearch. This is the actual term used for
832 esearch.
834 =cut
836 sub get_query_translation {
837 my $self = shift;
838 $self->parse_data unless $self->data_parsed;
839 return $self->{'_querytranslation'};
842 =head2 get_corrected_query
844 Title : get_corrected_query
845 Usage : my $cor = $eutil->get_corrected_query;
846 Function : retrieves the corrected query when using espell
847 Returns : string
848 Args : none
849 Notes : only applicable for espell.
851 =cut
853 sub get_corrected_query {
854 my $self = shift;
855 $self->parse_data unless $self->data_parsed;
856 return $self->{'_correctedquery'};
859 =head2 get_replaced_terms
861 Title : get_replaced_terms
862 Usage : my $term = $eutil->get_replaced_term
863 Function : returns array of strings replaced in the query
864 Returns : string
865 Args : none
866 Notes : only applicable for espell
868 =cut
870 sub get_replaced_terms {
871 my $self = shift;
872 $self->parse_data unless $self->data_parsed;
873 if ($self->{'_spelledquery'} && $self->{'_spelledquery'}->{Replaced}) {
874 ref $self->{'_spelledquery'}->{Replaced} ?
875 return @{ $self->{'_spelledquery'}->{Replaced} } : return ();
879 =head2 next_GlobalQuery
881 Title : next_GlobalQuery
882 Usage : while (my $query = $eutil->next_GlobalQuery) {...}
883 Function : iterates through the queries returned from an egquery search
884 Returns : GlobalQuery object
885 Args : none
886 Notes : only applicable for egquery
888 =cut
890 sub next_GlobalQuery {
891 my $self = shift;
892 $self->parse_data unless $self->data_parsed;
893 $self->{'_globalqueries_it'} = $self->generate_iterator('globalqueries')
894 if (!exists $self->{'_globalqueries_it'});
895 $self->{'_globalqueries_it'}->();
898 =head2 get_GlobalQueries
900 Title : get_GlobalQueries
901 Usage : @queries = $eutil->get_GlobalQueries
902 Function : returns list of GlobalQuery objects
903 Returns : array of GlobalQuery objects
904 Args : none
905 Notes : only applicable for egquery
907 =cut
909 sub get_GlobalQueries {
910 my $self = shift;
911 $self->parse_data unless $self->data_parsed;
912 ref $self->{'_globalqueries'} ? return @{ $self->{'_globalqueries'} } : return ();
915 =head2 print_GlobalQueries
917 Title : print_GlobalQueries
918 Usage : $docsum->print_GlobalQueries();
919 $docsum->print_GlobalQueries(-fh => $fh, -callback => $coderef);
920 Function : prints item data for all global queries. The default printing
921 method is each item per DocSum is printed with relevant values if
922 present in a simple table using Text::Wrap.
923 Returns : none
924 Args : [optional]
925 -file : file to print to
926 -fh : filehandle to print to (cannot be used concurrently with file)
927 -cb : coderef to use in place of default print method. This is passed
928 in a GlobalQuery object;
929 -wrap : number of columns to wrap default text output to (def = 80)
930 Notes : only applicable for esummary. If -file or -fh are not defined,
931 prints to STDOUT
933 =cut
935 sub print_GlobalQueries {
936 my ($self, @args) = @_;
937 $self->_print_handler(@args, -type => 'GlobalQuery');
940 =head1 Summary-related methods
942 =head2 next_DocSum
944 Title : next_DocSum
945 Usage : while (my $ds = $esum->next_DocSum) {...}
946 Function : iterate through DocSum instances
947 Returns : single Bio::Tools::EUtilities::Summary::DocSum
948 Args : none yet
949 Notes : only applicable for esummary
951 =cut
953 sub next_DocSum {
954 my $self = shift;
955 if(!$self->data_parsed && !$self->is_lazy) {
956 $self->parse_data;
958 $self->{'_docsums_it'} = $self->generate_iterator('docsums')
959 if (!exists $self->{'_docsums_it'});
960 $self->{'_docsums_it'}->();
963 =head2 get_DocSums
965 Title : get_DocSums
966 Usage : my @docsums = $esum->get_DocSums
967 Function : retrieve a list of DocSum instances
968 Returns : array of Bio::Tools::EUtilities::Summary::DocSum
969 Args : none
970 Notes : only applicable for esummary
972 =cut
974 sub get_DocSums {
975 my $self = shift;
976 if ($self->is_lazy) {
977 $self->warn('get_DocSums() not implemented when using lazy mode');
978 return ();
980 $self->parse_data unless $self->data_parsed;
981 return ref $self->{'_docsums'} ? @{ $self->{'_docsums'} } : return ();
984 =head2 print_DocSums
986 Title : print_DocSums
987 Usage : $docsum->print_DocSums();
988 $docsum->print_DocSums(-fh => $fh, -cb => $coderef);
989 Function : prints item data for all docsums. The default data is generated
990 via DocSum::to_string
991 Returns : none
992 Args : [optional]
993 -file : file to print to
994 -fh : filehandle to print to (cannot be used concurrently with file)
995 -cb : coderef to use in place of default print method. This is passed
996 in a DocSum object
997 -wrap : number of columns to wrap default text output to (def = 80)
998 Notes : only applicable for esummary. If -file or -fh are not defined,
999 prints to STDOUT
1001 =cut
1003 sub print_DocSums {
1004 my ($self, @args) = @_;
1005 $self->_print_handler(@args, -type => 'DocSum');
1008 =head1 Info-related methods
1010 =head2 get_available_databases
1012 Title : get_available_databases
1013 Usage : my @dbs = $info->get_available_databases
1014 Function : returns list of available eutil-compatible database names
1015 Returns : Array of strings
1016 Args : none
1017 Notes : only applicable for einfo.
1019 =cut
1021 sub get_available_databases {
1022 my $self = shift;
1023 $self->parse_data unless $self->data_parsed;
1024 ($self->{'_available_databases'}) ?
1025 return @{($self->{'_available_databases'})} :
1026 return ();
1029 =head2 get_record_count
1031 Title : get_record_count
1032 Usage : my $ct = $eutil->get_record_count;
1033 Function : returns database record count
1034 Returns : integer
1035 Args : none
1036 Notes : only applicable for einfo.
1038 =cut
1040 sub get_record_count {
1041 my $self = shift;
1042 $self->parse_data unless $self->data_parsed;
1043 return $self->{'_count'}
1046 =head2 get_last_update
1048 Title : get_last_update
1049 Usage : my $time = $info->get_last_update;
1050 Function : returns string containing time/date stamp for last database update
1051 Returns : integer
1052 Args : none
1053 Notes : only applicable for einfo.
1055 =cut
1057 sub get_last_update {
1058 my $self = shift;
1059 $self->parse_data unless $self->data_parsed;
1060 return $self->{'_lastupdate'}
1063 =head2 get_menu_name
1065 Title : get_menu_name
1066 Usage : my $nm = $info->get_menu_name;
1067 Function : returns string of database menu name
1068 Returns : string
1069 Args : none
1070 Notes : only applicable for einfo.
1072 =cut
1074 sub get_menu_name {
1075 my $self = shift;
1076 $self->parse_data unless $self->data_parsed;
1077 exists $self->{'_menuname'} ? return $self->{'_menuname'} :
1078 exists $self->{'_menu'} ? return $self->{'_menu'} :
1079 return;
1082 =head2 get_description
1084 Title : get_description
1085 Usage : my $desc = $info->get_description;
1086 Function : returns database description
1087 Returns : string
1088 Args : none
1089 Notes : only applicable for einfo.
1091 =cut
1093 sub get_description {
1094 my $self = shift;
1095 $self->parse_data unless $self->data_parsed;
1096 return $self->{'_description'};
1099 =head2 next_FieldInfo
1101 Title : next_FieldInfo
1102 Usage : while (my $field = $info->next_FieldInfo) {...}
1103 Function : iterate through FieldInfo objects
1104 Returns : Field object
1105 Args : none
1106 Notes : only applicable for einfo. Uses callback() for filtering if defined
1107 for 'fields'
1109 =cut
1111 sub next_FieldInfo {
1112 my $self = shift;
1113 $self->parse_data unless $self->data_parsed;
1114 $self->{'_fieldinfo_it'} = $self->generate_iterator('fieldinfo')
1115 if (!exists $self->{'_fieldinfo_it'});
1116 $self->{'_fieldinfo_it'}->();
1119 =head2 get_FieldInfo
1121 Title : get_FieldInfo
1122 Usage : my @fields = $info->get_FieldInfo;
1123 Function : returns list of FieldInfo objects
1124 Returns : array (FieldInfo objects)
1125 Args : none
1126 Notes : only applicable for einfo.
1128 =cut
1130 sub get_FieldInfo {
1131 my $self = shift;
1132 $self->parse_data unless $self->data_parsed;
1133 return ref $self->{'_fieldinfo'} ? @{ $self->{'_fieldinfo'} } : return ();
1136 *get_FieldInfos = \&get_FieldInfo;
1138 =head2 next_LinkInfo
1140 Title : next_LinkInfo
1141 Usage : while (my $link = $info->next_LinkInfo) {...}
1142 Function : iterate through LinkInfo objects
1143 Returns : LinkInfo object
1144 Args : none
1145 Notes : only applicable for einfo. Uses callback() for filtering if defined
1146 for 'linkinfo'
1148 =cut
1150 sub next_LinkInfo {
1151 my $self = shift;
1152 $self->parse_data unless $self->data_parsed;
1153 $self->{'_linkinfo_it'} = $self->generate_iterator('linkinfo')
1154 if (!exists $self->{'_linkinfo_it'});
1155 $self->{'_linkinfo_it'}->();
1158 =head2 get_LinkInfo
1160 Title : get_LinkInfo
1161 Usage : my @links = $info->get_LinkInfo;
1162 Function : returns list of LinkInfo objects
1163 Returns : array (LinkInfo objects)
1164 Args : none
1165 Notes : only applicable for einfo.
1167 =cut
1169 sub get_LinkInfo {
1170 my $self = shift;
1171 $self->parse_data unless $self->data_parsed;
1172 return ref $self->{'_linkinfo'} ? @{ $self->{'_linkinfo'} } : return ();
1175 *get_LinkInfos = \&get_LinkInfo;
1177 =head2 print_FieldInfo
1179 Title : print_FieldInfo
1180 Usage : $info->print_FieldInfo();
1181 $info->print_FieldInfo(-fh => $fh, -cb => $coderef);
1182 Function : prints link data for each FieldInfo object. The default is generated
1183 via FieldInfo::to_string
1184 Returns : none
1185 Args : [optional]
1186 -file : file to print to
1187 -fh : filehandle to print to (cannot be used concurrently with file)
1188 -cb : coderef to use in place of default print method. This is
1189 passed in a FieldInfo object
1190 -wrap : number of columns to wrap default text output to (def = 80)
1191 Notes : only applicable for einfo. If -file or -fh are not defined,
1192 prints to STDOUT
1194 =cut
1196 sub print_FieldInfo {
1197 my ($self, @args) = @_;
1198 $self->_print_handler(@args, -type => 'FieldInfo');
1201 =head2 print_LinkInfo
1203 Title : print_LinkInfo
1204 Usage : $info->print_LinkInfo();
1205 $info->print_LinkInfo(-fh => $fh, -cb => $coderef);
1206 Function : prints link data for each LinkInfo object. The default is generated
1207 via LinkInfo::to_string
1208 Returns : none
1209 Args : [optional]
1210 -file : file to print to
1211 -fh : filehandle to print to (cannot be used concurrently with file)
1212 -cb : coderef to use in place of default print method. This is passed
1213 in a LinkInfo object
1214 -wrap : number of columns to wrap default text output to (def = 80)
1215 Notes : only applicable for einfo. If -file or -fh are not defined,
1216 prints to STDOUT
1218 =cut
1220 sub print_LinkInfo {
1221 my ($self, @args) = @_;
1222 $self->_print_handler(@args, -type => 'LinkInfo');
1225 =head1 Bio::Tools::EUtilities::Link-related methods
1227 =head2 next_LinkSet
1229 Title : next_LinkSet
1230 Usage : while (my $ls = $eutil->next_LinkSet {...}
1231 Function : iterate through LinkSet objects
1232 Returns : LinkSet object
1233 Args : none
1234 Notes : only applicable for elink. Uses callback() for filtering if defined
1235 for 'linksets'
1237 =cut
1239 sub next_LinkSet {
1240 my $self = shift;
1241 #$self->parse_data unless $self->data_parsed;
1242 if(!$self->data_parsed && !$self->is_lazy) {
1243 $self->parse_data;
1245 $self->{'_linksets_it'} = $self->generate_iterator('linksets')
1246 if (!exists $self->{'_linksets_it'});
1247 $self->{'_linksets_it'}->();
1250 =head2 get_LinkSets
1252 Title : get_LinkSets
1253 Usage : my @links = $info->get_LinkSets;
1254 Function : returns list of LinkSets objects
1255 Returns : array (LinkSet objects)
1256 Args : none
1257 Notes : only applicable for elink.
1259 =cut
1261 # add support for retrieval of data if lazy parsing is enacted
1263 sub get_LinkSets {
1264 my $self = shift;
1265 if ($self->is_lazy) {
1266 $self->warn('get_LinkSets() not implemented when using lazy mode');
1267 return ();
1269 $self->parse_data unless $self->data_parsed;
1270 return ref $self->{'_linksets'} ? @{ $self->{'_linksets'} } : return ();
1273 =head2 print_LinkSets
1275 Title : print_LinkSets
1276 Usage : $info->print_LinkSets();
1277 $info->print_LinkSets(-fh => $fh, -cb => $coderef);
1278 Function : prints link data for each LinkSet object. The default is generated
1279 via LinkSet::to_string
1280 Returns : none
1281 Args : [optional]
1282 -file : file to print to
1283 -fh : filehandle to print to (cannot be used concurrently with file)
1284 -cb : coderef to use in place of default print method. This is passed
1285 in a LinkSet object
1286 -wrap : number of columns to wrap default text output to (def = 80)
1287 Notes : only applicable for einfo. If -file or -fh are not defined,
1288 prints to STDOUT
1290 =cut
1292 sub print_LinkSets {
1293 my ($self, @args) = @_;
1294 $self->_print_handler(@args, -type => 'LinkSet');
1297 =head2 get_linked_databases
1299 Title : get_linked_databases
1300 Usage : my @dbs = $eutil->get_linked_databases
1301 Function : returns list of databases linked to in linksets
1302 Returns : array of databases
1303 Args : none
1304 Notes : only applicable for elink. Now defers to get_databases.
1306 =cut
1308 sub get_linked_databases {
1309 my $self = shift;
1310 return $self->get_databases if $self->eutil eq 'elink';
1311 return ();
1314 =head1 Iterator- and callback-related methods
1316 =cut
1319 my %VALID_ITERATORS = (
1320 'globalqueries' => 'globalqueries',
1321 'fieldinfo' => 'fieldinfo',
1322 'fieldinfos' => 'fieldinfo',
1323 'linkinfo' => 'linkinfo',
1324 'linkinfos' => 'linkinfo',
1325 'linksets' => 'linksets',
1326 'docsums' => 'docsums',
1327 'histories' => 'histories'
1330 =head2 rewind
1332 Title : rewind
1333 Usage : $esum->rewind()
1334 $esum->rewind('recursive')
1335 Function : retrieve a list of DocSum instances
1336 Returns : array of Bio::Tools::EUtilities::Summary::DocSum
1337 Args : [optional] Scalar; string ('all') to reset all iterators, or string
1338 describing the specific main object iterator to reset. The following
1339 are recognized (case-insensitive):
1341 'all' - rewind all objects and also recursively resets nested object interators
1342 (such as LinkSets and DocSums).
1343 'globalqueries' - GlobalQuery objects
1344 'fieldinfo' or 'fieldinfos' - FieldInfo objects
1345 'linkinfo' or 'linkinfos' - LinkInfo objects in this layer
1346 'linksets' - LinkSet objects
1347 'docsums' - DocSum objects
1348 'histories' - HistoryI objects (Cookies, LinkSets)
1350 =cut
1352 sub rewind {
1353 my ($self, $arg) = ($_[0], lc $_[1]);
1354 my $eutil = $self->eutil;
1355 if ($self->is_lazy) {
1356 $self->warn('rewind() not implemented yet when running in lazy mode');
1357 return;
1359 $arg ||= 'all';
1360 if (exists $VALID_ITERATORS{$arg}) {
1361 delete $self->{'_'.$arg.'_it'};
1362 } elsif ($arg eq 'all') {
1363 for my $it (values %VALID_ITERATORS){
1364 delete $self->{'_'.$it.'_it'} if
1365 exists $self->{'_'.$it.'_it'};
1366 map {$_->rewind('all')} $self->get_LinkSets;
1367 map {$_->rewind('all')} $self->get_DocSums;
1372 =head2 generate_iterator
1374 Title : generate_iterator
1375 Usage : my $coderef = $esum->generate_iterator('linkinfo')
1376 Function : generates an iterator (code reference) which iterates through
1377 the relevant object indicated by the args
1378 Returns : code reference
1379 Args : [REQUIRED] Scalar; string describing the specific object to iterate.
1380 The following are currently recognized (case-insensitive):
1382 'globalqueries'
1383 'fieldinfo' or 'fieldinfos' (the latter sounds clumsy, but I alias it JIC)
1384 'linkinfo' or 'linkinfos' (the latter sounds clumsy, but I alias it JIC)
1385 'linksets'
1386 'docsums'
1387 'histories'
1389 Note : This function generates a simple coderef that one can use
1390 independently of the various next_* functions (in fact, the next_*
1391 functions use lazily created iterators generated via this method,
1392 while rewind() merely deletes them so they can be regenerated on the
1393 next call).
1395 A callback specified using callback() will be used to filter objects
1396 for any generated iterator. This behaviour is implemented for both
1397 normal and lazy iterator types and is the default. If you don't want
1398 this, make sure to reset any previously set callbacks via
1399 reset_callback() (which just deletes the code ref). Note that setting
1400 callback() also changes the behavior of the next_* functions as the
1401 iterators are generated here (as described above); this is a feature
1402 and not a bug.
1404 'Lazy' iterators are considered an experimental feature and may be
1405 modified in the future. A 'lazy' iterator, which loops through and
1406 returns objects as they are created (instead of creating all data
1407 instances up front, then iterating through) is returned if the
1408 parser is set to 'lazy' mode. This mode is only present for elink
1409 and esummary output as they are the two formats parsed which can
1410 generate potentially thousands of individual objects (note efetch
1411 isn't parsed, so isn't counted). Use of rewind() with these
1412 iterators is not supported for the time being as we can't guarantee
1413 you can rewind(), as this depends on whether the data source is
1414 seek()able and thus 'rewindable'. We will add rewind() support at a
1415 later time which will work for 'seekable' data or possibly cached
1416 objects via Storable or BDB.
1418 =cut
1420 sub generate_iterator {
1421 my ($self, $obj) = @_;
1422 if (!$obj) {
1423 $self->throw('Must provide object type to iterate');
1424 } elsif (!exists $VALID_ITERATORS{$obj}) {
1425 $self->throw("Unknown object type [$obj]");
1427 my $cb = $self->callback;
1428 if ($self->is_lazy) {
1429 my $type = $self->eutil eq 'esummary' ? '_docsums' : '_linksets';
1430 $self->{$type} = [];
1431 return sub {
1432 if (!@{$self->{$type}}) {
1433 $self->parse_chunk; # fill the queue
1435 while (my $obj = shift @{$self->{$type}}) {
1436 if ($cb) {
1437 ($cb->($obj)) ? return $obj : next;
1438 } else {
1439 return $obj;
1442 undef;
1444 } else {
1445 my $loc = '_'.$VALID_ITERATORS{$obj};
1446 my $index = $#{$self->{$loc}};
1447 my $current = 0;
1448 return sub {
1449 while ($current <= $index) {
1450 if ($cb) {
1451 if (my $d = $cb->($self->{$loc}->[$current])) {
1452 return $self->{$loc}->[$current++] }
1453 else {
1454 $current++;
1455 next;
1457 } else {
1458 return $self->{$loc}->[$current++]
1461 undef;
1468 =head2 callback
1470 Title : callback
1471 Usage : $parser->callback(sub {$_[0]->get_database eq 'protein'});
1472 Function : Get/set callback code ref used to filter returned data objects
1473 Returns : code ref if previously set
1474 Args : single argument:
1475 code ref - evaluates a passed object and returns true or false value
1476 (used in iterators)
1477 'reset' - string, resets the iterator.
1478 returns upon any other args
1480 =cut
1482 sub callback {
1483 my ($self, $cb) = @_;
1484 if ($cb) {
1485 delete $self->{'_cb'} if ($cb eq 'reset');
1486 return if ref $cb ne 'CODE';
1487 $self->{'_cb'} = $cb;
1489 return $self->{'_cb'};
1492 # Object printing methods
1495 my $DEF_HANDLER = sub {
1496 my $obj = shift;
1497 return $obj->to_string."\n";
1500 my %HANDLER = (
1501 'DocSum' => 1,
1502 'FieldInfo' => 1,
1503 'LinkInfo' => 1,
1504 'GlobalQuery' => 1,
1505 'LinkSet' => 1,
1506 'all' => 1,
1509 sub _print_handler {
1510 my $self = shift;
1511 my ($file, $fh, $cb, $wrap, $type, $all) = $self->_rearrange([qw(FILE FH CB WRAP TYPE ALL)], @_);
1512 $type ||= 'all';
1514 # default formatting delegates to_string
1515 if (!$cb) {
1516 $self->throw("Type $type not registered with print handler, exiting...")
1517 if !exists($HANDLER{$type});
1518 $cb = $DEF_HANDLER;
1519 } else {
1520 $self->throw("Callback must be a code reference") if ref $cb ne 'CODE';
1523 $file ||= $fh;
1524 $self->throw("Have defined both file and filehandle; only use one!") if $file && $fh;
1525 my $io = ($file) ? Bio::Root::IO->new(-input => $file, -flush => 1) :
1526 Bio::Root::IO->new(-flush => 1); # defaults to STDOUT
1528 if ($type eq 'all') {
1529 my $string = $cb->($self);
1530 $io->_print($string) if $string;
1531 } else {
1532 # set up iterator
1533 my $it = "next_$type";
1534 $self->throw("Unknown iterator method $it") unless $self->can($it);
1535 while (my $obj = $self->$it) {
1536 my $string = $cb->($obj);
1537 $io->_print($string) if $string;
1540 $io->close;
1544 # Private methods
1546 sub _seekable {
1547 return shift->{'_seekable'}
1550 # fixes odd bad XML issue espell data (still present 6-24-07)
1552 sub _fix_espell {
1553 my ($self, $response) = @_;
1554 my $temp;
1555 my $type = ref($response);
1556 if ($type eq 'GLOB') {
1557 $temp .= $_ for <$response>;
1558 } elsif ($type eq 'HTTP::Response') {
1559 $temp = $response->content;
1560 } else {
1561 $self->throw("Unrecognized ref type $type");
1563 if ($temp =~ m{^<html>}) {
1564 $self->throw("NCBI espell nonrecoverable error: HTML content returned")
1566 $temp =~ s{<ERROR>(.*?)<ERROR>}{<ERROR>$1</ERROR>};
1567 return $temp;
1570 sub _load_eutil_module {
1571 my ($self, $class) = @_;
1572 my $ok;
1573 my $module = "Bio::Tools::EUtilities::" . $class;
1575 eval {
1576 $ok = $self->_load_module($module);
1578 if ( $@ ) {
1579 print STDERR <<END;
1580 $self: data module $module cannot be found
1581 Exception $@
1582 For more information about the EUtilities system please see the EUtilities docs.
1586 return $ok;