[bug 3148] switch default to "expasy" until we can work out REST service interface
[bioperl-live.git] / Bio / Biblio / IO / pubmedxml.pm
blobe898ecfd210feefc4493a2d72c993b19a8d67ec6
2 # BioPerl module Bio::Biblio::IO::pubmedxml.pm
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Martin Senger <senger@ebi.ac.uk>
7 # For copyright and disclaimer see below.
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Biblio::IO::pubmedxml - A converter of XML files with PUBMED citations
15 =head1 SYNOPSIS
17 Do not use this object directly, it is recommended to access it and use
18 it through the I<Bio::Biblio::IO> module:
20 use Bio::Biblio::IO;
21 my $io = Bio::Biblio::IO->new(-format => 'pubmedxml');
23 =head1 DESCRIPTION
25 This object reads bibliographic citations in XML/MEDLINE format and
26 converts them into I<Bio::Biblio::RefI> objects. It is an
27 implementation of methods defined in I<Bio::Biblio::IO>.
29 =head1 FEEDBACK
31 =head2 Mailing Lists
33 User feedback is an integral part of the evolution of this and other
34 Bioperl modules. Send your comments and suggestions preferably to
35 the Bioperl mailing list. Your participation is much appreciated.
37 bioperl-l@bioperl.org - General discussion
38 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40 =head2 Support
42 Please direct usage questions or support issues to the mailing list:
44 I<bioperl-l@bioperl.org>
46 rather than to the module maintainer directly. Many experienced and
47 reponsive experts will be able look at the problem and quickly
48 address it. Please include a thorough description of the problem
49 with code and data examples if at all possible.
51 =head2 Reporting Bugs
53 Report bugs to the Bioperl bug tracking system to help us keep track
54 of the bugs and their resolution. Bug reports can be submitted via the
55 web:
57 http://bugzilla.open-bio.org/
59 =head1 AUTHOR
61 Martin Senger (senger@ebi.ac.uk)
63 =head1 COPYRIGHT
65 Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved.
67 This module is free software; you can redistribute it and/or modify
68 it under the same terms as Perl itself.
70 =head1 DISCLAIMER
72 This software is provided "as is" without warranty of any kind.
74 =head1 APPENDIX
76 The main documentation details are to be found in
77 L<Bio::Biblio::IO>.
79 Here is the rest of the object methods. Internal methods are preceded
80 with an underscore _.
82 =cut
85 # Let the code begin...
88 package Bio::Biblio::IO::pubmedxml;
89 use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_AND_ADD_DATA_ELEMENT);
91 use strict;
93 use base qw(Bio::Biblio::IO::medlinexml);
96 sub _initialize {
97 my ($self, @args) = @_;
99 # make a hashtable from @args
100 my %param = @args;
101 @param { map { lc $_ } keys %param } = values %param; # lowercase keys
103 # copy all @args into this object (overwriting what may already be
104 # there) - changing '-key' into '_key', and making keys lowercase
105 my $new_key;
106 foreach my $key (keys %param) {
107 ($new_key = $key) =~ s/^-/_/;
108 $self->{ lc $new_key } = $param { $key };
111 # find the format for output - and put it into a global $Convert
112 # because it will be used by the event handler who knows nothing
113 # about this object
114 my $result = $self->{'_result'} || 'pubmed2ref';
115 $result = "\L$result"; # normalize capitalization to lower case
117 # a special case is 'raw' when no converting module is loaded
118 # and citations will be returned as a hashtable (the one which
119 # is created during parsing XML file/stream)
120 unless ($result eq 'raw') {
122 # load module with output converter - as defined in $result
123 if (defined &Bio::Biblio::IO::_load_format_module ($result)) {
124 $Bio::Biblio::IO::medlinexml::Convert = "Bio::Biblio::IO::$result"->new (@args);
128 # create an instance of the XML parser
129 # (unless it is already there...)
130 $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&Bio::Biblio::IO::medlinexml::handle_doc_start,
131 Start => \&handle_start,
132 End => \&handle_end,
133 Char => \&Bio::Biblio::IO::medlinexml::handle_char,
134 Final => \&Bio::Biblio::IO::medlinexml::handle_doc_end})
135 unless $self->{'_xml_parser'};
137 # if there is an argument '-callback' then start parsing at once -
138 # the registered event handlers will use 'callback' to report
139 # back after each citation
141 # we need to remember this situation also in a global variable
142 # because the event handler subroutines know nothing about this
143 # object (unfortunately)
144 if ($SUPER::Callback = $self->{'_callback'}) {
145 $self->_parse;
149 # ---------------------------------------------------------------------
151 # Here are the event handlers (they do the real job!)
153 # Note that these methods do not know anything about the object they
154 # are part of - they are called as subroutines. not as methods.
155 # It also means that they need to use global variables to store and
156 # exchnage intermediate results.
158 # ---------------------------------------------------------------------
161 # This is a list of #PCDATA elements.
163 %PCDATA_NAMES =
165 'PublicationStatus' => 1,
166 'ProviderId' => 1,
167 'ArticleId' => 1,
168 'URL' => 1,
171 %SIMPLE_TREATMENT =
173 'History' => 1,
174 'PubMedArticle' => 1,
175 'PubmedArticle' => 1,
176 'PubmedData' => 1,
179 %POP_DATA_AND_PEEK_OBJ =
181 'Year' => 1,
182 'Month' => 1,
183 'Day' => 1,
184 'Hour' => 1,
185 'Minute' => 1,
186 'Second' => 1,
187 'ProviderId' => 1,
188 'PublicationStatus' => 1,
191 %POP_AND_ADD_DATA_ELEMENT =
193 'PubMedPubDate' => 'pubDates',
194 'History' => 'histories',
198 =head2 VERSION and Revision
200 Usage : print $Bio::Biblio::IO::pubmedxml::VERSION;
201 print $Bio::Biblio::IO::pubmedxml::Revision;
203 =cut
206 sub handle_start {
207 my ($expat, $e, %attrs) = @_;
208 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("START", $e);
211 # The #PCDATA elements which have an attribute list must
212 # be first here - because for them I create entries both on
213 # the @PCDataStack _and_ on @ObjectStack.
215 if ($e eq 'ArticleId') {
216 my %p = ();
217 $p{'idType'} = (defined $attrs{'IdType'} ? $attrs{'IdType'} : 'pubmed');
218 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
221 if ($e eq 'URL') {
222 my %p = ();
223 $p{'type'} = $attrs{'type'} if $attrs{'type'};
224 $p{'lang'} = $attrs{'lang'} if $attrs{'lang'};
225 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
229 # Then we have #PCDATA elements without an attribute list.
230 # For them I create an entry on @PCDataStack.
232 if (exists $PCDATA_NAMES{$e}) {
233 push (@Bio::Biblio::IO::medlinexml::PCDataStack, '');
236 # And finally, all non-PCDATA elements go to the objectStack
238 } elsif (exists $SIMPLE_TREATMENT{$e}) {
239 push (@Bio::Biblio::IO::medlinexml::ObjectStack, {});
241 } elsif ($e eq 'ArticleIdList') {
244 } elsif ($e eq 'PubMedPubDate') {
245 my %p = ();
246 $p{'pubStatus'} = $attrs{'PubStatus'} if $attrs{'PubStatus'};
247 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
249 } else {
250 &Bio::Biblio::IO::medlinexml::handle_start ($expat, $e, %attrs);
254 sub handle_end {
255 my ($expat, $e) = @_;
258 # First I have to deal with those elements which are both PCDATA
259 # (and therefore they are on the pcdataStack) and which have an
260 # attribute list (therefore they are also known as a separate
261 # p-object on the objectStack.
263 if ($e eq 'ArticleId') {
264 &Bio::Biblio::IO::medlinexml::_data2obj ('id');
265 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedArticleIds', pop @Bio::Biblio::IO::medlinexml::ObjectStack);
266 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
267 return;
270 if ($e eq 'URL') {
271 &Bio::Biblio::IO::medlinexml::_data2obj ('URL');
272 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedURLs', pop @Bio::Biblio::IO::medlinexml::ObjectStack);
273 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
274 return;
279 # both object and pcdata stacks elements mixed here together
282 if (exists $POP_DATA_AND_PEEK_OBJ{$e}) {
283 &Bio::Biblio::IO::medlinexml::_data2obj ("\l$e");
285 } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) {
286 &Bio::Biblio::IO::medlinexml::_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}, pop @Bio::Biblio::IO::medlinexml::ObjectStack);
288 } elsif ($e eq 'MedlineCitation' ||
289 $e eq 'NCBIArticle') {
290 &Bio::Biblio::IO::medlinexml::_obj2obj ('Citation');
292 } elsif ($e eq 'PubmedData') {
293 &Bio::Biblio::IO::medlinexml::_obj2obj ('PubmedData');
295 } elsif ($e eq 'PubMedArticle' ||
296 $e eq 'PubmedArticle') {
299 # Here we finally have the whole citation ready.
301 &Bio::Biblio::IO::medlinexml::_process_citation (pop @Bio::Biblio::IO::medlinexml::ObjectStack);
303 } else {
304 &Bio::Biblio::IO::medlinexml::handle_end ($expat, $e);
307 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
312 __END__