2 # BioPerl module Bio::Biblio::IO::pubmedxml.pm
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Martin Senger <senger@ebi.ac.uk>
7 # For copyright and disclaimer see below.
9 # POD documentation - main docs before the code
13 Bio::Biblio::IO::pubmedxml - A converter of XML files with PUBMED citations
17 Do not use this object directly, it is recommended to access it and use
18 it through the I<Bio::Biblio::IO> module:
21 my $io = Bio::Biblio::IO->new(-format => 'pubmedxml');
25 This object reads bibliographic citations in XML/MEDLINE format and
26 converts them into I<Bio::Biblio::RefI> objects. It is an
27 implementation of methods defined in I<Bio::Biblio::IO>.
33 User feedback is an integral part of the evolution of this and other
34 Bioperl modules. Send your comments and suggestions preferably to
35 the Bioperl mailing list. Your participation is much appreciated.
37 bioperl-l@bioperl.org - General discussion
38 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42 Please direct usage questions or support issues to the mailing list:
44 I<bioperl-l@bioperl.org>
46 rather than to the module maintainer directly. Many experienced and
47 reponsive experts will be able look at the problem and quickly
48 address it. Please include a thorough description of the problem
49 with code and data examples if at all possible.
53 Report bugs to the Bioperl bug tracking system to help us keep track
54 of the bugs and their resolution. Bug reports can be submitted via the
57 http://bugzilla.open-bio.org/
61 Martin Senger (senger@ebi.ac.uk)
65 Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved.
67 This module is free software; you can redistribute it and/or modify
68 it under the same terms as Perl itself.
72 This software is provided "as is" without warranty of any kind.
76 The main documentation details are to be found in
79 Here is the rest of the object methods. Internal methods are preceded
85 # Let the code begin...
88 package Bio
::Biblio
::IO
::pubmedxml
;
89 use vars
qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_AND_ADD_DATA_ELEMENT);
93 use base qw(Bio::Biblio::IO::medlinexml);
97 my ($self, @args) = @_;
99 # make a hashtable from @args
101 @param { map { lc $_ } keys %param } = values %param; # lowercase keys
103 # copy all @args into this object (overwriting what may already be
104 # there) - changing '-key' into '_key', and making keys lowercase
106 foreach my $key (keys %param) {
107 ($new_key = $key) =~ s/^-/_/;
108 $self->{ lc $new_key } = $param { $key };
111 # find the format for output - and put it into a global $Convert
112 # because it will be used by the event handler who knows nothing
114 my $result = $self->{'_result'} || 'pubmed2ref';
115 $result = "\L$result"; # normalize capitalization to lower case
117 # a special case is 'raw' when no converting module is loaded
118 # and citations will be returned as a hashtable (the one which
119 # is created during parsing XML file/stream)
120 unless ($result eq 'raw') {
122 # load module with output converter - as defined in $result
123 if (defined &Bio
::Biblio
::IO
::_load_format_module
($result)) {
124 $Bio::Biblio
::IO
::medlinexml
::Convert
= "Bio::Biblio::IO::$result"->new (@args);
128 # create an instance of the XML parser
129 # (unless it is already there...)
130 $self->{'_xml_parser'} = new XML
::Parser
(Handlers
=> {Init
=> \
&Bio
::Biblio
::IO
::medlinexml
::handle_doc_start
,
131 Start
=> \
&handle_start
,
133 Char
=> \
&Bio
::Biblio
::IO
::medlinexml
::handle_char
,
134 Final
=> \
&Bio
::Biblio
::IO
::medlinexml
::handle_doc_end
})
135 unless $self->{'_xml_parser'};
137 # if there is an argument '-callback' then start parsing at once -
138 # the registered event handlers will use 'callback' to report
139 # back after each citation
141 # we need to remember this situation also in a global variable
142 # because the event handler subroutines know nothing about this
143 # object (unfortunately)
144 if ($SUPER::Callback
= $self->{'_callback'}) {
149 # ---------------------------------------------------------------------
151 # Here are the event handlers (they do the real job!)
153 # Note that these methods do not know anything about the object they
154 # are part of - they are called as subroutines. not as methods.
155 # It also means that they need to use global variables to store and
156 # exchnage intermediate results.
158 # ---------------------------------------------------------------------
161 # This is a list of #PCDATA elements.
165 'PublicationStatus' => 1,
174 'PubMedArticle' => 1,
175 'PubmedArticle' => 1,
179 %POP_DATA_AND_PEEK_OBJ =
188 'PublicationStatus' => 1,
191 %POP_AND_ADD_DATA_ELEMENT =
193 'PubMedPubDate' => 'pubDates',
194 'History' => 'histories',
198 =head2 VERSION and Revision
200 Usage : print $Bio::Biblio::IO::pubmedxml::VERSION;
201 print $Bio::Biblio::IO::pubmedxml::Revision;
207 my ($expat, $e, %attrs) = @_;
208 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("START", $e);
211 # The #PCDATA elements which have an attribute list must
212 # be first here - because for them I create entries both on
213 # the @PCDataStack _and_ on @ObjectStack.
215 if ($e eq 'ArticleId') {
217 $p{'idType'} = (defined $attrs{'IdType'} ?
$attrs{'IdType'} : 'pubmed');
218 push (@Bio::Biblio
::IO
::medlinexml
::ObjectStack
, \
%p);
223 $p{'type'} = $attrs{'type'} if $attrs{'type'};
224 $p{'lang'} = $attrs{'lang'} if $attrs{'lang'};
225 push (@Bio::Biblio
::IO
::medlinexml
::ObjectStack
, \
%p);
229 # Then we have #PCDATA elements without an attribute list.
230 # For them I create an entry on @PCDataStack.
232 if (exists $PCDATA_NAMES{$e}) {
233 push (@Bio::Biblio
::IO
::medlinexml
::PCDataStack
, '');
236 # And finally, all non-PCDATA elements go to the objectStack
238 } elsif (exists $SIMPLE_TREATMENT{$e}) {
239 push (@Bio::Biblio
::IO
::medlinexml
::ObjectStack
, {});
241 } elsif ($e eq 'ArticleIdList') {
244 } elsif ($e eq 'PubMedPubDate') {
246 $p{'pubStatus'} = $attrs{'PubStatus'} if $attrs{'PubStatus'};
247 push (@Bio::Biblio
::IO
::medlinexml
::ObjectStack
, \
%p);
250 &Bio
::Biblio
::IO
::medlinexml
::handle_start
($expat, $e, %attrs);
255 my ($expat, $e) = @_;
258 # First I have to deal with those elements which are both PCDATA
259 # (and therefore they are on the pcdataStack) and which have an
260 # attribute list (therefore they are also known as a separate
261 # p-object on the objectStack.
263 if ($e eq 'ArticleId') {
264 &Bio
::Biblio
::IO
::medlinexml
::_data2obj
('id');
265 &Bio
::Biblio
::IO
::medlinexml
::_add_element
('pubmedArticleIds', pop @Bio::Biblio
::IO
::medlinexml
::ObjectStack
);
266 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
271 &Bio
::Biblio
::IO
::medlinexml
::_data2obj
('URL');
272 &Bio
::Biblio
::IO
::medlinexml
::_add_element
('pubmedURLs', pop @Bio::Biblio
::IO
::medlinexml
::ObjectStack
);
273 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
279 # both object and pcdata stacks elements mixed here together
282 if (exists $POP_DATA_AND_PEEK_OBJ{$e}) {
283 &Bio
::Biblio
::IO
::medlinexml
::_data2obj
("\l$e");
285 } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) {
286 &Bio
::Biblio
::IO
::medlinexml
::_add_element
($POP_AND_ADD_DATA_ELEMENT{$e}, pop @Bio::Biblio
::IO
::medlinexml
::ObjectStack
);
288 } elsif ($e eq 'MedlineCitation' ||
289 $e eq 'NCBIArticle') {
290 &Bio
::Biblio
::IO
::medlinexml
::_obj2obj
('Citation');
292 } elsif ($e eq 'PubmedData') {
293 &Bio
::Biblio
::IO
::medlinexml
::_obj2obj
('PubmedData');
295 } elsif ($e eq 'PubMedArticle' ||
296 $e eq 'PubmedArticle') {
299 # Here we finally have the whole citation ready.
301 &Bio
::Biblio
::IO
::medlinexml
::_process_citation
(pop @Bio::Biblio
::IO
::medlinexml
::ObjectStack
);
304 &Bio
::Biblio
::IO
::medlinexml
::handle_end
($expat, $e);
307 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);