fix MeSH uninit variable warnings, but this module needs further testing (service...
[bioperl-live.git] / Bio / DB / MeSH.pm
blob12401a81c2bb0a1699cfcf097bdfbd7725433cd4
2 # BioPerl module for Bio::DB::MeSH
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::DB::MeSH - Term retrieval from a Web MeSH database
16 =head1 SYNOPSIS
18 my $mesh = Bio::DB::MeSH->new();
19 my $term = $mesh->get_exact_term('Butter');
20 print $term->description;
22 =head1 DESCRIPTION
24 This class retrieves a term from the Medical Subject Headings database
25 by the National Library of Medicine of USA. See
26 L<http://www.nlm.nih.gov/mesh/meshhome.html>. It uses the latest
27 data available (updates happen on weekdays). If it fails, an archive
28 cgi scripts accessing older data from previous year is used.
30 This class implements L<Bio::SimpleAnalysisI> and wraps its methods under
31 L<get_exact_term>.
33 By default, web access uses L<WWW::Mechanize>, but in its absence
34 falls back to bioperl module L<Bio::WebAgent> which is a subclass of
35 L<LWP::UserAgent>.
37 =head1 SEE ALSO
39 L<Bio::Phenotype::MeSH::Term>
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to the
47 Bioperl mailing lists Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 =head2 Support
54 Please direct usage questions or support issues to the mailing list:
56 I<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
63 =head2 Reporting Bugs
65 report bugs to the Bioperl bug tracking system to help us keep track
66 the bugs and their resolution. Bug reports can be submitted via the
67 web:
69 https://github.com/bioperl/bioperl-live/issues
71 =head1 AUTHOR
73 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
75 =head1 APPENDIX
77 The rest of the documentation details each of the object
78 methods. Internal methods are usually preceded with a _
80 =cut
83 # Let the code begin...
86 package Bio::DB::MeSH;
87 use strict;
89 use Bio::Phenotype::MeSH::Term;
90 use Bio::Phenotype::MeSH::Twig;
92 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
95 my $URL = 'http://www.nlm.nih.gov/mesh/MBrowser.html';
97 my $ANALYSIS_SPEC= {name => 'MeSH term retrival',
98 type => 'Entry retrieval'};
99 my $INPUT_SPEC = [
100 {mandatory=>'true',
101 type => 'scalar',
102 'name'=> 'value',
106 my $RESULT_SPEC =
108 '' => 'Bio::Phenotype::MeSH::Term',
109 raw => 'raw output',
113 sub _init {
114 # Note: Base URL is now set in _webmodule(), depending on which is selected
115 my $self = shift;
116 $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
117 $self->{'_INPUT_SPEC'} = $INPUT_SPEC;
118 $self->{'_RESULT_SPEC'} = $RESULT_SPEC;
119 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
120 $self->_webmodule;
121 return $self;
125 sub _webmodule {
126 my ($self) = shift;
127 $self->{'_webmodule'} = '';
129 # Prefer WWW::Mechanize if available and use $URL
130 eval {
131 require WWW::Mechanize;
133 unless ($@) {
134 $self->{'_webmodule'} = 'WWW::Mechanize';
135 $self->url($URL);
136 return;
139 # Bio::WebAgent uses cgi alternative URL
140 $self->_set_cgi_base_url;
142 # Use Bio::WebAgent alternative
143 eval {
144 require LWP::UserAgent;
146 unless ($@) {
147 $self->{'_webmodule'} = 'Bio::WebAgent';
148 return;
151 $self->throw("Bio::DB::MeSH needs either WWW::Mechanize or Bio::WebAgent");
155 sub _set_cgi_base_url {
156 my ($self) = shift;
158 # Try to get webpage corresponding to current year.
159 # If it fails, try to get previous years until success or 2003
160 my $year = 1900 + (localtime)[5];
161 my $pass = 0;
162 while ($pass == 0 and $year > 2003) {
163 my $response;
164 eval {
165 $response = $self->get( "http://www.nlm.nih.gov/cgi/mesh/$year/MB_cgi" )
167 # Note: error 404 is acceptable because it can mean that webpage is not yet
168 # implemented for current year. Absence of internet generates error 500.
169 if ($@ or $response->{'_rc'} > 404) {
170 $self->warn("Could not connect to the server\n") and return;
172 # Success closes the loop, fail makes it try with the another year
173 if ($response->is_success) {
174 $pass = 1;
176 else {
177 $year -= 1;
180 $self->url("http://www.nlm.nih.gov/cgi/mesh/$year/MB_cgi");
183 =head2 get_exact_term
185 Title : get_exact_term
186 Usage : $s = $db->get_exact_term($value);
187 Function: Retrieve a single MeSH term using a unique ID or exact name.
188 Example :
189 Returns : a Bio::Phenotype::MeSH::Term object
190 Args : scalar, UID or name of a MeSH term
192 The returned term object contains information about the immediate
193 vincinity of the term in the terminology hierarchy. See
194 L<Bio::Phenotype::MeSH::Twig>.
196 =cut
199 sub get_exact_term {
200 my ($self, $value) = @_;
201 $self->{'_term'} = undef;
202 $self->run($value) if $value;
203 $self->throw("Could not connect to the server")
204 unless $self->status eq 'COMPLETED';
205 return $self->result;
209 sub run {
210 my ($self, $value) = @_;
212 # check input
213 $self->throw("Need a MeSH name or ID as an input [$value]") if ref $value;
215 # internal run()
216 $self->_run($value);
220 sub _cgi_url {
221 my($self, $field, $term) = @_;
223 # we don't bother to URI::Escape $field and $term as this is an untainted private sub
224 my $base_url = $self->url || '';
225 return "$base_url?field=$field&term=$term";
229 sub _run {
230 my ($self, $value) = @_;
231 $self->throw('Need a value [$value]')
232 unless $value;;
233 # delay repeated calls by default by 3 sec, set delay() to change
234 # $self->sleep;
236 $self->status('TERMINATED_BY_ERROR');
238 if ($self->{'_webmodule'} eq 'WWW::Mechanize') {
239 $self->debug("using WWW::Mechanize...\n");
240 my $agent = WWW::Mechanize->new();
241 $agent->get($self->url);
242 $agent->status == 200
243 or $self->warn("Could not connect to the server\n") and return;
245 $agent->form_name('MB');
247 $agent->field("term", $value);
248 if ($value =~ /\w\d{6}/) {
249 $agent->field("field", 'uid');
250 } else {
251 $agent->field("field", 'entry');
253 $agent->click("exact");
255 $self->{'_content'} = $agent->content();
256 $self->status('COMPLETED');
257 return;
259 elsif ($self->{'_webmodule'} eq 'Bio::WebAgent') {
260 $self->debug("using LWP::UserAgent...\n");
261 my $response;
262 if ($value =~ /\w\d{6}/) {
263 $self->{'_content'} =
264 $response = eval {
265 $self->get( $self->_cgi_url('uid', $value) )
267 $self->warn("Could not connect to the server\n") and return
268 if $@;
269 } else {
270 $self->{'_content'} =
271 eval {
272 $response = $self->get( $self->_cgi_url('entry', $value) )
274 $self->warn("Could not connect to the server\n") and return
275 if $@;
277 if ($response->is_success) {
278 $self->{'_content'} = $response->content;
279 $self->status('COMPLETED');
281 return;
286 sub result {
287 my ($self,$value) = @_;
289 $self->throw("Could not retrieve results") unless $self->status('COMPLETED');
291 # no processing
292 return $self->{'_content'} if $value && $value eq 'raw';
295 # create a MeSH::Term object
296 $_ = $self->{'_content'};
297 $self->debug( substr($_, 0, 100) . "\n");
298 my ($id) = m|Unique \s+ ID </TH>
299 <TD (?: \s+ [^>]+ )? >
300 (.*?) # id
301 </TD> |ix;
302 my ($name) = m|MeSH \s+ Heading </TH>
303 <TD (?: \s+ [^>]+ )? >
304 (.*?) # name
305 </TD> |ix;
306 my ($desc) = m|Scope \s+ Note </TH>
307 <TD (?: \s+ [^>]+ )? >
308 (.*?) # desc
309 </TD>|isx;
310 $self->throw("No description returned: $_") unless defined $desc;
311 $desc =~ s/<.*?>//sg;
312 $desc =~ s/(?:\r)?\n/ /g;
313 $desc =~ s/\( +/\(/g;
314 $desc =~ s/ {2,}/ /g;
316 my $term = Bio::Phenotype::MeSH::Term->new(-id => $id,
317 -name => $name,
318 -description => $desc
320 my ($trees) = $self->{'_content'} =~ /MeSH Tree Structures(.*)/s;
322 while (m|Entry Term</TH><TD(?: [^>]+)?>(.*?)</TD>|ig) {
323 $term->add_synonym($1);
324 $self->debug("Synonym: |$1|\n");
327 foreach (split /<HR>/i, $trees ) {
328 next unless /$name/;
329 s/<TD.*?>/ /sgi;
330 s/<.*?>//sg;
331 s/&nbsp;/ /sg;
332 #print "|$_|";
333 my ($treeno) = /$name \[([^]]+)]/;
334 my ($parent_treeno) = $treeno =~ /(.*)\.\d{3}/;
335 my ($parent) = /\n +(\w.+) \[$parent_treeno\]/;
337 my $twig = Bio::Phenotype::MeSH::Twig->new(-parent => $parent);
338 $term->add_twig($twig);
340 $self->debug("Parent: |$parent|\n");
341 while (/\n +(\w.+) \[$treeno\./g ) {
342 $twig->add_child($1);
343 $self->debug("Child: |$1|\n");
346 while (/\n +(\w.+) \[$parent_treeno\./g ) {
347 next if $name eq $1;
348 $twig->add_sister($1);
349 $self->debug("Sister: |$1|\n");
352 return $term;