tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / MeSH.pm
blob584c1cc4afc7d4612115b297df48eb5f355a11ab
1 # $Id$
3 # BioPerl module for Bio::DB::MeSH
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::DB::MeSH - Term retrieval from a Web MeSH database
17 =head1 SYNOPSIS
19 my $mesh = Bio::DB::MeSH->new();
20 my $term = $mesh->get_exact_term('Butter');
21 print $term->description;
23 =head1 DESCRIPTION
25 This class retrieves a term from the Medical Subject Headings database
26 by the National Library of Medicine of USA.
27 See L<http://www.nlm.nih.gov/mesh/meshhome.html>.
29 This class implements L<Bio::SimpleAnalysisI> and wraps its methods under
30 L<get_exact_term>.
32 By default, web access uses L<WWW::Mechanize>, but in its absense
33 falls back to bioperl module L<Bio::WebAgent> which is a subclass of
34 L<LWP::UserAgent>. If not even that is not installed, it uses
35 L<Bio::Root::HTTPget>.
37 =head1 SEE ALSO
39 L<Bio::Phenotype::MeSH::Term>
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to the
47 Bioperl mailing lists Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 =head2 Support
54 Please direct usage questions or support issues to the mailing list:
56 I<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
63 =head2 Reporting Bugs
65 report bugs to the Bioperl bug tracking system to help us keep track
66 the bugs and their resolution. Bug reports can be submitted via the
67 web:
69 http://bugzilla.open-bio.org/
71 =head1 AUTHOR
73 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
75 =head1 APPENDIX
77 The rest of the documentation details each of the object
78 methods. Internal methods are usually preceded with a _
80 =cut
83 # Let the code begin...
86 package Bio::DB::MeSH;
87 use strict;
89 use Bio::Phenotype::MeSH::Term;
90 use Bio::Phenotype::MeSH::Twig;
92 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
95 my $URL = 'http://www.nlm.nih.gov/mesh/MBrowser.html';
97 my $ANALYSIS_SPEC= {name => 'MeSH term retrival',
98 type => 'Entry retrieval'};
99 my $INPUT_SPEC = [
100 {mandatory=>'true',
101 type => 'scalar',
102 'name'=> 'value',
106 my $RESULT_SPEC =
108 '' => 'Bio::Phenotype::MeSH::Term',
109 raw => 'raw output',
113 sub _init {
114 my $self = shift;
115 $self->url($URL);
116 $self->{'_ANALYSIS_SPEC'} =$ANALYSIS_SPEC;
117 $self->{'_INPUT_SPEC'} =$INPUT_SPEC;
118 $self->{'_RESULT_SPEC'} =$RESULT_SPEC;
119 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
120 $self->_webmodule;
121 return $self;
124 sub _webmodule {
125 my ($self) = shift;
126 $self->{'_webmodule'} = '';
127 eval {
128 require WWW::Mechanize;
130 unless ($@) {
131 $self->{'_webmodule'} = 'WWW::Mechanize';
132 return;
134 eval {
135 require LWP::UserAgent;
137 unless ($@) {
138 $self->{'_webmodule'} = 'Bio::WebAgent';
139 return;
141 require Bio::Root::HTTPget;
142 $self->{'_webmodule'} = 'Bio::Root::HTTPget';
146 =head2 get_exact_term
148 Title : get_exact_term
149 Usage : $s = $db->get_exact_term($value);
150 Function: Retrive a single MeSH term using a unique ID or exact name.
151 Example :
152 Returns : a Bio::Phenotype::MeSH::Term object
153 Args : scalar, UID or name of a MeSH term
155 The returned term object contains information about the immediate
156 vincinity of the term in the terminology hierarchy. See
157 L<Bio::Phenotype::MeSH::Twig>.
159 =cut
162 sub get_exact_term {
163 my ($self, $value) = @_;
164 $self->{'_term'} = undef;
165 $self->run($value) if $value;
166 $self->throw("Could not connect to the server")
167 unless $self->status eq 'COMPLETED';
168 return $self->result;
172 sub run {
173 my ($self, $value) = @_;
175 # check input
176 $self->throw("Need a MeSH name or ID as an input [$value]") if ref $value;
178 # internal run()
179 $self->_run($value);
183 sub _cgi_url {
184 my($self, $field, $term) = @_;
185 # we don't bother to URI::Escape $field and $term as this is an untainted private sub
186 return 'http://www.nlm.nih.gov/cgi/mesh/2003/MB_cgi?field='.$field.'&term='.$term;
190 sub _run {
191 my ($self, $value) = @_;
192 $self->throw('Need a value [$value]')
193 unless $value;;
194 # delay repeated calls by default by 3 sec, set delay() to change
195 # $self->sleep;
197 $self->status('TERMINATED_BY_ERROR');
199 if ($self->{'_webmodule'} eq 'WWW::Mechanize') {
200 $self->debug("using WWW::Mechanize...\n");
201 my $agent = WWW::Mechanize->new();
202 $agent->get($self->url);
203 $agent->status == 200
204 or $self->warn("Could not connect to the server\n") and return;
206 $agent->form_name('MB');
208 $agent->field("term", $value);
209 if ($value =~ /\w\d{6}/) {
210 $agent->field("field", 'uid');
211 } else {
212 $agent->field("field", 'entry');
214 $agent->click("exact");
216 $self->{'_content'} = $agent->content();
217 $self->status('COMPLETED');
218 return;
220 elsif ($self->{'_webmodule'} eq 'Bio::WebAgent') {
221 $self->debug("using LWP::UserAgent...\n");
222 my $response;
223 if ($value =~ /\w\d{6}/) {
224 $self->{'_content'} =
225 $response = eval {
226 $self->get( $self->_cgi_url('uid', $value) )
228 $self->warn("Could not connect to the server\n") and return
229 if $@;
230 } else {
231 $self->{'_content'} =
232 eval {
233 $response = $self->get( $self->_cgi_url('entry', $value) )
235 $self->warn("Could not connect to the server\n") and return
236 if $@;
238 if ($response->is_success) {
239 $self->{'_content'} = $response->content;
240 $self->status('COMPLETED');
242 return;
243 } else {
244 $self->debug("using Bio::Root::HTTPget...\n");
245 my $agent = Bio::Root::HTTPget->new();
246 if ($value =~ /\w\d{6}/) {
247 $self->{'_content'} =
248 eval {
249 $agent->get( $self->_cgi_url('uid', $value) )
251 $self->warn("Could not connect to the server\n") and return
252 if $@;
253 } else {
254 $self->{'_content'} =
255 eval {
256 $agent->get( $self->_cgi_url('entry', $value) )
258 $self->debug("Could not connect to the server\n") and return
259 if $@;
261 $self->status('COMPLETED');
265 sub result {
266 my ($self,$value) = @_;
268 $self->throw("Could not retrive results") unless $self->status('COMPLETED');
270 # no processing
271 return $self->{'_content'} if $value && $value eq 'raw';
274 # create a MeSH::Term object
275 $_ = $self->{'_content'};
276 $self->debug( substr($_, 0, 100) . "\n");
277 my ($id) = m|Unique ID</TH><TD>(.*?)</TD>|i;
278 my ($name) = m|MeSH Heading</TH><TD>([^<]+)|i;
279 my ($desc) = m|Scope Note</TH><TD>(.*?)</TD>|is;
280 $self->throw("No description returned: $_") unless defined $desc;
281 $desc =~ s/<.*?>//sg;
282 $desc =~ s/\n/ /g;
284 my $term = Bio::Phenotype::MeSH::Term->new(-id => $id,
285 -name => $name,
286 -description => $desc
288 my ($trees) = $self->{'_content'} =~ /MeSH Tree Structures(.*)/s;
290 while (m|Entry Term</TH><TD>([^<]+)|ig) {
291 $term->add_synonym($1);
292 $self->debug("Synonym: |$1|\n");
295 foreach (split /<HR>/i, $trees ) {
296 next unless /$name/;
297 s/<TD.*?>/ /sgi;
298 s/<.*?>//sg;
299 s/&nbsp;/ /sg;
300 #print "|$_|";
301 my ($treeno) = /$name \[([^]]+)]/;
302 my ($parent_treeno) = $treeno =~ /(.*)\.\d{3}/;
303 my ($parent) = /\n +(\w.+) \[$parent_treeno\]/;
305 my $twig = Bio::Phenotype::MeSH::Twig->new(-parent => $parent);
306 $term->add_twig($twig);
308 $self->debug("Parent: |$parent|\n");
309 while (/\n +(\w.+) \[$treeno\./g ) {
310 $twig->add_child($1);
311 $self->debug("Child: |$1|\n");
314 while (/\n +(\w.+) \[$parent_treeno\./g ) {
315 next if $name eq $1;
316 $twig->add_sister($1);
317 $self->debug("Sister: |$1|\n");
320 return $term;