tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / SeqVersion / gi.pm
blob1daa1e286878f4777824f0d6ab3a9f6757f2ac4c
1 # $Id$
3 # BioPerl module for Bio::DB::SeqVersion::gi
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Brian Osborne
9 # Copyright Brian Osborne 2006
11 # You may distribute this module under the same terms as Perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::DB::SeqVersion::gi - interface to NCBI Sequence Revision History page
19 =head1 SYNOPSIS
21 Do not use this module directly, use Bio::DB::SeqVersion.
23 use Bio::DB::SeqVersion;
25 my $query = Bio::DB::SeqVersion->new(-type => 'gi');
27 # all GIs, which will include the GI used to query
28 my @all_gis = $query->get_all(2);
30 # the most recent GI, which may or may not be the GI used to query
31 my $live_gi = $query->get_recent(2);
33 # get all the visible data on the Sequence Revision page
34 my $array_ref = $query->get_history(11111111);
36 These methods can also take accession numbers as arguments, just like
37 the Sequence Revision page itself.
39 =head1 DESCRIPTION
41 All sequence entries at GenBank are identified by a pair of
42 identifiers, an accession and a numeric identifier, and this number is
43 frequently called a GI number (B<G>enInfo B<I>dentifier). The accession
44 is stable, but each new version of the sequence entry for the accession
45 receives a new GI number (see L<http://www.ncbi.nlm.nih.gov/Sitemap/sequenceIDs.html>
46 for more information on GenBank identifiers). One accession
47 can have one or more GI numbers and the highest of these is the most recent,
48 or "live", GI.
50 Information on an accession and its associated GI numbers is available at
51 the Sequence Revision History page at NCBI,
52 L<http://www.ncbi.nlm.nih.gov/entrez/sutils/girevhist.cgi>, this information is
53 not available in file format. This module queries the Web page and retrieves GI
54 numbers and related data given an accession (e.g. NP_111111, A11111, P12345) or
55 a GI number (e.g. 2, 11111111) as query.
57 =head1 FEEDBACK
59 =head2 Mailing Lists
61 User feedback is an integral part of the evolution of this and other
62 Bioperl modules. Send your comments and suggestions preferably to one
63 of the Bioperl mailing lists. Your participation is much appreciated.
65 bioperl-l@bioperl.org - General discussion
66 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
68 =head2 Support
70 Please direct usage questions or support issues to the mailing list:
72 I<bioperl-l@bioperl.org>
74 rather than to the module maintainer directly. Many experienced and
75 reponsive experts will be able look at the problem and quickly
76 address it. Please include a thorough description of the problem
77 with code and data examples if at all possible.
79 =head2 Reporting Bugs
81 Report bugs to the Bioperl bug tracking system to help us keep track
82 the bugs and their resolution. Bug reports can be submitted via the
83 web:
85 http://bugzilla.open-bio.org/
87 =head1 AUTHOR - Brian Osborne
89 Email E<lt> osborne at optonline dot net E<gt>
91 =head1 CONTRIBUTORS
93 Torsten Seemann - torsten.seemann AT infotech.monash.edu.au
95 =head1 APPENDIX
97 The rest of the documentation details each of the object
98 methods. Internal methods are usually preceded with a _
100 =cut
102 # Let the code begin...
104 package Bio::DB::SeqVersion::gi;
105 use strict;
107 use base qw(Bio::DB::SeqVersion);
109 # Private class variables
111 my $CGIBASE = 'http://www.ncbi.nlm.nih.gov';
112 my $CGIARGS = '/entrez/sutils/girevhist.cgi?val=';
114 =head2 new
116 Title : new
117 Usage : $gb = Bio::DB::SeqVersion::gi->new
118 Function: Creates a new query object
119 Returns : New query object
121 =cut
123 sub new {
124 my ($class, @args) = @_;
125 my $self = $class->SUPER::new(@args);
126 $self->_initialize;
127 return $self;
130 =head2 get_all
132 Title : get_all
133 Usage : my @gis = $q->get_all(2)
134 Function: Get all GI numbers given a GI number
135 Returns : An array of GI numbers, earliest GI number is the 0 element
136 Args : A single GI number (string)
138 =cut
140 sub get_all {
141 my ($self,$id) = @_;
142 my (@arr,$ref);
143 $id eq $self->{_last_id} ? $ref = $self->{_last_result}
144 : $ref = $self->get_history($id);
145 for my $row (@{$ref}) {
146 push @arr,$$row[0];
148 @arr;
151 =head2 get_recent
153 Title : get_recent
154 Usage : my $newest_gi = $q->get_recent(2)
155 Function: Get most recent GI given a single GI
156 Returns : String
157 Args : A single GI number (string)
159 =cut
161 sub get_recent {
162 my ($self,$id) = @_;
163 my $ref;
164 $id eq $self->{_last_id} ? $ref = $self->{_last_result}
165 : $ref = $self->get_history($id);
166 $ref->[0]->[0];
169 =head2 get_history
171 Title : get_history
172 Usage : my $ref = $query_obj->get_history()
173 Function: Queries the NCBI Revision page, gets the data from the HTML table
174 Returns : Reference to an array of arrays where element 0 refers to the most
175 recent version and the last element refers to the oldest version.
176 In the second dimension the elements are:
178 0 GI number
179 1 Version
180 2 Update Date
181 3 Status
183 For example, to get the GI number of the first version:
185 $ref->[$#{@$ref}]->[0]
187 To get the Update Date of the latest version:
189 $ref->[0]->[2]
191 Args : One identifier (string)
193 =cut
195 sub get_history {
196 my ($self,$id) = @_;
197 my $html = $self->_get_request($id);
198 my $ref = $self->_process_data($html);
199 # store the very last result in case some other methods
200 # are called using the same identifier
201 $self->{_last_result} = $ref;
202 $self->{_last_id} = $id;
203 $ref;
207 =head2 _get_request
209 Title : _get_request
210 Usage : my $url = $self->_get_request
211 Function: GET using NCBI Revision page URL, uses Root::HTTPget
212 Returns : HTML
213 Args : One identifier (string)
215 =cut
217 sub _get_request {
218 my ($self,$id) = @_;
220 $self->throw("Must specify a single id to query") if (!$id || ref($id));
222 my $url = $CGIBASE . $CGIARGS . $id;
223 my $response = $self->get( $url );
224 if ( not $response->is_success ) {
225 $self->warn("Can't query $url: ".$response->status_line."\n");
226 return;
228 $self->debug("Response is:\n",$response->content,"\n");
229 return $response->content;
232 =head2 _process_data
234 Title : _process_data
235 Usage : $self->_process_data($html)
236 Function: extract data from HTML
237 Args : HTML from Revision History page
238 Returns : reference to an array of arrays
240 =cut
242 sub _process_data {
243 my ($self,$html) = @_;
244 my @table = ();
245 my $count = 0;
246 my ($table) = $html =~ /Revision \s+ history \s+ for \s+ .+? (<table.+)/sx;
247 $self->throw("Could not parse 'Revision history' HTML table") if not defined $table;
248 my (@rows) = $table =~ /<tr>(.+?)<\/tr>/g;
249 shift @rows; # get rid of header
250 for my $row (@rows) {
251 my (@arr) = $row =~ />([^<>]+)/g;
252 $table[$count] = \@arr;
253 $count++;
255 \@table;
260 __END__