trust your DB implementation, particularly if ancestor data are already available
[bioperl-live.git] / Bio / Index / AbstractSeq.pm
blob7dcb93a9aed14354f2a5a824a8e9b7a27ce7f0b1
2 # BioPerl module for Bio::Index::AbstractSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Index::AbstractSeq - base class for AbstractSeq
18 =head1 SYNOPSIS
20 # Make a new sequence file indexing package
22 package MyShinyNewIndexer;
24 use base qw(Bio::Index::AbstractSeq);
26 # Now provide the necessary methods...
28 =head1 DESCRIPTION
30 Provides a common base class for multiple sequence files built using
31 the Bio::Index::Abstract system, and provides a Bio::DB::SeqI
32 interface.
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this
39 and other Bioperl modules. Send your comments and suggestions
40 preferably to one of the Bioperl mailing lists.
41 Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Support
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
57 =head2 Reporting Bugs
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 the bugs and their resolution. Bug reports can be submitted via the
61 web:
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Ewan Birney
67 Email birney@ebi.ac.uk
69 =head1 APPENDIX
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
74 =head1 SEE ALSO
76 L<Bio::Index::Abstract>, which provides dbm indexing for flat files of
77 any type, containing sequence or not. L<Bio::Index::AbstractSeq> inherits
78 from L<Bio::Index::Abstract>
80 =cut
82 # Let's begin the code ...
84 package Bio::Index::AbstractSeq;
85 use strict;
87 use Bio::SeqIO::MultiFile;
89 use base qw(Bio::Index::Abstract Bio::DB::SeqI);
91 sub new {
92 my ($class, @args) = @_;
93 my $self = $class->SUPER::new(@args);
95 $self->{'_seqio_cache'} = [];
96 return $self;
99 =head2 _file_format
101 Title : _file_format
102 Usage : $self->_file_format
103 Function: Derived classes should override this
104 method (it throws an exception here)
105 to give the file format of the files used
106 Example :
107 Returns :
108 Args :
110 =cut
112 sub _file_format {
113 my ($self,@args) = @_;
115 my $pkg = ref($self);
116 $self->throw("Class '$pkg' must provide a file format method correctly");
119 =head2 fetch
121 Title : fetch
122 Usage : $index->fetch( $id )
123 Function: Returns a Bio::Seq object from the index
124 Example : $seq = $index->fetch( 'dJ67B12' )
125 Returns : Bio::Seq object
126 Args : ID
128 =cut
130 sub fetch {
131 my( $self, $id ) = @_;
132 my $db = $self->db();
133 my $seq;
135 if (my $rec = $db->{ $id }) {
136 my ($file, $begin) = $self->unpack_record( $rec );
138 # Get the (possibly cached) SeqIO object
139 my $seqio = $self->_get_SeqIO_object( $file );
140 my $fh = $seqio->_fh();
142 # move to start of record
143 # $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
144 seek($fh, $begin, 0);
146 $seq = $seqio->next_seq();
149 # we essentially assumme that the primary_id for the database
150 # is the display_id
151 if (ref($seq) && $seq->isa('Bio::PrimarySeqI') &&
152 $seq->primary_id =~ /^\D+$/) {
153 $seq->primary_id( $seq->display_id() );
155 return $seq;
158 =head2 _get_SeqIO_object
160 Title : _get_SeqIO_object
161 Usage : $index->_get_SeqIO_object( $file )
162 Function: Returns a Bio::SeqIO object for the file
163 Example : $seq = $index->_get_SeqIO_object( 0 )
164 Returns : Bio::SeqIO object
165 Args : File number (an integer)
167 =cut
169 sub _get_SeqIO_object {
170 my( $self, $i ) = @_;
172 unless ($self->{'_seqio_cache'}[$i]) {
173 my $fh = $self->_file_handle($i);
174 # make a new SeqIO object
175 my $seqio = Bio::SeqIO->new( -Format => $self->_file_format,
176 -fh => $fh);
177 $self->{'_seqio_cache'}[$i] = $seqio;
179 return $self->{'_seqio_cache'}[$i];
182 =head2 get_Seq_by_id
184 Title : get_Seq_by_id
185 Usage : $seq = $db->get_Seq_by_id()
186 Function: retrieves a sequence object, identically to
187 ->fetch, but here behaving as a Bio::DB::BioSeqI
188 Returns : new Bio::Seq object
189 Args : string represents the id
192 =cut
194 sub get_Seq_by_id {
195 my ($self,$id) = @_;
197 return $self->fetch($id);
200 =head2 get_Seq_by_acc
202 Title : get_Seq_by_acc
203 Usage : $seq = $db->get_Seq_by_acc()
204 Function: retrieves a sequence object, identically to
205 ->fetch, but here behaving as a Bio::DB::BioSeqI
206 Returns : new Bio::Seq object
207 Args : string represents the accession number
210 =cut
212 sub get_Seq_by_acc {
213 my ($self,$id) = @_;
215 return $self->fetch($id);
218 =head2 get_PrimarySeq_stream
220 Title : get_PrimarySeq_stream
221 Usage : $stream = get_PrimarySeq_stream
222 Function: Makes a Bio::DB::SeqStreamI compliant object
223 which provides a single method, next_primary_seq
224 Returns : Bio::DB::SeqStreamI
225 Args : none
228 =cut
230 sub get_PrimarySeq_stream {
231 my $self = shift;
232 my $num = $self->_file_count() || 0;
233 my @file;
235 for (my $i = 0; $i < $num; $i++) {
236 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
237 push(@file,$file);
240 my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file);
241 return $out;
244 =head2 get_all_primary_ids
246 Title : get_all_primary_ids
247 Usage : @ids = $seqdb->get_all_primary_ids()
248 Function: gives an array of all the primary_ids of the
249 sequence objects in the database. These
250 maybe ids (display style) or accession numbers
251 or something else completely different - they
252 *are not* meaningful outside of this database
253 implementation.
254 Example :
255 Returns : an array of strings
256 Args : none
259 =cut
261 sub get_all_primary_ids {
262 my ($self,@args) = @_;
263 my $db = $self->db;
265 # the problem is here that we have indexed things both on
266 # accession number and name.
268 # We could take two options
269 # here - loop over the database, returning only one copy of each
270 # id that points to the same byte position, or we rely on semantics
271 # of accession numbers.
273 # someone is going to index a database with no accession numbers.
274 # doh!. We have to uniquify the index...
276 my( %bytepos );
277 while (my($id, $rec) = each %$db) {
278 if( $id =~ /^__/ ) {
279 # internal info
280 next;
282 my ($file, $begin) = $self->unpack_record( $rec );
284 $bytepos{"$file:$begin"} = $id;
287 return values %bytepos;
291 =head2 get_Seq_by_primary_id
293 Title : get_Seq_by_primary_id
294 Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string);
295 Function: Gets a Bio::Seq object by the primary id. The primary
296 id in these cases has to come from $db->get_all_primary_ids.
297 There is no other way to get (or guess) the primary_ids
298 in a database.
300 The other possibility is to get Bio::PrimarySeqI objects
301 via the get_PrimarySeq_stream and the primary_id field
302 on these objects are specified as the ids to use here.
303 Returns : A Bio::Seq object
304 Args : primary id (as a string)
305 Throws : "acc does not exist" exception
308 =cut
310 sub get_Seq_by_primary_id {
311 my ($self,$id) = @_;
312 return $self->fetch($id);