2 # BioPerl module for Bio::Index::AbstractSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Index::AbstractSeq - base class for AbstractSeq
20 # Make a new sequence file indexing package
22 package MyShinyNewIndexer;
24 use base qw(Bio::Index::AbstractSeq);
26 # Now provide the necessary methods...
30 Provides a common base class for multiple sequence files built using
31 the Bio::Index::Abstract system, and provides a Bio::DB::SeqI
38 User feedback is an integral part of the evolution of this
39 and other Bioperl modules. Send your comments and suggestions
40 preferably to one of the Bioperl mailing lists.
41 Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 the bugs and their resolution. Bug reports can be submitted via the
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Ewan Birney
67 Email birney@ebi.ac.uk
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
76 L<Bio::Index::Abstract>, which provides dbm indexing for flat files of
77 any type, containing sequence or not. L<Bio::Index::AbstractSeq> inherits
78 from L<Bio::Index::Abstract>
82 # Let's begin the code ...
84 package Bio
::Index
::AbstractSeq
;
87 use Bio
::SeqIO
::MultiFile
;
89 use base
qw(Bio::Index::Abstract Bio::DB::SeqI);
92 my ($class, @args) = @_;
93 my $self = $class->SUPER::new
(@args);
95 $self->{'_seqio_cache'} = [];
102 Usage : $self->_file_format
103 Function: Derived classes should override this
104 method (it throws an exception here)
105 to give the file format of the files used
113 my ($self,@args) = @_;
115 my $pkg = ref($self);
116 $self->throw("Class '$pkg' must provide a file format method correctly");
122 Usage : $index->fetch( $id )
123 Function: Returns a Bio::Seq object from the index
124 Example : $seq = $index->fetch( 'dJ67B12' )
125 Returns : Bio::Seq object
131 my( $self, $id ) = @_;
132 my $db = $self->db();
135 if (my $rec = $db->{ $id }) {
136 my ($file, $begin) = $self->unpack_record( $rec );
138 # Get the (possibly cached) SeqIO object
139 my $seqio = $self->_get_SeqIO_object( $file );
140 my $fh = $seqio->_fh();
142 # move to start of record
143 # $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
144 seek($fh, $begin, 0);
146 $seq = $seqio->next_seq();
149 # we essentially assumme that the primary_id for the database
151 if (ref($seq) && $seq->isa('Bio::PrimarySeqI') &&
152 $seq->primary_id =~ /^\D+$/) {
153 $seq->primary_id( $seq->display_id() );
158 =head2 _get_SeqIO_object
160 Title : _get_SeqIO_object
161 Usage : $index->_get_SeqIO_object( $file )
162 Function: Returns a Bio::SeqIO object for the file
163 Example : $seq = $index->_get_SeqIO_object( 0 )
164 Returns : Bio::SeqIO object
165 Args : File number (an integer)
169 sub _get_SeqIO_object
{
170 my( $self, $i ) = @_;
172 unless ($self->{'_seqio_cache'}[$i]) {
173 my $fh = $self->_file_handle($i);
174 # make a new SeqIO object
175 my $seqio = Bio
::SeqIO
->new( -Format
=> $self->_file_format,
177 $self->{'_seqio_cache'}[$i] = $seqio;
179 return $self->{'_seqio_cache'}[$i];
184 Title : get_Seq_by_id
185 Usage : $seq = $db->get_Seq_by_id()
186 Function: retrieves a sequence object, identically to
187 ->fetch, but here behaving as a Bio::DB::BioSeqI
188 Returns : new Bio::Seq object
189 Args : string represents the id
197 return $self->fetch($id);
200 =head2 get_Seq_by_acc
202 Title : get_Seq_by_acc
203 Usage : $seq = $db->get_Seq_by_acc()
204 Function: retrieves a sequence object, identically to
205 ->fetch, but here behaving as a Bio::DB::BioSeqI
206 Returns : new Bio::Seq object
207 Args : string represents the accession number
215 return $self->fetch($id);
218 =head2 get_PrimarySeq_stream
220 Title : get_PrimarySeq_stream
221 Usage : $stream = get_PrimarySeq_stream
222 Function: Makes a Bio::DB::SeqStreamI compliant object
223 which provides a single method, next_primary_seq
224 Returns : Bio::DB::SeqStreamI
230 sub get_PrimarySeq_stream
{
232 my $num = $self->_file_count() || 0;
235 for (my $i = 0; $i < $num; $i++) {
236 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
240 my $out = Bio
::SeqIO
::MultiFile
->new( '-format' => $self->_file_format , -files
=> \
@file);
244 =head2 get_all_primary_ids
246 Title : get_all_primary_ids
247 Usage : @ids = $seqdb->get_all_primary_ids()
248 Function: gives an array of all the primary_ids of the
249 sequence objects in the database. These
250 maybe ids (display style) or accession numbers
251 or something else completely different - they
252 *are not* meaningful outside of this database
255 Returns : an array of strings
261 sub get_all_primary_ids
{
262 my ($self,@args) = @_;
265 # the problem is here that we have indexed things both on
266 # accession number and name.
268 # We could take two options
269 # here - loop over the database, returning only one copy of each
270 # id that points to the same byte position, or we rely on semantics
271 # of accession numbers.
273 # someone is going to index a database with no accession numbers.
274 # doh!. We have to uniquify the index...
277 while (my($id, $rec) = each %$db) {
282 my ($file, $begin) = $self->unpack_record( $rec );
284 $bytepos{"$file:$begin"} = $id;
287 return values %bytepos;
291 =head2 get_Seq_by_primary_id
293 Title : get_Seq_by_primary_id
294 Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string);
295 Function: Gets a Bio::Seq object by the primary id. The primary
296 id in these cases has to come from $db->get_all_primary_ids.
297 There is no other way to get (or guess) the primary_ids
300 The other possibility is to get Bio::PrimarySeqI objects
301 via the get_PrimarySeq_stream and the primary_id field
302 on these objects are specified as the ids to use here.
303 Returns : A Bio::Seq object
304 Args : primary id (as a string)
305 Throws : "acc does not exist" exception
310 sub get_Seq_by_primary_id
{
312 return $self->fetch($id);