Bio/Index/AbstractSeq.pm

   1 #
   2 # BioPerl module for Bio::Index::AbstractSeq
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Index::AbstractSeq - base class for AbstractSeq
  17
  18 =head1 SYNOPSIS
  19
  20   # Make a new sequence file indexing package
  21
  22   package MyShinyNewIndexer;
  23
  24   use base qw(Bio::Index::AbstractSeq);
  25
  26   # Now provide the necessary methods...
  27
  28 =head1 DESCRIPTION
  29
  30 Provides a common base class for multiple sequence files built using
  31 the Bio::Index::Abstract system, and provides a Bio::DB::SeqI
  32 interface.
  33
  34 =head1 FEEDBACK
  35
  36 =head2 Mailing Lists
  37
  38 User feedback is an integral part of the evolution of this
  39 and other Bioperl modules. Send your comments and suggestions
  40 preferably to one of the Bioperl mailing lists.
  41 Your participation is much appreciated.
  42
  43   bioperl-l@bioperl.org                  - General discussion
  44   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  45
  46 =head2 Support
  47
  48 Please direct usage questions or support issues to the mailing list:
  49
  50 I<bioperl-l@bioperl.org>
  51
  52 rather than to the module maintainer directly. Many experienced and
  53 reponsive experts will be able look at the problem and quickly
  54 address it. Please include a thorough description of the problem
  55 with code and data examples if at all possible.
  56
  57 =head2 Reporting Bugs
  58
  59 Report bugs to the Bioperl bug tracking system to help us keep track
  60 the bugs and their resolution.  Bug reports can be submitted via the
  61 web:
  62
  63   https://github.com/bioperl/bioperl-live/issues
  64
  65 =head1 AUTHOR - Ewan Birney
  66
  67 Email birney@ebi.ac.uk
  68
  69 =head1 APPENDIX
  70
  71 The rest of the documentation details each of the object methods.
  72 Internal methods are usually preceded with a _
  73
  74 =head1 SEE ALSO
  75
  76 L<Bio::Index::Abstract>, which provides dbm indexing for flat files of
  77 any type, containing sequence or not. L<Bio::Index::AbstractSeq> inherits
  78 from L<Bio::Index::Abstract>
  79
  80 =cut
  81
  82 # Let's begin the code ...
  83
  84 package Bio::Index::AbstractSeq;
  85 use strict;
  86
  87 use Bio::SeqIO::MultiFile;
  88
  89 use base qw(Bio::Index::Abstract Bio::DB::SeqI);
  90
  91 sub new {
  92         my ($class, @args) = @_;
  93         my $self = $class->SUPER::new(@args);
  94
  95         $self->{'_seqio_cache'} = [];
  96         return $self;
  97 }
  98
  99 =head2 _file_format
 100
 101  Title   : _file_format
 102  Usage   : $self->_file_format
 103  Function: Derived classes should override this
 104            method (it throws an exception here)
 105            to give the file format of the files used
 106  Example :
 107  Returns :
 108  Args    :
 109
 110 =cut
 111
 112 sub _file_format {
 113    my ($self,@args) = @_;
 114
 115    my $pkg = ref($self);
 116    $self->throw("Class '$pkg' must provide a file format method correctly");
 117 }
 118
 119 =head2 fetch
 120
 121   Title   : fetch
 122   Usage   : $index->fetch( $id )
 123   Function: Returns a Bio::Seq object from the index
 124   Example : $seq = $index->fetch( 'dJ67B12' )
 125   Returns : Bio::Seq object
 126   Args    : ID
 127
 128 =cut
 129
 130 sub fetch {
 131         my( $self, $id ) = @_;
 132         my $db = $self->db();
 133         my $seq;
 134
 135         if (my $rec = $db->{ $id }) {
 136                 my ($file, $begin) = $self->unpack_record( $rec );
 137
 138                 # Get the (possibly cached) SeqIO object
 139                 my $seqio = $self->_get_SeqIO_object( $file );
 140                 my $fh = $seqio->_fh();
 141
 142                 # move to start of record
 143                 # $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
 144                 seek($fh, $begin, 0);
 145
 146                 $seq = $seqio->next_seq();
 147         }
 148
 149         # we essentially assumme that the primary_id for the database
 150         # is the display_id
 151         if (ref($seq) && $seq->isa('Bio::PrimarySeqI') &&
 152                  $seq->primary_id =~ /^\D+$/) {
 153                 $seq->primary_id( $seq->display_id() );
 154         }
 155         return $seq;
 156 }
 157
 158 =head2 _get_SeqIO_object
 159
 160   Title   : _get_SeqIO_object
 161   Usage   : $index->_get_SeqIO_object( $file )
 162   Function: Returns a Bio::SeqIO object for the file
 163   Example : $seq = $index->_get_SeqIO_object( 0 )
 164   Returns : Bio::SeqIO object
 165   Args    : File number (an integer)
 166
 167 =cut
 168
 169 sub _get_SeqIO_object {
 170     my( $self, $i ) = @_;
 171
 172     unless ($self->{'_seqio_cache'}[$i]) {
 173         my $fh = $self->_file_handle($i);
 174         # make a new SeqIO object
 175         my $seqio = Bio::SeqIO->new( -Format => $self->_file_format,
 176                                      -fh     => $fh);
 177         $self->{'_seqio_cache'}[$i] = $seqio;
 178     }
 179     return $self->{'_seqio_cache'}[$i];
 180 }
 181
 182 =head2 get_Seq_by_id
 183
 184  Title   : get_Seq_by_id
 185  Usage   : $seq = $db->get_Seq_by_id()
 186  Function: retrieves a sequence object, identically to
 187            ->fetch, but here behaving as a Bio::DB::BioSeqI
 188  Returns : new Bio::Seq object
 189  Args    : string represents the id
 190
 191
 192 =cut
 193
 194 sub get_Seq_by_id {
 195    my ($self,$id) = @_;
 196
 197    return $self->fetch($id);
 198 }
 199
 200 =head2 get_Seq_by_acc
 201
 202  Title   : get_Seq_by_acc
 203  Usage   : $seq = $db->get_Seq_by_acc()
 204  Function: retrieves a sequence object, identically to
 205            ->fetch, but here behaving as a Bio::DB::BioSeqI
 206  Returns : new Bio::Seq object
 207  Args    : string represents the accession number
 208
 209
 210 =cut
 211
 212 sub get_Seq_by_acc {
 213    my ($self,$id) = @_;
 214
 215    return $self->fetch($id);
 216 }
 217
 218 =head2 get_PrimarySeq_stream
 219
 220  Title   : get_PrimarySeq_stream
 221  Usage   : $stream = get_PrimarySeq_stream
 222  Function: Makes a Bio::DB::SeqStreamI compliant object
 223            which provides a single method, next_primary_seq
 224  Returns : Bio::DB::SeqStreamI
 225  Args    : none
 226
 227
 228 =cut
 229
 230 sub get_PrimarySeq_stream {
 231     my $self = shift;
 232     my $num  = $self->_file_count() || 0;
 233     my @file;
 234
 235     for (my $i = 0; $i < $num; $i++) {
 236         my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
 237         push(@file,$file);
 238     }
 239
 240     my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file);
 241     return $out;
 242 }
 243
 244 =head2 get_all_primary_ids
 245
 246  Title   : get_all_primary_ids
 247  Usage   : @ids = $seqdb->get_all_primary_ids()
 248  Function: gives an array of all the primary_ids of the
 249            sequence objects in the database. These
 250            maybe ids (display style) or accession numbers
 251            or something else completely different - they
 252            *are not* meaningful outside of this database
 253            implementation.
 254  Example :
 255  Returns : an array of strings
 256  Args    : none
 257
 258
 259 =cut
 260
 261 sub get_all_primary_ids {
 262    my ($self,@args) = @_;
 263     my $db = $self->db;
 264
 265    # the problem is here that we have indexed things both on
 266    # accession number and name.
 267
 268    # We could take two options
 269    # here - loop over the database, returning only one copy of each
 270    # id that points to the same byte position, or we rely on semantics
 271    # of accession numbers.
 272
 273    # someone is going to index a database with no accession numbers.
 274    # doh!. We have to uniquify the index...
 275
 276    my( %bytepos );
 277    while (my($id, $rec) = each %$db) {
 278        if( $id =~ /^__/ ) {
 279            # internal info
 280            next;
 281        }
 282        my ($file, $begin) = $self->unpack_record( $rec );
 283
 284        $bytepos{"$file:$begin"} = $id;
 285    }
 286
 287    return values %bytepos;
 288 }
 289
 290
 291 =head2 get_Seq_by_primary_id
 292
 293  Title   : get_Seq_by_primary_id
 294  Usage   : $seq = $db->get_Seq_by_primary_id($primary_id_string);
 295  Function: Gets a Bio::Seq object by the primary id. The primary
 296            id in these cases has to come from $db->get_all_primary_ids.
 297            There is no other way to get (or guess) the primary_ids
 298            in a database.
 299
 300            The other possibility is to get Bio::PrimarySeqI objects
 301            via the get_PrimarySeq_stream and the primary_id field
 302            on these objects are specified as the ids to use here.
 303  Returns : A Bio::Seq object
 304  Args    : primary id (as a string)
 305  Throws  : "acc does not exist" exception
 306
 307
 308 =cut
 309
 310 sub get_Seq_by_primary_id {
 311    my ($self,$id) = @_;
 312    return $self->fetch($id);
 313 }
 314
 315 1;