Bio/Index/Fasta.pm

   1
   2 #
   3 # BioPerl module for Bio::Index::Abstract
   4 #
   5 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
   6 #
   7 # You may distribute this module under the same terms as perl itself
   8
   9 # POD documentation - main docs before the code
  10
  11 =head1 NAME
  12
  13 Bio::Index::Fasta - Interface for indexing (multiple) fasta files
  14
  15 =head1 SYNOPSIS
  16
  17     # Complete code for making an index for several
  18     # fasta files
  19     use Bio::Index::Fasta;
  20
  21     my $Index_File_Name = shift;
  22     my $inx = Bio::Index::Fasta->new($Index_File_Name, 'WRITE');
  23     $inx->make_index(@ARGV);
  24
  25     # Print out several sequences present in the index
  26     # in gcg format
  27     use Bio::Index::Fasta;
  28
  29     my $Index_File_Name = shift;
  30     my $inx = Bio::Index::Fasta->new($Index_File_Name);
  31
  32     foreach my $id (@ARGV) {
  33         my $seq = $inx->fetch($id); # Returns Bio::Seq object
  34         print $seq->layout('GCG');
  35     }
  36
  37     # or, alternatively
  38
  39     my $seq = $inx->get_Seq_by_id($id); #identical to fetch
  40
  41 =head1 DESCRIPTION
  42
  43 Inherits functions for managing dbm files from Bio::Index::Abstract.pm,
  44 and provides the basic funtionallity for indexing fasta files, and
  45 retrieving the sequence from them.
  46
  47 Bio::Index::Fasta supports the Bio::DB::BioSeqI interface, meaning
  48 it can be used a a Sequence database for other parts of bioperl
  49
  50 =head1 FEED_BACK
  51
  52 =head2 Mailing Lists
  53
  54 User feedback is an integral part of the evolution of this and other
  55 Bioperl modules. Send your comments and suggestions preferably to one
  56 of the Bioperl mailing lists.  Your participation is much appreciated.
  57
  58   vsns-bcd-perl@lists.uni-bielefeld.de          - General discussion
  59   vsns-bcd-perl-guts@lists.uni-bielefeld.de     - Technically-oriented discussion
  60   http://bio.perl.org/MailList.html             - About the mailing lists
  61
  62 =head2 Reporting Bugs
  63
  64 Report bugs to the Bioperl bug tracking system to help us keep track
  65 the bugs and their resolution.  Bug reports can be submitted via
  66 email or the web:
  67
  68   bioperl-bugs@bio.perl.org
  69   http://bio.perl.org/bioperl-bugs/
  70
  71 =head1 AUTHOR - James Gilbert
  72
  73 Email - jgrg@sanger.ac.uk
  74
  75 =head1 APPENDIX
  76
  77 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
  78
  79 =cut
  80
  81
  82 # Let the code begin...
  83
  84
  85 package Bio::Index::Fasta;
  86
  87 use vars qw($VERSION @ISA @EXPORT_OK);
  88 use strict;
  89
  90 use Bio::Index::Abstract;
  91 use Bio::Seq;
  92
  93 @ISA = qw(Bio::Index::Abstract Bio::DB::BioSeqI Exporter);
  94 @EXPORT_OK = qw();
  95
  96 sub _type_stamp {
  97     return '__FASTA__'; # What kind of index are we?
  98 }
  99
 100 #
 101 # Suggested fix by Michael G Schwern <schwern@pobox.com> to
 102 # get around a clash with CPAN shell...
 103 #
 104
 105 BEGIN {
 106     $VERSION = 0.2;
 107 }
 108
 109 sub _version {
 110     return $VERSION;
 111 }
 112
 113
 114 =head2 _initialize
 115
 116   Title   : _initialize
 117   Usage   : $index->_initialize
 118   Function: Calls $index->SUPER::_initialize(), and then adds
 119             the default id parser for fasta files.
 120   Example :
 121   Returns :
 122   Args    :
 123
 124 =cut
 125
 126 sub _initialize {
 127     my($self, $index_file, $write_flag) = @_;
 128
 129     $self->SUPER::_initialize($index_file, $write_flag);
 130     $self->id_parser( \&default_id_parser );
 131 }
 132
 133
 134 =head2 _index_file
 135
 136   Title   : _index_file
 137   Usage   : $index->_index_file( $file_name, $i )
 138   Function: Specialist function to index FASTA format files.
 139             Is provided with a filename and an integer
 140             by make_index in its SUPER class.
 141   Example :
 142   Returns :
 143   Args    :
 144
 145 =cut
 146
 147 sub _index_file {
 148     my( $self,
 149         $file, # File name
 150         $i     # Index-number of file being indexed
 151         ) = @_;
 152
 153     my( $begin, # Offset from start of file of the start
 154                 # of the last found record.
 155         $end,   # Offset from start of file of the end
 156                 # of the last found record.
 157         $id,    # ID of last found record.
 158         );
 159
 160     $begin = 0;
 161     $end   = 0;
 162
 163     open FASTA, $file or $self->throw("Can't open file for read : $file");
 164
 165     # Main indexing loop
 166     while (<FASTA>) {
 167         if (/^>/) {
 168             my $new_begin = tell(FASTA) - length( $_ );
 169             $end = $new_begin - 1;
 170
 171             $self->add_record($id, $i, $begin, $end) if $id;
 172
 173             $begin = $new_begin;
 174             ($id) = $self->record_id( $_ );
 175         }
 176     }
 177     # Don't forget to add the last record
 178     $end = tell(FASTA);
 179     $self->add_record($id, $i, $begin, $end) if $id;
 180
 181     close FASTA;
 182     return 1;
 183 }
 184
 185
 186 # Should there be a prototype for this method in Index::Abstract.pm?
 187 =head2 record_id
 188
 189   Title   : record_id
 190   Usage   : $index->record_id( STRING );
 191   Function: Parses the ID for an entry from the string
 192             supplied, using the code in $index->{'_id_parser'}
 193   Example :
 194   Returns : scalar or exception
 195   Args    : STRING
 196
 197
 198 =cut
 199
 200 sub record_id {
 201     my ($self, $line) = @_;
 202
 203     if (my $id = $self->{'_id_parser'}->( $line )) {
 204         return $id;
 205     } else {
 206         $self->throw("Can't parse ID from line : $line");
 207     }
 208 }
 209
 210
 211 =head2 id_parser
 212
 213   Title   : id_parser
 214   Usage   : $index->id_parser( CODE )
 215   Function: Stores or returns the code used by record_id
 216             to parse the ID for record from a string.  Useful
 217             for (for instance) specifying a different parser
 218             for different flavours of FASTA file.
 219   Example : $index->id_parser( \&my_id_parser )
 220   Returns : ref to CODE if called without arguments
 221   Args    : CODE
 222
 223 =cut
 224
 225 sub id_parser {
 226     my( $self, $code ) = @_;
 227
 228     if ($code) {
 229         $self->{'_id_parser'} = $code;
 230     } else {
 231         return $self->{'_id_parser'};
 232     }
 233 }
 234
 235
 236
 237 =head2 default_id_parser
 238
 239   Title   : default_id_parser
 240   Usage   : $id = default_id_parser( $header )
 241   Function: The default Fasta ID parser for Fasta.pm
 242             Returns $1 from applying the regexp /^>\s*(\S+)/
 243             to $header.
 244   Example :
 245   Returns : ID string
 246   Args    : a fasta header line string
 247
 248 =cut
 249
 250 sub default_id_parser {
 251     my $line = shift;
 252     $line =~ /^>\s*(\S+)/;
 253     return $1;
 254 }
 255
 256
 257 =head2 fetch
 258
 259   Title   : fetch
 260   Usage   : $index->fetch( $id )
 261   Function: Returns a Bio::Seq object from the index
 262   Example : $seq = $index->fetch( 'dJ67B12' )
 263   Returns : Bio::Seq object
 264   Args    : ID
 265
 266 =cut
 267
 268 sub fetch {
 269     my( $self, $id ) = @_;
 270
 271     my $db = $self->db();
 272     if (my $rec = $db->{ $id }) {
 273         my( @record );
 274
 275         my ($file, $begin, $end) = $self->unpack_record( $rec );
 276
 277         # Get the (possibly cached) filehandle
 278         my $fh = $self->_file_handle( $file );
 279
 280         # Accumulate lines in @record until beyond end
 281         seek($fh, $begin, 0);
 282         while (defined(my $line = <$fh>)) {
 283             push(@record, $line);
 284             last if tell($fh) > $end;
 285         }
 286
 287         $self->throw("Can't fetch sequence for record : $id")
 288             unless @record;
 289
 290         # Parse record
 291         my $firstLine = shift @record;
 292         my ($name, $desc) = $firstLine =~ /^>\s*(\S+)\s*(.*?)\s*$/;
 293         chomp( @record );
 294
 295         # Return a shiny Bio::Seq object
 296         return Bio::Seq->new( -ID   => $name,
 297                               -DESC => $desc,
 298                               -SEQ  => uc(join('', @record)) );
 299     } else {
 300         $self->throw("Unable to find a record for $id in Fasta index");
 301         return;
 302     }
 303 }
 304
 305 =head2 get_Seq_by_id
 306
 307  Title   : get_Seq_by_id
 308  Usage   : $seq = $db->get_Seq_by_id()
 309  Function: retrieves a sequence object, identically to
 310            ->fetch, but here behaving as a Bio::DB::BioSeqI
 311  Returns : new Bio::Seq object
 312  Args    : string represents the id
 313
 314
 315 =cut
 316
 317 sub get_Seq_by_id{
 318    my ($self,$id) = @_;
 319
 320    return $self->fetch($id);
 321 }
 322
 323
 324 1;
 325
 326
 327