Bio/Index/Fasta.pm

   1 #
   2 # $Id$
   3 #
   4 # BioPerl module for Bio::Index::Fasta
   5 #
   6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   7 #
   8 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Index::Fasta - Interface for indexing (multiple) fasta files
  17
  18 =head1 SYNOPSIS
  19
  20     # Make an index for one or more fasta files
  21     use Bio::Index::Fasta;
  22     use strict;
  23
  24     my $Index_File_Name = shift;
  25     my $inx = Bio::Index::Fasta->new(-filename => $Index_File_Name,
  26                                      -write_flag => 1);
  27     $inx->make_index(@ARGV);
  28
  29
  30     # Once the index is made it can accessed, either in the
  31     # same script or a different one
  32     use Bio::Index::Fasta;
  33     use strict;
  34
  35     my $Index_File_Name = shift;
  36     my $inx = Bio::Index::Fasta->new(-filename => $Index_File_Name);
  37     my $out = Bio::SeqIO->new(-format => 'Fasta',
  38                               -fh => \*STDOUT);
  39
  40     foreach my $id (@ARGV) {
  41         my $seq = $inx->fetch($id); # Returns Bio::Seq object
  42          $out->write_seq($seq);
  43     }
  44
  45     # or, alternatively
  46     my $id;
  47     my $seq = $inx->get_Seq_by_id($id); # identical to fetch()
  48
  49 =head1 DESCRIPTION
  50
  51 Inherits functions for managing dbm files from Bio::Index::Abstract.pm,
  52 and provides the basic funtionallity for indexing fasta files, and
  53 retrieving the sequence from them. For best results 'use strict'.
  54
  55 Bio::Index::Fasta supports the Bio::DB::BioSeqI interface, meaning
  56 it can be used as a Sequence database for other parts of bioperl
  57
  58 Additional example code is available in scripts/index/*PLS and in
  59 the Bioperl Tutorial (L<http://www.bioperl.org/wiki/Bptutorial.pl>)
  60
  61 Note that by default the key for the sequence will be the first continuous
  62 string after the 'E<gt>' in the fasta header. If you want to use a specific
  63 substring of the fasta header you must use the id_parser() method.
  64
  65 You can also set or customize the unique key used to retrieve by
  66 writing your own function and calling the id_parser() method.
  67 For example:
  68
  69    $inx->id_parser(\&get_id);
  70    # make the index
  71    $inx->make_index($file_name);
  72
  73    # here is where the retrieval key is specified
  74    sub get_id {
  75       my $line = shift;
  76       $line =~ /^>.+gi\|(\d+)/;
  77       $1;
  78    }
  79
  80
  81 =head1 FEED_BACK
  82
  83 =head2 Mailing Lists
  84
  85 User feedback is an integral part of the evolution of this and other
  86 Bioperl modules. Send your comments and suggestions preferably to one
  87 of the Bioperl mailing lists.  Your participation is much appreciated.
  88
  89   bioperl-l@bioperl.org                  - General discussion
  90   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  91
  92 =head2 Support
  93
  94 Please direct usage questions or support issues to the mailing list:
  95
  96 I<bioperl-l@bioperl.org>
  97
  98 rather than to the module maintainer directly. Many experienced and
  99 reponsive experts will be able look at the problem and quickly
 100 address it. Please include a thorough description of the problem
 101 with code and data examples if at all possible.
 102
 103 =head2 Reporting Bugs
 104
 105 Report bugs to the Bioperl bug tracking system to help us keep track
 106 the bugs and their resolution.  Bug reports can be submitted via the
 107 web:
 108
 109   http://bugzilla.open-bio.org/
 110
 111 =head1 AUTHOR - James Gilbert
 112
 113 Email - jgrg@sanger.ac.uk
 114
 115 =head1 APPENDIX
 116
 117 The rest of the documentation details each of the object
 118 methods. Internal methods are usually preceded with a _
 119
 120 =cut
 121
 122
 123 # Let the code begin...
 124
 125
 126 package Bio::Index::Fasta;
 127
 128 use strict;
 129 use warnings;
 130
 131 use Bio::Seq;
 132
 133 use base qw(Bio::Index::AbstractSeq);
 134
 135 #
 136 # Suggested fix by Michael G Schwern <schwern@pobox.com> to
 137 # get around a clash with CPAN shell...
 138 #
 139
 140 sub _version {
 141     return 0.2;
 142 }
 143
 144 =head2 _file_format
 145
 146  Title   : _file_format
 147  Function: The file format for this package, which is needed
 148            by the SeqIO system when reading the sequence.
 149  Returns : 'Fasta'
 150
 151 =cut
 152
 153 sub _file_format {
 154     return 'Fasta';
 155 }
 156
 157 =head2 _index_file
 158
 159   Title   : _index_file
 160   Usage   : $index->_index_file( $file_name, $i )
 161   Function: Specialist function to index FASTA format files.
 162             Is provided with a filename and an integer
 163             by make_index in its SUPER class.
 164   Example :
 165   Returns :
 166   Args    :
 167
 168 =cut
 169
 170 sub _index_file {
 171     my( $self,
 172          $file, # File name
 173          $i,    # Index-number of file being indexed
 174       ) = @_;
 175
 176     my( $begin,     # Offset from start of file of the start
 177                      # of the last found record.
 178     );
 179
 180     my $id_parser = $self->id_parser;
 181
 182     open my $FASTA, '<', $file or $self->throw("Can't open file for read : $file");
 183
 184     my $offset = ( $^O =~ /mswin/i ) ? 1 : 0;
 185
 186     # Main indexing loop
 187     while (<$FASTA>) {
 188         if (/^>/) {
 189
 190             # the following was fixed to allow validation - cjfields
 191
 192             # $begin is the position of the first character after the '>'
 193             $begin = tell($FASTA) - length( $_ ) - $offset;
 194
 195             foreach my $id (&$id_parser($_)) {
 196                 $self->add_record($id, $i, $begin);
 197             }
 198         }
 199     }
 200     close $FASTA;
 201     return 1;
 202 }
 203
 204 =head2 id_parser
 205
 206   Title   : id_parser
 207   Usage   : $index->id_parser( CODE )
 208   Function: Stores or returns the code used by record_id to
 209             parse the ID for record from a string.  Useful
 210             for (for instance) specifying a different
 211             parser for different flavours of FASTA file.
 212             Returns \&default_id_parser (see below) if not
 213             set. If you supply your own id_parser
 214             subroutine, then it should expect a fasta
 215             description line.  An entry will be added to
 216             the index for each string in the list returned.
 217   Example : $index->id_parser( \&my_id_parser )
 218   Returns : ref to CODE if called without arguments
 219   Args    : CODE
 220
 221 =cut
 222
 223 sub id_parser {
 224     my( $self, $code ) = @_;
 225
 226     if ($code) {
 227         $self->{'_id_parser'} = $code;
 228     }
 229     return $self->{'_id_parser'} || \&default_id_parser;
 230 }
 231
 232 =head2 default_id_parser
 233
 234   Title   : default_id_parser
 235   Usage   : $id = default_id_parser( $header )
 236   Function: The default Fasta ID parser for Fasta.pm
 237             Returns $1 from applying the regexp /^>\s*(\S+)/
 238             to $header.
 239   Returns : ID string
 240   Args    : a fasta header line string
 241
 242 =cut
 243
 244 sub default_id_parser {
 245     if ($_[0] =~ /^>\s*(\S+)/) {
 246         return $1;
 247     } else {
 248         return;
 249     }
 250 }
 251
 252 1;