Bio/Index/Fasta.pm

   1 #
   2 # $Id$
   3 #
   4 # BioPerl module for Bio::Index::Fasta
   5 #
   6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   7 #
   8 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Index::Fasta - Interface for indexing (multiple) fasta files
  17
  18 =head1 SYNOPSIS
  19
  20     # Make an index for one or more fasta files
  21     use Bio::Index::Fasta;
  22     use strict;
  23
  24     my $Index_File_Name = shift;
  25     my $inx = Bio::Index::Fasta->new(-filename => $Index_File_Name,
  26                                      -write_flag => 1);
  27     $inx->make_index(@ARGV);
  28
  29
  30     # Once the index is made it can accessed, either in the
  31     # same script or a different one
  32     use Bio::Index::Fasta;
  33     use strict;
  34
  35     my $Index_File_Name = shift;
  36     my $inx = Bio::Index::Fasta->new(-filename => $Index_File_Name);
  37     my $out = Bio::SeqIO->new(-format => 'Fasta',
  38                               -fh => \*STDOUT);
  39
  40     foreach my $id (@ARGV) {
  41         my $seq = $inx->fetch($id); # Returns Bio::Seq object
  42              $out->write_seq($seq);
  43     }
  44
  45     # or, alternatively
  46     my $id;
  47     my $seq = $inx->get_Seq_by_id($id); # identical to fetch()
  48
  49 =head1 DESCRIPTION
  50
  51 Inherits functions for managing dbm files from Bio::Index::Abstract.pm,
  52 and provides the basic funtionallity for indexing fasta files, and
  53 retrieving the sequence from them. For best results 'use strict'.
  54
  55 Bio::Index::Fasta supports the Bio::DB::BioSeqI interface, meaning
  56 it can be used as a Sequence database for other parts of bioperl
  57
  58 Additional example code is available in scripts/index/*PLS and in
  59 the Bioperl Tutorial (L<http://www.bioperl.org/wiki/Bptutorial.pl>)
  60
  61 Note that by default the key for the sequence will be the first continuous
  62 string after the 'E<gt>' in the fasta header. If you want to use a specific
  63 substring of the fasta header you must use the id_parser() method.
  64
  65 You can also set or customize the unique key used to retrieve by
  66 writing your own function and calling the id_parser() method.
  67 For example:
  68
  69    $inx->id_parser(\&get_id);
  70    # make the index
  71    $inx->make_index($file_name);
  72
  73    # here is where the retrieval key is specified
  74    sub get_id {
  75       my $line = shift;
  76       $line =~ /^>.+gi\|(\d+)/;
  77       $1;
  78    }
  79
  80
  81 =head1 FEED_BACK
  82
  83 =head2 Mailing Lists
  84
  85 User feedback is an integral part of the evolution of this and other
  86 Bioperl modules. Send your comments and suggestions preferably to one
  87 of the Bioperl mailing lists.  Your participation is much appreciated.
  88
  89   bioperl-l@bioperl.org                  - General discussion
  90   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  91
  92 =head2 Support
  93
  94 Please direct usage questions or support issues to the mailing list:
  95
  96 L<bioperl-l@bioperl.org>
  97
  98 rather than to the module maintainer directly. Many experienced and
  99 reponsive experts will be able look at the problem and quickly
 100 address it. Please include a thorough description of the problem
 101 with code and data examples if at all possible.
 102
 103 =head2 Reporting Bugs
 104
 105 Report bugs to the Bioperl bug tracking system to help us keep track
 106 the bugs and their resolution.  Bug reports can be submitted via the
 107 web:
 108
 109   http://bugzilla.open-bio.org/
 110
 111 =head1 AUTHOR - James Gilbert
 112
 113 Email - jgrg@sanger.ac.uk
 114
 115 =head1 APPENDIX
 116
 117 The rest of the documentation details each of the object
 118 methods. Internal methods are usually preceded with a _
 119
 120 =cut
 121
 122
 123 # Let the code begin...
 124
 125
 126 package Bio::Index::Fasta;
 127
 128 use strict;
 129
 130 use Bio::Seq;
 131
 132 use base qw(Bio::Index::AbstractSeq);
 133
 134 #
 135 # Suggested fix by Michael G Schwern <schwern@pobox.com> to
 136 # get around a clash with CPAN shell...
 137 #
 138
 139 sub _version {
 140     return 0.2;
 141 }
 142
 143 =head2 _file_format
 144
 145  Title   : _file_format
 146  Function: The file format for this package, which is needed
 147            by the SeqIO system when reading the sequence.
 148  Returns : 'Fasta'
 149
 150 =cut
 151
 152 sub _file_format {
 153     return 'Fasta';
 154 }
 155
 156 =head2 _index_file
 157
 158   Title   : _index_file
 159   Usage   : $index->_index_file( $file_name, $i )
 160   Function: Specialist function to index FASTA format files.
 161             Is provided with a filename and an integer
 162             by make_index in its SUPER class.
 163   Example :
 164   Returns :
 165   Args    :
 166
 167 =cut
 168
 169 sub _index_file {
 170         my( $self,
 171                  $file, # File name
 172                  $i,    # Index-number of file being indexed
 173           ) = @_;
 174
 175         my( $begin,     # Offset from start of file of the start
 176                              # of the last found record.
 177           );
 178
 179         $begin = 0;
 180
 181         my $id_parser = $self->id_parser;
 182
 183         open my $FASTA, '<', $file or $self->throw("Can't open file for read : $file");
 184
 185         # Main indexing loop
 186         while (<$FASTA>) {
 187                 if (/^>/) {
 188                         # $begin is the position of the first character after the '>'
 189                         my $offset = ( $^O =~ /mswin/i ) ? 0 : 1;
 190                         my $begin = tell($FASTA) - length( $_ ) + $offset;
 191
 192                         foreach my $id (&$id_parser($_)) {
 193                                 $self->add_record($id, $i, $begin);
 194                         }
 195                 }
 196         }
 197         close $FASTA;
 198         return 1;
 199 }
 200
 201 =head2 id_parser
 202
 203   Title   : id_parser
 204   Usage   : $index->id_parser( CODE )
 205   Function: Stores or returns the code used by record_id to
 206             parse the ID for record from a string.  Useful
 207             for (for instance) specifying a different
 208             parser for different flavours of FASTA file.
 209             Returns \&default_id_parser (see below) if not
 210             set. If you supply your own id_parser
 211             subroutine, then it should expect a fasta
 212             description line.  An entry will be added to
 213             the index for each string in the list returned.
 214   Example : $index->id_parser( \&my_id_parser )
 215   Returns : ref to CODE if called without arguments
 216   Args    : CODE
 217
 218 =cut
 219
 220 sub id_parser {
 221         my( $self, $code ) = @_;
 222
 223         if ($code) {
 224                 $self->{'_id_parser'} = $code;
 225         }
 226         return $self->{'_id_parser'} || \&default_id_parser;
 227 }
 228
 229 =head2 default_id_parser
 230
 231   Title   : default_id_parser
 232   Usage   : $id = default_id_parser( $header )
 233   Function: The default Fasta ID parser for Fasta.pm
 234             Returns $1 from applying the regexp /^>\s*(\S+)/
 235             to $header.
 236   Returns : ID string
 237   Args    : a fasta header line string
 238
 239 =cut
 240
 241 sub default_id_parser {
 242         if ($_[0] =~ /^>\s*(\S+)/) {
 243                 return $1;
 244         } else {
 245                 return;
 246         }
 247 }
 248
 249 1;