fixed index bugs, and fixed makefile
[bioperl-live.git] / Bio / Index / Fasta.pm
blob0f4464f0b564bbdd992e2654caaecf1f16d88280
3 # BioPerl module for Bio::Index::Abstract
5 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Index::Fasta - Interface for indexing (multiple) fasta files
15 =head1 SYNOPSIS
17 # Complete code for making an index for several
18 # fasta files
19 use Bio::Index::Fasta;
21 my $Index_File_Name = shift;
22 my $inx = Bio::Index::Fasta->new($Index_File_Name, 'WRITE');
23 $inx->make_index(@ARGV);
25 # Print out several sequences present in the index
26 # in gcg format
27 use Bio::Index::Fasta;
29 my $Index_File_Name = shift;
30 my $inx = Bio::Index::Fasta->new($Index_File_Name);
32 foreach my $id (@ARGV) {
33 my $seq = $inx->fetch($id); # Returns Bio::Seq object
34 print $seq->layout('GCG');
37 # or, alternatively
39 my $seq = $inx->get_Seq_by_id($id); #identical to fetch
41 =head1 DESCRIPTION
43 Inherits functions for managing dbm files from Bio::Index::Abstract.pm,
44 and provides the basic funtionallity for indexing fasta files, and
45 retrieving the sequence from them.
47 Bio::Index::Fasta supports the Bio::DB::BioSeqI interface, meaning
48 it can be used a a Sequence database for other parts of bioperl
50 =head1 FEED_BACK
52 =head2 Mailing Lists
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to one
56 of the Bioperl mailing lists. Your participation is much appreciated.
58 vsns-bcd-perl@lists.uni-bielefeld.de - General discussion
59 vsns-bcd-perl-guts@lists.uni-bielefeld.de - Technically-oriented discussion
60 http://bio.perl.org/MailList.html - About the mailing lists
62 =head2 Reporting Bugs
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via
66 email or the web:
68 bioperl-bugs@bio.perl.org
69 http://bio.perl.org/bioperl-bugs/
71 =head1 AUTHOR - James Gilbert
73 Email - jgrg@sanger.ac.uk
75 =head1 APPENDIX
77 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
79 =cut
82 # Let the code begin...
85 package Bio::Index::Fasta;
87 use vars qw($VERSION @ISA @EXPORT_OK);
88 use strict;
90 use Bio::Index::Abstract;
91 use Bio::Seq;
93 @ISA = qw(Bio::Index::Abstract Bio::DB::BioSeqI Exporter);
94 @EXPORT_OK = qw();
96 sub _type_stamp {
97 return '__FASTA__'; # What kind of index are we?
101 # Suggested fix by Michael G Schwern <schwern@pobox.com> to
102 # get around a clash with CPAN shell...
105 BEGIN {
106 $VERSION = 0.2;
109 sub _version {
110 return $VERSION;
114 =head2 _initialize
116 Title : _initialize
117 Usage : $index->_initialize
118 Function: Calls $index->SUPER::_initialize(), and then adds
119 the default id parser for fasta files.
120 Example :
121 Returns :
122 Args :
124 =cut
126 sub _initialize {
127 my($self, $index_file, $write_flag) = @_;
129 $self->SUPER::_initialize($index_file, $write_flag);
130 $self->id_parser( \&default_id_parser );
134 =head2 _index_file
136 Title : _index_file
137 Usage : $index->_index_file( $file_name, $i )
138 Function: Specialist function to index FASTA format files.
139 Is provided with a filename and an integer
140 by make_index in its SUPER class.
141 Example :
142 Returns :
143 Args :
145 =cut
147 sub _index_file {
148 my( $self,
149 $file, # File name
150 $i # Index-number of file being indexed
151 ) = @_;
153 my( $begin, # Offset from start of file of the start
154 # of the last found record.
155 $end, # Offset from start of file of the end
156 # of the last found record.
157 $id, # ID of last found record.
160 $begin = 0;
161 $end = 0;
163 open FASTA, $file or $self->throw("Can't open file for read : $file");
165 # Main indexing loop
166 while (<FASTA>) {
167 if (/^>/) {
168 my $new_begin = tell(FASTA) - length( $_ );
169 $end = $new_begin - 1;
171 $self->add_record($id, $i, $begin, $end) if $id;
173 $begin = $new_begin;
174 ($id) = $self->record_id( $_ );
177 # Don't forget to add the last record
178 $end = tell(FASTA);
179 $self->add_record($id, $i, $begin, $end) if $id;
181 close FASTA;
182 return 1;
186 # Should there be a prototype for this method in Index::Abstract.pm?
187 =head2 record_id
189 Title : record_id
190 Usage : $index->record_id( STRING );
191 Function: Parses the ID for an entry from the string
192 supplied, using the code in $index->{'_id_parser'}
193 Example :
194 Returns : scalar or exception
195 Args : STRING
198 =cut
200 sub record_id {
201 my ($self, $line) = @_;
203 if (my $id = $self->{'_id_parser'}->( $line )) {
204 return $id;
205 } else {
206 $self->throw("Can't parse ID from line : $line");
211 =head2 id_parser
213 Title : id_parser
214 Usage : $index->id_parser( CODE )
215 Function: Stores or returns the code used by record_id
216 to parse the ID for record from a string. Useful
217 for (for instance) specifying a different parser
218 for different flavours of FASTA file.
219 Example : $index->id_parser( \&my_id_parser )
220 Returns : ref to CODE if called without arguments
221 Args : CODE
223 =cut
225 sub id_parser {
226 my( $self, $code ) = @_;
228 if ($code) {
229 $self->{'_id_parser'} = $code;
230 } else {
231 return $self->{'_id_parser'};
237 =head2 default_id_parser
239 Title : default_id_parser
240 Usage : $id = default_id_parser( $header )
241 Function: The default Fasta ID parser for Fasta.pm
242 Returns $1 from applying the regexp /^>\s*(\S+)/
243 to $header.
244 Example :
245 Returns : ID string
246 Args : a fasta header line string
248 =cut
250 sub default_id_parser {
251 my $line = shift;
252 $line =~ /^>\s*(\S+)/;
253 return $1;
257 =head2 fetch
259 Title : fetch
260 Usage : $index->fetch( $id )
261 Function: Returns a Bio::Seq object from the index
262 Example : $seq = $index->fetch( 'dJ67B12' )
263 Returns : Bio::Seq object
264 Args : ID
266 =cut
268 sub fetch {
269 my( $self, $id ) = @_;
271 my $db = $self->db();
272 if (my $rec = $db->{ $id }) {
273 my( @record );
275 my ($file, $begin, $end) = $self->unpack_record( $rec );
277 # Get the (possibly cached) filehandle
278 my $fh = $self->_file_handle( $file );
280 # Accumulate lines in @record until beyond end
281 seek($fh, $begin, 0);
282 while (defined(my $line = <$fh>)) {
283 push(@record, $line);
284 last if tell($fh) > $end;
287 $self->throw("Can't fetch sequence for record : $id")
288 unless @record;
290 # Parse record
291 my $firstLine = shift @record;
292 my ($name, $desc) = $firstLine =~ /^>\s*(\S+)\s*(.*?)\s*$/;
293 chomp( @record );
295 # Return a shiny Bio::Seq object
296 return Bio::Seq->new( -ID => $name,
297 -DESC => $desc,
298 -SEQ => uc(join('', @record)) );
299 } else {
300 $self->throw("Unable to find a record for $id in Fasta index");
301 return;
305 =head2 get_Seq_by_id
307 Title : get_Seq_by_id
308 Usage : $seq = $db->get_Seq_by_id()
309 Function: retrieves a sequence object, identically to
310 ->fetch, but here behaving as a Bio::DB::BioSeqI
311 Returns : new Bio::Seq object
312 Args : string represents the id
315 =cut
317 sub get_Seq_by_id{
318 my ($self,$id) = @_;
320 return $self->fetch($id);