don't enforce strict line length for FASTA, it's good practice but not canonical...
[bioperl-live.git] / Bio / DB / Fasta.pm
blob79d0413a365e46ac20d30bc085507ce89cc209d0
2 # BioPerl module for Bio::DB::Fasta
4 # You may distribute this module under the same terms as perl itself
8 =head1 NAME
10 Bio::DB::Fasta - Fast indexed access to fasta files
12 =head1 SYNOPSIS
14 use Bio::DB::Fasta;
16 # Create database from a directory of Fasta files
17 my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18 my @ids = $db->get_all_primary_ids;
20 # Simple access
21 my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22 my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23 my $length = $db->length('CHROMOSOME_I');
24 my $header = $db->header('CHROMOSOME_I');
25 my $alphabet = $db->alphabet('CHROMOSOME_I');
27 # Access to sequence objects. See Bio::PrimarySeqI.
28 my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29 my $seqstr = $seq->seq;
30 my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31 my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32 my $length = $seq->length;
34 # Loop through sequence objects
35 my $stream = $db->get_PrimarySeq_stream;
36 while (my $seq = $stream->next_seq) {
37 # Bio::PrimarySeqI stuff
40 # Filehandle access
41 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42 while (my $seq = <$fh>) {
43 # Bio::PrimarySeqI stuff
46 # Tied hash access
47 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48 print $sequences{'CHROMOSOME_I:1,20000'};
50 =head1 DESCRIPTION
52 Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53 or a directory of files. It provides persistent random access to each sequence
54 entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55 subsequences within each entry, allowing you to retrieve portions of very large
56 sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57 based on Bio::DB::IndexedBase. See this module's documentation for details.
59 The Fasta files may contain any combination of nucleotide and protein sequences;
60 during indexing the module guesses the molecular type. Entries may have any line
61 length up to 65,536 characters, and different line lengths are allowed in the
62 same file. However, within a sequence entry, all lines must be the same length
63 except for the last. An error will be thrown if this is not the case.
65 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
66 from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67 routine to reversibly modify this primary ID, e.g. if you wish to extract a
68 specific portion of the gi|gb|abc|xyz GenBank IDs.
70 =head1 DATABASE CREATION AND INDEXING
72 The object-oriented constructor is new(), the filehandle constructor is newFh()
73 and the tied hash constructor is tie(). They all allow to index a single Fasta
74 file, several files, or a directory of files. See Bio::DB::IndexedBase.
76 =head1 SEE ALSO
78 L<Bio::DB::IndexedBase>
80 L<Bio::DB::Qual>
82 L<Bio::PrimarySeqI>
84 =head1 AUTHOR
86 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
88 Copyright (c) 2001 Cold Spring Harbor Laboratory.
90 This library is free software; you can redistribute it and/or modify
91 it under the same terms as Perl itself. See DISCLAIMER.txt for
92 disclaimers of warranty.
94 =head1 APPENDIX
96 The rest of the documentation details each of the object
97 methods. Internal methods are usually preceded with a _
99 For BioPerl-style access, the following methods are provided:
101 =head2 get_Seq_by_id
103 Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104 Usage : my $seq = $db->get_Seq_by_id($id);
105 Function: Given an ID, fetch the corresponding sequence from the database.
106 Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107 Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108 only load the sequence string into memory when requested using seq().
109 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
110 returned from get_Seq_by_id() and get_PrimarySeq_stream().
111 Args : ID
113 =head2 get_PrimarySeq_stream
115 Title : get_PrimarySeq_stream
116 Usage : my $stream = $db->get_PrimarySeq_stream();
117 Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118 single method, next_seq(). Each call to next_seq() returns a new
119 Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120 Returns : A Bio::DB::Indexed::Stream object
121 Args : None
123 =head1
125 For simple access, the following methods are provided:
127 =cut
130 package Bio::DB::Fasta;
132 use strict;
133 use IO::File;
134 use File::Spec;
135 use Bio::PrimarySeqI;
137 use base qw(Bio::DB::IndexedBase);
139 our $obj_class = 'Bio::PrimarySeq::Fasta';
140 our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
143 =head2 new
145 Title : new
146 Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147 Function: Initialize a new database object. When indexing a directory, files
148 ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149 Returns : A new Bio::DB::Fasta object.
150 Args : A single file, or path to dir, or arrayref of files
151 Optional arguments: see Bio::DB::IndexedBase
153 =cut
156 sub _calculate_offsets {
157 # Bio::DB::IndexedBase calls this to calculate offsets
158 my ($self, $fileno, $file, $offsets) = @_;
160 my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
161 binmode $fh;
162 warn "Indexing $file\n" if $self->{debug};
163 my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164 $last_line, %offsets);
165 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
167 my $termination_length = $self->{termination_length};
168 while (my $line = <$fh>) {
169 # Account for crlf-terminated Windows files
170 if (index($line, '>') == 0) {
171 if ($line =~ /^>(\S+)/) {
172 print STDERR "Indexed $count sequences...\n"
173 if $self->{debug} && (++$count%1000) == 0;
175 # please, do not enforce arbitrary line length requirements.
176 # It's good practice but not enforced.
178 #$self->_check_linelength($linelen);
179 my $pos = tell($fh);
180 if (@ids) {
181 my $strlen = $pos - $offset - length($line);
182 $strlen -= $termination_length * $seq_lines;
183 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
184 $linelen, $headerlen, $alphabet, $fileno);
185 $alphabet = Bio::DB::IndexedBase::NA;
186 for my $id (@ids) {
187 $offsets->{$id} = $ppos;
190 @ids = $self->_makeid($line);
191 ($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
193 } else {
194 # Catch bad header lines, bug 3172
195 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
197 } elsif ($line !~ /\S/) {
198 # Skip blank line
199 $blank_lines++;
200 next;
201 } else {
202 # Need to check every line :(
203 $l3_len = $l2_len;
204 $l2_len = $l_len;
205 $l_len = length $line;
206 if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
207 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
208 my $fap = substr($line, 0, 20)."..";
209 $self->throw("Each line of the fasta entry must be the same ".
210 "length except the last. Line above #$. '$fap' is $l2_len".
211 " != $l3_len chars.");
213 if ($blank_lines) {
214 # Blank lines not allowed in entry
215 $self->throw("Blank lines can only precede header lines, ".
216 "found preceding line #$.");
219 $linelen ||= length $line;
220 $alphabet ||= $self->_guess_alphabet($line);
221 $seq_lines++;
223 $last_line = $line;
226 # Process last entry
227 $self->_check_linelength($linelen);
228 my $pos = tell $fh;
229 if (@ids) {
230 my $strlen = $pos - $offset;
231 if ($linelen == 0) { # yet another pesky empty chr_random.fa file
232 $strlen = 0;
233 } else {
234 if ($last_line !~ /\s$/) {
235 $seq_lines--;
237 $strlen -= $termination_length * $seq_lines;
239 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
240 $headerlen, $alphabet, $fileno);
241 for my $id (@ids) {
242 $offsets->{$id} = $ppos;
246 return \%offsets;
249 =head2 seq
251 Title : seq, sequence, subseq
252 Usage : # Entire sequence string
253 my $seqstr = $db->seq($id);
254 # Subsequence
255 my $subseqstr = $db->seq($id, $start, $stop, $strand);
256 # or...
257 my $subseqstr = $db->seq($compound_id);
258 Function: Get a subseq of a sequence from the database. For your convenience,
259 the sequence to extract can be specified with any of the following
260 compound IDs:
261 $db->seq("$id:$start,$stop")
262 $db->seq("$id:$start..$stop")
263 $db->seq("$id:$start-$stop")
264 $db->seq("$id:$start,$stop/$strand")
265 $db->seq("$id:$start..$stop/$strand")
266 $db->seq("$id:$start-$stop/$strand")
267 $db->seq("$id/$strand")
268 In the case of DNA or RNA sequence, if $stop is less than $start,
269 then the reverse complement of the sequence is returned. Avoid using
270 it if possible since this goes against Bio::Seq conventions.
271 Returns : A string
272 Args : ID of sequence to retrieve
274 Compound ID of subsequence to fetch
276 ID, optional start (defaults to 1), optional end (defaults to length
277 of sequence) and optional strand (defaults to 1).
279 =cut
281 sub subseq {
282 my ($self, $id, $start, $stop, $strand) = @_;
283 $self->throw('Need to provide a sequence ID') if not defined $id;
284 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
286 my $data;
288 my $fh = $self->_fh($id) or return;
289 my $filestart = $self->_calc_offset($id, $start);
290 my $filestop = $self->_calc_offset($id, $stop );
292 seek($fh, $filestart,0);
293 read($fh, $data, $filestop-$filestart+1);
295 $data = Bio::DB::IndexedBase::_strip_crnl($data);
297 if ($strand == -1) {
298 # Reverse-complement the sequence
299 $data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
301 return $data;
304 *seq = *sequence = \&subseq;
307 =head2 length
309 Title : length
310 Usage : my $length = $qualdb->length($id);
311 Function: Get the number of residues in the indicated sequence.
312 Returns : Number
313 Args : ID of entry
315 =head2 header
317 Title : header
318 Usage : my $header = $db->header($id);
319 Function: Get the header line (ID and description fields) of the specified
320 sequence.
321 Returns : String
322 Args : ID of sequence
324 =cut
326 sub header {
327 my ($self, $id) = @_;
328 $self->throw('Need to provide a sequence ID') if not defined $id;
329 my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
330 $offset -= $headerlen;
331 my $data;
332 my $fh = $self->_fh($id) or return;
333 seek($fh, $offset, 0);
334 read($fh, $data, $headerlen);
335 # On Windows chomp remove '\n' but leaves '\r'
336 # when reading '\r\n' in binary mode
337 $data = Bio::DB::IndexedBase::_strip_crnl($data);
338 substr($data, 0, 1) = '';
339 return $data;
343 =head2 alphabet
345 Title : alphabet
346 Usage : my $alphabet = $db->alphabet($id);
347 Function: Get the molecular type of the indicated sequence: dna, rna or protein
348 Returns : String
349 Args : ID of sequence
351 =cut
354 #-------------------------------------------------------------
355 # Bio::PrimarySeqI compatibility
357 package Bio::PrimarySeq::Fasta;
358 use overload '""' => 'display_id';
360 use base qw(Bio::Root::Root Bio::PrimarySeqI);
362 sub new {
363 my ($class, @args) = @_;
364 my $self = $class->SUPER::new(@args);
365 my ($db, $id, $start, $stop) = $self->_rearrange(
366 [qw(DATABASE ID START STOP)],
367 @args);
368 $self->{db} = $db;
369 $self->{id} = $id;
370 $self->{stop} = $stop || $db->length($id);
371 $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
372 return $self;
375 sub fetch_sequence {
376 return shift->seq(@_);
379 sub seq {
380 my $self = shift;
381 return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
384 sub subseq {
385 my $self = shift;
386 return $self->trunc(@_)->seq();
389 sub trunc {
390 # Override Bio::PrimarySeqI trunc() method. This way, we create an object
391 # that does not store the sequence in memory.
392 my ($self, $start, $stop) = @_;
393 $self->throw("Stop cannot be smaller than start") if $stop < $start;
394 if ($self->{start} <= $self->{stop}) {
395 $start = $self->{start}+$start-1;
396 $stop = $self->{start}+$stop-1;
397 } else {
398 $start = $self->{start}-($start-1);
399 $stop = $self->{start}-($stop-1);
401 return $self->new( $self->{db}, $self->{id}, $start, $stop );
404 sub is_circular {
405 my $self = shift;
406 return $self->{is_circular};
409 sub display_id {
410 my $self = shift;
411 return $self->{id};
414 sub accession_number {
415 my $self = shift;
416 return 'unknown';
419 sub primary_id {
420 # Following Bio::PrimarySeqI, since this sequence has no accession number,
421 # its primary_id should be a stringified memory location.
422 my $self = shift;
423 return overload::StrVal($self);
426 sub can_call_new {
427 return 0;
430 sub alphabet {
431 my $self = shift;
432 return $self->{db}->alphabet($self->{id});
435 sub revcom {
436 # Override Bio::PrimarySeqI revcom() with optimized method.
437 my $self = shift;
438 return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
441 sub length {
442 # Get length from sequence location, not the sequence string (too expensive)
443 my $self = shift;
444 return $self->{start} < $self->{stop} ?
445 $self->{stop} - $self->{start} + 1 :
446 $self->{start} - $self->{stop} + 1 ;
449 sub description {
450 my $self = shift;
451 my $header = $self->{'db'}->header($self->{id});
452 # Remove the ID from the header
453 return (split(/\s+/, $header, 2))[1];
455 *desc = \&description;