3 # BioPerl module for Bio::DB::Fasta
5 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
12 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
18 # create database from directory of fasta files
19 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
21 # simple access (for those without Bioperl)
22 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
23 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
25 my $length = $db->length('CHROMOSOME_I');
26 my $alphabet = $db->alphabet('CHROMOSOME_I');
27 my $header = $db->header('CHROMOSOME_I');
29 # Bioperl-style access
30 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
32 my $obj = $db->get_Seq_by_id('CHROMOSOME_I');
33 my $seq = $obj->seq; # sequence string
34 my $subseq = $obj->subseq(4_000_000 => 4_100_000); # string
35 my $trunc = $obj->trunc(4_000_000 => 4_100_000); # seq object
36 my $length = $obj->length;
39 # Bio::SeqIO-style access
40 my $stream = Bio::DB::Fasta->new('/path/to/files')->get_PrimarySeq_stream;
41 while (my $seq = $stream->next_seq) {
42 # Bio::PrimarySeqI stuff
45 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
46 while (my $seq = <$fh>) {
47 # Bio::PrimarySeqI stuff
51 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
52 print $sequences{'CHROMOSOME_I:1,20000'};
56 Bio::DB::Fasta provides indexed access to one or more Fasta files. It
57 provides random access to each sequence entry, and to subsequences
58 within each entry, allowing you to retrieve portions of very large
59 sequences without bringing the entire sequence into memory.
61 When you initialize the module, you point it at a single fasta file or
62 a directory of multiple such files. The first time it is run, the
63 module generates an index of the contents of the file or directory
64 using the AnyDBM module (Berkeley DB* preferred, followed by GDBM_File,
65 NDBM_File, and SDBM_File). Thereafter it uses the index file to find
66 the file and offset for any requested sequence. If one of the source
67 fasta files is updated, the module reindexes just that one file. (You
68 can also force reindexing manually). For improved performance, the
69 module keeps a cache of open filehandles, closing less-recently used
70 ones when the cache is full.
72 The fasta files may contain any combination of nucleotide and protein
73 sequences; during indexing the module guesses the molecular type.
74 Entries may have any line length up to 65,536 characters, and
75 different line lengths are allowed in the same file. However, within
76 a sequence entry, all lines must be the same length except for the
79 An error will be thrown if this is not the case.
81 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
82 from the Fasta header. During indexing, you may pass a callback routine to
83 modify this primary ID. For example, you may wish to extract a
84 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
85 The original header line can be recovered later.
87 This module was developed for use with the C. elegans and human
88 genomes, and has been tested with sequence segments as large as 20
89 megabases. Indexing the C. elegans genome (100 megabases of genomic
90 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
91 laptop. On the same system, average access time for any 200-mer within
92 the C. elegans genome was E<lt>0.02s.
94 *Berkeley DB can be obtained free from www.sleepycat.com. After it is
95 installed you will need to install the BerkeleyDB Perl module.
97 =head1 DATABASE CREATION AND INDEXING
99 The two constructors for this class are new() and newFh(). The former
100 creates a Bio::DB::Fasta object which is accessed via method calls.
101 The latter creates a tied filehandle which can be used Bio::SeqIO
102 style to fetch sequence objects in a stream fashion. There is also a
107 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
109 Create a new Bio::DB::Fasta object from the Fasta file or files
110 indicated by $fasta_path. Indexing will be performed automatically if
111 needed. If successful, new() will return the database accessor
112 object. Otherwise it will return undef.
114 $fasta_path may be an individual Fasta file, or may refer to a
115 directory containing one or more of such files. Following the path,
116 you may pass a series of name=E<gt>value options or a hash with these
117 same name=E<gt>value pairs. Valid options are:
119 Option Name Description Default
120 ----------- ----------- -------
122 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna}
123 for searching for Fasta
124 files in directories.
126 -makeid A code subroutine for None
127 transforming Fasta IDs.
129 -maxopen Maximum size of 32
132 -debug Turn on status 0
135 -reindex Force the index to be 0
138 -dbmargs Additional arguments none
141 (scalar or array ref).
143 -dbmargs can be used to control the format of the index. For example,
144 you can pass $DB_BTREE to this argument so as to force the IDs to be
145 sorted and retrieved alphabetically. Note that you must use the same
146 arguments every time you open the index!
148 -reindex can be used to force the index to be recreated from scratch.
150 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
152 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading
153 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
158 The -makeid option gives you a chance to modify sequence IDs during
159 indexing. The option value should be a code reference that will
160 take a scalar argument and return a scalar result, like this:
162 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
165 my $description_line = shift;
166 # get a different id from the fasta header, e.g.
167 $description_line =~ /(\S+)$/;
171 make_my_id() will be called with the full fasta id line (including the
172 "E<gt>" symbol!). For example:
174 >A12345.3 Predicted C. elegans protein egl-2
176 By default, this module will use the regular expression /^E<gt>(\S+)/
177 to extract "A12345.3" for use as the ID. If you pass a -makeid
178 callback, you can extract any portion of this, such as the "egl-2"
181 The -makeid option is ignored after the index is constructed.
183 =head1 OBJECT METHODS
185 The following object methods are provided.
189 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
191 Return the raw sequence (a string) given an ID and optionally a start
192 and stop position in the sequence. In the case of DNA sequence, if
193 $stop is less than $start, then the reverse complement of the sequence
194 is returned (this violates Bio::Seq conventions).
196 For your convenience, subsequences can be indicated with any of the
197 following compound IDs:
199 $db->seq("$id:$start,$stop")
201 $db->seq("$id:$start..$stop")
203 $db->seq("$id:$start-$stop")
205 =item $length = $db-E<gt>length($id)
207 Return the length of the indicated sequence.
209 =item $header = $db-E<gt>header($id)
211 Return the header line for the ID, including the initial "E<gt>".
213 =item $type = $db-E<gt>alphabet($id)
215 Return the molecular type of the indicated sequence. One of "dna",
218 =item $filename = $db-E<gt>file($id)
220 Return the name of the file in which the indicated sequence can be
223 =item $offset = $db-E<gt>offset($id)
225 Return the offset of the indicated sequence from the beginning of the
226 file in which it is located. The offset points to the beginning of
227 the sequence, not the beginning of the header line.
229 =item $header_length = $db-E<gt>headerlen($id)
231 Return the length of the header line for the indicated sequence.
233 =item $header_offset = $db-E<gt>header_offset($id)
235 Return the offset of the header line for the indicated sequence from
236 the beginning of the file in which it is located.
238 =item $index_name = $db-E<gt>index_name
240 Return the path to the index file.
242 =item $path = $db-E<gt>path
244 Return the path to the Fasta file(s).
248 For BioPerl-style access, the following methods are provided:
252 =item $seq = $db-E<gt>get_Seq_by_id($id)
254 Return a Bio::PrimarySeq::Fasta object, which obeys the
255 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or
256 protein sequence, call $seq-E<gt>seq().
258 Note that get_Seq_by_id() does not bring the entire sequence into
259 memory until requested. Internally, the returned object uses the
260 accessor to generate subsequences as needed.
262 =item $seq = $db-E<gt>get_Seq_by_acc($id)
264 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
266 These methods all do the same thing as get_Seq_by_id().
268 =item $stream = $db-E<gt>get_PrimarySeq_stream()
270 Return a Bio::DB::Fasta::Stream object, which supports a single method
271 next_seq(). Each call to next_seq() returns a new
272 Bio::PrimarySeq::Fasta object, until no more sequences remain.
276 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
277 returned from get_Seq_by_id() and get_PrimarySeq_stream().
279 =head1 TIED INTERFACES
281 This module provides two tied interfaces, one which allows you to
282 treat the sequence database as a hash, and the other which allows you
283 to treat the database as an I/O stream.
285 =head2 Creating a Tied Hash
287 The tied hash interface is very straightforward
291 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
293 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
294 The optional @args list is the same set of named argument/value pairs
295 used by Bio::DB::Fasta-E<gt>new().
297 If successful, tie() will return the tied object. Otherwise it will
302 Once tied, you can use the hash to retrieve an individual sequence by
305 my $seq = $db{CHROMOSOME_I};
307 You may select a subsequence by appending the comma-separated range to
308 the sequence ID in the format "$id:$start,$stop". For example, here
309 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
311 my $seq = $db{'CHROMOSOME_I:1,1000'};
313 (The regular expression used to parse this format allows sequence IDs
316 When selecting subsequences, if $start E<gt> stop, then the reverse
317 complement will be returned for DNA sequences.
319 The keys() and values() functions will return the sequence IDs and
320 their sequences, respectively. In addition, each() can be used to
321 iterate over the entire data set:
323 while (my ($id,$sequence) = each %db) {
324 print "$id => $sequence\n";
327 When dealing with very large sequences, you can avoid bringing them
328 into memory by calling each() in a scalar context. This returns the
329 key only. You can then use tied(%db) to recover the Bio::DB::Fasta
330 object and call its methods.
332 while (my $id = each %db) {
333 print "$id => $db{$sequence:1,100}\n";
334 print "$id => ",tied(%db)->length($id),"\n";
337 You may, in addition invoke Bio::DB::Fasta the FIRSTKEY and NEXTKEY tied
338 hash methods directly.
342 =item $id = $db-E<gt>FIRSTKEY
344 Return the first ID in the database.
346 =item $id = $db-E<gt>NEXTKEY($id)
348 Given an ID, return the next ID in sequence.
352 This allows you to write the following iterative loop using just the
353 object-oriented interface:
355 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
356 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
357 # do something with sequence
360 =head2 Creating a Tied Filehandle
362 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
363 which you can read Bio::PrimarySeq::Fasta sequence objects
364 sequentially. The following bit of code will iterate sequentially
365 over all sequences in the database:
367 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
368 while (my $seq = <$fh>) {
369 print $seq->id,' => ',$seq->length,"\n";
372 When no more sequences remain to be retrieved, the stream will return
377 When a sequence is deleted from one of the Fasta files, this deletion
378 is not detected by the module and removed from the index. As a
379 result, a "ghost" entry will remain in the index and will return
380 garbage results if accessed.
382 Currently, the only way to accomodate deletions is to rebuild the
383 entire index, either by deleting it manually, or by passing
384 -reindex=E<gt>1 to new() when initializing the module.
392 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
394 Copyright (c) 2001 Cold Spring Harbor Laboratory.
396 This library is free software; you can redistribute it and/or modify
397 it under the same terms as Perl itself. See DISCLAIMER.txt for
398 disclaimers of warranty.
403 package Bio
::DB
::Fasta
;
406 @AnyDBM_File::ISA
= qw(DB_File GDBM_File NDBM_File SDBM_File)
413 use File
::Basename
qw(basename dirname);
415 use base
qw(Bio::DB::SeqI Bio::Root::Root);
417 *seq
= *sequence
= \
&subseq
;
418 *ids
= \
&get_all_ids
;
419 *get_seq_by_primary_id
= *get_Seq_by_acc
= \
&get_Seq_by_id
;
421 use constant STRUCT
=>'NNnnCa*';
422 use constant STRUCTBIG
=>'QQnnCa*'; # 64-bit file offset and seq length
423 use constant DNA
=> 1;
424 use constant RNA
=> 2;
425 use constant PROTEIN
=> 3;
426 use constant DIE_ON_MISSMATCHED_LINES
=> 1; # if you want
428 # Bio::DB-like object
429 # providing fast random access to a directory of FASTA files
434 Usage : my $db = Bio::DB::Fasta->new( $path, @options);
435 Function: initialize a new Bio::DB::Fasta object
436 Returns : new Bio::DB::Fasta object
437 Args : path to dir of fasta files or a single filename
439 These are optional arguments to pass in as well.
441 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST}
442 for searching for Fasta
443 files in directories.
445 -makeid A code subroutine for None
446 transforming Fasta IDs.
448 -maxopen Maximum size of 32
451 -debug Turn on status 0
454 -reindex Force the index to be 0
457 -dbmargs Additional arguments none
460 (scalar or array ref).
469 my $self = bless { debug
=> $opts{-debug
},
470 makeid
=> $opts{-makeid
},
471 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,FNA,fna,FAA,faa,FSA,fsa}',
472 maxopen
=> $opts{-maxfh
} || 32,
473 dbmargs
=> $opts{-dbmargs
} || undef,
481 my ($offsets,$dirname);
484 # because Win32 glob() is broken with respect to long file names
485 # that contain whitespace.
486 $path = Win32
::GetShortPathName
($path)
487 if $^O
=~ /^MSWin/i && eval 'use Win32; 1';
488 $offsets = $self->index_dir($path,$opts{-reindex
}) or return;
491 $offsets = $self->index_file($path,$opts{-reindex
});
492 $dirname = dirname
($path);
494 $self->throw( "$path: Invalid file or dirname");
496 @
{$self}{qw(dirname offsets)} = ($dirname,$offsets);
504 Function: gets a new Fh for a file
505 Example : internal method
513 my $self = $class->new(@_);
515 my $fh = Symbol
::gensym
or return;
516 tie
$$fh,'Bio::DB::Fasta::Stream',$self or return;
522 my ($index,$write) = @_;
524 my $flags = $write ? O_CREAT
|O_RDWR
: O_RDONLY
;
525 my @dbmargs = $self->dbmargs;
527 tie
%offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
528 or die "Can't open sequence index file $index: $!";
543 Usage : $db->index_dir($dir)
544 Function: set the index dir and load all files in the dir
545 Returns : hashref of seq offsets in each file
546 Args : dirname, boolean to force a reload of all files
553 my $force_reindex = shift;
555 # find all fasta files
556 my @files = glob("$dir/$self->{glob}");
557 # $self->throw( "no fasta files in $dir") unless @files;
558 return unless @files;
561 my $index = $self->index_name($dir,1);
563 # if caller has requested reindexing, then unlink
565 unlink $index if $force_reindex;
567 # get the modification time of the index
569 for my $suffix('','.pag','.dir') {
570 $indextime ||= (stat("${index}${suffix}"))[9];
572 $indextime ||= 0; # prevent some uninit variable warnings
574 # get the most recent modification time of any of the contents
577 $self->set_pack_method( @files );
579 my $m = (stat($_))[9];
581 $modtime = $m if defined $m && $modtime < $m;
584 my $reindex = $force_reindex || $indextime < $modtime;
585 $self->{offsets
} = $self->_open_index($index,$reindex) or return;
588 return $self->{offsets
} unless $reindex;
590 # otherwise reindex contents of changed files
591 $self->{indexing
} = $index;
593 next if( defined $indextime && $modtime{$_} <= $indextime);
594 $self->calculate_offsets($_,$self->{offsets
});
596 delete $self->{indexing
};
598 # we've been having troubles with corrupted index files on Windows systems,
599 # so possibly closing and reopening will help
600 $self->_close_index($self->{offsets
});
602 return $self->{offsets
} = $self->_open_index($index);
607 Title : get_Seq_by_id
608 Usage : my $seq = $db->get_Seq_by_id($id)
609 Function: Bio::DB::RandomAccessI method implemented
610 Returns : Bio::PrimarySeqI object
618 return unless exists $self->{offsets
}{$id};
619 return Bio
::PrimarySeq
::Fasta
->new($self,$id);
622 =head2 set_pack_method
624 Title : set_pack_method
625 Usage : $db->set_pack_method( @files )
626 Function: Determines whether data packing uses 32 or 64 bit integers
628 Args : one or more file paths
632 sub set_pack_method
{
634 # Find the maximum file size:
635 my ($maxsize) = sort { $b <=> $a } map { -s
$_ } @_;
636 my $fourGB = (2 ** 32) - 1;
638 if ($maxsize > $fourGB) {
639 # At least one file exceeds 4Gb - we will need to use 64 bit ints
640 $self->{packmeth
} = \
&_packBig
;
641 $self->{unpackmeth
} = \
&_unpackBig
;
643 $self->{packmeth
} = \
&_pack
;
644 $self->{unpackmeth
} = \
&_unpack
;
651 Usage : $db->index_file($filename)
652 Function: (re)loads a sequence file and indexes sequences offsets in the file
653 Returns : seq offsets in the file
655 boolean to force reloading a file
663 my $force_reindex = shift;
665 $self->set_pack_method( $file );
666 my $index = $self->index_name($file);
667 # if caller has requested reindexing, then unlink the index
668 unlink $index if $force_reindex;
670 # get the modification time of the index
671 my $indextime = (stat($index))[9] || 0;
672 my $modtime = (stat($file))[9] || 0;
674 my $reindex = $force_reindex || $indextime < $modtime;
675 my $offsets = $self->_open_index($index,$reindex) or return;
676 $self->{offsets
} = $offsets;
678 return $self->{offsets
} unless $reindex;
680 $self->{indexing
} = $index;
681 $self->calculate_offsets($file,$offsets);
682 delete $self->{indexing
};
683 return $self->{offsets
};
689 Usage : my @args = $db->dbmargs;
690 Function: gets stored dbm arguments
698 my $args = $self->{dbmargs
} or return;
699 return ref($args) eq 'ARRAY' ? @
$args : $args;
705 Usage : my $indexname = $db->index_name($path,$isdir);
706 Function: returns the name of the index for a specific path
708 Args : path to check,
709 boolean if it is a dir
715 my ($path,$isdir) = @_;
717 my $dir = $self->{dirname
} or return;
718 return $self->index_name($dir,-d
$dir);
720 return "$path/directory.index" if $isdir;
721 return "$path.index";
724 =head2 calculate_offsets
726 Title : calculate_offsets
727 Usage : $db->calculate_offsets($filename,$offsets);
728 Function: calculates the sequence offsets in a file based on id
729 Returns : offset hash for each file
730 Args : file to process
731 $offsets - hashref of id to offset storage
735 sub calculate_offsets
{
737 my ($file,$offsets) = @_;
738 my $base = $self->path2fileno(basename
($file));
740 my $fh = IO
::File
->new($file) or $self->throw( "Can't open $file: $!");
742 warn "indexing $file\n" if $self->{debug
};
743 my ($offset,@id,$linelength,$type,$firstline,$count,
744 $termination_length,$seq_lines,$last_line,%offsets);
745 my ($l3_len,$l2_len,$l_len)=(0,0,0);
747 while (<$fh>) { # don't try this at home
748 $termination_length ||= /\r\n$/ ?
2 : 1; # account for crlf-terminated Windows files
750 print STDERR
"indexed $count sequences...\n"
751 if $self->{debug
} && (++$count%1000) == 0;
754 my $seqlength = $pos - $offset - length($_);
755 $seqlength -= $termination_length * $seq_lines;
756 my $ppos = &{$self->{packmeth
}}($offset,$seqlength,
757 $linelength,$firstline,
759 for my $id (@id) { $offsets->{$id} = $ppos }
761 @id = ref($self->{makeid
}) eq 'CODE' ?
$self->{makeid
}->($_) : $1;
762 ($offset,$firstline,$linelength) = ($pos,length($_),0);
763 $self->_check_linelength($linelength);
764 ($l3_len,$l2_len,$l_len)=(0,0,0);
767 $l3_len= $l2_len; $l2_len= $l_len; $l_len= length($_); # need to check every line :(
768 if (DIE_ON_MISSMATCHED_LINES
&&
769 $l3_len>0 && $l2_len>0 && $l3_len!=$l2_len) {
770 my $fap= substr($_,0,20)."..";
771 $self->throw("Each line of the fasta entry must be the same length except the last.
772 Line above #$. '$fap' is $l2_len != $l3_len chars.");
774 $linelength ||= length($_);
775 $type ||= $self->_type($_);
781 $self->_check_linelength($linelength);
782 # deal with last entry
785 my $seqlength = $pos - $offset;
786 if ($linelength == 0) { # yet another pesky empty chr_random.fa file
789 if ($last_line !~ /\s$/) {
792 $seqlength -= $termination_length * $seq_lines;
794 my $ppos = &{$self->{packmeth
}}($offset,$seqlength,
795 $linelength,$firstline,
797 for my $id (@id) { $offsets->{$id} = $ppos }
799 $offsets->{__termination_length
} = $termination_length;
806 Usage : my @ids = $db->get_all_ids
807 Function: gets all the stored ids in all indexes
808 Returns : list of ids
813 sub get_all_ids
{ grep {!/^__/} keys %{shift->{offsets
}} }
818 my $offset = $self->{offsets
}{$id} or return;
819 (&{$self->{unpackmeth
}}($offset))[0];
825 my $offset = $self->{offsets
}{$id} or return;
826 (&{$self->{unpackmeth
}}($offset))[1];
832 my $offset = $self->{offsets
}{$id} or return;
833 (&{$self->{unpackmeth
}}($offset))[2];
839 my $offset = $self->{offsets
}{$id} or return;
840 (&{$self->{unpackmeth
}}($offset))[3];
846 my $offset = $self->{offsets
}{$id} or return;
847 my $type = (&{$self->{unpackmeth
}}($offset))[4];
848 return $type == DNA ?
'dna'
849 : $type == RNA ?
'rna'
854 sub path
{ shift->{dirname
} }
859 return unless $self->{offsets
}{$id};
860 return $self->offset($id) - $self->headerlen($id);
866 my $offset = $self->{offsets
}{$id} or return;
867 $self->fileno2path((&{$self->{unpackmeth
}}($offset))[5]);
873 return $self->{offsets
}{"__file_$no"};
879 if ( !defined $self->{offsets
}{"__path_$path"} ) {
880 my $fileno = ($self->{offsets
}{"__path_$path"} = 0+ $self->{fileno}++);
881 $self->{offsets
}{"__file_$fileno"} = $path;
883 return $self->{offsets
}{"__path_$path"}
886 sub _check_linelength
{
888 my $linelength = shift;
889 return unless defined $linelength;
890 $self->throw("Each line of the fasta file must be less than 65,536 characters. Line $. is $linelength chars.") if $linelength > 65535.
897 Usage : $seqdb->subseq($id,$start,$stop);
898 Function: returns a subseq of a sequence in the db
899 Returns : subsequence data
900 Args : id of sequence, starting point, ending point
905 my ($self,$id,$start,$stop) = @_;
906 if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
907 ($id,$start,$stop) = ($1,$2,$3);
912 $stop ||= $self->length($id);
915 if (defined $stop && $start > $stop) {
916 ($start,$stop) = ($stop,$start);
922 my $fh = $self->fh($id) or return;
923 my $filestart = $self->caloffset($id,$start);
924 my $filestop = $self->caloffset($id,$stop);
926 seek($fh,$filestart,0);
927 read($fh,$data,$filestop-$filestart+1);
931 $data = reverse $data;
932 $data =~ tr/gatcGATC/ctagCTAG/;
940 my $file = $self->file($id) or return;
941 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
947 my ($offset,$seqlength,$linelength,$firstline,$type,$file)
948 = &{$self->{unpackmeth
}}($self->{offsets
}{$id}) or return;
949 $offset -= $firstline;
951 my $fh = $self->fh($id) or return;
953 read($fh,$data,$firstline);
955 substr($data,0,1) = '';
963 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = &{$self->{unpackmeth
}}($self->{offsets
}{$id});
965 $a = $seqlength-1 if $a >= $seqlength;
966 my $tl = $self->{offsets
}{__termination_length
};
967 $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
973 if (!$self->{fhcache
}{$path}) {
974 if ($self->{curopen
} >= $self->{maxopen
}) {
975 my @lru = sort {$self->{cacheseq
}{$a} <=> $self->{cacheseq
}{$b};} keys %{$self->{fhcache
}};
976 splice(@lru, $self->{maxopen
} / 3);
977 $self->{curopen
} -= @lru;
978 for (@lru) { delete $self->{fhcache
}{$_} }
980 $self->{fhcache
}{$path} = IO
::File
->new($path) or return;
981 binmode $self->{fhcache
}{$path};
984 $self->{cacheseq
}{$path}++;
985 $self->{fhcache
}{$path}
1001 unpack STRUCTBIG
,shift;
1007 return /^[gatcnGATCN*-]+$/ ? DNA
1008 : /^[gaucnGAUCN*-]+$/ ? RNA
1012 =head2 get_PrimarySeq_stream
1014 Title : get_PrimarySeq_stream
1023 sub get_PrimarySeq_stream
{
1025 return Bio
::DB
::Fasta
::Stream
->new($self);
1030 return $self->new(@_);
1037 shift->throw("Read-only database");
1040 shift->throw("Read-only database");
1043 shift->throw("Read-only database");
1046 defined shift->offset(@_);
1048 sub FIRSTKEY
{ tied(%{shift->{offsets
}})->FIRSTKEY(@_); }
1049 sub NEXTKEY
{ tied(%{shift->{offsets
}})->NEXTKEY(@_); }
1053 if ($self->{indexing
}) { # killed prematurely, so index file is no good!
1054 warn "indexing was interrupted, so unlinking $self->{indexing}";
1055 unlink $self->{indexing
};
1059 #-------------------------------------------------------------
1060 # Bio::PrimarySeqI compatibility
1062 package Bio
::PrimarySeq
::Fasta
;
1063 use overload
'""' => 'display_id';
1065 use base
qw(Bio::Root::Root Bio::PrimarySeqI);
1069 $class = ref($class) if ref $class;
1070 my ($db,$id,$start,$stop) = @_;
1071 return bless { db
=> $db,
1073 start
=> $start || 1,
1074 stop
=> $stop || $db->length($id)
1080 return $self->{db
}->seq($self->{id
},$self->{start
},$self->{stop
});
1085 $self->trunc(@_)->seq();
1090 my ($start,$stop) = @_;
1091 $self->throw("Stop cannot be smaller than start") unless $start <= $stop;
1092 return $self->{start
} <= $self->{stop
} ?
$self->new($self->{db
},
1094 $self->{start
}+$start-1,
1095 $self->{start
}+$stop-1)
1096 : $self->new($self->{db
},
1098 $self->{start
}-($start-1),
1099 $self->{start
}-($stop-1)
1106 return $self->{is_circular
};
1114 sub accession_number
{
1121 return overload
::StrVal
($self);
1124 sub can_call_new
{ return 0 }
1128 return $self->{db
}->alphabet($self->{id
});
1133 return $self->new(@
{$self}{'db','id','stop','start'});
1138 #return $self->{db}->length($self->{id}); # wrong because ignores sequence start and stop values
1139 return length($self->seq);
1145 my $header = $self->{'db'}->header($self->{id
});
1146 # remove the id from the header
1147 return (split(/\s+/,$header,2))[2];
1150 *desc
= \
&description
;
1152 #-------------------------------------------------------------
1153 # stream-based access to the database
1155 package Bio
::DB
::Fasta
::Stream
;
1156 use base
qw(Tie::Handle Bio::DB::SeqI);
1162 my $key = $db->FIRSTKEY;
1163 return bless { db
=>$db,key
=>$key },$class;
1168 my ($key,$db) = @
{$self}{'key','db'};
1169 while ($key =~ /^__/) {
1170 $key = $db->NEXTKEY($key);
1171 return unless defined $key;
1173 my $value = $db->get_Seq_by_id($key);
1174 $self->{key
} = $db->NEXTKEY($key);
1181 return $class->new($db);