rework to optimize a bit, again for bug #3172
[bioperl-live.git] / Bio / DB / Fasta.pm
blob6a9a05ff0621cef6c5013fd0a9739355e1549c0c
2 # BioPerl module for Bio::DB::Fasta
4 # You may distribute this module under the same terms as perl itself
7 # POD documentation - main docs before the code
9 =head1 NAME
11 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
13 =head1 SYNOPSIS
15 use Bio::DB::Fasta;
17 # create database from directory of fasta files
18 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
20 # simple access (for those without Bioperl)
21 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
22 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
23 my @ids = $db->ids;
24 my $length = $db->length('CHROMOSOME_I');
25 my $alphabet = $db->alphabet('CHROMOSOME_I');
26 my $header = $db->header('CHROMOSOME_I');
28 # Bioperl-style access
29 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
31 my $obj = $db->get_Seq_by_id('CHROMOSOME_I');
32 my $seq = $obj->seq; # sequence string
33 my $subseq = $obj->subseq(4_000_000 => 4_100_000); # string
34 my $trunc = $obj->trunc(4_000_000 => 4_100_000); # seq object
35 my $length = $obj->length;
36 # (etc)
38 # Bio::SeqIO-style access
39 my $stream = Bio::DB::Fasta->new('/path/to/files')->get_PrimarySeq_stream;
40 while (my $seq = $stream->next_seq) {
41 # Bio::PrimarySeqI stuff
44 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
45 while (my $seq = <$fh>) {
46 # Bio::PrimarySeqI stuff
49 # tied hash access
50 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
51 print $sequences{'CHROMOSOME_I:1,20000'};
53 =head1 DESCRIPTION
55 Bio::DB::Fasta provides indexed access to one or more Fasta files. It
56 provides random access to each sequence entry, and to subsequences
57 within each entry, allowing you to retrieve portions of very large
58 sequences without bringing the entire sequence into memory.
60 When you initialize the module, you point it at a single fasta file or
61 a directory of multiple such files. The first time it is run, the
62 module generates an index of the contents of the file or directory
63 using the AnyDBM module (Berkeley DB* preferred, followed by GDBM_File,
64 NDBM_File, and SDBM_File). Thereafter it uses the index file to find
65 the file and offset for any requested sequence. If one of the source
66 fasta files is updated, the module reindexes just that one file. (You
67 can also force reindexing manually). For improved performance, the
68 module keeps a cache of open filehandles, closing less-recently used
69 ones when the cache is full.
71 The fasta files may contain any combination of nucleotide and protein
72 sequences; during indexing the module guesses the molecular type.
73 Entries may have any line length up to 65,536 characters, and
74 different line lengths are allowed in the same file. However, within
75 a sequence entry, all lines must be the same length except for the
76 last.
78 An error will be thrown if this is not the case.
80 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
81 from the Fasta header. During indexing, you may pass a callback routine to
82 modify this primary ID. For example, you may wish to extract a
83 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
84 The original header line can be recovered later.
86 This module was developed for use with the C. elegans and human
87 genomes, and has been tested with sequence segments as large as 20
88 megabases. Indexing the C. elegans genome (100 megabases of genomic
89 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
90 laptop. On the same system, average access time for any 200-mer within
91 the C. elegans genome was E<lt>0.02s.
93 *Berkeley DB can be obtained free from www.sleepycat.com. After it is
94 installed you will need to install the BerkeleyDB Perl module.
96 =head1 DATABASE CREATION AND INDEXING
98 The two constructors for this class are new() and newFh(). The former
99 creates a Bio::DB::Fasta object which is accessed via method calls.
100 The latter creates a tied filehandle which can be used Bio::SeqIO
101 style to fetch sequence objects in a stream fashion. There is also a
102 tied hash interface.
104 =over 2
106 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
108 Create a new Bio::DB::Fasta object from the Fasta file or files
109 indicated by $fasta_path. Indexing will be performed automatically if
110 needed. If successful, new() will return the database accessor
111 object. Otherwise it will return undef.
113 $fasta_path may be an individual Fasta file, or may refer to a
114 directory containing one or more of such files. Following the path,
115 you may pass a series of name=E<gt>value options or a hash with these
116 same name=E<gt>value pairs. Valid options are:
118 Option Name Description Default
119 ----------- ----------- -------
121 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna}
122 for searching for Fasta
123 files in directories.
125 -makeid A code subroutine for None
126 transforming Fasta IDs.
128 -maxopen Maximum size of 32
129 filehandle cache.
131 -debug Turn on status 0
132 messages.
134 -reindex Force the index to be 0
135 rebuilt.
137 -dbmargs Additional arguments none
138 to pass to the DBM
139 routines when tied
140 (scalar or array ref).
142 -dbmargs can be used to control the format of the index. For example,
143 you can pass $DB_BTREE to this argument so as to force the IDs to be
144 sorted and retrieved alphabetically. Note that you must use the same
145 arguments every time you open the index!
147 -reindex can be used to force the index to be recreated from scratch.
149 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
151 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading
152 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
153 Bio::SeqIO style.
155 =back
157 The -makeid option gives you a chance to modify sequence IDs during
158 indexing. The option value should be a code reference that will
159 take a scalar argument and return a scalar result, like this:
161 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
163 sub make_my_id {
164 my $description_line = shift;
165 # get a different id from the fasta header, e.g.
166 $description_line =~ /(\S+)$/;
167 return $1;
170 make_my_id() will be called with the full fasta id line (including the
171 "E<gt>" symbol!). For example:
173 >A12345.3 Predicted C. elegans protein egl-2
175 By default, this module will use the regular expression /^E<gt>(\S+)/
176 to extract "A12345.3" for use as the ID. If you pass a -makeid
177 callback, you can extract any portion of this, such as the "egl-2"
178 symbol.
180 The -makeid option is ignored after the index is constructed.
182 =head1 OBJECT METHODS
184 The following object methods are provided.
186 =over 10
188 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
190 Return the raw sequence (a string) given an ID and optionally a start
191 and stop position in the sequence. In the case of DNA sequence, if
192 $stop is less than $start, then the reverse complement of the sequence
193 is returned (this violates Bio::Seq conventions).
195 For your convenience, subsequences can be indicated with any of the
196 following compound IDs:
198 $db->seq("$id:$start,$stop")
200 $db->seq("$id:$start..$stop")
202 $db->seq("$id:$start-$stop")
204 =item $length = $db-E<gt>length($id)
206 Return the length of the indicated sequence.
208 =item $header = $db-E<gt>header($id)
210 Return the header line for the ID, including the initial "E<gt>".
212 =item $type = $db-E<gt>alphabet($id)
214 Return the molecular type of the indicated sequence. One of "dna",
215 "rna" or "protein".
217 =item $filename = $db-E<gt>file($id)
219 Return the name of the file in which the indicated sequence can be
220 found.
222 =item $offset = $db-E<gt>offset($id)
224 Return the offset of the indicated sequence from the beginning of the
225 file in which it is located. The offset points to the beginning of
226 the sequence, not the beginning of the header line.
228 =item $header_length = $db-E<gt>headerlen($id)
230 Return the length of the header line for the indicated sequence.
232 =item $header_offset = $db-E<gt>header_offset($id)
234 Return the offset of the header line for the indicated sequence from
235 the beginning of the file in which it is located.
237 =item $index_name = $db-E<gt>index_name
239 Return the path to the index file.
241 =item $path = $db-E<gt>path
243 Return the path to the Fasta file(s).
245 =back
247 For BioPerl-style access, the following methods are provided:
249 =over 4
251 =item $seq = $db-E<gt>get_Seq_by_id($id)
253 Return a Bio::PrimarySeq::Fasta object, which obeys the
254 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or
255 protein sequence, call $seq-E<gt>seq().
257 Note that get_Seq_by_id() does not bring the entire sequence into
258 memory until requested. Internally, the returned object uses the
259 accessor to generate subsequences as needed.
261 =item $seq = $db-E<gt>get_Seq_by_acc($id)
263 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
265 These methods all do the same thing as get_Seq_by_id().
267 =item $stream = $db-E<gt>get_PrimarySeq_stream()
269 Return a Bio::DB::Fasta::Stream object, which supports a single method
270 next_seq(). Each call to next_seq() returns a new
271 Bio::PrimarySeq::Fasta object, until no more sequences remain.
273 =back
275 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
276 returned from get_Seq_by_id() and get_PrimarySeq_stream().
278 =head1 TIED INTERFACES
280 This module provides two tied interfaces, one which allows you to
281 treat the sequence database as a hash, and the other which allows you
282 to treat the database as an I/O stream.
284 =head2 Creating a Tied Hash
286 The tied hash interface is very straightforward
288 =over 1
290 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
292 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
293 The optional @args list is the same set of named argument/value pairs
294 used by Bio::DB::Fasta-E<gt>new().
296 If successful, tie() will return the tied object. Otherwise it will
297 return undef.
299 =back
301 Once tied, you can use the hash to retrieve an individual sequence by
302 its ID, like this:
304 my $seq = $db{CHROMOSOME_I};
306 You may select a subsequence by appending the comma-separated range to
307 the sequence ID in the format "$id:$start,$stop". For example, here
308 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
310 my $seq = $db{'CHROMOSOME_I:1,1000'};
312 (The regular expression used to parse this format allows sequence IDs
313 to contain colons.)
315 When selecting subsequences, if $start E<gt> stop, then the reverse
316 complement will be returned for DNA sequences.
318 The keys() and values() functions will return the sequence IDs and
319 their sequences, respectively. In addition, each() can be used to
320 iterate over the entire data set:
322 while (my ($id,$sequence) = each %db) {
323 print "$id => $sequence\n";
326 When dealing with very large sequences, you can avoid bringing them
327 into memory by calling each() in a scalar context. This returns the
328 key only. You can then use tied(%db) to recover the Bio::DB::Fasta
329 object and call its methods.
331 while (my $id = each %db) {
332 print "$id => $db{$sequence:1,100}\n";
333 print "$id => ",tied(%db)->length($id),"\n";
336 You may, in addition invoke Bio::DB::Fasta the FIRSTKEY and NEXTKEY tied
337 hash methods directly.
339 =over 2
341 =item $id = $db-E<gt>FIRSTKEY
343 Return the first ID in the database.
345 =item $id = $db-E<gt>NEXTKEY($id)
347 Given an ID, return the next ID in sequence.
349 =back
351 This allows you to write the following iterative loop using just the
352 object-oriented interface:
354 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
355 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
356 # do something with sequence
359 =head2 Creating a Tied Filehandle
361 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
362 which you can read Bio::PrimarySeq::Fasta sequence objects
363 sequentially. The following bit of code will iterate sequentially
364 over all sequences in the database:
366 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
367 while (my $seq = <$fh>) {
368 print $seq->id,' => ',$seq->length,"\n";
371 When no more sequences remain to be retrieved, the stream will return
372 undef.
374 =head1 BUGS
376 When a sequence is deleted from one of the Fasta files, this deletion
377 is not detected by the module and removed from the index. As a
378 result, a "ghost" entry will remain in the index and will return
379 garbage results if accessed.
381 Currently, the only way to accomodate deletions is to rebuild the
382 entire index, either by deleting it manually, or by passing
383 -reindex=E<gt>1 to new() when initializing the module.
385 =head1 SEE ALSO
387 L<bioperl>
389 =head1 AUTHOR
391 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
393 Copyright (c) 2001 Cold Spring Harbor Laboratory.
395 This library is free software; you can redistribute it and/or modify
396 it under the same terms as Perl itself. See DISCLAIMER.txt for
397 disclaimers of warranty.
399 =cut
402 package Bio::DB::Fasta;
404 BEGIN {
405 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
408 use strict;
409 use IO::File;
410 use AnyDBM_File;
411 use Fcntl;
412 use File::Glob ':glob';
413 use File::Basename qw(basename dirname);
415 use base qw(Bio::DB::SeqI Bio::Root::Root);
417 *seq = *sequence = \&subseq;
418 *ids = \&get_all_ids;
419 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
421 use constant STRUCT =>'NNnnCa*';
422 use constant STRUCTBIG =>'QQnnCa*'; # 64-bit file offset and seq length
423 use constant DNA => 1;
424 use constant RNA => 2;
425 use constant PROTEIN => 3;
426 use constant DIE_ON_MISSMATCHED_LINES => 1; # if you want
428 # Bio::DB-like object
429 # providing fast random access to a directory of FASTA files
431 =head2 new
433 Title : new
434 Usage : my $db = Bio::DB::Fasta->new( $path, @options);
435 Function: initialize a new Bio::DB::Fasta object
436 Returns : new Bio::DB::Fasta object
437 Args : path to dir of fasta files or a single filename
439 These are optional arguments to pass in as well.
441 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST}
442 for searching for Fasta
443 files in directories.
445 -makeid A code subroutine for none
446 transforming Fasta IDs.
448 -maxopen Maximum size of 32
449 filehandle cache.
451 -debug Turn on status 0
452 messages.
454 -reindex Force the index to be 0
455 rebuilt.
457 -dbmargs Additional arguments none
458 to pass to the DBM
459 routines when tied
460 (scalar or array ref).
462 =cut
464 sub new {
465 my $class = shift;
466 my $path = shift;
467 my %opts = @_;
469 my $self = bless { debug => $opts{-debug},
470 makeid => $opts{-makeid},
471 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,FNA,fna,FAA,faa,FSA,fsa}',
472 maxopen => $opts{-maxopen} || 32,
473 dbmargs => $opts{-dbmargs} || undef,
474 fhcache => {},
475 cacheseq => {},
476 curopen => 0,
477 openseq => 1,
478 dirname => undef,
479 offsets => undef,
480 }, $class;
481 my ($offsets,$dirname);
483 if (-d $path) {
484 # because Win32 glob() is broken with respect to long file names
485 # that contain whitespace.
486 $path = Win32::GetShortPathName($path)
487 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
488 $offsets = $self->index_dir($path,$opts{-reindex}) or return;
489 $dirname = $path;
490 } elsif (-f _) {
491 $offsets = $self->index_file($path,$opts{-reindex});
492 $dirname = dirname($path);
493 } else {
494 $self->throw( "$path: Invalid file or dirname");
496 @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
498 $self;
501 =head2 newFh
503 Title : newFh
504 Function: gets a new Fh for a file
505 Example : internal method
506 Returns : GLOB
507 Args :
509 =cut
511 sub newFh {
512 my $class = shift;
513 my $self = $class->new(@_);
514 require Symbol;
515 my $fh = Symbol::gensym or return;
516 tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
517 $fh;
520 sub _open_index {
521 my $self = shift;
522 my ($index,$write) = @_;
523 my %offsets;
524 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
525 my @dbmargs = $self->dbmargs;
526 eval {
527 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
528 or die "Can't open sequence index file $index: $!";
530 warn $@ if $@;
531 return \%offsets;
534 sub _close_index {
535 my $self = shift;
536 my $index = shift;
537 untie %$index;
540 =head2 index_dir
542 Title : index_dir
543 Usage : $db->index_dir($dir)
544 Function: set the index dir and load all files in the dir
545 Returns : hashref of seq offsets in each file
546 Args : dirname, boolean to force a reload of all files
548 =cut
550 sub index_dir {
551 my $self = shift;
552 my $dir = shift;
553 my $force_reindex = shift;
555 # find all fasta files
556 my @files = glob("$dir/$self->{glob}");
557 return unless @files;
559 # get name of index
560 my $index = $self->index_name($dir,1);
562 # if caller has requested reindexing, then unlink
563 # the index file.
564 unlink $index if $force_reindex;
566 # get the modification time of the index
567 my $indextime = 0;
568 for my $suffix('','.pag','.dir') {
569 $indextime ||= (stat("${index}${suffix}"))[9];
571 $indextime ||= 0; # prevent some uninit variable warnings
573 # get the most recent modification time of any of the contents
574 my $modtime = 0;
575 my %modtime;
576 $self->set_pack_method( @files );
577 foreach (@files) {
578 my $m = (stat($_))[9];
579 $modtime{$_} = $m;
580 $modtime = $m if defined $m && $modtime < $m;
583 my $reindex = $force_reindex || $indextime < $modtime;
584 $self->{offsets} = $self->_open_index($index,$reindex) or return;
586 # no indexing needed
587 return $self->{offsets} unless $reindex;
589 # otherwise reindex contents of changed files
590 $self->{indexing} = $index;
591 foreach (@files) {
592 next if( defined $indextime && $modtime{$_} <= $indextime);
593 $self->calculate_offsets($_,$self->{offsets});
595 delete $self->{indexing};
597 # we've been having troubles with corrupted index files on Windows systems,
598 # so possibly closing and reopening will help
599 $self->_close_index($self->{offsets});
601 return $self->{offsets} = $self->_open_index($index);
604 =head2 get_Seq_by_id
606 Title : get_Seq_by_id
607 Usage : my $seq = $db->get_Seq_by_id($id)
608 Function: Bio::DB::RandomAccessI method implemented
609 Returns : Bio::PrimarySeqI object
610 Args : id
612 =cut
614 sub get_Seq_by_id {
615 my $self = shift;
616 my $id = shift;
617 return unless exists $self->{offsets}{$id};
618 return Bio::PrimarySeq::Fasta->new($self,$id);
621 =head2 set_pack_method
623 Title : set_pack_method
624 Usage : $db->set_pack_method( @files )
625 Function: Determines whether data packing uses 32 or 64 bit integers
626 Returns :
627 Args : one or more file paths
629 =cut
631 sub set_pack_method {
632 my $self = shift;
633 # Find the maximum file size:eq
634 my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
635 my $fourGB = (2 ** 32) - 1;
637 if ($maxsize > $fourGB) {
638 # At least one file exceeds 4Gb - we will need to use 64 bit ints
639 $self->{packmeth} = \&_packBig;
640 $self->{unpackmeth} = \&_unpackBig;
641 } else {
642 $self->{packmeth} = \&_pack;
643 $self->{unpackmeth} = \&_unpack;
647 =head2 index_file
649 Title : index_file
650 Usage : $db->index_file($filename)
651 Function: (re)loads a sequence file and indexes sequences offsets in the file
652 Returns : seq offsets in the file
653 Args : filename,
654 boolean to force reloading a file
656 =cut
659 sub index_file {
660 my $self = shift;
661 my $file = shift;
662 my $force_reindex = shift;
664 $self->set_pack_method( $file );
665 my $index = $self->index_name($file);
666 # if caller has requested reindexing, then unlink the index
667 unlink $index if $force_reindex;
669 # get the modification time of the index
670 my $indextime = (stat($index))[9] || 0;
671 my $modtime = (stat($file))[9] || 0;
673 my $reindex = $force_reindex || $indextime < $modtime;
674 my $offsets = $self->_open_index($index,$reindex) or return;
675 $self->{offsets} = $offsets;
677 return $self->{offsets} unless $reindex;
679 $self->{indexing} = $index;
680 $self->calculate_offsets($file,$offsets);
681 delete $self->{indexing};
682 return $self->{offsets};
685 =head2 dbmargs
687 Title : dbmargs
688 Usage : my @args = $db->dbmargs;
689 Function: gets stored dbm arguments
690 Returns : array
691 Args : none
693 =cut
695 sub dbmargs {
696 my $self = shift;
697 my $args = $self->{dbmargs} or return;
698 return ref($args) eq 'ARRAY' ? @$args : $args;
701 =head2 index_name
703 Title : index_name
704 Usage : my $indexname = $db->index_name($path,$isdir);
705 Function: returns the name of the index for a specific path
706 Returns : string
707 Args : path to check,
708 boolean if it is a dir
710 =cut
712 sub index_name {
713 my $self = shift;
714 my ($path,$isdir) = @_;
715 unless ($path) {
716 my $dir = $self->{dirname} or return;
717 return $self->index_name($dir,-d $dir);
719 return "$path/directory.index" if $isdir;
720 return "$path.index";
723 =head2 calculate_offsets
725 Title : calculate_offsets
726 Usage : $db->calculate_offsets($filename,$offsets);
727 Function: calculates the sequence offsets in a file based on id
728 Returns : offset hash for each file
729 Args : file to process
730 $offsets - hashref of id to offset storage
732 =cut
734 sub calculate_offsets {
735 my $self = shift;
736 my ($file,$offsets) = @_;
737 my $base = $self->path2fileno(basename($file));
739 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
740 binmode $fh;
741 warn "indexing $file\n" if $self->{debug};
742 my ($offset,@id,$linelength,$type,$firstline,$count,
743 $termination_length,$seq_lines,$last_line,%offsets);
744 my ($l3_len,$l2_len,$l_len)=(0,0,0);
746 while (<$fh>) { # don't try this at home
747 $termination_length ||= /\r\n$/ ? 2 : 1; # account for crlf-terminated Windows files
748 next unless /\S/;
749 if (index($_, ">") == 0) {
750 if (/^>(\S+)/) {
751 print STDERR "indexed $count sequences...\n"
752 if $self->{debug} && (++$count%1000) == 0;
755 my $pos = tell($fh);
756 if (@id) {
757 my $seqlength = $pos - $offset - length($_);
758 $seqlength -= $termination_length * $seq_lines;
759 my $ppos = &{$self->{packmeth}}($offset,$seqlength,
760 $linelength,$firstline,
761 $type,$base);
762 for my $id (@id) { $offsets->{$id} = $ppos }
764 @id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
765 ($offset,$firstline,$linelength) = ($pos,length($_),0);
766 $self->_check_linelength($linelength);
767 ($l3_len,$l2_len,$l_len)=(0,0,0);
768 $seq_lines = 0;
769 } else {
770 # catch bad header lines, bug 3172
771 $self->throw("FASTA header doesn't match '>(\\S+)': $_")
773 } else {
774 $l3_len= $l2_len; $l2_len= $l_len; $l_len= length($_); # need to check every line :(
775 if (DIE_ON_MISSMATCHED_LINES &&
776 $l3_len>0 && $l2_len>0 && $l3_len!=$l2_len) {
777 my $fap= substr($_,0,20)."..";
778 $self->throw("Each line of the fasta entry must be the same length except the last.
779 Line above #$. '$fap' is $l2_len != $l3_len chars.");
781 $linelength ||= length($_);
782 $type ||= $self->_type($_);
783 $seq_lines++;
785 $last_line = $_;
788 $self->_check_linelength($linelength);
789 # deal with last entry
790 if (@id) {
791 my $pos = tell($fh);
792 my $seqlength = $pos - $offset;
793 if ($linelength == 0) { # yet another pesky empty chr_random.fa file
794 $seqlength = 0;
795 } else {
796 if ($last_line !~ /\s$/) {
797 $seq_lines--;
799 $seqlength -= $termination_length * $seq_lines;
801 my $ppos = &{$self->{packmeth}}($offset,$seqlength,
802 $linelength,$firstline,
803 $type,$base);
804 for my $id (@id) { $offsets->{$id} = $ppos }
806 $offsets->{__termination_length} = $termination_length;
807 return \%offsets;
810 =head2 get_all_ids
812 Title : get_all_ids
813 Usage : my @ids = $db->get_all_ids
814 Function: gets all the stored ids in all indexes
815 Returns : list of ids
816 Args : none
818 =cut
820 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} }
822 sub offset {
823 my $self = shift;
824 my $id = shift;
825 my $offset = $self->{offsets}{$id} or return;
826 (&{$self->{unpackmeth}}($offset))[0];
829 sub length {
830 my $self = shift;
831 my $id = shift;
832 my $offset = $self->{offsets}{$id} or return;
833 (&{$self->{unpackmeth}}($offset))[1];
836 sub linelen {
837 my $self = shift;
838 my $id = shift;
839 my $offset = $self->{offsets}{$id} or return;
840 (&{$self->{unpackmeth}}($offset))[2];
843 sub headerlen {
844 my $self = shift;
845 my $id = shift;
846 my $offset = $self->{offsets}{$id} or return;
847 (&{$self->{unpackmeth}}($offset))[3];
850 sub alphabet {
851 my $self = shift;
852 my $id = shift;
853 my $offset = $self->{offsets}{$id} or return;
854 my $type = (&{$self->{unpackmeth}}($offset))[4];
855 return $type == DNA ? 'dna'
856 : $type == RNA ? 'rna'
857 : 'protein';
861 sub path { shift->{dirname} }
863 sub header_offset {
864 my $self = shift;
865 my $id = shift;
866 return unless $self->{offsets}{$id};
867 return $self->offset($id) - $self->headerlen($id);
870 sub file {
871 my $self = shift;
872 my $id = shift;
873 my $offset = $self->{offsets}{$id} or return;
874 $self->fileno2path((&{$self->{unpackmeth}}($offset))[5]);
877 sub fileno2path {
878 my $self = shift;
879 my $no = shift;
880 return $self->{offsets}{"__file_$no"};
883 sub path2fileno {
884 my $self = shift;
885 my $path = shift;
886 if ( !defined $self->{offsets}{"__path_$path"} ) {
887 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
888 $self->{offsets}{"__file_$fileno"} = $path;
890 return $self->{offsets}{"__path_$path"}
893 sub _check_linelength {
894 my $self = shift;
895 my $linelength = shift;
896 return unless defined $linelength;
897 $self->throw("Each line of the fasta file must be less than 65,536 characters. Line $. is $linelength chars.") if $linelength > 65535.
901 =head2 subseq
903 Title : subseq
904 Usage : $seqdb->subseq($id,$start,$stop);
905 Function: returns a subseq of a sequence in the db
906 Returns : subsequence data
907 Args : id of sequence, starting point, ending point
909 =cut
911 sub subseq {
912 my ($self,$id,$start,$stop) = @_;
913 if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
914 ($id,$start,$stop) = ($1,$2,$3);
915 $start =~ s/_//g;
916 $stop =~ s/_//g;
918 $start ||= 1;
919 $stop ||= $self->length($id);
921 my $reversed;
922 if (defined $stop && $start > $stop) {
923 ($start,$stop) = ($stop,$start);
924 $reversed++;
927 my $data;
929 my $fh = $self->fh($id) or return;
930 my $filestart = $self->caloffset($id,$start);
931 my $filestop = $self->caloffset($id,$stop);
933 seek($fh,$filestart,0);
934 read($fh,$data,$filestop-$filestart+1);
935 $data =~ s/\n//g;
936 $data =~ s/\r//g;
937 if ($reversed) {
938 $data = reverse $data;
939 $data =~ tr/gatcGATC/ctagCTAG/;
941 $data;
944 sub fh {
945 my $self = shift;
946 my $id = shift;
947 my $file = $self->file($id) or return;
948 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
951 sub header {
952 my $self = shift;
953 my $id = shift;
954 my ($offset,$seqlength,$linelength,$firstline,$type,$file)
955 = &{$self->{unpackmeth}}($self->{offsets}{$id}) or return;
956 $offset -= $firstline;
957 my $data;
958 my $fh = $self->fh($id) or return;
959 seek($fh,$offset,0);
960 read($fh,$data,$firstline);
961 chomp $data;
962 substr($data,0,1) = '';
963 $data;
966 sub caloffset {
967 my $self = shift;
968 my $id = shift;
969 my $a = shift()-1;
970 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = &{$self->{unpackmeth}}($self->{offsets}{$id});
971 $a = 0 if $a < 0;
972 $a = $seqlength-1 if $a >= $seqlength;
973 my $tl = $self->{offsets}{__termination_length};
974 $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
977 sub fhcache {
978 my $self = shift;
979 my $path = shift;
980 if (!$self->{fhcache}{$path}) {
981 if ($self->{curopen} >= $self->{maxopen}) {
982 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
983 splice(@lru, $self->{maxopen} / 3);
984 $self->{curopen} -= @lru;
985 for (@lru) { delete $self->{fhcache}{$_} }
987 $self->{fhcache}{$path} = IO::File->new($path) or return;
988 binmode $self->{fhcache}{$path};
989 $self->{curopen}++;
991 $self->{cacheseq}{$path}++;
992 $self->{fhcache}{$path}
995 sub _pack {
996 pack STRUCT,@_;
999 sub _packBig {
1000 pack STRUCTBIG,@_;
1003 sub _unpack {
1004 unpack STRUCT,shift;
1007 sub _unpackBig {
1008 unpack STRUCTBIG,shift;
1011 sub _type {
1012 shift;
1013 local $_ = shift;
1014 return /^[gatcnGATCN*-]+$/ ? DNA
1015 : /^[gaucnGAUCN*-]+$/ ? RNA
1016 : PROTEIN;
1019 =head2 get_PrimarySeq_stream
1021 Title : get_PrimarySeq_stream
1022 Usage :
1023 Function:
1024 Example :
1025 Returns :
1026 Args :
1028 =cut
1030 sub get_PrimarySeq_stream {
1031 my $self = shift;
1032 return Bio::DB::Fasta::Stream->new($self);
1035 sub TIEHASH {
1036 my $self = shift;
1037 return $self->new(@_);
1040 sub FETCH {
1041 shift->subseq(@_);
1043 sub STORE {
1044 shift->throw("Read-only database");
1046 sub DELETE {
1047 shift->throw("Read-only database");
1049 sub CLEAR {
1050 shift->throw("Read-only database");
1052 sub EXISTS {
1053 defined shift->offset(@_);
1055 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
1056 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); }
1058 sub DESTROY {
1059 my $self = shift;
1060 if ($self->{indexing}) { # killed prematurely, so index file is no good!
1061 warn "indexing was interrupted, so unlinking $self->{indexing}";
1062 unlink $self->{indexing};
1066 #-------------------------------------------------------------
1067 # Bio::PrimarySeqI compatibility
1069 package Bio::PrimarySeq::Fasta;
1070 use overload '""' => 'display_id';
1072 use base qw(Bio::Root::Root Bio::PrimarySeqI);
1074 sub new {
1075 my $class = shift;
1076 $class = ref($class) if ref $class;
1077 my ($db,$id,$start,$stop) = @_;
1078 return bless { db => $db,
1079 id => $id,
1080 start => $start || 1,
1081 stop => $stop || $db->length($id)
1082 },$class;
1085 sub fetch_sequence { shift->seq(@_) }
1087 sub seq {
1088 my $self = shift;
1089 return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
1092 sub subseq {
1093 my $self = shift;
1094 $self->trunc(@_)->seq();
1097 sub trunc {
1098 my $self = shift;
1099 my ($start,$stop) = @_;
1100 $self->throw("Stop cannot be smaller than start") unless $start <= $stop;
1101 return $self->{start} <= $self->{stop} ? $self->new($self->{db},
1102 $self->{id},
1103 $self->{start}+$start-1,
1104 $self->{start}+$stop-1)
1105 : $self->new($self->{db},
1106 $self->{id},
1107 $self->{start}-($start-1),
1108 $self->{start}-($stop-1)
1113 sub is_circular {
1114 my $self = shift;
1115 return $self->{is_circular};
1118 sub display_id {
1119 my $self = shift;
1120 return $self->{id};
1123 sub accession_number {
1124 my $self = shift;
1125 return "unknown";
1128 sub primary_id {
1129 my $self = shift;
1130 return overload::StrVal($self);
1133 sub can_call_new { return 0 }
1135 sub alphabet {
1136 my $self = shift;
1137 return $self->{db}->alphabet($self->{id});
1140 sub revcom {
1141 my $self = shift;
1142 return $self->new(@{$self}{'db','id','stop','start'});
1145 sub length {
1146 my $self = shift;
1147 #return $self->{db}->length($self->{id}); # wrong because ignores sequence start and stop values
1148 return length($self->seq);
1152 sub description {
1153 my $self = shift;
1154 my $header = $self->{'db'}->header($self->{id});
1155 # remove the id from the header
1156 return (split(/\s+/,$header,2))[1];
1159 *desc = \&description;
1161 #-------------------------------------------------------------
1162 # stream-based access to the database
1164 package Bio::DB::Fasta::Stream;
1165 use base qw(Tie::Handle Bio::DB::SeqI);
1168 sub new {
1169 my $class = shift;
1170 my $db = shift;
1171 my $key = $db->FIRSTKEY;
1172 return bless { db=>$db,key=>$key },$class;
1175 sub next_seq {
1176 my $self = shift;
1177 my ($key,$db) = @{$self}{'key','db'};
1178 while ($key =~ /^__/) {
1179 $key = $db->NEXTKEY($key);
1180 return unless defined $key;
1182 my $value = $db->get_Seq_by_id($key);
1183 $self->{key} = $db->NEXTKEY($key);
1184 $value;
1187 sub TIEHANDLE {
1188 my $class = shift;
1189 my $db = shift;
1190 return $class->new($db);
1192 sub READLINE {
1193 my $self = shift;
1194 $self->next_seq;
1199 __END__