Sync with main trunk
[bioperl-live.git] / Bio / DB / Fasta.pm
blob811ad3ebf89da27857c03afc3b41f7c19b293ccf
1 # $Id$
3 # BioPerl module for Bio::DB::Fasta
5 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
10 =head1 NAME
12 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
14 =head1 SYNOPSIS
16 use Bio::DB::Fasta;
18 # create database from directory of fasta files
19 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
21 # simple access (for those without Bioperl)
22 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
23 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
24 my @ids = $db->ids;
25 my $length = $db->length('CHROMOSOME_I');
26 my $alphabet = $db->alphabet('CHROMOSOME_I');
27 my $header = $db->header('CHROMOSOME_I');
29 # Bioperl-style access
30 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
32 my $obj = $db->get_Seq_by_id('CHROMOSOME_I');
33 my $seq = $obj->seq; # sequence string
34 my $subseq = $obj->subseq(4_000_000 => 4_100_000); # string
35 my $trunc = $obj->trunc(4_000_000 => 4_100_000); # seq object
36 my $length = $obj->length;
37 # (etc)
39 # Bio::SeqIO-style access
40 my $stream = Bio::DB::Fasta->new('/path/to/files')->get_PrimarySeq_stream;
41 while (my $seq = $stream->next_seq) {
42 # Bio::PrimarySeqI stuff
45 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
46 while (my $seq = <$fh>) {
47 # Bio::PrimarySeqI stuff
50 # tied hash access
51 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
52 print $sequences{'CHROMOSOME_I:1,20000'};
54 =head1 DESCRIPTION
56 Bio::DB::Fasta provides indexed access to one or more Fasta files. It
57 provides random access to each sequence entry, and to subsequences
58 within each entry, allowing you to retrieve portions of very large
59 sequences without bringing the entire sequence into memory.
61 When you initialize the module, you point it at a single fasta file or
62 a directory of multiple such files. The first time it is run, the
63 module generates an index of the contents of the file or directory
64 using the AnyDBM module (Berkeley DB* preferred, followed by GDBM_File,
65 NDBM_File, and SDBM_File). Thereafter it uses the index file to find
66 the file and offset for any requested sequence. If one of the source
67 fasta files is updated, the module reindexes just that one file. (You
68 can also force reindexing manually). For improved performance, the
69 module keeps a cache of open filehandles, closing less-recently used
70 ones when the cache is full.
72 The fasta files may contain any combination of nucleotide and protein
73 sequences; during indexing the module guesses the molecular type.
74 Entries may have any line length up to 65,536 characters, and
75 different line lengths are allowed in the same file. However, within
76 a sequence entry, all lines must be the same length except for the
77 last.
79 An error will be thrown if this is not the case.
81 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
82 from the Fasta header. During indexing, you may pass a callback routine to
83 modify this primary ID. For example, you may wish to extract a
84 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
85 The original header line can be recovered later.
87 This module was developed for use with the C. elegans and human
88 genomes, and has been tested with sequence segments as large as 20
89 megabases. Indexing the C. elegans genome (100 megabases of genomic
90 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
91 laptop. On the same system, average access time for any 200-mer within
92 the C. elegans genome was E<lt>0.02s.
94 *Berkeley DB can be obtained free from www.sleepycat.com. After it is
95 installed you will need to install the BerkeleyDB Perl module.
97 =head1 DATABASE CREATION AND INDEXING
99 The two constructors for this class are new() and newFh(). The former
100 creates a Bio::DB::Fasta object which is accessed via method calls.
101 The latter creates a tied filehandle which can be used Bio::SeqIO
102 style to fetch sequence objects in a stream fashion. There is also a
103 tied hash interface.
105 =over 2
107 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
109 Create a new Bio::DB::Fasta object from the Fasta file or files
110 indicated by $fasta_path. Indexing will be performed automatically if
111 needed. If successful, new() will return the database accessor
112 object. Otherwise it will return undef.
114 $fasta_path may be an individual Fasta file, or may refer to a
115 directory containing one or more of such files. Following the path,
116 you may pass a series of name=E<gt>value options or a hash with these
117 same name=E<gt>value pairs. Valid options are:
119 Option Name Description Default
120 ----------- ----------- -------
122 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna}
123 for searching for Fasta
124 files in directories.
126 -makeid A code subroutine for None
127 transforming Fasta IDs.
129 -maxopen Maximum size of 32
130 filehandle cache.
132 -debug Turn on status 0
133 messages.
135 -reindex Force the index to be 0
136 rebuilt.
138 -dbmargs Additional arguments none
139 to pass to the DBM
140 routines when tied
141 (scalar or array ref).
143 -dbmargs can be used to control the format of the index. For example,
144 you can pass $DB_BTREE to this argument so as to force the IDs to be
145 sorted and retrieved alphabetically. Note that you must use the same
146 arguments every time you open the index!
148 -reindex can be used to force the index to be recreated from scratch.
150 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
152 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading
153 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
154 Bio::SeqIO style.
156 =back
158 The -makeid option gives you a chance to modify sequence IDs during
159 indexing. The option value should be a code reference that will
160 take a scalar argument and return a scalar result, like this:
162 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
164 sub make_my_id {
165 my $description_line = shift;
166 # get a different id from the fasta header, e.g.
167 $description_line =~ /(\S+)$/;
168 return $1;
171 make_my_id() will be called with the full fasta id line (including the
172 "E<gt>" symbol!). For example:
174 >A12345.3 Predicted C. elegans protein egl-2
176 By default, this module will use the regular expression /^E<gt>(\S+)/
177 to extract "A12345.3" for use as the ID. If you pass a -makeid
178 callback, you can extract any portion of this, such as the "egl-2"
179 symbol.
181 The -makeid option is ignored after the index is constructed.
183 =head1 OBJECT METHODS
185 The following object methods are provided.
187 =over 10
189 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
191 Return the raw sequence (a string) given an ID and optionally a start
192 and stop position in the sequence. In the case of DNA sequence, if
193 $stop is less than $start, then the reverse complement of the sequence
194 is returned (this violates Bio::Seq conventions).
196 For your convenience, subsequences can be indicated with any of the
197 following compound IDs:
199 $db->seq("$id:$start,$stop")
201 $db->seq("$id:$start..$stop")
203 $db->seq("$id:$start-$stop")
205 =item $length = $db-E<gt>length($id)
207 Return the length of the indicated sequence.
209 =item $header = $db-E<gt>header($id)
211 Return the header line for the ID, including the initial "E<gt>".
213 =item $type = $db-E<gt>alphabet($id)
215 Return the molecular type of the indicated sequence. One of "dna",
216 "rna" or "protein".
218 =item $filename = $db-E<gt>file($id)
220 Return the name of the file in which the indicated sequence can be
221 found.
223 =item $offset = $db-E<gt>offset($id)
225 Return the offset of the indicated sequence from the beginning of the
226 file in which it is located. The offset points to the beginning of
227 the sequence, not the beginning of the header line.
229 =item $header_length = $db-E<gt>headerlen($id)
231 Return the length of the header line for the indicated sequence.
233 =item $header_offset = $db-E<gt>header_offset($id)
235 Return the offset of the header line for the indicated sequence from
236 the beginning of the file in which it is located.
238 =item $index_name = $db-E<gt>index_name
240 Return the path to the index file.
242 =item $path = $db-E<gt>path
244 Return the path to the Fasta file(s).
246 =back
248 For BioPerl-style access, the following methods are provided:
250 =over 4
252 =item $seq = $db-E<gt>get_Seq_by_id($id)
254 Return a Bio::PrimarySeq::Fasta object, which obeys the
255 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or
256 protein sequence, call $seq-E<gt>seq().
258 Note that get_Seq_by_id() does not bring the entire sequence into
259 memory until requested. Internally, the returned object uses the
260 accessor to generate subsequences as needed.
262 =item $seq = $db-E<gt>get_Seq_by_acc($id)
264 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
266 These methods all do the same thing as get_Seq_by_id().
268 =item $stream = $db-E<gt>get_PrimarySeq_stream()
270 Return a Bio::DB::Fasta::Stream object, which supports a single method
271 next_seq(). Each call to next_seq() returns a new
272 Bio::PrimarySeq::Fasta object, until no more sequences remain.
274 =back
276 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
277 returned from get_Seq_by_id() and get_PrimarySeq_stream().
279 =head1 TIED INTERFACES
281 This module provides two tied interfaces, one which allows you to
282 treat the sequence database as a hash, and the other which allows you
283 to treat the database as an I/O stream.
285 =head2 Creating a Tied Hash
287 The tied hash interface is very straightforward
289 =over 1
291 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
293 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
294 The optional @args list is the same set of named argument/value pairs
295 used by Bio::DB::Fasta-E<gt>new().
297 If successful, tie() will return the tied object. Otherwise it will
298 return undef.
300 =back
302 Once tied, you can use the hash to retrieve an individual sequence by
303 its ID, like this:
305 my $seq = $db{CHROMOSOME_I};
307 You may select a subsequence by appending the comma-separated range to
308 the sequence ID in the format "$id:$start,$stop". For example, here
309 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
311 my $seq = $db{'CHROMOSOME_I:1,1000'};
313 (The regular expression used to parse this format allows sequence IDs
314 to contain colons.)
316 When selecting subsequences, if $start E<gt> stop, then the reverse
317 complement will be returned for DNA sequences.
319 The keys() and values() functions will return the sequence IDs and
320 their sequences, respectively. In addition, each() can be used to
321 iterate over the entire data set:
323 while (my ($id,$sequence) = each %db) {
324 print "$id => $sequence\n";
327 When dealing with very large sequences, you can avoid bringing them
328 into memory by calling each() in a scalar context. This returns the
329 key only. You can then use tied(%db) to recover the Bio::DB::Fasta
330 object and call its methods.
332 while (my $id = each %db) {
333 print "$id => $db{$sequence:1,100}\n";
334 print "$id => ",tied(%db)->length($id),"\n";
337 You may, in addition invoke Bio::DB::Fasta the FIRSTKEY and NEXTKEY tied
338 hash methods directly.
340 =over 2
342 =item $id = $db-E<gt>FIRSTKEY
344 Return the first ID in the database.
346 =item $id = $db-E<gt>NEXTKEY($id)
348 Given an ID, return the next ID in sequence.
350 =back
352 This allows you to write the following iterative loop using just the
353 object-oriented interface:
355 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
356 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
357 # do something with sequence
360 =head2 Creating a Tied Filehandle
362 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
363 which you can read Bio::PrimarySeq::Fasta sequence objects
364 sequentially. The following bit of code will iterate sequentially
365 over all sequences in the database:
367 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
368 while (my $seq = <$fh>) {
369 print $seq->id,' => ',$seq->length,"\n";
372 When no more sequences remain to be retrieved, the stream will return
373 undef.
375 =head1 BUGS
377 When a sequence is deleted from one of the Fasta files, this deletion
378 is not detected by the module and removed from the index. As a
379 result, a "ghost" entry will remain in the index and will return
380 garbage results if accessed.
382 Currently, the only way to accomodate deletions is to rebuild the
383 entire index, either by deleting it manually, or by passing
384 -reindex=E<gt>1 to new() when initializing the module.
386 =head1 SEE ALSO
388 L<bioperl>
390 =head1 AUTHOR
392 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
394 Copyright (c) 2001 Cold Spring Harbor Laboratory.
396 This library is free software; you can redistribute it and/or modify
397 it under the same terms as Perl itself. See DISCLAIMER.txt for
398 disclaimers of warranty.
400 =cut
403 package Bio::DB::Fasta;
405 BEGIN {
406 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
409 use strict;
410 use IO::File;
411 use AnyDBM_File;
412 use Fcntl;
413 use File::Basename qw(basename dirname);
415 use base qw(Bio::DB::SeqI Bio::Root::Root);
417 *seq = *sequence = \&subseq;
418 *ids = \&get_all_ids;
419 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
421 use constant STRUCT =>'NNnnCa*';
422 use constant STRUCTBIG =>'QQnnCa*'; # 64-bit file offset and seq length
423 use constant DNA => 1;
424 use constant RNA => 2;
425 use constant PROTEIN => 3;
426 use constant DIE_ON_MISSMATCHED_LINES => 1; # if you want
428 # Bio::DB-like object
429 # providing fast random access to a directory of FASTA files
431 =head2 new
433 Title : new
434 Usage : my $db = Bio::DB::Fasta->new( $path, @options);
435 Function: initialize a new Bio::DB::Fasta object
436 Returns : new Bio::DB::Fasta object
437 Args : path to dir of fasta files or a single filename
439 These are optional arguments to pass in as well.
441 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST}
442 for searching for Fasta
443 files in directories.
445 -makeid A code subroutine for None
446 transforming Fasta IDs.
448 -maxopen Maximum size of 32
449 filehandle cache.
451 -debug Turn on status 0
452 messages.
454 -reindex Force the index to be 0
455 rebuilt.
457 -dbmargs Additional arguments none
458 to pass to the DBM
459 routines when tied
460 (scalar or array ref).
462 =cut
464 sub new {
465 my $class = shift;
466 my $path = shift;
467 my %opts = @_;
469 my $self = bless { debug => $opts{-debug},
470 makeid => $opts{-makeid},
471 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,FNA,fna,FAA,faa,FSA,fsa}',
472 maxopen => $opts{-maxfh} || 32,
473 dbmargs => $opts{-dbmargs} || undef,
474 fhcache => {},
475 cacheseq => {},
476 curopen => 0,
477 openseq => 1,
478 dirname => undef,
479 offsets => undef,
480 }, $class;
481 my ($offsets,$dirname);
483 if (-d $path) {
484 # because Win32 glob() is broken with respect to long file names
485 # that contain whitespace.
486 $path = Win32::GetShortPathName($path)
487 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
488 $offsets = $self->index_dir($path,$opts{-reindex}) or return;
489 $dirname = $path;
490 } elsif (-f _) {
491 $offsets = $self->index_file($path,$opts{-reindex});
492 $dirname = dirname($path);
493 } else {
494 $self->throw( "$path: Invalid file or dirname");
496 @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
498 $self;
501 =head2 newFh
503 Title : newFh
504 Function: gets a new Fh for a file
505 Example : internal method
506 Returns : GLOB
507 Args :
509 =cut
511 sub newFh {
512 my $class = shift;
513 my $self = $class->new(@_);
514 require Symbol;
515 my $fh = Symbol::gensym or return;
516 tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
517 $fh;
520 sub _open_index {
521 my $self = shift;
522 my ($index,$write) = @_;
523 my %offsets;
524 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
525 my @dbmargs = $self->dbmargs;
526 eval {
527 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
528 or die "Can't open sequence index file $index: $!";
530 warn $@ if $@;
531 return \%offsets;
534 sub _close_index {
535 my $self = shift;
536 my $index = shift;
537 untie %$index;
540 =head2 index_dir
542 Title : index_dir
543 Usage : $db->index_dir($dir)
544 Function: set the index dir and load all files in the dir
545 Returns : hashref of seq offsets in each file
546 Args : dirname, boolean to force a reload of all files
548 =cut
550 sub index_dir {
551 my $self = shift;
552 my $dir = shift;
553 my $force_reindex = shift;
555 # find all fasta files
556 my @files = glob("$dir/$self->{glob}");
557 # $self->throw( "no fasta files in $dir") unless @files;
558 return unless @files;
560 # get name of index
561 my $index = $self->index_name($dir,1);
563 # if caller has requested reindexing, then unlink
564 # the index file.
565 unlink $index if $force_reindex;
567 # get the modification time of the index
568 my $indextime = 0;
569 for my $suffix('','.pag','.dir') {
570 $indextime ||= (stat("${index}${suffix}"))[9];
572 $indextime ||= 0; # prevent some uninit variable warnings
574 # get the most recent modification time of any of the contents
575 my $modtime = 0;
576 my %modtime;
577 $self->set_pack_method( @files );
578 foreach (@files) {
579 my $m = (stat($_))[9];
580 $modtime{$_} = $m;
581 $modtime = $m if defined $m && $modtime < $m;
584 my $reindex = $force_reindex || $indextime < $modtime;
585 $self->{offsets} = $self->_open_index($index,$reindex) or return;
587 # no indexing needed
588 return $self->{offsets} unless $reindex;
590 # otherwise reindex contents of changed files
591 $self->{indexing} = $index;
592 foreach (@files) {
593 next if( defined $indextime && $modtime{$_} <= $indextime);
594 $self->calculate_offsets($_,$self->{offsets});
596 delete $self->{indexing};
598 # we've been having troubles with corrupted index files on Windows systems,
599 # so possibly closing and reopening will help
600 $self->_close_index($self->{offsets});
602 return $self->{offsets} = $self->_open_index($index);
605 =head2 get_Seq_by_id
607 Title : get_Seq_by_id
608 Usage : my $seq = $db->get_Seq_by_id($id)
609 Function: Bio::DB::RandomAccessI method implemented
610 Returns : Bio::PrimarySeqI object
611 Args : id
613 =cut
615 sub get_Seq_by_id {
616 my $self = shift;
617 my $id = shift;
618 return unless exists $self->{offsets}{$id};
619 return Bio::PrimarySeq::Fasta->new($self,$id);
622 =head2 set_pack_method
624 Title : set_pack_method
625 Usage : $db->set_pack_method( @files )
626 Function: Determines whether data packing uses 32 or 64 bit integers
627 Returns :
628 Args : one or more file paths
630 =cut
632 sub set_pack_method {
633 my $self = shift;
634 # Find the maximum file size:
635 my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
636 my $fourGB = (2 ** 32) - 1;
638 if ($maxsize > $fourGB) {
639 # At least one file exceeds 4Gb - we will need to use 64 bit ints
640 $self->{packmeth} = \&_packBig;
641 $self->{unpackmeth} = \&_unpackBig;
642 } else {
643 $self->{packmeth} = \&_pack;
644 $self->{unpackmeth} = \&_unpack;
648 =head2 index_file
650 Title : index_file
651 Usage : $db->index_file($filename)
652 Function: (re)loads a sequence file and indexes sequences offsets in the file
653 Returns : seq offsets in the file
654 Args : filename,
655 boolean to force reloading a file
657 =cut
660 sub index_file {
661 my $self = shift;
662 my $file = shift;
663 my $force_reindex = shift;
665 $self->set_pack_method( $file );
666 my $index = $self->index_name($file);
667 # if caller has requested reindexing, then unlink the index
668 unlink $index if $force_reindex;
670 # get the modification time of the index
671 my $indextime = (stat($index))[9] || 0;
672 my $modtime = (stat($file))[9] || 0;
674 my $reindex = $force_reindex || $indextime < $modtime;
675 my $offsets = $self->_open_index($index,$reindex) or return;
676 $self->{offsets} = $offsets;
678 return $self->{offsets} unless $reindex;
680 $self->{indexing} = $index;
681 $self->calculate_offsets($file,$offsets);
682 delete $self->{indexing};
683 return $self->{offsets};
686 =head2 dbmargs
688 Title : dbmargs
689 Usage : my @args = $db->dbmargs;
690 Function: gets stored dbm arguments
691 Returns : array
692 Args : none
694 =cut
696 sub dbmargs {
697 my $self = shift;
698 my $args = $self->{dbmargs} or return;
699 return ref($args) eq 'ARRAY' ? @$args : $args;
702 =head2 index_name
704 Title : index_name
705 Usage : my $indexname = $db->index_name($path,$isdir);
706 Function: returns the name of the index for a specific path
707 Returns : string
708 Args : path to check,
709 boolean if it is a dir
711 =cut
713 sub index_name {
714 my $self = shift;
715 my ($path,$isdir) = @_;
716 unless ($path) {
717 my $dir = $self->{dirname} or return;
718 return $self->index_name($dir,-d $dir);
720 return "$path/directory.index" if $isdir;
721 return "$path.index";
724 =head2 calculate_offsets
726 Title : calculate_offsets
727 Usage : $db->calculate_offsets($filename,$offsets);
728 Function: calculates the sequence offsets in a file based on id
729 Returns : offset hash for each file
730 Args : file to process
731 $offsets - hashref of id to offset storage
733 =cut
735 sub calculate_offsets {
736 my $self = shift;
737 my ($file,$offsets) = @_;
738 my $base = $self->path2fileno(basename($file));
740 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
741 binmode $fh;
742 warn "indexing $file\n" if $self->{debug};
743 my ($offset,@id,$linelength,$type,$firstline,$count,
744 $termination_length,$seq_lines,$last_line,%offsets);
745 my ($l3_len,$l2_len,$l_len)=(0,0,0);
747 while (<$fh>) { # don't try this at home
748 $termination_length ||= /\r\n$/ ? 2 : 1; # account for crlf-terminated Windows files
749 if (/^>(\S+)/) {
750 print STDERR "indexed $count sequences...\n"
751 if $self->{debug} && (++$count%1000) == 0;
752 my $pos = tell($fh);
753 if (@id) {
754 my $seqlength = $pos - $offset - length($_);
755 $seqlength -= $termination_length * $seq_lines;
756 my $ppos = &{$self->{packmeth}}($offset,$seqlength,
757 $linelength,$firstline,
758 $type,$base);
759 for my $id (@id) { $offsets->{$id} = $ppos }
761 @id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
762 ($offset,$firstline,$linelength) = ($pos,length($_),0);
763 $self->_check_linelength($linelength);
764 ($l3_len,$l2_len,$l_len)=(0,0,0);
765 $seq_lines = 0;
766 } else {
767 $l3_len= $l2_len; $l2_len= $l_len; $l_len= length($_); # need to check every line :(
768 if (DIE_ON_MISSMATCHED_LINES &&
769 $l3_len>0 && $l2_len>0 && $l3_len!=$l2_len) {
770 my $fap= substr($_,0,20)."..";
771 $self->throw("Each line of the fasta entry must be the same length except the last.
772 Line above #$. '$fap' is $l2_len != $l3_len chars.");
774 $linelength ||= length($_);
775 $type ||= $self->_type($_);
776 $seq_lines++;
778 $last_line = $_;
781 $self->_check_linelength($linelength);
782 # deal with last entry
783 if (@id) {
784 my $pos = tell($fh);
785 my $seqlength = $pos - $offset;
786 if ($linelength == 0) { # yet another pesky empty chr_random.fa file
787 $seqlength = 0;
788 } else {
789 if ($last_line !~ /\s$/) {
790 $seq_lines--;
792 $seqlength -= $termination_length * $seq_lines;
794 my $ppos = &{$self->{packmeth}}($offset,$seqlength,
795 $linelength,$firstline,
796 $type,$base);
797 for my $id (@id) { $offsets->{$id} = $ppos }
799 $offsets->{__termination_length} = $termination_length;
800 return \%offsets;
803 =head2 get_all_ids
805 Title : get_all_ids
806 Usage : my @ids = $db->get_all_ids
807 Function: gets all the stored ids in all indexes
808 Returns : list of ids
809 Args : none
811 =cut
813 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} }
815 sub offset {
816 my $self = shift;
817 my $id = shift;
818 my $offset = $self->{offsets}{$id} or return;
819 (&{$self->{unpackmeth}}($offset))[0];
822 sub length {
823 my $self = shift;
824 my $id = shift;
825 my $offset = $self->{offsets}{$id} or return;
826 (&{$self->{unpackmeth}}($offset))[1];
829 sub linelen {
830 my $self = shift;
831 my $id = shift;
832 my $offset = $self->{offsets}{$id} or return;
833 (&{$self->{unpackmeth}}($offset))[2];
836 sub headerlen {
837 my $self = shift;
838 my $id = shift;
839 my $offset = $self->{offsets}{$id} or return;
840 (&{$self->{unpackmeth}}($offset))[3];
843 sub alphabet {
844 my $self = shift;
845 my $id = shift;
846 my $offset = $self->{offsets}{$id} or return;
847 my $type = (&{$self->{unpackmeth}}($offset))[4];
848 return $type == DNA ? 'dna'
849 : $type == RNA ? 'rna'
850 : 'protein';
854 sub path { shift->{dirname} }
856 sub header_offset {
857 my $self = shift;
858 my $id = shift;
859 return unless $self->{offsets}{$id};
860 return $self->offset($id) - $self->headerlen($id);
863 sub file {
864 my $self = shift;
865 my $id = shift;
866 my $offset = $self->{offsets}{$id} or return;
867 $self->fileno2path((&{$self->{unpackmeth}}($offset))[5]);
870 sub fileno2path {
871 my $self = shift;
872 my $no = shift;
873 return $self->{offsets}{"__file_$no"};
876 sub path2fileno {
877 my $self = shift;
878 my $path = shift;
879 if ( !defined $self->{offsets}{"__path_$path"} ) {
880 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
881 $self->{offsets}{"__file_$fileno"} = $path;
883 return $self->{offsets}{"__path_$path"}
886 sub _check_linelength {
887 my $self = shift;
888 my $linelength = shift;
889 return unless defined $linelength;
890 $self->throw("Each line of the fasta file must be less than 65,536 characters. Line $. is $linelength chars.") if $linelength > 65535.
894 =head2 subseq
896 Title : subseq
897 Usage : $seqdb->subseq($id,$start,$stop);
898 Function: returns a subseq of a sequence in the db
899 Returns : subsequence data
900 Args : id of sequence, starting point, ending point
902 =cut
904 sub subseq {
905 my ($self,$id,$start,$stop) = @_;
906 if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
907 ($id,$start,$stop) = ($1,$2,$3);
908 $start =~ s/_//g;
909 $stop =~ s/_//g;
911 $start ||= 1;
912 $stop ||= $self->length($id);
914 my $reversed;
915 if (defined $stop && $start > $stop) {
916 ($start,$stop) = ($stop,$start);
917 $reversed++;
920 my $data;
922 my $fh = $self->fh($id) or return;
923 my $filestart = $self->caloffset($id,$start);
924 my $filestop = $self->caloffset($id,$stop);
926 seek($fh,$filestart,0);
927 read($fh,$data,$filestop-$filestart+1);
928 $data =~ s/\n//g;
929 $data =~ s/\r//g;
930 if ($reversed) {
931 $data = reverse $data;
932 $data =~ tr/gatcGATC/ctagCTAG/;
934 $data;
937 sub fh {
938 my $self = shift;
939 my $id = shift;
940 my $file = $self->file($id) or return;
941 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
944 sub header {
945 my $self = shift;
946 my $id = shift;
947 my ($offset,$seqlength,$linelength,$firstline,$type,$file)
948 = &{$self->{unpackmeth}}($self->{offsets}{$id}) or return;
949 $offset -= $firstline;
950 my $data;
951 my $fh = $self->fh($id) or return;
952 seek($fh,$offset,0);
953 read($fh,$data,$firstline);
954 chomp $data;
955 substr($data,0,1) = '';
956 $data;
959 sub caloffset {
960 my $self = shift;
961 my $id = shift;
962 my $a = shift()-1;
963 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = &{$self->{unpackmeth}}($self->{offsets}{$id});
964 $a = 0 if $a < 0;
965 $a = $seqlength-1 if $a >= $seqlength;
966 my $tl = $self->{offsets}{__termination_length};
967 $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
970 sub fhcache {
971 my $self = shift;
972 my $path = shift;
973 if (!$self->{fhcache}{$path}) {
974 if ($self->{curopen} >= $self->{maxopen}) {
975 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
976 splice(@lru, $self->{maxopen} / 3);
977 $self->{curopen} -= @lru;
978 for (@lru) { delete $self->{fhcache}{$_} }
980 $self->{fhcache}{$path} = IO::File->new($path) or return;
981 binmode $self->{fhcache}{$path};
982 $self->{curopen}++;
984 $self->{cacheseq}{$path}++;
985 $self->{fhcache}{$path}
988 sub _pack {
989 pack STRUCT,@_;
992 sub _packBig {
993 pack STRUCTBIG,@_;
996 sub _unpack {
997 unpack STRUCT,shift;
1000 sub _unpackBig {
1001 unpack STRUCTBIG,shift;
1004 sub _type {
1005 shift;
1006 local $_ = shift;
1007 return /^[gatcnGATCN*-]+$/ ? DNA
1008 : /^[gaucnGAUCN*-]+$/ ? RNA
1009 : PROTEIN;
1012 =head2 get_PrimarySeq_stream
1014 Title : get_PrimarySeq_stream
1015 Usage :
1016 Function:
1017 Example :
1018 Returns :
1019 Args :
1021 =cut
1023 sub get_PrimarySeq_stream {
1024 my $self = shift;
1025 return Bio::DB::Fasta::Stream->new($self);
1028 sub TIEHASH {
1029 my $self = shift;
1030 return $self->new(@_);
1033 sub FETCH {
1034 shift->subseq(@_);
1036 sub STORE {
1037 shift->throw("Read-only database");
1039 sub DELETE {
1040 shift->throw("Read-only database");
1042 sub CLEAR {
1043 shift->throw("Read-only database");
1045 sub EXISTS {
1046 defined shift->offset(@_);
1048 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
1049 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); }
1051 sub DESTROY {
1052 my $self = shift;
1053 if ($self->{indexing}) { # killed prematurely, so index file is no good!
1054 warn "indexing was interrupted, so unlinking $self->{indexing}";
1055 unlink $self->{indexing};
1059 #-------------------------------------------------------------
1060 # Bio::PrimarySeqI compatibility
1062 package Bio::PrimarySeq::Fasta;
1063 use overload '""' => 'display_id';
1065 use base qw(Bio::Root::Root Bio::PrimarySeqI);
1067 sub new {
1068 my $class = shift;
1069 $class = ref($class) if ref $class;
1070 my ($db,$id,$start,$stop) = @_;
1071 return bless { db => $db,
1072 id => $id,
1073 start => $start || 1,
1074 stop => $stop || $db->length($id)
1075 },$class;
1078 sub seq {
1079 my $self = shift;
1080 return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
1083 sub subseq {
1084 my $self = shift;
1085 $self->trunc(@_)->seq();
1088 sub trunc {
1089 my $self = shift;
1090 my ($start,$stop) = @_;
1091 $self->throw("Stop cannot be smaller than start") unless $start <= $stop;
1092 return $self->{start} <= $self->{stop} ? $self->new($self->{db},
1093 $self->{id},
1094 $self->{start}+$start-1,
1095 $self->{start}+$stop-1)
1096 : $self->new($self->{db},
1097 $self->{id},
1098 $self->{start}-($start-1),
1099 $self->{start}-($stop-1)
1104 sub is_circular {
1105 my $self = shift;
1106 return $self->{is_circular};
1109 sub display_id {
1110 my $self = shift;
1111 return $self->{id};
1114 sub accession_number {
1115 my $self = shift;
1116 return "unknown";
1119 sub primary_id {
1120 my $self = shift;
1121 return overload::StrVal($self);
1124 sub can_call_new { return 0 }
1126 sub alphabet {
1127 my $self = shift;
1128 return $self->{db}->alphabet($self->{id});
1131 sub revcom {
1132 my $self = shift;
1133 return $self->new(@{$self}{'db','id','stop','start'});
1136 sub length {
1137 my $self = shift;
1138 #return $self->{db}->length($self->{id}); # wrong because ignores sequence start and stop values
1139 return length($self->seq);
1143 sub description {
1144 my $self = shift;
1145 my $header = $self->{'db'}->header($self->{id});
1146 # remove the id from the header
1147 return (split(/\s+/,$header,2))[2];
1150 *desc = \&description;
1152 #-------------------------------------------------------------
1153 # stream-based access to the database
1155 package Bio::DB::Fasta::Stream;
1156 use base qw(Tie::Handle Bio::DB::SeqI);
1159 sub new {
1160 my $class = shift;
1161 my $db = shift;
1162 my $key = $db->FIRSTKEY;
1163 return bless { db=>$db,key=>$key },$class;
1166 sub next_seq {
1167 my $self = shift;
1168 my ($key,$db) = @{$self}{'key','db'};
1169 while ($key =~ /^__/) {
1170 $key = $db->NEXTKEY($key);
1171 return unless defined $key;
1173 my $value = $db->get_Seq_by_id($key);
1174 $self->{key} = $db->NEXTKEY($key);
1175 $value;
1178 sub TIEHANDLE {
1179 my $class = shift;
1180 my $db = shift;
1181 return $class->new($db);
1183 sub READLINE {
1184 my $self = shift;
1185 $self->next_seq;
1190 __END__