Test -makeid option
[bioperl-live.git] / Bio / DB / IndexedBase.pm
bloba8c702860d7f3254a107f79875d907481a5ec048
2 # BioPerl module for Bio::DB::IndexedBase
4 # You may distribute this module under the same terms as perl itself
8 =head1 NAME
10 Bio::DB::IndexedBase - Base class for modules using indexed sequence files
12 =head1 SYNOPSIS
14 use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
16 # 1/ Bio::SeqIO-style access
18 # Index some sequence files
19 my $db = Bio::DB::XXX->new('/path/to/file'); # from a single file
20 my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
21 my $db = Bio::DB::XXX->new('/path/to/files/'); # from a directory
23 # Get IDs of all the sequences in the database
24 my @ids = $db->ids;
26 # Get a specific sequence
27 my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29 # Loop through all sequences
30 my $stream = $db->get_Seq_stream;
31 while (my $seq = $stream->next_seq) {
32 # Do something...
36 # 2/ Access via filehandle
37 my $fh = Bio::DB::XXX->newFh('/path/to/file');
38 while (my $seq = <$fh>) {
39 # Do something...
43 # 3/ Tied-hash access
44 tie %sequences, 'Bio::DB::XXX', '/path/to/file';
45 print $sequences{'CHROMOSOME_I:1,20000'};
47 =head1 DESCRIPTION
49 Bio::DB::IndexedBase provides a base class for modules that want to index
50 and read sequence files and provides persistent, random access to each sequence
51 entry, without bringing the entire file into memory. Bio::DB::Fasta and
52 Bio::DB::Qual both use Bio::DB::IndexedBase.
54 When you initialize the module, you point it at a single file, several files, or
55 a directory of files. The first time it is run, the module generates an index
56 of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
57 followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
58 index file to find the sequence file and offset for any requested sequence. If
59 one of the source files is updated, the module reindexes just that one file. You
60 can also force reindexing manually at any time. For improved performance, the
61 module keeps a cache of open filehandles, closing less-recently used ones when
62 the cache is full.
64 Entries may have any line length up to 65,536 characters, and different line
65 lengths are allowed in the same file. However, within a sequence entry, all
66 lines must be the same length except for the last. An error will be thrown if
67 this is not the case!
69 This module was developed for use with the C. elegans and human genomes, and has
70 been tested with sequence segments as large as 20 megabases. Indexing the C.
71 elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
72 minutes on my 300 MHz pentium laptop. On the same system, average access time
73 for any 200-mer within the C. elegans genome was E<lt>0.02s.
75 =head1 DATABASE CREATION AND INDEXING
77 The two constructors for this class are new() and newFh(). The former creates a
78 Bio::DB::IndexedBase object which is accessed via method calls. The latter
79 creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
80 objects in a stream fashion. There is also a tied hash interface.
82 =over
84 =item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options])
86 Create a new Bio::DB::IndexedBase object from the files designated by $path
87 $path may be a single file, an arrayref of files, or a directory containing
88 such files.
90 After the database is created, you can use methods like ids() and get_Seq_by_id()
91 to retrieve sequence objects.
93 =item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options])
95 Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
96 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
97 Bio::SeqIO style. The path and the options should be specified as for new().
99 =item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
101 Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
102 path to the files. The optional @args list is the same set used by new(). If
103 successful, tie() returns the tied object, undef otherwise.
105 Once tied, you can use the hash to retrieve an individual sequence by
106 its ID, like this:
108 my $seq = $db{CHROMOSOME_I};
110 The keys() and values() functions will return the sequence IDs and
111 their sequences, respectively. In addition, each() can be used to
112 iterate over the entire data set:
114 while (my ($id,$sequence) = each %db) {
115 print "$id => $sequence\n";
119 When dealing with very large sequences, you can avoid bringing them
120 into memory by calling each() in a scalar context. This returns the
121 key only. You can then use tied(%db) to recover the Bio::DB::IndexedBase
122 object and call its methods.
124 while (my $id = each %db) {
125 print "$id: $db{$sequence:1,100}\n";
126 print "$id: ".tied(%db)->length($id)."\n";
129 In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
130 to retrieve the first and next ID in the database, respectively. This allows to
131 write the following iterative loop using just the object-oriented interface:
133 my $db = Bio::DB::IndexedBase->new('/path/to/file');
134 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
135 # do something with sequence
138 =back
140 =head1 INDEX CONTENT
142 Several attributes of each sequence are stored in the index file. Given a
143 sequence ID, these attributes can be retrieved using the following methods:
145 =over
147 =item offset($id)
149 Get the offset of the indicated sequence from the beginning of the file in which
150 it is located. The offset points to the beginning of the sequence, not the
151 beginning of the header line.
153 =item strlen($id)
155 Get the number of characters in the sequence string.
157 =item length($id)
159 Get the number of residues of the sequence.
161 =item linelen($id)
163 Get the length of the line for this sequence. If the sequence is wrapped, then
164 linelen() is likely to be much shorter than strlen().
166 =item headerlen($id)
168 Get the length of the header line for the indicated sequence.
170 =item header_offset
172 Get the offset of the header line for the indicated sequence from the beginning
173 of the file in which it is located. This attribute is not stored. It is
174 calculated from offset() and headerlen().
176 =item alphabet($id)
178 Get the molecular type (alphabet) of the indicated sequence.
180 =item file($id)
182 Get the the name of the file in which the indicated sequence can be found.
184 =back
186 =head1 BUGS
188 When a sequence is deleted from one of the files, this deletion is not detected
189 by the module and removed from the index. As a result, a "ghost" entry will
190 remain in the index and will return garbage results if accessed.
192 Currently, the only way to accommodate sequence removal is to rebuild the entire
193 index, either by deleting it manually, or by passing -reindex=E<gt>1 to new()
194 when initializing the module.
197 =head1 SEE ALSO
199 L<DB_File>
201 L<Bio::DB::Fasta>
203 L<Bio::DB::Qual>
205 =head1 AUTHOR
207 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
209 Copyright (c) 2001 Cold Spring Harbor Laboratory.
211 Florent Angly (for the modularization)
213 This library is free software; you can redistribute it and/or modify
214 it under the same terms as Perl itself. See DISCLAIMER.txt for
215 disclaimers of warranty.
217 =head1 APPENDIX
219 The rest of the documentation details each of the object
220 methods. Internal methods are usually preceded with a _
222 =cut
225 package Bio::DB::IndexedBase;
227 BEGIN {
228 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
231 use strict;
232 use IO::File;
233 use AnyDBM_File;
234 use Fcntl;
235 use File::Spec;
236 use File::Basename qw(dirname);
238 use base qw(Bio::Root::Root);
240 # Store offset, strlen, linelen, headerlen, type and fileno
241 use constant STRUCT => 'NNNnnCa*'; # 32-bit file offset and seq length
242 use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
244 use constant NA => 0;
245 use constant DNA => 1;
246 use constant RNA => 2;
247 use constant PROTEIN => 3;
249 use constant DIE_ON_MISSMATCHED_LINES => 1;
250 # you can avoid dying if you want but you may get incorrect results
252 my ($caller, @fileno2path, %filepath2no);
255 =head2 new
257 Title : new
258 Usage : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
259 Function: Initialize a new database object
260 Returns : A Bio::DB::IndexedBase object
261 Args : A single file, or path to dir, or arrayref of files
262 Optional arguments:
264 Option Description Default
265 ----------- ----------- -------
267 -glob Glob expression to search for files in directories *
269 -makeid A code subroutine for transforming IDs None
271 -maxopen Maximum size of filehandle cache 32
273 -debug Turn on status messages 0
275 -reindex Force the index to be rebuilt 0
277 -dbmargs Additional arguments to pass to the DBM routine None
279 -index_name Name of the file that will hold the indices
281 -clean Remove the index file when finished 0
283 The -dbmargs option can be used to control the format of the index. For example,
284 you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
285 retrieved alphabetically. Note that you must use the same arguments every time
286 you open the index!
288 The -makeid option gives you a chance to modify sequence IDs during indexing.
289 For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
290 that GenBank Fasta files use. The original header line can be recovered later.
291 The option value should be a code reference that takes a scalar argument and
292 returns a scalar result, like this:
294 $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
296 sub extract_gi {
297 # Extract GI from GenBank
298 my $header = shift;
299 my ($id) = ($header =~ /gi\|(\d+)/m);
300 return $id || '';
303 extract_gi() will be called with the full header line, e.g. a Fasta line would
304 include the "E<gt>", the ID and the description:
306 >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
308 The -makeid option is ignored after the index is constructed.
310 =cut
312 sub new {
313 my ($class, $path, %opts) = @_;
315 my $self = bless {
316 debug => $opts{-debug},
317 makeid => $opts{-makeid},
318 glob => $opts{-glob} || eval '$'.$class.'::file_glob' || '*',
319 maxopen => $opts{-maxopen} || 32,
320 clean => $opts{-clean} || 0,
321 dbmargs => $opts{-dbmargs} || undef,
322 fhcache => {},
323 cacheseq => {},
324 curopen => 0,
325 openseq => 1,
326 dirname => undef,
327 offsets => undef,
328 index_name => $opts{-index_name},
329 obj_class => eval '$'.$class.'::obj_class',
330 offset_meth => \&{$class.'::_calculate_offsets'},
331 }, $class;
333 my ($offsets, $dirname);
334 my $ref = ref $path || '';
335 if ( $ref eq 'ARRAY' ) {
336 $offsets = $self->index_files($path, $opts{-reindex});
337 require Cwd;
338 $dirname = Cwd::getcwd();
339 } else {
340 if (-d $path) {
341 # because Win32 glob() is broken with respect to long file names
342 # that contain whitespace.
343 $path = Win32::GetShortPathName($path)
344 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
345 $offsets = $self->index_dir($path, $opts{-reindex});
346 $dirname = $path;
347 } elsif (-f _) {
348 $offsets = $self->index_file($path, $opts{-reindex});
349 $dirname = dirname($path);
350 } else {
351 $self->throw( "$path: Invalid file or dirname");
354 @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
356 return $self;
360 =head2 newFh
362 Title : newFh
363 Usage : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
364 Function: Index and get a new Fh for a single file, several files or a directory
365 Returns : Filehandle object
366 Args : Same as new()
368 =cut
370 sub newFh {
371 my ($class, @args) = @_;
372 my $self = $class->new(@args);
373 require Symbol;
374 my $fh = Symbol::gensym;
375 tie $$fh, 'Bio::DB::Indexed::Stream', $self
376 or $self->throw("Could not tie filehandle: $!");
377 return $fh;
381 =head2 dbmargs
383 Title : dbmargs
384 Usage : my @args = $db->dbmargs;
385 Function: Get stored dbm arguments
386 Returns : Array
387 Args : None
389 =cut
391 sub dbmargs {
392 my $self = shift;
393 my $args = $self->{dbmargs} or return;
394 return ref($args) eq 'ARRAY' ? @$args : $args;
398 =head2 glob
400 Title : glob
401 Usage : my $glob = $db->glob;
402 Function: Get the expression used to match files in directories
403 Returns : String
404 Args : None
406 =cut
408 sub glob {
409 my $self = shift;
410 return $self->{glob};
414 =head2 index_dir
416 Title : index_dir
417 Usage : $db->index_dir($dir);
418 Function: Index the files that match -glob in the given directory
419 Returns : Hashref of offsets
420 Args : Dirname
421 Boolean to force a reindexing the directory
423 =cut
425 sub index_dir {
426 my ($self, $dir, $force_reindex) = @_;
427 my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
428 $self->throw("No suitable files found in $dir") if scalar @files == 0;
429 $self->{index_name} ||= File::Spec->catfile($dir, 'directory.index');
430 my $offsets = $self->_index_files(\@files, $force_reindex);
431 return $offsets;
435 =head2 get_all_ids
437 Title : get_all_ids, ids
438 Usage : my @ids = $db->get_all_ids;
439 Function: Get the IDs stored in all indexes
440 Returns : List of ids
441 Args : None
443 =cut
445 sub get_all_ids {
446 return keys %{shift->{offsets}};
449 *ids = \&get_all_ids;
452 =head2 index_file
454 Title : index_file
455 Usage : $db->index_file($filename);
456 Function: Index the given file
457 Returns : Hashref of offsets
458 Args : Filename
459 Boolean to force reindexing the file
461 =cut
463 sub index_file {
464 my ($self, $file, $force_reindex) = @_;
465 $self->{index_name} ||= "$file.index";
466 my $offsets = $self->_index_files([$file], $force_reindex);
467 return $offsets;
471 =head2 index_files
473 Title : index_files
474 Usage : $db->index_files(\@files);
475 Function: Index the given files
476 Returns : Hashref of offsets
477 Args : Arrayref of filenames
478 Boolean to force reindexing the files
480 =cut
482 sub index_files {
483 my ($self, $files, $force_reindex) = @_;
484 my @paths = map { File::Spec->rel2abs($_) } @$files;
485 require Digest::MD5;
486 my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
487 $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
488 my $offsets = $self->_index_files($files, $force_reindex);
489 return $offsets;
493 =head2 index_name
495 Title : index_name
496 Usage : my $indexname = $db->index_name($path);
497 Function: Get the full name of the index file
498 Returns : String
499 Args : None
501 =cut
503 sub index_name {
504 return shift->{index_name};
508 =head2 path
510 Title : path
511 Usage : my $path = $db->path($path);
512 Function: When a simple file or a directory of files is indexed, this returns
513 the file directory. When indexing an arbitrary list of files, the
514 return value is the path of the current working directory.
515 Returns : String
516 Args : None
518 =cut
520 sub path {
521 return shift->{dirname};
525 =head2 get_Seq_stream
527 Title : get_Seq_stream
528 Usage : my $stream = $db->get_Seq_stream();
529 Function: Get a SeqIO-like stream of sequence objects. The stream supports a
530 single method, next_seq(). Each call to next_seq() returns a new
531 PrimarySeqI compliant sequence object, until no more sequences remain.
532 Returns : A Bio::DB::Indexed::Stream object
533 Args : None
535 =cut
537 sub get_Seq_stream {
538 my $self = shift;
539 return Bio::DB::Indexed::Stream->new($self);
542 *get_PrimarySeq_stream = \&get_Seq_stream;
545 =head2 get_Seq_by_id
547 Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
548 Usage : my $seq = $db->get_Seq_by_id($id);
549 Function: Given an ID, fetch the corresponding sequence from the database.
550 This is a Bio::DB::RandomAccessI method implementation.
551 Returns : A sequence object
552 Args : ID
554 =cut
556 sub get_Seq_by_id {
557 my ($self, $id) = @_;
558 $self->throw('Need to provide a sequence ID') if not defined $id;
559 return if not exists $self->{offsets}{$id};
560 return $self->{obj_class}->new($self, $id);
563 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
566 =head2 _calculate_offsets
568 Title : _calculate_offsets
569 Usage : $db->_calculate_offsets($filename, $offsets);
570 Function: This method calculates the sequence offsets in a file based on ID and
571 should be implemented by classes that use Bio::DB::IndexedBase.
572 Returns : Hash of offsets
573 Args : File to process
574 Hashref of file offsets keyed by IDs.
576 =cut
578 sub _calculate_offsets {
579 my $self = shift;
580 $self->throw_not_implemented();
584 sub _index_files {
585 # Do the indexing of the given files using the index file on record
586 my ($self, $files, $force_reindex) = @_;
588 $self->_set_pack_method( @$files );
590 # get name of index file
591 my $index = $self->index_name;
593 # if caller has requested reindexing, unlink the index file.
594 unlink $index if $force_reindex;
596 # get the modification time of the index
597 my $indextime = (stat $index)[9] || 0;
599 # list recently updated files
600 my $modtime = 0;
601 my @updated;
602 for my $file (@$files) {
603 my $m = (stat $file)[9] || 0;
604 if ($m > $modtime) {
605 $modtime = $m;
607 if ($m > $indextime) {
608 push @updated, $file;
612 my $reindex = $force_reindex || (scalar @updated > 0);
613 $self->{offsets} = $self->_open_index($index, $reindex) or return;
615 if ($reindex) {
616 # reindex contents of changed files
617 $self->{indexing} = $index;
618 for my $file (@updated) {
619 &{$self->{offset_meth}}($self, $file, $self->{offsets});
621 delete $self->{indexing};
624 # closing and reopening might help corrupted index file problem on Windows
625 $self->_close_index($self->{offsets});
627 return $self->{offsets} = $self->_open_index($index);
631 sub _open_index {
632 # Open index file in read-only or write mode
633 my ($self, $index_file, $write) = @_;
634 my %offsets;
635 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
636 my @dbmargs = $self->dbmargs;
637 tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
638 or $self->throw( "Could not open index file $index_file: $!");
639 return \%offsets;
643 sub _close_index {
644 # Close index file
645 my ($self, $index) = @_;
646 untie %$index;
647 return 1;
651 sub _parse_compound_id {
652 # Handle compound IDs:
653 # $db->seq($id)
654 # $db->seq($id, $start, $stop)
655 # $db->seq("$id:$start,$stop")
656 # $db->seq("$id:$start..$stop")
657 # $db->seq("$id:$start-$stop")
658 my ($self, $id, $start, $stop, $strand) = @_;
660 if ( (not defined $start ) &&
661 (not defined $stop ) &&
662 (not defined $strand) &&
663 ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) ) {
664 # Start, stop and strand not provided and ID looks like a compound ID
665 ($id, $start, $stop) = ($1, $2, $3);
666 $start =~ s/_//g;
667 $stop =~ s/_//g;
670 $start ||= 1;
671 $stop ||= $self->length($id) || 0; # 0 if sequence not found in database
672 $strand ||= 1;
674 if ($start > $stop) {
675 # Change the strand
676 ($start, $stop) = ($stop, $start);
677 $strand *= -1;
680 return $id, $start, $stop, $strand;
684 sub _guess_alphabet {
685 # Determine the molecular type of the given a sequence string: dna, rna or protein
686 my ($self, $string) = @_;
687 return $string =~ m/^[gatcnGATCN*-]+$/ ? DNA
688 : $string =~ m/^[gaucnGAUCN*-]+$/ ? RNA
689 : PROTEIN;
693 sub _check_linelength {
694 # Check that the line length is valid. Generate an error otherwise.
695 my ($self, $linelength) = @_;
696 return if not defined $linelength;
697 $self->throw(
698 "Each line of the qual file must be less than 65,536 characters. Line ".
699 "$. is $linelength chars."
700 ) if $linelength > 65535;
704 sub _caloffset {
705 # Get the offset of the n-th residue of the sequence with the given id
706 # and termination length (tl)
707 my ($self, $id, $n, $tl) = @_;
708 $n--;
709 my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
710 $n = 0 if $n < 0;
711 $n = $seqlen-1 if $n >= $seqlen;
712 return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
716 sub _fh {
717 # Given a sequence ID, return the filehandle on which to find this sequence
718 my ($self, $id) = @_;
719 $self->throw('Need to provide a sequence ID') if not defined $id;
720 my $file = $self->file($id) or return;
721 return $self->_fhcache( File::Spec->catfile($self->{dirname}, $file) ) or
722 $self->throw( "Can't open file $file");
726 sub _fhcache {
727 my ($self, $path) = @_;
728 if (!$self->{fhcache}{$path}) {
729 if ($self->{curopen} >= $self->{maxopen}) {
730 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
731 keys %{$self->{fhcache}};
732 splice(@lru, $self->{maxopen} / 3);
733 $self->{curopen} -= @lru;
734 for (@lru) {
735 delete $self->{fhcache}{$_};
738 $self->{fhcache}{$path} = IO::File->new($path) || return;
739 binmode $self->{fhcache}{$path};
740 $self->{curopen}++;
742 $self->{cacheseq}{$path}++;
743 return $self->{fhcache}{$path};
747 #-------------------------------------------------------------
748 # Methods to store and retrieve data from indexed file
751 =head2 offset
753 Title : offset
754 Usage : my $offset = $db->offset($id);
755 Function: Get the offset of the indicated sequence from the beginning of the
756 file in which it is located. The offset points to the beginning of
757 the sequence, not the beginning of the header line.
758 Returns : String
759 Args : ID of sequence
761 =cut
763 sub offset {
764 my ($self, $id) = @_;
765 $self->throw('Need to provide a sequence ID') if not defined $id;
766 my $offset = $self->{offsets}{$id} or return;
767 return (&{$self->{unpackmeth}}($offset))[0];
771 =head2 strlen
773 Title : strlen
774 Usage : my $length = $db->strlen($id);
775 Function: Get the number of characters in the sequence string.
776 Returns : Integer
777 Args : ID of sequence
779 =cut
781 sub strlen {
782 my ($self, $id) = @_;
783 $self->throw('Need to provide a sequence ID') if not defined $id;
784 my $offset = $self->{offsets}{$id} or return;
785 return (&{$self->{unpackmeth}}($offset))[1];
789 =head2 length
791 Title : length
792 Usage : my $length = $db->length($id);
793 Function: Get the number of residues of the sequence.
794 Returns : Integer
795 Args : ID of sequence
797 =cut
799 sub length {
800 my ($self, $id) = @_;
801 $self->throw('Need to provide a sequence ID') if not defined $id;
802 my $offset = $self->{offsets}{$id} or return;
803 return (&{$self->{unpackmeth}}($offset))[2];
807 =head2 linelen
809 Title : linelen
810 Usage : my $linelen = $db->linelen($id);
811 Function: Get the length of the line for this sequence.
812 Returns : Integer
813 Args : ID of sequence
815 =cut
817 sub linelen {
818 my ($self, $id) = @_;
819 $self->throw('Need to provide a sequence ID') if not defined $id;
820 my $offset = $self->{offsets}{$id} or return;
821 return (&{$self->{unpackmeth}}($offset))[3];
825 =head2 headerlen
827 Title : headerlen
828 Usage : my $length = $db->headerlen($id);
829 Function: Get the length of the header line for the indicated sequence.
830 Returns : Integer
831 Args : ID of sequence
833 =cut
835 sub headerlen {
836 my ($self, $id) = @_;
837 $self->throw('Need to provide a sequence ID') if not defined $id;
838 my $offset = $self->{offsets}{$id} or return;
839 return (&{$self->{unpackmeth}}($offset))[4];
843 =head2 header_offset
845 Title : header_offset
846 Usage : my $offset = $db->header_offset($id);
847 Function: Get the offset of the header line for the indicated sequence from
848 the beginning of the file in which it is located.
849 Returns : String
850 Args : ID of sequence
852 =cut
854 sub header_offset {
855 my ($self, $id) = @_;
856 $self->throw('Need to provide a sequence ID') if not defined $id;
857 return if not $self->{offsets}{$id};
858 return $self->offset($id) - $self->headerlen($id);
862 =head2 alphabet
864 Title : alphabet
865 Usage : my $alphabet = $db->alphabet($id);
866 Function: Get the molecular type of the indicated sequence: dna, rna or protein
867 Returns : String
868 Args : ID of sequence
870 =cut
872 sub alphabet {
873 my ($self, $id) = @_;
874 $self->throw('Need to provide a sequence ID') if not defined $id;
875 my $offset = $self->{offsets}{$id} or return;
876 my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
877 return : $alphabet == Bio::DB::IndexedBase::DNA ? 'dna'
878 : $alphabet == Bio::DB::IndexedBase::RNA ? 'rna'
879 : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
880 : '';
884 =head2 file
886 Title : file
887 Usage : my $file = $db->file($id);
888 Function: Get the the name of the file in which the indicated sequence can be
889 found.
890 Returns : String
891 Args : ID of sequence
893 =cut
895 sub file {
896 my ($self, $id) = @_;
897 $self->throw('Need to provide a sequence ID') if not defined $id;
898 my $offset = $self->{offsets}{$id} or return;
899 return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
903 sub _fileno2path {
904 my ($self, $fileno) = @_;
905 return $fileno2path[$fileno];
909 sub _path2fileno {
910 my ($self, $path) = @_;
911 if ( not exists $filepath2no{$path} ) {
912 my $fileno = ($filepath2no{$path} = 0+ $self->{fileno}++);
913 $fileno2path[$fileno] = $path;
915 return $filepath2no{$path};
919 sub _packSmall {
920 return pack STRUCT, @_;
924 sub _packBig {
925 return pack STRUCTBIG, @_;
929 sub _unpackSmall {
930 return unpack STRUCT, shift;
934 sub _unpackBig {
935 return unpack STRUCTBIG, shift;
939 sub _set_pack_method {
940 # Determine whether to use 32 or 64 bit integers for the given files.
941 my $self = shift;
942 # Find the maximum file size:
943 my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
944 my $fourGB = (2 ** 32) - 1;
946 if ($maxsize > $fourGB) {
947 # At least one file exceeds 4Gb - we will need to use 64 bit ints
948 $self->{packmeth} = \&_packBig;
949 $self->{unpackmeth} = \&_unpackBig;
950 } else {
951 $self->{packmeth} = \&_packSmall;
952 $self->{unpackmeth} = \&_unpackSmall;
954 return 1;
958 #-------------------------------------------------------------
959 # Tied hash logic
962 sub TIEHASH {
963 return shift->new(@_);
967 sub FETCH {
968 return shift->subseq(@_);
972 sub STORE {
973 shift->throw("Read-only database");
977 sub DELETE {
978 shift->throw("Read-only database");
982 sub CLEAR {
983 shift->throw("Read-only database");
987 sub EXISTS {
988 return defined shift->offset(@_);
992 sub FIRSTKEY {
993 return tied(%{shift->{offsets}})->FIRSTKEY(@_);
997 sub NEXTKEY {
998 return tied(%{shift->{offsets}})->NEXTKEY(@_);
1002 sub DESTROY {
1003 my $self = shift;
1004 if ( $self->{clean} || $self->{indexing} ) {
1005 # Indexing aborted or cleaning requested. Delete the index file.
1006 unlink $self->{index_name};
1008 return 1;
1012 #-------------------------------------------------------------
1013 # stream-based access to the database
1016 package Bio::DB::Indexed::Stream;
1017 use base qw(Tie::Handle Bio::DB::SeqI);
1020 sub new {
1021 my ($class, $db) = @_;
1022 my $key = $db->FIRSTKEY;
1023 return bless {
1024 db => $db,
1025 key => $key
1026 }, $class;
1029 sub next_seq {
1030 my $self = shift;
1031 my ($key, $db) = @{$self}{'key', 'db'};
1032 return if not defined $key;
1033 my $value = $db->get_Seq_by_id($key);
1034 $self->{key} = $db->NEXTKEY($key);
1035 return $value;
1038 sub TIEHANDLE {
1039 my ($class, $db) = @_;
1040 return $class->new($db);
1043 sub READLINE {
1044 my $self = shift;
1045 return $self->next_seq;