Bio/DB/IndexedBase.pm

   1 #
   2 # BioPerl module for Bio::DB::IndexedBase
   3 #
   4 # You may distribute this module under the same terms as perl itself
   5 #
   6
   7
   8 =head1 NAME
   9
  10 Bio::DB::IndexedBase - Base class for modules using indexed sequence files
  11
  12 =head1 SYNOPSIS
  13
  14   use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
  15
  16   # 1/ Bio::SeqIO-style access
  17
  18   # Index some sequence files
  19   my $db = Bio::DB::XXX->new('/path/to/file');    # from a single file
  20   my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
  21   my $db = Bio::DB::XXX->new('/path/to/files/');  # from a directory
  22
  23   # Get IDs of all the sequences in the database
  24   my @ids = $db->ids;
  25
  26   # Get a specific sequence
  27   my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
  28
  29   # Loop through all sequences
  30   my $stream = $db->get_Seq_stream;
  31   while (my $seq = $stream->next_seq) {
  32     # Do something...
  33   }
  34
  35
  36   # 2/ Access via filehandle
  37   my $fh = Bio::DB::XXX->newFh('/path/to/file');
  38   while (my $seq = <$fh>) {
  39     # Do something...
  40   }
  41
  42
  43   # 3/ Tied-hash access
  44   tie %sequences, 'Bio::DB::XXX', '/path/to/file';
  45   print $sequences{'CHROMOSOME_I:1,20000'};
  46
  47 =head1 DESCRIPTION
  48
  49 Bio::DB::IndexedBase provides a base class for modules that want to index
  50 and read sequence files and provides persistent, random access to each sequence
  51 entry, without bringing the entire file into memory. Bio::DB::Fasta and
  52 Bio::DB::Qual both use Bio::DB::IndexedBase.
  53
  54 When you initialize the module, you point it at a single file, several files, or
  55 a directory of files. The first time it is run, the module generates an index
  56 of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
  57 followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
  58 index file to find the sequence file and offset for any requested sequence. If
  59 one of the source files is updated, the module reindexes just that one file. You
  60 can also force reindexing manually at any time. For improved performance, the
  61 module keeps a cache of open filehandles, closing less-recently used ones when
  62 the cache is full.
  63
  64 Entries may have any line length up to 65,536 characters, and different line
  65 lengths are allowed in the same file.  However, within a sequence entry, all
  66 lines must be the same length except for the last. An error will be thrown if
  67 this is not the case!
  68
  69 This module was developed for use with the C. elegans and human genomes, and has
  70 been tested with sequence segments as large as 20 megabases. Indexing the C.
  71 elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
  72 minutes on my 300 MHz pentium laptop. On the same system, average access time
  73 for any 200-mer within the C. elegans genome was E<lt>0.02s.
  74
  75 =head1 DATABASE CREATION AND INDEXING
  76
  77 The two constructors for this class are new() and newFh(). The former creates a
  78 Bio::DB::IndexedBase object which is accessed via method calls. The latter
  79 creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
  80 objects in a stream fashion. There is also a tied hash interface.
  81
  82 =over
  83
  84 =item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options])
  85
  86 Create a new Bio::DB::IndexedBase object from the files designated by $path
  87 $path may be a single file, an arrayref of files, or a directory containing
  88 such files.
  89
  90 After the database is created, you can use methods like ids() and get_Seq_by_id()
  91 to retrieve sequence objects.
  92
  93 =item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options])
  94
  95 Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
  96 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
  97 Bio::SeqIO style. The path and the options should be specified as for new().
  98
  99 =item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
 100
 101 Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
 102 path to the files. The optional @args list is the same set used by new(). If
 103 successful, tie() returns the tied object, undef otherwise.
 104
 105 Once tied, you can use the hash to retrieve an individual sequence by
 106 its ID, like this:
 107
 108   my $seq = $db{CHROMOSOME_I};
 109
 110 The keys() and values() functions will return the sequence IDs and
 111 their sequences, respectively.  In addition, each() can be used to
 112 iterate over the entire data set:
 113
 114  while (my ($id,$sequence) = each %db) {
 115     print "$id => $sequence\n";
 116  }
 117
 118
 119 When dealing with very large sequences, you can avoid bringing them
 120 into memory by calling each() in a scalar context.  This returns the
 121 key only.  You can then use tied(%db) to recover the Bio::DB::IndexedBase
 122 object and call its methods.
 123
 124  while (my $id = each %db) {
 125     print "$id: $db{$sequence:1,100}\n";
 126     print "$id: ".tied(%db)->length($id)."\n";
 127  }
 128
 129 In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
 130 to retrieve the first and next ID in the database, respectively. This allows to
 131 write the following iterative loop using just the object-oriented interface:
 132
 133  my $db = Bio::DB::IndexedBase->new('/path/to/file');
 134  for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
 135     # do something with sequence
 136  }
 137
 138 =back
 139
 140 =head1 INDEX CONTENT
 141
 142 Several attributes of each sequence are stored in the index file. Given a
 143 sequence ID, these attributes can be retrieved using the following methods:
 144
 145 =over
 146
 147 =item offset($id)
 148
 149 Get the offset of the indicated sequence from the beginning of the file in which
 150 it is located. The offset points to the beginning of the sequence, not the
 151 beginning of the header line.
 152
 153 =item strlen($id)
 154
 155 Get the number of characters in the sequence string.
 156
 157 =item length($id)
 158
 159 Get the number of residues of the sequence.
 160
 161 =item linelen($id)
 162
 163 Get the length of the line for this sequence. If the sequence is wrapped, then
 164 linelen() is likely to be much shorter than strlen().
 165
 166 =item headerlen($id)
 167
 168 Get the length of the header line for the indicated sequence.
 169
 170 =item header_offset
 171
 172 Get the offset of the header line for the indicated sequence from the beginning
 173 of the file in which it is located. This attribute is not stored. It is
 174 calculated from offset() and headerlen().
 175
 176 =item alphabet($id)
 177
 178 Get the molecular type (alphabet) of the indicated sequence.
 179
 180 =item file($id)
 181
 182 Get the the name of the file in which the indicated sequence can be found.
 183
 184 =back
 185
 186 =head1 BUGS
 187
 188 When a sequence is deleted from one of the files, this deletion is not detected
 189 by the module and removed from the index. As a result, a "ghost" entry will
 190 remain in the index and will return garbage results if accessed.
 191
 192 Currently, the only way to accommodate sequence removal is to rebuild the entire
 193 index, either by deleting it manually, or by passing -reindex=E<gt>1 to new()
 194 when initializing the module.
 195
 196
 197 =head1 SEE ALSO
 198
 199 L<DB_File>
 200
 201 L<Bio::DB::Fasta>
 202
 203 L<Bio::DB::Qual>
 204
 205 =head1 AUTHOR
 206
 207 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
 208
 209 Copyright (c) 2001 Cold Spring Harbor Laboratory.
 210
 211 Florent Angly (for the modularization)
 212
 213 This library is free software; you can redistribute it and/or modify
 214 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 215 disclaimers of warranty.
 216
 217 =head1 APPENDIX
 218
 219 The rest of the documentation details each of the object
 220 methods. Internal methods are usually preceded with a _
 221
 222 =cut
 223
 224
 225 package Bio::DB::IndexedBase;
 226
 227 BEGIN {
 228     @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
 229 }
 230
 231 use strict;
 232 use IO::File;
 233 use AnyDBM_File;
 234 use Fcntl;
 235 use File::Spec;
 236 use File::Basename qw(dirname);
 237
 238 use base qw(Bio::Root::Root);
 239
 240 # Store offset, strlen, linelen, headerlen, type and fileno
 241 use constant STRUCT    => 'NNNnnCa*'; # 32-bit file offset and seq length
 242 use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
 243
 244 use constant NA        => 0;
 245 use constant DNA       => 1;
 246 use constant RNA       => 2;
 247 use constant PROTEIN   => 3;
 248
 249 use constant DIE_ON_MISSMATCHED_LINES => 1;
 250 # you can avoid dying if you want but you may get incorrect results
 251
 252 my ($caller, @fileno2path, %filepath2no);
 253
 254
 255 =head2 new
 256
 257  Title   : new
 258  Usage   : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
 259  Function: Initialize a new database object
 260  Returns : A Bio::DB::IndexedBase object
 261  Args    : A single file, or path to dir, or arrayref of files
 262            Optional arguments:
 263
 264  Option        Description                                         Default
 265  -----------   -----------                                         -------
 266
 267  -glob         Glob expression to search for files in directories  *
 268
 269  -makeid       A code subroutine for transforming IDs              None
 270
 271  -maxopen      Maximum size of filehandle cache                    32
 272
 273  -debug        Turn on status messages                             0
 274
 275  -reindex      Force the index to be rebuilt                       0
 276
 277  -dbmargs      Additional arguments to pass to the DBM routine     None
 278
 279  -index_name   Name of the file that will hold the indices
 280
 281  -clean        Remove the index file when finished                 0
 282
 283 The -dbmargs option can be used to control the format of the index. For example,
 284 you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
 285 retrieved alphabetically. Note that you must use the same arguments every time
 286 you open the index!
 287
 288 The -makeid option gives you a chance to modify sequence IDs during indexing.
 289 For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
 290 that GenBank Fasta files use. The original header line can be recovered later.
 291 The option value should be a code reference that takes a scalar argument and
 292 returns a scalar result, like this:
 293
 294   $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
 295
 296   sub extract_gi {
 297       # Extract GI from GenBank
 298       my $header = shift;
 299       my ($id) = ($header =~ /gi\|(\d+)/m);
 300       return $id || '';
 301   }
 302
 303 extract_gi() will be called with the full header line, e.g. a Fasta line would
 304 include the "E<gt>", the ID and the description:
 305
 306  >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
 307
 308 The -makeid option is ignored after the index is constructed.
 309
 310 =cut
 311
 312 sub new {
 313     my ($class, $path, %opts) = @_;
 314
 315     my $self = bless {
 316         debug       => $opts{-debug},
 317         makeid      => $opts{-makeid},
 318         glob        => $opts{-glob}    || eval '$'.$class.'::file_glob' || '*',
 319         maxopen     => $opts{-maxopen} || 32,
 320         clean       => $opts{-clean}   || 0,
 321         dbmargs     => $opts{-dbmargs} || undef,
 322         fhcache     => {},
 323         cacheseq    => {},
 324         curopen     => 0,
 325         openseq     => 1,
 326         dirname     => undef,
 327         offsets     => undef,
 328         index_name  => $opts{-index_name},
 329         obj_class   => eval '$'.$class.'::obj_class',
 330         offset_meth => \&{$class.'::_calculate_offsets'},
 331     }, $class;
 332
 333     my ($offsets, $dirname);
 334     my $ref = ref $path || '';
 335     if ( $ref eq 'ARRAY' ) {
 336         $offsets = $self->index_files($path, $opts{-reindex});
 337         require Cwd;
 338         $dirname = Cwd::getcwd();
 339     } else {
 340         if (-d $path) {
 341             # because Win32 glob() is broken with respect to long file names
 342             # that contain whitespace.
 343             $path = Win32::GetShortPathName($path)
 344                 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
 345             $offsets = $self->index_dir($path, $opts{-reindex});
 346             $dirname = $path;
 347         } elsif (-f _) {
 348             $offsets = $self->index_file($path, $opts{-reindex});
 349             $dirname = dirname($path);
 350         } else {
 351             $self->throw( "$path: Invalid file or dirname");
 352         }
 353     }
 354     @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
 355
 356     return $self;
 357 }
 358
 359
 360 =head2 newFh
 361
 362  Title   : newFh
 363  Usage   : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
 364  Function: Index and get a new Fh for a single file, several files or a directory
 365  Returns : Filehandle object
 366  Args    : Same as new()
 367
 368 =cut
 369
 370 sub newFh {
 371     my ($class, @args) = @_;
 372     my $self = $class->new(@args);
 373     require Symbol;
 374     my $fh = Symbol::gensym;
 375     tie $$fh, 'Bio::DB::Indexed::Stream', $self
 376         or $self->throw("Could not tie filehandle: $!");
 377     return $fh;
 378 }
 379
 380
 381 =head2 dbmargs
 382
 383  Title   : dbmargs
 384  Usage   : my @args = $db->dbmargs;
 385  Function: Get stored dbm arguments
 386  Returns : Array
 387  Args    : None
 388
 389 =cut
 390
 391 sub dbmargs {
 392     my $self = shift;
 393     my $args = $self->{dbmargs} or return;
 394     return ref($args) eq 'ARRAY' ? @$args : $args;
 395 }
 396
 397
 398 =head2 glob
 399
 400  Title   : glob
 401  Usage   : my $glob = $db->glob;
 402  Function: Get the expression used to match files in directories
 403  Returns : String
 404  Args    : None
 405
 406 =cut
 407
 408 sub glob {
 409     my $self = shift;
 410     return $self->{glob};
 411 }
 412
 413
 414 =head2 index_dir
 415
 416  Title   : index_dir
 417  Usage   : $db->index_dir($dir);
 418  Function: Index the files that match -glob in the given directory
 419  Returns : Hashref of offsets
 420  Args    : Dirname
 421            Boolean to force a reindexing the directory
 422
 423 =cut
 424
 425 sub index_dir {
 426     my ($self, $dir, $force_reindex) = @_;
 427     my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
 428     $self->throw("No suitable files found in $dir") if scalar @files == 0;
 429     $self->{index_name} ||= File::Spec->catfile($dir, 'directory.index');
 430     my $offsets = $self->_index_files(\@files, $force_reindex);
 431     return $offsets;
 432 }
 433
 434
 435 =head2 get_all_ids
 436
 437  Title   : get_all_ids, ids
 438  Usage   : my @ids = $db->get_all_ids;
 439  Function: Get the IDs stored in all indexes
 440  Returns : List of ids
 441  Args    : None
 442
 443 =cut
 444
 445 sub get_all_ids  {
 446     return keys %{shift->{offsets}};
 447 }
 448
 449 *ids = \&get_all_ids;
 450
 451
 452 =head2 index_file
 453
 454  Title   : index_file
 455  Usage   : $db->index_file($filename);
 456  Function: Index the given file
 457  Returns : Hashref of offsets
 458  Args    : Filename
 459            Boolean to force reindexing the file
 460
 461 =cut
 462
 463 sub index_file {
 464     my ($self, $file, $force_reindex) = @_;
 465     $self->{index_name} ||= "$file.index";
 466     my $offsets = $self->_index_files([$file], $force_reindex);
 467     return $offsets;
 468 }
 469
 470
 471 =head2 index_files
 472
 473  Title   : index_files
 474  Usage   : $db->index_files(\@files);
 475  Function: Index the given files
 476  Returns : Hashref of offsets
 477  Args    : Arrayref of filenames
 478            Boolean to force reindexing the files
 479
 480 =cut
 481
 482 sub index_files {
 483     my ($self, $files, $force_reindex) = @_;
 484     my @paths = map { File::Spec->rel2abs($_) } @$files;
 485     require Digest::MD5;
 486     my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
 487     $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
 488     my $offsets = $self->_index_files($files, $force_reindex);
 489     return $offsets;
 490 }
 491
 492
 493 =head2 index_name
 494
 495  Title   : index_name
 496  Usage   : my $indexname = $db->index_name($path);
 497  Function: Get the full name of the index file
 498  Returns : String
 499  Args    : None
 500
 501 =cut
 502
 503 sub index_name {
 504     return shift->{index_name};
 505 }
 506
 507
 508 =head2 path
 509
 510  Title   : path
 511  Usage   : my $path = $db->path($path);
 512  Function: When a simple file or a directory of files is indexed, this returns
 513            the file directory. When indexing an arbitrary list of files, the
 514            return value is the path of the current working directory.
 515  Returns : String
 516  Args    : None
 517
 518 =cut
 519
 520 sub path {
 521     return shift->{dirname};
 522 }
 523
 524
 525 =head2 get_Seq_stream
 526
 527  Title   : get_Seq_stream
 528  Usage   : my $stream = $db->get_Seq_stream();
 529  Function: Get a SeqIO-like stream of sequence objects. The stream supports a
 530            single method, next_seq(). Each call to next_seq() returns a new
 531            PrimarySeqI compliant sequence object, until no more sequences remain.
 532  Returns : A Bio::DB::Indexed::Stream object
 533  Args    : None
 534
 535 =cut
 536
 537 sub get_Seq_stream {
 538   my $self = shift;
 539   return Bio::DB::Indexed::Stream->new($self);
 540 }
 541
 542 *get_PrimarySeq_stream = \&get_Seq_stream;
 543
 544
 545 =head2 get_Seq_by_id
 546
 547  Title   : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
 548  Usage   : my $seq = $db->get_Seq_by_id($id);
 549  Function: Given an ID, fetch the corresponding sequence from the database.
 550            This is a Bio::DB::RandomAccessI method implementation.
 551  Returns : A sequence object
 552  Args    : ID
 553
 554 =cut
 555
 556 sub get_Seq_by_id {
 557   my ($self, $id) = @_;
 558   $self->throw('Need to provide a sequence ID') if not defined $id;
 559   return if not exists $self->{offsets}{$id};
 560   return $self->{obj_class}->new($self, $id);
 561 }
 562
 563 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
 564
 565
 566 =head2 _calculate_offsets
 567
 568  Title   : _calculate_offsets
 569  Usage   : $db->_calculate_offsets($filename, $offsets);
 570  Function: This method calculates the sequence offsets in a file based on ID and
 571            should be implemented by classes that use Bio::DB::IndexedBase.
 572  Returns : Hash of offsets
 573  Args    : File to process
 574            Hashref of file offsets keyed by IDs.
 575
 576 =cut
 577
 578 sub _calculate_offsets {
 579    my $self = shift;
 580    $self->throw_not_implemented();
 581 }
 582
 583
 584 sub _index_files {
 585     # Do the indexing of the given files using the index file on record
 586     my ($self, $files, $force_reindex) = @_;
 587
 588     $self->_set_pack_method( @$files );
 589
 590     # get name of index file
 591     my $index = $self->index_name;
 592
 593     # if caller has requested reindexing, unlink the index file.
 594     unlink $index if $force_reindex;
 595
 596     # get the modification time of the index
 597     my $indextime = (stat $index)[9] || 0;
 598
 599     # list recently updated files
 600     my $modtime = 0;
 601     my @updated;
 602     for my $file (@$files) {
 603         my $m = (stat $file)[9] || 0;
 604         if ($m > $modtime) {
 605            $modtime = $m;
 606         }
 607         if ($m > $indextime) {
 608            push @updated, $file;
 609         }
 610     }
 611
 612     my $reindex      = $force_reindex || (scalar @updated > 0);
 613     $self->{offsets} = $self->_open_index($index, $reindex) or return;
 614
 615     if ($reindex) {
 616         # reindex contents of changed files
 617         $self->{indexing} = $index;
 618         for my $file (@updated) {
 619             &{$self->{offset_meth}}($self, $file, $self->{offsets});
 620         }
 621         delete $self->{indexing};
 622     }
 623
 624     # closing and reopening might help corrupted index file problem on Windows
 625     $self->_close_index($self->{offsets});
 626
 627     return $self->{offsets} = $self->_open_index($index);
 628 }
 629
 630
 631 sub _open_index {
 632     # Open index file in read-only or write mode
 633     my ($self, $index_file, $write) = @_;
 634     my %offsets;
 635     my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
 636     my @dbmargs = $self->dbmargs;
 637     tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
 638         or $self->throw( "Could not open index file $index_file: $!");
 639     return \%offsets;
 640 }
 641
 642
 643 sub _close_index {
 644     # Close index file
 645     my ($self, $index) = @_;
 646     untie %$index;
 647     return 1;
 648 }
 649
 650
 651 sub _parse_compound_id {
 652     # Handle compound IDs:
 653     #     $db->seq($id)
 654     #     $db->seq($id, $start, $stop)
 655     #     $db->seq("$id:$start,$stop")
 656     #     $db->seq("$id:$start..$stop")
 657     #     $db->seq("$id:$start-$stop")
 658     my ($self, $id, $start, $stop, $strand) = @_;
 659
 660     if ( (not defined $start ) &&
 661          (not defined $stop  ) &&
 662          (not defined $strand) &&
 663          ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) ) {
 664         # Start, stop and strand not provided and ID looks like a compound ID
 665         ($id, $start, $stop) = ($1, $2, $3);
 666         $start =~ s/_//g;
 667         $stop  =~ s/_//g;
 668     }
 669
 670     $start  ||= 1;
 671     $stop   ||= $self->length($id) || 0; # 0 if sequence not found in database
 672     $strand ||= 1;
 673
 674     if ($start > $stop) {
 675         # Change the strand
 676         ($start, $stop) = ($stop, $start);
 677         $strand *= -1;
 678     }
 679
 680     return $id, $start, $stop, $strand;
 681 }
 682
 683
 684 sub _guess_alphabet {
 685   # Determine the molecular type of the given a sequence string: dna, rna or protein
 686   my ($self, $string) = @_;
 687   return $string =~ m/^[gatcnGATCN*-]+$/   ? DNA
 688          : $string =~ m/^[gaucnGAUCN*-]+$/ ? RNA
 689          : PROTEIN;
 690 }
 691
 692
 693 sub _check_linelength {
 694     # Check that the line length is valid. Generate an error otherwise.
 695     my ($self, $linelength) = @_;
 696     return if not defined $linelength;
 697     $self->throw(
 698         "Each line of the qual file must be less than 65,536 characters. Line ".
 699         "$. is $linelength chars."
 700     ) if $linelength > 65535;
 701 }
 702
 703
 704 sub _caloffset {
 705     # Get the offset of the n-th residue of the sequence with the given id
 706     # and termination length (tl)
 707     my ($self, $id, $n, $tl) = @_;
 708     $n--;
 709     my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
 710     $n = 0            if $n < 0;
 711     $n = $seqlen-1 if $n >= $seqlen;
 712     return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
 713 }
 714
 715
 716 sub _fh {
 717     # Given a sequence ID, return the filehandle on which to find this sequence
 718     my ($self, $id) = @_;
 719     $self->throw('Need to provide a sequence ID') if not defined $id;
 720     my $file = $self->file($id) or return;
 721     return $self->_fhcache( File::Spec->catfile($self->{dirname}, $file) ) or
 722         $self->throw( "Can't open file $file");
 723 }
 724
 725
 726 sub _fhcache {
 727     my ($self, $path) = @_;
 728     if (!$self->{fhcache}{$path}) {
 729         if ($self->{curopen} >= $self->{maxopen}) {
 730             my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
 731                 keys %{$self->{fhcache}};
 732             splice(@lru, $self->{maxopen} / 3);
 733             $self->{curopen} -= @lru;
 734             for (@lru) {
 735                 delete $self->{fhcache}{$_};
 736             }
 737         }
 738         $self->{fhcache}{$path} = IO::File->new($path) || return;
 739         binmode $self->{fhcache}{$path};
 740         $self->{curopen}++;
 741     }
 742     $self->{cacheseq}{$path}++;
 743     return $self->{fhcache}{$path};
 744 }
 745
 746
 747 #-------------------------------------------------------------
 748 # Methods to store and retrieve data from indexed file
 749 #
 750
 751 =head2 offset
 752
 753  Title   : offset
 754  Usage   : my $offset = $db->offset($id);
 755  Function: Get the offset of the indicated sequence from the beginning of the
 756            file in which it is located. The offset points to the beginning of
 757            the sequence, not the beginning of the header line.
 758  Returns : String
 759  Args    : ID of sequence
 760
 761 =cut
 762
 763 sub offset {
 764     my ($self, $id) = @_;
 765     $self->throw('Need to provide a sequence ID') if not defined $id;
 766     my $offset = $self->{offsets}{$id} or return;
 767     return (&{$self->{unpackmeth}}($offset))[0];
 768 }
 769
 770
 771 =head2 strlen
 772
 773  Title   : strlen
 774  Usage   : my $length = $db->strlen($id);
 775  Function: Get the number of characters in the sequence string.
 776  Returns : Integer
 777  Args    : ID of sequence
 778
 779 =cut
 780
 781 sub strlen {
 782     my ($self, $id) = @_;
 783     $self->throw('Need to provide a sequence ID') if not defined $id;
 784     my $offset = $self->{offsets}{$id} or return;
 785     return (&{$self->{unpackmeth}}($offset))[1];
 786 }
 787
 788
 789 =head2 length
 790
 791  Title   : length
 792  Usage   : my $length = $db->length($id);
 793  Function: Get the number of residues of the sequence.
 794  Returns : Integer
 795  Args    : ID of sequence
 796
 797 =cut
 798
 799 sub length {
 800     my ($self, $id) = @_;
 801     $self->throw('Need to provide a sequence ID') if not defined $id;
 802     my $offset = $self->{offsets}{$id} or return;
 803     return (&{$self->{unpackmeth}}($offset))[2];
 804 }
 805
 806
 807 =head2 linelen
 808
 809  Title   : linelen
 810  Usage   : my $linelen = $db->linelen($id);
 811  Function: Get the length of the line for this sequence.
 812  Returns : Integer
 813  Args    : ID of sequence
 814
 815 =cut
 816
 817 sub linelen {
 818     my ($self, $id) = @_;
 819     $self->throw('Need to provide a sequence ID') if not defined $id;
 820     my $offset = $self->{offsets}{$id} or return;
 821     return (&{$self->{unpackmeth}}($offset))[3];
 822 }
 823
 824
 825 =head2 headerlen
 826
 827  Title   : headerlen
 828  Usage   : my $length = $db->headerlen($id);
 829  Function: Get the length of the header line for the indicated sequence.
 830  Returns : Integer
 831  Args    : ID of sequence
 832
 833 =cut
 834
 835 sub headerlen {
 836     my ($self, $id) = @_;
 837     $self->throw('Need to provide a sequence ID') if not defined $id;
 838     my $offset = $self->{offsets}{$id} or return;
 839     return (&{$self->{unpackmeth}}($offset))[4];
 840 }
 841
 842
 843 =head2 header_offset
 844
 845  Title   : header_offset
 846  Usage   : my $offset = $db->header_offset($id);
 847  Function: Get the offset of the header line for the indicated sequence from
 848            the beginning of the file in which it is located.
 849  Returns : String
 850  Args    : ID of sequence
 851
 852 =cut
 853
 854 sub header_offset {
 855     my ($self, $id) = @_;
 856     $self->throw('Need to provide a sequence ID') if not defined $id;
 857     return if not $self->{offsets}{$id};
 858     return $self->offset($id) - $self->headerlen($id);
 859 }
 860
 861
 862 =head2 alphabet
 863
 864  Title   : alphabet
 865  Usage   : my $alphabet = $db->alphabet($id);
 866  Function: Get the molecular type of the indicated sequence: dna, rna or protein
 867  Returns : String
 868  Args    : ID of sequence
 869
 870 =cut
 871
 872 sub alphabet {
 873     my ($self, $id) = @_;
 874     $self->throw('Need to provide a sequence ID') if not defined $id;
 875     my $offset = $self->{offsets}{$id} or return;
 876     my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
 877     return : $alphabet == Bio::DB::IndexedBase::DNA     ? 'dna'
 878            : $alphabet == Bio::DB::IndexedBase::RNA     ? 'rna'
 879            : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
 880            : '';
 881 }
 882
 883
 884 =head2 file
 885
 886  Title   : file
 887  Usage   : my $file = $db->file($id);
 888  Function: Get the the name of the file in which the indicated sequence can be
 889            found.
 890  Returns : String
 891  Args    : ID of sequence
 892
 893 =cut
 894
 895 sub file {
 896   my ($self, $id) = @_;
 897   $self->throw('Need to provide a sequence ID') if not defined $id;
 898   my $offset = $self->{offsets}{$id} or return;
 899   return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
 900 }
 901
 902
 903 sub _fileno2path {
 904     my ($self, $fileno) = @_;
 905     return $fileno2path[$fileno];
 906 }
 907
 908
 909 sub _path2fileno {
 910     my ($self, $path) = @_;
 911     if ( not exists $filepath2no{$path} ) {
 912         my $fileno = ($filepath2no{$path} = 0+ $self->{fileno}++);
 913         $fileno2path[$fileno] = $path;
 914     }
 915     return $filepath2no{$path};
 916 }
 917
 918
 919 sub _packSmall {
 920     return pack STRUCT, @_;
 921 }
 922
 923
 924 sub _packBig {
 925     return pack STRUCTBIG, @_;
 926 }
 927
 928
 929 sub _unpackSmall {
 930     return unpack STRUCT, shift;
 931 }
 932
 933
 934 sub _unpackBig {
 935     return unpack STRUCTBIG, shift;
 936 }
 937
 938
 939 sub _set_pack_method {
 940     # Determine whether to use 32 or 64 bit integers for the given files.
 941     my $self = shift;
 942     # Find the maximum file size:
 943     my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
 944     my $fourGB    = (2 ** 32) - 1;
 945
 946     if ($maxsize > $fourGB) {
 947         # At least one file exceeds 4Gb - we will need to use 64 bit ints
 948         $self->{packmeth}   = \&_packBig;
 949         $self->{unpackmeth} = \&_unpackBig;
 950     } else {
 951         $self->{packmeth}   = \&_packSmall;
 952         $self->{unpackmeth} = \&_unpackSmall;
 953     }
 954     return 1;
 955 }
 956
 957
 958 #-------------------------------------------------------------
 959 # Tied hash logic
 960 #
 961
 962 sub TIEHASH {
 963     return shift->new(@_);
 964 }
 965
 966
 967 sub FETCH {
 968     return shift->subseq(@_);
 969 }
 970
 971
 972 sub STORE {
 973     shift->throw("Read-only database");
 974 }
 975
 976
 977 sub DELETE {
 978     shift->throw("Read-only database");
 979 }
 980
 981
 982 sub CLEAR {
 983     shift->throw("Read-only database");
 984 }
 985
 986
 987 sub EXISTS {
 988     return defined shift->offset(@_);
 989 }
 990
 991
 992 sub FIRSTKEY {
 993     return tied(%{shift->{offsets}})->FIRSTKEY(@_);
 994 }
 995
 996
 997 sub NEXTKEY {
 998     return tied(%{shift->{offsets}})->NEXTKEY(@_);
 999 }
1000
1001
1002 sub DESTROY {
1003     my $self = shift;
1004     if ( $self->{clean} || $self->{indexing} ) {
1005       # Indexing aborted or cleaning requested. Delete the index file.
1006       unlink $self->{index_name};
1007     }
1008     return 1;
1009 }
1010
1011
1012 #-------------------------------------------------------------
1013 # stream-based access to the database
1014 #
1015
1016 package Bio::DB::Indexed::Stream;
1017 use base qw(Tie::Handle Bio::DB::SeqI);
1018
1019
1020 sub new {
1021     my ($class, $db) = @_;
1022     my $key = $db->FIRSTKEY;
1023     return bless {
1024         db  => $db,
1025         key => $key
1026     }, $class;
1027 }
1028
1029 sub next_seq {
1030     my $self = shift;
1031     my ($key, $db) = @{$self}{'key', 'db'};
1032     return if not defined $key;
1033     my $value = $db->get_Seq_by_id($key);
1034     $self->{key} = $db->NEXTKEY($key);
1035     return $value;
1036 }
1037
1038 sub TIEHANDLE {
1039     my ($class, $db) = @_;
1040     return $class->new($db);
1041 }
1042
1043 sub READLINE {
1044     my $self = shift;
1045     return $self->next_seq;
1046 }
1047
1048
1049 1;