Bio/DB/IndexedBase.pm

   1 #
   2 # BioPerl module for Bio::DB::IndexedBase
   3 #
   4 # You may distribute this module under the same terms as perl itself
   5 #
   6
   7 =head1 NAME
   8
   9 Bio::DB::IndexedBase - Base class for modules using indexed sequence files
  10
  11 =head1 SYNOPSIS
  12
  13   use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
  14
  15   # 1/ Bio::SeqIO-style access
  16
  17   # Index some sequence files
  18   my $db = Bio::DB::XXX->new('/path/to/file');    # from a single file
  19   my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
  20   my $db = Bio::DB::XXX->new('/path/to/files/');  # from a directory
  21
  22   # Get IDs of all the sequences in the database
  23   my @ids = $db->get_all_primary_ids;
  24
  25   # Get a specific sequence
  26   my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
  27
  28   # Loop through all sequences
  29   my $stream = $db->get_PrimarySeq_stream;
  30   while (my $seq = $stream->next_seq) {
  31     # Do something...
  32   }
  33
  34
  35   # 2/ Access via filehandle
  36   my $fh = Bio::DB::XXX->newFh('/path/to/file');
  37   while (my $seq = <$fh>) {
  38     # Do something...
  39   }
  40
  41
  42   # 3/ Tied-hash access
  43   tie %sequences, 'Bio::DB::XXX', '/path/to/file';
  44   print $sequences{'CHROMOSOME_I:1,20000'};
  45
  46 =head1 DESCRIPTION
  47
  48 Bio::DB::IndexedBase provides a base class for modules that want to index
  49 and read sequence files and provides persistent, random access to each sequence
  50 entry, without bringing the entire file into memory. This module is compliant
  51 with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
  52 Bio::DB::IndexedBase.
  53
  54 When you initialize the module, you point it at a single file, several files, or
  55 a directory of files. The first time it is run, the module generates an index
  56 of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
  57 followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
  58 index file to find the sequence file and offset for any requested sequence. If
  59 one of the source files is updated, the module reindexes just that one file. You
  60 can also force reindexing manually at any time. For improved performance, the
  61 module keeps a cache of open filehandles, closing less-recently used ones when
  62 the cache is full.
  63
  64 Entries may have any line length up to 65,536 characters, and different line
  65 lengths are allowed in the same file.  However, within a sequence entry, all
  66 lines must be the same length except for the last. An error will be thrown if
  67 this is not the case!
  68
  69 This module was developed for use with the C. elegans and human genomes, and has
  70 been tested with sequence segments as large as 20 megabases. Indexing the C.
  71 elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
  72 minutes on my 300 MHz pentium laptop. On the same system, average access time
  73 for any 200-mer within the C. elegans genome was E<lt>0.02s.
  74
  75 =head1 DATABASE CREATION AND INDEXING
  76
  77 The two constructors for this class are new() and newFh(). The former creates a
  78 Bio::DB::IndexedBase object which is accessed via method calls. The latter
  79 creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
  80 objects in a stream fashion. There is also a tied hash interface.
  81
  82 =over
  83
  84 =item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options])
  85
  86 Create a new Bio::DB::IndexedBase object from the files designated by $path
  87 $path may be a single file, an arrayref of files, or a directory containing
  88 such files.
  89
  90 After the database is created, you can use methods like get_all_primary_ids()
  91 and get_Seq_by_id() to retrieve sequence objects.
  92
  93 =item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options])
  94
  95 Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
  96 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
  97 Bio::SeqIO style. The path and the options should be specified as for new().
  98
  99 =item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
 100
 101 Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
 102 path to the files. The optional @args list is the same set used by new(). If
 103 successful, tie() returns the tied object, undef otherwise.
 104
 105 Once tied, you can use the hash to retrieve an individual sequence by
 106 its ID, like this:
 107
 108   my $seq = $db{CHROMOSOME_I};
 109
 110 The keys() and values() functions will return the sequence IDs and their
 111 sequences, respectively.  In addition, each() can be used to iterate over the
 112 entire data set:
 113
 114  while (my ($id,$sequence) = each %db) {
 115     print "$id => $sequence\n";
 116  }
 117
 118
 119 When dealing with very large sequences, you can avoid bringing them into memory
 120 by calling each() in a scalar context.  This returns the key only.  You can then
 121 use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
 122
 123  while (my $id = each %db) {
 124     print "$id: $db{$sequence:1,100}\n";
 125     print "$id: ".tied(%db)->length($id)."\n";
 126  }
 127
 128 In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
 129 to retrieve the first and next ID in the database, respectively. This allows to
 130 write the following iterative loop using just the object-oriented interface:
 131
 132  my $db = Bio::DB::IndexedBase->new('/path/to/file');
 133  for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
 134     # do something with sequence
 135  }
 136
 137 =back
 138
 139 =head1 INDEX CONTENT
 140
 141 Several attributes of each sequence are stored in the index file. Given a
 142 sequence ID, these attributes can be retrieved using the following methods:
 143
 144 =over
 145
 146 =item offset($id)
 147
 148 Get the offset of the indicated sequence from the beginning of the file in which
 149 it is located. The offset points to the beginning of the sequence, not the
 150 beginning of the header line.
 151
 152 =item strlen($id)
 153
 154 Get the number of characters in the sequence string.
 155
 156 =item length($id)
 157
 158 Get the number of residues of the sequence.
 159
 160 =item linelen($id)
 161
 162 Get the length of the line for this sequence. If the sequence is wrapped, then
 163 linelen() is likely to be much shorter than strlen().
 164
 165 =item headerlen($id)
 166
 167 Get the length of the header line for the indicated sequence.
 168
 169 =item header_offset
 170
 171 Get the offset of the header line for the indicated sequence from the beginning
 172 of the file in which it is located. This attribute is not stored. It is
 173 calculated from offset() and headerlen().
 174
 175 =item alphabet($id)
 176
 177 Get the molecular type (alphabet) of the indicated sequence. This method handles
 178 residues according to the IUPAC convention.
 179
 180 =item file($id)
 181
 182 Get the the name of the file in which the indicated sequence can be found.
 183
 184 =back
 185
 186 =head1 INTERFACE COMPLIANCE NOTES
 187
 188 Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
 189 Bio::RandomAccessI interfaces.
 190
 191 Database do not necessarily provide any meaningful internal primary ID for the
 192 sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
 193 the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
 194 and get_Seq_by_primary_id() will return the same sequence.
 195
 196 Since this database index has no notion of sequence version or namespace, the
 197 get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
 198
 199 =head1 BUGS
 200
 201 When a sequence is deleted from one of the files, this deletion is not detected
 202 by the module and removed from the index. As a result, a "ghost" entry will
 203 remain in the index and will return garbage results if accessed.
 204
 205 Also, if you are indexing a directory, it is wise to not add or remove files
 206 from it.
 207
 208 In case you have changed the files in a directory, or the sequences in a file,
 209 you can to rebuild the entire index, either by deleting it manually, or by
 210 passing -reindex=E<gt>1 to new() when initializing the module.
 211
 212 =head1 SEE ALSO
 213
 214 L<DB_File>
 215
 216 L<Bio::DB::Fasta>
 217
 218 L<Bio::DB::Qual>
 219
 220 =head1 AUTHOR
 221
 222 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
 223
 224 Copyright (c) 2001 Cold Spring Harbor Laboratory.
 225
 226 Florent Angly (for the modularization)
 227
 228 This library is free software; you can redistribute it and/or modify
 229 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 230 disclaimers of warranty.
 231
 232 =head1 APPENDIX
 233
 234 The rest of the documentation details each of the object
 235 methods. Internal methods are usually preceded with a _
 236
 237 =cut
 238
 239
 240 package Bio::DB::IndexedBase;
 241
 242 BEGIN {
 243     @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
 244         if(!$INC{'AnyDBM_File.pm'});
 245 }
 246
 247 use strict;
 248 use IO::File;
 249 use AnyDBM_File;
 250 use Fcntl;
 251 use File::Spec;
 252 use File::Basename qw(basename dirname);
 253 use Bio::PrimarySeq;
 254
 255 use base qw(Bio::DB::SeqI);
 256
 257 # Store offset, strlen, linelen, headerlen, type and fileno
 258 use constant STRUCT    => 'NNNnnCa*'; # 32-bit file offset and seq length
 259 use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
 260
 261 use constant NA        => 0;
 262 use constant DNA       => 1;
 263 use constant RNA       => 2;
 264 use constant PROTEIN   => 3;
 265
 266 # You can avoid dying if you want but you may get incorrect results
 267 use constant DIE_ON_MISSMATCHED_LINES => 1;
 268
 269 # Remove carriage returns (\r) and newlines (\n) from a string.  When
 270 # called from subseq, this can take a signficiant portion of time, in
 271 # Variant Effect Prediction. Therefore we compile the match portion.
 272 sub _strip_crnl {
 273     eval 'require Inline::C';
 274     if ( $INC{'Inline/C.pm'} ) {
 275         # C can do _strip_crnl much faster. But this requires the
 276         # Inline::C module which we don't require people to have. So we make
 277         # this optional by wrapping the C code in an eval. If the eval works,
 278         # the Perl strip_crnl() function is overwritten.
 279         Inline->bind(
 280             C => q(
 281         /*
 282         Strip all newlines (\n) and carriage returns (\r) from the string
 283         */
 284         char* _strip_crnl(char* str) {
 285           char *s;
 286           char *s2 = str;
 287           for (s = str; *s; *s++) {
 288             if (*s != '\n' && *s != '\r') {
 289               *s2++ = *s;
 290             }
 291           }
 292           *s2 = '\0';
 293           return str;
 294         }
 295         )
 296         );
 297     } else {
 298         # "tr" is much faster than the regex, with "s"
 299         *Bio::DB::IndexedBase::_strip_crnl = sub {
 300             my $str = shift;
 301             $str =~ tr/\n\r//d;
 302             return $str;
 303         };
 304     }
 305
 306     return _strip_crnl(@_);
 307 }
 308
 309 =head2 new
 310
 311  Title   : new
 312  Usage   : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
 313  Function: Initialize a new database object
 314  Returns : A Bio::DB::IndexedBase object
 315  Args    : A single file, or path to dir, or arrayref of files
 316            Optional arguments:
 317
 318  Option        Description                                         Default
 319  -----------   -----------                                         -------
 320  -glob         Glob expression to search for files in directories  *
 321  -makeid       A code subroutine for transforming IDs              None
 322  -maxopen      Maximum size of filehandle cache                    32
 323  -debug        Turn on status messages                             0
 324  -reindex      Force the index to be rebuilt                       0
 325  -dbmargs      Additional arguments to pass to the DBM routine     None
 326  -index_name   Name of the file that will hold the indices
 327  -clean        Remove the index file when finished                 0
 328
 329 The -dbmargs option can be used to control the format of the index. For example,
 330 you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
 331 retrieved alphabetically. Note that you must use the same arguments every time
 332 you open the index!
 333
 334 The -makeid option gives you a chance to modify sequence IDs during indexing.
 335 For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
 336 that GenBank Fasta files use. The original header line can be recovered later.
 337 The option value for -makeid should be a code reference that takes a scalar
 338 argument (the full header line) and returns a scalar or an array of scalars (the
 339 ID or IDs you want to assign). For example:
 340
 341   $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
 342
 343   sub extract_gi {
 344       # Extract GI from GenBank
 345       my $header = shift;
 346       my ($id) = ($header =~ /gi\|(\d+)/m);
 347       return $id || '';
 348   }
 349
 350 extract_gi() will be called with the full header line, e.g. a Fasta line would
 351 include the "E<gt>", the ID and the description:
 352
 353  >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
 354
 355 In the database, this sequence can now be retrieved by its GI instead of its
 356 complete ID:
 357
 358  my $seq = $db->get_Seq_by_id(352962132);
 359
 360 The -makeid option is ignored after the index is constructed.
 361
 362 =cut
 363
 364 sub new {
 365     my ($class, $path, %opts) = @_;
 366
 367     my $self = bless {
 368         debug       => $opts{-debug}   || 0,
 369         makeid      => $opts{-makeid},
 370         glob        => $opts{-glob}    || eval '$'.$class.'::file_glob' || '*',
 371         maxopen     => $opts{-maxopen} || 32,
 372         clean       => $opts{-clean}   || 0,
 373         dbmargs     => $opts{-dbmargs} || undef,
 374         fhcache     => {},
 375         cacheseq    => {},
 376         curopen     => 0,
 377         openseq     => 1,
 378         dirname     => undef,
 379         offsets     => undef,
 380         index_name  => $opts{-index_name},
 381         obj_class   => eval '$'.$class.'::obj_class',
 382         offset_meth => \&{$class.'::_calculate_offsets'},
 383         fileno2path => [],
 384         filepath2no => {},
 385     }, $class;
 386
 387     my ($offsets, $dirname);
 388     my $ref = ref $path || '';
 389     if ( $ref eq 'ARRAY' ) {
 390         $offsets = $self->index_files($path, $opts{-reindex});
 391         require Cwd;
 392         $dirname = Cwd::getcwd();
 393     } else {
 394   $self->{index_name} ||= $self->_default_index_name($path);
 395         if (-d $path) {
 396             # because Win32 glob() is broken with respect to long file names
 397             # that contain whitespace.
 398             $path = Win32::GetShortPathName($path)
 399                 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
 400             $offsets = $self->index_dir($path, $opts{-reindex});
 401             $dirname = $path;
 402         } elsif (-f _) {
 403             $offsets = $self->index_file($path, $opts{-reindex});
 404             $dirname = dirname($path);
 405         } else {
 406             $self->throw( "No file or directory called '$path'");
 407         }
 408     }
 409     @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
 410
 411     return $self;
 412 }
 413
 414
 415 =head2 newFh
 416
 417  Title   : newFh
 418  Usage   : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
 419  Function: Index and get a new Fh for a single file, several files or a directory
 420  Returns : Filehandle object
 421  Args    : Same as new()
 422
 423 =cut
 424
 425 sub newFh {
 426     my ($class, @args) = @_;
 427     my $self = $class->new(@args);
 428     require Symbol;
 429     my $fh = Symbol::gensym;
 430     tie $$fh, 'Bio::DB::Indexed::Stream', $self
 431         or $self->throw("Could not tie filehandle: $!");
 432     return $fh;
 433 }
 434
 435
 436 =head2 dbmargs
 437
 438  Title   : dbmargs
 439  Usage   : my @args = $db->dbmargs;
 440  Function: Get stored dbm arguments
 441  Returns : Array
 442  Args    : None
 443
 444 =cut
 445
 446 sub dbmargs {
 447     my $self = shift;
 448     my $args = $self->{dbmargs} or return;
 449     return ref($args) eq 'ARRAY' ? @$args : $args;
 450 }
 451
 452
 453 =head2 glob
 454
 455  Title   : glob
 456  Usage   : my $glob = $db->glob;
 457  Function: Get the expression used to match files in directories
 458  Returns : String
 459  Args    : None
 460
 461 =cut
 462
 463 sub glob {
 464     my $self = shift;
 465     return $self->{glob};
 466 }
 467
 468
 469 =head2 index_dir
 470
 471  Title   : index_dir
 472  Usage   : $db->index_dir($dir);
 473  Function: Index the files that match -glob in the given directory
 474  Returns : Hashref of offsets
 475  Args    : Dirname
 476            Boolean to force a reindexing the directory
 477
 478 =cut
 479
 480 sub index_dir {
 481     my ($self, $dir, $force_reindex) = @_;
 482     my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
 483     return if scalar @files == 0;
 484     $self->{index_name} ||= $self->_default_index_name($dir);
 485     my $offsets = $self->_index_files(\@files, $force_reindex);
 486     return $offsets;
 487 }
 488
 489
 490 =head2 get_all_primary_ids
 491
 492  Title   : get_all_primary_ids, get_all_ids, ids
 493  Usage   : my @ids = $db->get_all_primary_ids;
 494  Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
 495            implementation. Note that in this implementation, the internal
 496            database primary IDs are also the sequence IDs.
 497  Returns : List of ids
 498  Args    : None
 499
 500 =cut
 501
 502 sub get_all_primary_ids  {
 503     return keys %{shift->{offsets}};
 504 }
 505
 506 *ids = *get_all_ids = \&get_all_primary_ids;
 507
 508
 509 =head2 index_file
 510
 511  Title   : index_file
 512  Usage   : $db->index_file($filename);
 513  Function: Index the given file
 514  Returns : Hashref of offsets
 515  Args    : Filename
 516            Boolean to force reindexing the file
 517
 518 =cut
 519
 520 sub index_file {
 521     my ($self, $file, $force_reindex) = @_;
 522     $self->{index_name} ||= $self->_default_index_name($file);
 523     my $offsets = $self->_index_files([$file], $force_reindex);
 524     return $offsets;
 525 }
 526
 527 sub _default_index_name {
 528     my ($self,$path) = @_;
 529     return File::Spec->catfile($path,'directory.index') if -d $path;
 530     return "$path.index";
 531 }
 532
 533 =head2 index_files
 534
 535  Title   : index_files
 536  Usage   : $db->index_files(\@files);
 537  Function: Index the given files
 538  Returns : Hashref of offsets
 539  Args    : Arrayref of filenames
 540            Boolean to force reindexing the files
 541
 542 =cut
 543
 544 sub index_files {
 545     my ($self, $files, $force_reindex) = @_;
 546     my @paths = map { File::Spec->rel2abs($_) } @$files;
 547     require Digest::MD5;
 548     my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
 549     $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
 550     my $offsets = $self->_index_files($files, $force_reindex);
 551     return $offsets;
 552 }
 553
 554
 555 =head2 index_name
 556
 557  Title   : index_name
 558  Usage   : my $indexname = $db->index_name($path);
 559  Function: Get the full name of the index file
 560  Returns : String
 561  Args    : None
 562
 563 =cut
 564
 565 sub index_name {
 566     return shift->{index_name};
 567 }
 568
 569
 570 =head2 path
 571
 572  Title   : path
 573  Usage   : my $path = $db->path($path);
 574  Function: When a single file or a directory of files is indexed, this returns
 575            the file directory. When indexing an arbitrary list of files, the
 576            return value is the path of the current working directory.
 577  Returns : String
 578  Args    : None
 579
 580 =cut
 581
 582 sub path {
 583     return shift->{dirname};
 584 }
 585
 586
 587 =head2 get_PrimarySeq_stream
 588
 589  Title   : get_PrimarySeq_stream
 590  Usage   : my $stream = $db->get_PrimarySeq_stream();
 591  Function: Get a SeqIO-like stream of sequence objects. The stream supports a
 592            single method, next_seq(). Each call to next_seq() returns a new
 593            PrimarySeqI compliant sequence object, until no more sequences remain.
 594            This is a Bio::DB::SeqI method implementation.
 595  Returns : A Bio::DB::Indexed::Stream object
 596  Args    : None
 597
 598 =cut
 599
 600 sub get_PrimarySeq_stream {
 601     my $self = shift;
 602     return Bio::DB::Indexed::Stream->new($self);
 603 }
 604
 605
 606 =head2 get_Seq_by_id
 607
 608  Title   : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
 609  Usage   : my $seq = $db->get_Seq_by_id($id);
 610  Function: Given an ID, fetch the corresponding sequence from the database.
 611            This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
 612  Returns : A sequence object
 613  Args    : ID
 614
 615 =cut
 616
 617 sub get_Seq_by_id {
 618     my ($self, $id) = @_;
 619     $self->throw('Need to provide a sequence ID') if not defined $id;
 620     return if not exists $self->{offsets}{$id};
 621     return $self->{obj_class}->new($self, $id);
 622 }
 623
 624 *get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
 625
 626
 627 =head2 _calculate_offsets
 628
 629  Title   : _calculate_offsets
 630  Usage   : $db->_calculate_offsets($filename, $offsets);
 631  Function: This method calculates the sequence offsets in a file based on ID and
 632            should be implemented by classes that use Bio::DB::IndexedBase.
 633  Returns : Hash of offsets
 634  Args    : File to process
 635            Hashref of file offsets keyed by IDs.
 636
 637 =cut
 638
 639 sub _calculate_offsets {
 640     my $self = shift;
 641     $self->throw_not_implemented();
 642 }
 643
 644
 645 sub _index_files {
 646     # Do the indexing of the given files using the index file on record
 647     my ($self, $files, $force_reindex) = @_;
 648
 649     $self->_set_pack_method( @$files );
 650
 651     # Get name of index file
 652     my $index = $self->index_name;
 653
 654     # If caller has requested reindexing, unlink the index file.
 655     if ($force_reindex) {
 656         # Tied-hash in Strawberry Perl creates "$file.index"
 657         unlink $index if -e $index;
 658         # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
 659         unlink "$index.dir" if -e "$index.dir";
 660         unlink "$index.pag" if -e "$index.pag";
 661     }
 662
 663     # Get the modification time of the index
 664     my $indextime = (stat $index)[9] || 0;
 665
 666     # Register files and find if there has been any update
 667     my $modtime = 0;
 668     my @updated;
 669     for my $file (@$files) {
 670         # Register file
 671         $self->_path2fileno(basename($file));
 672         # Any update?
 673         my $m = (stat $file)[9] || 0;
 674         if ($m > $modtime) {
 675            $modtime = $m;
 676         }
 677         if ($m > $indextime) {
 678            push @updated, $file;
 679         }
 680     }
 681
 682     # Get termination length from first file
 683     $self->{termination_length} = $self->_calc_termination_length( $files->[0] );
 684
 685     # Reindex contents of changed files if needed
 686     my $reindex      = $force_reindex || (scalar @updated > 0);
 687     $self->{offsets} = $self->_open_index($index, $reindex) or return;
 688     if ($reindex) {
 689         $self->{indexing} = $index;
 690         for my $file (@updated) {
 691             my $fileno = $self->_path2fileno(basename($file));
 692             &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
 693         }
 694         delete $self->{indexing};
 695     }
 696
 697     # Closing and reopening might help corrupted index file problem on Windows
 698     $self->_close_index($self->{offsets});
 699
 700     return $self->{offsets} = $self->_open_index($index);
 701 }
 702
 703
 704 sub _open_index {
 705     # Open index file in read-only or write mode
 706     my ($self, $index_file, $write) = @_;
 707     my %offsets;
 708     my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
 709     my @dbmargs = $self->dbmargs;
 710     tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
 711         or $self->throw( "Could not open index file $index_file: $!");
 712     return \%offsets;
 713 }
 714
 715
 716 sub _close_index {
 717     # Close index file
 718     my ($self, $index) = @_;
 719     untie %$index;
 720     return 1;
 721 }
 722
 723 # Compiling the below regular expression speeds up _parse_compound_id
 724 my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;
 725
 726 sub _parse_compound_id {
 727     # Handle compound IDs:
 728     #     $db->seq($id)
 729     #     $db->seq($id, $start, $stop, $strand)
 730     #     $db->seq("$id:$start,$stop")
 731     #     $db->seq("$id:$start..$stop")
 732     #     $db->seq("$id:$start-$stop")
 733     #     $db->seq("$id:$start,$stop/$strand")
 734     #     $db->seq("$id:$start..$stop/$strand")
 735     #     $db->seq("$id:$start-$stop/$strand")
 736     #     $db->seq("$id/$strand")
 737     my ($self, $id, $start, $stop, $strand) = @_;
 738
 739     if ( (not defined $start ) &&
 740          (not defined $stop  ) &&
 741          (not defined $strand) &&
 742          ($id =~ m{$compound_id}) ) {
 743         # Start, stop and strand not provided and ID looks like a compound ID
 744         ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
 745     }
 746
 747     # Start, stop and strand defaults
 748     $stop   ||= $self->length($id) || 0; # 0 if sequence not found in database
 749     $start  ||= ($stop > 0) ? 1 : 0;
 750     $strand ||= 1;
 751
 752     # Convert numbers such as 1_000_000 to 1000000
 753     $start =~ s/_//g;
 754     $stop  =~ s/_//g;
 755
 756     if ($start > $stop) {
 757         # Change the strand
 758         ($start, $stop) = ($stop, $start);
 759         $strand *= -1;
 760     }
 761
 762     return $id, $start, $stop, $strand;
 763 }
 764
 765
 766 sub _guess_alphabet {
 767     # Determine the molecular type of the given sequence string:
 768     #    'dna', 'rna', 'protein' or '' (unknown/empty)
 769     my ($self, $string) = @_;
 770     # Handle IUPAC residues like PrimarySeq does
 771     my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
 772     return $alphabet eq 'dna' ? DNA
 773            : $alphabet eq 'rna' ? RNA
 774            : $alphabet eq 'protein' ? PROTEIN
 775            : NA;
 776 }
 777
 778
 779 sub _makeid {
 780     # Process the header line by applying any transformation given in -makeid
 781     my ($self, $header_line) = @_;
 782     return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
 783 }
 784
 785
 786 sub _check_linelength {
 787     # Check that the line length is valid. Generate an error otherwise.
 788     my ($self, $linelength) = @_;
 789     return if not defined $linelength;
 790     $self->throw(
 791         "Each line of the file must be less than 65,536 characters. Line ".
 792         "$. is $linelength chars."
 793     ) if $linelength > 65535;
 794 }
 795
 796
 797 sub _calc_termination_length {
 798     # Try the beginning of the file to determine termination length
 799     # Account for crlf-terminated Windows and Mac files
 800     my ($self, $file) = @_;
 801     my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
 802
 803     # In Windows, text files have '\r\n' as line separator, but when reading in
 804     # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
 805     # "length $_" will report 4 although the line is 5 bytes in length.
 806     # We assume that all lines have the same line separator and only read current line.
 807     my $init_pos   = tell($fh);
 808     my $curr_line  = <$fh>;
 809     my $pos_diff   = tell($fh) - $init_pos;
 810     my $correction = $pos_diff - length $curr_line;
 811     close $fh;
 812
 813     $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
 814     return $self->{termination_length};
 815 }
 816
 817
 818 sub _calc_offset {
 819     # Get the offset of the n-th residue of the sequence with the given ID
 820     # and termination length (tl)
 821     my ($self, $id, $n) = @_;
 822     my $tl = $self->{termination_length};
 823     $n--;
 824     my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
 825     $n = 0            if $n < 0;
 826     $n = $seqlen-1 if $n >= $seqlen;
 827     return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
 828 }
 829
 830
 831 sub _fh {
 832     # Given a sequence ID, return the filehandle on which to find this sequence
 833     my ($self, $id) = @_;
 834     $self->throw('Need to provide a sequence ID') if not defined $id;
 835     my $file = $self->file($id) or return;
 836     return $self->_fhcache( File::Spec->catfile($self->{dirname}, $file) ) or
 837         $self->throw( "Can't open file $file");
 838 }
 839
 840
 841 sub _fhcache {
 842     my ($self, $path) = @_;
 843     if (!$self->{fhcache}{$path}) {
 844         if ($self->{curopen} >= $self->{maxopen}) {
 845             my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
 846                 keys %{$self->{fhcache}};
 847             splice(@lru, $self->{maxopen} / 3);
 848             $self->{curopen} -= @lru;
 849             for (@lru) {
 850                 delete $self->{fhcache}{$_};
 851             }
 852         }
 853         $self->{fhcache}{$path} = IO::File->new($path) || return;
 854         binmode $self->{fhcache}{$path};
 855         $self->{curopen}++;
 856     }
 857     $self->{cacheseq}{$path}++;
 858     return $self->{fhcache}{$path};
 859 }
 860
 861
 862 #-------------------------------------------------------------
 863 # Methods to store and retrieve data from indexed file
 864 #
 865
 866 =head2 offset
 867
 868  Title   : offset
 869  Usage   : my $offset = $db->offset($id);
 870  Function: Get the offset of the indicated sequence from the beginning of the
 871            file in which it is located. The offset points to the beginning of
 872            the sequence, not the beginning of the header line.
 873  Returns : String
 874  Args    : ID of sequence
 875
 876 =cut
 877
 878 sub offset {
 879     my ($self, $id) = @_;
 880     $self->throw('Need to provide a sequence ID') if not defined $id;
 881     my $offset = $self->{offsets}{$id} or return;
 882     return (&{$self->{unpackmeth}}($offset))[0];
 883 }
 884
 885
 886 =head2 strlen
 887
 888  Title   : strlen
 889  Usage   : my $length = $db->strlen($id);
 890  Function: Get the number of characters in the sequence string.
 891  Returns : Integer
 892  Args    : ID of sequence
 893
 894 =cut
 895
 896 sub strlen {
 897     my ($self, $id) = @_;
 898     $self->throw('Need to provide a sequence ID') if not defined $id;
 899     my $offset = $self->{offsets}{$id} or return;
 900     return (&{$self->{unpackmeth}}($offset))[1];
 901 }
 902
 903
 904 =head2 length
 905
 906  Title   : length
 907  Usage   : my $length = $db->length($id);
 908  Function: Get the number of residues of the sequence.
 909  Returns : Integer
 910  Args    : ID of sequence
 911
 912 =cut
 913
 914 sub length {
 915     my ($self, $id) = @_;
 916     $self->throw('Need to provide a sequence ID') if not defined $id;
 917     my $offset = $self->{offsets}{$id} or return;
 918     return (&{$self->{unpackmeth}}($offset))[2];
 919 }
 920
 921
 922 =head2 linelen
 923
 924  Title   : linelen
 925  Usage   : my $linelen = $db->linelen($id);
 926  Function: Get the length of the line for this sequence.
 927  Returns : Integer
 928  Args    : ID of sequence
 929
 930 =cut
 931
 932 sub linelen {
 933     my ($self, $id) = @_;
 934     $self->throw('Need to provide a sequence ID') if not defined $id;
 935     my $offset = $self->{offsets}{$id} or return;
 936     return (&{$self->{unpackmeth}}($offset))[3];
 937 }
 938
 939
 940 =head2 headerlen
 941
 942  Title   : headerlen
 943  Usage   : my $length = $db->headerlen($id);
 944  Function: Get the length of the header line for the indicated sequence.
 945  Returns : Integer
 946  Args    : ID of sequence
 947
 948 =cut
 949
 950 sub headerlen {
 951     my ($self, $id) = @_;
 952     $self->throw('Need to provide a sequence ID') if not defined $id;
 953     my $offset = $self->{offsets}{$id} or return;
 954     return (&{$self->{unpackmeth}}($offset))[4];
 955 }
 956
 957
 958 =head2 header_offset
 959
 960  Title   : header_offset
 961  Usage   : my $offset = $db->header_offset($id);
 962  Function: Get the offset of the header line for the indicated sequence from
 963            the beginning of the file in which it is located.
 964  Returns : String
 965  Args    : ID of sequence
 966
 967 =cut
 968
 969 sub header_offset {
 970     my ($self, $id) = @_;
 971     $self->throw('Need to provide a sequence ID') if not defined $id;
 972     return if not $self->{offsets}{$id};
 973     return $self->offset($id) - $self->headerlen($id);
 974 }
 975
 976
 977 =head2 alphabet
 978
 979  Title   : alphabet
 980  Usage   : my $alphabet = $db->alphabet($id);
 981  Function: Get the molecular type of the indicated sequence: dna, rna or protein
 982  Returns : String
 983  Args    : ID of sequence
 984
 985 =cut
 986
 987 sub alphabet {
 988     my ($self, $id) = @_;
 989     $self->throw('Need to provide a sequence ID') if not defined $id;
 990     my $offset = $self->{offsets}{$id} or return;
 991     my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
 992     return : $alphabet == Bio::DB::IndexedBase::DNA     ? 'dna'
 993            : $alphabet == Bio::DB::IndexedBase::RNA     ? 'rna'
 994            : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
 995            : '';
 996 }
 997
 998
 999 =head2 file
1000
1001  Title   : file
1002  Usage   : my $file = $db->file($id);
1003  Function: Get the the name of the file in which the indicated sequence can be
1004            found.
1005  Returns : String
1006  Args    : ID of sequence
1007
1008 =cut
1009
1010 sub file {
1011     my ($self, $id) = @_;
1012     $self->throw('Need to provide a sequence ID') if not defined $id;
1013     my $offset = $self->{offsets}{$id} or return;
1014     return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
1015 }
1016
1017
1018 sub _fileno2path {
1019     my ($self, $fileno) = @_;
1020     return $self->{fileno2path}->[$fileno];
1021 }
1022
1023
1024 sub _path2fileno {
1025     my ($self, $path) = @_;
1026     if ( not exists $self->{filepath2no}->{$path} ) {
1027         my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
1028         $self->{fileno2path}->[$fileno] = $path; # Save path
1029     }
1030     return $self->{filepath2no}->{$path};
1031
1032 }
1033
1034
1035 sub _packSmall {
1036     return pack STRUCT, @_;
1037 }
1038
1039
1040 sub _packBig {
1041     return pack STRUCTBIG, @_;
1042 }
1043
1044
1045 sub _unpackSmall {
1046     return unpack STRUCT, shift;
1047 }
1048
1049
1050 sub _unpackBig {
1051     return unpack STRUCTBIG, shift;
1052 }
1053
1054
1055 sub _set_pack_method {
1056     # Determine whether to use 32 or 64 bit integers for the given files.
1057     my $self = shift;
1058     # Find the maximum file size:
1059     my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
1060     my $fourGB    = (2 ** 32) - 1;
1061
1062     if ($maxsize > $fourGB) {
1063         # At least one file exceeds 4Gb - we will need to use 64 bit ints
1064         $self->{packmeth}   = \&_packBig;
1065         $self->{unpackmeth} = \&_unpackBig;
1066     } else {
1067         $self->{packmeth}   = \&_packSmall;
1068         $self->{unpackmeth} = \&_unpackSmall;
1069     }
1070     return 1;
1071 }
1072
1073
1074 #-------------------------------------------------------------
1075 # Tied hash logic
1076 #
1077
1078 sub TIEHASH {
1079     return shift->new(@_);
1080 }
1081
1082
1083 sub FETCH {
1084     return shift->subseq(@_);
1085 }
1086
1087
1088 sub STORE {
1089     shift->throw("Read-only database");
1090 }
1091
1092
1093 sub DELETE {
1094     shift->throw("Read-only database");
1095 }
1096
1097
1098 sub CLEAR {
1099     shift->throw("Read-only database");
1100 }
1101
1102
1103 sub EXISTS {
1104     return defined shift->offset(@_);
1105 }
1106
1107
1108 sub FIRSTKEY {
1109     return tied(%{shift->{offsets}})->FIRSTKEY(@_);
1110 }
1111
1112
1113 sub NEXTKEY {
1114     return tied(%{shift->{offsets}})->NEXTKEY(@_);
1115 }
1116
1117
1118 sub DESTROY {
1119     my $self = shift;
1120
1121     # Close filehandles
1122     while (my ($file, $fh) = each %{ $self->{fhcache} }) {
1123         if (defined $fh) {
1124             $fh->close;
1125         }
1126     }
1127     $self->_close_index($self->{offsets});
1128
1129     if ( $self->{clean} || $self->{indexing} ) {
1130         # Indexing aborted or cleaning requested. Delete the index file.
1131         my $index = $self->{index_name};
1132
1133         # Tied-hash in Strawberry Perl creates "$file.index"
1134         unlink $index if -e $index;
1135         # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
1136         unlink "$index.dir" if -e "$index.dir";
1137         unlink "$index.pag" if -e "$index.pag";
1138     }
1139     return 1;
1140 }
1141
1142
1143 #-------------------------------------------------------------
1144 # stream-based access to the database
1145 #
1146
1147 package Bio::DB::Indexed::Stream;
1148 use base qw(Tie::Handle Bio::DB::SeqI);
1149
1150
1151 sub new {
1152     my ($class, $db) = @_;
1153     my $key = $db->FIRSTKEY;
1154     return bless {
1155         db  => $db,
1156         key => $key
1157     }, $class;
1158 }
1159
1160 sub next_seq {
1161     my $self = shift;
1162     my ($key, $db) = @{$self}{'key', 'db'};
1163     return if not defined $key;
1164     my $value = $db->get_Seq_by_id($key);
1165     $self->{key} = $db->NEXTKEY($key);
1166     return $value;
1167 }
1168
1169 sub TIEHANDLE {
1170     my ($class, $db) = @_;
1171     return $class->new($db);
1172 }
1173
1174 sub READLINE {
1175     my $self = shift;
1176     return $self->next_seq || undef;
1177 }
1178
1179
1180 1;