Bio/DB/IndexedBase.pm

   1 #
   2 # BioPerl module for Bio::DB::IndexedBase
   3 #
   4 # You may distribute this module under the same terms as perl itself
   5 #
   6
   7
   8 =head1 NAME
   9
  10 Bio::DB::IndexedBase - Base class for modules using indexed sequence files
  11
  12 =head1 SYNOPSIS
  13
  14   use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
  15
  16   # 1/ Bio::SeqIO-style access
  17
  18   # Index some sequence files
  19   my $db = Bio::DB::XXX->new('/path/to/file');    # from a single file
  20   my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
  21   my $db = Bio::DB::XXX->new('/path/to/files/');  # from a directory
  22
  23   # Get IDs of all the sequences in the database
  24   my @ids = $db->get_all_primary_ids;
  25
  26   # Get a specific sequence
  27   my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
  28
  29   # Loop through all sequences
  30   my $stream = $db->get_PrimarySeq_stream;
  31   while (my $seq = $stream->next_seq) {
  32     # Do something...
  33   }
  34
  35
  36   # 2/ Access via filehandle
  37   my $fh = Bio::DB::XXX->newFh('/path/to/file');
  38   while (my $seq = <$fh>) {
  39     # Do something...
  40   }
  41
  42
  43   # 3/ Tied-hash access
  44   tie %sequences, 'Bio::DB::XXX', '/path/to/file';
  45   print $sequences{'CHROMOSOME_I:1,20000'};
  46
  47 =head1 DESCRIPTION
  48
  49 Bio::DB::IndexedBase provides a base class for modules that want to index
  50 and read sequence files and provides persistent, random access to each sequence
  51 entry, without bringing the entire file into memory. This module is compliant
  52 with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
  53 Bio::DB::IndexedBase.
  54
  55 When you initialize the module, you point it at a single file, several files, or
  56 a directory of files. The first time it is run, the module generates an index
  57 of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
  58 followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
  59 index file to find the sequence file and offset for any requested sequence. If
  60 one of the source files is updated, the module reindexes just that one file. You
  61 can also force reindexing manually at any time. For improved performance, the
  62 module keeps a cache of open filehandles, closing less-recently used ones when
  63 the cache is full.
  64
  65 Entries may have any line length up to 65,536 characters, and different line
  66 lengths are allowed in the same file.  However, within a sequence entry, all
  67 lines must be the same length except for the last. An error will be thrown if
  68 this is not the case!
  69
  70 This module was developed for use with the C. elegans and human genomes, and has
  71 been tested with sequence segments as large as 20 megabases. Indexing the C.
  72 elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
  73 minutes on my 300 MHz pentium laptop. On the same system, average access time
  74 for any 200-mer within the C. elegans genome was E<lt>0.02s.
  75
  76 =head1 DATABASE CREATION AND INDEXING
  77
  78 The two constructors for this class are new() and newFh(). The former creates a
  79 Bio::DB::IndexedBase object which is accessed via method calls. The latter
  80 creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
  81 objects in a stream fashion. There is also a tied hash interface.
  82
  83 =over
  84
  85 =item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options])
  86
  87 Create a new Bio::DB::IndexedBase object from the files designated by $path
  88 $path may be a single file, an arrayref of files, or a directory containing
  89 such files.
  90
  91 After the database is created, you can use methods like get_all_primary_ids()
  92 and get_Seq_by_id() to retrieve sequence objects.
  93
  94 =item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options])
  95
  96 Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
  97 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
  98 Bio::SeqIO style. The path and the options should be specified as for new().
  99
 100 =item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
 101
 102 Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
 103 path to the files. The optional @args list is the same set used by new(). If
 104 successful, tie() returns the tied object, undef otherwise.
 105
 106 Once tied, you can use the hash to retrieve an individual sequence by
 107 its ID, like this:
 108
 109   my $seq = $db{CHROMOSOME_I};
 110
 111 The keys() and values() functions will return the sequence IDs and their
 112 sequences, respectively.  In addition, each() can be used to iterate over the
 113 entire data set:
 114
 115  while (my ($id,$sequence) = each %db) {
 116     print "$id => $sequence\n";
 117  }
 118
 119
 120 When dealing with very large sequences, you can avoid bringing them into memory
 121 by calling each() in a scalar context.  This returns the key only.  You can then
 122 use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
 123
 124  while (my $id = each %db) {
 125     print "$id: $db{$sequence:1,100}\n";
 126     print "$id: ".tied(%db)->length($id)."\n";
 127  }
 128
 129 In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
 130 to retrieve the first and next ID in the database, respectively. This allows to
 131 write the following iterative loop using just the object-oriented interface:
 132
 133  my $db = Bio::DB::IndexedBase->new('/path/to/file');
 134  for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
 135     # do something with sequence
 136  }
 137
 138 =back
 139
 140 =head1 INDEX CONTENT
 141
 142 Several attributes of each sequence are stored in the index file. Given a
 143 sequence ID, these attributes can be retrieved using the following methods:
 144
 145 =over
 146
 147 =item offset($id)
 148
 149 Get the offset of the indicated sequence from the beginning of the file in which
 150 it is located. The offset points to the beginning of the sequence, not the
 151 beginning of the header line.
 152
 153 =item strlen($id)
 154
 155 Get the number of characters in the sequence string.
 156
 157 =item length($id)
 158
 159 Get the number of residues of the sequence.
 160
 161 =item linelen($id)
 162
 163 Get the length of the line for this sequence. If the sequence is wrapped, then
 164 linelen() is likely to be much shorter than strlen().
 165
 166 =item headerlen($id)
 167
 168 Get the length of the header line for the indicated sequence.
 169
 170 =item header_offset
 171
 172 Get the offset of the header line for the indicated sequence from the beginning
 173 of the file in which it is located. This attribute is not stored. It is
 174 calculated from offset() and headerlen().
 175
 176 =item alphabet($id)
 177
 178 Get the molecular type (alphabet) of the indicated sequence. This method handles
 179 residues according to the IUPAC convention.
 180
 181 =item file($id)
 182
 183 Get the the name of the file in which the indicated sequence can be found.
 184
 185 =back
 186
 187 =head1 INTERFACE COMPLIANCE NOTES
 188
 189 Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
 190 Bio::RandomAccessI interfaces.
 191
 192 Database do not necessarily provide any meaningful internal primary ID for the
 193 sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
 194 the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
 195 and get_Seq_by_primary_id() will return the same sequence.
 196
 197 Since this database index has no notion of sequence version or namespace, the
 198 get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
 199
 200 =head1 BUGS
 201
 202 When a sequence is deleted from one of the files, this deletion is not detected
 203 by the module and removed from the index. As a result, a "ghost" entry will
 204 remain in the index and will return garbage results if accessed.
 205
 206 Also, if you are indexing a directory, it is wise to not add or remove files
 207 from it.
 208
 209 In case you have changed the files in a directory, or the sequences in a file,
 210 you can to rebuild the entire index, either by deleting it manually, or by
 211 passing -reindex=E<gt>1 to new() when initializing the module.
 212
 213 =head1 SEE ALSO
 214
 215 L<DB_File>
 216
 217 L<Bio::DB::Fasta>
 218
 219 L<Bio::DB::Qual>
 220
 221 =head1 AUTHOR
 222
 223 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
 224
 225 Copyright (c) 2001 Cold Spring Harbor Laboratory.
 226
 227 Florent Angly (for the modularization)
 228
 229 This library is free software; you can redistribute it and/or modify
 230 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 231 disclaimers of warranty.
 232
 233 =head1 APPENDIX
 234
 235 The rest of the documentation details each of the object
 236 methods. Internal methods are usually preceded with a _
 237
 238 =cut
 239
 240
 241 package Bio::DB::IndexedBase;
 242
 243 BEGIN {
 244     @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
 245         if(!$INC{'AnyDBM_File.pm'});
 246 }
 247
 248 use strict;
 249 use IO::File;
 250 use AnyDBM_File;
 251 use Fcntl;
 252 use File::Spec;
 253 use File::Basename qw(basename dirname);
 254 use Bio::PrimarySeq;
 255
 256 use base qw(Bio::DB::SeqI);
 257
 258 # Store offset, strlen, linelen, headerlen, type and fileno
 259 use constant STRUCT    => 'NNNnnCa*'; # 32-bit file offset and seq length
 260 use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
 261
 262 use constant NA        => 0;
 263 use constant DNA       => 1;
 264 use constant RNA       => 2;
 265 use constant PROTEIN   => 3;
 266
 267 use constant DIE_ON_MISSMATCHED_LINES => 1;
 268 # you can avoid dying if you want but you may get incorrect results
 269
 270
 271 # Compiling the below regular expressions speeds up the Pure Perl
 272 # seq/subseq() from Bio::DB::Fasta by about 7% from 7.76s to 7.22s
 273 # over 32358 calls on Variant Effect Prediction data.
 274 my $nl = qr/\n/;
 275 my $cr = qr/\r/;
 276
 277 # Remove carriage returns (\r) and newlines (\n) from a string.  When
 278 # called from subseq, this can take a signficiant portion of time, in
 279 # Variant Effect Prediction. Therefore we compile the match portion.
 280 sub _strip_crnl {
 281     my $str = shift;
 282     $str =~ s/$nl//g;
 283     $str =~ s/$cr//g;
 284     return $str;
 285 }
 286
 287 # C can do perfrom _strip_crnl much faster. But this requires the
 288 # Inline::C module which we don't require people to have. So we make
 289 # this optional by wrapping the C code in an eval. If the eval works,
 290 # the Perl strip_crnl() function is overwritten.
 291 eval q{
 292     use Inline C  => <<'END_OF_C_CODE';
 293     /* Strip all new line (\n) and carriage return (\r) characters
 294        from string str
 295     */
 296     char* _strip_crnl(char* str) {
 297         char *s;
 298         char *s2 = str;
 299         for (s = str; *s; *s++) {
 300             if (*s != '\n' && *s != '\r') {
 301               *s2++ = *s;
 302             }
 303         }
 304         *s2 = '\0';
 305         return str;
 306     }
 307 END_OF_C_CODE
 308 };
 309
 310
 311 =head2 new
 312
 313  Title   : new
 314  Usage   : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
 315  Function: Initialize a new database object
 316  Returns : A Bio::DB::IndexedBase object
 317  Args    : A single file, or path to dir, or arrayref of files
 318            Optional arguments:
 319
 320  Option        Description                                         Default
 321  -----------   -----------                                         -------
 322  -glob         Glob expression to search for files in directories  *
 323  -makeid       A code subroutine for transforming IDs              None
 324  -maxopen      Maximum size of filehandle cache                    32
 325  -debug        Turn on status messages                             0
 326  -reindex      Force the index to be rebuilt                       0
 327  -dbmargs      Additional arguments to pass to the DBM routine     None
 328  -index_name   Name of the file that will hold the indices
 329  -clean        Remove the index file when finished                 0
 330
 331 The -dbmargs option can be used to control the format of the index. For example,
 332 you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
 333 retrieved alphabetically. Note that you must use the same arguments every time
 334 you open the index!
 335
 336 The -makeid option gives you a chance to modify sequence IDs during indexing.
 337 For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
 338 that GenBank Fasta files use. The original header line can be recovered later.
 339 The option value for -makeid should be a code reference that takes a scalar
 340 argument (the full header line) and returns a scalar or an array of scalars (the
 341 ID or IDs you want to assign). For example:
 342
 343   $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
 344
 345   sub extract_gi {
 346       # Extract GI from GenBank
 347       my $header = shift;
 348       my ($id) = ($header =~ /gi\|(\d+)/m);
 349       return $id || '';
 350   }
 351
 352 extract_gi() will be called with the full header line, e.g. a Fasta line would
 353 include the "E<gt>", the ID and the description:
 354
 355  >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
 356
 357 In the database, this sequence can now be retrieved by its GI instead of its
 358 complete ID:
 359
 360  my $seq = $db->get_Seq_by_id(352962132);
 361
 362 The -makeid option is ignored after the index is constructed.
 363
 364 =cut
 365
 366 sub new {
 367     my ($class, $path, %opts) = @_;
 368
 369     my $self = bless {
 370         debug       => $opts{-debug}   || 0,
 371         makeid      => $opts{-makeid},
 372         glob        => $opts{-glob}    || eval '$'.$class.'::file_glob' || '*',
 373         maxopen     => $opts{-maxopen} || 32,
 374         clean       => $opts{-clean}   || 0,
 375         dbmargs     => $opts{-dbmargs} || undef,
 376         fhcache     => {},
 377         cacheseq    => {},
 378         curopen     => 0,
 379         openseq     => 1,
 380         dirname     => undef,
 381         offsets     => undef,
 382         index_name  => $opts{-index_name},
 383         obj_class   => eval '$'.$class.'::obj_class',
 384         offset_meth => \&{$class.'::_calculate_offsets'},
 385         fileno2path => [],
 386         filepath2no => {},
 387     }, $class;
 388
 389     my ($offsets, $dirname);
 390     my $ref = ref $path || '';
 391     if ( $ref eq 'ARRAY' ) {
 392         $offsets = $self->index_files($path, $opts{-reindex});
 393         require Cwd;
 394         $dirname = Cwd::getcwd();
 395     } else {
 396         $self->{index_name} ||= $self->_default_index_name($path);
 397         if (-d $path) {
 398             # because Win32 glob() is broken with respect to long file names
 399             # that contain whitespace.
 400             $path = Win32::GetShortPathName($path)
 401                 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
 402             $offsets = $self->index_dir($path, $opts{-reindex});
 403             $dirname = $path;
 404         } elsif (-f _) {
 405             $offsets = $self->index_file($path, $opts{-reindex});
 406             $dirname = dirname($path);
 407         } else {
 408             $self->throw( "No file or directory called '$path'");
 409         }
 410     }
 411     @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
 412
 413     return $self;
 414 }
 415
 416
 417 =head2 newFh
 418
 419  Title   : newFh
 420  Usage   : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
 421  Function: Index and get a new Fh for a single file, several files or a directory
 422  Returns : Filehandle object
 423  Args    : Same as new()
 424
 425 =cut
 426
 427 sub newFh {
 428     my ($class, @args) = @_;
 429     my $self = $class->new(@args);
 430     require Symbol;
 431     my $fh = Symbol::gensym;
 432     tie $$fh, 'Bio::DB::Indexed::Stream', $self
 433         or $self->throw("Could not tie filehandle: $!");
 434     return $fh;
 435 }
 436
 437
 438 =head2 dbmargs
 439
 440  Title   : dbmargs
 441  Usage   : my @args = $db->dbmargs;
 442  Function: Get stored dbm arguments
 443  Returns : Array
 444  Args    : None
 445
 446 =cut
 447
 448 sub dbmargs {
 449     my $self = shift;
 450     my $args = $self->{dbmargs} or return;
 451     return ref($args) eq 'ARRAY' ? @$args : $args;
 452 }
 453
 454
 455 =head2 glob
 456
 457  Title   : glob
 458  Usage   : my $glob = $db->glob;
 459  Function: Get the expression used to match files in directories
 460  Returns : String
 461  Args    : None
 462
 463 =cut
 464
 465 sub glob {
 466     my $self = shift;
 467     return $self->{glob};
 468 }
 469
 470
 471 =head2 index_dir
 472
 473  Title   : index_dir
 474  Usage   : $db->index_dir($dir);
 475  Function: Index the files that match -glob in the given directory
 476  Returns : Hashref of offsets
 477  Args    : Dirname
 478            Boolean to force a reindexing the directory
 479
 480 =cut
 481
 482 sub index_dir {
 483     my ($self, $dir, $force_reindex) = @_;
 484     my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
 485     return if scalar @files == 0;
 486     $self->{index_name} ||= $self->_default_index_name($dir);
 487     my $offsets = $self->_index_files(\@files, $force_reindex);
 488     return $offsets;
 489 }
 490
 491
 492 =head2 get_all_primary_ids
 493
 494  Title   : get_all_primary_ids, get_all_ids, ids
 495  Usage   : my @ids = $db->get_all_primary_ids;
 496  Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
 497            implementation. Note that in this implementation, the internal
 498            database primary IDs are also the sequence IDs.
 499  Returns : List of ids
 500  Args    : None
 501
 502 =cut
 503
 504 sub get_all_primary_ids  {
 505     return keys %{shift->{offsets}};
 506 }
 507
 508 *ids = *get_all_ids = \&get_all_primary_ids;
 509
 510
 511 =head2 index_file
 512
 513  Title   : index_file
 514  Usage   : $db->index_file($filename);
 515  Function: Index the given file
 516  Returns : Hashref of offsets
 517  Args    : Filename
 518            Boolean to force reindexing the file
 519
 520 =cut
 521
 522 sub index_file {
 523     my ($self, $file, $force_reindex) = @_;
 524     $self->{index_name} ||= $self->_default_index_name($file);
 525     my $offsets = $self->_index_files([$file], $force_reindex);
 526     return $offsets;
 527 }
 528
 529 sub _default_index_name {
 530     my ($self,$path) = @_;
 531     return File::Spec->catfile($path,'directory.index') if -d $path;
 532     return "$path.index";
 533 }
 534
 535 =head2 index_files
 536
 537  Title   : index_files
 538  Usage   : $db->index_files(\@files);
 539  Function: Index the given files
 540  Returns : Hashref of offsets
 541  Args    : Arrayref of filenames
 542            Boolean to force reindexing the files
 543
 544 =cut
 545
 546 sub index_files {
 547     my ($self, $files, $force_reindex) = @_;
 548     my @paths = map { File::Spec->rel2abs($_) } @$files;
 549     require Digest::MD5;
 550     my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
 551     $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
 552     my $offsets = $self->_index_files($files, $force_reindex);
 553     return $offsets;
 554 }
 555
 556
 557 =head2 index_name
 558
 559  Title   : index_name
 560  Usage   : my $indexname = $db->index_name($path);
 561  Function: Get the full name of the index file
 562  Returns : String
 563  Args    : None
 564
 565 =cut
 566
 567 sub index_name {
 568     return shift->{index_name};
 569 }
 570
 571
 572 =head2 path
 573
 574  Title   : path
 575  Usage   : my $path = $db->path($path);
 576  Function: When a single file or a directory of files is indexed, this returns
 577            the file directory. When indexing an arbitrary list of files, the
 578            return value is the path of the current working directory.
 579  Returns : String
 580  Args    : None
 581
 582 =cut
 583
 584 sub path {
 585     return shift->{dirname};
 586 }
 587
 588
 589 =head2 get_PrimarySeq_stream
 590
 591  Title   : get_PrimarySeq_stream
 592  Usage   : my $stream = $db->get_PrimarySeq_stream();
 593  Function: Get a SeqIO-like stream of sequence objects. The stream supports a
 594            single method, next_seq(). Each call to next_seq() returns a new
 595            PrimarySeqI compliant sequence object, until no more sequences remain.
 596            This is a Bio::DB::SeqI method implementation.
 597  Returns : A Bio::DB::Indexed::Stream object
 598  Args    : None
 599
 600 =cut
 601
 602 sub get_PrimarySeq_stream {
 603     my $self = shift;
 604     return Bio::DB::Indexed::Stream->new($self);
 605 }
 606
 607
 608 =head2 get_Seq_by_id
 609
 610  Title   : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
 611  Usage   : my $seq = $db->get_Seq_by_id($id);
 612  Function: Given an ID, fetch the corresponding sequence from the database.
 613            This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
 614  Returns : A sequence object
 615  Args    : ID
 616
 617 =cut
 618
 619 sub get_Seq_by_id {
 620     my ($self, $id) = @_;
 621     $self->throw('Need to provide a sequence ID') if not defined $id;
 622     return if not exists $self->{offsets}{$id};
 623     return $self->{obj_class}->new($self, $id);
 624 }
 625
 626 *get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
 627
 628
 629 =head2 _calculate_offsets
 630
 631  Title   : _calculate_offsets
 632  Usage   : $db->_calculate_offsets($filename, $offsets);
 633  Function: This method calculates the sequence offsets in a file based on ID and
 634            should be implemented by classes that use Bio::DB::IndexedBase.
 635  Returns : Hash of offsets
 636  Args    : File to process
 637            Hashref of file offsets keyed by IDs.
 638
 639 =cut
 640
 641 sub _calculate_offsets {
 642     my $self = shift;
 643     $self->throw_not_implemented();
 644 }
 645
 646
 647 sub _index_files {
 648     # Do the indexing of the given files using the index file on record
 649     my ($self, $files, $force_reindex) = @_;
 650
 651     $self->_set_pack_method( @$files );
 652
 653     # Get name of index file
 654     my $index = $self->index_name;
 655
 656     # If caller has requested reindexing, unlink the index file.
 657     if ($force_reindex) {
 658         # Tied-hash in Strawberry Perl creates "$file.index"
 659         unlink $index if -e $index;
 660         # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
 661         unlink "$index.dir" if -e "$index.dir";
 662         unlink "$index.pag" if -e "$index.pag";
 663     }
 664
 665     # Get the modification time of the index
 666     my $indextime = (stat $index)[9] || 0;
 667
 668     # Register files and find if there has been any update
 669     my $modtime = 0;
 670     my @updated;
 671     for my $file (@$files) {
 672         # Register file
 673         $self->_path2fileno(basename($file));
 674         # Any update?
 675         my $m = (stat $file)[9] || 0;
 676         if ($m > $modtime) {
 677            $modtime = $m;
 678         }
 679         if ($m > $indextime) {
 680            push @updated, $file;
 681         }
 682     }
 683
 684     # Get termination length from first file
 685     $self->{termination_length} = $self->_calc_termination_length( $files->[0] );
 686
 687     # Reindex contents of changed files if needed
 688     my $reindex      = $force_reindex || (scalar @updated > 0);
 689     $self->{offsets} = $self->_open_index($index, $reindex) or return;
 690     if ($reindex) {
 691         $self->{indexing} = $index;
 692         for my $file (@updated) {
 693             my $fileno = $self->_path2fileno(basename($file));
 694             &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
 695         }
 696         delete $self->{indexing};
 697     }
 698
 699     # Closing and reopening might help corrupted index file problem on Windows
 700     $self->_close_index($self->{offsets});
 701
 702     return $self->{offsets} = $self->_open_index($index);
 703 }
 704
 705
 706 sub _open_index {
 707     # Open index file in read-only or write mode
 708     my ($self, $index_file, $write) = @_;
 709     my %offsets;
 710     my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
 711     my @dbmargs = $self->dbmargs;
 712     tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
 713         or $self->throw( "Could not open index file $index_file: $!");
 714     return \%offsets;
 715 }
 716
 717
 718 sub _close_index {
 719     # Close index file
 720     my ($self, $index) = @_;
 721     untie %$index;
 722     return 1;
 723 }
 724
 725 # Compiling the below regular expression speeds up _parse_compound_id
 726 my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;
 727
 728 sub _parse_compound_id {
 729     # Handle compound IDs:
 730     #     $db->seq($id)
 731     #     $db->seq($id, $start, $stop, $strand)
 732     #     $db->seq("$id:$start,$stop")
 733     #     $db->seq("$id:$start..$stop")
 734     #     $db->seq("$id:$start-$stop")
 735     #     $db->seq("$id:$start,$stop/$strand")
 736     #     $db->seq("$id:$start..$stop/$strand")
 737     #     $db->seq("$id:$start-$stop/$strand")
 738     #     $db->seq("$id/$strand")
 739     my ($self, $id, $start, $stop, $strand) = @_;
 740
 741     if ( (not defined $start ) &&
 742          (not defined $stop  ) &&
 743          (not defined $strand) &&
 744          ($id =~ m{$compound_id}) ) {
 745         # Start, stop and strand not provided and ID looks like a compound ID
 746         ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
 747     }
 748
 749     # Start, stop and strand defaults
 750     $stop   ||= $self->length($id) || 0; # 0 if sequence not found in database
 751     $start  ||= ($stop > 0) ? 1 : 0;
 752     $strand ||= 1;
 753
 754     # Convert numbers such as 1_000_000 to 1000000
 755     $start =~ s/_//g;
 756     $stop  =~ s/_//g;
 757
 758     if ($start > $stop) {
 759         # Change the strand
 760         ($start, $stop) = ($stop, $start);
 761         $strand *= -1;
 762     }
 763
 764     return $id, $start, $stop, $strand;
 765 }
 766
 767
 768 sub _guess_alphabet {
 769     # Determine the molecular type of the given sequence string:
 770     #    'dna', 'rna', 'protein' or '' (unknown/empty)
 771     my ($self, $string) = @_;
 772     # Handle IUPAC residues like PrimarySeq does
 773     my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
 774     return $alphabet eq 'dna' ? DNA
 775            : $alphabet eq 'rna' ? RNA
 776            : $alphabet eq 'protein' ? PROTEIN
 777            : NA;
 778 }
 779
 780
 781 sub _makeid {
 782     # Process the header line by applying any transformation given in -makeid
 783     my ($self, $header_line) = @_;
 784     return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
 785 }
 786
 787
 788 sub _check_linelength {
 789     # Check that the line length is valid. Generate an error otherwise.
 790     my ($self, $linelength) = @_;
 791     return if not defined $linelength;
 792     $self->throw(
 793         "Each line of the file must be less than 65,536 characters. Line ".
 794         "$. is $linelength chars."
 795     ) if $linelength > 65535;
 796 }
 797
 798
 799 sub _calc_termination_length {
 800     # Try the beginning of the file to determine termination length
 801     # Account for crlf-terminated Windows and Mac files
 802     my ($self, $file) = @_;
 803     my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
 804
 805     # In Windows, text files have '\r\n' as line separator, but when reading in
 806     # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
 807     # "length $_" will report 4 although the line is 5 bytes in length.
 808     # We assume that all lines have the same line separator and only read current line.
 809     my $init_pos   = tell($fh);
 810     my $curr_line  = <$fh>;
 811     my $pos_diff   = tell($fh) - $init_pos;
 812     my $correction = $pos_diff - length $curr_line;
 813     close $fh;
 814
 815     $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
 816     return $self->{termination_length};
 817 }
 818
 819
 820 sub _calc_offset {
 821     # Get the offset of the n-th residue of the sequence with the given ID
 822     # and termination length (tl)
 823     my ($self, $id, $n) = @_;
 824     my $tl = $self->{termination_length};
 825     $n--;
 826     my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
 827     $n = 0            if $n < 0;
 828     $n = $seqlen-1 if $n >= $seqlen;
 829     return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
 830 }
 831
 832
 833 sub _fh {
 834     # Given a sequence ID, return the filehandle on which to find this sequence
 835     my ($self, $id) = @_;
 836     $self->throw('Need to provide a sequence ID') if not defined $id;
 837     my $file = $self->file($id) or return;
 838     return $self->_fhcache( File::Spec->catfile($self->{dirname}, $file) ) or
 839         $self->throw( "Can't open file $file");
 840 }
 841
 842
 843 sub _fhcache {
 844     my ($self, $path) = @_;
 845     if (!$self->{fhcache}{$path}) {
 846         if ($self->{curopen} >= $self->{maxopen}) {
 847             my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
 848                 keys %{$self->{fhcache}};
 849             splice(@lru, $self->{maxopen} / 3);
 850             $self->{curopen} -= @lru;
 851             for (@lru) {
 852                 delete $self->{fhcache}{$_};
 853             }
 854         }
 855         $self->{fhcache}{$path} = IO::File->new($path) || return;
 856         binmode $self->{fhcache}{$path};
 857         $self->{curopen}++;
 858     }
 859     $self->{cacheseq}{$path}++;
 860     return $self->{fhcache}{$path};
 861 }
 862
 863
 864 #-------------------------------------------------------------
 865 # Methods to store and retrieve data from indexed file
 866 #
 867
 868 =head2 offset
 869
 870  Title   : offset
 871  Usage   : my $offset = $db->offset($id);
 872  Function: Get the offset of the indicated sequence from the beginning of the
 873            file in which it is located. The offset points to the beginning of
 874            the sequence, not the beginning of the header line.
 875  Returns : String
 876  Args    : ID of sequence
 877
 878 =cut
 879
 880 sub offset {
 881     my ($self, $id) = @_;
 882     $self->throw('Need to provide a sequence ID') if not defined $id;
 883     my $offset = $self->{offsets}{$id} or return;
 884     return (&{$self->{unpackmeth}}($offset))[0];
 885 }
 886
 887
 888 =head2 strlen
 889
 890  Title   : strlen
 891  Usage   : my $length = $db->strlen($id);
 892  Function: Get the number of characters in the sequence string.
 893  Returns : Integer
 894  Args    : ID of sequence
 895
 896 =cut
 897
 898 sub strlen {
 899     my ($self, $id) = @_;
 900     $self->throw('Need to provide a sequence ID') if not defined $id;
 901     my $offset = $self->{offsets}{$id} or return;
 902     return (&{$self->{unpackmeth}}($offset))[1];
 903 }
 904
 905
 906 =head2 length
 907
 908  Title   : length
 909  Usage   : my $length = $db->length($id);
 910  Function: Get the number of residues of the sequence.
 911  Returns : Integer
 912  Args    : ID of sequence
 913
 914 =cut
 915
 916 sub length {
 917     my ($self, $id) = @_;
 918     $self->throw('Need to provide a sequence ID') if not defined $id;
 919     my $offset = $self->{offsets}{$id} or return;
 920     return (&{$self->{unpackmeth}}($offset))[2];
 921 }
 922
 923
 924 =head2 linelen
 925
 926  Title   : linelen
 927  Usage   : my $linelen = $db->linelen($id);
 928  Function: Get the length of the line for this sequence.
 929  Returns : Integer
 930  Args    : ID of sequence
 931
 932 =cut
 933
 934 sub linelen {
 935     my ($self, $id) = @_;
 936     $self->throw('Need to provide a sequence ID') if not defined $id;
 937     my $offset = $self->{offsets}{$id} or return;
 938     return (&{$self->{unpackmeth}}($offset))[3];
 939 }
 940
 941
 942 =head2 headerlen
 943
 944  Title   : headerlen
 945  Usage   : my $length = $db->headerlen($id);
 946  Function: Get the length of the header line for the indicated sequence.
 947  Returns : Integer
 948  Args    : ID of sequence
 949
 950 =cut
 951
 952 sub headerlen {
 953     my ($self, $id) = @_;
 954     $self->throw('Need to provide a sequence ID') if not defined $id;
 955     my $offset = $self->{offsets}{$id} or return;
 956     return (&{$self->{unpackmeth}}($offset))[4];
 957 }
 958
 959
 960 =head2 header_offset
 961
 962  Title   : header_offset
 963  Usage   : my $offset = $db->header_offset($id);
 964  Function: Get the offset of the header line for the indicated sequence from
 965            the beginning of the file in which it is located.
 966  Returns : String
 967  Args    : ID of sequence
 968
 969 =cut
 970
 971 sub header_offset {
 972     my ($self, $id) = @_;
 973     $self->throw('Need to provide a sequence ID') if not defined $id;
 974     return if not $self->{offsets}{$id};
 975     return $self->offset($id) - $self->headerlen($id);
 976 }
 977
 978
 979 =head2 alphabet
 980
 981  Title   : alphabet
 982  Usage   : my $alphabet = $db->alphabet($id);
 983  Function: Get the molecular type of the indicated sequence: dna, rna or protein
 984  Returns : String
 985  Args    : ID of sequence
 986
 987 =cut
 988
 989 sub alphabet {
 990     my ($self, $id) = @_;
 991     $self->throw('Need to provide a sequence ID') if not defined $id;
 992     my $offset = $self->{offsets}{$id} or return;
 993     my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
 994     return : $alphabet == Bio::DB::IndexedBase::DNA     ? 'dna'
 995            : $alphabet == Bio::DB::IndexedBase::RNA     ? 'rna'
 996            : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
 997            : '';
 998 }
 999
1000
1001 =head2 file
1002
1003  Title   : file
1004  Usage   : my $file = $db->file($id);
1005  Function: Get the the name of the file in which the indicated sequence can be
1006            found.
1007  Returns : String
1008  Args    : ID of sequence
1009
1010 =cut
1011
1012 sub file {
1013     my ($self, $id) = @_;
1014     $self->throw('Need to provide a sequence ID') if not defined $id;
1015     my $offset = $self->{offsets}{$id} or return;
1016     return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
1017 }
1018
1019
1020 sub _fileno2path {
1021     my ($self, $fileno) = @_;
1022     return $self->{fileno2path}->[$fileno];
1023 }
1024
1025
1026 sub _path2fileno {
1027     my ($self, $path) = @_;
1028     if ( not exists $self->{filepath2no}->{$path} ) {
1029         my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
1030         $self->{fileno2path}->[$fileno] = $path; # Save path
1031     }
1032     return $self->{filepath2no}->{$path};
1033
1034 }
1035
1036
1037 sub _packSmall {
1038     return pack STRUCT, @_;
1039 }
1040
1041
1042 sub _packBig {
1043     return pack STRUCTBIG, @_;
1044 }
1045
1046
1047 sub _unpackSmall {
1048     return unpack STRUCT, shift;
1049 }
1050
1051
1052 sub _unpackBig {
1053     return unpack STRUCTBIG, shift;
1054 }
1055
1056
1057 sub _set_pack_method {
1058     # Determine whether to use 32 or 64 bit integers for the given files.
1059     my $self = shift;
1060     # Find the maximum file size:
1061     my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
1062     my $fourGB    = (2 ** 32) - 1;
1063
1064     if ($maxsize > $fourGB) {
1065         # At least one file exceeds 4Gb - we will need to use 64 bit ints
1066         $self->{packmeth}   = \&_packBig;
1067         $self->{unpackmeth} = \&_unpackBig;
1068     } else {
1069         $self->{packmeth}   = \&_packSmall;
1070         $self->{unpackmeth} = \&_unpackSmall;
1071     }
1072     return 1;
1073 }
1074
1075
1076 #-------------------------------------------------------------
1077 # Tied hash logic
1078 #
1079
1080 sub TIEHASH {
1081     return shift->new(@_);
1082 }
1083
1084
1085 sub FETCH {
1086     return shift->subseq(@_);
1087 }
1088
1089
1090 sub STORE {
1091     shift->throw("Read-only database");
1092 }
1093
1094
1095 sub DELETE {
1096     shift->throw("Read-only database");
1097 }
1098
1099
1100 sub CLEAR {
1101     shift->throw("Read-only database");
1102 }
1103
1104
1105 sub EXISTS {
1106     return defined shift->offset(@_);
1107 }
1108
1109
1110 sub FIRSTKEY {
1111     return tied(%{shift->{offsets}})->FIRSTKEY(@_);
1112 }
1113
1114
1115 sub NEXTKEY {
1116     return tied(%{shift->{offsets}})->NEXTKEY(@_);
1117 }
1118
1119
1120 sub DESTROY {
1121     my $self = shift;
1122
1123     # Close filehandles
1124     while (my ($file, $fh) = each %{ $self->{fhcache} }) {
1125         if (defined $fh) {
1126             $fh->close;
1127         }
1128     }
1129     $self->_close_index($self->{offsets});
1130
1131     if ( $self->{clean} || $self->{indexing} ) {
1132         # Indexing aborted or cleaning requested. Delete the index file.
1133         my $index = $self->{index_name};
1134
1135         # Tied-hash in Strawberry Perl creates "$file.index"
1136         unlink $index if -e $index;
1137         # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
1138         unlink "$index.dir" if -e "$index.dir";
1139         unlink "$index.pag" if -e "$index.pag";
1140     }
1141     return 1;
1142 }
1143
1144
1145 #-------------------------------------------------------------
1146 # stream-based access to the database
1147 #
1148
1149 package Bio::DB::Indexed::Stream;
1150 use base qw(Tie::Handle Bio::DB::SeqI);
1151
1152
1153 sub new {
1154     my ($class, $db) = @_;
1155     my $key = $db->FIRSTKEY;
1156     return bless {
1157         db  => $db,
1158         key => $key
1159     }, $class;
1160 }
1161
1162 sub next_seq {
1163     my $self = shift;
1164     my ($key, $db) = @{$self}{'key', 'db'};
1165     return if not defined $key;
1166     my $value = $db->get_Seq_by_id($key);
1167     $self->{key} = $db->NEXTKEY($key);
1168     return $value;
1169 }
1170
1171 sub TIEHANDLE {
1172     my ($class, $db) = @_;
1173     return $class->new($db);
1174 }
1175
1176 sub READLINE {
1177     my $self = shift;
1178     return $self->next_seq || undef;
1179 }
1180
1181
1182 1;