Bio/DB/Fasta.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::DB::Fasta
   4 #
   5 # You may distribute this module under the same terms as perl itself
   6 #
   7
   8 # POD documentation - main docs before the code
   9
  10 =head1 NAME
  11
  12 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
  13
  14 =head1 SYNOPSIS
  15
  16   use Bio::DB::Fasta;
  17
  18   # create database from directory of fasta files
  19   my $db      = Bio::DB::Fasta->new('/path/to/fasta/files');
  20
  21   # simple access (for those without Bioperl)
  22   my $seq      = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
  23   my $revseq   = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
  24   my @ids     = $db->ids;
  25   my $length   = $db->length('CHROMOSOME_I');
  26   my $alphabet = $db->alphabet('CHROMOSOME_I');
  27   my $header   = $db->header('CHROMOSOME_I');
  28
  29   # Bioperl-style access
  30   my $db      = Bio::DB::Fasta->new('/path/to/fasta/files');
  31
  32   my $obj     = $db->get_Seq_by_id('CHROMOSOME_I');
  33   my $seq     = $obj->seq; # sequence string
  34   my $subseq  = $obj->subseq(4_000_000 => 4_100_000); # string
  35   my $trunc   = $obj->trunc(4_000_000 => 4_100_000); # seq object
  36   my $length  = $obj->length;
  37   # (etc)
  38
  39   # Bio::SeqIO-style access
  40   my $stream  = Bio::DB::Fasta->new('/path/to/files')->get_PrimarySeq_stream;
  41   while (my $seq = $stream->next_seq) {
  42     # Bio::PrimarySeqI stuff
  43   }
  44
  45   my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
  46   while (my $seq = <$fh>) {
  47     # Bio::PrimarySeqI stuff
  48   }
  49
  50   # tied hash access
  51   tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
  52   print $sequences{'CHROMOSOME_I:1,20000'};
  53
  54 =head1 DESCRIPTION
  55
  56 Bio::DB::Fasta provides indexed access to one or more Fasta files.  It
  57 provides random access to each sequence entry, and to subsequences
  58 within each entry, allowing you to retrieve portions of very large
  59 sequences without bringing the entire sequence into memory.
  60
  61 When you initialize the module, you point it at a single fasta file or
  62 a directory of multiple such files.  The first time it is run, the
  63 module generates an index of the contents of the file or directory
  64 using the AnyDBM module (Berkeley DB* preferred, followed by GDBM_File,
  65 NDBM_File, and SDBM_File).  Thereafter it uses the index file to find
  66 the file and offset for any requested sequence.  If one of the source
  67 fasta files is updated, the module reindexes just that one file.  (You
  68 can also force reindexing manually).  For improved performance, the
  69 module keeps a cache of open filehandles, closing less-recently used
  70 ones when the cache is full.
  71
  72 The fasta files may contain any combination of nucleotide and protein
  73 sequences; during indexing the module guesses the molecular type.
  74 Entries may have any line length up to 65,536 characters, and
  75 different line lengths are allowed in the same file.  However, within
  76 a sequence entry, all lines must be the same length except for the
  77 last.
  78
  79 An error will be thrown if this is not the case.
  80
  81 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
  82 from the Fasta header.  During indexing, you may pass a callback routine to
  83 modify this primary ID.  For example, you may wish to extract a
  84 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
  85 The original header line can be recovered later.
  86
  87 This module was developed for use with the C. elegans and human
  88 genomes, and has been tested with sequence segments as large as 20
  89 megabases.  Indexing the C. elegans genome (100 megabases of genomic
  90 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
  91 laptop. On the same system, average access time for any 200-mer within
  92 the C. elegans genome was E<lt>0.02s.
  93
  94 *Berkeley DB can be obtained free from www.sleepycat.com. After it is
  95 installed you will need to install the BerkeleyDB Perl module.
  96
  97 =head1 DATABASE CREATION AND INDEXING
  98
  99 The two constructors for this class are new() and newFh().  The former
 100 creates a Bio::DB::Fasta object which is accessed via method calls.
 101 The latter creates a tied filehandle which can be used Bio::SeqIO
 102 style to fetch sequence objects in a stream fashion.  There is also a
 103 tied hash interface.
 104
 105 =over 2
 106
 107 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
 108
 109 Create a new Bio::DB::Fasta object from the Fasta file or files
 110 indicated by $fasta_path.  Indexing will be performed automatically if
 111 needed.  If successful, new() will return the database accessor
 112 object.  Otherwise it will return undef.
 113
 114 $fasta_path may be an individual Fasta file, or may refer to a
 115 directory containing one or more of such files.  Following the path,
 116 you may pass a series of name=E<gt>value options or a hash with these
 117 same name=E<gt>value pairs.  Valid options are:
 118
 119  Option Name   Description               Default
 120  -----------   -----------               -------
 121
 122  -glob         Glob expression to use    *.{fa,fasta,fast,FA,FASTA,FAST,dna}
 123                for searching for Fasta
 124                     files in directories.
 125
 126  -makeid       A code subroutine for     None
 127                     transforming Fasta IDs.
 128
 129  -maxopen      Maximum size of               32
 130                     filehandle cache.
 131
 132  -debug        Turn on status                   0
 133                     messages.
 134
 135  -reindex      Force the index to be     0
 136                rebuilt.
 137
 138  -dbmargs      Additional arguments      none
 139                to pass to the DBM
 140                routines when tied
 141                (scalar or array ref).
 142
 143 -dbmargs can be used to control the format of the index.  For example,
 144 you can pass $DB_BTREE to this argument so as to force the IDs to be
 145 sorted and retrieved alphabetically.  Note that you must use the same
 146 arguments every time you open the index!
 147
 148 -reindex can be used to force the index to be recreated from scratch.
 149
 150 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
 151
 152 Create a tied filehandle opened on a Bio::DB::Fasta object.  Reading
 153 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
 154 Bio::SeqIO style.
 155
 156 =back
 157
 158 The -makeid option gives you a chance to modify sequence IDs during
 159 indexing.  The option value should be a code reference that will
 160 take a scalar argument and return a scalar result, like this:
 161
 162   $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
 163
 164   sub make_my_id {
 165     my $description_line = shift;
 166     # get a different id from the fasta header, e.g.
 167          $description_line =~ /(\S+)$/;
 168     return $1;
 169   }
 170
 171 make_my_id() will be called with the full fasta id line (including the
 172 "E<gt>" symbol!).  For example:
 173
 174  >A12345.3 Predicted C. elegans protein egl-2
 175
 176 By default, this module will use the regular expression /^E<gt>(\S+)/
 177 to extract "A12345.3" for use as the ID.  If you pass a -makeid
 178 callback, you can extract any portion of this, such as the "egl-2"
 179 symbol.
 180
 181 The -makeid option is ignored after the index is constructed.
 182
 183 =head1 OBJECT METHODS
 184
 185 The following object methods are provided.
 186
 187 =over 10
 188
 189 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
 190
 191 Return the raw sequence (a string) given an ID and optionally a start
 192 and stop position in the sequence.  In the case of DNA sequence, if
 193 $stop is less than $start, then the reverse complement of the sequence
 194 is returned (this violates Bio::Seq conventions).
 195
 196 For your convenience, subsequences can be indicated with any of the
 197 following compound IDs:
 198
 199    $db->seq("$id:$start,$stop")
 200
 201    $db->seq("$id:$start..$stop")
 202
 203    $db->seq("$id:$start-$stop")
 204
 205 =item $length = $db-E<gt>length($id)
 206
 207 Return the length of the indicated sequence.
 208
 209 =item $header = $db-E<gt>header($id)
 210
 211 Return the header line for the ID, including the initial "E<gt>".
 212
 213 =item $type  = $db-E<gt>alphabet($id)
 214
 215 Return the molecular type of the indicated sequence.  One of "dna",
 216 "rna" or "protein".
 217
 218 =item $filename  = $db-E<gt>file($id)
 219
 220 Return the name of the file in which the indicated sequence can be
 221 found.
 222
 223 =item $offset    = $db-E<gt>offset($id)
 224
 225 Return the offset of the indicated sequence from the beginning of the
 226 file in which it is located.  The offset points to the beginning of
 227 the sequence, not the beginning of the header line.
 228
 229 =item $header_length = $db-E<gt>headerlen($id)
 230
 231 Return the length of the header line for the indicated sequence.
 232
 233 =item $header_offset = $db-E<gt>header_offset($id)
 234
 235 Return the offset of the header line for the indicated sequence from
 236 the beginning of the file in which it is located.
 237
 238 =item $index_name  = $db-E<gt>index_name
 239
 240 Return the path to the index file.
 241
 242 =item $path = $db-E<gt>path
 243
 244 Return the path to the Fasta file(s).
 245
 246 =back
 247
 248 For BioPerl-style access, the following methods are provided:
 249
 250 =over 4
 251
 252 =item $seq = $db-E<gt>get_Seq_by_id($id)
 253
 254 Return a Bio::PrimarySeq::Fasta object, which obeys the
 255 Bio::PrimarySeqI conventions.  For example, to recover the raw DNA or
 256 protein sequence, call $seq-E<gt>seq().
 257
 258 Note that get_Seq_by_id() does not bring the entire sequence into
 259 memory until requested.  Internally, the returned object uses the
 260 accessor to generate subsequences as needed.
 261
 262 =item $seq = $db-E<gt>get_Seq_by_acc($id)
 263
 264 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
 265
 266 These methods all do the same thing as get_Seq_by_id().
 267
 268 =item $stream = $db-E<gt>get_PrimarySeq_stream()
 269
 270 Return a Bio::DB::Fasta::Stream object, which supports a single method
 271 next_seq(). Each call to next_seq() returns a new
 272 Bio::PrimarySeq::Fasta object, until no more sequences remain.
 273
 274 =back
 275
 276 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
 277 returned from get_Seq_by_id() and get_PrimarySeq_stream().
 278
 279 =head1 TIED INTERFACES
 280
 281 This module provides two tied interfaces, one which allows you to
 282 treat the sequence database as a hash, and the other which allows you
 283 to treat the database as an I/O stream.
 284
 285 =head2 Creating a Tied Hash
 286
 287 The tied hash interface is very straightforward
 288
 289 =over 1
 290
 291 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
 292
 293 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
 294 The optional @args list is the same set of named argument/value pairs
 295 used by Bio::DB::Fasta-E<gt>new().
 296
 297 If successful, tie() will return the tied object.  Otherwise it will
 298 return undef.
 299
 300 =back
 301
 302 Once tied, you can use the hash to retrieve an individual sequence by
 303 its ID, like this:
 304
 305   my $seq = $db{CHROMOSOME_I};
 306
 307 You may select a subsequence by appending the comma-separated range to
 308 the sequence ID in the format "$id:$start,$stop".  For example, here
 309 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
 310
 311   my $seq = $db{'CHROMOSOME_I:1,1000'};
 312
 313 (The regular expression used to parse this format allows sequence IDs
 314 to contain colons.)
 315
 316 When selecting subsequences, if $start E<gt> stop, then the reverse
 317 complement will be returned for DNA sequences.
 318
 319 The keys() and values() functions will return the sequence IDs and
 320 their sequences, respectively.  In addition, each() can be used to
 321 iterate over the entire data set:
 322
 323  while (my ($id,$sequence) = each %db) {
 324     print "$id => $sequence\n";
 325  }
 326
 327 When dealing with very large sequences, you can avoid bringing them
 328 into memory by calling each() in a scalar context.  This returns the
 329 key only.  You can then use tied(%db) to recover the Bio::DB::Fasta
 330 object and call its methods.
 331
 332  while (my $id = each %db) {
 333     print "$id => $db{$sequence:1,100}\n";
 334     print "$id => ",tied(%db)->length($id),"\n";
 335  }
 336
 337 You may, in addition invoke Bio::DB::Fasta the FIRSTKEY and NEXTKEY tied
 338 hash methods directly.
 339
 340 =over 2
 341
 342 =item $id = $db-E<gt>FIRSTKEY
 343
 344 Return the first ID in the database.
 345
 346 =item $id = $db-E<gt>NEXTKEY($id)
 347
 348 Given an ID, return the next ID in sequence.
 349
 350 =back
 351
 352 This allows you to write the following iterative loop using just the
 353 object-oriented interface:
 354
 355  my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
 356  for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
 357     # do something with sequence
 358  }
 359
 360 =head2 Creating a Tied Filehandle
 361
 362 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
 363 which you can read Bio::PrimarySeq::Fasta sequence objects
 364 sequentially.  The following bit of code will iterate sequentially
 365 over all sequences in the database:
 366
 367  my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
 368  while (my $seq = <$fh>) {
 369    print $seq->id,' => ',$seq->length,"\n";
 370  }
 371
 372 When no more sequences remain to be retrieved, the stream will return
 373 undef.
 374
 375 =head1 BUGS
 376
 377 When a sequence is deleted from one of the Fasta files, this deletion
 378 is not detected by the module and removed from the index.  As a
 379 result, a "ghost" entry will remain in the index and will return
 380 garbage results if accessed.
 381
 382 Currently, the only way to accomodate deletions is to rebuild the
 383 entire index, either by deleting it manually, or by passing
 384 -reindex=E<gt>1 to new() when initializing the module.
 385
 386 =head1 SEE ALSO
 387
 388 L<bioperl>
 389
 390 =head1 AUTHOR
 391
 392 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
 393
 394 Copyright (c) 2001 Cold Spring Harbor Laboratory.
 395
 396 This library is free software; you can redistribute it and/or modify
 397 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 398 disclaimers of warranty.
 399
 400 =cut
 401
 402 #'
 403 package Bio::DB::Fasta;
 404
 405 BEGIN {
 406   @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
 407 }
 408
 409 use strict;
 410 use IO::File;
 411 use AnyDBM_File;
 412 use Fcntl;
 413 use File::Basename qw(basename dirname);
 414
 415 use base qw(Bio::DB::SeqI Bio::Root::Root);
 416
 417 *seq = *sequence = \&subseq;
 418 *ids = \&get_all_ids;
 419 *get_seq_by_primary_id = *get_Seq_by_acc  = \&get_Seq_by_id;
 420
 421 use constant STRUCT =>'NNnnCa*';
 422 use constant STRUCTBIG =>'QQnnCa*'; # 64-bit file offset and seq length
 423 use constant DNA     => 1;
 424 use constant RNA     => 2;
 425 use constant PROTEIN => 3;
 426 use constant DIE_ON_MISSMATCHED_LINES => 1; # if you want
 427
 428 # Bio::DB-like object
 429 # providing fast random access to a directory of FASTA files
 430
 431 =head2 new
 432
 433  Title   : new
 434  Usage   : my $db = Bio::DB::Fasta->new( $path, @options);
 435  Function: initialize a new Bio::DB::Fasta object
 436  Returns : new Bio::DB::Fasta object
 437  Args    : path to dir of fasta files or a single filename
 438
 439 These are optional arguments to pass in as well.
 440
 441  -glob         Glob expression to use    *.{fa,fasta,fast,FA,FASTA,FAST}
 442                for searching for Fasta
 443                     files in directories.
 444
 445  -makeid       A code subroutine for     None
 446                     transforming Fasta IDs.
 447
 448  -maxopen      Maximum size of               32
 449                     filehandle cache.
 450
 451  -debug        Turn on status                   0
 452                     messages.
 453
 454  -reindex      Force the index to be     0
 455                rebuilt.
 456
 457  -dbmargs      Additional arguments      none
 458                to pass to the DBM
 459                routines when tied
 460                (scalar or array ref).
 461
 462 =cut
 463
 464 sub new {
 465   my $class = shift;
 466   my $path  = shift;
 467   my %opts  = @_;
 468
 469   my $self = bless { debug      => $opts{-debug},
 470           makeid     => $opts{-makeid},
 471           glob       => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,FNA,fna,FAA,faa,FSA,fsa}',
 472           maxopen    => $opts{-maxfh}   || 32,
 473           dbmargs    => $opts{-dbmargs} || undef,
 474           fhcache    => {},
 475           cacheseq   => {},
 476           curopen    => 0,
 477           openseq    => 1,
 478           dirname    => undef,
 479           offsets    => undef,
 480                    }, $class;
 481   my ($offsets,$dirname);
 482
 483   if (-d $path) {
 484     # because Win32 glob() is broken with respect to long file names
 485     # that contain whitespace.
 486     $path = Win32::GetShortPathName($path)
 487       if $^O =~ /^MSWin/i && eval 'use Win32; 1';
 488     $offsets = $self->index_dir($path,$opts{-reindex}) or return;
 489     $dirname = $path;
 490   } elsif (-f _) {
 491     $offsets = $self->index_file($path,$opts{-reindex});
 492     $dirname = dirname($path);
 493   } else {
 494     $self->throw( "$path: Invalid file or dirname");
 495   }
 496   @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
 497
 498   $self;
 499 }
 500
 501 =head2 newFh
 502
 503  Title   : newFh
 504  Function: gets a new Fh for a file
 505  Example : internal method
 506  Returns : GLOB
 507  Args    :
 508
 509 =cut
 510
 511 sub newFh {
 512   my $class = shift;
 513   my $self  = $class->new(@_);
 514   require Symbol;
 515   my $fh = Symbol::gensym or return;
 516   tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
 517   $fh;
 518 }
 519
 520 sub _open_index {
 521   my $self = shift;
 522   my ($index,$write) = @_;
 523   my %offsets;
 524   my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
 525   my @dbmargs = $self->dbmargs;
 526   eval {
 527       tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
 528           or die "Can't open sequence index file $index: $!";
 529   };
 530   warn $@ if $@;
 531   return \%offsets;
 532 }
 533
 534 sub _close_index {
 535   my $self = shift;
 536   my $index = shift;
 537   untie %$index;
 538 }
 539
 540 =head2 index_dir
 541
 542  Title   : index_dir
 543  Usage   : $db->index_dir($dir)
 544  Function: set the index dir and load all files in the dir
 545  Returns : hashref of seq offsets in each file
 546  Args    : dirname, boolean to force a reload of all files
 547
 548 =cut
 549
 550 sub index_dir {
 551   my $self = shift;
 552   my $dir  = shift;
 553   my $force_reindex = shift;
 554
 555   # find all fasta files
 556   my @files = glob("$dir/$self->{glob}");
 557 #  $self->throw( "no fasta files in $dir") unless @files;
 558   return unless @files;
 559
 560   # get name of index
 561   my $index = $self->index_name($dir,1);
 562
 563   # if caller has requested reindexing, then unlink
 564   # the index file.
 565   unlink $index if $force_reindex;
 566
 567   # get the modification time of the index
 568   my $indextime   = 0;
 569   for my $suffix('','.pag','.dir') {
 570     $indextime ||= (stat("${index}${suffix}"))[9];
 571   }
 572   $indextime ||= 0;  # prevent some uninit variable warnings
 573
 574   # get the most recent modification time of any of the contents
 575   my $modtime = 0;
 576   my %modtime;
 577   $self->set_pack_method( @files );
 578   foreach (@files) {
 579     my $m = (stat($_))[9];
 580     $modtime{$_} = $m;
 581     $modtime = $m if defined $m && $modtime < $m;
 582   }
 583
 584   my $reindex      = $force_reindex || $indextime < $modtime;
 585   $self->{offsets} = $self->_open_index($index,$reindex) or return;
 586
 587   # no indexing needed
 588   return $self->{offsets} unless $reindex;
 589
 590   # otherwise reindex contents of changed files
 591   $self->{indexing} = $index;
 592   foreach (@files) {
 593     next if( defined $indextime && $modtime{$_} <= $indextime);
 594     $self->calculate_offsets($_,$self->{offsets});
 595   }
 596   delete $self->{indexing};
 597
 598   # we've been having troubles with corrupted index files on Windows systems,
 599   # so possibly closing and reopening will help
 600   $self->_close_index($self->{offsets});
 601
 602   return $self->{offsets}  = $self->_open_index($index);
 603 }
 604
 605 =head2 get_Seq_by_id
 606
 607  Title   : get_Seq_by_id
 608  Usage   : my $seq = $db->get_Seq_by_id($id)
 609  Function: Bio::DB::RandomAccessI method implemented
 610  Returns : Bio::PrimarySeqI object
 611  Args    : id
 612
 613 =cut
 614
 615 sub get_Seq_by_id {
 616   my $self = shift;
 617   my $id   = shift;
 618   return unless exists $self->{offsets}{$id};
 619   return Bio::PrimarySeq::Fasta->new($self,$id);
 620 }
 621
 622 =head2 set_pack_method
 623
 624  Title   : set_pack_method
 625  Usage   : $db->set_pack_method( @files )
 626  Function: Determines whether data packing uses 32 or 64 bit integers
 627  Returns :
 628  Args    : one or more file paths
 629
 630 =cut
 631
 632 sub set_pack_method {
 633   my $self = shift;
 634   # Find the maximum file size:
 635   my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
 636   my $fourGB    = (2 ** 32) - 1;
 637
 638   if ($maxsize > $fourGB) {
 639       # At least one file exceeds 4Gb - we will need to use 64 bit ints
 640       $self->{packmeth}   = \&_packBig;
 641       $self->{unpackmeth} = \&_unpackBig;
 642   } else {
 643       $self->{packmeth}   = \&_pack;
 644       $self->{unpackmeth} = \&_unpack;
 645   }
 646 }
 647
 648 =head2 index_file
 649
 650  Title   : index_file
 651  Usage   : $db->index_file($filename)
 652  Function: (re)loads a sequence file and indexes sequences offsets in the file
 653  Returns : seq offsets in the file
 654  Args    : filename,
 655            boolean to force reloading a file
 656
 657 =cut
 658
 659
 660 sub index_file {
 661   my $self = shift;
 662   my $file = shift;
 663   my $force_reindex = shift;
 664
 665   $self->set_pack_method( $file );
 666   my $index = $self->index_name($file);
 667   # if caller has requested reindexing, then unlink the index
 668   unlink $index if $force_reindex;
 669
 670   # get the modification time of the index
 671   my $indextime = (stat($index))[9] || 0;
 672   my $modtime   = (stat($file))[9]  || 0;
 673
 674   my $reindex = $force_reindex || $indextime < $modtime;
 675   my $offsets = $self->_open_index($index,$reindex) or return;
 676   $self->{offsets} = $offsets;
 677
 678   return $self->{offsets} unless $reindex;
 679
 680   $self->{indexing} = $index;
 681   $self->calculate_offsets($file,$offsets);
 682   delete $self->{indexing};
 683   return $self->{offsets};
 684 }
 685
 686 =head2 dbmargs
 687
 688  Title   : dbmargs
 689  Usage   : my @args = $db->dbmargs;
 690  Function: gets stored dbm arguments
 691  Returns : array
 692  Args    : none
 693
 694 =cut
 695
 696 sub dbmargs {
 697   my $self = shift;
 698   my $args = $self->{dbmargs} or return;
 699   return ref($args) eq 'ARRAY' ? @$args : $args;
 700 }
 701
 702 =head2 index_name
 703
 704  Title   : index_name
 705  Usage   : my $indexname = $db->index_name($path,$isdir);
 706  Function: returns the name of the index for a specific path
 707  Returns : string
 708  Args    : path to check,
 709            boolean if it is a dir
 710
 711 =cut
 712
 713 sub index_name {
 714   my $self  = shift;
 715   my ($path,$isdir) = @_;
 716   unless ($path) {
 717     my $dir = $self->{dirname} or return;
 718     return $self->index_name($dir,-d $dir);
 719   }
 720   return "$path/directory.index" if $isdir;
 721   return "$path.index";
 722 }
 723
 724 =head2 calculate_offsets
 725
 726  Title   : calculate_offsets
 727  Usage   : $db->calculate_offsets($filename,$offsets);
 728  Function: calculates the sequence offsets in a file based on id
 729  Returns : offset hash for each file
 730  Args    : file to process
 731            $offsets - hashref of id to offset storage
 732
 733 =cut
 734
 735 sub calculate_offsets {
 736   my $self = shift;
 737   my ($file,$offsets) = @_;
 738   my $base = $self->path2fileno(basename($file));
 739
 740   my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
 741   binmode $fh;
 742   warn "indexing $file\n" if $self->{debug};
 743   my ($offset,@id,$linelength,$type,$firstline,$count,
 744       $termination_length,$seq_lines,$last_line,%offsets);
 745   my ($l3_len,$l2_len,$l_len)=(0,0,0);
 746
 747   while (<$fh>) {               # don't try this at home
 748     $termination_length ||= /\r\n$/ ? 2 : 1; # account for crlf-terminated Windows files
 749     if (/^>(\S+)/) {
 750       print STDERR "indexed $count sequences...\n"
 751         if $self->{debug} && (++$count%1000) == 0;
 752       my $pos = tell($fh);
 753       if (@id) {
 754         my $seqlength    = $pos - $offset - length($_);
 755         $seqlength      -= $termination_length * $seq_lines;
 756         my $ppos = &{$self->{packmeth}}($offset,$seqlength,
 757                                        $linelength,$firstline,
 758                                        $type,$base);
 759         for my $id (@id) { $offsets->{$id}  = $ppos }
 760       }
 761       @id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
 762       ($offset,$firstline,$linelength) = ($pos,length($_),0);
 763       $self->_check_linelength($linelength);
 764       ($l3_len,$l2_len,$l_len)=(0,0,0);
 765       $seq_lines = 0;
 766     } else {
 767       $l3_len= $l2_len; $l2_len= $l_len; $l_len= length($_); # need to check every line :(
 768       if (DIE_ON_MISSMATCHED_LINES &&
 769           $l3_len>0 && $l2_len>0 && $l3_len!=$l2_len) {
 770         my $fap= substr($_,0,20)."..";
 771         $self->throw("Each line of the fasta entry must be the same length except the last.
 772     Line above #$. '$fap' is $l2_len != $l3_len chars.");
 773   }
 774       $linelength ||= length($_);
 775       $type       ||= $self->_type($_);
 776       $seq_lines++;
 777     }
 778     $last_line = $_;
 779   }
 780
 781   $self->_check_linelength($linelength);
 782   # deal with last entry
 783   if (@id) {
 784     my $pos = tell($fh);
 785     my $seqlength   = $pos - $offset;
 786     if ($linelength == 0) { # yet another pesky empty chr_random.fa file
 787       $seqlength = 0;
 788     } else {
 789       if ($last_line !~ /\s$/) {
 790         $seq_lines--;
 791       }
 792       $seqlength -= $termination_length * $seq_lines;
 793     };
 794     my $ppos = &{$self->{packmeth}}($offset,$seqlength,
 795                                            $linelength,$firstline,
 796                                            $type,$base);
 797     for my $id (@id) { $offsets->{$id}  = $ppos }
 798   }
 799   $offsets->{__termination_length} = $termination_length;
 800   return \%offsets;
 801 }
 802
 803 =head2 get_all_ids
 804
 805  Title   : get_all_ids
 806  Usage   : my @ids = $db->get_all_ids
 807  Function: gets all the stored ids in all indexes
 808  Returns : list of ids
 809  Args    : none
 810
 811 =cut
 812
 813 sub get_all_ids  { grep {!/^__/} keys %{shift->{offsets}} }
 814
 815 sub offset {
 816   my $self = shift;
 817   my $id   = shift;
 818   my $offset = $self->{offsets}{$id} or return;
 819   (&{$self->{unpackmeth}}($offset))[0];
 820 }
 821
 822 sub length {
 823   my $self = shift;
 824   my $id   = shift;
 825   my $offset = $self->{offsets}{$id} or return;
 826   (&{$self->{unpackmeth}}($offset))[1];
 827 }
 828
 829 sub linelen {
 830   my $self = shift;
 831   my $id   = shift;
 832   my $offset = $self->{offsets}{$id} or return;
 833   (&{$self->{unpackmeth}}($offset))[2];
 834 }
 835
 836 sub headerlen {
 837   my $self = shift;
 838   my $id   = shift;
 839   my $offset = $self->{offsets}{$id} or return;
 840   (&{$self->{unpackmeth}}($offset))[3];
 841 }
 842
 843 sub alphabet {
 844   my $self = shift;
 845   my $id   = shift;
 846   my $offset = $self->{offsets}{$id} or return;
 847   my $type = (&{$self->{unpackmeth}}($offset))[4];
 848   return $type == DNA ? 'dna'
 849          : $type == RNA ? 'rna'
 850          : 'protein';
 851
 852 }
 853
 854 sub path { shift->{dirname} }
 855
 856 sub header_offset {
 857     my $self = shift;
 858     my $id   = shift;
 859     return unless $self->{offsets}{$id};
 860     return $self->offset($id) - $self->headerlen($id);
 861 }
 862
 863 sub file {
 864   my $self = shift;
 865   my $id   = shift;
 866   my $offset = $self->{offsets}{$id} or return;
 867   $self->fileno2path((&{$self->{unpackmeth}}($offset))[5]);
 868 }
 869
 870 sub fileno2path {
 871   my $self = shift;
 872   my $no   = shift;
 873   return $self->{offsets}{"__file_$no"};
 874 }
 875
 876 sub path2fileno {
 877   my $self = shift;
 878   my $path = shift;
 879   if ( !defined $self->{offsets}{"__path_$path"} ) {
 880     my $fileno  = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
 881     $self->{offsets}{"__file_$fileno"} = $path;
 882   }
 883   return $self->{offsets}{"__path_$path"}
 884 }
 885
 886 sub _check_linelength {
 887   my $self       = shift;
 888   my $linelength = shift;
 889   return unless defined $linelength;
 890   $self->throw("Each line of the fasta file must be less than 65,536 characters.  Line $. is $linelength chars.")       if $linelength > 65535.
 891
 892 }
 893
 894 =head2 subseq
 895
 896  Title   : subseq
 897  Usage   : $seqdb->subseq($id,$start,$stop);
 898  Function: returns a subseq of a sequence in the db
 899  Returns : subsequence data
 900  Args    : id of sequence, starting point, ending point
 901
 902 =cut
 903
 904 sub subseq {
 905   my ($self,$id,$start,$stop) = @_;
 906   if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
 907     ($id,$start,$stop) = ($1,$2,$3);
 908     $start =~ s/_//g;
 909     $stop =~ s/_//g;
 910   }
 911   $start ||= 1;
 912   $stop  ||= $self->length($id);
 913
 914   my $reversed;
 915   if (defined $stop && $start > $stop) {
 916     ($start,$stop) = ($stop,$start);
 917     $reversed++;
 918   }
 919
 920   my $data;
 921
 922   my $fh = $self->fh($id) or return;
 923   my $filestart = $self->caloffset($id,$start);
 924   my $filestop  = $self->caloffset($id,$stop);
 925
 926   seek($fh,$filestart,0);
 927   read($fh,$data,$filestop-$filestart+1);
 928   $data =~ s/\n//g;
 929   $data =~ s/\r//g;
 930   if ($reversed) {
 931     $data = reverse $data;
 932     $data =~ tr/gatcGATC/ctagCTAG/;
 933   }
 934   $data;
 935 }
 936
 937 sub fh {
 938   my $self = shift;
 939   my $id   = shift;
 940   my $file = $self->file($id) or return;
 941   $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
 942 }
 943
 944 sub header {
 945   my $self = shift;
 946   my $id   = shift;
 947   my ($offset,$seqlength,$linelength,$firstline,$type,$file)
 948     = &{$self->{unpackmeth}}($self->{offsets}{$id}) or return;
 949   $offset -= $firstline;
 950   my $data;
 951   my $fh = $self->fh($id) or return;
 952   seek($fh,$offset,0);
 953   read($fh,$data,$firstline);
 954   chomp $data;
 955   substr($data,0,1) = '';
 956   $data;
 957 }
 958
 959 sub caloffset {
 960   my $self = shift;
 961   my $id   = shift;
 962   my $a    = shift()-1;
 963   my ($offset,$seqlength,$linelength,$firstline,$type,$file) = &{$self->{unpackmeth}}($self->{offsets}{$id});
 964   $a = 0            if $a < 0;
 965   $a = $seqlength-1 if $a >= $seqlength;
 966   my $tl = $self->{offsets}{__termination_length};
 967   $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
 968 }
 969
 970 sub fhcache {
 971   my $self = shift;
 972   my $path = shift;
 973   if (!$self->{fhcache}{$path}) {
 974     if ($self->{curopen} >= $self->{maxopen}) {
 975       my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
 976       splice(@lru, $self->{maxopen} / 3);
 977       $self->{curopen} -= @lru;
 978       for (@lru) { delete $self->{fhcache}{$_} }
 979     }
 980     $self->{fhcache}{$path} = IO::File->new($path) or return;
 981     binmode $self->{fhcache}{$path};
 982     $self->{curopen}++;
 983   }
 984   $self->{cacheseq}{$path}++;
 985   $self->{fhcache}{$path}
 986 }
 987
 988 sub _pack {
 989   pack STRUCT,@_;
 990 }
 991
 992 sub _packBig {
 993   pack STRUCTBIG,@_;
 994 }
 995
 996 sub _unpack {
 997   unpack STRUCT,shift;
 998 }
 999
1000 sub _unpackBig {
1001   unpack STRUCTBIG,shift;
1002 }
1003
1004 sub _type {
1005   shift;
1006   local $_ = shift;
1007   return /^[gatcnGATCN*-]+$/   ? DNA
1008          : /^[gaucnGAUCN*-]+$/ ? RNA
1009          : PROTEIN;
1010 }
1011
1012 =head2 get_PrimarySeq_stream
1013
1014  Title   : get_PrimarySeq_stream
1015  Usage   :
1016  Function:
1017  Example :
1018  Returns :
1019  Args    :
1020
1021 =cut
1022
1023 sub get_PrimarySeq_stream {
1024   my $self = shift;
1025   return Bio::DB::Fasta::Stream->new($self);
1026 }
1027
1028 sub TIEHASH {
1029   my $self = shift;
1030   return $self->new(@_);
1031 }
1032
1033 sub FETCH {
1034   shift->subseq(@_);
1035 }
1036 sub STORE {
1037     shift->throw("Read-only database");
1038 }
1039 sub DELETE {
1040     shift->throw("Read-only database");
1041 }
1042 sub CLEAR {
1043     shift->throw("Read-only database");
1044 }
1045 sub EXISTS {
1046   defined shift->offset(@_);
1047 }
1048 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
1049 sub NEXTKEY  { tied(%{shift->{offsets}})->NEXTKEY(@_);  }
1050
1051 sub DESTROY {
1052   my $self = shift;
1053   if ($self->{indexing}) {  # killed prematurely, so index file is no good!
1054     warn "indexing was interrupted, so unlinking $self->{indexing}";
1055     unlink $self->{indexing};
1056   }
1057 }
1058
1059 #-------------------------------------------------------------
1060 # Bio::PrimarySeqI compatibility
1061 #
1062 package Bio::PrimarySeq::Fasta;
1063 use overload '""' => 'display_id';
1064
1065 use base qw(Bio::Root::Root Bio::PrimarySeqI);
1066
1067 sub new {
1068   my $class = shift;
1069   $class = ref($class) if ref $class;
1070   my ($db,$id,$start,$stop) = @_;
1071   return bless { db    => $db,
1072                  id    => $id,
1073                  start => $start || 1,
1074                  stop  => $stop  || $db->length($id)
1075                },$class;
1076 }
1077
1078 sub seq {
1079   my $self = shift;
1080   return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
1081 }
1082
1083 sub subseq {
1084   my $self = shift;
1085   $self->trunc(@_)->seq();
1086 }
1087
1088 sub trunc {
1089   my $self = shift;
1090   my ($start,$stop) = @_;
1091   $self->throw("Stop cannot be smaller than start")  unless $start <= $stop;
1092   return $self->{start} <= $self->{stop} ?  $self->new($self->{db},
1093                                                        $self->{id},
1094                                                        $self->{start}+$start-1,
1095                                                        $self->{start}+$stop-1)
1096                                          :  $self->new($self->{db},
1097                                                        $self->{id},
1098                                                        $self->{start}-($start-1),
1099                                                        $self->{start}-($stop-1)
1100                                                       );
1101
1102 }
1103
1104 sub is_circular {
1105   my $self = shift;
1106   return $self->{is_circular};
1107 }
1108
1109 sub display_id {
1110   my $self = shift;
1111   return $self->{id};
1112 }
1113
1114 sub accession_number {
1115   my $self = shift;
1116   return "unknown";
1117 }
1118
1119 sub primary_id {
1120   my $self = shift;
1121   return overload::StrVal($self);
1122 }
1123
1124 sub can_call_new { return 0 }
1125
1126 sub alphabet {
1127   my $self = shift;
1128   return $self->{db}->alphabet($self->{id});
1129 }
1130
1131 sub revcom {
1132   my $self = shift;
1133   return $self->new(@{$self}{'db','id','stop','start'});
1134 }
1135
1136 sub length {
1137   my $self = shift;
1138   #return $self->{db}->length($self->{id}); # wrong because ignores sequence start and stop values
1139   return length($self->seq);
1140
1141 }
1142
1143 sub description  {
1144     my $self = shift;
1145     my $header = $self->{'db'}->header($self->{id});
1146     # remove the id from the header
1147     return (split(/\s+/,$header,2))[2];
1148 }
1149
1150 *desc = \&description;
1151
1152 #-------------------------------------------------------------
1153 # stream-based access to the database
1154 #
1155 package Bio::DB::Fasta::Stream;
1156 use base qw(Tie::Handle Bio::DB::SeqI);
1157
1158
1159 sub new {
1160   my $class = shift;
1161   my $db    = shift;
1162   my $key = $db->FIRSTKEY;
1163   return bless { db=>$db,key=>$key },$class;
1164 }
1165
1166 sub next_seq {
1167   my $self = shift;
1168   my ($key,$db) = @{$self}{'key','db'};
1169   while ($key =~ /^__/) {
1170     $key = $db->NEXTKEY($key);
1171     return unless defined $key;
1172   }
1173   my $value = $db->get_Seq_by_id($key);
1174   $self->{key} = $db->NEXTKEY($key);
1175   $value;
1176 }
1177
1178 sub TIEHANDLE {
1179   my $class = shift;
1180   my $db    = shift;
1181   return $class->new($db);
1182 }
1183 sub READLINE {
1184   my $self = shift;
1185   $self->next_seq;
1186 }
1187
1188 1;
1189
1190 __END__
1191