Bio/DB/Fasta.pm

   1 #
   2 # BioPerl module for Bio::DB::Fasta
   3 #
   4 # You may distribute this module under the same terms as perl itself
   5 #
   6
   7 # POD documentation - main docs before the code
   8
   9 =head1 NAME
  10
  11 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
  12
  13 =head1 SYNOPSIS
  14
  15   use Bio::DB::Fasta;
  16
  17   # create database from directory of fasta files
  18   my $db      = Bio::DB::Fasta->new('/path/to/fasta/files');
  19
  20   # simple access (for those without Bioperl)
  21   my $seq      = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
  22   my $revseq   = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
  23   my @ids     = $db->ids;
  24   my $length   = $db->length('CHROMOSOME_I');
  25   my $alphabet = $db->alphabet('CHROMOSOME_I');
  26   my $header   = $db->header('CHROMOSOME_I');
  27
  28   # Bioperl-style access
  29   my $db      = Bio::DB::Fasta->new('/path/to/fasta/files');
  30
  31   my $obj     = $db->get_Seq_by_id('CHROMOSOME_I');
  32   my $seq     = $obj->seq; # sequence string
  33   my $subseq  = $obj->subseq(4_000_000 => 4_100_000); # string
  34   my $trunc   = $obj->trunc(4_000_000 => 4_100_000); # seq object
  35   my $length  = $obj->length;
  36   # (etc)
  37
  38   # Bio::SeqIO-style access
  39   my $stream  = Bio::DB::Fasta->new('/path/to/files')->get_PrimarySeq_stream;
  40   while (my $seq = $stream->next_seq) {
  41     # Bio::PrimarySeqI stuff
  42   }
  43
  44   my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
  45   while (my $seq = <$fh>) {
  46     # Bio::PrimarySeqI stuff
  47   }
  48
  49   # tied hash access
  50   tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
  51   print $sequences{'CHROMOSOME_I:1,20000'};
  52
  53 =head1 DESCRIPTION
  54
  55 Bio::DB::Fasta provides indexed access to one or more Fasta files.  It
  56 provides random access to each sequence entry, and to subsequences
  57 within each entry, allowing you to retrieve portions of very large
  58 sequences without bringing the entire sequence into memory.
  59
  60 When you initialize the module, you point it at a single fasta file or
  61 a directory of multiple such files.  The first time it is run, the
  62 module generates an index of the contents of the file or directory
  63 using the AnyDBM module (Berkeley DB* preferred, followed by GDBM_File,
  64 NDBM_File, and SDBM_File).  Thereafter it uses the index file to find
  65 the file and offset for any requested sequence.  If one of the source
  66 fasta files is updated, the module reindexes just that one file.  (You
  67 can also force reindexing manually).  For improved performance, the
  68 module keeps a cache of open filehandles, closing less-recently used
  69 ones when the cache is full.
  70
  71 The fasta files may contain any combination of nucleotide and protein
  72 sequences; during indexing the module guesses the molecular type.
  73 Entries may have any line length up to 65,536 characters, and
  74 different line lengths are allowed in the same file.  However, within
  75 a sequence entry, all lines must be the same length except for the
  76 last.
  77
  78 An error will be thrown if this is not the case.
  79
  80 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
  81 from the Fasta header.  During indexing, you may pass a callback routine to
  82 modify this primary ID.  For example, you may wish to extract a
  83 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
  84 The original header line can be recovered later.
  85
  86 This module was developed for use with the C. elegans and human
  87 genomes, and has been tested with sequence segments as large as 20
  88 megabases.  Indexing the C. elegans genome (100 megabases of genomic
  89 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
  90 laptop. On the same system, average access time for any 200-mer within
  91 the C. elegans genome was E<lt>0.02s.
  92
  93 *Berkeley DB can be obtained free from www.sleepycat.com. After it is
  94 installed you will need to install the BerkeleyDB Perl module.
  95
  96 =head1 DATABASE CREATION AND INDEXING
  97
  98 The two constructors for this class are new() and newFh().  The former
  99 creates a Bio::DB::Fasta object which is accessed via method calls.
 100 The latter creates a tied filehandle which can be used Bio::SeqIO
 101 style to fetch sequence objects in a stream fashion.  There is also a
 102 tied hash interface.
 103
 104 =over 2
 105
 106 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
 107
 108 Create a new Bio::DB::Fasta object from the Fasta file or files
 109 indicated by $fasta_path.  Indexing will be performed automatically if
 110 needed.  If successful, new() will return the database accessor
 111 object.  Otherwise it will return undef.
 112
 113 $fasta_path may be an individual Fasta file, or may refer to a
 114 directory containing one or more of such files.  Following the path,
 115 you may pass a series of name=E<gt>value options or a hash with these
 116 same name=E<gt>value pairs.  Valid options are:
 117
 118  Option Name   Description               Default
 119  -----------   -----------               -------
 120
 121  -glob         Glob expression to use    *.{fa,fasta,fast,FA,FASTA,FAST,dna}
 122                for searching for Fasta
 123                     files in directories.
 124
 125  -makeid       A code subroutine for     None
 126                     transforming Fasta IDs.
 127
 128  -maxopen      Maximum size of               32
 129                     filehandle cache.
 130
 131  -debug        Turn on status                   0
 132                     messages.
 133
 134  -reindex      Force the index to be     0
 135                rebuilt.
 136
 137  -dbmargs      Additional arguments      none
 138                to pass to the DBM
 139                routines when tied
 140                (scalar or array ref).
 141
 142 -dbmargs can be used to control the format of the index.  For example,
 143 you can pass $DB_BTREE to this argument so as to force the IDs to be
 144 sorted and retrieved alphabetically.  Note that you must use the same
 145 arguments every time you open the index!
 146
 147 -reindex can be used to force the index to be recreated from scratch.
 148
 149 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
 150
 151 Create a tied filehandle opened on a Bio::DB::Fasta object.  Reading
 152 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
 153 Bio::SeqIO style.
 154
 155 =back
 156
 157 The -makeid option gives you a chance to modify sequence IDs during
 158 indexing.  The option value should be a code reference that will
 159 take a scalar argument and return a scalar result, like this:
 160
 161   $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
 162
 163   sub make_my_id {
 164     my $description_line = shift;
 165     # get a different id from the fasta header, e.g.
 166          $description_line =~ /(\S+)$/;
 167     return $1;
 168   }
 169
 170 make_my_id() will be called with the full fasta id line (including the
 171 "E<gt>" symbol!).  For example:
 172
 173  >A12345.3 Predicted C. elegans protein egl-2
 174
 175 By default, this module will use the regular expression /^E<gt>(\S+)/
 176 to extract "A12345.3" for use as the ID.  If you pass a -makeid
 177 callback, you can extract any portion of this, such as the "egl-2"
 178 symbol.
 179
 180 The -makeid option is ignored after the index is constructed.
 181
 182 =head1 OBJECT METHODS
 183
 184 The following object methods are provided.
 185
 186 =over 10
 187
 188 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
 189
 190 Return the raw sequence (a string) given an ID and optionally a start
 191 and stop position in the sequence.  In the case of DNA sequence, if
 192 $stop is less than $start, then the reverse complement of the sequence
 193 is returned (this violates Bio::Seq conventions).
 194
 195 For your convenience, subsequences can be indicated with any of the
 196 following compound IDs:
 197
 198    $db->seq("$id:$start,$stop")
 199
 200    $db->seq("$id:$start..$stop")
 201
 202    $db->seq("$id:$start-$stop")
 203
 204 =item $length = $db-E<gt>length($id)
 205
 206 Return the length of the indicated sequence.
 207
 208 =item $header = $db-E<gt>header($id)
 209
 210 Return the header line for the ID, including the initial "E<gt>".
 211
 212 =item $type  = $db-E<gt>alphabet($id)
 213
 214 Return the molecular type of the indicated sequence.  One of "dna",
 215 "rna" or "protein".
 216
 217 =item $filename  = $db-E<gt>file($id)
 218
 219 Return the name of the file in which the indicated sequence can be
 220 found.
 221
 222 =item $offset    = $db-E<gt>offset($id)
 223
 224 Return the offset of the indicated sequence from the beginning of the
 225 file in which it is located.  The offset points to the beginning of
 226 the sequence, not the beginning of the header line.
 227
 228 =item $header_length = $db-E<gt>headerlen($id)
 229
 230 Return the length of the header line for the indicated sequence.
 231
 232 =item $header_offset = $db-E<gt>header_offset($id)
 233
 234 Return the offset of the header line for the indicated sequence from
 235 the beginning of the file in which it is located.
 236
 237 =item $index_name  = $db-E<gt>index_name
 238
 239 Return the path to the index file.
 240
 241 =item $path = $db-E<gt>path
 242
 243 Return the path to the Fasta file(s).
 244
 245 =back
 246
 247 For BioPerl-style access, the following methods are provided:
 248
 249 =over 4
 250
 251 =item $seq = $db-E<gt>get_Seq_by_id($id)
 252
 253 Return a Bio::PrimarySeq::Fasta object, which obeys the
 254 Bio::PrimarySeqI conventions.  For example, to recover the raw DNA or
 255 protein sequence, call $seq-E<gt>seq().
 256
 257 Note that get_Seq_by_id() does not bring the entire sequence into
 258 memory until requested.  Internally, the returned object uses the
 259 accessor to generate subsequences as needed.
 260
 261 =item $seq = $db-E<gt>get_Seq_by_acc($id)
 262
 263 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
 264
 265 These methods all do the same thing as get_Seq_by_id().
 266
 267 =item $stream = $db-E<gt>get_PrimarySeq_stream()
 268
 269 Return a Bio::DB::Fasta::Stream object, which supports a single method
 270 next_seq(). Each call to next_seq() returns a new
 271 Bio::PrimarySeq::Fasta object, until no more sequences remain.
 272
 273 =back
 274
 275 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
 276 returned from get_Seq_by_id() and get_PrimarySeq_stream().
 277
 278 =head1 TIED INTERFACES
 279
 280 This module provides two tied interfaces, one which allows you to
 281 treat the sequence database as a hash, and the other which allows you
 282 to treat the database as an I/O stream.
 283
 284 =head2 Creating a Tied Hash
 285
 286 The tied hash interface is very straightforward
 287
 288 =over 1
 289
 290 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
 291
 292 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
 293 The optional @args list is the same set of named argument/value pairs
 294 used by Bio::DB::Fasta-E<gt>new().
 295
 296 If successful, tie() will return the tied object.  Otherwise it will
 297 return undef.
 298
 299 =back
 300
 301 Once tied, you can use the hash to retrieve an individual sequence by
 302 its ID, like this:
 303
 304   my $seq = $db{CHROMOSOME_I};
 305
 306 You may select a subsequence by appending the comma-separated range to
 307 the sequence ID in the format "$id:$start,$stop".  For example, here
 308 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
 309
 310   my $seq = $db{'CHROMOSOME_I:1,1000'};
 311
 312 (The regular expression used to parse this format allows sequence IDs
 313 to contain colons.)
 314
 315 When selecting subsequences, if $start E<gt> stop, then the reverse
 316 complement will be returned for DNA sequences.
 317
 318 The keys() and values() functions will return the sequence IDs and
 319 their sequences, respectively.  In addition, each() can be used to
 320 iterate over the entire data set:
 321
 322  while (my ($id,$sequence) = each %db) {
 323     print "$id => $sequence\n";
 324  }
 325
 326 When dealing with very large sequences, you can avoid bringing them
 327 into memory by calling each() in a scalar context.  This returns the
 328 key only.  You can then use tied(%db) to recover the Bio::DB::Fasta
 329 object and call its methods.
 330
 331  while (my $id = each %db) {
 332     print "$id => $db{$sequence:1,100}\n";
 333     print "$id => ",tied(%db)->length($id),"\n";
 334  }
 335
 336 You may, in addition invoke Bio::DB::Fasta the FIRSTKEY and NEXTKEY tied
 337 hash methods directly.
 338
 339 =over 2
 340
 341 =item $id = $db-E<gt>FIRSTKEY
 342
 343 Return the first ID in the database.
 344
 345 =item $id = $db-E<gt>NEXTKEY($id)
 346
 347 Given an ID, return the next ID in sequence.
 348
 349 =back
 350
 351 This allows you to write the following iterative loop using just the
 352 object-oriented interface:
 353
 354  my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
 355  for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
 356     # do something with sequence
 357  }
 358
 359 =head2 Creating a Tied Filehandle
 360
 361 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
 362 which you can read Bio::PrimarySeq::Fasta sequence objects
 363 sequentially.  The following bit of code will iterate sequentially
 364 over all sequences in the database:
 365
 366  my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
 367  while (my $seq = <$fh>) {
 368    print $seq->id,' => ',$seq->length,"\n";
 369  }
 370
 371 When no more sequences remain to be retrieved, the stream will return
 372 undef.
 373
 374 =head1 BUGS
 375
 376 When a sequence is deleted from one of the Fasta files, this deletion
 377 is not detected by the module and removed from the index.  As a
 378 result, a "ghost" entry will remain in the index and will return
 379 garbage results if accessed.
 380
 381 Currently, the only way to accomodate deletions is to rebuild the
 382 entire index, either by deleting it manually, or by passing
 383 -reindex=E<gt>1 to new() when initializing the module.
 384
 385 =head1 SEE ALSO
 386
 387 L<bioperl>
 388
 389 =head1 AUTHOR
 390
 391 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
 392
 393 Copyright (c) 2001 Cold Spring Harbor Laboratory.
 394
 395 This library is free software; you can redistribute it and/or modify
 396 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 397 disclaimers of warranty.
 398
 399 =cut
 400
 401 #'
 402 package Bio::DB::Fasta;
 403
 404 BEGIN {
 405   @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
 406 }
 407
 408 use strict;
 409 use IO::File;
 410 use AnyDBM_File;
 411 use Fcntl;
 412 use File::Glob ':glob';
 413 use File::Basename qw(basename dirname);
 414
 415 use base qw(Bio::DB::SeqI Bio::Root::Root);
 416
 417 *seq = *sequence = \&subseq;
 418 *ids = \&get_all_ids;
 419 *get_seq_by_primary_id = *get_Seq_by_acc  = \&get_Seq_by_id;
 420
 421 use constant STRUCT =>'NNnnCa*';
 422 use constant STRUCTBIG =>'QQnnCa*'; # 64-bit file offset and seq length
 423 use constant DNA     => 1;
 424 use constant RNA     => 2;
 425 use constant PROTEIN => 3;
 426 use constant DIE_ON_MISSMATCHED_LINES => 1; # if you want
 427
 428 # Bio::DB-like object
 429 # providing fast random access to a directory of FASTA files
 430
 431 =head2 new
 432
 433  Title   : new
 434  Usage   : my $db = Bio::DB::Fasta->new( $path, @options);
 435  Function: initialize a new Bio::DB::Fasta object
 436  Returns : new Bio::DB::Fasta object
 437  Args    : path to dir of fasta files or a single filename
 438
 439 These are optional arguments to pass in as well.
 440
 441  -glob         Glob expression to use    *.{fa,fasta,fast,FA,FASTA,FAST}
 442                for searching for Fasta
 443                      files in directories.
 444
 445  -makeid       A code subroutine for     none
 446                      transforming Fasta IDs.
 447
 448  -maxopen      Maximum size of                 32
 449                      filehandle cache.
 450
 451  -debug        Turn on status                    0
 452                      messages.
 453
 454  -reindex      Force the index to be     0
 455                rebuilt.
 456
 457  -dbmargs      Additional arguments      none
 458                to pass to the DBM
 459                routines when tied
 460                (scalar or array ref).
 461
 462 =cut
 463
 464 sub new {
 465   my $class = shift;
 466   my $path  = shift;
 467   my %opts  = @_;
 468
 469   my $self = bless { debug      => $opts{-debug},
 470           makeid     => $opts{-makeid},
 471           glob       => $opts{-glob}    || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,FNA,fna,FAA,faa,FSA,fsa}',
 472           maxopen    => $opts{-maxopen} || 32,
 473           dbmargs    => $opts{-dbmargs} || undef,
 474           fhcache    => {},
 475           cacheseq   => {},
 476           curopen    => 0,
 477           openseq    => 1,
 478           dirname    => undef,
 479           offsets    => undef,
 480                    }, $class;
 481   my ($offsets,$dirname);
 482
 483   if (-d $path) {
 484     # because Win32 glob() is broken with respect to long file names
 485     # that contain whitespace.
 486     $path = Win32::GetShortPathName($path)
 487       if $^O =~ /^MSWin/i && eval 'use Win32; 1';
 488     $offsets = $self->index_dir($path,$opts{-reindex}) or return;
 489     $dirname = $path;
 490   } elsif (-f _) {
 491     $offsets = $self->index_file($path,$opts{-reindex});
 492     $dirname = dirname($path);
 493   } else {
 494     $self->throw( "$path: Invalid file or dirname");
 495   }
 496   @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
 497
 498   $self;
 499 }
 500
 501 =head2 newFh
 502
 503  Title   : newFh
 504  Function: gets a new Fh for a file
 505  Example : internal method
 506  Returns : GLOB
 507  Args    :
 508
 509 =cut
 510
 511 sub newFh {
 512   my $class = shift;
 513   my $self  = $class->new(@_);
 514   require Symbol;
 515   my $fh = Symbol::gensym or return;
 516   tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
 517   $fh;
 518 }
 519
 520 sub _open_index {
 521   my $self = shift;
 522   my ($index,$write) = @_;
 523   my %offsets;
 524   my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
 525   my @dbmargs = $self->dbmargs;
 526   eval {
 527       tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
 528           or die "Can't open sequence index file $index: $!";
 529   };
 530   warn $@ if $@;
 531   return \%offsets;
 532 }
 533
 534 sub _close_index {
 535   my $self = shift;
 536   my $index = shift;
 537   untie %$index;
 538 }
 539
 540 =head2 index_dir
 541
 542  Title   : index_dir
 543  Usage   : $db->index_dir($dir)
 544  Function: set the index dir and load all files in the dir
 545  Returns : hashref of seq offsets in each file
 546  Args    : dirname, boolean to force a reload of all files
 547
 548 =cut
 549
 550 sub index_dir {
 551   my $self = shift;
 552   my $dir  = shift;
 553   my $force_reindex = shift;
 554
 555   # find all fasta files
 556   my @files = glob("$dir/$self->{glob}");
 557   return unless @files;
 558
 559   # get name of index
 560   my $index = $self->index_name($dir,1);
 561
 562   # if caller has requested reindexing, then unlink
 563   # the index file.
 564   unlink $index if $force_reindex;
 565
 566   # get the modification time of the index
 567   my $indextime   = 0;
 568   for my $suffix('','.pag','.dir') {
 569     $indextime ||= (stat("${index}${suffix}"))[9];
 570   }
 571   $indextime ||= 0;  # prevent some uninit variable warnings
 572
 573   # get the most recent modification time of any of the contents
 574   my $modtime = 0;
 575   my %modtime;
 576   $self->set_pack_method( @files );
 577   foreach (@files) {
 578     my $m = (stat($_))[9];
 579     $modtime{$_} = $m;
 580     $modtime = $m if defined $m && $modtime < $m;
 581   }
 582
 583   my $reindex      = $force_reindex || $indextime < $modtime;
 584   $self->{offsets} = $self->_open_index($index,$reindex) or return;
 585
 586   # no indexing needed
 587   return $self->{offsets} unless $reindex;
 588
 589   # otherwise reindex contents of changed files
 590   $self->{indexing} = $index;
 591   foreach (@files) {
 592     next if( defined $indextime && $modtime{$_} <= $indextime);
 593     $self->calculate_offsets($_,$self->{offsets});
 594   }
 595   delete $self->{indexing};
 596
 597   # we've been having troubles with corrupted index files on Windows systems,
 598   # so possibly closing and reopening will help
 599   $self->_close_index($self->{offsets});
 600
 601   return $self->{offsets}  = $self->_open_index($index);
 602 }
 603
 604 =head2 get_Seq_by_id
 605
 606  Title   : get_Seq_by_id
 607  Usage   : my $seq = $db->get_Seq_by_id($id)
 608  Function: Bio::DB::RandomAccessI method implemented
 609  Returns : Bio::PrimarySeqI object
 610  Args    : id
 611
 612 =cut
 613
 614 sub get_Seq_by_id {
 615   my $self = shift;
 616   my $id   = shift;
 617   return unless exists $self->{offsets}{$id};
 618   return Bio::PrimarySeq::Fasta->new($self,$id);
 619 }
 620
 621 =head2 set_pack_method
 622
 623  Title   : set_pack_method
 624  Usage   : $db->set_pack_method( @files )
 625  Function: Determines whether data packing uses 32 or 64 bit integers
 626  Returns :
 627  Args    : one or more file paths
 628
 629 =cut
 630
 631 sub set_pack_method {
 632   my $self = shift;
 633   # Find the maximum file size:eq
 634   my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
 635   my $fourGB    = (2 ** 32) - 1;
 636
 637   if ($maxsize > $fourGB) {
 638       # At least one file exceeds 4Gb - we will need to use 64 bit ints
 639       $self->{packmeth}   = \&_packBig;
 640       $self->{unpackmeth} = \&_unpackBig;
 641   } else {
 642       $self->{packmeth}   = \&_pack;
 643       $self->{unpackmeth} = \&_unpack;
 644   }
 645 }
 646
 647 =head2 index_file
 648
 649  Title   : index_file
 650  Usage   : $db->index_file($filename)
 651  Function: (re)loads a sequence file and indexes sequences offsets in the file
 652  Returns : seq offsets in the file
 653  Args    : filename,
 654            boolean to force reloading a file
 655
 656 =cut
 657
 658
 659 sub index_file {
 660   my $self = shift;
 661   my $file = shift;
 662   my $force_reindex = shift;
 663
 664   $self->set_pack_method( $file );
 665   my $index = $self->index_name($file);
 666   # if caller has requested reindexing, then unlink the index
 667   unlink $index if $force_reindex;
 668
 669   # get the modification time of the index
 670   my $indextime = (stat($index))[9] || 0;
 671   my $modtime   = (stat($file))[9]  || 0;
 672
 673   my $reindex = $force_reindex || $indextime < $modtime;
 674   my $offsets = $self->_open_index($index,$reindex) or return;
 675   $self->{offsets} = $offsets;
 676
 677   return $self->{offsets} unless $reindex;
 678
 679   $self->{indexing} = $index;
 680   $self->calculate_offsets($file,$offsets);
 681   delete $self->{indexing};
 682   return $self->{offsets};
 683 }
 684
 685 =head2 dbmargs
 686
 687  Title   : dbmargs
 688  Usage   : my @args = $db->dbmargs;
 689  Function: gets stored dbm arguments
 690  Returns : array
 691  Args    : none
 692
 693 =cut
 694
 695 sub dbmargs {
 696   my $self = shift;
 697   my $args = $self->{dbmargs} or return;
 698   return ref($args) eq 'ARRAY' ? @$args : $args;
 699 }
 700
 701 =head2 index_name
 702
 703  Title   : index_name
 704  Usage   : my $indexname = $db->index_name($path,$isdir);
 705  Function: returns the name of the index for a specific path
 706  Returns : string
 707  Args    : path to check,
 708            boolean if it is a dir
 709
 710 =cut
 711
 712 sub index_name {
 713   my $self  = shift;
 714   my ($path,$isdir) = @_;
 715   unless ($path) {
 716     my $dir = $self->{dirname} or return;
 717     return $self->index_name($dir,-d $dir);
 718   }
 719   return "$path/directory.index" if $isdir;
 720   return "$path.index";
 721 }
 722
 723 =head2 calculate_offsets
 724
 725  Title   : calculate_offsets
 726  Usage   : $db->calculate_offsets($filename,$offsets);
 727  Function: calculates the sequence offsets in a file based on id
 728  Returns : offset hash for each file
 729  Args    : file to process
 730            $offsets - hashref of id to offset storage
 731
 732 =cut
 733
 734 sub calculate_offsets {
 735   my $self = shift;
 736   my ($file,$offsets) = @_;
 737   my $base = $self->path2fileno(basename($file));
 738
 739   my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
 740   binmode $fh;
 741   warn "indexing $file\n" if $self->{debug};
 742   my ($offset,@id,$linelength,$type,$firstline,$count,
 743       $termination_length,$seq_lines,$last_line,%offsets);
 744   my ($l3_len,$l2_len,$l_len)=(0,0,0);
 745
 746   while (<$fh>) {               # don't try this at home
 747     $termination_length ||= /\r\n$/ ? 2 : 1; # account for crlf-terminated Windows files
 748     next unless /\S/;
 749     if (index($_, ">") == 0) {
 750         if (/^>(\S+)/) {
 751           print STDERR "indexed $count sequences...\n"
 752         if $self->{debug} && (++$count%1000) == 0;
 753
 754
 755           my $pos = tell($fh);
 756           if (@id) {
 757         my $seqlength    = $pos - $offset - length($_);
 758         $seqlength      -= $termination_length * $seq_lines;
 759         my $ppos = &{$self->{packmeth}}($offset,$seqlength,
 760                            $linelength,$firstline,
 761                            $type,$base);
 762         for my $id (@id) { $offsets->{$id}  = $ppos }
 763           }
 764           @id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
 765           ($offset,$firstline,$linelength) = ($pos,length($_),0);
 766           $self->_check_linelength($linelength);
 767           ($l3_len,$l2_len,$l_len)=(0,0,0);
 768           $seq_lines = 0;
 769         } else {
 770           # catch bad header lines, bug 3172
 771           $self->throw("FASTA header doesn't match '>(\\S+)': $_")
 772         }
 773     } else {
 774       $l3_len= $l2_len; $l2_len= $l_len; $l_len= length($_); # need to check every line :(
 775       if (DIE_ON_MISSMATCHED_LINES &&
 776           $l3_len>0 && $l2_len>0 && $l3_len!=$l2_len) {
 777           my $fap= substr($_,0,20)."..";
 778           $self->throw("Each line of the fasta entry must be the same length except the last.
 779    Line above #$. '$fap' is $l2_len != $l3_len chars.");
 780       }
 781       $linelength ||= length($_);
 782       $type       ||= $self->_type($_);
 783       $seq_lines++;
 784     }
 785     $last_line = $_;
 786   }
 787
 788   $self->_check_linelength($linelength);
 789   # deal with last entry
 790   if (@id) {
 791     my $pos = tell($fh);
 792     my $seqlength   = $pos - $offset;
 793     if ($linelength == 0) { # yet another pesky empty chr_random.fa file
 794       $seqlength = 0;
 795     } else {
 796       if ($last_line !~ /\s$/) {
 797         $seq_lines--;
 798       }
 799       $seqlength -= $termination_length * $seq_lines;
 800     };
 801     my $ppos = &{$self->{packmeth}}($offset,$seqlength,
 802                                            $linelength,$firstline,
 803                                            $type,$base);
 804     for my $id (@id) { $offsets->{$id}  = $ppos }
 805   }
 806   $offsets->{__termination_length} = $termination_length;
 807   return \%offsets;
 808 }
 809
 810 =head2 get_all_ids
 811
 812  Title   : get_all_ids
 813  Usage   : my @ids = $db->get_all_ids
 814  Function: gets all the stored ids in all indexes
 815  Returns : list of ids
 816  Args    : none
 817
 818 =cut
 819
 820 sub get_all_ids  { grep {!/^__/} keys %{shift->{offsets}} }
 821
 822 sub offset {
 823   my $self = shift;
 824   my $id   = shift;
 825   my $offset = $self->{offsets}{$id} or return;
 826   (&{$self->{unpackmeth}}($offset))[0];
 827 }
 828
 829 sub length {
 830   my $self = shift;
 831   my $id   = shift;
 832   my $offset = $self->{offsets}{$id} or return;
 833   (&{$self->{unpackmeth}}($offset))[1];
 834 }
 835
 836 sub linelen {
 837   my $self = shift;
 838   my $id   = shift;
 839   my $offset = $self->{offsets}{$id} or return;
 840   (&{$self->{unpackmeth}}($offset))[2];
 841 }
 842
 843 sub headerlen {
 844   my $self = shift;
 845   my $id   = shift;
 846   my $offset = $self->{offsets}{$id} or return;
 847   (&{$self->{unpackmeth}}($offset))[3];
 848 }
 849
 850 sub alphabet {
 851   my $self = shift;
 852   my $id   = shift;
 853   my $offset = $self->{offsets}{$id} or return;
 854   my $type = (&{$self->{unpackmeth}}($offset))[4];
 855   return $type == DNA ? 'dna'
 856          : $type == RNA ? 'rna'
 857          : 'protein';
 858
 859 }
 860
 861 sub path { shift->{dirname} }
 862
 863 sub header_offset {
 864     my $self = shift;
 865     my $id   = shift;
 866     return unless $self->{offsets}{$id};
 867     return $self->offset($id) - $self->headerlen($id);
 868 }
 869
 870 sub file {
 871   my $self = shift;
 872   my $id   = shift;
 873   my $offset = $self->{offsets}{$id} or return;
 874   $self->fileno2path((&{$self->{unpackmeth}}($offset))[5]);
 875 }
 876
 877 sub fileno2path {
 878   my $self = shift;
 879   my $no   = shift;
 880   return $self->{offsets}{"__file_$no"};
 881 }
 882
 883 sub path2fileno {
 884   my $self = shift;
 885   my $path = shift;
 886   if ( !defined $self->{offsets}{"__path_$path"} ) {
 887     my $fileno  = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
 888     $self->{offsets}{"__file_$fileno"} = $path;
 889   }
 890   return $self->{offsets}{"__path_$path"}
 891 }
 892
 893 sub _check_linelength {
 894   my $self       = shift;
 895   my $linelength = shift;
 896   return unless defined $linelength;
 897   $self->throw("Each line of the fasta file must be less than 65,536 characters.  Line $. is $linelength chars.")       if $linelength > 65535.
 898
 899 }
 900
 901 =head2 subseq
 902
 903  Title   : subseq
 904  Usage   : $seqdb->subseq($id,$start,$stop);
 905  Function: returns a subseq of a sequence in the db
 906  Returns : subsequence data
 907  Args    : id of sequence, starting point, ending point
 908
 909 =cut
 910
 911 sub subseq {
 912   my ($self,$id,$start,$stop) = @_;
 913   if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
 914     ($id,$start,$stop) = ($1,$2,$3);
 915     $start =~ s/_//g;
 916     $stop =~ s/_//g;
 917   }
 918   $start ||= 1;
 919   $stop  ||= $self->length($id);
 920
 921   my $reversed;
 922   if (defined $stop && $start > $stop) {
 923     ($start,$stop) = ($stop,$start);
 924     $reversed++;
 925   }
 926
 927   my $data;
 928
 929   my $fh = $self->fh($id) or return;
 930   my $filestart = $self->caloffset($id,$start);
 931   my $filestop  = $self->caloffset($id,$stop);
 932
 933   seek($fh,$filestart,0);
 934   read($fh,$data,$filestop-$filestart+1);
 935   $data =~ s/\n//g;
 936   $data =~ s/\r//g;
 937   if ($reversed) {
 938     $data = reverse $data;
 939     $data =~ tr/gatcGATC/ctagCTAG/;
 940   }
 941   $data;
 942 }
 943
 944 sub fh {
 945   my $self = shift;
 946   my $id   = shift;
 947   my $file = $self->file($id) or return;
 948   $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
 949 }
 950
 951 sub header {
 952   my $self = shift;
 953   my $id   = shift;
 954   my ($offset,$seqlength,$linelength,$firstline,$type,$file)
 955     = &{$self->{unpackmeth}}($self->{offsets}{$id}) or return;
 956   $offset -= $firstline;
 957   my $data;
 958   my $fh = $self->fh($id) or return;
 959   seek($fh,$offset,0);
 960   read($fh,$data,$firstline);
 961   chomp $data;
 962   substr($data,0,1) = '';
 963   $data;
 964 }
 965
 966 sub caloffset {
 967   my $self = shift;
 968   my $id   = shift;
 969   my $a    = shift()-1;
 970   my ($offset,$seqlength,$linelength,$firstline,$type,$file) = &{$self->{unpackmeth}}($self->{offsets}{$id});
 971   $a = 0            if $a < 0;
 972   $a = $seqlength-1 if $a >= $seqlength;
 973   my $tl = $self->{offsets}{__termination_length};
 974   $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
 975 }
 976
 977 sub fhcache {
 978   my $self = shift;
 979   my $path = shift;
 980   if (!$self->{fhcache}{$path}) {
 981     if ($self->{curopen} >= $self->{maxopen}) {
 982       my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
 983       splice(@lru, $self->{maxopen} / 3);
 984       $self->{curopen} -= @lru;
 985       for (@lru) { delete $self->{fhcache}{$_} }
 986     }
 987     $self->{fhcache}{$path} = IO::File->new($path) or return;
 988     binmode $self->{fhcache}{$path};
 989     $self->{curopen}++;
 990   }
 991   $self->{cacheseq}{$path}++;
 992   $self->{fhcache}{$path}
 993 }
 994
 995 sub _pack {
 996   pack STRUCT,@_;
 997 }
 998
 999 sub _packBig {
1000   pack STRUCTBIG,@_;
1001 }
1002
1003 sub _unpack {
1004   unpack STRUCT,shift;
1005 }
1006
1007 sub _unpackBig {
1008   unpack STRUCTBIG,shift;
1009 }
1010
1011 sub _type {
1012   shift;
1013   local $_ = shift;
1014   return /^[gatcnGATCN*-]+$/   ? DNA
1015          : /^[gaucnGAUCN*-]+$/ ? RNA
1016          : PROTEIN;
1017 }
1018
1019 =head2 get_PrimarySeq_stream
1020
1021  Title   : get_PrimarySeq_stream
1022  Usage   :
1023  Function:
1024  Example :
1025  Returns :
1026  Args    :
1027
1028 =cut
1029
1030 sub get_PrimarySeq_stream {
1031   my $self = shift;
1032   return Bio::DB::Fasta::Stream->new($self);
1033 }
1034
1035 sub TIEHASH {
1036   my $self = shift;
1037   return $self->new(@_);
1038 }
1039
1040 sub FETCH {
1041   shift->subseq(@_);
1042 }
1043 sub STORE {
1044     shift->throw("Read-only database");
1045 }
1046 sub DELETE {
1047     shift->throw("Read-only database");
1048 }
1049 sub CLEAR {
1050     shift->throw("Read-only database");
1051 }
1052 sub EXISTS {
1053   defined shift->offset(@_);
1054 }
1055 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
1056 sub NEXTKEY  { tied(%{shift->{offsets}})->NEXTKEY(@_);  }
1057
1058 sub DESTROY {
1059   my $self = shift;
1060   if ($self->{indexing}) {  # killed prematurely, so index file is no good!
1061     warn "indexing was interrupted, so unlinking $self->{indexing}";
1062     unlink $self->{indexing};
1063   }
1064 }
1065
1066 #-------------------------------------------------------------
1067 # Bio::PrimarySeqI compatibility
1068 #
1069 package Bio::PrimarySeq::Fasta;
1070 use overload '""' => 'display_id';
1071
1072 use base qw(Bio::Root::Root Bio::PrimarySeqI);
1073
1074 sub new {
1075   my $class = shift;
1076   $class = ref($class) if ref $class;
1077   my ($db,$id,$start,$stop) = @_;
1078   return bless { db    => $db,
1079                  id    => $id,
1080                  start => $start || 1,
1081                  stop  => $stop  || $db->length($id)
1082                },$class;
1083 }
1084
1085 sub fetch_sequence { shift->seq(@_) }
1086
1087 sub seq {
1088   my $self = shift;
1089   return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
1090 }
1091
1092 sub subseq {
1093   my $self = shift;
1094   $self->trunc(@_)->seq();
1095 }
1096
1097 sub trunc {
1098   my $self = shift;
1099   my ($start,$stop) = @_;
1100   $self->throw("Stop cannot be smaller than start")  unless $start <= $stop;
1101   return $self->{start} <= $self->{stop} ?  $self->new($self->{db},
1102                                                        $self->{id},
1103                                                        $self->{start}+$start-1,
1104                                                        $self->{start}+$stop-1)
1105                                          :  $self->new($self->{db},
1106                                                        $self->{id},
1107                                                        $self->{start}-($start-1),
1108                                                        $self->{start}-($stop-1)
1109                                                       );
1110
1111 }
1112
1113 sub is_circular {
1114   my $self = shift;
1115   return $self->{is_circular};
1116 }
1117
1118 sub display_id {
1119   my $self = shift;
1120   return $self->{id};
1121 }
1122
1123 sub accession_number {
1124   my $self = shift;
1125   return "unknown";
1126 }
1127
1128 sub primary_id {
1129   my $self = shift;
1130   return overload::StrVal($self);
1131 }
1132
1133 sub can_call_new { return 0 }
1134
1135 sub alphabet {
1136   my $self = shift;
1137   return $self->{db}->alphabet($self->{id});
1138 }
1139
1140 sub revcom {
1141   my $self = shift;
1142   return $self->new(@{$self}{'db','id','stop','start'});
1143 }
1144
1145 sub length {
1146   my $self = shift;
1147   #return $self->{db}->length($self->{id}); # wrong because ignores sequence start and stop values
1148   return length($self->seq);
1149
1150 }
1151
1152 sub description  {
1153     my $self = shift;
1154     my $header = $self->{'db'}->header($self->{id});
1155     # remove the id from the header
1156     return (split(/\s+/,$header,2))[1];
1157 }
1158
1159 *desc = \&description;
1160
1161 #-------------------------------------------------------------
1162 # stream-based access to the database
1163 #
1164 package Bio::DB::Fasta::Stream;
1165 use base qw(Tie::Handle Bio::DB::SeqI);
1166
1167
1168 sub new {
1169   my $class = shift;
1170   my $db    = shift;
1171   my $key = $db->FIRSTKEY;
1172   return bless { db=>$db,key=>$key },$class;
1173 }
1174
1175 sub next_seq {
1176   my $self = shift;
1177   my ($key,$db) = @{$self}{'key','db'};
1178   while ($key =~ /^__/) {
1179     $key = $db->NEXTKEY($key);
1180     return unless defined $key;
1181   }
1182   my $value = $db->get_Seq_by_id($key);
1183   $self->{key} = $db->NEXTKEY($key);
1184   $value;
1185 }
1186
1187 sub TIEHANDLE {
1188   my $class = shift;
1189   my $db    = shift;
1190   return $class->new($db);
1191 }
1192 sub READLINE {
1193   my $self = shift;
1194   $self->next_seq;
1195 }
1196
1197 1;
1198
1199 __END__
1200