Bio/DB/Qual.pm

   1 #
   2 #
   3 # BioPerl module for Bio::DB::Qual
   4 #
   5 # You may distribute this module under the same terms as perl itself
   6 #
   7
   8 # POD documentation - main docs before the code
   9
  10 =head1 NAME
  11
  12 Bio::DB::Qual -- Fast indexed access to a directory of quality files
  13
  14 =head1 SYNOPSIS
  15
  16   use Bio::DB::Qual;
  17
  18   # create database from directory of qual files
  19   my $db      = Bio::DB::Qual->new('/path/to/qual/files');
  20   my @ids     = $db->ids;
  21
  22   # simple access (for those without Bioperl)
  23   my @qual    = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
  24   my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
  25   my $length  = $db->length('CHROMOSOME_I');
  26   my $header  = $db->header('CHROMOSOME_I');
  27
  28   # Bioperl-style access
  29   my $obj     = $db->get_Qual_by_id('CHROMOSOME_I');
  30   my @qual    = @{$obj->qual};
  31   my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
  32   my $length  = $obj->length;
  33   # (etc)
  34
  35   # Bio::SeqIO-style access
  36   my $stream  = $db->get_PrimaryQual_stream;
  37   while (my $qual = $stream->next_seq) {
  38     # Bio::Seq::PrimaryQual operations
  39   }
  40
  41   my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
  42   while (my $qual = <$fh>) {
  43     # Bio::Seq::PrimaryQual operations
  44   }
  45
  46   # tied hash access
  47   tie %qualities,'Bio::DB::Qual','/path/to/qual/files';
  48   print $qualities{'CHROMOSOME_I:1,20000'};
  49
  50 =head1 DESCRIPTION
  51
  52 Bio::DB::Qual provides indexed access to one or more qual files. It provides
  53 random access to each quality score entry without having to read the file from
  54 the beginning. Access to subqualities (portions of a quality score) is provided,
  55 although contrary to Bio::DB::Fasta, the full quality score has to be brought in
  56 memory.
  57
  58 When you initialize the module, you point it at a single qual file or a
  59 directory of multiple such files. The first time it is run, the module generates
  60 an index of the contents of the file or directory using the AnyDBM module
  61 (Berkeley DB* preferred, followed by GDBM_File, NDBM_File, and SDBM_File).
  62 Thereafter it uses the index file to find the file and offset for any requested
  63 quality score. If one of the source qual files is updated, the module reindexes
  64 just that one file. (You can also force reindexing manually). For improved
  65 performance, the module keeps a cache of open filehandles, closing less-recently
  66 used ones when the cache is full.
  67
  68 The qual files may contain decimal quality scores. Entries may have any line
  69 length up to 65,536 characters, and different line lengths are allowed in the
  70 same file. However, within a quality score entry, all lines must be the same
  71 length except for the last. An error will be thrown if this is not the case.
  72
  73 The module uses /^E<gt>(\S+)/ to extract the primary ID of each quality score
  74 from the qual header. During indexing, you may pass a callback routine to modify
  75 this primary ID.  For example, you may wish to extract a portion of the
  76 gi|gb|abc|xyz prefixes that are commonly used. The original header line can be
  77 recovered later.
  78
  79 *Berkeley DB can be obtained free from www.sleepycat.com. After it is installed
  80 you will need to install the BerkeleyDB Perl module.
  81
  82 =head1 DATABASE CREATION AND INDEXING
  83
  84 The two constructors for this class are new() and newFh(). The former creates a
  85 Bio::DB::Qual object which is accessed via method calls. The latter creates a
  86 tied filehandle which can be used Bio::SeqIO-style to fetch quality score
  87 objects in a data stream. There is also a tied hash interface.
  88
  89 =over 2
  90
  91 =item $db = Bio::DB::Qual-E<gt>new($qual_path [,%options])
  92
  93 Create a new Bio::DB::Qual object from the Qual file or files indicated by
  94 $qual_path. Indexing will be performed automatically if needed. If successful,
  95 new() will return the database accessor object. Otherwise it will return undef.
  96
  97 $qual_path may be an individual qual file, or may refer to a directory
  98 containing one or more of such files. Following the path, you may pass a series
  99 of name=E<gt>value options or a hash with these same name=E<gt>value pairs.
 100 Valid options are:
 101
 102  Option Name   Description               Default
 103  -----------   -----------               -------
 104
 105  -glob         Glob expression to use    *.{qa,QA,qual,QUAL}
 106                for searching for qual
 107                files in directories.
 108
 109  -makeid       A code subroutine for     None
 110                transforming qual IDs.
 111
 112  -maxopen      Maximum size of           32
 113                filehandle cache.
 114
 115  -debug        Turn on status            0
 116                messages.
 117
 118  -reindex      Force the index to be     0
 119                rebuilt.
 120
 121  -dbmargs      Additional arguments      none
 122                to pass to the DBM
 123                routines when tied
 124                (scalar or array ref).
 125
 126 -dbmargs can be used to control the format of the index. For example, you can
 127 pass $DB_BTREE to this argument so as to force the IDs to be sorted and
 128 retrieved alphabetically. Note that you must use the same arguments every time
 129 you open the index!
 130
 131 -reindex can be used to force the index to be recreated from scratch.
 132
 133 =item $fh = Bio::DB::Qual-E<gt>newFh($qual_path [,%options])
 134
 135 Create a tied filehandle opened on a Bio::DB::Qual object. Reading from this
 136 filehandle with E<lt>E<gt> will return a stream of quality objects,
 137 Bio::SeqIO-style.
 138
 139 =back
 140
 141 The -makeid option gives you a chance to modify quality score IDs during
 142 indexing. The option value should be a code reference that will take a scalar
 143 argument and return a scalar result, like this:
 144
 145   $db = Bio::DB::Qual->new("file.qual",-makeid=>\&make_my_id);
 146
 147   sub make_my_id {
 148     my $description_line = shift;
 149     # get a different id from the quality header, e.g.
 150     $description_line =~ /(\S+)$/;
 151     return $1;
 152   }
 153
 154 make_my_id() will be called with the full qual id line (including the "E<gt>"
 155 symbol!). For example:
 156
 157   >A12345.3 Predicted C. elegans protein egl-2
 158
 159 By default, this module will use the regular expression /^E<gt>(\S+)/ to extract
 160 "A12345.3" for use as the ID.If you pass a -makeid callback, you can extract any
 161 portion of this, such as the "egl-2" symbol.
 162
 163 The -makeid option is ignored after the index is constructed.
 164
 165 =head1 OBJECT METHODS
 166
 167 The following object methods are provided.
 168
 169 =over 10
 170
 171 =item $raw_qual = $db-E<gt>qual($id [,$start, $stop])
 172
 173 Return a quality score array reference given an ID and optionally a start and
 174 stop position (the quality value number) in the quality score. If $stop is less
 175 than $start, then the reverse complement of the quality score is returned (this
 176 violates Bio::Seq conventions).
 177
 178 For your convenience, subqualities can be indicated with any of the following
 179 compound IDs:
 180
 181    $db->qual("$id:$start,$stop")
 182
 183    $db->qual("$id:$start..$stop")
 184
 185    $db->qual("$id:$start-$stop")
 186
 187 =item $length = $db-E<gt>length($id)
 188
 189 Return the length of the indicated quality score, i.e. the number of quality
 190 values.
 191
 192 =item $header = $db-E<gt>header($id)
 193
 194 Return the header line for the ID, including the initial "E<gt>".
 195
 196 =item $filename  = $db-E<gt>file($id)
 197
 198 Return the name of the file in which the indicated quality score can be found.
 199
 200 =item $offset    = $db-E<gt>offset($id)
 201
 202 Return the offset of the indicated quality score from the beginning of the file
 203 in which it is located.  The offset points to the beginning of the quality
 204 score, not the beginning of the header line.
 205
 206 =item $header_length = $db-E<gt>headerlen($id)
 207
 208 Return the length of the header line for the indicated quality score.
 209
 210 =item $header_offset = $db-E<gt>header_offset($id)
 211
 212 Return the offset of the header line for the indicated quality score from the
 213 beginning of the file in which it is located.
 214
 215 =item $index_name  = $db-E<gt>index_name
 216
 217 Return the path to the index file.
 218
 219 =item $path = $db-E<gt>path
 220
 221 Return the path to the Qual file(s).
 222
 223 =back
 224
 225 For BioPerl-style access, the following methods are provided:
 226
 227 =over 4
 228
 229 =item $qual = $db-E<gt>get_Qual_by_id($id)
 230
 231 Return a Bio::Seq::PrimaryQual object, which obeys the Bio::PrimarySeqI
 232 conventions. To recover the quality score, call $qual-E<gt>qual().
 233
 234 Note that get_Qual_by_id() does not bring the entire quality score into memory
 235 until requested. Internally, the returned object uses the accessor to generate
 236 subqualities as needed.
 237
 238 =item $qual = $db-E<gt>get_Qual_by_acc($id)
 239
 240 =item $qual = $db-E<gt>get_Qual_by_primary_id($id)
 241
 242 These methods all do the same thing as get_Qual_by_id().
 243
 244 =item $stream = $db-E<gt>get_PrimaryQual_stream()
 245
 246 Return a Bio::DB::Qual::Stream object, which supports a single method
 247 next_seq(). Each call to next_seq() returns a new Bio::Seq::PrimaryQual object,
 248 until no more quality scores remain.
 249
 250 =back
 251
 252 See L<Bio::Seq::PrimaryQual> and L<Bio::PrimarySeqI> for methods provided by the
 253 quality objects returned from get_Qual_by_id() and get_PrimaryQual_stream().
 254
 255 =head1 TIED INTERFACES
 256
 257 This module provides two tied interfaces, one which allows you to treat the
 258 quality score database as a hash, and the other which allows you to treat the
 259 database as an I/O stream.
 260
 261 =head2 Creating a Tied Hash
 262
 263 The tied hash interface is very straightforward.
 264
 265 =over 1
 266
 267 =item $obj = tie %db,'Bio::DB::Qual','/path/to/qual/files' [,@args]
 268
 269 Tie %db to Bio::DB::Qual using the indicated path to the Qual files. The
 270 optional @args list is the same set of named argument/value pairs used by
 271 Bio::DB::Qual-E<gt>new().
 272
 273 If successful, tie() will return the tied object.  Otherwise it will return
 274 undef.
 275
 276 =back
 277
 278 Once tied, you can use the hash to retrieve an individual quality score by its
 279 ID, like this:
 280
 281   my $qual = $db{CHROMOSOME_I};
 282
 283 You may select a subquality by appending the comma-separated range to the
 284 quality score ID in the format "$id:$start,$stop". For example, here is the
 285 first 1000 quality values of the quality score with ID "CHROMOSOME_I":
 286
 287   my $qual = $db{'CHROMOSOME_I:1,1000'};
 288
 289 (The regular expression used to parse this format allows quality score IDs to
 290 contain colons.)
 291
 292 When selecting subqualities, if $start E<gt> stop, then the reverse complement
 293 will be returned.
 294
 295 The keys() and values() functions will return the IDs and their quality scores,
 296 respectively. In addition, each() can be used to iterate over the entire data
 297 set:
 298
 299  while (my ($id,$quality) = each %db) {
 300     print "$id => $quality\n";
 301  }
 302
 303 When dealing with very large quality scores, you can avoid bringing them into
 304 memory by calling each() in a scalar context. This returns the key only. You can
 305 then use tied(%db) to recover the Bio::DB::Qual object and call its methods.
 306
 307  while (my $id = each %db) {
 308     print "$id => $db{$quality:1,100}\n";
 309     print "$id => ",tied(%db)->length($id),"\n";
 310  }
 311
 312 You may, in addition invoke Bio::DB::Qual the FIRSTKEY and NEXTKEY tied hash
 313 methods directly.
 314
 315 =over 2
 316
 317 =item $id = $db-E<gt>FIRSTKEY
 318
 319 Return the first ID in the database.
 320
 321 =item $id = $db-E<gt>NEXTKEY($id)
 322
 323 Given an ID, return the next quality score ID.
 324
 325 =back
 326
 327 This allows you to write the following iterative loop using just the object-
 328 oriented interface:
 329
 330  my $db = Bio::DB::Qual->new('/path/to/qual/files');
 331  for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
 332     # do something with quality
 333  }
 334
 335 =head2 Creating a Tied Filehandle
 336
 337 The Bio::DB::Qual-E<gt>newFh() method creates a tied filehandle from which you
 338 can read Bio::Seq::PrimaryQual quality score objects sequentially. The following
 339 bit of code will iterate sequentially over all quality scores in the database:
 340
 341  my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
 342  while (my $qual = <$fh>) {
 343    print $qual->id,' => ',$qual->length,"\n";
 344  }
 345
 346 When no more quality scores remain to be retrieved, the stream will return
 347 undef.
 348
 349 =head1 LIMITATIONS
 350
 351 When a quality score is deleted from one of the qual files, this deletion is not
 352 detected by the module and removed from the index. As a result, a "ghost" entry
 353 will remain in the index and will return garbage results if accessed. Currently,
 354 the only way to accommodate deletions is to rebuild the entire index, either by
 355 deleting it manually, or by passing -reindex=E<gt>1 to new() when
 356 initializing the module.
 357
 358 All quality score lines for a given quality score must have the same length
 359 except for the last (not sure why there is this limitation). This is not
 360 problematic for sequences but could be annoying for quality scores. A workaround
 361 is to make sure the your quality scores fit on no more than 2 lines. Another
 362 solution could be to padd them with blank spaces so that each line has the same
 363 number of characters (maybe this padding should be implemented in
 364 Bio::SeqIO::qual?).
 365
 366 =head1 AUTHOR
 367
 368 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
 369
 370 Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
 371
 372 Copyright (c) 2007 Florent E Angly.
 373
 374 This library is free software; you can redistribute it and/or modify
 375 it under the same terms as Perl itself.
 376
 377 =cut
 378
 379 package Bio::DB::Qual;
 380
 381 BEGIN {
 382   @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
 383 }
 384
 385 use strict;
 386 use IO::File;
 387 use AnyDBM_File;
 388 use Fcntl;
 389 use File::Basename qw(basename dirname);
 390
 391 use base qw(Bio::DB::SeqI Bio::Root::Root);
 392
 393 *qual = *quality = \&subqual;
 394 *ids = \&get_all_ids;
 395 *get_qual_by_primary_id = *get_qual_by_acc  = \&get_Qual_by_id;
 396
 397 use constant STRUCT =>'NNnna*';
 398 use constant STRUCTBIG =>'QQnna*'; # 64-bit file offset and quality score length
 399 use constant DIE_ON_MISSMATCHED_LINES => 1; # you can avoid dying if you want
 400                                             # but you're likely to get bad
 401                                             # results
 402
 403 # Bio::DB-like object
 404 # providing fast random access to a directory of qual files
 405
 406 =head2 new
 407
 408  Title   : new
 409  Usage   : my $db = Bio::DB::Qual->new( $path, @options);
 410  Function: initialize a new Bio::DB::Qual object
 411  Returns : new Bio::DB::Qual object
 412  Args    : path to dir of qual files or a single qual filename
 413
 414 These are optional arguments to pass in as well.
 415
 416  -glob         Glob expression to use    *.{qual,QUAL,qa,QA}
 417                for searching for qual
 418                files in directories.
 419
 420  -makeid       A code subroutine for     none
 421                transforming qual IDs.
 422
 423  -maxopen      Maximum size of           32
 424                filehandle cache.
 425
 426  -debug        Turn on status            0
 427                messages.
 428
 429  -reindex      Force the index to be     0
 430                rebuilt.
 431
 432  -dbmargs      Additional arguments      none
 433                to pass to the DBM
 434                routines when tied
 435                (scalar or array ref).
 436
 437 =cut
 438
 439 sub new {
 440     my ($class, $path) = @_;
 441     my %opts  = @_;
 442
 443     my $self = bless {
 444         debug      => $opts{-debug},
 445         makeid     => $opts{-makeid},
 446         glob       => $opts{-glob}    || '*.{qual,QUAL,qa,QA}',
 447         maxopen    => $opts{-maxopen} || 32,
 448         dbmargs    => $opts{-dbmargs} || undef,
 449         fhcache    => {},
 450         cacheseq   => {},
 451         curopen    => 0,
 452         openseq    => 1,
 453         dirname    => undef,
 454         offsets    => undef,
 455     }, $class;
 456     my ($offsets,$dirname);
 457
 458     if (-d $path) {
 459         # because Win32 glob() is broken with respect to long file names that
 460         # contain whitespace.
 461         $path = Win32::GetShortPathName($path)
 462         if $^O =~ /^MSWin/i && eval 'use Win32; 1';
 463         $offsets = $self->index_dir($path,$opts{-reindex});
 464         $dirname = $path;
 465     } elsif (-f _) {
 466         $offsets = $self->index_file($path,$opts{-reindex});
 467         $dirname = dirname($path);
 468     } else {
 469         $self->throw( "$path: invalid file or dirname");
 470     }
 471     @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
 472     $self;
 473 }
 474
 475 =head2 newFh
 476
 477  Title   : newFh
 478  Usage   : my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
 479  Function: gets a new Fh for a file or directory containing several files
 480  Returns : filehandle object
 481  Args    : none
 482
 483 =cut
 484
 485 sub newFh {
 486     my $class = shift;
 487     my $self  = $class->new(@_);
 488     require Symbol;
 489     my $fh = Symbol::gensym or return;
 490     tie $$fh,'Bio::DB::Qual::Stream',$self or return;
 491     $fh;
 492 }
 493
 494 sub _open_index {
 495     my $self = shift;
 496     my ($index,$write) = @_;
 497     my %offsets;
 498     my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
 499     my @dbmargs = $self->dbmargs;
 500     tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
 501         or $self->throw( "Can't open cache file $index: $!");
 502     return \%offsets;
 503 }
 504
 505 sub _close_index {
 506     my ($self, $index) = @_;
 507     untie %$index;
 508 }
 509
 510 =head2 index_dir
 511
 512  Title   : index_dir
 513  Usage   : $db->index_dir($dir)
 514  Function: set the index dir and load all files in the dir
 515  Returns : hashref of qual offsets in each file
 516  Args    : dirname, boolean to force a reload of all files
 517
 518 =cut
 519
 520 sub index_dir {
 521     my ($self, $dir, $force_reindex) = @_;
 522
 523     # find all qual files
 524     my @files = glob("$dir/$self->{glob}");
 525     $self->throw("No qual files found in $dir") unless @files;
 526
 527     # get name of index
 528     my $index = $self->index_name($dir,1);
 529
 530     # if caller has requested reindexing, then unlink
 531     # the index file.
 532     unlink $index if $force_reindex;
 533
 534     # get the modification time of the index
 535     my $indextime   = 0;
 536     for my $suffix('','.pag','.dir') {
 537         $indextime ||= (stat("${index}${suffix}"))[9];
 538     }
 539     $indextime ||= 0;  # prevent some uninit variable warnings
 540
 541     # get the most recent modification time of any of the contents
 542     my $modtime = 0;
 543     my %modtime;
 544     $self->set_pack_method( @files );
 545     foreach (@files) {
 546         my $m = (stat($_))[9];
 547         $modtime{$_} = $m;
 548         $modtime = $m if defined $m && $modtime < $m;
 549     }
 550
 551     my $reindex      = $force_reindex || $indextime < $modtime;
 552     $self->{offsets} = $self->_open_index($index,$reindex) or return;
 553
 554     # no indexing needed
 555     return $self->{offsets} unless $reindex;
 556
 557     # otherwise reindex contents of changed files
 558     $self->{indexing} = $index;
 559     foreach (@files) {
 560         next if( defined $indextime && $modtime{$_} <= $indextime);
 561         $self->calculate_offsets($_,$self->{offsets});
 562     }
 563     delete $self->{indexing};
 564
 565     # we've been having troubles with corrupted index files on Windows systems,
 566     # so possibly closing and reopening will help
 567     $self->_close_index($self->{offsets});
 568
 569     return $self->{offsets}  = $self->_open_index($index);
 570 }
 571
 572 =head2 get_Qual_by_id
 573
 574  Title   : get_Qual_by_id
 575  Usage   : my $qual = $db->get_Qual_by_id($id)
 576  Function: Bio::DB::RandomAccessI method implemented
 577  Returns : Bio::PrimarySeqI object
 578  Args    : id
 579
 580 =cut
 581
 582 sub get_Qual_by_id {
 583     my ($self, $id) = @_;
 584     return unless exists $self->{offsets}{$id};
 585     return Bio::Seq::PrimaryQual::Qual->new($self,$id);
 586 }
 587
 588 =head2 set_pack_method
 589
 590  Title   : set_pack_method
 591  Usage   : $db->set_pack_method( @files )
 592  Function: Determines whether data packing uses 32 or 64 bit integers
 593  Returns : 1 for success
 594  Args    : one or more file paths
 595
 596 =cut
 597
 598 sub set_pack_method {
 599     my $self = shift;
 600     # Find the maximum file size:
 601     my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
 602     my $fourGB    = (2 ** 32) - 1;
 603
 604     if ($maxsize > $fourGB) {
 605         # At least one file exceeds 4Gb - we will need to use 64 bit ints
 606         $self->{packmeth}   = \&_packBig;
 607         $self->{unpackmeth} = \&_unpackBig;
 608     } else {
 609         $self->{packmeth}   = \&_pack;
 610         $self->{unpackmeth} = \&_unpack;
 611     }
 612     return 1;
 613 }
 614
 615 =head2 index_file
 616
 617  Title   : index_file
 618  Usage   : $db->index_file($filename)
 619  Function: (re)loads a quality score file and indexes quality score offsets in
 620            the file
 621  Returns : qual offsets in the file
 622  Args    : filename, boolean to force reloading a file
 623
 624 =cut
 625
 626 sub index_file {
 627     my ($self, $file, $force_reindex) = @_;
 628
 629     $self->set_pack_method( $file );
 630     my $index = $self->index_name($file);
 631     # if caller has requested reindexing, then unlink the index
 632     unlink $index if $force_reindex;
 633
 634     # get the modification time of the index
 635     my $indextime = (stat($index))[9] || 0;
 636     my $modtime   = (stat($file) )[9] || 0;
 637
 638     my $reindex = $force_reindex || $indextime < $modtime;
 639     my $offsets = $self->_open_index($index,$reindex) or return;
 640     $self->{offsets} = $offsets;
 641
 642     return $self->{offsets} unless $reindex;
 643
 644     $self->{indexing} = $index;
 645     $self->calculate_offsets($file,$offsets);
 646     delete $self->{indexing};
 647     return $self->{offsets};
 648 }
 649
 650 =head2 dbmargs
 651
 652  Title   : dbmargs
 653  Usage   : my @args = $db->dbmargs;
 654  Function: gets stored dbm arguments
 655  Returns : array
 656  Args    : none
 657
 658 =cut
 659
 660 sub dbmargs {
 661     my $self = shift;
 662     my $args = $self->{dbmargs} or return;
 663     return ref($args) eq 'ARRAY' ? @$args : $args;
 664 }
 665
 666 =head2 index_name
 667
 668  Title   : index_name
 669  Usage   : my $indexname = $db->index_name($path,$isdir);
 670  Function: returns the name of the index for a specific path
 671  Returns : string
 672  Args    : path to check, boolean if it is a dir
 673
 674 =cut
 675
 676 sub index_name {
 677     my $self  = shift;
 678     my ($path,$isdir) = @_;
 679     unless ($path) {
 680         my $dir = $self->{dirname} or return;
 681         return $self->index_name($dir,-d $dir);
 682     }
 683     return "$path/directory.index" if $isdir;
 684     return "$path.index";
 685 }
 686
 687 =head2 calculate_offsets
 688
 689  Title   : calculate_offsets
 690  Usage   : $db->calculate_offsets($filename,$offsets);
 691  Function: calculates the quality score offsets in a file based on ID
 692  Returns : offset hash for each file
 693  Args    : file to process, $offsets - hashref of id to offset storage
 694
 695 =cut
 696
 697 sub calculate_offsets {
 698     my $self = shift;
 699     my ($file,$offsets) = @_;
 700     my $base = $self->path2fileno(basename($file));
 701
 702     my $fh = IO::File->new($file) or $self->throw("Can't open $file: $!");
 703     binmode $fh;
 704     warn "Indexing $file\n" if $self->{debug};
 705     my ( $offset,$id, $linelength, $firstline, $count, $termination_length,
 706          $qual_lines, $last_line,  %offsets );
 707     my ( $l3_len, $l2_len, $l_len ) = ( 0, 0, 0 );
 708
 709     while (<$fh>) { # don't try this at home
 710         # account for crlf-terminated Windows files
 711         $termination_length ||= /\r\n$/ ? 2 : 1;
 712         if (/^>(\S+)/) {
 713             print STDERR "indexed $count quality scores...\n"
 714             if $self->{debug} && (++$count%1000) == 0;
 715             my $pos = tell($fh);
 716             if ($id) {
 717                 my $qualstrlength = $pos - $offset - length($_);
 718                 $qualstrlength -= $termination_length * $qual_lines;
 719                 $offsets->{$id} = &{$self->{packmeth}}(
 720                     $offset,
 721                     $qualstrlength,
 722                     $linelength,
 723                     $firstline,
 724                     $base
 725                 );
 726             }
 727             $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
 728             ($offset, $firstline, $linelength) = ($pos, length($_), 0);
 729             $self->_check_linelength($linelength);
 730             ($l3_len, $l2_len, $l_len) = (0, 0, 0);
 731             $qual_lines = 0;
 732         } else {
 733             $l3_len = $l2_len;
 734             $l2_len = $l_len;
 735             $l_len = length($_);
 736             # need to check every line :(
 737             if (DIE_ON_MISSMATCHED_LINES &&
 738                 $l3_len > 0 &&
 739                 $l2_len > 0 &&
 740                 $l3_len != $l2_len
 741             ) {
 742                 my $fap = substr($_, 0, 20)."..";
 743                 $self->throw("Each line of the qual entry must be the same ".
 744                 "length except the last. Line above #$. '$fap' is $l2_len != ".
 745                 "$l3_len chars.");
 746             }
 747             $linelength ||= length($_);
 748             $qual_lines++;
 749         }
 750         $last_line = $_;
 751     }
 752
 753     $self->_check_linelength($linelength);
 754     # deal with last entry
 755     if ($id) {
 756         my $pos = tell($fh);
 757         my $qualstrlength = $pos - $offset;
 758
 759         if ($linelength == 0) {
 760             $qualstrlength = 0;
 761         } else {
 762             if ($last_line !~ /\s$/) {
 763                 $qual_lines--;
 764             }
 765             $qualstrlength -= $termination_length * $qual_lines;
 766         }
 767         $offsets->{$id} = &{$self->{packmeth}}(
 768             $offset,
 769             $qualstrlength,
 770             $linelength,
 771             $firstline,
 772             $base
 773         );
 774     }
 775     $offsets->{__termination_length} = $termination_length;
 776     return \%offsets;
 777 }
 778
 779 =head2 get_all_ids
 780
 781  Title   : get_all_ids
 782  Usage   : my @ids = $db->get_all_ids
 783  Function: gets all the stored ids in all indexes
 784  Returns : list of ids
 785  Args    : none
 786
 787 =cut
 788
 789 sub get_all_ids  { grep {!/^__/} keys %{shift->{offsets}} }
 790
 791 sub offset {
 792     my ($self, $id) = @_;
 793     my $offset = $self->{offsets}{$id} or return;
 794     (&{$self->{unpackmeth}}($offset))[0];
 795 }
 796
 797 =head2 length
 798
 799  Title   : length
 800  Usage   : $qualdb->length($seqid);
 801  Function: gets the number of quality values in a quality score
 802  Returns : scalar
 803  Args    : ID of a quality score
 804
 805 =cut
 806
 807 sub length {
 808     # the NUMBER of quality values
 809     my ($self, $id) = @_;
 810     my $len = scalar(@{$self->subqual($id)});
 811     return $len;
 812 }
 813
 814 sub lengthstr {
 815     # the length of the quality STRING
 816     my ($self, $id) = @_;
 817     my $offset = $self->{offsets}{$id} or return;
 818     (&{$self->{unpackmeth}}($offset))[1];
 819 }
 820
 821 sub linelen {
 822     my ($self, $id) = @_;
 823     my $offset = $self->{offsets}{$id} or return;
 824     (&{$self->{unpackmeth}}($offset))[2];
 825 }
 826
 827 sub headerlen {
 828     my ($self, $id) = @_;
 829     my $offset = $self->{offsets}{$id} or return;
 830     (&{$self->{unpackmeth}}($offset))[3];
 831 }
 832
 833 sub path { shift->{dirname} }
 834
 835 sub header_offset {
 836     my ($self, $id) = @_;
 837     return unless $self->{offsets}{$id};
 838     return $self->offset($id) - $self->headerlen($id);
 839 }
 840
 841 sub file {
 842     my ($self, $id) = @_;
 843     my $offset = $self->{offsets}{$id} or return;
 844     $self->fileno2path((&{$self->{unpackmeth}}($offset))[4]);
 845 }
 846
 847 sub fileno2path {
 848     my ($self, $no) = @_;
 849     return $self->{offsets}{"__file_$no"};
 850 }
 851
 852 sub path2fileno {
 853     my ($self, $path) = @_;
 854     if ( !defined $self->{offsets}{"__path_$path"} ) {
 855         my $fileno  = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
 856         $self->{offsets}{"__file_$fileno"} = $path;
 857     }
 858     return $self->{offsets}{"__path_$path"}
 859 }
 860
 861 sub _check_linelength {
 862     my ($self, $linelength) = @_;
 863     return unless defined $linelength;
 864     $self->throw(
 865         "Each line of the qual file must be less than 65,536 characters.Line ".
 866         "$. is $linelength chars."
 867     ) if $linelength > 65535;
 868 }
 869
 870 =head2 subqual
 871
 872  Title   : subqual
 873  Usage   : my @qualarr = @{$qualdb->subqual($id,$start,$stop)};
 874  Function: returns a subqual of a quality score in the database
 875  Returns : subquality array reference
 876  Args    : id of quality score, starting quality value number, ending quality
 877            value number
 878
 879 =cut
 880
 881 sub subqual {
 882     my ($self, $id, $start, $stop) = @_;
 883
 884     # Quality values in a quality score can have 1 or 2 digits and are separated
 885     # by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
 886     # is no easy way match the position of a quality value to its position in
 887     # the quality string.
 888     # As a consequence, if a subqual of the quality is requested, we still need
 889     # to grab the full quality string first - performance penalty for big
 890     # quality scores :(
 891     # I think there is no way around starting at the begining of the quality
 892     # score but maybe there is a resource-efficient way of starting at the
 893     # begining of the quality score and stopping when the the position of the
 894     # last quality value requested is reached??
 895
 896     # position of the quality values
 897     if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
 898         ($id, $start, $stop) = ($1,$2,$3);
 899         $start =~ s/_//g;
 900         $stop  =~ s/_//g;
 901     }
 902
 903     # position in quality string
 904     my $string_start = 1;
 905     my $string_stop = $self->lengthstr($id);
 906
 907     # fetch full quality string
 908     my $fh = $self->fh($id) or return;
 909     my $filestart = $self->caloffset($id, $string_start);
 910     my $filestop  = $self->caloffset($id, $string_stop);
 911     seek($fh,$filestart,0);
 912     my $data;
 913     read($fh, $data, $filestop-$filestart+1);
 914
 915     # process quality score
 916     $data =~ s/\n//g;
 917     $data =~ s/\r//g;
 918     my $reverse = 0;
 919     if ($stop && $start && $stop < $start) {
 920         $reverse = 1;
 921         my $tmp = $start;
 922         $start = $stop;
 923         $stop = $tmp;
 924     }
 925     my $subqual = 0;
 926     $subqual = 1 if ( $start || $stop );
 927     my @data;
 928     if ( $subqual || $reverse ) {
 929         @data = split / /, $data, $stop+1;
 930         my $length = scalar(@data);
 931         $start = 1       if $start < 1;
 932         $stop  = $length if $stop  > $length;
 933         pop @data if ($stop != $length);
 934         splice @data, 0, $start-1;
 935         @data = reverse(@data) if $reverse;
 936         $data = join ' ', @data;
 937     } else {
 938         @data = split / /, $data;
 939     }
 940
 941     return \@data;
 942 }
 943
 944 sub fh {
 945     my ($self, $id) = @_;
 946     my $file = $self->file($id) or return;
 947     $self->fhcache("$self->{dirname}/$file") ||
 948         $self->throw("Can't open file $file");
 949 }
 950
 951 =head2 header
 952
 953  Title   : header
 954  Usage   : $qualdb->header($id);
 955  Function: returns the header of a quality score in the database
 956  Returns : header string
 957  Args    : id of quality score
 958
 959 =cut
 960
 961 sub header {
 962     my ($self, $id) = @_;
 963     my ($offset,$seqlength,$linelength,$firstline,$file)
 964         = &{$self->{unpackmeth}}($self->{offsets}{$id}) or return;
 965     $offset -= $firstline;
 966     my $data;
 967     my $fh = $self->fh($id) or return;
 968     seek($fh,$offset,0);
 969     read($fh,$data,$firstline);
 970     chomp $data;
 971     substr($data,0,1) = '';
 972     $data;
 973 }
 974
 975 sub caloffset {
 976     my ($self, $id, $a) = @_;
 977     $a--;
 978     my ($offset,$seqlength,$linelength,$firstline,$file)
 979         = &{$self->{unpackmeth}}($self->{offsets}{$id});
 980     $a = 0            if $a < 0;
 981     $a = $seqlength-1 if $a >= $seqlength;
 982     my $tl = $self->{offsets}{__termination_length};
 983     $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
 984 }
 985
 986 sub fhcache {
 987     my ($self, $path) = @_;
 988     if (!$self->{fhcache}{$path}) {
 989         if ($self->{curopen} >= $self->{maxopen}) {
 990             my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
 991                 keys %{$self->{fhcache}};
 992             splice(@lru, $self->{maxopen} / 3);
 993             $self->{curopen} -= @lru;
 994             for (@lru) {
 995                 delete $self->{fhcache}{$_};
 996             }
 997         }
 998         $self->{fhcache}{$path} = IO::File->new($path) || return;
 999         binmode $self->{fhcache}{$path};
1000         $self->{curopen}++;
1001     }
1002     $self->{cacheseq}{$path}++;
1003     $self->{fhcache}{$path}
1004 }
1005
1006 sub _pack {
1007     pack STRUCT, @_;
1008 }
1009
1010 sub _packBig {
1011     pack STRUCTBIG, @_;
1012 }
1013
1014 sub _unpack {
1015     unpack STRUCT, shift;
1016 }
1017
1018 sub _unpackBig {
1019     unpack STRUCTBIG, shift;
1020 }
1021
1022 =head2 get_PrimaryQual_stream
1023
1024  Title   : get_PrimaryQual_stream
1025  Usage   : $qualdb->get_PrimaryQual_stream
1026  Function: get a SeqIO-like stream of quality scores
1027  Returns : stream object
1028  Args    : none
1029
1030 =cut
1031
1032 sub get_PrimaryQual_stream {
1033     my $self = shift;
1034     return Bio::DB::Qual::Stream->new($self);
1035 }
1036
1037 sub TIEHASH {
1038     my $self = shift;
1039     return $self->new(@_);
1040 }
1041
1042 sub FETCH {
1043     shift->subqual(@_);
1044 }
1045
1046 sub STORE {
1047     shift->throw("Read-only database");
1048 }
1049
1050 sub DELETE {
1051     shift->throw("Read-only database");
1052 }
1053
1054 sub CLEAR {
1055     shift->throw("Read-only database");
1056 }
1057
1058 sub EXISTS {
1059     defined shift->offset(@_);
1060 }
1061
1062 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
1063
1064 sub NEXTKEY  { tied(%{shift->{offsets}})->NEXTKEY(@_);  }
1065
1066 sub DESTROY {
1067     my $self = shift;
1068     if ($self->{indexing}) {  # killed prematurely, so index file is no good!
1069         warn "indexing was interrupted, so deleting $self->{indexing}";
1070         unlink $self->{indexing};
1071     }
1072 }
1073
1074 #-------------------------------------------------------------
1075 # Bio::Seq::PrimaryQual compatibility
1076 #
1077 # Usage is the same as in Bio::Seq::PrimaryQual
1078
1079 package Bio::Seq::PrimaryQual::Qual;
1080 use overload '""' => 'display_id';
1081
1082 use base qw(Bio::Root::Root Bio::Seq::PrimaryQual);
1083
1084 sub new {
1085     my ($class, @args) = @_;
1086     my $self = $class->SUPER::new(@args);
1087     my ($db, $id, $start, $stop) = $self->_rearrange(
1088                                  [qw(DATABASE ID START STOP)],
1089                                  @args);
1090     $self->{db}    = $db;
1091     $self->{id}    = $id;
1092     $self->{start} = $start || 1;
1093     $self->{stop}  = $stop  || $db->length($id);
1094     return $self;
1095 }
1096
1097 sub qual {
1098     my $self = shift;
1099     my $qual = $self->{db}->qual($self->{id},$self->{start},$self->{stop});
1100     return $qual;
1101 }
1102
1103 sub subqual {
1104     my ($self, $start, $stop) = @_;
1105     return $self->trunc($start, $stop)->qual;
1106 }
1107
1108 sub trunc {
1109     my ($self, $start, $stop) = @_;
1110     $self->throw(
1111         "$stop is smaller than $stop. If you want to truncate and reverse ".
1112         "complement, you must call trunc followed by revcom."
1113     ) if $start > $stop;
1114     my ($left, $right);
1115     if ($self->{start} <= $self->{stop}) {
1116         $left = $self->{start}+$start-1;
1117         $right  = $self->{start}+$stop-1;
1118     } else {
1119         $left = $self->{start}-($start-1);
1120         $right  = $self->{start}-($stop-1);
1121     }
1122     my $obj = $self->new( -database => $self->{db},
1123                           -id       => $self->{id},
1124                           -start    => $left,
1125                           -stop     => $right
1126                         );
1127     return $obj;
1128 }
1129
1130 sub display_id {
1131     my $self = shift;
1132     return $self->{id};
1133 }
1134
1135 sub primary_id {
1136     my $self = shift;
1137     return overload::StrVal($self);
1138 }
1139
1140 sub length {
1141     # number of quality scores
1142     my $self = shift;
1143     return scalar(@{$self->qual});
1144 }
1145
1146 sub description  {
1147     my $self = shift;
1148     my $header = $self->{'db'}->header($self->{id});
1149     # remove the id from the header
1150     $header = (split(/\s+/,$header,2))[2];
1151     return $header;
1152 }
1153 *desc = \&description;
1154
1155 #-------------------------------------------------------------
1156 # stream-based access to the database
1157 #
1158
1159 package Bio::DB::Qual::Stream;
1160 use base qw(Tie::Handle Bio::DB::SeqI);
1161
1162 sub new {
1163     my ($class, $db) = @_;
1164     my $key = $db->FIRSTKEY;
1165     return bless {
1166         db  => $db,
1167         key => $key
1168     }, $class;
1169 }
1170
1171 sub next_seq {
1172     my $self = shift;
1173     my ($key, $db) = @{$self}{'key', 'db'};
1174     while ($key =~ /^__/) {
1175         $key = $db->NEXTKEY($key);
1176         return unless defined $key;
1177     }
1178     my $value = $db->get_Qual_by_id($key);
1179     $self->{key} = $db->NEXTKEY($key);
1180     $value;
1181 }
1182
1183 sub TIEHANDLE {
1184     my ($class, $db) = @_;
1185     return $class->new($db);
1186 }
1187
1188 sub READLINE {
1189     my $self = shift;
1190     $self->next_seq;
1191 }
1192
1193 1;
1194
1195 __END__
1196