Bio/DB/Qual.pm

   1 #
   2 # BioPerl module for Bio::DB::Qual
   3 #
   4 # You may distribute this module under the same terms as perl itself
   5 #
   6
   7 =head1 NAME
   8
   9 Bio::DB::Qual - Fast indexed access to quality files
  10
  11 =head1 SYNOPSIS
  12
  13   use Bio::DB::Qual;
  14
  15   # create database from directory of qual files
  16   my $db      = Bio::DB::Qual->new('/path/to/qual/files/');
  17   my @ids     = $db->get_all_primary_ids;
  18
  19   # Simple access
  20   my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
  21   my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
  22   my $length  = $db->length('CHROMOSOME_I');
  23   my $header  = $db->header('CHROMOSOME_I');
  24
  25   # Access to sequence objects. See Bio::PrimarySeqI.
  26   my $obj     = $db->get_Qual_by_id('CHROMOSOME_I');
  27   my @qual    = @{$obj->qual};
  28   my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
  29   my $length  = $obj->length;
  30
  31   # Loop through sequence objects
  32   my $stream  = $db->get_PrimarySeq_stream;
  33   while (my $qual = $stream->next_seq) {
  34     # Bio::Seq::PrimaryQual operations
  35   }
  36
  37   # Filehandle access
  38   my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/');
  39   while (my $qual = <$fh>) {
  40     # Bio::Seq::PrimaryQual operations
  41   }
  42
  43   # Tied hash access
  44   tie %qualities,'Bio::DB::Qual','/path/to/qual/files/';
  45   print $qualities{'CHROMOSOME_I:1,20000'};
  46
  47 =head1 DESCRIPTION
  48
  49 Bio::DB::Qual provides indexed access to a single Fasta file, several files,
  50 or a directory of files. It provides random access to each quality score entry
  51 without having to read the file from the beginning. Access to subqualities
  52 (portions of a quality score) is provided, although contrary to Bio::DB::Fasta,
  53 the full quality score has to be brought in memory. Bio::DB::Qual is based on
  54 Bio::DB::IndexedBase. See this module's documentation for details.
  55
  56 The qual files should contain decimal quality scores. Entries may have any line
  57 length up to 65,536 characters, and different line lengths are allowed in the
  58 same file. However, within a quality score entry, all lines must be the same
  59 length except for the last. An error will be thrown if this is not the case.
  60
  61 The module uses /^E<gt>(\S+)/ to extract the primary ID of each quality score
  62 from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback
  63 routine to reversibly modify this primary ID, e.g. if you wish to extract a
  64 specific portion of the gi|gb|abc|xyz GenBank IDs.
  65
  66 =head1 DATABASE CREATION AND INDEXING
  67
  68 The object-oriented constructor is new(), the filehandle constructor is newFh()
  69 and the tied hash constructor is tie(). They all allow one to index a single Fasta
  70 file, several files, or a directory of files. See Bio::DB::IndexedBase.
  71
  72 =head1 SEE ALSO
  73
  74 L<Bio::DB::IndexedBase>
  75
  76 L<Bio::DB::Fasta>
  77
  78 L<Bio::Seq::PrimaryQual>
  79
  80 =head1 LIMITATIONS
  81
  82 When a quality score is deleted from one of the qual files, this deletion is not
  83 detected by the module and removed from the index. As a result, a "ghost" entry
  84 will remain in the index and will return garbage results if accessed. Currently,
  85 the only way to accommodate deletions is to rebuild the entire index, either by
  86 deleting it manually, or by passing -reindex=E<gt>1 to new() when
  87 initializing the module.
  88
  89 All quality score lines for a given quality score must have the same length
  90 except for the last (not sure why there is this limitation). This is not
  91 problematic for sequences but could be annoying for quality scores. A workaround
  92 is to make sure that your quality scores fit on no more than 2 lines. Another
  93 solution could be to padd them with blank spaces so that each line has the same
  94 number of characters (maybe this padding should be implemented in
  95 Bio::SeqIO::qual?).
  96
  97 =head1 AUTHOR
  98
  99 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
 100
 101 Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
 102
 103 Copyright (c) 2007 Florent E Angly.
 104
 105 This library is free software; you can redistribute it and/or modify
 106 it under the same terms as Perl itself.
 107
 108 =head1 APPENDIX
 109
 110 The rest of the documentation details each of the object
 111 methods. Internal methods are usually preceded with a _
 112
 113 For BioPerl-style access, the following methods are provided:
 114
 115 =head2 get_Seq_by_id
 116
 117  Title   : get_Seq_by_id,  get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id,
 118            get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id,
 119  Usage   : my $seq = $db->get_Seq_by_id($id);
 120  Function: Given an ID, fetch the corresponding sequence from the database.
 121  Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
 122            Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
 123            only load the sequence string into memory when requested using seq().
 124            See L<Bio::PrimarySeqI> for methods provided by the sequence objects
 125            returned from get_Seq_by_id() and get_PrimarySeq_stream().
 126  Args    : ID
 127
 128 =head2 get_PrimarySeq_stream
 129
 130  Title   : get_Seq_stream, get_PrimarySeq_stream
 131  Usage   : my $stream = $db->get_Seq_stream();
 132  Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
 133            single method, next_seq(). Each call to next_seq() returns a new
 134            Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
 135  Returns : A Bio::DB::Indexed::Stream object
 136  Args    : None
 137
 138 =head1
 139
 140 For simple access, the following methods are provided:
 141
 142 =cut
 143
 144
 145 package Bio::DB::Qual;
 146
 147 use strict;
 148 use IO::File;
 149 use File::Spec;
 150
 151 use base qw(Bio::DB::IndexedBase);
 152
 153 our $obj_class = 'Bio::Seq::PrimaryQual::Qual';
 154 our $file_glob = '*.{qual,QUAL,qa,QA}';
 155
 156
 157 =head2 new
 158
 159  Title   : new
 160  Usage   : my $db = Bio::DB::Qual->new( $path, %options);
 161  Function: Initialize a new database object. When indexing a directory, files
 162            ending in .qual,qa are indexed by default.
 163  Returns : A new Bio::DB::Qual object
 164  Args    : A single file, or path to dir, or arrayref of files
 165            Optional arguments: see Bio::DB::IndexedBase
 166
 167 =cut
 168
 169
 170 sub _calculate_offsets {
 171     # Bio::DB::IndexedBase calls this to calculate offsets
 172     my ($self, $fileno, $file, $offsets) = @_;
 173
 174     my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!");
 175     binmode $fh;
 176     warn "Indexing $file\n" if $self->{debug};
 177     my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line,
 178         $numres, %offsets);
 179     my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
 180
 181     my $termination_length = $self->{termination_length};
 182     while (my $line = <$fh>) {
 183         # Account for crlf-terminated Windows files
 184         if (index($line, '>') == 0) {
 185             if ($line =~ /^>(\S+)/) {
 186                 print STDERR "Indexed $count quality scores...\n"
 187                     if $self->{debug} && (++$count%1000) == 0;
 188                 $self->_check_linelength($linelen);
 189                 my $pos = tell($fh);
 190                 if (@ids) {
 191                     my $strlen = $pos - $offset - length($line);
 192                     $strlen   -= $termination_length * $qual_lines;
 193                     my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres,
 194                         $linelen, $headerlen, Bio::DB::IndexedBase::NA, $fileno);
 195                     for my $id (@ids) {
 196                         $offsets->{$id} = $ppos;
 197                     }
 198                     $numres = 0;
 199                 }
 200                 @ids = $self->_makeid($line);
 201                 ($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0);
 202                 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
 203             } else {
 204                 # Catch bad header lines, bug 3172
 205                 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
 206             }
 207         } elsif ($line !~ /\S/) {
 208             # Skip blank line
 209             $blank_lines++;
 210             next;
 211         } else {
 212             # Need to check every line :(
 213             $l3_len = $l2_len;
 214             $l2_len = $l_len;
 215             $l_len  = length $line;
 216             if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
 217                 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
 218                     my $fap = substr($line, 0, 20)."..";
 219                     $self->throw("Each line of the qual entry must be the same ".
 220                         "length except the last. Line above #$. '$fap' is $l2_len".
 221                         " != $l3_len chars.");
 222                 }
 223                 if ($blank_lines) {
 224                     # Blank lines not allowed in entry
 225                     $self->throw("Blank lines can only precede header lines, ".
 226                         "found preceding line #$.");
 227                 }
 228             }
 229             $linelen ||= length $line;
 230             $qual_lines++;
 231             $numres += scalar(split /\s+/, $line);
 232         }
 233         $last_line = $line;
 234     }
 235
 236     # Process last entry
 237     $self->_check_linelength($linelen);
 238     my $pos = tell($fh);
 239     if (@ids) {
 240         my $strlen = $pos - $offset;
 241         if ($linelen == 0) {
 242             $strlen = 0;
 243         } else {
 244             if ($last_line !~ /\s$/) {
 245                 $qual_lines--;
 246             }
 247             $strlen -= $termination_length * $qual_lines;
 248         }
 249         my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, $linelen,
 250             $headerlen, Bio::DB::IndexedBase::NA, $fileno);
 251         for my $id (@ids) {
 252             $offsets->{$id} = $ppos;
 253         }
 254     }
 255
 256     return \%offsets;
 257 }
 258
 259
 260 # for backward compatibility
 261 sub get_PrimaryQual_stream {
 262    my $self = shift;
 263    return $self->get_PrimarySeq_stream;
 264 }
 265
 266
 267 # for backward compatibility
 268 sub get_Qual_by_id {
 269     my ($self, $id) = @_;
 270     return $self->get_Seq_by_id($id);
 271 }
 272
 273 *get_qual_by_version = *get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id;
 274
 275
 276 =head2 qual
 277
 278  Title   : qual, quality, subqual
 279  Usage   : # All quality scores
 280            my @qualarr = @{$qualdb->subqual($id)};
 281            # Subset of the quality scores
 282            my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)};
 283            # or...
 284            my @subqualarr = @{$qualdb->subqual($compound_id)};
 285  Function: Get a subqual of an entry in the database. For your convenience,
 286            the sequence to extract can be specified with any of the following
 287            compound IDs:
 288               $db->qual("$id:$start,$stop")
 289               $db->qual("$id:$start..$stop")
 290               $db->qual("$id:$start-$stop")
 291               $db->qual("$id:$start,$stop/$strand")
 292               $db->qual("$id:$start..$stop/$strand")
 293               $db->qual("$id:$start-$stop/$strand")
 294               $db->qual("$id/$strand")
 295            If $stop is less than $start, then the reverse complement of the
 296            sequence is returned. Avoid using it if possible since this goes
 297            against Bio::Seq conventions.
 298  Returns : Reference to an array of quality scores
 299  Args    : Compound ID of entry to retrieve
 300              or
 301            ID, optional start (defaults to 1), optional end (defaults to the
 302            number of quality scores for this sequence), and strand (defaults to
 303            1).
 304
 305 =cut
 306
 307 sub subqual {
 308     my ($self, $id, $start, $stop, $strand) = @_;
 309
 310     # Quality values in a quality score can have 1 or 2 digits and are separated
 311     # by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
 312     # is no easy way match the position of a quality value to its position in
 313     # the quality string.
 314     # As a consequence, if a subqual of the quality is requested, we still need
 315     # to grab the full quality string first - performance penalty for big
 316     # quality scores :(
 317     # I think there is no way around starting at the begining of the quality
 318     # score but maybe there is a resource-efficient way of starting at the
 319     # begining of the quality score and stopping when the the position of the
 320     # last quality value requested is reached??
 321
 322     $self->throw('Need to provide a sequence ID') if not defined $id;
 323     ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
 324
 325     # Position in quality string
 326     my $string_start = 1;
 327     my $string_stop = $self->strlen($id);
 328
 329     # Fetch full quality string
 330     my $fh = $self->_fh($id) or return;
 331     my $filestart = $self->_calc_offset($id, $string_start);
 332     my $filestop  = $self->_calc_offset($id, $string_stop );
 333     seek($fh, $filestart,0);
 334     my $data;
 335     read($fh, $data, $filestop-$filestart+1);
 336
 337     # Process quality score
 338     Bio::DB::IndexedBase::_strip_crnl($data);
 339     my $subqual = 0;
 340     $subqual = 1 if ( $start || $stop );
 341     my @data;
 342     if ( $subqual || ($strand == -1) ) {
 343         @data = split / /, $data, $stop+1;
 344         my $length = scalar(@data);
 345         $start = 1       if $start < 1;
 346         $stop  = $length if $stop  > $length;
 347         pop @data if ($stop != $length);
 348         splice @data, 0, $start-1;
 349         @data = reverse(@data) if $strand == -1;
 350         $data = join ' ', @data;
 351     } else {
 352         @data = split / /, $data;
 353     }
 354
 355     return \@data;
 356 }
 357
 358 *qual = *quality = \&subqual;
 359
 360
 361 =head2 header
 362
 363  Title   : header
 364  Usage   : my $header = $db->header($id);
 365  Function: Get the header line (ID and description fields) of the specified entry.
 366  Returns : String
 367  Args    : ID of entry
 368
 369 =cut
 370
 371 sub header {
 372     my ($self, $id) = @_;
 373     $self->throw('Need to provide a sequence ID') if not defined $id;
 374     my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
 375     $offset -= $headerlen;
 376     my $data;
 377     my $fh = $self->_fh($id) or return;
 378     seek($fh, $offset, 0);
 379     read($fh, $data, $headerlen);
 380     # On Windows chomp remove '\n' but leaves '\r'
 381     # when reading '\r\n' in binary mode,
 382     # _strip_crnl removes both
 383     $data = Bio::DB::IndexedBase::_strip_crnl($data);
 384     substr($data, 0, 1) = '';
 385     return $data;
 386 }
 387
 388
 389 #-------------------------------------------------------------
 390 # Tied hash overrides
 391 #
 392
 393 sub FETCH {
 394     return shift->subqual(@_);
 395 }
 396
 397
 398 #-------------------------------------------------------------
 399 # Bio::Seq::PrimaryQual compatibility
 400 #
 401 # Usage is the same as in Bio::Seq::PrimaryQual
 402
 403 package Bio::Seq::PrimaryQual::Qual;
 404 use overload '""' => 'display_id';
 405
 406 use base qw(Bio::Root::Root Bio::Seq::PrimaryQual);
 407
 408 sub new {
 409     my ($class, @args) = @_;
 410     my $self = $class->SUPER::new(@args);
 411     my ($db, $id, $start, $stop) = $self->_rearrange(
 412                                  [qw(DATABASE ID START STOP)],
 413                                  @args);
 414     $self->{db}    = $db;
 415     $self->{id}    = $id;
 416     $self->{stop}  = $stop  || $db->length($id);
 417     $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
 418     return $self;
 419 }
 420
 421
 422 sub qual {
 423     my $self = shift;
 424     my $qual = $self->{db}->qual($self->{id}, $self->{start}, $self->{stop});
 425     return $qual;
 426 }
 427
 428
 429 sub subqual {
 430     my ($self, $start, $stop) = @_;
 431     return $self->trunc($start, $stop)->qual;
 432 }
 433
 434
 435 sub trunc {
 436     # Override Bio::Seq::QualI trunc() method. This way, we create an object
 437     # that does not store the quality array in memory.
 438     my ($self, $start, $stop) = @_;
 439     $self->throw(
 440         "$stop is smaller than $stop. If you want to truncate and reverse ".
 441         "complement, you must call trunc followed by revcom."
 442     ) if $start > $stop;
 443     if ($self->{start} <= $self->{stop}) {
 444         $start = $self->{start}+$start-1;
 445         $stop  = $self->{start}+$stop-1;
 446     } else {
 447         $start = $self->{start}-($start-1);
 448         $stop  = $self->{start}-($stop-1);
 449     }
 450     my $obj = $self->new( -database => $self->{db},
 451                           -id       => $self->{id},
 452                           -start    => $start,
 453                           -stop     => $stop
 454                         );
 455     return $obj;
 456 }
 457
 458
 459 sub display_id {
 460     my $self = shift;
 461     return $self->{id};
 462 }
 463
 464
 465 sub primary_id {
 466     my $self = shift;
 467     return overload::StrVal($self);
 468 }
 469
 470 sub revcom {
 471     # Override Bio::QualI revcom() with optimized method.
 472     my $self = shift;
 473     return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
 474 }
 475
 476 sub length {
 477     # Get length from quality location, not the quality array (too expensive)
 478     my $self = shift;
 479     return $self->{start} < $self->{stop}   ?
 480         $self->{stop}  - $self->{start} + 1 :
 481         $self->{start} - $self->{stop}  + 1 ;
 482 }
 483
 484
 485 sub description  {
 486     my $self = shift;
 487     my $header = $self->{'db'}->header($self->{id});
 488     # remove the id from the header
 489     $header = (split(/\s+/, $header, 2))[2];
 490     return $header;
 491 }
 492 *desc = \&description;
 493
 494
 495 1;