fix spelling errors, fixes #3228
[bioperl-live.git] / Bio / DB / Qual.pm
blobfeeed7187c5c0cefd2fb69b86ce38865b5763326
3 # BioPerl module for Bio::DB::Qual
5 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
10 =head1 NAME
12 Bio::DB::Qual -- Fast indexed access to a directory of quality files
14 =head1 SYNOPSIS
16 use Bio::DB::Qual;
18 # create database from directory of qual files
19 my $db = Bio::DB::Qual->new('/path/to/qual/files');
20 my @ids = $db->ids;
22 # simple access (for those without Bioperl)
23 my @qual = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
24 my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
25 my $length = $db->length('CHROMOSOME_I');
26 my $header = $db->header('CHROMOSOME_I');
28 # Bioperl-style access
29 my $obj = $db->get_Qual_by_id('CHROMOSOME_I');
30 my @qual = @{$obj->qual};
31 my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
32 my $length = $obj->length;
33 # (etc)
35 # Bio::SeqIO-style access
36 my $stream = $db->get_PrimaryQual_stream;
37 while (my $qual = $stream->next_seq) {
38 # Bio::Seq::PrimaryQual operations
41 my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
42 while (my $qual = <$fh>) {
43 # Bio::Seq::PrimaryQual operations
46 # tied hash access
47 tie %qualities,'Bio::DB::Qual','/path/to/qual/files';
48 print $qualities{'CHROMOSOME_I:1,20000'};
50 =head1 DESCRIPTION
52 Bio::DB::Qual provides indexed access to one or more qual files. It provides
53 random access to each quality score entry without having to read the file from
54 the beginning. Access to subqualities (portions of a quality score) is provided,
55 although contrary to Bio::DB::Fasta, the full quality score has to be brought in
56 memory.
58 When you initialize the module, you point it at a single qual file or a
59 directory of multiple such files. The first time it is run, the module generates
60 an index of the contents of the file or directory using the AnyDBM module
61 (Berkeley DB* preferred, followed by GDBM_File, NDBM_File, and SDBM_File).
62 Thereafter it uses the index file to find the file and offset for any requested
63 quality score. If one of the source qual files is updated, the module reindexes
64 just that one file. (You can also force reindexing manually). For improved
65 performance, the module keeps a cache of open filehandles, closing less-recently
66 used ones when the cache is full.
68 The qual files may contain decimal quality scores. Entries may have any line
69 length up to 65,536 characters, and different line lengths are allowed in the
70 same file. However, within a quality score entry, all lines must be the same
71 length except for the last. An error will be thrown if this is not the case.
73 The module uses /^E<gt>(\S+)/ to extract the primary ID of each quality score
74 from the qual header. During indexing, you may pass a callback routine to modify
75 this primary ID. For example, you may wish to extract a portion of the
76 gi|gb|abc|xyz prefixes that are commonly used. The original header line can be
77 recovered later.
79 *Berkeley DB can be obtained free from www.sleepycat.com. After it is installed
80 you will need to install the BerkeleyDB Perl module.
82 =head1 DATABASE CREATION AND INDEXING
84 The two constructors for this class are new() and newFh(). The former creates a
85 Bio::DB::Qual object which is accessed via method calls. The latter creates a
86 tied filehandle which can be used Bio::SeqIO-style to fetch quality score
87 objects in a data stream. There is also a tied hash interface.
89 =over 2
91 =item $db = Bio::DB::Qual-E<gt>new($qual_path [,%options])
93 Create a new Bio::DB::Qual object from the Qual file or files indicated by
94 $qual_path. Indexing will be performed automatically if needed. If successful,
95 new() will return the database accessor object. Otherwise it will return undef.
97 $qual_path may be an individual qual file, or may refer to a directory
98 containing one or more of such files. Following the path, you may pass a series
99 of name=E<gt>value options or a hash with these same name=E<gt>value pairs.
100 Valid options are:
102 Option Name Description Default
103 ----------- ----------- -------
105 -glob Glob expression to use *.{qa,QA,qual,QUAL}
106 for searching for qual
107 files in directories.
109 -makeid A code subroutine for None
110 transforming qual IDs.
112 -maxopen Maximum size of 32
113 filehandle cache.
115 -debug Turn on status 0
116 messages.
118 -reindex Force the index to be 0
119 rebuilt.
121 -dbmargs Additional arguments none
122 to pass to the DBM
123 routines when tied
124 (scalar or array ref).
126 -dbmargs can be used to control the format of the index. For example, you can
127 pass $DB_BTREE to this argument so as to force the IDs to be sorted and
128 retrieved alphabetically. Note that you must use the same arguments every time
129 you open the index!
131 -reindex can be used to force the index to be recreated from scratch.
133 =item $fh = Bio::DB::Qual-E<gt>newFh($qual_path [,%options])
135 Create a tied filehandle opened on a Bio::DB::Qual object. Reading from this
136 filehandle with E<lt>E<gt> will return a stream of quality objects,
137 Bio::SeqIO-style.
139 =back
141 The -makeid option gives you a chance to modify quality score IDs during
142 indexing. The option value should be a code reference that will take a scalar
143 argument and return a scalar result, like this:
145 $db = Bio::DB::Qual->new("file.qual",-makeid=>\&make_my_id);
147 sub make_my_id {
148 my $description_line = shift;
149 # get a different id from the quality header, e.g.
150 $description_line =~ /(\S+)$/;
151 return $1;
154 make_my_id() will be called with the full qual id line (including the "E<gt>"
155 symbol!). For example:
157 >A12345.3 Predicted C. elegans protein egl-2
159 By default, this module will use the regular expression /^E<gt>(\S+)/ to extract
160 "A12345.3" for use as the ID.If you pass a -makeid callback, you can extract any
161 portion of this, such as the "egl-2" symbol.
163 The -makeid option is ignored after the index is constructed.
165 =head1 OBJECT METHODS
167 The following object methods are provided.
169 =over 10
171 =item $raw_qual = $db-E<gt>qual($id [,$start, $stop])
173 Return a quality score array reference given an ID and optionally a start and
174 stop position (the quality value number) in the quality score. If $stop is less
175 than $start, then the reverse complement of the quality score is returned (this
176 violates Bio::Seq conventions).
178 For your convenience, subqualities can be indicated with any of the following
179 compound IDs:
181 $db->qual("$id:$start,$stop")
183 $db->qual("$id:$start..$stop")
185 $db->qual("$id:$start-$stop")
187 =item $length = $db-E<gt>length($id)
189 Return the length of the indicated quality score, i.e. the number of quality
190 values.
192 =item $header = $db-E<gt>header($id)
194 Return the header line for the ID, including the initial "E<gt>".
196 =item $filename = $db-E<gt>file($id)
198 Return the name of the file in which the indicated quality score can be found.
200 =item $offset = $db-E<gt>offset($id)
202 Return the offset of the indicated quality score from the beginning of the file
203 in which it is located. The offset points to the beginning of the quality
204 score, not the beginning of the header line.
206 =item $header_length = $db-E<gt>headerlen($id)
208 Return the length of the header line for the indicated quality score.
210 =item $header_offset = $db-E<gt>header_offset($id)
212 Return the offset of the header line for the indicated quality score from the
213 beginning of the file in which it is located.
215 =item $index_name = $db-E<gt>index_name
217 Return the path to the index file.
219 =item $path = $db-E<gt>path
221 Return the path to the Qual file(s).
223 =back
225 For BioPerl-style access, the following methods are provided:
227 =over 4
229 =item $qual = $db-E<gt>get_Qual_by_id($id)
231 Return a Bio::Seq::PrimaryQual object, which obeys the Bio::PrimarySeqI
232 conventions. To recover the quality score, call $qual-E<gt>qual().
234 Note that get_Qual_by_id() does not bring the entire quality score into memory
235 until requested. Internally, the returned object uses the accessor to generate
236 subqualities as needed.
238 =item $qual = $db-E<gt>get_Qual_by_acc($id)
240 =item $qual = $db-E<gt>get_Qual_by_primary_id($id)
242 These methods all do the same thing as get_Qual_by_id().
244 =item $stream = $db-E<gt>get_PrimaryQual_stream()
246 Return a Bio::DB::Qual::Stream object, which supports a single method
247 next_seq(). Each call to next_seq() returns a new Bio::Seq::PrimaryQual object,
248 until no more quality scores remain.
250 =back
252 See L<Bio::Seq::PrimaryQual> and L<Bio::PrimarySeqI> for methods provided by the
253 quality objects returned from get_Qual_by_id() and get_PrimaryQual_stream().
255 =head1 TIED INTERFACES
257 This module provides two tied interfaces, one which allows you to treat the
258 quality score database as a hash, and the other which allows you to treat the
259 database as an I/O stream.
261 =head2 Creating a Tied Hash
263 The tied hash interface is very straightforward.
265 =over 1
267 =item $obj = tie %db,'Bio::DB::Qual','/path/to/qual/files' [,@args]
269 Tie %db to Bio::DB::Qual using the indicated path to the Qual files. The
270 optional @args list is the same set of named argument/value pairs used by
271 Bio::DB::Qual-E<gt>new().
273 If successful, tie() will return the tied object. Otherwise it will return
274 undef.
276 =back
278 Once tied, you can use the hash to retrieve an individual quality score by its
279 ID, like this:
281 my $qual = $db{CHROMOSOME_I};
283 You may select a subquality by appending the comma-separated range to the
284 quality score ID in the format "$id:$start,$stop". For example, here is the
285 first 1000 quality values of the quality score with ID "CHROMOSOME_I":
287 my $qual = $db{'CHROMOSOME_I:1,1000'};
289 (The regular expression used to parse this format allows quality score IDs to
290 contain colons.)
292 When selecting subqualities, if $start E<gt> stop, then the reverse complement
293 will be returned.
295 The keys() and values() functions will return the IDs and their quality scores,
296 respectively. In addition, each() can be used to iterate over the entire data
297 set:
299 while (my ($id,$quality) = each %db) {
300 print "$id => $quality\n";
303 When dealing with very large quality scores, you can avoid bringing them into
304 memory by calling each() in a scalar context. This returns the key only. You can
305 then use tied(%db) to recover the Bio::DB::Qual object and call its methods.
307 while (my $id = each %db) {
308 print "$id => $db{$quality:1,100}\n";
309 print "$id => ",tied(%db)->length($id),"\n";
312 You may, in addition invoke Bio::DB::Qual the FIRSTKEY and NEXTKEY tied hash
313 methods directly.
315 =over 2
317 =item $id = $db-E<gt>FIRSTKEY
319 Return the first ID in the database.
321 =item $id = $db-E<gt>NEXTKEY($id)
323 Given an ID, return the next quality score ID.
325 =back
327 This allows you to write the following iterative loop using just the object-
328 oriented interface:
330 my $db = Bio::DB::Qual->new('/path/to/qual/files');
331 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
332 # do something with quality
335 =head2 Creating a Tied Filehandle
337 The Bio::DB::Qual-E<gt>newFh() method creates a tied filehandle from which you
338 can read Bio::Seq::PrimaryQual quality score objects sequentially. The following
339 bit of code will iterate sequentially over all quality scores in the database:
341 my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
342 while (my $qual = <$fh>) {
343 print $qual->id,' => ',$qual->length,"\n";
346 When no more quality scores remain to be retrieved, the stream will return
347 undef.
349 =head1 LIMITATIONS
351 When a quality score is deleted from one of the qual files, this deletion is not
352 detected by the module and removed from the index. As a result, a "ghost" entry
353 will remain in the index and will return garbage results if accessed. Currently,
354 the only way to accommodate deletions is to rebuild the entire index, either by
355 deleting it manually, or by passing -reindex=E<gt>1 to new() when
356 initializing the module.
358 All quality score lines for a given quality score must have the same length
359 except for the last (not sure why there is this limitation). This is not
360 problematic for sequences but could be annoying for quality scores. A workaround
361 is to make sure the your quality scores fit on no more than 2 lines. Another
362 solution could be to padd them with blank spaces so that each line has the same
363 number of characters (maybe this padding should be implemented in
364 Bio::SeqIO::qual?).
366 =head1 AUTHOR
368 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
370 Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
372 Copyright (c) 2007 Florent E Angly.
374 This library is free software; you can redistribute it and/or modify
375 it under the same terms as Perl itself.
377 =cut
379 package Bio::DB::Qual;
381 BEGIN {
382 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
385 use strict;
386 use IO::File;
387 use AnyDBM_File;
388 use Fcntl;
389 use File::Basename qw(basename dirname);
391 use base qw(Bio::DB::SeqI Bio::Root::Root);
393 *qual = *quality = \&subqual;
394 *ids = \&get_all_ids;
395 *get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id;
397 use constant STRUCT =>'NNnna*';
398 use constant STRUCTBIG =>'QQnna*'; # 64-bit file offset and quality score length
399 use constant DIE_ON_MISSMATCHED_LINES => 1; # you can avoid dying if you want
400 # but you're likely to get bad
401 # results
403 # Bio::DB-like object
404 # providing fast random access to a directory of qual files
406 =head2 new
408 Title : new
409 Usage : my $db = Bio::DB::Qual->new( $path, @options);
410 Function: initialize a new Bio::DB::Qual object
411 Returns : new Bio::DB::Qual object
412 Args : path to dir of qual files or a single qual filename
414 These are optional arguments to pass in as well.
416 -glob Glob expression to use *.{qual,QUAL,qa,QA}
417 for searching for qual
418 files in directories.
420 -makeid A code subroutine for none
421 transforming qual IDs.
423 -maxopen Maximum size of 32
424 filehandle cache.
426 -debug Turn on status 0
427 messages.
429 -reindex Force the index to be 0
430 rebuilt.
432 -dbmargs Additional arguments none
433 to pass to the DBM
434 routines when tied
435 (scalar or array ref).
437 =cut
439 sub new {
440 my ($class, $path) = @_;
441 my %opts = @_;
443 my $self = bless {
444 debug => $opts{-debug},
445 makeid => $opts{-makeid},
446 glob => $opts{-glob} || '*.{qual,QUAL,qa,QA}',
447 maxopen => $opts{-maxopen} || 32,
448 dbmargs => $opts{-dbmargs} || undef,
449 fhcache => {},
450 cacheseq => {},
451 curopen => 0,
452 openseq => 1,
453 dirname => undef,
454 offsets => undef,
455 }, $class;
456 my ($offsets,$dirname);
458 if (-d $path) {
459 # because Win32 glob() is broken with respect to long file names that
460 # contain whitespace.
461 $path = Win32::GetShortPathName($path)
462 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
463 $offsets = $self->index_dir($path,$opts{-reindex});
464 $dirname = $path;
465 } elsif (-f _) {
466 $offsets = $self->index_file($path,$opts{-reindex});
467 $dirname = dirname($path);
468 } else {
469 $self->throw( "$path: invalid file or dirname");
471 @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
472 $self;
475 =head2 newFh
477 Title : newFh
478 Usage : my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
479 Function: gets a new Fh for a file or directory containing several files
480 Returns : filehandle object
481 Args : none
483 =cut
485 sub newFh {
486 my $class = shift;
487 my $self = $class->new(@_);
488 require Symbol;
489 my $fh = Symbol::gensym or return;
490 tie $$fh,'Bio::DB::Qual::Stream',$self or return;
491 $fh;
494 sub _open_index {
495 my $self = shift;
496 my ($index,$write) = @_;
497 my %offsets;
498 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
499 my @dbmargs = $self->dbmargs;
500 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
501 or $self->throw( "Can't open cache file $index: $!");
502 return \%offsets;
505 sub _close_index {
506 my ($self, $index) = @_;
507 untie %$index;
510 =head2 index_dir
512 Title : index_dir
513 Usage : $db->index_dir($dir)
514 Function: set the index dir and load all files in the dir
515 Returns : hashref of qual offsets in each file
516 Args : dirname, boolean to force a reload of all files
518 =cut
520 sub index_dir {
521 my ($self, $dir, $force_reindex) = @_;
523 # find all qual files
524 my @files = glob("$dir/$self->{glob}");
525 $self->throw("No qual files found in $dir") unless @files;
527 # get name of index
528 my $index = $self->index_name($dir,1);
530 # if caller has requested reindexing, then unlink
531 # the index file.
532 unlink $index if $force_reindex;
534 # get the modification time of the index
535 my $indextime = 0;
536 for my $suffix('','.pag','.dir') {
537 $indextime ||= (stat("${index}${suffix}"))[9];
539 $indextime ||= 0; # prevent some uninit variable warnings
541 # get the most recent modification time of any of the contents
542 my $modtime = 0;
543 my %modtime;
544 $self->set_pack_method( @files );
545 foreach (@files) {
546 my $m = (stat($_))[9];
547 $modtime{$_} = $m;
548 $modtime = $m if defined $m && $modtime < $m;
551 my $reindex = $force_reindex || $indextime < $modtime;
552 $self->{offsets} = $self->_open_index($index,$reindex) or return;
554 # no indexing needed
555 return $self->{offsets} unless $reindex;
557 # otherwise reindex contents of changed files
558 $self->{indexing} = $index;
559 foreach (@files) {
560 next if( defined $indextime && $modtime{$_} <= $indextime);
561 $self->calculate_offsets($_,$self->{offsets});
563 delete $self->{indexing};
565 # we've been having troubles with corrupted index files on Windows systems,
566 # so possibly closing and reopening will help
567 $self->_close_index($self->{offsets});
569 return $self->{offsets} = $self->_open_index($index);
572 =head2 get_Qual_by_id
574 Title : get_Qual_by_id
575 Usage : my $qual = $db->get_Qual_by_id($id)
576 Function: Bio::DB::RandomAccessI method implemented
577 Returns : Bio::PrimarySeqI object
578 Args : id
580 =cut
582 sub get_Qual_by_id {
583 my ($self, $id) = @_;
584 return unless exists $self->{offsets}{$id};
585 return Bio::Seq::PrimaryQual::Qual->new($self,$id);
588 =head2 set_pack_method
590 Title : set_pack_method
591 Usage : $db->set_pack_method( @files )
592 Function: Determines whether data packing uses 32 or 64 bit integers
593 Returns : 1 for success
594 Args : one or more file paths
596 =cut
598 sub set_pack_method {
599 my $self = shift;
600 # Find the maximum file size:
601 my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
602 my $fourGB = (2 ** 32) - 1;
604 if ($maxsize > $fourGB) {
605 # At least one file exceeds 4Gb - we will need to use 64 bit ints
606 $self->{packmeth} = \&_packBig;
607 $self->{unpackmeth} = \&_unpackBig;
608 } else {
609 $self->{packmeth} = \&_pack;
610 $self->{unpackmeth} = \&_unpack;
612 return 1;
615 =head2 index_file
617 Title : index_file
618 Usage : $db->index_file($filename)
619 Function: (re)loads a quality score file and indexes quality score offsets in
620 the file
621 Returns : qual offsets in the file
622 Args : filename, boolean to force reloading a file
624 =cut
626 sub index_file {
627 my ($self, $file, $force_reindex) = @_;
629 $self->set_pack_method( $file );
630 my $index = $self->index_name($file);
631 # if caller has requested reindexing, then unlink the index
632 unlink $index if $force_reindex;
634 # get the modification time of the index
635 my $indextime = (stat($index))[9] || 0;
636 my $modtime = (stat($file) )[9] || 0;
638 my $reindex = $force_reindex || $indextime < $modtime;
639 my $offsets = $self->_open_index($index,$reindex) or return;
640 $self->{offsets} = $offsets;
642 return $self->{offsets} unless $reindex;
644 $self->{indexing} = $index;
645 $self->calculate_offsets($file,$offsets);
646 delete $self->{indexing};
647 return $self->{offsets};
650 =head2 dbmargs
652 Title : dbmargs
653 Usage : my @args = $db->dbmargs;
654 Function: gets stored dbm arguments
655 Returns : array
656 Args : none
658 =cut
660 sub dbmargs {
661 my $self = shift;
662 my $args = $self->{dbmargs} or return;
663 return ref($args) eq 'ARRAY' ? @$args : $args;
666 =head2 index_name
668 Title : index_name
669 Usage : my $indexname = $db->index_name($path,$isdir);
670 Function: returns the name of the index for a specific path
671 Returns : string
672 Args : path to check, boolean if it is a dir
674 =cut
676 sub index_name {
677 my $self = shift;
678 my ($path,$isdir) = @_;
679 unless ($path) {
680 my $dir = $self->{dirname} or return;
681 return $self->index_name($dir,-d $dir);
683 return "$path/directory.index" if $isdir;
684 return "$path.index";
687 =head2 calculate_offsets
689 Title : calculate_offsets
690 Usage : $db->calculate_offsets($filename,$offsets);
691 Function: calculates the quality score offsets in a file based on ID
692 Returns : offset hash for each file
693 Args : file to process, $offsets - hashref of id to offset storage
695 =cut
697 sub calculate_offsets {
698 my $self = shift;
699 my ($file,$offsets) = @_;
700 my $base = $self->path2fileno(basename($file));
702 my $fh = IO::File->new($file) or $self->throw("Can't open $file: $!");
703 binmode $fh;
704 warn "Indexing $file\n" if $self->{debug};
705 my ( $offset,$id, $linelength, $firstline, $count, $termination_length,
706 $qual_lines, $last_line, %offsets );
707 my ( $l3_len, $l2_len, $l_len ) = ( 0, 0, 0 );
709 while (<$fh>) { # don't try this at home
710 # account for crlf-terminated Windows files
711 $termination_length ||= /\r\n$/ ? 2 : 1;
712 if (/^>(\S+)/) {
713 print STDERR "indexed $count quality scores...\n"
714 if $self->{debug} && (++$count%1000) == 0;
715 my $pos = tell($fh);
716 if ($id) {
717 my $qualstrlength = $pos - $offset - length($_);
718 $qualstrlength -= $termination_length * $qual_lines;
719 $offsets->{$id} = &{$self->{packmeth}}(
720 $offset,
721 $qualstrlength,
722 $linelength,
723 $firstline,
724 $base
727 $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
728 ($offset, $firstline, $linelength) = ($pos, length($_), 0);
729 $self->_check_linelength($linelength);
730 ($l3_len, $l2_len, $l_len) = (0, 0, 0);
731 $qual_lines = 0;
732 } else {
733 $l3_len = $l2_len;
734 $l2_len = $l_len;
735 $l_len = length($_);
736 # need to check every line :(
737 if (DIE_ON_MISSMATCHED_LINES &&
738 $l3_len > 0 &&
739 $l2_len > 0 &&
740 $l3_len != $l2_len
742 my $fap = substr($_, 0, 20)."..";
743 $self->throw("Each line of the qual entry must be the same ".
744 "length except the last. Line above #$. '$fap' is $l2_len != ".
745 "$l3_len chars.");
747 $linelength ||= length($_);
748 $qual_lines++;
750 $last_line = $_;
753 $self->_check_linelength($linelength);
754 # deal with last entry
755 if ($id) {
756 my $pos = tell($fh);
757 my $qualstrlength = $pos - $offset;
759 if ($linelength == 0) {
760 $qualstrlength = 0;
761 } else {
762 if ($last_line !~ /\s$/) {
763 $qual_lines--;
765 $qualstrlength -= $termination_length * $qual_lines;
767 $offsets->{$id} = &{$self->{packmeth}}(
768 $offset,
769 $qualstrlength,
770 $linelength,
771 $firstline,
772 $base
775 $offsets->{__termination_length} = $termination_length;
776 return \%offsets;
779 =head2 get_all_ids
781 Title : get_all_ids
782 Usage : my @ids = $db->get_all_ids
783 Function: gets all the stored ids in all indexes
784 Returns : list of ids
785 Args : none
787 =cut
789 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} }
791 sub offset {
792 my ($self, $id) = @_;
793 my $offset = $self->{offsets}{$id} or return;
794 (&{$self->{unpackmeth}}($offset))[0];
797 =head2 length
799 Title : length
800 Usage : $qualdb->length($seqid);
801 Function: gets the number of quality values in a quality score
802 Returns : scalar
803 Args : ID of a quality score
805 =cut
807 sub length {
808 # the NUMBER of quality values
809 my ($self, $id) = @_;
810 my $len = scalar(@{$self->subqual($id)});
811 return $len;
814 sub lengthstr {
815 # the length of the quality STRING
816 my ($self, $id) = @_;
817 my $offset = $self->{offsets}{$id} or return;
818 (&{$self->{unpackmeth}}($offset))[1];
821 sub linelen {
822 my ($self, $id) = @_;
823 my $offset = $self->{offsets}{$id} or return;
824 (&{$self->{unpackmeth}}($offset))[2];
827 sub headerlen {
828 my ($self, $id) = @_;
829 my $offset = $self->{offsets}{$id} or return;
830 (&{$self->{unpackmeth}}($offset))[3];
833 sub path { shift->{dirname} }
835 sub header_offset {
836 my ($self, $id) = @_;
837 return unless $self->{offsets}{$id};
838 return $self->offset($id) - $self->headerlen($id);
841 sub file {
842 my ($self, $id) = @_;
843 my $offset = $self->{offsets}{$id} or return;
844 $self->fileno2path((&{$self->{unpackmeth}}($offset))[4]);
847 sub fileno2path {
848 my ($self, $no) = @_;
849 return $self->{offsets}{"__file_$no"};
852 sub path2fileno {
853 my ($self, $path) = @_;
854 if ( !defined $self->{offsets}{"__path_$path"} ) {
855 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
856 $self->{offsets}{"__file_$fileno"} = $path;
858 return $self->{offsets}{"__path_$path"}
861 sub _check_linelength {
862 my ($self, $linelength) = @_;
863 return unless defined $linelength;
864 $self->throw(
865 "Each line of the qual file must be less than 65,536 characters.Line ".
866 "$. is $linelength chars."
867 ) if $linelength > 65535;
870 =head2 subqual
872 Title : subqual
873 Usage : my @qualarr = @{$qualdb->subqual($id,$start,$stop)};
874 Function: returns a subqual of a quality score in the database
875 Returns : subquality array reference
876 Args : id of quality score, starting quality value number, ending quality
877 value number
879 =cut
881 sub subqual {
882 my ($self, $id, $start, $stop) = @_;
884 # Quality values in a quality score can have 1 or 2 digits and are separated
885 # by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
886 # is no easy way match the position of a quality value to its position in
887 # the quality string.
888 # As a consequence, if a subqual of the quality is requested, we still need
889 # to grab the full quality string first - performance penalty for big
890 # quality scores :(
891 # I think there is no way around starting at the begining of the quality
892 # score but maybe there is a resource-efficient way of starting at the
893 # begining of the quality score and stopping when the the position of the
894 # last quality value requested is reached??
896 # position of the quality values
897 if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
898 ($id, $start, $stop) = ($1,$2,$3);
899 $start =~ s/_//g;
900 $stop =~ s/_//g;
903 # position in quality string
904 my $string_start = 1;
905 my $string_stop = $self->lengthstr($id);
907 # fetch full quality string
908 my $fh = $self->fh($id) or return;
909 my $filestart = $self->caloffset($id, $string_start);
910 my $filestop = $self->caloffset($id, $string_stop);
911 seek($fh,$filestart,0);
912 my $data;
913 read($fh, $data, $filestop-$filestart+1);
915 # process quality score
916 $data =~ s/\n//g;
917 $data =~ s/\r//g;
918 my $reverse = 0;
919 if ($stop && $start && $stop < $start) {
920 $reverse = 1;
921 my $tmp = $start;
922 $start = $stop;
923 $stop = $tmp;
925 my $subqual = 0;
926 $subqual = 1 if ( $start || $stop );
927 my @data;
928 if ( $subqual || $reverse ) {
929 @data = split / /, $data, $stop+1;
930 my $length = scalar(@data);
931 $start = 1 if $start < 1;
932 $stop = $length if $stop > $length;
933 pop @data if ($stop != $length);
934 splice @data, 0, $start-1;
935 @data = reverse(@data) if $reverse;
936 $data = join ' ', @data;
937 } else {
938 @data = split / /, $data;
941 return \@data;
944 sub fh {
945 my ($self, $id) = @_;
946 my $file = $self->file($id) or return;
947 $self->fhcache("$self->{dirname}/$file") ||
948 $self->throw("Can't open file $file");
951 =head2 header
953 Title : header
954 Usage : $qualdb->header($id);
955 Function: returns the header of a quality score in the database
956 Returns : header string
957 Args : id of quality score
959 =cut
961 sub header {
962 my ($self, $id) = @_;
963 my ($offset,$seqlength,$linelength,$firstline,$file)
964 = &{$self->{unpackmeth}}($self->{offsets}{$id}) or return;
965 $offset -= $firstline;
966 my $data;
967 my $fh = $self->fh($id) or return;
968 seek($fh,$offset,0);
969 read($fh,$data,$firstline);
970 chomp $data;
971 substr($data,0,1) = '';
972 $data;
975 sub caloffset {
976 my ($self, $id, $a) = @_;
977 $a--;
978 my ($offset,$seqlength,$linelength,$firstline,$file)
979 = &{$self->{unpackmeth}}($self->{offsets}{$id});
980 $a = 0 if $a < 0;
981 $a = $seqlength-1 if $a >= $seqlength;
982 my $tl = $self->{offsets}{__termination_length};
983 $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
986 sub fhcache {
987 my ($self, $path) = @_;
988 if (!$self->{fhcache}{$path}) {
989 if ($self->{curopen} >= $self->{maxopen}) {
990 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
991 keys %{$self->{fhcache}};
992 splice(@lru, $self->{maxopen} / 3);
993 $self->{curopen} -= @lru;
994 for (@lru) {
995 delete $self->{fhcache}{$_};
998 $self->{fhcache}{$path} = IO::File->new($path) || return;
999 binmode $self->{fhcache}{$path};
1000 $self->{curopen}++;
1002 $self->{cacheseq}{$path}++;
1003 $self->{fhcache}{$path}
1006 sub _pack {
1007 pack STRUCT, @_;
1010 sub _packBig {
1011 pack STRUCTBIG, @_;
1014 sub _unpack {
1015 unpack STRUCT, shift;
1018 sub _unpackBig {
1019 unpack STRUCTBIG, shift;
1022 =head2 get_PrimaryQual_stream
1024 Title : get_PrimaryQual_stream
1025 Usage : $qualdb->get_PrimaryQual_stream
1026 Function: get a SeqIO-like stream of quality scores
1027 Returns : stream object
1028 Args : none
1030 =cut
1032 sub get_PrimaryQual_stream {
1033 my $self = shift;
1034 return Bio::DB::Qual::Stream->new($self);
1037 sub TIEHASH {
1038 my $self = shift;
1039 return $self->new(@_);
1042 sub FETCH {
1043 shift->subqual(@_);
1046 sub STORE {
1047 shift->throw("Read-only database");
1050 sub DELETE {
1051 shift->throw("Read-only database");
1054 sub CLEAR {
1055 shift->throw("Read-only database");
1058 sub EXISTS {
1059 defined shift->offset(@_);
1062 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
1064 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); }
1066 sub DESTROY {
1067 my $self = shift;
1068 if ($self->{indexing}) { # killed prematurely, so index file is no good!
1069 warn "indexing was interrupted, so deleting $self->{indexing}";
1070 unlink $self->{indexing};
1074 #-------------------------------------------------------------
1075 # Bio::Seq::PrimaryQual compatibility
1077 # Usage is the same as in Bio::Seq::PrimaryQual
1079 package Bio::Seq::PrimaryQual::Qual;
1080 use overload '""' => 'display_id';
1082 use base qw(Bio::Root::Root Bio::Seq::PrimaryQual);
1084 sub new {
1085 my ($class, @args) = @_;
1086 my $self = $class->SUPER::new(@args);
1087 my ($db, $id, $start, $stop) = $self->_rearrange(
1088 [qw(DATABASE ID START STOP)],
1089 @args);
1090 $self->{db} = $db;
1091 $self->{id} = $id;
1092 $self->{start} = $start || 1;
1093 $self->{stop} = $stop || $db->length($id);
1094 return $self;
1097 sub qual {
1098 my $self = shift;
1099 my $qual = $self->{db}->qual($self->{id},$self->{start},$self->{stop});
1100 return $qual;
1103 sub subqual {
1104 my ($self, $start, $stop) = @_;
1105 return $self->trunc($start, $stop)->qual;
1108 sub trunc {
1109 my ($self, $start, $stop) = @_;
1110 $self->throw(
1111 "$stop is smaller than $stop. If you want to truncate and reverse ".
1112 "complement, you must call trunc followed by revcom."
1113 ) if $start > $stop;
1114 my ($left, $right);
1115 if ($self->{start} <= $self->{stop}) {
1116 $left = $self->{start}+$start-1;
1117 $right = $self->{start}+$stop-1;
1118 } else {
1119 $left = $self->{start}-($start-1);
1120 $right = $self->{start}-($stop-1);
1122 my $obj = $self->new( -database => $self->{db},
1123 -id => $self->{id},
1124 -start => $left,
1125 -stop => $right
1127 return $obj;
1130 sub display_id {
1131 my $self = shift;
1132 return $self->{id};
1135 sub primary_id {
1136 my $self = shift;
1137 return overload::StrVal($self);
1140 sub length {
1141 # number of quality scores
1142 my $self = shift;
1143 return scalar(@{$self->qual});
1146 sub description {
1147 my $self = shift;
1148 my $header = $self->{'db'}->header($self->{id});
1149 # remove the id from the header
1150 $header = (split(/\s+/,$header,2))[2];
1151 return $header;
1153 *desc = \&description;
1155 #-------------------------------------------------------------
1156 # stream-based access to the database
1159 package Bio::DB::Qual::Stream;
1160 use base qw(Tie::Handle Bio::DB::SeqI);
1162 sub new {
1163 my ($class, $db) = @_;
1164 my $key = $db->FIRSTKEY;
1165 return bless {
1166 db => $db,
1167 key => $key
1168 }, $class;
1171 sub next_seq {
1172 my $self = shift;
1173 my ($key, $db) = @{$self}{'key', 'db'};
1174 while ($key =~ /^__/) {
1175 $key = $db->NEXTKEY($key);
1176 return unless defined $key;
1178 my $value = $db->get_Qual_by_id($key);
1179 $self->{key} = $db->NEXTKEY($key);
1180 $value;
1183 sub TIEHANDLE {
1184 my ($class, $db) = @_;
1185 return $class->new($db);
1188 sub READLINE {
1189 my $self = shift;
1190 $self->next_seq;
1195 __END__