3 # BioPerl module for Bio::DB::Qual
5 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
12 Bio::DB::Qual -- Fast indexed access to a directory of quality files
18 # create database from directory of qual files
19 my $db = Bio::DB::Qual->new('/path/to/qual/files');
22 # simple access (for those without Bioperl)
23 my @qual = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
24 my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
25 my $length = $db->length('CHROMOSOME_I');
26 my $header = $db->header('CHROMOSOME_I');
28 # Bioperl-style access
29 my $obj = $db->get_Qual_by_id('CHROMOSOME_I');
30 my @qual = @{$obj->qual};
31 my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
32 my $length = $obj->length;
35 # Bio::SeqIO-style access
36 my $stream = $db->get_PrimaryQual_stream;
37 while (my $qual = $stream->next_seq) {
38 # Bio::Seq::PrimaryQual operations
41 my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
42 while (my $qual = <$fh>) {
43 # Bio::Seq::PrimaryQual operations
47 tie %qualities,'Bio::DB::Qual','/path/to/qual/files';
48 print $qualities{'CHROMOSOME_I:1,20000'};
52 Bio::DB::Qual provides indexed access to one or more qual files. It provides
53 random access to each quality score entry without having to read the file from
54 the beginning. Access to subqualities (portions of a quality score) is provided,
55 although contrary to Bio::DB::Fasta, the full quality score has to be brought in
58 When you initialize the module, you point it at a single qual file or a
59 directory of multiple such files. The first time it is run, the module generates
60 an index of the contents of the file or directory using the AnyDBM module
61 (Berkeley DB* preferred, followed by GDBM_File, NDBM_File, and SDBM_File).
62 Thereafter it uses the index file to find the file and offset for any requested
63 quality score. If one of the source qual files is updated, the module reindexes
64 just that one file. (You can also force reindexing manually). For improved
65 performance, the module keeps a cache of open filehandles, closing less-recently
66 used ones when the cache is full.
68 The qual files may contain decimal quality scores. Entries may have any line
69 length up to 65,536 characters, and different line lengths are allowed in the
70 same file. However, within a quality score entry, all lines must be the same
71 length except for the last. An error will be thrown if this is not the case.
73 The module uses /^E<gt>(\S+)/ to extract the primary ID of each quality score
74 from the qual header. During indexing, you may pass a callback routine to modify
75 this primary ID. For example, you may wish to extract a portion of the
76 gi|gb|abc|xyz prefixes that are commonly used. The original header line can be
79 *Berkeley DB can be obtained free from www.sleepycat.com. After it is installed
80 you will need to install the BerkeleyDB Perl module.
82 =head1 DATABASE CREATION AND INDEXING
84 The two constructors for this class are new() and newFh(). The former creates a
85 Bio::DB::Qual object which is accessed via method calls. The latter creates a
86 tied filehandle which can be used Bio::SeqIO-style to fetch quality score
87 objects in a data stream. There is also a tied hash interface.
91 =item $db = Bio::DB::Qual-E<gt>new($qual_path [,%options])
93 Create a new Bio::DB::Qual object from the Qual file or files indicated by
94 $qual_path. Indexing will be performed automatically if needed. If successful,
95 new() will return the database accessor object. Otherwise it will return undef.
97 $qual_path may be an individual qual file, or may refer to a directory
98 containing one or more of such files. Following the path, you may pass a series
99 of name=E<gt>value options or a hash with these same name=E<gt>value pairs.
102 Option Name Description Default
103 ----------- ----------- -------
105 -glob Glob expression to use *.{qa,QA,qual,QUAL}
106 for searching for qual
107 files in directories.
109 -makeid A code subroutine for None
110 transforming qual IDs.
112 -maxopen Maximum size of 32
115 -debug Turn on status 0
118 -reindex Force the index to be 0
121 -dbmargs Additional arguments none
124 (scalar or array ref).
126 -dbmargs can be used to control the format of the index. For example, you can
127 pass $DB_BTREE to this argument so as to force the IDs to be sorted and
128 retrieved alphabetically. Note that you must use the same arguments every time
131 -reindex can be used to force the index to be recreated from scratch.
133 =item $fh = Bio::DB::Qual-E<gt>newFh($qual_path [,%options])
135 Create a tied filehandle opened on a Bio::DB::Qual object. Reading from this
136 filehandle with E<lt>E<gt> will return a stream of quality objects,
141 The -makeid option gives you a chance to modify quality score IDs during
142 indexing. The option value should be a code reference that will take a scalar
143 argument and return a scalar result, like this:
145 $db = Bio::DB::Qual->new("file.qual",-makeid=>\&make_my_id);
148 my $description_line = shift;
149 # get a different id from the quality header, e.g.
150 $description_line =~ /(\S+)$/;
154 make_my_id() will be called with the full qual id line (including the "E<gt>"
155 symbol!). For example:
157 >A12345.3 Predicted C. elegans protein egl-2
159 By default, this module will use the regular expression /^E<gt>(\S+)/ to extract
160 "A12345.3" for use as the ID.If you pass a -makeid callback, you can extract any
161 portion of this, such as the "egl-2" symbol.
163 The -makeid option is ignored after the index is constructed.
165 =head1 OBJECT METHODS
167 The following object methods are provided.
171 =item $raw_qual = $db-E<gt>qual($id [,$start, $stop])
173 Return a quality score array reference given an ID and optionally a start and
174 stop position (the quality value number) in the quality score. If $stop is less
175 than $start, then the reverse complement of the quality score is returned (this
176 violates Bio::Seq conventions).
178 For your convenience, subqualities can be indicated with any of the following
181 $db->qual("$id:$start,$stop")
183 $db->qual("$id:$start..$stop")
185 $db->qual("$id:$start-$stop")
187 =item $length = $db-E<gt>length($id)
189 Return the length of the indicated quality score, i.e. the number of quality
192 =item $header = $db-E<gt>header($id)
194 Return the header line for the ID, including the initial "E<gt>".
196 =item $filename = $db-E<gt>file($id)
198 Return the name of the file in which the indicated quality score can be found.
200 =item $offset = $db-E<gt>offset($id)
202 Return the offset of the indicated quality score from the beginning of the file
203 in which it is located. The offset points to the beginning of the quality
204 score, not the beginning of the header line.
206 =item $header_length = $db-E<gt>headerlen($id)
208 Return the length of the header line for the indicated quality score.
210 =item $header_offset = $db-E<gt>header_offset($id)
212 Return the offset of the header line for the indicated quality score from the
213 beginning of the file in which it is located.
215 =item $index_name = $db-E<gt>index_name
217 Return the path to the index file.
219 =item $path = $db-E<gt>path
221 Return the path to the Qual file(s).
225 For BioPerl-style access, the following methods are provided:
229 =item $qual = $db-E<gt>get_Qual_by_id($id)
231 Return a Bio::Seq::PrimaryQual object, which obeys the Bio::PrimarySeqI
232 conventions. To recover the quality score, call $qual-E<gt>qual().
234 Note that get_Qual_by_id() does not bring the entire quality score into memory
235 until requested. Internally, the returned object uses the accessor to generate
236 subqualities as needed.
238 =item $qual = $db-E<gt>get_Qual_by_acc($id)
240 =item $qual = $db-E<gt>get_Qual_by_primary_id($id)
242 These methods all do the same thing as get_Qual_by_id().
244 =item $stream = $db-E<gt>get_PrimaryQual_stream()
246 Return a Bio::DB::Qual::Stream object, which supports a single method
247 next_seq(). Each call to next_seq() returns a new Bio::Seq::PrimaryQual object,
248 until no more quality scores remain.
252 See L<Bio::Seq::PrimaryQual> and L<Bio::PrimarySeqI> for methods provided by the
253 quality objects returned from get_Qual_by_id() and get_PrimaryQual_stream().
255 =head1 TIED INTERFACES
257 This module provides two tied interfaces, one which allows you to treat the
258 quality score database as a hash, and the other which allows you to treat the
259 database as an I/O stream.
261 =head2 Creating a Tied Hash
263 The tied hash interface is very straightforward.
267 =item $obj = tie %db,'Bio::DB::Qual','/path/to/qual/files' [,@args]
269 Tie %db to Bio::DB::Qual using the indicated path to the Qual files. The
270 optional @args list is the same set of named argument/value pairs used by
271 Bio::DB::Qual-E<gt>new().
273 If successful, tie() will return the tied object. Otherwise it will return
278 Once tied, you can use the hash to retrieve an individual quality score by its
281 my $qual = $db{CHROMOSOME_I};
283 You may select a subquality by appending the comma-separated range to the
284 quality score ID in the format "$id:$start,$stop". For example, here is the
285 first 1000 quality values of the quality score with ID "CHROMOSOME_I":
287 my $qual = $db{'CHROMOSOME_I:1,1000'};
289 (The regular expression used to parse this format allows quality score IDs to
292 When selecting subqualities, if $start E<gt> stop, then the reverse complement
295 The keys() and values() functions will return the IDs and their quality scores,
296 respectively. In addition, each() can be used to iterate over the entire data
299 while (my ($id,$quality) = each %db) {
300 print "$id => $quality\n";
303 When dealing with very large quality scores, you can avoid bringing them into
304 memory by calling each() in a scalar context. This returns the key only. You can
305 then use tied(%db) to recover the Bio::DB::Qual object and call its methods.
307 while (my $id = each %db) {
308 print "$id => $db{$quality:1,100}\n";
309 print "$id => ",tied(%db)->length($id),"\n";
312 You may, in addition invoke Bio::DB::Qual the FIRSTKEY and NEXTKEY tied hash
317 =item $id = $db-E<gt>FIRSTKEY
319 Return the first ID in the database.
321 =item $id = $db-E<gt>NEXTKEY($id)
323 Given an ID, return the next quality score ID.
327 This allows you to write the following iterative loop using just the object-
330 my $db = Bio::DB::Qual->new('/path/to/qual/files');
331 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
332 # do something with quality
335 =head2 Creating a Tied Filehandle
337 The Bio::DB::Qual-E<gt>newFh() method creates a tied filehandle from which you
338 can read Bio::Seq::PrimaryQual quality score objects sequentially. The following
339 bit of code will iterate sequentially over all quality scores in the database:
341 my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
342 while (my $qual = <$fh>) {
343 print $qual->id,' => ',$qual->length,"\n";
346 When no more quality scores remain to be retrieved, the stream will return
351 When a quality score is deleted from one of the qual files, this deletion is not
352 detected by the module and removed from the index. As a result, a "ghost" entry
353 will remain in the index and will return garbage results if accessed. Currently,
354 the only way to accommodate deletions is to rebuild the entire index, either by
355 deleting it manually, or by passing -reindex=E<gt>1 to new() when
356 initializing the module.
358 All quality score lines for a given quality score must have the same length
359 except for the last (not sure why there is this limitation). This is not
360 problematic for sequences but could be annoying for quality scores. A workaround
361 is to make sure the your quality scores fit on no more than 2 lines. Another
362 solution could be to padd them with blank spaces so that each line has the same
363 number of characters (maybe this padding should be implemented in
368 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
370 Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
372 Copyright (c) 2007 Florent E Angly.
374 This library is free software; you can redistribute it and/or modify
375 it under the same terms as Perl itself.
379 package Bio
::DB
::Qual
;
382 @AnyDBM_File::ISA
= qw(DB_File GDBM_File NDBM_File SDBM_File)
389 use File
::Basename
qw(basename dirname);
391 use base
qw(Bio::DB::SeqI Bio::Root::Root);
393 *qual
= *quality
= \
&subqual
;
394 *ids
= \
&get_all_ids
;
395 *get_qual_by_primary_id
= *get_qual_by_acc
= \
&get_Qual_by_id
;
397 use constant STRUCT
=>'NNnna*';
398 use constant STRUCTBIG
=>'QQnna*'; # 64-bit file offset and quality score length
399 use constant DIE_ON_MISSMATCHED_LINES
=> 1; # you can avoid dying if you want
400 # but you're likely to get bad
403 # Bio::DB-like object
404 # providing fast random access to a directory of qual files
409 Usage : my $db = Bio::DB::Qual->new( $path, @options);
410 Function: initialize a new Bio::DB::Qual object
411 Returns : new Bio::DB::Qual object
412 Args : path to dir of qual files or a single qual filename
414 These are optional arguments to pass in as well.
416 -glob Glob expression to use *.{qual,QUAL,qa,QA}
417 for searching for qual
418 files in directories.
420 -makeid A code subroutine for none
421 transforming qual IDs.
423 -maxopen Maximum size of 32
426 -debug Turn on status 0
429 -reindex Force the index to be 0
432 -dbmargs Additional arguments none
435 (scalar or array ref).
440 my ($class, $path) = @_;
444 debug
=> $opts{-debug
},
445 makeid
=> $opts{-makeid
},
446 glob => $opts{-glob} || '*.{qual,QUAL,qa,QA}',
447 maxopen
=> $opts{-maxopen
} || 32,
448 dbmargs
=> $opts{-dbmargs
} || undef,
456 my ($offsets,$dirname);
459 # because Win32 glob() is broken with respect to long file names that
460 # contain whitespace.
461 $path = Win32
::GetShortPathName
($path)
462 if $^O
=~ /^MSWin/i && eval 'use Win32; 1';
463 $offsets = $self->index_dir($path,$opts{-reindex
});
466 $offsets = $self->index_file($path,$opts{-reindex
});
467 $dirname = dirname
($path);
469 $self->throw( "$path: invalid file or dirname");
471 @
{$self}{qw(dirname offsets)} = ($dirname,$offsets);
478 Usage : my $fh = Bio::DB::Qual->newFh('/path/to/qual/files');
479 Function: gets a new Fh for a file or directory containing several files
480 Returns : filehandle object
487 my $self = $class->new(@_);
489 my $fh = Symbol
::gensym
or return;
490 tie
$$fh,'Bio::DB::Qual::Stream',$self or return;
496 my ($index,$write) = @_;
498 my $flags = $write ? O_CREAT
|O_RDWR
: O_RDONLY
;
499 my @dbmargs = $self->dbmargs;
500 tie
%offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs
501 or $self->throw( "Can't open cache file $index: $!");
506 my ($self, $index) = @_;
513 Usage : $db->index_dir($dir)
514 Function: set the index dir and load all files in the dir
515 Returns : hashref of qual offsets in each file
516 Args : dirname, boolean to force a reload of all files
521 my ($self, $dir, $force_reindex) = @_;
523 # find all qual files
524 my @files = glob("$dir/$self->{glob}");
525 $self->throw("No qual files found in $dir") unless @files;
528 my $index = $self->index_name($dir,1);
530 # if caller has requested reindexing, then unlink
532 unlink $index if $force_reindex;
534 # get the modification time of the index
536 for my $suffix('','.pag','.dir') {
537 $indextime ||= (stat("${index}${suffix}"))[9];
539 $indextime ||= 0; # prevent some uninit variable warnings
541 # get the most recent modification time of any of the contents
544 $self->set_pack_method( @files );
546 my $m = (stat($_))[9];
548 $modtime = $m if defined $m && $modtime < $m;
551 my $reindex = $force_reindex || $indextime < $modtime;
552 $self->{offsets
} = $self->_open_index($index,$reindex) or return;
555 return $self->{offsets
} unless $reindex;
557 # otherwise reindex contents of changed files
558 $self->{indexing
} = $index;
560 next if( defined $indextime && $modtime{$_} <= $indextime);
561 $self->calculate_offsets($_,$self->{offsets
});
563 delete $self->{indexing
};
565 # we've been having troubles with corrupted index files on Windows systems,
566 # so possibly closing and reopening will help
567 $self->_close_index($self->{offsets
});
569 return $self->{offsets
} = $self->_open_index($index);
572 =head2 get_Qual_by_id
574 Title : get_Qual_by_id
575 Usage : my $qual = $db->get_Qual_by_id($id)
576 Function: Bio::DB::RandomAccessI method implemented
577 Returns : Bio::PrimarySeqI object
583 my ($self, $id) = @_;
584 return unless exists $self->{offsets
}{$id};
585 return Bio
::Seq
::PrimaryQual
::Qual
->new($self,$id);
588 =head2 set_pack_method
590 Title : set_pack_method
591 Usage : $db->set_pack_method( @files )
592 Function: Determines whether data packing uses 32 or 64 bit integers
593 Returns : 1 for success
594 Args : one or more file paths
598 sub set_pack_method
{
600 # Find the maximum file size:
601 my ($maxsize) = sort { $b <=> $a } map { -s
$_ } @_;
602 my $fourGB = (2 ** 32) - 1;
604 if ($maxsize > $fourGB) {
605 # At least one file exceeds 4Gb - we will need to use 64 bit ints
606 $self->{packmeth
} = \
&_packBig
;
607 $self->{unpackmeth
} = \
&_unpackBig
;
609 $self->{packmeth
} = \
&_pack
;
610 $self->{unpackmeth
} = \
&_unpack
;
618 Usage : $db->index_file($filename)
619 Function: (re)loads a quality score file and indexes quality score offsets in
621 Returns : qual offsets in the file
622 Args : filename, boolean to force reloading a file
627 my ($self, $file, $force_reindex) = @_;
629 $self->set_pack_method( $file );
630 my $index = $self->index_name($file);
631 # if caller has requested reindexing, then unlink the index
632 unlink $index if $force_reindex;
634 # get the modification time of the index
635 my $indextime = (stat($index))[9] || 0;
636 my $modtime = (stat($file) )[9] || 0;
638 my $reindex = $force_reindex || $indextime < $modtime;
639 my $offsets = $self->_open_index($index,$reindex) or return;
640 $self->{offsets
} = $offsets;
642 return $self->{offsets
} unless $reindex;
644 $self->{indexing
} = $index;
645 $self->calculate_offsets($file,$offsets);
646 delete $self->{indexing
};
647 return $self->{offsets
};
653 Usage : my @args = $db->dbmargs;
654 Function: gets stored dbm arguments
662 my $args = $self->{dbmargs
} or return;
663 return ref($args) eq 'ARRAY' ? @
$args : $args;
669 Usage : my $indexname = $db->index_name($path,$isdir);
670 Function: returns the name of the index for a specific path
672 Args : path to check, boolean if it is a dir
678 my ($path,$isdir) = @_;
680 my $dir = $self->{dirname
} or return;
681 return $self->index_name($dir,-d
$dir);
683 return "$path/directory.index" if $isdir;
684 return "$path.index";
687 =head2 calculate_offsets
689 Title : calculate_offsets
690 Usage : $db->calculate_offsets($filename,$offsets);
691 Function: calculates the quality score offsets in a file based on ID
692 Returns : offset hash for each file
693 Args : file to process, $offsets - hashref of id to offset storage
697 sub calculate_offsets
{
699 my ($file,$offsets) = @_;
700 my $base = $self->path2fileno(basename
($file));
702 my $fh = IO
::File
->new($file) or $self->throw("Can't open $file: $!");
704 warn "Indexing $file\n" if $self->{debug
};
705 my ( $offset,$id, $linelength, $firstline, $count, $termination_length,
706 $qual_lines, $last_line, %offsets );
707 my ( $l3_len, $l2_len, $l_len ) = ( 0, 0, 0 );
709 while (<$fh>) { # don't try this at home
710 # account for crlf-terminated Windows files
711 $termination_length ||= /\r\n$/ ?
2 : 1;
713 print STDERR
"indexed $count quality scores...\n"
714 if $self->{debug
} && (++$count%1000) == 0;
717 my $qualstrlength = $pos - $offset - length($_);
718 $qualstrlength -= $termination_length * $qual_lines;
719 $offsets->{$id} = &{$self->{packmeth
}}(
727 $id = ref($self->{makeid
}) eq 'CODE' ?
$self->{makeid
}->($_) : $1;
728 ($offset, $firstline, $linelength) = ($pos, length($_), 0);
729 $self->_check_linelength($linelength);
730 ($l3_len, $l2_len, $l_len) = (0, 0, 0);
736 # need to check every line :(
737 if (DIE_ON_MISSMATCHED_LINES
&&
742 my $fap = substr($_, 0, 20)."..";
743 $self->throw("Each line of the qual entry must be the same ".
744 "length except the last. Line above #$. '$fap' is $l2_len != ".
747 $linelength ||= length($_);
753 $self->_check_linelength($linelength);
754 # deal with last entry
757 my $qualstrlength = $pos - $offset;
759 if ($linelength == 0) {
762 if ($last_line !~ /\s$/) {
765 $qualstrlength -= $termination_length * $qual_lines;
767 $offsets->{$id} = &{$self->{packmeth
}}(
775 $offsets->{__termination_length
} = $termination_length;
782 Usage : my @ids = $db->get_all_ids
783 Function: gets all the stored ids in all indexes
784 Returns : list of ids
789 sub get_all_ids
{ grep {!/^__/} keys %{shift->{offsets
}} }
792 my ($self, $id) = @_;
793 my $offset = $self->{offsets
}{$id} or return;
794 (&{$self->{unpackmeth
}}($offset))[0];
800 Usage : $qualdb->length($seqid);
801 Function: gets the number of quality values in a quality score
803 Args : ID of a quality score
808 # the NUMBER of quality values
809 my ($self, $id) = @_;
810 my $len = scalar(@
{$self->subqual($id)});
815 # the length of the quality STRING
816 my ($self, $id) = @_;
817 my $offset = $self->{offsets
}{$id} or return;
818 (&{$self->{unpackmeth
}}($offset))[1];
822 my ($self, $id) = @_;
823 my $offset = $self->{offsets
}{$id} or return;
824 (&{$self->{unpackmeth
}}($offset))[2];
828 my ($self, $id) = @_;
829 my $offset = $self->{offsets
}{$id} or return;
830 (&{$self->{unpackmeth
}}($offset))[3];
833 sub path
{ shift->{dirname
} }
836 my ($self, $id) = @_;
837 return unless $self->{offsets
}{$id};
838 return $self->offset($id) - $self->headerlen($id);
842 my ($self, $id) = @_;
843 my $offset = $self->{offsets
}{$id} or return;
844 $self->fileno2path((&{$self->{unpackmeth
}}($offset))[4]);
848 my ($self, $no) = @_;
849 return $self->{offsets
}{"__file_$no"};
853 my ($self, $path) = @_;
854 if ( !defined $self->{offsets
}{"__path_$path"} ) {
855 my $fileno = ($self->{offsets
}{"__path_$path"} = 0+ $self->{fileno}++);
856 $self->{offsets
}{"__file_$fileno"} = $path;
858 return $self->{offsets
}{"__path_$path"}
861 sub _check_linelength
{
862 my ($self, $linelength) = @_;
863 return unless defined $linelength;
865 "Each line of the qual file must be less than 65,536 characters.Line ".
866 "$. is $linelength chars."
867 ) if $linelength > 65535;
873 Usage : my @qualarr = @{$qualdb->subqual($id,$start,$stop)};
874 Function: returns a subqual of a quality score in the database
875 Returns : subquality array reference
876 Args : id of quality score, starting quality value number, ending quality
882 my ($self, $id, $start, $stop) = @_;
884 # Quality values in a quality score can have 1 or 2 digits and are separated
885 # by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
886 # is no easy way match the position of a quality value to its position in
887 # the quality string.
888 # As a consequence, if a subqual of the quality is requested, we still need
889 # to grab the full quality string first - performance penalty for big
891 # I think there is no way around starting at the begining of the quality
892 # score but maybe there is a resource-efficient way of starting at the
893 # begining of the quality score and stopping when the the position of the
894 # last quality value requested is reached??
896 # position of the quality values
897 if ($id =~ /^(.+):([\d_]+)(?:,|-|\.\.)([\d_]+)$/) {
898 ($id, $start, $stop) = ($1,$2,$3);
903 # position in quality string
904 my $string_start = 1;
905 my $string_stop = $self->lengthstr($id);
907 # fetch full quality string
908 my $fh = $self->fh($id) or return;
909 my $filestart = $self->caloffset($id, $string_start);
910 my $filestop = $self->caloffset($id, $string_stop);
911 seek($fh,$filestart,0);
913 read($fh, $data, $filestop-$filestart+1);
915 # process quality score
919 if ($stop && $start && $stop < $start) {
926 $subqual = 1 if ( $start || $stop );
928 if ( $subqual || $reverse ) {
929 @data = split / /, $data, $stop+1;
930 my $length = scalar(@data);
931 $start = 1 if $start < 1;
932 $stop = $length if $stop > $length;
933 pop @data if ($stop != $length);
934 splice @data, 0, $start-1;
935 @data = reverse(@data) if $reverse;
936 $data = join ' ', @data;
938 @data = split / /, $data;
945 my ($self, $id) = @_;
946 my $file = $self->file($id) or return;
947 $self->fhcache("$self->{dirname}/$file") ||
948 $self->throw("Can't open file $file");
954 Usage : $qualdb->header($id);
955 Function: returns the header of a quality score in the database
956 Returns : header string
957 Args : id of quality score
962 my ($self, $id) = @_;
963 my ($offset,$seqlength,$linelength,$firstline,$file)
964 = &{$self->{unpackmeth
}}($self->{offsets
}{$id}) or return;
965 $offset -= $firstline;
967 my $fh = $self->fh($id) or return;
969 read($fh,$data,$firstline);
971 substr($data,0,1) = '';
976 my ($self, $id, $a) = @_;
978 my ($offset,$seqlength,$linelength,$firstline,$file)
979 = &{$self->{unpackmeth
}}($self->{offsets
}{$id});
981 $a = $seqlength-1 if $a >= $seqlength;
982 my $tl = $self->{offsets
}{__termination_length
};
983 $offset + $linelength * int($a/($linelength-$tl)) + $a % ($linelength-$tl);
987 my ($self, $path) = @_;
988 if (!$self->{fhcache
}{$path}) {
989 if ($self->{curopen
} >= $self->{maxopen
}) {
990 my @lru = sort {$self->{cacheseq
}{$a} <=> $self->{cacheseq
}{$b};}
991 keys %{$self->{fhcache
}};
992 splice(@lru, $self->{maxopen
} / 3);
993 $self->{curopen
} -= @lru;
995 delete $self->{fhcache
}{$_};
998 $self->{fhcache
}{$path} = IO
::File
->new($path) || return;
999 binmode $self->{fhcache
}{$path};
1002 $self->{cacheseq
}{$path}++;
1003 $self->{fhcache
}{$path}
1015 unpack STRUCT
, shift;
1019 unpack STRUCTBIG
, shift;
1022 =head2 get_PrimaryQual_stream
1024 Title : get_PrimaryQual_stream
1025 Usage : $qualdb->get_PrimaryQual_stream
1026 Function: get a SeqIO-like stream of quality scores
1027 Returns : stream object
1032 sub get_PrimaryQual_stream
{
1034 return Bio
::DB
::Qual
::Stream
->new($self);
1039 return $self->new(@_);
1047 shift->throw("Read-only database");
1051 shift->throw("Read-only database");
1055 shift->throw("Read-only database");
1059 defined shift->offset(@_);
1062 sub FIRSTKEY
{ tied(%{shift->{offsets
}})->FIRSTKEY(@_); }
1064 sub NEXTKEY
{ tied(%{shift->{offsets
}})->NEXTKEY(@_); }
1068 if ($self->{indexing
}) { # killed prematurely, so index file is no good!
1069 warn "indexing was interrupted, so deleting $self->{indexing}";
1070 unlink $self->{indexing
};
1074 #-------------------------------------------------------------
1075 # Bio::Seq::PrimaryQual compatibility
1077 # Usage is the same as in Bio::Seq::PrimaryQual
1079 package Bio
::Seq
::PrimaryQual
::Qual
;
1080 use overload
'""' => 'display_id';
1082 use base
qw(Bio::Root::Root Bio::Seq::PrimaryQual);
1085 my ($class, @args) = @_;
1086 my $self = $class->SUPER::new
(@args);
1087 my ($db, $id, $start, $stop) = $self->_rearrange(
1088 [qw(DATABASE ID START STOP)],
1092 $self->{start
} = $start || 1;
1093 $self->{stop
} = $stop || $db->length($id);
1099 my $qual = $self->{db
}->qual($self->{id
},$self->{start
},$self->{stop
});
1104 my ($self, $start, $stop) = @_;
1105 return $self->trunc($start, $stop)->qual;
1109 my ($self, $start, $stop) = @_;
1111 "$stop is smaller than $stop. If you want to truncate and reverse ".
1112 "complement, you must call trunc followed by revcom."
1113 ) if $start > $stop;
1115 if ($self->{start
} <= $self->{stop
}) {
1116 $left = $self->{start
}+$start-1;
1117 $right = $self->{start
}+$stop-1;
1119 $left = $self->{start
}-($start-1);
1120 $right = $self->{start
}-($stop-1);
1122 my $obj = $self->new( -database
=> $self->{db
},
1137 return overload
::StrVal
($self);
1141 # number of quality scores
1143 return scalar(@
{$self->qual});
1148 my $header = $self->{'db'}->header($self->{id
});
1149 # remove the id from the header
1150 $header = (split(/\s+/,$header,2))[2];
1153 *desc
= \
&description
;
1155 #-------------------------------------------------------------
1156 # stream-based access to the database
1159 package Bio
::DB
::Qual
::Stream
;
1160 use base
qw(Tie::Handle Bio::DB::SeqI);
1163 my ($class, $db) = @_;
1164 my $key = $db->FIRSTKEY;
1173 my ($key, $db) = @
{$self}{'key', 'db'};
1174 while ($key =~ /^__/) {
1175 $key = $db->NEXTKEY($key);
1176 return unless defined $key;
1178 my $value = $db->get_Qual_by_id($key);
1179 $self->{key
} = $db->NEXTKEY($key);
1184 my ($class, $db) = @_;
1185 return $class->new($db);