2 # BioPerl module for Bio::Seq::Quality
4 # Please direct questions and support issues to
5 # <bioperl-l@bioperl.org>
7 # Cared for by Heikki Lehvaslaiho
9 # Copyright Heikki Lehvaslaiho
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Seq::Quality - Implementation of sequence with residue quality
22 use Bio::Seq::Quality;
24 # input can be space delimited string or array ref
25 my $qual = '0 1 2 3 4 5 6 7 8 9 11 12';
26 my $trace = '0 5 10 15 20 25 30 35 40 45 50 55';
28 my $seq = Bio::Seq::Quality->new
30 -trace_indices => $trace,
31 -seq => 'atcgatcgatcg',
33 -accession_number => 'S000012',
34 -verbose => -1 # to silence deprecated methods
37 my $quals = $seq->qual; # array ref
38 my $traces = $seq->trace; # array ref
40 my $quals = $seq->qual_text; # string
41 my $traces = $seq->trace_text; # string
45 $quals = $seq->subqual(2, 3); # array ref
46 $traces = $seq->subtrace(2, 3); # array ref
47 $quals = $seq->subqual_text(2, 3); # string
48 $quals = $seq->subtrace_text(2, 3); # string
51 $seq->subqual(2, 3, "9 9");
52 $seq->subtrace(2, 3, "9 9");
58 This object stores base quality values together with the sequence
61 The implementation is based on Bio::Seq::Meta::Array. qual() and
62 trace() are base methods to store and retrieve information that have
63 extensions to retrieve values as a scalar (e.g. qual_text() ), or get
64 or set subvalues (e.g. subqual() ). See L<Bio::Seq::MetaI> for more
67 All the functional code is in Bio::Seq::Meta::Array.
69 There deprecated methods that are included for compatibility with
70 Bio::Seq::SeqWithQuality. These will print a warning unless verbosity
71 of the object is set to be less than zero.
76 L<Bio::Seq::Meta::Array>
82 User feedback is an integral part of the evolution of this and other
83 Bioperl modules. Send your comments and suggestions preferably to one
84 of the Bioperl mailing lists. Your participation is much appreciated.
86 bioperl-l@bioperl.org - General discussion
87 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91 Please direct usage questions or support issues to the mailing list:
93 I<bioperl-l@bioperl.org>
95 rather than to the module maintainer directly. Many experienced and
96 reponsive experts will be able look at the problem and quickly
97 address it. Please include a thorough description of the problem
98 with code and data examples if at all possible.
100 =head2 Reporting Bugs
102 Report bugs to the Bioperl bug tracking system to help us keep track
103 the bugs and their resolution. Bug reports can be submitted via the
106 https://github.com/bioperl/bioperl-live/issues
108 =head1 AUTHOR - Heikki Lehvaslaiho
110 Email heikki-at-bioperl-dot-org
114 Chad Matsalla, bioinformatics at dieselwurks dot com
116 Dan Bolser, dan dot bolser at gmail dot com
120 The rest of the documentation details each of the object methods.
121 Internal methods are usually preceded with a _
126 # Let the code begin...
129 package Bio
::Seq
::Quality
;
132 use base
qw(Bio::Seq::Meta::Array);
134 ## Is this the right place (and way) to define this?
135 our $MASK_CHAR = 'X';
136 our $DEFAULT_NAME = 'DEFAULT';
143 Usage : $metaseq = Bio::Seq::Quality->new
144 ( -qual => '0 1 2 3 4 5 6 7 8 9 11 12',
145 -trace => '0 5 10 15 20 25 30 35 40 45 50 55',
146 -seq => 'atcgatcgatcg',
148 -accession_number => 'S000012',
150 Function: Constructor for Bio::Seq::Quality class. Note that you can
151 provide an empty quality and trace strings.
153 Returns : a new Bio::Seq::Quality object
159 my ($class, @args) = @_;
161 my $self = $class->SUPER::new
(@args);
163 my($meta, $qual, $trace, $trace_indices, $trace_data) =
164 $self->_rearrange([qw(META
171 $self->{'_meta'}->{$DEFAULT_NAME} = [];
172 $self->{'_meta'}->{'trace'} = [];
173 $self->{trace_data
} = $trace_data;
175 $meta && $self->meta($meta);
176 $qual && $self->qual($qual);
177 $trace && $self->named_meta('trace', $trace);
178 $trace_indices && $self->named_meta('trace', $trace_indices);
190 Usage : $qual_values = $obj->qual($values_string);
193 Get and set method for the meta data starting from residue
194 position one. Since it is dependent on the length of the
195 sequence, it needs to be manipulated after the sequence.
197 The length of the returned value always matches the length
200 Returns : reference to an array of meta data
201 Args : new value, string or array ref or Bio::Seq::PrimaryQual, optional
203 Setting quality values resets the clear range.
210 $value = $value->qual
211 if ref($value) and ref($value) ne 'ARRAY' and
212 $value->isa('Bio::Seq::PrimaryQual');
213 $self->_empty_cache if $value;
214 return $self->named_meta($DEFAULT_NAME, $value);
220 Usage : $qual_values = $obj->qual_text($values_arrayref);
221 Function: Variant of meta() and qual() guarantied to return a string
222 representation of meta data. For details, see L<meta>.
224 Args : new value, optional
229 return join ' ', @
{shift->named_submeta($DEFAULT_NAME, @_)};
235 Usage : $subset_of_qual_values = $obj->subqual(10, 20, $value_string);
236 $subset_of_qual_values = $obj->subqual(10, undef, $value_string);
239 Get and set method for meta data for subsequences.
241 Numbering starts from 1 and the number is inclusive, ie 1-2
242 are the first two residue of the sequence. Start cannot be
243 larger than end but can be equal.
245 If the second argument is missing the returned values
246 should extend to the end of the sequence.
248 Returns : A reference to an array
249 Args : integer, start position
250 integer, end position, optional when a third argument present
256 shift->named_submeta($DEFAULT_NAME, @_);
262 Usage : $meta_values = $obj->subqual_text(20, $value_string);
263 Function: Variant of subqual() returning a stringified
264 representation of meta data. For details, see L<Bio::Seq::MetaI>.
266 Args : new value, optional
271 return join ' ', @
{shift->named_submeta($DEFAULT_NAME, @_)};
274 =head2 quality_length
276 Title : quality_length()
277 Usage : $qual_len = $obj->quality_length();
278 Function: return the number of elements in the quality array
286 return $self->named_meta_length($DEFAULT_NAME);
289 =head2 quality_is_flush
291 Title : quality_is_flush
292 Usage : $quality_is_flush = $obj->quality_is_flush()
293 Function: Boolean to tell if the trace length equals the sequence length.
294 Returns true if force_flush() is set.
295 Returns : boolean 1 or 0
300 sub quality_is_flush
{
301 return shift->is_flush('quality');
311 Usage : $trace_values = $obj->trace($values_string);
314 Get and set method for the meta data starting from residue
315 position one. Since it is dependent on the length of the
316 sequence, it needs to be manipulated after the sequence.
318 The length of the returned value always matches the length
321 Returns : reference to an array of meta data
322 Args : new value, string or array ref, optional
329 return $self->named_meta('trace', $value);
335 Usage : $trace_values = $obj->trace_text($values_arrayref);
336 Function: Variant of meta() and trace() guarantied to return a string
337 representation of meta data. For details, see L<meta>.
339 Args : new value, optional
344 return join ' ', @
{shift->named_submeta('trace', @_)};
350 Usage : $subset_of_trace_values = $obj->subtrace(10, 20, $value_string);
351 $subset_of_trace_values = $obj->subtrace(10, undef, $value_string);
354 Get and set method for meta data for subsequences.
356 Numbering starts from 1 and the number is inclusive, ie 1-2
357 are the first two residue of the sequence. Start cannot be
358 larger than end but can be equal.
360 If the second argument is missing the returned values
361 should extend to the end of the sequence.
363 Returns : A reference to an array
364 Args : integer, start position
365 integer, end position, optional when a third argument present
372 return shift->named_submeta('trace', @_);
377 Title : subtrace_text
378 Usage : $meta_values = $obj->subtrace_text(20, $value_string);
379 Function: Variant of subtrace() returning a stringified
380 representation of meta data. For details, see L<Bio::Seq::MetaI>.
382 Args : new value, optional
387 return join ' ', @
{shift->named_submeta('trace', @_)};
392 Title : trace_length()
393 Usage : $trace_len = $obj->trace_length();
394 Function: return the number of elements in the trace set
402 return $self->named_meta_length('trace');
405 =head2 trace_is_flush
407 Title : trace_is_flush
408 Usage : $trace_is_flush = $obj->trace_is_flush()
409 Function: Boolean to tell if the trace length equals the sequence length.
410 Returns true if force_flush() is set.
411 Returns : boolean 1 or 0
417 return shift->is_flush('trace');
422 =head2 get_trace_graph
424 Title : get_trace_graph
425 Usage : @trace_values = $obj->get_trace_graph( -trace => 'a',
427 Function : Returns array of raw trace values for a trace file, or
428 false if no trace data exists. Requires a value for trace
429 to get either the a, g, c or t trace information, and an
430 optional value for scale, which rescales the data between
431 0 and the provided value, a scale value of '0' performs no
434 Args : string, trace to retrieve, one of a, g, c or t integer,
435 scale, for scaling of trace between 0 and scale, or 0 for
444 $self->_rearrange([qw(TRACE
448 unless (defined($self->{trace_data
})) { return 0 }
449 unless (grep { lc($trace) eq $_ } ('a', 'g', 'c', 't')) { return 0 }
450 $trace = lc($trace) . "_trace";
451 my @trace_data = exists $self->{trace_data
}->{$trace} &&
452 ref $self->{trace_data
}->{$trace} eq 'ARRAY' ?
453 @
{$self->{trace_data
}->{$trace}} : ();
454 my $max = $self->{trace_data
}->{max_height
};
455 if (defined($scale) and $scale != 0)
457 @trace_data = map { $_ / $max * $scale } @trace_data;
466 Usage : $qual->threshold($value);
467 Function: Sets the quality threshold.
469 Args : new value, optional
471 Value used by *clear_range* method below.
478 if (defined $value) {
479 $self->throw("Threshold needs to be an integer [$value]")
480 unless $value =~ /^[-+]?\d+$/;
482 if defined $self->{_threshold
} and $self->{_threshold
} ne $value;
483 $self->{_threshold
} = $value;
485 return $self->{_threshold
};
489 =head2 mask_below_threshold
491 Title : mask_below_threshold
492 Usage : $count = $obj->count_clear_ranges($threshold);
493 Function: Counts number of ranges in the sequence where quality
494 values are above the threshold
495 Returns : count integer
496 Args : threshold integer, optional
498 Set threshold first using method L<threshold>.
502 sub mask_below_threshold
{
504 my $threshold = shift;
506 $self->threshold($threshold) if defined $threshold;
508 # populate the cache if needed
509 $self->_find_clear_ranges unless defined $self->{_ranges
};
511 my $maskSeq = $self->seq;
512 my $maskQual = $self->qual;
514 ## There must be a more efficient way than this!
515 for(my $i=0; $i<length($maskSeq); $i++){
516 #print join ("\t", $i, $maskQual->[$i]), "\n";
517 substr($maskSeq, $i, 1, $MASK_CHAR)
518 if $maskQual->[$i] < $self->{_threshold
};
521 ## This is the *wrong* way to do it!
522 #for my $r (@{$self->{_ranges}} ){
523 # substr($maskSeq, $r->{start}, $r->{length}, $MASK_CHAR x $r->{length});
529 =head2 count_clear_ranges
531 Title : count_clear_ranges
532 Usage : $count = $obj->count_clear_ranges($threshold);
533 Function: Counts number of ranges in the sequence where quality
534 values are above the threshold
535 Returns : count integer
536 Args : threshold integer, optional
538 Set threshold first using method L<threshold>.
542 sub count_clear_ranges
{
544 my $threshold = shift;
545 $self->threshold($threshold) if defined $threshold;
547 # populate the cache if needed
548 $self->_find_clear_ranges unless defined $self->{_ranges
};
550 return scalar @
{$self->{_ranges
}};
553 =head2 clear_ranges_length
555 Title : clear_ranges_length
556 Usage : $total_lenght = $obj->clear_ranges_length($threshold);
557 Function: Return number of residues with quality values above
558 the threshold in all clear ranges
560 Args : threshold, optional
562 Set threshold first using method L<threshold>.
564 I think this method needs a better name! count_high_quality_bases? or
569 sub clear_ranges_length
{
571 my $threshold = shift;
572 $self->threshold($threshold) if defined $threshold;
574 # populate the cache if needed
575 $self->_find_clear_ranges unless defined $self->{_ranges
};
578 map {$sum += $_->{length}} @
{$self->{_ranges
}};
582 =head2 get_clear_range
584 Title : get_clear_range
585 Usage : $newqualobj = $obj->get_clear_range($threshold);
586 Function: Return longest subsequence that has quality values above
587 the given threshold, or a default value of 13
588 Returns : a new Bio::Seq::Quality object
589 Args : threshold, optional
591 Set threshold first using method L<threshold>.
593 Note, this method could be implemented using some gaussian smoothing
594 of the quality scores. Currently one base below the threshold is
595 enough to end the clear range.
599 sub get_clear_range
{
601 my $threshold = shift;
602 $self->threshold($threshold) if defined $threshold;
604 # populate the cache if needed
605 $self->_find_clear_ranges unless defined $self->{_ranges
};
608 return unless defined $self->{_ranges
};
611 for (sort {$b->{length} <=> $a->{length} } @
{$self->{_ranges
}} ){
612 my $newqualobj = Bio
::Seq
::Quality
->new(
613 -seq
=> $self->subseq( $_->{start
}, $_->{end
}),
614 -qual
=> $self->subqual($_->{start
}, $_->{end
}),
617 $newqualobj->threshold($threshold);
625 =head2 get_all_clean_ranges
627 Title : get_all_clean_ranges
628 Usage : @ranges = $obj->get_all_clean_ranges($minlength);
629 Function: Return all ranges where quality values are above
630 the threshold. Original ordering.
631 Returns : an ordered array of new Bio::Seq::Quality objects
632 Args : minimum length , optional
634 Set threshold first using method L<threshold>.
638 sub get_all_clean_ranges
{
640 my $minl = shift || 0;
642 $self->throw("Mimimum length needs to be zero or a positive integer [$minl]")
643 unless $minl =~ /^\+?\d+$/;
645 # populate the cache if needed
646 $self->_find_clear_ranges unless defined $self->{_ranges
};
648 # return in the order of occurrence
650 for my $r (sort {$b->{start
} <=> $a->{start
} } @
{$self->{_ranges
}} ){
651 next if $r->{length} < $minl;
653 ## Constructor should allow "-threshold => ..."!
654 push @ranges, Bio
::Seq
::Quality
->new
655 ( -seq
=> $self->subseq( $r->{start
}, $r->{end
}),
656 -qual
=> $self->subqual($r->{start
}, $r->{end
}),
665 # _find_clear_ranges: where range/threshold calculations happen
668 sub _find_clear_ranges
{
670 my $qual = $self->qual;
672 $self->throw("You need to set the threshold value first")
673 unless defined $self->threshold;
675 my $threshold = $self->threshold;
679 for(my $i=0; $i<@
$qual; $i++){
680 ## Are we currently within a clear range or not?
682 ## Did we just leave the clear range?
683 if($qual->[$i]<$threshold){
686 $range->{end
} = $i-1;
687 $range->{start
} = $rangeFlag;
688 $range->{length} = $i - $rangeFlag;
689 push @
{$self->{_ranges
}}, $range;
690 ## and reset the range flag.
693 ## else nothing changes
696 ## Did we just enter a clear range?
697 if($qual->[$i]>=$threshold){
698 ## then set the range flag!
701 ## else nothing changes
704 ## Did we exit the last clear range?
706 my $i = scalar(@
$qual);
709 $range->{end
} = $i-1;
710 $range->{start
} = $rangeFlag;
711 $range->{length} = $i - $rangeFlag;
712 push @
{$self->{_ranges
}}, $range;
721 undef $self->{_ranges
};
725 ################## deprecated methods ##################
729 return $self->named_meta('trace');
733 my ($self, $val) =@_;
734 return shift @
{$self->named_submeta('trace', $val, $val)};
738 sub sub_trace_index
{
740 return $self->named_submeta('trace', @_);
745 my ($self, $val) =@_;
746 return shift @
{$self->submeta($val, $val)};
751 my ($self,$val) = @_;
752 return $self->subseq($val,$val);