2 # BioPerl module for Bio::Seq::Quality
4 # Please direct questions and support issues to
5 # <bioperl-l@bioperl.org>
7 # Cared for by Heikki Lehvaslaiho
9 # Copyright Heikki Lehvaslaiho
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Seq::Quality - Implementation of sequence with residue quality
22 use Bio::Seq::Quality;
24 # input can be space delimited string or array ref
25 my $qual = '0 1 2 3 4 5 6 7 8 9 11 12';
26 my $trace = '0 5 10 15 20 25 30 35 40 45 50 55';
28 my $seq = Bio::Seq::Quality->new
30 -trace_indices => $trace,
31 -seq => 'atcgatcgatcg',
33 -accession_number => 'S000012',
34 -verbose => -1 # to silence deprecated methods
37 my $quals = $seq->qual; # array ref
38 my $traces = $seq->trace; # array ref
40 my $quals = $seq->qual_text; # string
41 my $traces = $seq->trace_text; # string
45 $quals = $seq->subqual(2, 3); # array ref
46 $traces = $seq->subtrace(2, 3); # array ref
47 $quals = $seq->subqual_text(2, 3); # string
48 $quals = $seq->subtrace_text(2, 3); # string
51 $seq->subqual(2, 3, "9 9");
52 $seq->subtrace(2, 3, "9 9");
58 This object stores base quality values together with the sequence
61 It is a reimplementation of Chad Matsalla's Bio::Seq::SeqWithQuality
62 module using Bio::Seq::MetaI.
64 The implementation is based on Bio::Seq::Meta::Array. qual() and
65 trace() are base methods to store and retrieve information that have
66 extensions to retrieve values as a scalar (e.g. qual_text() ), or get
67 or set subvalues (e.g. subqual() ). See L<Bio::Seq::MetaI> for more
70 All the functional code is in Bio::Seq::Meta::Array.
72 There deprecated methods that are included for compatibility with
73 Bio::Seq::SeqWithQuality. These will print a warning unless verbosity
74 of the object is set to be less than zero.
76 =head2 Differences from Bio::Seq::SeqWithQuality
78 It is not possible to fully follow the interface of
79 Bio::Seq::SeqWithQuality since internally a Bio::Seq::SeqWithQuality
80 object is a composite of two independent objects: a Bio::PrimarySeq
81 object and Bio::Seq::PrimaryQual object. Both of these objects can be
82 created separately and merged into Bio::Seq::SeqWithQuality.
84 This implementation is based on Bio::Seq::Meta::Array that is a
85 subclass of Bio::PrimarySeq that stores any number of meta information
88 Here we assume that two meta sets, called 'qual' and 'trace_indices'
89 are attached to a sequence. (But there is nothing that prevents you to
90 add as many named meta sets as you need using normal meta() methods).
92 qual() is an alias to meta(), qualat($loc) is an alias to
95 trace_indices() in Bio::Seq::SeqWithQuality has been abbreviated to
96 trace() and is an alias to named_meta('trace').
98 You can create an object without passing any arguments to the
99 constructor (Bio::Seq::SeqWithQuality fails without alphabet). It will
100 warn about not being able to set alphabet unless you set verbosity of
101 the object to a negative value.
103 After the latest rewrite, the meta information sets (quality and
104 trace) no longer cover all the residues automatically. Methods to
105 check the length of meta information (L<quality_length>,
106 L<trace_length>)and to see if the ends are flushed to the sequence
107 have been added (L<quality_is_flush>, L<trace_is_flush>). To force the
108 old functinality, set L<force_flush> to true.
110 qual_obj() and seq_obj() methods do not exist!
112 Finally, there is only one set of descriptors (primary_id, display_id,
113 accession_number) for the object.
119 L<Bio::Seq::Meta::Array>
125 User feedback is an integral part of the evolution of this and other
126 Bioperl modules. Send your comments and suggestions preferably to one
127 of the Bioperl mailing lists. Your participation is much appreciated.
129 bioperl-l@bioperl.org - General discussion
130 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
134 Please direct usage questions or support issues to the mailing list:
136 I<bioperl-l@bioperl.org>
138 rather than to the module maintainer directly. Many experienced and
139 reponsive experts will be able look at the problem and quickly
140 address it. Please include a thorough description of the problem
141 with code and data examples if at all possible.
143 =head2 Reporting Bugs
145 Report bugs to the Bioperl bug tracking system to help us keep track
146 the bugs and their resolution. Bug reports can be submitted via the
149 https://github.com/bioperl/bioperl-live/issues
151 =head1 AUTHOR - Heikki Lehvaslaiho
153 Email heikki-at-bioperl-dot-org
157 Chad Matsalla, bioinformatics at dieselwurks dot com
159 Dan Bolser, dan dot bolser at gmail dot com
163 The rest of the documentation details each of the object methods.
164 Internal methods are usually preceded with a _
169 # Let the code begin...
172 package Bio
::Seq
::Quality
;
175 use base
qw(Bio::Seq::Meta::Array);
177 ## Is this the right place (and way) to define this?
178 our $MASK_CHAR = 'X';
179 our $DEFAULT_NAME = 'DEFAULT';
186 Usage : $metaseq = Bio::Seq::Quality->new
187 ( -qual => '0 1 2 3 4 5 6 7 8 9 11 12',
188 -trace => '0 5 10 15 20 25 30 35 40 45 50 55',
189 -seq => 'atcgatcgatcg',
191 -accession_number => 'S000012',
193 Function: Constructor for Bio::Seq::Quality class. Note that you can
194 provide an empty quality and trace strings.
196 Returns : a new Bio::Seq::Quality object
202 my ($class, @args) = @_;
204 my $self = $class->SUPER::new
(@args);
206 my($meta, $qual, $trace, $trace_indices, $trace_data) =
207 $self->_rearrange([qw(META
214 $self->{'_meta'}->{$DEFAULT_NAME} = [];
215 $self->{'_meta'}->{'trace'} = [];
216 $self->{trace_data
} = $trace_data;
218 $meta && $self->meta($meta);
219 $qual && $self->qual($qual);
220 $trace && $self->named_meta('trace', $trace);
221 $trace_indices && $self->named_meta('trace', $trace_indices);
233 Usage : $qual_values = $obj->qual($values_string);
236 Get and set method for the meta data starting from residue
237 position one. Since it is dependent on the length of the
238 sequence, it needs to be manipulated after the sequence.
240 The length of the returned value always matches the length
243 Returns : reference to an array of meta data
244 Args : new value, string or array ref or Bio::Seq::PrimaryQual, optional
246 Setting quality values resets the clear range.
253 $value = $value->qual
254 if ref($value) and ref($value) ne 'ARRAY' and
255 $value->isa('Bio::Seq::PrimaryQual');
256 $self->_empty_cache if $value;
257 return $self->named_meta($DEFAULT_NAME, $value);
263 Usage : $qual_values = $obj->qual_text($values_arrayref);
264 Function: Variant of meta() and qual() guarantied to return a string
265 representation of meta data. For details, see L<meta>.
267 Args : new value, optional
272 return join ' ', @
{shift->named_submeta($DEFAULT_NAME, @_)};
278 Usage : $subset_of_qual_values = $obj->subqual(10, 20, $value_string);
279 $subset_of_qual_values = $obj->subqual(10, undef, $value_string);
282 Get and set method for meta data for subsequences.
284 Numbering starts from 1 and the number is inclusive, ie 1-2
285 are the first two residue of the sequence. Start cannot be
286 larger than end but can be equal.
288 If the second argument is missing the returned values
289 should extend to the end of the sequence.
291 Returns : A reference to an array
292 Args : integer, start position
293 integer, end position, optional when a third argument present
299 shift->named_submeta($DEFAULT_NAME, @_);
305 Usage : $meta_values = $obj->subqual_text(20, $value_string);
306 Function: Variant of subqual() returning a stringified
307 representation of meta data. For details, see L<Bio::Seq::MetaI>.
309 Args : new value, optional
314 return join ' ', @
{shift->named_submeta($DEFAULT_NAME, @_)};
317 =head2 quality_length
319 Title : quality_length()
320 Usage : $qual_len = $obj->quality_length();
321 Function: return the number of elements in the quality array
329 return $self->named_meta_length($DEFAULT_NAME);
332 =head2 quality_is_flush
334 Title : quality_is_flush
335 Usage : $quality_is_flush = $obj->quality_is_flush()
336 Function: Boolean to tell if the trace length equals the sequence length.
337 Returns true if force_flush() is set.
338 Returns : boolean 1 or 0
343 sub quality_is_flush
{
344 return shift->is_flush('quality');
354 Usage : $trace_values = $obj->trace($values_string);
357 Get and set method for the meta data starting from residue
358 position one. Since it is dependent on the length of the
359 sequence, it needs to be manipulated after the sequence.
361 The length of the returned value always matches the length
364 Returns : reference to an array of meta data
365 Args : new value, string or array ref, optional
372 return $self->named_meta('trace', $value);
378 Usage : $trace_values = $obj->trace_text($values_arrayref);
379 Function: Variant of meta() and trace() guarantied to return a string
380 representation of meta data. For details, see L<meta>.
382 Args : new value, optional
387 return join ' ', @
{shift->named_submeta('trace', @_)};
393 Usage : $subset_of_trace_values = $obj->subtrace(10, 20, $value_string);
394 $subset_of_trace_values = $obj->subtrace(10, undef, $value_string);
397 Get and set method for meta data for subsequences.
399 Numbering starts from 1 and the number is inclusive, ie 1-2
400 are the first two residue of the sequence. Start cannot be
401 larger than end but can be equal.
403 If the second argument is missing the returned values
404 should extend to the end of the sequence.
406 Returns : A reference to an array
407 Args : integer, start position
408 integer, end position, optional when a third argument present
415 return shift->named_submeta('trace', @_);
420 Title : subtrace_text
421 Usage : $meta_values = $obj->subtrace_text(20, $value_string);
422 Function: Variant of subtrace() returning a stringified
423 representation of meta data. For details, see L<Bio::Seq::MetaI>.
425 Args : new value, optional
430 return join ' ', @
{shift->named_submeta('trace', @_)};
435 Title : trace_length()
436 Usage : $trace_len = $obj->trace_length();
437 Function: return the number of elements in the trace set
445 return $self->named_meta_length('trace');
448 =head2 trace_is_flush
450 Title : trace_is_flush
451 Usage : $trace_is_flush = $obj->trace_is_flush()
452 Function: Boolean to tell if the trace length equals the sequence length.
453 Returns true if force_flush() is set.
454 Returns : boolean 1 or 0
460 return shift->is_flush('trace');
465 =head2 get_trace_graph
467 Title : get_trace_graph
468 Usage : @trace_values = $obj->get_trace_graph( -trace => 'a',
470 Function : Returns array of raw trace values for a trace file, or
471 false if no trace data exists. Requires a value for trace
472 to get either the a, g, c or t trace information, and an
473 optional value for scale, which rescales the data between
474 0 and the provided value, a scale value of '0' performs no
477 Args : string, trace to retrieve, one of a, g, c or t integer,
478 scale, for scaling of trace between 0 and scale, or 0 for
487 $self->_rearrange([qw(TRACE
491 unless (defined($self->{trace_data
})) { return 0 }
492 unless (grep { lc($trace) eq $_ } ('a', 'g', 'c', 't')) { return 0 }
493 $trace = lc($trace) . "_trace";
494 my @trace_data = exists $self->{trace_data
}->{$trace} &&
495 ref $self->{trace_data
}->{$trace} eq 'ARRAY' ?
496 @
{$self->{trace_data
}->{$trace}} : ();
497 my $max = $self->{trace_data
}->{max_height
};
498 if (defined($scale) and $scale != 0)
500 @trace_data = map { $_ / $max * $scale } @trace_data;
509 Usage : $qual->threshold($value);
510 Function: Sets the quality threshold.
512 Args : new value, optional
514 Value used by *clear_range* method below.
521 if (defined $value) {
522 $self->throw("Threshold needs to be an integer [$value]")
523 unless $value =~ /^[-+]?\d+$/;
525 if defined $self->{_threshold
} and $self->{_threshold
} ne $value;
526 $self->{_threshold
} = $value;
528 return $self->{_threshold
};
532 =head2 mask_below_threshold
534 Title : mask_below_threshold
535 Usage : $count = $obj->count_clear_ranges($threshold);
536 Function: Counts number of ranges in the sequence where quality
537 values are above the threshold
538 Returns : count integer
539 Args : threshold integer, optional
541 Set threshold first using method L<threshold>.
545 sub mask_below_threshold
{
547 my $threshold = shift;
549 $self->threshold($threshold) if defined $threshold;
551 # populate the cache if needed
552 $self->_find_clear_ranges unless defined $self->{_ranges
};
554 my $maskSeq = $self->seq;
555 my $maskQual = $self->qual;
557 ## There must be a more efficient way than this!
558 for(my $i=0; $i<length($maskSeq); $i++){
559 #print join ("\t", $i, $maskQual->[$i]), "\n";
560 substr($maskSeq, $i, 1, $MASK_CHAR)
561 if $maskQual->[$i] < $self->{_threshold
};
564 ## This is the *wrong* way to do it!
565 #for my $r (@{$self->{_ranges}} ){
566 # substr($maskSeq, $r->{start}, $r->{length}, $MASK_CHAR x $r->{length});
572 =head2 count_clear_ranges
574 Title : count_clear_ranges
575 Usage : $count = $obj->count_clear_ranges($threshold);
576 Function: Counts number of ranges in the sequence where quality
577 values are above the threshold
578 Returns : count integer
579 Args : threshold integer, optional
581 Set threshold first using method L<threshold>.
585 sub count_clear_ranges
{
587 my $threshold = shift;
588 $self->threshold($threshold) if defined $threshold;
590 # populate the cache if needed
591 $self->_find_clear_ranges unless defined $self->{_ranges
};
593 return scalar @
{$self->{_ranges
}};
596 =head2 clear_ranges_length
598 Title : clear_ranges_length
599 Usage : $total_lenght = $obj->clear_ranges_length($threshold);
600 Function: Return number of residues with quality values above
601 the threshold in all clear ranges
603 Args : threshold, optional
605 Set threshold first using method L<threshold>.
607 I think this method needs a better name! count_high_quality_bases? or
612 sub clear_ranges_length
{
614 my $threshold = shift;
615 $self->threshold($threshold) if defined $threshold;
617 # populate the cache if needed
618 $self->_find_clear_ranges unless defined $self->{_ranges
};
621 map {$sum += $_->{length}} @
{$self->{_ranges
}};
625 =head2 get_clear_range
627 Title : get_clear_range
628 Usage : $newqualobj = $obj->get_clear_range($threshold);
629 Function: Return longest subsequence that has quality values above
630 the given threshold, or a default value of 13
631 Returns : a new Bio::Seq::Quality object
632 Args : threshold, optional
634 Set threshold first using method L<threshold>.
636 Note, this method could be implemented using some gaussian smoothing
637 of the quality scores. Currently one base below the threshold is
638 enough to end the clear range.
642 sub get_clear_range
{
644 my $threshold = shift;
645 $self->threshold($threshold) if defined $threshold;
647 # populate the cache if needed
648 $self->_find_clear_ranges unless defined $self->{_ranges
};
651 return unless defined $self->{_ranges
};
654 for (sort {$b->{length} <=> $a->{length} } @
{$self->{_ranges
}} ){
655 my $newqualobj = Bio
::Seq
::Quality
->new(
656 -seq
=> $self->subseq( $_->{start
}, $_->{end
}),
657 -qual
=> $self->subqual($_->{start
}, $_->{end
}),
660 $newqualobj->threshold($threshold);
668 =head2 get_all_clean_ranges
670 Title : get_all_clean_ranges
671 Usage : @ranges = $obj->get_all_clean_ranges($minlength);
672 Function: Return all ranges where quality values are above
673 the threshold. Original ordering.
674 Returns : an ordered array of new Bio::Seq::Quality objects
675 Args : minimum length , optional
677 Set threshold first using method L<threshold>.
681 sub get_all_clean_ranges
{
683 my $minl = shift || 0;
685 $self->throw("Mimimum length needs to be zero or a positive integer [$minl]")
686 unless $minl =~ /^\+?\d+$/;
688 # populate the cache if needed
689 $self->_find_clear_ranges unless defined $self->{_ranges
};
691 # return in the order of occurrence
693 for my $r (sort {$b->{start
} <=> $a->{start
} } @
{$self->{_ranges
}} ){
694 next if $r->{length} < $minl;
696 ## Constructor should allow "-threshold => ..."!
697 push @ranges, Bio
::Seq
::Quality
->new
698 ( -seq
=> $self->subseq( $r->{start
}, $r->{end
}),
699 -qual
=> $self->subqual($r->{start
}, $r->{end
}),
708 # _find_clear_ranges: where range/threshold calculations happen
711 sub _find_clear_ranges
{
713 my $qual = $self->qual;
715 $self->throw("You need to set the threshold value first")
716 unless defined $self->threshold;
718 my $threshold = $self->threshold;
722 for(my $i=0; $i<@
$qual; $i++){
723 ## Are we currently within a clear range or not?
725 ## Did we just leave the clear range?
726 if($qual->[$i]<$threshold){
729 $range->{end
} = $i-1;
730 $range->{start
} = $rangeFlag;
731 $range->{length} = $i - $rangeFlag;
732 push @
{$self->{_ranges
}}, $range;
733 ## and reset the range flag.
736 ## else nothing changes
739 ## Did we just enter a clear range?
740 if($qual->[$i]>=$threshold){
741 ## then set the range flag!
744 ## else nothing changes
747 ## Did we exit the last clear range?
749 my $i = scalar(@
$qual);
752 $range->{end
} = $i-1;
753 $range->{start
} = $rangeFlag;
754 $range->{length} = $i - $rangeFlag;
755 push @
{$self->{_ranges
}}, $range;
764 undef $self->{_ranges
};
770 ################## deprecated methods ##################
775 return $self->named_meta('trace');
779 my ($self, $val) =@_;
780 return shift @
{$self->named_submeta('trace', $val, $val)};
784 sub sub_trace_index
{
786 return $self->named_submeta('trace', @_);
791 my ($self, $val) =@_;
792 return shift @
{$self->submeta($val, $val)};
797 my ($self,$val) = @_;
798 return $self->subseq($val,$val);