t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / Tools / Alignment / Trim.pm
blob550dc30f7a1b60202198157276128e5975ec14b2
1 # Bio::Tools::Alignment::Trim.pm
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Chad Matsalla
7 # Copyright Chad Matsalla
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::Alignment::Trim - A kludge to do specialized trimming of
16 sequence based on quality.
18 =head1 SYNOPSIS
20 use Bio::Tools::Alignment::Trim;
21 $o_trim = Bio::Tools::Alignment::Trim->new();
22 $o_trim->set_reverse_designator("R");
23 $o_trim->set_forward_designator("F");
26 =head1 DESCRIPTION
28 This is a specialized module designed by Chad for Chad to trim sequences
29 based on a highly specialized list of requirements. In other words, write
30 something that will trim sequences 'just like the people in the lab would
31 do manually'.
33 I settled on a sliding-window-average style of search which is ugly and
34 slow but does _exactly_ what I want it to do.
36 Mental note: rewrite this.
38 It is very important to keep in mind the context in which this module was
39 written: strictly to support the projects for which Consed.pm was
40 designed.
42 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing
52 lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 the bugs and their resolution. Bug reports can be submitted via the
69 web:
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Chad Matsalla
75 Email bioinformatics-at-dieselwurks.com
77 =head1 APPENDIX
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
82 =cut
84 package Bio::Tools::Alignment::Trim;
86 use strict;
87 use Dumpvalue;
89 use vars qw(%DEFAULTS);
91 use base qw(Bio::Root::Root);
93 BEGIN {
94 %DEFAULTS = ( 'f_designator' => 'f',
95 'r_designator' => 'r',
96 'windowsize' => '10',
97 'phreds' => '20');
100 =head2 new()
102 Title : new()
103 Usage : $o_trim = Bio::Tools::Alignment::Trim->new();
104 Function: Construct the Bio::Tools::Alignment::Trim object. No parameters
105 are required to create this object. It is strictly a bundle of
106 functions, as far as I am concerned.
107 Returns : A reference to a Bio::Tools::Alignment::Trim object.
108 Args : (optional)
109 -windowsize (default 10)
110 -phreds (default 20)
113 =cut
115 sub new {
116 my ($class,@args) = @_;
117 my $self = $class->SUPER::new(@args);
118 my($windowsize,$phreds) =
119 $self->_rearrange([qw(
120 WINDOWSIZE
121 PHREDS
123 @args);
124 $self->{windowsize} = $windowsize || $DEFAULTS{'windowsize'};
125 $self->{phreds} = $phreds || $DEFAULTS{'phreds'};
126 # print("Constructor set phreds to ".$self->{phreds}."\n") if $self->verbose > 0;
127 $self->set_designators($DEFAULTS{'f_designator'},
128 $DEFAULTS{'r_designator'});
129 return $self;
132 =head2 set_designators($forward_designator,$reverse_designator)
134 Title : set_designators(<forward>,<reverse>)
135 Usage : $o_trim->set_designators("F","R")
136 Function: Set the string by which the system determines whether a given
137 sequence represents a forward or a reverse read.
138 Returns : Nothing.
139 Args : two scalars: one representing the forward designator and one
140 representing the reverse designator
142 =cut
144 sub set_designators {
145 my $self = shift;
146 ($self->{'f_designator'},$self->{'r_designator'}) = @_;
149 =head2 set_forward_designator($designator)
151 Title : set_forward_designator($designator)
152 Usage : $o_trim->set_forward_designator("F")
153 Function: Set the string by which the system determines if a given
154 sequence is a forward read.
155 Returns : Nothing.
156 Args : A string representing the forward designator of this project.
158 =cut
160 sub set_forward_designator {
161 my ($self,$desig) = @_;
162 $self->{'f_designator'} = $desig;
165 =head2 set_reverse_designator($reverse_designator)
167 Title : set_reverse_designator($reverse_designator)
168 Function: Set the string by which the system determines if a given
169 sequence is a reverse read.
170 Usage : $o_trim->set_reverse_designator("R")
171 Returns : Nothing.
172 Args : A string representing the forward designator of this project.
174 =cut
176 sub set_reverse_designator {
177 my ($self,$desig) = @_;
178 $self->{'r_designator'} = $desig;
181 =head2 get_designators()
183 Title : get_designators()
184 Usage : $o_trim->get_designators()
185 Returns : A string describing the current designators.
186 Args : None
187 Notes : Really for informational purposes only. Duh.
189 =cut
191 sub get_designators {
192 my $self = shift;
193 return("forward: ".$self->{'f_designator'}." reverse: ".$self->{'r_designator'});
196 =head2 trim_leading_polys()
198 Title : trim_leading_polys()
199 Usage : $o_trim->trim_leading_polys()
200 Function: Not implemented. Does nothing.
201 Returns : Nothing.
202 Args : None.
203 Notes : This function is not implemented. Part of something I wanted to
204 do but never got around to doing.
206 =cut
208 sub trim_leading_polys {
209 my ($self, $sequence) = @_;
212 =head2 dump_hash()
214 Title : dump_hash()
215 Usage : $o_trim->dump_hash()
216 Function: Unimplemented.
217 Returns : Nothing.
218 Args : None.
219 Notes : Does nothing.
221 =cut
223 sub dump_hash {
224 my $self = shift;
225 my %hash = %{$self->{'qualities'}};
226 } # end dump_hash
228 =head2 trim_singlet($sequence,$quality,$name,$class)
230 Title : trim_singlet($sequence,$quality,$name,$class)
231 Usage : ($r_trim_points,$trimmed_sequence) =
232 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
233 Function: Trim a singlet based on its quality.
234 Returns : a reference to an array containing the forward and reverse
235 trim points and the trimmed sequence.
236 Args : $sequence : A sequence (SCALAR, please)
237 $quality : A _scalar_ of space-delimited quality values.
238 $name : the name of the sequence
239 $class : The class of the sequence. One of qw(singlet
240 singleton doublet pair multiplet)
241 Notes : At the time this was written the bioperl objects SeqWithQuality
242 and PrimaryQual did not exist. This is what is with the clumsy
243 passing of references and so on. I will rewrite this next time I
244 have to work with it. I also wasn't sure whether this function
245 should return just the trim points or the points and the sequence.
246 I decided that I always wanted both so that's how I implemented
248 - Note that the size of the sliding windows is set during construction of
249 the Bio::Tools::Alignment::Trim object.
251 =cut
253 sub trim_singlet {
254 my ($self,$sequence,$quality,$name,$class) = @_;
255 # this split is done because I normally store quality values in a
256 # space-delimited scalar rather then in an array.
257 # I do this because serialization of the arrays is tough.
258 my @qual = split(' ',$quality);
259 my @points;
260 my $sequence_length = length($sequence);
261 my ($returnstring,$processed_sequence);
262 # smooth out the qualities
263 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
264 # find out the leading and trailing trimpoints
265 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds});
266 my (@new_points,$trimmed_sequence);
267 # do you think that any sequence shorter then 100 should be
268 # discarded? I don't think that this should be the decision of this
269 # module.
270 # removed, 020926
271 $points[0] = $start_base;
272 # whew! now for the end base
273 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
274 my $end_base = &_get_end($r_windows,$self->{windowsize},
275 $self->{phreds},$start_base);
276 $points[1] = $end_base;
277 # now do the actual trimming
278 # CHAD : I don't think that it is a good idea to call chop_sequence here
279 # because chop_sequence also removes X's and N's and things
280 # and that is not always what is wanted
281 return \@points;
284 =head2 trim_doublet($sequence,$quality,$name,$class)
286 Title : trim_doublet($sequence,$quality,$name,$class)
287 Usage : ($r_trim_points,$trimmed_sequence) =
288 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
289 Function: Trim a singlet based on its quality.
290 Returns : a reference to an array containing the forward and reverse
291 Args : $sequence : A sequence
292 $quality : A _scalar_ of space-delimited quality values.
293 $name : the name of the sequence
294 $class : The class of the sequence. One of qw(singlet
295 singleton doublet pair multiplet)
296 Notes : At the time this was written the bioperl objects SeqWithQuality
297 and PrimaryQual did not exist. This is what is with the clumsy
298 passing of references and so on. I will rewrite this next time I
299 have to work with it. I also wasn't sure whether this function
300 should return just the trim points or the points and the sequence.
301 I decided that I always wanted both so that's how I implemented
304 =cut
307 sub trim_doublet {
308 my ($self,$sequence,$quality,$name,$class) = @_;
309 my @qual = split(' ',$quality);
310 my @points;
311 my $sequence_length = length($sequence);
312 my ($returnstring,$processed_sequence);
313 # smooth out the qualities
314 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
315 # determine where the consensus sequence starts
316 my $offset = 0;
317 for (my $current = 0; $current<$sequence_length;$current++) {
318 if ($qual[$current] != 0) {
319 $offset = $current;
320 last;
323 # start_base required: r_quality,$windowsize,$phredvalue
324 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds},$offset);
325 if ($start_base > ($sequence_length - 100)) {
326 $points[0] = ("FAILED");
327 $points[1] = ("FAILED");
328 return @points;
330 $points[0] = $start_base;
332 # whew! now for the end base
334 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
335 # |
336 # 010420 NOTE: We will no longer get the end base to avoid the Q/--\___/-- syndrome
337 my $end_base = $sequence_length;
338 my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
339 $points[1] = $end_base;
340 # CHAD : I don't think that it is a good idea to call chop_sequence here
341 # because chop_sequence also removes X's and N's and things
342 # and that is not always what is wanted
343 return @points;
344 } # end trim_doublet
346 =head2 chop_sequence($name,$class,$sequence,@points)
348 Title : chop_sequence($name,$class,$sequence,@points)
349 Usage : ($start_point,$end_point,$chopped_sequence) =
350 $o_trim->chop_sequence($name,$class,$sequence,@points);
351 Function: Chop a sequence based on its name, class, and sequence.
352 Returns : an array containing three scalars:
353 1- the start trim point
354 2- the end trim point
355 3- the chopped sequence
356 Args :
357 $name : the name of the sequence
358 $class : The class of the sequence. One of qw(singlet
359 singleton doublet pair multiplet)
360 $sequence : A sequence
361 @points : An array containing two elements- the first contains
362 the start trim point and the second conatines the end trim
363 point.
365 =cut
367 sub chop_sequence {
368 my ($self,$name,$class,$sequence,@points) = @_;
369 print("Coming into chop_sequence, \@points are @points\n");
370 my $fdesig = $self->{'f_designator'};
371 my $rdesig = $self->{'r_designator'};
372 if (!$points[0] && !$points[1]) {
373 $sequence = "junk";
374 return $sequence;
376 if ($class eq "singlet" && $name =~ /$fdesig$/) {
377 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
379 elsif ($class eq "singlet" && $name =~ /$rdesig$/) {
380 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
382 elsif ($class eq "singleton" && $name =~ /$fdesig$/) {
383 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
385 elsif ($class eq "singleton" && $name =~ /$rdesig$/) {
386 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
388 elsif ($class eq "doublet") {
389 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
391 # this is a _terrible_ to do this! i couldn't seem to find a better way
392 # i thought something like s/(^.*[Xx]{5,})//g; might work, but no go
393 # no time to find a fix!
394 my $length_before_trimming = length($sequence);
395 my $subs_Xs = $sequence =~ s/^.*[Xx]{5,}//g;
396 if ($subs_Xs) {
397 my $length_after_trimming = length($sequence);
398 my $number_Xs_trimmed = $length_before_trimming - $length_after_trimming;
399 $points[0] += $number_Xs_trimmed;
401 $length_before_trimming = length($sequence);
402 my $subs_Ns = $sequence =~ s/[Nn]{1,}$//g;
403 if ($subs_Ns) {
404 my $length_after_trimming = length($sequence);
405 my $number_Ns_trimmed = $length_before_trimming - $length_after_trimming;
406 $points[1] -= $number_Ns_trimmed;
407 $points[1] -= 1;
409 push @points,$sequence;
410 print("chop_sequence \@points are @points\n");
411 return @points;
414 =head2 _get_start($r_quals,$windowsize,$phreds,$offset)
416 Title : _get_start($r_quals,$windowsize,$phreds,$offset)
417 Usage : $start_base = $self->_get_start($r_windows,5,20);
418 Function: Provide the start trim point for this sequence.
419 Returns : a scalar representing the start of the sequence
420 Args :
421 $r_quals : A reference to an array containing quality values. In
422 context, this array of values has been smoothed by then
423 sliding window-look ahead algorithm.
424 $windowsize : The size of the window used when the sliding window
425 look-ahead average was calculated.
426 $phreds : <fill in what this does here>
427 $offset : <fill in what this does here>
429 =cut
431 sub _get_start {
432 my ($self,$r_quals,$windowsize,$phreds,$offset) = @_;
433 print("Using $phreds phreds\n") if $self->verbose > 0;
434 # this is to help determine whether the sequence is good at all
435 my @quals = @$r_quals;
436 my ($count,$count2,$qualsum);
437 if ($offset) { $count = $offset; } else { $count = 0; }
438 # search along the length of the sequence
439 for (; ($count+$windowsize) <= scalar(@quals); $count++) {
440 # sum all of the quality values in this window.
441 my $cumulative=0;
442 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
443 if (!$quals[$count2]) {
444 # print("Quals don't exist here!\n");
446 else {
447 $qualsum += $quals[$count2];
448 # print("Incremented qualsum to ($qualsum)\n");
450 $cumulative++;
452 # print("The sum of this window (starting at $count) is $qualsum. I counted $cumulative bases.\n");
453 # if the total of windowsize * phreds is
454 if ($qualsum && $qualsum >= $windowsize*$phreds) { return $count; }
455 $qualsum = 0;
457 # if ($count > scalar(@quals)-$windowsize) { return; }
458 return $count;
461 =head2 _get_end($r_qual,$windowsize,$phreds,$count)
463 Title : _get_end($r_qual,$windowsize,$phreds,$count)
464 Usage : my $end_base = &_get_end($r_windows,20,20,$start_base);
465 Function: Get the end trim point for this sequence.
466 Returns : A scalar representing the end trim point for this sequence.
467 Args :
468 $r_qual : A reference to an array containing quality values. In
469 context, this array of values has been smoothed by then
470 sliding window-look ahead algorithm.
471 $windowsize : The size of the window used when the sliding window
472 look-ahead average was calculated.
473 $phreds : <fill in what this does here>
474 $count : Start looking for the end of the sequence here.
476 =cut
478 sub _get_end {
479 my ($r_qual,$windowsize,$phreds,$count) = @_;
480 my @quals = @$r_qual;
481 my $total_bases = scalar(@quals);
482 my ($count2,$qualsum,$end_of_quals,$bases_counted);
483 if (!$count) { $count=0; }
484 BASE: for (; $count < $total_bases; $count++) {
485 $bases_counted = 0;
486 $qualsum = 0;
487 POSITION: for($count2 = $count; $count2 < $total_bases; $count2++) {
488 $bases_counted++;
490 if ($count2 == $total_bases-1) {
491 $qualsum += $quals[$count2];
492 $bases_counted++;
493 last BASE;
495 elsif ($bases_counted == $windowsize) {
496 $qualsum += $quals[$count2];
497 if ($qualsum < $bases_counted*$phreds) {
498 return $count+$bases_counted+$windowsize;
500 next BASE;
502 else {
503 $qualsum += $quals[$count2];
506 if ($qualsum < $bases_counted*$phreds) {
507 return $count+$bases_counted+$windowsize;
509 else { }
510 $qualsum = 0;
511 } # end for
512 if ($end_of_quals) {
513 my $bases_for_average = $total_bases-$count2;
514 return $count2;
516 else { }
517 if ($qualsum) { } # print ("$qualsum\n");
518 return $total_bases;
519 } # end get_end
521 =head2 count_doublet_trailing_zeros($r_qual)
523 Title : count_doublet_trailing_zeros($r_qual)
524 Usage : my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
525 Function: Find out when the trailing zero qualities start.
526 Returns : A scalar representing where the zeros start.
527 Args : A reference to an array of quality values.
528 Notes : Again, this should be rewritten to use PrimaryQual objects.
529 A more detailed explanation of why phrap puts these zeros here should
530 be written and placed here. Please email and hassle the author.
533 =cut
535 sub count_doublet_trailing_zeros {
536 my ($r_qual) = shift;
537 my $number_of_trailing_zeros = 0;
538 my @qualities = @$r_qual;
539 for (my $current=scalar(@qualities);$current>0;$current--) {
540 if ($qualities[$current] && $qualities[$current] != 0) {
541 $number_of_trailing_zeros = scalar(@qualities)-$current;
542 return $current+1;
545 return scalar(@qualities);
546 } # end count_doublet_trailing_zeros
548 =head2 _sliding_window($r_quals,$windowsize)
550 Title : _sliding_window($r_quals,$windowsize)
551 Usage : my $r_windows = &_sliding_window(\@qual,$windowsize);
552 Function: Create a sliding window, look-forward-average on an array
553 of quality values. Used to smooth out differences in qualities.
554 Returns : A reference to an array containing the smoothed values.
555 Args : $r_quals: A reference to an array containing quality values.
556 $windowsize : The size of the sliding window.
557 Notes : This was written before PrimaryQual objects existed. They
558 should use that object but I haven't rewritten this yet.
560 =cut
563 sub _sliding_window {
564 my ($r_quals,$windowsize) = @_;
565 my (@window,@quals,$qualsum,$count,$count2,$average,@averages,$bases_counted);
566 @quals = @$r_quals;
567 my $size_of_quality = scalar(@quals);
568 # do this loop for all of the qualities
569 for ($count=0; $count <= $size_of_quality; $count++) {
570 $bases_counted = 0;
571 BASE: for($count2 = $count; $count2 < $size_of_quality; $count2++) {
572 $bases_counted++;
573 # if the search hits the end of the averages, stop
574 # this is for the case near the end where bases remaining < windowsize
575 if ($count2 == $size_of_quality) {
576 $qualsum += $quals[$count2];
577 last BASE;
579 # if the search hits the size of the window
580 elsif ($bases_counted == $windowsize) {
581 $qualsum += $quals[$count2];
582 last BASE;
584 # otherwise add the quality value
585 unless (!$quals[$count2]) {
586 $qualsum += $quals[$count2];
589 unless (!$qualsum || !$windowsize) {
590 $average = $qualsum / $bases_counted;
591 if (!$average) { $average = "0"; }
592 push @averages,$average;
594 $qualsum = 0;
596 # 02101 Yes, I repaired the mismatching numbers between averages and windows.
597 # print("There are ".scalar(@$r_quals)." quality values. They are @$r_quals\n");
598 # print("There are ".scalar(@averages)." average values. They are @averages\n");
599 return \@averages;
603 =head2 _print_formatted_qualities
605 Title : _print_formatted_qualities(\@quals)
606 Usage : &_print_formatted_qualities(\@quals);
607 Returns : Nothing. Prints.
608 Args : A reference to an array containing quality values.
609 Notes : An internal procedure used in debugging. Prints out an array nicely.
611 =cut
613 sub _print_formatted_qualities {
614 my $rquals = shift;
615 my @qual = @$rquals;
616 for (my $count=0; $count<scalar(@qual) ; $count++) {
617 if (($count%10)==0) { print("\n$count\t"); }
618 if ($qual[$count]) { print ("$qual[$count]\t");}
619 else { print("0\t"); }
621 print("\n");
624 =head2 _get_end_old($r_qual,$windowsize,$phreds,$count)
626 Title : _get_end_old($r_qual,$windowsize,$phreds,$count)
627 Usage : Deprecated. Don't use this!
628 Returns : Deprecated. Don't use this!
629 Args : Deprecated. Don't use this!
631 =cut
634 sub _get_end_old {
635 my ($r_qual,$windowsize,$phreds,$count) = @_;
636 warn("Do Not Use this function (_get_end_old)");
637 my $target = $windowsize*$phreds;
638 my @quals = @$r_qual;
639 my $total_bases = scalar(@quals);
640 my ($count2,$qualsum,$end_of_quals);
641 if (!$count) { $count=0; }
642 BASE: for (; $count < $total_bases; $count++) {
643 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
644 if ($count2 == scalar(@quals)-1) {
645 $qualsum += $quals[$count2];
646 $end_of_quals = 1;
647 last BASE;
650 $qualsum += $quals[$count2];
652 if ($qualsum < $windowsize*$phreds) {
653 return $count+$windowsize;
655 $qualsum = 0;
656 } # end for
657 } # end get_end_old
660 # Autoload methods go after =cut, and are processed by the autosplit program.
663 __END__