tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Tools / Alignment / Trim.pm
blob626cddac07254f87327dc5cea59f27029c710584
1 # $Id$
2 # Bio::Tools::Alignment::Trim.pm
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chad Matsalla
8 # Copyright Chad Matsalla
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Alignment::Trim - A kludge to do specialized trimming of
17 sequence based on quality.
19 =head1 SYNOPSIS
21 use Bio::Tools::Alignment::Trim;
22 $o_trim = Bio::Tools::Alignment::Trim->new();
23 $o_trim->set_reverse_designator("R");
24 $o_trim->set_forward_designator("F");
27 =head1 DESCRIPTION
29 This is a specialized module designed by Chad for Chad to trim sequences
30 based on a highly specialized list of requirements. In other words, write
31 something that will trim sequences 'just like the people in the lab would
32 do manually'.
34 I settled on a sliding-window-average style of search which is ugly and
35 slow but does _exactly_ what I want it to do.
37 Mental note: rewrite this.
39 It is very important to keep in mind the context in which this module was
40 written: strictly to support the projects for which Consed.pm was
41 designed.
43 =head1 FEEDBACK
45 =head2 Mailing Lists
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to one
49 of the Bioperl mailing lists. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing
53 lists
55 =head2 Support
57 Please direct usage questions or support issues to the mailing list:
59 I<bioperl-l@bioperl.org>
61 rather than to the module maintainer directly. Many experienced and
62 reponsive experts will be able look at the problem and quickly
63 address it. Please include a thorough description of the problem
64 with code and data examples if at all possible.
66 =head2 Reporting Bugs
68 Report bugs to the Bioperl bug tracking system to help us keep track
69 the bugs and their resolution. Bug reports can be submitted via the
70 web:
72 http://bugzilla.open-bio.org/
74 =head1 AUTHOR - Chad Matsalla
76 Email bioinformatics-at-dieselwurks.com
78 =head1 APPENDIX
80 The rest of the documentation details each of the object methods.
81 Internal methods are usually preceded with a _
83 =cut
85 package Bio::Tools::Alignment::Trim;
87 use strict;
88 use Dumpvalue;
90 use vars qw(%DEFAULTS);
92 use base qw(Bio::Root::Root);
94 BEGIN {
95 %DEFAULTS = ( 'f_designator' => 'f',
96 'r_designator' => 'r',
97 'windowsize' => '10',
98 'phreds' => '20');
101 =head2 new()
103 Title : new()
104 Usage : $o_trim = Bio::Tools::Alignment::Trim->new();
105 Function: Construct the Bio::Tools::Alignment::Trim object. No parameters
106 are required to create this object. It is strictly a bundle of
107 functions, as far as I am concerned.
108 Returns : A reference to a Bio::Tools::Alignment::Trim object.
109 Args : (optional)
110 -windowsize (default 10)
111 -phreds (default 20)
114 =cut
116 sub new {
117 my ($class,@args) = @_;
118 my $self = $class->SUPER::new(@args);
119 my($windowsize,$phreds) =
120 $self->_rearrange([qw(
121 WINDOWSIZE
122 PHREDS
124 @args);
125 $self->{windowsize} = $windowsize || $DEFAULTS{'windowsize'};
126 $self->{phreds} = $phreds || $DEFAULTS{'phreds'};
127 # print("Constructor set phreds to ".$self->{phreds}."\n") if $self->verbose > 0;
128 $self->set_designators($DEFAULTS{'f_designator'},
129 $DEFAULTS{'r_designator'});
130 return $self;
133 =head2 set_designators($forward_designator,$reverse_designator)
135 Title : set_designators(<forward>,<reverse>)
136 Usage : $o_trim->set_designators("F","R")
137 Function: Set the string by which the system determines whether a given
138 sequence represents a forward or a reverse read.
139 Returns : Nothing.
140 Args : two scalars: one representing the forward designator and one
141 representing the reverse designator
143 =cut
145 sub set_designators {
146 my $self = shift;
147 ($self->{'f_designator'},$self->{'r_designator'}) = @_;
150 =head2 set_forward_designator($designator)
152 Title : set_forward_designator($designator)
153 Usage : $o_trim->set_forward_designator("F")
154 Function: Set the string by which the system determines if a given
155 sequence is a forward read.
156 Returns : Nothing.
157 Args : A string representing the forward designator of this project.
159 =cut
161 sub set_forward_designator {
162 my ($self,$desig) = @_;
163 $self->{'f_designator'} = $desig;
166 =head2 set_reverse_designator($reverse_designator)
168 Title : set_reverse_designator($reverse_designator)
169 Function: Set the string by which the system determines if a given
170 sequence is a reverse read.
171 Usage : $o_trim->set_reverse_designator("R")
172 Returns : Nothing.
173 Args : A string representing the forward designator of this project.
175 =cut
177 sub set_reverse_designator {
178 my ($self,$desig) = @_;
179 $self->{'r_designator'} = $desig;
182 =head2 get_designators()
184 Title : get_designators()
185 Usage : $o_trim->get_designators()
186 Returns : A string describing the current designators.
187 Args : None
188 Notes : Really for informational purposes only. Duh.
190 =cut
192 sub get_designators {
193 my $self = shift;
194 return("forward: ".$self->{'f_designator'}." reverse: ".$self->{'r_designator'});
197 =head2 trim_leading_polys()
199 Title : trim_leading_polys()
200 Usage : $o_trim->trim_leading_polys()
201 Function: Not implemented. Does nothing.
202 Returns : Nothing.
203 Args : None.
204 Notes : This function is not implemented. Part of something I wanted to
205 do but never got around to doing.
207 =cut
209 sub trim_leading_polys {
210 my ($self, $sequence) = @_;
213 =head2 dump_hash()
215 Title : dump_hash()
216 Usage : $o_trim->dump_hash()
217 Function: Unimplemented.
218 Returns : Nothing.
219 Args : None.
220 Notes : Does nothing.
222 =cut
224 sub dump_hash {
225 my $self = shift;
226 my %hash = %{$self->{'qualities'}};
227 } # end dump_hash
229 =head2 trim_singlet($sequence,$quality,$name,$class)
231 Title : trim_singlet($sequence,$quality,$name,$class)
232 Usage : ($r_trim_points,$trimmed_sequence) =
233 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
234 Function: Trim a singlet based on its quality.
235 Returns : a reference to an array containing the forward and reverse
236 trim points and the trimmed sequence.
237 Args : $sequence : A sequence (SCALAR, please)
238 $quality : A _scalar_ of space-delimited quality values.
239 $name : the name of the sequence
240 $class : The class of the sequence. One of qw(singlet
241 singleton doublet pair multiplet)
242 Notes : At the time this was written the bioperl objects SeqWithQuality
243 and PrimaryQual did not exist. This is what is with the clumsy
244 passing of references and so on. I will rewrite this next time I
245 have to work with it. I also wasn't sure whether this function
246 should return just the trim points or the points and the sequence.
247 I decided that I always wanted both so that's how I implemented
249 - Note that the size of the sliding windows is set during construction of
250 the Bio::Tools::Alignment::Trim object.
252 =cut
254 sub trim_singlet {
255 my ($self,$sequence,$quality,$name,$class) = @_;
256 # this split is done because I normally store quality values in a
257 # space-delimited scalar rather then in an array.
258 # I do this because serialization of the arrays is tough.
259 my @qual = split(' ',$quality);
260 my @points;
261 my $sequence_length = length($sequence);
262 my ($returnstring,$processed_sequence);
263 # smooth out the qualities
264 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
265 # find out the leading and trailing trimpoints
266 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds});
267 my (@new_points,$trimmed_sequence);
268 # do you think that any sequence shorter then 100 should be
269 # discarded? I don't think that this should be the decision of this
270 # module.
271 # removed, 020926
272 $points[0] = $start_base;
273 # whew! now for the end base
274 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
275 my $end_base = &_get_end($r_windows,$self->{windowsize},
276 $self->{phreds},$start_base);
277 $points[1] = $end_base;
278 # now do the actual trimming
279 # CHAD : I don't think that it is a good idea to call chop_sequence here
280 # because chop_sequence also removes X's and N's and things
281 # and that is not always what is wanted
282 return \@points;
285 =head2 trim_doublet($sequence,$quality,$name,$class)
287 Title : trim_doublet($sequence,$quality,$name,$class)
288 Usage : ($r_trim_points,$trimmed_sequence) =
289 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
290 Function: Trim a singlet based on its quality.
291 Returns : a reference to an array containing the forward and reverse
292 Args : $sequence : A sequence
293 $quality : A _scalar_ of space-delimited quality values.
294 $name : the name of the sequence
295 $class : The class of the sequence. One of qw(singlet
296 singleton doublet pair multiplet)
297 Notes : At the time this was written the bioperl objects SeqWithQuality
298 and PrimaryQual did not exist. This is what is with the clumsy
299 passing of references and so on. I will rewrite this next time I
300 have to work with it. I also wasn't sure whether this function
301 should return just the trim points or the points and the sequence.
302 I decided that I always wanted both so that's how I implemented
305 =cut
308 sub trim_doublet {
309 my ($self,$sequence,$quality,$name,$class) = @_;
310 my @qual = split(' ',$quality);
311 my @points;
312 my $sequence_length = length($sequence);
313 my ($returnstring,$processed_sequence);
314 # smooth out the qualities
315 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
316 # determine where the consensus sequence starts
317 my $offset = 0;
318 for (my $current = 0; $current<$sequence_length;$current++) {
319 if ($qual[$current] != 0) {
320 $offset = $current;
321 last;
324 # start_base required: r_quality,$windowsize,$phredvalue
325 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds},$offset);
326 if ($start_base > ($sequence_length - 100)) {
327 $points[0] = ("FAILED");
328 $points[1] = ("FAILED");
329 return @points;
331 $points[0] = $start_base;
333 # whew! now for the end base
335 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
336 # |
337 # 010420 NOTE: We will no longer get the end base to avoid the Q/--\___/-- syndrome
338 my $end_base = $sequence_length;
339 my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
340 $points[1] = $end_base;
341 # CHAD : I don't think that it is a good idea to call chop_sequence here
342 # because chop_sequence also removes X's and N's and things
343 # and that is not always what is wanted
344 return @points;
345 } # end trim_doublet
347 =head2 chop_sequence($name,$class,$sequence,@points)
349 Title : chop_sequence($name,$class,$sequence,@points)
350 Usage : ($start_point,$end_point,$chopped_sequence) =
351 $o_trim->chop_sequence($name,$class,$sequence,@points);
352 Function: Chop a sequence based on its name, class, and sequence.
353 Returns : an array containing three scalars:
354 1- the start trim point
355 2- the end trim point
356 3- the chopped sequence
357 Args :
358 $name : the name of the sequence
359 $class : The class of the sequence. One of qw(singlet
360 singleton doublet pair multiplet)
361 $sequence : A sequence
362 @points : An array containing two elements- the first contains
363 the start trim point and the second conatines the end trim
364 point.
366 =cut
368 sub chop_sequence {
369 my ($self,$name,$class,$sequence,@points) = @_;
370 print("Coming into chop_sequence, \@points are @points\n");
371 my $fdesig = $self->{'f_designator'};
372 my $rdesig = $self->{'r_designator'};
373 if (!$points[0] && !$points[1]) {
374 $sequence = "junk";
375 return $sequence;
377 if ($class eq "singlet" && $name =~ /$fdesig$/) {
378 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
380 elsif ($class eq "singlet" && $name =~ /$rdesig$/) {
381 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
383 elsif ($class eq "singleton" && $name =~ /$fdesig$/) {
384 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
386 elsif ($class eq "singleton" && $name =~ /$rdesig$/) {
387 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
389 elsif ($class eq "doublet") {
390 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
392 # this is a _terrible_ to do this! i couldn't seem to find a better way
393 # i thought something like s/(^.*[Xx]{5,})//g; might work, but no go
394 # no time to find a fix!
395 my $length_before_trimming = length($sequence);
396 my $subs_Xs = $sequence =~ s/^.*[Xx]{5,}//g;
397 if ($subs_Xs) {
398 my $length_after_trimming = length($sequence);
399 my $number_Xs_trimmed = $length_before_trimming - $length_after_trimming;
400 $points[0] += $number_Xs_trimmed;
402 $length_before_trimming = length($sequence);
403 my $subs_Ns = $sequence =~ s/[Nn]{1,}$//g;
404 if ($subs_Ns) {
405 my $length_after_trimming = length($sequence);
406 my $number_Ns_trimmed = $length_before_trimming - $length_after_trimming;
407 $points[1] -= $number_Ns_trimmed;
408 $points[1] -= 1;
410 push @points,$sequence;
411 print("chop_sequence \@points are @points\n");
412 return @points;
415 =head2 _get_start($r_quals,$windowsize,$phreds,$offset)
417 Title : _get_start($r_quals,$windowsize,$phreds,$offset)
418 Usage : $start_base = $self->_get_start($r_windows,5,20);
419 Function: Provide the start trim point for this sequence.
420 Returns : a scalar representing the start of the sequence
421 Args :
422 $r_quals : A reference to an array containing quality values. In
423 context, this array of values has been smoothed by then
424 sliding window-look ahead algorithm.
425 $windowsize : The size of the window used when the sliding window
426 look-ahead average was calculated.
427 $phreds : <fill in what this does here>
428 $offset : <fill in what this does here>
430 =cut
432 sub _get_start {
433 my ($self,$r_quals,$windowsize,$phreds,$offset) = @_;
434 print("Using $phreds phreds\n") if $self->verbose > 0;
435 # this is to help determine whether the sequence is good at all
436 my @quals = @$r_quals;
437 my ($count,$count2,$qualsum);
438 if ($offset) { $count = $offset; } else { $count = 0; }
439 # search along the length of the sequence
440 for (; ($count+$windowsize) <= scalar(@quals); $count++) {
441 # sum all of the quality values in this window.
442 my $cumulative=0;
443 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
444 if (!$quals[$count2]) {
445 # print("Quals don't exist here!\n");
447 else {
448 $qualsum += $quals[$count2];
449 # print("Incremented qualsum to ($qualsum)\n");
451 $cumulative++;
453 # print("The sum of this window (starting at $count) is $qualsum. I counted $cumulative bases.\n");
454 # if the total of windowsize * phreds is
455 if ($qualsum && $qualsum >= $windowsize*$phreds) { return $count; }
456 $qualsum = 0;
458 # if ($count > scalar(@quals)-$windowsize) { return; }
459 return $count;
462 =head2 _get_end($r_qual,$windowsize,$phreds,$count)
464 Title : _get_end($r_qual,$windowsize,$phreds,$count)
465 Usage : my $end_base = &_get_end($r_windows,20,20,$start_base);
466 Function: Get the end trim point for this sequence.
467 Returns : A scalar representing the end trim point for this sequence.
468 Args :
469 $r_qual : A reference to an array containing quality values. In
470 context, this array of values has been smoothed by then
471 sliding window-look ahead algorithm.
472 $windowsize : The size of the window used when the sliding window
473 look-ahead average was calculated.
474 $phreds : <fill in what this does here>
475 $count : Start looking for the end of the sequence here.
477 =cut
479 sub _get_end {
480 my ($r_qual,$windowsize,$phreds,$count) = @_;
481 my @quals = @$r_qual;
482 my $total_bases = scalar(@quals);
483 my ($count2,$qualsum,$end_of_quals,$bases_counted);
484 if (!$count) { $count=0; }
485 BASE: for (; $count < $total_bases; $count++) {
486 $bases_counted = 0;
487 $qualsum = 0;
488 POSITION: for($count2 = $count; $count2 < $total_bases; $count2++) {
489 $bases_counted++;
491 if ($count2 == $total_bases-1) {
492 $qualsum += $quals[$count2];
493 $bases_counted++;
494 last BASE;
496 elsif ($bases_counted == $windowsize) {
497 $qualsum += $quals[$count2];
498 if ($qualsum < $bases_counted*$phreds) {
499 return $count+$bases_counted+$windowsize;
501 next BASE;
503 else {
504 $qualsum += $quals[$count2];
507 if ($qualsum < $bases_counted*$phreds) {
508 return $count+$bases_counted+$windowsize;
510 else { }
511 $qualsum = 0;
512 } # end for
513 if ($end_of_quals) {
514 my $bases_for_average = $total_bases-$count2;
515 return $count2;
517 else { }
518 if ($qualsum) { } # print ("$qualsum\n");
519 return $total_bases;
520 } # end get_end
522 =head2 count_doublet_trailing_zeros($r_qual)
524 Title : count_doublet_trailing_zeros($r_qual)
525 Usage : my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
526 Function: Find out when the trailing zero qualities start.
527 Returns : A scalar representing where the zeros start.
528 Args : A reference to an array of quality values.
529 Notes : Again, this should be rewritten to use PrimaryQual objects.
530 A more detailed explanation of why phrap puts these zeros here should
531 be written and placed here. Please email and hassle the author.
534 =cut
536 sub count_doublet_trailing_zeros {
537 my ($r_qual) = shift;
538 my $number_of_trailing_zeros = 0;
539 my @qualities = @$r_qual;
540 for (my $current=scalar(@qualities);$current>0;$current--) {
541 if ($qualities[$current] && $qualities[$current] != 0) {
542 $number_of_trailing_zeros = scalar(@qualities)-$current;
543 return $current+1;
546 return scalar(@qualities);
547 } # end count_doublet_trailing_zeros
549 =head2 _sliding_window($r_quals,$windowsize)
551 Title : _sliding_window($r_quals,$windowsize)
552 Usage : my $r_windows = &_sliding_window(\@qual,$windowsize);
553 Function: Create a sliding window, look-forward-average on an array
554 of quality values. Used to smooth out differences in qualities.
555 Returns : A reference to an array containing the smoothed values.
556 Args : $r_quals: A reference to an array containing quality values.
557 $windowsize : The size of the sliding window.
558 Notes : This was written before PrimaryQual objects existed. They
559 should use that object but I haven't rewritten this yet.
561 =cut
564 sub _sliding_window {
565 my ($r_quals,$windowsize) = @_;
566 my (@window,@quals,$qualsum,$count,$count2,$average,@averages,$bases_counted);
567 @quals = @$r_quals;
568 my $size_of_quality = scalar(@quals);
569 # do this loop for all of the qualities
570 for ($count=0; $count <= $size_of_quality; $count++) {
571 $bases_counted = 0;
572 BASE: for($count2 = $count; $count2 < $size_of_quality; $count2++) {
573 $bases_counted++;
574 # if the search hits the end of the averages, stop
575 # this is for the case near the end where bases remaining < windowsize
576 if ($count2 == $size_of_quality) {
577 $qualsum += $quals[$count2];
578 last BASE;
580 # if the search hits the size of the window
581 elsif ($bases_counted == $windowsize) {
582 $qualsum += $quals[$count2];
583 last BASE;
585 # otherwise add the quality value
586 unless (!$quals[$count2]) {
587 $qualsum += $quals[$count2];
590 unless (!$qualsum || !$windowsize) {
591 $average = $qualsum / $bases_counted;
592 if (!$average) { $average = "0"; }
593 push @averages,$average;
595 $qualsum = 0;
597 # 02101 Yes, I repaired the mismatching numbers between averages and windows.
598 # print("There are ".scalar(@$r_quals)." quality values. They are @$r_quals\n");
599 # print("There are ".scalar(@averages)." average values. They are @averages\n");
600 return \@averages;
604 =head2 _print_formatted_qualities
606 Title : _print_formatted_qualities(\@quals)
607 Usage : &_print_formatted_qualities(\@quals);
608 Returns : Nothing. Prints.
609 Args : A reference to an array containing quality values.
610 Notes : An internal procedure used in debugging. Prints out an array nicely.
612 =cut
614 sub _print_formatted_qualities {
615 my $rquals = shift;
616 my @qual = @$rquals;
617 for (my $count=0; $count<scalar(@qual) ; $count++) {
618 if (($count%10)==0) { print("\n$count\t"); }
619 if ($qual[$count]) { print ("$qual[$count]\t");}
620 else { print("0\t"); }
622 print("\n");
625 =head2 _get_end_old($r_qual,$windowsize,$phreds,$count)
627 Title : _get_end_old($r_qual,$windowsize,$phreds,$count)
628 Usage : Deprecated. Don't use this!
629 Returns : Deprecated. Don't use this!
630 Args : Deprecated. Don't use this!
632 =cut
635 sub _get_end_old {
636 my ($r_qual,$windowsize,$phreds,$count) = @_;
637 warn("Do Not Use this function (_get_end_old)");
638 my $target = $windowsize*$phreds;
639 my @quals = @$r_qual;
640 my $total_bases = scalar(@quals);
641 my ($count2,$qualsum,$end_of_quals);
642 if (!$count) { $count=0; }
643 BASE: for (; $count < $total_bases; $count++) {
644 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
645 if ($count2 == scalar(@quals)-1) {
646 $qualsum += $quals[$count2];
647 $end_of_quals = 1;
648 last BASE;
651 $qualsum += $quals[$count2];
653 if ($qualsum < $windowsize*$phreds) {
654 return $count+$windowsize;
656 $qualsum = 0;
657 } # end for
658 } # end get_end_old
661 # Autoload methods go after =cut, and are processed by the autosplit program.
664 __END__