tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Assembly / ContigAnalysis.pm
blob5094c41b1a7d82f53a28ee7862bf1a6ed66cef69
1 # $Id$
3 # BioPerl module for Bio::Assembly::ContigAnalysis
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
9 # Copyright Robson Francisco de Souza
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Assembly::ContigAnalysis -
18 Perform analysis on sequence assembly contigs.
20 =head1 SYNOPSIS
22 # Module loading
23 use Bio::Assembly::ContigAnalysis;
25 # Assembly loading methods
26 my $ca = Bio::Assembly::ContigAnalysis->new( -contig=>$contigOBJ );
28 my @lcq = $ca->low_consensus_quality;
29 my @hqd = $ca->high_quality_discrepancies;
30 my @ss = $ca->single_strand_regions;
32 =head1 DESCRIPTION
34 A contig is as a set of sequences, locally aligned to each other, when
35 the sequences in a pair may be aligned. It may also include a
36 consensus sequence. Bio::Assembly::ContigAnalysis is a module
37 holding a collection of methods to analyze contig objects. It was
38 developed around the Bio::Assembly::Contig implementation of contigs and
39 can not work with another contig interface.
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to the
47 Bioperl mailing lists Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 =head2 Support
54 Please direct usage questions or support issues to the mailing list:
56 I<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
63 =head2 Reporting Bugs
65 Report bugs to the Bioperl bug tracking system to help us keep track
66 the bugs and their resolution. Bug reports can be submitted via the
67 web:
69 http://bugzilla.open-bio.org/
71 =head1 AUTHOR - Robson Francisco de Souza
73 Email: rfsouza@citri.iq.usp.br
75 =head1 APPENDIX
77 The rest of the documentation details each of the object
78 methods. Internal methods are usually preceded with a _
80 =cut
82 package Bio::Assembly::ContigAnalysis;
84 use strict;
86 use base qw(Bio::Root::Root);
88 =head1 Object creator
90 =head2 new
92 Title : new
93 Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ);
94 Function : Creates a new contig analysis object
95 Returns : Bio::Assembly::ContigAnalysis
96 Args :
97 -contig : a Bio::Assembly::Contig object
99 =cut
101 sub new {
102 my($class,@args) = @_;
103 my $self = $class->SUPER::new(@args);
105 my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args);
106 unless ($contigOBJ->isa("Bio::Assembly::Contig")) {
107 $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n");
110 $self->{'_objref'} = $contigOBJ;
111 return $self;
114 =head1 Analysis methods
116 =head2 high_quality_discrepancies
118 Title : high_quality_discrepancies
119 Usage : my $sfc = $ContigAnal->high_quality_discrepancies();
120 Function :
122 Locates all high quality discrepancies among aligned
123 sequences and the consensus sequence.
125 Note: see Bio::Assembly::Contig POD documentation,
126 section "Coordinate System", for a definition of
127 available types. Default coordinate system type is
128 "gapped consensus", i.e. consensus sequence (with gaps)
129 coordinates. If limits are not specified, the entire
130 alignment is analyzed.
132 Returns : Bio::SeqFeature::Collection
133 Args : optional arguments are
134 -threshold : cutoff value for low quality (minimum high quality)
135 Default: 40
136 -ignore : number of bases that will not be analysed at
137 both ends of contig aligned elements
138 Default: 5
139 -start : start of interval that will be analyzed
140 -end : start of interval that will be analyzed
141 -type : coordinate system type for interval
143 =cut
145 sub high_quality_discrepancies {
146 my ($self,@args) = shift; # Package reference
148 my ($threshold,$ignore,$start,$end,$type) =
149 $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args);
151 # Defining default threhold and HQD_ignore
152 $threshold = 40 unless (defined($threshold));
153 $ignore = 5 unless (defined($ignore));
154 $type = 'gapped consensus' unless (defined($type));
156 # Changing input coordinates system (if needed)
157 if (defined $start && $type ne 'gapped consensus') {
158 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
159 } elsif (!defined $start) {
160 $start = 1;
162 if (defined $end && $type ne 'gapped consensus') {
163 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end);
164 } elsif (!defined $end) {
165 $end = $self->{'_objref'}->get_consensus_length();
168 # Scanning each read sequence and the contig sequence and
169 # adding discrepancies to Bio::SeqFeature::Collection
170 my @seqIDs = $self->{'_objref'}->get_seq_ids(-start=>$start,
171 -end=>$end,
172 -type=>$type);
173 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
175 my @HQD = ();
176 foreach my $seqID (@seqIDs) {
177 # Setting aligned read sub-sequence limits and loading data
178 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
179 my $qual = $self->{'_objref'}->get_qual_by_name($seqID);
180 unless (defined $qual) {
181 $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID");
182 next;
184 my $sequence = $seq->seq;
185 my @quality = @{ $qual->qual };
187 # Scanning the aligned region of each read
188 my $seq_ix = 0;
189 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
190 if (!$coord) {
191 $self->warn("Read $seqID has no alignment coordinates; considered low quality.\nSkipping...");
192 next;
194 my ($astart,$aend) = ($coord->start,$coord->end);
195 $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start)
196 $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end)
198 my ($d_start,$d_end,$i);
199 for ($i=$astart-1; $i<=$aend-1; $i++) {
200 # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix)
201 $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1);
202 next unless (($i >= $start) && ($i <= $end));
204 my $r_base = uc(substr($sequence,$seq_ix-1,1));
205 my $c_base = uc(substr($consensus,$i,1));
207 # Discrepant region start: store $d_start and $type
208 (!defined($d_start) &&
209 ($r_base ne $c_base) &&
210 ($quality[$seq_ix-1] >= $threshold)) && do {
211 $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1);
212 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
213 next;
216 # Quality change or end of discrepant region: store limits and undef $d_start
217 if (defined($d_start) &&
218 (($quality[$seq_ix-1] < $threshold) ||
219 (uc($r_base) eq uc($c_base)))) {
220 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
221 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
222 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
223 -start=>$d_start,
224 -end=>$d_end,
225 -strand=>$seq->strand()) );
226 $d_start = undef;
227 next;
229 } # for ($i=$astart-1; $i<=$aend-1; $i++)
231 # Loading discrepancies located at sub-sequence end, if any.
232 if (defined($d_start)) {
233 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
234 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
235 -start=>$d_start,
236 -end=>$d_end,
237 -strand=>$seq->strand()) );
239 } # foreach my $seqID (@seqIDs)
241 return @HQD;
244 =head2 low_consensus_quality
246 Title : low_consensus_quality
247 Usage : my $sfc = $ContigAnal->low_consensus_quality();
248 Function : Locates all low quality regions in the consensus
249 Returns : an array of Bio::SeqFeature::Generic objects
250 Args : optional arguments are
251 -threshold : cutoff value for low quality (minimum high quality)
252 Default: 25
253 -start : start of interval that will be analyzed
254 -end : start of interval that will be analyzed
255 -type : coordinate system type for interval
257 =cut
259 sub low_consensus_quality {
260 my ($self,@args) = shift; # Packege reference
262 my ($threshold,$start,$end,$type) =
263 $self->_rearrange([qw(THRESHOLD START END TYPE)],@args);
265 # Setting default value for threshold
266 $threshold = 25 unless (defined($threshold));
268 # Loading qualities
269 my @quality = @{ $self->{'_objref'}->get_consensus_quality()->qual };
271 # Changing coordinates to gap mode noaln (consed: consensus without alignments)
272 $start = 1 unless (defined($start));
273 if (defined $start && defined $type && ($type ne 'gapped consensus')) {
274 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
275 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
277 $end = $self->{'_objref'}->get_consensus_length unless (defined $end);
279 # Scanning @quality vector and storing intervals limits with base qualities less then
280 # the threshold value
281 my ($lcq_start);
282 my ($i,@LCQ);
283 for ($i=$start-1; $i<=$end-1; $i++) {
284 # print $quality[$i],"\t",$i,"\n";
285 if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) {
286 $lcq_start = $i+1;
287 } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) {
288 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
289 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
290 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
291 -end=>$lcq_end,
292 -primary=>'low_consensus_quality') );
293 $lcq_start = undef;
297 if (defined $lcq_start) {
298 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
299 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
300 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
301 -end=>$lcq_end,
302 -primary=>'low_consensus_quality') );
305 return @LCQ;
308 =head2 not_confirmed_on_both_strands
310 Title : low_quality_consensus
311 Usage : my $sfc = $ContigAnal->low_quality_consensus();
312 Function :
314 Locates all regions whose consensus bases were not
315 confirmed by bases from sequences aligned in both
316 orientations, i.e., in such regions, no bases in aligned
317 sequences of either +1 or -1 strand agree with the
318 consensus bases.
320 Returns : an array of Bio::SeqFeature::Generic objects
321 Args : optional arguments are
322 -start : start of interval that will be analyzed
323 -end : start of interval that will be analyzed
324 -type : coordinate system type for interval
326 =cut
328 sub not_confirmed_on_both_strands {
329 my ($self,@args) = shift; # Package reference
331 my ($start,$end,$type) =
332 $self->_rearrange([qw(START END TYPE)],@args);
334 # Changing coordinates to default system 'align' (contig sequence with alignments)
335 $start = 1 unless (defined($start));
336 if (defined($type) && ($type ne 'gapped consensus')) {
337 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
338 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
340 $end = $self->{'_objref'}->get_consensus_length unless (defined($end));
342 # Scanning alignment
343 my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand
344 my ($i);
345 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
346 foreach my $seqID ($self->{'_objref'}->get_seq_ids) {
347 # Setting aligned read sub-sequence limits and loading data
348 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
349 my $sequence = $seq->seq;
351 # Scanning the aligned regions of each read and registering confirmed sites
352 my $contig_ix = 0;
353 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
354 my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand);
355 $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart);
356 $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend);
357 for ($i=$astart-1; $i<=$aend-1; $i++) {
358 # $i+1 in 'align' mode is $contig_ix
359 $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1);
360 next unless (($contig_ix >= $start) && ($contig_ix <= $end));
361 my $r_base = uc(substr($sequence,$i,1));
362 my $c_base = uc(substr($consensus,$contig_ix-1,1));
363 if ($c_base eq '-') {
364 $confirmed{$orientation}[$contig_ix] = -1;
365 } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found
366 $confirmed{$orientation}[$contig_ix]++;
368 } # for ($i=$astart-1; $i<=$aend-1; $i++)
369 } # foreach $seqID (@reads)
371 # Locating non-confirmed aligned regions for each orientation in $confirmed registry
372 my ($orientation);
373 my @NCBS = ();
374 foreach $orientation (keys %confirmed) {
375 my ($ncbs_start,$ncbs_end);
377 for ($i=$start; $i<=$end; $i++) {
378 if (!defined($ncbs_start) &&
379 (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) {
380 $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
381 } elsif (defined($ncbs_start) &&
382 defined($confirmed{$orientation}[$i]) &&
383 ($confirmed{$orientation}[$i] > 0)) {
384 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1);
385 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
386 -end=>$ncbs_end,
387 -strand=>$orientation,
388 -primary=>"not_confirmed_on_both_strands") );
389 $ncbs_start = undef;
393 if (defined($ncbs_start)) { # NCBS at the end of contig
394 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end);
395 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
396 -end=>$ncbs_end,
397 -strand=>$orientation,
398 -primary=>'not_confirmed_on_both_strands') );
402 return @NCBS;
405 =head2 single_strand
407 Title : single_strand
408 Usage : my $sfc = $ContigAnal->single_strand();
409 Function :
411 Locates all regions covered by aligned sequences only in
412 one of the two strands, i.e., regions for which aligned
413 sequence's strand() method returns +1 or -1 for all
414 sequences.
416 Returns : an array of Bio::SeqFeature::Generic objects
417 Args : optional arguments are
418 -start : start of interval that will be analyzed
419 -end : start of interval that will be analyzed
420 -type : coordinate system type for interval
422 =cut
425 sub single_strand {
426 my ($self,@args) = shift; # Package reference
428 my ($start,$end,$type) =
429 $self->_rearrange([qw(START END TYPE)],@args);
431 # Changing coordinates to gap mode align (consed: consensus sequence with alignments)
432 $type = 'gapped consensus' unless(defined($type));
433 $start = 1 unless (defined($start));
434 if (defined($type) && $type ne 'gapped consensus') {
435 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
436 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
438 ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end));
440 # Loading complete list of coordinates for aligned sequences
441 my $sfc = $self->{'_objref'}->get_features_collection();
442 my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ }
443 $sfc->features_in_range(-start=>$start,
444 -end=>$end,
445 -contain=>0,
446 -strand=>1,
447 -strandmatch=>'strong');
448 my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ }
449 $sfc->features_in_range(-start=>$start,
450 -end=>$end,
451 -contain=>0,
452 -strand=>-1,
453 -strandmatch=>'strong');
454 # Merging overlapping features
455 @forward = $self->_merge_overlapping_features(@forward);
456 @reverse = $self->_merge_overlapping_features(@reverse);
458 # Finding single stranded regions
459 my ($length) = $self->{'_objref'}->get_consensus_length;
460 $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length);
461 @forward = $self->_complementary_features_list(1,$length,@forward);
462 @reverse = $self->_complementary_features_list(1,$length,@reverse);
464 my @SS = ();
465 foreach my $feat (@forward, @reverse) {
466 $feat->primary_tag('single_strand_region');
467 push(@SS,$feat);
470 return @SS;
473 =head1 Internal Methods
475 =head2 _merge_overlapping_features
477 Title : _merge_overlapping_features
478 Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features);
479 Function : Merge all overlapping features into features
480 that hold original features as sub-features
481 Returns : array of Bio::SeqFeature::Generic objects
482 Args : array of Bio::SeqFeature::Generic objects
484 =cut
486 sub _merge_overlapping_features {
487 my ($self,@feat) = @_;
489 $self->throw_not_implemented();
492 =head2 _complementary_features_list
494 Title : _complementary_features_list
495 Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features);
496 Function : Build a list of features for regions
497 not covered by features in @features array
498 Returns : array of Bio::SeqFeature::Generic objects
499 Args :
500 $start : [integer] start of first output feature
501 $end : [integer] end of last output feature
502 @features : array of Bio::SeqFeature::Generic objects
504 =cut
506 sub _complementary_features_list {
507 my ($self,$start,$end,@feat) = @_;
509 $self->throw_not_implemented();
514 __END__