[bug 2714]
[bioperl-live.git] / Bio / Assembly / ContigAnalysis.pm
blob0ce45f7117876bcaf5b22f394dabd5c47b030b04
1 # $Id$
3 # BioPerl module for Bio::Assembly::ContigAnalysis
5 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
7 # Copyright Robson Francisco de Souza
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Assembly::ContigAnalysis -
16 Perform analysis on sequence assembly contigs.
18 =head1 SYNOPSIS
20 # Module loading
21 use Bio::Assembly::ContigAnalysis;
23 # Assembly loading methods
24 my $ca = Bio::Assembly::ContigAnalysis->new( -contig=>$contigOBJ );
26 my @lcq = $ca->low_consensus_quality;
27 my @hqd = $ca->high_quality_discrepancies;
28 my @ss = $ca->single_strand_regions;
30 =head1 DESCRIPTION
32 A contig is as a set of sequences, locally aligned to each other, when
33 the sequences in a pair may be aligned. It may also include a
34 consensus sequence. Bio::Assembly::ContigAnalysis is a module
35 holding a collection of methods to analyze contig objects. It was
36 developed around the Bio::Assembly::Contig implementation of contigs and
37 can not work with another contig interface.
39 =head1 FEEDBACK
41 =head2 Mailing Lists
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to the
45 Bioperl mailing lists Your participation is much appreciated.
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50 =head2 Reporting Bugs
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 the bugs and their resolution. Bug reports can be submitted via the
54 web:
56 http://bugzilla.open-bio.org/
58 =head1 AUTHOR - Robson Francisco de Souza
60 Email: rfsouza@citri.iq.usp.br
62 =head1 APPENDIX
64 The rest of the documentation details each of the object
65 methods. Internal methods are usually preceded with a _
67 =cut
69 package Bio::Assembly::ContigAnalysis;
71 use strict;
73 use base qw(Bio::Root::Root);
75 =head1 Object creator
77 =head2 new
79 Title : new
80 Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ);
81 Function : Creates a new contig analysis object
82 Returns : Bio::Assembly::ContigAnalysis
83 Args :
84 -contig : a Bio::Assembly::Contig object
86 =cut
88 sub new {
89 my($class,@args) = @_;
90 my $self = $class->SUPER::new(@args);
92 my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args);
93 unless ($contigOBJ->isa("Bio::Assembly::Contig")) {
94 $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n");
97 $self->{'_objref'} = $contigOBJ;
98 return $self;
101 =head1 Analysis methods
103 =head2 high_quality_discrepancies
105 Title : high_quality_discrepancies
106 Usage : my $sfc = $ContigAnal->high_quality_discrepancies();
107 Function :
109 Locates all high quality discrepancies among aligned
110 sequences and the consensus sequence.
112 Note: see Bio::Assembly::Contig POD documentation,
113 section "Coordinate System", for a definition of
114 available types. Default coordinate system type is
115 "gapped consensus", i.e. consensus sequence (with gaps)
116 coordinates. If limits are not specified, the entire
117 alignment is analyzed.
119 Returns : Bio::SeqFeature::Collection
120 Args : optional arguments are
121 -threshold : cutoff value for low quality (minimum high quality)
122 Default: 40
123 -ignore : number of bases that will not be analysed at
124 both ends of contig aligned elements
125 Default: 5
126 -start : start of interval that will be analyzed
127 -end : start of interval that will be analyzed
128 -type : coordinate system type for interval
130 =cut
132 sub high_quality_discrepancies {
133 my ($self,@args) = shift; # Package reference
135 my ($threshold,$ignore,$start,$end,$type) =
136 $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args);
138 # Defining default threhold and HQD_ignore
139 $threshold = 40 unless (defined($threshold));
140 $ignore = 5 unless (defined($ignore));
141 $type = 'gapped consensus' unless (defined($type));
143 # Changing input coordinates system (if needed)
144 if (defined $start && $type ne 'gapped consensus') {
145 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
146 } elsif (!defined $start) {
147 $start = 1;
149 if (defined $end && $type ne 'gapped consensus') {
150 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end);
151 } elsif (!defined $end) {
152 $end = $self->{'_objref'}->get_consensus_length();
155 # Scanning each read sequence and the contig sequence and
156 # adding discrepancies to Bio::SeqFeature::Collection
157 my @seqIDs = $self->{'_objref'}->get_seq_ids(-start=>$start,
158 -end=>$end,
159 -type=>$type);
160 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
162 my @HQD = ();
163 foreach my $seqID (@seqIDs) {
164 # Setting aligned read sub-sequence limits and loading data
165 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
166 my $qual = $self->{'_objref'}->get_qual_by_name($seqID);
167 unless (defined $qual) {
168 $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID");
169 next;
171 my $sequence = $seq->seq;
172 my @quality = @{ $qual->qual };
174 # Scanning the aligned region of each read
175 my $seq_ix = 0;
176 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
177 if (!$coord) {
178 $self->warn("Read $seqID has no alignment coordinates; considered low quality.\nSkipping...");
179 next;
181 my ($astart,$aend) = ($coord->start,$coord->end);
182 $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start)
183 $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end)
185 my ($d_start,$d_end,$i);
186 for ($i=$astart-1; $i<=$aend-1; $i++) {
187 # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix)
188 $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1);
189 next unless (($i >= $start) && ($i <= $end));
191 my $r_base = uc(substr($sequence,$seq_ix-1,1));
192 my $c_base = uc(substr($consensus,$i,1));
194 # Discrepant region start: store $d_start and $type
195 (!defined($d_start) &&
196 ($r_base ne $c_base) &&
197 ($quality[$seq_ix-1] >= $threshold)) && do {
198 $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1);
199 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
200 next;
203 # Quality change or end of discrepant region: store limits and undef $d_start
204 if (defined($d_start) &&
205 (($quality[$seq_ix-1] < $threshold) ||
206 (uc($r_base) eq uc($c_base)))) {
207 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
208 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
209 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
210 -start=>$d_start,
211 -end=>$d_end,
212 -strand=>$seq->strand()) );
213 $d_start = undef;
214 next;
216 } # for ($i=$astart-1; $i<=$aend-1; $i++)
218 # Loading discrepancies located at sub-sequence end, if any.
219 if (defined($d_start)) {
220 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
221 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
222 -start=>$d_start,
223 -end=>$d_end,
224 -strand=>$seq->strand()) );
226 } # foreach my $seqID (@seqIDs)
228 return @HQD;
231 =head2 low_consensus_quality
233 Title : low_consensus_quality
234 Usage : my $sfc = $ContigAnal->low_consensus_quality();
235 Function : Locates all low quality regions in the consensus
236 Returns : an array of Bio::SeqFeature::Generic objects
237 Args : optional arguments are
238 -threshold : cutoff value for low quality (minimum high quality)
239 Default: 25
240 -start : start of interval that will be analyzed
241 -end : start of interval that will be analyzed
242 -type : coordinate system type for interval
244 =cut
246 sub low_consensus_quality {
247 my ($self,@args) = shift; # Packege reference
249 my ($threshold,$start,$end,$type) =
250 $self->_rearrange([qw(THRESHOLD START END TYPE)],@args);
252 # Setting default value for threshold
253 $threshold = 25 unless (defined($threshold));
255 # Loading qualities
256 my @quality = @{ $self->{'_objref'}->get_consensus_quality()->qual };
258 # Changing coordinates to gap mode noaln (consed: consensus without alignments)
259 $start = 1 unless (defined($start));
260 if (defined $start && defined $type && ($type ne 'gapped consensus')) {
261 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
262 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
264 $end = $self->{'_objref'}->get_consensus_length unless (defined $end);
266 # Scanning @quality vector and storing intervals limits with base qualities less then
267 # the threshold value
268 my ($lcq_start);
269 my ($i,@LCQ);
270 for ($i=$start-1; $i<=$end-1; $i++) {
271 # print $quality[$i],"\t",$i,"\n";
272 if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) {
273 $lcq_start = $i+1;
274 } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) {
275 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
276 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
277 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
278 -end=>$lcq_end,
279 -primary=>'low_consensus_quality') );
280 $lcq_start = undef;
284 if (defined $lcq_start) {
285 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
286 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
287 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
288 -end=>$lcq_end,
289 -primary=>'low_consensus_quality') );
292 return @LCQ;
295 =head2 not_confirmed_on_both_strands
297 Title : low_quality_consensus
298 Usage : my $sfc = $ContigAnal->low_quality_consensus();
299 Function :
301 Locates all regions whose consensus bases were not
302 confirmed by bases from sequences aligned in both
303 orientations, i.e., in such regions, no bases in aligned
304 sequences of either +1 or -1 strand agree with the
305 consensus bases.
307 Returns : an array of Bio::SeqFeature::Generic objects
308 Args : optional arguments are
309 -start : start of interval that will be analyzed
310 -end : start of interval that will be analyzed
311 -type : coordinate system type for interval
313 =cut
315 sub not_confirmed_on_both_strands {
316 my ($self,@args) = shift; # Package reference
318 my ($start,$end,$type) =
319 $self->_rearrange([qw(START END TYPE)],@args);
321 # Changing coordinates to default system 'align' (contig sequence with alignments)
322 $start = 1 unless (defined($start));
323 if (defined($type) && ($type ne 'gapped consensus')) {
324 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
325 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
327 $end = $self->{'_objref'}->get_consensus_length unless (defined($end));
329 # Scanning alignment
330 my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand
331 my ($i);
332 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
333 foreach my $seqID ($self->{'_objref'}->get_seq_ids) {
334 # Setting aligned read sub-sequence limits and loading data
335 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
336 my $sequence = $seq->seq;
338 # Scanning the aligned regions of each read and registering confirmed sites
339 my $contig_ix = 0;
340 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
341 my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand);
342 $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart);
343 $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend);
344 for ($i=$astart-1; $i<=$aend-1; $i++) {
345 # $i+1 in 'align' mode is $contig_ix
346 $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1);
347 next unless (($contig_ix >= $start) && ($contig_ix <= $end));
348 my $r_base = uc(substr($sequence,$i,1));
349 my $c_base = uc(substr($consensus,$contig_ix-1,1));
350 if ($c_base eq '-') {
351 $confirmed{$orientation}[$contig_ix] = -1;
352 } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found
353 $confirmed{$orientation}[$contig_ix]++;
355 } # for ($i=$astart-1; $i<=$aend-1; $i++)
356 } # foreach $seqID (@reads)
358 # Locating non-confirmed aligned regions for each orientation in $confirmed registry
359 my ($orientation);
360 my @NCBS = ();
361 foreach $orientation (keys %confirmed) {
362 my ($ncbs_start,$ncbs_end);
364 for ($i=$start; $i<=$end; $i++) {
365 if (!defined($ncbs_start) &&
366 (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) {
367 $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
368 } elsif (defined($ncbs_start) &&
369 defined($confirmed{$orientation}[$i]) &&
370 ($confirmed{$orientation}[$i] > 0)) {
371 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1);
372 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
373 -end=>$ncbs_end,
374 -strand=>$orientation,
375 -primary=>"not_confirmed_on_both_strands") );
376 $ncbs_start = undef;
380 if (defined($ncbs_start)) { # NCBS at the end of contig
381 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end);
382 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
383 -end=>$ncbs_end,
384 -strand=>$orientation,
385 -primary=>'not_confirmed_on_both_strands') );
389 return @NCBS;
392 =head2 single_strand
394 Title : single_strand
395 Usage : my $sfc = $ContigAnal->single_strand();
396 Function :
398 Locates all regions covered by aligned sequences only in
399 one of the two strands, i.e., regions for which aligned
400 sequence's strand() method returns +1 or -1 for all
401 sequences.
403 Returns : an array of Bio::SeqFeature::Generic objects
404 Args : optional arguments are
405 -start : start of interval that will be analyzed
406 -end : start of interval that will be analyzed
407 -type : coordinate system type for interval
409 =cut
412 sub single_strand {
413 my ($self,@args) = shift; # Package reference
415 my ($start,$end,$type) =
416 $self->_rearrange([qw(START END TYPE)],@args);
418 # Changing coordinates to gap mode align (consed: consensus sequence with alignments)
419 $type = 'gapped consensus' unless(defined($type));
420 $start = 1 unless (defined($start));
421 if (defined($type) && $type ne 'gapped consensus') {
422 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
423 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
425 ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end));
427 # Loading complete list of coordinates for aligned sequences
428 my $sfc = $self->{'_objref'}->get_features_collection();
429 my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ }
430 $sfc->features_in_range(-start=>$start,
431 -end=>$end,
432 -contain=>0,
433 -strand=>1,
434 -strandmatch=>'strong');
435 my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ }
436 $sfc->features_in_range(-start=>$start,
437 -end=>$end,
438 -contain=>0,
439 -strand=>-1,
440 -strandmatch=>'strong');
441 # Merging overlapping features
442 @forward = $self->_merge_overlapping_features(@forward);
443 @reverse = $self->_merge_overlapping_features(@reverse);
445 # Finding single stranded regions
446 my ($length) = $self->{'_objref'}->get_consensus_length;
447 $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length);
448 @forward = $self->_complementary_features_list(1,$length,@forward);
449 @reverse = $self->_complementary_features_list(1,$length,@reverse);
451 my @SS = ();
452 foreach my $feat (@forward, @reverse) {
453 $feat->primary_tag('single_strand_region');
454 push(@SS,$feat);
457 return @SS;
460 =head1 Internal Methods
462 =head2 _merge_overlapping_features
464 Title : _merge_overlapping_features
465 Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features);
466 Function : Merge all overlapping features into features
467 that hold original features as sub-features
468 Returns : array of Bio::SeqFeature::Generic objects
469 Args : array of Bio::SeqFeature::Generic objects
471 =cut
473 sub _merge_overlapping_features {
474 my ($self,@feat) = @_;
476 $self->throw_not_implemented();
479 =head2 _complementary_features_list
481 Title : _complementary_features_list
482 Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features);
483 Function : Build a list of features for regions
484 not covered by features in @features array
485 Returns : array of Bio::SeqFeature::Generic objects
486 Args :
487 $start : [integer] start of first output feature
488 $end : [integer] end of last output feature
489 @features : array of Bio::SeqFeature::Generic objects
491 =cut
493 sub _complementary_features_list {
494 my ($self,$start,$end,@feat) = @_;
496 $self->throw_not_implemented();
501 __END__