Couple more minor edits ...
[bioperl-live.git] / Bio / Assembly / ContigAnalysis.pm
blob934d7f9a340f117201a2f50b4fb0976343e368d8
2 # BioPerl module for Bio::Assembly::ContigAnalysis
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
8 # Copyright Robson Francisco de Souza
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Assembly::ContigAnalysis -
17 Perform analysis on sequence assembly contigs.
19 =head1 SYNOPSIS
21 # Module loading
22 use Bio::Assembly::ContigAnalysis;
24 # Assembly loading methods
25 my $ca = Bio::Assembly::ContigAnalysis->new( -contig=>$contigOBJ );
27 my @lcq = $ca->low_consensus_quality;
28 my @hqd = $ca->high_quality_discrepancies;
29 my @ss = $ca->single_strand_regions;
31 =head1 DESCRIPTION
33 A contig is as a set of sequences, locally aligned to each other, when
34 the sequences in a pair may be aligned. It may also include a
35 consensus sequence. Bio::Assembly::ContigAnalysis is a module
36 holding a collection of methods to analyze contig objects. It was
37 developed around the Bio::Assembly::Contig implementation of contigs and
38 can not work with another contig interface.
40 =head1 FEEDBACK
42 =head2 Mailing Lists
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to the
46 Bioperl mailing lists Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
51 =head2 Support
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
62 =head2 Reporting Bugs
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via the
66 web:
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHOR - Robson Francisco de Souza
72 Email: rfsouza@citri.iq.usp.br
74 =head1 APPENDIX
76 The rest of the documentation details each of the object
77 methods. Internal methods are usually preceded with a _
79 =cut
81 package Bio::Assembly::ContigAnalysis;
83 use strict;
85 use base qw(Bio::Root::Root);
87 =head1 Object creator
89 =head2 new
91 Title : new
92 Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ);
93 Function : Creates a new contig analysis object
94 Returns : Bio::Assembly::ContigAnalysis
95 Args :
96 -contig : a Bio::Assembly::Contig object
98 =cut
100 sub new {
101 my($class,@args) = @_;
102 my $self = $class->SUPER::new(@args);
104 my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args);
105 unless ($contigOBJ->isa("Bio::Assembly::Contig")) {
106 $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n");
109 $self->{'_objref'} = $contigOBJ;
110 return $self;
113 =head1 Analysis methods
115 =head2 high_quality_discrepancies
117 Title : high_quality_discrepancies
118 Usage : my $sfc = $ContigAnal->high_quality_discrepancies();
119 Function :
121 Locates all high quality discrepancies among aligned
122 sequences and the consensus sequence.
124 Note: see Bio::Assembly::Contig POD documentation,
125 section "Coordinate System", for a definition of
126 available types. Default coordinate system type is
127 "gapped consensus", i.e. consensus sequence (with gaps)
128 coordinates. If limits are not specified, the entire
129 alignment is analyzed.
131 Returns : Bio::SeqFeature::Collection
132 Args : optional arguments are
133 -threshold : cutoff value for low quality (minimum high quality)
134 Default: 40
135 -ignore : number of bases that will not be analysed at
136 both ends of contig aligned elements
137 Default: 5
138 -start : start of interval that will be analyzed
139 -end : start of interval that will be analyzed
140 -type : coordinate system type for interval
142 =cut
144 sub high_quality_discrepancies {
145 my ($self,@args) = shift; # Package reference
147 my ($threshold,$ignore,$start,$end,$type) =
148 $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args);
150 # Defining default threhold and HQD_ignore
151 $threshold = 40 unless (defined($threshold));
152 $ignore = 5 unless (defined($ignore));
153 $type = 'gapped consensus' unless (defined($type));
155 # Changing input coordinates system (if needed)
156 if (defined $start && $type ne 'gapped consensus') {
157 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
158 } elsif (!defined $start) {
159 $start = 1;
161 if (defined $end && $type ne 'gapped consensus') {
162 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end);
163 } elsif (!defined $end) {
164 $end = $self->{'_objref'}->get_consensus_length();
167 # Scanning each read sequence and the contig sequence and
168 # adding discrepancies to Bio::SeqFeature::Collection
169 my @seqIDs = $self->{'_objref'}->get_seq_ids(-start=>$start,
170 -end=>$end,
171 -type=>$type);
172 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
174 my @HQD = ();
175 foreach my $seqID (@seqIDs) {
176 # Setting aligned read sub-sequence limits and loading data
177 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
178 my $qual = $self->{'_objref'}->get_qual_by_name($seqID);
179 unless (defined $qual) {
180 $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID");
181 next;
183 my $sequence = $seq->seq;
184 my @quality = @{ $qual->qual };
186 # Scanning the aligned region of each read
187 my $seq_ix = 0;
188 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
189 if (!$coord) {
190 $self->warn("Read $seqID has no alignment coordinates; considered low quality.\nSkipping...");
191 next;
193 my ($astart,$aend) = ($coord->start,$coord->end);
194 $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start)
195 $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end)
197 my ($d_start,$d_end,$i);
198 for ($i=$astart-1; $i<=$aend-1; $i++) {
199 # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix)
200 $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1);
201 next unless (($i >= $start) && ($i <= $end));
203 my $r_base = uc(substr($sequence,$seq_ix-1,1));
204 my $c_base = uc(substr($consensus,$i,1));
206 # Discrepant region start: store $d_start and $type
207 (!defined($d_start) &&
208 ($r_base ne $c_base) &&
209 ($quality[$seq_ix-1] >= $threshold)) && do {
210 $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1);
211 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
212 next;
215 # Quality change or end of discrepant region: store limits and undef $d_start
216 if (defined($d_start) &&
217 (($quality[$seq_ix-1] < $threshold) ||
218 (uc($r_base) eq uc($c_base)))) {
219 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
220 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
221 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
222 -start=>$d_start,
223 -end=>$d_end,
224 -strand=>$seq->strand()) );
225 $d_start = undef;
226 next;
228 } # for ($i=$astart-1; $i<=$aend-1; $i++)
230 # Loading discrepancies located at sub-sequence end, if any.
231 if (defined($d_start)) {
232 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
233 push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID",
234 -start=>$d_start,
235 -end=>$d_end,
236 -strand=>$seq->strand()) );
238 } # foreach my $seqID (@seqIDs)
240 return @HQD;
243 =head2 low_consensus_quality
245 Title : low_consensus_quality
246 Usage : my $sfc = $ContigAnal->low_consensus_quality();
247 Function : Locates all low quality regions in the consensus
248 Returns : an array of Bio::SeqFeature::Generic objects
249 Args : optional arguments are
250 -threshold : cutoff value for low quality (minimum high quality)
251 Default: 25
252 -start : start of interval that will be analyzed
253 -end : start of interval that will be analyzed
254 -type : coordinate system type for interval
256 =cut
258 sub low_consensus_quality {
259 my ($self,@args) = shift; # Packege reference
261 my ($threshold,$start,$end,$type) =
262 $self->_rearrange([qw(THRESHOLD START END TYPE)],@args);
264 # Setting default value for threshold
265 $threshold = 25 unless (defined($threshold));
267 # Loading qualities
268 my @quality = @{ $self->{'_objref'}->get_consensus_quality()->qual };
270 # Changing coordinates to gap mode noaln (consed: consensus without alignments)
271 $start = 1 unless (defined($start));
272 if (defined $start && defined $type && ($type ne 'gapped consensus')) {
273 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
274 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
276 $end = $self->{'_objref'}->get_consensus_length unless (defined $end);
278 # Scanning @quality vector and storing intervals limits with base qualities less then
279 # the threshold value
280 my ($lcq_start);
281 my ($i,@LCQ);
282 for ($i=$start-1; $i<=$end-1; $i++) {
283 # print $quality[$i],"\t",$i,"\n";
284 if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) {
285 $lcq_start = $i+1;
286 } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) {
287 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
288 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
289 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
290 -end=>$lcq_end,
291 -primary=>'low_consensus_quality') );
292 $lcq_start = undef;
296 if (defined $lcq_start) {
297 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
298 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
299 push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start,
300 -end=>$lcq_end,
301 -primary=>'low_consensus_quality') );
304 return @LCQ;
307 =head2 not_confirmed_on_both_strands
309 Title : low_quality_consensus
310 Usage : my $sfc = $ContigAnal->low_quality_consensus();
311 Function :
313 Locates all regions whose consensus bases were not
314 confirmed by bases from sequences aligned in both
315 orientations, i.e., in such regions, no bases in aligned
316 sequences of either +1 or -1 strand agree with the
317 consensus bases.
319 Returns : an array of Bio::SeqFeature::Generic objects
320 Args : optional arguments are
321 -start : start of interval that will be analyzed
322 -end : start of interval that will be analyzed
323 -type : coordinate system type for interval
325 =cut
327 sub not_confirmed_on_both_strands {
328 my ($self,@args) = shift; # Package reference
330 my ($start,$end,$type) =
331 $self->_rearrange([qw(START END TYPE)],@args);
333 # Changing coordinates to default system 'align' (contig sequence with alignments)
334 $start = 1 unless (defined($start));
335 if (defined($type) && ($type ne 'gapped consensus')) {
336 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
337 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
339 $end = $self->{'_objref'}->get_consensus_length unless (defined($end));
341 # Scanning alignment
342 my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand
343 my ($i);
344 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
345 foreach my $seqID ($self->{'_objref'}->get_seq_ids) {
346 # Setting aligned read sub-sequence limits and loading data
347 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
348 my $sequence = $seq->seq;
350 # Scanning the aligned regions of each read and registering confirmed sites
351 my $contig_ix = 0;
352 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
353 my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand);
354 $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart);
355 $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend);
356 for ($i=$astart-1; $i<=$aend-1; $i++) {
357 # $i+1 in 'align' mode is $contig_ix
358 $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1);
359 next unless (($contig_ix >= $start) && ($contig_ix <= $end));
360 my $r_base = uc(substr($sequence,$i,1));
361 my $c_base = uc(substr($consensus,$contig_ix-1,1));
362 if ($c_base eq '-') {
363 $confirmed{$orientation}[$contig_ix] = -1;
364 } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found
365 $confirmed{$orientation}[$contig_ix]++;
367 } # for ($i=$astart-1; $i<=$aend-1; $i++)
368 } # foreach $seqID (@reads)
370 # Locating non-confirmed aligned regions for each orientation in $confirmed registry
371 my ($orientation);
372 my @NCBS = ();
373 foreach $orientation (keys %confirmed) {
374 my ($ncbs_start,$ncbs_end);
376 for ($i=$start; $i<=$end; $i++) {
377 if (!defined($ncbs_start) &&
378 (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) {
379 $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
380 } elsif (defined($ncbs_start) &&
381 defined($confirmed{$orientation}[$i]) &&
382 ($confirmed{$orientation}[$i] > 0)) {
383 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1);
384 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
385 -end=>$ncbs_end,
386 -strand=>$orientation,
387 -primary=>"not_confirmed_on_both_strands") );
388 $ncbs_start = undef;
392 if (defined($ncbs_start)) { # NCBS at the end of contig
393 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end);
394 push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start,
395 -end=>$ncbs_end,
396 -strand=>$orientation,
397 -primary=>'not_confirmed_on_both_strands') );
401 return @NCBS;
404 =head2 single_strand
406 Title : single_strand
407 Usage : my $sfc = $ContigAnal->single_strand();
408 Function :
410 Locates all regions covered by aligned sequences only in
411 one of the two strands, i.e., regions for which aligned
412 sequence's strand() method returns +1 or -1 for all
413 sequences.
415 Returns : an array of Bio::SeqFeature::Generic objects
416 Args : optional arguments are
417 -start : start of interval that will be analyzed
418 -end : start of interval that will be analyzed
419 -type : coordinate system type for interval
421 =cut
424 sub single_strand {
425 my ($self,@args) = shift; # Package reference
427 my ($start,$end,$type) =
428 $self->_rearrange([qw(START END TYPE)],@args);
430 # Changing coordinates to gap mode align (consed: consensus sequence with alignments)
431 $type = 'gapped consensus' unless(defined($type));
432 $start = 1 unless (defined($start));
433 if (defined($type) && $type ne 'gapped consensus') {
434 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
435 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
437 ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end));
439 # Loading complete list of coordinates for aligned sequences
440 my $sfc = $self->{'_objref'}->get_features_collection();
441 my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ }
442 $sfc->features_in_range(-start=>$start,
443 -end=>$end,
444 -contain=>0,
445 -strand=>1,
446 -strandmatch=>'strong');
447 my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ }
448 $sfc->features_in_range(-start=>$start,
449 -end=>$end,
450 -contain=>0,
451 -strand=>-1,
452 -strandmatch=>'strong');
453 # Merging overlapping features
454 @forward = $self->_merge_overlapping_features(@forward);
455 @reverse = $self->_merge_overlapping_features(@reverse);
457 # Finding single stranded regions
458 my ($length) = $self->{'_objref'}->get_consensus_length;
459 $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length);
460 @forward = $self->_complementary_features_list(1,$length,@forward);
461 @reverse = $self->_complementary_features_list(1,$length,@reverse);
463 my @SS = ();
464 foreach my $feat (@forward, @reverse) {
465 $feat->primary_tag('single_strand_region');
466 push(@SS,$feat);
469 return @SS;
472 =head1 Internal Methods
474 =head2 _merge_overlapping_features
476 Title : _merge_overlapping_features
477 Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features);
478 Function : Merge all overlapping features into features
479 that hold original features as sub-features
480 Returns : array of Bio::SeqFeature::Generic objects
481 Args : array of Bio::SeqFeature::Generic objects
483 =cut
485 sub _merge_overlapping_features {
486 my ($self,@feat) = @_;
488 $self->throw_not_implemented();
491 =head2 _complementary_features_list
493 Title : _complementary_features_list
494 Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features);
495 Function : Build a list of features for regions
496 not covered by features in @features array
497 Returns : array of Bio::SeqFeature::Generic objects
498 Args :
499 $start : [integer] start of first output feature
500 $end : [integer] end of last output feature
501 @features : array of Bio::SeqFeature::Generic objects
503 =cut
505 sub _complementary_features_list {
506 my ($self,$start,$end,@feat) = @_;
508 $self->throw_not_implemented();
513 __END__