3 # BioPerl module for Bio::Assembly::ContigAnalysis
5 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
7 # Copyright Robson Francisco de Souza
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Assembly::ContigAnalysis -
16 Perform analysis on sequence assembly contigs.
21 use Bio::Assembly::ContigAnalysis;
23 # Assembly loading methods
24 my $ca = Bio::Assembly::ContigAnalysis->new( -contig=>$contigOBJ );
26 my @lcq = $ca->low_consensus_quality;
27 my @hqd = $ca->high_quality_discrepancies;
28 my @ss = $ca->single_strand_regions;
32 A contig is as a set of sequences, locally aligned to each other, when
33 the sequences in a pair may be aligned. It may also include a
34 consensus sequence. Bio::Assembly::ContigAnalysis is a module
35 holding a collection of methods to analyze contig objects. It was
36 developed around the Bio::Assembly::Contig implementation of contigs and
37 can not work with another contig interface.
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to the
45 Bioperl mailing lists Your participation is much appreciated.
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 the bugs and their resolution. Bug reports can be submitted via the
56 http://bugzilla.open-bio.org/
58 =head1 AUTHOR - Robson Francisco de Souza
60 Email: rfsouza@citri.iq.usp.br
64 The rest of the documentation details each of the object
65 methods. Internal methods are usually preceded with a _
69 package Bio
::Assembly
::ContigAnalysis
;
73 use base
qw(Bio::Root::Root);
80 Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ);
81 Function : Creates a new contig analysis object
82 Returns : Bio::Assembly::ContigAnalysis
84 -contig : a Bio::Assembly::Contig object
89 my($class,@args) = @_;
90 my $self = $class->SUPER::new
(@args);
92 my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args);
93 unless ($contigOBJ->isa("Bio::Assembly::Contig")) {
94 $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n");
97 $self->{'_objref'} = $contigOBJ;
101 =head1 Analysis methods
103 =head2 high_quality_discrepancies
105 Title : high_quality_discrepancies
106 Usage : my $sfc = $ContigAnal->high_quality_discrepancies();
109 Locates all high quality discrepancies among aligned
110 sequences and the consensus sequence.
112 Note: see Bio::Assembly::Contig POD documentation,
113 section "Coordinate System", for a definition of
114 available types. Default coordinate system type is
115 "gapped consensus", i.e. consensus sequence (with gaps)
116 coordinates. If limits are not specified, the entire
117 alignment is analyzed.
119 Returns : Bio::SeqFeature::Collection
120 Args : optional arguments are
121 -threshold : cutoff value for low quality (minimum high quality)
123 -ignore : number of bases that will not be analysed at
124 both ends of contig aligned elements
126 -start : start of interval that will be analyzed
127 -end : start of interval that will be analyzed
128 -type : coordinate system type for interval
132 sub high_quality_discrepancies
{
133 my ($self,@args) = shift; # Package reference
135 my ($threshold,$ignore,$start,$end,$type) =
136 $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args);
138 # Defining default threhold and HQD_ignore
139 $threshold = 40 unless (defined($threshold));
140 $ignore = 5 unless (defined($ignore));
141 $type = 'gapped consensus' unless (defined($type));
143 # Changing input coordinates system (if needed)
144 if (defined $start && $type ne 'gapped consensus') {
145 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
146 } elsif (!defined $start) {
149 if (defined $end && $type ne 'gapped consensus') {
150 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end);
151 } elsif (!defined $end) {
152 $end = $self->{'_objref'}->get_consensus_length();
155 # Scanning each read sequence and the contig sequence and
156 # adding discrepancies to Bio::SeqFeature::Collection
157 my @seqIDs = $self->{'_objref'}->get_seq_ids(-start
=>$start,
160 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
163 foreach my $seqID (@seqIDs) {
164 # Setting aligned read sub-sequence limits and loading data
165 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
166 my $qual = $self->{'_objref'}->get_qual_by_name($seqID);
167 unless (defined $qual) {
168 $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID");
171 my $sequence = $seq->seq;
172 my @quality = @
{ $qual->qual };
174 # Scanning the aligned region of each read
176 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
178 $self->warn("Read $seqID has no alignment coordinates; considered low quality.\nSkipping...");
181 my ($astart,$aend) = ($coord->start,$coord->end);
182 $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start)
183 $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end)
185 my ($d_start,$d_end,$i);
186 for ($i=$astart-1; $i<=$aend-1; $i++) {
187 # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix)
188 $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1);
189 next unless (($i >= $start) && ($i <= $end));
191 my $r_base = uc(substr($sequence,$seq_ix-1,1));
192 my $c_base = uc(substr($consensus,$i,1));
194 # Discrepant region start: store $d_start and $type
195 (!defined($d_start) &&
196 ($r_base ne $c_base) &&
197 ($quality[$seq_ix-1] >= $threshold)) && do {
198 $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1);
199 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
203 # Quality change or end of discrepant region: store limits and undef $d_start
204 if (defined($d_start) &&
205 (($quality[$seq_ix-1] < $threshold) ||
206 (uc($r_base) eq uc($c_base)))) {
207 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
208 #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n";
209 push(@HQD, Bio
::SeqFeature
::Generic
->new(-primary
=>"high_quality_discrepancy:$seqID",
212 -strand
=>$seq->strand()) );
216 } # for ($i=$astart-1; $i<=$aend-1; $i++)
218 # Loading discrepancies located at sub-sequence end, if any.
219 if (defined($d_start)) {
220 $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
221 push(@HQD, Bio
::SeqFeature
::Generic
->new(-primary
=>"high_quality_discrepancy:$seqID",
224 -strand
=>$seq->strand()) );
226 } # foreach my $seqID (@seqIDs)
231 =head2 low_consensus_quality
233 Title : low_consensus_quality
234 Usage : my $sfc = $ContigAnal->low_consensus_quality();
235 Function : Locates all low quality regions in the consensus
236 Returns : an array of Bio::SeqFeature::Generic objects
237 Args : optional arguments are
238 -threshold : cutoff value for low quality (minimum high quality)
240 -start : start of interval that will be analyzed
241 -end : start of interval that will be analyzed
242 -type : coordinate system type for interval
246 sub low_consensus_quality
{
247 my ($self,@args) = shift; # Packege reference
249 my ($threshold,$start,$end,$type) =
250 $self->_rearrange([qw(THRESHOLD START END TYPE)],@args);
252 # Setting default value for threshold
253 $threshold = 25 unless (defined($threshold));
256 my @quality = @
{ $self->{'_objref'}->get_consensus_quality()->qual };
258 # Changing coordinates to gap mode noaln (consed: consensus without alignments)
259 $start = 1 unless (defined($start));
260 if (defined $start && defined $type && ($type ne 'gapped consensus')) {
261 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
262 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
264 $end = $self->{'_objref'}->get_consensus_length unless (defined $end);
266 # Scanning @quality vector and storing intervals limits with base qualities less then
267 # the threshold value
270 for ($i=$start-1; $i<=$end-1; $i++) {
271 # print $quality[$i],"\t",$i,"\n";
272 if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) {
274 } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) {
275 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
276 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
277 push(@LCQ, Bio
::SeqFeature
::Generic
->new(-start
=>$lcq_start,
279 -primary
=>'low_consensus_quality') );
284 if (defined $lcq_start) {
285 $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start);
286 my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
287 push(@LCQ, Bio
::SeqFeature
::Generic
->new(-start
=>$lcq_start,
289 -primary
=>'low_consensus_quality') );
295 =head2 not_confirmed_on_both_strands
297 Title : low_quality_consensus
298 Usage : my $sfc = $ContigAnal->low_quality_consensus();
301 Locates all regions whose consensus bases were not
302 confirmed by bases from sequences aligned in both
303 orientations, i.e., in such regions, no bases in aligned
304 sequences of either +1 or -1 strand agree with the
307 Returns : an array of Bio::SeqFeature::Generic objects
308 Args : optional arguments are
309 -start : start of interval that will be analyzed
310 -end : start of interval that will be analyzed
311 -type : coordinate system type for interval
315 sub not_confirmed_on_both_strands
{
316 my ($self,@args) = shift; # Package reference
318 my ($start,$end,$type) =
319 $self->_rearrange([qw(START END TYPE)],@args);
321 # Changing coordinates to default system 'align' (contig sequence with alignments)
322 $start = 1 unless (defined($start));
323 if (defined($type) && ($type ne 'gapped consensus')) {
324 $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start);
325 $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
327 $end = $self->{'_objref'}->get_consensus_length unless (defined($end));
330 my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand
332 my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq;
333 foreach my $seqID ($self->{'_objref'}->get_seq_ids) {
334 # Setting aligned read sub-sequence limits and loading data
335 my $seq = $self->{'_objref'}->get_seq_by_name($seqID);
336 my $sequence = $seq->seq;
338 # Scanning the aligned regions of each read and registering confirmed sites
340 my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID");
341 my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand);
342 $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart);
343 $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend);
344 for ($i=$astart-1; $i<=$aend-1; $i++) {
345 # $i+1 in 'align' mode is $contig_ix
346 $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1);
347 next unless (($contig_ix >= $start) && ($contig_ix <= $end));
348 my $r_base = uc(substr($sequence,$i,1));
349 my $c_base = uc(substr($consensus,$contig_ix-1,1));
350 if ($c_base eq '-') {
351 $confirmed{$orientation}[$contig_ix] = -1;
352 } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found
353 $confirmed{$orientation}[$contig_ix]++;
355 } # for ($i=$astart-1; $i<=$aend-1; $i++)
356 } # foreach $seqID (@reads)
358 # Locating non-confirmed aligned regions for each orientation in $confirmed registry
361 foreach $orientation (keys %confirmed) {
362 my ($ncbs_start,$ncbs_end);
364 for ($i=$start; $i<=$end; $i++) {
365 if (!defined($ncbs_start) &&
366 (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) {
367 $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i);
368 } elsif (defined($ncbs_start) &&
369 defined($confirmed{$orientation}[$i]) &&
370 ($confirmed{$orientation}[$i] > 0)) {
371 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1);
372 push(@NCBS, Bio
::SeqFeature
::Generic
->new(-start
=>$ncbs_start,
374 -strand
=>$orientation,
375 -primary
=>"not_confirmed_on_both_strands") );
380 if (defined($ncbs_start)) { # NCBS at the end of contig
381 $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end);
382 push(@NCBS, Bio
::SeqFeature
::Generic
->new(-start
=>$ncbs_start,
384 -strand
=>$orientation,
385 -primary
=>'not_confirmed_on_both_strands') );
394 Title : single_strand
395 Usage : my $sfc = $ContigAnal->single_strand();
398 Locates all regions covered by aligned sequences only in
399 one of the two strands, i.e., regions for which aligned
400 sequence's strand() method returns +1 or -1 for all
403 Returns : an array of Bio::SeqFeature::Generic objects
404 Args : optional arguments are
405 -start : start of interval that will be analyzed
406 -end : start of interval that will be analyzed
407 -type : coordinate system type for interval
413 my ($self,@args) = shift; # Package reference
415 my ($start,$end,$type) =
416 $self->_rearrange([qw(START END TYPE)],@args);
418 # Changing coordinates to gap mode align (consed: consensus sequence with alignments)
419 $type = 'gapped consensus' unless(defined($type));
420 $start = 1 unless (defined($start));
421 if (defined($type) && $type ne 'gapped consensus') {
422 $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start);
423 $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end));
425 ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end));
427 # Loading complete list of coordinates for aligned sequences
428 my $sfc = $self->{'_objref'}->get_features_collection();
429 my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ }
430 $sfc->features_in_range(-start
=>$start,
434 -strandmatch
=>'strong');
435 my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ }
436 $sfc->features_in_range(-start
=>$start,
440 -strandmatch
=>'strong');
441 # Merging overlapping features
442 @forward = $self->_merge_overlapping_features(@forward);
443 @reverse = $self->_merge_overlapping_features(@reverse);
445 # Finding single stranded regions
446 my ($length) = $self->{'_objref'}->get_consensus_length;
447 $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length);
448 @forward = $self->_complementary_features_list(1,$length,@forward);
449 @reverse = $self->_complementary_features_list(1,$length,@reverse);
452 foreach my $feat (@forward, @reverse) {
453 $feat->primary_tag('single_strand_region');
460 =head1 Internal Methods
462 =head2 _merge_overlapping_features
464 Title : _merge_overlapping_features
465 Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features);
466 Function : Merge all overlapping features into features
467 that hold original features as sub-features
468 Returns : array of Bio::SeqFeature::Generic objects
469 Args : array of Bio::SeqFeature::Generic objects
473 sub _merge_overlapping_features
{
474 my ($self,@feat) = @_;
476 $self->throw_not_implemented();
479 =head2 _complementary_features_list
481 Title : _complementary_features_list
482 Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features);
483 Function : Build a list of features for regions
484 not covered by features in @features array
485 Returns : array of Bio::SeqFeature::Generic objects
487 $start : [integer] start of first output feature
488 $end : [integer] end of last output feature
489 @features : array of Bio::SeqFeature::Generic objects
493 sub _complementary_features_list
{
494 my ($self,$start,$end,@feat) = @_;
496 $self->throw_not_implemented();