2 # BioPerl module for Bio::Assembly::Tools::ContigSpectrum
4 # Copyright by Florent Angly
6 # You may distribute this module under the same terms as Perl itself
8 # POD documentation - main docs before the code
12 Bio::Assembly::Tools::ContigSpectrum - create and manipulate contig spectra
16 # Simple contig spectrum creation
17 my $csp1 = Bio::Assembly::Tools::ContigSpectrum->new(
19 -spectrum => { 1 => 10,
23 # ...or another way to create a simple contig spectrum
24 my $csp2 = Bio::Assembly::Tools::ContigSpectrum->new;
26 $csp2->spectrum({ 1 => 20, 2 => 1, 4 => 1 });
28 # Get some information
29 print "This is contig spectrum ".$csp->id."\n";
30 print "It contains ".$csp->nof_seq." sequences\n";
31 print "The largest contig has ".$csp->max_size." sequences\n";
32 print "The spectrum is: ".$csp->to_string($csp->spectrum)."\n";
34 # Let's add the contig spectra
35 my $summed_csp = Bio::Assembly::Tools::ContigSpectrum->new;
36 $summed_csp->add($csp1);
37 $summed_csp->add($csp2);
38 print "The summed contig spectrum is ".$summed_csp->to_string."\n";
41 my $avg_csp = Bio::Assembly::Tools::ContigSpectrum->new;
42 $avg_csp = $avg_csp->average([$csp1, $csp2]);
43 print "The average contig spectrum is ".$avg_csp->to_string."\n";
45 # Get a contig spectrum from an assembly
46 my $from_assembly = Bio::Assembly::Tools::ContigSpectrum->new(
47 -assembly => $assembly_object,
48 -eff_asm_params => 1);
49 print "The contig spectrum from assembly is ".$from_assembly->to_string."\n";
51 # Report advanced information (possible because eff_asm_params = 1)
52 print "Average sequence length: ".$from_assembly->avg_seq_len." bp\n";
53 print "Minimum overlap length: ".$from_assembly->min_overlap." bp\n";
54 print "Average overlap length: ".$from_assembly->avg_overlap." bp\n";
55 print "Minimum overlap match: ".$from_assembly->min_identity." %\n";
56 print "Average overlap match: ".$from_assembly->avg_identity." %\n";
58 # Assuming the assembly object contains sequences from several different
59 # metagenomes, we have a mixed contig spectrum from which a cross contig
60 # spectrum and dissolved contig spectra can be obtained
61 my $mixed_csp = $from_assembly;
63 # Calculate a dissolved contig spectrum
64 my $meta1_dissolved = Bio::Assembly::Tools::ContigSpectrum->new(
65 -dissolve => [$mixed_csp, 'metagenome1'] );
66 my $meta2_dissolved = Bio::Assembly::Tools::ContigSpectrum->new(
67 -dissolve => [$mixed_csp, 'metagenome2'] );
68 print "The dissolved contig spectra are:\n".
69 $meta1_dissolved->to_string."\n".
70 $meta2_dissolved->to_string."\n";
72 # Determine a cross contig spectrum
73 my $cross_csp = Bio::Assembly::Tools::ContigSpectrum->new(
74 -cross => $mixed_csp );
75 print "The cross contig spectrum is ".$cross_csp->to_string."\n";
77 # Score a contig spectrum (the more abundant the contigs and the larger their
78 # size, the larger the score)
79 my $csp_score = $csp->score( $csp->nof_seq );
83 The Bio::Assembly::Tools::ContigSpectrum Perl module enables to
84 manually create contig spectra, import them from assemblies,
85 manipulate them, transform between different types of contig spectra
88 Bio::Assembly::Tools::ContigSpectrum is a module to create, manipulate
89 and output contig spectra, assembly-derived data used in metagenomics
90 (community genomics) for diversity estimation.
94 A contig spectrum is the count of the number of contigs of different
95 size in an assembly. For example, the contig spectrum [100 5 1 0 0
96 ...] means that there were 100 singlets (1-contigs), 5 contigs of 2
97 sequences (2-contigs), 1 contig of 3 sequences (3-contig) and no
100 An assembly can be produced from a mixture of sequences from different
101 metagenomes. The contig obtained from this assembly is a mixed contig
102 spectrum. The contribution of each metagenome in this mixed contig
103 spectrum can be obtained by determining a dissolved contig spectrum.
105 Finally, based on a mixed contig spectrum, a cross contig spectrum can
106 be determined. In a cross contig spectrum, only contigs containing
107 sequences from different metagenomes are kept; "pure" contigs are
108 excluded. Additionally, the total number of singletons (1-contigs)
109 from each region that assembles with any fragments from other regions
110 is the number of 1-contigs in the cross contig spectrum.
112 =head2 Implementation
114 The simplest representation of a contig spectrum is as a hash
115 representation where the key is the contig size (number of sequences
116 making up the contig) and the value the number of contigs of this
119 In fact, it is useful to have more information associated with the
120 contig spectrum, hence the Bio::Assembly::Tools::ContigSpectrum module
121 implements an object containing a contig spectrum hash and additional
122 information. The get/set methods to access them are:
124 id contig spectrum ID
125 nof_rep number of repetitions (assemblies) used
126 max_size size of (number of sequences in) the largest contig
127 spectrum hash representation of a contig spectrum
129 nof_seq number of sequences
130 avg_seq_len average sequence length
132 eff_asm_params reports effective assembly parameters
134 nof_overlaps number of overlaps (needs eff_asm_params)
135 min_overlap minimum overlap length in a contig (needs eff_asm_params)
136 min_identity minimum sequence identity percentage (needs eff_asm_params)
137 avg_overlap average overlap length (needs eff_asm_params)
138 avg_identity average overlap identity percentage (needs eff_asm_params)
140 Operations on the contig spectra:
142 to_string create a string representation of the spectrum
143 spectrum import a hash contig spectrum
144 assembly determine a contig spectrum from an assembly, contig or singlet
145 dissolve calculate a dissolved contig spectrum (depends on assembly)
146 cross produce a cross contig spectrum (depends on assembly)
147 add add a contig spectrum to an existing one
148 average make an average of several contig spectra
149 score score a contig spectrum: the higher the number of contigs
150 and the larger their size, the higher the score.
152 When using operations that rely on knowing "where" (from what
153 metagenomes) a sequence came from (i.e. when creating a dissolved or
154 cross contig spectrum), make sure that the sequences used for the
155 assembly have a name header, e.g. E<gt>metagenome1|seq1,
156 E<gt>metagenome2|seq1, ...
158 Note: The following operations require the C<Graph::Undirected> module:
159 eff_asm_params, cross, dissolve
165 User feedback is an integral part of the evolution of this and other
166 Bioperl modules. Send your comments and suggestions preferably to the
167 Bioperl mailing lists Your participation is much appreciated.
169 bioperl-l@bioperl.org - General discussion
170 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
175 Please direct usage questions or support issues to the mailing list:
177 I<bioperl-l@bioperl.org>
179 rather than to the module maintainer directly. Many experienced and
180 reponsive experts will be able look at the problem and quickly
181 address it. Please include a thorough description of the problem
182 with code and data examples if at all possible.
184 =head2 Reporting Bugs
186 Report bugs to the BioPerl bug tracking system to help us keep track
187 the bugs and their resolution. Bug reports can be submitted via email
190 bioperl-bugs@bio.perl.org
191 https://github.com/bioperl/bioperl-live/issues
193 =head1 AUTHOR - Florent E Angly
195 Email florent_dot_angly_at_gmail_dot_com
199 The rest of the documentation details each of the object
200 methods. Internal methods are usually preceded with a "_".
204 package Bio
::Assembly
::Tools
::ContigSpectrum
;
209 use Bio
::Assembly
::Scaffold
;
210 use Bio
::SimpleAlign
;
211 use Bio
::LocatableSeq
;
213 use base
'Bio::Root::Root';
219 Usage : my $csp = Bio::Assembly::Tools::ContigSpectrum->new();
221 my $csp = Bio::Assembly::Tools::ContigSpectrum->new(
223 -spectrum => { 1 => 90 , 2 => 3 , 4 => 1 },
226 my $csp = Bio::Assembly::Tools::ContigSpectrum->new(
227 -assembly => $assembly_obj
229 Function: create a new contig spectrum object
230 Returns : reference to a contig spectrum object
236 my ($class, @args) = @_;
237 my $self = $class->SUPER::new
(@args);
238 my ( $id, $nof_seq, $nof_rep, $max_size, $nof_overlaps, $min_overlap,
239 $min_identity, $avg_overlap, $avg_identity, $avg_seq_len, $spectrum,
240 $assembly, $eff_asm_params, $dissolve, $cross) = $self->_rearrange(
241 [qw(ID NOF_SEQ NOF_REP MAX_SIZE NOF_OVERLAPS MIN_OVERLAP MIN_IDENTITY
242 AVG_OVERLAP AVG_IDENTITY AVG_SEQ_LEN SPECTRUM ASSEMBLY EFF_ASM_PARAMS
243 DISSOLVE CROSS)], @args );
245 # First set up some defauts
246 $self->{'_id'} = 'NoName';
247 $self->{'_nof_seq'} = 0;
248 $self->{'_nof_rep'} = 0;
249 $self->{'_max_size'} = 0;
250 $self->{'_nof_overlaps'} = 0;
251 $self->{'_min_overlap'} = undef;
252 $self->{'_min_identity'} = undef;
253 $self->{'_avg_overlap'} = 0;
254 $self->{'_avg_identity'} = 0;
255 $self->{'_avg_seq_len'} = 0;
256 $self->{'_eff_asm_params'} = 0;
257 $self->{'_spectrum'} = {1 => 0}; # contig spectrum hash representation
258 $self->{'_assembly'} = []; # list of assembly, contigs and singlet objects
260 # Then, according to user desires, override defaults
261 $self->{'_id'} = $id if (defined $id);
262 $self->{'_nof_seq'} = $nof_seq if (defined $nof_seq);
263 $self->{'_nof_rep'} = $nof_rep if (defined $nof_rep);
264 $self->{'_max_size'} = $max_size if (defined $max_size);
265 $self->{'_nof_overlaps'} = $nof_overlaps if (defined $nof_overlaps);
266 $self->{'_min_overlap'} = $min_overlap if (defined $min_overlap);
267 $self->{'_avg_overlap'} = $avg_overlap if (defined $avg_overlap);
268 $self->{'_min_identity'} = $min_identity if (defined $min_identity);
269 $self->{'_avg_identity'} = $avg_identity if (defined $avg_identity);
270 $self->{'_avg_seq_len'} = $avg_seq_len if (defined $avg_seq_len);
271 $self->{'_eff_asm_params'} = $eff_asm_params if (defined $eff_asm_params);
273 # Finally get stuff that can be obtained in an automated way
274 $self->_import_spectrum($spectrum) if defined($spectrum);
275 $self->_import_assembly($assembly) if defined($assembly);
276 $self->_import_cross_csp($cross) if defined($cross);
277 if (defined($dissolve)) {
278 my ($mixed_csp, $header) = (@
$dissolve[0], @
$dissolve[1]);
279 $self->_import_dissolved_csp($mixed_csp, $header);
290 Function: get/set contig spectrum id
292 Args : string [optional]
297 my ($self, $id) = @_;
299 $self->{'_id'} = $id;
301 $id = $self->{'_id'};
309 Usage : $csp->nof_seq
310 Function: get/set the number of sequences making up the contig spectrum
312 Args : integer [optional]
317 my ($self, $nof_seq) = @_;
318 if (defined $nof_seq) {
319 $self->throw("The number of sequences must be strictly positive. Got ".
320 "'$nof_seq'") if $nof_seq < 1;
321 $self->{'_nof_seq'} = $nof_seq;
323 $nof_seq = $self->{'_nof_seq'};
331 Usage : $csp->nof_rep
332 Function: Get/Set the number of repetitions (assemblies) used to create the
335 Args : integer [optional]
340 my ($self, $nof_rep) = @_;
341 if (defined $nof_rep) {
342 $self->throw("The number of repetitions must be strictly positive. Got ".
343 "'$nof_rep'") if $nof_rep < 1;
344 $self->{'_nof_rep'} = $nof_rep;
346 $nof_rep = $self->{'_nof_rep'};
354 Usage : $csp->max_size
355 Function: get/set the size of (number of sequences in) the largest contig
357 Args : integer [optional]
362 my ($self, $max_size) = @_;
363 if (defined $max_size) {
364 $self->throw("The contig maximum size must be strictly positive. Got ".
365 "'$max_size'") if $max_size < 1;
366 $self->{'_max_size'} = $max_size;
368 $max_size = $self->{'_max_size'};
376 Usage : $csp->nof_overlaps
377 Function: Get/Set the number of overlaps in the assembly.
379 Args : integer [optional]
384 my ($self, $nof_overlaps) = @_;
385 if (defined $nof_overlaps) {
386 $self->throw("The number of overlaps must be strictly positive. Got ".
387 "'$nof_overlaps'") if $nof_overlaps < 1;
388 $self->{'_nof_overlaps'} = $nof_overlaps;
390 $nof_overlaps = $self->{'_nof_overlaps'};
391 return $nof_overlaps;
398 Usage : $csp->min_overlap
399 Function: get/set the assembly minimum overlap length
401 Args : integer [optional]
406 my ($self, $min_overlap) = @_;
407 if (defined $min_overlap) {
408 $self->throw("The minimum of overlap length must be strictly positive. Got".
409 " '$min_overlap'") if $min_overlap < 1;
410 $self->{'_min_overlap'} = $min_overlap;
412 $min_overlap = $self->{'_min_overlap'};
420 Usage : $csp->avg_overlap
421 Function: get/set the assembly average overlap length
423 Args : decimal [optional]
428 my ($self, $avg_overlap) = @_;
429 if (defined $avg_overlap) {
430 $self->throw("The average overlap length must be strictly positive. Got ".
431 "'$avg_overlap'") if $avg_overlap < 1;
432 $self->{'_avg_overlap'} = $avg_overlap;
434 $avg_overlap = $self->{'_avg_overlap'};
442 Usage : $csp->min_identity
443 Function: get/set the assembly minimum overlap identity percent
444 Returns : 0 < decimal < 100
445 Args : 0 < decimal < 100 [optional]
450 my ($self, $min_identity) = @_;
451 if (defined $min_identity) {
452 $self->throw("The minimum overlap percent identity must be strictly ".
453 "positive. Got '$min_identity'") if $min_identity < 1;
454 $self->{'_min_identity'} = $min_identity;
456 $min_identity = $self->{'_min_identity'};
457 return $min_identity;
464 Usage : $csp->avg_identity
465 Function: get/set the assembly average overlap identity percent
466 Returns : 0 < decimal < 100
467 Args : 0 < decimal < 100 [optional]
472 my ($self, $avg_identity) = @_;
473 if (defined $avg_identity) {
474 $self->throw("The average overlap percent identity must be strictly ".
475 "positive. Got '$avg_identity'") if $avg_identity < 1;
476 $self->{'_avg_identity'} = $avg_identity;
478 $avg_identity = $self->{'_avg_identity'};
479 return $avg_identity;
486 Usage : $csp->avg_seq_len
487 Function: get/set the assembly average sequence length
488 Returns : avg_seq_len
489 Args : real [optional]
494 my ($self, $avg_seq_len) = @_;
495 if (defined $avg_seq_len) {
496 $self->throw("The average sequence length must be strictly positive. Got ".
497 "'$avg_seq_len'") if $avg_seq_len < 1;
498 $self->{'_avg_seq_len'} = $avg_seq_len;
500 $avg_seq_len = $self->{'_avg_seq_len'};
505 =head2 eff_asm_params
507 Title : eff_asm_params
508 Usage : $csp->eff_asm_params(1)
509 Function: Get/set the effective assembly parameters option. It defines if the
510 effective assembly parameters should be determined when a contig
511 spectrum based or derived from an assembly is calculated. The
512 effective assembly parameters include avg_seq_length, nof_overlaps,
513 min_overlap, avg_overlap, min_identity and avg_identity.
514 1 = get them, 0 = don't.
516 Args : integer [optional]
521 my ($self, $eff_asm_params) = @_;
522 if (defined $eff_asm_params) {
523 $self->throw("eff_asm_params can only take values 0 or 1. Input value was ".
524 "'$eff_asm_params'") unless $eff_asm_params == 0 || $eff_asm_params == 1;
525 $self->{'_eff_asm_params'} = $eff_asm_params;
527 $eff_asm_params = $self->{'_eff_asm_params'};
528 return $eff_asm_params;
535 Usage : my $spectrum = $csp->spectrum({1=>10, 2=>2, 3=>1});
536 Function: Get the current contig spectrum represented as a hash / Update a
537 contig spectrum object based on a contig spectrum represented as a
539 The hash representation of a contig spectrum is as following:
540 key -> contig size (in number of sequences)
541 value -> number of contigs of this size
542 Returns : contig spectrum as a hash reference
543 Args : contig spectrum as a hash reference [optional]
548 my ($self, $spectrum) = @_;
549 if (defined $spectrum) {
550 $self->_import_spectrum($spectrum);
552 $spectrum = $self->{'_spectrum'};
560 Usage : my @obj_list = $csp->assembly();
561 Function: get/set the contig spectrum object by adding an assembly, contig or
562 singlet object to it, or get the list of objects associated with it
563 Returns : arrayref of assembly, contig and singlet objects used in the contig
564 spectrum object (Bio::Assembly::Scaffold, Bio::Assembly::Contig and
565 Bio::Assembly::Singlet objects)
566 Args : Bio::Assembly::Scaffold, Contig or Singlet object
571 my ($self, $assembly) = @_;
572 if (defined $assembly) {
573 $self->_import_assembly($assembly);
575 my @obj_list = @
{$self->{'_assembly'}} if defined $self->{'_assembly'};
582 Title : drop_assembly
583 Usage : $csp->drop_assembly();
584 Function: Remove all assembly objects associated with a contig spectrum.
585 Assembly objects can take a lot of memory, which can be freed by
586 calling this method. Don't call this method if you need the assembly
587 object later on, for example for creating a dissolved or cross
589 Returns : 1 for success
596 $self->{'_assembly'} = [];
604 Usage : $dissolved_csp->dissolve($mixed_csp, $seq_header);
605 Function: Dissolve a mixed contig spectrum for the set of sequences that
606 contain the specified header, i.e. determine the contribution of
607 these sequences to the mixed contig spectrum. The mixed contig
608 spectrum object must have one or several assembly object(s). In
609 addition, min_overlap, min_identity and eff_asm_params are taken
610 from the mixed contig spectrum, unless they are specified manually
611 for the dissolved contig spectrum. The dissolved contigs underlying
612 the contig spectrum can be obtained by calling the assembly() method.
613 Returns : 1 for success
614 Args : Bio::Assembly::Tools::ContigSpectrum reference
615 sequence header string
620 my ($self, $mixed_csp, $seq_header) = @_;
621 $self->_import_dissolved_csp($mixed_csp, $seq_header);
629 Usage : $cross_csp->cross($mixed_csp);
630 Function: Calculate a cross contig_spectrum based on a mixed contig_spectrum.
631 The underlying cross-contigs themselves can be obtained by calling
632 the assembly() method.
633 Returns : 1 for success
634 Args : Bio::Assembly::Tools::ContigSpectrum reference
639 my ($self, $mixed_csp) = @_;
640 $self->_import_cross_csp($mixed_csp);
647 Usage : my $csp_string = $csp->to_string;
648 Function: Convert the contig spectrum into a string (easy to print!!).
650 Args : element separator (integer) [optional]
653 3 -> newline-separated
658 my ($self, $element_separator) = @_;
659 return 0 if $self->{'_max_size'} == 0;
660 $element_separator ||= 1;
661 if ($element_separator == 1) {
662 $element_separator = ' ';
663 } elsif ($element_separator == 2) {
664 $element_separator = "\t";
665 } elsif ($element_separator == 3) {
666 $element_separator = "\n";
668 $self->throw("Unknown separator type '$element_separator'\n");
671 for (my $q = 1 ; $q <= $self->{'_max_size'} ; $q++) {
673 if (exists $self->{'_spectrum'}{$q}) {
674 $val = $self->{'_spectrum'}{$q};
676 $str .= $val.$element_separator;
686 Usage : $csp->add($additional_csp);
687 Function: Add a contig spectrum to an existing one: sums the spectra, update
688 the number of sequences, number of repetitions, ...
689 Returns : 1 for success
690 Args : Bio::Assembly::Tools::ContigSpectrum object
695 my ($self, $csp) = @_;
697 if( !ref $csp || ! $csp->isa('Bio::Assembly::Tools::ContigSpectrum') ) {
698 $self->throw("Unable to process non Bio::Assembly::Tools::ContigSpectrum ".
699 "object [".ref($csp)."]");
701 # Update overlap statistics
702 if ( $self->{'_eff_asm_params'} > 0 ) {
705 if ( $csp->{'_eff_asm_params'} == 0 ) {
706 $self->warn("The parent contig spectrum needs effective assembly ".
707 "parameters (eff_asm_params = ".$self->{'_eff_asm_params'}.") but the ".
708 "child contig spectrum doesn't have them (eff_asm_params = ".
709 $csp->{'_eff_asm_params'}."). Skipping them...");
710 } elsif ( $csp->{'_eff_asm_params'} != $self->{'_eff_asm_params'} ) {
711 $self->warn("The parent contig spectrum needs a different method for ".
712 "detecting the effective assembly parameters (eff_asm_params = ".
713 $self->{'_eff_asm_params'}.") than the one specified for the child ".
714 "contig spectrum (eff_asm_params = ".$csp->{'_eff_asm_params'}."). ".
715 "Ignoring the differences...");
718 # Update existing stats
719 ( $self->{'_avg_overlap'} , $self->{'_avg_identity'}, $self->{'_min_overlap'},
720 $self->{'_min_identity'}, $self->{'_nof_overlaps'} ) = $self->_update_overlap_stats(
721 $self->{'_avg_overlap'} , $self->{'_avg_identity'}, $self->{'_min_overlap'},
722 $self->{'_min_identity'}, $self->{'_nof_overlaps'},
723 $csp->{'_avg_overlap'} , $csp->{'_avg_identity'} , $csp->{'_min_overlap'},
724 $csp->{'_min_identity'} , $csp->{'_nof_overlaps'} );
728 # Update sequence average length (not number of sequences)
729 ( $self->{'_avg_seq_len'} ) = $self->_update_seq_stats(
730 $self->{'_avg_seq_len'}, $self->{'_nof_seq'}, $csp->{'_avg_seq_len'},
731 $csp->{'_nof_seq'} );
733 # Update spectrum (and nof_seq, max_size, and increment nof_rep by 1)
734 $self->_import_spectrum($csp->{'_spectrum'});
737 $self->{'_nof_rep'}--;
738 $self->{'_nof_rep'} += $csp->{'_nof_rep'};
739 # Update list of assembly objects used
740 push @
{$self->{'_assembly'}}, @
{$csp->{'_assembly'}}
741 if defined $csp->{'_assembly'};
750 Usage : my $avg_csp = $csp->average([$csp1, $csp2, $csp3]);
751 Function: Average one contig spectrum or the sum of several contig spectra by
752 the number of repetitions
753 Returns : Bio::Assembly::Tools::ContigSpectrum
754 Args : Bio::Assembly::Tools::ContigSpectrum array reference
760 my ($self, $list) = @_;
762 if ( ! ref $list || ! ref $list eq 'ARRAY') {
763 $self->throw("Average takes an array reference but got [".ref($list)."]");
765 # New average contig spectrum object
766 my $avg = Bio
::Assembly
::Tools
::ContigSpectrum
->new;
767 $avg->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
769 # Cycle through contig spectra
771 for my $csp (@
$list) {
773 if (not $csp->isa('Bio::Assembly::Tools::ContigSpectrum')) {
774 $csp->throw("Unable to process non Bio::Assembly::Tools::ContigSpectrum ".
775 "object [".ref($csp)."]");
777 # Import contig spectrum
781 # Average sum of contig spectra by number of repetitions
782 for (my $q = 1 ; $q <= $avg->{'_max_size'} ; $q++) {
783 $avg->{'_spectrum'}{$q} /= $avg->{'_nof_rep'}
784 if (defined $avg->{'_spectrum'}{$q});
786 # Average number of sequences
787 $avg->{'_nof_seq'} /= $avg->{'_nof_rep'};
788 # Average number of overlaps
789 $avg->{'_nof_overlaps'} /= $avg->{'_nof_rep'};
798 Usage : my $score = $csp->score();
799 Function: Score a contig spectrum (or cross-contig spectrum) such that the
800 higher the number of contigs (or cross-contigs) and the larger their
801 size, the higher the score.
802 Let n : total number of sequences
803 c_q : number of contigs of size q
804 q : number of sequence in a contig
805 We define: score = n/(n-1) * (X - 1/n)
806 where X = sum ( c_q * q^2 ) / n**2
807 The score ranges from 0 (singlets only) to 1 (a single large contig)
808 It is possible to specify a value for the number of sequences to
809 assume in the contig spectrum.
810 Returns : contig score, or undef if there were no sequences in the contig spectrum
811 Args : number of total sequences to assume [optional]
816 my ($self, $nof_seqs) = @_;
818 my $n = $self->nof_seq;
819 return undef if ($n <= 0);
822 my $q_max = $self->max_size;
823 my $spec = $self->spectrum;
824 for my $q ( 1 .. $q_max ) {
825 my $c_q = $spec->{$q};
826 if ( $q == 1 && $nof_seqs ) {
827 $c_q += $nof_seqs - $n;
831 $score += $c_q * $q ** 2;
834 # Rescale X to obtain the score
835 $score = $n/($n-1) * ($score - 1/$n);
840 =head2 _naive_assembler
842 Title : _naive_assembler
844 Function: Reassemble the specified sequences only based on their position in
845 the contig. This naive assembly only verifies that the minimum
846 overlap length and percentage identity are respected. No actual
848 Returns : arrayref of contigs and singlets
849 Args : Bio::Assembly::Contig
850 array reference of sequence IDs to use [optional]
851 minimum overlap length (integer) [optional]
852 minimum percentage identity (integer) [optional]
856 sub _naive_assembler
{
857 my ($self, $contig, $seqlist, $min_overlap, $min_identity) = @_;
859 # Use all reads if none was specified:
860 if (not defined $seqlist) {
861 for my $seq ($contig->each_seq) {
862 push @
$seqlist, $seq->id;
867 if ( ! ref $seqlist || ! ref($seqlist) eq 'ARRAY') {
868 $self->throw('Expecting an array reference. Got ['.ref($seqlist)."] \n");
870 my $max = scalar @
$seqlist;
871 $self->throw("Expecting at least 2 sequences as input for _naive_assembler")
875 my %seq_hash = map { $_ => undef } (@
$seqlist) if (scalar @
$seqlist > 0);
876 my ($g, $overlaps) = $self->_contig_graph($contig, \
%seq_hash, $min_overlap, $min_identity);
878 # Construct sub-contigs
882 for my $connected_reads ($g->connected_components) { # reads that belong in contigs
883 my $sub_id = $contig->id.'_'.$num;
884 my $sub_contig = $self->_create_subcontig($contig, $connected_reads, $sub_id);
885 push @contig_objs, $sub_contig;
887 for my $read_id ( @
$connected_reads ) {
888 delete $seq_hash{$read_id};
893 # Construct sub-singlets
895 for my $read_id ( keys %seq_hash ) {
896 my $read = $contig->get_seq_by_name($read_id);
897 my $sub_singlet = Bio
::Assembly
::Singlet
->new(
898 -id
=> $contig->id.'_'.$num,
899 -seqref
=> $self->_obj_copy($read)
902 push @singlet_objs, $sub_singlet;
905 return [@contig_objs, @singlet_objs];
909 =head2 _create_subcontig
911 Title : _create_subcontig
913 Function: Create a subcontig from another contig
914 Returns : Bio::Assembly::Contig object
915 Args : Bio::Assembly::Contig
916 arrayref of the IDs of the reads to includes in the subcontig
917 ID to give to the subcontig
921 sub _create_subcontig
{
922 my ($self, $contig, $read_ids, $sub_contig_id) = @_;
924 my $sub_contig = Bio
::Assembly
::Contig
->new( -id
=> $sub_contig_id );
926 # Get min and max read coordinates
927 my ($min, $max) = (undef, undef);
928 for my $read_id ( @
$read_ids ) {
929 my ($aln_coord) = $contig->get_features_collection->get_features_by_type("_aligned_coord:$read_id");
930 my $seq_start = $aln_coord->location->start;
931 my $seq_end = $aln_coord->location->end;
932 $min = $seq_start if (not defined $min) || ((defined $min) && ($seq_start < $min));
933 $max = $seq_end if (not defined $max) || ((defined $max) && ($seq_end > $max));
936 # Add reads to subcontig
937 for my $read_id (@
$read_ids) {
938 my $read = $self->_obj_copy($contig->get_seq_by_name($read_id));
939 my $coord = $self->_obj_copy($contig->get_seq_coord($read));
941 # adjust read coordinates
942 $coord->start( $coord->start - $min + 1 );
943 $coord->end( $coord->end - $min + 1 );
945 $sub_contig->set_seq_coord($coord, $read);
948 # Truncate copy of original consensus to new boundaries
949 my $cons_seq = $contig->get_consensus_sequence;
950 $sub_contig->set_consensus_sequence( $self->_obj_copy($cons_seq, $min, $max) );
951 my $cons_qual = $contig->get_consensus_quality;
953 $sub_contig->set_consensus_quality( $self->_obj_copy($cons_qual, $min, $max) );
963 Function: Copy (most of) an object, and optionally truncate it
964 Returns : another a Bio::LocatableSeq, Bio::Seq::PrimaryQual, or
965 Bio::SeqFeature::Generic object
966 Args : a Bio::LocatableSeq, Bio::Seq::PrimaryQual, or
967 Bio::SeqFeature::Generic object
974 my ($self, $obj, $start, $end) = @_;
976 if ($obj->isa('Bio::Seq::PrimaryQual')) {
977 my $qual = [@
{$obj->qual}]; # copy of the quality scores
978 if (defined $start && defined $end && $start !=1 && $end != scalar(@
$qual)) {
979 # Truncate the quality scores
980 $qual = [ splice @
$qual, $start - 1, $end - $start + 1 ];
982 $new = Bio
::Seq
::PrimaryQual
->new(
987 } elsif ($obj->isa('Bio::LocatableSeq')) {
989 if (defined $start && defined $end && $start != 1 && $end != length($seq)) {
990 # Truncate the aligned sequence
991 $seq = substr $seq, $start - 1, $end - $start + 1;
993 $new = Bio
::LocatableSeq
->new(
996 -start
=> $obj->start,
997 -strand
=> $obj->strand,
998 -alphabet
=> $obj->alphabet,
1001 } elsif ($obj->isa('Bio::SeqFeature::Generic')) {
1002 $new = Bio
::SeqFeature
::Generic
->new(
1003 -start
=> $obj->start,
1012 =head2 _new_from_assembly
1014 Title : _new_from_assembly
1016 Function: Creates a new contig spectrum object based solely on the result of
1017 an assembly, contig or singlet
1018 Returns : Bio::Assembly::Tools::ContigSpectrum object
1019 Args : Bio::Assembly::Scaffold, Contig or Singlet object
1023 sub _new_from_assembly
{
1024 # Create new contig spectrum object based purely on what we can get from the
1026 my ($self, $assemblyobj) = @_;
1027 my $csp = Bio
::Assembly
::Tools
::ContigSpectrum
->new();
1029 $csp->{'_id'} = $assemblyobj->id;
1030 # 2: Set overlap statistics: nof_overlaps, min_overlap, avg_overlap,
1031 # min_identity and avg_identity
1032 $csp->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
1033 $csp->{'_min_overlap'} = $self->{'_min_overlap'};
1034 $csp->{'_min_identity'} = $self->{'_min_identity'};
1035 if ( $csp->{'_eff_asm_params'} > 0 ) {
1036 ( $csp->{'_avg_overlap'}, $csp->{'_avg_identity'}, $csp->{'_min_overlap'},
1037 $csp->{'_min_identity'}, $csp->{'_nof_overlaps'} )
1038 = $csp->_get_assembly_overlap_stats($assemblyobj);
1040 # 3: Set sequence statistics: nof_seq and avg_seq_len
1041 ($csp->{'_avg_seq_len'}, $csp->{'_nof_seq'}) = $self->_get_assembly_seq_stats($assemblyobj);
1042 ### any use in using _naive_assembler here to re-assemble with specific minmum criteria?
1043 # 4: Set the spectrum: spectrum and max_size
1044 for my $contigobj ( $self->_get_contig_like($assemblyobj) ) {
1045 my $size = $contigobj->num_sequences;
1046 if (defined $csp->{'_spectrum'}{$size}) {
1047 $csp->{'_spectrum'}{$size}++;
1049 $csp->{'_spectrum'}{$size} = 1;
1051 $csp->{'_max_size'} = $size if $size > $csp->{'_max_size'};
1053 # 5: Set list of assembly objects used
1054 push @
{$csp->{'_assembly'}}, $assemblyobj;
1055 # 6: Set number of repetitions
1056 $csp->{'_nof_rep'} = 1;
1061 =head2 _new_dissolved_csp
1063 Title : _new_dissolved_csp
1065 Function: create a dissolved contig spectrum object
1066 Returns : dissolved contig spectrum
1067 Args : mixed contig spectrum
1068 header of sequences to keep in this contig spectrum
1072 sub _new_dissolved_csp
{
1073 my ($self, $mixed_csp, $seq_header) = @_;
1074 # Sanity checks on the mixed contig spectrum
1076 # min_overlap and min_identity must be specified if there are some overlaps
1077 # in the mixed contig
1078 if ($mixed_csp->{'_nof_overlaps'} > 0) {
1079 unless ( defined $self->{'_min_overlap'} ||
1080 defined $mixed_csp->{'_min_overlap'} ) {
1081 $self->throw("min_overlap must be defined in the dissolved contig spectrum".
1082 " or mixed contig spectrum to dissolve a contig");
1084 unless ( defined $self->{'_min_identity'} ||
1085 defined $mixed_csp->{'_min_identity'} ) {
1086 $self->throw("min_identity must be defined in the dissolved contig spectrum".
1087 " or mixed contig spectrum");
1091 # there must be at least one assembly in mixed contig spectrum
1092 if (!defined $mixed_csp->{'_assembly'} ||
1093 scalar @
{$mixed_csp->{'_assembly'}} < 1) {
1094 $self->throw("The mixed contig spectrum must be based on at least one
1098 # New dissolved contig spectrum object
1099 my $dissolved = Bio
::Assembly
::Tools
::ContigSpectrum
->new();
1101 # Take attributes of the parent contig spectrum if they exist, or those of the
1102 # mixed contig spectrum otherwise
1103 my ($eff_asm_params, $min_overlap, $min_identity);
1104 if ($self->{'_eff_asm_params'}) {
1105 $eff_asm_params = $self->{'_eff_asm_params'};
1107 $eff_asm_params = $mixed_csp->{'_eff_asm_params'};
1109 if ($self->{'_min_overlap'}) {
1110 $min_overlap = $self->{'_min_overlap'};
1112 $min_overlap = $mixed_csp->{'_min_overlap'};
1114 if ($self->{'_min_identity'}) {
1115 $min_identity = $self->{'_min_identity'};
1117 $min_identity = $mixed_csp->{'_min_identity'};
1119 ($dissolved->{'_eff_asm_params'}, $dissolved->{'_min_overlap'},
1120 $dissolved->{'_min_identity'}) = ($eff_asm_params, $min_overlap,
1123 # Dissolve each assembly
1124 for my $obj (@
{$mixed_csp->{'_assembly'}}) {
1125 for my $contig ( $self->_get_contig_like($obj) ) {
1127 # Dissolve this assembly/contig/singlet for the given sequences
1128 my $dissolved_objs = $self->_dissolve_contig( $contig, $seq_header,
1129 $min_overlap, $min_identity );
1131 # Add dissolved contigs to contig spectrum
1132 for my $dissolved_obj (@
$dissolved_objs) {
1133 $dissolved->assembly($dissolved_obj);
1134 $dissolved->{'_nof_rep'}--;
1141 $dissolved->{'_nof_rep'} += $mixed_csp->{'_nof_rep'};
1147 =head2 _dissolve_contig
1149 Title : _dissolve_contig
1151 Function: dissolve a contig
1152 Returns : arrayref of contigs and singlets
1153 Args : mixed contig spectrum
1154 header of sequences to keep in this contig spectrum
1160 sub _dissolve_contig
{
1161 my ($self, $contig, $wanted_origin, $min_overlap, $min_identity) = @_;
1165 if ($contig->isa('Bio::Assembly::Singlet')) {
1166 @seqs = $contig->seqref;
1167 } elsif ($contig->isa('Bio::Assembly::Contig')) {
1168 @seqs = $contig->each_seq;
1171 # Get sequences from the desired metagenome
1173 for my $seq (@seqs) {
1174 my $seq_id = $seq->id;
1175 # get sequence origin
1176 next unless $self->_seq_origin($seq_id) eq $wanted_origin;
1178 push @contig_seqs, $seq_id;
1182 my $size = scalar @contig_seqs;
1185 # create a singlet and add it to list of objects
1186 my $id = $contig_seqs[0];
1187 my $seq = $contig->get_seq_by_name($id);
1188 push @
$objs, Bio
::Assembly
::Singlet
->new(-id
=> $contig->id, -seqref
=> $self->_obj_copy($seq) );
1189 } elsif ($size > 1) {
1190 # Reassemble good sequences
1191 my $contig_objs = $self->_naive_assembler( $contig, \
@contig_seqs,
1192 $min_overlap, $min_identity );
1193 push @
$objs, @
$contig_objs;
1200 =head2 _new_cross_csp
1202 Title : _new_cross_csp
1204 Function: create a cross contig spectrum object
1205 Returns : cross-contig spectrum
1206 Args : mixed contig spectrum
1210 sub _new_cross_csp
{
1211 my ($self, $mixed_csp) = @_;
1212 # Sanity check on the mixed contig spectrum
1213 # There must be at least one assembly
1214 if (!defined $mixed_csp->{'_assembly'} ||
1215 scalar @
{$mixed_csp->{'_assembly'}} < 1) {
1216 $self->throw("The mixed contig spectrum must be based on at least one ".
1220 # New dissolved contig spectrum object
1221 my $cross = Bio
::Assembly
::Tools
::ContigSpectrum
->new();
1223 # Take attributes from parent or from mixed contig spectrums
1224 my ($eff_asm_params, $min_overlap, $min_identity);
1225 if ($self->{'_eff_asm_params'}) {
1226 $eff_asm_params = $self->{'_eff_asm_params'};
1228 $eff_asm_params = $mixed_csp->{'_eff_asm_params'};
1230 if ($self->{'_min_overlap'}) {
1231 $min_overlap = $self->{'_min_overlap'};
1233 $min_overlap = $mixed_csp->{'_min_overlap'};
1235 if ($self->{'_min_identity'}) {
1236 $min_identity = $self->{'_min_identity'};
1238 $min_identity = $mixed_csp->{'_min_identity'};
1240 ($cross->{'_eff_asm_params'},$cross->{'_min_overlap'},$cross->{'_min_identity'})
1241 = ($eff_asm_params, $min_overlap, $min_identity);
1243 # Get cross contig spectrum for each assembly
1244 for my $obj ( @
{$mixed_csp->{'_assembly'}} ) {
1245 for my $contig ( $self->_get_contig_like($obj) ) {
1247 # Go through contigs and skip the pure ones
1248 my ($cross_contigs, $nof_cross_singlets) = $self->_cross_contig($contig,
1249 $min_overlap, $min_identity);
1252 for my $cross_contig ( @
$cross_contigs ) {
1253 $cross->assembly($cross_contig);
1254 $cross->{'_nof_rep'}--;
1257 # Add cross-singlets
1258 $cross->{'_spectrum'}->{'1'} += $nof_cross_singlets;
1264 $cross->{'_nof_rep'} += $mixed_csp->{'_nof_rep'};
1270 =head2 _cross_contig
1272 Title : _cross_contig
1274 Function: calculate cross contigs
1275 Returns : arrayref of cross-contigs
1276 number of cross-singlets
1284 my ($self, $contig, $min_overlap, $min_identity) = @_;
1286 my $nof_cross_singlets = 0;
1289 # Weed out pure contigs
1291 for my $seq ($contig->each_seq) {
1292 my $seq_id = $seq->id;
1293 my $seq_origin = $self->_seq_origin($seq_id);
1294 if (not defined $seq_origin) {
1295 $self->warn("Sequence $seq_id doesn't have any header. Skipping it...");
1298 if ( scalar keys %all_origins > 1 ) {
1299 # a cross-contig spectrum
1302 $all_origins{$seq_origin} = undef;
1304 if ( scalar keys %all_origins <= 1 ) {
1306 return \
@cross_contigs, $nof_cross_singlets;
1310 # Break the cross-contigs using the specified stringency
1311 my $test_contigs = $self->_naive_assembler($contig, undef, $min_overlap, $min_identity);
1313 # Find cross contigs and singlets
1314 for my $test_contig ( @
$test_contigs ) {
1316 # Find cross-contigs
1318 for my $seq ($test_contig->each_seq) {
1319 my $seq_id = $seq->id;
1320 my $seq_origin = $self->_seq_origin($seq_id);
1321 next if not defined $seq_origin;
1322 push @
{$origins{$seq_origin}}, $seq_id;
1324 if (scalar keys %origins > 1) {
1325 # Found a cross-contig
1326 push @cross_contigs, $test_contig;
1331 # Find cross-singlets
1332 for my $origin (keys %origins) {
1333 my @ori_ids = @
{$origins{$origin}};
1334 if (scalar @ori_ids == 1) {
1335 $nof_cross_singlets++;
1336 } elsif (scalar @ori_ids > 1) {
1337 # Dissolve contig for the given origin
1339 ### consider using the minimum overlap and identity here again?
1340 my $ori_contigs = $self->_naive_assembler($test_contig, \
@ori_ids, undef, undef);
1342 for my $ori_contig (@
$ori_contigs) {
1343 $nof_cross_singlets++ if $ori_contig->num_sequences == 1;
1351 return \
@cross_contigs, $nof_cross_singlets;
1359 Function: determines where a sequence comes from using its header. For example
1360 the origin of the sequence 'metagenome1|gi|9626988|ref|NC_001508.1|'
1368 # Current sequence origin. Example: sequence with ID
1369 # 'metagenome1|gi|9626988|ref|NC_001508.1|' has header 'metagenome1'
1370 my ($self, $seq_id) = @_;
1372 if ( $seq_id =~ m/^(.+?)\|/ ) {
1379 =head2 _import_assembly
1381 Title : _import_assembly
1382 Usage : $csp->_import_assembly($assemblyobj);
1383 Function: Update a contig spectrum object based on an assembly, contig or
1385 Returns : 1 for success
1386 Args : Bio::Assembly::Scaffold, Contig or Singlet object
1390 sub _import_assembly
{
1391 my ($self, $assemblyobj) = @_;
1393 if ( ! ref $assemblyobj ||
1394 ( ! $assemblyobj->isa('Bio::Assembly::ScaffoldI') &&
1395 ! $assemblyobj->isa('Bio::Assembly::Contig') )) {
1396 $self->throw("Unable to process non Bio::Assembly::ScaffoldI, Contig or ".
1397 "Singlet object [".ref($assemblyobj)."]");
1399 # Create new object from assembly
1400 my $csp = $self->_new_from_assembly($assemblyobj);
1402 # Update current contig spectrum object with new one
1409 =head2 _import_spectrum
1411 Title : _import_spectrum
1412 Usage : $csp->_import_spectrum({ 1 => 90 , 2 => 3 , 4 => 1 })
1413 Function: update a contig spectrum object based on a contig spectrum
1414 represented as a hash (key: contig size, value: number of contigs of
1416 Returns : 1 for success
1417 Args : contig spectrum as a hash reference
1421 sub _import_spectrum
{
1422 my ($self, $spectrum) = @_;
1424 if( ! ref $spectrum || ! ref $spectrum eq 'HASH') {
1425 $self->throw("Spectrum should be a hash reference, but it is [".
1426 ref($spectrum)."]");
1429 # Update the spectrum (+ nof_rep, max_size and nof_seq)
1430 for my $size (keys %$spectrum) {
1431 # Get the number of contigs of different size
1432 if (defined $self->{'_spectrum'}{$size}) {
1433 $self->{'_spectrum'}{$size} += $$spectrum{$size};
1435 $self->{'_spectrum'}{$size} = $$spectrum{$size};
1438 $self->{'_nof_seq'} += $size * $$spectrum{$size};
1440 $self->{'_max_size'} = $size if $size > $self->{'_max_size'};
1443 # If the contig spectrum has only zero 1-contigs, max_size is zero
1444 $self->{'_max_size'} = 0 if scalar keys %{$self->{'_spectrum'}} == 1 &&
1445 defined $self->{'_spectrum'}{'1'} && $self->{'_spectrum'}{'1'} == 0;
1448 $self->{'_nof_rep'}++;
1452 =head2 _import_dissolved_csp
1454 Title : _import_dissolved_csp
1455 Usage : $csp->_import_dissolved_csp($mixed_csp, $seq_header);
1456 Function: Update a contig spectrum object by dissolving a mixed contig
1457 spectrum based on the header of the sequences
1458 Returns : 1 for success
1459 Args : Bio::Assembly::Tools::ContigSpectrum
1460 sequence header string
1464 sub _import_dissolved_csp
{
1465 my ($self, $mixed_csp, $seq_header) = @_;
1467 if (not defined $mixed_csp || not defined $seq_header) {
1468 $self->throw("Expecting a contig spectrum reference and sequence header as".
1471 # Create new object from assembly
1472 my $dissolved_csp = $self->_new_dissolved_csp($mixed_csp, $seq_header);
1473 # Update current contig spectrum object with new one
1474 $self->add($dissolved_csp);
1479 =head2 _import_cross_csp
1481 Title : _import_cross_csp
1482 Usage : $csp->_import_cross_csp($mixed_csp);
1483 Function: Update a contig spectrum object by calculating the cross contig
1484 spectrum based on a mixed contig spectrum
1485 Returns : 1 for success
1486 Args : Bio::Assembly::Tools::ContigSpectrum
1490 sub _import_cross_csp
{
1491 my ($self, $mixed_csp) = @_;
1493 if (not defined $mixed_csp) {
1494 $self->throw("Expecting a contig spectrum reference as argument");
1497 # Create new object from assembly
1498 my $cross_csp = $self->_new_cross_csp($mixed_csp);
1499 my $nof_1_cross_contigs = $cross_csp->spectrum->{1};
1501 # Update current contig spectrum object with new one
1502 $self->add($cross_csp);
1505 $self->{'_nof_seq'} -= $nof_1_cross_contigs;
1510 =head2 _get_contig_like
1512 Title : _get_contig_like
1513 Usage : my @contig_like_objs = $csp->_get_contig_like($assembly_obj);
1514 Function: Get contigs and singlets from an assembly, contig or singlet
1515 Returns : array of Bio::Assembly::Contig and Singlet objects
1516 Args : a Bio::Assembly::Scaffold, Contig or singlet object
1520 sub _get_contig_like
{
1521 my ($self, $assembly_obj) = @_;
1523 if ($assembly_obj->isa('Bio::Assembly::ScaffoldI')) {
1524 # all contigs and singlets in the scaffold
1525 push @contig_objs, ($assembly_obj->all_contigs, $assembly_obj->all_singlets);
1527 # a contig or singlet
1528 @contig_objs = $assembly_obj;
1530 return @contig_objs;
1534 =head2 _get_assembly_seq_stats
1536 Title : _get_assembly_seq_stats
1537 Usage : my $seqlength = $csp->_get_assembly_seq_stats($assemblyobj);
1538 Function: Get sequence statistics from an assembly:
1539 average sequence length, number of sequences
1540 Returns : average sequence length (decimal)
1541 number of sequences (integer)
1542 Args : Bio::Assembly::Scaffold, Contig or singlet object
1543 hash reference with the IDs of the sequences to consider [optional]
1547 sub _get_assembly_seq_stats
{
1548 my ($self, $assemblyobj, $seq_hash) = @_;
1551 if ( !defined $assemblyobj ||
1552 ( !$assemblyobj->isa('Bio::Assembly::ScaffoldI') &&
1553 !$assemblyobj->isa('Bio::Assembly::Contig') ) ) {
1554 $self->throw("Must provide a Bio::Assembly::Scaffold, Contig or Singlet object");
1556 $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
1557 if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
1559 # Update sequence stats
1560 my @asm_stats = (0,0);
1561 # asm_stats = (avg_seq_len, nof_seq)
1562 for my $contigobj ( $self->_get_contig_like($assemblyobj) ) {
1563 @asm_stats = $self->_update_seq_stats( @asm_stats,
1564 $self->_get_contig_seq_stats($contigobj, $seq_hash) );
1570 =head2 _get_contig_seq_stats
1572 Title : _get_contig_seq_stats
1573 Usage : my $seqlength = $csp->_get_contig_seq_stats($contigobj);
1574 Function: Get sequence statistics from a contig:
1575 average sequence length, number of sequences
1576 Returns : average sequence length (decimal)
1577 number of sequences (integer)
1578 Args : contig object reference
1579 hash reference with the IDs of the sequences to consider [optional]
1583 sub _get_contig_seq_stats
{
1584 my ($self, $contigobj, $seq_hash) = @_;
1585 my @contig_stats = (0, 0);
1586 # contig_stats = (avg_length, nof_seq)
1587 for my $seqobj ($contigobj->each_seq) {
1588 next if defined $seq_hash && !defined $$seq_hash{$seqobj->id};
1590 if ($contigobj->isa('Bio::Assembly::Singlet')) { # a singlet
1591 $seq_string = $contigobj->seqref->seq;
1593 $seq_string = $seqobj->seq;
1595 # Number of non-gap characters in the sequence
1596 my $seq_len = $seqobj->_ungapped_len;
1597 my @seq_stats = ($seq_len);
1598 @contig_stats = $self->_update_seq_stats(@contig_stats, @seq_stats);
1600 return @contig_stats;
1604 =head2 _update_seq_stats
1606 Title : _update_seq_stats
1608 Function: Update the number of sequences and their average length 1
1612 number of overlaps 1 average sequence length
1613 Returns : average sequence length
1615 Args : average sequence length 1
1616 number of sequences 1
1617 average sequence length 2
1618 number of sequences 2
1622 sub _update_seq_stats
{
1623 my ($self, $p_avg_length, $p_nof_seq, $n_avg_length, $n_nof_seq) = @_;
1625 if (not defined $n_nof_seq) {
1628 # Update overlap statistics
1630 my $nof_seq = $p_nof_seq + $n_nof_seq;
1631 if ($nof_seq != 0) {
1632 $avg_length = ($p_avg_length * $p_nof_seq + $n_avg_length * $n_nof_seq) / $nof_seq;
1634 return $avg_length, $nof_seq;
1638 =head2 _get_assembly_overlap_stats
1640 Title : _get_assembly_overlap_stats
1641 Usage : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
1642 = $csp->_get_assembly_overlap_stats($assemblyobj);
1643 Function: Get statistics about pairwise overlaps in contigs of an assembly
1644 Returns : average overlap length
1645 average identity percent
1646 minimum overlap length
1647 minimum identity percent
1649 Args : Bio::Assembly::Scaffold, Contig or Singlet object
1650 hash reference with the IDs of the sequences to consider [optional]
1654 sub _get_assembly_overlap_stats
{
1655 my ($self, $assembly_obj, $seq_hash) = @_;
1658 if ( !defined $assembly_obj ||
1659 ( !$assembly_obj->isa('Bio::Assembly::ScaffoldI') &&
1660 !$assembly_obj->isa('Bio::Assembly::Contig') ) ) {
1661 $self->throw("Must provide a Bio::Assembly::ScaffoldI, Contig or Singlet object");
1663 $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
1664 if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
1666 # Look at all the contigs (no singlets!)
1667 my @asm_stats = (0, 0, undef, undef, 0);
1668 # asm_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)
1669 for my $contig_obj ( $self->_get_contig_like($assembly_obj) ) {
1670 @asm_stats = $self->_update_overlap_stats( @asm_stats,
1671 $self->_get_contig_overlap_stats($contig_obj, $seq_hash) );
1678 =head2 _get_contig_overlap_stats
1680 Title : _get_contig_overlap_stats
1681 Usage : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
1682 = $csp->_get_contig_overlap_stats($contigobj);
1683 Function: Get statistics about pairwise overlaps in a contig or singlet. The
1684 statistics are obtained using graph theory: each read is a node
1685 and the edges between 2 reads are weighted by minus the number of
1686 conserved residues in the alignment between the 2 reads. The
1687 minimum spanning tree of this graph represents the overlaps that
1688 form the contig. Overlaps that do not satisfy the minimum overlap
1689 length and similarity get a malus on their score.
1690 Note: This function requires the optional BioPerl dependency
1691 module called 'Graph'
1692 Returns : average overlap length
1693 average identity percent
1694 minimum overlap length
1695 minimum identity percent
1697 Args : Bio::Assembly::Contig or Singlet object
1698 hash reference with the IDs of the sequences to consider [optional]
1702 sub _get_contig_overlap_stats
{
1703 my ($self, $contig_obj, $seq_hash) = @_;
1706 $self->throw("Must provide a Bio::Assembly::Contig object")
1707 if (!defined $contig_obj || !$contig_obj->isa("Bio::Assembly::Contig"));
1708 $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
1709 if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
1711 my @contig_stats = (0, 0, undef, undef, 0);
1712 # contig_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)
1714 # Build contig graph
1715 ### consider providing the minima to _contig_graph here too?
1716 my ($g, $overlaps) = $self->_contig_graph($contig_obj, $seq_hash);
1719 # Graph minimum spanning tree (tree that goes through strongest overlaps)
1720 $g = $g->MST_Kruskal();
1722 # Calculate minimum overlap length and identity for this contig
1723 for my $edge ( $g->edges ) {
1724 # Retrieve overlap information
1725 my ($id1, $id2) = @
$edge;
1726 if (not exists $$overlaps{$id1}{$id2}) {
1727 ($id2, $id1) = @
$edge;
1729 my ($score, $length, $identity) = @
{$$overlaps{$id1}{$id2}};
1730 # Update contig stats
1731 my @overlap_stats = ($length, $identity);
1732 @contig_stats = $self->_update_overlap_stats(@contig_stats, @overlap_stats);
1736 return @contig_stats;
1740 =head2 _update_overlap_stats
1742 Title : _update_overlap_stats
1744 Function: update the number of overlaps and their minimum and average length
1747 Args : average length 1
1751 number of overlaps 1
1756 number of overlaps 2
1760 sub _update_overlap_stats
{
1762 $p_avg_length, $p_avg_identity, $p_min_length, $p_min_identity, $p_nof_overlaps,
1763 $n_avg_length, $n_avg_identity, $n_min_length, $n_min_identity, $n_nof_overlaps)
1767 if (not defined $n_nof_overlaps) { $n_nof_overlaps = 1 };
1768 if ((not defined $n_min_length) && ($n_avg_length != 0)) { $n_min_length = $n_avg_length };
1769 if ((not defined $n_min_identity) && ($n_avg_identity != 0)) { $n_min_identity = $n_avg_identity };
1771 # Update overlap statistics
1772 my ($avg_length, $avg_identity, $min_length, $min_identity, $nof_overlaps)
1773 = (0, 0, undef, undef, 0);
1774 $nof_overlaps = $p_nof_overlaps + $n_nof_overlaps;
1775 if ($nof_overlaps > 0) {
1776 $avg_length = ($p_avg_length * $p_nof_overlaps + $n_avg_length * $n_nof_overlaps) / $nof_overlaps;
1777 $avg_identity = ($p_avg_identity * $p_nof_overlaps + $n_avg_identity * $n_nof_overlaps) / $nof_overlaps;
1780 if ( not defined $p_min_length ) {
1781 $min_length = $n_min_length;
1782 } elsif ( not defined $n_min_length ) {
1783 $min_length = $p_min_length;
1784 } else { # both values are defined
1785 if ($n_min_length < $p_min_length) {
1786 $min_length = $n_min_length;
1788 $min_length = $p_min_length;
1792 if ( not defined $p_min_identity ) {
1793 $min_identity = $n_min_identity;
1794 } elsif ( not defined $n_min_identity ) {
1795 $min_identity = $p_min_identity;
1796 } else { # both values are defined
1797 if ($n_min_identity < $p_min_identity) {
1798 $min_identity = $n_min_identity;
1800 $min_identity = $p_min_identity;
1804 return $avg_length, $avg_identity, $min_length, $min_identity, $nof_overlaps;
1808 =head2 _overlap_alignment
1810 Title : _overlap_alignment
1812 Function: Produce an alignment of the overlapping section of two sequences of
1813 a contig. Minimum overlap length and percentage identity can be
1814 specified. Return undef if the sequences do not overlap or do not
1815 meet the minimum overlap criteria.
1816 Return : Bio::SimpleAlign object reference
1817 alignment overlap length
1818 alignment overlap identity
1819 Args : Bio::Assembly::Contig object reference
1820 Bio::LocatableSeq contig sequence 1
1821 Bio::LocatableSeq contig sequence 2
1822 minium overlap length [optional]
1823 minimum overlap identity percentage[optional]
1827 sub _overlap_alignment
{
1828 my ($self, $contig, $qseq, $tseq, $min_overlap, $min_identity) = @_;
1829 # get query and target sequence position
1830 my $qpos = $contig->get_seq_coord($qseq);
1831 my $tpos = $contig->get_seq_coord($tseq);
1832 # check that there is an overlap
1833 my $qend = $qpos->end;
1834 my $tstart = $tpos->start;
1835 return if $qend < $tstart;
1836 my $qstart = $qpos->start;
1837 my $tend = $tpos->end;
1838 return if $qstart > $tend;
1839 # get overlap boundaries and check overlap length
1841 if ($qstart >= $tstart) {
1847 if ($qend > $tend) {
1852 my $overlap = $right - $left + 1;
1853 return if defined $min_overlap && $overlap < $min_overlap;
1854 # slice query and target sequence to overlap boundaries
1856 $contig->change_coord('gapped consensus', "aligned ".$qseq->id, $left);
1857 my $qstring = substr($qseq->seq, $qleft - 1, $overlap);
1859 $contig->change_coord('gapped consensus', "aligned ".$tseq->id, $left);
1860 my $tstring = substr($tseq->seq, $tleft - 1, $overlap);
1861 # remove gaps present in both sequences at the same position
1862 for (my $pos = 0 ; $pos < $overlap ; $pos++) {
1863 my $qnt = substr($qstring, $pos, 1);
1865 my $tnt = substr($tstring, $pos, 1);
1867 substr($qstring, $pos, 1, '');
1868 substr($tstring, $pos, 1, '');
1874 return if defined $min_overlap && $overlap < $min_overlap;
1875 # count the number of gaps remaining in each sequence
1876 my $qgaps = ($qstring =~ tr/-//);
1877 my $tgaps = ($tstring =~ tr/-//);
1878 # make an alignment object with the query and target sequences
1879 my $aln = Bio
::SimpleAlign
->new;
1880 my $alseq = Bio
::LocatableSeq
->new(
1884 -end
=> $overlap - $qgaps,
1887 $aln->add_seq($alseq);
1888 $alseq = Bio
::LocatableSeq
->new(
1892 -end
=> $overlap - $tgaps,
1895 $aln->add_seq($alseq);
1897 # check overlap percentage identity
1898 my $identity = $aln->overall_percentage_identity;
1899 return if defined $min_identity && $identity < $min_identity;
1901 # all checks passed, return alignment
1902 return $aln, $overlap, $identity;
1906 =head2 _contig_graph
1908 Title : _contig_graph
1910 Function: Creates a graph data structure of the contig.The graph is undirected.
1911 The vertices are the reads of the contig and edges are the overlap
1912 between the reads. The edges are weighted by the opposite of the
1913 overlap, so it is negative and the better the overlap, the lower the
1915 Return : Graph object or undef
1916 hashref of overlaps (score, length, identity) for each read pair
1917 Args : Bio::Assembly::Contig object reference
1918 hash reference with the IDs of the sequences to consider [optional]
1919 minimum overlap length (integer) [optional]
1920 minimum percentage identity (integer) [optional]
1925 my ($self, $contig_obj, $seq_hash, $min_overlap, $min_identity) = @_;
1928 if( !ref $contig_obj || ! $contig_obj->isa('Bio::Assembly::Contig') ) {
1929 $self->throw("Unable to process non Bio::Assembly::Contig ".
1930 "object [".ref($contig_obj)."]");
1933 if (not eval { require Graph
::Undirected
}) {
1934 $self->throw("Error: the module 'Graph' is needed by the method ".
1935 "_contig_graph but could not be found\n$@");
1938 # Skip contigs of 1 sequence (they have no overlap)
1939 my @seq_objs = $contig_obj->each_seq;
1940 my $nof_seqs = scalar @seq_objs;
1942 return if ($nof_seqs <= 1);
1944 # Calculate alignment between all pairs of reads
1946 for my $i (0 .. $nof_seqs-1) {
1947 my $seq_obj = $seq_objs[$i];
1948 my $seq_id = $seq_obj->id;
1950 # Skip this read if not in list of wanted sequences
1951 next if defined $seq_hash && !exists $$seq_hash{$seq_id};
1953 # What is the best sequence to align to?
1954 my ($best_score, $best_length, $best_identity);
1955 for my $j ($i+1 .. $nof_seqs-1) {
1957 # Skip this sequence if not in list of wanted sequences
1958 my $target_obj = $seq_objs[$j];
1959 my $target_id = $target_obj->id;
1960 next if defined $seq_hash && !exists $$seq_hash{$target_id};
1962 # How much overlap with this sequence?
1963 my ($aln_obj, $length, $identity)
1964 = $self->_overlap_alignment($contig_obj, $seq_obj, $target_obj, $min_overlap, $min_identity);
1965 next if ! defined $aln_obj; # there was no sequence overlap or overlap not good enough
1967 # Score the overlap as the number of conserved residues. In practice, it
1968 # seems to work better than giving +1 for match and -3 for errors
1969 # (mismatch or indels)
1970 my $score = $length * $identity / 100;
1972 # Apply a malus (square root) for scores that do not satisfy the minimum
1973 # overlap length similarity. It is necessary for overlaps that get a high
1974 # score without satisfying both the minimum values.
1975 if ( ( $min_overlap && ($length < $min_overlap ) ) ||
1976 ( $min_identity && ($identity < $min_identity) ) ) {
1977 $score = sqrt($score);
1979 $overlaps{$seq_id}{$target_id} = [$score, $length, $identity];
1986 my $g; # the Graph object
1987 if (scalar keys %overlaps >= 1) {
1988 # At least 1 overlap. Create a weighted undirected graph
1989 $g = Graph
::Undirected
->new();
1990 for my $seq_id (keys %overlaps) {
1991 for my $target_id (keys %{$overlaps{$seq_id}}) {
1992 my $score = @
{$overlaps{$seq_id}{$target_id}}[0];
1993 my $weight = -$score;
1994 $g->add_weighted_edge($seq_id, $target_id, $weight);
2000 return $g, \
%overlaps;
2009 Function: Generates a PNG picture of the contig graph. It is mostly for
2011 Return : 1 for success
2012 Args : a Graph object
2013 hashref of overlaps (score, length, identity) for each read pair
2015 overlap info to display: 'score' (default), 'length' or 'identity'
2020 my ($self, $g, $overlaps, $outfile, $edge_type) = @_;
2022 $self->throw("Error: need to provide a graph as input\n") if not defined $g;
2024 if (not eval { require GraphViz
}) {
2025 $self->throw("Error: the module 'GraphViz' is needed by the method ".
2026 "_draw_graph but could not be found\n$@");
2029 $edge_type ||= 'score';
2031 my $viz = GraphViz
->new( directed
=> 0 );
2033 for my $edge ( $g->edges ) {
2034 # Retrieve overlap information
2035 my ($id1, $id2) = @
$edge;
2036 if (not exists $$overlaps{$id1}{$id2}) {
2037 ($id2, $id1) = @
$edge;
2039 my ($score, $length, $identity) = @
{$$overlaps{$id1}{$id2}};
2042 if ($edge_type eq 'score') {
2044 } elsif ($edge_type eq 'length') {
2045 $edge_val = $length;
2046 } elsif ($edge_type eq 'identity') {
2047 $edge_val = $identity;
2049 $self->throw("Error: invalid edge type to display, '$edge_val'");
2051 $viz->add_edge($id1 => $id2, label
=> $edge_val);
2053 open my $fh, '>', $outfile or $self->throw("Error: Could not write file '$outfile': $!");
2054 print $fh $viz->as_png;