changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / Assembly / Tools / ContigSpectrum.pm
blob05cdb60edbc9645b33b731146176b5c288b42195
2 # BioPerl module for Bio::Assembly::Tools::ContigSpectrum
4 # Copyright by Florent Angly
6 # You may distribute this module under the same terms as Perl itself
8 # POD documentation - main docs before the code
10 =head1 NAME
12 Bio::Assembly::Tools::ContigSpectrum - create and manipulate contig spectra
14 =head1 SYNOPSIS
16 # Simple contig spectrum creation
17 my $csp1 = Bio::Assembly::Tools::ContigSpectrum->new(
18 -id => 'csp1',
19 -spectrum => { 1 => 10,
20 2 => 2,
21 3 => 1 } );
23 # ...or another way to create a simple contig spectrum
24 my $csp2 = Bio::Assembly::Tools::ContigSpectrum->new;
25 $csp2->id('csp2');
26 $csp2->spectrum({ 1 => 20, 2 => 1, 4 => 1 });
28 # Get some information
29 print "This is contig spectrum ".$csp->id."\n";
30 print "It contains ".$csp->nof_seq." sequences\n";
31 print "The largest contig has ".$csp->max_size." sequences\n";
32 print "The spectrum is: ".$csp->to_string($csp->spectrum)."\n";
34 # Let's add the contig spectra
35 my $summed_csp = Bio::Assembly::Tools::ContigSpectrum->new;
36 $summed_csp->add($csp1);
37 $summed_csp->add($csp2);
38 print "The summed contig spectrum is ".$summed_csp->to_string."\n";
40 # Make an average
41 my $avg_csp = Bio::Assembly::Tools::ContigSpectrum->new;
42 $avg_csp = $avg_csp->average([$csp1, $csp2]);
43 print "The average contig spectrum is ".$avg_csp->to_string."\n";
45 # Get a contig spectrum from an assembly
46 my $from_assembly = Bio::Assembly::Tools::ContigSpectrum->new(
47 -assembly => $assembly_object,
48 -eff_asm_params => 1);
49 print "The contig spectrum from assembly is ".$from_assembly->to_string."\n";
51 # Report advanced information (possible because eff_asm_params = 1)
52 print "Average sequence length: ".$from_assembly->avg_seq_len." bp\n";
53 print "Minimum overlap length: ".$from_assembly->min_overlap." bp\n";
54 print "Average overlap length: ".$from_assembly->avg_overlap." bp\n";
55 print "Minimum overlap match: ".$from_assembly->min_identity." %\n";
56 print "Average overlap match: ".$from_assembly->avg_identity." %\n";
58 # Assuming the assembly object contains sequences from several different
59 # metagenomes, we have a mixed contig spectrum from which a cross contig
60 # spectrum and dissolved contig spectra can be obtained
61 my $mixed_csp = $from_assembly;
63 # Calculate a dissolved contig spectrum
64 my $meta1_dissolved = Bio::Assembly::Tools::ContigSpectrum->new(
65 -dissolve => [$mixed_csp, 'metagenome1'] );
66 my $meta2_dissolved = Bio::Assembly::Tools::ContigSpectrum->new(
67 -dissolve => [$mixed_csp, 'metagenome2'] );
68 print "The dissolved contig spectra are:\n".
69 $meta1_dissolved->to_string."\n".
70 $meta2_dissolved->to_string."\n";
72 # Determine a cross contig spectrum
73 my $cross_csp = Bio::Assembly::Tools::ContigSpectrum->new(
74 -cross => $mixed_csp );
75 print "The cross contig spectrum is ".$cross_csp->to_string."\n";
77 # Score a contig spectrum (the more abundant the contigs and the larger their
78 # size, the larger the score)
79 my $csp_score = $csp->score( $csp->nof_seq );
81 =head1 DESCRIPTION
83 The Bio::Assembly::Tools::ContigSpectrum Perl module enables to
84 manually create contig spectra, import them from assemblies,
85 manipulate them, transform between different types of contig spectra
86 and output them.
88 Bio::Assembly::Tools::ContigSpectrum is a module to create, manipulate
89 and output contig spectra, assembly-derived data used in metagenomics
90 (community genomics) for diversity estimation.
92 =head2 Background
94 A contig spectrum is the count of the number of contigs of different
95 size in an assembly. For example, the contig spectrum [100 5 1 0 0
96 ...] means that there were 100 singlets (1-contigs), 5 contigs of 2
97 sequences (2-contigs), 1 contig of 3 sequences (3-contig) and no
98 larger contigs.
100 An assembly can be produced from a mixture of sequences from different
101 metagenomes. The contig obtained from this assembly is a mixed contig
102 spectrum. The contribution of each metagenome in this mixed contig
103 spectrum can be obtained by determining a dissolved contig spectrum.
105 Finally, based on a mixed contig spectrum, a cross contig spectrum can
106 be determined. In a cross contig spectrum, only contigs containing
107 sequences from different metagenomes are kept; "pure" contigs are
108 excluded. Additionally, the total number of singletons (1-contigs)
109 from each region that assembles with any fragments from other regions
110 is the number of 1-contigs in the cross contig spectrum.
112 =head2 Implementation
114 The simplest representation of a contig spectrum is as a hash
115 representation where the key is the contig size (number of sequences
116 making up the contig) and the value the number of contigs of this
117 size.
119 In fact, it is useful to have more information associated with the
120 contig spectrum, hence the Bio::Assembly::Tools::ContigSpectrum module
121 implements an object containing a contig spectrum hash and additional
122 information. The get/set methods to access them are:
124 id contig spectrum ID
125 nof_rep number of repetitions (assemblies) used
126 max_size size of (number of sequences in) the largest contig
127 spectrum hash representation of a contig spectrum
129 nof_seq number of sequences
130 avg_seq_len average sequence length
132 eff_asm_params reports effective assembly parameters
134 nof_overlaps number of overlaps (needs eff_asm_params)
135 min_overlap minimum overlap length in a contig (needs eff_asm_params)
136 min_identity minimum sequence identity percentage (needs eff_asm_params)
137 avg_overlap average overlap length (needs eff_asm_params)
138 avg_identity average overlap identity percentage (needs eff_asm_params)
140 Operations on the contig spectra:
142 to_string create a string representation of the spectrum
143 spectrum import a hash contig spectrum
144 assembly determine a contig spectrum from an assembly, contig or singlet
145 dissolve calculate a dissolved contig spectrum (depends on assembly)
146 cross produce a cross contig spectrum (depends on assembly)
147 add add a contig spectrum to an existing one
148 average make an average of several contig spectra
149 score score a contig spectrum: the higher the number of contigs
150 and the larger their size, the higher the score.
152 When using operations that rely on knowing "where" (from what
153 metagenomes) a sequence came from (i.e. when creating a dissolved or
154 cross contig spectrum), make sure that the sequences used for the
155 assembly have a name header, e.g. E<gt>metagenome1|seq1,
156 E<gt>metagenome2|seq1, ...
158 Note: The following operations require the C<Graph::Undirected> module:
159 eff_asm_params, cross, dissolve
161 =head1 FEEDBACK
163 =head2 Mailing Lists
165 User feedback is an integral part of the evolution of this and other
166 Bioperl modules. Send your comments and suggestions preferably to the
167 Bioperl mailing lists Your participation is much appreciated.
169 bioperl-l@bioperl.org - General discussion
170 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
173 =head2 Support
175 Please direct usage questions or support issues to the mailing list:
177 I<bioperl-l@bioperl.org>
179 rather than to the module maintainer directly. Many experienced and
180 reponsive experts will be able look at the problem and quickly
181 address it. Please include a thorough description of the problem
182 with code and data examples if at all possible.
184 =head2 Reporting Bugs
186 Report bugs to the BioPerl bug tracking system to help us keep track
187 the bugs and their resolution. Bug reports can be submitted via email
188 or the web:
190 bioperl-bugs@bio.perl.org
191 https://github.com/bioperl/bioperl-live/issues
193 =head1 AUTHOR - Florent E Angly
195 Email florent_dot_angly_at_gmail_dot_com
197 =head1 APPENDIX
199 The rest of the documentation details each of the object
200 methods. Internal methods are usually preceded with a "_".
202 =cut
204 package Bio::Assembly::Tools::ContigSpectrum;
206 use strict;
208 use Bio::Root::Root;
209 use Bio::Assembly::Scaffold;
210 use Bio::SimpleAlign;
211 use Bio::LocatableSeq;
213 use base 'Bio::Root::Root';
216 =head2 new
218 Title : new
219 Usage : my $csp = Bio::Assembly::Tools::ContigSpectrum->new();
221 my $csp = Bio::Assembly::Tools::ContigSpectrum->new(
222 -id => 'some_name',
223 -spectrum => { 1 => 90 , 2 => 3 , 4 => 1 },
226 my $csp = Bio::Assembly::Tools::ContigSpectrum->new(
227 -assembly => $assembly_obj
229 Function: create a new contig spectrum object
230 Returns : reference to a contig spectrum object
231 Args : none
233 =cut
235 sub new {
236 my ($class, @args) = @_;
237 my $self = $class->SUPER::new(@args);
238 my ( $id, $nof_seq, $nof_rep, $max_size, $nof_overlaps, $min_overlap,
239 $min_identity, $avg_overlap, $avg_identity, $avg_seq_len, $spectrum,
240 $assembly, $eff_asm_params, $dissolve, $cross) = $self->_rearrange(
241 [qw(ID NOF_SEQ NOF_REP MAX_SIZE NOF_OVERLAPS MIN_OVERLAP MIN_IDENTITY
242 AVG_OVERLAP AVG_IDENTITY AVG_SEQ_LEN SPECTRUM ASSEMBLY EFF_ASM_PARAMS
243 DISSOLVE CROSS)], @args );
245 # First set up some defauts
246 $self->{'_id'} = 'NoName';
247 $self->{'_nof_seq'} = 0;
248 $self->{'_nof_rep'} = 0;
249 $self->{'_max_size'} = 0;
250 $self->{'_nof_overlaps'} = 0;
251 $self->{'_min_overlap'} = undef;
252 $self->{'_min_identity'} = undef;
253 $self->{'_avg_overlap'} = 0;
254 $self->{'_avg_identity'} = 0;
255 $self->{'_avg_seq_len'} = 0;
256 $self->{'_eff_asm_params'} = 0;
257 $self->{'_spectrum'} = {1 => 0}; # contig spectrum hash representation
258 $self->{'_assembly'} = []; # list of assembly, contigs and singlet objects
260 # Then, according to user desires, override defaults
261 $self->{'_id'} = $id if (defined $id);
262 $self->{'_nof_seq'} = $nof_seq if (defined $nof_seq);
263 $self->{'_nof_rep'} = $nof_rep if (defined $nof_rep);
264 $self->{'_max_size'} = $max_size if (defined $max_size);
265 $self->{'_nof_overlaps'} = $nof_overlaps if (defined $nof_overlaps);
266 $self->{'_min_overlap'} = $min_overlap if (defined $min_overlap);
267 $self->{'_avg_overlap'} = $avg_overlap if (defined $avg_overlap);
268 $self->{'_min_identity'} = $min_identity if (defined $min_identity);
269 $self->{'_avg_identity'} = $avg_identity if (defined $avg_identity);
270 $self->{'_avg_seq_len'} = $avg_seq_len if (defined $avg_seq_len);
271 $self->{'_eff_asm_params'} = $eff_asm_params if (defined $eff_asm_params);
273 # Finally get stuff that can be obtained in an automated way
274 $self->_import_spectrum($spectrum) if defined($spectrum);
275 $self->_import_assembly($assembly) if defined($assembly);
276 $self->_import_cross_csp($cross) if defined($cross);
277 if (defined($dissolve)) {
278 my ($mixed_csp, $header) = (@$dissolve[0], @$dissolve[1]);
279 $self->_import_dissolved_csp($mixed_csp, $header);
282 return $self;
286 =head2 id
288 Title : id
289 Usage : $csp->id
290 Function: get/set contig spectrum id
291 Returns : string
292 Args : string [optional]
294 =cut
296 sub id {
297 my ($self, $id) = @_;
298 if (defined $id) {
299 $self->{'_id'} = $id;
301 $id = $self->{'_id'};
302 return $id;
306 =head2 nof_seq
308 Title : nof_seq
309 Usage : $csp->nof_seq
310 Function: get/set the number of sequences making up the contig spectrum
311 Returns : integer
312 Args : integer [optional]
314 =cut
316 sub nof_seq {
317 my ($self, $nof_seq) = @_;
318 if (defined $nof_seq) {
319 $self->throw("The number of sequences must be strictly positive. Got ".
320 "'$nof_seq'") if $nof_seq < 1;
321 $self->{'_nof_seq'} = $nof_seq;
323 $nof_seq = $self->{'_nof_seq'};
324 return $nof_seq;
328 =head2 nof_rep
330 Title : nof_rep
331 Usage : $csp->nof_rep
332 Function: Get/Set the number of repetitions (assemblies) used to create the
333 contig spectrum
334 Returns : integer
335 Args : integer [optional]
337 =cut
339 sub nof_rep {
340 my ($self, $nof_rep) = @_;
341 if (defined $nof_rep) {
342 $self->throw("The number of repetitions must be strictly positive. Got ".
343 "'$nof_rep'") if $nof_rep < 1;
344 $self->{'_nof_rep'} = $nof_rep;
346 $nof_rep = $self->{'_nof_rep'};
347 return $nof_rep;
351 =head2 max_size
353 Title : max_size
354 Usage : $csp->max_size
355 Function: get/set the size of (number of sequences in) the largest contig
356 Returns : integer
357 Args : integer [optional]
359 =cut
361 sub max_size {
362 my ($self, $max_size) = @_;
363 if (defined $max_size) {
364 $self->throw("The contig maximum size must be strictly positive. Got ".
365 "'$max_size'") if $max_size < 1;
366 $self->{'_max_size'} = $max_size;
368 $max_size = $self->{'_max_size'};
369 return $max_size;
373 =head2 nof_overlaps
375 Title : nof_overlaps
376 Usage : $csp->nof_overlaps
377 Function: Get/Set the number of overlaps in the assembly.
378 Returns : integer
379 Args : integer [optional]
381 =cut
383 sub nof_overlaps {
384 my ($self, $nof_overlaps) = @_;
385 if (defined $nof_overlaps) {
386 $self->throw("The number of overlaps must be strictly positive. Got ".
387 "'$nof_overlaps'") if $nof_overlaps < 1;
388 $self->{'_nof_overlaps'} = $nof_overlaps;
390 $nof_overlaps = $self->{'_nof_overlaps'};
391 return $nof_overlaps;
395 =head2 min_overlap
397 Title : min_overlap
398 Usage : $csp->min_overlap
399 Function: get/set the assembly minimum overlap length
400 Returns : integer
401 Args : integer [optional]
403 =cut
405 sub min_overlap {
406 my ($self, $min_overlap) = @_;
407 if (defined $min_overlap) {
408 $self->throw("The minimum of overlap length must be strictly positive. Got".
409 " '$min_overlap'") if $min_overlap < 1;
410 $self->{'_min_overlap'} = $min_overlap;
412 $min_overlap = $self->{'_min_overlap'};
413 return $min_overlap;
417 =head2 avg_overlap
419 Title : avg_overlap
420 Usage : $csp->avg_overlap
421 Function: get/set the assembly average overlap length
422 Returns : decimal
423 Args : decimal [optional]
425 =cut
427 sub avg_overlap {
428 my ($self, $avg_overlap) = @_;
429 if (defined $avg_overlap) {
430 $self->throw("The average overlap length must be strictly positive. Got ".
431 "'$avg_overlap'") if $avg_overlap < 1;
432 $self->{'_avg_overlap'} = $avg_overlap;
434 $avg_overlap = $self->{'_avg_overlap'};
435 return $avg_overlap;
439 =head2 min_identity
441 Title : min_identity
442 Usage : $csp->min_identity
443 Function: get/set the assembly minimum overlap identity percent
444 Returns : 0 < decimal < 100
445 Args : 0 < decimal < 100 [optional]
447 =cut
449 sub min_identity {
450 my ($self, $min_identity) = @_;
451 if (defined $min_identity) {
452 $self->throw("The minimum overlap percent identity must be strictly ".
453 "positive. Got '$min_identity'") if $min_identity < 1;
454 $self->{'_min_identity'} = $min_identity;
456 $min_identity = $self->{'_min_identity'};
457 return $min_identity;
461 =head2 avg_identity
463 Title : avg_identity
464 Usage : $csp->avg_identity
465 Function: get/set the assembly average overlap identity percent
466 Returns : 0 < decimal < 100
467 Args : 0 < decimal < 100 [optional]
469 =cut
471 sub avg_identity {
472 my ($self, $avg_identity) = @_;
473 if (defined $avg_identity) {
474 $self->throw("The average overlap percent identity must be strictly ".
475 "positive. Got '$avg_identity'") if $avg_identity < 1;
476 $self->{'_avg_identity'} = $avg_identity;
478 $avg_identity = $self->{'_avg_identity'};
479 return $avg_identity;
483 =head2 avg_seq_len
485 Title : avg_seq_len
486 Usage : $csp->avg_seq_len
487 Function: get/set the assembly average sequence length
488 Returns : avg_seq_len
489 Args : real [optional]
491 =cut
493 sub avg_seq_len {
494 my ($self, $avg_seq_len) = @_;
495 if (defined $avg_seq_len) {
496 $self->throw("The average sequence length must be strictly positive. Got ".
497 "'$avg_seq_len'") if $avg_seq_len < 1;
498 $self->{'_avg_seq_len'} = $avg_seq_len;
500 $avg_seq_len = $self->{'_avg_seq_len'};
501 return $avg_seq_len;
505 =head2 eff_asm_params
507 Title : eff_asm_params
508 Usage : $csp->eff_asm_params(1)
509 Function: Get/set the effective assembly parameters option. It defines if the
510 effective assembly parameters should be determined when a contig
511 spectrum based or derived from an assembly is calculated. The
512 effective assembly parameters include avg_seq_length, nof_overlaps,
513 min_overlap, avg_overlap, min_identity and avg_identity.
514 1 = get them, 0 = don't.
515 Returns : integer
516 Args : integer [optional]
518 =cut
520 sub eff_asm_params {
521 my ($self, $eff_asm_params) = @_;
522 if (defined $eff_asm_params) {
523 $self->throw("eff_asm_params can only take values 0 or 1. Input value was ".
524 "'$eff_asm_params'") unless $eff_asm_params == 0 || $eff_asm_params == 1;
525 $self->{'_eff_asm_params'} = $eff_asm_params;
527 $eff_asm_params = $self->{'_eff_asm_params'};
528 return $eff_asm_params;
532 =head2 spectrum
534 Title : spectrum
535 Usage : my $spectrum = $csp->spectrum({1=>10, 2=>2, 3=>1});
536 Function: Get the current contig spectrum represented as a hash / Update a
537 contig spectrum object based on a contig spectrum represented as a
538 hash
539 The hash representation of a contig spectrum is as following:
540 key -> contig size (in number of sequences)
541 value -> number of contigs of this size
542 Returns : contig spectrum as a hash reference
543 Args : contig spectrum as a hash reference [optional]
545 =cut
547 sub spectrum {
548 my ($self, $spectrum) = @_;
549 if (defined $spectrum) {
550 $self->_import_spectrum($spectrum);
552 $spectrum = $self->{'_spectrum'};
553 return $spectrum;
557 =head2 assembly
559 Title : assembly
560 Usage : my @obj_list = $csp->assembly();
561 Function: get/set the contig spectrum object by adding an assembly, contig or
562 singlet object to it, or get the list of objects associated with it
563 Returns : arrayref of assembly, contig and singlet objects used in the contig
564 spectrum object (Bio::Assembly::Scaffold, Bio::Assembly::Contig and
565 Bio::Assembly::Singlet objects)
566 Args : Bio::Assembly::Scaffold, Contig or Singlet object
568 =cut
570 sub assembly {
571 my ($self, $assembly) = @_;
572 if (defined $assembly) {
573 $self->_import_assembly($assembly);
575 my @obj_list = @{$self->{'_assembly'}} if defined $self->{'_assembly'};
576 return @obj_list;
580 =head2 drop_assembly
582 Title : drop_assembly
583 Usage : $csp->drop_assembly();
584 Function: Remove all assembly objects associated with a contig spectrum.
585 Assembly objects can take a lot of memory, which can be freed by
586 calling this method. Don't call this method if you need the assembly
587 object later on, for example for creating a dissolved or cross
588 contig spectrum.
589 Returns : 1 for success
590 Args : none
592 =cut
594 sub drop_assembly {
595 my ($self) = @_;
596 $self->{'_assembly'} = [];
597 return 1;
601 =head2 dissolve
603 Title : dissolve
604 Usage : $dissolved_csp->dissolve($mixed_csp, $seq_header);
605 Function: Dissolve a mixed contig spectrum for the set of sequences that
606 contain the specified header, i.e. determine the contribution of
607 these sequences to the mixed contig spectrum. The mixed contig
608 spectrum object must have one or several assembly object(s). In
609 addition, min_overlap, min_identity and eff_asm_params are taken
610 from the mixed contig spectrum, unless they are specified manually
611 for the dissolved contig spectrum. The dissolved contigs underlying
612 the contig spectrum can be obtained by calling the assembly() method.
613 Returns : 1 for success
614 Args : Bio::Assembly::Tools::ContigSpectrum reference
615 sequence header string
617 =cut
619 sub dissolve {
620 my ($self, $mixed_csp, $seq_header) = @_;
621 $self->_import_dissolved_csp($mixed_csp, $seq_header);
622 return 1;
626 =head2 cross
628 Title : cross
629 Usage : $cross_csp->cross($mixed_csp);
630 Function: Calculate a cross contig_spectrum based on a mixed contig_spectrum.
631 The underlying cross-contigs themselves can be obtained by calling
632 the assembly() method.
633 Returns : 1 for success
634 Args : Bio::Assembly::Tools::ContigSpectrum reference
636 =cut
638 sub cross {
639 my ($self, $mixed_csp) = @_;
640 $self->_import_cross_csp($mixed_csp);
641 return 1;
644 =head2 to_string
646 Title : to_string
647 Usage : my $csp_string = $csp->to_string;
648 Function: Convert the contig spectrum into a string (easy to print!!).
649 Returns : string
650 Args : element separator (integer) [optional]
651 1 -> space-separated
652 2 -> tab-separated
653 3 -> newline-separated
655 =cut
657 sub to_string {
658 my ($self, $element_separator) = @_;
659 return 0 if $self->{'_max_size'} == 0;
660 $element_separator ||= 1;
661 if ($element_separator == 1) {
662 $element_separator = ' ';
663 } elsif ($element_separator == 2) {
664 $element_separator = "\t";
665 } elsif ($element_separator == 3) {
666 $element_separator = "\n";
667 } else {
668 $self->throw("Unknown separator type '$element_separator'\n");
670 my $str = '';
671 for (my $q = 1 ; $q <= $self->{'_max_size'} ; $q++) {
672 my $val = 0;
673 if (exists $self->{'_spectrum'}{$q}) {
674 $val = $self->{'_spectrum'}{$q};
676 $str .= $val.$element_separator;
678 $str =~ s/\s$//;
679 return $str;
683 =head2 add
685 Title : add
686 Usage : $csp->add($additional_csp);
687 Function: Add a contig spectrum to an existing one: sums the spectra, update
688 the number of sequences, number of repetitions, ...
689 Returns : 1 for success
690 Args : Bio::Assembly::Tools::ContigSpectrum object
692 =cut
694 sub add {
695 my ($self, $csp) = @_;
696 # Sanity check
697 if( !ref $csp || ! $csp->isa('Bio::Assembly::Tools::ContigSpectrum') ) {
698 $self->throw("Unable to process non Bio::Assembly::Tools::ContigSpectrum ".
699 "object [".ref($csp)."]");
701 # Update overlap statistics
702 if ( $self->{'_eff_asm_params'} > 0 ) {
704 # Warnings
705 if ( $csp->{'_eff_asm_params'} == 0 ) {
706 $self->warn("The parent contig spectrum needs effective assembly ".
707 "parameters (eff_asm_params = ".$self->{'_eff_asm_params'}.") but the ".
708 "child contig spectrum doesn't have them (eff_asm_params = ".
709 $csp->{'_eff_asm_params'}."). Skipping them...");
710 } elsif ( $csp->{'_eff_asm_params'} != $self->{'_eff_asm_params'} ) {
711 $self->warn("The parent contig spectrum needs a different method for ".
712 "detecting the effective assembly parameters (eff_asm_params = ".
713 $self->{'_eff_asm_params'}.") than the one specified for the child ".
714 "contig spectrum (eff_asm_params = ".$csp->{'_eff_asm_params'}."). ".
715 "Ignoring the differences...");
718 # Update existing stats
719 ( $self->{'_avg_overlap'} , $self->{'_avg_identity'}, $self->{'_min_overlap'},
720 $self->{'_min_identity'}, $self->{'_nof_overlaps'} ) = $self->_update_overlap_stats(
721 $self->{'_avg_overlap'} , $self->{'_avg_identity'}, $self->{'_min_overlap'},
722 $self->{'_min_identity'}, $self->{'_nof_overlaps'},
723 $csp->{'_avg_overlap'} , $csp->{'_avg_identity'} , $csp->{'_min_overlap'},
724 $csp->{'_min_identity'} , $csp->{'_nof_overlaps'} );
728 # Update sequence average length (not number of sequences)
729 ( $self->{'_avg_seq_len'} ) = $self->_update_seq_stats(
730 $self->{'_avg_seq_len'}, $self->{'_nof_seq'}, $csp->{'_avg_seq_len'},
731 $csp->{'_nof_seq'} );
733 # Update spectrum (and nof_seq, max_size, and increment nof_rep by 1)
734 $self->_import_spectrum($csp->{'_spectrum'});
736 # Update nof_rep
737 $self->{'_nof_rep'}--;
738 $self->{'_nof_rep'} += $csp->{'_nof_rep'};
739 # Update list of assembly objects used
740 push @{$self->{'_assembly'}}, @{$csp->{'_assembly'}}
741 if defined $csp->{'_assembly'};
743 return 1;
747 =head2 average
749 Title : average
750 Usage : my $avg_csp = $csp->average([$csp1, $csp2, $csp3]);
751 Function: Average one contig spectrum or the sum of several contig spectra by
752 the number of repetitions
753 Returns : Bio::Assembly::Tools::ContigSpectrum
754 Args : Bio::Assembly::Tools::ContigSpectrum array reference
755 eff_asm_params
757 =cut
759 sub average {
760 my ($self, $list) = @_;
761 # Sanity check
762 if ( ! ref $list || ! ref $list eq 'ARRAY') {
763 $self->throw("Average takes an array reference but got [".ref($list)."]");
765 # New average contig spectrum object
766 my $avg = Bio::Assembly::Tools::ContigSpectrum->new;
767 $avg->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
769 # Cycle through contig spectra
770 my $tot_nof_rep = 0;
771 for my $csp (@$list) {
772 # Sanity check
773 if (not $csp->isa('Bio::Assembly::Tools::ContigSpectrum')) {
774 $csp->throw("Unable to process non Bio::Assembly::Tools::ContigSpectrum ".
775 "object [".ref($csp)."]");
777 # Import contig spectrum
778 $avg->add($csp);
781 # Average sum of contig spectra by number of repetitions
782 for (my $q = 1 ; $q <= $avg->{'_max_size'} ; $q++) {
783 $avg->{'_spectrum'}{$q} /= $avg->{'_nof_rep'}
784 if (defined $avg->{'_spectrum'}{$q});
786 # Average number of sequences
787 $avg->{'_nof_seq'} /= $avg->{'_nof_rep'};
788 # Average number of overlaps
789 $avg->{'_nof_overlaps'} /= $avg->{'_nof_rep'};
791 return $avg;
795 =head2 score
797 Title : score
798 Usage : my $score = $csp->score();
799 Function: Score a contig spectrum (or cross-contig spectrum) such that the
800 higher the number of contigs (or cross-contigs) and the larger their
801 size, the higher the score.
802 Let n : total number of sequences
803 c_q : number of contigs of size q
804 q : number of sequence in a contig
805 We define: score = n/(n-1) * (X - 1/n)
806 where X = sum ( c_q * q^2 ) / n**2
807 The score ranges from 0 (singlets only) to 1 (a single large contig)
808 It is possible to specify a value for the number of sequences to
809 assume in the contig spectrum.
810 Returns : contig score, or undef if there were no sequences in the contig spectrum
811 Args : number of total sequences to assume [optional]
813 =cut
815 sub score {
816 my ($self, $nof_seqs) = @_;
817 # Sanity check
818 my $n = $self->nof_seq;
819 return undef if ($n <= 0);
820 # Calculate X
821 my $score = 0;
822 my $q_max = $self->max_size;
823 my $spec = $self->spectrum;
824 for my $q ( 1 .. $q_max ) {
825 my $c_q = $spec->{$q};
826 if ( $q == 1 && $nof_seqs ) {
827 $c_q += $nof_seqs - $n;
828 $n = $nof_seqs;
830 next if not $c_q;
831 $score += $c_q * $q ** 2;
833 $score /= $n ** 2;
834 # Rescale X to obtain the score
835 $score = $n/($n-1) * ($score - 1/$n);
836 return $score;
840 =head2 _naive_assembler
842 Title : _naive_assembler
843 Usage :
844 Function: Reassemble the specified sequences only based on their position in
845 the contig. This naive assembly only verifies that the minimum
846 overlap length and percentage identity are respected. No actual
847 alignment is done
848 Returns : arrayref of contigs and singlets
849 Args : Bio::Assembly::Contig
850 array reference of sequence IDs to use [optional]
851 minimum overlap length (integer) [optional]
852 minimum percentage identity (integer) [optional]
854 =cut
856 sub _naive_assembler {
857 my ($self, $contig, $seqlist, $min_overlap, $min_identity) = @_;
859 # Use all reads if none was specified:
860 if (not defined $seqlist) {
861 for my $seq ($contig->each_seq) {
862 push @$seqlist, $seq->id;
866 # Sanity checks
867 if ( ! ref $seqlist || ! ref($seqlist) eq 'ARRAY') {
868 $self->throw('Expecting an array reference. Got ['.ref($seqlist)."] \n");
870 my $max = scalar @$seqlist;
871 $self->throw("Expecting at least 2 sequences as input for _naive_assembler")
872 if ($max < 2);
874 # Build contig graph
875 my %seq_hash = map { $_ => undef } (@$seqlist) if (scalar @$seqlist > 0);
876 my ($g, $overlaps) = $self->_contig_graph($contig, \%seq_hash, $min_overlap, $min_identity);
878 # Construct sub-contigs
879 my @contig_objs;
880 my $num = 1;
881 if (defined $g) {
882 for my $connected_reads ($g->connected_components) { # reads that belong in contigs
883 my $sub_id = $contig->id.'_'.$num;
884 my $sub_contig = $self->_create_subcontig($contig, $connected_reads, $sub_id);
885 push @contig_objs, $sub_contig;
886 $num++;
887 for my $read_id ( @$connected_reads ) {
888 delete $seq_hash{$read_id};
893 # Construct sub-singlets
894 my @singlet_objs;
895 for my $read_id ( keys %seq_hash ) {
896 my $read = $contig->get_seq_by_name($read_id);
897 my $sub_singlet = Bio::Assembly::Singlet->new(
898 -id => $contig->id.'_'.$num,
899 -seqref => $self->_obj_copy($read)
901 $num++;
902 push @singlet_objs, $sub_singlet;
905 return [@contig_objs, @singlet_objs];
909 =head2 _create_subcontig
911 Title : _create_subcontig
912 Usage :
913 Function: Create a subcontig from another contig
914 Returns : Bio::Assembly::Contig object
915 Args : Bio::Assembly::Contig
916 arrayref of the IDs of the reads to includes in the subcontig
917 ID to give to the subcontig
919 =cut
921 sub _create_subcontig {
922 my ($self, $contig, $read_ids, $sub_contig_id) = @_;
924 my $sub_contig = Bio::Assembly::Contig->new( -id => $sub_contig_id );
926 # Get min and max read coordinates
927 my ($min, $max) = (undef, undef);
928 for my $read_id ( @$read_ids ) {
929 my ($aln_coord) = $contig->get_features_collection->get_features_by_type("_aligned_coord:$read_id");
930 my $seq_start = $aln_coord->location->start;
931 my $seq_end = $aln_coord->location->end;
932 $min = $seq_start if (not defined $min) || ((defined $min) && ($seq_start < $min));
933 $max = $seq_end if (not defined $max) || ((defined $max) && ($seq_end > $max));
936 # Add reads to subcontig
937 for my $read_id (@$read_ids) {
938 my $read = $self->_obj_copy($contig->get_seq_by_name($read_id));
939 my $coord = $self->_obj_copy($contig->get_seq_coord($read));
940 if ($min > 1) {
941 # adjust read coordinates
942 $coord->start( $coord->start - $min + 1 );
943 $coord->end( $coord->end - $min + 1 );
945 $sub_contig->set_seq_coord($coord, $read);
948 # Truncate copy of original consensus to new boundaries
949 my $cons_seq = $contig->get_consensus_sequence;
950 $sub_contig->set_consensus_sequence( $self->_obj_copy($cons_seq, $min, $max) );
951 my $cons_qual = $contig->get_consensus_quality;
952 if ($cons_qual) {
953 $sub_contig->set_consensus_quality( $self->_obj_copy($cons_qual, $min, $max) );
956 return $sub_contig;
959 =head2 _obj_copy
961 Title : _obj_copy
962 Usage :
963 Function: Copy (most of) an object, and optionally truncate it
964 Returns : another a Bio::LocatableSeq, Bio::Seq::PrimaryQual, or
965 Bio::SeqFeature::Generic object
966 Args : a Bio::LocatableSeq, Bio::Seq::PrimaryQual, or
967 Bio::SeqFeature::Generic object
968 a start position
969 an end position
971 =cut
973 sub _obj_copy {
974 my ($self, $obj, $start, $end) = @_;
975 my $new;
976 if ($obj->isa('Bio::Seq::PrimaryQual')) {
977 my $qual = [@{$obj->qual}]; # copy of the quality scores
978 if (defined $start && defined $end && $start !=1 && $end != scalar(@$qual)) {
979 # Truncate the quality scores
980 $qual = [ splice @$qual, $start - 1, $end - $start + 1 ];
982 $new = Bio::Seq::PrimaryQual->new(
983 -qual => $qual,
984 -id => $obj->id,
987 } elsif ($obj->isa('Bio::LocatableSeq')) {
988 my $seq = $obj->seq;
989 if (defined $start && defined $end && $start != 1 && $end != length($seq)) {
990 # Truncate the aligned sequence
991 $seq = substr $seq, $start - 1, $end - $start + 1;
993 $new = Bio::LocatableSeq->new(
994 -seq => $seq,
995 -id => $obj->id,
996 -start => $obj->start,
997 -strand => $obj->strand,
998 -alphabet => $obj->alphabet,
1001 } elsif ($obj->isa('Bio::SeqFeature::Generic')) {
1002 $new = Bio::SeqFeature::Generic->new(
1003 -start => $obj->start,
1004 -end => $obj->end
1008 return $new;
1012 =head2 _new_from_assembly
1014 Title : _new_from_assembly
1015 Usage :
1016 Function: Creates a new contig spectrum object based solely on the result of
1017 an assembly, contig or singlet
1018 Returns : Bio::Assembly::Tools::ContigSpectrum object
1019 Args : Bio::Assembly::Scaffold, Contig or Singlet object
1021 =cut
1023 sub _new_from_assembly {
1024 # Create new contig spectrum object based purely on what we can get from the
1025 # assembly object
1026 my ($self, $assemblyobj) = @_;
1027 my $csp = Bio::Assembly::Tools::ContigSpectrum->new();
1028 # 1: Set id
1029 $csp->{'_id'} = $assemblyobj->id;
1030 # 2: Set overlap statistics: nof_overlaps, min_overlap, avg_overlap,
1031 # min_identity and avg_identity
1032 $csp->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
1033 $csp->{'_min_overlap'} = $self->{'_min_overlap'};
1034 $csp->{'_min_identity'} = $self->{'_min_identity'};
1035 if ( $csp->{'_eff_asm_params'} > 0 ) {
1036 ( $csp->{'_avg_overlap'}, $csp->{'_avg_identity'}, $csp->{'_min_overlap'},
1037 $csp->{'_min_identity'}, $csp->{'_nof_overlaps'} )
1038 = $csp->_get_assembly_overlap_stats($assemblyobj);
1040 # 3: Set sequence statistics: nof_seq and avg_seq_len
1041 ($csp->{'_avg_seq_len'}, $csp->{'_nof_seq'}) = $self->_get_assembly_seq_stats($assemblyobj);
1042 ### any use in using _naive_assembler here to re-assemble with specific minmum criteria?
1043 # 4: Set the spectrum: spectrum and max_size
1044 for my $contigobj ( $self->_get_contig_like($assemblyobj) ) {
1045 my $size = $contigobj->num_sequences;
1046 if (defined $csp->{'_spectrum'}{$size}) {
1047 $csp->{'_spectrum'}{$size}++;
1048 } else {
1049 $csp->{'_spectrum'}{$size} = 1;
1051 $csp->{'_max_size'} = $size if $size > $csp->{'_max_size'};
1053 # 5: Set list of assembly objects used
1054 push @{$csp->{'_assembly'}}, $assemblyobj;
1055 # 6: Set number of repetitions
1056 $csp->{'_nof_rep'} = 1;
1057 return $csp;
1061 =head2 _new_dissolved_csp
1063 Title : _new_dissolved_csp
1064 Usage :
1065 Function: create a dissolved contig spectrum object
1066 Returns : dissolved contig spectrum
1067 Args : mixed contig spectrum
1068 header of sequences to keep in this contig spectrum
1070 =cut
1072 sub _new_dissolved_csp {
1073 my ($self, $mixed_csp, $seq_header) = @_;
1074 # Sanity checks on the mixed contig spectrum
1076 # min_overlap and min_identity must be specified if there are some overlaps
1077 # in the mixed contig
1078 if ($mixed_csp->{'_nof_overlaps'} > 0) {
1079 unless ( defined $self->{'_min_overlap'} ||
1080 defined $mixed_csp->{'_min_overlap'} ) {
1081 $self->throw("min_overlap must be defined in the dissolved contig spectrum".
1082 " or mixed contig spectrum to dissolve a contig");
1084 unless ( defined $self->{'_min_identity'} ||
1085 defined $mixed_csp->{'_min_identity'} ) {
1086 $self->throw("min_identity must be defined in the dissolved contig spectrum".
1087 " or mixed contig spectrum");
1091 # there must be at least one assembly in mixed contig spectrum
1092 if (!defined $mixed_csp->{'_assembly'} ||
1093 scalar @{$mixed_csp->{'_assembly'}} < 1) {
1094 $self->throw("The mixed contig spectrum must be based on at least one
1095 assembly");
1098 # New dissolved contig spectrum object
1099 my $dissolved = Bio::Assembly::Tools::ContigSpectrum->new();
1101 # Take attributes of the parent contig spectrum if they exist, or those of the
1102 # mixed contig spectrum otherwise
1103 my ($eff_asm_params, $min_overlap, $min_identity);
1104 if ($self->{'_eff_asm_params'}) {
1105 $eff_asm_params = $self->{'_eff_asm_params'};
1106 } else {
1107 $eff_asm_params = $mixed_csp->{'_eff_asm_params'};
1109 if ($self->{'_min_overlap'}) {
1110 $min_overlap = $self->{'_min_overlap'};
1111 } else {
1112 $min_overlap = $mixed_csp->{'_min_overlap'};
1114 if ($self->{'_min_identity'}) {
1115 $min_identity = $self->{'_min_identity'};
1116 } else {
1117 $min_identity = $mixed_csp->{'_min_identity'};
1119 ($dissolved->{'_eff_asm_params'}, $dissolved->{'_min_overlap'},
1120 $dissolved->{'_min_identity'}) = ($eff_asm_params, $min_overlap,
1121 $min_identity);
1123 # Dissolve each assembly
1124 for my $obj (@{$mixed_csp->{'_assembly'}}) {
1125 for my $contig ( $self->_get_contig_like($obj) ) {
1127 # Dissolve this assembly/contig/singlet for the given sequences
1128 my $dissolved_objs = $self->_dissolve_contig( $contig, $seq_header,
1129 $min_overlap, $min_identity );
1131 # Add dissolved contigs to contig spectrum
1132 for my $dissolved_obj (@$dissolved_objs) {
1133 $dissolved->assembly($dissolved_obj);
1134 $dissolved->{'_nof_rep'}--;
1140 # Update nof_rep
1141 $dissolved->{'_nof_rep'} += $mixed_csp->{'_nof_rep'};
1143 return $dissolved;
1147 =head2 _dissolve_contig
1149 Title : _dissolve_contig
1150 Usage :
1151 Function: dissolve a contig
1152 Returns : arrayref of contigs and singlets
1153 Args : mixed contig spectrum
1154 header of sequences to keep in this contig spectrum
1155 minimum overlap
1156 minimum identity
1158 =cut
1160 sub _dissolve_contig {
1161 my ($self, $contig, $wanted_origin, $min_overlap, $min_identity) = @_;
1163 # List of reads
1164 my @seqs;
1165 if ($contig->isa('Bio::Assembly::Singlet')) {
1166 @seqs = $contig->seqref;
1167 } elsif ($contig->isa('Bio::Assembly::Contig')) {
1168 @seqs = $contig->each_seq;
1171 # Get sequences from the desired metagenome
1172 my @contig_seqs;
1173 for my $seq (@seqs) {
1174 my $seq_id = $seq->id;
1175 # get sequence origin
1176 next unless $self->_seq_origin($seq_id) eq $wanted_origin;
1177 # add it to hash
1178 push @contig_seqs, $seq_id;
1181 # Update spectrum
1182 my $size = scalar @contig_seqs;
1183 my $objs;
1184 if ($size == 1) {
1185 # create a singlet and add it to list of objects
1186 my $id = $contig_seqs[0];
1187 my $seq = $contig->get_seq_by_name($id);
1188 push @$objs, Bio::Assembly::Singlet->new(-id => $contig->id, -seqref => $self->_obj_copy($seq) );
1189 } elsif ($size > 1) {
1190 # Reassemble good sequences
1191 my $contig_objs = $self->_naive_assembler( $contig, \@contig_seqs,
1192 $min_overlap, $min_identity );
1193 push @$objs, @$contig_objs;
1196 return $objs;
1200 =head2 _new_cross_csp
1202 Title : _new_cross_csp
1203 Usage :
1204 Function: create a cross contig spectrum object
1205 Returns : cross-contig spectrum
1206 Args : mixed contig spectrum
1208 =cut
1210 sub _new_cross_csp {
1211 my ($self, $mixed_csp) = @_;
1212 # Sanity check on the mixed contig spectrum
1213 # There must be at least one assembly
1214 if (!defined $mixed_csp->{'_assembly'} ||
1215 scalar @{$mixed_csp->{'_assembly'}} < 1) {
1216 $self->throw("The mixed contig spectrum must be based on at least one ".
1217 "assembly.");
1220 # New dissolved contig spectrum object
1221 my $cross = Bio::Assembly::Tools::ContigSpectrum->new();
1223 # Take attributes from parent or from mixed contig spectrums
1224 my ($eff_asm_params, $min_overlap, $min_identity);
1225 if ($self->{'_eff_asm_params'}) {
1226 $eff_asm_params = $self->{'_eff_asm_params'};
1227 } else {
1228 $eff_asm_params = $mixed_csp->{'_eff_asm_params'};
1230 if ($self->{'_min_overlap'}) {
1231 $min_overlap = $self->{'_min_overlap'};
1232 } else {
1233 $min_overlap = $mixed_csp->{'_min_overlap'};
1235 if ($self->{'_min_identity'}) {
1236 $min_identity = $self->{'_min_identity'};
1237 } else {
1238 $min_identity = $mixed_csp->{'_min_identity'};
1240 ($cross->{'_eff_asm_params'},$cross->{'_min_overlap'},$cross->{'_min_identity'})
1241 = ($eff_asm_params, $min_overlap, $min_identity);
1243 # Get cross contig spectrum for each assembly
1244 for my $obj ( @{$mixed_csp->{'_assembly'}} ) {
1245 for my $contig ( $self->_get_contig_like($obj) ) {
1247 # Go through contigs and skip the pure ones
1248 my ($cross_contigs, $nof_cross_singlets) = $self->_cross_contig($contig,
1249 $min_overlap, $min_identity);
1251 # Add cross-contig
1252 for my $cross_contig ( @$cross_contigs ) {
1253 $cross->assembly($cross_contig);
1254 $cross->{'_nof_rep'}--;
1257 # Add cross-singlets
1258 $cross->{'_spectrum'}->{'1'} += $nof_cross_singlets;
1263 # Update nof_rep
1264 $cross->{'_nof_rep'} += $mixed_csp->{'_nof_rep'};
1266 return $cross;
1270 =head2 _cross_contig
1272 Title : _cross_contig
1273 Usage :
1274 Function: calculate cross contigs
1275 Returns : arrayref of cross-contigs
1276 number of cross-singlets
1277 Args : contig
1278 minimum overlap
1279 minimum identity
1281 =cut
1283 sub _cross_contig {
1284 my ($self, $contig, $min_overlap, $min_identity) = @_;
1286 my $nof_cross_singlets = 0;
1287 my @cross_contigs;
1289 # Weed out pure contigs
1290 my %all_origins;
1291 for my $seq ($contig->each_seq) {
1292 my $seq_id = $seq->id;
1293 my $seq_origin = $self->_seq_origin($seq_id);
1294 if (not defined $seq_origin) {
1295 $self->warn("Sequence $seq_id doesn't have any header. Skipping it...");
1296 next;
1298 if ( scalar keys %all_origins > 1 ) {
1299 # a cross-contig spectrum
1300 last;
1302 $all_origins{$seq_origin} = undef;
1304 if ( scalar keys %all_origins <= 1 ) {
1305 # a pure contig
1306 return \@cross_contigs, $nof_cross_singlets;
1308 %all_origins = ();
1310 # Break the cross-contigs using the specified stringency
1311 my $test_contigs = $self->_naive_assembler($contig, undef, $min_overlap, $min_identity);
1313 # Find cross contigs and singlets
1314 for my $test_contig ( @$test_contigs ) {
1316 # Find cross-contigs
1317 my %origins;
1318 for my $seq ($test_contig->each_seq) {
1319 my $seq_id = $seq->id;
1320 my $seq_origin = $self->_seq_origin($seq_id);
1321 next if not defined $seq_origin;
1322 push @{$origins{$seq_origin}}, $seq_id;
1324 if (scalar keys %origins > 1) {
1325 # Found a cross-contig
1326 push @cross_contigs, $test_contig;
1327 } else {
1328 next;
1331 # Find cross-singlets
1332 for my $origin (keys %origins) {
1333 my @ori_ids = @{$origins{$origin}};
1334 if (scalar @ori_ids == 1) {
1335 $nof_cross_singlets++;
1336 } elsif (scalar @ori_ids > 1) {
1337 # Dissolve contig for the given origin
1339 ### consider using the minimum overlap and identity here again?
1340 my $ori_contigs = $self->_naive_assembler($test_contig, \@ori_ids, undef, undef);
1342 for my $ori_contig (@$ori_contigs) {
1343 $nof_cross_singlets++ if $ori_contig->num_sequences == 1;
1351 return \@cross_contigs, $nof_cross_singlets;
1355 =head2 _seq_origin
1357 Title : _seq_origin
1358 Usage :
1359 Function: determines where a sequence comes from using its header. For example
1360 the origin of the sequence 'metagenome1|gi|9626988|ref|NC_001508.1|'
1361 is 'metagenome1'
1362 Returns : origin
1363 Args : sequence ID
1365 =cut
1367 sub _seq_origin {
1368 # Current sequence origin. Example: sequence with ID
1369 # 'metagenome1|gi|9626988|ref|NC_001508.1|' has header 'metagenome1'
1370 my ($self, $seq_id) = @_;
1371 my $origin;
1372 if ( $seq_id =~ m/^(.+?)\|/ ) {
1373 $origin = $1;
1375 return $origin;
1379 =head2 _import_assembly
1381 Title : _import_assembly
1382 Usage : $csp->_import_assembly($assemblyobj);
1383 Function: Update a contig spectrum object based on an assembly, contig or
1384 singlet object
1385 Returns : 1 for success
1386 Args : Bio::Assembly::Scaffold, Contig or Singlet object
1388 =cut
1390 sub _import_assembly {
1391 my ($self, $assemblyobj) = @_;
1392 # Sanity check
1393 if ( ! ref $assemblyobj ||
1394 ( ! $assemblyobj->isa('Bio::Assembly::ScaffoldI') &&
1395 ! $assemblyobj->isa('Bio::Assembly::Contig') )) {
1396 $self->throw("Unable to process non Bio::Assembly::ScaffoldI, Contig or ".
1397 "Singlet object [".ref($assemblyobj)."]");
1399 # Create new object from assembly
1400 my $csp = $self->_new_from_assembly($assemblyobj);
1402 # Update current contig spectrum object with new one
1403 $self->add($csp);
1405 return 1;
1409 =head2 _import_spectrum
1411 Title : _import_spectrum
1412 Usage : $csp->_import_spectrum({ 1 => 90 , 2 => 3 , 4 => 1 })
1413 Function: update a contig spectrum object based on a contig spectrum
1414 represented as a hash (key: contig size, value: number of contigs of
1415 this size)
1416 Returns : 1 for success
1417 Args : contig spectrum as a hash reference
1419 =cut
1421 sub _import_spectrum {
1422 my ($self, $spectrum) = @_;
1423 # Sanity check
1424 if( ! ref $spectrum || ! ref $spectrum eq 'HASH') {
1425 $self->throw("Spectrum should be a hash reference, but it is [".
1426 ref($spectrum)."]");
1429 # Update the spectrum (+ nof_rep, max_size and nof_seq)
1430 for my $size (keys %$spectrum) {
1431 # Get the number of contigs of different size
1432 if (defined $self->{'_spectrum'}{$size}) {
1433 $self->{'_spectrum'}{$size} += $$spectrum{$size};
1434 } else {
1435 $self->{'_spectrum'}{$size} = $$spectrum{$size};
1437 # Update nof_seq
1438 $self->{'_nof_seq'} += $size * $$spectrum{$size};
1439 # Update max_size
1440 $self->{'_max_size'} = $size if $size > $self->{'_max_size'};
1443 # If the contig spectrum has only zero 1-contigs, max_size is zero
1444 $self->{'_max_size'} = 0 if scalar keys %{$self->{'_spectrum'}} == 1 &&
1445 defined $self->{'_spectrum'}{'1'} && $self->{'_spectrum'}{'1'} == 0;
1447 # Update nof_rep
1448 $self->{'_nof_rep'}++;
1449 return 1;
1452 =head2 _import_dissolved_csp
1454 Title : _import_dissolved_csp
1455 Usage : $csp->_import_dissolved_csp($mixed_csp, $seq_header);
1456 Function: Update a contig spectrum object by dissolving a mixed contig
1457 spectrum based on the header of the sequences
1458 Returns : 1 for success
1459 Args : Bio::Assembly::Tools::ContigSpectrum
1460 sequence header string
1462 =cut
1464 sub _import_dissolved_csp {
1465 my ($self, $mixed_csp, $seq_header) = @_;
1466 # Sanity check
1467 if (not defined $mixed_csp || not defined $seq_header) {
1468 $self->throw("Expecting a contig spectrum reference and sequence header as".
1469 " arguments");
1471 # Create new object from assembly
1472 my $dissolved_csp = $self->_new_dissolved_csp($mixed_csp, $seq_header);
1473 # Update current contig spectrum object with new one
1474 $self->add($dissolved_csp);
1475 return 1;
1479 =head2 _import_cross_csp
1481 Title : _import_cross_csp
1482 Usage : $csp->_import_cross_csp($mixed_csp);
1483 Function: Update a contig spectrum object by calculating the cross contig
1484 spectrum based on a mixed contig spectrum
1485 Returns : 1 for success
1486 Args : Bio::Assembly::Tools::ContigSpectrum
1488 =cut
1490 sub _import_cross_csp {
1491 my ($self, $mixed_csp) = @_;
1492 # Sanity check
1493 if (not defined $mixed_csp) {
1494 $self->throw("Expecting a contig spectrum reference as argument");
1497 # Create new object from assembly
1498 my $cross_csp = $self->_new_cross_csp($mixed_csp);
1499 my $nof_1_cross_contigs = $cross_csp->spectrum->{1};
1501 # Update current contig spectrum object with new one
1502 $self->add($cross_csp);
1504 # Remove 1-contigs
1505 $self->{'_nof_seq'} -= $nof_1_cross_contigs;
1507 return 1;
1510 =head2 _get_contig_like
1512 Title : _get_contig_like
1513 Usage : my @contig_like_objs = $csp->_get_contig_like($assembly_obj);
1514 Function: Get contigs and singlets from an assembly, contig or singlet
1515 Returns : array of Bio::Assembly::Contig and Singlet objects
1516 Args : a Bio::Assembly::Scaffold, Contig or singlet object
1518 =cut
1520 sub _get_contig_like {
1521 my ($self, $assembly_obj) = @_;
1522 my @contig_objs;
1523 if ($assembly_obj->isa('Bio::Assembly::ScaffoldI')) {
1524 # all contigs and singlets in the scaffold
1525 push @contig_objs, ($assembly_obj->all_contigs, $assembly_obj->all_singlets);
1526 } else {
1527 # a contig or singlet
1528 @contig_objs = $assembly_obj;
1530 return @contig_objs;
1534 =head2 _get_assembly_seq_stats
1536 Title : _get_assembly_seq_stats
1537 Usage : my $seqlength = $csp->_get_assembly_seq_stats($assemblyobj);
1538 Function: Get sequence statistics from an assembly:
1539 average sequence length, number of sequences
1540 Returns : average sequence length (decimal)
1541 number of sequences (integer)
1542 Args : Bio::Assembly::Scaffold, Contig or singlet object
1543 hash reference with the IDs of the sequences to consider [optional]
1545 =cut
1547 sub _get_assembly_seq_stats {
1548 my ($self, $assemblyobj, $seq_hash) = @_;
1550 # Sanity checks
1551 if ( !defined $assemblyobj ||
1552 ( !$assemblyobj->isa('Bio::Assembly::ScaffoldI') &&
1553 !$assemblyobj->isa('Bio::Assembly::Contig') ) ) {
1554 $self->throw("Must provide a Bio::Assembly::Scaffold, Contig or Singlet object");
1556 $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
1557 if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
1559 # Update sequence stats
1560 my @asm_stats = (0,0);
1561 # asm_stats = (avg_seq_len, nof_seq)
1562 for my $contigobj ( $self->_get_contig_like($assemblyobj) ) {
1563 @asm_stats = $self->_update_seq_stats( @asm_stats,
1564 $self->_get_contig_seq_stats($contigobj, $seq_hash) );
1567 return @asm_stats;
1570 =head2 _get_contig_seq_stats
1572 Title : _get_contig_seq_stats
1573 Usage : my $seqlength = $csp->_get_contig_seq_stats($contigobj);
1574 Function: Get sequence statistics from a contig:
1575 average sequence length, number of sequences
1576 Returns : average sequence length (decimal)
1577 number of sequences (integer)
1578 Args : contig object reference
1579 hash reference with the IDs of the sequences to consider [optional]
1581 =cut
1583 sub _get_contig_seq_stats {
1584 my ($self, $contigobj, $seq_hash) = @_;
1585 my @contig_stats = (0, 0);
1586 # contig_stats = (avg_length, nof_seq)
1587 for my $seqobj ($contigobj->each_seq) {
1588 next if defined $seq_hash && !defined $$seq_hash{$seqobj->id};
1589 my $seq_string;
1590 if ($contigobj->isa('Bio::Assembly::Singlet')) { # a singlet
1591 $seq_string = $contigobj->seqref->seq;
1592 } else { # a contig
1593 $seq_string = $seqobj->seq;
1595 # Number of non-gap characters in the sequence
1596 my $seq_len = $seqobj->_ungapped_len;
1597 my @seq_stats = ($seq_len);
1598 @contig_stats = $self->_update_seq_stats(@contig_stats, @seq_stats);
1600 return @contig_stats;
1604 =head2 _update_seq_stats
1606 Title : _update_seq_stats
1607 Usage :
1608 Function: Update the number of sequences and their average length 1
1609 average identity 1
1610 minimum length 1
1611 minimum identity 1
1612 number of overlaps 1 average sequence length
1613 Returns : average sequence length
1614 number of sequences
1615 Args : average sequence length 1
1616 number of sequences 1
1617 average sequence length 2
1618 number of sequences 2
1620 =cut
1622 sub _update_seq_stats {
1623 my ($self, $p_avg_length, $p_nof_seq, $n_avg_length, $n_nof_seq) = @_;
1624 # Defaults
1625 if (not defined $n_nof_seq) {
1626 $n_nof_seq = 1;
1628 # Update overlap statistics
1629 my $avg_length = 0;
1630 my $nof_seq = $p_nof_seq + $n_nof_seq;
1631 if ($nof_seq != 0) {
1632 $avg_length = ($p_avg_length * $p_nof_seq + $n_avg_length * $n_nof_seq) / $nof_seq;
1634 return $avg_length, $nof_seq;
1638 =head2 _get_assembly_overlap_stats
1640 Title : _get_assembly_overlap_stats
1641 Usage : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
1642 = $csp->_get_assembly_overlap_stats($assemblyobj);
1643 Function: Get statistics about pairwise overlaps in contigs of an assembly
1644 Returns : average overlap length
1645 average identity percent
1646 minimum overlap length
1647 minimum identity percent
1648 number of overlaps
1649 Args : Bio::Assembly::Scaffold, Contig or Singlet object
1650 hash reference with the IDs of the sequences to consider [optional]
1652 =cut
1654 sub _get_assembly_overlap_stats {
1655 my ($self, $assembly_obj, $seq_hash) = @_;
1657 # Sanity check
1658 if ( !defined $assembly_obj ||
1659 ( !$assembly_obj->isa('Bio::Assembly::ScaffoldI') &&
1660 !$assembly_obj->isa('Bio::Assembly::Contig') ) ) {
1661 $self->throw("Must provide a Bio::Assembly::ScaffoldI, Contig or Singlet object");
1663 $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
1664 if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
1666 # Look at all the contigs (no singlets!)
1667 my @asm_stats = (0, 0, undef, undef, 0);
1668 # asm_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)
1669 for my $contig_obj ( $self->_get_contig_like($assembly_obj) ) {
1670 @asm_stats = $self->_update_overlap_stats( @asm_stats,
1671 $self->_get_contig_overlap_stats($contig_obj, $seq_hash) );
1674 return @asm_stats;
1678 =head2 _get_contig_overlap_stats
1680 Title : _get_contig_overlap_stats
1681 Usage : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
1682 = $csp->_get_contig_overlap_stats($contigobj);
1683 Function: Get statistics about pairwise overlaps in a contig or singlet. The
1684 statistics are obtained using graph theory: each read is a node
1685 and the edges between 2 reads are weighted by minus the number of
1686 conserved residues in the alignment between the 2 reads. The
1687 minimum spanning tree of this graph represents the overlaps that
1688 form the contig. Overlaps that do not satisfy the minimum overlap
1689 length and similarity get a malus on their score.
1690 Note: This function requires the optional BioPerl dependency
1691 module called 'Graph'
1692 Returns : average overlap length
1693 average identity percent
1694 minimum overlap length
1695 minimum identity percent
1696 number of overlaps
1697 Args : Bio::Assembly::Contig or Singlet object
1698 hash reference with the IDs of the sequences to consider [optional]
1700 =cut
1702 sub _get_contig_overlap_stats {
1703 my ($self, $contig_obj, $seq_hash) = @_;
1705 # Sanity check
1706 $self->throw("Must provide a Bio::Assembly::Contig object")
1707 if (!defined $contig_obj || !$contig_obj->isa("Bio::Assembly::Contig"));
1708 $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
1709 if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
1711 my @contig_stats = (0, 0, undef, undef, 0);
1712 # contig_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)
1714 # Build contig graph
1715 ### consider providing the minima to _contig_graph here too?
1716 my ($g, $overlaps) = $self->_contig_graph($contig_obj, $seq_hash);
1718 if ( defined $g ) {
1719 # Graph minimum spanning tree (tree that goes through strongest overlaps)
1720 $g = $g->MST_Kruskal();
1722 # Calculate minimum overlap length and identity for this contig
1723 for my $edge ( $g->edges ) {
1724 # Retrieve overlap information
1725 my ($id1, $id2) = @$edge;
1726 if (not exists $$overlaps{$id1}{$id2}) {
1727 ($id2, $id1) = @$edge;
1729 my ($score, $length, $identity) = @{$$overlaps{$id1}{$id2}};
1730 # Update contig stats
1731 my @overlap_stats = ($length, $identity);
1732 @contig_stats = $self->_update_overlap_stats(@contig_stats, @overlap_stats);
1736 return @contig_stats;
1740 =head2 _update_overlap_stats
1742 Title : _update_overlap_stats
1743 Usage :
1744 Function: update the number of overlaps and their minimum and average length
1745 and identity
1746 Returns :
1747 Args : average length 1
1748 average identity 1
1749 minimum length 1
1750 minimum identity 1
1751 number of overlaps 1
1752 average length 2
1753 average identity 2
1754 minimum length 2
1755 minimum identity 2
1756 number of overlaps 2
1758 =cut
1760 sub _update_overlap_stats {
1761 my ($self,
1762 $p_avg_length, $p_avg_identity, $p_min_length, $p_min_identity, $p_nof_overlaps,
1763 $n_avg_length, $n_avg_identity, $n_min_length, $n_min_identity, $n_nof_overlaps)
1764 = @_;
1766 # Defaults
1767 if (not defined $n_nof_overlaps) { $n_nof_overlaps = 1 };
1768 if ((not defined $n_min_length) && ($n_avg_length != 0)) { $n_min_length = $n_avg_length };
1769 if ((not defined $n_min_identity) && ($n_avg_identity != 0)) { $n_min_identity = $n_avg_identity };
1771 # Update overlap statistics
1772 my ($avg_length, $avg_identity, $min_length, $min_identity, $nof_overlaps)
1773 = (0, 0, undef, undef, 0);
1774 $nof_overlaps = $p_nof_overlaps + $n_nof_overlaps;
1775 if ($nof_overlaps > 0) {
1776 $avg_length = ($p_avg_length * $p_nof_overlaps + $n_avg_length * $n_nof_overlaps) / $nof_overlaps;
1777 $avg_identity = ($p_avg_identity * $p_nof_overlaps + $n_avg_identity * $n_nof_overlaps) / $nof_overlaps;
1780 if ( not defined $p_min_length ) {
1781 $min_length = $n_min_length;
1782 } elsif ( not defined $n_min_length ) {
1783 $min_length = $p_min_length;
1784 } else { # both values are defined
1785 if ($n_min_length < $p_min_length) {
1786 $min_length = $n_min_length;
1787 } else {
1788 $min_length = $p_min_length;
1792 if ( not defined $p_min_identity ) {
1793 $min_identity = $n_min_identity;
1794 } elsif ( not defined $n_min_identity ) {
1795 $min_identity = $p_min_identity;
1796 } else { # both values are defined
1797 if ($n_min_identity < $p_min_identity) {
1798 $min_identity = $n_min_identity;
1799 } else {
1800 $min_identity = $p_min_identity;
1804 return $avg_length, $avg_identity, $min_length, $min_identity, $nof_overlaps;
1808 =head2 _overlap_alignment
1810 Title : _overlap_alignment
1811 Usage :
1812 Function: Produce an alignment of the overlapping section of two sequences of
1813 a contig. Minimum overlap length and percentage identity can be
1814 specified. Return undef if the sequences do not overlap or do not
1815 meet the minimum overlap criteria.
1816 Return : Bio::SimpleAlign object reference
1817 alignment overlap length
1818 alignment overlap identity
1819 Args : Bio::Assembly::Contig object reference
1820 Bio::LocatableSeq contig sequence 1
1821 Bio::LocatableSeq contig sequence 2
1822 minium overlap length [optional]
1823 minimum overlap identity percentage[optional]
1825 =cut
1827 sub _overlap_alignment {
1828 my ($self, $contig, $qseq, $tseq, $min_overlap, $min_identity) = @_;
1829 # get query and target sequence position
1830 my $qpos = $contig->get_seq_coord($qseq);
1831 my $tpos = $contig->get_seq_coord($tseq);
1832 # check that there is an overlap
1833 my $qend = $qpos->end;
1834 my $tstart = $tpos->start;
1835 return if $qend < $tstart;
1836 my $qstart = $qpos->start;
1837 my $tend = $tpos->end;
1838 return if $qstart > $tend;
1839 # get overlap boundaries and check overlap length
1840 my $left;
1841 if ($qstart >= $tstart) {
1842 $left = $qstart
1843 } else {
1844 $left = $tstart;
1846 my $right;
1847 if ($qend > $tend) {
1848 $right = $tend;
1849 } else {
1850 $right = $qend;
1852 my $overlap = $right - $left + 1;
1853 return if defined $min_overlap && $overlap < $min_overlap;
1854 # slice query and target sequence to overlap boundaries
1855 my $qleft =
1856 $contig->change_coord('gapped consensus', "aligned ".$qseq->id, $left);
1857 my $qstring = substr($qseq->seq, $qleft - 1, $overlap);
1858 my $tleft =
1859 $contig->change_coord('gapped consensus', "aligned ".$tseq->id, $left);
1860 my $tstring = substr($tseq->seq, $tleft - 1, $overlap);
1861 # remove gaps present in both sequences at the same position
1862 for (my $pos = 0 ; $pos < $overlap ; $pos++) {
1863 my $qnt = substr($qstring, $pos, 1);
1864 if ($qnt eq '-') {
1865 my $tnt = substr($tstring, $pos, 1);
1866 if ($tnt eq '-') {
1867 substr($qstring, $pos, 1, '');
1868 substr($tstring, $pos, 1, '');
1869 $pos--;
1870 $overlap--;
1874 return if defined $min_overlap && $overlap < $min_overlap;
1875 # count the number of gaps remaining in each sequence
1876 my $qgaps = ($qstring =~ tr/-//);
1877 my $tgaps = ($tstring =~ tr/-//);
1878 # make an alignment object with the query and target sequences
1879 my $aln = Bio::SimpleAlign->new;
1880 my $alseq = Bio::LocatableSeq->new(
1881 -id => 1,
1882 -seq => $qstring,
1883 -start => 1,
1884 -end => $overlap - $qgaps,
1885 -alphabet => 'dna',
1887 $aln->add_seq($alseq);
1888 $alseq = Bio::LocatableSeq->new(
1889 -id => 2,
1890 -seq => $tstring,
1891 -start => 1,
1892 -end => $overlap - $tgaps,
1893 -alphabet => 'dna',
1895 $aln->add_seq($alseq);
1897 # check overlap percentage identity
1898 my $identity = $aln->overall_percentage_identity;
1899 return if defined $min_identity && $identity < $min_identity;
1901 # all checks passed, return alignment
1902 return $aln, $overlap, $identity;
1906 =head2 _contig_graph
1908 Title : _contig_graph
1909 Usage :
1910 Function: Creates a graph data structure of the contig.The graph is undirected.
1911 The vertices are the reads of the contig and edges are the overlap
1912 between the reads. The edges are weighted by the opposite of the
1913 overlap, so it is negative and the better the overlap, the lower the
1914 weight.
1915 Return : Graph object or undef
1916 hashref of overlaps (score, length, identity) for each read pair
1917 Args : Bio::Assembly::Contig object reference
1918 hash reference with the IDs of the sequences to consider [optional]
1919 minimum overlap length (integer) [optional]
1920 minimum percentage identity (integer) [optional]
1922 =cut
1924 sub _contig_graph {
1925 my ($self, $contig_obj, $seq_hash, $min_overlap, $min_identity) = @_;
1927 # Sanity checks
1928 if( !ref $contig_obj || ! $contig_obj->isa('Bio::Assembly::Contig') ) {
1929 $self->throw("Unable to process non Bio::Assembly::Contig ".
1930 "object [".ref($contig_obj)."]");
1933 if (not eval { require Graph::Undirected }) {
1934 $self->throw("Error: the module 'Graph' is needed by the method ".
1935 "_contig_graph but could not be found\n$@");
1938 # Skip contigs of 1 sequence (they have no overlap)
1939 my @seq_objs = $contig_obj->each_seq;
1940 my $nof_seqs = scalar @seq_objs;
1942 return if ($nof_seqs <= 1);
1944 # Calculate alignment between all pairs of reads
1945 my %overlaps;
1946 for my $i (0 .. $nof_seqs-1) {
1947 my $seq_obj = $seq_objs[$i];
1948 my $seq_id = $seq_obj->id;
1950 # Skip this read if not in list of wanted sequences
1951 next if defined $seq_hash && !exists $$seq_hash{$seq_id};
1953 # What is the best sequence to align to?
1954 my ($best_score, $best_length, $best_identity);
1955 for my $j ($i+1 .. $nof_seqs-1) {
1957 # Skip this sequence if not in list of wanted sequences
1958 my $target_obj = $seq_objs[$j];
1959 my $target_id = $target_obj->id;
1960 next if defined $seq_hash && !exists $$seq_hash{$target_id};
1962 # How much overlap with this sequence?
1963 my ($aln_obj, $length, $identity)
1964 = $self->_overlap_alignment($contig_obj, $seq_obj, $target_obj, $min_overlap, $min_identity);
1965 next if ! defined $aln_obj; # there was no sequence overlap or overlap not good enough
1967 # Score the overlap as the number of conserved residues. In practice, it
1968 # seems to work better than giving +1 for match and -3 for errors
1969 # (mismatch or indels)
1970 my $score = $length * $identity / 100;
1972 # Apply a malus (square root) for scores that do not satisfy the minimum
1973 # overlap length similarity. It is necessary for overlaps that get a high
1974 # score without satisfying both the minimum values.
1975 if ( ( $min_overlap && ($length < $min_overlap ) ) ||
1976 ( $min_identity && ($identity < $min_identity) ) ) {
1977 $score = sqrt($score);
1979 $overlaps{$seq_id}{$target_id} = [$score, $length, $identity];
1985 # Process overlaps
1986 my $g; # the Graph object
1987 if (scalar keys %overlaps >= 1) {
1988 # At least 1 overlap. Create a weighted undirected graph
1989 $g = Graph::Undirected->new();
1990 for my $seq_id (keys %overlaps) {
1991 for my $target_id (keys %{$overlaps{$seq_id}}) {
1992 my $score = @{$overlaps{$seq_id}{$target_id}}[0];
1993 my $weight = -$score;
1994 $g->add_weighted_edge($seq_id, $target_id, $weight);
2000 return $g, \%overlaps;
2005 =head2 _draw_graph
2007 Title : _draw_graph
2008 Usage :
2009 Function: Generates a PNG picture of the contig graph. It is mostly for
2010 debugging purposes.
2011 Return : 1 for success
2012 Args : a Graph object
2013 hashref of overlaps (score, length, identity) for each read pair
2014 name of output file
2015 overlap info to display: 'score' (default), 'length' or 'identity'
2017 =cut
2019 sub _draw_graph {
2020 my ($self, $g, $overlaps, $outfile, $edge_type) = @_;
2022 $self->throw("Error: need to provide a graph as input\n") if not defined $g;
2024 if (not eval { require GraphViz }) {
2025 $self->throw("Error: the module 'GraphViz' is needed by the method ".
2026 "_draw_graph but could not be found\n$@");
2029 $edge_type ||= 'score';
2031 my $viz = GraphViz->new( directed => 0 );
2033 for my $edge ( $g->edges ) {
2034 # Retrieve overlap information
2035 my ($id1, $id2) = @$edge;
2036 if (not exists $$overlaps{$id1}{$id2}) {
2037 ($id2, $id1) = @$edge;
2039 my ($score, $length, $identity) = @{$$overlaps{$id1}{$id2}};
2041 my $edge_val;
2042 if ($edge_type eq 'score') {
2043 $edge_val = $score;
2044 } elsif ($edge_type eq 'length') {
2045 $edge_val = $length;
2046 } elsif ($edge_type eq 'identity') {
2047 $edge_val = $identity;
2048 } else {
2049 $self->throw("Error: invalid edge type to display, '$edge_val'");
2051 $viz->add_edge($id1 => $id2, label => $edge_val);
2053 open my $fh, '>', $outfile or $self->throw("Error: Could not write file '$outfile': $!");
2054 print $fh $viz->as_png;
2055 close $fh;
2056 return 1;
2062 __END__