sync w/ main trunk
[bioperl-live.git] / Bio / Assembly / Scaffold.pm
blob12c6260ed404e7b371bebdc98502db32bbceca0a
1 # $Id$
3 # BioPerl module for Bio::Assembly::Scaffold
5 # Copyright by Robson F. de Souza
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly
14 data.
16 =head1 SYNOPSIS
18 # Module loading
19 use Bio::Assembly::IO;
21 # Assembly loading methods
22 my $aio = Bio::Assembly::IO->new(-file=>"test.ace.1", -format=>'phrap');
23 my $assembly = $aio->next_assembly;
25 foreach my $contig ($assembly->all_contigs) {
26 # do something... (see Bio::Assembly::Contig)
29 =head1 DESCRIPTION
31 Bio::Assembly::Scaffold was developed to store and manipulate data
32 from sequence assembly programs like Phrap. It implements the
33 ScaffoldI interface and intends to be generic enough to be used by
34 Bio::Assembly::IO drivers written to programs other than Phrap.
36 =head1 FEEDBACK
38 =head2 Mailing Lists
40 User feedback is an integral part of the evolution of this and other
41 Bioperl modules. Send your comments and suggestions preferably to the
42 Bioperl mailing lists Your participation is much appreciated.
44 bioperl-l@bioperl.org - General discussion
45 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 =head2 Support
49 Please direct usage questions or support issues to the mailing list:
51 L<bioperl-l@bioperl.org>
53 rather than to the module maintainer directly. Many experienced and
54 reponsive experts will be able look at the problem and quickly
55 address it. Please include a thorough description of the problem
56 with code and data examples if at all possible.
58 =head2 Reporting Bugs
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 the bugs and their resolution. Bug reports can be submitted via the
62 web:
64 http://bugzilla.open-bio.org/
66 =head1 AUTHOR - Robson Francisco de Souza
68 rfsouza@citri.iq.usp.br
70 =head1 APPENDIX
72 The rest of the documentation details each of the object
73 methods. Internal methods are usually preceded with a _
75 =cut
77 package Bio::Assembly::Scaffold;
79 use strict;
80 use Bio::Annotation::Collection;
82 use base qw(Bio::Root::Root Bio::Assembly::ScaffoldI);
84 =head2 new ()
86 Title : new
87 Usage : $scaffold = new ( -id => "assembly 1",
88 -source => 'program_name',
89 -contigs => \@contigs,
90 -singlets => \@singlets );
91 Function: creates a new scaffold object
92 Returns : Bio::Assembly::Scaffold
93 Args : -id : [string] scaffold name
94 -source : [string] sequence assembly program
95 -contigs : reference to array of Bio::Assembly::Contig objects
96 -singlets : reference to array of Bio::Assembly::Singlet objects
99 =cut
101 sub new {
102 my($class, @args) = @_;
103 my $self = $class->SUPER::new(@args);
104 my ($id, $src, $contigs, $singlets) = $self->_rearrange(
105 [qw(ID SOURCE CONTIGS SINGLETS)], @args);
107 # Scaffold defaults
108 $self->{'_id'} = 'NoName';
109 $self->{'_source'} = 'Unknown';
110 $self->{'_contigs'} = {};
111 $self->{'_singlets'} = {};
112 $self->{'_seqs'} = {};
113 $self->{'_annotation'} = Bio::Annotation::Collection->new();
115 # Import manual info
116 $self->{'_id'} = $id if (defined $id);
117 $self->{'_source'} = $src if (defined $src);
119 # Add contigs and singlets to scaffold
120 if (defined $contigs && ref($contigs = 'ARRAY')) {
121 for my $contig (@{$contigs}) {
122 if( ! ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
123 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
124 "Bio::Assembly::Contig object [", ref($contig), "]");
126 $self->add_contig($contig);
129 if (defined $singlets && ref($singlets = 'ARRAY')) {
130 for my $singlet (@{$singlets}) {
131 if( ! ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
132 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
133 "Bio::Assembly::Singlet object [", ref($singlet), "]");
135 $self->add_singlet($singlet);
139 return $self;
142 =head1 Accessing general assembly data
144 =cut
146 =head2 id
148 Title : id
149 Usage : $assembly->id()
150 Function: Get/Set assembly ID
151 Returns : string or undef
152 Args : string
154 =cut
156 sub id {
157 my ($self, $id) = @_;
158 return $self->{'_id'} = $id if (defined $id);
159 return $self->{'_id'};
162 =head2 annotation
164 Title : annotation
165 Usage : $assembly->annotation()
166 Function: Get/Set assembly annotation object
167 Returns : Bio::Annotation::Collection
168 Args : none
170 =cut
172 sub annotation {
173 my ($self, $ref) = shift;
174 $self->{'_annotation'} = $ref if (defined $ref);
175 return $self->{'_annotation'};
178 =head2 get_nof_contigs
180 Title : get_nof_contigs
181 Usage : $assembly->get_nof_contigs()
182 Function: Get the number of contigs included in the scaffold
183 Returns : integer
184 Args : none
186 =cut
188 sub get_nof_contigs {
189 my $self = shift;
190 return scalar( $self->get_contig_ids() );
193 =head2 get_nof_contig_seqs
195 Title : get_nof_contig_seqs
196 Usage : $assembly->get_nof_contig_seqs()
197 Function: Get the number of sequences included in contigs of the
198 scaffold (no consensus sequences or singlets)
199 Returns : integer
200 Args : none
202 =cut
204 sub get_nof_contig_seqs {
205 my $self = shift;
207 my $nof_seqs = 0;
208 foreach my $contig ($self->all_contigs) {
209 $nof_seqs += scalar( $contig->get_seq_ids() );
212 return $nof_seqs;
214 # function alias for backward compatibility
215 *get_nof_sequences_in_contigs = \&get_nof_contig_seqs;
218 =head2 get_nof_singlets (get_nof_singlet_seqs)
220 Title : nof_singlets
221 Usage : $assembly->nof_singlets()
222 Function: Get the number of singlets included in the assembly
223 Returns : integer
224 Args : none
226 =cut
228 sub get_nof_singlets {
229 my $self = shift;
230 return scalar( $self->get_singlet_ids() );
232 *get_nof_singlet_seqs = \&get_nof_singlets;
234 =head2 get_all_seq_ids
236 Title : get_all_seq_ids
237 Usage : $assembly->get_all_seq_ids()
238 Function: Get the ID of all sequences making up the scaffold
239 (sequences from contigs and singlets, not consensus).
240 Returns : array of strings
241 Args : none
243 =cut
245 sub get_all_seq_ids {
246 my $self = shift;
247 return keys %{ $self->{'_seqs'} };
250 =head2 get_nof_seqs
252 Title : get_nof_seqs
253 Usage : $assembly->get_nof_seqs()
254 Function: Get total number of sequences making up the scaffold
255 (sequences from contigs and singlets, not consensus).
256 Returns : integer
257 Args : none
259 =cut
261 sub get_nof_seqs {
262 my $self = shift;
263 return scalar $self->get_all_seq_ids;
266 =head2 get_contig_seq_ids
268 Title : get_contig_seq_ids
269 Usage : $assembly->get_contig_seq_ids()
270 Function: Get the ID of all sequences in contigs
271 Returns : array of strings
272 Args : none
274 =cut
276 sub get_contig_seq_ids {
277 my $self = shift;
278 my @ids;
279 for my $contig ( $self->all_contigs ) {
280 push @ids, $contig->get_seq_ids;
282 return @ids;
284 # function alias for backward compatibility
285 *get_seq_ids = \&get_contig_seq_ids;
287 =head2 get_contig_ids
289 Title : get_contig_ids
290 Usage : $assembly->get_contig_ids()
291 Function: Access list of contig IDs from assembly
292 Returns : an array, if there are any contigs in the
293 assembly. An empty array otherwise
294 Args : none
296 =cut
298 sub get_contig_ids {
299 my $self = shift;
301 return wantarray
302 ? sort keys %{$self->{'_contigs'}}
303 : scalar keys %{$self->{'_contigs'}};
306 =head2 get_singlet_ids (get_singlet_seq_ids)
308 Title : get_singlet_ids
309 Usage : $assembly->get_singlet_ids()
310 Function: Access list of singlet IDs from assembly
311 Returns : array of strings if there are any singlets
312 otherwise an empty array
313 Args : none
315 =cut
317 sub get_singlet_ids {
318 my $self = shift;
320 return wantarray
321 ? sort keys %{$self->{'_singlets'}}
322 : scalar keys %{$self->{'_singlets'}};
324 *get_singlet_seq_ids = \&get_singlet_ids;
326 =head2 get_seq_by_id
328 Title : get_seq_by_id
329 Usage : $assembly->get_seq_by_id($id)
330 Function: Get a reference for an sequence making up the scaffold
331 (from a contig or singlet, not consensus)
332 Returns : a Bio::LocatableSeq object
333 undef if sequence $id is not found in the scaffold
334 Args : [string] sequence identifier (id)
336 =cut
338 sub get_seq_by_id {
339 my $self = shift;
340 my $seqID = shift;
342 return unless (exists $self->{'_seqs'}{$seqID});
344 return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID);
347 =head2 get_contig_by_id
349 Title : get_contig_by_id
350 Usage : $assembly->get_contig_by_id($id)
351 Function: Get a reference for a contig
352 Returns : a Bio::Assembly::Contig object or undef
353 Args : [string] contig unique identifier (ID)
355 =cut
357 sub get_contig_by_id {
358 my $self = shift;
359 my $contigID = shift;
361 return unless (exists $self->{'_contigs'}{$contigID});
363 return $self->{'_contigs'}{$contigID};
366 =head2 get_singlet_by_id
368 Title : get_singlet_by_id
369 Usage : $assembly->get_singlet_by_id()
370 Function: Get a reference for a singlet
371 Returns : Bio::Assembly::Singlet object or undef
372 Args : [string] a singlet ID
374 =cut
376 sub get_singlet_by_id {
377 my $self = shift;
379 my $singletID = shift;
381 return unless (exists $self->{'_singlets'}{$singletID});
383 return $self->{'_singlets'}{$singletID};
386 =head1 Modifier methods
388 =cut
390 =head2 add_contig
392 Title : add_contig
393 Usage : $assembly->add_contig($contig)
394 Function: Add a contig to the assembly
395 Returns : 1 on success
396 Args : a Bio::Assembly::Contig object
397 order (optional)
399 =cut
401 sub add_contig {
402 my ($self, $contig) = @_;
404 # Input check
405 if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
406 $self->throw("Bio::Assembly::Scaffold::add_contig is unable to process".
407 " non Bio::Assembly::Contig object [", ref($contig), "]");
410 # Create and attribute contig ID
411 my $contigID = $contig->id();
412 if( !defined $contigID ) {
413 $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1);
414 $contig->id($contigID);
415 $self->warn("Attributing ID $contigID to unnamed Bio::Assembly::Contig".
416 " object.");
419 # Adding contig to scaffold
420 $self->warn("Replacing contig $contigID with a new contig object")
421 if (exists $self->{'_contigs'}{$contigID});
422 $self->{'_contigs'}{$contigID} = $contig;
423 $contig->assembly($self); # weak circular reference
425 # Put contig sequences in the list of sequences belonging to the scaffold
426 foreach my $seqID ($contig->get_seq_ids()) {
427 if (exists $self->{'_seqs'}{$seqID} &&
428 not($self->{'_seqs'}{$seqID} eq $contig) ) {
429 $self->warn( "Sequence $seqID already assigned to object ".
430 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
432 $self->{'_seqs'}{$seqID} = $contig;
435 return 1;
438 =head2 add_singlet
440 Title : add_singlet
441 Usage : $assembly->add_singlet($seq)
442 Function: Add a singlet to the assembly
443 Returns : 1 on success
444 Args : a Bio::Assembly::Singlet object
445 order (optional)
447 =cut
449 sub add_singlet {
450 my ($self, $singlet) = @_;
452 # Input check
453 if ( !ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
454 $self->throw("Bio::Assembly::Scaffold::add_singlet is unable to process".
455 " non Bio::Assembly::Singlet object [", ref($singlet), "]");
458 # Create and attribute singlet ID
459 my $singletID = $singlet->id();
460 if( !defined $singletID ) {
461 $singletID = 'Unknown_' . ($self->get_nof_singlets() + 1);
462 $singlet->id($singletID);
463 $self->warn("Attributing ID $singletID to unnamed Bio::Assembly::".
464 "Singlet object.");
467 # Adding singlet to scaffold
468 $self->warn("Replacing singlet $singletID with a new singlet object")
469 if (exists $self->{'_singlets'}{$singletID});
470 $self->{'_singlets'}{$singletID} = $singlet;
471 $singlet->assembly($self); # weak circular reference
473 # Put singlet sequence in the list of sequences belonging to the scaffold
474 my $seqID = $singlet->id();
475 if (exists $self->{'_seqs'}{$seqID} &&
476 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
477 $self->warn( "Sequence $seqID already assigned to object ".
478 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
480 $self->{'_seqs'}{$seqID} = $singlet;
481 return 1;
484 =head2 update_seq_list
486 Title : update_seq_list
487 Usage : $assembly->update_seq_list()
488 Function:
490 Synchronizes the assembly registry for sequences in
491 contigs and contig actual aligned sequences content. You
492 probably want to run this after you remove/add a
493 sequence from/to a contig in the assembly.
495 Returns : 1 for success
496 Args : none
498 =cut
500 sub update_seq_list {
501 my $self = shift;
503 $self->{'_seqs'} = {};
505 # Put sequences in contigs in list of sequences belonging to the scaffold
506 foreach my $contig ($self->all_contigs) {
507 my $contigID = $contig->id();
508 foreach my $seqID ($contig->get_seq_ids) {
509 if (exists $self->{'_seqs'}{$seqID} &&
510 not($self->{'_seqs'}{$seqID} eq $contig) ) {
511 $self->warn( "Sequence $seqID already assigned to object ".
512 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
514 $self->{'_seqs'}{$seqID} = $contig;
518 # Put singlet sequences in the list of sequences belonging to the scaffold
519 foreach my $singlet ($self->all_singlets) {
520 my $seqID = $singlet->id();
521 my $singletID = $singlet->id();
522 if (exists $self->{'_seqs'}{$seqID} &&
523 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
524 $self->warn( "Sequence $seqID already assigned to object ".
525 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
527 $self->{'_seqs'}{$seqID} = $singlet;
530 return 1;
533 =head2 remove_contigs
535 Title : remove_contigs
536 Usage : $assembly->remove_contigs(1..4)
537 Function: Remove contig from assembly object
538 Returns : an array of removed Bio::Assembly::Contig
539 objects
540 Args : an array of contig IDs
542 See function get_contig_ids() above
544 =cut
546 sub remove_contigs {
547 my ($self, @args) = @_;
549 my @ret = ();
550 foreach my $contigID (@args) {
551 foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) {
552 delete $self->{'_seqs'}{$seqID};
554 push(@ret, $self->{'_contigs'}{$contigID});
555 delete $self->{'_contigs'}{$contigID};
558 return @ret;
561 =head2 remove_singlets
563 Title : remove_singlets
564 Usage : $assembly->remove_singlets(@singlet_ids)
565 Function: Remove singlet from assembly object
566 Returns : the Bio::Assembly::Singlet objects removed
567 Args : a list of singlet IDs
569 See function get_singlet_ids() above
571 =cut
573 sub remove_singlets {
574 my ($self,@args) = @_;
576 my @ret = ();
577 foreach my $singletID (@args) {
578 push(@ret,$self->{'_singlets'}{$singletID});
579 delete $self->{'_singlets'}{$singletID};
582 return @ret;
585 =head2 remove_features_collection
587 Title : remove_features_collection
588 Usage : $assembly->remove_features_collection()
589 Function: Removes the collection of features associated to every
590 contig and singlet of the scaffold. This can be useful
591 to save some memory (when contig and singlet features are
592 not needed).
593 Returns : none
594 Argument : none
596 =cut
598 sub remove_features_collection {
599 my ($self) = @_;
600 for my $obj ( $self->all_contigs, $self->all_singlets ) {
601 $obj->remove_features_collection;
603 return;
607 =head1 Contig and singlet selection methods
609 =cut
611 =head2 select_contigs
613 Title : select_contigs
614 Usage : $assembly->select_contigs(@list)
615 Function: Select an array of contigs from the assembly
616 Returns : an array of Bio::Assembly::Contig objects
617 Args : an array of contig ids
619 See function get_contig_ids() above
621 =cut
623 sub select_contigs {
624 my ($self,@args) = @_;
626 my @contigs = ();
627 foreach my $contig (@args) {
628 unless (exists $self->{'_contigs'}{$contig}) {
629 $self->warn("$contig contig not found. Ignoring...");
630 next;
632 push(@contigs, $self->{'_contigs'}{$contig});
635 return @contigs;
638 =head2 select_singlets
640 Title : select_singlets
641 Usage : $assembly->select_singlets(@list)
642 Function: Selects an array of singlets from the assembly
643 Returns : an array of Bio::Assembly::Singlet objects
644 Args : an array of singlet ids
646 See function get_singlet_ids() above
648 =cut
650 sub select_singlets {
651 my ($self,@args) = @_;
653 my @singlets = ();
654 foreach my $singlet (@args) {
655 unless (exists $self->{'_singlets'}{$singlet}) {
656 $self->warn("$singlet singlet not found. Ignoring...");
657 next;
659 push(@singlets, $self->{'_singlets'}{$singlet});
662 return @singlets;
665 =head2 all_contigs
667 Title : all_contigs
668 Usage : my @contigs = $assembly->all_contigs
669 Function:
671 Returns a list of all contigs in this assembly. Contigs
672 are both clusters and alignments of one or more reads,
673 with an associated consensus sequence.
675 Returns : array of Bio::Assembly::Contig (in lexical id order)
676 Args : none
678 =cut
680 sub all_contigs {
681 my ($self) = @_;
683 my @contigs = ();
684 foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) {
685 push(@contigs, $self->{'_contigs'}{$contig});
688 return @contigs;
691 =head2 all_singlets
693 Title : all_singlets
694 Usage : my @singlets = $assembly->all_singlets
695 Function:
697 Returns a list of all singlets in this assembly.
698 Singlets are isolated reads, without non-vector
699 matches to any other read in the assembly.
701 Returns : array of Bio::Assembly::Singlet objects (in lexical order by id)
702 Args : none
704 =cut
706 sub all_singlets {
707 my ($self) = @_;
709 my @singlets = ();
710 foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) {
711 push(@singlets, $self->{'_singlets'}{$singlet});
714 return @singlets;