3 # BioPerl module for Bio::Assembly::Scaffold
5 # Copyright by Robson F. de Souza
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
13 Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly
19 use Bio::Assembly::IO;
21 # Assembly loading methods
22 my $aio = Bio::Assembly::IO->new(-file=>"test.ace.1", -format=>'phrap');
23 my $assembly = $aio->next_assembly;
25 foreach my $contig ($assembly->all_contigs) {
26 # do something... (see Bio::Assembly::Contig)
31 Bio::Assembly::Scaffold was developed to store and manipulate data
32 from sequence assembly programs like Phrap. It implements the
33 ScaffoldI interface and intends to be generic enough to be used by
34 Bio::Assembly::IO drivers written to programs other than Phrap.
40 User feedback is an integral part of the evolution of this and other
41 Bioperl modules. Send your comments and suggestions preferably to the
42 Bioperl mailing lists Your participation is much appreciated.
44 bioperl-l@bioperl.org - General discussion
45 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
49 Please direct usage questions or support issues to the mailing list:
51 L<bioperl-l@bioperl.org>
53 rather than to the module maintainer directly. Many experienced and
54 reponsive experts will be able look at the problem and quickly
55 address it. Please include a thorough description of the problem
56 with code and data examples if at all possible.
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 the bugs and their resolution. Bug reports can be submitted via the
64 http://bugzilla.open-bio.org/
66 =head1 AUTHOR - Robson Francisco de Souza
68 rfsouza@citri.iq.usp.br
72 The rest of the documentation details each of the object
73 methods. Internal methods are usually preceded with a _
77 package Bio
::Assembly
::Scaffold
;
80 use Bio
::Annotation
::Collection
;
82 use base
qw(Bio::Root::Root Bio::Assembly::ScaffoldI);
87 Usage : $scaffold = new ( -id => "assembly 1",
88 -source => 'program_name',
89 -contigs => \@contigs,
90 -singlets => \@singlets );
91 Function: creates a new scaffold object
92 Returns : Bio::Assembly::Scaffold
93 Args : -id : [string] scaffold name
94 -source : [string] sequence assembly program
95 -contigs : reference to array of Bio::Assembly::Contig objects
96 -singlets : reference to array of Bio::Assembly::Singlet objects
102 my($class, @args) = @_;
103 my $self = $class->SUPER::new
(@args);
104 my ($id, $src, $contigs, $singlets) = $self->_rearrange(
105 [qw(ID SOURCE CONTIGS SINGLETS)], @args);
108 $self->{'_id'} = 'NoName';
109 $self->{'_source'} = 'Unknown';
110 $self->{'_contigs'} = {};
111 $self->{'_singlets'} = {};
112 $self->{'_seqs'} = {};
113 $self->{'_annotation'} = Bio
::Annotation
::Collection
->new();
116 $self->{'_id'} = $id if (defined $id);
117 $self->{'_source'} = $src if (defined $src);
119 # Add contigs and singlets to scaffold
120 if (defined $contigs && ref($contigs = 'ARRAY')) {
121 for my $contig (@
{$contigs}) {
122 if( ! ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
123 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
124 "Bio::Assembly::Contig object [", ref($contig), "]");
126 $self->add_contig($contig);
129 if (defined $singlets && ref($singlets = 'ARRAY')) {
130 for my $singlet (@
{$singlets}) {
131 if( ! ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
132 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
133 "Bio::Assembly::Singlet object [", ref($singlet), "]");
135 $self->add_singlet($singlet);
142 =head1 Accessing general assembly data
149 Usage : $assembly->id()
150 Function: Get/Set assembly ID
151 Returns : string or undef
157 my ($self, $id) = @_;
158 return $self->{'_id'} = $id if (defined $id);
159 return $self->{'_id'};
165 Usage : $assembly->annotation()
166 Function: Get/Set assembly annotation object
167 Returns : Bio::Annotation::Collection
173 my ($self, $ref) = shift;
174 $self->{'_annotation'} = $ref if (defined $ref);
175 return $self->{'_annotation'};
178 =head2 get_nof_contigs
180 Title : get_nof_contigs
181 Usage : $assembly->get_nof_contigs()
182 Function: Get the number of contigs included in the scaffold
188 sub get_nof_contigs
{
190 return scalar( $self->get_contig_ids() );
193 =head2 get_nof_contig_seqs
195 Title : get_nof_contig_seqs
196 Usage : $assembly->get_nof_contig_seqs()
197 Function: Get the number of sequences included in contigs of the
198 scaffold (no consensus sequences or singlets)
204 sub get_nof_contig_seqs
{
208 foreach my $contig ($self->all_contigs) {
209 $nof_seqs += scalar( $contig->get_seq_ids() );
214 # function alias for backward compatibility
215 *get_nof_sequences_in_contigs
= \
&get_nof_contig_seqs
;
218 =head2 get_nof_singlets (get_nof_singlet_seqs)
221 Usage : $assembly->nof_singlets()
222 Function: Get the number of singlets included in the assembly
228 sub get_nof_singlets
{
230 return scalar( $self->get_singlet_ids() );
232 *get_nof_singlet_seqs
= \
&get_nof_singlets
;
234 =head2 get_all_seq_ids
236 Title : get_all_seq_ids
237 Usage : $assembly->get_all_seq_ids()
238 Function: Get the ID of all sequences making up the scaffold
239 (sequences from contigs and singlets, not consensus).
240 Returns : array of strings
245 sub get_all_seq_ids
{
247 return keys %{ $self->{'_seqs'} };
253 Usage : $assembly->get_nof_seqs()
254 Function: Get total number of sequences making up the scaffold
255 (sequences from contigs and singlets, not consensus).
263 return scalar $self->get_all_seq_ids;
266 =head2 get_contig_seq_ids
268 Title : get_contig_seq_ids
269 Usage : $assembly->get_contig_seq_ids()
270 Function: Get the ID of all sequences in contigs
271 Returns : array of strings
276 sub get_contig_seq_ids
{
279 for my $contig ( $self->all_contigs ) {
280 push @ids, $contig->get_seq_ids;
284 # function alias for backward compatibility
285 *get_seq_ids
= \
&get_contig_seq_ids
;
287 =head2 get_contig_ids
289 Title : get_contig_ids
290 Usage : $assembly->get_contig_ids()
291 Function: Access list of contig IDs from assembly
292 Returns : an array, if there are any contigs in the
293 assembly. An empty array otherwise
302 ?
sort keys %{$self->{'_contigs'}}
303 : scalar keys %{$self->{'_contigs'}};
306 =head2 get_singlet_ids (get_singlet_seq_ids)
308 Title : get_singlet_ids
309 Usage : $assembly->get_singlet_ids()
310 Function: Access list of singlet IDs from assembly
311 Returns : array of strings if there are any singlets
312 otherwise an empty array
317 sub get_singlet_ids
{
321 ?
sort keys %{$self->{'_singlets'}}
322 : scalar keys %{$self->{'_singlets'}};
324 *get_singlet_seq_ids
= \
&get_singlet_ids
;
328 Title : get_seq_by_id
329 Usage : $assembly->get_seq_by_id($id)
330 Function: Get a reference for an sequence making up the scaffold
331 (from a contig or singlet, not consensus)
332 Returns : a Bio::LocatableSeq object
333 undef if sequence $id is not found in the scaffold
334 Args : [string] sequence identifier (id)
342 return unless (exists $self->{'_seqs'}{$seqID});
344 return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID);
347 =head2 get_contig_by_id
349 Title : get_contig_by_id
350 Usage : $assembly->get_contig_by_id($id)
351 Function: Get a reference for a contig
352 Returns : a Bio::Assembly::Contig object or undef
353 Args : [string] contig unique identifier (ID)
357 sub get_contig_by_id
{
359 my $contigID = shift;
361 return unless (exists $self->{'_contigs'}{$contigID});
363 return $self->{'_contigs'}{$contigID};
366 =head2 get_singlet_by_id
368 Title : get_singlet_by_id
369 Usage : $assembly->get_singlet_by_id()
370 Function: Get a reference for a singlet
371 Returns : Bio::Assembly::Singlet object or undef
372 Args : [string] a singlet ID
376 sub get_singlet_by_id
{
379 my $singletID = shift;
381 return unless (exists $self->{'_singlets'}{$singletID});
383 return $self->{'_singlets'}{$singletID};
386 =head1 Modifier methods
393 Usage : $assembly->add_contig($contig)
394 Function: Add a contig to the assembly
395 Returns : 1 on success
396 Args : a Bio::Assembly::Contig object
402 my ($self, $contig) = @_;
405 if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
406 $self->throw("Bio::Assembly::Scaffold::add_contig is unable to process".
407 " non Bio::Assembly::Contig object [", ref($contig), "]");
410 # Create and attribute contig ID
411 my $contigID = $contig->id();
412 if( !defined $contigID ) {
413 $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1);
414 $contig->id($contigID);
415 $self->warn("Attributing ID $contigID to unnamed Bio::Assembly::Contig".
419 # Adding contig to scaffold
420 $self->warn("Replacing contig $contigID with a new contig object")
421 if (exists $self->{'_contigs'}{$contigID});
422 $self->{'_contigs'}{$contigID} = $contig;
423 $contig->assembly($self); # weak circular reference
425 # Put contig sequences in the list of sequences belonging to the scaffold
426 foreach my $seqID ($contig->get_seq_ids()) {
427 if (exists $self->{'_seqs'}{$seqID} &&
428 not($self->{'_seqs'}{$seqID} eq $contig) ) {
429 $self->warn( "Sequence $seqID already assigned to object ".
430 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
432 $self->{'_seqs'}{$seqID} = $contig;
441 Usage : $assembly->add_singlet($seq)
442 Function: Add a singlet to the assembly
443 Returns : 1 on success
444 Args : a Bio::Assembly::Singlet object
450 my ($self, $singlet) = @_;
453 if ( !ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
454 $self->throw("Bio::Assembly::Scaffold::add_singlet is unable to process".
455 " non Bio::Assembly::Singlet object [", ref($singlet), "]");
458 # Create and attribute singlet ID
459 my $singletID = $singlet->id();
460 if( !defined $singletID ) {
461 $singletID = 'Unknown_' . ($self->get_nof_singlets() + 1);
462 $singlet->id($singletID);
463 $self->warn("Attributing ID $singletID to unnamed Bio::Assembly::".
467 # Adding singlet to scaffold
468 $self->warn("Replacing singlet $singletID with a new singlet object")
469 if (exists $self->{'_singlets'}{$singletID});
470 $self->{'_singlets'}{$singletID} = $singlet;
471 $singlet->assembly($self); # weak circular reference
473 # Put singlet sequence in the list of sequences belonging to the scaffold
474 my $seqID = $singlet->id();
475 if (exists $self->{'_seqs'}{$seqID} &&
476 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
477 $self->warn( "Sequence $seqID already assigned to object ".
478 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
480 $self->{'_seqs'}{$seqID} = $singlet;
484 =head2 update_seq_list
486 Title : update_seq_list
487 Usage : $assembly->update_seq_list()
490 Synchronizes the assembly registry for sequences in
491 contigs and contig actual aligned sequences content. You
492 probably want to run this after you remove/add a
493 sequence from/to a contig in the assembly.
495 Returns : 1 for success
500 sub update_seq_list
{
503 $self->{'_seqs'} = {};
505 # Put sequences in contigs in list of sequences belonging to the scaffold
506 foreach my $contig ($self->all_contigs) {
507 my $contigID = $contig->id();
508 foreach my $seqID ($contig->get_seq_ids) {
509 if (exists $self->{'_seqs'}{$seqID} &&
510 not($self->{'_seqs'}{$seqID} eq $contig) ) {
511 $self->warn( "Sequence $seqID already assigned to object ".
512 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
514 $self->{'_seqs'}{$seqID} = $contig;
518 # Put singlet sequences in the list of sequences belonging to the scaffold
519 foreach my $singlet ($self->all_singlets) {
520 my $seqID = $singlet->id();
521 my $singletID = $singlet->id();
522 if (exists $self->{'_seqs'}{$seqID} &&
523 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
524 $self->warn( "Sequence $seqID already assigned to object ".
525 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
527 $self->{'_seqs'}{$seqID} = $singlet;
533 =head2 remove_contigs
535 Title : remove_contigs
536 Usage : $assembly->remove_contigs(1..4)
537 Function: Remove contig from assembly object
538 Returns : an array of removed Bio::Assembly::Contig
540 Args : an array of contig IDs
542 See function get_contig_ids() above
547 my ($self, @args) = @_;
550 foreach my $contigID (@args) {
551 foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) {
552 delete $self->{'_seqs'}{$seqID};
554 push(@ret, $self->{'_contigs'}{$contigID});
555 delete $self->{'_contigs'}{$contigID};
561 =head2 remove_singlets
563 Title : remove_singlets
564 Usage : $assembly->remove_singlets(@singlet_ids)
565 Function: Remove singlet from assembly object
566 Returns : the Bio::Assembly::Singlet objects removed
567 Args : a list of singlet IDs
569 See function get_singlet_ids() above
573 sub remove_singlets
{
574 my ($self,@args) = @_;
577 foreach my $singletID (@args) {
578 push(@ret,$self->{'_singlets'}{$singletID});
579 delete $self->{'_singlets'}{$singletID};
585 =head2 remove_features_collection
587 Title : remove_features_collection
588 Usage : $assembly->remove_features_collection()
589 Function: Removes the collection of features associated to every
590 contig and singlet of the scaffold. This can be useful
591 to save some memory (when contig and singlet features are
598 sub remove_features_collection
{
600 for my $obj ( $self->all_contigs, $self->all_singlets ) {
601 $obj->remove_features_collection;
607 =head1 Contig and singlet selection methods
611 =head2 select_contigs
613 Title : select_contigs
614 Usage : $assembly->select_contigs(@list)
615 Function: Select an array of contigs from the assembly
616 Returns : an array of Bio::Assembly::Contig objects
617 Args : an array of contig ids
619 See function get_contig_ids() above
624 my ($self,@args) = @_;
627 foreach my $contig (@args) {
628 unless (exists $self->{'_contigs'}{$contig}) {
629 $self->warn("$contig contig not found. Ignoring...");
632 push(@contigs, $self->{'_contigs'}{$contig});
638 =head2 select_singlets
640 Title : select_singlets
641 Usage : $assembly->select_singlets(@list)
642 Function: Selects an array of singlets from the assembly
643 Returns : an array of Bio::Assembly::Singlet objects
644 Args : an array of singlet ids
646 See function get_singlet_ids() above
650 sub select_singlets
{
651 my ($self,@args) = @_;
654 foreach my $singlet (@args) {
655 unless (exists $self->{'_singlets'}{$singlet}) {
656 $self->warn("$singlet singlet not found. Ignoring...");
659 push(@singlets, $self->{'_singlets'}{$singlet});
668 Usage : my @contigs = $assembly->all_contigs
671 Returns a list of all contigs in this assembly. Contigs
672 are both clusters and alignments of one or more reads,
673 with an associated consensus sequence.
675 Returns : array of Bio::Assembly::Contig (in lexical id order)
684 foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) {
685 push(@contigs, $self->{'_contigs'}{$contig});
694 Usage : my @singlets = $assembly->all_singlets
697 Returns a list of all singlets in this assembly.
698 Singlets are isolated reads, without non-vector
699 matches to any other read in the assembly.
701 Returns : array of Bio::Assembly::Singlet objects (in lexical order by id)
710 foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) {
711 push(@singlets, $self->{'_singlets'}{$singlet});