[bug 2714]
[bioperl-live.git] / Bio / Assembly / Contig.pm
blob3581a2bfa4dad3ae7e6e9a81a890df6335a7c2b3
1 # $Id$
3 # BioPerl module for Bio::Assembly::Contig
4 # Mostly based on Bio::SimpleAlign by Ewan Birney
6 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
8 # Copyright Robson Francisco de Souza
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Assembly::Contig - Perl module to hold and manipulate
17 sequence assembly contigs.
19 =head1 SYNOPSIS
21 # Module loading
22 use Bio::Assembly::IO;
24 # Assembly loading methods
25 $aio = Bio::Assembly::IO->new(-file=>"test.ace.1",
26 -format=>'phrap');
28 $assembly = $aio->next_assembly;
29 foreach $contig ($assembly->all_contigs) {
30 # do something
33 # OR, if you want to build the contig yourself,
35 use Bio::Assembly::Contig;
36 $c = Bio::Assembly::Contig->new(-id=>"1");
38 $ls = Bio::LocatableSeq->new(-seq=>"ACCG-T",
39 -id=>"r1",
40 -alphabet=>'dna');
41 $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T",
42 -id=>"r2",
43 -alphabet=>'dna');
45 $ls_coord = Bio::SeqFeature::Generic->new(-start=>3,
46 -end=>8,
47 -strand=>1);
48 $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1,
49 -end=>8,
50 -strand=>1);
51 $c->add_seq($ls);
52 $c->add_seq($ls2);
53 $c->set_seq_coord($ls_coord,$ls);
54 $c->set_seq_coord($ls2_coord,$ls2);
56 $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T",
57 -alphabet=>'dna');
58 $c->set_consensus_sequence($con);
60 $l = $c->change_coord('unaligned r2','ungapped consensus',6);
61 print "6 in unaligned r2 => $l in ungapped consensus\n";
64 =head1 DESCRIPTION
66 A contig is as a set of sequences, locally aligned to each other, so
67 that every sequence has overlapping regions with at least one sequence
68 in the contig, such that a continuous of overlapping sequences is
69 formed, allowing the deduction of a consensus sequence which may be
70 longer than any of the sequences from which it was deduced.
72 In this documentation we refer to the overlapping sequences used to
73 build the contig as "aligned sequences" and to the sequence deduced
74 from the overlap of aligned sequences as the "consensus". Methods to
75 deduce the consensus sequence from aligned sequences were not yet
76 implemented in this module, but its posssible to add a consensus
77 sequence deduced by other means, e.g, by the assembly program used to
78 build the alignment.
80 All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable
81 objects and have a unique ID. The unique ID restriction is due to the
82 nature of the module's internal data structures and is also a request
83 of some assembly programs. If two sequences with the same ID are added
84 to a contig, the first sequence added is replaced by the second one.
86 =head2 Coordinate_systems
88 There are four base coordinate systems in Bio::Assembly::Contig. When
89 you need to access contig elements or data that exists on a certain
90 range or location, you may be specifying coordinates in relation to
91 different sequences, which may be either the contig consensus or one
92 of the aligned sequences that were used to do the assembly.
94 =========================================================
95 Name | Referenced sequence
96 ---------------------------------------------------------
97 "gapped consensus" | Contig (with gaps)
98 "ungapped consensus" | Contig (without gaps)
99 "aligned $seqID" | sequence $seqID (with gaps)
100 "unaligned $seqID" | sequence $seqID (without gaps)
101 =========================================================
103 "gapped consensus" refers to positions in the aligned consensus
104 sequence, which is the consensus sequence including the gaps inserted
105 to align it agains the aligned sequences that were used to assemble
106 the contig. So, its limits are [ 1, (consensus length + number of gaps
107 in consensus) ]
109 "ungapped consensus" is a coordinate system based on the consensus
110 sequence, but excluding consensus gaps. This is just the coordinate
111 system that you have when considering the consensus sequence alone,
112 instead of aligned to other sequences.
114 "aligned $seqID" refers to locations in the sequence $seqID after
115 alignment of $seqID against the consensus sequence (reverse
116 complementing the original sequence, if needed). Coordinate 1 in
117 "aligned $seqID" is equivalent to the start location (first base) of
118 $seqID in the consensus sequence, just like if the aligned sequence
119 $seqID was a feature of the consensus sequence.
121 "unaligned $seqID" is equivalent to a location in the isolated
122 sequence, just like you would have when considering the sequence
123 alone, out of an alignment. When changing coordinates from "aligned
124 $seq2" to "unaligned $seq2", if $seq2 was reverse complemented when
125 included in the alignment, the output coordinates will be reversed to
126 fit that fact, i.e. 1 will be changed to length($seq2), 2 will be
127 length($seq)-1 and so on.
129 An important note: when you change gap coordinates from a gapped
130 system ("gapped consensus" or "aligned $seqID") to a system that does
131 not include gaps ("ungapped consensus" or "unaligned $seqID"), the
132 position returned will be the first location before all gaps
133 neighboring the input location.
135 =head2 Feature_collection
137 Bio::Assembly::Contig stores much information about a contig in a
138 Bio::Assembly::SeqFeature::Collection object. Relevant information on the
139 alignment is accessed by selecting features based on their primary
140 tags (e.g. all features which have a primary tag of the form
141 '_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are
142 coordinates for sequences in the contig alignment) and, by using
143 methods from Bio::Assembly::SeqFeature::Collection, it's possible to select
144 features by overlap with other features.
146 We suggest that you use the primary tags of features as identifiers
147 for feature classes. By convention, features with primary tags
148 starting with a '_' are generated by modules that populate the contig
149 data structure and return the contig object, maybe as part of an
150 assembly object, e.g. drivers from the Bio::Assembly::IO set.
152 Features in the features collection may be associated with particular
153 aligned sequences. To obtain this, you must attach the sequence to the
154 feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the
155 feature to the feature collection. We also suggest to add the sequence
156 id to the primary tag, so that is easy to select feature for a
157 particular sequence.
159 There is only one feature class that some methods in
160 Bio::Assembly::Contig expect to find in the feature collection: features
161 with primary tags of the form '_aligned_coord:$seqID', where $seqID is
162 the aligned sequence id (like returned by $seq-E<gt>id()). These features
163 describe the position (in "gapped consensus" coordinates) of aligned
164 sequences, and the method set_seq_coord() automatically changes a
165 feature's primary tag to this form whenever the feature is added to
166 the collection by this method. Only two methods in Bio::Assembly::Contig
167 will not work unless there are features from this class:
168 change_coord() and get_seq_coord().
170 Other feature classes will be automatically available only when
171 Bio::Assembly::Contig objects are created by a specific module. Such
172 feature classes are (or should be) documented in the documentation of
173 the module which create them, to which the user should refer.
175 =head1 FEEDBACK
177 =head2 Mailing Lists
179 User feedback is an integral part of the evolution of this and other
180 Bioperl modules. Send your comments and suggestions preferably to the
181 Bioperl mailing lists Your participation is much appreciated.
183 bioperl-l@bioperl.org - General discussion
184 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
186 =head2 Reporting Bugs
188 Report bugs to the Bioperl bug tracking system to help us keep track
189 the bugs and their resolution. Bug reports can be submitted via the
190 web:
192 http://bugzilla.open-bio.org/
194 =head1 AUTHOR - Robson Francisco de Souza
196 rfsouza@citri.iq.usp.br
198 =head1 APPENDIX
200 The rest of the documentation details each of the object
201 methods. Internal methods are usually preceded with a _
203 =cut
206 package Bio::Assembly::Contig;
208 use strict;
210 use Bio::SeqFeature::Collection;
211 use Bio::Seq::PrimaryQual;
213 use Scalar::Util qw(weaken);
215 use base qw(Bio::Root::Root Bio::Align::AlignI);
217 =head1 Object creator
219 =head2 new
221 Title : new
222 Usage : my $contig = Bio::Assembly::Contig->new();
223 Function : Creates a new contig object
224 Returns : Bio::Assembly::Contig
225 Args : -id => contig unique ID
226 -source => string for the sequence assembly program used
227 -collection => Bio::SeqFeature::Collection instance
229 =cut
231 #-----------
232 sub new {
233 #-----------
234 my ($class, @args) = @_;
236 my $self = $class->SUPER::new(@args);
238 my ($src, $id, $collection) = $self->_rearrange([qw(SOURCE ID COLLECTION)], @args);
239 $src && $self->source($src);
240 ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name
241 ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig
242 # we need to set up internal hashes first!
244 # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility)
245 $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID)
246 $self->{'_order'} = {}; # store sequence order
247 # $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids.
248 # $self->{'_dis_name'} = {}; # Display names for each sequence
249 $self->{'_symbols'} = {}; # List of symbols
251 #Contig specific slots
252 $self->{'_consensus_sequence'} = undef;
253 $self->{'_consensus_quality'} = undef;
254 $self->{'_nof_residues'} = 0;
255 $self->{'_nof_seqs'} = 0;
256 # $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now...
258 # for cases where SF::Collection is shared between Bio::Assembly::Contig
259 if ($collection) {
260 $self->throw("Collection must implement Bio::SeqFeature::CollectionI") unless $collection->isa('Bio::SeqFeature::CollectionI');
261 $self->{'_sfc'} = $collection;
262 } else {
263 $self->{'_sfc'} = Bio::SeqFeature::Collection->new()
266 # Assembly specifics
267 $self->{'_assembly'} = undef; # Reference to a Bio::Assembly::Scaffold object, if contig belongs to one.
268 $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise
269 $self->{'_neighbor_start'} = undef; # Will hold a reference to another contig
270 $self->{'_neighbor_end'} = undef; # Will hold a reference to another contig
272 return $self; # success - we hope!
275 =head1 Assembly related methods
277 These methods exist to enable adding information about possible
278 relations among contigs, e.g. when you already have a scaffold for
279 your assembly, describing the ordering of contigs in the final
280 assembly, but no sequences covering the gaps between neighboring
281 contigs.
283 =head2 source
285 Title : source
286 Usage : $contig->source($program);
287 Function : Get/Set program used to build this contig
288 Returns : string
289 Argument : [optional] string
291 =cut
293 sub source {
294 my $self = shift;
295 my $source = shift;
297 $self->{'_source'} = $source if (defined $source);
298 return $self->{'_source'};
301 =head2 assembly
303 Title : assembly
304 Usage : $contig->assembly($assembly);
305 Function : Get/Set assembly object for this contig
306 Returns : a Bio::Assembly::Scaffold object
307 Argument : a Bio::Assembly::Scaffold object
309 =cut
311 sub assembly {
312 my $self = shift;
313 my $assembly = shift;
315 $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly")
316 if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold"));
317 # We create a circular reference to a Scaffold object. It is made weak
318 # to prevent memory leaks.
319 $self->{'_assembly'} = $assembly if (defined $assembly);
320 weaken($self->{'_assembly'});
322 return $self->{'_assembly'};
325 =head2 strand
327 Title : strand
328 Usage : $contig->strand($num);
329 Function : Get/Set contig orientation in a scaffold/assembly.
330 Its equivalent to the strand property of sequence
331 objects and sets whether the contig consensus should
332 be reversed and complemented before being added to a
333 scaffold or assembly.
334 Returns : integer
335 Argument : 1 if orientaion is forward, -1 if reverse and
336 0 if none
338 =cut
340 sub strand {
341 my $self = shift;
342 my $ori = shift;
344 if (defined $ori) {
345 $self->throw("Contig strand must be either 1, -1 or 0")
346 unless $ori == 1 || $ori == 0 || $ori == -1;
347 $self->{'_strand'} = $ori;
350 return $self->{'_strand'};
353 =head2 upstream_neighbor
355 Title : upstream_neighbor
356 Usage : $contig->upstream_neighbor($contig);
357 Function : Get/Set a contig neighbor for the current contig when
358 building a scaffold. The upstream neighbor is
359 located before $contig first base
360 Returns : nothing
361 Argument : Bio::Assembly::Contig
363 =cut
365 sub upstream_neighbor {
366 my $self = shift;
367 my $ref = shift;
369 $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig")
370 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
372 $self->{'_neighbor_start'} = $ref if (defined $ref);
373 return $self->{'_neighbor_start'};
376 =head2 downstream_neighbor
378 Title : downstream_neighbor
379 Usage : $contig->downstream_neighbor($num);
380 Function : Get/Set a contig neighbor for the current contig when
381 building a scaffold. The downstream neighbor is
382 located after $contig last base
383 Returns : nothing
384 Argument : Bio::Assembly::Contig
386 =cut
388 sub downstream_neighbor {
389 my $self = shift;
390 my $ref = shift;
392 $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig")
393 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
394 $self->{'_neighbor_end'} = $ref if (defined $ref);
395 return $self->{'_neighbor_end'};
398 =head1 Contig feature collection methods
400 =head2 add_features
402 Title : add_features
403 Usage : $contig->add_features($feat,$flag)
404 Function :
406 Add an array of features to the contig feature
407 collection. The consensus sequence may be attached to the
408 added feature, if $flag is set to 1. If $flag is 0 and
409 the feature attached to one of the contig aligned
410 sequences, the feature is registered as an aligned
411 sequence feature. If $flag is 0 and the feature is not
412 attched to any sequence in the contig, the feature is
413 simply added to the feature collection and no attachment
414 or registration is made.
416 Note: You must attach aligned sequences to their features
417 prior to calling add_features, otherwise you won't be
418 able to access the feature through get_seq_feat_by_tag()
419 method.
421 Returns : number of features added.
422 Argument :
423 $feat : A reference to an array of Bio::SeqFeatureI
424 $flag : boolean - true if consensus sequence object
425 should be attached to this feature, false if
426 no consensus attachment should be made.
427 Default: false.
429 =cut
431 sub add_features {
432 my ($self, $args, $flag) = @_;
434 # Adding shortcuts for aligned sequence features
435 $flag = 0 unless (defined $flag);
436 if ($flag && defined $self->{'_consensus_sequence'}) {
437 foreach my $feat (@$args) {
438 next if (defined $feat->seq);
439 $feat->attach_seq($self->{'_consensus_sequence'});
441 } elsif (!$flag) { # Register aligned sequence features
442 foreach my $feat (@$args) {
443 if (my $seq = $feat->entire_seq()) {
444 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
445 $self->warn("Adding contig feature attached to unknown sequence $seqID!")
446 unless (exists $self->{'_elem'}{$seqID});
447 my $tag = $feat->primary_tag;
448 $self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat;
453 # Add feature to feature collection
454 my $nof_added = $self->{'_sfc'}->add_features($args);
456 return $nof_added;
459 =head2 remove_features
461 Title : remove_features
462 Usage : $contig->remove_features(@feat)
463 Function : Remove an array of contig features
464 Returns : number of features removed.
465 Argument : An array of Bio::SeqFeatureI
467 =cut
469 sub remove_features {
470 my ($self, @args) = @_;
472 # Removing shortcuts for aligned sequence features
473 foreach my $feat (@args) {
474 if (my $seq = $feat->entire_seq()) {
475 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
476 my $tag = $feat->primary_tag;
477 $tag =~ s/:$seqID$/$1/g;
478 delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} )
479 if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} &&
480 $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat);
484 # Removing Bio::SeqFeature::Collection features
485 return $self->{'_sfc'}->remove_features(\@args);
488 =head2 get_features_collection
490 Title : get_features_collection
491 Usage : $contig->get_features_collection()
492 Function : Get the collection of all contig features
493 Returns : Bio::SeqFeature::Collection
494 Argument : none
496 =cut
498 sub get_features_collection {
499 my $self = shift;
500 return $self->{'_sfc'};
503 =head2 remove_features_collection
505 Title : remove_features_collection
506 Usage : $contig->remove_features_collection()
507 Function : Remove the collection of all contig features. It is useful
508 to save some memory (when contig features are not needed).
509 Returns : none
510 Argument : none
512 =cut
514 sub remove_features_collection {
515 my $self = shift;
516 # Removing shortcuts for aligned sequence features
517 for my $seqID (keys %{$self->{'_elem'}}) {
518 delete $self->{'_elem'}{$seqID};
520 # Removing Bio::SeqFeature::Collection features
521 $self->{'_sfc'} = {};
522 return;
525 =head1 Coordinate system's related methods
527 See L<Coordinate_Systems> above.
529 =head2 change_coord
531 Title : change_coord
532 Usage : $contig->change_coord($in,$out,$query)
533 Function :
535 Change coordinate system for $query. This method
536 transforms locations between coordinate systems described
537 in section "Coordinate Systems" of this document.
539 Note: this method will throw an exception when changing
540 coordinates between "ungapped consensus" and other
541 systems if consensus sequence was not set. It will also
542 throw exceptions when changing coordinates among aligned
543 sequence, either with or without gaps, and other systems
544 if sequence locations were not set with set_seq_coord().
546 Returns : integer
547 Argument :
548 $in : [string] input coordinate system
549 $out : [string] output coordinate system
550 $query : [integer] a position in a sequence
552 =cut
554 sub change_coord {
555 my $self = shift;
556 my $type_in = shift;
557 my $type_out = shift;
558 my $query = shift;
560 # Parsing arguments
561 # Loading read objects (these calls will throw exceptions whether $read_in or
562 # $read_out is not found
563 my ($read_in,$read_out) = (undef,undef);
564 my $in_ID = ( split(' ',$type_in) )[1];
565 my $out_ID = ( split(' ',$type_out) )[1];
567 if ($in_ID ne 'consensus') {
568 $read_in = $self->get_seq_coord( $self->get_seq_by_name($in_ID) );
569 $self->throw("Can't change coordinates without sequence location for $in_ID")
570 unless (defined $read_in);
572 if ($out_ID ne 'consensus') {
573 $read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) );
574 $self->throw("Can't change coordinates without sequence location for $out_ID")
575 unless (defined $read_out);
578 # Performing transformation between coordinates
579 SWITCH1: {
581 # Transformations between contig padded and contig unpadded
582 (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do {
583 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
584 unless (defined $self->{'_consensus_sequence'});
585 $query = &_padded_unpadded($self->{'_consensus_gaps'}, $query);
586 last SWITCH1;
588 (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do {
589 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
590 unless (defined $self->{'_consensus_sequence'});
591 $query = &_unpadded_padded($self->{'_consensus_gaps'},$query);
592 last SWITCH1;
595 # Transformations between contig (padded) and read (padded)
596 (($type_in eq 'gapped consensus') &&
597 ($type_out =~ /^aligned /) && defined($read_out)) && do {
598 $query = $query - $read_out->start() + 1;
599 last SWITCH1;
601 (($type_in =~ /^aligned /) && defined($read_in) &&
602 ($type_out eq 'gapped consensus')) && do {
603 $query = $query + $read_in->start() - 1;
604 last SWITCH1;
607 # Transformations between contig (unpadded) and read (padded)
608 (($type_in eq 'ungapped consensus') &&
609 ($type_out =~ /^aligned /) && defined($read_out)) && do {
610 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
611 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
612 last SWITCH1;
614 (($type_in =~ /^aligned /) && defined($read_in) &&
615 ($type_out eq 'ungapped consensus')) && do {
616 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
617 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
618 last SWITCH1;
621 # Transformations between seq $read_in padded and seq $read_out padded
622 (defined($read_in) && ($type_in =~ /^aligned /) &&
623 defined($read_out) && ($type_out =~ /^aligned /)) && do {
624 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
625 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
626 last SWITCH1;
629 # Transformations between seq $read_in padded and seq $read_out unpadded
630 (defined($read_in) && ($type_in =~ /^aligned /) &&
631 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
632 if ($read_in ne $read_out) {
633 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
634 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
636 my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'};
637 $query = &_padded_unpadded($list_out,$query);
638 # Changing read orientation if read was reverse complemented when aligned
639 if ($read_out->strand == -1) {
640 my ($length) = $read_out->length();
641 $length = $length - &_nof_gaps($list_out,$length);
642 $query = $length - $query + 1;
644 last SWITCH1;
646 (defined($read_in) && ($type_in =~ /^unaligned /) &&
647 defined($read_out) && ($type_out =~ /^aligned /)) && do {
648 my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'};
649 # Changing read orientation if read was reverse complemented when aligned
650 if ($read_in->strand == -1) {
651 my ($length) = $read_in->length();
652 $length = $length - &_nof_gaps($list_in,$length);
653 $query = $length - $query + 1;
655 $query = &_unpadded_padded($list_in,$query);
656 if ($read_in ne $read_out) {
657 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
658 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
660 last SWITCH1;
663 # Transformations between seq $read_in unpadded and seq $read_out unpadded
664 (defined($read_in) && ($type_in =~ /^unaligned /) &&
665 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
666 $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query);
667 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
668 last SWITCH1;
671 # Transformations between contig (padded) and read (unpadded)
672 (($type_in eq 'gapped consensus') &&
673 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
674 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
675 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
676 last SWITCH1;
678 (($type_in =~ /^unaligned /) && defined($read_in) &&
679 ($type_out eq 'gapped consensus')) && do {
680 $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query);
681 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
682 last SWITCH1;
685 # Transformations between contig (unpadded) and read (unpadded)
686 (($type_in eq 'ungapped consensus') &&
687 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
688 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
689 $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query);
690 last SWITCH1;
692 (($type_in =~ /^unaligned /) && defined($read_in) &&
693 ($type_out eq 'ungapped consensus')) && do {
694 $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query);
695 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
696 last SWITCH1;
699 $self->throw("Unknow coordinate system. Args: $type_in, $type_out.");
700 $query = undef; # If a coordinate systems just requested is unknown
703 return $query;
706 =head2 get_seq_coord
708 Title : get_seq_coord
709 Usage : $contig->get_seq_coord($seq);
710 Function : Get "gapped consensus" location for aligned sequence
711 Returns : Bio::SeqFeature::Generic for coordinates or undef.
712 A warning is printed if sequence coordinates were not set.
713 Argument : Bio::LocatabaleSeq object
715 =cut
717 sub get_seq_coord {
718 my ($self,$seq) = @_;
720 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
721 $self->throw("$seq is not a Bio::LocatableSeq");
723 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
725 unless (exists( $self->{'_elem'}{$seqID} )) {
726 $self->warn("No such sequence ($seqID) in contig ".$self->id);
727 return;
729 unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) {
730 # $self->warn("Chad. Location not set for sequence ($seqID) in contig ".$self->id);
731 return;
734 return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"};
737 =head2 set_seq_coord
739 Title : set_seq_coord
740 Usage : $contig->set_seq_coord($feat,$seq);
741 Function :
743 Set "gapped consensus" location for an aligned
744 sequence. If the sequence was previously added using
745 add_seq, its coordinates are changed/set. Otherwise,
746 add_seq is called and the sequence is added to the
747 contig.
749 Returns : Bio::SeqFeature::Generic for old coordinates or undef.
750 Argument :
751 $feat : a Bio::SeqFeature::Generic object
752 representing a location for the
753 aligned sequence, in "gapped
754 consensus" coordinates.
756 Note: the original feature primary tag will
757 be lost.
759 $seq : a Bio::LocatabaleSeq object
761 =cut
763 sub set_seq_coord {
764 my ($self,$feat,$seq) = @_;
766 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
767 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
770 # Complaining about inadequate feature object
771 $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!")
772 unless ( $feat->isa("Bio::SeqFeature::Generic") );
773 $self->throw("Sequence coordinates must have an end!")
774 unless (defined $feat->end);
775 $self->throw("Sequence coordinates must have a start!")
776 unless (defined $feat->start);
778 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
779 if (exists( $self->{'_elem'}{$seqID} ) &&
780 exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
781 defined( $self->{'_elem'}{$seqID}{'_seq'} ) &&
782 ($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) {
783 $self->warn("Replacing sequence $seqID\n");
784 $self->remove_seq($self->{'_elem'}{$seqID}{'_seq'});
786 $self->add_seq($seq);
788 # Remove previous coordinates, if any
789 $self->remove_features($feat);
791 # Add new Bio::Generic::SeqFeature
792 $feat->add_tag_value('contig',$self->id)
793 unless ( $feat->has_tag('contig') );
794 $feat->primary_tag("_aligned_coord:$seqID");
795 $feat->attach_seq($seq);
796 $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat;
797 $self->add_features([ $feat ]);
800 =head1 Bio::Assembly::Contig consensus methods
802 =head2 set_consensus_sequence
804 Title : set_consensus_sequence
805 Usage : $contig->set_consensus_sequence($seq)
806 Function : Set the consensus sequence object for this contig
807 Returns : consensus length
808 Argument : Bio::LocatableSeq
810 =cut
812 sub set_consensus_sequence {
813 my $self = shift;
814 my $seq = shift;
816 $self->throw("Consensus sequence must be a Bio::LocatableSeq!")
817 unless ($seq->isa("Bio::LocatableSeq"));
819 $self->{'_consensus_gaps'} = []; # Consensus Gap registry
820 $self->_register_gaps( $seq->seq, $self->{'_consensus_gaps'} );
821 $self->{'_consensus_sequence'} = $seq;
823 $seq->start(1);
824 $seq->end($seq->_ungapped_len);
826 my $con_len = $seq->length;
828 return $con_len;
831 =head2 set_consensus_quality
833 Title : set_consensus_quality
834 Usage : $contig->set_consensus_quality($qual)
835 Function : Set the quality object for consensus sequence
836 Returns : nothing
837 Argument : Bio::Seq::QualI object
839 =cut
841 sub set_consensus_quality {
842 my $self = shift;
843 my $qual = shift;
845 $self->throw("Consensus quality must be a Bio::Seq::Quality object!")
846 unless ( $qual->isa("Bio::Seq::Quality") );
848 $self->throw("Consensus quality can't be added before you set the consensus sequence!")
849 unless (defined $self->{'_consensus_sequence'});
851 $self->{'_consensus_quality'} = $qual;
854 =head2 get_consensus_length
856 Title : get_consensus_length
857 Usage : $contig->get_consensus_length()
858 Function : Get consensus sequence length
859 Returns : integer
860 Argument : none
862 =cut
864 sub get_consensus_length {
865 my $self = shift;
867 return $self->{'_consensus_sequence'}->length();
870 =head2 get_consensus_sequence
872 Title : get_consensus_sequence
873 Usage : $contig->get_consensus_sequence()
874 Function : Get a reference to the consensus sequence object
875 for this contig
876 Returns : Bio::SeqI object
877 Argument : none
879 =cut
881 sub get_consensus_sequence {
882 my ($self, @args) = @_;
884 return $self->{'_consensus_sequence'};
887 =head2 get_consensus_quality
889 Title : get_consensus_quality
890 Usage : $contig->get_consensus_quality()
891 Function : Get a reference to the consensus quality object
892 for this contig.
893 Returns : A Bio::QualI object
894 Argument : none
896 =cut
898 sub get_consensus_quality {
899 my ($self, @args) = @_;
901 return $self->{'_consensus_quality'};
904 =head1 Bio::Assembly::Contig aligned sequences methods
906 =head2 set_seq_qual
908 Title : set_seq_qual
909 Usage : $contig->set_seq_qual($seq,$qual);
910 Function : Adds quality to an aligned sequence.
911 Returns : nothing
912 Argument : a Bio::LocatableSeq object and
913 a Bio::Seq::QualI object
915 See L<Bio::LocatableSeq> for more information.
917 =cut
919 sub set_seq_qual {
920 my ($self,$seq,$qual) = @_;
922 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
923 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
925 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
927 $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
928 unless ( $qual->isa("Bio::Seq::QualI") );
929 $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!")
930 unless (exists $self->{'_elem'}{$seqID}{'_seq'});
931 $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) ));
933 # Adding gaps to quality object
934 my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq();
935 my $tmp = $qual->qual();
936 @{$tmp} = reverse(@{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1);
937 my @quality = ();
938 my $previous = 0;
939 my $next = 0;
940 my $i = 0; my $j = 0;
941 while ($i<=$#{$tmp}) {
942 # IF base is a gap, quality is the average for neighbouring sites
943 if (substr($sequence,$j,1) eq '-') {
944 $previous = $tmp->[$i-1] unless ($i == 0);
945 if ($i < $#{$tmp}) {
946 $next = $tmp->[$i+1];
947 } else {
948 $next = 0;
950 push(@quality,int( ($previous+$next)/2 ));
951 } else {
952 push(@quality,$tmp->[$i]);
953 $i++;
955 $j++;
958 $self->{'_elem'}{$seqID}{'_qual'} = Bio::Seq::PrimaryQual->new(
959 -qual=>join(" ",@quality), -id=>$seqID );
962 =head2 get_seq_ids
964 Title : get_seq_ids
965 Usage : $contig->get_seq_ids(-start=>$start,
966 -end=>$end,
967 -type=>"gapped A0QR67B08.b");
968 Function : Get list of sequence IDs overlapping interval [$start, $end]
969 The default interval is [1,$contig->length]
970 Default coordinate system is "gapped contig"
971 Returns : An array
972 Argument : A hash with optional elements:
973 -start : consensus subsequence start
974 -end : consensus subsequence end
975 -type : the coordinate system type for $start and $end arguments
976 Coordinate system avaliable are:
977 "gapped consensus" : consensus coordinates with gaps
978 "ungapped consensus" : consensus coordinates without gaps
979 "aligned $ReadID" : read $ReadID coordinates with gaps
980 "unaligned $ReadID" : read $ReadID coordinates without gaps
983 =cut
985 sub get_seq_ids {
986 my ($self, @args) = @_;
988 my ($type,$start,$end) =
989 $self->_rearrange([qw(TYPE START END)], @args);
991 if (defined($start) && defined($end)) {
992 if (defined($type) && ($type ne 'gapped consensus')) {
993 $start = $self->change_coord($type,'gapped consensus',$start);
994 $end = $self->change_coord($type,'gapped consensus',$end);
997 my @list = grep { $_->isa("Bio::SeqFeature::Generic") &&
998 ($_->primary_tag =~ /^_aligned_coord:/) }
999 $self->{'_sfc'}->features_in_range( -start=>$start,
1000 -end=>$end,
1001 -contain=>0,
1002 -strandmatch=>'ignore' );
1003 @list = map { $_->entire_seq->id } @list;
1004 return @list;
1007 # Entire aligned sequences list
1008 return map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} };
1011 =head2 get_seq_feat_by_tag
1013 Title : get_seq_feat_by_tag
1014 Usage : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID")
1015 Function :
1017 Get a sequence feature based on its primary_tag.
1018 When you add
1020 Returns : a Bio::SeqFeature object
1021 Argument : a Bio::LocatableSeq and a string (feature primary tag)
1023 =cut
1025 sub get_seq_feat_by_tag {
1026 my ($self,$seq,$tag) = @_;
1028 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1029 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
1031 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1033 return $self->{'_elem'}{$seqID}{'_feat'}{$tag};
1036 =head2 get_seq_by_name
1038 Title : get_seq_by_name
1039 Usage : $seq = $contig->get_seq_by_name('Seq1')
1040 Function : Gets a sequence based on its id.
1041 Returns : a Bio::LocatableSeq object
1042 undef if name is not found
1043 Argument : string
1045 =cut
1047 sub get_seq_by_name {
1048 my $self = shift;
1049 my ($seqID) = @_;
1051 unless (exists $self->{'_elem'}{$seqID}{'_seq'}) {
1052 $self->throw("Could not find sequence $seqID in contig ".$self->id);
1053 return;
1056 return $self->{'_elem'}{$seqID}{'_seq'};
1059 =head2 get_qual_by_name
1061 Title : get_qual_by_name
1062 Usage : $seq = $contig->get_qual_by_name('Seq1')
1063 Function :
1065 Gets Bio::Seq::QualI object for a sequence
1066 through its id ( as given by $qual->id() ).
1068 Returns : a Bio::Seq::QualI object.
1069 undef if name is not found
1070 Argument : string
1072 =cut
1074 sub get_qual_by_name {
1075 my $self = shift;
1076 my ($seqID) = @_;
1078 unless (exists $self->{'_elem'}{$seqID}{'_qual'}) {
1079 $self->warn("Could not find quality for $seqID in contig!");
1080 return;
1083 return $self->{'_elem'}{$seqID}{'_qual'};
1086 =head1 Bio::Align::AlignI compatible methods
1088 =head2 Modifier methods
1090 These methods modify the MSE by adding, removing or shuffling complete
1091 sequences.
1093 =head2 add_seq
1095 Title : add_seq
1096 Usage : $contig->add_seq($newseq);
1097 Function :
1099 Adds a sequence to the contig. *Does*
1100 *not* align it - just adds it to the
1101 hashes.
1103 Returns : nothing
1104 Argument : a Bio::LocatableSeq object
1106 See L<Bio::LocatableSeq> for more information.
1108 =cut
1110 sub add_seq {
1111 my $self = shift;
1112 my $seq = shift;
1114 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1115 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
1118 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1119 $self->{'_elem'}{$seqID} = {} unless (exists $self->{'_elem'}{$seqID});
1121 if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
1122 ($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) {
1123 $self->warn("Adding sequence $seqID, which has already been added");
1126 # Our locatable sequences are always considered to be complete sequences
1127 $seq->start(1);
1128 $seq->end($seq->_ungapped_len);
1130 $self->warn("Adding non-nucleotidic sequence ".$seqID)
1131 if (lc($seq->alphabet) ne 'dna' && lc($seq->alphabet) ne 'rna');
1133 # build the symbol list for this sequence,
1134 # will prune out the gap and missing/match chars
1135 # when actually asked for the symbol list in the
1136 # symbol_chars
1137 if (defined $seq->seq) {
1138 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1139 } else {
1140 $self->{'_symbols'} = {};
1143 my $seq_no = ++$self->{'_nof_seqs'};
1145 if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) {
1146 $self->warn("Replacing one sequence [$seqID]\n");
1147 } else {
1148 #print STDERR "Assigning $seqID to $order\n";
1149 $self->{'_order'}->{$seq_no} = $seqID;
1150 # $self->{'_start_end_lists'}->{$id} = []
1151 # unless(exists $self->{'_start_end_lists'}->{$id});
1152 # push @{$self->{'_start_end_lists'}->{$id}}, $seq;
1155 $self->{'_elem'}{$seqID}{'_seq'} = $seq;
1156 $self->{'_elem'}{$seqID}{'_feat'} = {};
1157 $self->{'_elem'}{$seqID}{'_gaps'} = [];
1158 my $dbref = $self->{'_elem'}{$seqID}{'_gaps'};
1159 my $nofgaps = $self->_register_gaps($seq->seq,$dbref);
1161 # Updating residue count
1162 $self->{'_nof_residues'} += $seq->length - $nofgaps;
1164 return 1;
1167 =head2 remove_seq
1169 Title : remove_seq
1170 Usage : $contig->remove_seq($seq);
1171 Function : Removes a single sequence from a contig
1172 Returns : 1 on success, 0 otherwise
1173 Argument : a Bio::LocatableSeq object
1175 =cut
1177 sub remove_seq {
1178 my ($self,$seq) = @_;
1180 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1181 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
1184 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1185 unless (exists $self->{'_elem'}{$seqID} ) {
1186 $self->warn("No sequence named $seqID [$seq]");
1187 return 0;
1190 # Updating residue count
1191 $self->{'_nof_residues'} -= $seq->length() +
1192 &_nof_gaps( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length );
1194 # Update number of sequences
1195 $self->{'_nof_seqs'}--;
1197 # Update order of sequences (order starts at 1)
1198 my $max_order = $self->{'_nof_seqs'} + 1;
1199 my $target_order = $max_order + 1;
1200 for (my $order = 1 ; $order <= $max_order ; $order++) {
1201 if ($self->{'_order'}->{$order} eq $seqID) {
1202 # Found the wanted sequence order
1203 $target_order = $order;
1205 if ($order > $target_order) {
1206 # Decrement this sequence order by one order
1207 $self->{'_order'}->{$order-1} = $self->{'_order'}->{$order};
1209 if ($order == $max_order) {
1210 # Remove last order
1211 delete $self->{'_order'}->{$order};
1215 # Remove all references to features of this sequence
1216 my @feats = ();
1217 for my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) {
1218 push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag});
1220 $self->{'_sfc'}->remove_features(\@feats);
1221 delete $self->{'_elem'}{$seqID};
1223 return 1;
1226 =head2 purge
1228 Title : purge
1229 Usage : $contig->purge(0.7);
1230 Function:
1232 Removes sequences above whatever %id.
1234 This function will grind on large alignments. Beware!
1235 (perhaps not ideally implemented)
1237 Example :
1238 Returns : An array of the removed sequences
1239 Argument:
1242 =cut
1244 sub purge {
1245 my ($self) = @_;
1246 $self->throw_not_implemented();
1249 =head2 sort_alphabetically
1251 Title : sort_alphabetically
1252 Usage : $contig->sort_alphabetically
1253 Function :
1255 Changes the order of the alignemnt to alphabetical on name
1256 followed by numerical by number.
1258 Returns :
1259 Argument :
1261 =cut
1263 sub sort_alphabetically {
1264 my ($self) = @_;
1265 $self->throw_not_implemented();
1268 =head2 Sequence selection methods
1270 Methods returning one or more sequences objects.
1272 =head2 each_seq
1274 Title : each_seq
1275 Usage : foreach $seq ( $contig->each_seq() )
1276 Function : Gets an array of Seq objects from the alignment
1277 Returns : an array
1278 Argument :
1280 =cut
1282 sub each_seq {
1283 my ($self) = @_;
1285 my (@arr,$seqID);
1287 foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) {
1288 push(@arr,$self->{'_elem'}{$seqID}{'_seq'});
1291 return @arr;
1294 =head2 each_alphabetically
1296 Title : each_alphabetically
1297 Usage : foreach $seq ( $contig->each_alphabetically() )
1298 Function :
1300 Returns an array of sequence object sorted alphabetically
1301 by name and then by start point.
1302 Does not change the order of the alignment
1304 Returns :
1305 Argument :
1307 =cut
1309 sub each_alphabetically {
1310 my($self) = @_;
1311 $self->throw_not_implemented();
1314 =head2 each_seq_with_id
1316 Title : each_seq_with_id
1317 Usage : foreach $seq ( $contig->each_seq_with_id() )
1318 Function :
1320 Gets an array of Seq objects from the
1321 alignment, the contents being those sequences
1322 with the given name (there may be more than one)
1324 Returns : an array
1325 Argument : a seq name
1327 =cut
1329 sub each_seq_with_id {
1330 my ($self) = @_;
1331 $self->throw_not_implemented();
1334 =head2 get_seq_by_pos
1336 Title : get_seq_by_pos
1337 Usage : $seq = $contig->get_seq_by_pos(3)
1338 Function :
1340 Gets a sequence based on its position in the alignment.
1341 Numbering starts from 1. Sequence positions larger than
1342 no_sequences() will thow an error.
1344 Returns : a Bio::LocatableSeq object
1345 Argument : positive integer for the sequence osition
1347 =cut
1349 sub get_seq_by_pos {
1350 my $self = shift;
1351 my ($pos) = @_;
1353 $self->throw("Sequence position has to be a positive integer, not [$pos]")
1354 unless $pos =~ /^\d+$/ and $pos > 0;
1355 $self->throw("No sequence at position [$pos]")
1356 unless $pos <= $self->no_sequences ;
1358 my $seqID = $self->{'_order'}->{--$pos};
1359 return $self->{'_elem'}{$seqID}{'_seq'};
1362 =head2 Create new alignments
1364 The result of these methods are horizontal or vertical subsets of the
1365 current MSE.
1367 =head2 select
1369 Title : select
1370 Usage : $contig2 = $contig->select(1, 3) # three first sequences
1371 Function :
1373 Creates a new alignment from a continuous subset of
1374 sequences. Numbering starts from 1. Sequence positions
1375 larger than no_sequences() will thow an error.
1377 Returns : a Bio::Assembly::Contig object
1378 Argument : positive integer for the first sequence
1379 positive integer for the last sequence to include (optional)
1381 =cut
1383 sub select {
1384 my ($self) = @_;
1385 $self->throw_not_implemented();
1389 =head2 select_noncont
1391 Title : select_noncont
1392 Usage : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences
1393 Function :
1395 Creates a new alignment from a subset of
1396 sequences. Numbering starts from 1. Sequence positions
1397 larger than no_sequences() will thow an error.
1399 Returns : a Bio::Assembly::Contig object
1400 Args : array of integers for the sequences
1402 =cut
1404 sub select_noncont {
1405 my ($self) = @_;
1406 $self->throw_not_implemented();
1409 =head2 slice
1411 Title : slice
1412 Usage : $contig2 = $contig->slice(20, 30)
1413 Function :
1415 Creates a slice from the alignment inclusive of start and
1416 end columns. Sequences with no residues in the slice are
1417 excluded from the new alignment and a warning is printed.
1418 Slice beyond the length of the sequence does not do
1419 padding.
1421 Returns : a Bio::Assembly::Contig object
1422 Argument : positive integer for start column
1423 positive integer for end column
1425 =cut
1427 sub slice {
1428 my ($self) = @_;
1429 $self->throw_not_implemented();
1432 =head2 Change sequences within the MSE
1434 These methods affect characters in all sequences without changeing the
1435 alignment.
1438 =head2 map_chars
1440 Title : map_chars
1441 Usage : $contig->map_chars('\.','-')
1442 Function :
1444 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1445 characters
1447 Notice that the from (arg1) is interpretted as a regex,
1448 so be careful about quoting meta characters (eg
1449 $contig->map_chars('.','-') wont do what you want)
1451 Returns :
1452 Argument : 'from' rexexp
1453 'to' string
1455 =cut
1457 sub map_chars {
1458 my ($self) = @_;
1459 $self->throw_not_implemented();
1462 =head2 uppercase
1464 Title : uppercase()
1465 Usage : $contig->uppercase()
1466 Function : Sets all the sequences to uppercase
1467 Returns :
1468 Argument :
1470 =cut
1472 sub uppercase {
1473 my ($self) = @_;
1474 $self->throw_not_implemented();
1477 =head2 match_line
1479 Title : match_line()
1480 Usage : $contig->match_line()
1481 Function : Generates a match line - much like consensus string
1482 except that a line indicating the '*' for a match.
1483 Argument : (optional) Match line characters ('*' by default)
1484 (optional) Strong match char (':' by default)
1485 (optional) Weak match char ('.' by default)
1487 =cut
1489 sub match_line {
1490 my ($self) = @_;
1491 $self->throw_not_implemented();
1494 =head2 match
1496 Title : match()
1497 Usage : $contig->match()
1498 Function :
1500 Goes through all columns and changes residues that are
1501 identical to residue in first sequence to match '.'
1502 character. Sets match_char.
1504 USE WITH CARE: Most MSE formats do not support match
1505 characters in sequences, so this is mostly for output
1506 only. NEXUS format (Bio::AlignIO::nexus) can handle
1509 Returns : 1
1510 Argument : a match character, optional, defaults to '.'
1512 =cut
1514 sub match {
1515 my ($self) = @_;
1516 $self->throw_not_implemented();
1519 =head2 unmatch
1521 Title : unmatch()
1522 Usage : $contig->unmatch()
1523 Function :
1525 Undoes the effect of method match. Unsets match_char.
1527 Returns : 1
1528 Argument : a match character, optional, defaults to '.'
1530 =cut
1532 sub unmatch {
1533 my ($self) = @_;
1534 $self->throw_not_implemented();
1538 =head2 MSE attibutes
1540 Methods for setting and reading the MSE attributes.
1542 Note that the methods defining character semantics depend on the user
1543 to set them sensibly. They are needed only by certain input/output
1544 methods. Unset them by setting to an empty string ('').
1546 =head2 id
1548 Title : id
1549 Usage : $contig->id("Ig")
1550 Function : Gets/sets the id field of the alignment
1551 Returns : An id string
1552 Argument : An id string (optional)
1554 =cut
1556 sub id {
1557 my ($self, $contig_name) = @_;
1559 if (defined( $contig_name )) {
1560 $self->{'_id'} = $contig_name;
1563 return $self->{'_id'};
1566 =head2 missing_char
1568 Title : missing_char
1569 Usage : $contig->missing_char("?")
1570 Function : Gets/sets the missing_char attribute of the alignment
1571 It is generally recommended to set it to 'n' or 'N'
1572 for nucleotides and to 'X' for protein.
1573 Returns : An missing_char string,
1574 Argument : An missing_char string (optional)
1576 =cut
1578 sub missing_char {
1579 my ($self) = @_;
1580 $self->throw_not_implemented();
1583 =head2 match_char
1585 Title : match_char
1586 Usage : $contig->match_char('.')
1587 Function : Gets/sets the match_char attribute of the alignment
1588 Returns : An match_char string,
1589 Argument : An match_char string (optional)
1591 =cut
1593 sub match_char {
1594 my ($self) = @_;
1595 $self->throw_not_implemented();
1598 =head2 gap_char
1600 Title : gap_char
1601 Usage : $contig->gap_char('-')
1602 Function : Gets/sets the gap_char attribute of the alignment
1603 Returns : An gap_char string, defaults to '-'
1604 Argument : An gap_char string (optional)
1606 =cut
1608 sub gap_char {
1609 my ($self) = @_;
1610 $self->throw_not_implemented();
1613 =head2 symbol_chars
1615 Title : symbol_chars
1616 Usage : my @symbolchars = $contig->symbol_chars;
1617 Function: Returns all the seen symbols (other than gaps)
1618 Returns : array of characters that are the seen symbols
1619 Argument: boolean to include the gap/missing/match characters
1621 =cut
1623 sub symbol_chars{
1624 my ($self) = @_;
1625 $self->throw_not_implemented();
1628 =head2 Alignment descriptors
1630 These read only methods describe the MSE in various ways.
1633 =head2 consensus_string
1635 Title : consensus_string
1636 Usage : $str = $contig->consensus_string($threshold_percent)
1637 Function : Makes a strict consensus
1638 Returns :
1639 Argument : Optional treshold ranging from 0 to 100.
1640 The consensus residue has to appear at least threshold %
1641 of the sequences at a given location, otherwise a '?'
1642 character will be placed at that location.
1643 (Default value = 0%)
1645 =cut
1647 sub consensus_string {
1648 my ($self) = @_;
1649 $self->throw_not_implemented();
1652 =head2 consensus_iupac
1654 Title : consensus_iupac
1655 Usage : $str = $contig->consensus_iupac()
1656 Function :
1658 Makes a consensus using IUPAC ambiguity codes from DNA
1659 and RNA. The output is in upper case except when gaps in
1660 a column force output to be in lower case.
1662 Note that if your alignment sequences contain a lot of
1663 IUPAC ambiquity codes you often have to manually set
1664 alphabet. Bio::PrimarySeq::_guess_type thinks they
1665 indicate a protein sequence.
1667 Returns : consensus string
1668 Argument : none
1669 Throws : on protein sequences
1672 =cut
1674 sub consensus_iupac {
1675 my ($self) = @_;
1676 $self->throw_not_implemented();
1679 =head2 is_flush
1681 Title : is_flush
1682 Usage : if( $contig->is_flush() )
1685 Function : Tells you whether the alignment
1686 : is flush, ie all of the same length
1689 Returns : 1 or 0
1690 Argument :
1692 =cut
1694 sub is_flush {
1695 my ($self) = @_;
1696 $self->throw_not_implemented();
1699 =head2 length
1701 Title : length()
1702 Usage : $len = $contig->length()
1703 Function : Returns the maximum length of the alignment.
1704 To be sure the alignment is a block, use is_flush
1705 Returns :
1706 Argument :
1708 =cut
1710 sub length {
1711 my ($self) = @_;
1713 $self->throw_not_implemented();
1716 =head2 maxdname_length
1718 Title : maxname_length
1719 Usage : $contig->maxname_length()
1720 Function :
1722 Gets the maximum length of the displayname in the
1723 alignment. Used in writing out various MSE formats.
1725 Returns : integer
1726 Argument :
1728 =cut
1730 sub maxname_length {
1731 my ($self) = @_;
1732 $self->throw_not_implemented();
1735 =head2 no_residues
1737 Title : no_residues
1738 Usage : $no = $contig->no_residues
1739 Function : number of residues in total in the alignment
1740 Returns : integer
1741 Argument :
1743 =cut
1745 sub no_residues {
1746 my ($self) = @_;
1748 return $self->{'_nof_residues'};
1751 =head2 no_sequences
1753 Title : no_sequences
1754 Usage : $depth = $contig->no_sequences
1755 Function : number of sequence in the sequence alignment
1756 Returns : integer
1757 Argument : None
1759 =cut
1761 sub no_sequences {
1762 my ($self) = @_;
1764 return scalar( keys %{ $self->{'_elem'} } );
1767 =head2 percentage_identity
1769 Title : percentage_identity
1770 Usage : $id = $contig->percentage_identity
1771 Function: The function calculates the percentage identity of the alignment
1772 Returns : The percentage identity of the alignment (as defined by the
1773 implementation)
1774 Argument: None
1776 =cut
1778 sub percentage_identity{
1779 my ($self) = @_;
1780 $self->throw_not_implemented();
1783 =head2 overall_percentage_identity
1785 Title : percentage_identity
1786 Usage : $id = $contig->percentage_identity
1787 Function: The function calculates the percentage identity of
1788 the conserved columns
1789 Returns : The percentage identity of the conserved columns
1790 Args : None
1792 =cut
1794 sub overall_percentage_identity{
1795 my ($self) = @_;
1796 $self->throw_not_implemented();
1800 =head2 average_percentage_identity
1802 Title : average_percentage_identity
1803 Usage : $id = $contig->average_percentage_identity
1804 Function: The function uses a fast method to calculate the average
1805 percentage identity of the alignment
1806 Returns : The average percentage identity of the alignment
1807 Args : None
1809 =cut
1811 sub average_percentage_identity {
1812 my ($self) = @_;
1813 $self->throw_not_implemented();
1816 =head2 Alignment positions
1818 Methods to map a sequence position into an alignment column and back.
1819 column_from_residue_number() does the former. The latter is really a
1820 property of the sequence object and can done using
1821 L<Bio::LocatableSeq::location_from_column>:
1823 # select somehow a sequence from the alignment, e.g.
1824 my $seq = $contig->get_seq_by_pos(1);
1825 #$loc is undef or Bio::LocationI object
1826 my $loc = $seq->location_from_column(5);
1829 =head2 column_from_residue_number
1831 Title : column_from_residue_number
1832 Usage : $col = $contig->column_from_residue_number( $seqname, $resnumber)
1833 Function:
1835 This function gives the position in the alignment
1836 (i.e. column number) of the given residue number in the
1837 sequence with the given name. For example, for the
1838 alignment
1840 Seq1/91-97 AC..DEF.GH
1841 Seq2/24-30 ACGG.RTY..
1842 Seq3/43-51 AC.DDEFGHI
1844 column_from_residue_number( "Seq1", 94 ) returns 5.
1845 column_from_residue_number( "Seq2", 25 ) returns 2.
1846 column_from_residue_number( "Seq3", 50 ) returns 9.
1848 An exception is thrown if the residue number would lie
1849 outside the length of the aligment
1850 (e.g. column_from_residue_number( "Seq2", 22 )
1852 Note: If the the parent sequence is represented by more than
1853 one alignment sequence and the residue number is present in
1854 them, this method finds only the first one.
1856 Returns : A column number for the position in the alignment of the
1857 given residue in the given sequence (1 = first column)
1858 Args : A sequence id/name (not a name/start-end)
1859 A residue number in the whole sequence (not just that
1860 segment of it in the alignment)
1862 =cut
1864 sub column_from_residue_number {
1865 my ($self) = @_;
1866 $self->throw_not_implemented();
1869 =head2 Sequence names
1871 Methods to manipulate the display name. The default name based on the
1872 sequence id and subsequence positions can be overridden in various
1873 ways.
1875 =head2 displayname
1877 Title : displayname
1878 Usage : $contig->displayname("Ig", "IgA")
1879 Function : Gets/sets the display name of a sequence in the alignment
1881 Returns : A display name string
1882 Argument : name of the sequence
1883 displayname of the sequence (optional)
1885 =cut
1887 sub displayname { # Do nothing
1890 =head2 set_displayname_count
1892 Title : set_displayname_count
1893 Usage : $contig->set_displayname_count
1894 Function :
1896 Sets the names to be name_# where # is the number of
1897 times this name has been used.
1899 Returns : None
1900 Argument : None
1902 =cut
1904 sub set_displayname_count {
1905 my ($self) = @_;
1906 $self->throw_not_implemented();
1909 =head2 set_displayname_flat
1911 Title : set_displayname_flat
1912 Usage : $contig->set_displayname_flat()
1913 Function : Makes all the sequences be displayed as just their name,
1914 not name/start-end
1915 Returns : 1
1916 Argument : None
1918 =cut
1920 sub set_displayname_flat { # Do nothing!
1923 =head2 set_displayname_normal
1925 Title : set_displayname_normal
1926 Usage : $contig->set_displayname_normal()
1927 Function : Makes all the sequences be displayed as name/start-end
1928 Returns : None
1929 Argument : None
1931 =cut
1933 sub set_displayname_normal { # Do nothing!
1936 =head1 Internal Methods
1938 =head2 _binary_search
1940 Title : _binary_search
1941 Usage : _binary_search($list,$query)
1942 Function :
1944 Find a number in a sorted list of numbers. Return values
1945 may be on or two integers. One positive integer or zero
1946 (>=0) is the index of the element that stores the queried
1947 value. Two positive integers (or zero and another
1948 number) are the indexes of elements among which the
1949 queried value should be placed. Negative single values
1950 mean:
1952 -1: $query is smaller than smallest element in list
1953 -2: $query is greater than greatest element in list
1955 Returns : array of integers
1956 Argument :
1957 $list : array reference
1958 $query : integer
1960 =cut
1962 sub _binary_search {
1963 my $list = shift;
1964 my $query = shift;
1966 # If there is only one element in list
1967 if (!$#{$list} && ($query == $list->[0])) { return (0) }
1968 # If there are others...
1969 my $start = 0;
1970 my $end = $#{$list};
1971 (&_compare($query,$list->[$start]) == 0) && do { return ($start) };
1972 (&_compare($query,$list->[$end]) == 0) && do { return ($end) };
1973 (&_compare($query,$list->[$start]) < 0) && do { return (-1) };
1974 (&_compare($query,$list->[$end]) > 0) && do { return (-2) };
1975 my $middle = 0;
1976 while ($end - $start > 1) {
1977 $middle = int(($end+$middle)/2);
1978 (&_compare($query,$list->[$middle]) == 0) && return ($middle);
1979 (&_compare($query,$list->[$middle]) < 0) && do { $end = $middle ; $middle = 0; next };
1980 $start = $middle; # If &_compare() > 0, move region beggining
1982 return ($start,$end);
1985 =head2 _compare
1987 Title : _compare
1988 Usage : _compare($arg1,$arg2)
1989 Function: Perform numeric or string comparisons
1990 Returns : integer (0, 1 or -1)
1991 Args : values to be compared
1993 =cut
1995 sub _compare {
1996 my $arg1 = shift;
1997 my $arg2 = shift;
1999 if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 }
2000 else { return $arg1 cmp $arg2 }
2003 =head2 _nof_gaps
2005 Title : _nof_gaps
2006 Usage : _nof_gaps($array_ref, $query)
2007 Function: number of gaps found before position $query
2008 Returns : integer
2009 Args :
2010 $array_ref : gap registry reference
2011 $query : [integer] a position in a sequence
2013 =cut
2015 #' emacs...
2016 sub _nof_gaps {
2017 my $list = shift;
2018 my $query = shift;
2019 # If there are no gaps in this contig
2020 return 0 unless (defined($list) && scalar(@{$list}));
2021 # Locate query index in gap list (if any)
2022 my @index = &_binary_search($list,$query);
2023 # If after all alignments, correct using total number of align
2024 if ($index[0] == -2) { $query = scalar(@{$list}) }
2025 # If before any alignment, return 0
2026 elsif ($index[0] == -1) { $query = 0 }
2027 elsif ($index[0] >= 0) {
2028 # If query is between alignments, translate coordinates
2029 if ($#index > 0) { $query = $index[0] + 1 }
2030 # If query sits upon an alignment, do another correction
2031 elsif ($#index == 0) { $query = $index[0] }
2034 return $query;
2037 =head2 _padded_unpadded
2039 Title : _padded_unpadded
2040 Usage : _padded_unpadded($array_ref, $query)
2041 Function:
2043 Returns a coordinate corresponding to
2044 position $query after gaps were
2045 removed from a sequence.
2047 Returns : integer
2048 Args :
2049 $array_ref : reference to this gap registry
2050 $query : [integer] coordionate to change
2052 =cut
2054 sub _padded_unpadded {
2055 my $list = shift;
2056 my $query = shift;
2058 my $align = &_nof_gaps($list,$query);
2059 $query-- if (defined($list->[$align]) && ($list->[$align] == $query));
2060 $query = $query - $align;
2062 return $query;
2065 =head2 _unpadded_padded
2067 Title : _unpadded_padded
2068 Usage : _unpadded_padded($array_ref, $query)
2069 Function:
2071 Returns the value corresponding to
2072 ungapped position $query when gaps are
2073 counted as valid sites in a sequence
2075 Returns :
2076 Args : $array_ref = a reference to this sequence's gap registry
2077 $query = [integer] location to change
2079 =cut
2082 sub _unpadded_padded {
2083 my $list = shift;
2084 my $query = shift;
2086 my $align = &_nof_gaps($list,$query);
2087 $query = $query + $align;
2088 my $new_align = &_nof_gaps($list,$query);
2089 while ($new_align - $align > 0) {
2090 $query = $query + $new_align - $align;
2091 $align = $new_align;
2092 $new_align = &_nof_gaps($list,$query);
2094 # If current position is also a align, look for the first upstream base
2095 while (defined($list->[$align]) && ($list->[$align] == $query)) {
2096 $query++; $align++;
2099 return $query;
2102 =head2 _register_gaps
2104 Title : _register_gaps
2105 Usage : $self->_register_gaps($seq, $array_ref)
2106 Function: stores gap locations for a sequence
2107 Returns : number of gaps found
2108 Args :
2109 $seq : sequence string
2110 $array_ref : a reference to an array,
2111 where gap locations will
2112 be stored
2114 =cut
2116 sub _register_gaps {
2117 my $self = shift;
2118 my $sequence = shift;
2119 my $dbref = shift;
2121 $self->throw("Not an aligned sequence string to register gaps")
2122 if (ref($sequence));
2124 $self->throw("Not an array reference for gap registry")
2125 unless (ref($dbref) eq 'ARRAY');
2127 # Registering alignments
2128 @{$dbref} = (); # Cleaning registry
2129 if (defined $sequence) {
2130 my $i = -1;
2131 while(1) {
2132 $i = index($sequence,"-",$i+1);
2133 last if ($i == -1);
2134 push(@{$dbref},$i+1);
2136 } else {
2137 # $self->warn("Found undefined sequence while registering gaps");
2138 return 0;
2141 return scalar(@{$dbref});