2 # BioPerl module for Bio::Assembly::Contig
3 # Mostly based on Bio::SimpleAlign by Ewan Birney
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Robson Francisco de Souza <rfsouza@citri.iq.usp.br>
9 # Copyright Robson Francisco de Souza
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Assembly::Contig - Perl module to hold and manipulate
18 sequence assembly contigs.
23 use Bio::Assembly::IO;
25 # Assembly loading methods
26 $aio = Bio::Assembly::IO->new(-file=>"test.ace.1",
29 $assembly = $aio->next_assembly;
30 foreach $contig ($assembly->all_contigs) {
34 # OR, if you want to build the contig yourself,
36 use Bio::Assembly::Contig;
37 $c = Bio::Assembly::Contig->new(-id=>"1");
39 $ls = Bio::LocatableSeq->new(-seq=>"ACCG-T",
42 $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T",
46 $ls_coord = Bio::SeqFeature::Generic->new(-start=>3,
49 $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1,
54 $c->set_seq_coord($ls_coord,$ls);
55 $c->set_seq_coord($ls2_coord,$ls2);
57 $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T",
59 $c->set_consensus_sequence($con);
61 $l = $c->change_coord('unaligned r2','ungapped consensus',6);
62 print "6 in unaligned r2 => $l in ungapped consensus\n";
67 A contig is as a set of sequences, locally aligned to each other, so
68 that every sequence has overlapping regions with at least one sequence
69 in the contig, such that a continuous of overlapping sequences is
70 formed, allowing the deduction of a consensus sequence which may be
71 longer than any of the sequences from which it was deduced.
73 In this documentation we refer to the overlapping sequences used to
74 build the contig as "aligned sequences" and to the sequence deduced
75 from the overlap of aligned sequences as the "consensus". Methods to
76 deduce the consensus sequence from aligned sequences were not yet
77 implemented in this module, but its posssible to add a consensus
78 sequence deduced by other means, e.g, by the assembly program used to
81 All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable
82 objects and have a unique ID. The unique ID restriction is due to the
83 nature of the module's internal data structures and is also a request
84 of some assembly programs. If two sequences with the same ID are added
85 to a contig, the first sequence added is replaced by the second one.
87 =head2 Coordinate_systems
89 There are four base coordinate systems in Bio::Assembly::Contig. When
90 you need to access contig elements or data that exists on a certain
91 range or location, you may be specifying coordinates in relation to
92 different sequences, which may be either the contig consensus or one
93 of the aligned sequences that were used to do the assembly.
95 =========================================================
96 Name | Referenced sequence
97 ---------------------------------------------------------
98 "gapped consensus" | Contig (with gaps)
99 "ungapped consensus" | Contig (without gaps)
100 "aligned $seqID" | sequence $seqID (with gaps)
101 "unaligned $seqID" | sequence $seqID (without gaps)
102 =========================================================
104 "gapped consensus" refers to positions in the aligned consensus
105 sequence, which is the consensus sequence including the gaps inserted
106 to align it agains the aligned sequences that were used to assemble
107 the contig. So, its limits are [ 1, (consensus length + number of gaps
110 "ungapped consensus" is a coordinate system based on the consensus
111 sequence, but excluding consensus gaps. This is just the coordinate
112 system that you have when considering the consensus sequence alone,
113 instead of aligned to other sequences.
115 "aligned $seqID" refers to locations in the sequence $seqID after
116 alignment of $seqID against the consensus sequence (reverse
117 complementing the original sequence, if needed). Coordinate 1 in
118 "aligned $seqID" is equivalent to the start location (first base) of
119 $seqID in the consensus sequence, just like if the aligned sequence
120 $seqID was a feature of the consensus sequence.
122 "unaligned $seqID" is equivalent to a location in the isolated
123 sequence, just like you would have when considering the sequence
124 alone, out of an alignment. When changing coordinates from "aligned
125 $seq2" to "unaligned $seq2", if $seq2 was reverse complemented when
126 included in the alignment, the output coordinates will be reversed to
127 fit that fact, i.e. 1 will be changed to length($seq2), 2 will be
128 length($seq)-1 and so on.
130 An important note: when you change gap coordinates from a gapped
131 system ("gapped consensus" or "aligned $seqID") to a system that does
132 not include gaps ("ungapped consensus" or "unaligned $seqID"), the
133 position returned will be the first location before all gaps
134 neighboring the input location.
136 =head2 Feature_collection
138 Bio::Assembly::Contig stores much information about a contig in a
139 Bio::Assembly::SeqFeature::Collection object. Relevant information on the
140 alignment is accessed by selecting features based on their primary
141 tags (e.g. all features which have a primary tag of the form
142 '_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are
143 coordinates for sequences in the contig alignment) and, by using
144 methods from Bio::Assembly::SeqFeature::Collection, it's possible to select
145 features by overlap with other features.
147 We suggest that you use the primary tags of features as identifiers
148 for feature classes. By convention, features with primary tags
149 starting with a '_' are generated by modules that populate the contig
150 data structure and return the contig object, maybe as part of an
151 assembly object, e.g. drivers from the Bio::Assembly::IO set.
153 Features in the features collection may be associated with particular
154 aligned sequences. To obtain this, you must attach the sequence to the
155 feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the
156 feature to the feature collection. We also suggest to add the sequence
157 id to the primary tag, so that is easy to select feature for a
160 There is only one feature class that some methods in
161 Bio::Assembly::Contig expect to find in the feature collection: features
162 with primary tags of the form '_aligned_coord:$seqID', where $seqID is
163 the aligned sequence id (like returned by $seq-E<gt>id()). These features
164 describe the position (in "gapped consensus" coordinates) of aligned
165 sequences, and the method set_seq_coord() automatically changes a
166 feature's primary tag to this form whenever the feature is added to
167 the collection by this method. Only two methods in Bio::Assembly::Contig
168 will not work unless there are features from this class:
169 change_coord() and get_seq_coord().
171 Other feature classes will be automatically available only when
172 Bio::Assembly::Contig objects are created by a specific module. Such
173 feature classes are (or should be) documented in the documentation of
174 the module which create them, to which the user should refer.
180 User feedback is an integral part of the evolution of this and other
181 Bioperl modules. Send your comments and suggestions preferably to the
182 Bioperl mailing lists Your participation is much appreciated.
184 bioperl-l@bioperl.org - General discussion
185 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
189 Please direct usage questions or support issues to the mailing list:
191 I<bioperl-l@bioperl.org>
193 rather than to the module maintainer directly. Many experienced and
194 reponsive experts will be able look at the problem and quickly
195 address it. Please include a thorough description of the problem
196 with code and data examples if at all possible.
198 =head2 Reporting Bugs
200 Report bugs to the Bioperl bug tracking system to help us keep track
201 the bugs and their resolution. Bug reports can be submitted via the
204 https://github.com/bioperl/bioperl-live/issues
206 =head1 AUTHOR - Robson Francisco de Souza
208 rfsouza@citri.iq.usp.br
212 The rest of the documentation details each of the object
213 methods. Internal methods are usually preceded with a _
218 package Bio
::Assembly
::Contig
;
222 use Bio
::DB
::SeqFeature
::Store
; # isa Bio::SeqFeature::CollectionI
223 use Bio
::Seq
::PrimaryQual
; # isa Bio::Seq::QualI
225 use Scalar
::Util
qw(weaken);
227 use base
qw(Bio::Root::Root Bio::Align::AlignI);
229 =head1 Object creator
234 Usage : my $contig = Bio::Assembly::Contig->new();
235 Function : Creates a new contig object
236 Returns : Bio::Assembly::Contig
237 Args : -id => unique contig ID
238 -source => string for the sequence assembly program used
239 -collection => Bio::SeqFeature::CollectionI instance
246 my ($class, @args) = @_;
248 my $self = $class->SUPER::new
(@args);
250 my ($src, $id, $collection) = $self->_rearrange([qw(SOURCE ID COLLECTION)], @args);
251 $src && $self->source($src);
252 ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name
253 ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig
254 # we need to set up internal hashes first!
256 # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility)
257 $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID)
258 $self->{'_order'} = {}; # store sequence order
259 # $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids.
260 # $self->{'_dis_name'} = {}; # Display names for each sequence
261 $self->{'_symbols'} = {}; # List of symbols
263 #Contig specific slots
264 $self->{'_consensus_sequence'} = undef;
265 $self->{'_consensus_quality'} = undef;
266 $self->{'_nof_residues'} = 0;
267 $self->{'_nof_seqs'} = 0;
268 # $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now...
270 # for cases where SF::Collection is shared between Bio::Assembly::Contig
272 $self->throw("Collection must implement Bio::SeqFeature::CollectionI") unless $collection->isa('Bio::SeqFeature::CollectionI');
273 $self->{'_sfc'} = $collection;
275 $self->{'_sfc'} = Bio
::DB
::SeqFeature
::Store
->new(
276 -adaptor
=> 'memory',
277 -index_subfeatures
=> 1,
282 $self->{'_assembly'} = undef; # Bio::Assembly::Scaffold the contig belongs to
283 $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise
284 $self->{'_neighbor_start'} = undef; # Neighbor Bio::Assembly::Contig
285 $self->{'_neighbor_end'} = undef; # Neighbor Bio::Assembly::Contig
287 return $self; # success - we hope!
290 =head1 Assembly related methods
292 These methods exist to enable adding information about possible
293 relations among contigs, e.g. when you already have a scaffold for
294 your assembly, describing the ordering of contigs in the final
295 assembly, but no sequences covering the gaps between neighboring
301 Usage : $contig->source($program);
302 Function : Get/Set program used to build this contig
304 Argument : [optional] string
312 $self->{'_source'} = $source if (defined $source);
313 return $self->{'_source'};
319 Usage : $contig->assembly($assembly);
320 Function : Get/Set assembly object for this contig
321 Returns : a Bio::Assembly::Scaffold object
322 Argument : a Bio::Assembly::Scaffold object
328 my $assembly = shift;
330 $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly")
331 if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold"));
332 # We create a circular reference to a Scaffold object. It is made weak
333 # to prevent memory leaks.
334 $self->{'_assembly'} = $assembly if (defined $assembly);
335 weaken
($self->{'_assembly'});
337 return $self->{'_assembly'};
343 Usage : $contig->strand($num);
344 Function : Get/Set contig orientation in a scaffold/assembly.
345 Its equivalent to the strand property of sequence
346 objects and sets whether the contig consensus should
347 be reversed and complemented before being added to a
348 scaffold or assembly.
350 Argument : 1 if orientaion is forward, -1 if reverse and
360 $self->throw("Contig strand must be either 1, -1 or 0")
361 unless $ori == 1 || $ori == 0 || $ori == -1;
362 $self->{'_strand'} = $ori;
365 return $self->{'_strand'};
368 =head2 upstream_neighbor
370 Title : upstream_neighbor
371 Usage : $contig->upstream_neighbor($contig);
372 Function : Get/Set a contig neighbor for the current contig when
373 building a scaffold. The upstream neighbor is
374 located before $contig first base
376 Argument : Bio::Assembly::Contig
380 sub upstream_neighbor
{
384 $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig")
385 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
387 $self->{'_neighbor_start'} = $ref if (defined $ref);
388 return $self->{'_neighbor_start'};
391 =head2 downstream_neighbor
393 Title : downstream_neighbor
394 Usage : $contig->downstream_neighbor($num);
395 Function : Get/Set a contig neighbor for the current contig when
396 building a scaffold. The downstream neighbor is
397 located after $contig last base
399 Argument : Bio::Assembly::Contig
403 sub downstream_neighbor
{
407 $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig")
408 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
409 $self->{'_neighbor_end'} = $ref if (defined $ref);
410 return $self->{'_neighbor_end'};
413 =head1 Contig feature collection methods
418 Usage : $contig->add_features($feat,$flag)
421 Add an array of features to the contig feature
422 collection. The consensus sequence may be attached to the
423 added feature, if $flag is set to 1. If $flag is 0 and
424 the feature attached to one of the contig aligned
425 sequences, the feature is registered as an aligned
426 sequence feature. If $flag is 0 and the feature is not
427 attched to any sequence in the contig, the feature is
428 simply added to the feature collection and no attachment
429 or registration is made.
431 Note: You must attach aligned sequences to their features
432 prior to calling add_features, otherwise you won't be
433 able to access the feature through get_seq_feat_by_tag()
436 Returns : number of features added.
438 $feat : A reference to an array of Bio::SeqFeatureI
439 $flag : boolean - true if consensus sequence object
440 should be attached to this feature, false if
441 no consensus attachment should be made.
447 my ($self, $args, $flag) = @_;
449 # Adding shortcuts for aligned sequence features
450 $flag = 0 unless (defined $flag);
451 if ($flag && defined $self->{'_consensus_sequence'}) {
452 foreach my $feat (@
$args) {
453 next if (defined $feat->seq);
454 $feat->attach_seq($self->{'_consensus_sequence'});
456 } elsif (!$flag) { # Register aligned sequence features
457 foreach my $feat (@
$args) {
458 if (my $seq = $feat->entire_seq()) {
459 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
460 $self->warn("Adding contig feature attached to unknown sequence $seqID!")
461 unless (exists $self->{'_elem'}{$seqID});
462 my $tag = $feat->primary_tag;
463 $self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat;
468 # Add feature to feature collection
469 my $nof_added = $self->get_features_collection->add_features($args);
474 =head2 remove_features
476 Title : remove_features
477 Usage : $contig->remove_features(@feat)
478 Function : Remove an array of contig features
479 Returns : true if successful
480 Argument : An array of Bio::SeqFeature::Generic (Bio::SeqFeatureI)
484 sub remove_features
{
485 my ($self, @args) = @_;
487 # Removing shortcuts for aligned sequence features
488 for my $feat (@args) {
489 if (my $seq = $feat->entire_seq()) {
490 my $seqID = $seq->id || $seq->display_id || $seq->primary_id;
491 my $tag = $feat->primary_tag;
492 $tag =~ s/:$seqID$/$1/g;
493 delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} )
494 if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} &&
495 $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat);
499 # Removing Bio::SeqFeature objects
500 return $self->get_features_collection->delete(@args);
503 =head2 get_features_collection
505 Title : get_features_collection
506 Usage : $contig->get_features_collection()
507 Function : Get the collection of all contig features and seqfeatures
508 Returns : Bio::DB::SeqFeature::Store (Bio::SeqFeature::CollectionI)
513 sub get_features_collection
{
515 return $self->{'_sfc'};
518 =head2 remove_features_collection
520 Title : remove_features_collection
521 Usage : $contig->remove_features_collection()
522 Function : Remove the collection of all contig features. It is useful
523 to save some memory (when contig features are not needed).
529 sub remove_features_collection
{
531 # Removing shortcuts for aligned sequence features
532 for my $seqID (keys %{$self->{'_elem'}}) {
533 delete $self->{'_elem'}{$seqID};
535 # Removing Bio::SeqFeature::Collection features
536 $self->{'_sfc'} = {};
540 =head1 Coordinate system's related methods
542 See L<Coordinate_Systems> above.
547 Usage : $contig->change_coord($in,$out,$query)
550 Change coordinate system for $query. This method
551 transforms locations between coordinate systems described
552 in section "Coordinate Systems" of this document.
554 Note: this method will throw an exception when changing
555 coordinates between "ungapped consensus" and other
556 systems if consensus sequence was not set. It will also
557 throw exceptions when changing coordinates among aligned
558 sequence, either with or without gaps, and other systems
559 if sequence locations were not set with set_seq_coord().
563 $in : [string] input coordinate system
564 $out : [string] output coordinate system
565 $query : [integer] a position in a sequence
572 my $type_out = shift;
576 # Loading read objects (these calls will throw exceptions whether $read_in or
577 # $read_out is not found
578 my ($read_in,$read_out) = (undef,undef);
579 my $in_ID = ( split(' ',$type_in) )[1];
580 my $out_ID = ( split(' ',$type_out) )[1];
582 if ($in_ID ne 'consensus') {
583 $read_in = $self->get_seq_coord( $self->get_seq_by_name($in_ID) );
584 $self->throw("Can't change coordinates without sequence location for $in_ID")
585 unless (defined $read_in);
587 if ($out_ID ne 'consensus') {
588 $read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) );
589 $self->throw("Can't change coordinates without sequence location for $out_ID")
590 unless (defined $read_out);
593 # Performing transformation between coordinates
596 # Transformations between contig padded and contig unpadded
597 (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do {
598 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
599 unless (defined $self->{'_consensus_sequence'});
600 $query = &_padded_unpadded
($self->{'_consensus_gaps'}, $query);
603 (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do {
604 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
605 unless (defined $self->{'_consensus_sequence'});
606 $query = &_unpadded_padded
($self->{'_consensus_gaps'},$query);
610 # Transformations between contig (padded) and read (padded)
611 (($type_in eq 'gapped consensus') &&
612 ($type_out =~ /^aligned /) && defined($read_out)) && do {
613 $query = $query - $read_out->start() + 1;
616 (($type_in =~ /^aligned /) && defined($read_in) &&
617 ($type_out eq 'gapped consensus')) && do {
618 $query = $query + $read_in->start() - 1;
622 # Transformations between contig (unpadded) and read (padded)
623 (($type_in eq 'ungapped consensus') &&
624 ($type_out =~ /^aligned /) && defined($read_out)) && do {
625 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
626 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
629 (($type_in =~ /^aligned /) && defined($read_in) &&
630 ($type_out eq 'ungapped consensus')) && do {
631 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
632 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
636 # Transformations between seq $read_in padded and seq $read_out padded
637 (defined($read_in) && ($type_in =~ /^aligned /) &&
638 defined($read_out) && ($type_out =~ /^aligned /)) && do {
639 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
640 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
644 # Transformations between seq $read_in padded and seq $read_out unpadded
645 (defined($read_in) && ($type_in =~ /^aligned /) &&
646 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
647 if ($read_in ne $read_out) {
648 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
649 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
651 my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'};
652 $query = &_padded_unpadded
($list_out,$query);
653 # Changing read orientation if read was reverse complemented when aligned
654 if ($read_out->strand == -1) {
655 my ($length) = $read_out->length();
656 $length = $length - &_nof_gaps
($list_out,$length);
657 $query = $length - $query + 1;
661 (defined($read_in) && ($type_in =~ /^unaligned /) &&
662 defined($read_out) && ($type_out =~ /^aligned /)) && do {
663 my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'};
664 # Changing read orientation if read was reverse complemented when aligned
665 if ($read_in->strand == -1) {
666 my ($length) = $read_in->length();
667 $length = $length - &_nof_gaps
($list_in,$length);
668 $query = $length - $query + 1;
670 $query = &_unpadded_padded
($list_in,$query);
671 if ($read_in ne $read_out) {
672 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
673 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
678 # Transformations between seq $read_in unpadded and seq $read_out unpadded
679 (defined($read_in) && ($type_in =~ /^unaligned /) &&
680 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
681 $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query);
682 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
686 # Transformations between contig (padded) and read (unpadded)
687 (($type_in eq 'gapped consensus') &&
688 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
689 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
690 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
693 (($type_in =~ /^unaligned /) && defined($read_in) &&
694 ($type_out eq 'gapped consensus')) && do {
695 $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query);
696 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
700 # Transformations between contig (unpadded) and read (unpadded)
701 (($type_in eq 'ungapped consensus') &&
702 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
703 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
704 $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query);
707 (($type_in =~ /^unaligned /) && defined($read_in) &&
708 ($type_out eq 'ungapped consensus')) && do {
709 $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query);
710 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
714 $self->throw("Unknow coordinate system. Args: $type_in, $type_out.");
715 $query = undef; # If a coordinate systems just requested is unknown
723 Title : get_seq_coord
724 Usage : $contig->get_seq_coord($seq);
725 Function : Get "gapped consensus" location for aligned sequence
726 Returns : Bio::SeqFeature::Generic for coordinates or undef.
727 A warning is printed if sequence coordinates were not set.
728 Argument : Bio::LocatableSeq object
733 my ($self,$seq) = @_;
735 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
736 $self->throw("$seq is not a Bio::LocatableSeq");
738 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
740 unless (exists( $self->{'_elem'}{$seqID} )) {
741 $self->warn("No such sequence ($seqID) in contig ".$self->id);
744 unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) {
745 # $self->warn("Chad. Location not set for sequence ($seqID) in contig ".$self->id);
749 return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"};
754 Title : set_seq_coord
755 Usage : $contig->set_seq_coord($feat,$seq);
758 Set "gapped consensus" location for an aligned
759 sequence. If the sequence was previously added using
760 add_seq, its coordinates are changed/set. Otherwise,
761 add_seq is called and the sequence is added to the
764 Returns : Bio::SeqFeature::Generic for old coordinates or undef.
766 $feat : a Bio::SeqFeature::Generic object
767 representing a location for the
768 aligned sequence, in "gapped
769 consensus" coordinates.
771 Note: the original feature primary tag will
774 $seq : a Bio::LocatableSeq object
779 my ($self,$feat,$seq) = @_;
781 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
782 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
785 # Complaining about inadequate feature object
786 $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!")
787 unless ( $feat->isa("Bio::SeqFeature::Generic") );
788 $self->throw("Sequence coordinates must have an end!")
789 unless (defined $feat->end);
790 $self->throw("Sequence coordinates must have a start!")
791 unless (defined $feat->start);
793 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
794 if ( exists( $self->{'_elem'}{$seqID} ) &&
795 exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
796 defined( $self->{'_elem'}{$seqID}{'_seq'} ) &&
797 ($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) {
798 $self->warn("Replacing sequence $seqID\n");
799 $self->remove_seq($self->{'_elem'}{$seqID}{'_seq'});
800 $self->remove_features($feat);
803 # Add new sequence and Bio::Generic::SeqFeature
804 $self->add_seq($seq);
806 $feat->add_tag_value('contig',$self->id) unless ( $feat->has_tag('contig') );
807 $feat->primary_tag("_aligned_coord");
808 $feat->source_tag($seqID);
809 $feat->attach_seq($seq);
811 $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat;
812 $self->add_features([ $feat ]);
815 =head1 Bio::Assembly::Contig consensus methods
817 =head2 set_consensus_sequence
819 Title : set_consensus_sequence
820 Usage : $contig->set_consensus_sequence($seq)
821 Function : Set the consensus sequence object for this contig
822 Returns : consensus length
823 Argument : Bio::LocatableSeq
827 sub set_consensus_sequence
{
831 $self->throw("Consensus sequence must be a Bio::LocatableSeq!")
832 unless ($seq->isa("Bio::LocatableSeq"));
834 $self->{'_consensus_gaps'} = []; # Consensus Gap registry
835 $self->_register_gaps( $seq->seq, $self->{'_consensus_gaps'} );
836 $self->{'_consensus_sequence'} = $seq;
839 $seq->end($seq->_ungapped_len);
841 my $con_len = $seq->length;
846 =head2 set_consensus_quality
848 Title : set_consensus_quality
849 Usage : $contig->set_consensus_quality($qual)
850 Function : Set the quality object for consensus sequence
852 Argument : Bio::Seq::QualI object
856 sub set_consensus_quality
{
857 my ($self, $qual) = @_;
859 $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
860 unless ( $qual->isa("Bio::Seq::QualI") );
862 $self->throw("Consensus quality can't be added before you set the consensus sequence!")
863 unless (defined $self->{'_consensus_sequence'});
865 $self->{'_consensus_quality'} = $qual;
868 =head2 get_consensus_length
870 Title : get_consensus_length
871 Usage : $contig->get_consensus_length()
872 Function : Get consensus sequence length
878 sub get_consensus_length
{
881 return $self->{'_consensus_sequence'}->length();
884 =head2 get_consensus_sequence
886 Title : get_consensus_sequence
887 Usage : $contig->get_consensus_sequence()
888 Function : Get a reference to the consensus sequence object
890 Returns : Bio::SeqI object
895 sub get_consensus_sequence
{
896 my ($self, @args) = @_;
898 return $self->{'_consensus_sequence'};
901 =head2 get_consensus_quality
903 Title : get_consensus_quality
904 Usage : $contig->get_consensus_quality()
905 Function : Get a reference to the consensus quality object
907 Returns : A Bio::Seq::QualI object
912 sub get_consensus_quality
{
913 my ($self, @args) = @_;
915 return $self->{'_consensus_quality'};
918 =head1 Bio::Assembly::Contig aligned sequences methods
923 Usage : $contig->set_seq_qual($seq,$qual);
924 Function : Adds quality to an aligned sequence.
926 Argument : a Bio::LocatableSeq object and
927 a Bio::Seq::QualI object
929 See L<Bio::LocatableSeq> for more information.
934 my ($self,$seq,$qual) = @_;
936 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
937 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
939 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
941 $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
942 unless ( $qual->isa("Bio::Seq::QualI") );
943 $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!")
944 unless (exists $self->{'_elem'}{$seqID}{'_seq'});
945 $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) ));
947 # Adding gaps to quality object
948 my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq();
949 my $tmp = $qual->qual();
950 @
{$tmp} = reverse(@
{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1);
954 my $i = 0; my $j = 0;
955 while ($i <= $#{$tmp}) {
956 # IF base is a gap, quality is the average for neighbouring sites
957 if ($j > $i && substr($sequence,$j,1) eq '-') {
958 $previous = $tmp->[$i-1] unless ($i == 0);
960 $next = $tmp->[$i+1];
964 push(@quality,int( ($previous+$next)/2 ));
966 push(@quality,$tmp->[$i]);
972 $self->{'_elem'}{$seqID}{'_qual'} = Bio
::Seq
::PrimaryQual
->new(
973 -qual
=>join(" ",@quality), -id
=>$seqID );
979 Usage : $contig->get_seq_ids( -start => $start,
981 -type => "gapped A0QR67B08.b" );
982 Function : Get list of sequence IDs overlapping interval [$start, $end]
983 The default interval is [1,$contig->length]
984 Default coordinate system is "gapped contig"
986 Argument : A hash with optional elements:
987 -start : consensus subsequence start
988 -end : consensus subsequence end
989 -type : the coordinate system type for $start and $end arguments
990 Coordinate system available are:
991 "gapped consensus" : consensus coordinates with gaps
992 "ungapped consensus" : consensus coordinates without gaps
993 "aligned $ReadID" : read $ReadID coordinates with gaps
994 "unaligned $ReadID" : read $ReadID coordinates without gaps
1000 my ($self, @args) = @_;
1002 my ($type, $start, $end) = $self->_rearrange([qw(TYPE START END)], @args);
1005 if (defined($start) && defined($end)) {
1006 if (defined($type) && ($type ne 'gapped consensus')) {
1007 $start = $self->change_coord($type,'gapped consensus',$start);
1008 $end = $self->change_coord($type,'gapped consensus',$end);
1010 @list = $self->get_features_collection->features(
1011 -type
=> '_aligned_coord', # primary tag
1015 #-strandmatch => 'ignore',
1017 @list = map { $_->entire_seq->id } @list;
1019 # Entire aligned sequences list
1020 @list = map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} };
1026 =head2 get_seq_feat_by_tag
1028 Title : get_seq_feat_by_tag
1029 Usage : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID")
1030 Function : Get a sequence feature based on its primary_tag.
1031 Returns : a Bio::SeqFeature object
1032 Argument : a Bio::LocatableSeq and a string (feature primary tag)
1036 sub get_seq_feat_by_tag
{
1037 my ($self,$seq,$tag) = @_;
1039 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1040 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
1042 my $seqID = $seq->id || $seq->display_id || $seq->primary_id;
1044 return $self->{'_elem'}{$seqID}{'_feat'}{$tag};
1047 =head2 get_seq_by_name
1049 Title : get_seq_by_name
1050 Usage : $seq = $contig->get_seq_by_name('Seq1')
1051 Function : Gets a sequence based on its id.
1052 Returns : a Bio::LocatableSeq object
1053 undef if name is not found
1058 sub get_seq_by_name
{
1062 unless (exists $self->{'_elem'}{$seqID}{'_seq'}) {
1063 $self->throw("Could not find sequence $seqID in contig ".$self->id);
1067 return $self->{'_elem'}{$seqID}{'_seq'};
1070 =head2 get_qual_by_name
1072 Title : get_qual_by_name
1073 Usage : $seq = $contig->get_qual_by_name('Seq1')
1076 Gets Bio::Seq::QualI object for a sequence
1077 through its id ( as given by $qual->id() ).
1079 Returns : a Bio::Seq::QualI object.
1080 undef if name is not found
1085 sub get_qual_by_name
{
1089 unless (exists $self->{'_elem'}{$seqID}{'_qual'}) {
1090 $self->warn("Could not find quality for $seqID in contig!");
1094 return $self->{'_elem'}{$seqID}{'_qual'};
1097 =head1 Bio::Align::AlignI compatible methods
1099 =head2 Modifier methods
1101 These methods modify the MSE by adding, removing or shuffling complete
1107 Usage : $contig->add_seq($newseq);
1110 Adds a sequence to the contig. *Does*
1111 *not* align it - just adds it to the
1115 Argument : a Bio::LocatableSeq object
1117 See L<Bio::LocatableSeq> for more information.
1125 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1126 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
1129 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1130 $self->{'_elem'}{$seqID} = {} unless (exists $self->{'_elem'}{$seqID});
1132 if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
1133 ($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) {
1134 $self->warn("Adding sequence $seqID, which has already been added");
1137 # Our locatable sequences are always considered to be complete sequences
1139 $seq->end($seq->_ungapped_len);
1141 my $alphabet = $seq->alphabet;
1143 $alphabet = lc($alphabet) if defined $alphabet;
1145 $self->warn("Adding non-nucleotidic sequence ".$seqID)
1146 if (!$alphabet || ($alphabet ne 'dna' && $alphabet ne 'rna'));
1148 # build the symbol list for this sequence,
1149 # will prune out the gap and missing/match chars
1150 # when actually asked for the symbol list in the
1152 if (defined $seq->seq) {
1153 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1155 $self->{'_symbols'} = {};
1158 my $seq_no = ++$self->{'_nof_seqs'};
1160 if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) {
1161 $self->warn("Replacing one sequence [$seqID]\n");
1163 #print STDERR "Assigning $seqID to $order\n";
1164 $self->{'_order'}->{$seq_no} = $seqID;
1165 # $self->{'_start_end_lists'}->{$id} = []
1166 # unless(exists $self->{'_start_end_lists'}->{$id});
1167 # push @{$self->{'_start_end_lists'}->{$id}}, $seq;
1170 $self->{'_elem'}{$seqID}{'_seq'} = $seq;
1171 $self->{'_elem'}{$seqID}{'_feat'} = {};
1172 $self->{'_elem'}{$seqID}{'_gaps'} = [];
1173 my $dbref = $self->{'_elem'}{$seqID}{'_gaps'};
1174 my $nofgaps = $self->_register_gaps($seq->seq,$dbref);
1176 # Updating residue count
1177 $self->{'_nof_residues'} += $seq->length - $nofgaps;
1185 Usage : $contig->remove_seq($seq);
1186 Function : Removes a single sequence from a contig
1187 Returns : 1 on success, 0 otherwise
1188 Argument : a Bio::LocatableSeq object
1193 my ($self,$seq) = @_;
1195 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1196 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
1199 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1200 unless (exists $self->{'_elem'}{$seqID} ) {
1201 $self->warn("No sequence named $seqID [$seq]");
1205 # Updating residue count
1206 $self->{'_nof_residues'} -= $seq->length() +
1207 &_nof_gaps
( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length );
1209 # Update number of sequences
1210 $self->{'_nof_seqs'}--;
1212 # Update order of sequences (order starts at 1)
1213 my $max_order = $self->{'_nof_seqs'} + 1;
1214 my $target_order = $max_order + 1;
1215 for (my $order = 1 ; $order <= $max_order ; $order++) {
1216 if ($self->{'_order'}->{$order} eq $seqID) {
1217 # Found the wanted sequence order
1218 $target_order = $order;
1220 if ($order > $target_order) {
1221 # Decrement this sequence order by one order
1222 $self->{'_order'}->{$order-1} = $self->{'_order'}->{$order};
1224 if ($order == $max_order) {
1226 delete $self->{'_order'}->{$order};
1230 # Remove all references to features of this sequence
1232 for my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) {
1233 push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag});
1235 $self->{'_sfc'}->remove_features(\
@feats);
1236 delete $self->{'_elem'}{$seqID};
1244 Usage : $contig->purge(0.7);
1247 Removes sequences above whatever %id.
1249 This function will grind on large alignments. Beware!
1250 (perhaps not ideally implemented)
1253 Returns : An array of the removed sequences
1261 $self->throw_not_implemented();
1264 =head2 sort_alphabetically
1266 Title : sort_alphabetically
1267 Usage : $contig->sort_alphabetically
1270 Changes the order of the alignemnt to alphabetical on name
1271 followed by numerical by number.
1278 sub sort_alphabetically
{
1280 $self->throw_not_implemented();
1283 =head2 Sequence selection methods
1285 Methods returning one or more sequences objects.
1290 Usage : foreach $seq ( $contig->each_seq() )
1291 Function : Gets an array of Seq objects from the alignment
1302 foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) {
1303 push(@arr,$self->{'_elem'}{$seqID}{'_seq'});
1309 =head2 each_alphabetically
1311 Title : each_alphabetically
1312 Usage : foreach $seq ( $contig->each_alphabetically() )
1315 Returns an array of sequence object sorted alphabetically
1316 by name and then by start point.
1317 Does not change the order of the alignment
1324 sub each_alphabetically
{
1326 $self->throw_not_implemented();
1329 =head2 each_seq_with_id
1331 Title : each_seq_with_id
1332 Usage : foreach $seq ( $contig->each_seq_with_id() )
1335 Gets an array of Seq objects from the
1336 alignment, the contents being those sequences
1337 with the given name (there may be more than one)
1340 Argument : a seq name
1344 sub each_seq_with_id
{
1346 $self->throw_not_implemented();
1349 =head2 get_seq_by_pos
1351 Title : get_seq_by_pos
1352 Usage : $seq = $contig->get_seq_by_pos(3)
1355 Gets a sequence based on its position in the alignment.
1356 Numbering starts from 1. Sequence positions larger than
1357 num_sequences() will thow an error.
1359 Returns : a Bio::LocatableSeq object
1360 Argument : positive integer for the sequence osition
1364 sub get_seq_by_pos
{
1368 $self->throw("Sequence position has to be a positive integer, not [$pos]")
1369 unless $pos =~ /^\d+$/ and $pos > 0;
1370 $self->throw("No sequence at position [$pos]")
1371 unless $pos <= $self->num_sequences ;
1373 my $seqID = $self->{'_order'}->{--$pos};
1374 return $self->{'_elem'}{$seqID}{'_seq'};
1377 =head2 Create new alignments
1379 The result of these methods are horizontal or vertical subsets of the
1385 Usage : $contig2 = $contig->select(1, 3) # three first sequences
1388 Creates a new alignment from a continuous subset of
1389 sequences. Numbering starts from 1. Sequence positions
1390 larger than num_sequences() will thow an error.
1392 Returns : a Bio::Assembly::Contig object
1393 Argument : positive integer for the first sequence
1394 positive integer for the last sequence to include (optional)
1400 $self->throw_not_implemented();
1404 =head2 select_noncont
1406 Title : select_noncont
1407 Usage : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences
1410 Creates a new alignment from a subset of
1411 sequences. Numbering starts from 1. Sequence positions
1412 larger than num_sequences() will throw an error.
1414 Returns : a Bio::Assembly::Contig object
1415 Args : array of integers for the sequences
1419 sub select_noncont
{
1421 $self->throw_not_implemented();
1427 Usage : $contig2 = $contig->slice(20, 30)
1430 Creates a slice from the alignment inclusive of start and
1431 end columns. Sequences with no residues in the slice are
1432 excluded from the new alignment and a warning is printed.
1433 Slice beyond the length of the sequence does not do
1436 Returns : a Bio::Assembly::Contig object
1437 Argument : positive integer for start column
1438 positive integer for end column
1444 $self->throw_not_implemented();
1447 =head2 Change sequences within the MSE
1449 These methods affect characters in all sequences without changeing the
1456 Usage : $contig->map_chars('\.','-')
1459 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1462 Notice that the from (arg1) is interpretted as a regex,
1463 so be careful about quoting meta characters (eg
1464 $contig->map_chars('.','-') wont do what you want)
1467 Argument : 'from' rexexp
1474 $self->throw_not_implemented();
1480 Usage : $contig->uppercase()
1481 Function : Sets all the sequences to uppercase
1489 $self->throw_not_implemented();
1494 Title : match_line()
1495 Usage : $contig->match_line()
1496 Function : Generates a match line - much like consensus string
1497 except that a line indicating the '*' for a match.
1498 Argument : (optional) Match line characters ('*' by default)
1499 (optional) Strong match char (':' by default)
1500 (optional) Weak match char ('.' by default)
1506 $self->throw_not_implemented();
1512 Usage : $contig->match()
1515 Goes through all columns and changes residues that are
1516 identical to residue in first sequence to match '.'
1517 character. Sets match_char.
1519 USE WITH CARE: Most MSE formats do not support match
1520 characters in sequences, so this is mostly for output
1521 only. NEXUS format (Bio::AlignIO::nexus) can handle
1525 Argument : a match character, optional, defaults to '.'
1531 $self->throw_not_implemented();
1537 Usage : $contig->unmatch()
1540 Undoes the effect of method match. Unsets match_char.
1543 Argument : a match character, optional, defaults to '.'
1549 $self->throw_not_implemented();
1553 =head2 MSE attibutes
1555 Methods for setting and reading the MSE attributes.
1557 Note that the methods defining character semantics depend on the user
1558 to set them sensibly. They are needed only by certain input/output
1559 methods. Unset them by setting to an empty string ('').
1564 Usage : $contig->id("Ig")
1565 Function : Gets/sets the id field of the alignment
1566 Returns : An id string
1567 Argument : An id string (optional)
1572 my ($self, $contig_name) = @_;
1574 if (defined( $contig_name )) {
1575 $self->{'_id'} = $contig_name;
1578 return $self->{'_id'};
1583 Title : missing_char
1584 Usage : $contig->missing_char("?")
1585 Function : Gets/sets the missing_char attribute of the alignment
1586 It is generally recommended to set it to 'n' or 'N'
1587 for nucleotides and to 'X' for protein.
1588 Returns : An missing_char string,
1589 Argument : An missing_char string (optional)
1595 $self->throw_not_implemented();
1601 Usage : $contig->match_char('.')
1602 Function : Gets/sets the match_char attribute of the alignment
1603 Returns : An match_char string,
1604 Argument : An match_char string (optional)
1610 $self->throw_not_implemented();
1616 Usage : $contig->gap_char('-')
1617 Function : Gets/sets the gap_char attribute of the alignment
1618 Returns : An gap_char string, defaults to '-'
1619 Argument : An gap_char string (optional)
1625 $self->throw_not_implemented();
1630 Title : symbol_chars
1631 Usage : my @symbolchars = $contig->symbol_chars;
1632 Function: Returns all the seen symbols (other than gaps)
1633 Returns : array of characters that are the seen symbols
1634 Argument: boolean to include the gap/missing/match characters
1640 $self->throw_not_implemented();
1643 =head2 Alignment descriptors
1645 These read only methods describe the MSE in various ways.
1648 =head2 consensus_string
1650 Title : consensus_string
1651 Usage : $str = $contig->consensus_string($threshold_percent)
1652 Function : Makes a strict consensus
1654 Argument : Optional threshold ranging from 0 to 100.
1655 The consensus residue has to appear at least threshold %
1656 of the sequences at a given location, otherwise a '?'
1657 character will be placed at that location.
1658 (Default value = 0%)
1662 sub consensus_string
{
1664 $self->throw_not_implemented();
1667 =head2 consensus_iupac
1669 Title : consensus_iupac
1670 Usage : $str = $contig->consensus_iupac()
1673 Makes a consensus using IUPAC ambiguity codes from DNA
1674 and RNA. The output is in upper case except when gaps in
1675 a column force output to be in lower case.
1677 Note that if your alignment sequences contain a lot of
1678 IUPAC ambiquity codes you often have to manually set
1679 alphabet. Bio::PrimarySeq::_guess_type thinks they
1680 indicate a protein sequence.
1682 Returns : consensus string
1684 Throws : on protein sequences
1689 sub consensus_iupac
{
1691 $self->throw_not_implemented();
1697 Usage : if( $contig->is_flush() )
1700 Function : Tells you whether the alignment
1701 : is flush, ie all of the same length
1711 $self->throw_not_implemented();
1717 Usage : $len = $contig->length()
1718 Function : Returns the maximum length of the alignment.
1719 To be sure the alignment is a block, use is_flush
1728 $self->throw_not_implemented();
1731 =head2 maxname_length
1733 Title : maxname_length
1734 Usage : $contig->maxname_length()
1737 Gets the maximum length of the displayname in the
1738 alignment. Used in writing out various MSE formats.
1745 sub maxname_length
{
1747 $self->throw_not_implemented();
1752 Title : num_residues
1753 Usage : $no = $contig->num_residues
1754 Function : number of residues in total in the alignment
1757 Note : replaces no_residues
1764 return $self->{'_nof_residues'};
1767 =head2 num_sequences
1769 Title : num_sequences
1770 Usage : $depth = $contig->num_sequences
1771 Function : number of sequence in the sequence alignment
1774 Note : replaces no_sequences
1781 return scalar( keys %{ $self->{'_elem'} } );
1784 =head2 percentage_identity
1786 Title : percentage_identity
1787 Usage : $id = $contig->percentage_identity
1788 Function: The function calculates the percentage identity of the alignment
1789 Returns : The percentage identity of the alignment (as defined by the
1795 sub percentage_identity
{
1797 $self->throw_not_implemented();
1800 =head2 overall_percentage_identity
1802 Title : percentage_identity
1803 Usage : $id = $contig->percentage_identity
1804 Function: The function calculates the percentage identity of
1805 the conserved columns
1806 Returns : The percentage identity of the conserved columns
1811 sub overall_percentage_identity
{
1813 $self->throw_not_implemented();
1817 =head2 average_percentage_identity
1819 Title : average_percentage_identity
1820 Usage : $id = $contig->average_percentage_identity
1821 Function: The function uses a fast method to calculate the average
1822 percentage identity of the alignment
1823 Returns : The average percentage identity of the alignment
1828 sub average_percentage_identity
{
1830 $self->throw_not_implemented();
1833 =head2 Alignment positions
1835 Methods to map a sequence position into an alignment column and back.
1836 column_from_residue_number() does the former. The latter is really a
1837 property of the sequence object and can done using
1838 L<Bio::LocatableSeq::location_from_column>:
1840 # select somehow a sequence from the alignment, e.g.
1841 my $seq = $contig->get_seq_by_pos(1);
1842 #$loc is undef or Bio::LocationI object
1843 my $loc = $seq->location_from_column(5);
1846 =head2 column_from_residue_number
1848 Title : column_from_residue_number
1849 Usage : $col = $contig->column_from_residue_number( $seqname, $resnumber)
1852 This function gives the position in the alignment
1853 (i.e. column number) of the given residue number in the
1854 sequence with the given name. For example, for the
1857 Seq1/91-97 AC..DEF.GH
1858 Seq2/24-30 ACGG.RTY..
1859 Seq3/43-51 AC.DDEFGHI
1861 column_from_residue_number( "Seq1", 94 ) returns 5.
1862 column_from_residue_number( "Seq2", 25 ) returns 2.
1863 column_from_residue_number( "Seq3", 50 ) returns 9.
1865 An exception is thrown if the residue number would lie
1866 outside the length of the aligment
1867 (e.g. column_from_residue_number( "Seq2", 22 )
1869 Note: If the the parent sequence is represented by more than
1870 one alignment sequence and the residue number is present in
1871 them, this method finds only the first one.
1873 Returns : A column number for the position in the alignment of the
1874 given residue in the given sequence (1 = first column)
1875 Args : A sequence id/name (not a name/start-end)
1876 A residue number in the whole sequence (not just that
1877 segment of it in the alignment)
1881 sub column_from_residue_number
{
1883 $self->throw_not_implemented();
1886 =head2 Sequence names
1888 Methods to manipulate the display name. The default name based on the
1889 sequence id and subsequence positions can be overridden in various
1895 Usage : $contig->displayname("Ig", "IgA")
1896 Function : Gets/sets the display name of a sequence in the alignment
1898 Returns : A display name string
1899 Argument : name of the sequence
1900 displayname of the sequence (optional)
1904 sub displayname
{ # Do nothing
1907 =head2 set_displayname_count
1909 Title : set_displayname_count
1910 Usage : $contig->set_displayname_count
1913 Sets the names to be name_# where # is the number of
1914 times this name has been used.
1921 sub set_displayname_count
{
1923 $self->throw_not_implemented();
1926 =head2 set_displayname_flat
1928 Title : set_displayname_flat
1929 Usage : $contig->set_displayname_flat()
1930 Function : Makes all the sequences be displayed as just their name,
1937 sub set_displayname_flat
{ # Do nothing!
1940 =head2 set_displayname_normal
1942 Title : set_displayname_normal
1943 Usage : $contig->set_displayname_normal()
1944 Function : Makes all the sequences be displayed as name/start-end
1950 sub set_displayname_normal
{ # Do nothing!
1953 =head1 Internal Methods
1955 =head2 _binary_search
1957 Title : _binary_search
1958 Usage : _binary_search($list,$query)
1961 Find a number in a sorted list of numbers. Return values
1962 may be on or two integers. One positive integer or zero
1963 (>=0) is the index of the element that stores the queried
1964 value. Two positive integers (or zero and another
1965 number) are the indexes of elements among which the
1966 queried value should be placed. Negative single values
1969 -1: $query is smaller than smallest element in list
1970 -2: $query is greater than greatest element in list
1972 Returns : array of integers
1974 $list : array reference
1979 sub _binary_search
{
1983 # If there is only one element in list
1984 if (!$#{$list} && ($query == $list->[0])) { return (0) }
1985 # If there are others...
1987 my $end = $#{$list};
1988 (&_compare
($query,$list->[$start]) == 0) && do { return ($start) };
1989 (&_compare
($query,$list->[$end]) == 0) && do { return ($end) };
1990 (&_compare
($query,$list->[$start]) < 0) && do { return (-1) };
1991 (&_compare
($query,$list->[$end]) > 0) && do { return (-2) };
1993 while ($end - $start > 1) {
1994 $middle = int(($end+$middle)/2);
1995 (&_compare
($query,$list->[$middle]) == 0) && return ($middle);
1996 (&_compare
($query,$list->[$middle]) < 0) && do { $end = $middle ; $middle = 0; next };
1997 $start = $middle; # If &_compare() > 0, move region beggining
1999 return ($start,$end);
2005 Usage : _compare($arg1,$arg2)
2006 Function: Perform numeric or string comparisons
2007 Returns : integer (0, 1 or -1)
2008 Args : values to be compared
2016 if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 }
2017 else { return $arg1 cmp $arg2 }
2023 Usage : _nof_gaps($array_ref, $query)
2024 Function: number of gaps found before position $query
2027 $array_ref : gap registry reference
2028 $query : [integer] a position in a sequence
2036 # If there are no gaps in this contig
2037 return 0 unless (defined($list) && scalar(@
{$list}));
2038 # Locate query index in gap list (if any)
2039 my @index = &_binary_search
($list,$query);
2040 # If after all alignments, correct using total number of align
2041 if ($index[0] == -2) { $query = scalar(@
{$list}) }
2042 # If before any alignment, return 0
2043 elsif ($index[0] == -1) { $query = 0 }
2044 elsif ($index[0] >= 0) {
2045 # If query is between alignments, translate coordinates
2046 if ($#index > 0) { $query = $index[0] + 1 }
2047 # If query sits upon an alignment, do another correction
2048 elsif ($#index == 0) { $query = $index[0] }
2054 =head2 _padded_unpadded
2056 Title : _padded_unpadded
2057 Usage : _padded_unpadded($array_ref, $query)
2060 Returns a coordinate corresponding to
2061 position $query after gaps were
2062 removed from a sequence.
2066 $array_ref : reference to this gap registry
2067 $query : [integer] coordionate to change
2071 sub _padded_unpadded
{
2075 my $align = &_nof_gaps
($list,$query);
2076 $query-- if (defined($list->[$align]) && ($list->[$align] == $query));
2077 $query = $query - $align;
2082 =head2 _unpadded_padded
2084 Title : _unpadded_padded
2085 Usage : _unpadded_padded($array_ref, $query)
2088 Returns the value corresponding to
2089 ungapped position $query when gaps are
2090 counted as valid sites in a sequence
2093 Args : $array_ref = a reference to this sequence's gap registry
2094 $query = [integer] location to change
2099 sub _unpadded_padded
{
2103 my $align = &_nof_gaps
($list,$query);
2104 $query = $query + $align;
2105 my $new_align = &_nof_gaps
($list,$query);
2106 while ($new_align - $align > 0) {
2107 $query = $query + $new_align - $align;
2108 $align = $new_align;
2109 $new_align = &_nof_gaps
($list,$query);
2111 # If current position is also a align, look for the first upstream base
2112 while (defined($list->[$align]) && ($list->[$align] == $query)) {
2119 =head2 _register_gaps
2121 Title : _register_gaps
2122 Usage : $self->_register_gaps($seq, $array_ref)
2123 Function: stores gap locations for a sequence
2124 Returns : number of gaps found
2126 $seq : sequence string
2127 $array_ref : a reference to an array,
2128 where gap locations will
2133 sub _register_gaps
{
2135 my $sequence = shift;
2138 $self->throw("Not an aligned sequence string to register gaps")
2139 if (ref($sequence));
2141 $self->throw("Not an array reference for gap registry")
2142 unless (ref($dbref) eq 'ARRAY');
2144 # Registering alignments
2145 @
{$dbref} = (); # Cleaning registry
2146 if (defined $sequence) {
2149 $i = index($sequence,"-",$i+1);
2151 push(@
{$dbref},$i+1);
2154 # $self->warn("Found undefined sequence while registering gaps");
2158 return scalar(@
{$dbref});
2161 =head1 Deprecated methods
2168 Usage : $no = $ali->no_residues
2169 Function : number of residues in total in the alignment
2172 Note : deprecated in favor of num_residues()
2178 $self->deprecated(-warn_version
=> 1.0069,
2179 -throw_version
=> 1.0075);
2180 $self->num_residues(@_);
2185 Title : no_sequences
2186 Usage : $depth = $ali->no_sequences
2187 Function : number of sequence in the sequence alignment
2190 Note : deprecated in favor of num_sequences()
2196 $self->deprecated(-warn_version
=> 1.0069,
2197 -throw_version
=> 1.0075);
2198 $self->num_sequences(@_);