Sync with main trunk
[bioperl-live.git] / Bio / Align / AlignI.pm
blob871b9da7bc1c53b9cd3aa1caf61fb2b3d95630d3
1 # $Id$
3 # BioPerl module for Bio::Align::AlignI
5 # Cared for by Jason Stajich <jason@bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Align::AlignI - An interface for describing sequence alignments.
17 =head1 SYNOPSIS
19 # get a Bio::Align::AlignI somehow - typically using Bio::AlignIO system
20 # some descriptors
21 print $aln->length, "\n";
22 print $aln->no_residues, "\n";
23 print $aln->is_flush, "\n";
24 print $aln->no_sequences, "\n";
25 print $aln->percentage_identity, "\n";
26 print $aln->consensus_string(50), "\n";
28 # find the position in the alignment for a sequence location
29 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
31 # extract sequences and check values for the alignment column $pos
32 foreach $seq ($aln->each_seq) {
33 $res = $seq->subseq($pos, $pos);
34 $count{$res}++;
36 foreach $res (keys %count) {
37 printf "Res: %s Count: %2d\n", $res, $count{$res};
40 =head1 DESCRIPTION
42 This interface describes the basis for alignment objects.
44 =head1 FEEDBACK
46 =head2 Mailing Lists
48 User feedback is an integral part of the evolution of this and other
49 Bioperl modules. Send your comments and suggestions preferably to
50 the Bioperl mailing list. Your participation is much appreciated.
52 bioperl-l@bioperl.org - General discussion
53 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55 =head2 Reporting Bugs
57 Report bugs to the Bioperl bug tracking system to help us keep track
58 of the bugs and their resolution. Bug reports can be submitted via the
59 web:
61 http://bugzilla.open-bio.org/
63 =head1 AUTHOR - Jason Stajich
65 Email jason@bioperl.org
67 =head1 CONTRIBUTORS
69 Ewan Birney, birney@ebi.ac.uk
70 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
72 =head1 APPENDIX
74 The rest of the documentation details each of the object methods.
75 Internal methods are usually preceded with a _
77 =cut
80 # Let the code begin...
83 package Bio::Align::AlignI;
84 use strict;
87 use base qw(Bio::Root::RootI);
89 =head1 Modifier methods
91 These methods modify the MSE by adding, removing or shuffling complete
92 sequences.
94 =head2 add_seq
96 Title : add_seq
97 Usage : $myalign->add_seq($newseq);
98 Function : Adds another sequence to the alignment. *Does not* align
99 it - just adds it to the hashes.
100 Returns : None
101 Argument : a Bio::LocatableSeq object
102 order (optional)
104 See L<Bio::LocatableSeq> for more information.
106 =cut
108 sub add_seq {
109 my ($self) = @_;
110 $self->throw_not_implemented();
113 =head2 remove_seq
115 Title : remove_seq
116 Usage : $aln->remove_seq($seq);
117 Function : Removes a single sequence from an alignment
118 Returns :
119 Argument : a Bio::LocatableSeq object
121 =cut
123 sub remove_seq {
124 my ($self) = @_;
125 $self->throw_not_implemented();
128 =head2 purge
130 Title : purge
131 Usage : $aln->purge(0.7);
132 Function:
134 Removes sequences above whatever %id.
136 This function will grind on large alignments. Beware!
137 (perhaps not ideally implemented)
139 Example :
140 Returns : An array of the removed sequences
141 Argument:
144 =cut
146 sub purge {
147 my ($self) = @_;
148 $self->throw_not_implemented();
151 =head2 sort_alphabetically
153 Title : sort_alphabetically
154 Usage : $ali->sort_alphabetically
155 Function :
157 Changes the order of the alignment to alphabetical on name
158 followed by numerical by number.
160 Returns : an array
161 Argument :
163 =cut
165 sub sort_alphabetically {
166 my ($self) = @_;
167 $self->throw_not_implemented();
170 =head1 Sequence selection methods
172 Methods returning one or more sequences objects.
174 =head2 each_seq
176 Title : each_seq
177 Usage : foreach $seq ( $align->each_seq() )
178 Function : Gets an array of Seq objects from the alignment
179 Returns : an array
180 Argument :
182 =cut
184 sub each_seq {
185 my ($self) = @_;
186 $self->throw_not_implemented();
189 =head2 each_alphabetically
191 Title : each_alphabetically
192 Usage : foreach $seq ( $ali->each_alphabetically() )
193 Function :
195 Returns an array of sequence object sorted alphabetically
196 by name and then by start point.
197 Does not change the order of the alignment
199 Returns :
200 Argument :
202 =cut
204 sub each_alphabetically {
205 my($self) = @_;
206 $self->throw_not_implemented();
209 =head2 each_seq_with_id
211 Title : each_seq_with_id
212 Usage : foreach $seq ( $align->each_seq_with_id() )
213 Function :
215 Gets an array of Seq objects from the
216 alignment, the contents being those sequences
217 with the given name (there may be more than one)
219 Returns : an array
220 Argument : a seq name
222 =cut
224 sub each_seq_with_id {
225 my ($self) = @_;
226 $self->throw_not_implemented();
229 =head2 get_seq_by_pos
231 Title : get_seq_by_pos
232 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
233 Function :
235 Gets a sequence based on its position in the alignment.
236 Numbering starts from 1. Sequence positions larger than
237 no_sequences() will throw an error.
239 Returns : a Bio::LocatableSeq object
240 Argument : positive integer for the sequence position
242 =cut
244 sub get_seq_by_pos {
245 my ($self) = @_;
246 $self->throw_not_implemented();
249 =head1 Create new alignments
251 The result of these methods are horizontal or vertical subsets of the
252 current MSE.
254 =head2 select
256 Title : select
257 Usage : $aln2 = $aln->select(1, 3) # three first sequences
258 Function :
260 Creates a new alignment from a continuous subset of
261 sequences. Numbering starts from 1. Sequence positions
262 larger than no_sequences() will throw an error.
264 Returns : a Bio::SimpleAlign object
265 Argument : positive integer for the first sequence
266 positive integer for the last sequence to include (optional)
268 =cut
270 sub select {
271 my ($self) = @_;
272 $self->throw_not_implemented();
276 =head2 select_noncont
278 Title : select_noncont
279 Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences
280 Function :
282 Creates a new alignment from a subset of
283 sequences. Numbering starts from 1. Sequence positions
284 larger than no_sequences() will throw an error.
286 Returns : a Bio::SimpleAlign object
287 Args : array of integers for the sequences
289 =cut
291 sub select_noncont {
292 my ($self) = @_;
293 $self->throw_not_implemented();
296 =head2 slice
298 Title : slice
299 Usage : $aln2 = $aln->slice(20, 30)
300 Function :
302 Creates a slice from the alignment inclusive of start and
303 end columns. Sequences with no residues in the slice are
304 excluded from the new alignment and a warning is printed.
305 Slice beyond the length of the sequence does not do
306 padding.
308 Returns : a Bio::SimpleAlign object
309 Argument : positive integer for start column
310 positive integer for end column
312 =cut
314 sub slice {
315 my ($self) = @_;
316 $self->throw_not_implemented();
319 =head1 Change sequences within the MSE
321 These methods affect characters in all sequences without changing the
322 alignment.
325 =head2 map_chars
327 Title : map_chars
328 Usage : $ali->map_chars('\.','-')
329 Function :
331 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
332 characters
334 Notice that the from (arg1) is interpreted as a regex,
335 so be careful about quoting meta characters (eg
336 $ali->map_chars('.','-') wont do what you want)
338 Returns : None
339 Argument : 'from' rexexp
340 'to' string
342 =cut
344 sub map_chars {
345 my ($self) = @_;
346 $self->throw_not_implemented();
349 =head2 uppercase
351 Title : uppercase()
352 Usage : $ali->uppercase()
353 Function : Sets all the sequences to uppercase
354 Returns :
355 Argument :
357 =cut
359 sub uppercase {
360 my ($self) = @_;
361 $self->throw_not_implemented();
364 =head2 match_line
366 Title : match_line()
367 Usage : $align->match_line()
368 Function : Generates a match line - much like consensus string
369 except that a line indicating the '*' for a match.
370 Argument : (optional) Match line characters ('*' by default)
371 (optional) Strong match char (':' by default)
372 (optional) Weak match char ('.' by default)
374 =cut
376 sub match_line {
377 my ($self) = @_;
378 $self->throw_not_implemented();
381 =head2 match
383 Title : match()
384 Usage : $ali->match()
385 Function :
387 Goes through all columns and changes residues that are
388 identical to residue in first sequence to match '.'
389 character. Sets match_char.
391 USE WITH CARE: Most MSE formats do not support match
392 characters in sequences, so this is mostly for output
393 only. NEXUS format (Bio::AlignIO::nexus) can handle
396 Returns : 1
397 Argument : a match character, optional, defaults to '.'
399 =cut
401 sub match {
402 my ($self) = @_;
403 $self->throw_not_implemented();
406 =head2 unmatch
408 Title : unmatch()
409 Usage : $ali->unmatch()
410 Function :
412 Undoes the effect of method match. Unsets match_char.
414 Returns : 1
415 Argument : a match character, optional, defaults to '.'
417 =cut
419 sub unmatch {
420 my ($self) = @_;
421 $self->throw_not_implemented();
425 =head1 MSE attibutes
427 Methods for setting and reading the MSE attributes.
429 Note that the methods defining character semantics depend on the user
430 to set them sensibly. They are needed only by certain input/output
431 methods. Unset them by setting to an empty string ('').
433 =head2 id
435 Title : id
436 Usage : $myalign->id("Ig")
437 Function : Gets/sets the id field of the alignment
438 Returns : An id string
439 Argument : An id string (optional)
441 =cut
443 sub id {
444 my ($self) = @_;
445 $self->throw_not_implemented();
448 =head2 missing_char
450 Title : missing_char
451 Usage : $myalign->missing_char("?")
452 Function : Gets/sets the missing_char attribute of the alignment
453 It is generally recommended to set it to 'n' or 'N'
454 for nucleotides and to 'X' for protein.
455 Returns : An missing_char string,
456 Argument : An missing_char string (optional)
458 =cut
460 sub missing_char {
461 my ($self) = @_;
462 $self->throw_not_implemented();
465 =head2 match_char
467 Title : match_char
468 Usage : $myalign->match_char('.')
469 Function : Gets/sets the match_char attribute of the alignment
470 Returns : An match_char string,
471 Argument : An match_char string (optional)
473 =cut
475 sub match_char {
476 my ($self) = @_;
477 $self->throw_not_implemented();
480 =head2 gap_char
482 Title : gap_char
483 Usage : $myalign->gap_char('-')
484 Function : Gets/sets the gap_char attribute of the alignment
485 Returns : An gap_char string, defaults to '-'
486 Argument : An gap_char string (optional)
488 =cut
490 sub gap_char {
491 my ($self) = @_;
492 $self->throw_not_implemented();
495 =head2 symbol_chars
497 Title : symbol_chars
498 Usage : my @symbolchars = $aln->symbol_chars;
499 Function: Returns all the seen symbols (other than gaps)
500 Returns : array of characters that are the seen symbols
501 Argument: boolean to include the gap/missing/match characters
503 =cut
505 sub symbol_chars{
506 my ($self) = @_;
507 $self->throw_not_implemented();
510 =head1 Alignment descriptors
512 These read only methods describe the MSE in various ways.
515 =head2 consensus_string
517 Title : consensus_string
518 Usage : $str = $ali->consensus_string($threshold_percent)
519 Function : Makes a strict consensus
520 Returns : consensus string
521 Argument : Optional threshold ranging from 0 to 100.
522 The consensus residue has to appear at least threshold %
523 of the sequences at a given location, otherwise a '?'
524 character will be placed at that location.
525 (Default value = 0%)
527 =cut
529 sub consensus_string {
530 my ($self) = @_;
531 $self->throw_not_implemented();
534 =head2 consensus_iupac
536 Title : consensus_iupac
537 Usage : $str = $ali->consensus_iupac()
538 Function :
540 Makes a consensus using IUPAC ambiguity codes from DNA
541 and RNA. The output is in upper case except when gaps in
542 a column force output to be in lower case.
544 Note that if your alignment sequences contain a lot of
545 IUPAC ambiquity codes you often have to manually set
546 alphabet. Bio::PrimarySeq::_guess_type thinks they
547 indicate a protein sequence.
549 Returns : consensus string
550 Argument : none
551 Throws : on protein sequences
554 =cut
556 sub consensus_iupac {
557 my ($self) = @_;
558 $self->throw_not_implemented();
561 =head2 is_flush
563 Title : is_flush
564 Usage : if( $ali->is_flush() )
567 Function : Tells you whether the alignment
568 : is flush, ie all of the same length
571 Returns : 1 or 0
572 Argument :
574 =cut
576 sub is_flush {
577 my ($self) = @_;
578 $self->throw_not_implemented();
581 =head2 length
583 Title : length()
584 Usage : $len = $ali->length()
585 Function : Returns the maximum length of the alignment.
586 To be sure the alignment is a block, use is_flush
587 Returns : integer
588 Argument :
590 =cut
592 sub length {
593 my ($self) = @_;
594 $self->throw_not_implemented();
597 =head2 maxname_length
599 Title : maxname_length
600 Usage : $ali->maxname_length()
601 Function :
603 Gets the maximum length of the displayname in the
604 alignment. Used in writing out various MSE formats.
606 Returns : integer
607 Argument :
609 =cut
611 sub maxname_length {
612 my ($self) = @_;
613 $self->throw_not_implemented();
616 =head2 no_residues
618 Title : no_residues
619 Usage : $no = $ali->no_residues
620 Function : number of residues in total in the alignment
621 Returns : integer
622 Argument :
624 =cut
626 sub no_residues {
627 my ($self) = @_;
628 $self->throw_not_implemented();
631 =head2 no_sequences
633 Title : no_sequences
634 Usage : $depth = $ali->no_sequences
635 Function : number of sequence in the sequence alignment
636 Returns : integer
637 Argument : None
639 =cut
641 sub no_sequences {
642 my ($self) = @_;
643 $self->throw_not_implemented();
646 =head2 percentage_identity
648 Title : percentage_identity
649 Usage : $id = $align->percentage_identity
650 Function: The function calculates the percentage identity of the alignment
651 Returns : The percentage identity of the alignment (as defined by the
652 implementation)
653 Argument: None
655 =cut
657 sub percentage_identity{
658 my ($self) = @_;
659 $self->throw_not_implemented();
662 =head2 overall_percentage_identity
664 Title : overall_percentage_identity
665 Usage : $id = $align->overall_percentage_identity
666 Function: The function calculates the percentage identity of
667 the conserved columns
668 Returns : The percentage identity of the conserved columns
669 Args : None
671 =cut
673 sub overall_percentage_identity{
674 my ($self) = @_;
675 $self->throw_not_implemented();
679 =head2 average_percentage_identity
681 Title : average_percentage_identity
682 Usage : $id = $align->average_percentage_identity
683 Function: The function uses a fast method to calculate the average
684 percentage identity of the alignment
685 Returns : The average percentage identity of the alignment
686 Args : None
688 =cut
690 sub average_percentage_identity{
691 my ($self) = @_;
692 $self->throw_not_implemented();
695 =head1 Alignment positions
697 Methods to map a sequence position into an alignment column and back.
698 column_from_residue_number() does the former. The latter is really a
699 property of the sequence object and can done using
700 L<Bio::LocatableSeq::location_from_column>:
702 # select somehow a sequence from the alignment, e.g.
703 my $seq = $aln->get_seq_by_pos(1);
704 #$loc is undef or Bio::LocationI object
705 my $loc = $seq->location_from_column(5);
708 =head2 column_from_residue_number
710 Title : column_from_residue_number
711 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
712 Function:
714 This function gives the position in the alignment
715 (i.e. column number) of the given residue number in the
716 sequence with the given name. For example, for the
717 alignment
719 Seq1/91-97 AC..DEF.GH
720 Seq2/24-30 ACGG.RTY..
721 Seq3/43-51 AC.DDEFGHI
723 column_from_residue_number( "Seq1", 94 ) returns 5.
724 column_from_residue_number( "Seq2", 25 ) returns 2.
725 column_from_residue_number( "Seq3", 50 ) returns 9.
727 An exception is thrown if the residue number would lie
728 outside the length of the alignment
729 (e.g. column_from_residue_number( "Seq2", 22 )
731 Note: If the parent sequence is represented by more than one
732 alignment sequence and the residue number is present in
733 them, this method finds only the first one.
735 Returns : A column number for the position in the alignment of the
736 given residue in the given sequence (1 = first column)
737 Args : A sequence id/name (not a name/start-end)
738 A residue number in the whole sequence (not just that
739 segment of it in the alignment)
741 =cut
743 sub column_from_residue_number {
744 my ($self) = @_;
745 $self->throw_not_implemented();
748 =head1 Sequence names
750 Methods to manipulate the display name. The default name based on the
751 sequence id and subsequence positions can be overridden in various
752 ways.
754 =head2 displayname
756 Title : displayname
757 Usage : $myalign->displayname("Ig", "IgA")
758 Function : Gets/sets the display name of a sequence in the alignment
760 Returns : A display name string
761 Argument : name of the sequence
762 displayname of the sequence (optional)
764 =cut
766 sub displayname {
767 my ($self) = @_;
768 $self->throw_not_implemented();
771 =head2 set_displayname_count
773 Title : set_displayname_count
774 Usage : $ali->set_displayname_count
775 Function :
777 Sets the names to be name_# where # is the number of
778 times this name has been used.
780 Returns : None
781 Argument : None
783 =cut
785 sub set_displayname_count {
786 my ($self) = @_;
787 $self->throw_not_implemented();
790 =head2 set_displayname_flat
792 Title : set_displayname_flat
793 Usage : $ali->set_displayname_flat()
794 Function : Makes all the sequences be displayed as just their name,
795 not name/start-end
796 Returns : 1
797 Argument : None
799 =cut
801 sub set_displayname_flat {
802 my ($self) = @_;
803 $self->throw_not_implemented();
806 =head2 set_displayname_normal
808 Title : set_displayname_normal
809 Usage : $ali->set_displayname_normal()
810 Function : Makes all the sequences be displayed as name/start-end
811 Returns : None
812 Argument : None
814 =cut
816 sub set_displayname_normal {
817 my ($self) = @_;
818 $self->throw_not_implemented();