bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / PopGen / Utilities.pm
bloba9cee8ad309dbe1f9f749acc445d245171ffceb7
1 # $Id$
3 # BioPerl module for Bio::PopGen::Utilities
5 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::PopGen::Utilities - Utilities for working with PopGen data and objects
17 =head1 SYNOPSIS
19 use Bio::PopGen::Utilities;
20 use Bio::AlignIO;
22 my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
23 -format => 'clustalw');
24 my $aln = $in->next_aln;
25 # get a population, each sequence is an individual and
26 # for the default case, every site which is not monomorphic
27 # is a 'marker'. Each individual will have a 'genotype' for the
28 # site which will be the specific base in the alignment at that
29 # site
30 my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
32 # get the synonymous sites from the alignemt only as the 'genotypes'
33 # for the population
34 my $synpop = Bio::PopGen::Utilities->aln_to_population(-site_model => 'syn',
35 -alignment => $aln);
38 =head1 DESCRIPTION
40 This object provides some convience function to turn sequence
41 alignments into usable objects for the Population genetics modules
42 (Bio::PopGen).
44 =head1 FEEDBACK
46 =head2 Mailing Lists
48 User feedback is an integral part of the evolution of this and other
49 Bioperl modules. Send your comments and suggestions preferably to
50 the Bioperl mailing list. Your participation is much appreciated.
52 bioperl-l@bioperl.org - General discussion
53 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55 =head2 Reporting Bugs
57 Report bugs to the Bioperl bug tracking system to help us keep track
58 of the bugs and their resolution. Bug reports can be submitted via
59 the web:
61 http://bugzilla.open-bio.org/
63 =head1 AUTHOR - Jason Stajich
65 Email jason-at-bioperl-dot-org
67 =head1 APPENDIX
69 The rest of the documentation details each of the object methods.
70 Internal methods are usually preceded with a _
72 =cut
75 # Let the code begin...
78 package Bio::PopGen::Utilities;
79 use strict;
81 use Bio::Align::DNAStatistics;
82 use Bio::PopGen::Population;
83 use Bio::PopGen::Individual;
85 use base qw(Bio::Root::Root);
86 use constant CodonLen => 3;
89 =head2 aln_to_population
91 Title : aln_to_population
92 Usage : my $pop = Bio::PopGen::Utilities->aln_to_population($aln);
93 Function: Turn and alignment into a set of L<Bio::PopGen::Individual>
94 objects grouped in a L<Bio::PopGen::Population> object
96 Sites are treated as 'Markers' in the Bioperl PopGen object
97 model in the sense that a site is a unique location for which
98 an individual will have a genotype (a set of alleles).
99 In this implementation we are assuming that each individual
100 has a single entry in the alignment file.
102 Specify a site model as one of those listed
103 'all' -- every base in the alignment is considered a site
104 'cod' -- codons
106 The option -site_model
107 for All sites : 'all'
108 Codon sites : 'cod' or 'codon'
110 To see all sites, including those which are fixed in the population
111 add -include_monomorphic => 1
112 to the arguments
113 Returns :
114 Args : -include_monomorphic => 1 to specify all sites,
115 even those which are monomorphic
116 in the population
117 (useful for HKA test mostly)
118 [default is false]
119 -phase => specify a phase for the data, this is only
120 used if the site_mode is codon
121 [default is 0]
122 -site_model => one-of 'all', 'codon'
123 to specify a site model for the data extraction
124 from the alignment
125 [default is all]
126 -alignment => provide a L<Bio::SimpleAlign> object [required]
128 =cut
130 sub aln_to_population{
131 my ($self,@args) = @_;
132 my ($aln,
133 $sitemodel,$phase,
134 $includefixed,$checkisa) = $self->_rearrange([qw(ALIGNMENT
135 SITE_MODEL
136 PHASE
137 INCLUDE_MONOMORPHIC
138 CHECKISA)],
139 @args);
140 if( ! defined $aln ) {
141 $self->warn("Must provide a valid Bio::SimpleAlign object to run aln_to_population");
142 return;
145 if( ! $aln->is_flush ) {
146 $self->warn("Must provide a Bio::SimpleAlign object with aligned sequences to aln_to_population!");
147 return;
149 $phase = 0 unless defined $phase;
150 if( $phase != 0 && $phase != 1 && $phase != 2 ) {
151 warn("phase must be 0,1, or 2");
152 return;
154 my $alength = $aln->length;
155 my @inds;
156 if( ! defined $sitemodel || $sitemodel =~ /all/i ) {
157 my $ct = 0;
158 my @seqs;
159 for my $seq ( $aln->each_seq ) {
160 push @seqs, $seq->seq;
161 push @inds, Bio::PopGen::Individual->new(-unique_id => $seq->display_id);
164 for( my $i = 0; $i < $alength; $i++ ) {
165 my $nm = "Site-$i";
166 my (@genotypes,%set);
168 # do we skip indels?
169 # slicing vertically
170 for my $seq ( @seqs ) {
171 my $site = substr($seq,$i,1);
172 $set{$site}++;
173 push @genotypes, $site;
175 if( keys %set > 1 || $includefixed ) {
176 my $genoct = scalar @genotypes;
177 for( my $j = 0; $j < $genoct; $j++ ) {
178 $inds[$j]->add_Genotype(Bio::PopGen::Genotype->new
179 (-marker_name => $nm,
180 -individual_id=> $inds[$j]->unique_id,
181 -alleles => [$genotypes[$j]]));
185 } elsif( $sitemodel =~ /cod(on)?/i ) {
186 my $ct = 0;
187 my @seqs;
188 for my $seq ( $aln->each_seq ) {
189 push @seqs, $seq->seq;
190 push @inds, Bio::PopGen::Individual->new(-unique_id => $seq->display_id);
192 my $codonct = 0;
193 for( my $i = $phase; $i < $alength; $i += CodonLen ) {
194 my $nm = "Codon-$codonct-$i";
195 my (@genotypes,%set,$genoct);
197 for my $seq ( @seqs ) {
198 my $site = substr($seq,$i,CodonLen);
199 if( length($site) < CodonLen ) {
200 # at end of alignment and this is not in phase
201 $self->debug("phase was $phase, but got to end of alignment with overhang of $site");
202 next;
204 # do we check for gaps/indels here?
205 $set{$site}++;
206 push @genotypes, $site;
208 $genoct = scalar @genotypes;
210 # do we include fixed sites? yes I think so since this is
211 # typically being used by MK
212 for( my $j = 0; $j < $genoct; $j++ ) {
213 $inds[$j]->add_Genotype(Bio::PopGen::Genotype->new
214 (-marker_name => $nm,
215 -individual_id=> $inds[$j]->unique_id,
216 -alleles => [$genotypes[$j]]));
218 $codonct++;
220 } else {
221 $self->throw("Can only build sites based on all the data right now!");
223 return Bio::PopGen::Population->new(-checkisa => 0,
224 -source => 'alignment',
225 -individuals=> \@inds);