3 # BioPerl module for Bio::PopGen::Utilities
5 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::PopGen::Utilities - Utilities for working with PopGen data and objects
19 use Bio::PopGen::Utilities;
22 my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
23 -format => 'clustalw');
24 my $aln = $in->next_aln;
25 # get a population, each sequence is an individual and
26 # for the default case, every site which is not monomorphic
27 # is a 'marker'. Each individual will have a 'genotype' for the
28 # site which will be the specific base in the alignment at that
30 my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
32 # get the synonymous sites from the alignemt only as the 'genotypes'
34 my $synpop = Bio::PopGen::Utilities->aln_to_population(-site_model => 'syn',
40 This object provides some convience function to turn sequence
41 alignments into usable objects for the Population genetics modules
48 User feedback is an integral part of the evolution of this and other
49 Bioperl modules. Send your comments and suggestions preferably to
50 the Bioperl mailing list. Your participation is much appreciated.
52 bioperl-l@bioperl.org - General discussion
53 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
57 Report bugs to the Bioperl bug tracking system to help us keep track
58 of the bugs and their resolution. Bug reports can be submitted via
61 http://bugzilla.open-bio.org/
63 =head1 AUTHOR - Jason Stajich
65 Email jason-at-bioperl-dot-org
69 The rest of the documentation details each of the object methods.
70 Internal methods are usually preceded with a _
75 # Let the code begin...
78 package Bio
::PopGen
::Utilities
;
81 use Bio
::Align
::DNAStatistics
;
82 use Bio
::PopGen
::Population
;
83 use Bio
::PopGen
::Individual
;
85 use base
qw(Bio::Root::Root);
86 use constant CodonLen
=> 3;
89 =head2 aln_to_population
91 Title : aln_to_population
92 Usage : my $pop = Bio::PopGen::Utilities->aln_to_population($aln);
93 Function: Turn and alignment into a set of L<Bio::PopGen::Individual>
94 objects grouped in a L<Bio::PopGen::Population> object
96 Sites are treated as 'Markers' in the Bioperl PopGen object
97 model in the sense that a site is a unique location for which
98 an individual will have a genotype (a set of alleles).
99 In this implementation we are assuming that each individual
100 has a single entry in the alignment file.
102 Specify a site model as one of those listed
103 'all' -- every base in the alignment is considered a site
106 The option -site_model
107 for All sites : 'all'
108 Codon sites : 'cod' or 'codon'
110 To see all sites, including those which are fixed in the population
111 add -include_monomorphic => 1
114 Args : -include_monomorphic => 1 to specify all sites,
115 even those which are monomorphic
117 (useful for HKA test mostly)
119 -phase => specify a phase for the data, this is only
120 used if the site_mode is codon
122 -site_model => one-of 'all', 'codon'
123 to specify a site model for the data extraction
126 -alignment => provide a L<Bio::SimpleAlign> object [required]
130 sub aln_to_population
{
131 my ($self,@args) = @_;
134 $includefixed,$checkisa) = $self->_rearrange([qw(ALIGNMENT
140 if( ! defined $aln ) {
141 $self->warn("Must provide a valid Bio::SimpleAlign object to run aln_to_population");
145 if( ! $aln->is_flush ) {
146 $self->warn("Must provide a Bio::SimpleAlign object with aligned sequences to aln_to_population!");
149 $phase = 0 unless defined $phase;
150 if( $phase != 0 && $phase != 1 && $phase != 2 ) {
151 warn("phase must be 0,1, or 2");
154 my $alength = $aln->length;
156 if( ! defined $sitemodel || $sitemodel =~ /all/i ) {
159 for my $seq ( $aln->each_seq ) {
160 push @seqs, $seq->seq;
161 push @inds, Bio
::PopGen
::Individual
->new(-unique_id
=> $seq->display_id);
164 for( my $i = 0; $i < $alength; $i++ ) {
166 my (@genotypes,%set);
170 for my $seq ( @seqs ) {
171 my $site = substr($seq,$i,1);
173 push @genotypes, $site;
175 if( keys %set > 1 || $includefixed ) {
176 my $genoct = scalar @genotypes;
177 for( my $j = 0; $j < $genoct; $j++ ) {
178 $inds[$j]->add_Genotype(Bio
::PopGen
::Genotype
->new
179 (-marker_name
=> $nm,
180 -individual_id
=> $inds[$j]->unique_id,
181 -alleles
=> [$genotypes[$j]]));
185 } elsif( $sitemodel =~ /cod(on)?/i ) {
188 for my $seq ( $aln->each_seq ) {
189 push @seqs, $seq->seq;
190 push @inds, Bio
::PopGen
::Individual
->new(-unique_id
=> $seq->display_id);
193 for( my $i = $phase; $i < $alength; $i += CodonLen
) {
194 my $nm = "Codon-$codonct-$i";
195 my (@genotypes,%set,$genoct);
197 for my $seq ( @seqs ) {
198 my $site = substr($seq,$i,CodonLen
);
199 if( length($site) < CodonLen
) {
200 # at end of alignment and this is not in phase
201 $self->debug("phase was $phase, but got to end of alignment with overhang of $site");
204 # do we check for gaps/indels here?
206 push @genotypes, $site;
208 $genoct = scalar @genotypes;
210 # do we include fixed sites? yes I think so since this is
211 # typically being used by MK
212 for( my $j = 0; $j < $genoct; $j++ ) {
213 $inds[$j]->add_Genotype(Bio
::PopGen
::Genotype
->new
214 (-marker_name
=> $nm,
215 -individual_id
=> $inds[$j]->unique_id,
216 -alleles
=> [$genotypes[$j]]));
221 $self->throw("Can only build sites based on all the data right now!");
223 return Bio
::PopGen
::Population
->new(-checkisa
=> 0,
224 -source
=> 'alignment',
225 -individuals
=> \
@inds);