2 # BioPerl module for Bio::PopGen::Utilities
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::PopGen::Utilities - Utilities for working with PopGen data and objects
20 use Bio::PopGen::Utilities;
23 my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
24 -format => 'clustalw');
25 my $aln = $in->next_aln;
26 # get a population, each sequence is an individual and
27 # for the default case, every site which is not monomorphic
28 # is a 'marker'. Each individual will have a 'genotype' for the
29 # site which will be the specific base in the alignment at that
31 my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
33 # get the synonymous sites from the alignemt only as the 'genotypes'
35 my $synpop = Bio::PopGen::Utilities->aln_to_population(-site_model => 'cod',
41 This object provides some convience function to turn sequence
42 alignments into usable objects for the Population genetics modules
49 User feedback is an integral part of the evolution of this and other
50 Bioperl modules. Send your comments and suggestions preferably to
51 the Bioperl mailing list. Your participation is much appreciated.
53 bioperl-l@bioperl.org - General discussion
54 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
58 Please direct usage questions or support issues to the mailing list:
60 I<bioperl-l@bioperl.org>
62 rather than to the module maintainer directly. Many experienced and
63 reponsive experts will be able look at the problem and quickly
64 address it. Please include a thorough description of the problem
65 with code and data examples if at all possible.
69 Report bugs to the Bioperl bug tracking system to help us keep track
70 of the bugs and their resolution. Bug reports can be submitted via
73 https://github.com/bioperl/bioperl-live/issues
75 =head1 AUTHOR - Jason Stajich
77 Email jason-at-bioperl-dot-org
81 The rest of the documentation details each of the object methods.
82 Internal methods are usually preceded with a _
87 # Let the code begin...
90 package Bio
::PopGen
::Utilities
;
93 use Bio
::Align
::DNAStatistics
;
94 use Bio
::PopGen
::Population
;
95 use Bio
::PopGen
::Individual
;
97 use base
qw(Bio::Root::Root);
98 use constant CodonLen
=> 3;
101 =head2 aln_to_population
103 Title : aln_to_population
104 Usage : my $pop = Bio::PopGen::Utilities->aln_to_population($aln);
105 Function: Turn and alignment into a set of L<Bio::PopGen::Individual>
106 objects grouped in a L<Bio::PopGen::Population> object
108 Sites are treated as 'Markers' in the Bioperl PopGen object
109 model in the sense that a site is a unique location for which
110 an individual will have a genotype (a set of alleles).
111 In this implementation we are assuming that each individual
112 has a single entry in the alignment file.
114 Specify a site model as one of those listed
115 'all' -- every base in the alignment is considered a site
118 The option -site_model
119 for All sites : 'all'
120 Codon sites : 'cod' or 'codon'
122 To see all sites, including those which are fixed in the population
123 add -include_monomorphic => 1
126 Args : -include_monomorphic => 1 to specify all sites,
127 even those which are monomorphic
129 (useful for HKA test mostly)
131 -phase => specify a phase for the data, this is only
132 used if the site_mode is codon
134 -site_model => one-of 'all', 'codon'
135 to specify a site model for the data extraction
138 -alignment => provide a L<Bio::SimpleAlign> object [required]
142 sub aln_to_population
{
143 my ($self,@args) = @_;
146 $includefixed,$checkisa) = $self->_rearrange([qw(ALIGNMENT
153 my %ambig_code = ('?' => ['?','?'],
167 if( ! defined $aln ) {
168 $self->warn("Must provide a valid Bio::SimpleAlign object to run aln_to_population");
172 if( ! $aln->is_flush ) {
173 $self->warn("Must provide a Bio::SimpleAlign object with aligned sequences to aln_to_population!");
176 $phase = 0 unless defined $phase;
177 if( $phase != 0 && $phase != 1 && $phase != 2 ) {
178 warn("phase must be 0,1, or 2");
181 my $alength = $aln->length;
183 if( ! defined $sitemodel || $sitemodel =~ /all/i ) {
186 for my $seq ( $aln->each_seq ) {
187 push @seqs, $seq->seq;
188 push @inds, Bio
::PopGen
::Individual
->new(-unique_id
=> $seq->display_id);
191 for( my $i = 0; $i < $alength; $i++ ) {
192 my (@genotypes,%set);
196 for my $seq ( @seqs ) {
197 my $site = uc(substr($seq,$i,1));
198 push @genotypes, $ambig_code{$site};
201 if( keys %set > 1 || $includefixed ) {
202 my $genoct = scalar @genotypes;
203 for( my $j = 0; $j < $genoct; $j++ ) {
204 $inds[$j]->add_Genotype(Bio
::PopGen
::Genotype
->new
205 (-marker_name
=> ($i+1),
206 -individual_id
=> $inds[$j]->unique_id,
207 -alleles
=> $genotypes[$j]));
211 } elsif( $sitemodel =~ /cod(on)?/i ) {
214 for my $seq ( $aln->each_seq ) {
215 push @seqs, $seq->seq;
216 push @inds, Bio
::PopGen
::Individual
->new(-unique_id
=> $seq->display_id);
219 for( my $i = $phase; $i < $alength; $i += CodonLen
) {
220 my (@genotypes,%set,$genoct);
222 for my $seq ( @seqs ) {
224 my $site = uc(substr($seq,$i,CodonLen
));
225 if( length($site) < CodonLen
) {
226 # at end of alignment and this is not in phase
227 $self->debug("phase was $phase, but got to end of alignment with overhang of $site");
230 # do we check for gaps/indels here?
231 for (my $pos=0; $pos<CodonLen
; $pos++)
233 $unambig_site[0] .= $ambig_code{substr($site, $pos, 1)}[0];
234 $unambig_site[1] .= $ambig_code{substr($site, $pos, 1)}[1];
236 push @genotypes, [@unambig_site];
239 $genoct = scalar @genotypes;
241 # do we include fixed sites? I think we should leave it to the user.
242 if( keys %set > 1 || $includefixed ) {
243 for( my $j = 0; $j < $genoct; $j++ ) {
244 $inds[$j]->add_Genotype(Bio
::PopGen
::Genotype
->new
245 (-marker_name
=> ($i/CodonLen
),
246 -individual_id
=> $inds[$j]->unique_id,
247 -alleles
=> $genotypes[$j]));
253 $self->throw("Can only build sites based on all the data right now!");
255 return Bio
::PopGen
::Population
->new(-checkisa
=> 0,
256 -source
=> 'alignment',
257 -individuals
=> \
@inds);