nexml.t: Added a missing XML::Twig requirement.
[bioperl-live.git] / Bio / PopGen / Utilities.pm
blob5a2cbc4e888d3ec79107d7b8aeb901871b12d8de
2 # BioPerl module for Bio::PopGen::Utilities
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PopGen::Utilities - Utilities for working with PopGen data and objects
18 =head1 SYNOPSIS
20 use Bio::PopGen::Utilities;
21 use Bio::AlignIO;
23 my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
24 -format => 'clustalw');
25 my $aln = $in->next_aln;
26 # get a population, each sequence is an individual and
27 # for the default case, every site which is not monomorphic
28 # is a 'marker'. Each individual will have a 'genotype' for the
29 # site which will be the specific base in the alignment at that
30 # site
31 my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
33 # get the synonymous sites from the alignemt only as the 'genotypes'
34 # for the population
35 my $synpop = Bio::PopGen::Utilities->aln_to_population(-site_model => 'cod',
36 -alignment => $aln);
39 =head1 DESCRIPTION
41 This object provides some convience function to turn sequence
42 alignments into usable objects for the Population genetics modules
43 (Bio::PopGen).
45 =head1 FEEDBACK
47 =head2 Mailing Lists
49 User feedback is an integral part of the evolution of this and other
50 Bioperl modules. Send your comments and suggestions preferably to
51 the Bioperl mailing list. Your participation is much appreciated.
53 bioperl-l@bioperl.org - General discussion
54 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
56 =head2 Support
58 Please direct usage questions or support issues to the mailing list:
60 I<bioperl-l@bioperl.org>
62 rather than to the module maintainer directly. Many experienced and
63 reponsive experts will be able look at the problem and quickly
64 address it. Please include a thorough description of the problem
65 with code and data examples if at all possible.
67 =head2 Reporting Bugs
69 Report bugs to the Bioperl bug tracking system to help us keep track
70 of the bugs and their resolution. Bug reports can be submitted via
71 the web:
73 https://github.com/bioperl/bioperl-live/issues
75 =head1 AUTHOR - Jason Stajich
77 Email jason-at-bioperl-dot-org
79 =head1 APPENDIX
81 The rest of the documentation details each of the object methods.
82 Internal methods are usually preceded with a _
84 =cut
87 # Let the code begin...
90 package Bio::PopGen::Utilities;
91 use strict;
93 use Bio::Align::DNAStatistics;
94 use Bio::PopGen::Population;
95 use Bio::PopGen::Individual;
97 use base qw(Bio::Root::Root);
98 use constant CodonLen => 3;
101 =head2 aln_to_population
103 Title : aln_to_population
104 Usage : my $pop = Bio::PopGen::Utilities->aln_to_population($aln);
105 Function: Turn and alignment into a set of L<Bio::PopGen::Individual>
106 objects grouped in a L<Bio::PopGen::Population> object
108 Sites are treated as 'Markers' in the Bioperl PopGen object
109 model in the sense that a site is a unique location for which
110 an individual will have a genotype (a set of alleles).
111 In this implementation we are assuming that each individual
112 has a single entry in the alignment file.
114 Specify a site model as one of those listed
115 'all' -- every base in the alignment is considered a site
116 'cod' -- codons
118 The option -site_model
119 for All sites : 'all'
120 Codon sites : 'cod' or 'codon'
122 To see all sites, including those which are fixed in the population
123 add -include_monomorphic => 1
124 to the arguments
125 Returns :
126 Args : -include_monomorphic => 1 to specify all sites,
127 even those which are monomorphic
128 in the population
129 (useful for HKA test mostly)
130 [default is false]
131 -phase => specify a phase for the data, this is only
132 used if the site_mode is codon
133 [default is 0]
134 -site_model => one-of 'all', 'codon'
135 to specify a site model for the data extraction
136 from the alignment
137 [default is all]
138 -alignment => provide a L<Bio::SimpleAlign> object [required]
140 =cut
142 sub aln_to_population{
143 my ($self,@args) = @_;
144 my ($aln,
145 $sitemodel,$phase,
146 $includefixed,$checkisa) = $self->_rearrange([qw(ALIGNMENT
147 SITE_MODEL
148 PHASE
149 INCLUDE_MONOMORPHIC
150 CHECKISA)],
151 @args);
153 my %ambig_code = ('?' => ['?','?'],
154 'N' => ['?','?'],
155 '-' => ['?','?'],
156 'G' => ['G','G'],
157 'A' => ['A','A'],
158 'T' => ['T','T'],
159 'C' => ['C','C'],
160 'R' => ['A','G'],
161 'Y' => ['C','T'],
162 'W' => ['T','A'],
163 'M' => ['C','A'],
164 'S' => ['C','G'],
165 'K' => ['G','T']);
167 if( ! defined $aln ) {
168 $self->warn("Must provide a valid Bio::SimpleAlign object to run aln_to_population");
169 return;
172 if( ! $aln->is_flush ) {
173 $self->warn("Must provide a Bio::SimpleAlign object with aligned sequences to aln_to_population!");
174 return;
176 $phase = 0 unless defined $phase;
177 if( $phase != 0 && $phase != 1 && $phase != 2 ) {
178 warn("phase must be 0,1, or 2");
179 return;
181 my $alength = $aln->length;
182 my @inds;
183 if( ! defined $sitemodel || $sitemodel =~ /all/i ) {
184 my $ct = 0;
185 my @seqs;
186 for my $seq ( $aln->each_seq ) {
187 push @seqs, $seq->seq;
188 push @inds, Bio::PopGen::Individual->new(-unique_id => $seq->display_id);
191 for( my $i = 0; $i < $alength; $i++ ) {
192 my (@genotypes,%set);
194 # do we skip indels?
195 # slicing vertically
196 for my $seq ( @seqs ) {
197 my $site = uc(substr($seq,$i,1));
198 push @genotypes, $ambig_code{$site};
199 $set{$site}++;
201 if( keys %set > 1 || $includefixed ) {
202 my $genoct = scalar @genotypes;
203 for( my $j = 0; $j < $genoct; $j++ ) {
204 $inds[$j]->add_Genotype(Bio::PopGen::Genotype->new
205 (-marker_name => ($i+1),
206 -individual_id=> $inds[$j]->unique_id,
207 -alleles => $genotypes[$j]));
211 } elsif( $sitemodel =~ /cod(on)?/i ) {
212 my $ct = 0;
213 my @seqs;
214 for my $seq ( $aln->each_seq ) {
215 push @seqs, $seq->seq;
216 push @inds, Bio::PopGen::Individual->new(-unique_id => $seq->display_id);
218 my $codonct = 0;
219 for( my $i = $phase; $i < $alength; $i += CodonLen ) {
220 my (@genotypes,%set,$genoct);
222 for my $seq ( @seqs ) {
223 my @unambig_site;
224 my $site = uc(substr($seq,$i,CodonLen));
225 if( length($site) < CodonLen ) {
226 # at end of alignment and this is not in phase
227 $self->debug("phase was $phase, but got to end of alignment with overhang of $site");
228 next;
230 # do we check for gaps/indels here?
231 for (my $pos=0; $pos<CodonLen; $pos++)
233 $unambig_site[0] .= $ambig_code{substr($site, $pos, 1)}[0];
234 $unambig_site[1] .= $ambig_code{substr($site, $pos, 1)}[1];
236 push @genotypes, [@unambig_site];
237 $set{$site}++;
239 $genoct = scalar @genotypes;
241 # do we include fixed sites? I think we should leave it to the user.
242 if( keys %set > 1 || $includefixed ) {
243 for( my $j = 0; $j < $genoct; $j++ ) {
244 $inds[$j]->add_Genotype(Bio::PopGen::Genotype->new
245 (-marker_name => ($i/CodonLen),
246 -individual_id=> $inds[$j]->unique_id,
247 -alleles => $genotypes[$j]));
249 $codonct++;
252 } else {
253 $self->throw("Can only build sites based on all the data right now!");
255 return Bio::PopGen::Population->new(-checkisa => 0,
256 -source => 'alignment',
257 -individuals=> \@inds);