A test to ensure Bio::PrimarySeqI->trunc() doesn't use clone() for a Bio::Seq::RichSe...
[bioperl-live.git] / Bio / PopGen / Population.pm
blob9149802e659ca1b931dc0f1b9c6516bc2fe067be
2 # BioPerl module for Bio::PopGen::Population
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PopGen::Population - A population of individuals
18 =head1 SYNOPSIS
20 use Bio::PopGen::Population;
21 use Bio::PopGen::Individual;
22 my $population = Bio::PopGen::Population->new();
23 my $ind = Bio::PopGen::Individual->new(-unique_id => 'id');
24 $population->add_Individual($ind);
26 for my $ind ( $population->get_Individuals ) {
27 # iterate through the individuals
30 for my $name ( $population->get_marker_names ) {
31 my $marker = $population->get_Marker($name);
34 my $num_inds = $population->get_number_individuals;
36 my $homozygote_f = $population->get_Frequency_Homozygotes;
37 my $heterozygote_f = $population->get_Frequency_Heterozygotes;
39 # make a population haploid by making fake chromosomes through
40 # haplotypes -- ala allele 1 is on chrom 1 and allele 2 is on chrom 2
41 # the number of individuals created will thus be 2 x number in
42 # population
43 my $happop = $population->haploid_population;
46 =head1 DESCRIPTION
48 This is a collection of individuals. We'll have ways of generating
49 L<Bio::PopGen::MarkerI> objects out so we can calculate allele_frequencies
50 for implementing the various statistical tests.
52 =head1 FEEDBACK
54 =head2 Mailing Lists
56 User feedback is an integral part of the evolution of this and other
57 Bioperl modules. Send your comments and suggestions preferably to
58 the Bioperl mailing list. Your participation is much appreciated.
60 bioperl-l@bioperl.org - General discussion
61 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63 =head2 Support
65 Please direct usage questions or support issues to the mailing list:
67 I<bioperl-l@bioperl.org>
69 rather than to the module maintainer directly. Many experienced and
70 reponsive experts will be able look at the problem and quickly
71 address it. Please include a thorough description of the problem
72 with code and data examples if at all possible.
74 =head2 Reporting Bugs
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 of the bugs and their resolution. Bug reports can be submitted via
78 email or the web:
80 https://github.com/bioperl/bioperl-live/issues
82 =head1 AUTHOR - Jason Stajich
84 Email jason-at-bioperl.org
86 =head1 CONTRIBUTORS
88 Matthew Hahn, matthew.hahn-at-duke.edu
90 =head1 APPENDIX
92 The rest of the documentation details each of the object methods.
93 Internal methods are usually preceded with a _
95 =cut
98 # Let the code begin...
101 package Bio::PopGen::Population;
102 use strict;
104 # Object preamble - inherits from Bio::Root::Root
106 use Bio::PopGen::Marker;
107 use Bio::PopGen::Genotype;
108 our $CheckISA = 1;
109 use base qw(Bio::Root::Root Bio::PopGen::PopulationI);
111 =head2 new
113 Title : new
114 Usage : my $obj = Bio::PopGen::Population->new();
115 Function: Builds a new Bio::PopGen::Population object
116 Returns : an instance of Bio::PopGen::Population
117 Args : -individuals => array ref of individuals (optional)
118 -name => population name (optional)
119 -source => a source tag (optional)
120 -description => a short description string of the population (optional)
122 =cut
124 sub new {
125 my($class,@args) = @_;
127 my $self = $class->SUPER::new(@args);
128 $self->{'_individuals'} = [];
129 my ($name,$source,$description,
130 $inds,$checkisa) = $self->_rearrange([qw(NAME
131 SOURCE
132 DESCRIPTION
133 INDIVIDUALS
134 CHECKISA)], @args);
135 if( defined $inds ) {
136 if( ref($inds) !~ /ARRAY/i ) {
137 $self->warn("Need to provide a value array ref for the -individuals initialization flag");
138 } else {
139 $self->add_Individual(@$inds);
143 defined $name && $self->name($name);
144 defined $source && $self->source($source);
145 defined $description && $self->description($description);
146 $self->{'_checkisa'} = defined $checkisa ? $checkisa : $CheckISA;
147 return $self;
151 =head2 name
153 Title : name
154 Usage : my $name = $pop->name
155 Function: Get the population name
156 Returns : string representing population name
157 Args : [optional] string representing population name
160 =cut
162 sub name{
163 my $self = shift;
164 return $self->{'_name'} = shift if @_;
165 return $self->{'_name'};
168 =head2 description
170 Title : description
171 Usage : my $description = $pop->description
172 Function: Get the population description
173 Returns : string representing population description
174 Args : [optional] string representing population description
177 =cut
179 sub description{
180 my $self = shift;
181 return $self->{'_description'} = shift if @_;
182 return $self->{'_description'};
185 =head2 source
187 Title : source
188 Usage : my $source = $pop->source
189 Function: Get the population source
190 Returns : string representing population source
191 Args : [optional] string representing population source
194 =cut
196 sub source{
197 my $self = shift;
198 return $self->{'_source'} = shift if @_;
199 return $self->{'_source'};
202 =head2 annotation
204 Title : annotation
205 Usage : my $annotation_collection = $pop->annotation;
206 Function: Get/set a Bio::AnnotationCollectionI for this population
207 Returns : Bio::AnnotationCollectionI object
208 Args : [optional set] Bio::AnnotationCollectionI object
210 =cut
212 sub annotation{
213 my ($self, $arg) = @_;
214 return $self->{_annotation} unless $arg;
215 $self->throw("Bio::AnnotationCollectionI required for argument") unless
216 ref($arg) && $arg->isa('Bio::AnnotationCollectionI');
217 return $self->{_annotation} = $arg;
220 =head2 set_Allele_Frequency
222 Title : set_Allele_Frequency
223 Usage : $population->set_Allele_Frequency('marker' => { 'allele1' => 0.1});
224 Function: Sets an allele frequency for a Marker for this Population
225 This allows the Population to not have individual individual
226 genotypes but rather a set of overall allele frequencies
227 Returns : Count of the number of markers
228 Args : -name => (string) marker name
229 -allele => (string) allele name
230 -frequency => (double) allele frequency - must be between 0 and 1
232 -frequencies => { 'marker1' => { 'allele1' => 0.01,
233 'allele2' => 0.99},
234 'marker2' => ...
237 =cut
239 sub set_Allele_Frequency {
240 my ($self,@args) = @_;
241 my ($name,$allele, $frequency,
242 $frequencies) = $self->_rearrange([qw(NAME
243 ALLELE
244 FREQUENCY
245 FREQUENCIES
246 )], @args);
247 if( defined $frequencies ) { # this supercedes the res
248 if( ref($frequencies) =~ /HASH/i ) {
249 my ($markername,$alleles);
250 while( ($markername,$alleles) = each %$frequencies ) {
251 $self->{'_allele_freqs'}->{$markername} =
252 Bio::PopGen::Marker->new(-name => $markername,
253 -allele_freq => $alleles);
255 } else {
256 $self->throw("Must provide a valid hashref for the -frequencies option");
258 } else {
259 unless( defined $self->{'_allele_freqs'}->{$name} ) {
260 $self->{'_allele_freqs'}->{$name} =
261 Bio::PopGen::Marker->new(-name => $name);
263 $self->{'_allele_freqs'}->{$name}->add_Allele_Frequency($allele,$frequency);
265 return scalar keys %{$self->{'_allele_freqs'}};
269 =head2 add_Individual
271 Title : add_Individual
272 Usage : $population->add_Individual(@individuals);
273 Function: Add individuals to a population
274 Returns : count of the current number in the object
275 Args : Array of Individuals
278 =cut
280 sub add_Individual{
281 my ($self,@inds) = @_;
282 foreach my $i ( @inds ) {
283 next if ! defined $i;
285 unless( $self->{'_checkisa'} ? $i->isa('Bio::PopGen::IndividualI') : 1 ) {
286 $self->warn("cannot add an individual ($i) which is not a Bio::PopGen::IndividualI");
287 next;
290 push @{$self->{'_individuals'}}, @inds;
291 $self->{'_cached_markernames'} = undef;
292 $self->{'_allele_freqs'} = {};
293 return scalar @{$self->{'_individuals'} || []};
297 =head2 remove_Individuals
299 Title : remove_Individuals
300 Usage : $population->remove_Individuals(@ids);
301 Function: Remove individual(s) to a population
302 Returns : count of the current number in the object
303 Args : Array of ids
305 =cut
307 sub remove_Individuals {
308 my ($self,@names) = @_;
309 my $i = 0;
310 my %namehash; # O(1) lookup will be faster I think
311 foreach my $n ( @names ) { $namehash{$n}++ }
312 my @tosplice;
313 foreach my $ind ( @{$self->{'_individuals'} || []} ) {
314 unshift @tosplice, $i if( $namehash{$ind->unique_id} );
315 $i++;
317 foreach my $index ( @tosplice ) {
318 splice(@{$self->{'_individuals'}}, $index,1);
320 $self->{'_cached_markernames'} = undef;
321 $self->{'_allele_freqs'} = {};
322 return scalar @{$self->{'_individuals'} || []};
325 =head2 get_Individuals
327 Title : get_Individuals
328 Usage : my @inds = $pop->get_Individuals();
329 Function: Return the individuals, alternatively restrict by a criteria
330 Returns : Array of Bio::PopGen::IndividualI objects
331 Args : none if want all the individuals OR,
332 -unique_id => To get an individual with a specific id
333 -marker => To only get individuals which have a genotype specific
334 for a specific marker name
337 =cut
339 sub get_Individuals{
340 my ($self,@args) = @_;
341 my @inds = @{$self->{'_individuals'} || []};
342 return unless @inds;
343 if( @args ) { # save a little time here if @args is empty
344 my ($id,$marker) = $self->_rearrange([qw(UNIQUE_ID MARKER)], @args);
347 if( defined $id ) {
348 @inds = grep { $_->unique_id eq $id } @inds;
349 } elsif (defined $marker) {
350 @inds = grep { $_->has_Marker($marker) } @inds;
353 return @inds;
356 =head2 get_Genotypes
358 Title : get_Genotypes
359 Usage : my @genotypes = $pop->get_Genotypes(-marker => $name)
360 Function: Get the genotypes for all the individuals for a specific
361 marker name
362 Returns : Array of Bio::PopGen::GenotypeI objects
363 Args : -marker => name of the marker
366 =cut
368 sub get_Genotypes{
369 my ($self,@args) = @_;
370 my ($name) = $self->_rearrange([qw(MARKER)],@args);
371 if( defined $name ) {
372 return grep { defined $_ } map { $_->get_Genotypes(-marker => $name) }
373 @{$self->{'_individuals'} || []}
375 $self->warn("You needed to have provided a valid -marker value");
376 return ();
380 =head2 get_marker_names
382 Title : get_marker_names
383 Usage : my @names = $pop->get_marker_names;
384 Function: Get the names of the markers
385 Returns : Array of strings
386 Args : [optional] boolean flag to ignore internal cache status
389 =cut
391 sub get_marker_names {
392 my ($self,$force) = @_;
393 return @{$self->{'_cached_markernames'} || []}
394 if( ! $force && defined $self->{'_cached_markernames'});
395 my %unique;
396 foreach my $n ( map { $_->get_marker_names } $self->get_Individuals() ) {
397 $unique{$n}++;
399 my @nms = keys %unique;
400 if( $nms[0] =~ /^(Site|Codon)/ ) {
401 # sort by site or codon number and do it in
402 # a schwartzian transformation baby!
403 @nms = map { $_->[1] }
404 sort { $a->[0] <=> $b->[0] }
405 map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @nms;
407 $self->{'_cached_markernames'} = [ @nms ];
408 return @{$self->{'_cached_markernames'} || []};
412 =head2 get_Marker
414 Title : get_Marker
415 Usage : my $marker = $population->get_Marker($name)
416 Function: Get a Bio::PopGen::Marker object based on this population
417 Returns : Bio::PopGen::MarkerI object
418 Args : name of the marker
421 =cut
423 sub get_Marker{
424 my ($self,$markername) = @_;
425 my $marker;
426 # setup some caching too
427 if( defined $self->{'_allele_freqs'} &&
428 defined ($marker = $self->{'_allele_freqs'}->{$markername}) ) {
429 # marker is now set to the stored value
430 } else {
431 my @genotypes = $self->get_Genotypes(-marker => $markername);
432 $marker = Bio::PopGen::Marker->new(-name => $markername);
434 if( ! @genotypes ) {
435 $self->warn("No genotypes for Marker $markername in the population");
436 } else {
437 my %alleles;
438 my $count;
439 for my $al ( map { $_->get_Alleles} @genotypes ) {
440 next if($al eq '?');
441 $count++;
442 $alleles{$al}++
444 foreach my $allele ( keys %alleles ) {
445 $marker->add_Allele_Frequency($allele, $alleles{$allele}/$count);
446 $marker->{_marker_coverage} = $count/2;
449 $self->{'_allele_freqs'}->{$markername} = $marker;
451 return $marker;
455 =head2 get_number_individuals
457 Title : get_number_individuals
458 Usage : my $count = $pop->get_number_individuals;
459 Function: Get the count of the number of individuals
460 Returns : integer >= 0
461 Args : none
464 =cut
466 sub get_number_individuals{
467 my ($self,$markername) = @_;
469 if( $self->{'_forced_set_individuals'} ) {
470 return $self->{'_forced_set_individuals'};
473 unless( defined $markername ) {
474 return scalar @{$self->{'_individuals'} || []};
475 } else {
476 my $number =0;
477 foreach my $individual ( @{$self->{'_individuals'} || []} ) {
478 $number++ if( $individual->has_Marker($markername));
480 return $number;
484 =head2 set_number_individuals
486 Title : set_number_individuals
487 Usage : $pop->set_number_individuals($num);
488 Function: Fixes the number of individuals, call this with
489 0 to unset.
490 Only use this if you know what you are doing,
491 this is only relavent when you are just adding
492 allele frequency data for a population and want to
493 calculate something like theta
494 Returns : none
495 Args : individual count, calling it with undef or 0
496 will reset the value to return a number
497 calculated from the number of individuals
498 stored for this population.
500 =cut
502 sub set_number_individuals{
503 my ($self,$indcount) = @_;
504 return $self->{'_forced_set_individuals'} = $indcount;
508 =head2 get_Frequency_Homozygotes
510 Title : get_Frequency_Homozygotes
511 Usage : my $freq = $pop->get_Frequency_Homozygotes;
512 Function: Calculate the frequency of homozygotes in the population
513 Returns : fraction between 0 and 1
514 Args : $markername
517 =cut
519 sub get_Frequency_Homozygotes{
520 my ($self,$marker,$allelename) = @_;
521 my ($homozygote_count) = 0;
522 return 0 if ! defined $marker || ! defined $allelename;
523 $marker = $marker->name if( defined $marker &&
524 ref($marker) &&
525 ( $self->{'_checkisa'} ?
526 $marker->isa('Bio::PopGen::MarkerI') : 1));
527 my $total = $self->get_number_individuals($marker);
528 foreach my $genotype ( $self->get_Genotypes($marker) ) {
529 my %alleles = map { $_ => 1} $genotype->get_Alleles();
530 # what to do for non-diploid situations?
531 if( $alleles{$allelename} ) {
532 $homozygote_count++ if( keys %alleles == 1);
535 return $total ? $homozygote_count / $total : 0;
538 =head2 get_Frequency_Heterozygotes
540 Title : get_Frequency_Heterozygotes
541 Usage : my $freq = $pop->get_Frequency_Homozygotes;
542 Function: Calculate the frequency of homozygotes in the population
543 Returns : fraction between 0 and 1
544 Args : $markername
547 =cut
549 sub get_Frequency_Heterozygotes{
550 my ($self,$marker,$allelename) = @_;
551 my ($heterozygote_count) = 0;
552 return 0 if ! defined $marker || ! defined $allelename;
553 $marker = $marker->name if( defined $marker && ref($marker) &&
554 ($self->{'_checkisa'} ?
555 $marker->isa('Bio::PopGen::MarkerI') : 1));
556 if( ref($marker) ) {
557 $self->warn("Passed in a ".ref($marker). " to has_Marker, expecting either a string or a Bio::PopGen::MarkerI");
558 return 0;
560 my $total = $self->get_number_individuals($marker);
562 foreach my $genotype ( $self->get_Genotypes($marker) ) {
563 my %alleles = map { $_ => 1} $genotype->get_Alleles();
564 # what to do for non-diploid situations?
565 if( $alleles{$allelename} ) {
566 $heterozygote_count++ if( keys %alleles == 2);
569 return $total ? $heterozygote_count / $total : 0;
572 =head2 haploid_population
574 Title : haploid_population
575 Usage : my $pop = $population->haploid_population;
576 Function: Make a new population where all the individuals
577 are haploid - effectively an individual out of each
578 chromosome an individual has.
579 Returns : L<Bio::PopGen::PopulationI>
580 Args : None
583 =cut
585 sub haploid_population{
586 my ($self) = @_;
587 my @inds;
588 my @marker_names = $self->get_marker_names;
590 for my $ind ( $self->get_Individuals ) {
591 my @chromosomes;
592 my $id = $ind->unique_id;
593 # separate genotypes into 'chromosomes'
594 for my $marker_name( @marker_names ) {
595 my ($genotype) = $ind->get_Genotypes(-marker => $marker_name);
596 my $i =0;
597 for my $allele ( $genotype->get_Alleles ) {
598 push @{$chromosomes[$i]},
599 Bio::PopGen::Genotype->new(-marker_name => $marker_name,
600 -individual_id => $id.".$i",
601 -alleles => [$allele]);
602 $i++;
605 for my $chrom ( @chromosomes ) {
606 my $copyind = ref($ind)->new(-unique_id => $id.".1",
607 -genotypes => $chrom);
608 push @inds, $ind;
611 my $population = ref($self)->new(-name => $self->name,
612 -source => $self->source,
613 -description => $self->description,
614 -individuals => \@inds);