sync w/ main trunk
[bioperl-live.git] / Bio / PopGen / IO / phase.pm
blob54ff063517c43d2b0110c8c71a6d0db3933d9afb
1 # $Id$
3 # BioPerl module for Bio::PopGen::IO::phase
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Rich Dobson <r.j.dobson-at-qmul.ac.uk>
9 # Copyright Rich Dobson
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::PopGen::IO::phase - A parser for Phase format data
19 =head1 SYNOPSIS
21 # Do not use directly, use through the Bio::PopGen::IO driver
23 use Bio::PopGen::IO;
24 my $io = Bio::PopGen::IO->new(-format => 'phase',
25 -file => 'data.phase');
27 # Some IO might support reading in a population at a time
29 my @population;
30 while( my $ind = $io->next_individual ) {
31 push @population, $ind;
35 =head1 DESCRIPTION
37 A driver module for Bio::PopGen::IO for parsing phase data.
39 PHASE is defined in http://www.stat.washington.edu/stephens/instruct2.1.pdf
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to
47 the Bioperl mailing list. Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 =head2 Support
54 Please direct usage questions or support issues to the mailing list:
56 L<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
63 =head2 Reporting Bugs
65 Report bugs to the Bioperl bug tracking system to help us keep track
66 of the bugs and their resolution. Bug reports can be submitted via
67 the web:
69 http://bugzilla.open-bio.org/
71 =head1 AUTHOR - Rich Dobson
73 Email r.j.dobson-at-qmul.ac.uk
75 =head1 CONTRIBUTORS
77 Jason Stajich, jason-at-bioperl.org
79 =head1 APPENDIX
81 The rest of the documentation details each of the object methods.
82 Internal methods are usually preceded with a _
84 =cut
87 # Let the code begin...
90 package Bio::PopGen::IO::phase;
91 use vars qw($FieldDelim $AlleleDelim $NoHeader);
92 use strict;
94 ($FieldDelim,$AlleleDelim,$NoHeader) =( ',', '\s+',0);
99 use Bio::PopGen::Individual;
100 use Bio::PopGen::Population;
101 use Bio::PopGen::Genotype;
103 use base qw(Bio::PopGen::IO);
105 =head2 new
107 Title : new
108 Usage : my $obj = Bio::PopGen::IO::hapmap->new();
109 Function: Builds a new Bio::PopGen::IO::hapmap object
110 Returns : an instance of Bio::PopGen::IO::hapmap
111 Args : [optional, these are the current defaults]
112 -field_delimiter => ','
113 -allele_delimiter=> '\s+'
114 -no_header => 0,
117 =cut
120 sub _initialize {
122 my($self, @args) = @_;
124 $Bio::PopGen::Genotype::BlankAlleles='';
126 my ($fieldsep,$all_sep,
127 $noheader) = $self->_rearrange([qw(FIELD_DELIMITER
128 ALLELE_DELIMITER
129 NO_HEADER)],@args);
131 $self->flag('no_header', defined $noheader ? $noheader : $NoHeader);
132 $self->flag('field_delimiter',defined $fieldsep ? $fieldsep : $FieldDelim);
133 $self->flag('allele_delimiter',defined $all_sep ? $all_sep : $AlleleDelim);
135 $self->{'_header'} = undef;
136 return 1;
140 =head2 flag
142 Title : flag
143 Usage : $obj->flag($flagname,$newval)
144 Function: Get/Set the flag value
145 Returns : value of a flag (a boolean)
146 Args : A flag name, currently we expect
147 'no_header', 'field_delimiter', or 'allele_delimiter'
148 on set, new value (a boolean or undef, optional)
151 =cut
153 sub flag {
155 my $self = shift;
156 my $fieldname = shift;
157 return unless defined $fieldname;
159 return $self->{'_flag'}->{$fieldname} = shift if @_;
160 return $self->{'_flag'}->{$fieldname};
164 =head2 next_individual
166 Title : next_individual
167 Usage : my $ind = $popgenio->next_individual;
168 Function: Retrieve the next individual from a dataset
169 Returns : L<Bio::PopGen::IndividualI> object
170 Args : none
173 =cut
175 sub next_individual {
176 my ($self) = @_;
178 my ($sam,@marker_results,$number_of_ids,$number_of_markers,
179 $marker_positions,$micro_snp);
181 while( defined( $_ = $self->_readline) ) {
182 next if( /^\s+$/ || ! length($_) );
183 last;
186 return unless defined $_;
187 if( $self->flag('no_header') || defined $self->{'_header'} ) {
189 ####### sometimes there is some marker info at the start of a phase input file
190 ####### we collect it in the next few lines if there is. Should this info be held in a marker object?
192 if(!$self->{'_count'} && /^\s*\d+$/){
193 $self->flag('number_of_ids',$_);
194 #print "number_of_ids : $number_of_ids\n";
195 $self->{'_count'}++;
196 return $self->next_individual;
197 } elsif($self->{'_count'} == 1 && /^\s*\d+$/){
198 $self->flag('number_of_markers',$_);
199 #print "number_of_markers : $number_of_markers\n";
200 $self->{'_count'}++;
201 return $self->next_individual;
202 } elsif($self->{'_count'} == 2 && /^\s*P\s\d/){
203 $self->flag('marker_positions',$_);
204 #print "marker_position : $marker_positions\n";
205 $self->{'_count'}++;
206 return $self->next_individual;
207 } elsif($self->{'_count'} == 3 && /^\s*(M|S)+\s*$/i){
208 $self->flag('micro_snp',$_);
209 #print "microsat or snp : $micro_snp\n";
210 $self->{'_count'}++;
211 return $self->next_individual;
212 } elsif(/^\s*\#/){
213 ($self->{'_sam'}) = /^\s*\#(.+)/;
214 #print "sample : $self->{'_sam'}\n";
215 $self->{'_count'}++;
216 return $self->next_individual;
217 } else {
218 chomp $_;
219 if( $self->{'_row1'} ) {
220 # if we are looking at the 2nd row of alleles for this id
222 @{$self->{'_second_row'}} =
223 split($self->flag('field_delimiter'),$_);
225 for my $i(0 .. $#{$self->{'_first_row'}}){
227 push(@{$self->{'_marker_results'}},
228 $self->{'_first_row'}->[$i].
229 $self->flag('field_delimiter').
230 $self->{'_second_row'}->[$i]);
232 $self->{'_row1'} = 0;
233 } else {
234 # if we are looking at the first row of alleles for this id
235 @{$self->{'_marker_results'}} = ();
236 @{$self->{'_first_row'}} = split($self->flag('field_delimiter'),$_);
237 $self->{'_row1'} = 1;
238 return $self->next_individual;
242 my $i = 1;
243 foreach my $m ( @{$self->{'_marker_results'}} ) {
244 $m =~ s/^\s+//;
245 $m =~ s/\s+$//;
246 my $markername;
247 if( defined $self->{'_header'} ) {
248 $markername = $self->{'_header'}->[$i] || "Marker$i";
249 } else {
250 $markername = "Marker$i";
252 $self->debug( "markername is $markername alleles are $m\n");
253 my @alleles = split($self->flag('allele_delimiter'), $m);
255 $m = Bio::PopGen::Genotype->new(-alleles =>\@alleles,
256 -marker_name => $markername,
257 -individual_id=> $self->{'_sam'});
258 $i++;
260 return Bio::PopGen::Individual->new(-unique_id => $self->{'_sam'},
261 -genotypes =>\@{$self->{'_marker_results'}},
264 } else {
265 chomp;
266 $self->{'_header'} = [split($self->flag('field_delimiter'),$_)];
267 return $self->next_individual; # rerun loop again
269 return;
272 =head2 next_population
274 Title : next_population
275 Usage : my $ind = $popgenio->next_population;
276 Function: Retrieve the next population from a dataset
277 Returns : L<Bio::PopGen::PopulationI> object
278 Args : none
279 Note : Many implementation will not implement this
281 =cut
283 sub next_population{
284 my ($self) = @_;
285 my @inds;
286 while( my $ind = $self->next_individual ) {
287 push @inds, $ind;
289 Bio::PopGen::Population->new(-individuals => \@inds);
292 =head2 write_individual
294 Title : write_individual
295 Usage : $popgenio->write_individual($ind);
296 Function: Write an individual out in the file format
297 Returns : none
298 Args : L<Bio::PopGen::PopulationI> object(s)
300 =cut
303 sub write_individual {
304 my ($self,@inds) = @_;
305 my $fielddelim = $self->flag('field_delimiter');
306 my $alleledelim = $self->flag('allele_delimiter');
308 foreach my $ind ( @inds ) {
309 if (! ref($ind) || ! $ind->isa('Bio::PopGen::IndividualI') ) {
310 $self->warn("Cannot write an object that is not a Bio::PopGen::IndividualI object ($ind)");
311 next;
313 # we'll go ahead and sort these until
314 # we have a better way to insure a consistent order
315 my @marker_names = sort $ind->get_marker_names;
316 if( ! $self->flag('no_header') &&
317 ! $self->flag('header_written') ) {
318 $self->_print(join($fielddelim, ('SAM', @marker_names)), "\n");
319 $self->flag('header_written',1);
322 my(@row1,@row2);
324 for (@marker_names){
325 my $geno = $ind->get_Genotypes(-marker => $_);
326 my @alleles = $geno->get_Alleles();
327 push(@row1,$alleles[0]);
328 push(@row2,$alleles[1]);
330 $self->_print("#",$ind->unique_id,"\n",
331 join($fielddelim,@row1),"\n",
332 join($fielddelim,@row2),"\n");
336 =head2 write_population
338 Title : write_population
339 Usage : $popgenio->write_population($pop);
340 Function: Write a population out in the file format
341 Returns : none
342 Args : L<Bio::PopGen::PopulationI> object(s)
343 Note : Many implementation will not implement this
345 =cut
348 sub write_population {
349 my ($self,@pops) = @_;
350 my $fielddelim = $self->flag('field_delimiter');
351 my $alleledelim = $self->flag('allele_delimiter');
353 foreach my $pop ( @pops ) {
354 if (! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
355 $self->warn("Cannot write an object that is not a Bio::PopGen::PopulationI object");
356 next;
358 # we'll go ahead and sort these until
359 # we have a better way to insure a consistent order
360 my @marker_names = sort $pop->get_marker_names;
361 if( ! $self->flag('no_header') &&
362 ! $self->flag('header_written') ) {
363 $self->_print( join($fielddelim, ('SAM', @marker_names)),
364 "\n");
365 $self->flag('header_written',1);
367 foreach my $ind ( $pop->get_Individuals ) {
368 my(@row1,@row2);
369 for (@marker_names){
370 my $geno = $ind->get_Genotypes(-marker => $_);
371 my @alleles = $geno->get_Alleles();
372 push (@row1,$alleles[0]);
373 push (@row2,$alleles[1]);
375 $self->_print("#",$ind->unique_id,"\n",
376 join($fielddelim,@row1),"\n",
377 join($fielddelim,@row2),"\n");