sync w/ main trunk
[bioperl-live.git] / Bio / PopGen / IO / csv.pm
blob8b84498c7d60bf0ed027a28db22d372e1de73fcc
1 # $Id$
3 # BioPerl module for Bio::PopGen::IO::csv
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::PopGen::IO::csv -Extract individual allele data from a CSV parser
19 =head1 SYNOPSIS
21 #Do not use directly, use through the Bio::PopGen::IO driver
23 use Bio::PopGen::IO;
24 my $io = Bio::PopGen::IO->new(-format => 'csv',
25 -file => 'data.csv');
27 # Some IO might support reading in a population at a time
29 my @population;
30 while( my $ind = $io->next_individual ) {
31 push @population, $ind;
34 =head1 DESCRIPTION
36 This object will parse comma delimited format (CSV) or whatever
37 delimiter you specify. It currently doesn't handle the more complex
38 quote escaped CSV format. There are 3 initialization parameters,
39 the delimiter (-field_delimiter) [default ','], (-allele_delimiter)
40 [default ' ']. The third initialization parameter is a boolean
41 -no_header which specifies if there is no header line to read in. All lines starting with '#' will be skipped
43 When no_header is not specific the data is assumed to be of the following form.
44 Having a header line this
45 SAMPLE,MARKERNAME1,MARKERNAME2,...
47 and each data line having the form (diploid data)
48 SAMP1,101 102,100 90,a b
49 or for haploid data
50 SAMP1,101,100,a
52 =head1 FEEDBACK
54 =head2 Mailing Lists
56 User feedback is an integral part of the evolution of this and other
57 Bioperl modules. Send your comments and suggestions preferably to
58 the Bioperl mailing list. Your participation is much appreciated.
60 bioperl-l@bioperl.org - General discussion
61 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63 =head2 Support
65 Please direct usage questions or support issues to the mailing list:
67 L<bioperl-l@bioperl.org>
69 rather than to the module maintainer directly. Many experienced and
70 reponsive experts will be able look at the problem and quickly
71 address it. Please include a thorough description of the problem
72 with code and data examples if at all possible.
74 =head2 Reporting Bugs
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 of the bugs and their resolution. Bug reports can be submitted via
78 the web:
80 http://bugzilla.open-bio.org/
82 =head1 AUTHOR - Jason Stajich
84 Email jason-at-bioperl.org
86 =head1 CONTRIBUTORS
88 Matthew Hahn, matthew.hahn-at-duke.edu
90 =head1 APPENDIX
92 The rest of the documentation details each of the object methods.
93 Internal methods are usually preceded with a _
95 =cut
98 # Let the code begin...
101 package Bio::PopGen::IO::csv;
102 use vars qw($FieldDelim $AlleleDelim $NoHeader);
103 use strict;
105 ($FieldDelim,$AlleleDelim,$NoHeader) =( ',', '\s+',0);
107 # Object preamble - inherits from Bio::Root::Root
110 use Bio::PopGen::Individual;
111 use Bio::PopGen::Population;
112 use Bio::PopGen::Genotype;
114 use base qw(Bio::PopGen::IO);
116 =head2 new
118 Title : new
119 Usage : my $obj = Bio::PopGen::IO::csv->new();
120 Function: Builds a new Bio::PopGen::IO::csv object
121 Returns : an instance of Bio::PopGen::IO::csv
122 Args : [optional, these are the current defaults]
123 -field_delimiter => ','
124 -allele_delimiter=> '\s+'
125 -no_header => 0,
128 =cut
130 sub _initialize {
131 my($self, @args) = @_;
132 my ($fieldsep,$all_sep,
133 $noheader) = $self->_rearrange([qw(FIELD_DELIMITER
134 ALLELE_DELIMITER
135 NO_HEADER)],@args);
138 $self->flag('no_header', defined $noheader ? $noheader : $NoHeader);
139 $self->flag('field_delimiter',defined $fieldsep ? $fieldsep : $FieldDelim);
140 $self->flag('allele_delimiter',defined $all_sep ? $all_sep : $AlleleDelim);
142 $self->{'_header'} = undef;
143 return 1;
146 =head2 flag
148 Title : flag
149 Usage : $obj->flag($flagname,$newval)
150 Function: Get/Set the flag value
151 Returns : value of a flag (a boolean)
152 Args : A flag name, currently we expect
153 'no_header', 'field_delimiter', or 'allele_delimiter'
154 on set, new value (a boolean or undef, optional)
157 =cut
159 sub flag{
160 my $self = shift;
161 my $fieldname = shift;
162 return unless defined $fieldname;
164 return $self->{'_flag'}->{$fieldname} = shift if @_;
165 return $self->{'_flag'}->{$fieldname};
169 =head2 next_individual
171 Title : next_individual
172 Usage : my $ind = $popgenio->next_individual;
173 Function: Retrieve the next individual from a dataset
174 Returns : L<Bio::PopGen::IndividualI> object
175 Args : none
178 =cut
180 sub next_individual{
181 my ($self) = @_;
182 while( defined( $_ = $self->_readline) ) {
183 next if( /^\s*\#/ || /^\s+$/ || ! length($_) );
184 last;
186 return if ! defined $_;
187 if( $self->flag('no_header') ||
188 defined $self->{'_header'} ) {
190 #########new (allows field delim to be the same as the allele delim
192 my ($samp,@marker_results);
194 if($self->flag('field_delimiter') ne $self->flag('allele_delimiter')){
196 ($samp,@marker_results) = split($self->flag('field_delimiter'),$_);
198 else{
200 my $fielddelim = $self->flag('field_delimiter');
201 my $alleledelim = $self->flag('allele_delimiter');
203 ($samp) = /(^.+?)$fielddelim/;
204 s/^.+?$fielddelim//;
206 (@marker_results) = /([\d|\w]+$alleledelim[\d|\w]+)/g;
210 #########end new
212 my $i = 1;
213 foreach my $m ( @marker_results ) {
214 $m =~ s/^\s+//;
215 $m =~ s/\s+$//;
216 my $markername;
217 if( defined $self->{'_header'} ) {
218 $markername = $self->{'_header'}->[$i];
219 } else {
220 $markername = "Marker$i";
222 $self->debug( "markername is $markername alleles are $m\n");
224 my @alleles = split($self->flag('allele_delimiter'), $m);
226 $m = Bio::PopGen::Genotype->new(-alleles => \@alleles,
227 -marker_name => $markername,
228 -individual_id=> $samp);
229 $i++;
231 return Bio::PopGen::Individual->new(-unique_id => $samp,
232 -genotypes => \@marker_results);
233 } else {
234 chomp;
235 $self->{'_header'} = [split($self->flag('field_delimiter'),$_)];
236 return $self->next_individual; # rerun loop again
238 return;
242 =head2 next_population
244 Title : next_population
245 Usage : my $ind = $popgenio->next_population;
246 Function: Retrieve the next population from a dataset
247 Returns : L<Bio::PopGen::PopulationI> object
248 Args : none
249 Note : Many implementation will not implement this
251 =cut
253 # Plan is to just return the whole dataset as a single population by
254 # default I think - people would then have each population in a separate
255 # file.
257 sub next_population{
258 my ($self) = @_;
259 my @inds;
260 while( my $ind = $self->next_individual ) {
261 push @inds, $ind;
263 Bio::PopGen::Population->new(-individuals => \@inds);
269 =head2 write_individual
271 Title : write_individual
272 Usage : $popgenio->write_individual($ind);
273 Function: Write an individual out in the file format
274 Returns : none
275 Args : L<Bio::PopGen::PopulationI> object(s)
277 =cut
279 sub write_individual{
280 my ($self,@inds) = @_;
281 my $fielddelim = $self->flag('field_delimiter');
282 my $alleledelim= $self->flag('allele_delimiter');
284 foreach my $ind ( @inds ) {
285 if (! ref($ind) || ! $ind->isa('Bio::PopGen::IndividualI') ) {
286 $self->warn("Cannot write an object that is not a Bio::PopGen::IndividualI object ($ind)");
287 next;
289 # we'll go ahead and sort these until
290 # we have a better way to insure a consistent order
291 my @marker_names = sort $ind->get_marker_names;
292 if( ! $self->flag('no_header') &&
293 ! $self->flag('header_written') ) {
294 $self->_print(join($fielddelim, ('SAMPLE', @marker_names)), "\n");
295 $self->flag('header_written',1);
297 $self->_print( join($fielddelim, $ind->unique_id,
298 # we're chaining map here, pay attention and read
299 # starting with the last map
301 # we'll turn genotypes into allele pairs
302 # which will be separated by the allele delimiter
303 map { join($alleledelim,$_->get_Alleles) }
304 # marker names will be sorted so we don't
305 # have to worry about this between individuals
306 # unless the individual set you pass in has
307 # a mixed set of markers...
308 # this will turn marker names into Genotypes
309 map {$ind->get_Genotypes(-marker => $_)}
310 @marker_names), "\n")
314 =head2 write_population
316 Title : write_population
317 Usage : $popgenio->write_population($pop);
318 Function: Write a population out in the file format
319 Returns : none
320 Args : L<Bio::PopGen::PopulationI> object(s)
321 Note : Many implementation will not implement this
323 =cut
325 sub write_population{
326 my ($self,@pops) = @_;
327 my $fielddelim = $self->flag('field_delimiter');
328 # my $alleledelim= $self->flag('allele_delimiter');
329 my $alleledelim = ' ';
330 foreach my $pop ( @pops ) {
331 if (! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
332 $self->warn("Cannot write an object that is not a Bio::PopGen::PopulationI object");
333 next;
335 # we'll go ahead and sort these until
336 # we have a better way to insure a consistent order
337 my @marker_names = sort $pop->get_marker_names;
338 if( ! $self->flag('no_header') &&
339 ! $self->flag('header_written') ) {
340 $self->_print( join($fielddelim, ('SAMPLE', @marker_names)),
341 "\n");
342 $self->flag('header_written',1);
344 foreach my $ind ( $pop->get_Individuals ) {
345 $self->_print( join($fielddelim, $ind->unique_id,
346 # we're chaining map here, pay attention
347 # and read starting with the last map
349 # we'll turn genotypes into allele pairs
350 # which will be separated by the allele
351 # delimiter
352 map { join($alleledelim,$_->get_Alleles) }
353 # marker names will be sorted so we don't
354 # have to worry about this between individuals
355 # unless the individual set you pass in has
356 # a mixed set of markers...
357 # this will turn marker names into Genotypes
358 map {$ind->get_Genotypes(-marker => $_)}
359 @marker_names), "\n");