Bio/PopGen/IO/csv.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::PopGen::IO::csv
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Jason Stajich <jason-at-bioperl.org>
   8 #
   9 # Copyright Jason Stajich
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::PopGen::IO::csv -Extract individual allele data from a CSV parser
  18
  19 =head1 SYNOPSIS
  20
  21 #Do not use directly, use through the Bio::PopGen::IO driver
  22
  23   use Bio::PopGen::IO;
  24   my $io = Bio::PopGen::IO->new(-format => 'csv',
  25                                -file   => 'data.csv');
  26
  27   # Some IO might support reading in a population at a time
  28
  29   my @population;
  30   while( my $ind = $io->next_individual ) {
  31       push @population, $ind;
  32   }
  33
  34 =head1 DESCRIPTION
  35
  36 This object will parse comma delimited format (CSV) or whatever
  37 delimiter you specify. It currently doesn't handle the more complex
  38 quote escaped CSV format.  There are 3 initialization parameters,
  39 the delimiter (-field_delimiter) [default ','], (-allele_delimiter)
  40 [default ' '].    The third initialization parameter is a boolean
  41 -no_header which specifies if there is no header line to read in.  All lines starting with '#' will be skipped
  42
  43 When no_header is not specific the data is assumed to be of the following form.
  44 Having a header line this
  45 SAMPLE,MARKERNAME1,MARKERNAME2,...
  46
  47 and each data line having the form (diploid data)
  48 SAMP1,101 102,100 90,a b
  49 or for haploid data
  50 SAMP1,101,100,a
  51
  52 =head1 FEEDBACK
  53
  54 =head2 Mailing Lists
  55
  56 User feedback is an integral part of the evolution of this and other
  57 Bioperl modules. Send your comments and suggestions preferably to
  58 the Bioperl mailing list.  Your participation is much appreciated.
  59
  60   bioperl-l@bioperl.org                  - General discussion
  61   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  62
  63 =head2 Support
  64
  65 Please direct usage questions or support issues to the mailing list:
  66
  67 L<bioperl-l@bioperl.org>
  68
  69 rather than to the module maintainer directly. Many experienced and
  70 reponsive experts will be able look at the problem and quickly
  71 address it. Please include a thorough description of the problem
  72 with code and data examples if at all possible.
  73
  74 =head2 Reporting Bugs
  75
  76 Report bugs to the Bioperl bug tracking system to help us keep track
  77 of the bugs and their resolution. Bug reports can be submitted via
  78 the web:
  79
  80   http://bugzilla.open-bio.org/
  81
  82 =head1 AUTHOR - Jason Stajich
  83
  84 Email jason-at-bioperl.org
  85
  86 =head1 CONTRIBUTORS
  87
  88 Matthew Hahn, matthew.hahn-at-duke.edu
  89
  90 =head1 APPENDIX
  91
  92 The rest of the documentation details each of the object methods.
  93 Internal methods are usually preceded with a _
  94
  95 =cut
  96
  97
  98 # Let the code begin...
  99
 100
 101 package Bio::PopGen::IO::csv;
 102 use vars qw($FieldDelim $AlleleDelim $NoHeader);
 103 use strict;
 104
 105 ($FieldDelim,$AlleleDelim,$NoHeader) =( ',', '\s+',0);
 106
 107 # Object preamble - inherits from Bio::Root::Root
 108
 109
 110 use Bio::PopGen::Individual;
 111 use Bio::PopGen::Population;
 112 use Bio::PopGen::Genotype;
 113
 114 use base qw(Bio::PopGen::IO);
 115
 116 =head2 new
 117
 118  Title   : new
 119  Usage   : my $obj = Bio::PopGen::IO::csv->new();
 120  Function: Builds a new Bio::PopGen::IO::csv object
 121  Returns : an instance of Bio::PopGen::IO::csv
 122  Args    : [optional, these are the current defaults]
 123            -field_delimiter => ','
 124            -allele_delimiter=> '\s+'
 125            -no_header       => 0,
 126
 127
 128 =cut
 129
 130 sub _initialize {
 131     my($self, @args) = @_;
 132     my ($fieldsep,$all_sep,
 133         $noheader) = $self->_rearrange([qw(FIELD_DELIMITER
 134                                            ALLELE_DELIMITER
 135                                            NO_HEADER)],@args);
 136
 137
 138     $self->flag('no_header', defined $noheader ? $noheader : $NoHeader);
 139     $self->flag('field_delimiter',defined $fieldsep ? $fieldsep : $FieldDelim);
 140     $self->flag('allele_delimiter',defined $all_sep ? $all_sep : $AlleleDelim);
 141
 142     $self->{'_header'} = undef;
 143     return 1;
 144 }
 145
 146 =head2 flag
 147
 148  Title   : flag
 149  Usage   : $obj->flag($flagname,$newval)
 150  Function: Get/Set the flag value
 151  Returns : value of a flag (a boolean)
 152  Args    : A flag name, currently we expect
 153            'no_header', 'field_delimiter', or 'allele_delimiter'
 154            on set, new value (a boolean or undef, optional)
 155
 156
 157 =cut
 158
 159 sub flag{
 160     my $self = shift;
 161     my $fieldname = shift;
 162     return unless defined $fieldname;
 163
 164     return $self->{'_flag'}->{$fieldname} = shift if @_;
 165     return $self->{'_flag'}->{$fieldname};
 166 }
 167
 168
 169 =head2 next_individual
 170
 171  Title   : next_individual
 172  Usage   : my $ind = $popgenio->next_individual;
 173  Function: Retrieve the next individual from a dataset
 174  Returns : L<Bio::PopGen::IndividualI> object
 175  Args    : none
 176
 177
 178 =cut
 179
 180 sub next_individual{
 181     my ($self) = @_;
 182     while( defined( $_ = $self->_readline) ) {
 183         next if( /^\s*\#/ || /^\s+$/ || ! length($_) );
 184         last;
 185     }
 186     return if ! defined $_;
 187     if( $self->flag('no_header') ||
 188         defined $self->{'_header'} ) {
 189
 190         #########new (allows field delim to be the same as the allele delim
 191
 192         my ($samp,@marker_results);
 193
 194         if($self->flag('field_delimiter') ne $self->flag('allele_delimiter')){
 195
 196                 ($samp,@marker_results) = split($self->flag('field_delimiter'),$_);
 197         }
 198         else{
 199
 200                 my $fielddelim = $self->flag('field_delimiter');
 201                 my $alleledelim = $self->flag('allele_delimiter');
 202
 203                 ($samp) = /(^.+?)$fielddelim/;
 204                 s/^.+?$fielddelim//;
 205
 206                 (@marker_results) = /([\d|\w]+$alleledelim[\d|\w]+)/g;
 207
 208         }
 209
 210         #########end new
 211
 212         my $i = 1;
 213         foreach my $m ( @marker_results ) {
 214             $m =~ s/^\s+//;
 215             $m =~ s/\s+$//;
 216             my $markername;
 217             if( defined $self->{'_header'} ) {
 218                 $markername = $self->{'_header'}->[$i];
 219             } else {
 220                 $markername = "Marker$i";
 221             }
 222             $self->debug( "markername is $markername alleles are $m\n");
 223
 224             my @alleles = split($self->flag('allele_delimiter'), $m);
 225
 226             $m = Bio::PopGen::Genotype->new(-alleles      => \@alleles,
 227                                            -marker_name  => $markername,
 228                                            -individual_id=> $samp);
 229             $i++;
 230         }
 231         return Bio::PopGen::Individual->new(-unique_id => $samp,
 232                                            -genotypes => \@marker_results);
 233     } else {
 234         chomp;
 235         $self->{'_header'} = [split($self->flag('field_delimiter'),$_)];
 236         return $self->next_individual; # rerun loop again
 237     }
 238     return;
 239 }
 240
 241
 242 =head2 next_population
 243
 244  Title   : next_population
 245  Usage   : my $ind = $popgenio->next_population;
 246  Function: Retrieve the next population from a dataset
 247  Returns : L<Bio::PopGen::PopulationI> object
 248  Args    : none
 249  Note    : Many implementation will not implement this
 250
 251 =cut
 252
 253 # Plan is to just return the whole dataset as a single population by
 254 # default I think - people would then have each population in a separate
 255 # file.
 256
 257 sub next_population{
 258     my ($self) = @_;
 259     my @inds;
 260     while( my $ind = $self->next_individual ) {
 261         push @inds, $ind;
 262     }
 263     Bio::PopGen::Population->new(-individuals => \@inds);
 264 }
 265
 266
 267
 268
 269 =head2 write_individual
 270
 271  Title   : write_individual
 272  Usage   : $popgenio->write_individual($ind);
 273  Function: Write an individual out in the file format
 274  Returns : none
 275  Args    : L<Bio::PopGen::PopulationI> object(s)
 276
 277 =cut
 278
 279 sub write_individual{
 280     my ($self,@inds) = @_;
 281     my $fielddelim  = $self->flag('field_delimiter');
 282     my $alleledelim= $self->flag('allele_delimiter');
 283
 284     foreach my $ind ( @inds ) {
 285         if (! ref($ind) || ! $ind->isa('Bio::PopGen::IndividualI') ) {
 286             $self->warn("Cannot write an object that is not a Bio::PopGen::IndividualI object ($ind)");
 287             next;
 288         }
 289         # we'll go ahead and sort these until
 290         # we have a better way to insure a consistent order
 291         my @marker_names = sort $ind->get_marker_names;
 292         if( ! $self->flag('no_header') &&
 293             ! $self->flag('header_written') ) {
 294             $self->_print(join($fielddelim, ('SAMPLE', @marker_names)), "\n");
 295             $self->flag('header_written',1);
 296         }
 297         $self->_print( join($fielddelim, $ind->unique_id,
 298                             # we're chaining map here, pay attention and read
 299                             # starting with the last map
 300
 301                             # we'll turn genotypes into allele pairs
 302                             # which will be separated by the allele delimiter
 303                             map { join($alleledelim,$_->get_Alleles) }
 304                             # marker names will be sorted so we don't
 305                             # have to worry about this between individuals
 306                             # unless the individual set you pass in has
 307                             # a mixed set of markers...
 308                             # this will turn marker names into Genotypes
 309                             map {$ind->get_Genotypes(-marker => $_)}
 310                             @marker_names), "\n")
 311     }
 312 }
 313
 314 =head2 write_population
 315
 316  Title   : write_population
 317  Usage   : $popgenio->write_population($pop);
 318  Function: Write a population out in the file format
 319  Returns : none
 320  Args    : L<Bio::PopGen::PopulationI> object(s)
 321  Note    : Many implementation will not implement this
 322
 323 =cut
 324
 325 sub write_population{
 326     my ($self,@pops) = @_;
 327     my $fielddelim  = $self->flag('field_delimiter');
 328 #     my $alleledelim= $self->flag('allele_delimiter');
 329     my $alleledelim = ' ';
 330     foreach my $pop ( @pops ) {
 331         if (! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
 332             $self->warn("Cannot write an object that is not a Bio::PopGen::PopulationI object");
 333             next;
 334         }
 335         # we'll go ahead and sort these until
 336         # we have a better way to insure a consistent order
 337         my @marker_names = sort $pop->get_marker_names;
 338         if( ! $self->flag('no_header') &&
 339             ! $self->flag('header_written') ) {
 340             $self->_print( join($fielddelim, ('SAMPLE', @marker_names)),
 341                            "\n");
 342             $self->flag('header_written',1);
 343         }
 344         foreach my $ind ( $pop->get_Individuals ) {
 345            $self->_print( join($fielddelim, $ind->unique_id,
 346                                # we're chaining map here, pay attention
 347                                # and read starting with the last map
 348
 349                                # we'll turn genotypes into allele pairs
 350                                # which will be separated by the allele
 351                                # delimiter
 352                                map { join($alleledelim,$_->get_Alleles) }
 353                                # marker names will be sorted so we don't
 354                                # have to worry about this between individuals
 355                                # unless the individual set you pass in has
 356                                # a mixed set of markers...
 357                                # this will turn marker names into Genotypes
 358                                map {$ind->get_Genotypes(-marker => $_)}
 359                                @marker_names), "\n");
 360        }
 361     }
 362 }
 363
 364 1;