[bug 2714]
[bioperl-live.git] / Bio / SeqIO / table.pm
blobe4eaf14530f6e9588e6516db3fe28e8c75892fa4
1 # $Id$
3 # BioPerl module for Bio::SeqIO::table
5 # Cared for by Hilmar Lapp <hlapp at gmx.net>
9 # (c) Hilmar Lapp, hlapp at gmx.net, 2005.
10 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2005.
12 # You may distribute this module under the same terms as perl itself.
13 # Refer to the Perl Artistic License (see the license accompanying this
14 # software package, or see http://www.perl.com/language/misc/Artistic.html)
15 # for the terms under which you may use, modify, and redistribute this module.
17 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
18 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22 # POD documentation - main docs before the code
24 =head1 NAME
26 Bio::SeqIO::table - sequence input/output stream from a delimited table
28 =head1 SYNOPSIS
30 #It is probably best not to use this object directly, but
31 #rather go through the SeqIO handler system. Go:
33 $stream = Bio::SeqIO->new(-file => $filename, -format => 'table');
35 while ( my $seq = $stream->next_seq() ) {
36 # do something with $seq
39 =head1 DESCRIPTION
41 This class transforms records in a table-formatted text file into
42 Bio::Seq objects.
44 A table-formatted text file of sequence records for the purposes of
45 this module is defined as a text file with each row corresponding to a
46 sequence, and the attributes of the sequence being in different
47 columns. Columns are delimited by a common delimiter, for instance tab
48 or comma.
50 The module permits specifying which columns hold which type of
51 annotation. The semantics of certain attributes, if present, are
52 pre-defined, e.g., accession number and sequence. Additional
53 attributes may be added to the annotation bundle.
55 =head1 FEEDBACK
57 =head2 Mailing Lists
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to one
61 of the Bioperl mailing lists. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 =head2 Reporting Bugs
68 Report bugs to the Bioperl bug tracking system to help us keep track
69 the bugs and their resolution.
71 Bug reports can be submitted via email or the web:
73 http://bugzilla.open-bio.org/
75 =head1 AUTHOR - Hilmar Lapp
77 Email hlapp at gmx.net
79 =head1 APPENDIX
81 The rest of the documentation details each of the object
82 methods. Internal methods are usually preceded with a _
84 =cut
86 # Let the code begin...
88 package Bio::SeqIO::table;
89 use strict;
91 use Bio::Species;
92 use Bio::Seq::SeqFactory;
93 use Bio::Annotation::Collection;
94 use Bio::Annotation::SimpleValue;
96 use base qw(Bio::SeqIO);
98 =head2 new
100 Title : new
101 Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'table')
102 Function: Returns a new seqstream
103 Returns : A Bio::SeqIO stream for a table format
104 Args : Named parameters:
106 -file name of file to read
107 -fh filehandle to attach to
108 -comment leading character(s) introducing a comment line
109 -header the number of header lines to skip; the first
110 non-comment header line will be used to obtain
111 column names; column names will be used as the
112 default tags for attaching annotation.
113 -delim the delimiter for columns as a regular expression;
114 consecutive occurrences of the delimiter will
115 not be collapsed.
116 -display_id the one-based index of the column containing
117 the display ID of the sequence
118 -accession_number the one-based index of the column
119 containing the accession number of the sequence
120 -seq the one-based index of the column containing
121 the sequence string of the sequence
122 -species the one-based index of the column containing the
123 species for the sequence record; if not a
124 number, will be used as the static species
125 common to all records
126 -annotation if provided and a scalar (but see below), a
127 flag whether or not all additional columns are
128 to be preserved as annotation, the tags used
129 will either be 'colX' if there is no column
130 header and where X is the one-based column
131 index, and otherwise the column headers will be
132 used as tags;
134 if a reference to an array, or a square
135 bracket-enclosed string of comma-delimited
136 values, only those columns (one-based index)
137 will be preserved as annotation, tags as before;
139 if a reference to a hash, or a curly
140 braces-enclosed string of comma-delimited key
141 and value pairs in alternating order, the keys
142 are one-based column indexes to be preserved,
143 and the values are the tags under which the
144 annotation is to be attached; if not provided or
145 supplied as undef, no additional annotation will
146 be preserved.
147 -colnames a reference to an array of column labels, or a
148 string of comma-delimited labels, denoting the
149 columns to be converted into annotation; this is
150 an alternative to -annotation and will be
151 ignored if -annotation is also supplied with a
152 valid value.
153 -trim flag determining whether or not all values should
154 be trimmed of leading and trailing white space
155 and double quotes
157 Additional arguments may be used to e.g. set factories and
158 builders involved in the sequence object creation (see the
159 POD of Bio::SeqIO).
162 =cut
164 sub _initialize {
165 my($self,@args) = @_;
167 # chained initialization
168 $self->SUPER::_initialize(@args);
170 # our own parameters
171 my ($cmtchars,
172 $header,
173 $delim,
174 $display_id,
175 $accnr,
176 $seq,
177 $taxon,
178 $useann,
179 $colnames,
180 $trim) =
181 $self->_rearrange([qw(COMMENT
182 HEADER
183 DELIM
184 DISPLAY_ID
185 ACCESSION_NUMBER
187 SPECIES
188 ANNOTATION
189 COLNAMES
190 TRIM)
191 ], @args);
193 # store options and apply defaults
194 $self->comment_char(defined($cmtchars) ? $cmtchars : "#")
195 if (!defined($self->comment_char)) || defined($cmtchars);
196 $self->delimiter(defined($delim) ? $delim : "\t")
197 if (!defined($self->delimiter)) || defined($delim);
198 $self->header($header) if defined($header);
199 $self->trim_values($trim) if defined($trim);
201 # attribute columns
202 my $attrs = {};
203 $attrs->{-display_id} = $display_id if defined($display_id);
204 $attrs->{-accession_number} = $accnr if defined($accnr);
205 $attrs->{-seq} = $seq if defined($seq);
206 if (defined($taxon)) {
207 if (ref($taxon) || ($taxon =~ /^\d+$/)) {
208 # either a static object, or a column reference
209 $attrs->{-species} = $taxon;
210 } else {
211 # static species as a string
212 $attrs->{-species} = Bio::Species->new(
213 -classification => [reverse(split(' ',$taxon))]);
216 $self->attribute_map($attrs);
218 # annotation columns, if any
219 if ($useann && !ref($useann)) {
220 # it's a scalar; check whether this is in fact an array or
221 # hash as a string rather than just a flag
222 if ($useann =~ /^\[(.*)\]$/) {
223 $useann = [split(/[,;]/,$1)];
224 } elsif ($useann =~ /^{(.*)}$/) {
225 $useann = {split(/[,;]/,$1)};
226 } # else it is probably indeed just a flag
228 if (ref($useann)) {
229 my $ann_map;
230 if (ref($useann) eq "ARRAY") {
231 my $has_header = ($self->header > 0);
232 $ann_map = {};
233 foreach my $i (@$useann) {
234 $ann_map->{$i} = $has_header ? undef : "col$i";
236 } else {
237 # no special handling necessary
238 $ann_map = $useann;
240 $self->annotation_map($ann_map);
241 } else {
242 $self->keep_annotation($useann || $colnames);
243 # annotation columns, if any
244 if ($colnames && !ref($colnames)) {
245 # an array as a string
246 $colnames =~ s/^\[(.*)\]$/$1/;
247 $colnames = [split(/[,;]/,$colnames)];
249 $self->annotation_columns($colnames) if ref($colnames);
252 # make sure we have a factory defined
253 if(!defined($self->sequence_factory)) {
254 $self->sequence_factory(
255 Bio::Seq::SeqFactory->new(-verbose => $self->verbose(),
256 -type => 'Bio::Seq::RichSeq'));
260 =head2 next_seq
262 Title : next_seq
263 Usage : $seq = $stream->next_seq()
264 Function: returns the next sequence in the stream
265 Returns : Bio::Seq::RichSeq object
266 Args :
268 =cut
270 sub next_seq {
271 my $self = shift;
273 # skip until not a comment and not an empty line
274 my $line_ok = $self->_next_record();
276 # if there is a header but we haven't read past it yet then do so now
277 if ($line_ok && (! $self->_header_skipped) && $self->header) {
278 $line_ok = $self->_parse_header();
279 $self->_header_skipped(1);
282 # return if we reached end-of-file
283 return unless $line_ok;
285 # otherwise, parse the record
287 # split into columns
288 my @cols = $self->_get_row_values();
289 # trim leading and trailing whitespace and quotes if desired
290 if ($self->trim_values) {
291 for(my $i = 0; $i < scalar(@cols); $i++) {
292 if ($cols[$i]) {
293 # trim off whitespace
294 $cols[$i] =~ s/^\s+//;
295 $cols[$i] =~ s/\s+$//;
296 # trim off double quotes
297 $cols[$i] =~ s/^"//;
298 $cols[$i] =~ s/"$//;
303 # assign values for columns in the attribute map
304 my $attrmap = $self->_attribute_map;
305 my %params = ();
306 foreach my $attr (keys %$attrmap) {
307 if ((!ref($attrmap->{$attr})) && ($attrmap->{$attr} =~ /^\d+$/)) {
308 # this is a column index, add to instantiation parameters
309 $params{$attr} = $cols[$attrmap->{$attr}];
310 } else {
311 # not a column index; we assume it's a static value
312 $params{$attr} = $attrmap->{$attr};
316 # add annotation columns to the annotation bundle
317 my $annmap = $self->_annotation_map;
318 if ($annmap && %$annmap) {
319 my $anncoll = Bio::Annotation::Collection->new();
320 foreach my $col (keys %$annmap) {
321 next unless $cols[$col]; # skip empty columns!
322 $anncoll->add_Annotation(
323 Bio::Annotation::SimpleValue->new(-value => $cols[$col],
324 -tagname=> $annmap->{$col}));
326 $params{'-annotation'} = $anncoll;
329 # ask the object builder to add the slots that we've gathered
330 my $builder = $self->sequence_builder();
331 $builder->add_slot_value(%params);
332 # and instantiate the object
333 my $seq = $builder->make_object();
335 # done!
336 return $seq;
339 =head2 comment_char
341 Title : comment_char
342 Usage : $obj->comment_char($newval)
343 Function: Get/set the leading character(s) designating a line as
344 a comment-line.
345 Example :
346 Returns : value of comment_char (a scalar)
347 Args : on set, new value (a scalar or undef, optional)
350 =cut
352 sub comment_char{
353 my $self = shift;
355 return $self->{'comment_char'} = shift if @_;
356 return $self->{'comment_char'};
359 =head2 header
361 Title : header
362 Usage : $obj->header($newval)
363 Function: Get/set the number of header lines to skip before the
364 rows containing actual sequence records.
366 If set to zero or undef, means that there is no header and
367 therefore also no column headers.
369 Example :
370 Returns : value of header (a scalar)
371 Args : on set, new value (a scalar or undef, optional)
374 =cut
376 sub header{
377 my $self = shift;
379 return $self->{'header'} = shift if @_;
380 return $self->{'header'};
383 =head2 delimiter
385 Title : delimiter
386 Usage : $obj->delimiter($newval)
387 Function: Get/set the column delimiter. This will in fact be
388 treated as a regular expression. Consecutive occurrences
389 will not be collapsed to a single one.
391 Example :
392 Returns : value of delimiter (a scalar)
393 Args : on set, new value (a scalar or undef, optional)
396 =cut
398 sub delimiter{
399 my $self = shift;
401 return $self->{'delimiter'} = shift if @_;
402 return $self->{'delimiter'};
405 =head2 attribute_map
407 Title : attribute_map
408 Usage : $obj->attribute_map($newval)
409 Function: Get/set the map of sequence object initialization
410 attributes (keys) to one-based column index.
412 Attributes will usually need to be prefixed by a dash, just
413 as if they were passed to the new() method of the sequence
414 class.
416 Example :
417 Returns : value of attribute_map (a reference to a hash)
418 Args : on set, new value (a reference to a hash or undef, optional)
421 =cut
423 sub attribute_map{
424 my $self = shift;
426 # internally we store zero-based maps - so we need to convert back
427 # and forth here
428 if (@_) {
429 my $arg = shift;
430 # allow for and protect against undef
431 return delete $self->{'_attribute_map'} unless defined($arg);
432 # copy to avoid side-effects
433 my $attr_map = {%$arg};
434 foreach my $key (keys %$attr_map) {
435 if ((!ref($attr_map->{$key})) && ($attr_map->{$key} =~ /^\d+$/)) {
436 $attr_map->{$key}--;
439 $self->{'_attribute_map'} = $attr_map;
441 # there may not be a map
442 return unless exists($self->{'_attribute_map'});
443 # we need to copy in order not to override the stored map!
444 my %attr_map = %{$self->{'_attribute_map'}};
445 foreach my $key (keys %attr_map) {
446 if ((!ref($attr_map{$key})) && ($attr_map{$key} =~ /^\d+$/)) {
447 $attr_map{$key}++;
450 return \%attr_map;
453 =head2 annotation_map
455 Title : annotation_map
456 Usage : $obj->annotation_map($newval)
457 Function: Get/set the mapping between one-based column indexes
458 (keys) and annotation tags (values).
460 Note that the map returned by this method may change after
461 the first next_seq() call if the file contains a column
462 header and no annotation keys have been predefined in the
463 map, because upon reading the column header line the tag
464 names will be set automatically.
466 Note also that the map may reference columns that are used
467 as well in the sequence attribute map.
469 Example :
470 Returns : value of annotation_map (a reference to a hash)
471 Args : on set, new value (a reference to a hash or undef, optional)
474 =cut
476 sub annotation_map{
477 my $self = shift;
479 # internally we store zero-based maps - so we need to convert back
480 # and forth here
481 if (@_) {
482 my $arg = shift;
483 # allow for and protect against undef
484 return delete $self->{'_annotation_map'} unless defined($arg);
485 # copy to avoid side-effects
486 my $ann_map = {%$arg};
487 # make sure we sort the keys numerically or otherwise we may
488 # clobber a key with a higher index
489 foreach my $key (sort { $a <=> $b } keys(%$ann_map)) {
490 $ann_map->{$key-1} = $ann_map->{$key};
491 delete $ann_map->{$key};
493 $self->{'_annotation_map'} = $ann_map;
494 # also make a note that we want to keep annotation
495 $self->keep_annotation(1);
497 # there may not be a map
498 return unless exists($self->{'_annotation_map'});
499 # we need to copy in order not to override the stored map!
500 my %ann_map = %{$self->{'_annotation_map'}};
501 # here we need to sort numerically in reverse order ...
502 foreach my $key (sort { $b <=> $a } keys(%ann_map)) {
503 $ann_map{$key+1} = $ann_map{$key};
504 delete $ann_map{$key};
506 return \%ann_map;
509 =head2 keep_annotation
511 Title : keep_annotation
512 Usage : $obj->keep_annotation($newval)
513 Function: Get/set flag whether or not to keep values from
514 additional columns as annotation.
516 Additional columns are all those columns in the input file
517 that aren't referenced in the attribute map.
519 Example :
520 Returns : value of keep_annotation (a scalar)
521 Args : on set, new value (a scalar or undef, optional)
524 =cut
526 sub keep_annotation{
527 my $self = shift;
529 return $self->{'keep_annotation'} = shift if @_;
530 return $self->{'keep_annotation'};
533 =head2 annotation_columns
535 Title : annotation_columns
536 Usage : $obj->annotation_columns($newval)
537 Function: Get/set the names (labels) of the columns to be used for
538 annotation.
540 This is an alternative to using annotation_map. In order to
541 have any effect, it must be set before the first call of
542 next_seq(), and obviously there must be a header line (or
543 row) too giving the column labels.
545 Example :
546 Returns : value of annotation_columns (a reference to an array)
547 Args : on set, new value (a reference to an array of undef, optional)
550 =cut
552 sub annotation_columns{
553 my $self = shift;
555 return $self->{'annotation_columns'} = shift if @_;
556 return $self->{'annotation_columns'};
559 =head2 trim_values
561 Title : trim_values
562 Usage : $obj->trim_values($newval)
563 Function: Get/set whether or not to trim leading and trailing
564 whitespace off all column values.
565 Example :
566 Returns : value of trim_values (a scalar)
567 Args : on set, new value (a scalar or undef, optional)
570 =cut
572 sub trim_values{
573 my $self = shift;
575 return $self->{'trim_values'} = shift if @_;
576 return $self->{'trim_values'};
579 =head1 Internal methods
581 All methods with a leading underscore are not meant to be part of the
582 'official' API. They are for use by this module only, consider them
583 private unless you are a developer trying to modify this module.
585 =cut
587 =head2 _attribute_map
589 Title : _attribute_map
590 Usage : $obj->_attribute_map($newval)
591 Function: Get only. Same as attribute_map, but zero-based indexes.
593 Note that any changes made to the returned map will change
594 the map used by this instance. You should know what you are
595 doing if you modify the returned value (or if you call this
596 method in the first place).
598 Example :
599 Returns : value of _attribute_map (a reference to a hash)
600 Args : none
603 =cut
605 sub _attribute_map{
606 my $self = shift;
608 return $self->{'_attribute_map'};
611 =head2 _annotation_map
613 Title : _annotation_map
614 Usage : $obj->_annotation_map($newval)
615 Function: Get only. Same as annotation_map, but with zero-based indexes.
617 Note that any changes made to the returned map will change
618 the map used by this instance. You should know what you are
619 doing if you modify the returned value (or if you call this
620 method in the first place).
622 Example :
623 Returns : value of _annotation_map (a reference to a hash)
624 Args : none
627 =cut
629 sub _annotation_map{
630 my $self = shift;
632 return $self->{'_annotation_map'};
635 =head2 _header_skipped
637 Title : _header_skipped
638 Usage : $obj->_header_skipped($newval)
639 Function: Get/set the flag whether the header was already
640 read (and skipped) or not.
641 Example :
642 Returns : value of _header_skipped (a scalar)
643 Args : on set, new value (a scalar or undef, optional)
646 =cut
648 sub _header_skipped{
649 my $self = shift;
651 return $self->{'_header_skipped'} = shift if @_;
652 return $self->{'_header_skipped'};
655 =head2 _next_record
657 Title : _next_record
658 Usage :
659 Function: Navigates the underlying file to the next record.
661 For row-based records in delimited text files, this will
662 skip all empty lines and lines with a leading comment
663 character.
665 This method is here is to serve as a hook for other formats
666 that conceptually also represent tables but aren't
667 formatted as row-based text files.
669 Example :
670 Returns : TRUE if the navigation was successful and FALSE
671 otherwise. Unsuccessful navigation will usually be treated
672 as an end-of-file condition.
673 Args :
676 =cut
678 sub _next_record{
679 my $self = shift;
681 my $cmtcc = $self->comment_char;
682 my $line = $self->_readline();
684 # skip until not a comment and not an empty line
685 while (defined($line)
686 && (($cmtcc && ($line =~ /^\s*$cmtcc/))
687 || ($line =~ /^\s*$/))) {
688 $line = $self->_readline();
691 return $self->{'_line'} = $line;
694 =head2 _parse_header
696 Title : _parse_header
697 Usage :
698 Function: Parse the table header and navigate past it.
700 This method is called if the number of header rows has been
701 specified equal to or greater than one, and positioned at
702 the first header line (row). By default the first header
703 line (row) is used for setting column names, but additional
704 lines (rows) may be skipped too. Empty lines and comment
705 lines do not count as header lines (rows).
707 This method will call _next_record() to navigate to the
708 next header line (row), if there is more than one header
709 line (row). Upon return, the file is presumed to be
710 positioned at the first record after the header.
712 This method is here is to serve as a hook for other formats
713 that conceptually also represent tables but aren't
714 formatted as row-based text files.
716 Note however that the only methods used to access file
717 content or navigate the position are _get_row_values() and
718 _next_record(), so it should usually suffice to override
719 those.
721 Example :
722 Returns : TRUE if navigation past the header was successful and FALSE
723 otherwise. Unsuccessful navigation will usually be treated
724 as an end-of-file condition.
725 Args :
728 =cut
730 sub _parse_header{
731 my $self = shift;
733 # the first header line contains the column headers, see whether
734 # we need them
735 if ($self->keep_annotation) {
736 my @colnames = $self->_get_row_values();
737 # trim leading and trailing whitespace if desired
738 if ($self->trim_values) {
739 # trim off whitespace
740 @colnames = map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_; } @colnames;
741 # trim off double quotes
742 @colnames = map { $_ =~ s/^"//; $_ =~ s/"$//; $_; } @colnames;
744 # build or complete annotation column map
745 my $annmap = $self->annotation_map || {};
746 if (! %$annmap) {
747 # check whether columns have been defined by name rather than index
748 if (my $anncols = $self->annotation_columns) {
749 # first sanity check: all column names must map
750 my %colmap = map { ($_,1); } @colnames;
751 foreach my $col (@$anncols) {
752 if (!exists($colmap{$col})) {
753 $self->throw("no such column labeled '$col'");
756 # now map to the column indexes
757 %colmap = map { ($_,1); } @$anncols;
758 for (my $i = 0; $i < scalar(@colnames); $i++) {
759 if (exists($colmap{$colnames[$i]})) {
760 $annmap->{$i+1} = $colnames[$i];
763 } else {
764 # no columns specified, default to all non-attribute columns
765 for (my $i = 0; $i < scalar(@colnames); $i++) {
766 $annmap->{$i+1} = $colnames[$i];
768 # subtract all attribute-referenced columns
769 foreach my $attrcol (values %{$self->attribute_map}) {
770 if ((!ref($attrcol)) && ($attrcol =~ /^\d+$/)) {
771 delete $annmap->{$attrcol};
775 } else {
776 # fill in where the tag names weren't pre-defined
777 for (my $i = 0; $i < scalar(@colnames); $i++) {
778 if (exists($annmap->{$i+1}) && ! defined($annmap->{$i+1})) {
779 $annmap->{$i+1} = $colnames[$i];
783 $self->annotation_map($annmap);
786 # now read past the header
787 my $header_lines = $self->header;
788 my $line_ok = 1;
789 while (defined($line_ok) && ($header_lines > 0)) {
790 $line_ok = $self->_next_record();
791 $header_lines--;
794 return $line_ok;
797 =head2 _get_row_values
799 Title : _get_row_values
800 Usage :
801 Function: Get the values for the current line (or row) as an array in
802 the order of columns.
804 This method is here is to serve as a hook for other formats
805 that conceptually also represent tables but aren't
806 formatted as row-based text files.
808 Example :
809 Returns : An array of column values for the current row.
810 Args :
813 =cut
815 sub _get_row_values{
816 my $self = shift;
817 my $delim = $self->delimiter;
818 my $line = $self->{'_line'};
819 chomp($line);
820 my @cols = split(/$delim/,$line);
821 return @cols;