bump rc version
[bioperl-live.git] / Bio / SeqIO / table.pm
blob64ff8825a1523a2042001decebfc3b043a69d5f5
2 # BioPerl module for Bio::SeqIO::table
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp
8 # You may distribute this module under the same terms as perl itself.
9 # Refer to the Perl Artistic License (see the license accompanying this
10 # software package, or see http://www.perl.com/language/misc/Artistic.html)
11 # for the terms under which you may use, modify, and redistribute this module.
13 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
14 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
15 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
18 # POD documentation - main docs before the code
20 =head1 NAME
22 Bio::SeqIO::table - sequence input/output stream from a delimited table
24 =head1 SYNOPSIS
26 # Do not to use this object directly, use Bio::SeqIO, for example:
28 $in = Bio::SeqIO->new(-file => $filename, -format => 'table');
30 while ( my $seq = $in->next_seq() ) {
31 # do something with $seq
34 =head1 DESCRIPTION
36 This class transforms records in a table-formatted text file into
37 Bio::Seq objects.
39 A table-formatted text file of sequence records for the purposes of
40 this module is defined as a text file with each row corresponding to a
41 sequence, and the attributes of the sequence being in different
42 columns. Columns are delimited by a common delimiter, for instance tab
43 or comma.
45 The module permits specifying which columns hold which type of
46 annotation. The semantics of certain attributes, if present, are
47 pre-defined, e.g., accession number and sequence. Additional
48 attributes may be added to the annotation bundle.
50 =head1 FEEDBACK
52 =head2 Mailing Lists
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to one
56 of the Bioperl mailing lists. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 =head2 Support
63 Please direct usage questions or support issues to the mailing list:
65 I<bioperl-l@bioperl.org>
67 rather than to the module maintainer directly. Many experienced and
68 reponsive experts will be able look at the problem and quickly
69 address it. Please include a thorough description of the problem
70 with code and data examples if at all possible.
72 =head2 Reporting Bugs
74 Report bugs to the Bioperl bug tracking system to help us keep track
75 the bugs and their resolution.
77 Bug reports can be submitted via email or the web:
79 https://github.com/bioperl/bioperl-live/issues
81 =head1 AUTHOR - Hilmar Lapp
83 Email hlapp at gmx.net
85 =head1 APPENDIX
87 The rest of the documentation details each of the object
88 methods. Internal methods are usually preceded with a _
90 =cut
92 # Let the code begin...
94 package Bio::SeqIO::table;
95 use strict;
97 use Bio::Species;
98 use Bio::Seq::SeqFactory;
99 use Bio::Annotation::Collection;
100 use Bio::Annotation::SimpleValue;
102 use base qw(Bio::SeqIO);
104 =head2 new
106 Title : new
107 Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'table')
108 Function: Returns a new seqstream
109 Returns : A Bio::SeqIO stream for a table format
110 Args : Named parameters:
112 -file Name of file to read
113 -fh Filehandle to attach to
114 -comment Leading character(s) introducing a comment line
115 -header the number of header lines to skip; the first
116 non-comment header line will be used to obtain
117 column names; column names will be used as the
118 default tags for attaching annotation.
119 -delim The delimiter for columns as a regular expression;
120 consecutive occurrences of the delimiter will
121 not be collapsed.
122 -display_id The one-based index of the column containing
123 the display ID of the sequence
124 -accession_number The one-based index of the column
125 containing the accession number of the sequence
126 -seq The one-based index of the column containing
127 the sequence string of the sequence
128 -desc The one-based index of the column containing
129 the description of the sequence
130 -species The one-based index of the column containing the
131 species for the sequence record; if not a
132 number, will be used as the static species
133 common to all records
134 -annotation If provided and a scalar (but see below), a
135 flag whether or not all additional columns are
136 to be preserved as annotation, the tags used
137 will either be 'colX' if there is no column
138 header and where X is the one-based column
139 index, and otherwise the column headers will be
140 used as tags;
142 If a reference to an array, or a square
143 bracket-enclosed string of comma-delimited
144 values, only those columns (one-based index)
145 will be preserved as annotation, tags as before;
147 If a reference to a hash, or a curly
148 braces-enclosed string of comma-delimited key
149 and value pairs in alternating order, the keys
150 are one-based column indexes to be preserved,
151 and the values are the tags under which the
152 annotation is to be attached; if not provided or
153 supplied as undef, no additional annotation will
154 be preserved.
155 -colnames A reference to an array of column labels, or a
156 string of comma-delimited labels, denoting the
157 columns to be converted into annotation; this is
158 an alternative to -annotation and will be
159 ignored if -annotation is also supplied with a
160 valid value.
161 -trim Flag determining whether or not all values should
162 be trimmed of leading and trailing white space
163 and double quotes
165 Additional arguments may be used to e.g. set factories and
166 builders involved in the sequence object creation (see the
167 POD of Bio::SeqIO).
169 =cut
171 sub _initialize {
172 my($self,@args) = @_;
174 # chained initialization
175 $self->SUPER::_initialize(@args);
177 # our own parameters
178 my ($cmtchars,
179 $header,
180 $delim,
181 $display_id,
182 $desc,
183 $accnr,
184 $seq,
185 $taxon,
186 $useann,
187 $colnames,
188 $trim) =
189 $self->_rearrange([qw(COMMENT
190 HEADER
191 DELIM
192 DISPLAY_ID
193 DESC
194 ACCESSION_NUMBER
196 SPECIES
197 ANNOTATION
198 COLNAMES
199 TRIM)
200 ], @args);
202 # store options and apply defaults
203 $self->comment_char(defined($cmtchars) ? $cmtchars : "#")
204 if (!defined($self->comment_char)) || defined($cmtchars);
205 $self->delimiter(defined($delim) ? $delim : "\t")
206 if (!defined($self->delimiter)) || defined($delim);
207 $self->header($header) if defined($header);
208 $self->trim_values($trim) if defined($trim);
210 # attribute columns
211 my $attrs = {};
212 $attrs->{-display_id} = $display_id if defined($display_id);
213 $attrs->{-accession_number} = $accnr if defined($accnr);
214 $attrs->{-seq} = $seq if defined($seq);
215 $attrs->{-desc} = $desc if defined($desc);
216 if (defined($taxon)) {
217 if (ref($taxon) || ($taxon =~ /^\d+$/)) {
218 # either a static object, or a column reference
219 $attrs->{-species} = $taxon;
220 } else {
221 # static species as a string
222 $attrs->{-species} = Bio::Species->new(
223 -classification => [reverse(split(' ',$taxon))]);
226 $self->attribute_map($attrs);
228 # annotation columns, if any
229 if ($useann && !ref($useann)) {
230 # it's a scalar; check whether this is in fact an array or
231 # hash as a string rather than just a flag
232 if ($useann =~ /^\[(.*)\]$/) {
233 $useann = [split(/[,;]/,$1)];
234 } elsif ($useann =~ /^{(.*)}$/) {
235 $useann = {split(/[,;]/,$1)};
236 } # else it is probably indeed just a flag
238 if (ref($useann)) {
239 my $ann_map;
240 if (ref($useann) eq "ARRAY") {
241 my $has_header = ($self->header > 0);
242 $ann_map = {};
243 foreach my $i (@$useann) {
244 $ann_map->{$i} = $has_header ? undef : "col$i";
246 } else {
247 # no special handling necessary
248 $ann_map = $useann;
250 $self->annotation_map($ann_map);
251 } else {
252 $self->keep_annotation($useann || $colnames);
253 # annotation columns, if any
254 if ($colnames && !ref($colnames)) {
255 # an array as a string
256 $colnames =~ s/^\[(.*)\]$/$1/;
257 $colnames = [split(/[,;]/,$colnames)];
259 $self->annotation_columns($colnames) if ref($colnames);
262 # make sure we have a factory defined
263 if(!defined($self->sequence_factory)) {
264 $self->sequence_factory(
265 Bio::Seq::SeqFactory->new(-verbose => $self->verbose(),
266 -type => 'Bio::Seq::RichSeq'));
270 =head2 next_seq
272 Title : next_seq
273 Usage : $seq = $stream->next_seq()
274 Function: returns the next sequence in the stream
275 Returns : Bio::Seq::RichSeq object
276 Args :
278 =cut
280 sub next_seq {
281 my $self = shift;
283 # skip until not a comment and not an empty line
284 my $line_ok = $self->_next_record();
286 # if there is a header but we haven't read past it yet then do so now
287 if ($line_ok && (! $self->_header_skipped) && $self->header) {
288 $line_ok = $self->_parse_header();
289 $self->_header_skipped(1);
292 # return if we reached end-of-file
293 return unless $line_ok;
295 # otherwise, parse the record
297 # split into columns
298 my @cols = $self->_get_row_values();
299 # trim leading and trailing whitespace and quotes if desired
300 if ($self->trim_values) {
301 for(my $i = 0; $i < scalar(@cols); $i++) {
302 if ($cols[$i]) {
303 # trim off whitespace
304 $cols[$i] =~ s/^\s+//;
305 $cols[$i] =~ s/\s+$//;
306 # trim off double quotes
307 $cols[$i] =~ s/^"//;
308 $cols[$i] =~ s/"$//;
313 # assign values for columns in the attribute map
314 my $attrmap = $self->_attribute_map;
315 my %params = ();
316 foreach my $attr (keys %$attrmap) {
317 if ((!ref($attrmap->{$attr})) && ($attrmap->{$attr} =~ /^\d+$/)) {
318 # this is a column index, add to instantiation parameters
319 $params{$attr} = $cols[$attrmap->{$attr}];
320 } else {
321 # not a column index; we assume it's a static value
322 $params{$attr} = $attrmap->{$attr};
326 # add annotation columns to the annotation bundle
327 my $annmap = $self->_annotation_map;
328 if ($annmap && %$annmap) {
329 my $anncoll = Bio::Annotation::Collection->new();
330 foreach my $col (keys %$annmap) {
331 next unless $cols[$col]; # skip empty columns!
332 $anncoll->add_Annotation(
333 Bio::Annotation::SimpleValue->new(-value => $cols[$col],
334 -tagname=> $annmap->{$col}));
336 $params{'-annotation'} = $anncoll;
339 # ask the object builder to add the slots that we've gathered
340 my $builder = $self->sequence_builder();
341 $builder->add_slot_value(%params);
342 # and instantiate the object
343 my $seq = $builder->make_object();
345 # done!
346 return $seq;
349 =head2 comment_char
351 Title : comment_char
352 Usage : $obj->comment_char($newval)
353 Function: Get/set the leading character(s) designating a line as
354 a comment-line.
355 Example :
356 Returns : value of comment_char (a scalar)
357 Args : on set, new value (a scalar or undef, optional)
360 =cut
362 sub comment_char{
363 my $self = shift;
365 return $self->{'comment_char'} = shift if @_;
366 return $self->{'comment_char'};
369 =head2 header
371 Title : header
372 Usage : $obj->header($newval)
373 Function: Get/set the number of header lines to skip before the
374 rows containing actual sequence records.
376 If set to zero or undef, means that there is no header and
377 therefore also no column headers.
379 Example :
380 Returns : value of header (a scalar)
381 Args : on set, new value (a scalar or undef, optional)
384 =cut
386 sub header{
387 my $self = shift;
389 return $self->{'header'} = shift if @_;
390 return $self->{'header'};
393 =head2 delimiter
395 Title : delimiter
396 Usage : $obj->delimiter($newval)
397 Function: Get/set the column delimiter. This will in fact be
398 treated as a regular expression. Consecutive occurrences
399 will not be collapsed to a single one.
401 Example :
402 Returns : value of delimiter (a scalar)
403 Args : on set, new value (a scalar or undef, optional)
406 =cut
408 sub delimiter{
409 my $self = shift;
411 return $self->{'delimiter'} = shift if @_;
412 return $self->{'delimiter'};
415 =head2 attribute_map
417 Title : attribute_map
418 Usage : $obj->attribute_map($newval)
419 Function: Get/set the map of sequence object initialization
420 attributes (keys) to one-based column index.
422 Attributes will usually need to be prefixed by a dash, just
423 as if they were passed to the new() method of the sequence
424 class.
426 Example :
427 Returns : value of attribute_map (a reference to a hash)
428 Args : on set, new value (a reference to a hash or undef, optional)
431 =cut
433 sub attribute_map{
434 my $self = shift;
436 # internally we store zero-based maps - so we need to convert back
437 # and forth here
438 if (@_) {
439 my $arg = shift;
440 # allow for and protect against undef
441 return delete $self->{'_attribute_map'} unless defined($arg);
442 # copy to avoid side-effects
443 my $attr_map = {%$arg};
444 foreach my $key (keys %$attr_map) {
445 if ((!ref($attr_map->{$key})) && ($attr_map->{$key} =~ /^\d+$/)) {
446 $attr_map->{$key}--;
449 $self->{'_attribute_map'} = $attr_map;
451 # there may not be a map
452 return unless exists($self->{'_attribute_map'});
453 # we need to copy in order not to override the stored map!
454 my %attr_map = %{$self->{'_attribute_map'}};
455 foreach my $key (keys %attr_map) {
456 if ((!ref($attr_map{$key})) && ($attr_map{$key} =~ /^\d+$/)) {
457 $attr_map{$key}++;
460 return \%attr_map;
463 =head2 annotation_map
465 Title : annotation_map
466 Usage : $obj->annotation_map($newval)
467 Function: Get/set the mapping between one-based column indexes
468 (keys) and annotation tags (values).
470 Note that the map returned by this method may change after
471 the first next_seq() call if the file contains a column
472 header and no annotation keys have been predefined in the
473 map, because upon reading the column header line the tag
474 names will be set automatically.
476 Note also that the map may reference columns that are used
477 as well in the sequence attribute map.
479 Example :
480 Returns : value of annotation_map (a reference to a hash)
481 Args : on set, new value (a reference to a hash or undef, optional)
484 =cut
486 sub annotation_map{
487 my $self = shift;
489 # internally we store zero-based maps - so we need to convert back
490 # and forth here
491 if (@_) {
492 my $arg = shift;
493 # allow for and protect against undef
494 return delete $self->{'_annotation_map'} unless defined($arg);
495 # copy to avoid side-effects
496 my $ann_map = {%$arg};
497 # make sure we sort the keys numerically or otherwise we may
498 # clobber a key with a higher index
499 foreach my $key (sort { $a <=> $b } keys(%$ann_map)) {
500 $ann_map->{$key-1} = $ann_map->{$key};
501 delete $ann_map->{$key};
503 $self->{'_annotation_map'} = $ann_map;
504 # also make a note that we want to keep annotation
505 $self->keep_annotation(1);
507 # there may not be a map
508 return unless exists($self->{'_annotation_map'});
509 # we need to copy in order not to override the stored map!
510 my %ann_map = %{$self->{'_annotation_map'}};
511 # here we need to sort numerically in reverse order ...
512 foreach my $key (sort { $b <=> $a } keys(%ann_map)) {
513 $ann_map{$key+1} = $ann_map{$key};
514 delete $ann_map{$key};
516 return \%ann_map;
519 =head2 keep_annotation
521 Title : keep_annotation
522 Usage : $obj->keep_annotation($newval)
523 Function: Get/set flag whether or not to keep values from
524 additional columns as annotation.
526 Additional columns are all those columns in the input file
527 that aren't referenced in the attribute map.
529 Example :
530 Returns : value of keep_annotation (a scalar)
531 Args : on set, new value (a scalar or undef, optional)
534 =cut
536 sub keep_annotation{
537 my $self = shift;
539 return $self->{'keep_annotation'} = shift if @_;
540 return $self->{'keep_annotation'};
543 =head2 annotation_columns
545 Title : annotation_columns
546 Usage : $obj->annotation_columns($newval)
547 Function: Get/set the names (labels) of the columns to be used for
548 annotation.
550 This is an alternative to using annotation_map. In order to
551 have any effect, it must be set before the first call of
552 next_seq(), and obviously there must be a header line (or
553 row) too giving the column labels.
555 Example :
556 Returns : value of annotation_columns (a reference to an array)
557 Args : on set, new value (a reference to an array of undef, optional)
560 =cut
562 sub annotation_columns{
563 my $self = shift;
565 return $self->{'annotation_columns'} = shift if @_;
566 return $self->{'annotation_columns'};
569 =head2 trim_values
571 Title : trim_values
572 Usage : $obj->trim_values($newval)
573 Function: Get/set whether or not to trim leading and trailing
574 whitespace off all column values.
575 Example :
576 Returns : value of trim_values (a scalar)
577 Args : on set, new value (a scalar or undef, optional)
580 =cut
582 sub trim_values{
583 my $self = shift;
585 return $self->{'trim_values'} = shift if @_;
586 return $self->{'trim_values'};
589 =head2 write_seq
591 Title: write_seq
592 Usage: write_seq() is not implemented for table format output.
594 =cut
596 sub write_seq {
597 shift->throw("write_seq() not implemented for 'table' format");
600 =head1 Internal methods
602 All methods with a leading underscore are not meant to be part of the
603 'official' API. They are for use by this module only, consider them
604 private unless you are a developer trying to modify this module.
606 =cut
608 =head2 _attribute_map
610 Title : _attribute_map
611 Usage : $obj->_attribute_map($newval)
612 Function: Get only. Same as attribute_map, but zero-based indexes.
614 Note that any changes made to the returned map will change
615 the map used by this instance. You should know what you are
616 doing if you modify the returned value (or if you call this
617 method in the first place).
619 Example :
620 Returns : value of _attribute_map (a reference to a hash)
621 Args : none
624 =cut
626 sub _attribute_map{
627 my $self = shift;
629 return $self->{'_attribute_map'};
632 =head2 _annotation_map
634 Title : _annotation_map
635 Usage : $obj->_annotation_map($newval)
636 Function: Get only. Same as annotation_map, but with zero-based indexes.
638 Note that any changes made to the returned map will change
639 the map used by this instance. You should know what you are
640 doing if you modify the returned value (or if you call this
641 method in the first place).
643 Example :
644 Returns : value of _annotation_map (a reference to a hash)
645 Args : none
648 =cut
650 sub _annotation_map{
651 my $self = shift;
653 return $self->{'_annotation_map'};
656 =head2 _header_skipped
658 Title : _header_skipped
659 Usage : $obj->_header_skipped($newval)
660 Function: Get/set the flag whether the header was already
661 read (and skipped) or not.
662 Example :
663 Returns : value of _header_skipped (a scalar)
664 Args : on set, new value (a scalar or undef, optional)
667 =cut
669 sub _header_skipped{
670 my $self = shift;
672 return $self->{'_header_skipped'} = shift if @_;
673 return $self->{'_header_skipped'};
676 =head2 _next_record
678 Title : _next_record
679 Usage :
680 Function: Navigates the underlying file to the next record.
682 For row-based records in delimited text files, this will
683 skip all empty lines and lines with a leading comment
684 character.
686 This method is here is to serve as a hook for other formats
687 that conceptually also represent tables but aren't
688 formatted as row-based text files.
690 Example :
691 Returns : TRUE if the navigation was successful and FALSE
692 otherwise. Unsuccessful navigation will usually be treated
693 as an end-of-file condition.
694 Args :
697 =cut
699 sub _next_record{
700 my $self = shift;
702 my $cmtcc = $self->comment_char;
703 my $line = $self->_readline();
705 # skip until not a comment and not an empty line
706 while (defined($line)
707 && (($cmtcc && ($line =~ /^\s*$cmtcc/))
708 || ($line =~ /^\s*$/))) {
709 $line = $self->_readline();
712 return $self->{'_line'} = $line;
715 =head2 _parse_header
717 Title : _parse_header
718 Usage :
719 Function: Parse the table header and navigate past it.
721 This method is called if the number of header rows has been
722 specified equal to or greater than one, and positioned at
723 the first header line (row). By default the first header
724 line (row) is used for setting column names, but additional
725 lines (rows) may be skipped too. Empty lines and comment
726 lines do not count as header lines (rows).
728 This method will call _next_record() to navigate to the
729 next header line (row), if there is more than one header
730 line (row). Upon return, the file is presumed to be
731 positioned at the first record after the header.
733 This method is here is to serve as a hook for other formats
734 that conceptually also represent tables but aren't
735 formatted as row-based text files.
737 Note however that the only methods used to access file
738 content or navigate the position are _get_row_values() and
739 _next_record(), so it should usually suffice to override
740 those.
742 Example :
743 Returns : TRUE if navigation past the header was successful and FALSE
744 otherwise. Unsuccessful navigation will usually be treated
745 as an end-of-file condition.
746 Args :
749 =cut
751 sub _parse_header{
752 my $self = shift;
754 # the first header line contains the column headers, see whether
755 # we need them
756 if ($self->keep_annotation) {
757 my @colnames = $self->_get_row_values();
758 # trim leading and trailing whitespace if desired
759 if ($self->trim_values) {
760 # trim off whitespace
761 @colnames = map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_; } @colnames;
762 # trim off double quotes
763 @colnames = map { $_ =~ s/^"//; $_ =~ s/"$//; $_; } @colnames;
765 # build or complete annotation column map
766 my $annmap = $self->annotation_map || {};
767 if (! %$annmap) {
768 # check whether columns have been defined by name rather than index
769 if (my $anncols = $self->annotation_columns) {
770 # first sanity check: all column names must map
771 my %colmap = map { ($_,1); } @colnames;
772 foreach my $col (@$anncols) {
773 if (!exists($colmap{$col})) {
774 $self->throw("no such column labeled '$col'");
777 # now map to the column indexes
778 %colmap = map { ($_,1); } @$anncols;
779 for (my $i = 0; $i < scalar(@colnames); $i++) {
780 if (exists($colmap{$colnames[$i]})) {
781 $annmap->{$i+1} = $colnames[$i];
784 } else {
785 # no columns specified, default to all non-attribute columns
786 for (my $i = 0; $i < scalar(@colnames); $i++) {
787 $annmap->{$i+1} = $colnames[$i];
789 # subtract all attribute-referenced columns
790 foreach my $attrcol (values %{$self->attribute_map}) {
791 if ((!ref($attrcol)) && ($attrcol =~ /^\d+$/)) {
792 delete $annmap->{$attrcol};
796 } else {
797 # fill in where the tag names weren't pre-defined
798 for (my $i = 0; $i < scalar(@colnames); $i++) {
799 if (exists($annmap->{$i+1}) && ! defined($annmap->{$i+1})) {
800 $annmap->{$i+1} = $colnames[$i];
804 $self->annotation_map($annmap);
807 # now read past the header
808 my $header_lines = $self->header;
809 my $line_ok = 1;
810 while (defined($line_ok) && ($header_lines > 0)) {
811 $line_ok = $self->_next_record();
812 $header_lines--;
815 return $line_ok;
818 =head2 _get_row_values
820 Title : _get_row_values
821 Usage :
822 Function: Get the values for the current line (or row) as an array in
823 the order of columns.
825 This method is here is to serve as a hook for other formats
826 that conceptually also represent tables but aren't
827 formatted as row-based text files.
829 Example :
830 Returns : An array of column values for the current row.
831 Args :
834 =cut
836 sub _get_row_values{
837 my $self = shift;
838 my $delim = $self->delimiter;
839 my $line = $self->{'_line'};
840 chomp($line);
841 my @cols = split(/$delim/,$line);
842 return @cols;