2 # BioPerl module for Bio::SeqIO::table
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp <hlapp at gmx.net>
10 # (c) Hilmar Lapp, hlapp at gmx.net, 2005.
11 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2005.
13 # You may distribute this module under the same terms as perl itself.
14 # Refer to the Perl Artistic License (see the license accompanying this
15 # software package, or see http://www.perl.com/language/misc/Artistic.html)
16 # for the terms under which you may use, modify, and redistribute this module.
18 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
19 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
20 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
23 # POD documentation - main docs before the code
27 Bio::SeqIO::table - sequence input/output stream from a delimited table
31 #It is probably best not to use this object directly, but
32 #rather go through the SeqIO handler system. Go:
34 $stream = Bio::SeqIO->new(-file => $filename, -format => 'table');
36 while ( my $seq = $stream->next_seq() ) {
37 # do something with $seq
42 This class transforms records in a table-formatted text file into
45 A table-formatted text file of sequence records for the purposes of
46 this module is defined as a text file with each row corresponding to a
47 sequence, and the attributes of the sequence being in different
48 columns. Columns are delimited by a common delimiter, for instance tab
51 The module permits specifying which columns hold which type of
52 annotation. The semantics of certain attributes, if present, are
53 pre-defined, e.g., accession number and sequence. Additional
54 attributes may be added to the annotation bundle.
60 User feedback is an integral part of the evolution of this and other
61 Bioperl modules. Send your comments and suggestions preferably to one
62 of the Bioperl mailing lists. Your participation is much appreciated.
64 bioperl-l@bioperl.org - General discussion
65 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
69 Please direct usage questions or support issues to the mailing list:
71 I<bioperl-l@bioperl.org>
73 rather than to the module maintainer directly. Many experienced and
74 reponsive experts will be able look at the problem and quickly
75 address it. Please include a thorough description of the problem
76 with code and data examples if at all possible.
80 Report bugs to the Bioperl bug tracking system to help us keep track
81 the bugs and their resolution.
83 Bug reports can be submitted via email or the web:
85 https://github.com/bioperl/bioperl-live/issues
87 =head1 AUTHOR - Hilmar Lapp
89 Email hlapp at gmx.net
93 The rest of the documentation details each of the object
94 methods. Internal methods are usually preceded with a _
98 # Let the code begin...
100 package Bio
::SeqIO
::table
;
104 use Bio
::Seq
::SeqFactory
;
105 use Bio
::Annotation
::Collection
;
106 use Bio
::Annotation
::SimpleValue
;
108 use base
qw(Bio::SeqIO);
113 Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'table')
114 Function: Returns a new seqstream
115 Returns : A Bio::SeqIO stream for a table format
116 Args : Named parameters:
118 -file name of file to read
119 -fh filehandle to attach to
120 -comment leading character(s) introducing a comment line
121 -header the number of header lines to skip; the first
122 non-comment header line will be used to obtain
123 column names; column names will be used as the
124 default tags for attaching annotation.
125 -delim the delimiter for columns as a regular expression;
126 consecutive occurrences of the delimiter will
128 -display_id the one-based index of the column containing
129 the display ID of the sequence
130 -accession_number the one-based index of the column
131 containing the accession number of the sequence
132 -seq the one-based index of the column containing
133 the sequence string of the sequence
134 -species the one-based index of the column containing the
135 species for the sequence record; if not a
136 number, will be used as the static species
137 common to all records
138 -annotation if provided and a scalar (but see below), a
139 flag whether or not all additional columns are
140 to be preserved as annotation, the tags used
141 will either be 'colX' if there is no column
142 header and where X is the one-based column
143 index, and otherwise the column headers will be
146 if a reference to an array, or a square
147 bracket-enclosed string of comma-delimited
148 values, only those columns (one-based index)
149 will be preserved as annotation, tags as before;
151 if a reference to a hash, or a curly
152 braces-enclosed string of comma-delimited key
153 and value pairs in alternating order, the keys
154 are one-based column indexes to be preserved,
155 and the values are the tags under which the
156 annotation is to be attached; if not provided or
157 supplied as undef, no additional annotation will
159 -colnames a reference to an array of column labels, or a
160 string of comma-delimited labels, denoting the
161 columns to be converted into annotation; this is
162 an alternative to -annotation and will be
163 ignored if -annotation is also supplied with a
165 -trim flag determining whether or not all values should
166 be trimmed of leading and trailing white space
169 Additional arguments may be used to e.g. set factories and
170 builders involved in the sequence object creation (see the
177 my($self,@args) = @_;
179 # chained initialization
180 $self->SUPER::_initialize
(@args);
193 $self->_rearrange([qw(COMMENT
205 # store options and apply defaults
206 $self->comment_char(defined($cmtchars) ?
$cmtchars : "#")
207 if (!defined($self->comment_char)) || defined($cmtchars);
208 $self->delimiter(defined($delim) ?
$delim : "\t")
209 if (!defined($self->delimiter)) || defined($delim);
210 $self->header($header) if defined($header);
211 $self->trim_values($trim) if defined($trim);
215 $attrs->{-display_id
} = $display_id if defined($display_id);
216 $attrs->{-accession_number
} = $accnr if defined($accnr);
217 $attrs->{-seq
} = $seq if defined($seq);
218 if (defined($taxon)) {
219 if (ref($taxon) || ($taxon =~ /^\d+$/)) {
220 # either a static object, or a column reference
221 $attrs->{-species
} = $taxon;
223 # static species as a string
224 $attrs->{-species
} = Bio
::Species
->new(
225 -classification
=> [reverse(split(' ',$taxon))]);
228 $self->attribute_map($attrs);
230 # annotation columns, if any
231 if ($useann && !ref($useann)) {
232 # it's a scalar; check whether this is in fact an array or
233 # hash as a string rather than just a flag
234 if ($useann =~ /^\[(.*)\]$/) {
235 $useann = [split(/[,;]/,$1)];
236 } elsif ($useann =~ /^{(.*)}$/) {
237 $useann = {split(/[,;]/,$1)};
238 } # else it is probably indeed just a flag
242 if (ref($useann) eq "ARRAY") {
243 my $has_header = ($self->header > 0);
245 foreach my $i (@
$useann) {
246 $ann_map->{$i} = $has_header ?
undef : "col$i";
249 # no special handling necessary
252 $self->annotation_map($ann_map);
254 $self->keep_annotation($useann || $colnames);
255 # annotation columns, if any
256 if ($colnames && !ref($colnames)) {
257 # an array as a string
258 $colnames =~ s/^\[(.*)\]$/$1/;
259 $colnames = [split(/[,;]/,$colnames)];
261 $self->annotation_columns($colnames) if ref($colnames);
264 # make sure we have a factory defined
265 if(!defined($self->sequence_factory)) {
266 $self->sequence_factory(
267 Bio
::Seq
::SeqFactory
->new(-verbose
=> $self->verbose(),
268 -type
=> 'Bio::Seq::RichSeq'));
275 Usage : $seq = $stream->next_seq()
276 Function: returns the next sequence in the stream
277 Returns : Bio::Seq::RichSeq object
285 # skip until not a comment and not an empty line
286 my $line_ok = $self->_next_record();
288 # if there is a header but we haven't read past it yet then do so now
289 if ($line_ok && (! $self->_header_skipped) && $self->header) {
290 $line_ok = $self->_parse_header();
291 $self->_header_skipped(1);
294 # return if we reached end-of-file
295 return unless $line_ok;
297 # otherwise, parse the record
300 my @cols = $self->_get_row_values();
301 # trim leading and trailing whitespace and quotes if desired
302 if ($self->trim_values) {
303 for(my $i = 0; $i < scalar(@cols); $i++) {
305 # trim off whitespace
306 $cols[$i] =~ s/^\s+//;
307 $cols[$i] =~ s/\s+$//;
308 # trim off double quotes
315 # assign values for columns in the attribute map
316 my $attrmap = $self->_attribute_map;
318 foreach my $attr (keys %$attrmap) {
319 if ((!ref($attrmap->{$attr})) && ($attrmap->{$attr} =~ /^\d+$/)) {
320 # this is a column index, add to instantiation parameters
321 $params{$attr} = $cols[$attrmap->{$attr}];
323 # not a column index; we assume it's a static value
324 $params{$attr} = $attrmap->{$attr};
328 # add annotation columns to the annotation bundle
329 my $annmap = $self->_annotation_map;
330 if ($annmap && %$annmap) {
331 my $anncoll = Bio
::Annotation
::Collection
->new();
332 foreach my $col (keys %$annmap) {
333 next unless $cols[$col]; # skip empty columns!
334 $anncoll->add_Annotation(
335 Bio
::Annotation
::SimpleValue
->new(-value
=> $cols[$col],
336 -tagname
=> $annmap->{$col}));
338 $params{'-annotation'} = $anncoll;
341 # ask the object builder to add the slots that we've gathered
342 my $builder = $self->sequence_builder();
343 $builder->add_slot_value(%params);
344 # and instantiate the object
345 my $seq = $builder->make_object();
354 Usage : $obj->comment_char($newval)
355 Function: Get/set the leading character(s) designating a line as
358 Returns : value of comment_char (a scalar)
359 Args : on set, new value (a scalar or undef, optional)
367 return $self->{'comment_char'} = shift if @_;
368 return $self->{'comment_char'};
374 Usage : $obj->header($newval)
375 Function: Get/set the number of header lines to skip before the
376 rows containing actual sequence records.
378 If set to zero or undef, means that there is no header and
379 therefore also no column headers.
382 Returns : value of header (a scalar)
383 Args : on set, new value (a scalar or undef, optional)
391 return $self->{'header'} = shift if @_;
392 return $self->{'header'};
398 Usage : $obj->delimiter($newval)
399 Function: Get/set the column delimiter. This will in fact be
400 treated as a regular expression. Consecutive occurrences
401 will not be collapsed to a single one.
404 Returns : value of delimiter (a scalar)
405 Args : on set, new value (a scalar or undef, optional)
413 return $self->{'delimiter'} = shift if @_;
414 return $self->{'delimiter'};
419 Title : attribute_map
420 Usage : $obj->attribute_map($newval)
421 Function: Get/set the map of sequence object initialization
422 attributes (keys) to one-based column index.
424 Attributes will usually need to be prefixed by a dash, just
425 as if they were passed to the new() method of the sequence
429 Returns : value of attribute_map (a reference to a hash)
430 Args : on set, new value (a reference to a hash or undef, optional)
438 # internally we store zero-based maps - so we need to convert back
442 # allow for and protect against undef
443 return delete $self->{'_attribute_map'} unless defined($arg);
444 # copy to avoid side-effects
445 my $attr_map = {%$arg};
446 foreach my $key (keys %$attr_map) {
447 if ((!ref($attr_map->{$key})) && ($attr_map->{$key} =~ /^\d+$/)) {
451 $self->{'_attribute_map'} = $attr_map;
453 # there may not be a map
454 return unless exists($self->{'_attribute_map'});
455 # we need to copy in order not to override the stored map!
456 my %attr_map = %{$self->{'_attribute_map'}};
457 foreach my $key (keys %attr_map) {
458 if ((!ref($attr_map{$key})) && ($attr_map{$key} =~ /^\d+$/)) {
465 =head2 annotation_map
467 Title : annotation_map
468 Usage : $obj->annotation_map($newval)
469 Function: Get/set the mapping between one-based column indexes
470 (keys) and annotation tags (values).
472 Note that the map returned by this method may change after
473 the first next_seq() call if the file contains a column
474 header and no annotation keys have been predefined in the
475 map, because upon reading the column header line the tag
476 names will be set automatically.
478 Note also that the map may reference columns that are used
479 as well in the sequence attribute map.
482 Returns : value of annotation_map (a reference to a hash)
483 Args : on set, new value (a reference to a hash or undef, optional)
491 # internally we store zero-based maps - so we need to convert back
495 # allow for and protect against undef
496 return delete $self->{'_annotation_map'} unless defined($arg);
497 # copy to avoid side-effects
498 my $ann_map = {%$arg};
499 # make sure we sort the keys numerically or otherwise we may
500 # clobber a key with a higher index
501 foreach my $key (sort { $a <=> $b } keys(%$ann_map)) {
502 $ann_map->{$key-1} = $ann_map->{$key};
503 delete $ann_map->{$key};
505 $self->{'_annotation_map'} = $ann_map;
506 # also make a note that we want to keep annotation
507 $self->keep_annotation(1);
509 # there may not be a map
510 return unless exists($self->{'_annotation_map'});
511 # we need to copy in order not to override the stored map!
512 my %ann_map = %{$self->{'_annotation_map'}};
513 # here we need to sort numerically in reverse order ...
514 foreach my $key (sort { $b <=> $a } keys(%ann_map)) {
515 $ann_map{$key+1} = $ann_map{$key};
516 delete $ann_map{$key};
521 =head2 keep_annotation
523 Title : keep_annotation
524 Usage : $obj->keep_annotation($newval)
525 Function: Get/set flag whether or not to keep values from
526 additional columns as annotation.
528 Additional columns are all those columns in the input file
529 that aren't referenced in the attribute map.
532 Returns : value of keep_annotation (a scalar)
533 Args : on set, new value (a scalar or undef, optional)
541 return $self->{'keep_annotation'} = shift if @_;
542 return $self->{'keep_annotation'};
545 =head2 annotation_columns
547 Title : annotation_columns
548 Usage : $obj->annotation_columns($newval)
549 Function: Get/set the names (labels) of the columns to be used for
552 This is an alternative to using annotation_map. In order to
553 have any effect, it must be set before the first call of
554 next_seq(), and obviously there must be a header line (or
555 row) too giving the column labels.
558 Returns : value of annotation_columns (a reference to an array)
559 Args : on set, new value (a reference to an array of undef, optional)
564 sub annotation_columns
{
567 return $self->{'annotation_columns'} = shift if @_;
568 return $self->{'annotation_columns'};
574 Usage : $obj->trim_values($newval)
575 Function: Get/set whether or not to trim leading and trailing
576 whitespace off all column values.
578 Returns : value of trim_values (a scalar)
579 Args : on set, new value (a scalar or undef, optional)
587 return $self->{'trim_values'} = shift if @_;
588 return $self->{'trim_values'};
591 =head1 Internal methods
593 All methods with a leading underscore are not meant to be part of the
594 'official' API. They are for use by this module only, consider them
595 private unless you are a developer trying to modify this module.
599 =head2 _attribute_map
601 Title : _attribute_map
602 Usage : $obj->_attribute_map($newval)
603 Function: Get only. Same as attribute_map, but zero-based indexes.
605 Note that any changes made to the returned map will change
606 the map used by this instance. You should know what you are
607 doing if you modify the returned value (or if you call this
608 method in the first place).
611 Returns : value of _attribute_map (a reference to a hash)
620 return $self->{'_attribute_map'};
623 =head2 _annotation_map
625 Title : _annotation_map
626 Usage : $obj->_annotation_map($newval)
627 Function: Get only. Same as annotation_map, but with zero-based indexes.
629 Note that any changes made to the returned map will change
630 the map used by this instance. You should know what you are
631 doing if you modify the returned value (or if you call this
632 method in the first place).
635 Returns : value of _annotation_map (a reference to a hash)
644 return $self->{'_annotation_map'};
647 =head2 _header_skipped
649 Title : _header_skipped
650 Usage : $obj->_header_skipped($newval)
651 Function: Get/set the flag whether the header was already
652 read (and skipped) or not.
654 Returns : value of _header_skipped (a scalar)
655 Args : on set, new value (a scalar or undef, optional)
663 return $self->{'_header_skipped'} = shift if @_;
664 return $self->{'_header_skipped'};
671 Function: Navigates the underlying file to the next record.
673 For row-based records in delimited text files, this will
674 skip all empty lines and lines with a leading comment
677 This method is here is to serve as a hook for other formats
678 that conceptually also represent tables but aren't
679 formatted as row-based text files.
682 Returns : TRUE if the navigation was successful and FALSE
683 otherwise. Unsuccessful navigation will usually be treated
684 as an end-of-file condition.
693 my $cmtcc = $self->comment_char;
694 my $line = $self->_readline();
696 # skip until not a comment and not an empty line
697 while (defined($line)
698 && (($cmtcc && ($line =~ /^\s*$cmtcc/))
699 || ($line =~ /^\s*$/))) {
700 $line = $self->_readline();
703 return $self->{'_line'} = $line;
708 Title : _parse_header
710 Function: Parse the table header and navigate past it.
712 This method is called if the number of header rows has been
713 specified equal to or greater than one, and positioned at
714 the first header line (row). By default the first header
715 line (row) is used for setting column names, but additional
716 lines (rows) may be skipped too. Empty lines and comment
717 lines do not count as header lines (rows).
719 This method will call _next_record() to navigate to the
720 next header line (row), if there is more than one header
721 line (row). Upon return, the file is presumed to be
722 positioned at the first record after the header.
724 This method is here is to serve as a hook for other formats
725 that conceptually also represent tables but aren't
726 formatted as row-based text files.
728 Note however that the only methods used to access file
729 content or navigate the position are _get_row_values() and
730 _next_record(), so it should usually suffice to override
734 Returns : TRUE if navigation past the header was successful and FALSE
735 otherwise. Unsuccessful navigation will usually be treated
736 as an end-of-file condition.
745 # the first header line contains the column headers, see whether
747 if ($self->keep_annotation) {
748 my @colnames = $self->_get_row_values();
749 # trim leading and trailing whitespace if desired
750 if ($self->trim_values) {
751 # trim off whitespace
752 @colnames = map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_; } @colnames;
753 # trim off double quotes
754 @colnames = map { $_ =~ s/^"//; $_ =~ s/"$//; $_; } @colnames;
756 # build or complete annotation column map
757 my $annmap = $self->annotation_map || {};
759 # check whether columns have been defined by name rather than index
760 if (my $anncols = $self->annotation_columns) {
761 # first sanity check: all column names must map
762 my %colmap = map { ($_,1); } @colnames;
763 foreach my $col (@
$anncols) {
764 if (!exists($colmap{$col})) {
765 $self->throw("no such column labeled '$col'");
768 # now map to the column indexes
769 %colmap = map { ($_,1); } @
$anncols;
770 for (my $i = 0; $i < scalar(@colnames); $i++) {
771 if (exists($colmap{$colnames[$i]})) {
772 $annmap->{$i+1} = $colnames[$i];
776 # no columns specified, default to all non-attribute columns
777 for (my $i = 0; $i < scalar(@colnames); $i++) {
778 $annmap->{$i+1} = $colnames[$i];
780 # subtract all attribute-referenced columns
781 foreach my $attrcol (values %{$self->attribute_map}) {
782 if ((!ref($attrcol)) && ($attrcol =~ /^\d+$/)) {
783 delete $annmap->{$attrcol};
788 # fill in where the tag names weren't pre-defined
789 for (my $i = 0; $i < scalar(@colnames); $i++) {
790 if (exists($annmap->{$i+1}) && ! defined($annmap->{$i+1})) {
791 $annmap->{$i+1} = $colnames[$i];
795 $self->annotation_map($annmap);
798 # now read past the header
799 my $header_lines = $self->header;
801 while (defined($line_ok) && ($header_lines > 0)) {
802 $line_ok = $self->_next_record();
809 =head2 _get_row_values
811 Title : _get_row_values
813 Function: Get the values for the current line (or row) as an array in
814 the order of columns.
816 This method is here is to serve as a hook for other formats
817 that conceptually also represent tables but aren't
818 formatted as row-based text files.
821 Returns : An array of column values for the current row.
829 my $delim = $self->delimiter;
830 my $line = $self->{'_line'};
832 my @cols = split(/$delim/,$line);