3 # BioPerl module for Bio::Seq::Meta
5 # Cared for by Heikki Lehvaslaiho
7 # Copyright Heikki Lehvaslaiho
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Seq::Meta - Generic superclass for sequence objects with
16 residue-based meta information
20 use Bio::LocatableSeq;
22 use Bio::Tools::OddCodes;
25 my $seq = Bio::Seq::Meta->new(-id=>'test',
30 -verbose=>1, # to see warnings
33 # the existing sequence object can be a Bio::PrimarySeq, too
35 # to test this is a meta seq object
36 $seq->isa("Bio::Seq::Meta")
37 || $seq->throw("$seq is not a Bio::Seq::Meta");
40 $seq->meta('1234567890');
41 $seq = Bio::Seq::Meta->new(-id=>'test',
47 -verbose=>1, # to see warnings
51 $string = $seq->meta_text();
52 $substring = $seq->submeta_text(2,5);
53 $unique_key = $seq->accession_number();
55 # storing output from Bio::Tools::OddCodes as meta data
56 my $protcodes = Bio::Tools::OddCodes->new(-seq => $seq);
57 my @codes = qw(structural chemical functional charge hydrophobic);
58 map { $seq->named_meta($_, ${$protcodes->$_($seq) } )} @codes;
60 my $out = Bio::SeqIO->new(-format=>'metafasta');
61 $out->write_seq($seq);
65 This class implements generic methods for sequences with residue-based
66 meta information. Meta sequences with meta data are Bio::LocatableSeq
67 objects with additional methods to store that meta information. See
68 L<Bio::LocatableSeq> and L<Bio::Seq::MetaI>.
70 The meta information in this class is always one character per residue
71 long and blank values are space characters (ASCII 32).
73 After the latest rewrite, the meta information no longer covers all
74 the residues automatically. Methods to check the length of meta
75 information (L<meta_length>)and to see if the ends are flushed to the
76 sequence have been added (L<is_flush>). To force the old
77 functionality, set L<force_flush> to true.
79 It is assumed that meta data values do not depend on the nucleotide
80 sequence strand value.
82 Application specific implementations should inherit from this class to
83 override and add to these methods.
85 L<Bio::Seq::Meta::Array> allows for more complex meta values (scalars
86 or objects) to be used.
90 Character based meta data is read and set by method meta() and its
91 variants. These are the suffixes and prefixes used in the variants:
93 [named_] [sub] meta [_text]
99 Suffix B<_text> guaranties that output is a string. Note that it does
102 In this implementation, the output is always text, so these methods
107 Prefix B<sub>, like in subseq(), means that the method applies to sub
108 region of the sequence range and takes start and end as arguments.
109 Unlike subseq(), these methods are able to set values. If the range
110 is not defined, it defaults to the complete sequence.
114 Prefix B<named_> in method names allows the used to attach multiple
115 meta strings to one sequence by explicitly naming them. The name is
116 always the first argument to the method. The "unnamed" methods use the
117 class wide default name for the meta data and are thus special cases
120 Note that internally names are keys in a hash and any misspelling of a
121 name will silently store the data under a wrong name. The used names
122 (keys) can be retrieved using method meta_names(). See L<meta_names>.
128 This Bio::Seq::MetaI implementation inherits from Bio::LocatableSeq, which
129 itself inherits from Bio::PrimarySeq. It is not a Bio::SeqI, so bless-ing
130 objects of this class into a Bio::SeqI or vice versa and will not work as
131 expected (see bug 2262). This may be addressed in a future refactor of
137 L<Bio::LocatableSeq>,
139 L<Bio::Seq::Meta::Array>
145 User feedback is an integral part of the evolution of this and other
146 Bioperl modules. Send your comments and suggestions preferably to one
147 of the Bioperl mailing lists. Your participation is much appreciated.
149 bioperl-l@bioperl.org - General discussion
150 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
152 =head2 Reporting Bugs
154 Report bugs to the Bioperl bug tracking system to help us keep track
155 the bugs and their resolution. Bug reports can be submitted via the
158 http://bugzilla.open-bio.org/
160 =head1 AUTHOR - Heikki Lehvaslaiho
162 Email heikki-at-bioperl-dot-org
166 Chad Matsalla, bioinformatics@dieselwurks.com
168 Aaron Mackey, amackey@virginia.edu
172 The rest of the documentation details each of the object methods.
173 Internal methods are usually preceded with a _
178 # Let the code begin...
181 package Bio
::Seq
::Meta
;
182 use vars
qw($DEFAULT_NAME $GAP $META_GAP);
185 #use overload '""' => \&to_string;
187 use base qw(Bio::LocatableSeq Bio::Seq::MetaI);
192 $DEFAULT_NAME = 'DEFAULT';
200 Usage : $metaseq = Bio::Seq::Meta->new
201 ( -meta => 'aaaaaaaabbbbbbbb',
202 -seq => 'TKLMILVSHIVILSRM'
204 -accession_number => 'S000012',
206 Function: Constructor for Bio::Seq::Meta class, meta data being in a
207 string. Note that you can provide an empty quality string.
208 Returns : a new Bio::Seq::Meta object
214 my ($class, @args) = @_;
216 my $self = $class->SUPER::new
(@args);
218 my($meta, $forceflush, $nm) =
219 $self->_rearrange([qw(META
224 #$self->{'_meta'} = {};
225 $self->{'_meta'}->{$DEFAULT_NAME} = "";
227 $meta && $self->meta($meta);
228 if ($nm && ref($nm) eq 'HASH') {
229 while (my ($name, $meta) = each %$nm) {
230 $self->named_meta($name, $meta);
233 $forceflush && $self->force_flush($forceflush);
242 Usage : $meta_values = $obj->meta($values_string);
245 Get and set method for the meta data starting from residue
246 position one. Since it is dependent on the length of the
247 sequence, it needs to be manipulated after the sequence.
249 The length of the returned value always matches the length
250 of the sequence, if force_flush() is set. See L<force_flush>.
252 Returns : meta data in a string
253 Args : new value, string, optional
258 shift->named_meta($DEFAULT_NAME, shift);
264 Usage : $meta_values = $obj->meta_text($values_arrayref);
265 Function: Variant of meta() guarantied to return a textual
266 representation of meta data. For details, see L<meta>.
268 Args : new value, optional
279 Usage : $meta_values = $obj->named_meta($name, $values_arrayref);
280 Function: A more general version of meta(). Each meta data set needs
281 to be named. See also L<meta_names>.
283 Args : scalar, name of the meta data set
289 my ($self, $name, $value) = @_;
291 $name ||= $DEFAULT_NAME;
292 if( defined $value) {
294 $self->throw("I need a scalar value, not [". ref($value). "]")
298 my $diff = $self->length - CORE
::length($value);
300 $value .= (" " x
$diff);
303 $self->{'_meta'}->{$name} = $value;
305 #$self->_test_gap_positions($name) if $self->verbose > 0;
308 return " " x
$self->length
309 if $self->force_flush && not defined $self->{'_meta'}->{$name};
312 $self->_do_flush if $self->force_flush;
314 return $self->{'_meta'}->{$name};
317 =head2 _test_gap_positions
319 Title : _test_gap_positions
320 Usage : $meta_values = $obj->_test_gap_positions($name);
321 Function: Internal test for correct position of gap characters.
322 Gap being only '-' this time.
324 This method is called from named_meta() when setting meta
325 data but only if verbose is positive as this can be an
326 expensive process on very long sequences. Set verbose(1) to
327 see warnings when gaps do not align in sequence and meta
328 data and turn them into errors by setting verbose(2).
330 Returns : true on success, prints warnings
335 sub _test_gap_positions
{
340 $self->seq || return $success;
341 my $len = CORE
::length($self->seq);
342 for (my $i=0; $i < $len; $i++) {
343 my $s = substr $self->{seq
}, $i, 1;
344 my $m = substr $self->{_meta
}->{$name}, $i, 1;
345 $self->warn("Gap mismatch [$m/$s] in column [". ($i+1). "] of [$name] meta data in seq [". $self->id. "]")
347 if ($s eq $META_GAP) && $s ne $m;
352 =head2 named_meta_text
354 Title : named_meta_text()
355 Usage : $meta_values = $obj->named_meta_text($name, $values_arrayref);
356 Function: Variant of named_meta() guarantied to return a textual
357 representation of the named meta data.
358 For details, see L<meta>.
360 Args : scalar, name of the meta data set
365 sub named_meta_text
{
366 shift->named_meta(@_);
372 Usage : $subset_of_meta_values = $obj->submeta(10, 20, $value_string);
373 $subset_of_meta_values = $obj->submeta(10, undef, $value_string);
376 Get and set method for meta data for subsequences.
378 Numbering starts from 1 and the number is inclusive, ie 1-2
379 are the first two residue of the sequence. Start cannot be
380 larger than end but can be equal.
382 If the second argument is missing the returned values
383 should extend to the end of the sequence.
385 The return value may be a string or an array reference,
386 depending on the implementation. If in doubt, use
387 submeta_text() which is a variant guarantied to return a
388 string. See L<submeta_text>.
390 Returns : A reference to an array or a string
391 Args : integer, start position
392 integer, end position, optional when a third argument present
398 shift->named_submeta($DEFAULT_NAME, @_);
404 Usage : $meta_values = $obj->submeta_text(20, $value_string);
405 Function: Variant of submeta() guarantied to return a textual
406 representation of meta data. For details, see L<meta>.
408 Args : new value, optional
419 Title : named_submeta
420 Usage : $subset_of_meta_values = $obj->named_submeta($name, 10, 20, $value_string);
421 $subset_of_meta_values = $obj->named_submeta($name, 10);
422 Function: Variant of submeta() guarantied to return a textual
423 representation of meta data. For details, see L<meta>.
424 Returns : A reference to an array or a string
425 Args : scalar, name of the meta data set
426 integer, start position
427 integer, end position, optional when a third argument present
433 my ($self, $name, $start, $end, $value) = @_;
435 $name ||= $DEFAULT_NAME;
439 $start =~ /^[+]?\d+$/ and $start > 0 or
440 $self->throw("Need at least a positive integer start value");
443 $end ||= $start+length($value)-1;
444 $self->warn("You are setting meta values beyond the length of the sequence\n".
445 "[$start > ". length($self->seq)."] in sequence ". $self->id)
446 if $start > length $self->seq;
448 # pad meta data if needed
449 $self->{_meta
}->{$name} = () unless defined $self->{_meta
}->{$name};
450 if (length($self->{_meta
}->{$name}) < $start) {
451 $self->{'_meta'}->{$name} .= " " x
( $start - length($self->{'_meta'}->{$name}) -1);
455 $tail = substr ($self->{_meta
}->{$name}, $start-1+length($value))
456 if length($self->{_meta
}->{$name}) >= $start-1+length($value);
458 substr ($self->{_meta
}->{$name}, --$start) = $value;
459 $self->{_meta
}->{$name} .= $tail;
461 return substr ($self->{_meta
}->{$name}, $start, $end - $start + 1);
465 $end or $end = length $self->seq;
467 # pad meta data if needed
468 if (length($self->{_meta
}->{$name}) < $end) {
469 $self->{'_meta'}->{$name} .= " " x
( $start - length($self->{'_meta'}->{$name}));
472 return substr ($self->{_meta
}->{$name}, $start-1, $end - $start + 1)
477 =head2 named_submeta_text
479 Title : named_submeta_text
480 Usage : $meta_values = $obj->named_submeta_text($name, 20, $value_string);
481 Function: Variant of submeta() guarantied to return a textual
482 representation of meta data. For details, see L<meta>.
484 Args : scalar, name of the meta data
485 Args : integer, start position, optional
486 integer, end position, optional
491 sub named_submeta_text
{
492 shift->named_submeta(@_);
498 Usage : @meta_names = $obj->meta_names()
499 Function: Retrieves an array of meta data set names. The default
500 (unnamed) set name is guarantied to be the first name.
501 Returns : an array of names
510 foreach ( sort keys %{$self->{'_meta'}} ) {
511 push (@r, $_) unless $_ eq $DEFAULT_NAME;
513 unshift @r, $DEFAULT_NAME if $self->{'_meta'}->{$DEFAULT_NAME};
520 Title : meta_length()
521 Usage : $meeta_len = $obj->meta_length();
522 Function: return the number of elements in the meta set
530 return $self->named_meta_length($DEFAULT_NAME);
534 =head2 named_meta_length
536 Title : named_meta_length()
537 Usage : $meta_len = $obj->named_meta_length($name);
538 Function: return the number of elements in the named meta set
544 sub named_meta_length
{
545 my ($self, $name) = @_;
546 $name ||= $DEFAULT_NAME;
547 return length ($self->{'_meta'}->{$name});
553 Title : force_flush()
554 Usage : $force_flush = $obj->force_flush(1);
555 Function: Automatically pad with empty values or truncate meta values
556 to sequence length. Not done by default.
557 Returns : boolean 1 or 0
558 Args : optional boolean value
560 Note that if you turn this forced padding off, the previously padded
561 values are not changed.
566 my ($self, $value) = @_;
568 if (defined $value) {
570 $self->{force_flush
} = 1;
573 $self->{force_flush
} = 0;
576 return $self->{force_flush
};
584 Function: internal method to do the force that meta values are same
585 length as the sequence . Called from L<force_flush>
595 foreach my $name ( ('DEFAULT', $self->meta_names) ) {
598 if ($self->length > $self->named_meta_length($name)) {
599 $self->{'_meta'}->{$name} .= $META_GAP x
($self->length - $self->named_meta_length($name)) ;
602 elsif ( $self->length < $self->named_meta_length($name) ) {
603 $self->{_meta
}->{$name} = substr($self->{_meta
}->{$name}, 0, $self->length-1);
613 Usage : $is_flush = $obj->is_flush()
614 or $is_flush = $obj->is_flush($my_meta_name)
615 Function: Boolean to tell if all meta values are in
616 flush with the sequence length.
617 Returns true if force_flush() is set
618 Set verbosity to a positive value to see failed meta sets
619 Returns : boolean 1 or 0
620 Args : optional name of the meta set
627 my ($self, $name) = shift;
629 return 1 if $self->force_flush;
635 $sticky .= "$name " if $self->length != $self->named_meta_length($name);
637 foreach my $m ($self->meta_names) {
638 $sticky .= "$m " if $self->length != $self->named_meta_length($m);
643 print "These meta set are not flush: $sticky\n" if $self->verbose;
651 =head1 Bio::PrimarySeqI methods
656 Usage : $newseq = $seq->revcom();
657 Function: Produces a new Bio::Seq::MetaI implementing object where
658 the order of residues and their meta information is reversed.
659 Returns : A new (fresh) Bio::Seq::Meta object
661 Throws : if the object returns false on is_flush()
663 Note: The method does nothing to meta values, it reorders them, only.
670 $self->throw("Can not get a reverse complement. The object is not flush.")
671 unless $self->is_flush;
673 my $new = $self->SUPER::revcom
;
674 foreach (keys %{$self->{_meta
}}) {
675 $new->named_meta($_, scalar reverse $self->{_meta
}->{$_} );
683 Usage : $subseq = $seq->trunc(10,100);
684 Function: Provides a truncation of a sequence together with meta data
685 Returns : a fresh Bio::Seq::Meta implementing object
686 Args : Two integers denoting first and last residue of the sub-sequence.
691 my ($self, $start, $end) = @_;
694 $start =~ /^[+]?\d+$/ and $start > 0 or
695 $self->throw("Need at least a positive integer start value as start");
696 $end =~ /^[+]?\d+$/ and $end > 0 or
697 $self->throw("Need at least a positive integer start value as end");
699 $self->throw("End position has to be larger or equal to start");
700 $end <= $self->length or
701 $self->throw("End position can not be larger than sequence length");
703 my $new = $self->SUPER::trunc
($start, $end);
705 foreach (keys %{$self->{_meta
}}) {
707 substr($self->{_meta
}->{$_}, $start, $end - $start)
716 my $out = Bio
::SeqIO
->new(-format
=>'metafasta');
717 $out->write_seq($self);