2 # BioPerl module for Bio::Seq::Meta
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Seq::Meta - Generic superclass for sequence objects with
17 residue-based meta information
21 use Bio::LocatableSeq;
23 use Bio::Tools::OddCodes;
26 my $seq = Bio::Seq::Meta->new(-id=>'test',
31 -verbose=>1, # to see warnings
34 # the existing sequence object can be a Bio::PrimarySeq, too
36 # to test this is a meta seq object
37 $seq->isa("Bio::Seq::Meta")
38 || $seq->throw("$seq is not a Bio::Seq::Meta");
41 $seq->meta('1234567890');
42 $seq = Bio::Seq::Meta->new(-id=>'test',
48 -verbose=>1, # to see warnings
52 $string = $seq->meta_text();
53 $substring = $seq->submeta_text(2,5);
54 $unique_key = $seq->accession_number();
56 # storing output from Bio::Tools::OddCodes as meta data
57 my $protcodes = Bio::Tools::OddCodes->new(-seq => $seq);
58 my @codes = qw(structural chemical functional charge hydrophobic);
59 map { $seq->named_meta($_, ${$protcodes->$_($seq) } )} @codes;
61 my $out = Bio::SeqIO->new(-format=>'metafasta');
62 $out->write_seq($seq);
66 This class implements generic methods for sequences with residue-based
67 meta information. Meta sequences with meta data are Bio::LocatableSeq
68 objects with additional methods to store that meta information. See
69 L<Bio::LocatableSeq> and L<Bio::Seq::MetaI>.
71 The meta information in this class is always one character per residue
72 long and blank values are space characters (ASCII 32).
74 After the latest rewrite, the meta information no longer covers all
75 the residues automatically. Methods to check the length of meta
76 information (L<meta_length>)and to see if the ends are flushed to the
77 sequence have been added (L<is_flush>). To force the old
78 functionality, set L<force_flush> to true.
80 It is assumed that meta data values do not depend on the nucleotide
81 sequence strand value.
83 Application specific implementations should inherit from this class to
84 override and add to these methods.
86 L<Bio::Seq::Meta::Array> allows for more complex meta values (scalars
87 or objects) to be used.
91 Character based meta data is read and set by method meta() and its
92 variants. These are the suffixes and prefixes used in the variants:
94 [named_] [sub] meta [_text]
100 Suffix B<_text> guaranties that output is a string. Note that it does
103 In this implementation, the output is always text, so these methods
108 Prefix B<sub>, like in subseq(), means that the method applies to sub
109 region of the sequence range and takes start and end as arguments.
110 Unlike subseq(), these methods are able to set values. If the range
111 is not defined, it defaults to the complete sequence.
115 Prefix B<named_> in method names allows the used to attach multiple
116 meta strings to one sequence by explicitly naming them. The name is
117 always the first argument to the method. The "unnamed" methods use the
118 class wide default name for the meta data and are thus special cases
121 Note that internally names are keys in a hash and any misspelling of a
122 name will silently store the data under a wrong name. The used names
123 (keys) can be retrieved using method meta_names(). See L<meta_names>.
129 This Bio::Seq::MetaI implementation inherits from Bio::LocatableSeq, which
130 itself inherits from Bio::PrimarySeq. It is not a Bio::SeqI, so bless-ing
131 objects of this class into a Bio::SeqI or vice versa and will not work as
132 expected (see bug 2262). This may be addressed in a future refactor of
138 L<Bio::LocatableSeq>,
140 L<Bio::Seq::Meta::Array>
146 User feedback is an integral part of the evolution of this and other
147 Bioperl modules. Send your comments and suggestions preferably to one
148 of the Bioperl mailing lists. Your participation is much appreciated.
150 bioperl-l@bioperl.org - General discussion
151 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
155 Please direct usage questions or support issues to the mailing list:
157 I<bioperl-l@bioperl.org>
159 rather than to the module maintainer directly. Many experienced and
160 reponsive experts will be able look at the problem and quickly
161 address it. Please include a thorough description of the problem
162 with code and data examples if at all possible.
164 =head2 Reporting Bugs
166 Report bugs to the Bioperl bug tracking system to help us keep track
167 the bugs and their resolution. Bug reports can be submitted via the
170 https://github.com/bioperl/bioperl-live/issues
172 =head1 AUTHOR - Heikki Lehvaslaiho
174 Email heikki-at-bioperl-dot-org
178 Chad Matsalla, bioinformatics@dieselwurks.com
180 Aaron Mackey, amackey@virginia.edu
184 The rest of the documentation details each of the object methods.
185 Internal methods are usually preceded with a _
190 # Let the code begin...
193 package Bio
::Seq
::Meta
;
194 use vars
qw($DEFAULT_NAME $GAP $META_GAP);
197 #use overload '""' => \&to_string;
199 use base qw(Bio::LocatableSeq Bio::Seq::MetaI);
204 $DEFAULT_NAME = 'DEFAULT';
212 Usage : $metaseq = Bio::Seq::Meta->new
213 ( -meta => 'aaaaaaaabbbbbbbb',
214 -seq => 'TKLMILVSHIVILSRM'
216 -accession_number => 'S000012',
218 Function: Constructor for Bio::Seq::Meta class, meta data being in a
219 string. Note that you can provide an empty quality string.
220 Returns : a new Bio::Seq::Meta object
226 my ($class, @args) = @_;
228 my $self = $class->SUPER::new
(@args);
230 my($meta, $forceflush, $nm) =
231 $self->_rearrange([qw(META
236 #$self->{'_meta'} = {};
237 $self->{'_meta'}->{$DEFAULT_NAME} = "";
239 $meta && $self->meta($meta);
240 if ($nm && ref($nm) eq 'HASH') {
241 while (my ($name, $meta) = each %$nm) {
242 $self->named_meta($name, $meta);
245 $forceflush && $self->force_flush($forceflush);
254 Usage : $meta_values = $obj->meta($values_string);
257 Get and set method for the meta data starting from residue
258 position one. Since it is dependent on the length of the
259 sequence, it needs to be manipulated after the sequence.
261 The length of the returned value always matches the length
262 of the sequence, if force_flush() is set. See L<force_flush>.
264 Returns : meta data in a string
265 Args : new value, string, optional
270 shift->named_meta($DEFAULT_NAME, shift);
276 Usage : $meta_values = $obj->meta_text($values_arrayref);
277 Function: Variant of meta() guarantied to return a textual
278 representation of meta data. For details, see L<meta>.
280 Args : new value, optional
291 Usage : $meta_values = $obj->named_meta($name, $values_arrayref);
292 Function: A more general version of meta(). Each meta data set needs
293 to be named. See also L<meta_names>.
295 Args : scalar, name of the meta data set
301 my ($self, $name, $value) = @_;
303 $name ||= $DEFAULT_NAME;
304 if( defined $value) {
306 $self->throw("I need a scalar value, not [". ref($value). "]")
310 my $diff = $self->length - CORE
::length($value);
312 $value .= (" " x
$diff);
315 $self->{'_meta'}->{$name} = $value;
317 #$self->_test_gap_positions($name) if $self->verbose > 0;
320 return " " x
$self->length
321 if $self->force_flush && not defined $self->{'_meta'}->{$name};
324 $self->_do_flush if $self->force_flush;
326 return $self->{'_meta'}->{$name};
329 =head2 _test_gap_positions
331 Title : _test_gap_positions
332 Usage : $meta_values = $obj->_test_gap_positions($name);
333 Function: Internal test for correct position of gap characters.
334 Gap being only '-' this time.
336 This method is called from named_meta() when setting meta
337 data but only if verbose is positive as this can be an
338 expensive process on very long sequences. Set verbose(1) to
339 see warnings when gaps do not align in sequence and meta
340 data and turn them into errors by setting verbose(2).
342 Returns : true on success, prints warnings
347 sub _test_gap_positions
{
352 $self->seq || return $success;
353 my $len = CORE
::length($self->seq);
354 for (my $i=0; $i < $len; $i++) {
355 my $s = substr $self->{seq
}, $i, 1;
356 my $m = substr $self->{_meta
}->{$name}, $i, 1;
357 $self->warn("Gap mismatch [$m/$s] in column [". ($i+1). "] of [$name] meta data in seq [". $self->id. "]")
359 if ($s eq $META_GAP) && $s ne $m;
364 =head2 named_meta_text
366 Title : named_meta_text()
367 Usage : $meta_values = $obj->named_meta_text($name, $values_arrayref);
368 Function: Variant of named_meta() guarantied to return a textual
369 representation of the named meta data.
370 For details, see L<meta>.
372 Args : scalar, name of the meta data set
377 sub named_meta_text
{
378 shift->named_meta(@_);
384 Usage : $subset_of_meta_values = $obj->submeta(10, 20, $value_string);
385 $subset_of_meta_values = $obj->submeta(10, undef, $value_string);
388 Get and set method for meta data for subsequences.
390 Numbering starts from 1 and the number is inclusive, ie 1-2
391 are the first two residue of the sequence. Start cannot be
392 larger than end but can be equal.
394 If the second argument is missing the returned values
395 should extend to the end of the sequence.
397 The return value may be a string or an array reference,
398 depending on the implementation. If in doubt, use
399 submeta_text() which is a variant guarantied to return a
400 string. See L<submeta_text>.
402 Returns : A reference to an array or a string
403 Args : integer, start position
404 integer, end position, optional when a third argument present
410 shift->named_submeta($DEFAULT_NAME, @_);
416 Usage : $meta_values = $obj->submeta_text(20, $value_string);
417 Function: Variant of submeta() guarantied to return a textual
418 representation of meta data. For details, see L<meta>.
420 Args : new value, optional
431 Title : named_submeta
432 Usage : $subset_of_meta_values = $obj->named_submeta($name, 10, 20, $value_string);
433 $subset_of_meta_values = $obj->named_submeta($name, 10);
434 Function: Variant of submeta() guarantied to return a textual
435 representation of meta data. For details, see L<meta>.
436 Returns : A reference to an array or a string
437 Args : scalar, name of the meta data set
438 integer, start position
439 integer, end position, optional when a third argument present
445 my ($self, $name, $start, $end, $value) = @_;
447 $name ||= $DEFAULT_NAME;
451 $start =~ /^[+]?\d+$/ and $start > 0 or
452 $self->throw("Need at least a positive integer start value");
455 $end ||= $start+length($value)-1;
456 $self->warn("You are setting meta values beyond the length of the sequence\n".
457 "[$start > ". length($self->seq)."] in sequence ". $self->id)
458 if $start > length $self->seq;
460 # pad meta data if needed
461 $self->{_meta
}->{$name} = () unless defined $self->{_meta
}->{$name};
462 if (length($self->{_meta
}->{$name}) < $start) {
463 $self->{'_meta'}->{$name} .= " " x
( $start - length($self->{'_meta'}->{$name}) -1);
467 $tail = substr ($self->{_meta
}->{$name}, $start-1+length($value))
468 if length($self->{_meta
}->{$name}) >= $start-1+length($value);
470 substr ($self->{_meta
}->{$name}, --$start) = $value;
471 $self->{_meta
}->{$name} .= $tail;
473 return substr ($self->{_meta
}->{$name}, $start, $end - $start + 1);
477 $end or $end = length $self->seq;
479 # pad meta data if needed
480 if (length($self->{_meta
}->{$name}) < $end) {
481 $self->{'_meta'}->{$name} .= " " x
( $start - length($self->{'_meta'}->{$name}));
484 return substr ($self->{_meta
}->{$name}, $start-1, $end - $start + 1)
489 =head2 named_submeta_text
491 Title : named_submeta_text
492 Usage : $meta_values = $obj->named_submeta_text($name, 20, $value_string);
493 Function: Variant of submeta() guarantied to return a textual
494 representation of meta data. For details, see L<meta>.
496 Args : scalar, name of the meta data
497 Args : integer, start position, optional
498 integer, end position, optional
503 sub named_submeta_text
{
504 shift->named_submeta(@_);
510 Usage : @meta_names = $obj->meta_names()
511 Function: Retrieves an array of meta data set names. The default
512 (unnamed) set name is guarantied to be the first name.
513 Returns : an array of names
522 foreach ( sort keys %{$self->{'_meta'}} ) {
523 push (@r, $_) unless $_ eq $DEFAULT_NAME;
525 unshift @r, $DEFAULT_NAME if $self->{'_meta'}->{$DEFAULT_NAME};
532 Title : meta_length()
533 Usage : $meeta_len = $obj->meta_length();
534 Function: return the number of elements in the meta set
542 return $self->named_meta_length($DEFAULT_NAME);
546 =head2 named_meta_length
548 Title : named_meta_length()
549 Usage : $meta_len = $obj->named_meta_length($name);
550 Function: return the number of elements in the named meta set
556 sub named_meta_length
{
557 my ($self, $name) = @_;
558 $name ||= $DEFAULT_NAME;
559 return length ($self->{'_meta'}->{$name});
565 Title : force_flush()
566 Usage : $force_flush = $obj->force_flush(1);
567 Function: Automatically pad with empty values or truncate meta values
568 to sequence length. Not done by default.
569 Returns : boolean 1 or 0
570 Args : optional boolean value
572 Note that if you turn this forced padding off, the previously padded
573 values are not changed.
578 my ($self, $value) = @_;
580 if (defined $value) {
582 $self->{force_flush
} = 1;
585 $self->{force_flush
} = 0;
589 return $self->{force_flush
};
597 Function: internal method to do the force that meta values are same
598 length as the sequence . Called from L<force_flush>
608 foreach my $name ( ('DEFAULT', $self->meta_names) ) {
611 if ($self->length > $self->named_meta_length($name)) {
612 $self->{'_meta'}->{$name} .= $META_GAP x
($self->length - $self->named_meta_length($name)) ;
615 elsif ( $self->length < $self->named_meta_length($name) ) {
616 $self->{_meta
}->{$name} = substr($self->{_meta
}->{$name}, 0, $self->length-1);
626 Usage : $is_flush = $obj->is_flush()
627 or $is_flush = $obj->is_flush($my_meta_name)
628 Function: Boolean to tell if all meta values are in
629 flush with the sequence length.
630 Returns true if force_flush() is set
631 Set verbosity to a positive value to see failed meta sets
632 Returns : boolean 1 or 0
633 Args : optional name of the meta set
639 my ($self, $name) = shift;
641 return 1 if $self->force_flush;
647 $sticky .= "$name " if $self->length != $self->named_meta_length($name);
649 foreach my $m ($self->meta_names) {
650 $sticky .= "$m " if ($self->named_meta_length($m) > 0) && ($self->length != $self->named_meta_length($m));
655 print "These meta set are not flush: $sticky\n" if $self->verbose;
663 =head1 Bio::PrimarySeqI methods
668 Usage : $newseq = $seq->revcom();
669 Function: Produces a new Bio::Seq::MetaI implementing object where
670 the order of residues and their meta information is reversed.
671 Returns : A new (fresh) Bio::Seq::Meta object
673 Throws : if the object returns false on is_flush()
675 Note: The method does nothing to meta values, it reorders them, only.
682 $self->throw("Can not get a reverse complement. The object is not flush.")
683 unless $self->is_flush;
685 my $new = $self->SUPER::revcom
;
686 foreach (keys %{$self->{_meta
}}) {
687 $new->named_meta($_, scalar reverse $self->{_meta
}->{$_} );
695 Usage : $subseq = $seq->trunc(10,100);
696 Function: Provides a truncation of a sequence together with meta data
697 Returns : a fresh Bio::Seq::Meta implementing object
698 Args : Two integers denoting first and last residue of the sub-sequence.
703 my ($self, $start, $end) = @_;
706 $start =~ /^[+]?\d+$/ and $start > 0 or
707 $self->throw("Need at least a positive integer start value as start");
708 $end =~ /^[+]?\d+$/ and $end > 0 or
709 $self->throw("Need at least a positive integer start value as end");
711 $self->throw("End position has to be larger or equal to start");
712 $end <= $self->length or
713 $self->throw("End position can not be larger than sequence length");
715 my $new = $self->SUPER::trunc
($start, $end);
717 foreach (keys %{$self->{_meta
}}) {
719 substr($self->{_meta
}->{$_}, $start, $end - $start)
728 my $out = Bio
::SeqIO
->new(-format
=>'metafasta');
729 $out->write_seq($self);