[bug 2450]
[bioperl-live.git] / Bio / Seq / Meta.pm
blob4c262b9415703701b775f6b5a272bec787675a59
1 # $Id$
3 # BioPerl module for Bio::Seq::Meta
5 # Cared for by Heikki Lehvaslaiho
7 # Copyright Heikki Lehvaslaiho
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Seq::Meta - Generic superclass for sequence objects with
16 residue-based meta information
18 =head1 SYNOPSIS
20 use Bio::LocatableSeq;
21 use Bio::Seq::Meta;
22 use Bio::Tools::OddCodes;
23 use Bio::SeqIO;
25 my $seq = Bio::Seq::Meta->new(-id=>'test',
26 -seq=>'ACTGCTAGCT',
27 -start=>2434,
28 -end=>2443,
29 -strand=>1,
30 -verbose=>1, # to see warnings
33 # the existing sequence object can be a Bio::PrimarySeq, too
35 # to test this is a meta seq object
36 $seq->isa("Bio::Seq::Meta")
37 || $seq->throw("$seq is not a Bio::Seq::Meta");
40 $seq->meta('1234567890');
41 $seq = Bio::Seq::Meta->new(-id=>'test',
42 -seq=>'HACILMIFGT',
43 -start=>2434,
44 -end=>2443,
45 -strand=>1,
46 -meta=>'1234567890',
47 -verbose=>1, # to see warnings
50 # accessors
51 $string = $seq->meta_text();
52 $substring = $seq->submeta_text(2,5);
53 $unique_key = $seq->accession_number();
55 # storing output from Bio::Tools::OddCodes as meta data
56 my $protcodes = Bio::Tools::OddCodes->new(-seq => $seq);
57 my @codes = qw(structural chemical functional charge hydrophobic);
58 map { $seq->named_meta($_, ${$protcodes->$_($seq) } )} @codes;
60 my $out = Bio::SeqIO->new(-format=>'metafasta');
61 $out->write_seq($seq);
63 =head1 DESCRIPTION
65 This class implements generic methods for sequences with residue-based
66 meta information. Meta sequences with meta data are Bio::LocatableSeq
67 objects with additional methods to store that meta information. See
68 L<Bio::LocatableSeq> and L<Bio::Seq::MetaI>.
70 The meta information in this class is always one character per residue
71 long and blank values are space characters (ASCII 32).
73 After the latest rewrite, the meta information no longer covers all
74 the residues automatically. Methods to check the length of meta
75 information (L<meta_length>)and to see if the ends are flushed to the
76 sequence have been added (L<is_flush>). To force the old
77 functionality, set L<force_flush> to true.
79 It is assumed that meta data values do not depend on the nucleotide
80 sequence strand value.
82 Application specific implementations should inherit from this class to
83 override and add to these methods.
85 L<Bio::Seq::Meta::Array> allows for more complex meta values (scalars
86 or objects) to be used.
88 =head2 Method naming
90 Character based meta data is read and set by method meta() and its
91 variants. These are the suffixes and prefixes used in the variants:
93 [named_] [sub] meta [_text]
95 =over 3
97 =item _text
99 Suffix B<_text> guaranties that output is a string. Note that it does
100 not limit the input.
102 In this implementation, the output is always text, so these methods
103 are redundant.
105 =item sub
107 Prefix B<sub>, like in subseq(), means that the method applies to sub
108 region of the sequence range and takes start and end as arguments.
109 Unlike subseq(), these methods are able to set values. If the range
110 is not defined, it defaults to the complete sequence.
112 =item named
114 Prefix B<named_> in method names allows the used to attach multiple
115 meta strings to one sequence by explicitly naming them. The name is
116 always the first argument to the method. The "unnamed" methods use the
117 class wide default name for the meta data and are thus special cases
118 "named" methods.
120 Note that internally names are keys in a hash and any misspelling of a
121 name will silently store the data under a wrong name. The used names
122 (keys) can be retrieved using method meta_names(). See L<meta_names>.
124 =back
126 =head1 NOTE
128 This Bio::Seq::MetaI implementation inherits from Bio::LocatableSeq, which
129 itself inherits from Bio::PrimarySeq. It is not a Bio::SeqI, so bless-ing
130 objects of this class into a Bio::SeqI or vice versa and will not work as
131 expected (see bug 2262). This may be addressed in a future refactor of
132 Bio::LocatableSeq.
135 =head1 SEE ALSO
137 L<Bio::LocatableSeq>,
138 L<Bio::Seq::MetaI>,
139 L<Bio::Seq::Meta::Array>
141 =head1 FEEDBACK
143 =head2 Mailing Lists
145 User feedback is an integral part of the evolution of this and other
146 Bioperl modules. Send your comments and suggestions preferably to one
147 of the Bioperl mailing lists. Your participation is much appreciated.
149 bioperl-l@bioperl.org - General discussion
150 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
152 =head2 Reporting Bugs
154 Report bugs to the Bioperl bug tracking system to help us keep track
155 the bugs and their resolution. Bug reports can be submitted via the
156 web:
158 http://bugzilla.open-bio.org/
160 =head1 AUTHOR - Heikki Lehvaslaiho
162 Email heikki-at-bioperl-dot-org
164 =head1 CONTRIBUTORS
166 Chad Matsalla, bioinformatics@dieselwurks.com
168 Aaron Mackey, amackey@virginia.edu
170 =head1 APPENDIX
172 The rest of the documentation details each of the object methods.
173 Internal methods are usually preceded with a _
175 =cut
178 # Let the code begin...
181 package Bio::Seq::Meta;
182 use vars qw($DEFAULT_NAME $GAP $META_GAP);
183 use strict;
185 #use overload '""' => \&to_string;
187 use base qw(Bio::LocatableSeq Bio::Seq::MetaI);
190 BEGIN {
192 $DEFAULT_NAME = 'DEFAULT';
193 $GAP = '-';
194 $META_GAP = ' ';
197 =head2 new
199 Title : new
200 Usage : $metaseq = Bio::Seq::Meta->new
201 ( -meta => 'aaaaaaaabbbbbbbb',
202 -seq => 'TKLMILVSHIVILSRM'
203 -id => 'human_id',
204 -accession_number => 'S000012',
206 Function: Constructor for Bio::Seq::Meta class, meta data being in a
207 string. Note that you can provide an empty quality string.
208 Returns : a new Bio::Seq::Meta object
210 =cut
213 sub new {
214 my ($class, @args) = @_;
216 my $self = $class->SUPER::new(@args);
218 my($meta, $forceflush, $nm) =
219 $self->_rearrange([qw(META
220 FORCE_FLUSH
221 NAMED_META)],
222 @args);
224 #$self->{'_meta'} = {};
225 $self->{'_meta'}->{$DEFAULT_NAME} = "";
227 $meta && $self->meta($meta);
228 if ($nm && ref($nm) eq 'HASH') {
229 while (my ($name, $meta) = each %$nm) {
230 $self->named_meta($name, $meta);
233 $forceflush && $self->force_flush($forceflush);
235 return $self;
239 =head2 meta
241 Title : meta
242 Usage : $meta_values = $obj->meta($values_string);
243 Function:
245 Get and set method for the meta data starting from residue
246 position one. Since it is dependent on the length of the
247 sequence, it needs to be manipulated after the sequence.
249 The length of the returned value always matches the length
250 of the sequence, if force_flush() is set. See L<force_flush>.
252 Returns : meta data in a string
253 Args : new value, string, optional
255 =cut
257 sub meta {
258 shift->named_meta($DEFAULT_NAME, shift);
261 =head2 meta_text
263 Title : meta_text
264 Usage : $meta_values = $obj->meta_text($values_arrayref);
265 Function: Variant of meta() guarantied to return a textual
266 representation of meta data. For details, see L<meta>.
267 Returns : a string
268 Args : new value, optional
270 =cut
272 sub meta_text {
273 shift->meta(shift);
276 =head2 named_meta
278 Title : named_meta()
279 Usage : $meta_values = $obj->named_meta($name, $values_arrayref);
280 Function: A more general version of meta(). Each meta data set needs
281 to be named. See also L<meta_names>.
282 Returns : a string
283 Args : scalar, name of the meta data set
284 new value, optional
286 =cut
288 sub named_meta {
289 my ($self, $name, $value) = @_;
291 $name ||= $DEFAULT_NAME;
292 if( defined $value) {
294 $self->throw("I need a scalar value, not [". ref($value). "]")
295 if ref($value);
297 # test for length
298 my $diff = $self->length - CORE::length($value);
299 if ($diff > 0) {
300 $value .= (" " x $diff);
303 $self->{'_meta'}->{$name} = $value;
305 #$self->_test_gap_positions($name) if $self->verbose > 0;
308 return " " x $self->length
309 if $self->force_flush && not defined $self->{'_meta'}->{$name};
312 $self->_do_flush if $self->force_flush;
314 return $self->{'_meta'}->{$name};
317 =head2 _test_gap_positions
319 Title : _test_gap_positions
320 Usage : $meta_values = $obj->_test_gap_positions($name);
321 Function: Internal test for correct position of gap characters.
322 Gap being only '-' this time.
324 This method is called from named_meta() when setting meta
325 data but only if verbose is positive as this can be an
326 expensive process on very long sequences. Set verbose(1) to
327 see warnings when gaps do not align in sequence and meta
328 data and turn them into errors by setting verbose(2).
330 Returns : true on success, prints warnings
331 Args : none
333 =cut
335 sub _test_gap_positions {
336 my $self = shift;
337 my $name = shift;
338 my $success = 1;
340 $self->seq || return $success;
341 my $len = CORE::length($self->seq);
342 for (my $i=0; $i < $len; $i++) {
343 my $s = substr $self->{seq}, $i, 1;
344 my $m = substr $self->{_meta}->{$name}, $i, 1;
345 $self->warn("Gap mismatch [$m/$s] in column [". ($i+1). "] of [$name] meta data in seq [". $self->id. "]")
346 and $success = 0
347 if ($s eq $META_GAP) && $s ne $m;
349 return $success;
352 =head2 named_meta_text
354 Title : named_meta_text()
355 Usage : $meta_values = $obj->named_meta_text($name, $values_arrayref);
356 Function: Variant of named_meta() guarantied to return a textual
357 representation of the named meta data.
358 For details, see L<meta>.
359 Returns : a string
360 Args : scalar, name of the meta data set
361 new value, optional
363 =cut
365 sub named_meta_text {
366 shift->named_meta(@_);
369 =head2 submeta
371 Title : submeta
372 Usage : $subset_of_meta_values = $obj->submeta(10, 20, $value_string);
373 $subset_of_meta_values = $obj->submeta(10, undef, $value_string);
374 Function:
376 Get and set method for meta data for subsequences.
378 Numbering starts from 1 and the number is inclusive, ie 1-2
379 are the first two residue of the sequence. Start cannot be
380 larger than end but can be equal.
382 If the second argument is missing the returned values
383 should extend to the end of the sequence.
385 The return value may be a string or an array reference,
386 depending on the implementation. If in doubt, use
387 submeta_text() which is a variant guarantied to return a
388 string. See L<submeta_text>.
390 Returns : A reference to an array or a string
391 Args : integer, start position
392 integer, end position, optional when a third argument present
393 new value, optional
395 =cut
397 sub submeta {
398 shift->named_submeta($DEFAULT_NAME, @_);
401 =head2 submeta_text
403 Title : submeta_text
404 Usage : $meta_values = $obj->submeta_text(20, $value_string);
405 Function: Variant of submeta() guarantied to return a textual
406 representation of meta data. For details, see L<meta>.
407 Returns : a string
408 Args : new value, optional
411 =cut
413 sub submeta_text {
414 shift->submeta(@_);
417 =head2 named_submeta
419 Title : named_submeta
420 Usage : $subset_of_meta_values = $obj->named_submeta($name, 10, 20, $value_string);
421 $subset_of_meta_values = $obj->named_submeta($name, 10);
422 Function: Variant of submeta() guarantied to return a textual
423 representation of meta data. For details, see L<meta>.
424 Returns : A reference to an array or a string
425 Args : scalar, name of the meta data set
426 integer, start position
427 integer, end position, optional when a third argument present
428 new value, optional
430 =cut
432 sub named_submeta {
433 my ($self, $name, $start, $end, $value) = @_;
435 $name ||= $DEFAULT_NAME;
436 $start ||=1;
439 $start =~ /^[+]?\d+$/ and $start > 0 or
440 $self->throw("Need at least a positive integer start value");
442 if ($value) {
443 $end ||= $start+length($value)-1;
444 $self->warn("You are setting meta values beyond the length of the sequence\n".
445 "[$start > ". length($self->seq)."] in sequence ". $self->id)
446 if $start > length $self->seq;
448 # pad meta data if needed
449 $self->{_meta}->{$name} = () unless defined $self->{_meta}->{$name};
450 if (length($self->{_meta}->{$name}) < $start) {
451 $self->{'_meta'}->{$name} .= " " x ( $start - length($self->{'_meta'}->{$name}) -1);
454 my $tail = '';
455 $tail = substr ($self->{_meta}->{$name}, $start-1+length($value))
456 if length($self->{_meta}->{$name}) >= $start-1+length($value);
458 substr ($self->{_meta}->{$name}, --$start) = $value;
459 $self->{_meta}->{$name} .= $tail;
461 return substr ($self->{_meta}->{$name}, $start, $end - $start + 1);
463 } else {
465 $end or $end = length $self->seq;
467 # pad meta data if needed
468 if (length($self->{_meta}->{$name}) < $end) {
469 $self->{'_meta'}->{$name} .= " " x ( $start - length($self->{'_meta'}->{$name}));
472 return substr ($self->{_meta}->{$name}, $start-1, $end - $start + 1)
477 =head2 named_submeta_text
479 Title : named_submeta_text
480 Usage : $meta_values = $obj->named_submeta_text($name, 20, $value_string);
481 Function: Variant of submeta() guarantied to return a textual
482 representation of meta data. For details, see L<meta>.
483 Returns : a string
484 Args : scalar, name of the meta data
485 Args : integer, start position, optional
486 integer, end position, optional
487 new value, optional
489 =cut
491 sub named_submeta_text {
492 shift->named_submeta(@_);
495 =head2 meta_names
497 Title : meta_names
498 Usage : @meta_names = $obj->meta_names()
499 Function: Retrieves an array of meta data set names. The default
500 (unnamed) set name is guarantied to be the first name.
501 Returns : an array of names
502 Args : none
504 =cut
506 sub meta_names {
507 my ($self) = @_;
509 my @r;
510 foreach ( sort keys %{$self->{'_meta'}} ) {
511 push (@r, $_) unless $_ eq $DEFAULT_NAME;
513 unshift @r, $DEFAULT_NAME if $self->{'_meta'}->{$DEFAULT_NAME};
514 return @r;
518 =head2 meta_length
520 Title : meta_length()
521 Usage : $meeta_len = $obj->meta_length();
522 Function: return the number of elements in the meta set
523 Returns : integer
524 Args : -
526 =cut
528 sub meta_length {
529 my ($self) = @_;
530 return $self->named_meta_length($DEFAULT_NAME);
534 =head2 named_meta_length
536 Title : named_meta_length()
537 Usage : $meta_len = $obj->named_meta_length($name);
538 Function: return the number of elements in the named meta set
539 Returns : integer
540 Args : -
542 =cut
544 sub named_meta_length {
545 my ($self, $name) = @_;
546 $name ||= $DEFAULT_NAME;
547 return length ($self->{'_meta'}->{$name});
551 =head2 force_flush
553 Title : force_flush()
554 Usage : $force_flush = $obj->force_flush(1);
555 Function: Automatically pad with empty values or truncate meta values
556 to sequence length. Not done by default.
557 Returns : boolean 1 or 0
558 Args : optional boolean value
560 Note that if you turn this forced padding off, the previously padded
561 values are not changed.
563 =cut
565 sub force_flush {
566 my ($self, $value) = @_;
568 if (defined $value) {
569 if ($value) {
570 $self->{force_flush} = 1;
571 $self->_do_flush;
572 } else {
573 $self->{force_flush} = 0;
576 return $self->{force_flush};
580 =head2 _do_flush
582 Title : _do_flush
583 Usage :
584 Function: internal method to do the force that meta values are same
585 length as the sequence . Called from L<force_flush>
586 Returns :
587 Args :
589 =cut
592 sub _do_flush {
593 my ($self) = @_;
595 foreach my $name ( ('DEFAULT', $self->meta_names) ) {
597 # elongnation
598 if ($self->length > $self->named_meta_length($name)) {
599 $self->{'_meta'}->{$name} .= $META_GAP x ($self->length - $self->named_meta_length($name)) ;
601 # truncation
602 elsif ( $self->length < $self->named_meta_length($name) ) {
603 $self->{_meta}->{$name} = substr($self->{_meta}->{$name}, 0, $self->length-1);
610 =head2 is_flush
612 Title : is_flush
613 Usage : $is_flush = $obj->is_flush()
614 or $is_flush = $obj->is_flush($my_meta_name)
615 Function: Boolean to tell if all meta values are in
616 flush with the sequence length.
617 Returns true if force_flush() is set
618 Set verbosity to a positive value to see failed meta sets
619 Returns : boolean 1 or 0
620 Args : optional name of the meta set
622 =cut
625 sub is_flush {
627 my ($self, $name) = shift;
629 return 1 if $self->force_flush;
631 my $sticky = '';
634 if ($name) {
635 $sticky .= "$name " if $self->length != $self->named_meta_length($name);
636 } else {
637 foreach my $m ($self->meta_names) {
638 $sticky .= "$m " if $self->length != $self->named_meta_length($m);
642 if ($sticky) {
643 print "These meta set are not flush: $sticky\n" if $self->verbose;
644 return 0;
647 return 1;
651 =head1 Bio::PrimarySeqI methods
653 =head2 revcom
655 Title : revcom
656 Usage : $newseq = $seq->revcom();
657 Function: Produces a new Bio::Seq::MetaI implementing object where
658 the order of residues and their meta information is reversed.
659 Returns : A new (fresh) Bio::Seq::Meta object
660 Args : none
661 Throws : if the object returns false on is_flush()
663 Note: The method does nothing to meta values, it reorders them, only.
665 =cut
667 sub revcom {
668 my $self = shift;
670 $self->throw("Can not get a reverse complement. The object is not flush.")
671 unless $self->is_flush;
673 my $new = $self->SUPER::revcom;
674 foreach (keys %{$self->{_meta}}) {
675 $new->named_meta($_, scalar reverse $self->{_meta}->{$_} );
677 return $new;
680 =head2 trunc
682 Title : trunc
683 Usage : $subseq = $seq->trunc(10,100);
684 Function: Provides a truncation of a sequence together with meta data
685 Returns : a fresh Bio::Seq::Meta implementing object
686 Args : Two integers denoting first and last residue of the sub-sequence.
688 =cut
690 sub trunc {
691 my ($self, $start, $end) = @_;
693 # test arguments
694 $start =~ /^[+]?\d+$/ and $start > 0 or
695 $self->throw("Need at least a positive integer start value as start");
696 $end =~ /^[+]?\d+$/ and $end > 0 or
697 $self->throw("Need at least a positive integer start value as end");
698 $end >= $start or
699 $self->throw("End position has to be larger or equal to start");
700 $end <= $self->length or
701 $self->throw("End position can not be larger than sequence length");
703 my $new = $self->SUPER::trunc($start, $end);
704 $start--;
705 foreach (keys %{$self->{_meta}}) {
706 $new->named_meta($_,
707 substr($self->{_meta}->{$_}, $start, $end - $start)
710 return $new;
714 sub to_string {
715 my ($self) = @_;
716 my $out = Bio::SeqIO->new(-format=>'metafasta');
717 $out->write_seq($self);
718 return 1;