Merge branch 'master' of github.com:bioperl/bioperl-live
[bioperl-live.git] / Bio / Seq / Meta.pm
blobdd3407355d8837428535014e537874b81a163b78
2 # BioPerl module for Bio::Seq::Meta
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Seq::Meta - Generic superclass for sequence objects with
17 residue-based meta information
19 =head1 SYNOPSIS
21 use Bio::LocatableSeq;
22 use Bio::Seq::Meta;
23 use Bio::Tools::OddCodes;
24 use Bio::SeqIO;
26 my $seq = Bio::Seq::Meta->new(-id=>'test',
27 -seq=>'ACTGCTAGCT',
28 -start=>2434,
29 -end=>2443,
30 -strand=>1,
31 -verbose=>1, # to see warnings
34 # the existing sequence object can be a Bio::PrimarySeq, too
36 # to test this is a meta seq object
37 $seq->isa("Bio::Seq::Meta")
38 || $seq->throw("$seq is not a Bio::Seq::Meta");
41 $seq->meta('1234567890');
42 $seq = Bio::Seq::Meta->new(-id=>'test',
43 -seq=>'HACILMIFGT',
44 -start=>2434,
45 -end=>2443,
46 -strand=>1,
47 -meta=>'1234567890',
48 -verbose=>1, # to see warnings
51 # accessors
52 $string = $seq->meta_text();
53 $substring = $seq->submeta_text(2,5);
54 $unique_key = $seq->accession_number();
56 # storing output from Bio::Tools::OddCodes as meta data
57 my $protcodes = Bio::Tools::OddCodes->new(-seq => $seq);
58 my @codes = qw(structural chemical functional charge hydrophobic);
59 map { $seq->named_meta($_, ${$protcodes->$_($seq) } )} @codes;
61 my $out = Bio::SeqIO->new(-format=>'metafasta');
62 $out->write_seq($seq);
64 =head1 DESCRIPTION
66 This class implements generic methods for sequences with residue-based
67 meta information. Meta sequences with meta data are Bio::LocatableSeq
68 objects with additional methods to store that meta information. See
69 L<Bio::LocatableSeq> and L<Bio::Seq::MetaI>.
71 The meta information in this class is always one character per residue
72 long and blank values are space characters (ASCII 32).
74 After the latest rewrite, the meta information no longer covers all
75 the residues automatically. Methods to check the length of meta
76 information (L<meta_length>)and to see if the ends are flushed to the
77 sequence have been added (L<is_flush>). To force the old
78 functionality, set L<force_flush> to true.
80 It is assumed that meta data values do not depend on the nucleotide
81 sequence strand value.
83 Application specific implementations should inherit from this class to
84 override and add to these methods.
86 L<Bio::Seq::Meta::Array> allows for more complex meta values (scalars
87 or objects) to be used.
89 =head2 Method naming
91 Character based meta data is read and set by method meta() and its
92 variants. These are the suffixes and prefixes used in the variants:
94 [named_] [sub] meta [_text]
96 =over 3
98 =item _text
100 Suffix B<_text> guaranties that output is a string. Note that it does
101 not limit the input.
103 In this implementation, the output is always text, so these methods
104 are redundant.
106 =item sub
108 Prefix B<sub>, like in subseq(), means that the method applies to sub
109 region of the sequence range and takes start and end as arguments.
110 Unlike subseq(), these methods are able to set values. If the range
111 is not defined, it defaults to the complete sequence.
113 =item named
115 Prefix B<named_> in method names allows the used to attach multiple
116 meta strings to one sequence by explicitly naming them. The name is
117 always the first argument to the method. The "unnamed" methods use the
118 class wide default name for the meta data and are thus special cases
119 "named" methods.
121 Note that internally names are keys in a hash and any misspelling of a
122 name will silently store the data under a wrong name. The used names
123 (keys) can be retrieved using method meta_names(). See L<meta_names>.
125 =back
127 =head1 NOTE
129 This Bio::Seq::MetaI implementation inherits from Bio::LocatableSeq, which
130 itself inherits from Bio::PrimarySeq. It is not a Bio::SeqI, so bless-ing
131 objects of this class into a Bio::SeqI or vice versa and will not work as
132 expected (see bug 2262). This may be addressed in a future refactor of
133 Bio::LocatableSeq.
136 =head1 SEE ALSO
138 L<Bio::LocatableSeq>,
139 L<Bio::Seq::MetaI>,
140 L<Bio::Seq::Meta::Array>
142 =head1 FEEDBACK
144 =head2 Mailing Lists
146 User feedback is an integral part of the evolution of this and other
147 Bioperl modules. Send your comments and suggestions preferably to one
148 of the Bioperl mailing lists. Your participation is much appreciated.
150 bioperl-l@bioperl.org - General discussion
151 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
153 =head2 Support
155 Please direct usage questions or support issues to the mailing list:
157 I<bioperl-l@bioperl.org>
159 rather than to the module maintainer directly. Many experienced and
160 reponsive experts will be able look at the problem and quickly
161 address it. Please include a thorough description of the problem
162 with code and data examples if at all possible.
164 =head2 Reporting Bugs
166 Report bugs to the Bioperl bug tracking system to help us keep track
167 the bugs and their resolution. Bug reports can be submitted via the
168 web:
170 https://github.com/bioperl/bioperl-live/issues
172 =head1 AUTHOR - Heikki Lehvaslaiho
174 Email heikki-at-bioperl-dot-org
176 =head1 CONTRIBUTORS
178 Chad Matsalla, bioinformatics@dieselwurks.com
180 Aaron Mackey, amackey@virginia.edu
182 =head1 APPENDIX
184 The rest of the documentation details each of the object methods.
185 Internal methods are usually preceded with a _
187 =cut
190 # Let the code begin...
193 package Bio::Seq::Meta;
194 use vars qw($DEFAULT_NAME $GAP $META_GAP);
195 use strict;
197 #use overload '""' => \&to_string;
199 use base qw(Bio::LocatableSeq Bio::Seq::MetaI);
202 BEGIN {
204 $DEFAULT_NAME = 'DEFAULT';
205 $GAP = '-';
206 $META_GAP = ' ';
209 =head2 new
211 Title : new
212 Usage : $metaseq = Bio::Seq::Meta->new
213 ( -meta => 'aaaaaaaabbbbbbbb',
214 -seq => 'TKLMILVSHIVILSRM'
215 -id => 'human_id',
216 -accession_number => 'S000012',
218 Function: Constructor for Bio::Seq::Meta class, meta data being in a
219 string. Note that you can provide an empty quality string.
220 Returns : a new Bio::Seq::Meta object
222 =cut
225 sub new {
226 my ($class, @args) = @_;
228 my $self = $class->SUPER::new(@args);
230 my($meta, $forceflush, $nm) =
231 $self->_rearrange([qw(META
232 FORCE_FLUSH
233 NAMED_META)],
234 @args);
236 #$self->{'_meta'} = {};
237 $self->{'_meta'}->{$DEFAULT_NAME} = "";
239 $meta && $self->meta($meta);
240 if ($nm && ref($nm) eq 'HASH') {
241 while (my ($name, $meta) = each %$nm) {
242 $self->named_meta($name, $meta);
245 $forceflush && $self->force_flush($forceflush);
247 return $self;
251 =head2 meta
253 Title : meta
254 Usage : $meta_values = $obj->meta($values_string);
255 Function:
257 Get and set method for the meta data starting from residue
258 position one. Since it is dependent on the length of the
259 sequence, it needs to be manipulated after the sequence.
261 The length of the returned value always matches the length
262 of the sequence, if force_flush() is set. See L<force_flush>.
264 Returns : meta data in a string
265 Args : new value, string, optional
267 =cut
269 sub meta {
270 shift->named_meta($DEFAULT_NAME, shift);
273 =head2 meta_text
275 Title : meta_text
276 Usage : $meta_values = $obj->meta_text($values_arrayref);
277 Function: Variant of meta() guarantied to return a textual
278 representation of meta data. For details, see L<meta>.
279 Returns : a string
280 Args : new value, optional
282 =cut
284 sub meta_text {
285 shift->meta(shift);
288 =head2 named_meta
290 Title : named_meta()
291 Usage : $meta_values = $obj->named_meta($name, $values_arrayref);
292 Function: A more general version of meta(). Each meta data set needs
293 to be named. See also L<meta_names>.
294 Returns : a string
295 Args : scalar, name of the meta data set
296 new value, optional
298 =cut
300 sub named_meta {
301 my ($self, $name, $value) = @_;
303 $name ||= $DEFAULT_NAME;
304 if( defined $value) {
306 $self->throw("I need a scalar value, not [". ref($value). "]")
307 if ref($value);
309 # test for length
310 my $diff = $self->length - CORE::length($value);
311 if ($diff > 0) {
312 $value .= (" " x $diff);
315 $self->{'_meta'}->{$name} = $value;
317 #$self->_test_gap_positions($name) if $self->verbose > 0;
320 return " " x $self->length
321 if $self->force_flush && not defined $self->{'_meta'}->{$name};
324 $self->_do_flush if $self->force_flush;
326 return $self->{'_meta'}->{$name};
329 =head2 _test_gap_positions
331 Title : _test_gap_positions
332 Usage : $meta_values = $obj->_test_gap_positions($name);
333 Function: Internal test for correct position of gap characters.
334 Gap being only '-' this time.
336 This method is called from named_meta() when setting meta
337 data but only if verbose is positive as this can be an
338 expensive process on very long sequences. Set verbose(1) to
339 see warnings when gaps do not align in sequence and meta
340 data and turn them into errors by setting verbose(2).
342 Returns : true on success, prints warnings
343 Args : none
345 =cut
347 sub _test_gap_positions {
348 my $self = shift;
349 my $name = shift;
350 my $success = 1;
352 $self->seq || return $success;
353 my $len = CORE::length($self->seq);
354 for (my $i=0; $i < $len; $i++) {
355 my $s = substr $self->{seq}, $i, 1;
356 my $m = substr $self->{_meta}->{$name}, $i, 1;
357 $self->warn("Gap mismatch [$m/$s] in column [". ($i+1). "] of [$name] meta data in seq [". $self->id. "]")
358 and $success = 0
359 if ($s eq $META_GAP) && $s ne $m;
361 return $success;
364 =head2 named_meta_text
366 Title : named_meta_text()
367 Usage : $meta_values = $obj->named_meta_text($name, $values_arrayref);
368 Function: Variant of named_meta() guarantied to return a textual
369 representation of the named meta data.
370 For details, see L<meta>.
371 Returns : a string
372 Args : scalar, name of the meta data set
373 new value, optional
375 =cut
377 sub named_meta_text {
378 shift->named_meta(@_);
381 =head2 submeta
383 Title : submeta
384 Usage : $subset_of_meta_values = $obj->submeta(10, 20, $value_string);
385 $subset_of_meta_values = $obj->submeta(10, undef, $value_string);
386 Function:
388 Get and set method for meta data for subsequences.
390 Numbering starts from 1 and the number is inclusive, ie 1-2
391 are the first two residue of the sequence. Start cannot be
392 larger than end but can be equal.
394 If the second argument is missing the returned values
395 should extend to the end of the sequence.
397 The return value may be a string or an array reference,
398 depending on the implementation. If in doubt, use
399 submeta_text() which is a variant guarantied to return a
400 string. See L<submeta_text>.
402 Returns : A reference to an array or a string
403 Args : integer, start position
404 integer, end position, optional when a third argument present
405 new value, optional
407 =cut
409 sub submeta {
410 shift->named_submeta($DEFAULT_NAME, @_);
413 =head2 submeta_text
415 Title : submeta_text
416 Usage : $meta_values = $obj->submeta_text(20, $value_string);
417 Function: Variant of submeta() guarantied to return a textual
418 representation of meta data. For details, see L<meta>.
419 Returns : a string
420 Args : new value, optional
423 =cut
425 sub submeta_text {
426 shift->submeta(@_);
429 =head2 named_submeta
431 Title : named_submeta
432 Usage : $subset_of_meta_values = $obj->named_submeta($name, 10, 20, $value_string);
433 $subset_of_meta_values = $obj->named_submeta($name, 10);
434 Function: Variant of submeta() guarantied to return a textual
435 representation of meta data. For details, see L<meta>.
436 Returns : A reference to an array or a string
437 Args : scalar, name of the meta data set
438 integer, start position
439 integer, end position, optional when a third argument present
440 new value, optional
442 =cut
444 sub named_submeta {
445 my ($self, $name, $start, $end, $value) = @_;
447 $name ||= $DEFAULT_NAME;
448 $start ||=1;
451 $start =~ /^[+]?\d+$/ and $start > 0 or
452 $self->throw("Need at least a positive integer start value");
454 if ($value) {
455 $end ||= $start+length($value)-1;
456 $self->warn("You are setting meta values beyond the length of the sequence\n".
457 "[$start > ". length($self->seq)."] in sequence ". $self->id)
458 if $start > length $self->seq;
460 # pad meta data if needed
461 $self->{_meta}->{$name} = () unless defined $self->{_meta}->{$name};
462 if (length($self->{_meta}->{$name}) < $start) {
463 $self->{'_meta'}->{$name} .= " " x ( $start - length($self->{'_meta'}->{$name}) -1);
466 my $tail = '';
467 $tail = substr ($self->{_meta}->{$name}, $start-1+length($value))
468 if length($self->{_meta}->{$name}) >= $start-1+length($value);
470 substr ($self->{_meta}->{$name}, --$start) = $value;
471 $self->{_meta}->{$name} .= $tail;
473 return substr ($self->{_meta}->{$name}, $start, $end - $start + 1);
475 } else {
477 $end or $end = length $self->seq;
479 # pad meta data if needed
480 if (length($self->{_meta}->{$name}) < $end) {
481 $self->{'_meta'}->{$name} .= " " x ( $start - length($self->{'_meta'}->{$name}));
484 return substr ($self->{_meta}->{$name}, $start-1, $end - $start + 1)
489 =head2 named_submeta_text
491 Title : named_submeta_text
492 Usage : $meta_values = $obj->named_submeta_text($name, 20, $value_string);
493 Function: Variant of submeta() guarantied to return a textual
494 representation of meta data. For details, see L<meta>.
495 Returns : a string
496 Args : scalar, name of the meta data
497 Args : integer, start position, optional
498 integer, end position, optional
499 new value, optional
501 =cut
503 sub named_submeta_text {
504 shift->named_submeta(@_);
507 =head2 meta_names
509 Title : meta_names
510 Usage : @meta_names = $obj->meta_names()
511 Function: Retrieves an array of meta data set names. The default
512 (unnamed) set name is guarantied to be the first name.
513 Returns : an array of names
514 Args : none
516 =cut
518 sub meta_names {
519 my ($self) = @_;
521 my @r;
522 foreach ( sort keys %{$self->{'_meta'}} ) {
523 push (@r, $_) unless $_ eq $DEFAULT_NAME;
525 unshift @r, $DEFAULT_NAME if $self->{'_meta'}->{$DEFAULT_NAME};
526 return @r;
530 =head2 meta_length
532 Title : meta_length()
533 Usage : $meeta_len = $obj->meta_length();
534 Function: return the number of elements in the meta set
535 Returns : integer
536 Args : -
538 =cut
540 sub meta_length {
541 my ($self) = @_;
542 return $self->named_meta_length($DEFAULT_NAME);
546 =head2 named_meta_length
548 Title : named_meta_length()
549 Usage : $meta_len = $obj->named_meta_length($name);
550 Function: return the number of elements in the named meta set
551 Returns : integer
552 Args : -
554 =cut
556 sub named_meta_length {
557 my ($self, $name) = @_;
558 $name ||= $DEFAULT_NAME;
559 return length ($self->{'_meta'}->{$name});
563 =head2 force_flush
565 Title : force_flush()
566 Usage : $force_flush = $obj->force_flush(1);
567 Function: Automatically pad with empty values or truncate meta values
568 to sequence length. Not done by default.
569 Returns : boolean 1 or 0
570 Args : optional boolean value
572 Note that if you turn this forced padding off, the previously padded
573 values are not changed.
575 =cut
577 sub force_flush {
578 my ($self, $value) = @_;
580 if (defined $value) {
581 if ($value) {
582 $self->{force_flush} = 1;
583 $self->_do_flush;
584 } else {
585 $self->{force_flush} = 0;
589 return $self->{force_flush};
593 =head2 _do_flush
595 Title : _do_flush
596 Usage :
597 Function: internal method to do the force that meta values are same
598 length as the sequence . Called from L<force_flush>
599 Returns :
600 Args :
602 =cut
605 sub _do_flush {
606 my ($self) = @_;
608 foreach my $name ( ('DEFAULT', $self->meta_names) ) {
610 # elongnation
611 if ($self->length > $self->named_meta_length($name)) {
612 $self->{'_meta'}->{$name} .= $META_GAP x ($self->length - $self->named_meta_length($name)) ;
614 # truncation
615 elsif ( $self->length < $self->named_meta_length($name) ) {
616 $self->{_meta}->{$name} = substr($self->{_meta}->{$name}, 0, $self->length-1);
623 =head2 is_flush
625 Title : is_flush
626 Usage : $is_flush = $obj->is_flush()
627 or $is_flush = $obj->is_flush($my_meta_name)
628 Function: Boolean to tell if all meta values are in
629 flush with the sequence length.
630 Returns true if force_flush() is set
631 Set verbosity to a positive value to see failed meta sets
632 Returns : boolean 1 or 0
633 Args : optional name of the meta set
635 =cut
637 sub is_flush {
639 my ($self, $name) = shift;
641 return 1 if $self->force_flush;
643 my $sticky = '';
646 if ($name) {
647 $sticky .= "$name " if $self->length != $self->named_meta_length($name);
648 } else {
649 foreach my $m ($self->meta_names) {
650 $sticky .= "$m " if ($self->named_meta_length($m) > 0) && ($self->length != $self->named_meta_length($m));
654 if ($sticky) {
655 print "These meta set are not flush: $sticky\n" if $self->verbose;
656 return 0;
659 return 1;
663 =head1 Bio::PrimarySeqI methods
665 =head2 revcom
667 Title : revcom
668 Usage : $newseq = $seq->revcom();
669 Function: Produces a new Bio::Seq::MetaI implementing object where
670 the order of residues and their meta information is reversed.
671 Returns : A new (fresh) Bio::Seq::Meta object
672 Args : none
673 Throws : if the object returns false on is_flush()
675 Note: The method does nothing to meta values, it reorders them, only.
677 =cut
679 sub revcom {
680 my $self = shift;
682 $self->throw("Can not get a reverse complement. The object is not flush.")
683 unless $self->is_flush;
685 my $new = $self->SUPER::revcom;
686 foreach (keys %{$self->{_meta}}) {
687 $new->named_meta($_, scalar reverse $self->{_meta}->{$_} );
689 return $new;
692 =head2 trunc
694 Title : trunc
695 Usage : $subseq = $seq->trunc(10,100);
696 Function: Provides a truncation of a sequence together with meta data
697 Returns : a fresh Bio::Seq::Meta implementing object
698 Args : Two integers denoting first and last residue of the sub-sequence.
700 =cut
702 sub trunc {
703 my ($self, $start, $end) = @_;
705 # test arguments
706 $start =~ /^[+]?\d+$/ and $start > 0 or
707 $self->throw("Need at least a positive integer start value as start");
708 $end =~ /^[+]?\d+$/ and $end > 0 or
709 $self->throw("Need at least a positive integer start value as end");
710 $end >= $start or
711 $self->throw("End position has to be larger or equal to start");
712 $end <= $self->length or
713 $self->throw("End position can not be larger than sequence length");
715 my $new = $self->SUPER::trunc($start, $end);
716 $start--;
717 foreach (keys %{$self->{_meta}}) {
718 $new->named_meta($_,
719 substr($self->{_meta}->{$_}, $start, $end - $start)
722 return $new;
726 sub to_string {
727 my ($self) = @_;
728 my $out = Bio::SeqIO->new(-format=>'metafasta');
729 $out->write_seq($self);
730 return 1;