tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / SeqFeature / NormalizedFeature.pm
blob6be38d5ff28b3fc7e8edac5b224583a51ce853dd
1 package Bio::DB::SeqFeature::NormalizedFeature;
3 # $Id$
5 =head1 NAME
7 Bio::DB::SeqFeature::NormalizedFeature -- Normalized feature for use with Bio::DB::SeqFeature::Store
9 =head1 SYNOPSIS
11 use Bio::DB::SeqFeature::Store;
12 # Open the sequence database
13 my $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'DBI::mysql',
14 -dsn => 'dbi:mysql:test');
15 my ($feature) = $db->get_features_by_name('ZK909');
16 my @subfeatures = $feature->get_SeqFeatures();
17 my @exons_only = $feature->get_SeqFeatures('exon');
19 # create a new object
20 $db->seqfeature_class('Bio::DB::SeqFeature::NormalizedFeature');
21 my $new = $db->new_feature(-primary_tag=>'gene',
22 -seq_id => 'chr3',
23 -start => 10000,
24 -end => 11000);
26 # add a new exon
27 $feature->add_SeqFeature($db->new_feature(-primary_tag=>'exon',
28 -seq_id => 'chr3',
29 -start => 5000,
30 -end => 5551));
32 =head1 DESCRIPTION
34 The Bio::DB::SeqFeature::NormalizedFeature object is an alternative
35 representation of SeqFeatures for use with Bio::DB::SeqFeature::Store
36 database system. It is identical to Bio::DB::SeqFeature, except that
37 instead of storing feature/subfeature relationships in a database
38 table, the information is stored in the object itself. This actually
39 makes the objects somewhat inconvenient to work with from SQL, but
40 does speed up access somewhat.
42 To use this class, pass the name of the class to the
43 Bio::DB::SeqFeature::Store object's seqfeature_class() method. After
44 this, $db-E<gt>new_feature() will create objects of type
45 Bio::DB::SeqFeature::NormalizedFeature. If you are using the GFF3
46 loader, pass Bio::DB::SeqFeature::Store::GFF3Loader-E<gt>new() the
47 -seqfeature_class argument:
49 use Bio::DB::SeqFeature::Store::GFF3Loader;
51 my $store = connect_to_db_somehow();
52 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(
53 -store=>$db,
54 -seqfeature_class => 'Bio::DB::SeqFeature::NormalizedFeature'
57 =cut
59 use strict;
60 use Carp 'croak';
61 use base 'Bio::SeqFeature::Lite';
62 use base 'Bio::DB::SeqFeature::NormalizedFeatureI';
63 use overload '""' => \&as_string,
64 eq => \&eq,
65 ne => \&ne,
66 fallback => 1;
68 use vars '$AUTOLOAD';
70 my $USE_OVERLOADED_NAMES = 1;
72 # some of this is my fault and some of it is changing bioperl API
73 *get_all_SeqFeatures = *sub_SeqFeature = *merged_segments = \&segments;
75 ##### CLASS METHODS ####
77 =head2 new
79 Title : new
80 Usage : $feature = Bio::DB::SeqFeature::NormalizedFeature->new(@args)
81 Function: create a new feature
82 Returns : the new seqfeature
83 Args : see below
84 Status : public
86 This method creates and, if possible stores into a database, a new
87 Bio::DB::SeqFeature::NormalizedFeature object using the specialized
88 Bio::DB::SeqFeature class.
90 The arguments are the same to Bio::SeqFeature::Generic-E<gt>new() and
91 Bio::Graphics::Feature-E<gt>new(). The most important difference is the
92 B<-store> option, which if present creates the object in a
93 Bio::DB::SeqFeature::Store database, and he B<-index> option, which
94 controls whether the feature will be indexed for retrieval (default is
95 true). Ordinarily, you would only want to turn indexing on when
96 creating top level features, and off only when storing
97 subfeatures. The default is on.
99 Arguments are as follows:
101 -seq_id the reference sequence
102 -start the start position of the feature
103 -end the stop position of the feature
104 -display_name the feature name (returned by seqname)
105 -primary_tag the feature type (returned by primary_tag)
106 -source the source tag
107 -score the feature score (for GFF compatibility)
108 -desc a description of the feature
109 -segments a list of subfeatures (see Bio::Graphics::Feature)
110 -subtype the type to use when creating subfeatures
111 -strand the strand of the feature (one of -1, 0 or +1)
112 -phase the phase of the feature (0..2)
113 -url a URL to link to when rendered with Bio::Graphics
114 -attributes a hashref of tag value attributes, in which the key is the tag
115 and the value is an array reference of values
116 -store a previously-opened Bio::DB::SeqFeature::Store object
117 -index index this feature if true
119 Aliases:
121 -id an alias for -display_name
122 -seqname an alias for -display_name
123 -display_id an alias for -display_name
124 -name an alias for -display_name
125 -stop an alias for end
126 -type an alias for primary_tag
128 =cut
130 sub new {
131 my $class = shift;
132 my %args = @_;
133 my $db = $args{-store} || $args{-factory};
134 my $index = exists $args{-index} ? $args{-index} : 1;
135 my $self = $class->SUPER::new(@_);
137 if ($db) {
138 if ($index) {
139 $db->store($self); # this will set the primary_id
140 } else {
141 $db->store_noindex($self); # this will set the primary_id
143 $self->object_store($db);
145 $self;
148 =head2 Bio::SeqFeatureI methods
150 The following Bio::SeqFeatureI methods are supported:
152 seq_id(), start(), end(), strand(), get_SeqFeatures(),
153 display_name(), primary_tag(), source_tag(), seq(),
154 location(), primary_id(), overlaps(), contains(), equals(),
155 intersection(), union(), has_tag(), remove_tag(),
156 add_tag_value(), get_tag_values(), get_all_tags()
158 Some methods that do not make sense in the context of a genome
159 annotation database system, such as attach_seq(), are not supported.
161 Please see L<Bio::SeqFeatureI> for more details.
163 =cut
165 sub seq {
166 my $self = shift;
168 require Bio::PrimarySeq unless Bio::PrimarySeq->can('new');
170 my ($start,$end) = ($self->start,$self->end);
171 if ($self->strand < 0) {
172 ($start,$end) = ($end,$start);
175 if (my $store = $self->object_store) {
176 return Bio::PrimarySeq->new(-seq => $store->fetch_sequence($self->seq_id,$start,$end) || '',
177 -id => $self->display_name);
178 } else {
179 return $self->SUPER::seq($self->seq_id,$start,$end);
183 sub subseq {
184 my $self = shift;
185 my ($newstart,$newstop) = @_;
186 my $store = $self->object_store or return;
187 my ($start,$stop) = ($self->start+$newstart-1,$self->end+$newstop-1);
188 if ($self->strand < 0) {
189 ($start,$stop) = ($stop,$start);
191 my $seq = $store->fetch_sequence($self->seq_id,$start,$stop);
192 return Bio::PrimarySeq->new($seq);
195 =head2 add_SeqFeature
197 Title : add_SeqFeature
198 Usage : $flag = $feature->add_SeqFeature(@features)
199 Function: Add subfeatures to the feature
200 Returns : true if successful
201 Args : list of Bio::SeqFeatureI objects
202 Status : public
204 Add one or more subfeatures to the feature. For best results,
205 subfeatures should be of the same class as the parent feature
206 (i.e. don't try mixing Bio::DB::SeqFeature::NormalizedFeature with
207 other feature types).
209 An alias for this method is add_segment().
211 =cut
213 sub add_SeqFeature {
214 my $self = shift;
215 $self->_add_segment(1,@_);
218 =head2 update
220 Title : update
221 Usage : $flag = $feature->update()
222 Function: Update feature in the database
223 Returns : true if successful
224 Args : none
225 Status : public
227 After changing any fields in the feature, call update() to write it to
228 the database. This is not needed for add_SeqFeature() as update() is
229 invoked automatically.
231 =cut
233 sub update {
234 my $self = shift;
235 my $store = $self->object_store or return;
236 $store->store($self);
239 =head2 get_SeqFeatures
241 Title : get_SeqFeature
242 Usage : @subfeatures = $feature->get_SeqFeatures([@types])
243 Function: return subfeatures of this feature
244 Returns : list of subfeatures
245 Args : list of subfeature primary_tags (optional)
246 Status : public
248 This method extends the Bio::SeqFeatureI get_SeqFeatures() slightly by
249 allowing you to pass a list of primary_tags, in which case only
250 subfeatures whose primary_tag is contained on the list will be
251 returned. Without any types passed all subfeatures are returned.
253 =cut
256 # segments can be either normalized IDs or ordinary feature objects
257 sub get_SeqFeatures {
258 my $self = shift;
259 my @types = @_;
261 my $s = $self->{segments} or return;
262 my $store = $self->object_store;
263 my (@ordinary,@ids);
264 for (@$s) {
265 if (ref ($_)) {
266 push @ordinary,$_;
267 } else {
268 push @ids,$_;
271 my @r = grep {$_->type_match(@types)} (@ordinary,$store->fetch_many(\@ids));
272 for my $r (@r) {
273 eval {$r->object_store($store) };
275 return @r;
278 =head2 object_store
280 Title : object_store
281 Usage : $store = $feature->object_store([$new_store])
282 Function: get or set the database handle
283 Returns : current database handle
284 Args : new database handle (optional)
285 Status : public
287 This method will get or set the Bio::DB::SeqFeature::Store object that
288 is associated with the feature. After changing the store, you should
289 probably unset the feature's primary_id() and call update() to ensure
290 that the object is written into the database as a new feature.
292 =cut
294 sub object_store {
295 my $self = shift;
296 my $d = $self->{store};
297 $self->{store} = shift if @_;
302 =head2 overloaded_names
304 Title : overloaded_names
305 Usage : $overload = $feature->overloaded_names([$new_overload])
306 Function: get or set overloading of object strings
307 Returns : current flag
308 Args : new flag (optional)
309 Status : public
311 For convenience, when objects of this class are stringified, they are
312 represented in the form "primary_tag(display_name)". To turn this
313 feature off, call overloaded_names() with a false value. You can
314 invoke this on an individual feature object or on the class:
316 Bio::DB::SeqFeature::NormalizedFeature->overloaded_names(0);
318 =cut
321 sub overloaded_names {
322 my $class = shift;
323 my $d = $USE_OVERLOADED_NAMES;
324 $USE_OVERLOADED_NAMES = shift if @_;
328 =head2 segment
330 Title : segment
331 Usage : $segment = $feature->segment
332 Function: return a Segment object corresponding to feature
333 Returns : a Bio::DB::SeqFeature::Segment
334 Args : none
335 Status : public
337 This turns the feature into a Bio::DB::SeqFeature::Segment object,
338 which you can then use to query for overlapping features. See
339 L<Bio::DB::SeqFeature::Segment>.
341 =cut
343 sub segment {
344 my $self = shift;
345 return Bio::DB::SeqFeature::Segment->new($self);
348 ### instance methods
350 =head2 AUTOLOADED methods
352 @subfeatures = $feature->Exon;
354 If you use an unknown method that begins with a capital letter, then
355 the feature autogenerates a call to get_SeqFeatures() using the
356 lower-cased method name as the primary_tag. In other words
357 $feature-E<gt>Exon is equivalent to:
359 @subfeature s= $feature->get_SeqFeatures('exon')
361 If you use an unknown method that begins with Tag_(tagname),
362 Att_(tagname) Is_(tagname), then it will be the same as calling the
363 each_tag_value() method with the tagname. In a list context, these
364 autogenerated procedures return the list of results. In scalar
365 context, they return the first item in the list!!
367 =cut
370 sub AUTOLOAD {
371 my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/;
372 my $sub = $AUTOLOAD;
373 my $self = $_[0];
375 # ignore DESTROY calls
376 return if $func_name eq 'DESTROY';
378 # call attributes if func_name begins with "Tag_" or "Att_":
379 if ($func_name =~ /^(Tag|Att|Is)_(\w+)/) {
380 my @result = $self->each_tag_value($2);
381 return wantarray ? @result : $result[0];
384 # fetch subfeatures if func_name has an initial cap
385 if ($func_name =~ /^[A-Z]/) {
386 return $self->get_SeqFeatures(lc $func_name);
389 # error message of last resort
390 $self->throw(qq(Can't locate object method "$func_name" via package "$pack"));
394 sub add_segment {
395 my $self = shift;
396 $self->_add_segment(0,@_);
399 # This adds subfeatures. It has the property of converting the
400 # provided features into an object like itself and storing them
401 # into the database. If the feature already has a primary id and
402 # an object_store() method, then it is not stored into the database,
403 # but its primary id is reused.
404 sub _add_segment {
405 my $self = shift;
406 my $normalized = shift;
407 my $store = $self->object_store;
409 my @segments = $self->_create_subfeatures($normalized,@_);
411 # fix boundaries
412 $self->_fix_boundaries(\@segments);
414 # freakish fixing of our non-standard Target attribute
415 $self->_fix_target(\@segments);
417 for my $seg (@segments) {
418 my $id = $normalized ? $seg->primary_id : $seg;
419 defined $id or $self->throw("No primary ID when there should be");
420 push @{$self->{segments}},$id;
423 $self->update if $self->primary_id; # write us back to disk
426 sub _fix_boundaries {
427 my $self = shift;
428 my $segments = shift;
429 my $normalized = shift;
431 my $min_start = $self->start || 999_999_999_999;
432 my $max_stop = $self->end || -999_999_999_999;
434 for my $seg (@$segments) {
435 $min_start = $seg->start if $seg->start < $min_start;
436 $max_stop = $seg->end if $seg->end > $max_stop;
439 # adjust our boundaries, etc.
440 $self->start($min_start) if $min_start < $self->start;
441 $self->end($max_stop) if $max_stop > $self->end;
442 $self->{ref} ||= $segments->[0]->seq_id;
443 $self->{strand} ||= $segments->[0]->strand;
446 sub _fix_target {
447 my $self = shift;
448 my $segs = shift;
449 my $normalized = shift; # ignored for now
451 # freakish fixing of our non-standard Target attribute
452 if (my $t = ($self->attributes('Target'))[0]) {
453 my ($seqid,$tstart,$tend,$strand) = split /\s+/,$t;
454 if (defined $tstart && defined $tend) {
455 my $min_tstart = $tstart;
456 my $max_tend = $tend;
457 for my $seg (@$segs) {
458 my $st = ($seg->attributes('Target'))[0] or next;
459 (undef,$tstart,$tend) = split /\s+/,$st;
460 next unless defined $tstart && defined $tend;
461 $min_tstart = $tstart if $tstart < $min_tstart;
462 $max_tend = $tend if $tend > $max_tend;
464 if ($min_tstart < $tstart or $max_tend > $tend) {
465 $self->{attributes}{Target}[0] = join ' ',($seqid,$min_tstart,$max_tend,$strand||'');
471 # undo the load_id and Target hacks on the way out
472 sub format_attributes {
473 my $self = shift;
474 my $parent = shift;
475 my $fallback_id = shift;
477 my $load_id = $self->load_id || '';
479 my $targobj = ($self->attributes('Target'))[0];
480 # was getting an 'Use of uninitialized value with split' here, changed to cooperate -cjf 7/10/07
481 my ($target) = $targobj ? split /\s+/,($self->attributes('Target'))[0] : ('');
482 my @tags = $self->all_tags;
483 my @result;
484 for my $t (@tags) {
485 my @values = $self->each_tag_value($t);
487 # This line prevents Alias from showing up if it matches the load id, but this is not good
488 # @values = grep {$_ ne $load_id && $_ ne $target} @values if $t eq 'Alias';
490 # these are hacks, which we don't want to appear in the file
491 next if $t eq 'load_id';
492 next if $t eq 'parent_id';
494 foreach (@values) { s/\s+$// } # get rid of trailing whitespace
495 push @result,join '=',$self->escape($t),join(',', map {$self->escape($_)} @values) if @values;
497 my $id = $self->primary_id || $fallback_id;
498 my $parent_id;
499 if (@$parent) {
500 $parent_id = join (',',map {$self->escape($_)} @$parent);
502 my $name = $self->display_name;
503 unshift @result,"ID=".$self->escape($id) if defined $id;
504 unshift @result,"Parent=".$parent_id if defined $parent_id;
505 unshift @result,"Name=".$self->escape($name) if defined $name;
506 return join ';',@result;
509 sub _create_subfeatures {
510 my $self = shift;
511 my $normalized = shift;
513 my $type = $self->{subtype} || $self->{type};
514 my $ref = $self->seq_id;
515 my $name = $self->name;
516 my $class = $self->class;
517 my $store = $self->object_store;
518 my $source = $self->source;
520 if ($normalized) {
521 $store or $self->throw("Feature must be associated with a Bio::DB::SeqFeature::Store database before attempting to add subfeatures to a normalized object");
524 my $index_subfeatures_policy = eval{$store->index_subfeatures};
526 my @segments;
528 for my $seg (@_) {
530 if (UNIVERSAL::isa($seg,ref $self)) {
532 if (!$normalized) { # make sure the object has no lazy behavior
533 $seg->primary_id(undef);
534 $seg->object_store(undef);
536 push @segments,$seg;
539 elsif (ref($seg) eq 'ARRAY') {
540 my ($start,$stop) = @{$seg};
541 next unless defined $start && defined $stop; # fixes an obscure bug somewhere above us
542 my $strand = $self->{strand};
544 if ($start > $stop) {
545 ($start,$stop) = ($stop,$start);
546 $strand = -1;
548 push @segments,$self->new(-start => $start,
549 -stop => $stop,
550 -strand => $strand,
551 -ref => $ref,
552 -type => $type,
553 -name => $name,
554 -class => $class,
555 -source => $source,
560 elsif (UNIVERSAL::isa($seg,'Bio::SeqFeatureI')) {
561 my $score = $seg->score if $seg->can('score');
562 my $f = $self->new(-start => $seg->start,
563 -end => $seg->end,
564 -strand => $seg->strand,
565 -seq_id => $seg->seq_id,
566 -name => $seg->display_name,
567 -primary_tag => $seg->primary_tag,
568 -source_tag => $seg->source,
569 -score => $score,
570 -source => $source,
572 for my $tag ($seg->get_all_tags) {
573 my @values = $seg->get_tag_values($tag);
574 $f->{attributes}{$tag} = \@values;
576 push @segments,$f;
579 else {
580 croak "$seg is neither a Bio::SeqFeatureI object nor an arrayref";
584 return unless @segments;
586 if ($normalized && $store) { # parent/child data is going to be stored in the database
588 my @need_loading = grep {!defined $_->primary_id || $_->object_store ne $store} @segments;
589 if (@need_loading) {
590 my $result;
591 if ($index_subfeatures_policy) {
592 $result = $store->store(@need_loading);
593 } else {
594 $result = $store->store_noindex(@need_loading);
596 $result or croak "Couldn't store one or more subseqfeatures";
600 return @segments;
603 =head2 load_id
605 Title : load_id
606 Usage : $id = $feature->load_id
607 Function: get the GFF3 load ID
608 Returns : the GFF3 load ID (string)
609 Args : none
610 Status : public
612 For features that were originally loaded by the GFF3 loader, this
613 method returns the GFF3 load ID. This method may not be supported in
614 future versions of the module.
616 =cut
618 sub load_id {
619 return (shift->attributes('load_id'))[0];
623 =head2 notes
625 Title : notes
626 Usage : @notes = $feature->notes
627 Function: get contents of the GFF3 Note tag
628 Returns : List of GFF3 Note tags
629 Args : none
630 Status : public
632 For features that were originally loaded by the GFF3 loader, this
633 method returns the contents of the Note tag as a list. This is a
634 convenience for Bio::Graphics, which looks for notes() when it
635 constructs a default description line.
637 =cut
639 sub notes {
640 return shift->attributes('Note');
643 =head2 primary_id
645 Title : primary_id
646 Usage : $id = $feature->primary_id([$new_id])
647 Function: get/set the feature's database ID
648 Returns : the current primary ID
649 Args : none
650 Status : public
652 This method gets or sets the primary ID of the feature in the
653 underlying Bio::DB::SeqFeature::Store database. If you change this
654 field and then call update(), it will have the effect of making a copy
655 of the feature in the database under a new ID.
657 =cut
659 sub primary_id {
660 my $self = shift;
661 my $d = $self->{primary_id};
662 $self->{primary_id} = shift if @_;
666 =head2 target
668 Title : target
669 Usage : $segment = $feature->target
670 Function: return the segment correspondent to the "Target" attribute
671 Returns : a Bio::DB::SeqFeature::Segment object
672 Args : none
673 Status : public
675 For features that are aligned with others via the GFF3 Target
676 attribute, this returns a segment corresponding to the aligned
677 region. The CIGAR gap string is not yet supported.
679 =cut
681 sub target {
682 my $self = shift;
683 my @targets = $self->attributes('Target');
684 my @result;
685 for my $t (@targets) {
686 my ($seqid,$start,$end,$strand) = split /\s+/,$t;
687 $strand ||= '';
688 $strand = $strand eq '+' ? 1
689 : $strand eq '-' ? -1
690 : 0;
691 push @result,Bio::DB::SeqFeature::Segment->new($self->object_store,
692 $seqid,
693 $start,
694 $end,
695 $strand);
697 return wantarray ? @result : $result[0];
700 =head2 Internal methods
702 =over 4
704 =item $feature-E<gt>as_string()
706 Internal method used to implement overloaded stringification.
708 =item $boolean = $feature-E<gt>type_match(@list_of_types)
710 Internal method that will return true if the feature's primary_tag and
711 source_tag match any of the list of types (in primary_tag:source_tag
712 format) provided.
714 =back
716 =cut
718 sub as_string {
719 my $self = shift;
720 return overload::StrVal($self) unless $self->overloaded_names;
721 my $name = $self->display_name || $self->load_id;
722 $name ||= "id=".$self->primary_id if $self->primary_id;
723 $name ||= "<unnamed>";
724 my $method = $self->primary_tag;
725 my $source= $self->source_tag;
726 my $type = $source ? "$method:$source" : $method;
727 return "$type($name)";
730 sub eq {
731 my $self = shift;
732 my $b = shift;
733 my $store1 = $self->object_store;
734 my $store2 = eval {$b->object_store} || '';
735 return $store1 eq $store2 && $self->primary_id eq $b->primary_id;
738 sub ne {
739 my $self = shift;
740 return !$self->eq(shift);
743 # completely case insensitive
744 sub type_match {
745 my $self = shift;
746 my @types = @_;
747 my $method = lc $self->primary_tag;
748 my $source = lc $self->source_tag;
749 for my $t (@types) {
750 my ($m,$s) = map {lc $_} split /:/,$t;
751 return 1 if $method eq $m && (!defined $s || $source eq $s);
753 return;
756 sub segments { shift->get_SeqFeatures(@_) }
761 __END__
763 =head1 BUGS
765 This is an early version, so there are certainly some bugs. Please
766 use the BioPerl bug tracking system to report bugs.
768 =head1 SEE ALSO
770 L<bioperl>,
771 L<Bio::DB::SeqFeature>,
772 L<Bio::DB::SeqFeature::Store>,
773 L<Bio::DB::SeqFeature::Segment>,
774 L<Bio::DB::SeqFeature::GFF3Loader>,
775 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
776 L<Bio::DB::SeqFeature::Store::bdb>
778 =head1 AUTHOR
780 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
782 Copyright (c) 2006 Cold Spring Harbor Laboratory.
784 This library is free software; you can redistribute it and/or modify
785 it under the same terms as Perl itself.
787 =cut