3 # Helper module for Bio::SeqIO::game::featHandler
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sheldon McKay <mckays@cshl.edu>
9 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqIO::game::featHandler -- a class for handling feature elements
20 This module is not used directly
24 Bio::SeqIO::game::featHandler converts game XML E<lt>annotationE<gt>
25 elements into flattened Bio::SeqFeature::Generic objects to be added
32 User feedback is an integral part of the evolution of this
33 and other Bioperl modules. Send your comments and suggestions preferably
34 to one of the Bioperl mailing lists.
36 Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 Please direct usage questions or support issues to the mailing list:
45 I<bioperl-l@bioperl.org>
47 rather than to the module maintainer directly. Many experienced and
48 reponsive experts will be able look at the problem and quickly
49 address it. Please include a thorough description of the problem
50 with code and data examples if at all possible.
54 Report bugs to the Bioperl bug tracking system to help us keep track
55 of the bugs and their resolution. Bug reports can be submitted via the
58 https://github.com/bioperl/bioperl-live/issues
60 =head1 AUTHOR - Sheldon McKay
66 The rest of the documentation details each of the object
67 methods. Internal methods are usually preceded with a _
71 package Bio
::SeqIO
::game
::featHandler
;
73 use Bio
::SeqFeature
::Generic
;
74 use Bio
::Location
::Split
;
80 use base
qw(Bio::SeqIO::game::gameSubs);
85 Usage : my $featHandler = Bio::SeqIO::game::featHandler->new($seq, $seq_h, $ann_l)
86 Function: creates an object to deal with sequence features
87 Returns : a handler object
88 Args : $seq -- a Bio::SeqI compliant object
89 $seq_h -- ref. to a hash of other sequences associated
90 with the main sequence (proteins, etc)
91 $ann_l -- ref. to a list of annotations
96 my ($caller, $seq, $seq_h, $ann_l ) = @_;
97 my $class = ref($caller) || $caller;
113 Usage : $featHandler->add_source($seq->length, \%tags);
114 Function: creates a source feature
115 Returns : a Bio::SeqFeature::Generic object
116 Args : sequence length and a ref. to a hash of tag/value attributes
121 my ($self, $length, $tags) = @_;
122 my $feat = Bio
::SeqFeature
::Generic
->new( -primary
=> 'source',
126 for ( keys %{$tags} ) {
127 for my $val ( @
{$tags->{$_}} ) {
128 $feat->add_tag_value( $_ => $val );
138 Usage : my $gene = $self->_has_gene($gene, $gname, $id)
139 Function: method to get/set the current gene feature
140 Returns : a Bio::SeqFeature::Generic object (if there is a gene)
142 $gene -- an XML element for the annotation
144 $id -- gene ID (not always the same as the name)
149 my ($self, $gene, $gname, $id) = @_;
151 # use name preferentially over id. We can't edit IDs in Apollo
152 # AFAIK, and this will create an orphan CDS for newly created
153 # transcipts -- I think this needs more work
154 #$id = $gname if $id && $gname;
157 if ( defined $self->{curr_gene
} ) {
158 return $self->{curr_gene
};
165 if ( $id && !$self->{curr_ltag
} ) {
166 $self->{curr_ltag
} = $id;
168 if ( $gname && !$self->{curr_gname
} ) {
169 $self->{curr_gname
} = $gname;
174 for my $child ( @
{$gene->{Children
}} ) {
175 my $name = $child->{Name
};
177 if ( $name eq 'dbxref' ) {
178 $tags->{dbxref
} ||= [];
179 push @
{$tags->{dbxref
}}, $self->dbxref( $child );
181 elsif ( $name !~ /name/ ){
182 $self->complain("Unrecognized element '$name'. I don't " .
183 "know what to do with $name elements");
187 my $feat = Bio
::SeqFeature
::Generic
->new(
191 for ( keys %{$tags} ) {
192 for my $val ( @
{$tags->{$_}} ) {
193 $feat->add_tag_value( $_ => $val ) unless ++$seen{$_.$val} > 1;
197 $self->{curr_gene
} = $feat;
205 Usage : my $cds = $self->_has_CDS
206 Function: internal getter/setter for CDS features
207 Returns : a Bio::SeqFeature::Generic transcript object (or nothing)
208 Args : a Bio::SeqFeature::Generic transcript feature
213 my ($self, $transcript) = @_;
215 if ( !$transcript ) {
216 if ( defined $self->{curr_cds
} ) {
217 return $self->{curr_cds
};
224 my $tags = $self->{curr_tags
};
225 $self->{curr_cds
} = $self->_add_CDS( $transcript, $tags );
229 =head2 add_annotation
231 Title : add_annotation
232 Usage : $featHandler->add_annotation($seq, $type, $id, $tags, $feats)
233 Function: converts a containment hierarchy into an ordered list of flat features
235 Args : $seq -- a Bio::SeqI compliant object
236 $type -- the annotation type
237 $id -- the anotation ID
238 $tags -- ref. to a hash of tag/value attributes
239 $feats -- ref to an array of Bio::SeqFeature::Generic objects
244 my ($self, $seq, $type, $id, $tags, $feats) = @_;
246 # is this a generic feature?
247 unless ( $self->has_gene ) {
249 $self->_add_generic_annotation(@_);
255 if ( $type eq 'gene' ) {
256 $feat = $self->has_gene;
257 $feat->add_tag_value( gene
=> ($self->{curr_gname
} || $id) )
258 unless $feat->has_tag('gene');
261 $feat = Bio
::SeqFeature
::Generic
->new;
262 $feat->primary_tag($type);
263 my $gene = $self->has_gene;
264 $gene->add_tag_value( gene
=> ($self->{curr_gname
} || $id) )
265 unless $gene->has_tag('gene');
266 $feat->add_tag_value( gene
=> ($self->{curr_gname
} || $id) )
267 unless $feat->has_tag('gene');;
269 for ( keys %{$tags} ) {
270 # or else add simple tag/value pairs
271 if ( $_ eq 'name' && $tags->{type
}->[0] eq 'gene' ) {
272 $feat->add_tag_value( gene
=> $tags->{name
}->[0] )
273 unless $feat->has_tag( 'gene' );
274 delete $tags->{name
};
277 next if $_ eq 'type' && $tags->{$_}->[0] eq 'gene';
278 next if $_ eq 'gene' && $feat->has_tag( 'gene' );
279 for my $val ( @
{$tags->{$_}} ) {
280 $feat->add_tag_value( $_ => $val );
286 $feat->strand( $self->{curr_strand
} );
287 $feat->start( $self->{curr_coords
}->[0] );
288 $feat->end( $self->{curr_coords
}->[1] );
290 # create an array of features for the annotation (order matters)
291 my @annotations = ( $feat );
293 # add the gene feature if the annotation is not a gene
294 if ( $self->has_gene && $type ne 'gene') {
295 my $gene = $self->has_gene;
296 $gene->strand( $self->{curr_strand
} );
297 $gene->start( $self->{curr_coords
}->[0] );
298 $gene->end( $self->{curr_coords
}->[-1] );
299 push @annotations, $gene;
300 $self->{curr_gene
} = '';
303 # add the subfeatures
305 $self->complain("bad feature $_") unless ref($_) =~ /Bio/;
306 push @annotations, $_;
309 # add the annotation array to the list for this sequence
310 my $seqid = $seq->id;
311 my $list = $self->{ann_l
};
313 # make sure the feature_sets appear in ascending order
314 if ( $list->[0] && $annotations[0]->start < $list->[0]->start ) {
315 unshift @
{$list}, @annotations;
318 push @
{$list}, @annotations;
322 $self->{curr_gene
} = '';
323 $self->{curr_ltag
} = '';
324 $self->{curr_gname
} = '';
325 $self->{curr_coords
} = [];
326 $self->{curr_feats
} = [];
327 $self->{curr_strand
} = 0;
328 $self->{ann_seq
} = $seq;
333 =head2 _add_generic_annotation
335 Title : _add_generic_annotation
336 Usage : $self->_add_generic_annotation($seq, $type, $id, $tags, $feats)
337 Function: an internal method to handle non-gene annotations
339 Args : $seq -- a Bio::SeqI compliant object
340 $type -- the annotation type
341 $id -- the anotation ID
342 $tags -- ref. to a hash of tag/value attributes
343 $feats -- ref to an array of Bio::SeqFeature::Generic objects
347 sub _add_generic_annotation
{
348 my ($self, $seq, $type, $id, $tags, $feats) = @_;
351 $_->primary_tag($type);
354 push @
{$self->{ann_l
}}, @
$feats;
356 $self->{curr_coords
} = [];
357 $self->{curr_feats
} = [];
358 $self->{curr_strand
} = 0;
359 $self->{ann_seq
} = $seq;
367 Usage : push @feats, $featHandler->feature_set($id, $gname, $set, $anntype);
368 Function: handles <feature_span> hierarchies (usually a transcript)
369 Returns : a list of Bio::SeqFeature::Generic objects
370 Args : $id -- ID of the feature set
371 $gname -- name of the gene
372 $set -- the <feature_set> object
373 $anntype -- type of the parent annotation
380 my ($self, $id, $gname, $set, $anntype) = @_;
381 my $stype = $set->{_type
}->{Characters
};
382 $self->{curr_loc
} = [];
383 $self->{curr_tags
} = {};
384 $self->{curr_subfeats
} = [];
385 $self->{curr_strand
} = 0;
387 my $tags = $self->{curr_tags
};
388 my $sname = $set->{_name
}->{Characters
} ||
389 $set->{Attributes
}->{id
};
391 if ( $set->{Attributes
}->{problem
} ) {
392 $tags->{problem
} = [$set->{Attributes
}->{problem
}];
395 my @fcount = grep { $_->{Name
} eq 'feature_span' } @
{$set->{Children
}};
397 if ( @fcount == 1 ) {
398 $self->_build_feature_set($set, 1);
399 my ($feat) = @
{$self->{curr_subfeats
}};
400 $feat->primary_tag('transcript') if $feat->primary_tag eq 'exon';
401 if ( $feat->primary_tag eq 'transcript' ) {
402 $feat->add_tag_value( gene
=> ($gname || $id) )
403 unless $feat->has_tag('gene');
407 for my $tag ( keys %{$tags} ) {
408 for my $val ( @
{$tags->{$tag}} ) {
409 $feat->add_tag_value( $tag => $val )
410 if $val && ++$seen_tag{$tag.$val} < 2;
416 $self->{curr_ltag
} = $id;
417 $self->{curr_cds
} = '';
418 $gname = $id if $gname eq 'gene';
419 $self->{curr_gname
} = $gname;
421 if ( $self->has_gene ) {
422 unless ( $anntype =~/RNA/i ) {
423 $stype =~ s/transcript/mRNA/;
427 $self->{curr_feat
} = Bio
::SeqFeature
::Generic
->new(
431 my $feat = $self->{curr_feat
};
432 $self->_build_feature_set($set);
434 my $gene = $gname || $self->{curr_ltag
};
436 $feat->add_tag_value( gene
=> $gene )
437 unless $feat->has_tag('gene');
439 # if there is an annotated protein product
440 my $cds = $self->_has_CDS( $feat );
443 $feat->primary_tag('mRNA');
445 # we really just want one value here
446 $cds->remove_tag('standard_name') if $cds->has_tag('standard_name');
447 $cds->add_tag_value( standard_name
=> $sname );
448 $cds->remove_tag('gene') if $cds->has_tag('gene');
449 $cds->add_tag_value( gene
=> $gene );
451 # catch empty protein ids
452 if ( $cds->has_tag('protein_id' ) && !$cds->get_tag_values('protein_id') ) {
453 my $pid = $self->protein_id($cds, $sname);
454 $cds->remove_tag('protein_id');
455 $cds->add_tag_value( protein_id
=> $pid );
458 # make sure other subfeats are tied to the transcript
459 # via a 'standard_name' qualifier and the gene via a 'gene' qualifier
460 my @subfeats = @
{$self->{curr_subfeats
}};
461 for my $sf ( @ subfeats
) {
462 $sf->add_tag_value( standard_name
=> $sname )
463 unless $sf->has_tag('standard_name');
464 $sf->add_tag_value( gene
=> $gene )
465 unless $sf->has_tag('gene');
468 $feat->add_tag_value( standard_name
=> $sname )
469 unless $feat->has_tag('standard_name');
470 $feat->add_tag_value( gene
=> $gene )
471 unless $feat->has_tag('gene');
473 # if the mRNA and CDS are the same length, the mRNA is redundant
474 # lose the mRNA, steal its tags and give them to the CDS
476 if ( $feat->length == $cds->length ) {
477 for my $t ( $feat->all_tags ) {
478 next if $t =~ /gene|standard_name/;
479 $cds->add_tag_value( $t => $feat->get_tag_values($t) );
484 @feats = sort { $a->start <=> $b->start } ($cds, @subfeats);
485 unshift @feats, $feat if $feat;
488 if ( @
{$self->{curr_loc
}} > 1 ) {
489 my $loc = Bio
::Location
::Split
->new( -splittype
=> 'JOIN' );
491 # sort the exons in ascending start order
492 my @loc = sort { $a->start <=> $b->start } @
{$self->{curr_loc
}};
494 # then add them to the transcript location
496 $loc->add_sub_Location( $_ )
498 $feat->location( $loc );
501 $feat->location( $self->{curr_loc
}->[0] );
504 for ( keys %$tags ) {
505 # expunge duplicate gene attributes
506 next if /gene/ && $feat->has_tag('gene');
507 for my $v ( @
{$tags->{$_}} ) {
508 $feat->add_tag_value( $_ => $v );
512 # make sure other subfeats are tied to the transcript
513 my @subfeats = @
{$self->{curr_subfeats
}};
514 for my $sf ( @ subfeats
) {
515 $sf->add_tag_value( standard_name
=> $sname )
516 unless $sf->has_tag('standard_name');
517 $sf->add_tag_value( gene
=> $gene )
518 unless $sf->has_tag('gene');
521 @feats = ( $feat, @subfeats );
525 # adjust the maximum extent of the annotated feature
526 # if req'd (ie the <annotation> element)
527 $self->{curr_coords
}->[0] ||= 1000000000000;
528 $self->{curr_coords
}->[1] ||= -1000000000000;
530 if ( $self->{curr_coords
}->[0] > $_->start ) {
531 $self->{curr_coords
}->[0] = $_->start;
533 if ( $self->{curr_coords
}->[1] < $_->end ) {
534 $self->{curr_coords
}->[1] = $_->end;
538 $self->flush( $set );
544 =head2 _build_feature_set
546 Title : _build_feature_set
547 Usage : $self->_build_feature_set($set, 1) # 1 flag means retain the exon as a subfeat
548 Function: an internal method to process attributes and subfeats of a feature set
550 Args : $set -- a <feature_set> element
551 1 -- optional flag to retain exons as subfeats. Otherwise, they will
552 be converted to sublocations of a parent CDS feature
557 sub _build_feature_set
{
558 my ($self, $set, $keep_subfeat) = @_;
560 for my $child ( @
{$set->{Children
}} ) {
561 my $name = $child->{Name
};
563 # these elements require special handling
564 if ( $name eq 'date' ) {
565 $self->date( $child );
567 elsif ( $name eq 'comment' ) {
568 $self->comment( $child );
570 elsif ( $name eq 'evidence' ) {
571 $self->evidence( $child );
573 elsif ( $name eq 'feature_span' ) {
574 $self->_add_feature_span( $child, $keep_subfeat );
576 elsif ( $name eq 'property' ) {
577 $self->property( $child );
580 # need to add the db_xref tags to the gene?
581 # otherwise, simple tag/value pairs
582 elsif ( $name =~ /synonym|author|description/) {
583 $self->{curr_tags
}->{$name} = [$child->{Characters
}];
585 elsif ( $name !~ /name|type|seq/ ){
586 $self->complain("Unrecognized element '$name'. I don't " .
587 "know what to do with $name elements");
593 =head2 _add_feature_span
595 Title : _add_feature_span
596 Usage : $self->_add_feature_span($el, 1)
597 Function: an internal method to process <feature_span> elements
599 Args : $el -- a <feature_span> element
600 1 -- an optional flag to retain exons as subfeatures
606 sub _add_feature_span
{
607 my ($self, $el, $keep_subfeat) = @_;
609 my $tags = $self->{curr_tags
};
610 my $feat = $self->{curr_feat
};
611 my $type = $el->{_type
}->{Characters
} || $el->{Name
};
612 my $id = $el->{Attributes
}->{id
} || $el->{_name
}->{Characters
};
613 my $seqr = $el->{_seq_relationship
};
614 my $start = int $seqr->{_span
}->{_start
}->{Characters
};
615 my $end = int $seqr->{_span
}->{_end
}->{Characters
};
616 my $stype = $seqr->{Attributes
}->{type
};
617 my $seqid = $seqr->{Attributes
}->{seq
};
619 push @
{$self->{seq_l
}}, $self->{seq_h
}->{$seqid};
621 if ( $start > $end ) {
622 $self->{curr_strand
} = -1;
623 ($start, $end) = ($end, $start);
626 $self->{curr_strand
} = 1;
629 # add exons to the transcript
630 if ( $type eq 'exon' ) {
631 my $sl = Bio
::Location
::Simple
->new( -start
=> $start,
633 -strand
=> $self->{curr_strand
} );
634 push @
{$self->{curr_loc
}}, $sl;
637 # apollo and gadfly use different tags for the same thing
638 if ( $type =~ /start_codon|translate offset/ ) {
639 $self->{curr_tags
}->{codon_start
} = [$start];
642 if ( $type eq 'exon' ) {
643 return unless $keep_subfeat;
645 push @
{$self->{curr_subfeats
}},
646 Bio
::SeqFeature
::Generic
->new( -start
=> $start,
648 -strand
=> $self->{curr_strand
},
652 # identify the translation product
653 my $tscript = $el->{Attributes
}->{produces_seq
};
654 if ( $tscript && $tscript ne 'null') {
655 my $subseq = $self->{seq_h
}->{$el->{Attributes
}->{produces_seq
}};
656 $self->{curr_tags
}->{product
} = [$el->{Attributes
}->{produces_seq
}];
657 $self->{curr_tags
}->{translation
} = [$subseq->seq] if $subseq;
666 Usage : my $cds = $self->_add_CDS($transcript, $tags)
667 Function: an internal method to create a CDS feature from a transcript feature
668 Returns : a Bio::SeqFeature::Generic object
669 Args : $transcript -- a Bio::SeqFeature::Generic object for a transcript
670 $tags -- ref. to a hash of tag/value attributes
675 my ($self, $feat, $tags) = @_;
679 if ( @
{$self->{curr_loc
}} > 1 ) {
680 $loc = Bio
::Location
::Split
->new;
682 # sort the exons in ascending start order
683 my @loc = sort { $a->start <=> $b->start } @
{$self->{curr_loc
}};
685 # then add them to the location object
687 $loc->add_sub_Location( $_ );
691 $loc = $self->{curr_loc
}->[0];
696 my @exons = $single ?
$loc : $loc->sub_Location(1);
698 $feat->location($loc);
699 # try to find a peptide
700 my $seq = $self->{seq_h
}->{ $tags->{protein_id
}->[0] };
701 $seq ||= $self->{seq_h
}->{ $tags->{product
}->[0] } ||
702 $self->{seq_h
}->{ $tags->{gene
}->[0] } ||
703 $self->{seq_h
}->{ $tags->{standard_name
}->[0] };
706 # Can we count on the description format being consistent?
707 # Why is CDS coordinate info saved as description text not
708 # specified in the DTD? Anyone have a better idea? Aww,
709 # who am I kidding, I'm the only one who will ever read this!
710 my ($start, $stop, $peptide) = ();
712 $peptide = $seq->display_id;
713 my $desc = $seq->description || '';
715 $desc =~ s/\)(\w)/\) $1/g;
717 if ( $desc =~ /cds_boundaries:.+?(\d+)\.\.(\d+)/ ) {
718 ($start, $stop) = ($1 - $self->{offset
}, $2 - $self->{offset
});
721 # OK, I guess the transcript must be the CDS then
722 $start = $loc->start;
727 $self->warn("I did not find a protein sequence for " . $feat->display_name);
730 delete $tags->{transcript
};
732 # now chop off the UTRs to create a CDS
733 my @exons_to_add = ();
734 #warn scalar(@exons), " exons, $start, $stop\n";
736 my $exon = Bio
::Location
::Simple
->new;
738 if ( $_->end < $start || $_->start > $stop ) {
739 #warn "exon out of range\n";
742 if ( $_->start < $start && $_->end > $start ) {
743 #warn "chopping off left UTR\n";
744 $exon->start( $start );
746 if ( $_->end > $stop && $_->start < $stop ) {
747 #warn "chopping off right UTR\n";
751 unless ($exon->valid_Location) {
752 $exon->start( $_->start );
753 $exon->end( $_->end );
755 $exon->strand ( $self->{curr_strand
} );
756 push @exons_to_add, $exon;
760 if ( @exons_to_add > 1 ) {
761 $cds_loc = Bio
::Location
::Split
->new( -splittype
=> 'JOIN' );
762 for ( @exons_to_add ) {
763 $cds_loc->add_sub_Location( $_ );
767 $cds_loc = $exons_to_add[0];
770 my $parent = $self->{curr_gname
} || $self->{curr_ltag
};
772 # try not to steal too many mRNA attributes for the CDS
774 for my $k ( keys %$tags ) {
775 if ( $k =~ /product|protein|translation|codon_start/ ) {
776 $cds_tags->{$k} = $tags->{$k};
781 for ( keys %$tags ) {
782 for my $v ( @
{$tags->{$_}} ) {
783 $feat->add_tag_value( $_ => $v )
784 unless $feat->has_tag($_);
788 if ( $self->{curr_gname
} ) {
789 $cds_tags->{gene
} = [$self->{curr_gname
}];
792 my $gene = $self->has_gene;
794 my $cds = Bio
::SeqFeature
::Generic
->new(
796 -location
=> $cds_loc,
799 $cds_tags->{translation
} = [$seq->seq];
801 for ( keys %{$cds_tags} ) {
803 for my $val (@
{$cds_tags->{$_}}) {
804 next if ++$seen{$val} > 1;
805 $cds->add_tag_value( $_ => $val );