sync w/ main trunk
[bioperl-live.git] / Bio / SeqFeature / Tools / IDHandler.pm
blob3dc943a9e3d9182ea13acd9b3e7a4c718c9703a5
1 # $Id$
3 # bioperl module for Bio::SeqFeature::Tools::IDHandler
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Chris Mungall <cjm@fruitfly.org>
9 # Copyright Chris Mungall
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SeqFeature::Tools::IDHandler - maps $seq_feature-E<gt>primary_tag
19 =head1 SYNOPSIS
21 use Bio::SeqIO;
22 use Bio::SeqFeature::Tools::IDHandler;
25 =head1 DESCRIPTION
27 Class to map $seq_feature-E<gt>primary_tag
30 =head1 FEEDBACK
32 =head2 Mailing Lists
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to the
36 Bioperl mailing lists Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 =head2 Support
43 Please direct usage questions or support issues to the mailing list:
45 L<bioperl-l@bioperl.org>
47 rather than to the module maintainer directly. Many experienced and
48 reponsive experts will be able look at the problem and quickly
49 address it. Please include a thorough description of the problem
50 with code and data examples if at all possible.
52 =head2 Reporting Bugs
54 report bugs to the Bioperl bug tracking system to help us keep track
55 the bugs and their resolution. Bug reports can be submitted via the
56 web:
58 http://bugzilla.open-bio.org/
60 =head1 AUTHOR - Chris Mungall
62 Email: cjm@fruitfly.org
64 =head1 APPENDIX
66 The rest of the documentation details each of the object
67 methods. Internal methods are usually preceded with a _
69 =cut
72 # Let the code begin...
74 package Bio::SeqFeature::Tools::IDHandler;
75 use strict;
77 # Object preamble - inherits from Bio::Root::Root
79 use base qw(Bio::Root::Root);
81 =head2 new
83 Title : new
84 Usage : $unflattener = Bio::SeqFeature::Tools::IDHandler->new();
85 Function: constructor
86 Example :
87 Returns : a new Bio::SeqFeature::Tools::IDHandler
88 Args : see below
91 =cut
93 sub new {
94 my($class,@args) = @_;
95 my $self = $class->SUPER::new(@args);
97 my($generate_id_sub) =
98 $self->_rearrange([qw(GENERATE_ID_SUB
99 )],
100 @args);
102 return $self; # success - we hope!
105 =head2 set_ParentIDs_from_hierarchy()
107 Title : set_ParentIDs_from_hierarchy()
108 Usage : $idhandler->set_ParentIDs_from_hierarchy($fholder)
109 Function: populates tags Parent and ID via holder hierarchy
110 Example :
111 Returns :
112 Args : Bio::featureHolderI (either a SeqFeature or a Seq)
114 This is mainly for GFF3 export
116 GFF3 uses the tags ID and Parent to represent the feature containment
117 hierarchy; it does NOT use the feature holder tree
119 This method sets Parent (and ID for any parents not set) based on
120 feature holder/containement hierarchy, ready for GFF3 output
122 =cut
124 # method author: cjm@fruitfly.org
125 sub set_ParentIDs_from_hierarchy(){
126 my $self = shift;
127 my ($featholder) = @_;
129 # we will traverse the tree of contained seqfeatures
130 # (a seqfeature is itself a holder)
132 # start with the top-level features
133 my @sfs = $featholder->get_SeqFeatures;
135 # clear existing parent tags
136 # (we assume this is the desired behaviour)
137 my @all_sfs = $featholder->get_all_SeqFeatures;
138 foreach (@all_sfs) {
139 if ($_->has_tag('Parent')) {
140 $_->remove_tag('Parent');
145 # iterate until entire tree traversed
146 while (@sfs) {
147 my $sf = shift @sfs;
148 my @subsfs = $sf->get_SeqFeatures;
150 # see if the ID tag
151 my $id = $sf->primary_id;
152 if (!$id) {
153 # the skolem function feature(seq,start,end,type)
154 # is presumed to uniquely identify this feature, and
155 # to also be persistent
156 $id = $sf->generate_unique_persistent_id;
158 foreach my $subsf (@subsfs) {
159 $subsf->add_tag_value('Parent', $id);
162 # push children on to end of stack (breadth first search)
163 push(@sfs, @subsfs);
165 return;
168 =head2 create_hierarchy_from_ParentIDs
170 Title : create_hierarchy_from_ParentIDs
171 Usage : $idhandler->set_ParentIDs_from_hierarchy($fholder)
172 Function: inverse of set_ParentIDs_from_hierarchy
173 Example :
174 Returns : list of top SeqFeatures
175 Args :
178 =cut
180 sub create_hierarchy_from_ParentIDs{
181 my ($self,$featholder,@args) = @_;
183 my @sfs = $featholder->get_all_SeqFeatures;
184 my %sf_by_ID = ();
185 foreach (@sfs) {
186 my $id = $_->primary_id;
187 next unless $id;
188 if ($sf_by_ID{$id}) {
189 $featholder->throw("DUPLICATE ID: $id");
191 $sf_by_ID{$id} = $_;
192 $_->remove_SeqFeatures; # clear existing hierarchy (assume this is desired)
194 if (!%sf_by_ID) {
195 # warn??
196 # this is actually expected behaviour for some kinds of data;
197 # eg lists of STSs - no containment hierarchy
198 return;
201 my @topsfs =
202 grep {
203 my @parents = $_->get_tagset_values('Parent');
204 foreach my $parent (@parents) {
205 $sf_by_ID{$parent}->add_SeqFeature($_)
206 if exists $sf_by_ID{$parent};
208 !@parents;
209 } @sfs;
210 $featholder->remove_SeqFeatures;
211 $featholder->add_SeqFeature($_) foreach @topsfs;
212 return @topsfs;
216 =head2 generate_unique_persistent_id
218 Title : generate_unique_persistent_id
219 Usage :
220 Function: generates a unique and persistent identifier for this
221 Example :
222 Returns : value of primary_id (a scalar)
223 Args :
225 Will generate an ID, B<and> set primary_id() (see above)
227 The ID is a string generated from
229 seq_id
230 primary_tag
231 start
234 There are three underlying assumptions: that all the above accessors
235 are set; that seq_id is a persistent and unique identifier for the
236 sequence containing this feature; and that
238 (seq_id, primary_tag, start, end)
240 is a "unique constraint" over features
242 The ID is persistent, so long as none of these values change - if they
243 do, it is considered a seperate entity
245 =cut
247 # method author: cjm@fruitfly.org
248 sub generate_unique_persistent_id{
249 my ($self,$sf,@args) = @_;
251 my $id;
252 if (!$sf->isa("Bio::SeqFeatureI")) {
253 $sf->throw("not a Bio::SeqFeatureI");
255 my $seq_id = $sf->seq_id || $sf->throw("seq_id must be set");
256 #my $seq_id = $sf->seq_id || 'unknown_seq';
257 if ($sf->has_tag('transcript_id')) {
258 ($id) = $sf->get_tag_values('transcript_id');
260 elsif ($sf->has_tag('protein_id')) {
261 ($id) = $sf->get_tag_values('protein_id');
263 else {
264 my $source = $sf->source_tag || $sf->throw("source tag must be set");
265 #my $source = $sf->source_tag || 'unknown_source';
266 my $start = $sf->start || $sf->throw("start must be set");
267 my $end = $sf->end || $sf->throw("end must be set");
268 my $type = $sf->primary_tag || $sf->throw("primary_tag must be set");
270 $id = "$source:$type:$seq_id:$start:$end";
272 $sf->primary_id($id);
273 return $id;