2 # BioPerl module for Bio::SeqIO::game::seqHandler
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sheldon McKay <mckays@cshl.edu>
8 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SeqIO::game::seqHandler -- a class for handling game-XML sequences
19 This modules is not used directly
23 Bio::SeqIO::game::seqHandler processes all of the sequences associated with a game record
24 and, via feature handlers, processes the associated annotations
30 User feedback is an integral part of the evolution of this
31 and other Bioperl modules. Send your comments and suggestions preferably
32 to one of the Bioperl mailing lists.
34 Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 Please direct usage questions or support issues to the mailing list:
43 I<bioperl-l@bioperl.org>
45 rather than to the module maintainer directly. Many experienced and
46 reponsive experts will be able look at the problem and quickly
47 address it. Please include a thorough description of the problem
48 with code and data examples if at all possible.
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 of the bugs and their resolution. Bug reports can be submitted via the
56 https://github.com/bioperl/bioperl-live/issues
58 =head1 AUTHOR - Sheldon McKay
64 The rest of the documentation details each of the object
65 methods. Internal methods are usually preceded with a _
69 package Bio
::SeqIO
::game
::seqHandler
;
73 use Bio
::SeqIO
::game
::featHandler
;
74 use Bio
::SeqFeature
::Generic
;
75 use Bio
::Seq
::RichSeq
;
81 use base
qw(Bio::SeqIO::game::gameSubs);
86 Usage : my $seqHandler = Bio::SeqIO::game::seqHandler->new($seq, $ann, $comp, $map, $src )
87 Function: constructor method to create a sequence handler
88 Returns : a sequence handler object
89 Args : $seq -- an XML sequence element
90 $ann -- a ref. to a list of <annotation> elements
91 $comp -- a ref. to a list of <computational_analysis> elements (not used yet)
92 $map -- a <map_position> element
93 $src -- a flag to indicate that the sequence already has a source feature
98 my ($caller, $seq, $ann, $comp, $map, $src ) = @_;
100 my $class = ref($caller) || $caller;
118 Usage : @seqs = $seqHandler->convert
119 Function: converts the main XML sequence element and associated annotations to Bio::
120 Returns : a ref. to a an array containing the sequence object and a ref. to a list of features
123 Note : The features and sequence are kept apart to facilitate downstream filtering of features
129 my @ann = @
{$self->{anns
}} if defined $self->{anns
};;
130 my @seq = @
{$self->{seqs
}};
134 if ( $self->{comps
} ) {
135 @comp = @
{$self->{comps
}}
138 # process the sequence elements
140 $self->_add_seq( $_ );
143 # process the annotation elements
145 $self->_annotation( $_ );
148 return $self->_order_feats( $self->{seq_h
} );
154 Usage : $self->_order_feats( $self->{seq_h} )
155 Function: an internal method to ensure the source feature comes first
156 and keep gene, mRNA and CDS features together
157 Returns : a ref. to an array containing the sequence object and a ref. to a list of features
158 Args : a ref. to a hash of sequences
163 my ($self, $seqs) = @_;
164 my $seq = $self->{main_seq
};
166 my $ann = $self->{ann_l
};
168 # make sure source(s) come first
169 my @src = grep { $_->primary_tag =~ /source|origin|\bregion\b/ } @
$ann;
170 # preserve gene->mRNA->CDS or ncRNA->gene->transcript order
171 my @genes = grep { $_->primary_tag =~ /gene|CDS|[a-z]+RNA|transcript/ } @
$ann;
172 my @other = sort { $a->start <=> $b->start || $b->end <=> $a->end }
173 grep { $_->primary_tag !~ /source|origin|\bregion\b/ }
174 grep { $_->primary_tag !~ /gene|mRNA|CDS/ } @
$ann;
176 return [$seq, [@src, @genes, @other]];
182 Usage : $self->_add_seq($seq_element)
183 Function: an internal method to process the sequence elements
185 Args : a sequence element
190 my ($self, $el) = @_;
193 if ($el->{_residues
}) {
194 $residues = $el->{_residues
}->{Characters
};
195 $residues =~ s/[ \n\r]//g;
196 $residues =~ s/\!//g;
197 $residues =~ tr/a-z/A-Z/;
203 my $id = $el->{Attributes
}->{id
};
204 my $ver = $el->{Attributes
}->{version
};
205 my $name = $el->{_name
}->{Characters
};
207 if ($name && $name ne $id) {
208 $self->complain("The sequence name and unique ID do not match. Using ID");
211 # get/set the sequence object
212 my $seq = $self->_seq($id);
214 # get/set the feature handler
215 my $featHandler = $self->_feat_handler;
217 # populate the sequence object
218 $seq->seq($residues);
219 $seq->seq_version($ver) if $ver;
221 # assume the id is the accession number
222 if ( $id =~ /^\w+$/ ) {
223 $seq->accession($id);
226 # If the focus attribute is set to "true", this is the main
229 if ( defined $el->{Attributes
}->{focus
} ) {
230 $self->{main_seq
} = $seq;
234 # make sure real and annotated lengths match
235 my $length = $el->{Attributes
}->{'length'};
236 $length && $seq->length(int($length));
237 if ( $seq->seq && defined($length) && $seq->length != int($length) ) {
238 $self->complain("The specified sequence has length ", $seq->length(),
239 " but the length attribute= ", $length);
241 $seq->length( int($length) );
244 # deal with top-level annotations
246 if ( $el->{Attributes
}->{md5checksum
} ) {
247 $tags->{md5checksum
} = [$el->{Attributes
}->{md5checksum
}];
249 if ($el->{_dbxref
}) {
250 $tags->{dbxref
} ||= [];
251 push @
{$tags->{dbxref
}}, $self->dbxref( $el->{_dbxref
} );
253 if ($el->{_description
}) {
254 my $desc = $el->{_description
}->{Characters
};
255 $seq->description( $desc );
257 if ($el->{_organism
}) {
258 my @organism = split /\s+/, $el->{_organism
}->{Characters
};
260 $self->complain("Species name should have at least two words");
263 my $species = Bio
::Species
->new( -classification
=> [reverse @organism] );
264 $seq->species($species);
267 if ( defined($seq->species) ) {
268 $tags->{organism
} = [$seq->species->binomial];
270 # elsif ($seq eq $self->{main_seq}) {
271 # $self->warn("The source organism for this sequence was\n" .
272 # "not specified. I will guess Drosophila melanogaster.\n" .
273 # "Otherwise, add <organism>Genus species</organism>\n" .
274 # "to the main sequence element");
275 # my @class = qw/ Eukaryota Metazoa Arthropoda Insecta Pterygota
276 # Neoptera Endopterygota Diptera Brachycera
277 # Muscomorpha Ephydroidea Drosophilidae Drosophila melanogaster/;
278 # my $species = Bio::Species->new( -classification => [ reverse @class ],
279 # -common_name => 'fruit fly' );
280 # $seq->species( $species );
283 # convert GAME to bioperl molecule types
284 my $alphabet = $el->{Attributes
}->{type
};
286 $alphabet =~ s/aa/protein/;
287 $alphabet =~ s/cdna/rna/;
288 $seq->alphabet($alphabet);
291 # add a source feature if req'd
292 if ( !$self->{has_source
} && $focus ) {
293 #$self->{source} = $featHandler->add_source($seq->length, $tags);
297 # add the map position
298 $self->_map_position( $self->{map_pos
}, $seq );
299 $featHandler->{offset
} = $self->{offset
};
302 # prune the sequence from the parse tree
308 Title : _map_position
309 Usage : $self->_map_position($map_posn_element)
310 Function: an internal method to process the <map_position> element
312 Args : a map_position element
317 my ($self, $el) = @_;
319 # we can live without it
326 # chromosome and coordinates
327 my $arm = $el->{_arm
}->{Characters
};
328 my $type = $el->{Attributes
}->{type
};
329 my $loc = $el->{_span
};
330 my $start = $loc->{_start
}->{Characters
};
331 my $end = $loc->{_end
}->{Characters
};
333 # define the offset (may be a partial sequence)
334 # The coordinates will be relative but the CDS description
335 # coordinates may be absolute if the game-XML comes from apollo
337 $self->{offset
} = $start - 1;
339 my $seq_id = $el->{Attributes
}->{seq
};
340 my $seq = $self->{seq_h
}->{$seq_id};
343 $self->throw("Map position with no corresponding sequence object");
345 unless ($seq eq $self->{main_seq
}){
346 $self->throw("Map position does not correspond to the main sequence");
351 # create/update the top-level sequence feature if req'd
352 if ( $self->{source
} ) {
353 my $feat = $self->{source
};
355 unless ($feat->has_tag('organism')) {
356 $species = eval {$seq->species->binomial} || 'unknown species';
357 $feat->add_tag_value( organism
=> $species );
360 my %tags = ( mol_type
=> "genomic dna",
362 location
=> "$start..$end",
367 $feat->add_tag_value( $_ => $tags{$_} );
370 $seq->add_SeqFeature($feat);
373 # come up with a description if there is none
374 my $desc = $seq->description;
375 if ( $species && $arm && $start && $end && !$desc) {
376 $seq->description("$species chromosome $arm $start..$end " .
377 "segment of complete sequence");
386 Usage : $self->_annotation($annotation_element)
387 Function: an internal method to process <annotation> elements
389 Args : an annotation element
394 my ($self, $el) = @_;
396 my $id = $el->{Attributes
}->{id
};
397 my $type = $el->{_type
}->{Characters
};
399 my $gname = $el->{_name
}->{Characters
} eq $id ?
'' : $el->{_name
}->{Characters
};
401 # 'transposable element' is too long (breaks Bio::SeqIO::GenBank)
402 # $type =~ s/transposable_element/repeat_region/;
404 # annotations must be on the main sequence
405 my $seqid = $self->{main_seq
}->id;
406 my $featHandler = $self->_feat_handler;
410 for my $child ( @
{$el->{Children
}} ) {
411 my $name = $child->{Name
};
413 # these elements require special handling
414 if ( $name eq 'dbxref' ) {
415 $tags->{dbxref
} ||= [];
416 push @
{$tags->{dbxref
}}, $self->dbxref( $child );
418 elsif ( $name eq 'aspect' ) {
419 $tags->{dbxref
} ||= [];
420 push @
{$tags->{dbxref
}}, $self->dbxref( $child->{_dbxref
} );
422 elsif ( $name eq 'feature_set' ) {
423 push @feats, $featHandler->feature_set( $id, $gname, $child, $type );
425 elsif ( $name eq 'comment' ) {
426 $tags->{comment
} = [$self->comment( $child )];
428 elsif ( $name eq 'property' ) {
429 $self->property( $child, $tags );
431 elsif ( $name eq 'gene' ) {
432 # we may be dealing with an annotation that is not
433 # a gene, so we have to nest the gene inside it
434 $featHandler->has_gene( $child, $gname, $id )
437 # otherwise, tag/value pairs
438 # -- mild dtd enforcement
439 # synonym is not in the dtd but shows up in gadfly
441 elsif ( $name =~ /type|synonym/ ) {
442 $tags->{$name} = [$child->{Characters
}];
444 elsif ( $name ne 'name' ) {
445 $self->complain("Unrecognized element '$name'. I don't " .
446 "know what to do with $name elements in " .
447 "top-level sequence annotations." );
452 # add a gene annotation if required
453 unless ( $featHandler->has_gene || $type ne 'gene' ) {
454 $featHandler->has_gene( $el, $gname, $id )
457 if ( $tags->{symbol
} ) {
458 if ( !$tags->{gene
} ) {
459 $tags->{gene
} = $tags->{symbol
};
461 delete $tags->{symbol
};
465 $featHandler->add_annotation( $self->{main_seq
}, $type, $id, $tags, \
@feats );
469 # get/set the sequence object
474 Usage : my $seq = $self->_seq
475 Function: an internal sequence getter/setter
476 Returns : a Bio::RichSeq object
482 my ($self, $id) = @_;
483 $id || $self->throw("A unique id must be provided for the sequence");
487 if ( defined $self->{seq_h
}->{$id}) {
488 $seq = $self->{seq_h
}->{$id};
490 $seq = Bio
::Seq
::RichSeq
->new( -id
=> $id );
491 $self->{seq_h
}->{$id} = $seq; # store it
497 #get/set the feature handler
501 Title : _feat_handler
502 Usage : my $featHandler = $self->_featHandler
503 Function: an internal getter/setter for feature handling objects
504 Returns : a Bio::SeqIO::game::featHandler object
513 my $seq = $self->{main_seq
};
515 if ( defined $self->{feat_handler
} ) {
516 $handler = $self->{feat_handler
};
519 my @args = ( $seq, $self->{seq_h
}, $self->{ann_l
} );
520 $handler = Bio
::SeqIO
::game
::featHandler
->new( @args );
521 $self->{feat_handler
} = $handler;