changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / SeqIO / game / seqHandler.pm
blobc5b4600dbcf22c609f19c6b2a27bcafce6d33014
2 # BioPerl module for Bio::SeqIO::game::seqHandler
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sheldon McKay <mckays@cshl.edu>
8 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqIO::game::seqHandler -- a class for handling game-XML sequences
17 =head1 SYNOPSIS
19 This modules is not used directly
21 =head1 DESCRIPTION
23 Bio::SeqIO::game::seqHandler processes all of the sequences associated with a game record
24 and, via feature handlers, processes the associated annotations
26 =head1 FEEDBACK
28 =head2 Mailing Lists
30 User feedback is an integral part of the evolution of this
31 and other Bioperl modules. Send your comments and suggestions preferably
32 to one of the Bioperl mailing lists.
34 Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39 =head2 Support
41 Please direct usage questions or support issues to the mailing list:
43 I<bioperl-l@bioperl.org>
45 rather than to the module maintainer directly. Many experienced and
46 reponsive experts will be able look at the problem and quickly
47 address it. Please include a thorough description of the problem
48 with code and data examples if at all possible.
50 =head2 Reporting Bugs
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 of the bugs and their resolution. Bug reports can be submitted via the
54 web:
56 https://github.com/bioperl/bioperl-live/issues
58 =head1 AUTHOR - Sheldon McKay
60 Email mckays@cshl.edu
62 =head1 APPENDIX
64 The rest of the documentation details each of the object
65 methods. Internal methods are usually preceded with a _
67 =cut
69 package Bio::SeqIO::game::seqHandler;
71 use Data::Dumper;
73 use Bio::SeqIO::game::featHandler;
74 use Bio::SeqFeature::Generic;
75 use Bio::Seq::RichSeq;
76 use Bio::Species;
77 use strict;
79 use vars qw {};
81 use base qw(Bio::SeqIO::game::gameSubs);
83 =head2 new
85 Title : new
86 Usage : my $seqHandler = Bio::SeqIO::game::seqHandler->new($seq, $ann, $comp, $map, $src )
87 Function: constructor method to create a sequence handler
88 Returns : a sequence handler object
89 Args : $seq -- an XML sequence element
90 $ann -- a ref. to a list of <annotation> elements
91 $comp -- a ref. to a list of <computational_analysis> elements (not used yet)
92 $map -- a <map_position> element
93 $src -- a flag to indicate that the sequence already has a source feature
95 =cut
97 sub new {
98 my ($caller, $seq, $ann, $comp, $map, $src ) = @_;
100 my $class = ref($caller) || $caller;
102 my $self = bless ( {
103 seqs => $seq,
104 anns => $ann,
105 comps => $comp,
106 map_pos => $map,
107 has_source => $src,
108 seq_h => {},
109 ann_l => []
110 }, $class );
112 return $self;
115 =head2 convert
117 Title : convert
118 Usage : @seqs = $seqHandler->convert
119 Function: converts the main XML sequence element and associated annotations to Bio::
120 Returns : a ref. to a an array containing the sequence object and a ref. to a list of features
121 Args : none
123 Note : The features and sequence are kept apart to facilitate downstream filtering of features
125 =cut
127 sub convert {
128 my $self = shift;
129 my @ann = @{$self->{anns}} if defined $self->{anns};;
130 my @seq = @{$self->{seqs}};
132 # not used yet
133 my @comp;
134 if ( $self->{comps} ) {
135 @comp = @{$self->{comps}}
138 # process the sequence elements
139 for ( @seq ) {
140 $self->_add_seq( $_ );
143 # process the annotation elements
144 for ( @ann ) {
145 $self->_annotation( $_ );
148 return $self->_order_feats( $self->{seq_h} );
151 =head2 _order_feats
153 Title : _order_feats
154 Usage : $self->_order_feats( $self->{seq_h} )
155 Function: an internal method to ensure the source feature comes first
156 and keep gene, mRNA and CDS features together
157 Returns : a ref. to an array containing the sequence object and a ref. to a list of features
158 Args : a ref. to a hash of sequences
160 =cut
162 sub _order_feats {
163 my ($self, $seqs) = @_;
164 my $seq = $self->{main_seq};
165 my $id = $seq->id;
166 my $ann = $self->{ann_l};
168 # make sure source(s) come first
169 my @src = grep { $_->primary_tag =~ /source|origin|\bregion\b/ } @$ann;
170 # preserve gene->mRNA->CDS or ncRNA->gene->transcript order
171 my @genes = grep { $_->primary_tag =~ /gene|CDS|[a-z]+RNA|transcript/ } @$ann;
172 my @other = sort { $a->start <=> $b->start || $b->end <=> $a->end }
173 grep { $_->primary_tag !~ /source|origin|\bregion\b/ }
174 grep { $_->primary_tag !~ /gene|mRNA|CDS/ } @$ann;
176 return [$seq, [@src, @genes, @other]];
179 =head2 _add_seq
181 Title : _add_seq
182 Usage : $self->_add_seq($seq_element)
183 Function: an internal method to process the sequence elements
184 Returns : nothing
185 Args : a sequence element
187 =cut
189 sub _add_seq {
190 my ($self, $el) = @_;
191 my $residues = '';
193 if ($el->{_residues}) {
194 $residues = $el->{_residues}->{Characters};
195 $residues =~ s/[ \n\r]//g;
196 $residues =~ s/\!//g;
197 $residues =~ tr/a-z/A-Z/;
199 else {
200 return 0;
203 my $id = $el->{Attributes}->{id};
204 my $ver = $el->{Attributes}->{version};
205 my $name = $el->{_name}->{Characters};
207 if ($name && $name ne $id) {
208 $self->complain("The sequence name and unique ID do not match. Using ID");
211 # get/set the sequence object
212 my $seq = $self->_seq($id);
214 # get/set the feature handler
215 my $featHandler = $self->_feat_handler;
217 # populate the sequence object
218 $seq->seq($residues);
219 $seq->seq_version($ver) if $ver;
221 # assume the id is the accession number
222 if ( $id =~ /^\w+$/ ) {
223 $seq->accession($id);
226 # If the focus attribute is set to "true", this is the main
227 # sequence
228 my $focus = 0;
229 if ( defined $el->{Attributes}->{focus} ) {
230 $self->{main_seq} = $seq;
231 $focus++;
234 # make sure real and annotated lengths match
235 my $length = $el->{Attributes}->{'length'};
236 $length && $seq->length(int($length));
237 if ( $seq->seq && defined($length) && $seq->length != int($length) ) {
238 $self->complain("The specified sequence has length ", $seq->length(),
239 " but the length attribute= ", $length);
240 $seq->seq( undef );
241 $seq->length( int($length) );
244 # deal with top-level annotations
245 my $tags = {};
246 if ( $el->{Attributes}->{md5checksum} ) {
247 $tags->{md5checksum} = [$el->{Attributes}->{md5checksum}];
249 if ($el->{_dbxref}) {
250 $tags->{dbxref} ||= [];
251 push @{$tags->{dbxref}}, $self->dbxref( $el->{_dbxref} );
253 if ($el->{_description}) {
254 my $desc = $el->{_description}->{Characters};
255 $seq->description( $desc );
257 if ($el->{_organism}) {
258 my @organism = split /\s+/, $el->{_organism}->{Characters};
259 if (@organism < 2) {
260 $self->complain("Species name should have at least two words");
262 else {
263 my $species = Bio::Species->new( -classification => [reverse @organism] );
264 $seq->species($species);
267 if ( defined($seq->species) ) {
268 $tags->{organism} = [$seq->species->binomial];
270 # elsif ($seq eq $self->{main_seq}) {
271 # $self->warn("The source organism for this sequence was\n" .
272 # "not specified. I will guess Drosophila melanogaster.\n" .
273 # "Otherwise, add <organism>Genus species</organism>\n" .
274 # "to the main sequence element");
275 # my @class = qw/ Eukaryota Metazoa Arthropoda Insecta Pterygota
276 # Neoptera Endopterygota Diptera Brachycera
277 # Muscomorpha Ephydroidea Drosophilidae Drosophila melanogaster/;
278 # my $species = Bio::Species->new( -classification => [ reverse @class ],
279 # -common_name => 'fruit fly' );
280 # $seq->species( $species );
283 # convert GAME to bioperl molecule types
284 my $alphabet = $el->{Attributes}->{type};
285 if ( $alphabet ) {
286 $alphabet =~ s/aa/protein/;
287 $alphabet =~ s/cdna/rna/;
288 $seq->alphabet($alphabet);
291 # add a source feature if req'd
292 if ( !$self->{has_source} && $focus ) {
293 #$self->{source} = $featHandler->add_source($seq->length, $tags);
296 if ( $focus ) {
297 # add the map position
298 $self->_map_position( $self->{map_pos}, $seq );
299 $featHandler->{offset} = $self->{offset};
302 # prune the sequence from the parse tree
303 $self->flush;
306 =head2 _map_position
308 Title : _map_position
309 Usage : $self->_map_position($map_posn_element)
310 Function: an internal method to process the <map_position> element
311 Returns : nothing
312 Args : a map_position element
314 =cut
316 sub _map_position {
317 my ($self, $el) = @_;
319 # we can live without it
320 if ( !$el ) {
321 $self->{offset}= 0;
322 return 0;
326 # chromosome and coordinates
327 my $arm = $el->{_arm}->{Characters};
328 my $type = $el->{Attributes}->{type};
329 my $loc = $el->{_span};
330 my $start = $loc->{_start}->{Characters};
331 my $end = $loc->{_end}->{Characters};
333 # define the offset (may be a partial sequence)
334 # The coordinates will be relative but the CDS description
335 # coordinates may be absolute if the game-XML comes from apollo
336 # or gadfly
337 $self->{offset} = $start - 1;
339 my $seq_id = $el->{Attributes}->{seq};
340 my $seq = $self->{seq_h}->{$seq_id};
342 unless ( $seq ) {
343 $self->throw("Map position with no corresponding sequence object");
345 unless ($seq eq $self->{main_seq}){
346 $self->throw("Map position does not correspond to the main sequence");
349 my $species = '';
351 # create/update the top-level sequence feature if req'd
352 if ( $self->{source} ) {
353 my $feat = $self->{source};
355 unless ($feat->has_tag('organism')) {
356 $species = eval {$seq->species->binomial} || 'unknown species';
357 $feat->add_tag_value( organism => $species );
360 my %tags = ( mol_type => "genomic dna",
361 chromosome => $arm,
362 location => "$start..$end",
363 type => $type
366 for (keys %tags) {
367 $feat->add_tag_value( $_ => $tags{$_} );
370 $seq->add_SeqFeature($feat);
373 # come up with a description if there is none
374 my $desc = $seq->description;
375 if ( $species && $arm && $start && $end && !$desc) {
376 $seq->description("$species chromosome $arm $start..$end " .
377 "segment of complete sequence");
380 $self->flush;
383 =head2 _annotation
385 Title : _annotation
386 Usage : $self->_annotation($annotation_element)
387 Function: an internal method to process <annotation> elements
388 Returns : nothing
389 Args : an annotation element
391 =cut
393 sub _annotation {
394 my ($self, $el) = @_;
396 my $id = $el->{Attributes}->{id};
397 my $type = $el->{_type}->{Characters};
398 my $tags = {};
399 my $gname = $el->{_name}->{Characters} eq $id ? '' : $el->{_name}->{Characters};
401 # 'transposable element' is too long (breaks Bio::SeqIO::GenBank)
402 # $type =~ s/transposable_element/repeat_region/;
404 # annotations must be on the main sequence
405 my $seqid = $self->{main_seq}->id;
406 my $featHandler = $self->_feat_handler;
408 my @feats = ();
410 for my $child ( @{$el->{Children}} ) {
411 my $name = $child->{Name};
413 # these elements require special handling
414 if ( $name eq 'dbxref' ) {
415 $tags->{dbxref} ||= [];
416 push @{$tags->{dbxref}}, $self->dbxref( $child );
418 elsif ( $name eq 'aspect' ) {
419 $tags->{dbxref} ||= [];
420 push @{$tags->{dbxref}}, $self->dbxref( $child->{_dbxref} );
422 elsif ( $name eq 'feature_set' ) {
423 push @feats, $featHandler->feature_set( $id, $gname, $child, $type );
425 elsif ( $name eq 'comment' ) {
426 $tags->{comment} = [$self->comment( $child )];
428 elsif ( $name eq 'property' ) {
429 $self->property( $child, $tags );
431 elsif ( $name eq 'gene' ) {
432 # we may be dealing with an annotation that is not
433 # a gene, so we have to nest the gene inside it
434 $featHandler->has_gene( $child, $gname, $id )
437 # otherwise, tag/value pairs
438 # -- mild dtd enforcement
439 # synonym is not in the dtd but shows up in gadfly
440 # annotations
441 elsif ( $name =~ /type|synonym/ ) {
442 $tags->{$name} = [$child->{Characters}];
444 elsif ( $name ne 'name' ) {
445 $self->complain("Unrecognized element '$name'. I don't " .
446 "know what to do with $name elements in " .
447 "top-level sequence annotations." );
452 # add a gene annotation if required
453 unless ( $featHandler->has_gene || $type ne 'gene' ) {
454 $featHandler->has_gene( $el, $gname, $id )
457 if ( $tags->{symbol} ) {
458 if ( !$tags->{gene} ) {
459 $tags->{gene} = $tags->{symbol};
461 delete $tags->{symbol};
465 $featHandler->add_annotation( $self->{main_seq}, $type, $id, $tags, \@feats );
466 $self->flush;
469 # get/set the sequence object
471 =head2 _seq
473 Title : _seq
474 Usage : my $seq = $self->_seq
475 Function: an internal sequence getter/setter
476 Returns : a Bio::RichSeq object
477 Args : a sequence ID
479 =cut
481 sub _seq {
482 my ($self, $id) = @_;
483 $id || $self->throw("A unique id must be provided for the sequence");
485 my $seq = {};
487 if ( defined $self->{seq_h}->{$id}) {
488 $seq = $self->{seq_h}->{$id};
489 } else {
490 $seq = Bio::Seq::RichSeq->new( -id => $id );
491 $self->{seq_h}->{$id} = $seq; # store it
494 return $seq;
497 #get/set the feature handler
499 =head2 _feat_handler
501 Title : _feat_handler
502 Usage : my $featHandler = $self->_featHandler
503 Function: an internal getter/setter for feature handling objects
504 Returns : a Bio::SeqIO::game::featHandler object
505 Args : none
507 =cut
509 sub _feat_handler {
510 my $self = shift;
512 my $handler = {};
513 my $seq = $self->{main_seq};
515 if ( defined $self->{feat_handler} ) {
516 $handler = $self->{feat_handler};
518 else {
519 my @args = ( $seq, $self->{seq_h}, $self->{ann_l} );
520 $handler = Bio::SeqIO::game::featHandler->new( @args );
521 $self->{feat_handler} = $handler;
524 return $handler;