sync w/ main trunk
[bioperl-live.git] / Bio / Variation / IO / xml.pm
blobea8e21b9dfeb4fe68c1a278f8f8669e759e6ec11
1 # $Id$
2 # BioPerl module for Bio::Variation::IO::xml
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Variation::IO::xml - XML sequence variation input/output stream
19 =head1 SYNOPSIS
21 Do not use this module directly. Use it via the Bio::Variation::IO class.
23 =head1 DESCRIPTION
25 This object can transform L<Bio::Variation::SeqDiff> objects to and from XML
26 file databases.
28 The XML format, although consistent, is still evolving. The current
29 DTD for it is at L<http://www.ebi.ac.uk/mutations/DTDE/seqDiff.dtd>.
31 =head1 REQUIREMENTS
33 To use this code you need the module L<XML::Twig> which creates an
34 interface to L<XML::Parser> to read XML and modules L<XML::Writer> and
35 L<IO::String> to write XML out.
37 =head1 FEEDBACK
39 =head2 Mailing Lists
41 User feedback is an integral part of the evolution of this and other
42 Bioperl modules. Send your comments and suggestions preferably to the
43 Bioperl mailing lists Your participation is much appreciated.
45 bioperl-l@bioperl.org - General discussion
46 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 =head2 Support
50 Please direct usage questions or support issues to the mailing list:
52 L<bioperl-l@bioperl.org>
54 rather than to the module maintainer directly. Many experienced and
55 reponsive experts will be able look at the problem and quickly
56 address it. Please include a thorough description of the problem
57 with code and data examples if at all possible.
59 =head2 Reporting Bugs
61 report bugs to the Bioperl bug tracking system to help us keep track
62 the bugs and their resolution. Bug reports can be submitted via the
63 web:
65 http://bugzilla.open-bio.org/
67 =head1 AUTHOR - Heikki Lehvaslaiho
69 Email: heikki-at-bioperl-dot-org
71 =head1 APPENDIX
73 The rest of the documentation details each of the object
74 methods. Internal methods are usually preceded with a _
76 =cut
78 # Let the code begin...
80 package Bio::Variation::IO::xml;
82 use vars qw($seqdiff $var $prevdnaobj $prevrnaobj $prevaaobj);
83 use strict;
85 use XML::Twig;
86 use XML::Writer 0.4;
87 use IO::String;
88 use Bio::Variation::SeqDiff;
89 use Bio::Variation::DNAMutation;
90 use Bio::Variation::RNAChange;
91 use Bio::Variation::AAChange;
92 use Bio::Variation::Allele;
94 use base qw(Bio::Variation::IO);
96 # _initialize is where the heavy stuff will happen when new is called
98 sub new {
99 my ($class,@args) = @_;
100 my $self = bless {}, $class;
101 $self->_initialize(@args);
102 return $self;
105 sub _initialize {
106 my($self,@args) = @_;
107 return unless $self->SUPER::_initialize(@args);
110 =head2 next
112 Title : next
113 Usage : $haplo = $stream->next()
114 Function: returns the next seqDiff in the stream
115 Returns : Bio::Variation::SeqDiff object
116 Args : NONE
118 =cut
121 sub _seqDiff {
122 my ($t, $term)= @_;
123 $seqdiff->id( $term->att('id') );
124 $seqdiff->alphabet( $term->att('moltype') );
125 $seqdiff->offset( $term->att('offset') );
127 foreach my $child ($term->children) {
128 _variant($t, $child);
132 sub _variant {
133 my ($t, $term)= @_;
134 my $var;
135 my $att = $term->atts();
136 my ($variation_number, $change_number) = split /\./, $att->{number};
138 # if more than two alleles
139 if ($variation_number and $change_number and $change_number > 1 ) {
140 my $a3 = Bio::Variation::Allele->new;
141 $a3->seq( $term->first_child_text('allele_mut') )
142 if $term->first_child_text('allele_mut');
143 if ($term->gi eq 'DNA') {
144 $prevdnaobj->add_Allele($a3);
146 elsif ($term->gi eq 'RNA') {
147 $prevrnaobj->add_Allele($a3);
148 } else { # AA
149 $prevaaobj->add_Allele($a3);
151 } else { # create new variants
152 if ($term->gi eq 'DNA') {
153 $var = Bio::Variation::DNAMutation->new();
155 elsif ($term->gi eq 'RNA') {
156 $var = Bio::Variation::RNAChange->new();
157 } else { # AA
158 $var = Bio::Variation::AAChange->new();
161 # these are always present
162 $var->start( $att->{start} );
163 $var->end( $att->{end});
164 $var->length($att->{len});
165 $var->mut_number( $att->{number});
166 $var->upStreamSeq($term->first_child_text('upFlank'));
167 $var->dnStreamSeq($term->first_child_text('dnFlank'));
168 $var->proof($term->first_child_text('proof'));
170 # region
171 my $region = $term->first_child('region');
172 if ($region) {
173 $var->region($region->text);
174 my $region_atts = $region->atts;
175 $var->region_value( $region_atts->{value} )
176 if $region_atts->{value};
177 $var->region_dist( $region_atts->{dist} )
178 if $region_atts->{dist};
181 # alleles
182 my $a1 = Bio::Variation::Allele->new;
183 $a1->seq($term->first_child_text('allele_ori') )
184 if $term->first_child_text('allele_ori');
185 $var->allele_ori($a1);
186 my $a2 = Bio::Variation::Allele->new;
187 $a2->seq($term->first_child_text('allele_mut') )
188 if $term->first_child_text('allele_mut');
189 $var->isMutation(1) if $term->att('isMutation');
190 $var->allele_mut($a2);
191 $var->add_Allele($a2);
192 $var->length( $term->att('length') );
193 $seqdiff->add_Variant($var);
195 # variant specific code
196 if ($term->gi eq 'DNA') {
197 $prevdnaobj = $var;
199 elsif ($term->gi eq 'RNA') {
200 my $codon = $term->first_child('codon');
201 if ($codon) {
202 my $codon_atts = $codon->atts;
203 $var->codon_table( $codon->att('codon_table') )
204 if $codon_atts->{codon_table} and $codon_atts->{codon_table} != 1;
205 $var->codon_pos( $codon->att('codon_pos') )
206 if $codon_atts->{codon_pos};
208 $prevdnaobj->RNAChange($var);
209 $var->DNAMutation($prevdnaobj);
210 $prevrnaobj = $var;
211 } else {
212 $prevrnaobj->AAChange($var);
213 $var->RNAChange($prevrnaobj);
214 $prevaaobj = $var;
219 sub next {
220 my( $self ) = @_;
222 local $/ = "</seqDiff>\n";
223 return unless my $entry = $self->_readline;
224 # print STDERR "|$entry|";
225 return unless $entry =~ /^\W*<seqDiff/;
227 $seqdiff = Bio::Variation::SeqDiff->new;
229 # create new parser object
230 my $twig_handlers = {'seqDiff' => \&_seqDiff };
231 my $t = new XML::Twig ( TwigHandlers => $twig_handlers,
232 KeepEncoding => 1 );
233 $t->parse($entry);
235 return $seqdiff;
238 =head2 write
240 Title : write
241 Usage : $stream->write(@haplos)
242 Function: writes the $seqDiff objects into the stream
243 Returns : 1 for success and 0 for error
244 Args : Bio::Variation::SeqDiff object
246 =cut
248 sub write {
249 my ($self,@h) = @_;
251 if( !defined $h[0] ) {
252 $self->throw("Attempting to write with no information!");
254 my $str;
255 my $output = IO::String->new($str);
256 my $w = new XML::Writer(OUTPUT => $output, DATA_MODE => 1, DATA_INDENT => 4 );
258 foreach my $h (@h) {
260 # seqDiff
262 $h->alphabet || $self->throw("Moltype of the reference sequence is not set!");
263 my $hasAA = 0;
264 foreach my $mut ($h->each_Variant) {
265 $hasAA = 1 if $mut->isa('Bio::Variation::AAChange');
267 if ($hasAA) {
268 $w->startTag("seqDiff",
269 "id" => $h->id,
270 "moltype" => $h->alphabet,
271 "offset" => $h->offset,
272 "sysname" => $h->sysname,
273 "trivname" => $h->trivname
275 } else {
276 $w->startTag("seqDiff",
277 "id" => $h->id,
278 "moltype" => $h->alphabet,
279 "offset" => $h->offset,
280 "sysname" => $h->sysname
283 my @allvariants = $h->each_Variant;
284 #print "allvars:", scalar @allvariants, "\n";
285 my %variants = ();
286 foreach my $mut ($h->each_Variant) {
287 #print STDERR $mut->mut_number, "\t", $mut, "\t",
288 #$mut->proof, "\t", scalar $mut->each_Allele, "\n";
289 push @{$variants{$mut->mut_number} }, $mut;
291 foreach my $var (sort keys %variants) {
292 foreach my $mut (@{$variants{$var}}) {
294 # DNA
296 if( $mut->isa('Bio::Variation::DNAMutation') ) {
297 $mut->isMutation(0) if not $mut->isMutation;
298 my @alleles = $mut->each_Allele;
299 my $count = 0;
300 foreach my $allele (@alleles) {
301 $count++;
302 my ($variation_number, $change_number) = split /\./, $mut->mut_number;
303 if ($change_number and $change_number != $count){
304 $mut->mut_number("$change_number.$count");
306 $mut->allele_mut($allele);
307 $w->startTag("DNA",
308 "number" => $mut->mut_number,
309 "start" => $mut->start,
310 "end" => $mut->end,
311 "length" => $mut->length,
312 "isMutation" => $mut->isMutation
314 if ($mut->label) {
315 foreach my $label (split ', ', $mut->label) {
316 $w->startTag("label");
317 $w->characters($label);
318 $w->endTag;
321 if ($mut->proof) {
322 $w->startTag("proof");
323 $w->characters($mut->proof );
324 $w->endTag;
326 if ($mut->upStreamSeq) {
327 $w->startTag("upFlank");
328 $w->characters($mut->upStreamSeq );
329 $w->endTag;
331 #if ( $mut->isMutation) {
332 #if ($mut->allele_ori) {
333 $w->startTag("allele_ori");
334 $w->characters($mut->allele_ori->seq) if $mut->allele_ori->seq ;
335 $w->endTag;
337 #if ($mut->allele_mut) {
338 $w->startTag("allele_mut");
339 $w->characters($mut->allele_mut->seq) if $mut->allele_mut->seq;
340 $w->endTag;
343 if ($mut->dnStreamSeq) {
344 $w->startTag("dnFlank");
345 $w->characters($mut->dnStreamSeq );
346 $w->endTag;
348 if ($mut->restriction_changes) {
349 $w->startTag("restriction_changes");
350 $w->characters($mut->restriction_changes);
351 $w->endTag;
353 if ($mut->region) {
354 if($mut->region_value and $mut->region_dist) {
355 $w->startTag("region",
356 "value" => $mut->region_value,
357 "dist" => $mut->region_dist
360 elsif($mut->region_value) {
361 $w->startTag("region",
362 "value" => $mut->region_value
365 elsif($mut->region_dist) {
366 $w->startTag("region",
367 "dist" => $mut->region_dist
369 } else {
370 $w->startTag("region");
372 $w->characters($mut->region );
373 $w->endTag;
375 $w->endTag; #DNA
379 # RNA
381 elsif( $mut->isa('Bio::Variation::RNAChange') ) {
382 $mut->isMutation(0) if not $mut->isMutation;
383 my @alleles = $mut->each_Allele;
384 my $count = 0;
385 foreach my $allele (@alleles) {
386 $count++;
387 my ($variation_number, $change_number) = split /\./, $mut->mut_number;
388 if ($change_number and $change_number != $count){
389 $mut->mut_number("$change_number.$count");
391 $mut->allele_mut($allele);
392 $w->startTag("RNA",
393 "number" => $mut->mut_number,
394 "start" => $mut->start,
395 "end" => $mut->end,
396 "length" => $mut->length,
397 "isMutation" => $mut->isMutation
400 if ($mut->label) {
401 foreach my $label (split ', ', $mut->label) {
402 $w->startTag("label");
403 $w->characters($label );
404 $w->endTag;
407 if ($mut->proof) {
408 $w->startTag("proof");
409 $w->characters($mut->proof );
410 $w->endTag;
412 if ($mut->upStreamSeq) {
413 $w->startTag("upFlank");
414 $w->characters($mut->upStreamSeq );
415 $w->endTag;
417 #if ( $mut->isMutation) {
418 if ($mut->allele_ori) {
419 $w->startTag("allele_ori");
420 $w->characters($mut->allele_ori->seq) if $mut->allele_ori->seq ;
421 $w->endTag;
423 if ($mut->allele_mut) {
424 $w->startTag("allele_mut");
425 $w->characters($mut->allele_mut->seq) if $mut->allele_mut->seq ;
426 $w->endTag;
429 if ($mut->dnStreamSeq) {
430 $w->startTag("dnFlank");
431 $w->characters($mut->dnStreamSeq );
432 $w->endTag;
434 if ($mut->region eq 'coding') {
435 if (! $mut->codon_mut) {
436 $w->startTag("codon",
437 "codon_ori" => $mut->codon_ori,
438 "codon_pos" => $mut->codon_pos
440 } else {
441 $w->startTag("codon",
442 "codon_ori" => $mut->codon_ori,
443 "codon_mut" => $mut->codon_mut,
444 "codon_pos" => $mut->codon_pos
447 $w->endTag;
449 if ($mut->codon_table != 1) {
450 $w->startTag("codon_table");
451 $w->characters($mut->codon_table);
452 $w->endTag;
455 if ($mut->restriction_changes) {
456 $w->startTag("restriction_changes");
457 $w->characters($mut->restriction_changes);
458 $w->endTag;
460 if ($mut->region) {
461 if($mut->region_value and $mut->region_dist) {
462 $w->startTag("region",
463 "value" => $mut->region_value,
464 "dist" => $mut->region_dist
467 elsif($mut->region_value) {
468 $w->startTag("region",
469 "value" => $mut->region_value
472 elsif($mut->region_dist) {
473 $w->startTag("region",
474 "dist" => $mut->region_dist
476 } else {
477 $w->startTag("region");
479 $w->characters($mut->region );
480 $w->endTag;
482 $w->endTag; #RNA
486 # AA
488 elsif( $mut->isa('Bio::Variation::AAChange') ) {
489 $mut->isMutation(0) if not $mut->isMutation;
490 my @alleles = $mut->each_Allele;
491 my $count = 0;
492 foreach my $allele (@alleles) {
493 $count++;
494 my ($variation_number, $change_number) = split /\./, $mut->mut_number;
495 if ($change_number and $change_number != $count){
496 $mut->mut_number("$change_number.$count");
498 $mut->allele_mut($allele);
499 $w->startTag("AA",
500 "number" => $mut->mut_number,
501 "start" => $mut->start,
502 "end" => $mut->end,
503 "length" => $mut->length,
504 "isMutation" => $mut->isMutation
507 if ($mut->label) {
508 foreach my $label (split ', ', $mut->label) {
509 $w->startTag("label");
510 $w->characters($label );
511 $w->endTag;
514 if ($mut->proof) {
515 $w->startTag("proof");
516 $w->characters($mut->proof );
517 $w->endTag;
519 #if ( $mut->isMutation) {
520 if ($mut->allele_ori) {
521 $w->startTag("allele_ori");
522 $w->characters($mut->allele_ori->seq) if $mut->allele_ori->seq;
523 $w->endTag;
525 if ($mut->allele_mut) {
526 $w->startTag("allele_mut");
527 $w->characters($mut->allele_mut->seq) if $mut->allele_mut->seq;
528 $w->endTag;
531 if ($mut->region) {
532 if($mut->region_value and $mut->region_dist) {
533 $w->startTag("region",
534 "value" => $mut->region_value,
535 "dist" => $mut->region_dist
538 elsif($mut->region_value) {
539 $w->startTag("region",
540 "value" => $mut->region_value
543 elsif($mut->region_dist) {
544 $w->startTag("region",
545 "dist" => $mut->region_dist
547 } else {
548 $w->startTag("region");
550 $w->characters($mut->region );
551 $w->endTag;
553 $w->endTag; #AA
559 $w->endTag;
562 $w->end;
563 $self->_print($str);
564 $output = undef;
565 return 1;