Bio/SeqIO/agave.pm

   1 # BioPerl module: Bio::SeqIO::agave
   2 #
   3 # AGAVE: Architecture for Genomic Annotation, Visualization and Exchange.
   4 #
   5 # You may distribute this module under the same terms as perl itself
   6 #
   7 # POD documentation - main docs before the code
   8 #
   9 # The original version of the module can be found here:
  10 # http://www.lifecde.com/products/agave/agave.pm
  11 #
  12 # The DTD for AGAVE XML can be located here:
  13 # http://www.lifecde.com/products/agave/schema/v2_3/agave.dtd
  14 #
  15 #
  16 =head1 NAME
  17
  18 Bio::SeqIO::agave - AGAVE sequence output stream.
  19
  20 =head1 SYNOPSIS
  21
  22 It is probably best not to use this object directly, but
  23 rather go through the SeqIO handler system. Go:
  24
  25   $in  = Bio::SeqIO->new('-file'   => "$file_in",
  26                          '-format' => 'EMBL');
  27
  28   $out = Bio::SeqIO->new('-file'   => ">$file_out",
  29                          '-format' => 'AGAVE');
  30
  31   while (my $seq = $in->next_seq){
  32         $out->write_seq($seq);
  33   }
  34
  35 =head1 DESCRIPTION
  36
  37 This object can transform Bio::Seq objects to agave xml file and
  38 vice-versa.  I (Simon) coded up this module because I needed a parser
  39 to extract data from AGAVE xml to be utitlized by the GenQuire genome
  40 annotation system (See http://www.bioinformatics.org/Genquire).
  41
  42 ***NOTE*** At the moment, not all of the tags are implemented.  In
  43 general, I followed the output format for the XEMBL project
  44 http://www.ebi.ac.uk/xembl/
  45
  46 =cut
  47
  48 =head1 FEEDBACK
  49
  50 =head2 Mailing Lists
  51
  52 User feedback is an integral part of the evolution of this and other
  53 Bioperl modules. Send your comments and suggestions preferably to one
  54 of the Bioperl mailing lists.  Your participation is much appreciated.
  55
  56   bioperl-l@bioperl.org                  - General discussion
  57   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  58
  59 =head2 Reporting Bugs
  60
  61 Report bugs to the Bioperl bug tracking system to help us keep track
  62 the bugs and their resolution.
  63 Bug reports can be submitted via the web:
  64
  65   http://bugzilla.open-bio.org/
  66
  67 =head1 AUTHOR - Simon K. Chan
  68
  69 Email:
  70
  71 =head1 APPENDIX
  72
  73 The rest of the documentation details each of the object
  74 methods. Internal methods are usually preceded with a _
  75
  76 =cut
  77
  78 # ===================
  79
  80
  81 # Let the code begin...
  82 package Bio::SeqIO::agave;
  83 use strict;
  84
  85 use IO::File;
  86
  87
  88 use Bio::SeqFeature::Generic;
  89 use Bio::Seq;
  90 use Bio::PrimarySeq;
  91 use Bio::Seq::SeqFactory;
  92 use Bio::Annotation::Reference;
  93 use Bio::Species;
  94
  95 use XML::Writer;
  96
  97 use Data::Dumper;
  98
  99 use base qw(Bio::SeqIO);
 100
 101 # ==================================================================================
 102 sub _initialize {
 103
 104     my ($self,@args) = @_;
 105     $self->SUPER::_initialize(@args); # Run the constructor of the parent class.
 106
 107     my %tmp = @args ;
 108     $self->{'file'} = $tmp{'-file'};
 109
 110     if ($self->{'file'} !~ /^>/) {
 111         $self->_process;
 112         # Parse the thing, but only if it is the input file (ie not
 113         # outputing agave file, but reading it).
 114         $self->{'parsed'} = 1;
 115         # Set the flag to let the code know that the agave xml file
 116         # has been parsed.
 117     }
 118     $self->{'seqs_stored'} = 0;
 119
 120 }
 121 # ==================================================================================
 122
 123 =head2 _process
 124
 125   Title    : _process
 126   Usage    : $self->_process
 127   Function : Parses the agave xml file.
 128   Args     : None.
 129   Returns  : Nothing.
 130   Note     : Method(s) that call(s) this method : _initialize
 131              Method(s) that this method calls   : _process_sciobj
 132              FIRST/START sub.
 133
 134 =cut
 135
 136 sub _process {
 137     my ($self) = @_;
 138
 139     while (1) {
 140
 141         my $line = $self->_readline;
 142         next unless $line;
 143         next if $line =~ /^\s*$/;
 144
 145         if ($line =~ /<\?xml version/o) {
 146
 147             # do nothing
 148
 149         } elsif ($line =~ /\<!DOCTYPE (\w+) SYSTEM "([\w\.]+)"\>/) {
 150
 151             $self->throw("Error: This xml file is not in AGAVE format! DOCTYPE: $1 , SYSTEM: $2\n\n")
 152                 if $1 ne 'sciobj' || $2 ne 'sciobj.dtd';
 153
 154         } elsif ($line =~ /<sciobj (.*)>/) {
 155
 156             push @{$self->{'sciobj'}}, $self->_process_sciobj($1);
 157
 158         } elsif ($line =~ /<\/sciobj>/) {
 159
 160             last;               # It is finished.
 161
 162         } else {
 163
 164             # throw an error message.  The above conditions should
 165             # take care all of the possible options...?
 166             # $self->throw("Error: Do not recognize this AGAVE xml
 167             # line: $line\n\n");
 168
 169         }
 170
 171
 172     }                           # close while loop
 173
 174
 175     return;
 176
 177 }
 178 # ==================================================================================
 179
 180 =head2 _process_sciobj
 181
 182   Title    : _process_sciobj
 183   Usage    : $self->_process_sciobj
 184   Function : Parses the data between the <sciobj></sciobj> tags.
 185   Args     : The string that holds the attributes for <sciobj>.
 186   Returns  : Data structure holding the values parsed between
 187              the <sciobj></sciobj> tags.
 188   Note     : Method(s) that call(s) this method : _process
 189              Method(s) that this method calls   :
 190              _helper_store_attribute_list , _process_contig
 191
 192 =cut
 193
 194 sub _process_sciobj {
 195
 196     my ($self, $attribute_line) = @_;
 197     my $sciobj;
 198     $self->_helper_store_attribute_list($attribute_line, \$sciobj);
 199
 200     my $line = $self->_readline;
 201
 202     # Zero or more <contig>
 203     while ($line =~ /<contig\s?(.*?)\s?>/) {
 204         my $contig = $self->_process_contig(\$line, $1);
 205         push @{$sciobj->{'contig'}}, $contig;
 206         # print "line in _process_sciobj: $line\n";
 207         # $line changes value within the subs called in this sub (_process_contig).
 208     }
 209
 210     return $sciobj;
 211 }
 212 # ==================================================================================
 213
 214 =head2 _process_contig
 215
 216   Title    : _process_contig
 217   Usage    : $self->_process_contig
 218   Function : Parses the data between the <contig></contig> tags.
 219   Args     : 2 scalars:
 220              - reference to a scalar holding the line to be parsed.
 221              - scalar holding the attributes for the <contig> tag
 222                to be parsed.
 223   Returns  : Data structure holding the values parsed between
 224              the <contig></contig> tags.
 225   Note     : Method(s) that call(s) this method : _process_sciobj
 226              Method(s) that this method calls   :
 227              _helper_store_attribute_list, _one_tag , _process_fragment_order
 228
 229 =cut
 230
 231 sub _process_contig {
 232
 233     my ($self, $line, $attribute_line) = @_;
 234
 235     my $contig;
 236     $self->_helper_store_attribute_list($attribute_line, \$contig);
 237     $$line = $self->_readline;
 238
 239     # One <db_id>:
 240     $self->_one_tag($line, \$contig, 'db_id');
 241
 242
 243     # Zero or more <fragment_order>
 244     $self->_process_fragment_order($line, \$contig);
 245
 246     return $contig;
 247
 248 }
 249 # ==================================================================================
 250
 251 =head2 _process_fragment_order
 252
 253   Title    : _process_fragment_order
 254   Usage    : $self->_process_fragment_order
 255   Function : Parses the data between the <fragment_order></fragment_order> tags.
 256   Args     : 2 scalars:
 257              - reference to a scalar holding the value of the line to be parsed.
 258              - reference to a data structure to store the <fragment_order> data.
 259   Returns  : Nothing.
 260   Note     : Method(s) that call(s) this method : _process_contig
 261              Method(s) that this method calls   :
 262              _helper_store_attribute_list , _process_fragment_orientation
 263
 264 =cut
 265
 266 sub _process_fragment_order {
 267
 268
 269     my ($self, $line, $data_structure) = @_;
 270     # Because I'm passing a reference to a data structure, I don't need to return it
 271     # after values have been added.
 272
 273     while ($$line =~ /<fragment_order\s?(.*?)\s?>/) {
 274
 275         my $fragment_order;
 276         $self->_helper_store_attribute_list($1, \$fragment_order);
 277         # Store the attribute(s) for <fragment_order> into the
 278         # $fragment_order data structure.
 279         $$line = $self->_readline;
 280
 281         # One or more <fragment_orientation>
 282         $self->_process_fragment_orientation($line, \$fragment_order);
 283         # Don't forget: $line is a reference to a scalar.
 284
 285         push @{$$data_structure->{'fragment_order'}}, $fragment_order;
 286         # Store the data between <fragment_order></fragment_order>
 287         # in $$data_structure.
 288
 289     }
 290
 291     return;
 292
 293 }
 294 # ==================================================================================
 295
 296 =head2 _process_fragment_orientation
 297
 298   Title    : _process_fragment_orientation
 299   Usage    : $self->_process_fragment_orientation
 300   Function : Parses the data between the <fragment_orientation> and
 301              </fragment_orientation> tags.
 302   Args     : 2 scalars:
 303              - reference to a scalar holding the value of the line to be parsed.
 304              - reference to a data structure to store the <fragment_orientation> data.
 305   Returns  : Nothing.
 306   Note     : Method(s) that call(s) this method : _process_fragment_order
 307
 308 Method(s) that this method calls : _helper_store_attribute_list ,
 309 _process_bio_sequence
 310
 311 =cut
 312
 313 sub _process_fragment_orientation {
 314
 315
 316     my ($self, $line, $data_structure) = @_;
 317
 318     # counter to determine the number of iterations within this while loop.
 319     my $count = 0;
 320
 321     # One or more <fragment_orientation>
 322     while ($$line =~ /<fragment_orientation\s?(.*?)\s?>/) {
 323
 324         my $fragment_orientation;
 325         $self->_helper_store_attribute_list($1, \$fragment_orientation);
 326         $$line = $self->_readline;
 327
 328         # One <bio_sequence>
 329         $$line =~ /<bio_sequence\s?(.*?)\s?>/;
 330         # Process the data between <bio_sequence></bio_sequence>
 331         my $bio_sequence = $self->_process_bio_sequence($line, $1);
 332         $fragment_orientation->{'bio_sequence'} = $bio_sequence;
 333
 334         push @{$$data_structure->{'fragment_orientation'}}, $fragment_orientation;
 335
 336         ++$count;
 337     }
 338
 339
 340     $self->throw("Error: Missing <fragment_orientation> tag.  Got this: $$line\n\n")
 341         if $count == 0;
 342
 343     return;
 344
 345 }
 346 # ==================================================================================
 347
 348 =head2 _process_bio_sequence
 349
 350   Title    : _process_bio_sequence
 351   Usage    : $self->_process_bio_sequence
 352   Function : Parses the data between the <bio_sequence></bio_sequence> tags.
 353   Args     : 2 scalars:
 354              - reference to a scalar holding the value of the line to be parsed.
 355              - scalar holding the value of the attributes for <bio_sequence>
 356   Returns  : data structure holding the values between <bio_sequence></bio_sequence>
 357   Note     : Method(s) that call(s) this method : _process_fragment_orientation
 358
 359 Method(s) that this method calls : _helper_store_attribute_list ,
 360 _one_tag , _question_mark_tag , _star_tag , _process_alt_ids ,
 361 _process_xrefs , _process_sequence_map
 362
 363 =cut
 364
 365 sub _process_bio_sequence {
 366
 367     my ($self, $line, $attribute_line) = @_;
 368
 369     my $bio_sequence;
 370
 371     $self->_helper_store_attribute_list($attribute_line, \$bio_sequence);
 372     $$line = $self->_readline;
 373
 374
 375     # One <db_id>.
 376     $self->_one_tag($line, \$bio_sequence, 'db_id');
 377
 378
 379     # Zero or one <note>.
 380     $self->_question_mark_tag($line, \$bio_sequence, 'note');
 381
 382
 383     # Zero or more <description>
 384     $self->_question_mark_tag($line, \$bio_sequence, 'description');
 385
 386
 387     # Zero or more <keyword>
 388     $self->_star_tag($line, \$bio_sequence, 'keyword');
 389
 390
 391     # Zero or one <sequence>
 392     $self->_question_mark_tag($line, \$bio_sequence, 'sequence');
 393
 394
 395     # Zero or one <alt_ids>
 396     # NOT IMPLEMENTED!!!!
 397     #if ($line =~ /<alt_ids>/){ # NOT DONE YET!
 398     #       my $alt_ids;
 399     #       $bio_sequence->{'alt_ids'} = $self->_process_alt_ids(\$alt_ids);
 400     #}
 401
 402
 403     # Zero or one <xrefs>
 404     if ($$line =~ /<xrefs\s?(.*?)\s?>/) {
 405         my $xrefs = $self->_process_xrefs($line, \$bio_sequence);
 406         $bio_sequence->{'xrefs'} = $xrefs || 'null';
 407     }
 408
 409
 410     # Zero or more <sequence_map>
 411     if ($$line =~ /<sequence_map\s?(.*?)\s?>/) {
 412         my $sequence_map = $self->_process_sequence_map($line);
 413         push @{$bio_sequence->{'sequence_map'}}, $sequence_map;
 414     }
 415
 416     # print Data::Dumper->Dump([$bio_sequence]); exit;
 417
 418     return $bio_sequence;
 419
 420 }
 421 # ==================================================================================
 422
 423 =head2 _process_xrefs
 424
 425   Title    : _process_xrefs
 426   Usage    : $self->_process_xrefs
 427   Function : Parse the data between the <xrefs></xrefs> tags.
 428   Args     : reference to a scalar holding the value of the line to be parsed.
 429   Return   : Nothing.
 430   Note     : Method(s) that call(s) this method: _process_bio_sequence
 431              Method(s) that this method calls: _one_tag , _process_xref
 432
 433 =cut
 434
 435 sub _process_xrefs {
 436
 437     my ($self, $line) = @_;
 438
 439     my $xrefs;
 440
 441     $$line = $self->_readline;
 442
 443     # One or more <db_id> or <xref> within <xrefs></xrefs>.  Check if
 444     # to see if there's at least one.
 445     if ($$line =~ /<db_id|xref\s?(.*?)\s?>/) {
 446
 447         while ($$line =~ /<(db_id|xref)\s?(.*?)\s?>/) {
 448
 449             if ($1 eq "db_id") {
 450
 451                 my $db_id;
 452                 $self->_one_tag($line, \$db_id, 'db_id');
 453                 push @{$xrefs->{'db_id'}}, $db_id;
 454
 455             } elsif ($1 eq "xref") {
 456
 457                 my $xref;
 458                 $self->_process_xref($line, \$xref);
 459                 push @{$xrefs->{'xref'}}, $xref;
 460
 461             } else {
 462
 463                 $self->throw("Error:  Tag type should be one of db_id or xref!  Got this: $$line\n\n");
 464             }
 465
 466
 467         }                       # close while loop
 468
 469
 470         if ($$line =~ /<\/xrefs>/) {
 471             $$line = $self->_readline; # get the next line to be _processed by the next sub.
 472             return $xrefs;
 473         } else {
 474             $self->throw("Error: Missing </xrefs> tag.  Got this: $$line\n\n");
 475         }
 476
 477
 478
 479     } else {
 480
 481         $self->throw("Error: Missing <db_id> or <xref> tag.  Got this: $$line\n\n");
 482     }
 483
 484     return;
 485
 486 }
 487 # ==================================================================================
 488
 489 =head2 _process_xref
 490
 491   Title    : _process_xref
 492   Usage    : $self->_process_xref
 493   Function : Parses the data between the <xref></xref> tags.
 494   Args     : 2 scalars:
 495              - reference to a scalar holding the value of the line to be parsed.
 496              - reference to a data structure to store the <xref> data.
 497   Returns  : Nothing.
 498   Note     : Method(s) that call(s) this method : _process_xrefs (note the 's' in 'xrefs')
 499              Method(s) that this method calls   : _helper_store_attribute_list , _star_tag
 500
 501 =cut
 502
 503 sub _process_xref {
 504
 505     my ($self, $line, $xref) = @_;
 506
 507     $$line = $self->_readline;
 508
 509     # One <db_id>
 510     if ($$line =~ /<db_id\s?(.*?)\s?>/) {
 511         $self->_helper_store_attribute_list($1, $xref);
 512     } else {
 513         $self->throw("Error:  Missing <db_id> tag.  Got this: $$line\n\n");
 514     }
 515
 516
 517     # Zero or more <xref_property>
 518     $self->_star_tag($line, $xref, 'xref_propery');
 519
 520     return;
 521
 522 }
 523 # ==================================================================================
 524
 525 =head2 _process_sequence_map
 526
 527   Title    : _process_sequence_map
 528   Usage    : $self->_process_sequence_map
 529   Function : Parses the data between the <sequence_map></sequence_map> tags.
 530   Args     : Reference to scalar holding the line to be parsed.
 531   Returns  : Data structure that holds the values that were parsed.
 532   Note     : Method(s) that call(s) this method : _process_bio_sequence
 533              Method(s) that this method calls   : _helper_store_attribute_list ,
 534                 _question_mark_tag , _process_annotations
 535
 536 =cut
 537
 538 sub _process_sequence_map {
 539
 540     my ($self, $line) = @_;
 541
 542     my $sequence_map;
 543
 544     # Zero or more <sequence_map>
 545     while ($$line =~ /<sequence_map\s?(.*?)\s?>/) {
 546
 547         $self->_helper_store_attribute_list($1, \$sequence_map) if defined $1;
 548         $$line = $self->_readline;
 549
 550         # Zero or one <note>
 551         $self->_question_mark_tag($line, \$sequence_map, 'note');
 552
 553         # NOT IMPLEMENTED!!!
 554         #if ($$line =~ /<computations\?(.*?)\s?>/){
 555         #       # $self->_process_computations();
 556         #}
 557
 558
 559         # Zero or one <annotations>
 560         if ($$line =~ /<annotations\s?(.*?)\s?>/) {
 561             my $annotations = $self->_process_annotations($line);
 562             $sequence_map->{'annotations'} = $annotations;
 563         }
 564
 565
 566     }                           # closes the while loop
 567
 568
 569     # Match closing tag:
 570     if ($$line =~ /<\/sequence_map>/) {
 571         return $sequence_map;
 572     } else {
 573         $self->throw("Error:  Missing </sequence_map> tag.  Got this: $$line\n\n");
 574     }
 575
 576
 577 }
 578 # ==================================================================================
 579
 580 =head2 _process_annotations
 581
 582   Title    : _process_annotations
 583   Usage    : $self->_process_annotations
 584   Function : Parse the data between the <annotations></annotations> tags.
 585   Args     : Reference to scalar holding the line to be parsed.
 586   Returns  : Data structure that holds the values that were parsed.
 587   Note     : Method(s) that call(s) this method : _process_sequence_map
 588              Method(s) that this method calls   : _process_seq_feature
 589
 590 =cut
 591
 592 sub _process_annotations {
 593
 594     my ($self, $line) = @_;
 595     # ( seq_feature | gene | comp_result )+
 596
 597     my $annotations;
 598
 599     $$line = $self->_readline;
 600
 601     my $count = 0;              # counter to keep track of number of iterations in the loop.
 602
 603     # One or more of these:
 604     while ($$line =~ /<(seq_feature|gene|comp_result)\s?(.*?)\s?>/) {
 605
 606         if ($$line =~ /<seq_feature\s?(.*?)\s?>/) {
 607
 608             my $seq_feature = $self->_process_seq_feature($line, $1);
 609             push @{$annotations->{'seq_feature'}}, $seq_feature;
 610
 611         } elsif ($$line =~ /<gene\s?(.*?)\s?>/) {
 612
 613             # gene
 614
 615         } elsif ($$line =~ /<comp_result\s?(.*?)\s?>/) {
 616
 617             # comp_result
 618
 619         }
 620
 621         ++$count;
 622
 623     }                           # closes the while loop.
 624
 625     $self->throw("Error:  Missing <seq_feature> tag.  Got: $$line\n\n") if $count == 0;
 626
 627     # Match closing tag:
 628     if ($$line =~ /<\/annotations/) {
 629
 630         $$line = $self->_readline; # get the next line to be _processed by the next sub.
 631         return $annotations;
 632
 633     } else {
 634         $self->throw("Error:  Missing </annotations> tag.  Got this: $$line\n\n");
 635     }
 636
 637
 638 }
 639 # ==================================================================================
 640
 641 =head2 _process_seq_feature
 642
 643   Title    : _process_seq_feature
 644   Usage    : $self->_process_seq_feature
 645   Function : Parses the data between the <seq_feature></seq_feature> tag.
 646   Args     : 2 scalars:
 647              - Reference to scalar holding the line to be parsed.
 648              - Scalar holding the attributes for <seq_feature>.
 649   Returns  : Data structure holding the values parsed.
 650   Note     : Method(s) that call(s) this method: _process_annotations
 651
 652 Method(s) that this method calls: _helper_store_attribute_list ,
 653 _process_classification , _question_mark_tag , _one_tag ,
 654 _process_evidence , _process_qualifier , _process_seq_feature ,
 655 _process_related_annot
 656
 657 =cut
 658
 659 sub _process_seq_feature {
 660
 661     my ($self, $line, $attribute_line) = @_;
 662
 663     my $seq_feature;
 664     $self->_helper_store_attribute_list($attribute_line, \$seq_feature);
 665
 666
 667     $$line = $self->_readline;
 668
 669
 670     # Zero or more <classification>
 671     $self->_process_classification($line, \$seq_feature);
 672
 673
 674
 675     # Zero or one <note>
 676     $self->_question_mark_tag($line, \$seq_feature, 'note');
 677
 678
 679
 680     # One <seq_location>
 681     $self->_one_tag($line, \$seq_feature, 'seq_location');
 682
 683
 684
 685     # Zero or one <xrefs>
 686     $self->_question_mark_tag($line, \$seq_feature, 'xrefs');
 687
 688
 689
 690     # Zero or one <evidence>
 691     $self->_process_evidence($line, \$seq_feature);
 692
 693
 694
 695     # Zero or more <qualifier>
 696     $self->_process_qualifier($line, \$seq_feature);
 697
 698
 699
 700     # Zero or more <seq_feature>.  A <seq_feature> tag within a <seq_feature> tag?  Oh, well.  Whatever...
 701     while ($$line =~ /<seq_feature\s?(.*?)\s?>/) {
 702         $self->_process_seq_feature($line, $1);
 703         $$line = $self->_readline;
 704     }
 705
 706
 707     # Zero or more <related_annot>
 708     while ($$line =~ /<related_annot\s?(.*?)\s?>/) {
 709         $self->_process_related_annot($line, $1);
 710         $$line = $self->_readline;
 711     }
 712
 713
 714     # Match the closing tag:
 715     if ($$line =~ /<\/seq_feature>/) {
 716
 717         $$line = $self->_readline; # for the next sub...
 718         return $seq_feature;
 719
 720     } else {
 721
 722         $self->throw("Error.  Missing </seq_feature> tag.  Got this: $$line\n");
 723
 724     }
 725
 726 }
 727 # ==================================================================================
 728
 729 =head2 _process_qualifier
 730
 731   Title    : _process_qualifier
 732   Usage    : $self->_process_qualifier
 733   Function : Parse the data between the <qualifier></qualifier> tags.
 734   Args     : 2 scalars:
 735              - reference to a scalar holding the value of the line to be parsed.
 736              - reference to a data structure to store the <qualifer> data.
 737   Returns  : Nothing.
 738   Note     : Method(s) that call(s) this method : _process_seq_feature
 739              Method(s) that this method calls   : _star_tag
 740
 741 =cut
 742
 743 sub _process_qualifier {
 744
 745     my ($self, $line, $data_structure) = @_;
 746
 747     my $qualifier;
 748     $self->_star_tag($line, \$qualifier, 'qualifier');
 749     push @{$$data_structure->{'qualifier'}},$qualifier;
 750
 751
 752     return;
 753     # No need to return the data structure since its reference was what was modified.
 754
 755 }
 756 # ==================================================================================
 757
 758 =head2 _process_classification
 759
 760   Title   : _process_classification
 761   Usage   : $self->_process_classification
 762   Function: Parse the data between the <classification></classification> tags.
 763   Args    :   2 scalars:
 764             - reference to a scalar holding the value of the line to be parsed.
 765             - reference to a data structure to store the <qualifer> data.
 766   Returns : Nothing.
 767   Note    : Method(s) that call(s) this method: _process_seq_feature
 768
 769   Method(s) that this method calls: _helper_store_attribute_list ,
 770   _question_mark_tag , _star_tag, _process_evidence
 771
 772 =cut
 773
 774 sub _process_classification { # NOT IN USE.
 775
 776     my ($self, $line, $data_structure) = @_;
 777
 778     my $classification = $$data_structure->{'classification'};
 779
 780     while ($$line =~ /<classification\s?(.*?)\s?>/) {
 781
 782         $self->_helper_store_attribute_list($1, \$classification);
 783
 784         # Zero or one <description>
 785         $self->_question_mark_tag($line, \$classification, 'description');
 786
 787         # Zero or more <id_alias>
 788         $self->_star_tag($line, \$classification, 'id_alias');
 789
 790         # Zero or one <evidence>
 791         $self->_process_evidence($line, \$classification);
 792     }
 793
 794
 795 }
 796 # ==================================================================================
 797
 798 sub _process_evidence { # NOT done.
 799
 800     my ($self, $line, $data_structure) = @_;
 801
 802     if ($$line =~ /<evidence>/) {
 803
 804         $$line = $self->_readline;
 805
 806         # One or more <element_id> OR One or more <comp_result>
 807         while ($$line =~ /<(element_id|comp_result)\s?(.*?)\s?>/) {
 808             if ($$line =~ /<element_id\s?(.*?)\s?>/) {
 809                 my $element_id;
 810                 $self->_plus_tag($line, \$element_id, 'element_id');
 811                 push @{$$data_structure->{'element_id'}}, $element_id;
 812             } elsif ($$line =~ /<comp_result\s?(.*?)\s?>/) {
 813                 my $comp_result;
 814                 $self->_process_comp_result($line, \$comp_result, $1);
 815                 push @{$$data_structure->{'comp_result'}}, $comp_result;
 816             }
 817             $$line = $self->_readline;
 818         }
 819
 820     }
 821
 822
 823 }
 824 # ==================================================================================
 825
 826 sub _process_comp_result { # NOT IN USE.
 827
 828
 829     my ($self, $line, $comp_result, $attribute_line) = @_;
 830
 831     $self->_helper_store_attribute_list($attribute_line, $comp_result);
 832     $$line = $self->_readline;
 833
 834     # Zero or one <note>
 835     $self->_question_mark_tag($line, $comp_result, 'note');
 836
 837     # Zero or one <match_desc>
 838     $self->_question_mark_tag($line, $comp_result, 'match_desc');
 839
 840     # Zero or one <match_align>
 841     $self->_question_mark_tag($line, $comp_result, 'match_align');
 842
 843     # Zero or one <query_region>
 844     $self->_process_query_region($line, $comp_result);
 845
 846     # Zero or one <match_region>
 847     $self->_process_match_region($line, $comp_result);
 848
 849     # Zero or more <result_property>
 850     $self->_star_tag($line, $comp_result, 'result_property');
 851
 852     # Zero or more <result_group>
 853     $self->_process_result_group($line, $comp_result);
 854
 855     # Zero or more <related_annot>
 856     $self->_process_related_annot($line, $comp_result);
 857
 858 }
 859 # ==================================================================================
 860
 861 sub _process_related_annot { # NOT IN USE.
 862
 863     my ($self, $line, $data_structure) = @_;
 864
 865     while ($$line =~ /<related_annot\s?(.*?)\s?>/) {
 866
 867         my $related_annot;
 868         # Zero or one <related_annot>
 869         $self->_helper_store_attribute_list($1, \$related_annot);
 870         $$line = $self->_readline;
 871
 872         # One or more <element_id>
 873         my $element_id_count = 0;
 874         while ($$line =~ /<element_id\s?(.*?)\s?>/) {
 875             my $element_id;
 876             $self->_helper_store_attribute_list($1, \$element_id);
 877             push @{$related_annot->{'element_id'}}, $element_id;
 878             $$line = $self->_readline;
 879             ++$element_id_count;
 880         }
 881
 882         if ($element_id_count == 0) {
 883             $self->throw("Error.  Missing <element_id> tag.  Got: $$line");
 884         }
 885
 886         # Zero or more <sci_property>
 887         $self->_star_tag($line, \$related_annot, 'sci_property');
 888         # while ($$line =~ /<sci_property\s?(.*?)\s?>/){
 889         #
 890         # }
 891
 892         push @{$data_structure->{'related_annot'}}, $related_annot;
 893
 894         unless ($$line =~ /<\/related_annot>/){
 895             $self->throw("Error.  Missing </related_tag>. Got: $$line\n");
 896         }
 897
 898     }
 899
 900
 901 }
 902 # ==================================================================================
 903
 904 sub _process_result_group { # NOT IN USE.
 905
 906     my ($self, $line, $data_structure) = @_;
 907
 908     while ($$line =~ /<result_group\s?(.*?)\s?>/) {
 909         my $result_group = $$data_structure->{'result_group'};
 910         $self->_helper_store_attribute_list($1, \$result_group);
 911
 912         my $count = 0;
 913         $$line = $self->_readline;
 914         while ($$line =~ /<comp_result\s?(.*?)\s?>/) {
 915             # one or more <comp_result>
 916             $self->_process_comp_result(\$line, \$result_group, $1);
 917             $$line = $self->_readline;
 918             ++$count;
 919         }
 920
 921         $self->throw("Error.  No <comp_result></comp_result> tag! Got this: $$line")
 922             if $count == 0;
 923
 924         # in the last iteration in the inner while loop, $line will
 925         # have a value of the closing tag of 'result_group'
 926         if ($line =~ /<\/result_group>/) {
 927             $$line = $self->_readline;
 928         } else {
 929             $self->throw("Error.  No </result_tag>!  Got this: $$line");
 930         }
 931
 932
 933     }
 934
 935
 936 }
 937 # ==================================================================================
 938
 939 sub _process_match_region { # NOT IN USE.
 940
 941     my ($self, $line, $data_structure) = @_;
 942
 943     my $match_region = $data_structure->{'match_region'};
 944
 945     if ($$line =~ /<match_region\s?(.*?)\s?>(.*?)>/) {
 946
 947         $self->_helper_store_attribute_line($1, \$match_region);
 948         $$line = $self->_readline;
 949
 950         # Zero or one db_id | element_id | bio_sequence
 951         if ($$line =~ /<db_id\s?(.*?)\s?>(.*?)<\/db_id>/) {
 952             $self->_question_mark_tag($line, \$match_region, 'db_id');
 953         } elsif ($$line =~ /<element_id\s?(.*?)\s?>/) { # empty...
 954             $self->_question_mark_tag($line, \$match_region, 'element_id');
 955         } elsif ($$line =~ /<bio_sequence\s?(.*?)\s?>/) {
 956             $match_region->{'bio_sequence'} = $self->_process_bio_sequence($line, $1);
 957         }
 958
 959         $$line = $self->_readline;
 960         if ($$line =~ /<\/match_region>/o) {
 961             $$line = $self->_readline; # get the next line to be _processed by the next sub
 962             return;
 963         } else {
 964             $self->throw("No closing tag </match_region>!  Got this: $$line\n");
 965         }
 966
 967     }
 968 }
 969 # ==================================================================================
 970
 971 sub _process_query_region { # NOT IN USE.
 972
 973     my ($self, $line, $data_structure) = @_;
 974
 975     my $query_region = $data_structure->{'query_region'};
 976     if ($$line =~ /<query_region\s?(.*?)\s?>/) {
 977         $self->_helper_store_attribute_list($1, \$query_region);
 978         $$line = $self->_readline;
 979
 980         # Zero or one <db_id>
 981         $self->_question_mark_tag($line, \$query_region, 'db_id');
 982
 983         if ($$line =~ /<\/query_region>/) {
 984             $$line = $self->_readline; # get the next line to _process.
 985             return;
 986         } else {
 987             $self->throw("No closing tag </query_region>.  Got this: $$line\n");
 988         }
 989
 990     }
 991
 992
 993 }
 994 # ==================================================================================
 995
 996 =head2 _tag_processing_helper
 997
 998   Title    : _tag_processing_helper
 999   Usage    : $self->_tag_processing_helper
1000   Function : Stores the tag value within the data structure.
1001              Also calls _helper_store_attribute_list to store the
1002              attributes and their values in the data structure.
1003   Args     : 5 scalars:
1004              - Scalar holding the value of the attributes
1005              - Reference to a data structure to store the data for <$tag_name>
1006              - Scalar holding the tag name.
1007              - Scalar holding the value of the tag.
1008              - Scalar holding the value of either 'star', 'plus',
1009                or 'question mark' which specifies what type of method
1010                called this method.
1011   Returns  : Nothing.
1012   Note     : Method(s) that call(s) this method:
1013              Method(s) that this method calls: _helper_store_attribute_list
1014
1015 =cut
1016
1017 sub _tag_processing_helper {
1018
1019     my ($self, $attribute_list, $data_structure, $tag_name, $tag_value, $caller) = @_;
1020
1021     # Add the attributes to the $$data_structure if they exist.
1022     # print "tag_name: $tag_name , attribute_list: $attribute_list\n";
1023     if (defined $attribute_list) {
1024         $self->_helper_store_attribute_list($attribute_list, $data_structure);
1025     }
1026
1027
1028     if ($caller eq 'star' || $caller eq 'plus') {
1029         push @{$$data_structure->{$tag_name}}, $tag_value;
1030         # There's either zero or more tags (*) or one or more (+)
1031     } else {
1032         $$data_structure->{$tag_name} = $tag_value || 'null';
1033         # There's zero or one tag (?)
1034     }
1035
1036     return;
1037
1038 }
1039 # ==================================================================================
1040
1041 =head2 _one_tag
1042
1043   Title    : _one_tag
1044   Usage    : $self->_one_tag
1045   Function : A method to store data from tags that occurs just once.
1046   Args     : 2 scalars:
1047              - reference to a scalar holding the value of the line to be parsed.
1048              - reference to a data structure to store the data for <$tag_name>
1049   Returns  : Nothing.
1050   Note     : Method(s) that call(s) this method : many
1051              Method(s) that this method calls   : _tag_processing_helper
1052
1053 =cut
1054
1055 sub _one_tag {
1056
1057     my ($self, $line, $data_structure, $tag_name) = @_;
1058
1059     $self->throw("Error:  Missing <$tag_name></$tag_name>.  Got: $$line\n\n")
1060         if $$line !~ /\<$tag_name/;
1061     # check to see if $$line is in correct format.
1062
1063     if ($$line =~ /<$tag_name\s?(.*?)\s?\/?>(.*?)<\/$tag_name>/) {
1064
1065         $self->_tag_processing_helper($1, $data_structure, $tag_name, $2, 'one');
1066         # $1 = attributes $data_structure = to hold the parsed values
1067         # # $tag_name = name of the tag $2 = tag value 'one' = lets
1068         # _tag_processing_helper know that it was called from the
1069         # _one_tag method.
1070
1071     } elsif ($$line =~ /<$tag_name\s?(.*?)\s?\/?>/) {
1072
1073         $self->_tag_processing_helper($1, $data_structure, $tag_name, '', 'one');
1074
1075     } else {
1076         $self->throw("Error:  Cannot parse this line: $$line\n\n");
1077     }
1078
1079     $$line = $self->_readline;  # get the next line.
1080
1081     return;
1082
1083 }
1084 # ==================================================================================
1085
1086 =head2 _question_mark_tag
1087
1088   Title    : _question_mark_tag
1089   Usage    : $self->_question_mark_tag
1090   Function : Parses values from tags that occurs zero or one time. ie: tag_name?
1091   Args     : 3 scalars:
1092              - reference to a scalar holding the value of the line to be parsed.
1093              - reference to a data structure to store the data for <$tag_name>
1094              - scalar holding the name of the tag.
1095   Returns  : Nothing.
1096   Note     : Method(s) that call(s) this method : many.
1097              Method(s) that this method calls   : _tag_processing_helper
1098
1099
1100 =cut
1101
1102 sub _question_mark_tag {
1103
1104     my ($self, $line, $data_structure, $tag_name) = @_;
1105
1106     if ($$line =~ /<$tag_name\s?(.*?)\s?>(.*?)<\/$tag_name>/) {
1107         $self->_tag_processing_helper($1, $data_structure, $tag_name, $2, 'question mark');
1108         $$line = $self->_readline;
1109     }
1110
1111     return;
1112
1113 }
1114 # ==================================================================================
1115
1116 =head2 _star_tag
1117
1118   Title    : _star_tag
1119   Usage    : $self->_star_tag
1120   Function : Parses values from tags that occur zero or more times. ie: tag_name*
1121   Args     : 3 scalars:
1122              - reference to a scalar holding the value of the line to be parsed.
1123              - reference to a data structure to store the data for <$tag_name>
1124              - scalar holding the name of the tag.
1125   Returns  : Nothing.
1126   Note     : Method(s) that call(s) this method : many.
1127              Method(s) that this method calls   : _tag_processing_helper
1128
1129 =cut
1130
1131 sub _star_tag {
1132
1133     my ($self, $line, $data_structure, $tag_name) = @_;
1134
1135     #print "tag_name: $tag_name\n";
1136     while ($$line =~ /<$tag_name\s?(.*?)\s?>(.*?)<\/$tag_name>/) {
1137         $self->_tag_processing_helper
1138             ($1, $data_structure, $tag_name, $2, 'star');
1139         # The tag and attribute values are stored within
1140         # $$data_structure within the _tag_processing_helper method.
1141         $$line = $self->_readline;
1142     }
1143     #if ($tag_name eq 'qualifier'){
1144     #       print "this one:\n";
1145     #       print Data::Dumper->Dump([$data_structure]); exit;
1146     #}
1147
1148     return;
1149
1150 }
1151 # ==================================================================================
1152
1153 =head2 _plus_tag
1154
1155   Title    : _plus_tag
1156   Usage    : $self->_plus_tag
1157   Function : Handles 'plus' tags (tags that occur one or more times).  tag_name+
1158   Args     : 3 scalars:
1159              - reference to a scalar holding the value of the line to be parsed.
1160              - reference to a data structure to store the data for <$tag_name>
1161              - scalar holding the name of the tag.
1162   Returns  : Nothing.
1163   Note     : Method(s) that call(s) this method : many.
1164              Method(s) that this method calls   : _star_tag
1165
1166 =cut
1167
1168 sub _plus_tag {
1169
1170     my ($self, $line, $data_structure, $tag_name) = @_;
1171
1172     if ($$line =~ /<$tag_name\s?(.*?)\s?>(.*?)<\/$tag_name>/) {
1173
1174         # Store value of the first occurence of $tag_name.
1175         # All subsequent values, if any, will be stored in the method _star_tag.
1176         $self->_tag_processing_helper($1, $data_structure, $tag_name, $2, 'plus');
1177
1178
1179         # If the flow gets within this block, we've already determined
1180         # that there's at least one of <$tag_name> Are there more?  To
1181         # answer this, we could just treat the tag as a * tag now
1182         # (zero or more).  We've already determined that it's NOT
1183         # zero, so how many more?  Thus, call _star_tag.
1184         $$line = $self->_readline;
1185         $self->_star_tag($line, $data_structure, $tag_name);
1186
1187
1188     } else {
1189         $self->throw("Error:  Missing <$tag_name></$tag_name>.  Got: $$line\n\n");
1190     }
1191
1192     return;
1193
1194 }
1195 # ==================================================================================
1196
1197 =head2 _helper_store_attribute_list
1198
1199   Title    : _helper_store_attribute_list
1200   Usage    : $self->_helper_store_attribute_list
1201   Function : A helper method used to store the attributes from
1202              the tags into the data structure.
1203   Args     : 2 scalars:
1204              - scalar holding the attribute values to be parsed.
1205              - reference to a data structure to store the data between the 2 tags.
1206   Returns  : Nothing.
1207   Note     : Method(s) that call(s) this method : Many.
1208              Method(s) that this method call(s) : None.
1209
1210 =cut
1211
1212 sub _helper_store_attribute_list {
1213
1214     my ($self, $attribute_line, $data_structure) = @_;
1215
1216     my %attribs = ($attribute_line =~ /(\w+)\s*=\s*"([^"]*)"/g);
1217
1218     my $attribute_list;
1219     for my $key (keys %attribs) {
1220         # print "\tkey: $key , value: $attribs{$key}\n";
1221         ###$$data_structure->{$key} = $attribs{$key};           # <- The ORIGINAL.
1222         push @{$$data_structure->{$key}}, $attribs{$key};
1223         # Now, store them in an array because there may be > 1 tag, thus
1224         # > 1 attribute of the same name.
1225         # Doing this has made it necessary to change the _store_seqs method.
1226         # ie: Change $bio_sequence->{'molecule_type'};
1227         # to
1228         # $bio_sequence->{'molecule_type'}->[0];
1229     }
1230
1231     return;
1232
1233 }
1234 # ==================================================================================
1235
1236 =head2 _store_seqs
1237
1238   Title    : _store_seqs
1239   Usage    : $self->_store_seqs
1240   Function : This method is called once in the life time of the script.
1241              It stores the data parsed from the agave xml file into
1242              the Bio::Seq object.
1243   Args     : None.
1244   Returns  : Nothing.
1245   Note     : Method(s) that call(s) this method : next_seq
1246              Method(s) that this method calls   : None.
1247
1248 =cut
1249
1250 sub _store_seqs {
1251
1252     my ($self) = @_;
1253
1254
1255     for my $sciobj (@{$self->{'sciobj'}}) {
1256
1257         ### $sciobj = $self->{'sciobj'};                # The root node.
1258
1259
1260         for my $contig (@{$sciobj->{'contig'}}) { # Each contig has a fragment order.
1261
1262             for my $fragment_order (@{$contig->{'fragment_order'}}) { # Each fragment order has a fragment_orientation.
1263
1264                 for my $fragment_orientation (@{$fragment_order->{'fragment_orientation'}}) {
1265                     # Each fragment_orientation contain 1 bio sequence.
1266
1267                     my $bio_sequence = $fragment_orientation->{'bio_sequence'}; # <bio_sequence> contains all the
1268                     # interesting stuff:
1269
1270                     my $sequence         = $bio_sequence->{'sequence'};
1271                     my $accession_number = $bio_sequence->{'sequence_id'}->[0]; # also use for primary_id
1272                     my $organism         = $bio_sequence->{'organism'};
1273                     my $description      = $bio_sequence->{'description'};
1274                     my $molecule_type    = $bio_sequence->{'molecule_type'}->[0];
1275
1276                     my $primary_seq = Bio::PrimarySeq->new(
1277                                                            -id       => $accession_number,
1278                                                            -alphabet => $molecule_type,
1279                                                            -seq      => $sequence,
1280                                                            -desc     => $description,
1281                                                           );
1282
1283                     my $seq = Bio::Seq->new (
1284                                              -display_id       => $accession_number,
1285                                              -accession_number => $accession_number,
1286                                              -primary_seq      => $primary_seq,
1287                                              -seq              => $sequence,
1288                                              -description      => $description,
1289                                             );
1290
1291                     my $organism_name = $bio_sequence->{organism_name}->[0];
1292                     if (defined $organism_name) {
1293
1294                         my @classification = split(' ', $organism_name);
1295                         my $species = Bio::Species->new();
1296                         $species->classification(@classification);
1297                         $seq->species($species);
1298                     }
1299                     # Pull out the keywords: $keywords is an array ref.
1300
1301                     my $keywords = $bio_sequence->{keyword};
1302                     my %key_to_value;
1303
1304                     for my $keywords (@$keywords) {
1305                         # print "keywords: $keywords\n";
1306                         my @words = split(':', $keywords);
1307                         for (my $i = 0; $i < scalar @words - 1; $i++) {
1308                             if ($i % 2 == 0) {
1309                                 my $j = $i; $j++;
1310                                 # print "$words[$i] , $words[$j]\n";
1311                                 $key_to_value{$words[$i]} = $words[$j];
1312                             }
1313                         }
1314                         # print Data::Dumper->Dump([%key_to_value]);
1315                         my $reference = Bio::Annotation::Reference->
1316                             new(-authors => $key_to_value{authors},
1317                                 -title => $key_to_value{title},
1318                                 -database => $key_to_value{database},
1319                                 -pubmed => $key_to_value{pubmed},
1320                                );
1321                         $seq->annotation->add_Annotation('reference', $reference);
1322
1323                     }           # close for my $keywords
1324
1325
1326                     #  print Data::Dumper->Dump([$bio_sequence]); print "here\n"; exit;
1327                     if (defined $bio_sequence->{'sequence_map'}) {
1328
1329                         for my $sequence_map (@{$bio_sequence->{'sequence_map'}}) {
1330
1331                             # print Data::Dumper->Dump([$sequence_map]); print "here\n"; exit;
1332
1333                             my $label = $sequence_map->{label};
1334
1335                             if (defined $sequence_map->{annotations} &&
1336                                 ref($sequence_map->{annotations}) eq 'HASH') {
1337
1338                                 # Get the sequence features (ie genes, exons, etc) from this $sequence_map
1339                                 for my $seq_feature (@{$sequence_map->{'annotations'}->{'seq_feature'}}) {
1340
1341                                     # print Data::Dumper->Dump([$seq_feature]); exit;
1342                                     my $seq_location     = $seq_feature->{'seq_location'};
1343                                     my $start_coord      = $seq_feature->{'least_start'}->[0];
1344                                     my $feature_type     = $seq_feature->{'feature_type'}->[0];
1345                                     my $end_coord        = $seq_feature->{'greatest_end'}->[0];
1346                                     my $is_on_complement = $seq_feature->{'is_on_complement'}->[0];
1347
1348                                     # Specify the coordinates and the tag for this seq feature.
1349                                     # print "Primary Tag for this SeqFeature: $feature_type\n";
1350                                     my $feat = Bio::SeqFeature::Generic->
1351                                         new(
1352                                             -start       => $start_coord,
1353                                             -end         => $end_coord,
1354                                             -primary_tag => $feature_type,
1355                                            );
1356
1357
1358                                     if (defined $seq_feature->{'qualifier'} &&
1359                                         ref($seq_feature->{'qualifier'}) eq 'ARRAY') {
1360
1361                                         for my $feature (@{$seq_feature->{'qualifier'}}) {
1362
1363                                             my $value = $feature->{'qualifier'};
1364                                             my $feature_type = $feature->{'qualifier_type'};
1365
1366                                             for (my $i = 0;
1367                                                  $i < scalar @{$value};
1368                                                  $i++) {
1369                                                 $feat->add_tag_value(
1370                                                                      $feature_type->[$i] => $value->[$i]
1371                                                                     );
1372                                             } # close the for loop
1373
1374                                         }
1375
1376                                     } # close if (defined $seq_feature->...
1377
1378
1379                                     $seq->add_SeqFeature($feat);
1380
1381
1382                                 } # close for my $seq_feature (@{$sequence_map->...
1383
1384
1385                             }   # close if (defined $sequence_map->{annotations} &&
1386
1387
1388                         }       # close for my $sequence_map (@{$bio_sequence->{'sequence_map'}}){
1389
1390                     }           # close if (defined $bio_sequence->{'sequence_map'}){
1391
1392
1393                     # This is where the Bio::Seq objects are stored:
1394                     push @{$self->{'sequence_objects'}}, $seq;
1395
1396
1397                 }               # close for my $fragment_orientation
1398
1399
1400             }                   # close for my $fragment_order
1401
1402
1403         }                       # close for my $contig
1404
1405
1406     }                           # close for my $sciobj
1407
1408     # Flag is set so that we know that the sequence objects are now stored in $self.
1409     $self->{'seqs_stored'} = 1;
1410
1411     return;
1412
1413 }
1414 # ==================================================================================
1415
1416 =head2 next_seq
1417
1418         Title    : next_seq
1419         Usage    : $seq = $stream->next_seq()
1420         Function : Returns the next sequence in the stream.
1421         Args     : None.
1422         Returns  : Bio::Seq object
1423
1424 Method is called from the script.  Method(s) that this method calls:
1425 _store_seqs (only once throughout the life time of script execution).
1426
1427
1428 =cut
1429
1430 sub next_seq {
1431
1432     my ($self) = @_;
1433
1434     # convert agave to genbank/fasta/embl whatever.
1435
1436     $self->_store_seqs if $self->{'seqs_stored'} == 0;
1437
1438     $self->throw("Error: No Bio::Seq objects stored yet!\n\n")
1439         if !defined $self->{'sequence_objects'}; # This should never occur...
1440
1441     if (scalar @{$self->{'sequence_objects'}} > 0) {
1442         return shift @{$self->{'sequence_objects'}};
1443     } else {
1444         # All done.  Nothing more to parse.
1445         # print "returning nothing!\n";
1446         return 0;
1447     }
1448
1449
1450 }
1451 # ==================================================================================
1452
1453 =head2 next_primary_seq
1454
1455   Title   : next_primary_seq
1456   Usage   : $seq = $stream->next_primary_seq()
1457   Function: returns the next primary sequence (ie no seq_features) in the stream
1458   Returns : Bio::PrimarySeq object
1459   Args    : NONE
1460
1461 =cut
1462
1463 sub next_primary_seq {
1464     my $self=shift;
1465     return 0;
1466 }
1467 # ==================================================================================
1468
1469 =head2 write_seq
1470
1471   Title   : write_seq
1472   Usage   : Not Yet Implemented! $stream->write_seq(@seq)
1473   Function: writes the $seq object into the stream
1474   Returns : 1 for success and 0 for error
1475   Args    : Bio::Seq object
1476
1477 =cut
1478
1479 sub write_seq {
1480
1481     # Convert the Bio::Seq object(s) to AGAVE xml file.
1482
1483     my ($self,@seqs) = @_;
1484
1485     foreach my $seq ( @seqs ) {
1486         $self->_write_each_record( $seq ); # where most of the work actually takes place.
1487     }
1488
1489     return;
1490
1491 }
1492 # ==================================================================================
1493
1494 =head2 _write_each_record
1495
1496   Title   : _write_each_record
1497   Usage   : $agave->_write_each_record( $seqI )
1498   Function: change data into agave format
1499   Returns : NONE
1500   Args    : Bio::SeqI object
1501
1502 =cut
1503
1504 sub  _write_each_record {
1505     my ($self,$seq) = @_;
1506
1507     # $self->{'file'} =~ s/>//g;
1508     my $output = new IO::File(">" . $self->{'file'});
1509     my $writer = new XML::Writer(OUTPUT => $output,
1510                                  NAMESPACES => 0,
1511                                  DATA_MODE => 1,
1512                                  DATA_INDENT => 2 ) ;
1513
1514     $writer->xmlDecl("UTF-8");
1515     $writer->doctype("sciobj", '', "sciobj.dtd");
1516     $writer ->startTag('sciobj',
1517                        'version', '2',
1518                        'release', '2');
1519
1520     $writer->startTag('contig', 'length', $seq->length);
1521     my $annotation = $seq ->annotation;
1522     # print "annotation: $annotation\n"; exit;  Bio::Annotation::Collection=HASH(0x8112e6c)
1523     if ( $annotation->get_Annotations('dblink') ) {
1524         # used to be $annotation->each_DBLink, but Bio::Annotation::Collection::each_DBLink
1525         # is now replaced with get_Annotations('dblink')
1526         my $dblink = $annotation->get_Annotations('dblink')->[0] ;
1527
1528         $writer ->startTag('db_id',
1529                            'id', $dblink->primary_id ,
1530                            'db_code', $dblink->database );
1531     } else {
1532         $writer ->startTag('db_id',
1533                            'id', $seq->display_id ,
1534                            'db_code', 'default' );
1535     }
1536     $writer ->endTag('db_id') ;
1537
1538
1539     $writer->startTag('fragment_order');
1540     $writer->startTag('fragment_orientation');
1541
1542     ##start bio_sequence
1543     ####my $organism = $seq->species->genus . " " . $seq->species->species;
1544     $writer ->startTag('bio_sequence',
1545                        'sequence_id', $seq->display_id,
1546                        'seq_length', $seq->length,
1547                        # 'molecule_type', $seq->moltype, # deprecated
1548                        'molecule_type', $self->alphabet,
1549                        #'organism_name', $organism
1550                       );
1551
1552     # my $desc = $seq->{primary_seq}->{desc};
1553     # print "desc: $desc\n"; exit;
1554     # print Data::Dumper->Dump([$seq]);  exit;
1555     ##start db_id under bio_sequence
1556     $annotation = $seq ->annotation;
1557     # print "annotation: $annotation\n"; exit;  Bio::Annotation::Collection=HASH(0x8112e6c)
1558     if ( $annotation->get_Annotations('dblink') ) {
1559         # used to be $annotation->each_DBLink, but Bio::Annotation::Collection::each_DBLink
1560         # is now replaced with get_Annotations('dblink')
1561         my $dblink = $annotation->get_Annotations('dblink')->[0] ;
1562
1563         $writer ->startTag('db_id',
1564                            'id', $dblink->primary_id ,
1565                            'db_code', $dblink->database );
1566     } else {
1567         $writer ->startTag('db_id',
1568                            'id', $seq->display_id ,
1569                            'db_code', 'default' );
1570     }
1571     $writer ->endTag('db_id') ;
1572
1573     ##start note
1574     my $note = "" ;
1575     foreach my $comment ( $annotation->get_Annotations('comment') ) {
1576         # used to be $annotations->each_Comment(), but that's now been replaced
1577         # with get_Annotations()
1578         # $comment is a Bio::Annotation::Comment object
1579         $note .= $comment->text() . "\n";
1580     }
1581
1582     $writer ->startTag('note');
1583     $writer ->characters( $note ) ;
1584     $writer ->endTag('note');
1585
1586     ##start description
1587     $writer ->startTag('description');
1588
1589     # $writer ->characters( $annotation->get_Annotations('description') ) ;
1590     # used to be $annotations->each_description(), but that's now been
1591     # replaced with get_Annotations.
1592     # Simon added this: this is the primary_seq's desc (the DEFINITION tag in a genbank file)
1593     $writer->characters($seq->{primary_seq}->{desc});
1594     $writer ->endTag('description');
1595
1596     ##start keywords
1597     foreach my $genename ( $annotation->get_Annotations('gene_name') ) {
1598         # used to be $annotations->each_gene_name, but that's now been
1599         # replaced with get_Annotations()
1600         $writer ->startTag('keyword');
1601         $writer ->characters( $genename ) ;
1602         $writer ->endTag('keyword');
1603     }
1604
1605
1606     foreach my $ref ( $annotation->get_Annotations('reference') ) {
1607         # used to be $annotation->each_Reference, but
1608         # that's now been replaced with get_Annotations('reference');
1609         # link is a Bio::Annotation::Reference object
1610         $writer ->startTag('keyword');
1611         # print Data::Dumper->Dump([$ref]); exit;
1612         my $medline  = $ref->medline || 'null';
1613         my $pubmed   = $ref->pubmed || 'null';
1614         my $database = $ref->database || 'null';
1615         my $authors  = $ref->authors || 'null';
1616         my $title    = $ref->title || 'null';
1617
1618
1619         $writer ->characters( 'medline:' . "$medline" . ':' . 'pubmed:' .
1620                               "$pubmed" . ':' . 'database:' . "$database" .
1621                               ':' .'authors:' . "$authors" . ':' . 'title:' . "$title" ) ;
1622         $writer ->endTag('keyword');
1623     }
1624
1625     ## start sequence
1626     $writer ->startTag('sequence');
1627     $writer ->characters( $seq->seq ) ;
1628     $writer ->endTag('sequence');
1629
1630     ## start xrefs
1631     $writer ->startTag('xrefs');
1632     foreach my $link ( $annotation->get_Annotations('dblink') ) {
1633         # link is a Bio::Annotation::DBLink object
1634         $writer ->startTag('db_id',
1635                            'db_code', $link->database,
1636                            'id', $link->primary_id);
1637         $writer ->characters( $link->comment ) ;
1638         $writer ->endTag('db_id');
1639     }
1640     $writer ->endTag('xrefs') ;
1641
1642     ##start sequence map
1643     ##we can not use :  my @feats = $seq->all_SeqFeatures;
1644     ##rather, we use top_SeqFeatures() to keep the tree structure
1645     my @feats = $seq->top_SeqFeatures ;
1646
1647     my $features;
1648
1649     ##now we need cluster top level seqfeature by algorithm
1650     my $maps;
1651     foreach my $feature (@feats) {
1652         my $map_type = $feature ->source_tag;
1653         push (@{$maps->{ $map_type }}, $feature);
1654     }
1655
1656     ##now we enter each sequence_map
1657     foreach my $map_type (keys  %$maps ) {
1658         $writer->startTag('sequence_map',
1659                           'label', $map_type );
1660         $writer->startTag('annotations');
1661         # the original author accidently entered 'annotation' instead of 'annotations'
1662
1663         foreach my $feature ( @{$maps->{ $map_type }} ) {
1664             $self->_write_seqfeature( $feature, $writer ) ;
1665         }
1666
1667         $writer->endTag('annotations');
1668         $writer->endTag('sequence_map');
1669     }
1670
1671     $writer->endTag('bio_sequence');
1672     $writer->endTag('fragment_orientation');
1673     $writer->endTag('fragment_order');
1674     $writer->endTag('contig');
1675     $writer->endTag('sciobj');
1676
1677 }
1678 # ==================================================================================
1679
1680 =head2 _write_seqfeature
1681
1682   Usage   : $agave->_write_each_record( $seqfeature, $write )
1683   Function: change seeqfeature data into agave format
1684   Returns : NONE
1685   Args    : Bio::SeqFeature object and XML::writer object
1686
1687 =cut
1688
1689 sub _write_seqfeature{
1690
1691     my ($self,$seqf, $writer) = @_;
1692
1693     ##now enter seq feature
1694     $writer ->startTag('seq_feature',
1695                        'feature_type', $seqf->primary_tag() );
1696
1697     my $strand = $seqf->strand();
1698     $strand = 0 if !defined $strand;
1699     # $strand == 1 ? 'false' : 'true';
1700     my $is_on_complement;
1701     if ($strand == 1) {
1702         $is_on_complement = 'true';
1703     } else {
1704         $is_on_complement = 'false';
1705     }
1706
1707     # die Data::Dumper->Dump([$seqf]) if !defined $strand;
1708     $writer ->startTag('seq_location',
1709                        'lease_start', $seqf->start(),
1710                        'greatest_end', $seqf->end(),
1711                        # 'is_on_complement', $seqf->strand() == 1 ? 'false' : 'true') ;
1712                        'is_on_complement' , $is_on_complement);
1713     # is_on_complement: is the feature found on the complementary
1714     # strand (true) or not (false)?
1715     $writer ->endTag('seq_location');
1716
1717     ##enter qualifier
1718     foreach my $tag ( $seqf->all_tags() ) {
1719         $writer ->startTag('qualifier',
1720                            'qualifier_type', $tag);
1721         $writer ->characters( $seqf->each_tag_value($tag) ) ;
1722         $writer ->endTag('qualifier');
1723     }
1724
1725     ##now recursively travel the seqFeature
1726     foreach my $subfeat ( $seqf->sub_SeqFeature ) {
1727         $self->_write_seqfeature( $subfeat, $writer ) ;
1728     }
1729
1730     $writer->endTag('seq_feature');
1731
1732     return;
1733
1734 }
1735 # ==================================================================================
1736
1737 =head2 _filehandle
1738
1739   Title   : _filehandle
1740   Usage   : $obj->_filehandle($newval)
1741   Function:
1742   Example :
1743   Returns : value of _filehandle
1744   Args    : newvalue (optional)
1745
1746 =cut
1747
1748 sub _filehandle{
1749
1750     my ($obj,$value) = @_;
1751     if ( defined $value) {
1752         $obj->{'_filehandle'} = $value;
1753     }
1754     return $obj->{'_filehandle'};
1755
1756 }
1757 # ==================================================================================
1758
1759 =head2 throw
1760
1761   Title    : throw
1762   Usage    : $self->throw;
1763   Function : Throw's error message.  Calls SeqIO's throw method.
1764   Args     : Array of string(s), holding error message(s).
1765   Returns  : Nothing.
1766   Note     : Method(s) that call(s) this method: many.
1767              Method(s) that this method calls: Bio::SeqIO's throw method.
1768
1769 =cut
1770
1771 sub throw {
1772
1773     my ($self, @s) = @_;
1774     my $string = "[$.]" . join('', @s);
1775     $self->SUPER::throw($string);
1776     return;
1777
1778 }
1779
1780 1;