Bio/SeqIO/chadoxml.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::SeqIO::chadoxml
   4 #
   5 # Peili Zhang   <peili@morgan.harvard.edu>
   6 #
   7 # You may distribute this module under the same terms as perl itself
   8
   9 # POD documentation - main docs before the code
  10
  11 =head1 NAME
  12
  13 Bio::SeqIO::chadoxml - chadoxml sequence output stream
  14
  15 =head1 SYNOPSIS
  16
  17 It is probably best not to use this object directly, but
  18 rather go through the SeqIO handler system:
  19
  20     $writer = Bio::SeqIO->new(-file => ">chado.xml",
  21                               -format => 'chadoxml');
  22
  23     # assume you already have Sequence or SeqFeature objects
  24     $writer->write_seq($seq_obj);
  25
  26     #after writing all seqs
  27     $writer->close_chadoxml();
  28
  29
  30
  31 =head1 DESCRIPTION
  32
  33 This object can transform Bio::Seq objects to chadoxml flat
  34 file databases (for chadoxml DTD, see
  35 http://gmod.cvs.sourceforge.net/gmod/schema/chado/dat/chado.dtd).
  36
  37 This is currently a write-only module.
  38
  39     $seqio = Bio::SeqIO->new(-file => '>outfile.xml',
  40                              -format => 'chadoxml'
  41                              -suppress_residues => 1,
  42                              -allow_residues => 'chromosome',
  43                              );
  44
  45     # we have a Bio::Seq object $seq which is a gene located on
  46     # chromosome arm 'X', to be written out to chadoxml
  47     # before converting to chadoxml, $seq object B<must> be transformed
  48     # so that all the coordinates in $seq are against the source
  49     # feature to be passed into Bio::SeqIO::chadoxml->write_seq()
  50     # -- chromosome arm X in the example below.
  51
  52     $seqio->write_seq(-seq=>$seq,
  53                       -genus   => 'Homo',
  54                       -species => 'sapiens',
  55                       -seq_so_type=>'gene',
  56                       -src_feature=>'X',
  57                       -src_feat_type=>'chromosome_arm',
  58                                 -nounflatten=>1,
  59                       -is_analysis=>'true',
  60                       -data_source=>'GenBank');
  61
  62 The chadoxml output of Bio::SeqIO::chadoxml-E<gt>write_seq() method can be
  63 passed to the loader utility in XORT package
  64 (http://gmod.cvs.sourceforge.net/gmod/schema/XMLTools/XORT/)
  65 to be loaded into chado.
  66
  67 This object is currently implemented to work with sequence and
  68 annotation data from whole genome projects deposited in GenBank. It
  69 may not be able to handle all different types of data from all
  70 different sources.
  71
  72 In converting a Bio::Seq object into chadoxml, a top-level feature is
  73 created to represent the object and all sequence features inside the
  74 Bio::Seq object are treated as subfeatures of the top-level
  75 feature. The Bio::SeqIO::chadoxml object calls
  76 Bio::SeqFeature::Tools::Unflattener to unflatten the flat feature list
  77 contained in the subject Bio::Seq object, to build gene model
  78 containment hierarchy conforming to chado central dogma model: gene
  79 --E<gt> mRNA --E<gt> exons and protein.
  80
  81 Destination of data in the subject Bio::Seq object $seq is as following:
  82
  83         *$seq->display_id:  name of the top-level feature;
  84
  85         *$seq->accession_number: if defined, uniquename and
  86                                  feature_dbxref of the top-level
  87                                  feature if not defined,
  88                                  $seq->display_id is used as the
  89                                  uniquename of the top-level feature;
  90
  91         *$seq->molecule: transformed to SO type, used as the feature
  92                         type of the top-level feature if -seq_so_type
  93                         argument is supplied, use the supplied SO type
  94                         as the feature type of the top-level feature;
  95
  96         *$seq->species: organism of the top-level feature;
  97
  98         *$seq->seq: residues of the top-level feature;
  99
 100         *$seq->is_circular, $seq->division: feature_cvterm;
 101
 102         *$seq->keywords, $seq->desc, comments: featureprop;
 103
 104         *references: pub and feature_pub;
 105                 medline/pubmed ids: pub_dbxref;
 106                 comments: pubprop;
 107
 108         *feature "source" span: featureloc for top-level feature;
 109
 110         *feature "source" db_xref: feature_dbxref for top-level feature;
 111
 112         *feature "source" other tags: featureprop for top-level feature;
 113
 114         *subfeature 'symbol' or 'label' tag: feature uniquename, if
 115                      none of these is present, the chadoxml object
 116                      generates feature uniquenames as:
 117                      <gene>-<feature_type>-<span>
 118                      (e.g. foo-mRNA--1000..3000);
 119
 120         *gene model: feature_relationship built based on the
 121                      containment hierarchy;
 122
 123         *feature span: featureloc;
 124
 125         *feature accession numbers: feature_dbxref;
 126
 127         *feature tags (except db_xref, symbol and gene): featureprop;
 128
 129 Things to watch out for:
 130
 131         *chado schema change: this version works with the chado
 132                                version tagged chado_1_01 in GMOD CVS.
 133
 134         *feature uniquenames: especially important if using XORT
 135                               loader to do incremental load into
 136                               chado. may need pre-processing of the
 137                               source data to put the correct
 138                               uniquenames in place.
 139
 140         *pub uniquenames: chadoxml->write_seq() has the FlyBase policy
 141                           on pub uniquenames hard-coded, it assigns
 142                           pub uniquenames in the following way: for
 143                           journals and books, use ISBN number; for
 144                           published papers, use MEDLINE ID; for
 145                           everything else, use FlyBase unique
 146                           identifier FBrf#. need to modify the code to
 147                           implement your policy. look for the comments
 148                           in the code.
 149
 150         *for pubs possibly existing in chado but with no knowledge of
 151          its uniquename:put "op" as "match", then need to run the
 152                         output chadoxml through a special filter that
 153                         talks to chado database and tries to find the
 154                         pub by matching with the provided information
 155                         instead of looking up by the unique key. after
 156                         matching, the filter also resets the "match"
 157                         operation to either "force" (default), or
 158                         "lookup", or "insert", or "update". the
 159                         "match" operation is for a special FlyBase use
 160                         case. please modify to work according to your
 161                         rules.
 162
 163         *chado initialization for loading:
 164
 165                 cv & cvterm: in the output chadoxml, all cv's and
 166                              cvterm's are lookup only. Therefore,
 167                              before using XORT loader to load the
 168                              output into chado, chado must be
 169                              pre-loaded with all necessary CVs and
 170                              CVterms, including "SO" , "property
 171                              type", "relationship type", "pub type",
 172                              "pubprop type", "pub relationship type",
 173                              "sequence topology", "GenBank feature
 174                              qualifier", "GenBank division". A pub by
 175                              the uniquename 'nullpub' of type 'null
 176                              pub' needs to be inserted.
 177
 178 =head1 FEEDBACK
 179
 180 =head2 Mailing Lists
 181
 182 User feedback is an integral part of the evolution of this and other
 183 Bioperl modules. Send your comments and suggestions preferably to one
 184 of the Bioperl mailing lists.  Your participation is much appreciated.
 185
 186   bioperl-l@bioperl.org                  - General discussion
 187   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 188
 189 =head2 Support
 190
 191 Please direct usage questions or support issues to the mailing list:
 192
 193 L<bioperl-l@bioperl.org>
 194
 195 rather than to the module maintainer directly. Many experienced and
 196 reponsive experts will be able look at the problem and quickly
 197 address it. Please include a thorough description of the problem
 198 with code and data examples if at all possible.
 199
 200 =head2 Reporting Bugs
 201
 202 Report bugs to the Bioperl bug tracking system to help us keep track
 203 the bugs and their resolution.
 204 Bug reports can be submitted via the web:
 205
 206   http://bugzilla.bioperl.org
 207
 208 =head1 AUTHOR - Peili Zhang
 209
 210 Email peili@morgan.harvard.edu
 211
 212 =head1 APPENDIX
 213
 214 The rest of the documentation details each of the object
 215 methods. Internal methods are usually preceded with a _
 216
 217 =cut
 218
 219 # Let the code begin...
 220
 221 package Bio::SeqIO::chadoxml;
 222 use strict;
 223 use English;
 224
 225 use Carp;
 226 use Data::Dumper;
 227 use XML::Writer;
 228 use IO::File;
 229 use IO::Handle;
 230 use Bio::Seq;
 231 use Bio::Seq::RichSeq;
 232 use Bio::SeqIO::FTHelper;
 233 use Bio::Species;
 234 use Bio::Seq::SeqFactory;
 235 use Bio::Factory::SequenceStreamI;
 236 use Bio::SeqFeature::Generic;
 237 use Bio::Annotation::Collection;
 238 use Bio::Annotation::Comment;
 239 use Bio::Annotation::Reference;
 240 use Bio::Annotation::DBLink;
 241 use Bio::SeqFeature::Tools::Unflattener;
 242
 243 #global variables
 244 undef(my %finaldatahash); #data from Bio::Seq object stored in a hash
 245 undef(my %datahash); #data from Bio::Seq object stored in a hash
 246
 247 my $chadotables = 'feature featureprop feature_relationship featureloc feature_cvterm cvterm cv feature_pub pub pub_dbxref pub_author author pub_relationship pubprop feature_dbxref dbxref db synonym feature_synonym';
 248
 249 my %fkey = (
 250         "cvterm.cv_id"                  => "cv",
 251         "cvterm.dbxref_id"              => "dbxref",
 252         "dbxref.db_id"                  => "db",
 253         "feature.type_id"               => "cvterm",
 254         "feature.organism_id"           => "organism",
 255         "feature.dbxref_id"             => "dbxref",
 256         "featureprop.type_id"           => "cvterm",
 257         "feature_pub.pub_id"            => "pub",
 258         "feature_cvterm.cvterm_id"      => "cvterm",
 259         "feature_cvterm.pub_id"         => "pub",
 260         "feature_cvterm.feature_id"     => "feature",
 261         "feature_dbxref.dbxref_id"      => "dbxref",
 262         "feature_relationship.object_id"        => "feature",
 263         "feature_relationship.subject_id"       => "feature",
 264         "feature_relationship.type_id"  => "cvterm",
 265         "featureloc.srcfeature_id"      => "feature",
 266         "pub.type_id"                   => "cvterm",
 267         "pub_dbxref.dbxref_id"          => "dbxref",
 268         "pub_author.author_id"          => "author",
 269         "pub_relationship.obj_pub_id"   => "pub",
 270         "pub_relationship.subj_pub_id"  => "pub",
 271         "pub_relationship.type_id"      => "cvterm",
 272         "pubprop.type_id"               => "cvterm",
 273         "feature_synonym.feature_id"    => "feature",
 274         "feature_synonym.synonym_id"    => "synonym",
 275         "feature_synonym.pub_id"        => "pub",
 276         "synonym.type_id"               => "cvterm",
 277 );
 278
 279 my %cv_name = (
 280         'relationship'                  => 'relationship',
 281         'sequence'                      => 'sequence',
 282         'feature_property'              => 'feature_property',
 283 );
 284
 285 my %feattype_args2so = (
 286         "aberr"                         => "aberration_junction",
 287 #       "conflict"                      => "sequence_difference",
 288 #       "polyA_signal"                  => "polyA_signal_sequence",
 289         "variation"                     => "sequence_variant",
 290         "mutation1"                     => "point_mutation",            #for single-base mutation
 291         "mutation2"                     => "sequence_variant",          #for multi-base mutation
 292         "rescue"                        => "rescue_fragment",
 293 #       "rfrag"                         => "restriction_fragment",
 294         "protein_bind"                  => "protein_binding_site",
 295         "misc_feature"                  => "region",
 296 #       "prim_transcript"               => "primary_transcript",
 297         "CDS"                           => "polypeptide",
 298         "reg_element"                   => "regulatory_region",
 299         "seq_variant"                   => "sequence_variant",
 300         "mat_peptide"                   => "mature_peptide",
 301         "sig_peptide"                   => "signal_peptide",
 302 );
 303
 304 undef(my %organism);
 305
 306 use base qw(Bio::SeqIO);
 307
 308 sub _initialize {
 309
 310     my($self,%args) = @_;
 311
 312     $self->SUPER::_initialize(%args);
 313     unless( defined $self->sequence_factory ) {
 314         $self->sequence_factory(Bio::Seq::SeqFactory->new
 315                                 (-verbose => $self->verbose(),
 316                                  -type => 'Bio::Seq::RichSeq'));
 317     }
 318     #optional arguments that can be passed in
 319     $self->suppress_residues($args{'-suppress_residues'})
 320         if defined $args{'-suppress_residues'};
 321
 322     $self->allow_residues($args{'-allow_residues'})
 323         if defined $args{'-allow_residues'};
 324     return;
 325 }
 326
 327 =head2 write_seq
 328
 329  Title   : write_seq
 330  Usage   : $stream->write_seq(-seq=>$seq, -seq_so_type=>$seqSOtype,
 331                               -src_feature=>$srcfeature,
 332                               -src_feat_type=>$srcfeattype,
 333                               -nounflatten=>0 or 1,
 334                               -is_analysis=>'true' or 'false',
 335                               -data_source=>$datasource)
 336  Function: writes the $seq object (must be seq) into chadoxml.
 337            Current implementation:
 338            1. for non-mRNA records,
 339            a top-level feature of type $seq->alphabet is
 340            generated for the whole GenBank record, features listed
 341            are unflattened for DNA records to build gene model
 342            feature graph, and for the other types of records all
 343            features in $seq are treated as subfeatures of the top-level
 344            feature.
 345            2. for mRNA records,
 346            if a 'gene' feature is present, it B<must> have a /symbol
 347            or /label tag to contain the uniquename of the gene. a top-
 348            level feature of type 'gene' is generated. the mRNA is written
 349            as a subfeature of the top-level gene feature, and the other
 350            sequence features listed in $seq are treated as subfeatures
 351            of the mRNA feature.
 352  Returns : 1 for success and 0 for error
 353
 354
 355  Args     : A Bio::Seq object $seq, optional $seqSOtype, $srcfeature,
 356                  $srcfeattype, $nounflatten, $is_analysis and $data_source.
 357            when $srcfeature (a string, the uniquename of the source
 358            feature) is given, the location and strand information of
 359            the top-level feature against the source feature will be
 360            derived from the sequence feature called 'source' of the
 361            $seq object, a featureloc record is generated for the top
 362            -level feature on $srcfeature. when $srcfeature is given,
 363            $srcfeattype must also be present. All feature coordinates
 364            in $seq should be against $srcfeature.  $seqSOtype is the
 365            optional SO term to use as the type of the top-level feature.
 366            For example, a GenBank data file for a Drosophila melanogaster
 367            genome scaffold has the molecule type of "DNA", when
 368            converting to chadoxml, a $seqSOtype argument of
 369            "golden_path_region" can be supplied to save the scaffold
 370            as a feature of type "golden_path_region" in chadoxml, instead
 371            of "DNA".  a feature with primary tag of 'source' must be
 372            present in the sequence feature list of $seq, to decribe the
 373            whole sequence record.
 374
 375
 376 =cut
 377
 378 sub write_seq {
 379         my $usage = <<EOUSAGE;
 380 Bio::SeqIO::chadoxml->write_seq()
 381 Usage   : \$stream->write_seq(-seq=>\$seq,
 382                               -seq_so_type=>\$SOtype,
 383                               -src_feature=>\$srcfeature,
 384                               -src_feat_type=>\$srcfeattype,
 385                               -nounflatten=>0 or 1,
 386                               -is_analysis=>'true' or 'false',
 387                               -data_source=>\$datasource)
 388 Args    : \$seq         : a Bio::Seq object
 389           \$SOtype      : the SO term to use as the feature type of
 390                           the \$seq record, optional
 391           \$srcfeature  : unique name of the source feature, a string
 392                           containing at least one alphabetical letter
 393                           (a-z, A-Z), optional
 394           \$srcfeattype : feature type of \$srcfeature. one of SO terms.
 395                           optional
 396           when \$srcfeature is given, \$srcfeattype becomes mandatory,
 397           \$datasource  : source of the sequence annotation data,
 398                           e.g. 'GenBank' or 'GFF'.
 399 EOUSAGE
 400
 401         my ($self,@args) = @_;
 402
 403         my ($seq, $seq_so_type, $srcfeature, $srcfeattype, $nounflatten, $isanalysis, $datasource, $genus, $species) =
 404            $self->_rearrange([qw(SEQ
 405                                  SEQ_SO_TYPE
 406                                  SRC_FEATURE
 407                                  SRC_FEAT_TYPE
 408                                  NOUNFLATTEN
 409                                  IS_ANALYSIS
 410                                  DATA_SOURCE
 411                                  GENUS
 412                                  SPECIES
 413                                  )],
 414                               @args);
 415         #print "$seq_so_type, $srcfeature, $srcfeattype\n";
 416
 417         if( !defined $seq ) {
 418             $self->throw("Attempting to write with no seq!");
 419         }
 420
 421         if( ! ref $seq || ! $seq->isa('Bio::Seq::RichSeqI') ) {
 422            ## FIXME $self->warn(" $seq is not a RichSeqI compliant module. Attempting to dump, but may fail!");
 423         }
 424
 425         # try to get the srcfeature from the seqFeature object
 426         # for this to work, the user has to pass in the srcfeature type
 427         if (!$srcfeature) {
 428             if ($seq->can('seq_id')) {
 429                 $srcfeature=$seq->seq_id if ($seq->seq_id ne $seq->display_name);
 430             }
 431         }
 432
 433         #$srcfeature, when provided, should contain at least one alphabetical letter
 434         if (defined $srcfeature)
 435         {
 436             if ($srcfeature =~ /[a-zA-Z]/)
 437             {
 438                 chomp($srcfeature);
 439             } else {
 440                 $self->throw( $usage );
 441             }
 442
 443             #check for mandatory $srcfeattype
 444             if (! defined $srcfeattype)
 445             {
 446                 $self->throw( $usage );
 447                 #$srcfeattype must be a string of non-whitespace characters
 448             } else {
 449                 if ($srcfeattype =~ /\S+/) {
 450                     chomp($srcfeattype);
 451                 } else {
 452                     $self->throw( $usage );
 453                 }
 454             }
 455         }
 456
 457         # variables local to write_seq()
 458         my $div = undef;
 459         my $hkey = undef;
 460         undef(my @top_featureprops);
 461         undef(my @featuresyns);
 462         undef(my @top_featurecvterms);
 463         my $name = $seq->display_id if $seq->can('display_id');
 464         $name = $seq->display_name  if $seq->can('display_name');
 465         undef(my @feature_cvterms);
 466         undef(my %sthash);
 467         undef(my %dvhash);
 468         undef(my %h1);
 469         undef(my %h2);
 470         my $temp = undef;
 471         my $ann = undef;
 472         undef(my @references);
 473         undef(my @feature_pubs);
 474         my $ref = undef;
 475         my $location = undef;
 476         my $fbrf = undef;
 477         my $journal = undef;
 478         my $issue = undef;
 479         my $volume = undef;
 480         my $volumeissue = undef;
 481         my $pages = undef;
 482         my $year = undef;
 483         my $pubtype = undef;
 484 #       my $miniref= undef;
 485         my $uniquename = undef;
 486         my $refhash = undef;
 487         my $feat = undef;
 488         my $tag = undef;
 489         my $tag_cv = undef;
 490         my $ftype = undef;
 491         my $subfeatcnt = undef;
 492         undef(my @top_featrels);
 493         undef (my %srcfhash);
 494
 495         local($^W) = 0; # supressing warnings about uninitialized fields.
 496
 497         if (!$name && $seq->can('attributes') ) {
 498             ($name) = $seq->attributes('Alias');
 499         }
 500
 501         if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') {
 502                 $uniquename = $seq->accession_number;
 503         } elsif ($seq->can('accession') && defined $seq->accession && $seq->accession ne 'unknown') {
 504                 $uniquename = $seq->accession;
 505         } elsif ($seq->can('attributes')) {
 506                 ($uniquename) = $seq->attributes('load_id');
 507         } else {
 508                 $uniquename = $name;
 509         }
 510         my $len = $seq->length();
 511         if ($len == 0) {
 512                 $len = undef;
 513         }
 514
 515         undef(my $gb_type);
 516         if (!$seq->can('molecule') || ! defined ($gb_type = $seq->molecule()) ) {
 517                 $gb_type = $seq->can('alphabet') ? $seq->alphabet : 'DNA';
 518         }
 519         $gb_type = 'DNA' if $ftype eq 'dna';
 520         $gb_type = 'RNA' if $ftype eq 'rna';
 521
 522         if(length $seq_so_type > 0) {
 523                 if (defined $seq_so_type) {
 524                         $ftype = $seq_so_type;
 525                 }
 526                 elsif ($seq->type) {
 527                         $ftype = ($seq->type =~ /(.*):/)
 528                                  ? $1
 529                                  : $seq->type;
 530                 }
 531                 else {
 532                         $ftype = $gb_type;
 533                 }
 534         }
 535         else {
 536                 $ftype = $gb_type;
 537         }
 538
 539         my %ftype_hash = $self->return_ftype_hash($ftype);
 540
 541         if ($species) {
 542             %organism = ("genus"=>$genus, "species" => $species);
 543         }
 544         else {
 545             my $spec = $seq->species();
 546             if (!defined $spec) {
 547                 $self->throw("$seq does not know what organism it is from, which is required by chado. cannot proceed!\n");
 548             } else {
 549                 %organism = ("genus"=>$spec->genus(), "species" => $spec->species());
 550             }
 551         }
 552
 553         my $residues;
 554         if (!$self->suppress_residues ||
 555             ($self->suppress_residues && $self->allow_residues eq $ftype)) {
 556             $residues = $seq->seq->isa('Bio::PrimarySeq')
 557                         ? $seq->seq->seq
 558                         : $seq->seq;
 559         }
 560         else {
 561             $residues = '';
 562         }
 563
 564         #set is_analysis flag for gene model features
 565         undef(my $isanal);
 566         if ($ftype eq 'gene' || $ftype eq 'mRNA' || $ftype eq 'exon' || $ftype eq 'protein' || $ftype eq 'polypeptide') {
 567                 $isanal = $isanalysis;
 568                 $isanal = 'false' if !defined $isanal;
 569         }
 570
 571         %datahash = (
 572                 "name"          => $name,
 573                 "uniquename"    => $uniquename,
 574                 "seqlen"        => $len,
 575                 "residues"      => $residues,
 576                 "type_id"       => \%ftype_hash,
 577                 "organism_id"   => \%organism,
 578                 "is_analysis"   => $isanal || 'false',
 579                 );
 580
 581         if (defined $srcfeature) {
 582                 %srcfhash = $self->_srcf_hash($srcfeature,
 583                                               $srcfeattype,
 584                                               \%organism);
 585
 586                 my ($phase,$strand);
 587                 if ($seq->can('phase')) {
 588                     $phase = $seq->phase;
 589                 }
 590
 591                 if ($seq->can('strand')) {
 592                     $strand = $seq->strand;
 593                 }
 594                 my %fl = (
 595                                 "srcfeature_id" => \%srcfhash,
 596                                 "fmin"          => $seq->start - 1,
 597                                 "fmax"          => $seq->end,
 598                                 "strand"        => $strand,
 599                                 "phase"         => $phase,
 600                                 );
 601
 602                 $datahash{'featureloc'} = \%fl;
 603
 604         }
 605
 606
 607         #if $srcfeature is not given, use the Bio::Seq object itself as the srcfeature for featureloc's
 608         if (!defined $srcfeature) {
 609                 $srcfeature = $uniquename;
 610                 $srcfeattype = $ftype;
 611         }
 612
 613         #default data source is 'GenBank'
 614         if (!defined $datasource) {
 615                 $datasource = 'GenBank';
 616         }
 617
 618         if ($datasource =~ /GenBank/i) {
 619                 #sequence topology as feature_cvterm
 620                 if ($seq->can('is_circular') && $seq->is_circular) {
 621                         %sthash = (
 622                                 "cvterm_id"     => {'name' => 'circular',
 623                                                     'cv_id' => {
 624                                                         'name' => 'sequence topology',
 625                                                     },
 626                                                 },
 627                                    "pub_id"     => {'uniquename' => 'nullpub',
 628                                                     'type_id' => {
 629                                                         'name' => 'null pub',
 630                                                         'cv_id' => {
 631                                                             'name'=> 'pub type',
 632                                                         },
 633                                                     },
 634                                                 },
 635                                 );
 636                 } else {
 637                         %sthash = (
 638                                 "cvterm_id"     => { 'name' => 'linear',
 639                                                      'cv_id' => {
 640                                                          'name' => 'sequence topology',
 641                                                      }
 642                                                  },
 643                                 "pub_id"        => {'uniquename' => 'nullpub',
 644                                                     'type_id' => {
 645                                                         'name' => 'null pub',
 646                                                         'cv_id' => {
 647                                                             'name'=> 'pub type',
 648                                                         },
 649                                                     },
 650                                                 },
 651                                    );
 652                 }
 653                 push(@feature_cvterms, \%sthash);
 654
 655                 #division as feature_cvterm
 656                 if ($seq->can('division') && defined $seq->division()) {
 657                         $div = $seq->division();
 658                         %dvhash = (
 659                                 "cvterm_id"     => {'name' => $div,
 660                                                     'cv_id' => {
 661                                                         'name' => 'GenBank division'}},
 662                                 "pub_id"        => {'uniquename' => 'nullpub',
 663                                                     'type_id' => {
 664                                                         'name' => 'null pub',
 665                                                         'cv_id' => {
 666                                                             'name'=> 'pub type'},
 667                                                         }},
 668                                 );
 669                         push(@feature_cvterms, \%dvhash);
 670                 }
 671
 672                 $datahash{'feature_cvterm'} = \@feature_cvterms;
 673         } # closes if GenBank
 674
 675         #featureprop's
 676         #DEFINITION
 677         if ($seq->can('desc') && defined $seq->desc()) {
 678                 $temp = $seq->desc();
 679
 680                 my %prophash = (
 681                         "type_id"       => {'name' => 'description',
 682                                             'cv_id' => {
 683                                                 'name' =>
 684                                                  $cv_name{'feature_property'}
 685                                                        },
 686                                            },
 687                         "value"         => $temp,
 688                         );
 689
 690                 push(@top_featureprops, \%prophash);
 691         }
 692
 693         #KEYWORDS
 694         if ($seq->can('keywords')) {
 695             $temp = $seq->keywords();
 696
 697             if (defined $temp && $temp ne '.' && $temp ne '') {
 698                 my %prophash = (
 699                                 "type_id"   => {'name' => 'keywords',
 700                                                 'cv_id' => {
 701                                                   'name' =>
 702                                                    $cv_name{'feature_property'}
 703                                                            }
 704                                                 },
 705                                 "value"     => $temp,
 706                                 );
 707
 708                 push(@top_featureprops, \%prophash);
 709             }
 710         }
 711
 712         #COMMENT
 713         if ($seq->can('annotation')) {
 714                 $ann = $seq->annotation();
 715                 foreach my $comment ($ann->get_Annotations('comment')) {
 716                         $temp = $comment->as_text();
 717                         #print "fcomment: $temp\n";
 718                         my %prophash = (
 719                                 "type_id"   => {'name' => 'comment',
 720                                                 'cv_id' => {
 721                                                   'name' =>
 722                                                    $cv_name{'feature_property'}
 723                                                            }
 724                                                },
 725                                 "value"     => $temp,
 726                                 );
 727
 728                         push(@top_featureprops, \%prophash);
 729                 }
 730         }
 731
 732         my @top_dbxrefs = ();
 733         #feature object from Bio::DB::SeqFeature::Store
 734         if ($seq->can('attributes')) {
 735                 my %attributes = $seq->attributes;
 736                 for my $key (keys %attributes) {
 737                     next if ($key eq 'parent_id');
 738                     next if ($key eq 'load_id');
 739
 740                     if ($key eq 'Alias') {
 741                         @featuresyns = $self->handle_Alias_tag($seq,@featuresyns);
 742                     }
 743
 744                     ###FIXME deal with Dbxref, Ontology_term,source,
 745                     elsif ($key eq 'Ontology_term') {
 746                         @top_featurecvterms = $self->handle_Ontology_tag($seq,@top_featurecvterms);
 747                     }
 748
 749                     elsif ($key eq 'dbxref' or $key eq 'Dbxref') {
 750                         @top_dbxrefs = $self->handle_dbxref($seq, $key, @top_dbxrefs);
 751                     }
 752
 753                     elsif ($key =~ /^[a-z]/) {
 754                         @top_featureprops
 755                              = $self->handle_unreserved_tags($seq,$key,@top_featureprops);
 756                     }
 757                 }
 758         }
 759         $datahash{'feature_synonym'} = \@featuresyns;
 760
 761         if ($seq->can('source')) {
 762                 @top_dbxrefs = $self->handle_source($seq,@top_dbxrefs);
 763         }
 764
 765         #accession and version as feature_dbxref
 766         if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') {
 767             my $db = $self->_guess_acc_db($seq, $seq->accession_number);
 768             my %acchash = (
 769                            "db_id"      => {'name' => $db},
 770                            "accession"  => $seq->accession_number,
 771                            "version"    => $seq->seq_version,
 772                            );
 773             my %fdbx = ('dbxref_id' => \%acchash);
 774             push(@top_dbxrefs, \%fdbx);
 775         }
 776
 777         if( $seq->isa('Bio::Seq::RichSeqI') && defined $seq->get_secondary_accessions() ) {
 778                 my @secacc = $seq->get_secondary_accessions();
 779                 my $acc;
 780                 foreach $acc (@secacc) {
 781                         my %acchash = (
 782                                 "db_id"         => {'name' => 'GB'},
 783                                 "accession"     => $acc,
 784                                 );
 785                         my %fdbx = ('dbxref_id' => \%acchash);
 786                         push(@top_dbxrefs, \%fdbx);
 787                 }
 788         }
 789
 790         #GI number
 791         if( $seq->isa('Bio::Seq::RichSeqI') && defined ($seq->pid)) {
 792                 my $id = $seq->pid;
 793                 #print "reftype: ", ref($id), "\n";
 794
 795                 #if (ref($id) eq 'HASH') {
 796                 my %acchash = (
 797                         "db_id"         => {'name' => 'GI'},
 798                         "accession"     => $id,
 799                         );
 800                 my %fdbx = ('dbxref_id' => \%acchash);
 801                 push (@top_dbxrefs, \%fdbx);
 802         }
 803
 804         #REFERENCES as feature_pub
 805         if (defined $ann) {
 806             #get the references
 807             @references = $ann->get_Annotations('reference');
 808             foreach $ref (@references) {
 809                 undef(my %pubhash);
 810                 $refhash = $ref->hash_tree();
 811                 $location = $ref->location || $refhash->{'location'};
 812                 #print "location: $location\n";
 813
 814                 #get FBrf#, special for FlyBase SEAN loading
 815                 if (index($location, ' ==') >= 0) {
 816                     $location =~ /\s==/;
 817                                 #print "match: $MATCH\n";
 818                                 #print "prematch: $PREMATCH\n";
 819                                 #print "postmatch: $POSTMATCH\n";
 820                     $fbrf = $PREMATCH;
 821                     $location = $POSTMATCH;
 822                     $location =~ s/^\s//;
 823                 }
 824
 825                 #print "location: $location\n";
 826                 #unpublished reference
 827                 if ($location =~ /Unpublished/) {
 828                     $pubtype = 'unpublished';
 829                     %pubhash = (
 830                                 "title"         => $ref->title || $refhash->{'title'},
 831                                 #"miniref"      => substr($location, 0, 255),
 832                                 #"uniquename"   => $fbrf,
 833                                 "type_id"       => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}
 834                                 );
 835                 }
 836                 #submitted
 837                 elsif ($location =~ /Submitted/) {
 838                     $pubtype = 'submitted';
 839
 840                     %pubhash = (
 841                                 "title"         => $ref->title || $refhash->{'title'},
 842                                 #"miniref"      => substr($location, 0, 255),
 843                                 #"uniquename"   => $fbrf,
 844                                 "type_id"       => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}
 845                                 );
 846
 847                     undef(my $pyear);
 848                     $pyear = $self->_getSubmitYear($location);
 849                     if (defined $pyear) {
 850                         $pubhash{'pyear'} = $pyear;
 851                     }
 852                 }
 853
 854                 #published journal paper
 855                 elsif ($location =~ /\D+\s\d+\s\((\d+|\d+-\d+)\),\s(\d+-\d+|\d+--\d+)\s\(\d\d\d\d\)$/) {
 856                     $pubtype = 'paper';
 857
 858                                 #parse location to get journal, volume, issue, pages & year
 859                     $location =~ /\(\d\d\d\d\)$/;
 860
 861                     $year = $MATCH;
 862                     my $stuff = $PREMATCH;
 863                     $year =~ s/\(//; #remove the leading parenthesis
 864                     $year =~ s/\)//; #remove the trailing parenthesis
 865
 866                     $stuff =~ /,\s(\d+-\d+|\d+--\d+)\s$/;
 867
 868                     $pages = $MATCH;
 869                     $stuff = $PREMATCH;
 870                     $pages =~ s/^, //; #remove the leading comma and space
 871                     $pages =~ s/ $//; #remove the last space
 872
 873                     $stuff =~ /\s\d+\s\((\d+|\d+-\d+)\)$/;
 874
 875                     $volumeissue = $MATCH;
 876                     $journal = $PREMATCH;
 877                     $volumeissue =~ s/^ //; #remove the leading space
 878                     $volumeissue =~ /\((\d+|\d+-\d+)\)$/;
 879                     $issue = $MATCH;
 880                     $volume = $PREMATCH;
 881                     $issue =~ s/^\(//; #remove the leading parentheses
 882                     $issue =~ s/\)$//; #remove the last parentheses
 883                     $volume =~ s/^\s//; #remove the leading space
 884                     $volume =~ s/\s$//; #remove the last space
 885
 886                     %pubhash = (
 887                                 "title"         => $ref->title || $refhash->{'title'},
 888                                 "volume"        => $volume,
 889                                 "issue"         => $issue,
 890                                 "pyear"         => $year,
 891                                 "pages"         => $pages,
 892                                 #"miniref"      => substr($location, 0, 255),
 893                                 #"miniref"      => ' ',
 894                                 #"uniquename"   => $fbrf,
 895                                 "type_id"       => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}},
 896                                 "pub_relationship" => {
 897                                     'obj_pub_id' => {
 898                                         'uniquename' => $journal,
 899                                         'title' => $journal,
 900                                         #'miniref' => substr($journal, 0, 255),
 901                                         'type_id' =>{'name' => 'journal',
 902                                                      'cv_id' =>
 903                                                      {'name' => 'pub type'
 904                                                       },
 905                                                  },
 906                                                      #'pubprop' =>{'value'=> $journal,
 907                                                      #       'type_id'=>{'name' => 'abbreviation', 'cv_id' => {'name' => 'pubprop type'}},
 908                                                      #      },
 909                                                  },
 910                                            'type_id' => {
 911                                                'name' => 'published_in',
 912                                                'cv_id' => {
 913                                                    'name' => 'pub relationship type'},
 914                                            },
 915                                 },
 916                                 );
 917                 }
 918
 919                 #other references
 920                 else {
 921                     $pubtype = 'other';
 922                     %pubhash = (
 923                                 "title"         => $ref->title || $refhash->{'title'},
 924                                 #"miniref"      => $fbrf,
 925                                 "type_id"       => {
 926                                     'name' => $pubtype,
 927                                     'cv_id' => {'name' =>'pub type'}
 928                                 }
 929                                 );
 930                 }
 931
 932                 #pub_author
 933                 my $autref = $self->_getRefAuthors($ref);
 934                 if (defined $autref) {
 935                     $pubhash{'pub_author'} = $autref;
 936                 }
 937                 # if no author and is type 'submitted' and has submitter address, use the first 100 characters of submitter address as the author lastname.
 938                 else {
 939                     if ($pubtype eq 'submitted') {
 940                         my $autref = $self->_getSubmitAddr($ref);
 941                         if (defined $autref) {
 942                             $pubhash{'pub_author'} = $autref;
 943                         }
 944                     }
 945                 }
 946
 947                 #$ref->comment as pubprop
 948                 #print "ref comment: ", $ref->comment, "\n";
 949                 #print "ref comment: ", $refhash->{'comment'}, "\n";
 950                 if (defined $ref->comment || defined $refhash->{'comment'}) {
 951                     my $comnt = $ref->comment || $refhash->{'comment'};
 952                                 #print "remark: ", $comnt, "\n";
 953                     $pubhash{'pubprop'} = {
 954                         "type_id"       => {'name' => 'comment', 'cv_id' => {'name' => 'pubprop type'}},
 955                         "value"         => $comnt,
 956                     };
 957                 }
 958
 959                 #pub_dbxref
 960                 undef(my @pub_dbxrefs);
 961                 if (defined $fbrf) {
 962                     push(@pub_dbxrefs, {dbxref_id => {accession => $fbrf, db_id => {'name' => 'FlyBase'}}});
 963                 }
 964                 if (defined ($temp = $ref->medline)) {
 965                     push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'MEDLINE'}}});
 966                                 #use medline # as the pub's uniquename
 967                     $pubhash{'uniquename'} = $temp;
 968                 }
 969                 if (defined ($temp = $ref->pubmed)) {
 970                     push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'PUBMED'}}});
 971                 }
 972                 $pubhash{'pub_dbxref'} = \@pub_dbxrefs;
 973
 974                 #if the pub uniquename is not defined or blank, put its FBrf# as its uniquename
 975                 #this is unique to FlyBase
 976                 #USERS OF THIS MODULE: PLEASE MODIFY HERE TO IMPLEMENT YOUR POLICY
 977                 # ON PUB UNIQUENAME!!!
 978                 if (!defined $pubhash{'uniquename'} || $pubhash{'uniquename'} eq '') {
 979                     if (defined $fbrf) {
 980                         $pubhash{'uniquename'} = $fbrf;
 981                     }
 982                                 #else {
 983                                 #       $pubhash{'uniquename'} = $self->_CreatePubUname($ref);
 984                                 #}
 985                 }
 986
 987                 #add to collection of references
 988                 #if the pub covers the entire sequence of the top-level feature, add it to feature_pubs
 989                 if (($ref->start == 1 && $ref->end == $len) || (!defined $ref->start && !defined $ref->end)) {
 990                     push(@feature_pubs, {"pub_id" => \%pubhash});
 991                 }
 992                 #the pub is about a sub-sequence of the top-level feature
 993                 #create a feature for the sub-sequence and add pub as its feature_pub
 994                 #featureloc of this sub-sequence is against the top-level feature, in interbase coordinates.
 995                 else {
 996                     my %parf = (
 997                                 'uniquename'    => $uniquename . ':' . $ref->start . "\.\." . $ref->end,
 998                                 'organism_id'   =>\%organism,
 999                                 'type_id'       =>{'name' =>'region', 'cv_id' => {'name' => $cv_name{'sequence'} }},
1000                                 );
1001                     my %parfsrcf = (
1002                                     'uniquename'        => $uniquename,
1003                                     'organism_id'       =>\%organism,
1004                                     );
1005                     my %parfloc = (
1006                                    'srcfeature_id'      => \%parfsrcf,
1007                                    'fmin'               => $ref->start - 1,
1008                                    'fmax'               => $ref->end,
1009                                    );
1010                     $parf{'featureloc'} = \%parfloc;
1011                     $parf{'feature_pub'} = {'pub_id' => \%pubhash};
1012                     my %ffr = (
1013                                'subject_id'     => \%parf,
1014                                'type_id'                => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'}}},
1015                                );
1016                     push(@top_featrels, \%ffr);
1017                 }
1018             }
1019             $datahash{'feature_pub'} = \@feature_pubs;
1020         }
1021
1022         ##construct srcfeature hash for use in featureloc
1023         if (defined $srcfeature) {
1024                 %srcfhash = $self->_srcf_hash($srcfeature,
1025                                               $srcfeattype,
1026                                               \%organism);
1027         #       my %fr = (
1028         #               "object_id"     => \%srcfhash,
1029         #               "type_id"       => { 'name' => 'partof', 'cv_id' => { 'name' => 'relationship type'}},
1030         #               );
1031
1032         #       push (@top_featrels, \%fr);
1033         }
1034
1035         #unflatten the seq features in $seq if $seq is a gene or a DNA sequence
1036         if (($gb_type eq 'gene' || $gb_type eq 'DNA') &&
1037             !$nounflatten) {
1038                 my $u = Bio::SeqFeature::Tools::Unflattener->new;
1039                 $u->unflatten_seq(-seq=>$seq, -use_magic=>1);
1040         }
1041
1042         my @top_sfs = $seq->get_SeqFeatures;
1043         #print $#top_sfs, "\n";
1044
1045         #SUBFEATURES
1046
1047         if ($datasource =~ /GenBank/i) {
1048                 $tag_cv = 'GenBank feature qualifier';
1049         } elsif ($datasource =~ /GFF/i) {
1050                 $tag_cv = 'feature_property';
1051         } else {
1052                 $tag_cv = $cv_name{'feature_property'};
1053         }
1054
1055         my $si = 0;
1056         foreach $feat (@top_sfs) {
1057                 #$feat = $top_sfs[$si];
1058                 #print "si: $si\n";
1059                 my $prim_tag = $feat->primary_tag;
1060                 #print $prim_tag, "\n";
1061
1062                 # get all qualifiers of the 'source' feature, load these as top_featureprops of the top level feature
1063                 if ($prim_tag eq 'source') {
1064                         foreach $tag ($feat->all_tags()) {
1065                                 #db_xref
1066                                 if ($tag eq 'db_xref'
1067                                  or $tag eq 'Dbxref'
1068                                  or $tag eq 'dbxref')   {
1069                                         my @t1 = $feat->each_tag_value($tag);
1070                                         foreach $temp (@t1) {
1071                                            $temp =~ /([^:]*?):(.*)/;
1072                                            my $db = $1;
1073                                            my $xref = $2;
1074                                            #PRE/POST very inefficent
1075                                            #my $db = $PREMATCH;
1076                                            #my $xref = $POSTMATCH;
1077                                            my %acchash = (
1078                                                 "db_id"         => {'name' => $db},
1079                                                 "accession"     => $xref,
1080                                                 );
1081                                            my %fdbx = ('dbxref_id' => \%acchash);
1082                                            push (@top_dbxrefs, \%fdbx);
1083                                         }
1084                                 #Ontology_term
1085                                 } elsif ($tag eq 'Ontology_term') {
1086                                         my @t1 = $feat->each_tag_value($tag);
1087                                         foreach $temp (@t1) {
1088                                             ###FIXME
1089                                         }
1090                                 #other tags as featureprop
1091                                 } elsif ($tag ne 'gene') {
1092                                         my %prophash = undef;
1093                                         %prophash = (
1094                                                 "type_id"       => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}},
1095                                                 "value"         => join(' ',$feat->each_tag_value($tag)),
1096                                                 );
1097                                         push(@top_featureprops, \%prophash);
1098                                 }
1099                         }
1100
1101                         if ($feat->can('source')) {
1102                             my $source = $feat->source();
1103                             @top_dbxrefs = $self->handle_source($feat, @top_dbxrefs);
1104                         }
1105
1106                         #featureloc for the top-level feature
1107                         my $fmin = undef;
1108                         my $fmax = undef;
1109                         my $strand = undef;
1110                         my $phase = undef;
1111                         my %fl = undef;
1112
1113                         $fmin = $feat->start - 1;
1114                         $fmax = $feat->end;
1115                         $strand = $feat->strand;
1116
1117                         if ($feat->can('phase')) {
1118                             $phase = $feat->phase;
1119                         }
1120
1121                         %fl = (
1122                                 "srcfeature_id" => \%srcfhash,
1123                                 "fmin"          => $fmin,
1124                                 "fmax"          => $fmax,
1125                                 "strand"        => $strand,
1126                                 "phase"         => $phase,
1127                                 );
1128
1129                         $datahash{'featureloc'} = \%fl;
1130
1131                         #delete 'source' feature from @top_sfs
1132                         splice(@top_sfs, $si, 1);
1133                 }
1134                 $si ++;
1135         #close loop over top_sfs
1136         }
1137
1138         #the top-level features other than 'source'
1139         foreach $feat (@top_sfs) {
1140                 #print $feat->primary_tag, "\n";
1141
1142                 my $r = $self->_subfeat2featrelhash($name, $ftype, $feat, \%srcfhash, $tag_cv, $isanalysis);
1143
1144                 if (!($ftype eq 'mRNA' && $feat->primary_tag eq 'gene')) {
1145                         my %fr = %$r;
1146                         push(@top_featrels, \%fr);
1147                 } else {
1148                         %finaldatahash = %$r;
1149                 }
1150         }
1151
1152         if (@top_dbxrefs) {
1153                 $datahash{'feature_dbxref'} = \@top_dbxrefs;
1154         }
1155
1156         if (@top_featureprops) {
1157                 $datahash{'featureprop'} = \@top_featureprops;
1158         }
1159
1160         if (@top_featrels) {
1161                 $datahash{'feature_relationship'} = \@top_featrels;
1162         }
1163
1164         if (@top_featurecvterms) {
1165                 $datahash{'feature_cvterm'} = \@top_featurecvterms;
1166         }
1167
1168         if ($ftype eq 'mRNA' && %finaldatahash) {
1169                 $finaldatahash{'feature_relationship'} = {
1170                                                 'subject_id'    => \%datahash,
1171                                                 'type_id'       => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'} }},
1172                                                          };
1173         } else {
1174                 %finaldatahash = %datahash;
1175         }
1176
1177         my $mainTag = 'feature';
1178         $self->_hash2xml(undef, $mainTag, \%finaldatahash);
1179
1180         return 1;
1181 }
1182
1183 sub _hash2xml {
1184     my $self = shift;
1185     my $isMatch = undef;
1186     $isMatch = shift;
1187     my $ult = shift;
1188     my $ref = shift;
1189     my %mh = %$ref;
1190     my $key;
1191     my $v;
1192     my $sh;
1193     my $xx;
1194     my $yy;
1195     my $nt;
1196     my $ntref;
1197     my $output;
1198     my $root = shift if (@_);
1199     #print "ult: $ult\n";
1200     if (!defined $self->{'writer'}) {
1201         $root = 1;
1202         $self->_create_writer();
1203     }
1204     my $temp;
1205     my %subh = undef;
1206
1207     #start opeing tag
1208     #if pub record of type 'journal', form the 'ref' attribute for special pub lookup
1209     #requires that the journal name itself is also stored as a pubprop record for the journal with value equal
1210     #to the journal name and type of 'abbreviation'.
1211     if ($ult eq 'pub' && $mh{'type_id'}->{'name'} eq 'journal') {
1212         $self->{'writer'}->startTag($ult, 'ref' => $mh{'title'} . ':journal:abbreviation');
1213     }
1214
1215     #special pub match if pub uniquename not known
1216     elsif ($ult eq 'pub' && !defined $mh{'uniquename'}) {
1217         $self->{'writer'}->startTag($ult, 'op' => 'match');
1218         #set the match flag, all the sub tags should also have "op"="match"
1219         $isMatch = 1;
1220     }
1221
1222     #if cvterm or cv, lookup only
1223     elsif (($ult eq 'cvterm') || ($ult eq 'cv')) {
1224         $self->{'writer'}->startTag($ult, 'op' => 'lookup');
1225     }
1226
1227     #if nested tables of match table, match too
1228     elsif ($isMatch) {
1229         $self->{'writer'}->startTag($ult, 'op' => 'match');
1230     }
1231
1232     else {
1233         $self->{'writer'}->startTag($ult);
1234     }
1235
1236     #first loop to produce xml for all the table columns
1237     foreach $key (keys %mh)
1238     {
1239         #print "key: $key\n";
1240         $xx = ' ' . $key;
1241         $yy = $key . ' ';
1242         if (index($chadotables, $xx) < 0 && index($chadotables, $yy) < 0)
1243         {
1244             if ($isMatch) {
1245                 $self->{'writer'}->startTag($key, 'op' => 'match');
1246             } else {
1247                 $self->{'writer'}->startTag($key);
1248             }
1249
1250             my $x = $ult . '.' . $key;
1251             #the column is a foreign key
1252             if (defined $fkey{$x})
1253             {
1254                 $nt = $fkey{$x};
1255                 $sh = $mh{$key};
1256                 $self->_hash2xml($isMatch, $nt, $sh, 0);
1257             } else
1258             {
1259                 #print "$key: $mh{$key}\n";
1260                 $self->{'writer'}->characters($mh{$key});
1261             }
1262             $self->{'writer'}->endTag($key);
1263         }
1264     }
1265
1266     #second loop to produce xml for all the nested tables
1267     foreach $key (keys %mh)
1268     {
1269         #print "key: $key\n";
1270         $xx = ' ' . $key;
1271         $yy = $key . ' ';
1272         #a nested table
1273         if (index($chadotables, $xx) > 0 || index($chadotables, $yy) > 0)
1274         {
1275             #$writer->startTag($key);
1276             $ntref = $mh{$key};
1277             #print "$key: ", ref($ntref), "\n";
1278             if (ref($ntref) =~ 'HASH') {
1279                 $self->_hash2xml($isMatch, $key, $ntref, 0);
1280             } elsif (ref($ntref) =~ 'ARRAY') {
1281                 #print "array dim: ", $#$ntref, "\n";
1282                 foreach $ref (@$ntref) {
1283                                 #print "\n";
1284                     $self->_hash2xml($isMatch, $key, $ref, 0);
1285                 }
1286             }
1287             #$writer->endTag($key);
1288         }
1289     }
1290
1291     #end tag
1292     $self->{'writer'}->endTag($ult);
1293
1294     #if ($root == 1) {
1295 #       $self->{'writer'}->endTag('chado');
1296 #    }
1297 }
1298
1299 sub _guess_acc_db {
1300         my $self = shift;
1301         my $seq = shift;
1302         my $acc = shift;
1303         #print "acc: $acc\n";
1304
1305         if ($acc =~ /^NM_\d{6}/ || $acc =~ /^NP_\d{6}/ || $acc =~ /^NT_\d{6}/ || $acc =~ /^NC_\d{6}/) {
1306                 return "RefSeq";
1307         } elsif ($acc =~ /^XM_\d{6}/ || $acc =~ /^XP_\d{6}/ || $acc =~ /^XR_\d{6}/) {
1308                 return "RefSeq";
1309         } elsif ($acc =~ /^[a-zA-Z]{1,2}\d{5,6}/) {
1310                 return "GB";
1311         } elsif ($seq->molecule() eq 'protein' && $acc =~ /^[a-zA-z]\d{5}/) {
1312                 return "PIR";
1313         } elsif ($seq->molecule() eq 'protein' && $acc =~ /^\d{6,7}[a-zA-Z]/) {
1314                 return "PRF";
1315         } elsif ($acc =~ /\d+/ && $acc !~ /[a-zA-Z]/) {
1316                 return "LocusID";
1317         } elsif ($acc =~ /^CG\d+/ || $acc =~ /^FB[a-z][a-z]\d+/) {
1318                 return "FlyBase";
1319         } else {
1320                 return "unknown";
1321         }
1322 }
1323
1324 sub _subfeat2featrelhash {
1325         my $self = shift;
1326         my $genename = shift;
1327         my $seqtype = shift;
1328         my $feat = shift;
1329         my $r = shift;
1330         my %srcf = %$r;         #srcfeature hash for featureloc.srcfeature_id
1331         my $tag_cv = shift;
1332         my $isanalysis = shift;
1333
1334         my $prim_tag = $feat->primary_tag;
1335
1336         my $sfunique = undef;           #subfeature uniquename
1337         my $sfname = undef;             #subfeature name
1338         my $sftype = undef;             #subfeature type
1339
1340         if ($feat->has_tag('symbol')) {
1341                 ($sfunique) = $feat->each_tag_value("symbol");
1342         } elsif ($feat->has_tag('label')) {
1343                 ($sfunique) = $feat->each_tag_value("label");
1344         } else {
1345                 #$self->throw("$prim_tag at " . $feat->start . "\.\." . $feat->end . " does not have symbol or label! To convert into chadoxml, a seq feature must have a /symbol or /label tag holding its unique name.");
1346                 #generate feature unique name as <genename>-<feature-type>-<span>
1347                 $sfunique = $self->_genFeatUniqueName($genename, $feat);
1348         }
1349
1350         if ($feat->has_tag('Name')) {
1351                 ($sfname) = $feat->each_tag_value("Name");
1352         }
1353
1354         #feature type translation
1355         if (defined $feattype_args2so{$prim_tag}) {
1356                 $sftype = $feattype_args2so{$prim_tag};
1357         } else {
1358                 $sftype = $prim_tag;
1359         }
1360
1361         if ($prim_tag eq 'mutation') {
1362                 if ($feat->start == $feat->end) {
1363                         $sftype = $feattype_args2so{'mutation1'};
1364                 } else {
1365                         $sftype = $feattype_args2so{'mutation2'};
1366                 }
1367         }
1368
1369         #set is_analysis flag for gene model features
1370         undef(my $isanal);
1371         if ($sftype eq 'gene' || $sftype eq 'mRNA' || $sftype eq 'exon' || $sftype eq 'protein' || $sftype eq 'polypeptide') {
1372                 $isanal = $isanalysis;
1373         }
1374
1375         my %sfhash = (
1376                 "name"                  => $sfname,
1377                 "uniquename"            => $sfunique,
1378                 "organism_id"           => \%organism,
1379                 "type_id"               => { 'name' => $sftype, 'cv_id' => { 'name' => $cv_name{'sequence'} }},
1380                 "is_analysis"           => $isanal || 'false',
1381                 );
1382
1383         #make a copy of %sfhash for passing to this method when recursively called
1384         #my %srcfeat = (
1385         #        "name"                  => $sfname,
1386         #        "uniquename"            => $sfunique,
1387         #        "organism_id"           => \%organism,
1388         #        "type_id"               => { 'name' => $sftype, 'cv_id' => { 'name' => 'SO'}},
1389         #        );
1390
1391         #featureloc for subfeatures
1392         undef(my $sfmin);
1393         undef(my $sfmax);
1394         undef(my $is_sfmin_partial);
1395         undef(my $is_sfmax_partial);
1396         undef(my $sfstrand);
1397         undef(my $sfphase);
1398         $sfmin = $feat->start - 1;
1399         $sfmax = $feat->end;
1400         $sfstrand = $feat->strand();
1401
1402         if ($feat->can('phase')) {
1403             $sfphase = $feat->phase;
1404         }
1405
1406         #if the gene feature in an mRNA record, cannot use its coordinates, omit featureloc
1407         if ($seqtype eq 'mRNA' && $sftype eq 'gene') {
1408         } else {
1409                 if ($feat->location->isa('Bio::Location::FuzzyLocationI')) {
1410                         if ($feat->location->start_pos_type() ne 'EXACT') {
1411                                 $is_sfmin_partial = 'true';
1412                         }
1413                         if ($feat->location->end_pos_type() ne 'EXACT') {
1414                                 $is_sfmax_partial = 'true';
1415                         }
1416                 }
1417
1418                 my %sfl = (
1419                         "srcfeature_id" => \%srcf,
1420                         "fmin"          => $sfmin,
1421                         "is_fmin_partial" => $is_sfmin_partial || 'false',
1422                         "fmax"          => $sfmax,
1423                         "is_fmax_partial" => $is_sfmax_partial || 'false',
1424                         "strand"        => $sfstrand,
1425                         "phase"         => $sfphase,
1426                         );
1427
1428                 $sfhash{'featureloc'} = \%sfl;
1429         }
1430
1431
1432         #subfeature tags
1433         undef(my @sfdbxrefs);           #subfeature dbxrefs
1434         undef(my @sub_featureprops);    #subfeature props
1435         undef(my @sub_featuresyns);     #subfeature synonyms
1436         undef(my @sub_featurecvterms);  #subfeature cvterms
1437         foreach my $tag ($feat->all_tags()) {
1438                 #feature_dbxref for features
1439                 if ($tag eq 'db_xref' or $tag eq 'dbxref' or $tag eq 'Dbxref')   {
1440                         my @t1 = $feat->each_tag_value($tag);
1441                         #print "# of dbxref: @t1\n";
1442                         for my $temp (@t1) {
1443                            $temp =~ /:/;
1444                            my $db = $PREMATCH;
1445                            my $xref = $POSTMATCH;
1446                            #print "db: $db; xref: $xref\n";
1447                            my %acchash = (
1448                                 "db_id"         => {'name' => $db},
1449                                 "accession"     => $xref,
1450                                 );
1451                            my %sfdbx = ('dbxref_id' => \%acchash);
1452                            push (@sfdbxrefs, \%sfdbx);
1453                         }
1454                 #Alias tags
1455                 } elsif ($tag eq 'Alias') {
1456                         @sub_featuresyns = $self->handle_Alias_tag($feat, @sub_featuresyns);
1457                 } elsif ($tag eq 'Ontology_term') {
1458                         @sub_featurecvterms = $self->handle_Ontology_tag($feat, @sub_featurecvterms);
1459                 #featureprop for features, excluding GFF Name & Parent tags
1460                 } elsif ($tag ne 'gene' && $tag ne 'symbol' && $tag ne 'Name' && $tag ne 'Parent') {
1461                         next if ($tag eq 'parent_id');
1462                         next if ($tag eq 'load_id');
1463                         foreach my $val ($feat->each_tag_value($tag)) {
1464                                 my %prophash = undef;
1465                                 %prophash = (
1466                                         "type_id"       => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}},
1467                                         "value"         => $val,
1468                                 );
1469                                 push(@sub_featureprops, \%prophash);
1470                         }
1471                 }
1472         }
1473
1474         if ($feat->can('source')) {
1475                 @sfdbxrefs = $self->handle_source($feat,@sfdbxrefs);
1476         }
1477
1478         if (@sub_featureprops) {
1479                 $sfhash{'featureprop'} = \@sub_featureprops;
1480         }
1481         if (@sfdbxrefs) {
1482                 $sfhash{'feature_dbxref'} = \@sfdbxrefs;
1483         }
1484         if (@sub_featuresyns) {
1485                 $sfhash{'feature_synonym'} = \@sub_featuresyns;
1486         }
1487         if (@sub_featurecvterms) {
1488                 $sfhash{'feature_cvterm'} = \@sub_featurecvterms;
1489         }
1490
1491         undef(my @ssfeatrel);
1492         if ($feat->has_tag('locus_tag')) {
1493                 ($genename)= $feat->each_tag_value('locus_tag');
1494         } elsif ($feat->has_tag('gene')) {
1495                 ($genename)= $feat->each_tag_value('gene');
1496         }
1497
1498         foreach my $sf ($feat->get_SeqFeatures()) {
1499                 #print $sf->primary_tag, "\n";
1500                 my $rref = $self->_subfeat2featrelhash($genename, $sftype, $sf, \%srcf, $tag_cv, $isanalysis);
1501                 if (defined $rref) {
1502                         push(@ssfeatrel, $rref);
1503                 }
1504         }
1505
1506         if (@ssfeatrel) {
1507                 $sfhash{'feature_relationship'} = \@ssfeatrel;
1508         }
1509
1510         #subj-obj relationship type
1511         undef(my $reltypename);
1512         $reltypename = return_reltypename($sftype);
1513
1514         my %fr = (
1515                 "subject_id"    => \%sfhash,
1516                 "type_id"               => { 'name' => $reltypename,
1517                                              'cv_id' => { 'name' => $cv_name{'relationship'} }},
1518                 );
1519
1520         if ($seqtype eq 'mRNA' && $sftype eq 'gene') {
1521                 return \%sfhash;
1522         } else {
1523                 return \%fr;
1524         }
1525
1526 }
1527
1528 #generate uniquename for feature as: <genename>-<feature-type>-<span> (foo-mRNA-10..1000)
1529 sub _genFeatUniqueName {
1530         my $self = shift;
1531         my $genename = shift;
1532         my $feat = shift;
1533         undef(my $uniquename);
1534         my $ftype = $feat->primary_tag;
1535         my $start = $feat->start;
1536         my $end = $feat->end;
1537
1538         if ($feat->has_tag('locus_tag')) {
1539                 ($genename) = $feat->each_tag_value("locus_tag");
1540         } elsif ($feat->has_tag('gene')) {
1541                 ($genename) = $feat->each_tag_value("gene");
1542         }
1543
1544         $uniquename = $genename . '-' . $ftype . '-' . $start . "\.\." . $end;
1545
1546         return $uniquename;
1547 }
1548
1549 #create uniquename for pubs with no medline id and no FBrf#
1550 #use "<authors>, <year>, <type>" as the uniquename (same as miniref)
1551 #<authors> is <sole-author-surname>    if one author,
1552 #  or <first-author-surname> and <second-author-surname>   if two,
1553 #  or <first-author-surname> et al.   if more
1554 #sub _CreatePubUname {
1555 #       my $self = shift;
1556 #       my $pub = shift;
1557 #       undef(my $pubuname);
1558 #
1559 #       return $pubuname;
1560 #}
1561
1562 #get authors of a reference
1563 #returns ref to the array of author hashes
1564 sub _getRefAuthors {
1565         my $self = shift;
1566         my $ref = shift;
1567
1568         my $temp = $ref->authors;
1569         undef(my @authors);
1570         undef(my @aut);
1571
1572         #there are authors
1573         if ($temp ne '.') {
1574                 if (index($temp, ' and ') > 0) {
1575                         $temp =~ / and /;
1576                         my $lastauthor = $POSTMATCH;
1577                         @authors = split(/\, /, $PREMATCH);
1578                         push (@authors, $lastauthor);
1579                 } else {
1580                         @authors = split(/\, /, $temp);
1581                 }
1582
1583                 my $a;
1584                 my $i = 0;
1585                 foreach $a (@authors) {
1586                         $i ++;
1587                         #parse the author lastname and givennames
1588                         undef(my $last);
1589                         undef(my $given);
1590                         if (index($a, ',') > 0) {       #genbank format, last,f.m.
1591                                 ($last, $given) = split(/\,/, $a);
1592                         } elsif (index($a, ' ') > 0) {  #embl format, last f.m.
1593                                 ($last, $given) = split(/ /, $a);
1594                         }
1595                         my %au = (
1596                                 'surname'       => $last,
1597                                 'givennames'    => $given,
1598                                 );
1599                         push(@aut, {author_id => \%au, arank => $i});
1600                 }
1601
1602                 return \@aut;
1603         }
1604
1605         #no authors, Bio::SeqIO::genbank doesn't pick up 'CONSRTM' line.
1606         else {
1607                 return;
1608         }
1609
1610 }
1611
1612 #extract submission year from the citation of the submitted reference
1613 #genbank format for the submitted citation: JOURNAL   Submitted (DD-MON-YYYY) submitter address
1614 sub _getSubmitYear {
1615     my $self = shift;
1616     my $citation = shift;
1617
1618     if ($citation !~ /Submitted/) {
1619         $self->warn("not citation for a submitted reference. cannot extract submission year.");
1620         return;
1621     } else {
1622         $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/;
1623         my $a = $MATCH;
1624         $a =~ /\d{4}/;
1625         my $year = $MATCH;
1626
1627         return $year;
1628     }
1629 }
1630
1631 sub _getSubmitAddr {
1632     my $self = shift;
1633     my $ref = shift;
1634     undef(my %author);
1635
1636     my $citation = $ref->location;
1637     if ($citation !~ /Submitted/) {
1638         $self->warn("not citation for a submitted reference. cannot extract submission year.");
1639         return;
1640     } else {
1641         $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/;
1642         my $a = $POSTMATCH;
1643         if (defined $a) {
1644             $a =~ s/^\s//;
1645             %author = (
1646                        'author_id'      => {'surname'   => substr($a, 0, 100)},
1647                        );
1648             return \%author;
1649         } else {
1650             return;
1651         }
1652     }
1653 }
1654
1655
1656 =head2 suppress_residues
1657
1658 =over
1659
1660 =item Usage
1661
1662   $obj->suppress_residues()        #get existing value
1663   $obj->suppress_residues($newval) #set new value
1664
1665 =item Function
1666
1667 Keep track of the flag to suppress printing of residues in the chadoxml file.
1668 The default it to allow all residues to go into the file.
1669
1670 =item Returns
1671
1672 value of suppress_residues (a scalar)
1673
1674 =item Arguments
1675
1676 new value of suppress_residues (to set)
1677
1678 =back
1679
1680 =cut
1681
1682 sub suppress_residues {
1683     my $self = shift;
1684     my $suppress_residues = shift if defined(@_);
1685     return $self->{'suppress_residues'} = $suppress_residues if defined($suppress_residues);
1686     return $self->{'suppress_residues'};
1687 }
1688
1689 =head2 allow_residues
1690
1691 =over
1692
1693 =item Usage
1694
1695   $obj->allow_residues()        #get existing value
1696   $obj->allow_residues($feature_type) #set new value
1697
1698 =item Function
1699
1700 Track the allow_residues type.  This can be used in conjunction with the
1701 suppress_residues flag to only allow residues from a specific feature type
1702 to be printed in the xml file, for example, only printing chromosome
1703 residues.  When suppress_residues is set to true, then only chromosome
1704 features would would go into the xml file.  If suppress_residues is not
1705 set, this function has no effect (since the default is to put all residues
1706 in the xml file).
1707
1708 =item Returns
1709
1710 value of allow_residues (a string that corresponds to a feature type)
1711
1712 =item Arguments
1713
1714 new value of allow_residues (to set)
1715
1716 =back
1717
1718 =cut
1719
1720 sub allow_residues {
1721     my $self = shift;
1722     my $allow_residues = shift if defined(@_);
1723     return $self->{'allow_residues'} = $allow_residues if defined($allow_residues);
1724     return $self->{'allow_residues'};
1725 }
1726
1727 =head2 return_ftype_hash
1728
1729 =over
1730
1731 =item Usage
1732
1733   $obj->return_ftype_hash()
1734
1735 =item Function
1736
1737 A simple hash where returning it has be factored out of the main
1738 code to allow subclasses to override it.
1739
1740 =item Returns
1741
1742 A hash that indicates what the name of the SO term is and what
1743 the name of the Sequence Ontology is in the cv table.
1744
1745 =item Arguments
1746
1747 The string that represents the SO term.
1748
1749 =back
1750
1751 =cut
1752
1753 sub return_ftype_hash {
1754     my $self  = shift;
1755     my $ftype = shift;
1756     my %ftype_hash = ( "name" => $ftype,
1757                        "cv_id" => {"name" => $cv_name{'sequence'} });
1758     return %ftype_hash;
1759 }
1760
1761 =head2 return_reltypename
1762
1763 =over
1764
1765 =item Usage
1766
1767   $obj->return_reltypename()
1768
1769 =item Function
1770
1771 Return the appropriate relationship type name depending on the
1772 feature type (typically part_of, but derives_from for polypeptide).
1773
1774 =item Returns
1775
1776 A relationship type name.
1777
1778 =item Arguments
1779
1780 A SO type name.
1781
1782 =back
1783
1784 =cut
1785
1786 sub return_reltypename {
1787     my $self   = shift;
1788     my $sftype = shift;
1789
1790     my $reltypename;
1791     if ($sftype eq 'protein' || $sftype eq 'polypeptide') {
1792         $reltypename = 'derives_from';
1793     } else {
1794         $reltypename = 'part_of';
1795     }
1796
1797     return $reltypename;
1798 }
1799
1800 =head2 next_seq
1801
1802 =over
1803
1804 =item Usage
1805
1806   $obj->next_seq()
1807
1808 =item Function
1809
1810 Not implemented--this is a write-only adapter.
1811
1812 =item Returns
1813
1814 =item Arguments
1815
1816 =back
1817
1818 =cut
1819
1820 sub next_seq {
1821     my ($self, %argv) = @_;
1822
1823     $self->throw('next_seq is not implemented; this is a write-only adapter.');
1824
1825 }
1826
1827
1828 =head2 _create_writer
1829
1830 =over
1831
1832 =item Usage
1833
1834   $obj->_create_writer()
1835
1836 =item Function
1837
1838 Creates XML::Writer object and writes start tag
1839
1840 =item Returns
1841
1842 Nothing, though the writer persists as part of the chadoxml object
1843
1844 =item Arguments
1845
1846 None
1847
1848 =back
1849
1850 =cut
1851
1852 sub _create_writer {
1853     my $self = shift;
1854
1855     $self->{'writer'} = new XML::Writer(OUTPUT => $self->_fh,
1856                                         DATA_MODE => 1,
1857                                         DATA_INDENT => 3);
1858
1859     #print header
1860     $self->{'writer'}->xmlDecl("UTF-8");
1861     $self->{'writer'}->comment("created by Peili Zhang, Flybase, Harvard University\n".
1862                                "and Scott Cain, GMOD, Cold Spring Harbor Laboratory");
1863
1864     #start chadoxml
1865     $self->{'writer'}->startTag('chado');
1866
1867     return;
1868 }
1869
1870 =head2 close_chadoxml
1871
1872 =over
1873
1874 =item Usage
1875
1876   $obj->close_chadoxml()
1877
1878 =item Function
1879
1880 Writes the closing xml tag
1881
1882 =item Returns
1883
1884 Nothing
1885
1886 =item Arguments
1887
1888 None
1889
1890 =back
1891
1892 =cut
1893
1894 sub close_chadoxml {
1895     my $self = shift;
1896
1897     $self->{'writer'}->endTag('chado');
1898     return;
1899 }
1900
1901 =head2 handle_unreserved_tags
1902
1903 =over
1904
1905 =item Usage
1906
1907   $obj->handle_unreserved_tags()
1908
1909 =item Function
1910
1911 Converts tag value pairs to xml-ready hashrefs
1912
1913 =item Returns
1914
1915 The array containing the hashrefs
1916
1917 =item Arguments
1918
1919 In order: the Seq or SeqFeature object, the key, and the hasharray
1920
1921 =back
1922
1923 =cut
1924
1925 sub handle_unreserved_tags {
1926     my $self = shift;
1927     my $seq  = shift;
1928     my $key  = shift;
1929     my @arr  = @_;
1930
1931     my @values = $seq->attributes($key);
1932     for my $value (@values) {
1933         my %prophash = (
1934            "type_id"     => {'name' => $key,
1935                              'cv_id' => { 'name' => $cv_name{'feature_property'} }
1936                             },
1937                             "value"       => $value,
1938                        );
1939         push(@arr, \%prophash);
1940     }
1941
1942     return @arr;
1943 }
1944
1945 =head2 handle_Alias_tag
1946
1947 =over
1948
1949 =item Usage
1950
1951   $obj->handle_Alias_tag()
1952
1953 =item Function
1954
1955 Convert Alias values to synonym hash refs
1956
1957 =item Returns
1958
1959 An array of synonym hash tags
1960
1961 =item Arguments
1962
1963 The seq or seqFeature object and the synonym hash array
1964
1965 =back
1966
1967 =cut
1968
1969 sub handle_Alias_tag {
1970     my $self = shift;
1971     my $seq  = shift;
1972     my @arr  = @_;
1973
1974     my @Aliases = $seq->attributes('Alias');
1975     for my $Alias (@Aliases) {
1976         my %synhash = (
1977                   "type_id"   => { 'name' => 'exact',
1978                                   'cv_id' => { 'name'  => 'synonym_type' } },
1979                                  "name"         => $Alias,
1980                                  "synonym_sgml" => $Alias,
1981                       );
1982         push(@arr, {'synonym_id' => \%synhash,
1983                     'pub_id'     => {'uniquename' => 'null',
1984                                      'type_id'    => { 'name' => 'null',
1985                                                        'cv_id' => {
1986                                                             'name' => 'null',
1987                                                                   },
1988                                                      },
1989                                     },
1990                    });
1991     }
1992
1993     return @arr;
1994 }
1995
1996 =head2 handle_Ontology_tag
1997
1998 =over
1999
2000 =item Usage
2001
2002   $obj->handle_Ontology_tag ()
2003
2004 =item Function
2005
2006 Convert Ontology_term values to ontology term hash refs
2007
2008 =item Returns
2009
2010 An array of ontology term hash refs
2011
2012 =item Arguments
2013
2014 The seq or seqFeature object and the ontology term array
2015
2016 =back
2017
2018 =cut
2019
2020 sub handle_Ontology_tag  {
2021     my $self = shift;
2022     my $seq  = shift;
2023     my @arr  = @_;
2024
2025     my @terms = $seq->attributes('Ontology_term');
2026     for my $term (@terms) {
2027         my $hashref;
2028         if ($term =~ /(\S+):(\S+)/) {
2029             my $db  = $1;
2030             my $acc = $2;
2031             $hashref = {
2032                     'cvterm_id' => {
2033                         'dbxref_id' => {
2034                            'db_id' => { 'name' => $db },
2035                            'accession' => $acc
2036                                       },
2037                                    },
2038                        };
2039         }
2040         push(@arr, {cvterm_id => $hashref});
2041     }
2042
2043     return @arr;
2044 }
2045
2046 =head2 handle_dbxref
2047
2048 =over
2049
2050 =item Usage
2051
2052   $obj->handle_dbxref()
2053
2054 =item Function
2055
2056 Convert Dbxref values to dbxref hashref
2057
2058 =item Returns
2059
2060 An array of dbxref hashrefs
2061
2062 =item Arguments
2063
2064 A seq or seqFeature object and the dbxref array
2065
2066 =back
2067
2068 =cut
2069
2070 sub handle_dbxref {
2071     my $self = shift;
2072     my $seq  = shift;
2073     my $tag  = shift;
2074     my @arr  = @_;
2075
2076     my @terms = $seq->attributes($tag);
2077     for my $term (@terms) {
2078         my $hashref;
2079         if ($term =~ /(\S+):(\S+)/) {
2080             my $db = $1;
2081             my $acc= $2;
2082             my $version = 1;
2083             if ($acc =~ /(\S+)\.(\S+)/) {
2084                 $acc = $1;
2085                 $version = $2;
2086             }
2087             $hashref = {
2088                          'dbxref_id' => {
2089                                'db_id' => { 'name' => $db },
2090                                'accession' => $acc,
2091                                'version'   => $version,
2092                                         },
2093                        };
2094         }
2095         else {
2096             $self->throw("I don't know how to handle a dbxref like $term");
2097         }
2098         push(@arr, {'dbxref_id' => $hashref});
2099     }
2100     return @arr;
2101 }
2102
2103 =head2 handle_source
2104
2105 =over
2106
2107 =item Usage
2108
2109   $obj->handle_source()
2110
2111 =item Function
2112
2113 =item Returns
2114
2115 =item Arguments
2116
2117 =back
2118
2119 =cut
2120
2121 sub handle_source {
2122     my $self = shift;
2123     my $seq  = shift;
2124     my @arr  = @_;
2125
2126     my $source = $seq->source();
2127     return @arr unless $source;
2128
2129     my $hashref = {
2130                'dbxref_id' => {
2131                        'db_id' => {'name' => 'GFF_source'},
2132                        'accession' => $source,
2133                               }
2134                   };
2135
2136     push(@arr, {'dbxref_id' => $hashref});
2137     return @arr;
2138 }
2139
2140 =head2 _srcf_hash
2141
2142 =over
2143
2144 =item Usage
2145
2146   $obj->_srcf_hash()
2147
2148 =item Function
2149
2150 Creates the srcfeature hash for use in featureloc hashes
2151
2152 =item Returns
2153
2154 The srcfeature hash
2155
2156 =item Arguments
2157
2158 The srcfeature name, the srcfeature type and a reference to the
2159 organism hash.
2160
2161 =back
2162
2163 =cut
2164
2165 sub _srcf_hash {
2166     my $self = shift;
2167     my $srcf = shift;
2168     my $stype= shift;
2169     my $orgref = shift;
2170
2171     my %hash = ('uniquename'    => $srcf,
2172                 'organism_id'   => $orgref,
2173                 'type_id'       => {'name' => $stype,
2174                                     'cv_id' =>
2175                                        {'name' => $cv_name{'sequence'} }},
2176                );
2177
2178     return %hash;
2179 }
2180
2181
2182 1;