branch-1-6/Bio/SeqIO/chadoxml.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::SeqIO::chadoxml
   4 #
   5 # Peili Zhang   <peili@morgan.harvard.edu>
   6 #
   7 # You may distribute this module under the same terms as perl itself
   8
   9 # POD documentation - main docs before the code
  10
  11 =head1 NAME
  12
  13 Bio::SeqIO::chadoxml - chadoxml sequence output stream
  14
  15 =head1 SYNOPSIS
  16
  17 It is probably best not to use this object directly, but
  18 rather go through the SeqIO handler system:
  19
  20     $writer = Bio::SeqIO->new(-file => ">chado.xml",
  21                               -format => 'chadoxml');
  22
  23     # assume you already have Sequence or SeqFeature objects
  24     $writer->write_seq($seq_obj);
  25
  26     #after writing all seqs
  27     $writer->close_chadoxml();
  28
  29
  30
  31 =head1 DESCRIPTION
  32
  33 This object can transform Bio::Seq objects to chadoxml flat
  34 file databases (for chadoxml DTD, see
  35 http://gmod.cvs.sourceforge.net/gmod/schema/chado/dat/chado.dtd).
  36
  37 This is currently a write-only module.
  38
  39     $seqio = Bio::SeqIO->new(-file => '>outfile.xml',
  40                              -format => 'chadoxml'
  41                              -suppress_residues => 1,
  42                              -allow_residues => 'chromosome',
  43                              );
  44
  45     # we have a Bio::Seq object $seq which is a gene located on
  46     # chromosome arm 'X', to be written out to chadoxml
  47     # before converting to chadoxml, $seq object B<must> be transformed
  48     # so that all the coordinates in $seq are against the source
  49     # feature to be passed into Bio::SeqIO::chadoxml->write_seq()
  50     # -- chromosome arm X in the example below.
  51
  52     $seqio->write_seq(-seq=>$seq,
  53                       -genus   => 'Homo',
  54                       -species => 'sapiens',
  55                       -seq_so_type=>'gene',
  56                       -src_feature=>'X',
  57                       -src_feat_type=>'chromosome_arm',
  58                         -nounflatten=>1,
  59                       -is_analysis=>'true',
  60                       -data_source=>'GenBank');
  61
  62 The chadoxml output of Bio::SeqIO::chadoxml-E<gt>write_seq() method can be
  63 passed to the loader utility in XORT package
  64 (http://gmod.cvs.sourceforge.net/gmod/schema/XMLTools/XORT/)
  65 to be loaded into chado.
  66
  67 This object is currently implemented to work with sequence and
  68 annotation data from whole genome projects deposited in GenBank. It
  69 may not be able to handle all different types of data from all
  70 different sources.
  71
  72 In converting a Bio::Seq object into chadoxml, a top-level feature is
  73 created to represent the object and all sequence features inside the
  74 Bio::Seq object are treated as subfeatures of the top-level
  75 feature. The Bio::SeqIO::chadoxml object calls
  76 Bio::SeqFeature::Tools::Unflattener to unflatten the flat feature list
  77 contained in the subject Bio::Seq object, to build gene model
  78 containment hierarchy conforming to chado central dogma model: gene
  79 --E<gt> mRNA --E<gt> exons and protein.
  80
  81 Destination of data in the subject Bio::Seq object $seq is as following:
  82
  83     *$seq->display_id:  name of the top-level feature;
  84
  85     *$seq->accession_number: if defined, uniquename and
  86                  feature_dbxref of the top-level
  87                  feature if not defined,
  88                  $seq->display_id is used as the
  89                  uniquename of the top-level feature;
  90
  91     *$seq->molecule: transformed to SO type, used as the feature
  92             type of the top-level feature if -seq_so_type
  93             argument is supplied, use the supplied SO type
  94             as the feature type of the top-level feature;
  95
  96     *$seq->species: organism of the top-level feature;
  97
  98     *$seq->seq: residues of the top-level feature;
  99
 100     *$seq->is_circular, $seq->division: feature_cvterm;
 101
 102     *$seq->keywords, $seq->desc, comments: featureprop;
 103
 104     *references: pub and feature_pub;
 105         medline/pubmed ids: pub_dbxref;
 106         comments: pubprop;
 107
 108     *feature "source" span: featureloc for top-level feature;
 109
 110     *feature "source" db_xref: feature_dbxref for top-level feature;
 111
 112     *feature "source" other tags: featureprop for top-level feature;
 113
 114     *subfeature 'symbol' or 'label' tag: feature uniquename, if
 115                      none of these is present, the chadoxml object
 116                      generates feature uniquenames as:
 117                      <gene>-<feature_type>-<span>
 118                      (e.g. foo-mRNA--1000..3000);
 119
 120     *gene model: feature_relationship built based on the
 121                      containment hierarchy;
 122
 123     *feature span: featureloc;
 124
 125     *feature accession numbers: feature_dbxref;
 126
 127     *feature tags (except db_xref, symbol and gene): featureprop;
 128
 129 Things to watch out for:
 130
 131     *chado schema change: this version works with the chado
 132                                version tagged chado_1_01 in GMOD CVS.
 133
 134     *feature uniquenames: especially important if using XORT
 135                               loader to do incremental load into
 136                               chado. may need pre-processing of the
 137                               source data to put the correct
 138                               uniquenames in place.
 139
 140     *pub uniquenames: chadoxml->write_seq() has the FlyBase policy
 141                           on pub uniquenames hard-coded, it assigns
 142                           pub uniquenames in the following way: for
 143                           journals and books, use ISBN number; for
 144                           published papers, use MEDLINE ID; for
 145                           everything else, use FlyBase unique
 146                           identifier FBrf#. need to modify the code to
 147                           implement your policy. look for the comments
 148                           in the code.
 149
 150     *for pubs possibly existing in chado but with no knowledge of
 151          its uniquename:put "op" as "match", then need to run the
 152                         output chadoxml through a special filter that
 153                         talks to chado database and tries to find the
 154                         pub by matching with the provided information
 155                         instead of looking up by the unique key. after
 156                         matching, the filter also resets the "match"
 157                         operation to either "force" (default), or
 158                         "lookup", or "insert", or "update". the
 159                         "match" operation is for a special FlyBase use
 160                         case. please modify to work according to your
 161                         rules.
 162
 163     *chado initialization for loading:
 164
 165         cv & cvterm: in the output chadoxml, all cv's and
 166                              cvterm's are lookup only. Therefore,
 167                              before using XORT loader to load the
 168                              output into chado, chado must be
 169                              pre-loaded with all necessary CVs and
 170                              CVterms, including "SO" , "property
 171                              type", "relationship type", "pub type",
 172                              "pubprop type", "pub relationship type",
 173                              "sequence topology", "GenBank feature
 174                              qualifier", "GenBank division". A pub by
 175                              the uniquename 'nullpub' of type 'null
 176                              pub' needs to be inserted.
 177
 178 =head1 FEEDBACK
 179
 180 =head2 Mailing Lists
 181
 182 User feedback is an integral part of the evolution of this and other
 183 Bioperl modules. Send your comments and suggestions preferably to one
 184 of the Bioperl mailing lists.  Your participation is much appreciated.
 185
 186   bioperl-l@bioperl.org                  - General discussion
 187   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 188
 189 =head2 Support
 190
 191 Please direct usage questions or support issues to the mailing list:
 192
 193 I<bioperl-l@bioperl.org>
 194
 195 rather than to the module maintainer directly. Many experienced and
 196 reponsive experts will be able look at the problem and quickly
 197 address it. Please include a thorough description of the problem
 198 with code and data examples if at all possible.
 199
 200 =head2 Reporting Bugs
 201
 202 Report bugs to the Bioperl bug tracking system to help us keep track
 203 the bugs and their resolution.
 204 Bug reports can be submitted via the web:
 205
 206   http://bugzilla.bioperl.org
 207
 208 =head1 AUTHOR - Peili Zhang
 209
 210 Email peili@morgan.harvard.edu
 211
 212 =head1 APPENDIX
 213
 214 The rest of the documentation details each of the object
 215 methods. Internal methods are usually preceded with a _
 216
 217 =cut
 218
 219 # Let the code begin...
 220
 221 package Bio::SeqIO::chadoxml;
 222 use strict;
 223 use English;
 224
 225 use Carp;
 226 use Data::Dumper;
 227 use XML::Writer;
 228 use IO::File;
 229 use IO::Handle;
 230 use Bio::Seq;
 231 use Bio::Seq::RichSeq;
 232 use Bio::SeqIO::FTHelper;
 233 use Bio::Species;
 234 use Bio::Seq::SeqFactory;
 235 use Bio::Factory::SequenceStreamI;
 236 use Bio::SeqFeature::Generic;
 237 use Bio::Annotation::Collection;
 238 use Bio::Annotation::Comment;
 239 use Bio::Annotation::Reference;
 240 use Bio::Annotation::DBLink;
 241 use Bio::SeqFeature::Tools::Unflattener;
 242
 243 #global variables
 244 undef(my %finaldatahash); #data from Bio::Seq object stored in a hash
 245 undef(my %datahash); #data from Bio::Seq object stored in a hash
 246
 247 my $chadotables = 'feature featureprop feature_relationship featureloc feature_cvterm cvterm cv feature_pub pub pub_dbxref pub_author author pub_relationship pubprop feature_dbxref dbxref db synonym feature_synonym';
 248
 249 my %fkey = (
 250     "cvterm.cv_id"          => "cv",
 251         "cvterm.dbxref_id"              => "dbxref",
 252     "dbxref.db_id"          => "db",
 253     "feature.type_id"       => "cvterm",
 254     "feature.organism_id"       => "organism",
 255     "feature.dbxref_id"         => "dbxref",
 256     "featureprop.type_id"       => "cvterm",
 257     "feature_pub.pub_id"        => "pub",
 258     "feature_cvterm.cvterm_id"  => "cvterm",
 259     "feature_cvterm.pub_id"     => "pub",
 260         "feature_cvterm.feature_id"     => "feature",
 261     "feature_dbxref.dbxref_id"  => "dbxref",
 262     "feature_relationship.object_id"    => "feature",
 263     "feature_relationship.subject_id"   => "feature",
 264     "feature_relationship.type_id"  => "cvterm",
 265     "featureloc.srcfeature_id"  => "feature",
 266     "pub.type_id"           => "cvterm",
 267     "pub_dbxref.dbxref_id"      => "dbxref",
 268     "pub_author.author_id"      => "author",
 269     "pub_relationship.obj_pub_id"   => "pub",
 270     "pub_relationship.subj_pub_id"  => "pub",
 271     "pub_relationship.type_id"  => "cvterm",
 272     "pubprop.type_id"       => "cvterm",
 273         "feature_synonym.feature_id"    => "feature",
 274         "feature_synonym.synonym_id"    => "synonym",
 275         "feature_synonym.pub_id"        => "pub",
 276         "synonym.type_id"               => "cvterm",
 277 );
 278
 279 my %cv_name = (
 280         'relationship'                  => 'relationship',
 281         'sequence'                      => 'sequence',
 282         'feature_property'              => 'feature_property',
 283 );
 284
 285 my %feattype_args2so = (
 286     "aberr"             => "aberration_junction",
 287 #   "conflict"          => "sequence_difference",
 288 #   "polyA_signal"          => "polyA_signal_sequence",
 289     "variation"         => "sequence_variant",
 290     "mutation1"         => "point_mutation",        #for single-base mutation
 291     "mutation2"         => "sequence_variant",      #for multi-base mutation
 292     "rescue"            => "rescue_fragment",
 293 #   "rfrag"             => "restriction_fragment",
 294     "protein_bind"          => "protein_binding_site",
 295     "misc_feature"          => "region",
 296 #   "prim_transcript"       => "primary_transcript",
 297     "CDS"               => "polypeptide",
 298     "reg_element"           => "regulatory_region",
 299     "seq_variant"           => "sequence_variant",
 300     "mat_peptide"           => "mature_peptide",
 301     "sig_peptide"           => "signal_peptide",
 302 );
 303
 304 undef(my %organism);
 305
 306 use base qw(Bio::SeqIO);
 307
 308 sub _initialize {
 309
 310     my($self,%args) = @_;
 311
 312     $self->SUPER::_initialize(%args);
 313     unless( defined $self->sequence_factory ) {
 314         $self->sequence_factory(Bio::Seq::SeqFactory->new
 315                                 (-verbose => $self->verbose(),
 316                                  -type => 'Bio::Seq::RichSeq'));
 317     }
 318     #optional arguments that can be passed in
 319     $self->suppress_residues($args{'-suppress_residues'})
 320         if defined $args{'-suppress_residues'};
 321
 322     $self->allow_residues($args{'-allow_residues'})
 323         if defined $args{'-allow_residues'};
 324     return;
 325 }
 326
 327 =head2 write_seq
 328
 329  Title   : write_seq
 330  Usage   : $stream->write_seq(-seq=>$seq, -seq_so_type=>$seqSOtype,
 331                   -src_feature=>$srcfeature,
 332                   -src_feat_type=>$srcfeattype,
 333                   -nounflatten=>0 or 1,
 334                   -is_analysis=>'true' or 'false',
 335                   -data_source=>$datasource)
 336  Function: writes the $seq object (must be seq) into chadoxml.
 337  Returns : 1 for success and 0 for error
 338  Args     : A Bio::Seq object $seq, optional $seqSOtype, $srcfeature,
 339             $srcfeattype, $nounflatten, $is_analysis and $data_source.
 340
 341 When $srcfeature (a string, the uniquename of the source feature) is given, the
 342 location and strand information of the top-level feature against the source
 343 feature will be derived from the sequence feature called 'source' of the $seq
 344 object, a featureloc record is generated for the top -level feature on
 345 $srcfeature. when $srcfeature is given, $srcfeattype must also be present. All
 346 feature coordinates in $seq should be against $srcfeature. $seqSOtype is the
 347 optional SO term to use as the type of the top-level feature. For example, a
 348 GenBank data file for a Drosophila melanogaster genome scaffold has the molecule
 349 type of "DNA", when converting to chadoxml, a $seqSOtype argument of
 350 "golden_path_region" can be supplied to save the scaffold as a feature of type
 351 "golden_path_region" in chadoxml, instead of "DNA". a feature with primary tag
 352 of 'source' must be present in the sequence feature list of $seq, to decribe the
 353 whole sequence record.
 354
 355 In the current implementation:
 356
 357 =over 3
 358
 359 =item *
 360
 361 non-mRNA records
 362
 363 A top-level feature of type $seq-E<gt>alphabet is generated for the whole GenBank
 364 record, features listed are unflattened for DNA records to build gene model
 365 feature graph, and for the other types of records all features in $seq are
 366 treated as subfeatures of the top-level feature.
 367
 368 =item *
 369
 370 mRNA records
 371
 372 If a 'gene' feature is present, it B<must> have a /symbol or /label tag to
 373 contain the uniquename of the gene. a top-level feature of type 'gene' is
 374 generated. the mRNA is written as a subfeature of the top-level gene feature,
 375 and the other sequence features listed in $seq are treated as subfeatures of the
 376 mRNA feature.
 377
 378 =back
 379
 380 =cut
 381
 382 sub write_seq {
 383     my $usage = <<EOUSAGE;
 384 Bio::SeqIO::chadoxml->write_seq()
 385 Usage   : \$stream->write_seq(-seq=>\$seq,
 386                   -seq_so_type=>\$SOtype,
 387                   -src_feature=>\$srcfeature,
 388                   -src_feat_type=>\$srcfeattype,
 389                   -nounflatten=>0 or 1,
 390                               -is_analysis=>'true' or 'false',
 391                               -data_source=>\$datasource)
 392 Args    : \$seq     : a Bio::Seq object
 393       \$SOtype  : the SO term to use as the feature type of
 394                       the \$seq record, optional
 395       \$srcfeature  : unique name of the source feature, a string
 396               containing at least one alphabetical letter
 397               (a-z, A-Z), optional
 398       \$srcfeattype : feature type of \$srcfeature. one of SO terms.
 399               optional
 400       when \$srcfeature is given, \$srcfeattype becomes mandatory,
 401       \$datasource  : source of the sequence annotation data,
 402               e.g. 'GenBank' or 'GFF'.
 403 EOUSAGE
 404
 405     my ($self,@args) = @_;
 406
 407     my ($seq, $seq_so_type, $srcfeature, $srcfeattype, $nounflatten, $isanalysis, $datasource, $genus, $species) =
 408        $self->_rearrange([qw(SEQ
 409                  SEQ_SO_TYPE
 410                  SRC_FEATURE
 411                  SRC_FEAT_TYPE
 412                  NOUNFLATTEN
 413                  IS_ANALYSIS
 414                  DATA_SOURCE
 415                                  GENUS
 416                                  SPECIES
 417                  )],
 418                   @args);
 419     #print "$seq_so_type, $srcfeature, $srcfeattype\n";
 420
 421     if( !defined $seq ) {
 422         $self->throw("Attempting to write with no seq!");
 423     }
 424
 425     if( ! ref $seq || ! $seq->isa('Bio::Seq::RichSeqI') ) {
 426        ## FIXME $self->warn(" $seq is not a RichSeqI compliant module. Attempting to dump, but may fail!");
 427     }
 428
 429         # try to get the srcfeature from the seqFeature object
 430         # for this to work, the user has to pass in the srcfeature type
 431         if (!$srcfeature) {
 432             if ($seq->can('seq_id')) {
 433                 $srcfeature=$seq->seq_id if ($seq->seq_id ne $seq->display_name);
 434             }
 435         }
 436
 437     #$srcfeature, when provided, should contain at least one alphabetical letter
 438     if (defined $srcfeature)
 439     {
 440         if ($srcfeature =~ /[a-zA-Z]/)
 441         {
 442         chomp($srcfeature);
 443         } else {
 444         $self->throw( $usage );
 445         }
 446
 447         #check for mandatory $srcfeattype
 448         if (! defined $srcfeattype)
 449         {
 450         $self->throw( $usage );
 451         #$srcfeattype must be a string of non-whitespace characters
 452         } else {
 453         if ($srcfeattype =~ /\S+/) {
 454             chomp($srcfeattype);
 455         } else {
 456             $self->throw( $usage );
 457         }
 458         }
 459     }
 460
 461     # variables local to write_seq()
 462         my $div = undef;
 463     my $hkey = undef;
 464     undef(my @top_featureprops);
 465         undef(my @featuresyns);
 466         undef(my @top_featurecvterms);
 467     my $name = $seq->display_id if $seq->can('display_id');
 468         $name = $seq->display_name  if $seq->can('display_name');
 469     undef(my @feature_cvterms);
 470     undef(my %sthash);
 471     undef(my %dvhash);
 472     undef(my %h1);
 473     undef(my %h2);
 474     my $temp = undef;
 475     my $ann = undef;
 476     undef(my @references);
 477     undef(my @feature_pubs);
 478     my $ref = undef;
 479     my $location = undef;
 480     my $fbrf = undef;
 481     my $journal = undef;
 482     my $issue = undef;
 483     my $volume = undef;
 484     my $volumeissue = undef;
 485     my $pages = undef;
 486     my $year = undef;
 487     my $pubtype = undef;
 488 #   my $miniref= undef;
 489     my $uniquename = undef;
 490     my $refhash = undef;
 491     my $feat = undef;
 492     my $tag = undef;
 493     my $tag_cv = undef;
 494     my $ftype = undef;
 495     my $subfeatcnt = undef;
 496     undef(my @top_featrels);
 497     undef (my %srcfhash);
 498
 499     local($^W) = 0; # supressing warnings about uninitialized fields.
 500
 501         if (!$name && $seq->can('attributes') ) {
 502             ($name) = $seq->attributes('Alias');
 503         }
 504
 505     if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') {
 506         $uniquename = $seq->accession_number;
 507     } elsif ($seq->can('accession') && defined $seq->accession && $seq->accession ne 'unknown') {
 508         $uniquename = $seq->accession;
 509     } elsif ($seq->can('attributes')) {
 510                 ($uniquename) = $seq->attributes('load_id');
 511         } else {
 512         $uniquename = $name;
 513     }
 514         my $len = $seq->length();
 515     if ($len == 0) {
 516         $len = undef;
 517     }
 518
 519     undef(my $gb_type);
 520     if (!$seq->can('molecule') || ! defined ($gb_type = $seq->molecule()) ) {
 521         $gb_type = $seq->can('alphabet') ? $seq->alphabet : 'DNA';
 522     }
 523     $gb_type = 'DNA' if $ftype eq 'dna';
 524     $gb_type = 'RNA' if $ftype eq 'rna';
 525
 526     if(length $seq_so_type > 0) {
 527         if (defined $seq_so_type) {
 528             $ftype = $seq_so_type;
 529         }
 530             elsif ($seq->type) {
 531                     $ftype = ($seq->type =~ /(.*):/)
 532                              ? $1
 533                              : $seq->type;
 534             }
 535         else {
 536             $ftype = $gb_type;
 537         }
 538     }
 539     else {
 540         $ftype = $gb_type;
 541     }
 542
 543     my %ftype_hash = $self->return_ftype_hash($ftype);
 544
 545         if ($species) {
 546             %organism = ("genus"=>$genus, "species" => $species);
 547         }
 548         else {
 549         my $spec = $seq->species();
 550         if (!defined $spec) {
 551         $self->throw("$seq does not know what organism it is from, which is required by chado. cannot proceed!\n");
 552         } else {
 553         %organism = ("genus"=>$spec->genus(), "species" => $spec->species());
 554         }
 555         }
 556
 557         my $residues;
 558         if (!$self->suppress_residues ||
 559             ($self->suppress_residues && $self->allow_residues eq $ftype)) {
 560             $residues = $seq->seq->isa('Bio::PrimarySeq')
 561                         ? $seq->seq->seq
 562                         : $seq->seq;
 563         }
 564         else {
 565             $residues = '';
 566         }
 567
 568     #set is_analysis flag for gene model features
 569     undef(my $isanal);
 570     if ($ftype eq 'gene' || $ftype eq 'mRNA' || $ftype eq 'exon' || $ftype eq 'protein' || $ftype eq 'polypeptide') {
 571         $isanal = $isanalysis;
 572         $isanal = 'false' if !defined $isanal;
 573     }
 574
 575     %datahash = (
 576         "name"      => $name,
 577         "uniquename"    => $uniquename,
 578         "seqlen"    => $len,
 579         "residues"  => $residues,
 580         "type_id"   => \%ftype_hash,
 581         "organism_id"   => \%organism,
 582         "is_analysis"   => $isanal || 'false',
 583         );
 584
 585         if (defined $srcfeature) {
 586                 %srcfhash = $self->_srcf_hash($srcfeature,
 587                                               $srcfeattype,
 588                                               \%organism);
 589
 590                 my ($phase,$strand);
 591                 if ($seq->can('phase')) {
 592                     $phase = $seq->phase;
 593                 }
 594
 595                 if ($seq->can('strand')) {
 596                     $strand = $seq->strand;
 597                 }
 598                 my %fl = (
 599                                 "srcfeature_id" => \%srcfhash,
 600                                 "fmin"          => $seq->start - 1,
 601                                 "fmax"          => $seq->end,
 602                                 "strand"        => $strand,
 603                                 "phase"         => $phase,
 604                                 );
 605
 606                 $datahash{'featureloc'} = \%fl;
 607
 608         }
 609
 610
 611     #if $srcfeature is not given, use the Bio::Seq object itself as the srcfeature for featureloc's
 612     if (!defined $srcfeature) {
 613         $srcfeature = $uniquename;
 614         $srcfeattype = $ftype;
 615     }
 616
 617     #default data source is 'GenBank'
 618     if (!defined $datasource) {
 619         $datasource = 'GenBank';
 620     }
 621
 622     if ($datasource =~ /GenBank/i) {
 623         #sequence topology as feature_cvterm
 624         if ($seq->can('is_circular') && $seq->is_circular) {
 625             %sthash = (
 626                 "cvterm_id" => {'name' => 'circular',
 627                             'cv_id' => {
 628                             'name' => 'sequence topology',
 629                             },
 630                         },
 631                    "pub_id" => {'uniquename' => 'nullpub',
 632                             'type_id' => {
 633                             'name' => 'null pub',
 634                             'cv_id' => {
 635                                 'name'=> 'pub type',
 636                             },
 637                             },
 638                         },
 639                 );
 640         } else {
 641             %sthash = (
 642                 "cvterm_id" => { 'name' => 'linear',
 643                              'cv_id' => {
 644                              'name' => 'sequence topology',
 645                              }
 646                          },
 647                 "pub_id"    => {'uniquename' => 'nullpub',
 648                             'type_id' => {
 649                             'name' => 'null pub',
 650                             'cv_id' => {
 651                                 'name'=> 'pub type',
 652                             },
 653                             },
 654                         },
 655                    );
 656         }
 657         push(@feature_cvterms, \%sthash);
 658
 659         #division as feature_cvterm
 660             if ($seq->can('division') && defined $seq->division()) {
 661                 $div = $seq->division();
 662             %dvhash = (
 663                 "cvterm_id" => {'name' => $div,
 664                             'cv_id' => {
 665                             'name' => 'GenBank division'}},
 666                 "pub_id"    => {'uniquename' => 'nullpub',
 667                             'type_id' => {
 668                             'name' => 'null pub',
 669                             'cv_id' => {
 670                                 'name'=> 'pub type'},
 671                                 }},
 672                 );
 673             push(@feature_cvterms, \%dvhash);
 674         }
 675
 676         $datahash{'feature_cvterm'} = \@feature_cvterms;
 677     } # closes if GenBank
 678
 679     #featureprop's
 680     #DEFINITION
 681     if ($seq->can('desc') && defined $seq->desc()) {
 682         $temp = $seq->desc();
 683
 684         my %prophash = (
 685             "type_id"   => {'name' => 'description',
 686                         'cv_id' => {
 687                         'name' =>
 688                                                  $cv_name{'feature_property'}
 689                                                        },
 690                                            },
 691             "value"     => $temp,
 692             );
 693
 694         push(@top_featureprops, \%prophash);
 695         }
 696
 697     #KEYWORDS
 698     if ($seq->can('keywords')) {
 699         $temp = $seq->keywords();
 700
 701         if (defined $temp && $temp ne '.' && $temp ne '') {
 702         my %prophash = (
 703                 "type_id"   => {'name' => 'keywords',
 704                             'cv_id' => {
 705                                                   'name' =>
 706                                                    $cv_name{'feature_property'}
 707                                                            }
 708                         },
 709                 "value"     => $temp,
 710                             );
 711
 712         push(@top_featureprops, \%prophash);
 713         }
 714         }
 715
 716     #COMMENT
 717     if ($seq->can('annotation')) {
 718         $ann = $seq->annotation();
 719         foreach my $comment ($ann->get_Annotations('comment')) {
 720             $temp = $comment->as_text();
 721             #print "fcomment: $temp\n";
 722             my %prophash = (
 723                 "type_id"   => {'name' => 'comment',
 724                             'cv_id' => {
 725                                                   'name' =>
 726                                                    $cv_name{'feature_property'}
 727                                                            }
 728                                                },
 729                 "value"     => $temp,
 730                 );
 731
 732             push(@top_featureprops, \%prophash);
 733         }
 734     }
 735
 736         my @top_dbxrefs = ();
 737         #feature object from Bio::DB::SeqFeature::Store
 738         if ($seq->can('attributes')) {
 739                 my %attributes = $seq->attributes;
 740                 for my $key (keys %attributes) {
 741                     next if ($key eq 'parent_id');
 742                     next if ($key eq 'load_id');
 743
 744                     if ($key eq 'Alias') {
 745                         @featuresyns = $self->handle_Alias_tag($seq,@featuresyns);
 746                     }
 747
 748                     ###FIXME deal with Dbxref, Ontology_term,source,
 749                     elsif ($key eq 'Ontology_term') {
 750                         @top_featurecvterms = $self->handle_Ontology_tag($seq,@top_featurecvterms);
 751                     }
 752
 753                     elsif ($key eq 'dbxref' or $key eq 'Dbxref') {
 754                         @top_dbxrefs = $self->handle_dbxref($seq, $key, @top_dbxrefs);
 755                     }
 756
 757                     elsif ($key =~ /^[a-z]/) {
 758                         @top_featureprops
 759                              = $self->handle_unreserved_tags($seq,$key,@top_featureprops);
 760                     }
 761                 }
 762         }
 763         $datahash{'feature_synonym'} = \@featuresyns;
 764
 765         if ($seq->can('source')) {
 766                 @top_dbxrefs = $self->handle_source($seq,@top_dbxrefs);
 767         }
 768
 769     #accession and version as feature_dbxref
 770     if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') {
 771         my $db = $self->_guess_acc_db($seq, $seq->accession_number);
 772         my %acchash = (
 773                "db_id"  => {'name' => $db},
 774                "accession"  => $seq->accession_number,
 775                "version"    => $seq->seq_version,
 776                );
 777         my %fdbx = ('dbxref_id' => \%acchash);
 778         push(@top_dbxrefs, \%fdbx);
 779     }
 780
 781     if( $seq->isa('Bio::Seq::RichSeqI') && defined $seq->get_secondary_accessions() ) {
 782         my @secacc = $seq->get_secondary_accessions();
 783         my $acc;
 784         foreach $acc (@secacc) {
 785             my %acchash = (
 786                 "db_id"         => {'name' => 'GB'},
 787                 "accession" => $acc,
 788                 );
 789             my %fdbx = ('dbxref_id' => \%acchash);
 790             push(@top_dbxrefs, \%fdbx);
 791         }
 792     }
 793
 794     #GI number
 795     if( $seq->isa('Bio::Seq::RichSeqI') && defined ($seq->pid)) {
 796         my $id = $seq->pid;
 797         #print "reftype: ", ref($id), "\n";
 798
 799         #if (ref($id) eq 'HASH') {
 800         my %acchash = (
 801             "db_id"     => {'name' => 'GI'},
 802             "accession" => $id,
 803             );
 804         my %fdbx = ('dbxref_id' => \%acchash);
 805         push (@top_dbxrefs, \%fdbx);
 806     }
 807
 808     #REFERENCES as feature_pub
 809     if (defined $ann) {
 810         #get the references
 811         @references = $ann->get_Annotations('reference');
 812         foreach $ref (@references) {
 813         undef(my %pubhash);
 814         $refhash = $ref->hash_tree();
 815         $location = $ref->location || $refhash->{'location'};
 816         #print "location: $location\n";
 817
 818         #get FBrf#, special for FlyBase SEAN loading
 819         if (index($location, ' ==') >= 0) {
 820             $location =~ /\s==/;
 821                 #print "match: $MATCH\n";
 822                 #print "prematch: $PREMATCH\n";
 823                 #print "postmatch: $POSTMATCH\n";
 824             $fbrf = $PREMATCH;
 825             $location = $POSTMATCH;
 826             $location =~ s/^\s//;
 827         }
 828
 829         #print "location: $location\n";
 830         #unpublished reference
 831         if ($location =~ /Unpublished/) {
 832             $pubtype = 'unpublished';
 833             %pubhash = (
 834                 "title"     => $ref->title || $refhash->{'title'},
 835                 #"miniref"  => substr($location, 0, 255),
 836                 #"uniquename"   => $fbrf,
 837                 "type_id"   => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}
 838                 );
 839         }
 840         #submitted
 841         elsif ($location =~ /Submitted/) {
 842             $pubtype = 'submitted';
 843
 844             %pubhash = (
 845                 "title"     => $ref->title || $refhash->{'title'},
 846                 #"miniref"  => substr($location, 0, 255),
 847                 #"uniquename"   => $fbrf,
 848                 "type_id"   => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}
 849                 );
 850
 851             undef(my $pyear);
 852             $pyear = $self->_getSubmitYear($location);
 853             if (defined $pyear) {
 854             $pubhash{'pyear'} = $pyear;
 855             }
 856         }
 857
 858         #published journal paper
 859         elsif ($location =~ /\D+\s\d+\s\((\d+|\d+-\d+)\),\s(\d+-\d+|\d+--\d+)\s\(\d\d\d\d\)$/) {
 860             $pubtype = 'paper';
 861
 862                 #parse location to get journal, volume, issue, pages & year
 863             $location =~ /\(\d\d\d\d\)$/;
 864
 865             $year = $MATCH;
 866             my $stuff = $PREMATCH;
 867             $year =~ s/\(//; #remove the leading parenthesis
 868             $year =~ s/\)//; #remove the trailing parenthesis
 869
 870             $stuff =~ /,\s(\d+-\d+|\d+--\d+)\s$/;
 871
 872             $pages = $MATCH;
 873             $stuff = $PREMATCH;
 874             $pages =~ s/^, //; #remove the leading comma and space
 875             $pages =~ s/ $//; #remove the last space
 876
 877             $stuff =~ /\s\d+\s\((\d+|\d+-\d+)\)$/;
 878
 879             $volumeissue = $MATCH;
 880             $journal = $PREMATCH;
 881             $volumeissue =~ s/^ //; #remove the leading space
 882             $volumeissue =~ /\((\d+|\d+-\d+)\)$/;
 883             $issue = $MATCH;
 884             $volume = $PREMATCH;
 885             $issue =~ s/^\(//; #remove the leading parentheses
 886             $issue =~ s/\)$//; #remove the last parentheses
 887             $volume =~ s/^\s//; #remove the leading space
 888             $volume =~ s/\s$//; #remove the last space
 889
 890             %pubhash = (
 891                 "title"     => $ref->title || $refhash->{'title'},
 892                 "volume"    => $volume,
 893                 "issue"     => $issue,
 894                 "pyear"     => $year,
 895                 "pages"     => $pages,
 896                 #"miniref"  => substr($location, 0, 255),
 897                 #"miniref"  => ' ',
 898                 #"uniquename"   => $fbrf,
 899                 "type_id"   => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}},
 900                 "pub_relationship" => {
 901                     'obj_pub_id' => {
 902                     'uniquename' => $journal,
 903                     'title' => $journal,
 904                     #'miniref' => substr($journal, 0, 255),
 905                     'type_id' =>{'name' => 'journal',
 906                              'cv_id' =>
 907                              {'name' => 'pub type'
 908                               },
 909                          },
 910                              #'pubprop' =>{'value'=> $journal,
 911                              #       'type_id'=>{'name' => 'abbreviation', 'cv_id' => {'name' => 'pubprop type'}},
 912                              #      },
 913                          },
 914                        'type_id' => {
 915                            'name' => 'published_in',
 916                            'cv_id' => {
 917                            'name' => 'pub relationship type'},
 918                        },
 919                 },
 920                 );
 921         }
 922
 923         #other references
 924         else {
 925             $pubtype = 'other';
 926             %pubhash = (
 927                 "title"     => $ref->title || $refhash->{'title'},
 928                 #"miniref"  => $fbrf,
 929                 "type_id"   => {
 930                     'name' => $pubtype,
 931                     'cv_id' => {'name' =>'pub type'}
 932                 }
 933                 );
 934         }
 935
 936         #pub_author
 937         my $autref = $self->_getRefAuthors($ref);
 938         if (defined $autref) {
 939             $pubhash{'pub_author'} = $autref;
 940         }
 941         # if no author and is type 'submitted' and has submitter address, use the first 100 characters of submitter address as the author lastname.
 942         else {
 943             if ($pubtype eq 'submitted') {
 944             my $autref = $self->_getSubmitAddr($ref);
 945             if (defined $autref) {
 946                 $pubhash{'pub_author'} = $autref;
 947             }
 948             }
 949         }
 950
 951         #$ref->comment as pubprop
 952         #print "ref comment: ", $ref->comment, "\n";
 953         #print "ref comment: ", $refhash->{'comment'}, "\n";
 954         if (defined $ref->comment || defined $refhash->{'comment'}) {
 955             my $comnt = $ref->comment || $refhash->{'comment'};
 956                 #print "remark: ", $comnt, "\n";
 957             $pubhash{'pubprop'} = {
 958             "type_id"       => {'name' => 'comment', 'cv_id' => {'name' => 'pubprop type'}},
 959             "value"     => $comnt,
 960             };
 961         }
 962
 963         #pub_dbxref
 964         undef(my @pub_dbxrefs);
 965         if (defined $fbrf) {
 966             push(@pub_dbxrefs, {dbxref_id => {accession => $fbrf, db_id => {'name' => 'FlyBase'}}});
 967         }
 968         if (defined ($temp = $ref->medline)) {
 969             push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'MEDLINE'}}});
 970                 #use medline # as the pub's uniquename
 971             $pubhash{'uniquename'} = $temp;
 972         }
 973         if (defined ($temp = $ref->pubmed)) {
 974             push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'PUBMED'}}});
 975         }
 976         $pubhash{'pub_dbxref'} = \@pub_dbxrefs;
 977
 978         #if the pub uniquename is not defined or blank, put its FBrf# as its uniquename
 979         #this is unique to FlyBase
 980         #USERS OF THIS MODULE: PLEASE MODIFY HERE TO IMPLEMENT YOUR POLICY
 981         # ON PUB UNIQUENAME!!!
 982         if (!defined $pubhash{'uniquename'} || $pubhash{'uniquename'} eq '') {
 983             if (defined $fbrf) {
 984             $pubhash{'uniquename'} = $fbrf;
 985             }
 986                 #else {
 987                 #   $pubhash{'uniquename'} = $self->_CreatePubUname($ref);
 988                 #}
 989         }
 990
 991         #add to collection of references
 992         #if the pub covers the entire sequence of the top-level feature, add it to feature_pubs
 993         if (($ref->start == 1 && $ref->end == $len) || (!defined $ref->start && !defined $ref->end)) {
 994             push(@feature_pubs, {"pub_id" => \%pubhash});
 995         }
 996         #the pub is about a sub-sequence of the top-level feature
 997         #create a feature for the sub-sequence and add pub as its feature_pub
 998         #featureloc of this sub-sequence is against the top-level feature, in interbase coordinates.
 999         else {
1000             my %parf = (
1001                 'uniquename'    => $uniquename . ':' . $ref->start . "\.\." . $ref->end,
1002                 'organism_id'   =>\%organism,
1003                 'type_id'   =>{'name' =>'region', 'cv_id' => {'name' => $cv_name{'sequence'} }},
1004                 );
1005             my %parfsrcf = (
1006                     'uniquename'    => $uniquename,
1007                     'organism_id'   =>\%organism,
1008                     );
1009             my %parfloc = (
1010                    'srcfeature_id'  => \%parfsrcf,
1011                    'fmin'       => $ref->start - 1,
1012                    'fmax'       => $ref->end,
1013                    );
1014             $parf{'featureloc'} = \%parfloc;
1015             $parf{'feature_pub'} = {'pub_id' => \%pubhash};
1016             my %ffr = (
1017                    'subject_id' => \%parf,
1018                    'type_id'        => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'}}},
1019                    );
1020             push(@top_featrels, \%ffr);
1021         }
1022         }
1023         $datahash{'feature_pub'} = \@feature_pubs;
1024     }
1025
1026     ##construct srcfeature hash for use in featureloc
1027     if (defined $srcfeature) {
1028                 %srcfhash = $self->_srcf_hash($srcfeature,
1029                                               $srcfeattype,
1030                                               \%organism);
1031     #   my %fr = (
1032     #       "object_id" => \%srcfhash,
1033     #       "type_id"   => { 'name' => 'partof', 'cv_id' => { 'name' => 'relationship type'}},
1034     #       );
1035
1036     #   push (@top_featrels, \%fr);
1037     }
1038
1039     #unflatten the seq features in $seq if $seq is a gene or a DNA sequence
1040     if (($gb_type eq 'gene' || $gb_type eq 'DNA') &&
1041         !$nounflatten) {
1042         my $u = Bio::SeqFeature::Tools::Unflattener->new;
1043         $u->unflatten_seq(-seq=>$seq, -use_magic=>1);
1044     }
1045
1046     my @top_sfs = $seq->get_SeqFeatures;
1047     #print $#top_sfs, "\n";
1048
1049     #SUBFEATURES
1050
1051     if ($datasource =~ /GenBank/i) {
1052         $tag_cv = 'GenBank feature qualifier';
1053     } elsif ($datasource =~ /GFF/i) {
1054         $tag_cv = 'feature_property';
1055     } else {
1056         $tag_cv = $cv_name{'feature_property'};
1057     }
1058
1059     my $si = 0;
1060     foreach $feat (@top_sfs) {
1061         #$feat = $top_sfs[$si];
1062         #print "si: $si\n";
1063         my $prim_tag = $feat->primary_tag;
1064         #print $prim_tag, "\n";
1065
1066         # get all qualifiers of the 'source' feature, load these as top_featureprops of the top level feature
1067         if ($prim_tag eq 'source') {
1068             foreach $tag ($feat->all_tags()) {
1069                 #db_xref
1070                 if ($tag eq 'db_xref'
1071                                  or $tag eq 'Dbxref'
1072                                  or $tag eq 'dbxref')   {
1073                     my @t1 = $feat->each_tag_value($tag);
1074                     foreach $temp (@t1) {
1075                        $temp =~ /([^:]*?):(.*)/;
1076                                            my $db = $1;
1077                                            my $xref = $2;
1078                                            #PRE/POST very inefficent
1079                        #my $db = $PREMATCH;
1080                        #my $xref = $POSTMATCH;
1081                        my %acchash = (
1082                         "db_id"     => {'name' => $db},
1083                         "accession" => $xref,
1084                         );
1085                        my %fdbx = ('dbxref_id' => \%acchash);
1086                        push (@top_dbxrefs, \%fdbx);
1087                     }
1088                                 #Ontology_term
1089                                 } elsif ($tag eq 'Ontology_term') {
1090                                         my @t1 = $feat->each_tag_value($tag);
1091                                         foreach $temp (@t1) {
1092                                             ###FIXME
1093                                         }
1094                 #other tags as featureprop
1095                 } elsif ($tag ne 'gene') {
1096                     my %prophash = undef;
1097                     %prophash = (
1098                                     "type_id"       => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}},
1099                         "value"     => join(' ',$feat->each_tag_value($tag)),
1100                         );
1101                     push(@top_featureprops, \%prophash);
1102                 }
1103             }
1104
1105                         if ($feat->can('source')) {
1106                             my $source = $feat->source();
1107                             @top_dbxrefs = $self->handle_source($feat, @top_dbxrefs);
1108                         }
1109
1110             #featureloc for the top-level feature
1111             my $fmin = undef;
1112             my $fmax = undef;
1113             my $strand = undef;
1114                         my $phase = undef;
1115             my %fl = undef;
1116
1117             $fmin = $feat->start - 1;
1118             $fmax = $feat->end;
1119             $strand = $feat->strand;
1120
1121                         if ($feat->can('phase')) {
1122                             $phase = $feat->phase;
1123                         }
1124
1125             %fl = (
1126                 "srcfeature_id" => \%srcfhash,
1127                 "fmin"      => $fmin,
1128                 "fmax"      => $fmax,
1129                 "strand"    => $strand,
1130                                 "phase"         => $phase,
1131                 );
1132
1133             $datahash{'featureloc'} = \%fl;
1134
1135             #delete 'source' feature from @top_sfs
1136             splice(@top_sfs, $si, 1);
1137         }
1138         $si ++;
1139     #close loop over top_sfs
1140     }
1141
1142     #the top-level features other than 'source'
1143     foreach $feat (@top_sfs) {
1144         #print $feat->primary_tag, "\n";
1145
1146         my $r = $self->_subfeat2featrelhash($name, $ftype, $feat, \%srcfhash, $tag_cv, $isanalysis);
1147
1148         if (!($ftype eq 'mRNA' && $feat->primary_tag eq 'gene')) {
1149             my %fr = %$r;
1150             push(@top_featrels, \%fr);
1151         } else {
1152             %finaldatahash = %$r;
1153         }
1154     }
1155
1156     if (@top_dbxrefs) {
1157         $datahash{'feature_dbxref'} = \@top_dbxrefs;
1158     }
1159
1160     if (@top_featureprops) {
1161         $datahash{'featureprop'} = \@top_featureprops;
1162     }
1163
1164     if (@top_featrels) {
1165         $datahash{'feature_relationship'} = \@top_featrels;
1166     }
1167
1168         if (@top_featurecvterms) {
1169                 $datahash{'feature_cvterm'} = \@top_featurecvterms;
1170         }
1171
1172     if ($ftype eq 'mRNA' && %finaldatahash) {
1173         $finaldatahash{'feature_relationship'} = {
1174                         'subject_id'    => \%datahash,
1175                         'type_id'   => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'} }},
1176                              };
1177     } else {
1178         %finaldatahash = %datahash;
1179     }
1180
1181     my $mainTag = 'feature';
1182     $self->_hash2xml(undef, $mainTag, \%finaldatahash);
1183
1184     return 1;
1185 }
1186
1187 sub _hash2xml {
1188     my $self = shift;
1189     my $isMatch = undef;
1190     $isMatch = shift;
1191     my $ult = shift;
1192     my $ref = shift;
1193     my %mh = %$ref;
1194     my $key;
1195     my $v;
1196     my $sh;
1197     my $xx;
1198     my $yy;
1199     my $nt;
1200     my $ntref;
1201     my $output;
1202     my $root = shift if (@_);
1203     #print "ult: $ult\n";
1204     if (!defined $self->{'writer'}) {
1205     $root = 1;
1206         $self->_create_writer();
1207     }
1208     my $temp;
1209     my %subh = undef;
1210
1211     #start opeing tag
1212     #if pub record of type 'journal', form the 'ref' attribute for special pub lookup
1213     #requires that the journal name itself is also stored as a pubprop record for the journal with value equal
1214     #to the journal name and type of 'abbreviation'.
1215     if ($ult eq 'pub' && $mh{'type_id'}->{'name'} eq 'journal') {
1216     $self->{'writer'}->startTag($ult, 'ref' => $mh{'title'} . ':journal:abbreviation');
1217     }
1218
1219     #special pub match if pub uniquename not known
1220     elsif ($ult eq 'pub' && !defined $mh{'uniquename'}) {
1221     $self->{'writer'}->startTag($ult, 'op' => 'match');
1222     #set the match flag, all the sub tags should also have "op"="match"
1223     $isMatch = 1;
1224     }
1225
1226     #if cvterm or cv, lookup only
1227     elsif (($ult eq 'cvterm') || ($ult eq 'cv')) {
1228     $self->{'writer'}->startTag($ult, 'op' => 'lookup');
1229     }
1230
1231     #if nested tables of match table, match too
1232     elsif ($isMatch) {
1233     $self->{'writer'}->startTag($ult, 'op' => 'match');
1234     }
1235
1236     else {
1237     $self->{'writer'}->startTag($ult);
1238     }
1239
1240     #first loop to produce xml for all the table columns
1241     foreach $key (keys %mh)
1242     {
1243     #print "key: $key\n";
1244     $xx = ' ' . $key;
1245     $yy = $key . ' ';
1246     if (index($chadotables, $xx) < 0 && index($chadotables, $yy) < 0)
1247     {
1248         if ($isMatch) {
1249         $self->{'writer'}->startTag($key, 'op' => 'match');
1250         } else {
1251         $self->{'writer'}->startTag($key);
1252         }
1253
1254         my $x = $ult . '.' . $key;
1255         #the column is a foreign key
1256         if (defined $fkey{$x})
1257         {
1258         $nt = $fkey{$x};
1259         $sh = $mh{$key};
1260         $self->_hash2xml($isMatch, $nt, $sh, 0);
1261         } else
1262         {
1263         #print "$key: $mh{$key}\n";
1264         $self->{'writer'}->characters($mh{$key});
1265         }
1266         $self->{'writer'}->endTag($key);
1267     }
1268     }
1269
1270     #second loop to produce xml for all the nested tables
1271     foreach $key (keys %mh)
1272     {
1273     #print "key: $key\n";
1274     $xx = ' ' . $key;
1275     $yy = $key . ' ';
1276     #a nested table
1277     if (index($chadotables, $xx) > 0 || index($chadotables, $yy) > 0)
1278     {
1279         #$writer->startTag($key);
1280         $ntref = $mh{$key};
1281         #print "$key: ", ref($ntref), "\n";
1282         if (ref($ntref) =~ 'HASH') {
1283         $self->_hash2xml($isMatch, $key, $ntref, 0);
1284         } elsif (ref($ntref) =~ 'ARRAY') {
1285         #print "array dim: ", $#$ntref, "\n";
1286         foreach $ref (@$ntref) {
1287                 #print "\n";
1288             $self->_hash2xml($isMatch, $key, $ref, 0);
1289         }
1290         }
1291         #$writer->endTag($key);
1292     }
1293     }
1294
1295     #end tag
1296     $self->{'writer'}->endTag($ult);
1297
1298     #if ($root == 1) {
1299 #   $self->{'writer'}->endTag('chado');
1300 #    }
1301 }
1302
1303 sub _guess_acc_db {
1304     my $self = shift;
1305     my $seq = shift;
1306     my $acc = shift;
1307     #print "acc: $acc\n";
1308
1309     if ($acc =~ /^NM_\d{6}/ || $acc =~ /^NP_\d{6}/ || $acc =~ /^NT_\d{6}/ || $acc =~ /^NC_\d{6}/) {
1310         return "RefSeq";
1311     } elsif ($acc =~ /^XM_\d{6}/ || $acc =~ /^XP_\d{6}/ || $acc =~ /^XR_\d{6}/) {
1312         return "RefSeq";
1313     } elsif ($acc =~ /^[a-zA-Z]{1,2}\d{5,6}/) {
1314         return "GB";
1315     } elsif ($seq->molecule() eq 'protein' && $acc =~ /^[a-zA-z]\d{5}/) {
1316         return "PIR";
1317     } elsif ($seq->molecule() eq 'protein' && $acc =~ /^\d{6,7}[a-zA-Z]/) {
1318         return "PRF";
1319     } elsif ($acc =~ /\d+/ && $acc !~ /[a-zA-Z]/) {
1320         return "LocusID";
1321     } elsif ($acc =~ /^CG\d+/ || $acc =~ /^FB[a-z][a-z]\d+/) {
1322         return "FlyBase";
1323     } else {
1324         return "unknown";
1325     }
1326 }
1327
1328 sub _subfeat2featrelhash {
1329     my $self = shift;
1330     my $genename = shift;
1331     my $seqtype = shift;
1332     my $feat = shift;
1333     my $r = shift;
1334     my %srcf = %$r;     #srcfeature hash for featureloc.srcfeature_id
1335     my $tag_cv = shift;
1336     my $isanalysis = shift;
1337
1338     my $prim_tag = $feat->primary_tag;
1339
1340     my $sfunique = undef;       #subfeature uniquename
1341     my $sfname = undef;     #subfeature name
1342     my $sftype = undef;     #subfeature type
1343
1344     if ($feat->has_tag('symbol')) {
1345         ($sfunique) = $feat->each_tag_value("symbol");
1346     } elsif ($feat->has_tag('label')) {
1347         ($sfunique) = $feat->each_tag_value("label");
1348     } else {
1349         #$self->throw("$prim_tag at " . $feat->start . "\.\." . $feat->end . " does not have symbol or label! To convert into chadoxml, a seq feature must have a /symbol or /label tag holding its unique name.");
1350         #generate feature unique name as <genename>-<feature-type>-<span>
1351         $sfunique = $self->_genFeatUniqueName($genename, $feat);
1352     }
1353
1354     if ($feat->has_tag('Name')) {
1355         ($sfname) = $feat->each_tag_value("Name");
1356     }
1357
1358     #feature type translation
1359     if (defined $feattype_args2so{$prim_tag}) {
1360         $sftype = $feattype_args2so{$prim_tag};
1361     } else {
1362         $sftype = $prim_tag;
1363     }
1364
1365     if ($prim_tag eq 'mutation') {
1366         if ($feat->start == $feat->end) {
1367             $sftype = $feattype_args2so{'mutation1'};
1368         } else {
1369             $sftype = $feattype_args2so{'mutation2'};
1370         }
1371     }
1372
1373     #set is_analysis flag for gene model features
1374     undef(my $isanal);
1375     if ($sftype eq 'gene' || $sftype eq 'mRNA' || $sftype eq 'exon' || $sftype eq 'protein' || $sftype eq 'polypeptide') {
1376         $isanal = $isanalysis;
1377     }
1378
1379     my %sfhash = (
1380         "name"          => $sfname,
1381         "uniquename"        => $sfunique,
1382         "organism_id"       => \%organism,
1383         "type_id"       => { 'name' => $sftype, 'cv_id' => { 'name' => $cv_name{'sequence'} }},
1384         "is_analysis"           => $isanal || 'false',
1385         );
1386
1387     #make a copy of %sfhash for passing to this method when recursively called
1388     #my %srcfeat = (
1389         #        "name"                  => $sfname,
1390         #        "uniquename"            => $sfunique,
1391         #        "organism_id"           => \%organism,
1392         #        "type_id"               => { 'name' => $sftype, 'cv_id' => { 'name' => 'SO'}},
1393         #        );
1394
1395     #featureloc for subfeatures
1396     undef(my $sfmin);
1397     undef(my $sfmax);
1398     undef(my $is_sfmin_partial);
1399     undef(my $is_sfmax_partial);
1400     undef(my $sfstrand);
1401         undef(my $sfphase);
1402     $sfmin = $feat->start - 1;
1403     $sfmax = $feat->end;
1404     $sfstrand = $feat->strand();
1405
1406         if ($feat->can('phase')) {
1407             $sfphase = $feat->phase;
1408         }
1409
1410     #if the gene feature in an mRNA record, cannot use its coordinates, omit featureloc
1411     if ($seqtype eq 'mRNA' && $sftype eq 'gene') {
1412     } else {
1413         if ($feat->location->isa('Bio::Location::FuzzyLocationI')) {
1414             if ($feat->location->start_pos_type() ne 'EXACT') {
1415                 $is_sfmin_partial = 'true';
1416             }
1417             if ($feat->location->end_pos_type() ne 'EXACT') {
1418                 $is_sfmax_partial = 'true';
1419             }
1420         }
1421
1422         my %sfl = (
1423             "srcfeature_id" => \%srcf,
1424             "fmin"      => $sfmin,
1425             "is_fmin_partial" => $is_sfmin_partial || 'false',
1426             "fmax"      => $sfmax,
1427             "is_fmax_partial" => $is_sfmax_partial || 'false',
1428             "strand"    => $sfstrand,
1429                         "phase"         => $sfphase,
1430             );
1431
1432         $sfhash{'featureloc'} = \%sfl;
1433     }
1434
1435
1436     #subfeature tags
1437     undef(my @sfdbxrefs);       #subfeature dbxrefs
1438     undef(my @sub_featureprops);    #subfeature props
1439         undef(my @sub_featuresyns);     #subfeature synonyms
1440         undef(my @sub_featurecvterms);  #subfeature cvterms
1441     foreach my $tag ($feat->all_tags()) {
1442         #feature_dbxref for features
1443         if ($tag eq 'db_xref' or $tag eq 'dbxref' or $tag eq 'Dbxref')   {
1444             my @t1 = $feat->each_tag_value($tag);
1445             #print "# of dbxref: @t1\n";
1446             for my $temp (@t1) {
1447                $temp =~ /:/;
1448                my $db = $PREMATCH;
1449                my $xref = $POSTMATCH;
1450                #print "db: $db; xref: $xref\n";
1451                my %acchash = (
1452                 "db_id"     => {'name' => $db},
1453                 "accession" => $xref,
1454                 );
1455                my %sfdbx = ('dbxref_id' => \%acchash);
1456                push (@sfdbxrefs, \%sfdbx);
1457             }
1458                 #Alias tags
1459                 } elsif ($tag eq 'Alias') {
1460                         @sub_featuresyns = $self->handle_Alias_tag($feat, @sub_featuresyns);
1461                 } elsif ($tag eq 'Ontology_term') {
1462                         @sub_featurecvterms = $self->handle_Ontology_tag($feat, @sub_featurecvterms);
1463         #featureprop for features, excluding GFF Name & Parent tags
1464         } elsif ($tag ne 'gene' && $tag ne 'symbol' && $tag ne 'Name' && $tag ne 'Parent') {
1465                         next if ($tag eq 'parent_id');
1466                         next if ($tag eq 'load_id');
1467             foreach my $val ($feat->each_tag_value($tag)) {
1468                 my %prophash = undef;
1469                 %prophash = (
1470                                 "type_id"       => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}},
1471                     "value"     => $val,
1472                 );
1473                 push(@sub_featureprops, \%prophash);
1474             }
1475         }
1476     }
1477
1478         if ($feat->can('source')) {
1479                 @sfdbxrefs = $self->handle_source($feat,@sfdbxrefs);
1480         }
1481
1482     if (@sub_featureprops) {
1483         $sfhash{'featureprop'} = \@sub_featureprops;
1484     }
1485     if (@sfdbxrefs) {
1486         $sfhash{'feature_dbxref'} = \@sfdbxrefs;
1487     }
1488         if (@sub_featuresyns) {
1489                 $sfhash{'feature_synonym'} = \@sub_featuresyns;
1490         }
1491         if (@sub_featurecvterms) {
1492                 $sfhash{'feature_cvterm'} = \@sub_featurecvterms;
1493         }
1494
1495     undef(my @ssfeatrel);
1496     if ($feat->has_tag('locus_tag')) {
1497         ($genename)= $feat->each_tag_value('locus_tag');
1498     } elsif ($feat->has_tag('gene')) {
1499         ($genename)= $feat->each_tag_value('gene');
1500     }
1501
1502     foreach my $sf ($feat->get_SeqFeatures()) {
1503         #print $sf->primary_tag, "\n";
1504         my $rref = $self->_subfeat2featrelhash($genename, $sftype, $sf, \%srcf, $tag_cv, $isanalysis);
1505         if (defined $rref) {
1506             push(@ssfeatrel, $rref);
1507         }
1508     }
1509
1510     if (@ssfeatrel) {
1511         $sfhash{'feature_relationship'} = \@ssfeatrel;
1512     }
1513
1514     #subj-obj relationship type
1515     undef(my $reltypename);
1516         $reltypename = return_reltypename($sftype);
1517
1518     my %fr = (
1519         "subject_id"    => \%sfhash,
1520         "type_id"       => { 'name' => $reltypename,
1521                                              'cv_id' => { 'name' => $cv_name{'relationship'} }},
1522         );
1523
1524     if ($seqtype eq 'mRNA' && $sftype eq 'gene') {
1525         return \%sfhash;
1526     } else {
1527         return \%fr;
1528     }
1529
1530 }
1531
1532 #generate uniquename for feature as: <genename>-<feature-type>-<span> (foo-mRNA-10..1000)
1533 sub _genFeatUniqueName {
1534     my $self = shift;
1535     my $genename = shift;
1536     my $feat = shift;
1537     undef(my $uniquename);
1538     my $ftype = $feat->primary_tag;
1539     my $start = $feat->start;
1540     my $end = $feat->end;
1541
1542     if ($feat->has_tag('locus_tag')) {
1543         ($genename) = $feat->each_tag_value("locus_tag");
1544     } elsif ($feat->has_tag('gene')) {
1545         ($genename) = $feat->each_tag_value("gene");
1546     }
1547
1548     $uniquename = $genename . '-' . $ftype . '-' . $start . "\.\." . $end;
1549
1550     return $uniquename;
1551 }
1552
1553 #create uniquename for pubs with no medline id and no FBrf#
1554 #use "<authors>, <year>, <type>" as the uniquename (same as miniref)
1555 #<authors> is <sole-author-surname>    if one author,
1556 #  or <first-author-surname> and <second-author-surname>   if two,
1557 #  or <first-author-surname> et al.   if more
1558 #sub _CreatePubUname {
1559 #   my $self = shift;
1560 #   my $pub = shift;
1561 #   undef(my $pubuname);
1562 #
1563 #   return $pubuname;
1564 #}
1565
1566 #get authors of a reference
1567 #returns ref to the array of author hashes
1568 sub _getRefAuthors {
1569     my $self = shift;
1570     my $ref = shift;
1571
1572     my $temp = $ref->authors;
1573     undef(my @authors);
1574     undef(my @aut);
1575
1576     #there are authors
1577     if ($temp ne '.') {
1578         if (index($temp, ' and ') > 0) {
1579             $temp =~ / and /;
1580             my $lastauthor = $POSTMATCH;
1581             @authors = split(/\, /, $PREMATCH);
1582             push (@authors, $lastauthor);
1583         } else {
1584             @authors = split(/\, /, $temp);
1585         }
1586
1587         my $a;
1588         my $i = 0;
1589         foreach $a (@authors) {
1590             $i ++;
1591             #parse the author lastname and givennames
1592             undef(my $last);
1593             undef(my $given);
1594             if (index($a, ',') > 0) {   #genbank format, last,f.m.
1595                 ($last, $given) = split(/\,/, $a);
1596             } elsif (index($a, ' ') > 0) {  #embl format, last f.m.
1597                 ($last, $given) = split(/ /, $a);
1598             }
1599             my %au = (
1600                 'surname'   => $last,
1601                 'givennames'    => $given,
1602                 );
1603             push(@aut, {author_id => \%au, arank => $i});
1604         }
1605
1606         return \@aut;
1607     }
1608
1609     #no authors, Bio::SeqIO::genbank doesn't pick up 'CONSRTM' line.
1610     else {
1611         return;
1612     }
1613
1614 }
1615
1616 #extract submission year from the citation of the submitted reference
1617 #genbank format for the submitted citation: JOURNAL   Submitted (DD-MON-YYYY) submitter address
1618 sub _getSubmitYear {
1619     my $self = shift;
1620     my $citation = shift;
1621
1622     if ($citation !~ /Submitted/) {
1623     $self->warn("not citation for a submitted reference. cannot extract submission year.");
1624     return;
1625     } else {
1626     $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/;
1627     my $a = $MATCH;
1628     $a =~ /\d{4}/;
1629     my $year = $MATCH;
1630
1631     return $year;
1632     }
1633 }
1634
1635 sub _getSubmitAddr {
1636     my $self = shift;
1637     my $ref = shift;
1638     undef(my %author);
1639
1640     my $citation = $ref->location;
1641     if ($citation !~ /Submitted/) {
1642     $self->warn("not citation for a submitted reference. cannot extract submission year.");
1643     return;
1644     } else {
1645     $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/;
1646     my $a = $POSTMATCH;
1647     if (defined $a) {
1648         $a =~ s/^\s//;
1649         %author = (
1650                'author_id'  => {'surname'   => substr($a, 0, 100)},
1651                );
1652         return \%author;
1653     } else {
1654         return;
1655     }
1656     }
1657 }
1658
1659 =head2 suppress_residues
1660
1661  Title    : suppress_residues
1662  Usage    : $obj->suppress_residues()        #get existing value
1663             $obj->suppress_residues($newval) #set new value
1664  Function : Keep track of the flag to suppress printing of residues in the
1665             chadoxml file. The default it to allow all residues to go into the
1666             file.
1667  Returns  : value of suppress_residues (a scalar)
1668  Args     : new value of suppress_residues (to set)
1669
1670 =cut
1671
1672 sub suppress_residues {
1673     my $self = shift;
1674     my $suppress_residues = shift if defined(@_);
1675     return $self->{'suppress_residues'} = $suppress_residues if defined($suppress_residues);
1676     return $self->{'suppress_residues'};
1677 }
1678
1679 =head2 allow_residues
1680
1681  Title    : allow_residues
1682  Usage    : $obj->allow_residues()        #get existing value
1683             $obj->allow_residues($feature_type) #set new value
1684  Function : Track the allow_residues type.  This can be used in conjunction
1685             with the suppress_residues flag to only allow residues from a
1686             specific feature type to be printed in the xml file, for example,
1687             only printing chromosome residues. When suppress_residues is set to
1688             true, then only chromosome features would would go into the xml
1689             file. If suppress_residues is not set, this function has no effect
1690             (since the default is to put all residues in the xml file).
1691  Returns  : value of allow_residues (string that corresponds to a feature type)
1692  Args     : new value of allow_residues (to set)
1693  Status   :
1694
1695 =cut
1696
1697 sub allow_residues {
1698     my $self = shift;
1699     my $allow_residues = shift if defined(@_);
1700     return $self->{'allow_residues'} = $allow_residues if defined($allow_residues);
1701     return $self->{'allow_residues'};
1702 }
1703
1704 =head2 return_ftype_hash
1705
1706  Title    : return_ftype_hash
1707  Usage    : $obj->return_ftype_hash()
1708  Function : A simple hash where returning it has be factored out of the main
1709             code to allow subclasses to override it.
1710  Returns  : A hash that indicates what the name of the SO term is and what
1711             the name of the Sequence Ontology is in the cv table.
1712  Args     : The string that represents the SO term.
1713  Status   :
1714
1715 =cut
1716
1717 sub return_ftype_hash {
1718     my $self  = shift;
1719     my $ftype = shift;
1720     my %ftype_hash = ( "name" => $ftype,
1721                        "cv_id" => {"name" => $cv_name{'sequence'} });
1722     return %ftype_hash;
1723 }
1724
1725 =head2 return_reltypename
1726
1727  Title    : return_reltypename
1728  Usage    : $obj->return_reltypename
1729  Function : Return the appropriate relationship type name depending on the
1730             feature type (typically part_of, but derives_from for polypeptide).
1731  Returns  : A relationship type name.
1732  Args     : A SO type name.
1733  Status   :
1734
1735 =cut
1736
1737 sub return_reltypename {
1738     my $self   = shift;
1739     my $sftype = shift;
1740
1741     my $reltypename;
1742     if ($sftype eq 'protein' || $sftype eq 'polypeptide') {
1743         $reltypename = 'derives_from';
1744     } else {
1745         $reltypename = 'part_of';
1746     }
1747
1748     return $reltypename;
1749 }
1750
1751 =head2 next_seq
1752
1753  Title    : next_seq
1754  Usage    : $obj->next_seq
1755  Function :
1756  Returns  :
1757  Args     :
1758  Status   : Not implemented (write only adaptor)
1759
1760 =cut
1761
1762 sub next_seq {
1763     my ($self, %argv) = @_;
1764
1765     $self->throw('next_seq is not implemented; this is a write-only adapter.');
1766
1767 }
1768
1769 =head2 _create_writer
1770
1771  Title    : _create_writer
1772  Usage    : $obj->_create_writer
1773  Function : Creates XML::Writer object and writes start tag
1774  Returns  : Nothing, though the writer persists as part of the chadoxml object
1775  Args     : None
1776  Status   :
1777
1778 =cut
1779
1780 sub _create_writer {
1781     my $self = shift;
1782
1783     $self->{'writer'} = new XML::Writer(OUTPUT => $self->_fh,
1784                                         DATA_MODE => 1,
1785                                         DATA_INDENT => 3);
1786
1787     #print header
1788     $self->{'writer'}->xmlDecl("UTF-8");
1789     $self->{'writer'}->comment("created by Peili Zhang, Flybase, Harvard University\n".
1790                                "and Scott Cain, GMOD, Cold Spring Harbor Laboratory");
1791
1792     #start chadoxml
1793     $self->{'writer'}->startTag('chado');
1794
1795     return;
1796 }
1797
1798 =head2 close_chadoxml
1799
1800  Title    : close_chadoxml
1801  Usage    : $obj->close_chadoxml
1802  Function : Writes the closing xml tag
1803  Returns  : None
1804  Args     : None
1805  Status   :
1806
1807 =cut
1808
1809 sub close_chadoxml {
1810     my $self = shift;
1811
1812     $self->{'writer'}->endTag('chado');
1813     return;
1814 }
1815
1816 =head2 handle_unreserved_tags
1817
1818  Title    : handle_unreserved_tags
1819  Usage    : $obj->handle_unreserved_tags
1820  Function : Converts tag value pairs to xml-ready hashrefs
1821  Returns  : The array containing the hashrefs
1822  Args     : In order: the Seq or SeqFeature object, the key, and the hasharray
1823  Status   :
1824
1825 =cut
1826
1827 sub handle_unreserved_tags {
1828     my $self = shift;
1829     my $seq  = shift;
1830     my $key  = shift;
1831     my @arr  = @_;
1832
1833     my @values = $seq->attributes($key);
1834     for my $value (@values) {
1835         my %prophash = (
1836            "type_id"     => {'name' => $key,
1837                              'cv_id' => { 'name' => $cv_name{'feature_property'} }
1838                             },
1839                             "value"       => $value,
1840                        );
1841         push(@arr, \%prophash);
1842     }
1843
1844     return @arr;
1845 }
1846
1847 =head2 handle_Alias_tag
1848
1849  Title    : handle_Alias_tag
1850  Usage    : $obj->handle_Alias_tag
1851  Function : Convert Alias values to synonym hash refs
1852  Returns  : An array of synonym hash tags
1853  Args     : The seq or seqFeature object and the synonym hash array
1854  Status   :
1855
1856 =cut
1857
1858 sub handle_Alias_tag {
1859     my $self = shift;
1860     my $seq  = shift;
1861     my @arr  = @_;
1862
1863     my @Aliases = $seq->attributes('Alias');
1864     for my $Alias (@Aliases) {
1865         my %synhash = (
1866                   "type_id"   => { 'name' => 'exact',
1867                                   'cv_id' => { 'name'  => 'synonym_type' } },
1868                                  "name"         => $Alias,
1869                                  "synonym_sgml" => $Alias,
1870                       );
1871         push(@arr, {'synonym_id' => \%synhash,
1872                     'pub_id'     => {'uniquename' => 'null',
1873                                      'type_id'    => { 'name' => 'null',
1874                                                        'cv_id' => {
1875                                                             'name' => 'null',
1876                                                                   },
1877                                                      },
1878                                     },
1879                    });
1880     }
1881
1882     return @arr;
1883 }
1884
1885 =head2 handle_Ontology_tag
1886
1887  Title    : handle_Ontology_tag
1888  Usage    : $obj->handle_Ontology_tag
1889  Function : Convert Ontology_term values to ontology term hash refs
1890  Returns  : An array of ontology term hash refs
1891  Args     : The seq or seqFeature object and the ontology term array
1892  Status   :
1893
1894 =cut
1895
1896 sub handle_Ontology_tag  {
1897     my $self = shift;
1898     my $seq  = shift;
1899     my @arr  = @_;
1900
1901     my @terms = $seq->attributes('Ontology_term');
1902     for my $term (@terms) {
1903         my $hashref;
1904         if ($term =~ /(\S+):(\S+)/) {
1905             my $db  = $1;
1906             my $acc = $2;
1907             $hashref = {
1908                     'cvterm_id' => {
1909                         'dbxref_id' => {
1910                            'db_id' => { 'name' => $db },
1911                            'accession' => $acc
1912                                       },
1913                                    },
1914                        };
1915         }
1916         push(@arr, {cvterm_id => $hashref});
1917     }
1918
1919     return @arr;
1920 }
1921
1922 =head2 handle_dbxref
1923
1924  Title    : handle_dbxref
1925  Usage    : $obj->handle_dbxref
1926  Function : Convert Dbxref values to dbxref hashref
1927  Returns  : An array of dbxref hashrefs
1928  Args     : A seq or seqFeature object and the dbxref array
1929  Status   :
1930
1931 =cut
1932
1933 sub handle_dbxref {
1934     my $self = shift;
1935     my $seq  = shift;
1936     my $tag  = shift;
1937     my @arr  = @_;
1938
1939     my @terms = $seq->attributes($tag);
1940     for my $term (@terms) {
1941         my $hashref;
1942         if ($term =~ /(\S+):(\S+)/) {
1943             my $db = $1;
1944             my $acc= $2;
1945             my $version = 1;
1946             if ($acc =~ /(\S+)\.(\S+)/) {
1947                 $acc = $1;
1948                 $version = $2;
1949             }
1950             $hashref = {
1951                          'dbxref_id' => {
1952                                'db_id' => { 'name' => $db },
1953                                'accession' => $acc,
1954                                'version'   => $version,
1955                                         },
1956                        };
1957         }
1958         else {
1959             $self->throw("I don't know how to handle a dbxref like $term");
1960         }
1961         push(@arr, {'dbxref_id' => $hashref});
1962     }
1963     return @arr;
1964 }
1965
1966 =head2 handle_source
1967
1968  Title    : handle_source
1969  Usage    : $obj->handle_source
1970  Function :
1971  Returns  :
1972  Args     :
1973  Status   :
1974
1975 =cut
1976
1977 sub handle_source {
1978     my $self = shift;
1979     my $seq  = shift;
1980     my @arr  = @_;
1981
1982     my $source = $seq->source();
1983     return @arr unless $source;
1984
1985     my $hashref = {
1986                'dbxref_id' => {
1987                        'db_id' => {'name' => 'GFF_source'},
1988                        'accession' => $source,
1989                               }
1990                   };
1991
1992     push(@arr, {'dbxref_id' => $hashref});
1993     return @arr;
1994 }
1995
1996 =head2 _srcf_hash
1997
1998  Title    : _srcf_hash
1999  Usage    : $obj->_srcf_hash
2000  Function : Creates the srcfeature hash for use in featureloc hashes
2001  Returns  : The srcfeature hash
2002  Args     : The srcfeature name, the srcfeature type and a reference to the
2003             organism hash.
2004  Status   :
2005
2006 =cut
2007
2008 sub _srcf_hash {
2009     my $self = shift;
2010     my $srcf = shift;
2011     my $stype= shift;
2012     my $orgref = shift;
2013
2014     my %hash = ('uniquename'    => $srcf,
2015                 'organism_id'   => $orgref,
2016                 'type_id'       => {'name' => $stype,
2017                                     'cv_id' =>
2018                                        {'name' => $cv_name{'sequence'} }},
2019                );
2020
2021     return %hash;
2022 }
2023
2024
2025 1;