Bio/SeqIO/chadoxml.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::chadoxml
   3 #
   4 # Peili Zhang   <peili@morgan.harvard.edu>
   5 #
   6 # You may distribute this module under the same terms as perl itself
   7
   8 # POD documentation - main docs before the code
   9
  10 =head1 NAME
  11
  12 Bio::SeqIO::chadoxml - chadoxml sequence output stream
  13
  14 =head1 SYNOPSIS
  15
  16 It is probably best not to use this object directly, but
  17 rather go through the SeqIO handler system:
  18
  19     $writer = Bio::SeqIO->new(-file => ">chado.xml",
  20                               -format => 'chadoxml');
  21
  22     # assume you already have Sequence or SeqFeature objects
  23     $writer->write_seq($seq_obj);
  24
  25     #after writing all seqs
  26     $writer->close_chadoxml();
  27
  28
  29
  30 =head1 DESCRIPTION
  31
  32 This object can transform Bio::Seq objects to chadoxml flat
  33 file databases (for chadoxml DTD, see
  34 http://gmod.cvs.sourceforge.net/gmod/schema/chado/dat/chado.dtd).
  35
  36 This is currently a write-only module.
  37
  38     $seqio = Bio::SeqIO->new(-file => '>outfile.xml',
  39                              -format => 'chadoxml'
  40                              -suppress_residues => 1,
  41                              -allow_residues => 'chromosome',
  42                              );
  43
  44     # we have a Bio::Seq object $seq which is a gene located on
  45     # chromosome arm 'X', to be written out to chadoxml
  46     # before converting to chadoxml, $seq object B<must> be transformed
  47     # so that all the coordinates in $seq are against the source
  48     # feature to be passed into Bio::SeqIO::chadoxml->write_seq()
  49     # -- chromosome arm X in the example below.
  50
  51     $seqio->write_seq(-seq=>$seq,
  52                       -genus   => 'Homo',
  53                       -species => 'sapiens',
  54                       -seq_so_type=>'gene',
  55                       -src_feature=>'X',
  56                       -src_feat_type=>'chromosome_arm',
  57                         -nounflatten=>1,
  58                       -is_analysis=>'true',
  59                       -data_source=>'GenBank');
  60
  61 The chadoxml output of Bio::SeqIO::chadoxml-E<gt>write_seq() method can be
  62 passed to the loader utility in XORT package
  63 (http://gmod.cvs.sourceforge.net/gmod/schema/XMLTools/XORT/)
  64 to be loaded into chado.
  65
  66 This object is currently implemented to work with sequence and
  67 annotation data from whole genome projects deposited in GenBank. It
  68 may not be able to handle all different types of data from all
  69 different sources.
  70
  71 In converting a Bio::Seq object into chadoxml, a top-level feature is
  72 created to represent the object and all sequence features inside the
  73 Bio::Seq object are treated as subfeatures of the top-level
  74 feature. The Bio::SeqIO::chadoxml object calls
  75 Bio::SeqFeature::Tools::Unflattener to unflatten the flat feature list
  76 contained in the subject Bio::Seq object, to build gene model
  77 containment hierarchy conforming to chado central dogma model: gene
  78 --E<gt> mRNA --E<gt> exons and protein.
  79
  80 Destination of data in the subject Bio::Seq object $seq is as following:
  81
  82     *$seq->display_id:  name of the top-level feature;
  83
  84     *$seq->accession_number: if defined, uniquename and
  85                  feature_dbxref of the top-level
  86                  feature if not defined,
  87                  $seq->display_id is used as the
  88                  uniquename of the top-level feature;
  89
  90     *$seq->molecule: transformed to SO type, used as the feature
  91             type of the top-level feature if -seq_so_type
  92             argument is supplied, use the supplied SO type
  93             as the feature type of the top-level feature;
  94
  95     *$seq->species: organism of the top-level feature;
  96
  97     *$seq->seq: residues of the top-level feature;
  98
  99     *$seq->is_circular, $seq->division: feature_cvterm;
 100
 101     *$seq->keywords, $seq->desc, comments: featureprop;
 102
 103     *references: pub and feature_pub;
 104         medline/pubmed ids: pub_dbxref;
 105         comments: pubprop;
 106
 107     *feature "source" span: featureloc for top-level feature;
 108
 109     *feature "source" db_xref: feature_dbxref for top-level feature;
 110
 111     *feature "source" other tags: featureprop for top-level feature;
 112
 113     *subfeature 'symbol' or 'label' tag: feature uniquename, if
 114                      none of these is present, the chadoxml object
 115                      generates feature uniquenames as:
 116                      <gene>-<feature_type>-<span>
 117                      (e.g. foo-mRNA--1000..3000);
 118
 119     *gene model: feature_relationship built based on the
 120                      containment hierarchy;
 121
 122     *feature span: featureloc;
 123
 124     *feature accession numbers: feature_dbxref;
 125
 126     *feature tags (except db_xref, symbol and gene): featureprop;
 127
 128 Things to watch out for:
 129
 130     *chado schema change: this version works with the chado
 131                                version tagged chado_1_01 in GMOD CVS.
 132
 133     *feature uniquenames: especially important if using XORT
 134                               loader to do incremental load into
 135                               chado. may need pre-processing of the
 136                               source data to put the correct
 137                               uniquenames in place.
 138
 139     *pub uniquenames: chadoxml->write_seq() has the FlyBase policy
 140                           on pub uniquenames hard-coded, it assigns
 141                           pub uniquenames in the following way: for
 142                           journals and books, use ISBN number; for
 143                           published papers, use MEDLINE ID; for
 144                           everything else, use FlyBase unique
 145                           identifier FBrf#. need to modify the code to
 146                           implement your policy. look for the comments
 147                           in the code.
 148
 149     *for pubs possibly existing in chado but with no knowledge of
 150          its uniquename:put "op" as "match", then need to run the
 151                         output chadoxml through a special filter that
 152                         talks to chado database and tries to find the
 153                         pub by matching with the provided information
 154                         instead of looking up by the unique key. after
 155                         matching, the filter also resets the "match"
 156                         operation to either "force" (default), or
 157                         "lookup", or "insert", or "update". the
 158                         "match" operation is for a special FlyBase use
 159                         case. please modify to work according to your
 160                         rules.
 161
 162     *chado initialization for loading:
 163
 164         cv & cvterm: in the output chadoxml, all cv's and
 165                              cvterm's are lookup only. Therefore,
 166                              before using XORT loader to load the
 167                              output into chado, chado must be
 168                              pre-loaded with all necessary CVs and
 169                              CVterms, including "SO" , "property
 170                              type", "relationship type", "pub type",
 171                              "pubprop type", "pub relationship type",
 172                              "sequence topology", "GenBank feature
 173                              qualifier", "GenBank division". A pub by
 174                              the uniquename 'nullpub' of type 'null
 175                              pub' needs to be inserted.
 176
 177 =head1 FEEDBACK
 178
 179 =head2 Mailing Lists
 180
 181 User feedback is an integral part of the evolution of this and other
 182 Bioperl modules. Send your comments and suggestions preferably to one
 183 of the Bioperl mailing lists.  Your participation is much appreciated.
 184
 185   bioperl-l@bioperl.org                  - General discussion
 186   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 187
 188 =head2 Support
 189
 190 Please direct usage questions or support issues to the mailing list:
 191
 192 I<bioperl-l@bioperl.org>
 193
 194 rather than to the module maintainer directly. Many experienced and
 195 reponsive experts will be able look at the problem and quickly
 196 address it. Please include a thorough description of the problem
 197 with code and data examples if at all possible.
 198
 199 =head2 Reporting Bugs
 200
 201 Report bugs to the Bioperl bug tracking system to help us keep track
 202 the bugs and their resolution.
 203 Bug reports can be submitted via the web:
 204
 205   https://github.com/bioperl/bioperl-live/issues
 206
 207 =head1 AUTHOR - Peili Zhang
 208
 209 Email peili@morgan.harvard.edu
 210
 211 =head1 APPENDIX
 212
 213 The rest of the documentation details each of the object
 214 methods. Internal methods are usually preceded with a _
 215
 216 =cut
 217
 218 # Let the code begin...
 219
 220 package Bio::SeqIO::chadoxml;
 221 use strict;
 222 use English;
 223
 224 use Carp;
 225 use Data::Dumper;
 226 use XML::Writer;
 227 use IO::File;
 228 use IO::Handle;
 229 use Bio::Seq;
 230 use Bio::Seq::RichSeq;
 231 use Bio::SeqIO::FTHelper;
 232 use Bio::Species;
 233 use Bio::Seq::SeqFactory;
 234 use Bio::Factory::SequenceStreamI;
 235 use Bio::SeqFeature::Generic;
 236 use Bio::Annotation::Collection;
 237 use Bio::Annotation::Comment;
 238 use Bio::Annotation::Reference;
 239 use Bio::Annotation::DBLink;
 240 use Bio::SeqFeature::Tools::Unflattener;
 241
 242 #global variables
 243 undef(my %finaldatahash); #data from Bio::Seq object stored in a hash
 244 undef(my %datahash); #data from Bio::Seq object stored in a hash
 245
 246 my $chadotables = 'feature featureprop feature_relationship featureloc feature_cvterm cvterm cv feature_pub pub pub_dbxref pub_author author pub_relationship pubprop feature_dbxref dbxref db synonym feature_synonym';
 247
 248 my %fkey = (
 249     "cvterm.cv_id"          => "cv",
 250         "cvterm.dbxref_id"              => "dbxref",
 251     "dbxref.db_id"          => "db",
 252     "feature.type_id"       => "cvterm",
 253     "feature.organism_id"       => "organism",
 254     "feature.dbxref_id"         => "dbxref",
 255     "featureprop.type_id"       => "cvterm",
 256     "feature_pub.pub_id"        => "pub",
 257     "feature_cvterm.cvterm_id"  => "cvterm",
 258     "feature_cvterm.pub_id"     => "pub",
 259         "feature_cvterm.feature_id"     => "feature",
 260     "feature_dbxref.dbxref_id"  => "dbxref",
 261     "feature_relationship.object_id"    => "feature",
 262     "feature_relationship.subject_id"   => "feature",
 263     "feature_relationship.type_id"  => "cvterm",
 264     "featureloc.srcfeature_id"  => "feature",
 265     "pub.type_id"           => "cvterm",
 266     "pub_dbxref.dbxref_id"      => "dbxref",
 267     "pub_author.author_id"      => "author",
 268     "pub_relationship.obj_pub_id"   => "pub",
 269     "pub_relationship.subj_pub_id"  => "pub",
 270     "pub_relationship.type_id"  => "cvterm",
 271     "pubprop.type_id"       => "cvterm",
 272         "feature_synonym.feature_id"    => "feature",
 273         "feature_synonym.synonym_id"    => "synonym",
 274         "feature_synonym.pub_id"        => "pub",
 275         "synonym.type_id"               => "cvterm",
 276 );
 277
 278 my %cv_name = (
 279         'relationship'                  => 'relationship',
 280         'sequence'                      => 'sequence',
 281         'feature_property'              => 'feature_property',
 282 );
 283
 284 my %feattype_args2so = (
 285     "aberr"             => "aberration_junction",
 286 #   "conflict"          => "sequence_difference",
 287 #   "polyA_signal"          => "polyA_signal_sequence",
 288     "variation"         => "sequence_variant",
 289     "mutation1"         => "point_mutation",        #for single-base mutation
 290     "mutation2"         => "sequence_variant",      #for multi-base mutation
 291     "rescue"            => "rescue_fragment",
 292 #   "rfrag"             => "restriction_fragment",
 293     "protein_bind"          => "protein_binding_site",
 294     "misc_feature"          => "region",
 295 #   "prim_transcript"       => "primary_transcript",
 296     "CDS"               => "polypeptide",
 297     "reg_element"           => "regulatory_region",
 298     "seq_variant"           => "sequence_variant",
 299     "mat_peptide"           => "mature_peptide",
 300     "sig_peptide"           => "signal_peptide",
 301 );
 302
 303 undef(my %organism);
 304
 305 use base qw(Bio::SeqIO);
 306
 307 sub _initialize {
 308
 309     my($self,%args) = @_;
 310
 311     $self->SUPER::_initialize(%args);
 312     unless( defined $self->sequence_factory ) {
 313         $self->sequence_factory(Bio::Seq::SeqFactory->new
 314                                 (-verbose => $self->verbose(),
 315                                  -type => 'Bio::Seq::RichSeq'));
 316     }
 317     #optional arguments that can be passed in
 318     $self->suppress_residues($args{'-suppress_residues'})
 319         if defined $args{'-suppress_residues'};
 320
 321     $self->allow_residues($args{'-allow_residues'})
 322         if defined $args{'-allow_residues'};
 323     return;
 324 }
 325
 326 =head2 write_seq
 327
 328  Title   : write_seq
 329  Usage   : $stream->write_seq(-seq=>$seq, -seq_so_type=>$seqSOtype,
 330                   -src_feature=>$srcfeature,
 331                   -src_feat_type=>$srcfeattype,
 332                   -nounflatten=>0 or 1,
 333                   -is_analysis=>'true' or 'false',
 334                   -data_source=>$datasource)
 335  Function: writes the $seq object (must be seq) into chadoxml.
 336  Returns : 1 for success and 0 for error
 337  Args     : A Bio::Seq object $seq, optional $seqSOtype, $srcfeature,
 338             $srcfeattype, $nounflatten, $is_analysis and $data_source.
 339
 340 When $srcfeature (a string, the uniquename of the source feature) is given, the
 341 location and strand information of the top-level feature against the source
 342 feature will be derived from the sequence feature called 'source' of the $seq
 343 object, a featureloc record is generated for the top -level feature on
 344 $srcfeature. when $srcfeature is given, $srcfeattype must also be present. All
 345 feature coordinates in $seq should be against $srcfeature. $seqSOtype is the
 346 optional SO term to use as the type of the top-level feature. For example, a
 347 GenBank data file for a Drosophila melanogaster genome scaffold has the molecule
 348 type of "DNA", when converting to chadoxml, a $seqSOtype argument of
 349 "golden_path_region" can be supplied to save the scaffold as a feature of type
 350 "golden_path_region" in chadoxml, instead of "DNA". a feature with primary tag
 351 of 'source' must be present in the sequence feature list of $seq, to decribe the
 352 whole sequence record.
 353
 354 In the current implementation:
 355
 356 =over 3
 357
 358 =item *
 359
 360 non-mRNA records
 361
 362 A top-level feature of type $seq-E<gt>alphabet is generated for the whole GenBank
 363 record, features listed are unflattened for DNA records to build gene model
 364 feature graph, and for the other types of records all features in $seq are
 365 treated as subfeatures of the top-level feature.
 366
 367 =item *
 368
 369 mRNA records
 370
 371 If a 'gene' feature is present, it B<must> have a /symbol or /label tag to
 372 contain the uniquename of the gene. a top-level feature of type 'gene' is
 373 generated. the mRNA is written as a subfeature of the top-level gene feature,
 374 and the other sequence features listed in $seq are treated as subfeatures of the
 375 mRNA feature.
 376
 377 =back
 378
 379 =cut
 380
 381 sub write_seq {
 382     my $usage = <<EOUSAGE;
 383 Bio::SeqIO::chadoxml->write_seq()
 384 Usage   : \$stream->write_seq(-seq=>\$seq,
 385                   -seq_so_type=>\$SOtype,
 386                   -src_feature=>\$srcfeature,
 387                   -src_feat_type=>\$srcfeattype,
 388                   -nounflatten=>0 or 1,
 389                               -is_analysis=>'true' or 'false',
 390                               -data_source=>\$datasource)
 391 Args    : \$seq     : a Bio::Seq object
 392       \$SOtype  : the SO term to use as the feature type of
 393                       the \$seq record, optional
 394       \$srcfeature  : unique name of the source feature, a string
 395               containing at least one alphabetical letter
 396               (a-z, A-Z), optional
 397       \$srcfeattype : feature type of \$srcfeature. one of SO terms.
 398               optional
 399       when \$srcfeature is given, \$srcfeattype becomes mandatory,
 400       \$datasource  : source of the sequence annotation data,
 401               e.g. 'GenBank' or 'GFF'.
 402 EOUSAGE
 403
 404     my ($self,@args) = @_;
 405
 406     my ($seq, $seq_so_type, $srcfeature, $srcfeattype, $nounflatten, $isanalysis, $datasource, $genus, $species) =
 407        $self->_rearrange([qw(SEQ
 408                  SEQ_SO_TYPE
 409                  SRC_FEATURE
 410                  SRC_FEAT_TYPE
 411                  NOUNFLATTEN
 412                  IS_ANALYSIS
 413                  DATA_SOURCE
 414                                  GENUS
 415                                  SPECIES
 416                  )],
 417                   @args);
 418     #print "$seq_so_type, $srcfeature, $srcfeattype\n";
 419
 420     if( !defined $seq ) {
 421         $self->throw("Attempting to write with no seq!");
 422     }
 423
 424     if( ! ref $seq || ! $seq->isa('Bio::Seq::RichSeqI') ) {
 425        ## FIXME $self->warn(" $seq is not a RichSeqI compliant module. Attempting to dump, but may fail!");
 426     }
 427
 428         # try to get the srcfeature from the seqFeature object
 429         # for this to work, the user has to pass in the srcfeature type
 430         if (!$srcfeature) {
 431             if ($seq->can('seq_id')) {
 432                 $srcfeature=$seq->seq_id if ($seq->seq_id ne $seq->display_name);
 433             }
 434         }
 435
 436     #$srcfeature, when provided, should contain at least one alphabetical letter
 437     if (defined $srcfeature)
 438     {
 439         if ($srcfeature =~ /[a-zA-Z]/)
 440         {
 441         chomp($srcfeature);
 442         } else {
 443         $self->throw( $usage );
 444         }
 445
 446         #check for mandatory $srcfeattype
 447         if (! defined $srcfeattype)
 448         {
 449         $self->throw( $usage );
 450         #$srcfeattype must be a string of non-whitespace characters
 451         } else {
 452         if ($srcfeattype =~ /\S+/) {
 453             chomp($srcfeattype);
 454         } else {
 455             $self->throw( $usage );
 456         }
 457         }
 458     }
 459
 460     # variables local to write_seq()
 461         my $div = undef;
 462     my $hkey = undef;
 463     undef(my @top_featureprops);
 464         undef(my @featuresyns);
 465         undef(my @top_featurecvterms);
 466     my $name = $seq->display_id if $seq->can('display_id');
 467         $name = $seq->display_name  if $seq->can('display_name');
 468     undef(my @feature_cvterms);
 469     undef(my %sthash);
 470     undef(my %dvhash);
 471     undef(my %h1);
 472     undef(my %h2);
 473     my $temp = undef;
 474     my $ann = undef;
 475     undef(my @references);
 476     undef(my @feature_pubs);
 477     my $ref = undef;
 478     my $location = undef;
 479     my $fbrf = undef;
 480     my $journal = undef;
 481     my $issue = undef;
 482     my $volume = undef;
 483     my $volumeissue = undef;
 484     my $pages = undef;
 485     my $year = undef;
 486     my $pubtype = undef;
 487 #   my $miniref= undef;
 488     my $uniquename = undef;
 489     my $refhash = undef;
 490     my $feat = undef;
 491     my $tag = undef;
 492     my $tag_cv = undef;
 493     my $ftype = undef;
 494     my $subfeatcnt = undef;
 495     undef(my @top_featrels);
 496     undef (my %srcfhash);
 497
 498     local($^W) = 0; # supressing warnings about uninitialized fields.
 499
 500         if (!$name && $seq->can('attributes') ) {
 501             ($name) = $seq->attributes('Alias');
 502         }
 503
 504     if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') {
 505         $uniquename = $seq->accession_number;
 506     } elsif ($seq->can('accession') && defined $seq->accession && $seq->accession ne 'unknown') {
 507         $uniquename = $seq->accession;
 508     } elsif ($seq->can('attributes')) {
 509                 ($uniquename) = $seq->attributes('load_id');
 510         } else {
 511         $uniquename = $name;
 512     }
 513         my $len = $seq->length();
 514     if ($len == 0) {
 515         $len = undef;
 516     }
 517
 518     undef(my $gb_type);
 519     if (!$seq->can('molecule') || ! defined ($gb_type = $seq->molecule()) ) {
 520         $gb_type = $seq->can('alphabet') ? $seq->alphabet : 'DNA';
 521     }
 522     $gb_type = 'DNA' if $ftype eq 'dna';
 523     $gb_type = 'RNA' if $ftype eq 'rna';
 524
 525     if(length $seq_so_type > 0) {
 526         if (defined $seq_so_type) {
 527             $ftype = $seq_so_type;
 528         }
 529             elsif ($seq->type) {
 530                     $ftype = ($seq->type =~ /(.*):/)
 531                              ? $1
 532                              : $seq->type;
 533             }
 534         else {
 535             $ftype = $gb_type;
 536         }
 537     }
 538     else {
 539         $ftype = $gb_type;
 540     }
 541
 542     my %ftype_hash = $self->return_ftype_hash($ftype);
 543
 544         if ($species) {
 545             %organism = ("genus"=>$genus, "species" => $species);
 546         }
 547         else {
 548         my $spec = $seq->species();
 549         if (!defined $spec) {
 550         $self->throw("$seq does not know what organism it is from, which is required by chado. cannot proceed!\n");
 551         } else {
 552         %organism = ("genus"=>$spec->genus(), "species" => $spec->species());
 553         }
 554         }
 555
 556         my $residues;
 557         if (!$self->suppress_residues ||
 558             ($self->suppress_residues && $self->allow_residues eq $ftype)) {
 559             $residues = $seq->seq->isa('Bio::PrimarySeq')
 560                         ? $seq->seq->seq
 561                         : $seq->seq;
 562         }
 563         else {
 564             $residues = '';
 565         }
 566
 567     #set is_analysis flag for gene model features
 568     undef(my $isanal);
 569     if ($ftype eq 'gene' || $ftype eq 'mRNA' || $ftype eq 'exon' || $ftype eq 'protein' || $ftype eq 'polypeptide') {
 570         $isanal = $isanalysis;
 571         $isanal = 'false' if !defined $isanal;
 572     }
 573
 574     %datahash = (
 575         "name"      => $name,
 576         "uniquename"    => $uniquename,
 577         "seqlen"    => $len,
 578         "residues"  => $residues,
 579         "type_id"   => \%ftype_hash,
 580         "organism_id"   => \%organism,
 581         "is_analysis"   => $isanal || 'false',
 582         );
 583
 584         if (defined $srcfeature) {
 585                 %srcfhash = $self->_srcf_hash($srcfeature,
 586                                               $srcfeattype,
 587                                               \%organism);
 588
 589                 my ($phase,$strand);
 590                 if ($seq->can('phase')) {
 591                     $phase = $seq->phase;
 592                 }
 593
 594                 if ($seq->can('strand')) {
 595                     $strand = $seq->strand;
 596                 }
 597                 my %fl = (
 598                                 "srcfeature_id" => \%srcfhash,
 599                                 "fmin"          => $seq->start - 1,
 600                                 "fmax"          => $seq->end,
 601                                 "strand"        => $strand,
 602                                 "phase"         => $phase,
 603                                 );
 604
 605                 $datahash{'featureloc'} = \%fl;
 606
 607         }
 608
 609
 610     #if $srcfeature is not given, use the Bio::Seq object itself as the srcfeature for featureloc's
 611     if (!defined $srcfeature) {
 612         $srcfeature = $uniquename;
 613         $srcfeattype = $ftype;
 614     }
 615
 616     #default data source is 'GenBank'
 617     if (!defined $datasource) {
 618         $datasource = 'GenBank';
 619     }
 620
 621     if ($datasource =~ /GenBank/i) {
 622         #sequence topology as feature_cvterm
 623         if ($seq->can('is_circular') && $seq->is_circular) {
 624             %sthash = (
 625                 "cvterm_id" => {'name' => 'circular',
 626                             'cv_id' => {
 627                             'name' => 'sequence topology',
 628                             },
 629                         },
 630                    "pub_id" => {'uniquename' => 'nullpub',
 631                             'type_id' => {
 632                             'name' => 'null pub',
 633                             'cv_id' => {
 634                                 'name'=> 'pub type',
 635                             },
 636                             },
 637                         },
 638                 );
 639         } else {
 640             %sthash = (
 641                 "cvterm_id" => { 'name' => 'linear',
 642                              'cv_id' => {
 643                              'name' => 'sequence topology',
 644                              }
 645                          },
 646                 "pub_id"    => {'uniquename' => 'nullpub',
 647                             'type_id' => {
 648                             'name' => 'null pub',
 649                             'cv_id' => {
 650                                 'name'=> 'pub type',
 651                             },
 652                             },
 653                         },
 654                    );
 655         }
 656         push(@feature_cvterms, \%sthash);
 657
 658         #division as feature_cvterm
 659             if ($seq->can('division') && defined $seq->division()) {
 660                 $div = $seq->division();
 661             %dvhash = (
 662                 "cvterm_id" => {'name' => $div,
 663                             'cv_id' => {
 664                             'name' => 'GenBank division'}},
 665                 "pub_id"    => {'uniquename' => 'nullpub',
 666                             'type_id' => {
 667                             'name' => 'null pub',
 668                             'cv_id' => {
 669                                 'name'=> 'pub type'},
 670                                 }},
 671                 );
 672             push(@feature_cvterms, \%dvhash);
 673         }
 674
 675         $datahash{'feature_cvterm'} = \@feature_cvterms;
 676     } # closes if GenBank
 677
 678     #featureprop's
 679     #DEFINITION
 680     if ($seq->can('desc') && defined $seq->desc()) {
 681         $temp = $seq->desc();
 682
 683         my %prophash = (
 684             "type_id"   => {'name' => 'description',
 685                         'cv_id' => {
 686                         'name' =>
 687                                                  $cv_name{'feature_property'}
 688                                                        },
 689                                            },
 690             "value"     => $temp,
 691             );
 692
 693         push(@top_featureprops, \%prophash);
 694         }
 695
 696     #KEYWORDS
 697     if ($seq->can('keywords')) {
 698         $temp = $seq->keywords();
 699
 700         if (defined $temp && $temp ne '.' && $temp ne '') {
 701         my %prophash = (
 702                 "type_id"   => {'name' => 'keywords',
 703                             'cv_id' => {
 704                                                   'name' =>
 705                                                    $cv_name{'feature_property'}
 706                                                            }
 707                         },
 708                 "value"     => $temp,
 709                             );
 710
 711         push(@top_featureprops, \%prophash);
 712         }
 713         }
 714
 715     #COMMENT
 716     if ($seq->can('annotation')) {
 717         $ann = $seq->annotation();
 718         foreach my $comment ($ann->get_Annotations('comment')) {
 719             $temp = $comment->as_text();
 720             #print "fcomment: $temp\n";
 721             my %prophash = (
 722                 "type_id"   => {'name' => 'comment',
 723                             'cv_id' => {
 724                                                   'name' =>
 725                                                    $cv_name{'feature_property'}
 726                                                            }
 727                                                },
 728                 "value"     => $temp,
 729                 );
 730
 731             push(@top_featureprops, \%prophash);
 732         }
 733     }
 734
 735         my @top_dbxrefs = ();
 736         #feature object from Bio::DB::SeqFeature::Store
 737         if ($seq->can('attributes')) {
 738                 my %attributes = $seq->attributes;
 739                 for my $key (keys %attributes) {
 740                     next if ($key eq 'parent_id');
 741                     next if ($key eq 'load_id');
 742
 743                     if ($key eq 'Alias') {
 744                         @featuresyns = $self->handle_Alias_tag($seq,@featuresyns);
 745                     }
 746
 747                     ###FIXME deal with Dbxref, Ontology_term,source,
 748                     elsif ($key eq 'Ontology_term') {
 749                         @top_featurecvterms = $self->handle_Ontology_tag($seq,@top_featurecvterms);
 750                     }
 751
 752                     elsif ($key eq 'dbxref' or $key eq 'Dbxref') {
 753                         @top_dbxrefs = $self->handle_dbxref($seq, $key, @top_dbxrefs);
 754                     }
 755
 756                     elsif ($key =~ /^[a-z]/) {
 757                         @top_featureprops
 758                              = $self->handle_unreserved_tags($seq,$key,@top_featureprops);
 759                     }
 760                 }
 761         }
 762         $datahash{'feature_synonym'} = \@featuresyns;
 763
 764         if ($seq->can('source')) {
 765                 @top_dbxrefs = $self->handle_source($seq,@top_dbxrefs);
 766         }
 767
 768     #accession and version as feature_dbxref
 769     if ($seq->can('accession_number') && defined $seq->accession_number && $seq->accession_number ne 'unknown') {
 770         my $db = $self->_guess_acc_db($seq, $seq->accession_number);
 771         my %acchash = (
 772                "db_id"  => {'name' => $db},
 773                "accession"  => $seq->accession_number,
 774                "version"    => $seq->seq_version,
 775                );
 776         my %fdbx = ('dbxref_id' => \%acchash);
 777         push(@top_dbxrefs, \%fdbx);
 778     }
 779
 780     if( $seq->isa('Bio::Seq::RichSeqI') && defined $seq->get_secondary_accessions() ) {
 781         my @secacc = $seq->get_secondary_accessions();
 782         my $acc;
 783         foreach $acc (@secacc) {
 784             my %acchash = (
 785                 "db_id"         => {'name' => 'GB'},
 786                 "accession" => $acc,
 787                 );
 788             my %fdbx = ('dbxref_id' => \%acchash);
 789             push(@top_dbxrefs, \%fdbx);
 790         }
 791     }
 792
 793     #GI number
 794     if( $seq->isa('Bio::Seq::RichSeqI') && defined ($seq->pid)) {
 795         my $id = $seq->pid;
 796         #print "reftype: ", ref($id), "\n";
 797
 798         #if (ref($id) eq 'HASH') {
 799         my %acchash = (
 800             "db_id"     => {'name' => 'GI'},
 801             "accession" => $id,
 802             );
 803         my %fdbx = ('dbxref_id' => \%acchash);
 804         push (@top_dbxrefs, \%fdbx);
 805     }
 806
 807     #REFERENCES as feature_pub
 808     if (defined $ann) {
 809         #get the references
 810         @references = $ann->get_Annotations('reference');
 811         foreach $ref (@references) {
 812         undef(my %pubhash);
 813         $refhash = $ref->hash_tree();
 814         $location = $ref->location || $refhash->{'location'};
 815         #print "location: $location\n";
 816
 817         #get FBrf#, special for FlyBase SEAN loading
 818         if (index($location, ' ==') >= 0) {
 819             $location =~ /\s==/;
 820                 #print "match: $MATCH\n";
 821                 #print "prematch: $PREMATCH\n";
 822                 #print "postmatch: $POSTMATCH\n";
 823             $fbrf = $PREMATCH;
 824             $location = $POSTMATCH;
 825             $location =~ s/^\s//;
 826         }
 827
 828         #print "location: $location\n";
 829         #unpublished reference
 830         if ($location =~ /Unpublished/) {
 831             $pubtype = 'unpublished';
 832             %pubhash = (
 833                 "title"     => $ref->title || $refhash->{'title'},
 834                 #"miniref"  => substr($location, 0, 255),
 835                 #"uniquename"   => $fbrf,
 836                 "type_id"   => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}
 837                 );
 838         }
 839         #submitted
 840         elsif ($location =~ /Submitted/) {
 841             $pubtype = 'submitted';
 842
 843             %pubhash = (
 844                 "title"     => $ref->title || $refhash->{'title'},
 845                 #"miniref"  => substr($location, 0, 255),
 846                 #"uniquename"   => $fbrf,
 847                 "type_id"   => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}}
 848                 );
 849
 850             undef(my $pyear);
 851             $pyear = $self->_getSubmitYear($location);
 852             if (defined $pyear) {
 853             $pubhash{'pyear'} = $pyear;
 854             }
 855         }
 856
 857         #published journal paper
 858         elsif ($location =~ /\D+\s\d+\s\((\d+|\d+-\d+)\),\s(\d+-\d+|\d+--\d+)\s\(\d\d\d\d\)$/) {
 859             $pubtype = 'paper';
 860
 861                 #parse location to get journal, volume, issue, pages & year
 862             $location =~ /\(\d\d\d\d\)$/;
 863
 864             $year = $MATCH;
 865             my $stuff = $PREMATCH;
 866             $year =~ s/\(//; #remove the leading parenthesis
 867             $year =~ s/\)//; #remove the trailing parenthesis
 868
 869             $stuff =~ /,\s(\d+-\d+|\d+--\d+)\s$/;
 870
 871             $pages = $MATCH;
 872             $stuff = $PREMATCH;
 873             $pages =~ s/^, //; #remove the leading comma and space
 874             $pages =~ s/ $//; #remove the last space
 875
 876             $stuff =~ /\s\d+\s\((\d+|\d+-\d+)\)$/;
 877
 878             $volumeissue = $MATCH;
 879             $journal = $PREMATCH;
 880             $volumeissue =~ s/^ //; #remove the leading space
 881             $volumeissue =~ /\((\d+|\d+-\d+)\)$/;
 882             $issue = $MATCH;
 883             $volume = $PREMATCH;
 884             $issue =~ s/^\(//; #remove the leading parentheses
 885             $issue =~ s/\)$//; #remove the last parentheses
 886             $volume =~ s/^\s//; #remove the leading space
 887             $volume =~ s/\s$//; #remove the last space
 888
 889             %pubhash = (
 890                 "title"     => $ref->title || $refhash->{'title'},
 891                 "volume"    => $volume,
 892                 "issue"     => $issue,
 893                 "pyear"     => $year,
 894                 "pages"     => $pages,
 895                 #"miniref"  => substr($location, 0, 255),
 896                 #"miniref"  => ' ',
 897                 #"uniquename"   => $fbrf,
 898                 "type_id"   => {'name' => $pubtype, 'cv_id' => {'name' =>'pub type'}},
 899                 "pub_relationship" => {
 900                     'obj_pub_id' => {
 901                     'uniquename' => $journal,
 902                     'title' => $journal,
 903                     #'miniref' => substr($journal, 0, 255),
 904                     'type_id' =>{'name' => 'journal',
 905                              'cv_id' =>
 906                              {'name' => 'pub type'
 907                               },
 908                          },
 909                              #'pubprop' =>{'value'=> $journal,
 910                              #       'type_id'=>{'name' => 'abbreviation', 'cv_id' => {'name' => 'pubprop type'}},
 911                              #      },
 912                          },
 913                        'type_id' => {
 914                            'name' => 'published_in',
 915                            'cv_id' => {
 916                            'name' => 'pub relationship type'},
 917                        },
 918                 },
 919                 );
 920         }
 921
 922         #other references
 923         else {
 924             $pubtype = 'other';
 925             %pubhash = (
 926                 "title"     => $ref->title || $refhash->{'title'},
 927                 #"miniref"  => $fbrf,
 928                 "type_id"   => {
 929                     'name' => $pubtype,
 930                     'cv_id' => {'name' =>'pub type'}
 931                 }
 932                 );
 933         }
 934
 935         #pub_author
 936         my $autref = $self->_getRefAuthors($ref);
 937         if (defined $autref) {
 938             $pubhash{'pub_author'} = $autref;
 939         }
 940         # if no author and is type 'submitted' and has submitter address, use the first 100 characters of submitter address as the author lastname.
 941         else {
 942             if ($pubtype eq 'submitted') {
 943             my $autref = $self->_getSubmitAddr($ref);
 944             if (defined $autref) {
 945                 $pubhash{'pub_author'} = $autref;
 946             }
 947             }
 948         }
 949
 950         #$ref->comment as pubprop
 951         #print "ref comment: ", $ref->comment, "\n";
 952         #print "ref comment: ", $refhash->{'comment'}, "\n";
 953         if (defined $ref->comment || defined $refhash->{'comment'}) {
 954             my $comnt = $ref->comment || $refhash->{'comment'};
 955                 #print "remark: ", $comnt, "\n";
 956             $pubhash{'pubprop'} = {
 957             "type_id"       => {'name' => 'comment', 'cv_id' => {'name' => 'pubprop type'}},
 958             "value"     => $comnt,
 959             };
 960         }
 961
 962         #pub_dbxref
 963         undef(my @pub_dbxrefs);
 964         if (defined $fbrf) {
 965             push(@pub_dbxrefs, {dbxref_id => {accession => $fbrf, db_id => {'name' => 'FlyBase'}}});
 966         }
 967         if (defined ($temp = $ref->medline)) {
 968             push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'MEDLINE'}}});
 969                 #use medline # as the pub's uniquename
 970             $pubhash{'uniquename'} = $temp;
 971         }
 972         if (defined ($temp = $ref->pubmed)) {
 973             push(@pub_dbxrefs, {dbxref_id => {accession => $temp, db_id => {'name' => 'PUBMED'}}});
 974         }
 975         $pubhash{'pub_dbxref'} = \@pub_dbxrefs;
 976
 977         #if the pub uniquename is not defined or blank, put its FBrf# as its uniquename
 978         #this is unique to FlyBase
 979         #USERS OF THIS MODULE: PLEASE MODIFY HERE TO IMPLEMENT YOUR POLICY
 980         # ON PUB UNIQUENAME!!!
 981         if (!defined $pubhash{'uniquename'} || $pubhash{'uniquename'} eq '') {
 982             if (defined $fbrf) {
 983             $pubhash{'uniquename'} = $fbrf;
 984             }
 985                 #else {
 986                 #   $pubhash{'uniquename'} = $self->_CreatePubUname($ref);
 987                 #}
 988         }
 989
 990         #add to collection of references
 991         #if the pub covers the entire sequence of the top-level feature, add it to feature_pubs
 992         if (($ref->start == 1 && $ref->end == $len) || (!defined $ref->start && !defined $ref->end)) {
 993             push(@feature_pubs, {"pub_id" => \%pubhash});
 994         }
 995         #the pub is about a sub-sequence of the top-level feature
 996         #create a feature for the sub-sequence and add pub as its feature_pub
 997         #featureloc of this sub-sequence is against the top-level feature, in interbase coordinates.
 998         else {
 999             my %parf = (
1000                 'uniquename'    => $uniquename . ':' . $ref->start . "\.\." . $ref->end,
1001                 'organism_id'   =>\%organism,
1002                 'type_id'   =>{'name' =>'region', 'cv_id' => {'name' => $cv_name{'sequence'} }},
1003                 );
1004             my %parfsrcf = (
1005                     'uniquename'    => $uniquename,
1006                     'organism_id'   =>\%organism,
1007                     );
1008             my %parfloc = (
1009                    'srcfeature_id'  => \%parfsrcf,
1010                    'fmin'       => $ref->start - 1,
1011                    'fmax'       => $ref->end,
1012                    );
1013             $parf{'featureloc'} = \%parfloc;
1014             $parf{'feature_pub'} = {'pub_id' => \%pubhash};
1015             my %ffr = (
1016                    'subject_id' => \%parf,
1017                    'type_id'        => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'}}},
1018                    );
1019             push(@top_featrels, \%ffr);
1020         }
1021         }
1022         $datahash{'feature_pub'} = \@feature_pubs;
1023     }
1024
1025     ##construct srcfeature hash for use in featureloc
1026     if (defined $srcfeature) {
1027                 %srcfhash = $self->_srcf_hash($srcfeature,
1028                                               $srcfeattype,
1029                                               \%organism);
1030     #   my %fr = (
1031     #       "object_id" => \%srcfhash,
1032     #       "type_id"   => { 'name' => 'partof', 'cv_id' => { 'name' => 'relationship type'}},
1033     #       );
1034
1035     #   push (@top_featrels, \%fr);
1036     }
1037
1038     #unflatten the seq features in $seq if $seq is a gene or a DNA sequence
1039     if (($gb_type eq 'gene' || $gb_type eq 'DNA') &&
1040         !$nounflatten) {
1041         my $u = Bio::SeqFeature::Tools::Unflattener->new;
1042         $u->unflatten_seq(-seq=>$seq, -use_magic=>1);
1043     }
1044
1045     my @top_sfs = $seq->get_SeqFeatures;
1046     #print $#top_sfs, "\n";
1047
1048     #SUBFEATURES
1049
1050     if ($datasource =~ /GenBank/i) {
1051         $tag_cv = 'GenBank feature qualifier';
1052     } elsif ($datasource =~ /GFF/i) {
1053         $tag_cv = 'feature_property';
1054     } else {
1055         $tag_cv = $cv_name{'feature_property'};
1056     }
1057
1058     my $si = 0;
1059     foreach $feat (@top_sfs) {
1060         #$feat = $top_sfs[$si];
1061         #print "si: $si\n";
1062         my $prim_tag = $feat->primary_tag;
1063         #print $prim_tag, "\n";
1064
1065         # get all qualifiers of the 'source' feature, load these as top_featureprops of the top level feature
1066         if ($prim_tag eq 'source') {
1067             foreach $tag ($feat->all_tags()) {
1068                 #db_xref
1069                 if ($tag eq 'db_xref'
1070                                  or $tag eq 'Dbxref'
1071                                  or $tag eq 'dbxref')   {
1072                     my @t1 = $feat->each_tag_value($tag);
1073                     foreach $temp (@t1) {
1074                        $temp =~ /([^:]*?):(.*)/;
1075                                            my $db = $1;
1076                                            my $xref = $2;
1077                                            #PRE/POST very inefficent
1078                        #my $db = $PREMATCH;
1079                        #my $xref = $POSTMATCH;
1080                        my %acchash = (
1081                         "db_id"     => {'name' => $db},
1082                         "accession" => $xref,
1083                         );
1084                        my %fdbx = ('dbxref_id' => \%acchash);
1085                        push (@top_dbxrefs, \%fdbx);
1086                     }
1087                                 #Ontology_term
1088                                 } elsif ($tag eq 'Ontology_term') {
1089                                         my @t1 = $feat->each_tag_value($tag);
1090                                         foreach $temp (@t1) {
1091                                             ###FIXME
1092                                         }
1093                 #other tags as featureprop
1094                 } elsif ($tag ne 'gene') {
1095                     my %prophash = undef;
1096                     %prophash = (
1097                                     "type_id"       => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}},
1098                         "value"     => join(' ',$feat->each_tag_value($tag)),
1099                         );
1100                     push(@top_featureprops, \%prophash);
1101                 }
1102             }
1103
1104                         if ($feat->can('source')) {
1105                             my $source = $feat->source();
1106                             @top_dbxrefs = $self->handle_source($feat, @top_dbxrefs);
1107                         }
1108
1109             #featureloc for the top-level feature
1110             my $fmin = undef;
1111             my $fmax = undef;
1112             my $strand = undef;
1113                         my $phase = undef;
1114             my %fl = undef;
1115
1116             $fmin = $feat->start - 1;
1117             $fmax = $feat->end;
1118             $strand = $feat->strand;
1119
1120                         if ($feat->can('phase')) {
1121                             $phase = $feat->phase;
1122                         }
1123
1124             %fl = (
1125                 "srcfeature_id" => \%srcfhash,
1126                 "fmin"      => $fmin,
1127                 "fmax"      => $fmax,
1128                 "strand"    => $strand,
1129                                 "phase"         => $phase,
1130                 );
1131
1132             $datahash{'featureloc'} = \%fl;
1133
1134             #delete 'source' feature from @top_sfs
1135             splice(@top_sfs, $si, 1);
1136         }
1137         $si ++;
1138     #close loop over top_sfs
1139     }
1140
1141     #the top-level features other than 'source'
1142     foreach $feat (@top_sfs) {
1143         #print $feat->primary_tag, "\n";
1144
1145         my $r = $self->_subfeat2featrelhash($name, $ftype, $feat, \%srcfhash, $tag_cv, $isanalysis);
1146
1147         if (!($ftype eq 'mRNA' && $feat->primary_tag eq 'gene')) {
1148             my %fr = %$r;
1149             push(@top_featrels, \%fr);
1150         } else {
1151             %finaldatahash = %$r;
1152         }
1153     }
1154
1155     if (@top_dbxrefs) {
1156         $datahash{'feature_dbxref'} = \@top_dbxrefs;
1157     }
1158
1159     if (@top_featureprops) {
1160         $datahash{'featureprop'} = \@top_featureprops;
1161     }
1162
1163     if (@top_featrels) {
1164         $datahash{'feature_relationship'} = \@top_featrels;
1165     }
1166
1167         if (@top_featurecvterms) {
1168                 $datahash{'feature_cvterm'} = \@top_featurecvterms;
1169         }
1170
1171     if ($ftype eq 'mRNA' && %finaldatahash) {
1172         $finaldatahash{'feature_relationship'} = {
1173                         'subject_id'    => \%datahash,
1174                         'type_id'   => { 'name' => 'partof', 'cv_id' => { 'name' => $cv_name{'relationship'} }},
1175                              };
1176     } else {
1177         %finaldatahash = %datahash;
1178     }
1179
1180     my $mainTag = 'feature';
1181     $self->_hash2xml(undef, $mainTag, \%finaldatahash);
1182
1183     return 1;
1184 }
1185
1186 sub _hash2xml {
1187     my $self = shift;
1188     my $isMatch = undef;
1189     $isMatch = shift;
1190     my $ult = shift;
1191     my $ref = shift;
1192     my %mh = %$ref;
1193     my $key;
1194     my $v;
1195     my $sh;
1196     my $xx;
1197     my $yy;
1198     my $nt;
1199     my $ntref;
1200     my $output;
1201     my $root = shift if (@_);
1202     #print "ult: $ult\n";
1203     if (!defined $self->{'writer'}) {
1204     $root = 1;
1205         $self->_create_writer();
1206     }
1207     my $temp;
1208     my %subh = undef;
1209
1210     #start opeing tag
1211     #if pub record of type 'journal', form the 'ref' attribute for special pub lookup
1212     #requires that the journal name itself is also stored as a pubprop record for the journal with value equal
1213     #to the journal name and type of 'abbreviation'.
1214     if ($ult eq 'pub' && $mh{'type_id'}->{'name'} eq 'journal') {
1215     $self->{'writer'}->startTag($ult, 'ref' => $mh{'title'} . ':journal:abbreviation');
1216     }
1217
1218     #special pub match if pub uniquename not known
1219     elsif ($ult eq 'pub' && !defined $mh{'uniquename'}) {
1220     $self->{'writer'}->startTag($ult, 'op' => 'match');
1221     #set the match flag, all the sub tags should also have "op"="match"
1222     $isMatch = 1;
1223     }
1224
1225     #if cvterm or cv, lookup only
1226     elsif (($ult eq 'cvterm') || ($ult eq 'cv')) {
1227     $self->{'writer'}->startTag($ult, 'op' => 'lookup');
1228     }
1229
1230     #if nested tables of match table, match too
1231     elsif ($isMatch) {
1232     $self->{'writer'}->startTag($ult, 'op' => 'match');
1233     }
1234
1235     else {
1236     $self->{'writer'}->startTag($ult);
1237     }
1238
1239     #first loop to produce xml for all the table columns
1240     foreach $key (keys %mh)
1241     {
1242     #print "key: $key\n";
1243     $xx = ' ' . $key;
1244     $yy = $key . ' ';
1245     if (index($chadotables, $xx) < 0 && index($chadotables, $yy) < 0)
1246     {
1247         if ($isMatch) {
1248         $self->{'writer'}->startTag($key, 'op' => 'match');
1249         } else {
1250         $self->{'writer'}->startTag($key);
1251         }
1252
1253         my $x = $ult . '.' . $key;
1254         #the column is a foreign key
1255         if (defined $fkey{$x})
1256         {
1257         $nt = $fkey{$x};
1258         $sh = $mh{$key};
1259         $self->_hash2xml($isMatch, $nt, $sh, 0);
1260         } else
1261         {
1262         #print "$key: $mh{$key}\n";
1263         $self->{'writer'}->characters($mh{$key});
1264         }
1265         $self->{'writer'}->endTag($key);
1266     }
1267     }
1268
1269     #second loop to produce xml for all the nested tables
1270     foreach $key (keys %mh)
1271     {
1272     #print "key: $key\n";
1273     $xx = ' ' . $key;
1274     $yy = $key . ' ';
1275     #a nested table
1276     if (index($chadotables, $xx) > 0 || index($chadotables, $yy) > 0)
1277     {
1278         #$writer->startTag($key);
1279         $ntref = $mh{$key};
1280         #print "$key: ", ref($ntref), "\n";
1281         if (ref($ntref) =~ 'HASH') {
1282         $self->_hash2xml($isMatch, $key, $ntref, 0);
1283         } elsif (ref($ntref) =~ 'ARRAY') {
1284         #print "array dim: ", $#$ntref, "\n";
1285         foreach $ref (@$ntref) {
1286                 #print "\n";
1287             $self->_hash2xml($isMatch, $key, $ref, 0);
1288         }
1289         }
1290         #$writer->endTag($key);
1291     }
1292     }
1293
1294     #end tag
1295     $self->{'writer'}->endTag($ult);
1296
1297     #if ($root == 1) {
1298 #   $self->{'writer'}->endTag('chado');
1299 #    }
1300 }
1301
1302 sub _guess_acc_db {
1303     my $self = shift;
1304     my $seq = shift;
1305     my $acc = shift;
1306     #print "acc: $acc\n";
1307
1308     if ($acc =~ /^NM_\d{6}/ || $acc =~ /^NP_\d{6}/ || $acc =~ /^NT_\d{6}/ || $acc =~ /^NC_\d{6}/) {
1309         return "RefSeq";
1310     } elsif ($acc =~ /^XM_\d{6}/ || $acc =~ /^XP_\d{6}/ || $acc =~ /^XR_\d{6}/) {
1311         return "RefSeq";
1312     } elsif ($acc =~ /^[a-zA-Z]{1,2}\d{5,6}/) {
1313         return "GB";
1314     } elsif ($seq->molecule() eq 'protein' && $acc =~ /^[a-zA-z]\d{5}/) {
1315         return "PIR";
1316     } elsif ($seq->molecule() eq 'protein' && $acc =~ /^\d{6,7}[a-zA-Z]/) {
1317         return "PRF";
1318     } elsif ($acc =~ /\d+/ && $acc !~ /[a-zA-Z]/) {
1319         return "LocusID";
1320     } elsif ($acc =~ /^CG\d+/ || $acc =~ /^FB[a-z][a-z]\d+/) {
1321         return "FlyBase";
1322     } else {
1323         return "unknown";
1324     }
1325 }
1326
1327 sub _subfeat2featrelhash {
1328     my $self = shift;
1329     my $genename = shift;
1330     my $seqtype = shift;
1331     my $feat = shift;
1332     my $r = shift;
1333     my %srcf = %$r;     #srcfeature hash for featureloc.srcfeature_id
1334     my $tag_cv = shift;
1335     my $isanalysis = shift;
1336
1337     my $prim_tag = $feat->primary_tag;
1338
1339     my $sfunique = undef;       #subfeature uniquename
1340     my $sfname = undef;     #subfeature name
1341     my $sftype = undef;     #subfeature type
1342
1343     if ($feat->has_tag('symbol')) {
1344         ($sfunique) = $feat->each_tag_value("symbol");
1345     } elsif ($feat->has_tag('label')) {
1346         ($sfunique) = $feat->each_tag_value("label");
1347     } else {
1348         #$self->throw("$prim_tag at " . $feat->start . "\.\." . $feat->end . " does not have symbol or label! To convert into chadoxml, a seq feature must have a /symbol or /label tag holding its unique name.");
1349         #generate feature unique name as <genename>-<feature-type>-<span>
1350         $sfunique = $self->_genFeatUniqueName($genename, $feat);
1351     }
1352
1353     if ($feat->has_tag('Name')) {
1354         ($sfname) = $feat->each_tag_value("Name");
1355     }
1356
1357     #feature type translation
1358     if (defined $feattype_args2so{$prim_tag}) {
1359         $sftype = $feattype_args2so{$prim_tag};
1360     } else {
1361         $sftype = $prim_tag;
1362     }
1363
1364     if ($prim_tag eq 'mutation') {
1365         if ($feat->start == $feat->end) {
1366             $sftype = $feattype_args2so{'mutation1'};
1367         } else {
1368             $sftype = $feattype_args2so{'mutation2'};
1369         }
1370     }
1371
1372     #set is_analysis flag for gene model features
1373     undef(my $isanal);
1374     if ($sftype eq 'gene' || $sftype eq 'mRNA' || $sftype eq 'exon' || $sftype eq 'protein' || $sftype eq 'polypeptide') {
1375         $isanal = $isanalysis;
1376     }
1377
1378     my %sfhash = (
1379         "name"          => $sfname,
1380         "uniquename"        => $sfunique,
1381         "organism_id"       => \%organism,
1382         "type_id"       => { 'name' => $sftype, 'cv_id' => { 'name' => $cv_name{'sequence'} }},
1383         "is_analysis"           => $isanal || 'false',
1384         );
1385
1386     #make a copy of %sfhash for passing to this method when recursively called
1387     #my %srcfeat = (
1388         #        "name"                  => $sfname,
1389         #        "uniquename"            => $sfunique,
1390         #        "organism_id"           => \%organism,
1391         #        "type_id"               => { 'name' => $sftype, 'cv_id' => { 'name' => 'SO'}},
1392         #        );
1393
1394     #featureloc for subfeatures
1395     undef(my $sfmin);
1396     undef(my $sfmax);
1397     undef(my $is_sfmin_partial);
1398     undef(my $is_sfmax_partial);
1399     undef(my $sfstrand);
1400         undef(my $sfphase);
1401     $sfmin = $feat->start - 1;
1402     $sfmax = $feat->end;
1403     $sfstrand = $feat->strand();
1404
1405         if ($feat->can('phase')) {
1406             $sfphase = $feat->phase;
1407         }
1408
1409     #if the gene feature in an mRNA record, cannot use its coordinates, omit featureloc
1410     if ($seqtype eq 'mRNA' && $sftype eq 'gene') {
1411     } else {
1412         if ($feat->location->isa('Bio::Location::FuzzyLocationI')) {
1413             if ($feat->location->start_pos_type() ne 'EXACT') {
1414                 $is_sfmin_partial = 'true';
1415             }
1416             if ($feat->location->end_pos_type() ne 'EXACT') {
1417                 $is_sfmax_partial = 'true';
1418             }
1419         }
1420
1421         my %sfl = (
1422             "srcfeature_id" => \%srcf,
1423             "fmin"      => $sfmin,
1424             "is_fmin_partial" => $is_sfmin_partial || 'false',
1425             "fmax"      => $sfmax,
1426             "is_fmax_partial" => $is_sfmax_partial || 'false',
1427             "strand"    => $sfstrand,
1428                         "phase"         => $sfphase,
1429             );
1430
1431         $sfhash{'featureloc'} = \%sfl;
1432     }
1433
1434
1435     #subfeature tags
1436     undef(my @sfdbxrefs);       #subfeature dbxrefs
1437     undef(my @sub_featureprops);    #subfeature props
1438         undef(my @sub_featuresyns);     #subfeature synonyms
1439         undef(my @sub_featurecvterms);  #subfeature cvterms
1440     foreach my $tag ($feat->all_tags()) {
1441         #feature_dbxref for features
1442         if ($tag eq 'db_xref' or $tag eq 'dbxref' or $tag eq 'Dbxref')   {
1443             my @t1 = $feat->each_tag_value($tag);
1444             #print "# of dbxref: @t1\n";
1445             for my $temp (@t1) {
1446                $temp =~ /:/;
1447                my $db = $PREMATCH;
1448                my $xref = $POSTMATCH;
1449                #print "db: $db; xref: $xref\n";
1450                my %acchash = (
1451                 "db_id"     => {'name' => $db},
1452                 "accession" => $xref,
1453                 );
1454                my %sfdbx = ('dbxref_id' => \%acchash);
1455                push (@sfdbxrefs, \%sfdbx);
1456             }
1457                 #Alias tags
1458                 } elsif ($tag eq 'Alias') {
1459                         @sub_featuresyns = $self->handle_Alias_tag($feat, @sub_featuresyns);
1460                 } elsif ($tag eq 'Ontology_term') {
1461                         @sub_featurecvterms = $self->handle_Ontology_tag($feat, @sub_featurecvterms);
1462         #featureprop for features, excluding GFF Name & Parent tags
1463         } elsif ($tag ne 'gene' && $tag ne 'symbol' && $tag ne 'Name' && $tag ne 'Parent') {
1464                         next if ($tag eq 'parent_id');
1465                         next if ($tag eq 'load_id');
1466             foreach my $val ($feat->each_tag_value($tag)) {
1467                 my %prophash = undef;
1468                 %prophash = (
1469                                 "type_id"       => {'name' => $tag, 'cv_id' => {'name' => $tag_cv}},
1470                     "value"     => $val,
1471                 );
1472                 push(@sub_featureprops, \%prophash);
1473             }
1474         }
1475     }
1476
1477         if ($feat->can('source')) {
1478                 @sfdbxrefs = $self->handle_source($feat,@sfdbxrefs);
1479         }
1480
1481     if (@sub_featureprops) {
1482         $sfhash{'featureprop'} = \@sub_featureprops;
1483     }
1484     if (@sfdbxrefs) {
1485         $sfhash{'feature_dbxref'} = \@sfdbxrefs;
1486     }
1487         if (@sub_featuresyns) {
1488                 $sfhash{'feature_synonym'} = \@sub_featuresyns;
1489         }
1490         if (@sub_featurecvterms) {
1491                 $sfhash{'feature_cvterm'} = \@sub_featurecvterms;
1492         }
1493
1494     undef(my @ssfeatrel);
1495     if ($feat->has_tag('locus_tag')) {
1496         ($genename)= $feat->each_tag_value('locus_tag');
1497     } elsif ($feat->has_tag('gene')) {
1498         ($genename)= $feat->each_tag_value('gene');
1499     }
1500
1501     foreach my $sf ($feat->get_SeqFeatures()) {
1502         #print $sf->primary_tag, "\n";
1503         my $rref = $self->_subfeat2featrelhash($genename, $sftype, $sf, \%srcf, $tag_cv, $isanalysis);
1504         if (defined $rref) {
1505             push(@ssfeatrel, $rref);
1506         }
1507     }
1508
1509     if (@ssfeatrel) {
1510         $sfhash{'feature_relationship'} = \@ssfeatrel;
1511     }
1512
1513     #subj-obj relationship type
1514     undef(my $reltypename);
1515         $reltypename = return_reltypename($sftype);
1516
1517     my %fr = (
1518         "subject_id"    => \%sfhash,
1519         "type_id"       => { 'name' => $reltypename,
1520                                              'cv_id' => { 'name' => $cv_name{'relationship'} }},
1521         );
1522
1523     if ($seqtype eq 'mRNA' && $sftype eq 'gene') {
1524         return \%sfhash;
1525     } else {
1526         return \%fr;
1527     }
1528
1529 }
1530
1531 #generate uniquename for feature as: <genename>-<feature-type>-<span> (foo-mRNA-10..1000)
1532 sub _genFeatUniqueName {
1533     my $self = shift;
1534     my $genename = shift;
1535     my $feat = shift;
1536     undef(my $uniquename);
1537     my $ftype = $feat->primary_tag;
1538     my $start = $feat->start;
1539     my $end = $feat->end;
1540
1541     if ($feat->has_tag('locus_tag')) {
1542         ($genename) = $feat->each_tag_value("locus_tag");
1543     } elsif ($feat->has_tag('gene')) {
1544         ($genename) = $feat->each_tag_value("gene");
1545     }
1546
1547     $uniquename = $genename . '-' . $ftype . '-' . $start . "\.\." . $end;
1548
1549     return $uniquename;
1550 }
1551
1552 #create uniquename for pubs with no medline id and no FBrf#
1553 #use "<authors>, <year>, <type>" as the uniquename (same as miniref)
1554 #<authors> is <sole-author-surname>    if one author,
1555 #  or <first-author-surname> and <second-author-surname>   if two,
1556 #  or <first-author-surname> et al.   if more
1557 #sub _CreatePubUname {
1558 #   my $self = shift;
1559 #   my $pub = shift;
1560 #   undef(my $pubuname);
1561 #
1562 #   return $pubuname;
1563 #}
1564
1565 #get authors of a reference
1566 #returns ref to the array of author hashes
1567 sub _getRefAuthors {
1568     my $self = shift;
1569     my $ref = shift;
1570
1571     my $temp = $ref->authors;
1572     undef(my @authors);
1573     undef(my @aut);
1574
1575     #there are authors
1576     if ($temp ne '.') {
1577         if (index($temp, ' and ') > 0) {
1578             $temp =~ / and /;
1579             my $lastauthor = $POSTMATCH;
1580             @authors = split(/\, /, $PREMATCH);
1581             push (@authors, $lastauthor);
1582         } else {
1583             @authors = split(/\, /, $temp);
1584         }
1585
1586         my $a;
1587         my $i = 0;
1588         foreach $a (@authors) {
1589             $i ++;
1590             #parse the author lastname and givennames
1591             undef(my $last);
1592             undef(my $given);
1593             if (index($a, ',') > 0) {   #genbank format, last,f.m.
1594                 ($last, $given) = split(/\,/, $a);
1595             } elsif (index($a, ' ') > 0) {  #embl format, last f.m.
1596                 ($last, $given) = split(/ /, $a);
1597             }
1598             my %au = (
1599                 'surname'   => $last,
1600                 'givennames'    => $given,
1601                 );
1602             push(@aut, {author_id => \%au, arank => $i});
1603         }
1604
1605         return \@aut;
1606     }
1607
1608     #no authors, Bio::SeqIO::genbank doesn't pick up 'CONSRTM' line.
1609     else {
1610         return;
1611     }
1612
1613 }
1614
1615 #extract submission year from the citation of the submitted reference
1616 #genbank format for the submitted citation: JOURNAL   Submitted (DD-MON-YYYY) submitter address
1617 sub _getSubmitYear {
1618     my $self = shift;
1619     my $citation = shift;
1620
1621     if ($citation !~ /Submitted/) {
1622     $self->warn("not citation for a submitted reference. cannot extract submission year.");
1623     return;
1624     } else {
1625     $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/;
1626     my $a = $MATCH;
1627     $a =~ /\d{4}/;
1628     my $year = $MATCH;
1629
1630     return $year;
1631     }
1632 }
1633
1634 sub _getSubmitAddr {
1635     my $self = shift;
1636     my $ref = shift;
1637     undef(my %author);
1638
1639     my $citation = $ref->location;
1640     if ($citation !~ /Submitted/) {
1641     $self->warn("not citation for a submitted reference. cannot extract submission year.");
1642     return;
1643     } else {
1644     $citation =~ /Submitted \(\d\d-[a-zA-Z]{3}-\d{4}\)/;
1645     my $a = $POSTMATCH;
1646     if (defined $a) {
1647         $a =~ s/^\s//;
1648         %author = (
1649                'author_id'  => {'surname'   => substr($a, 0, 100)},
1650                );
1651         return \%author;
1652     } else {
1653         return;
1654     }
1655     }
1656 }
1657
1658 =head2 suppress_residues
1659
1660  Title    : suppress_residues
1661  Usage    : $obj->suppress_residues()        #get existing value
1662             $obj->suppress_residues($newval) #set new value
1663  Function : Keep track of the flag to suppress printing of residues in the
1664             chadoxml file. The default it to allow all residues to go into the
1665             file.
1666  Returns  : value of suppress_residues (a scalar)
1667  Args     : new value of suppress_residues (to set)
1668
1669 =cut
1670
1671 sub suppress_residues {
1672     my $self = shift;
1673     my $suppress_residues = shift if @_;
1674     return $self->{'suppress_residues'} = $suppress_residues if defined($suppress_residues);
1675     return $self->{'suppress_residues'};
1676 }
1677
1678 =head2 allow_residues
1679
1680  Title    : allow_residues
1681  Usage    : $obj->allow_residues()        #get existing value
1682             $obj->allow_residues($feature_type) #set new value
1683  Function : Track the allow_residues type.  This can be used in conjunction
1684             with the suppress_residues flag to only allow residues from a
1685             specific feature type to be printed in the xml file, for example,
1686             only printing chromosome residues. When suppress_residues is set to
1687             true, then only chromosome features would would go into the xml
1688             file. If suppress_residues is not set, this function has no effect
1689             (since the default is to put all residues in the xml file).
1690  Returns  : value of allow_residues (string that corresponds to a feature type)
1691  Args     : new value of allow_residues (to set)
1692  Status   :
1693
1694 =cut
1695
1696 sub allow_residues {
1697     my $self = shift;
1698     my $allow_residues = shift if @_;
1699     return $self->{'allow_residues'} = $allow_residues if defined($allow_residues);
1700     return $self->{'allow_residues'};
1701 }
1702
1703 =head2 return_ftype_hash
1704
1705  Title    : return_ftype_hash
1706  Usage    : $obj->return_ftype_hash()
1707  Function : A simple hash where returning it has be factored out of the main
1708             code to allow subclasses to override it.
1709  Returns  : A hash that indicates what the name of the SO term is and what
1710             the name of the Sequence Ontology is in the cv table.
1711  Args     : The string that represents the SO term.
1712  Status   :
1713
1714 =cut
1715
1716 sub return_ftype_hash {
1717     my $self  = shift;
1718     my $ftype = shift;
1719     my %ftype_hash = ( "name" => $ftype,
1720                        "cv_id" => {"name" => $cv_name{'sequence'} });
1721     return %ftype_hash;
1722 }
1723
1724 =head2 return_reltypename
1725
1726  Title    : return_reltypename
1727  Usage    : $obj->return_reltypename
1728  Function : Return the appropriate relationship type name depending on the
1729             feature type (typically part_of, but derives_from for polypeptide).
1730  Returns  : A relationship type name.
1731  Args     : A SO type name.
1732  Status   :
1733
1734 =cut
1735
1736 sub return_reltypename {
1737     my $self   = shift;
1738     my $sftype = shift;
1739
1740     my $reltypename;
1741     if ($sftype eq 'protein' || $sftype eq 'polypeptide') {
1742         $reltypename = 'derives_from';
1743     } else {
1744         $reltypename = 'part_of';
1745     }
1746
1747     return $reltypename;
1748 }
1749
1750 =head2 next_seq
1751
1752  Title    : next_seq
1753  Usage    : $obj->next_seq
1754  Function :
1755  Returns  :
1756  Args     :
1757  Status   : Not implemented (write only adaptor)
1758
1759 =cut
1760
1761 sub next_seq {
1762     my ($self, %argv) = @_;
1763
1764     $self->throw('next_seq is not implemented; this is a write-only adapter.');
1765
1766 }
1767
1768 =head2 _create_writer
1769
1770  Title    : _create_writer
1771  Usage    : $obj->_create_writer
1772  Function : Creates XML::Writer object and writes start tag
1773  Returns  : Nothing, though the writer persists as part of the chadoxml object
1774  Args     : None
1775  Status   :
1776
1777 =cut
1778
1779 sub _create_writer {
1780     my $self = shift;
1781
1782     $self->{'writer'} = XML::Writer->new(OUTPUT => $self->_fh,
1783                                          DATA_MODE => 1,
1784                                          DATA_INDENT => 3);
1785
1786     #print header
1787     $self->{'writer'}->xmlDecl("UTF-8");
1788     $self->{'writer'}->comment("created by Peili Zhang, Flybase, Harvard University\n".
1789                                "and Scott Cain, GMOD, Cold Spring Harbor Laboratory");
1790
1791     #start chadoxml
1792     $self->{'writer'}->startTag('chado');
1793
1794     return;
1795 }
1796
1797 =head2 close_chadoxml
1798
1799  Title    : close_chadoxml
1800  Usage    : $obj->close_chadoxml
1801  Function : Writes the closing xml tag
1802  Returns  : None
1803  Args     : None
1804  Status   :
1805
1806 =cut
1807
1808 sub close_chadoxml {
1809     my $self = shift;
1810
1811     $self->{'writer'}->endTag('chado');
1812     return;
1813 }
1814
1815 =head2 handle_unreserved_tags
1816
1817  Title    : handle_unreserved_tags
1818  Usage    : $obj->handle_unreserved_tags
1819  Function : Converts tag value pairs to xml-ready hashrefs
1820  Returns  : The array containing the hashrefs
1821  Args     : In order: the Seq or SeqFeature object, the key, and the hasharray
1822  Status   :
1823
1824 =cut
1825
1826 sub handle_unreserved_tags {
1827     my $self = shift;
1828     my $seq  = shift;
1829     my $key  = shift;
1830     my @arr  = @_;
1831
1832     my @values = $seq->attributes($key);
1833     for my $value (@values) {
1834         my %prophash = (
1835            "type_id"     => {'name' => $key,
1836                              'cv_id' => { 'name' => $cv_name{'feature_property'} }
1837                             },
1838                             "value"       => $value,
1839                        );
1840         push(@arr, \%prophash);
1841     }
1842
1843     return @arr;
1844 }
1845
1846 =head2 handle_Alias_tag
1847
1848  Title    : handle_Alias_tag
1849  Usage    : $obj->handle_Alias_tag
1850  Function : Convert Alias values to synonym hash refs
1851  Returns  : An array of synonym hash tags
1852  Args     : The seq or seqFeature object and the synonym hash array
1853  Status   :
1854
1855 =cut
1856
1857 sub handle_Alias_tag {
1858     my $self = shift;
1859     my $seq  = shift;
1860     my @arr  = @_;
1861
1862     my @Aliases = $seq->attributes('Alias');
1863     for my $Alias (@Aliases) {
1864         my %synhash = (
1865                   "type_id"   => { 'name' => 'exact',
1866                                   'cv_id' => { 'name'  => 'synonym_type' } },
1867                                  "name"         => $Alias,
1868                                  "synonym_sgml" => $Alias,
1869                       );
1870         push(@arr, {'synonym_id' => \%synhash,
1871                     'pub_id'     => {'uniquename' => 'null',
1872                                      'type_id'    => { 'name' => 'null',
1873                                                        'cv_id' => {
1874                                                             'name' => 'null',
1875                                                                   },
1876                                                      },
1877                                     },
1878                    });
1879     }
1880
1881     return @arr;
1882 }
1883
1884 =head2 handle_Ontology_tag
1885
1886  Title    : handle_Ontology_tag
1887  Usage    : $obj->handle_Ontology_tag
1888  Function : Convert Ontology_term values to ontology term hash refs
1889  Returns  : An array of ontology term hash refs
1890  Args     : The seq or seqFeature object and the ontology term array
1891  Status   :
1892
1893 =cut
1894
1895 sub handle_Ontology_tag  {
1896     my $self = shift;
1897     my $seq  = shift;
1898     my @arr  = @_;
1899
1900     my @terms = $seq->attributes('Ontology_term');
1901     for my $term (@terms) {
1902         my $hashref;
1903         if ($term =~ /(\S+):(\S+)/) {
1904             my $db  = $1;
1905             my $acc = $2;
1906             $hashref = {
1907                     'cvterm_id' => {
1908                         'dbxref_id' => {
1909                            'db_id' => { 'name' => $db },
1910                            'accession' => $acc
1911                                       },
1912                                    },
1913                        };
1914         }
1915         push(@arr, {cvterm_id => $hashref});
1916     }
1917
1918     return @arr;
1919 }
1920
1921 =head2 handle_dbxref
1922
1923  Title    : handle_dbxref
1924  Usage    : $obj->handle_dbxref
1925  Function : Convert Dbxref values to dbxref hashref
1926  Returns  : An array of dbxref hashrefs
1927  Args     : A seq or seqFeature object and the dbxref array
1928  Status   :
1929
1930 =cut
1931
1932 sub handle_dbxref {
1933     my $self = shift;
1934     my $seq  = shift;
1935     my $tag  = shift;
1936     my @arr  = @_;
1937
1938     my @terms = $seq->attributes($tag);
1939     for my $term (@terms) {
1940         my $hashref;
1941         if ($term =~ /(\S+):(\S+)/) {
1942             my $db = $1;
1943             my $acc= $2;
1944             my $version = 1;
1945             if ($acc =~ /(\S+)\.(\S+)/) {
1946                 $acc = $1;
1947                 $version = $2;
1948             }
1949             $hashref = {
1950                          'dbxref_id' => {
1951                                'db_id' => { 'name' => $db },
1952                                'accession' => $acc,
1953                                'version'   => $version,
1954                                         },
1955                        };
1956         }
1957         else {
1958             $self->throw("I don't know how to handle a dbxref like $term");
1959         }
1960         push(@arr, {'dbxref_id' => $hashref});
1961     }
1962     return @arr;
1963 }
1964
1965 =head2 handle_source
1966
1967  Title    : handle_source
1968  Usage    : $obj->handle_source
1969  Function :
1970  Returns  :
1971  Args     :
1972  Status   :
1973
1974 =cut
1975
1976 sub handle_source {
1977     my $self = shift;
1978     my $seq  = shift;
1979     my @arr  = @_;
1980
1981     my $source = $seq->source();
1982     return @arr unless $source;
1983
1984     my $hashref = {
1985                'dbxref_id' => {
1986                        'db_id' => {'name' => 'GFF_source'},
1987                        'accession' => $source,
1988                               }
1989                   };
1990
1991     push(@arr, {'dbxref_id' => $hashref});
1992     return @arr;
1993 }
1994
1995 =head2 _srcf_hash
1996
1997  Title    : _srcf_hash
1998  Usage    : $obj->_srcf_hash
1999  Function : Creates the srcfeature hash for use in featureloc hashes
2000  Returns  : The srcfeature hash
2001  Args     : The srcfeature name, the srcfeature type and a reference to the
2002             organism hash.
2003  Status   :
2004
2005 =cut
2006
2007 sub _srcf_hash {
2008     my $self = shift;
2009     my $srcf = shift;
2010     my $stype= shift;
2011     my $orgref = shift;
2012
2013     my %hash = ('uniquename'    => $srcf,
2014                 'organism_id'   => $orgref,
2015                 'type_id'       => {'name' => $stype,
2016                                     'cv_id' =>
2017                                        {'name' => $cv_name{'sequence'} }},
2018                );
2019
2020     return %hash;
2021 }
2022
2023
2024 1;