scripts/Bio-DB-GFF/bp_genbank2gff3.pl

   1 #!/usr/bin/perl
   2
   3
   4
   5 =pod
   6
   7 =head1 NAME
   8
   9 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3
  10
  11 =head1 SYNOPSIS
  12
  13   bp_genbank2gff3.pl [options] filename(s)
  14
  15   # process a directory containing GenBank flatfiles
  16   perl bp_genbank2gff3.pl --dir path_to_files --zip
  17
  18   # process a single file, ignore explicit exons and introns
  19   perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz
  20
  21   # process a list of files
  22   perl bp_genbank2gff3.pl *gbk.gz
  23
  24   # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader
  25   curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \
  26   | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \
  27   | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata
  28
  29     Options:
  30         --noinfer  -r  don't infer exon/mRNA subfeatures
  31         --conf     -i  path to the curation configuration file that contains user preferences
  32                        for Genbank entries (must be YAML format)
  33                        (if --manual is passed without --ini, user will be prompted to
  34                         create the file if any manual input is saved)
  35         --sofile  -l  path to to the so.obo file to use for feature type mapping
  36                        (--sofile live will download the latest online revision)
  37         --manual   -m  when trying to guess the proper SO term, if more than
  38                        one option matches the primary tag, the converter will
  39                        wait for user input to choose the correct one
  40                        (only works with --sofile)
  41         --dir      -d  path to a list of genbank flatfiles
  42         --outdir   -o  location to write GFF files (can be 'stdout' or '-' for pipe)
  43         --zip      -z  compress GFF3 output files with gzip
  44         --summary  -s  print a summary of the features in each contig
  45         --filter   -x  genbank feature type(s) to ignore
  46         --split    -y  split output to separate GFF and fasta files for
  47                        each genbank record
  48         --nolump   -n  separate file for each reference sequence
  49                        (default is to lump all records together into one
  50                        output file for each input file)
  51         --ethresh  -e  error threshold for unflattener
  52                        set this high (>2) to ignore all unflattener errors
  53         --[no]CDS  -c  Keep CDS-exons, or convert to alternate gene-RNA-protein-exon
  54                        model. --CDS is default. Use --CDS to keep default GFF gene model,
  55                        use --noCDS to convert to g-r-p-e.
  56         --format   -f  Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work
  57                        (GenBank is default)
  58         --GFF_VERSION  3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available
  59         --quiet        don't talk about what is being processed
  60         --typesource   SO sequence type for source (e.g. chromosome; region; contig)
  61         --help     -h  display this message
  62
  63
  64 =head1 DESCRIPTION
  65
  66 This script uses Bio::SeqFeature::Tools::Unflattener and
  67 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene
  68 containment hierarchies mapped for optimal display in gbrowse.
  69
  70 The input files are assumed to be gzipped GenBank flatfiles for refseq
  71 contigs.  The files may contain multiple GenBank records.  Either a
  72 single file or an entire directory can be processed.  By default, the
  73 DNA sequence is embedded in the GFF but it can be saved into separate
  74 fasta file with the --split(-y) option.
  75
  76 If an input file contains multiple records, the default behaviour is
  77 to dump all GFF and sequence to a file of the same name (with .gff
  78 appended).  Using the 'nolump' option will create a separate file for
  79 each genbank record.  Using the 'split' option will create separate
  80 GFF and Fasta files for each genbank record.
  81
  82
  83 =head2 Notes
  84
  85 =head3 'split' and 'nolump' produce many files
  86
  87 In cases where the input files contain many GenBank records (for
  88 example, the chromosome files for the mouse genome build), a very
  89 large number of output files will be produced if the 'split' or
  90 'nolump' options are selected.  If you do have lists of files E<gt> 6000,
  91 use the --long_list option in bp_bulk_load_gff.pl or
  92 bp_fast_load_gff.pl to load the gff and/ or fasta files.
  93
  94 =head3 Designed for RefSeq
  95
  96 This script is designed for RefSeq genomic sequence entries.  It may
  97 work for third party annotations but this has not been tested.
  98 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl
  99 if you don't mind some gene model unflattener errors (dgg).
 100
 101 =head3 G-R-P-E Gene Model
 102
 103 Don Gilbert worked this over with needs to produce GFF3 suited to
 104 loading to GMOD Chado databases.  Most of the changes I believe are
 105 suited for general use.  One main chado-specific addition is the
 106   --[no]cds2protein  flag
 107
 108 My favorite GFF is to set the above as ON by default (disable with --nocds2prot)
 109 For general use it probably should be OFF, enabled with --cds2prot.
 110
 111 This writes GFF with an alternate, but useful Gene model,
 112 instead of the consensus model for GFF3
 113
 114   [ gene > mRNA> (exon,CDS,UTR) ]
 115
 116 This alternate is
 117
 118   gene > mRNA > polypeptide > exon
 119
 120 means the only feature with dna bases is the exon.  The others
 121 specify only location ranges on a genome.  Exon of course is a child
 122 of mRNA and protein/peptide.
 123
 124 The protein/polypeptide feature is an important one, having all the
 125 annotations of the GenBank CDS feature, protein ID, translation, GO
 126 terms, Dbxrefs to other proteins.
 127
 128 UTRs, introns, CDS-exons are all inferred from the primary exon bases
 129 inside/outside appropriate higher feature ranges.   Other special gene
 130 model features remain the same.
 131
 132 Several other improvements and bugfixes, minor but useful are included
 133
 134   * IO pipes now work:
 135     curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...
 136
 137   * GenBank main record fields are added to source feature, e.g. organism, date,
 138     and the sourcetype, commonly chromosome for  genomes, is used.
 139
 140   * Gene Model handling for ncRNA, pseudogenes are added.
 141
 142   * GFF header is cleaner, more informative.
 143     --GFF_VERSION flag allows choice of v2 as well as default v3
 144
 145   * GFF ##FASTA inclusion is improved, and
 146     CDS translation sequence is moved to FASTA records.
 147
 148   * FT -> GFF attribute mapping is improved.
 149
 150   * --format choice of SeqIO input formats (GenBank default).
 151     Uniprot/Swissprot and EMBL work and produce useful GFF.
 152
 153   * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions
 154       and more flexible usage.
 155
 156 =head1 TODO
 157
 158 =head2 Are these additions desired?
 159
 160  * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY
 161  * handle Entrezgene, other non-sequence SeqIO structures (really should change
 162     those parsers to produce consistent annotation tags).
 163
 164 =head2 Related bugfixes/tests
 165
 166 These items from Bioperl mail were tested (sample data generating
 167 errors), and found corrected:
 168
 169  From: Ed Green <green <at> eva.mpg.de>
 170  Subject: genbank2gff3.pl on new human RefSeq
 171  Date: 2006-03-13 21:22:26 GMT
 172    -- unspecified errors (sample data works now).
 173
 174  From: Eric Just <e-just <at> northwestern.edu>
 175  Subject: genbank2gff3.pl
 176  Date: 2007-01-26 17:08:49 GMT
 177    -- bug fixed in genbank2gff3 for multi-record handling
 178
 179 This error is for a /trans_splice gene that is hard to handle, and
 180 unflattner/genbank2 doesn't
 181
 182  From: Chad Matsalla <chad <at> dieselwurks.com>
 183  Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?
 184  Date: 2005-07-15 19:51:48 GMT
 185
 186 =head1 AUTHOR
 187
 188 Sheldon McKay (mckays@cshl.edu)
 189
 190 Copyright (c) 2004 Cold Spring Harbor Laboratory.
 191
 192 =head2 AUTHOR of hacks for GFF2Chado loading
 193
 194 Don Gilbert (gilbertd@indiana.edu)
 195
 196
 197 =cut
 198
 199 use strict;
 200 use warnings;
 201
 202 use lib "$ENV{HOME}/bioperl-live";
 203 # chad put this here to enable situations when this script is tested
 204 # against bioperl compiled into blib along with other programs using blib
 205 BEGIN {
 206         unshift(@INC,'blib/lib');
 207 };
 208 use Pod::Usage;
 209 use Bio::Root::RootI;
 210 use Bio::SeqIO;
 211 use File::Spec;
 212 use Bio::SeqFeature::Tools::Unflattener;
 213 use Bio::SeqFeature::Tools::TypeMapper;
 214 use Bio::SeqFeature::Tools::IDHandler;
 215 use Bio::Location::SplitLocationI;
 216 use Bio::Location::Simple;
 217 use Bio::Tools::GFF;
 218 use Getopt::Long;
 219 use List::Util qw(first);
 220 use Bio::OntologyIO;
 221 use YAML qw(Dump LoadFile DumpFile);
 222 use File::Basename;
 223
 224 use vars qw/$split @filter $zip $outdir $help $ethresh
 225             $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
 226             $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
 227             $file @files $dir $summary $nolump
 228             $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
 229             $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;
 230
 231 use constant SO_URL =>
 232     'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';
 233 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];
 234 use constant ALPHABET_TO_NUMBER => {
 235     a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,
 236     j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,
 237     r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,
 238     z => 25,
 239     };
 240 use constant ALPHABET_DIVISOR => 26;
 241 use constant GM_NEW_TOPLEVEL => 2;
 242 use constant GM_NEW_PART => 1;
 243 use constant GM_DUP_PART => 0;
 244 use constant GM_NOT_PART => -1;
 245
 246 # Options cycle in multiples of 2 because of left side/right side pairing.
 247 # You can make this number odd, but displayed matches will still round up
 248 use constant OPTION_CYCLE => 6;
 249
 250 $GFF_VERSION = 3; # allow v2 ...
 251 $verbose = 1; # right default? -nov to turn off
 252
 253 # dgg: change the gene model to  Gene/mRNA/Polypeptide/exons...
 254 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()
 255 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;
 256   # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support
 257
 258 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **
 259 my $SOURCEID= $FORMAT; # "UniProt" "GenBank"  "EMBL" should work
 260    # other Bio::SeqIO formats may work.  TEST: EntrezGene < problematic tags; InterPro  KEGG
 261
 262
 263 my %TAG_MAP = (
 264   db_xref => 'Dbxref',
 265   name => 'Name',
 266   note => 'Note', # also pull GO: ids into Ontology_term
 267   synonym => 'Alias',
 268   symbol => 'Alias',  # is symbol still used?
 269   # protein_id => 'Dbxref', also seen Dbxref tags: EC_number
 270   # translation: handled in gene_features
 271 );
 272
 273
 274 $| = 1;
 275 my $quiet= !$verbose;
 276 my $ok= GetOptions( 'd|dir|input:s'   => \$dir,
 277             'z|zip'     => \$zip,
 278             'h|help'    => \$help,
 279             's|summary' => \$summary,
 280             'r|noinfer' => \$noinfer,
 281             'i|conf=s' => \$CONF,
 282             'sofile=s' => \$SO_FILE,
 283             'm|manual' => \$MANUAL,
 284             'o|outdir|output:s'=> \$outdir,
 285             'x|filter:s'=> \@filter,
 286             'y|split'   => \$split,
 287             "ethresh|e=s"=>\$ethresh,
 288             'c|CDS!'    => \$CDSkeep,
 289             'f|format=s' => \$FORMAT,
 290             'typesource=s' => \$source_type,
 291             'GFF_VERSION=s' => \$GFF_VERSION,
 292             'quiet!'    => \$quiet, # swap quiet to verbose
 293             'DEBUG!'    => \$DEBUG,
 294             'n|nolump'  => \$nolump);
 295
 296 my $lump = 1 unless $nolump || $split;
 297 $verbose= !$quiet;
 298
 299 # look for help request
 300 pod2usage(2) if $help || !$ok;
 301
 302 # keep SOURCEID as-is and change FORMAT for SeqIO types;
 303 # note SeqIO uses file.suffix to guess type; not useful here
 304 $SOURCEID= $FORMAT;
 305 $FORMAT  = "swiss" if $FORMAT =~/UniProt|trembl/;
 306 $verbose =1 if($DEBUG);
 307
 308 # initialize handlers
 309 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);
 310 $unflattener->error_threshold($ethresh) if $ethresh;
 311 $unflattener->verbose(1) if($DEBUG);
 312 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?
 313 # ensembl parsing is still problematic, forget this
 314
 315 my $tm  = Bio::SeqFeature::Tools::TypeMapper->new;
 316 my $idh = Bio::SeqFeature::Tools::IDHandler->new;
 317
 318 # dgg
 319 $source_type ||= "region"; # should really parse from FT.source contents below
 320
 321 #my $FTSOmap = $tm->FT_SO_map();
 322 my $FTSOmap;
 323 my $FTSOsynonyms;
 324
 325 if (defined($SO_FILE) && $SO_FILE eq 'live') {
 326     print "\nDownloading the latest SO file from ".SO_URL."\n\n";
 327     use LWP::UserAgent;
 328     my $ua = LWP::UserAgent->new(timeout => 30);
 329     my $request = HTTP::Request->new(GET => SO_URL);
 330     my $response = $ua->request($request);
 331
 332
 333     if ($response->status_line =~ /200/) {
 334         use File::Temp qw/ tempfile /;
 335         my ($fh, $fn) = tempfile();
 336         print $fh $response->content;
 337         $SO_FILE = $fn;
 338     } else {
 339         print "Couldn't download SO file online...skipping validation.\n"
 340             . "HTTP Status was " . $response->status_line . "\n"
 341             and undef $SO_FILE
 342     }
 343 }
 344
 345 if ($SO_FILE) {
 346
 347
 348     my (%terms, %syn);
 349
 350     my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );
 351     $ONTOLOGY = $parser->next_ontology();
 352
 353     for ($ONTOLOGY->get_all_terms) {
 354         my $feat = $_;
 355
 356         $terms{$feat->name} = $feat->name;
 357         #$terms{$feat->name} = $feat;
 358
 359         my @syn = $_->each_synonym;
 360
 361         push @{$syn{$_}}, $feat->name for @syn;
 362         #push @{$syn{$_}}, $feat for @syn;
 363     }
 364
 365     $FTSOmap = \%terms;
 366     $FTSOsynonyms = \%syn;
 367
 368     my %hardTerms = %{ $tm->FT_SO_map() };
 369     map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;
 370
 371 } else {
 372     my %terms = %{ $tm->FT_SO_map() };
 373     while (my ($k,$v) = each %terms) {
 374         $FTSOmap->{$k} = ref($v) ? shift @$v : $v;
 375     }
 376 }
 377
 378 $TYPE_MAP = $FTSOmap;
 379 $SYN_MAP = $FTSOsynonyms;
 380
 381
 382 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")
 383
 384 # stringify filter list if applicable
 385 my $filter = join ' ', @filter  if @filter;
 386
 387 # determine input files
 388 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use
 389 if ($dir && ($dir eq '-' || $dir eq 'stdin')) {
 390   $stdin=1;  $dir=''; @files=('stdin');
 391
 392 } elsif ( $dir ) {
 393     if ( -d $dir ) {
 394         opendir DIR, $dir or die "could not open $dir for reading: $!";
 395         @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;
 396         closedir DIR;
 397     }
 398     else {
 399         die "$dir is not a directory\n";
 400     }
 401 }
 402 else {
 403     @files = @ARGV;
 404     $dir = '';
 405 }
 406
 407 # we should have some files by now
 408 pod2usage(2) unless @files;
 409
 410
 411 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use
 412 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {
 413   warn("std. output chosen: cannot split\n") if($split);
 414   warn("std. output chosen: cannot zip\n") if($zip);
 415   warn("std. output chosen: cannot nolump\n") if($nolump);
 416   $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip
 417
 418 } elsif ( $outdir && !-e $outdir ) {
 419     mkdir($outdir) or die "could not create directory $outdir: $!\n";
 420 }
 421 elsif ( !$outdir ) {
 422     $outdir = $dir || '.';
 423 }
 424
 425 for my $file ( @files ) {
 426     # dgg ; allow 'stdin' / '-' input ?
 427     chomp $file;
 428     die "$! $file" unless($stdin || -e $file);
 429     print "# Input: $file\n" if($verbose);
 430
 431     my ($lump_fh, $lumpfa_fh, $outfile, $outfa);
 432     if ($stdout) {
 433       $lump_fh= *STDOUT; $lump="stdout$$";
 434       $outfa= "stdout$$.fa";  # this is a temp file ... see below
 435       open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
 436
 437     } elsif ( $lump ) {
 438         my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);
 439         $lump = File::Spec->catfile($outdir, $fileonly.'.gff');
 440         ($outfa = $lump) =~ s/\.gff/\.fa/;
 441         open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";
 442         open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
 443
 444     }
 445
 446     # open input file, unzip if req'd
 447     if ($stdin) {
 448        *FH = *STDIN;
 449     } elsif ( $file =~ /\.gz/ ) {
 450         open FH, "gunzip -c $file |";
 451     }
 452     else {
 453         open FH, '<', $file;
 454     }
 455
 456     my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);
 457     my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );
 458
 459     while ( my $seq = $in->next_seq() ) {
 460         my $seq_name = $seq->accession_number;
 461         my $end = $seq->length;
 462         my @to_print;
 463
 464         # arrange disposition of GFF output
 465         $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');
 466         my $out;
 467
 468         if ( $lump ) {
 469             $outfile = $lump;
 470             $out = $lump_fh;
 471         }
 472         else {
 473             $outfile = File::Spec->catfile($outdir, $seq_name.'.gff');
 474             open $out, ">$outfile";
 475         }
 476
 477         # filter out unwanted features
 478         my $source_feat= undef;
 479         my @source= filter($seq); $source_feat= $source[0];
 480
 481         ($source_type,$source_feat)=
 482           getSourceInfo( $seq, $source_type, $source_feat ) ;
 483           # always; here we build main prot $source_feat; # if @source;
 484
 485         # abort if there are no features
 486         warn "$seq_name has no features, skipping\n" and next
 487             if !$seq->all_SeqFeatures;
 488
 489
 490         $FTSOmap->{'source'} = $source_type;
 491         ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features
 492
 493         # construct a GFF header
 494         # add: get source_type from attributes of source feature? chromosome=X tag
 495         # also combine 1st ft line here with source ft from $seq ..
 496         my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);
 497         print $out $header;
 498         print "# working on $info\n" if($verbose);
 499
 500         # unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..
 501         unflatten_seq($seq);
 502
 503         # Note that we use our own get_all_SeqFeatures function
 504         # to rescue cloned exons
 505
 506         @GFF_LINE_FEAT = ();
 507         for my $feature ( get_all_SeqFeatures($seq) ) {
 508
 509             my $method = $feature->primary_tag;
 510             next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);
 511
 512             $feature->seq_id($seq->id) unless($feature->seq_id);
 513             $feature->source_tag($SOURCEID);
 514
 515             # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;
 516             ## also, pull any GO:000 ids from /note tag and put into Ontology_term
 517             maptags2gff($feature);
 518
 519             # current gene name.  The unflattened gene features should be in order so any
 520             # exons, CDSs, etc that follow will belong to this gene
 521             my $gene_name;
 522             if ( $method eq 'gene' || $method eq 'pseudogene' ) {
 523               @to_print= print_held($out, $gffio, \@to_print);
 524               $gene_id = $gene_name= gene_name($feature);
 525             } else {
 526               $gene_name= gene_name($feature);
 527             }
 528
 529             #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx
 530             # be converted to or added as  Name=xxx (if not ID= or as well)
 531             ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags
 532             convert_to_name($feature);
 533
 534             ## dgg: extended to protein|polypeptide
 535             ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes
 536             ## in yeast have that genbank tag; why?
 537             ## these include pseudogene ...
 538
 539             ## Note we also have mapped types to SO, so these RNA's are now transcripts:
 540             # pseudomRNA => "pseudogenic_transcript",
 541             # pseudotranscript" => "pseudogenic_transcript",
 542             # misc_RNA=>'processed_transcript',
 543
 544             warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;
 545
 546             if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
 547                || ( $gene_id && $gene_name eq $gene_id ) ) {
 548
 549                 my $action = gene_features($feature, $gene_id, $gene_name);  # -1, 0, 1, 2 result
 550                 if ($action == GM_DUP_PART) {
 551                   # ignore, this is dupl. exon with new parent ...
 552
 553                 } elsif ($action == GM_NOT_PART) {
 554                   add_generic_id( $feature, $gene_name, "nocount");
 555                   my $gff = $gffio->gff_string($feature);
 556                   push @GFF_LINE_FEAT, $feature;
 557                   #print $out "$gff\n" if $gff;
 558
 559                 } elsif ($action > 0) {
 560                  # hold off print because exon etc. may get 2nd, 3rd parents
 561                   @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);
 562                   push(@to_print, $feature);
 563                 }
 564
 565             }
 566
 567             # otherwise handle as generic feats with IDHandler labels
 568             else {
 569                 add_generic_id( $feature, $gene_name, "");
 570                 my $gff= $gffio->gff_string($feature);
 571                 push @GFF_LINE_FEAT, $feature;
 572                 #print $out "$gff\n" if $gff;
 573             }
 574         }
 575
 576         # don't like doing this after others; do after each new gene id?
 577         @to_print= print_held($out, $gffio, \@to_print);
 578
 579         gff_validate(@GFF_LINE_FEAT);
 580
 581         for my $feature (@GFF_LINE_FEAT) {
 582             my $gff= $gffio->gff_string($feature);
 583             print $out "$gff\n" if $gff;
 584         }
 585
 586         # deal with the corresponding DNA
 587         my ($fa_out,$fa_outfile);
 588         my $dna = $seq->seq;
 589         if($dna || %proteinfa) {
 590           $method{'RESIDUES'} += length($dna);
 591           $dna    =~ s/(\S{60})/$1\n/g;
 592           $dna   .= "\n";
 593
 594           if ($split) {
 595               $fa_outfile = $outfile;
 596               $fa_outfile =~ s/gff$/fa/;
 597               open $fa_out, ">$fa_outfile" or die $!;
 598               print $fa_out ">$seq_name\n$dna" if $dna;
 599               foreach my $aid (sort keys %proteinfa) {
 600                 my $aa= delete $proteinfa{$aid};
 601                 $method{'RESIDUES(tr)'} += length($aa);
 602                 $aa =~ s/(\S{60})/$1\n/g;
 603                 print $fa_out ">$aid\n$aa\n";
 604                 }
 605
 606           }
 607           else {
 608           ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out
 609           ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk
 610           ## maybe write this to temp .fa then cat to end of lumped gff $out
 611               print $lumpfa_fh ">$seq_name\n$dna" if $dna;
 612               foreach my $aid (sort keys %proteinfa) {
 613                 my $aa= delete $proteinfa{$aid};
 614                 $method{'RESIDUES(tr)'} += length($aa);
 615                 $aa =~ s/(\S{60})/$1\n/g;
 616                 print $lumpfa_fh ">$aid\n$aa\n";
 617                 }
 618           }
 619
 620         %proteinfa=();
 621         }
 622
 623         if ( $zip && !$lump ) {
 624             system "gzip -f $outfile";
 625             system "gzip -f $fa_outfile" if($fa_outfile);
 626             $outfile .= '.gz';
 627             $fa_outfile .= '.gz' if $split;
 628         }
 629
 630         # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA
 631         print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);
 632         print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);
 633
 634         # dgg: moved to after all inputs; here it prints cumulative sum for each record
 635         #if ( $summary ) {
 636         #      print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
 637         #
 638         #      for ( keys %method ) {
 639         #          print "# $_  $method{$_}\n";
 640         #      }
 641         #      print "# \n";
 642         #  }
 643
 644     }
 645
 646     print "# GFF3 saved to $outfile\n" if( $verbose && $lump);
 647     if ( $summary ) {
 648         print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
 649         for ( keys %method ) {
 650             print "# $_  $method{$_}\n";
 651         }
 652         print "# \n";
 653     }
 654
 655     ## FIXME for piped output w/ split FA files ...
 656     close($lumpfa_fh) if $lumpfa_fh;
 657     if (!$split && $outfa && $lump_fh) {
 658         print $lump_fh "##FASTA\n"; # GFF3 spec
 659         open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";
 660         while( <$lumpfa_fh>) {
 661             print $lump_fh $_;
 662         } # is $lump_fh still open?
 663         close($lumpfa_fh);
 664         unlink($outfa);
 665     }
 666
 667
 668     if ( $zip && $lump ) {
 669         system "gzip -f $lump";
 670     }
 671
 672     close FH;
 673 }
 674
 675
 676
 677
 678
 679 sub typeorder {
 680   return 1 if ($_[0] =~ /gene/);
 681   return 2 if ($_[0] =~ /RNA|transcript/);
 682   return 3 if ($_[0] =~ /protein|peptide/);
 683   return 4 if ($_[0] =~ /exon|CDS/);
 684   return 3; # default before exon (smallest part)
 685 }
 686
 687 sub sort_by_feattype {
 688   my($at,$bt)= ($a->primary_tag, $b->primary_tag);
 689   return (typeorder($at) <=> typeorder($bt))
 690     or ($at cmp $bt);
 691     ## or ($a->name() cmp $b->name());
 692 }
 693
 694 sub print_held {
 695   my($out,$gffio,$to_print)= @_;
 696   return unless(@$to_print);
 697   @$to_print = sort sort_by_feattype  @$to_print; # put exons after mRNA, otherwise chado loader chokes
 698   while ( my $feature = shift @$to_print) {
 699     my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode
 700     push @GFF_LINE_FEAT, $feature;
 701     #print $out "$gff\n";
 702     }
 703   return (); # @to_print
 704 }
 705
 706 sub maptags2gff {
 707   my $f = shift;
 708   ## should copy/move locus_tag to Alias, if not ID/Name/Alias already
 709   # but see below /gene /locus_tag usage
 710   foreach my $tag (keys %TAG_MAP) {
 711     if ($f->has_tag($tag)) {
 712       my $newtag= $TAG_MAP{$tag};
 713       my @v= $f->get_tag_values($tag);
 714       $f->remove_tag($tag);
 715       $f->add_tag_value($newtag,@v);
 716
 717       ## also, pull any GO:000 ids from /note tag and put into Ontology_term
 718       ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'
 719       if ($tag eq 'note') {
 720         map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;
 721         my @go= map { m/(GO:\d+)/g } @v;
 722         $f->add_tag_value('Ontology_term',@go) if(@go);
 723         }
 724
 725       }
 726     }
 727 }
 728
 729
 730 sub getSourceInfo {
 731   my ($seq, $source_type, $sf) = @_;
 732
 733   my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);
 734   my $is_gene = ($SOURCEID =~/entrezgene/i);
 735   my $is_rich = (ref($seq) =~ /RichSeq/);
 736   my $seq_name= $seq->accession_number();
 737
 738   unless($sf) { # make one
 739     $source_type=  $is_swiss ? $PROTEIN_TYPE
 740         : $is_gene ? "eneg" # "gene"  # "region" #
 741         : $is_rich ? $seq->molecule : $source_type;
 742     $sf = Bio::SeqFeature::Generic->direct_new();
 743
 744     my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);
 745     my $loc= $seq->can('location') ? $seq->location()
 746             :  new Bio::Location::Simple( -start => $start, -end => $len);
 747     $sf->location( $loc );
 748     $sf->primary_tag($source_type);
 749     $sf->source_tag($SOURCEID);
 750     $sf->seq_id( $seq_name);
 751     #? $sf->display_name($seq->id()); ## Name or Alias ?
 752     $sf->add_tag_value( Alias => $seq->id()); # unless id == accession
 753     $seq->add_SeqFeature($sf);
 754     ## $source_feat= $sf;
 755   }
 756
 757   if ($sf->has_tag("chromosome")) {
 758     $source_type= "chromosome";
 759     my ($chrname) = $sf->get_tag_values("chromosome");
 760     ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead
 761     ## e.g. Mouse chr 19 has two IDs in NCBI genbank now
 762     $sf->add_tag_value( Alias => $chrname );
 763   }
 764
 765   # pull GB Comment, Description for source ft ...
 766   # add reference - can be long, not plain string...
 767   warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;
 768   # GenBank   fields: keyword,comment,reference,date_changed
 769   # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM
 770
 771   # is this just for main $seq object or for all seqfeatures ?
 772   my %AnnotTagMap= (
 773       'gene_name' => 'Alias',
 774       'ALIAS_SYMBOL' => 'Alias',  # Entrezgene
 775       'LOCUS_SYNONYM' => 'Alias', #?
 776       'symbol' => 'Alias',
 777       'synonym' => 'Alias',
 778       'dblink' => 'Dbxref',
 779       'product' => 'product',
 780       'Reference' => 'reference',
 781       'OntologyTerm' => 'Ontology_term',
 782       'comment'  => 'Note',
 783       'comment1' => 'Note',
 784       # various map-type locations
 785       # gene accession tag is named per source db !??
 786       # 'Index terms' => keywords ??
 787       );
 788
 789
 790   my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );
 791   my ($date)= $seq->annotation->get_Annotations("dates")
 792       || $seq->annotation->get_Annotations("update-date")
 793       || $is_rich ? $seq->get_dates() : ();
 794   my ($comment)= $seq->annotation->get_Annotations("comment");
 795   my ($species)= $seq->annotation->get_Annotations("species");
 796   if (!$species
 797        && $seq->can('species')
 798        && defined $seq->species()
 799        && $seq->species()->can('binomial') ) {
 800     $species= $seq->species()->binomial();
 801   }
 802
 803   # update source feature with main GB fields
 804   $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');
 805   $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));
 806   $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));
 807   $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));
 808   $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));
 809
 810   $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;
 811
 812   foreach my $atag (sort keys %AnnotTagMap) {
 813       my $gtag= $AnnotTagMap{$atag}; next unless($gtag);
 814       my @anno = map{
 815           if (ref $_ && $_->can('get_all_values')) {
 816               split( /[,;] */, join ";", $_->get_all_values)
 817           }
 818           elsif (ref $_ && $_->can('display_text')) {
 819               split( /[,;] */, $_->display_text)
 820           }
 821           elsif (ref $_ && $_->can('value')) {
 822               split( /[,;] */, $_->value)
 823           }
 824           else {
 825              ();
 826           }
 827       } $seq->annotation->get_Annotations($atag);
 828       foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }
 829   }
 830
 831   #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');
 832   #$sf->add_tag_value( Alias => $_ ) foreach(@genes);
 833   #
 834   #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all
 835   #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);
 836
 837   return (wantarray)? ($source_type,$sf) : $source_type; #?
 838 }
 839
 840
 841 sub gene_features {
 842     my ($f, $gene_id, $genelinkID) = @_;
 843     local $_ = $f->primary_tag;
 844     $method{$_}++;
 845
 846     if ( /gene/ ) {
 847         $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?
 848         $tnum = $rnum= 0; $ncrna_id= $rna_id  = '';
 849         return GM_NEW_TOPLEVEL;
 850
 851     } elsif ( /mRNA/ ) {
 852         return GM_NOT_PART unless $gene_id;
 853         return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
 854         ($rna_id  = $gene_id ) =~ s/gene/mRNA/;
 855         $rna_id   .= '.t0' . ++$tnum;
 856         $f->add_tag_value( ID => $rna_id );
 857         $f->add_tag_value( Parent => $gene_id );
 858
 859     } elsif ( /RNA|transcript/) {
 860         ## misc_RNA here; missing exons ... flattener problem?
 861         #  all of {t,nc,sn}RNA can have gene models now
 862         ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag
 863         ## CDS needs to use mRNA, not misc_RNA, rna_id ...
 864         ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level
 865
 866         if($gene_id) {
 867           return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
 868           ($ncrna_id = $gene_id) =~ s/gene/ncRNA/;
 869           $ncrna_id .= '.r0' . ++$rnum;
 870           $f->add_tag_value( Parent => $gene_id );
 871           $f->add_tag_value( ID => $ncrna_id );
 872         } else {
 873           unless ($f->has_tag('ID')) {
 874             if($genelinkID) {
 875               $f->add_tag_value( ID => $genelinkID  ) ;
 876             } else {
 877               $idh->generate_unique_persistent_id($f);
 878             }
 879           }
 880          ($ncrna_id)= $f->get_tag_values('ID');
 881          return GM_NEW_TOPLEVEL;
 882           # this feat now acts as gene-top-level; need to print @to_print to flush prior exons?
 883         }
 884
 885     } elsif ( /exon/ ) { # can belong to any kind of RNA
 886         return GM_NOT_PART unless ($rna_id||$ncrna_id);
 887         return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
 888         ## we are getting duplicate Parents here, which chokes chado loader, with reason...
 889         ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice
 890         ## check all Parents
 891         for my $expar ($rna_id, $ncrna_id) {
 892           next unless($expar);
 893           if ( $exonpar{$expar} and $f->has_tag('Parent') ) {
 894             my @vals = $f->get_tag_values('Parent');
 895             next if (grep {$expar eq $_} @vals);
 896             }
 897           $exonpar{$expar}++;
 898           $f->add_tag_value( Parent => $expar);
 899           # last; #? could be both
 900         }
 901         # now we can skip cloned exons
 902         # dgg note: multiple parents get added and printed for each unique exon
 903         return GM_DUP_PART if ++$seen{$f} > 1;
 904
 905     } elsif ( /CDS|protein|polypeptide/ ) {
 906         return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??
 907         return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??
 908         (my $pro_id = $rna_id) =~ s/\.t/\.p/;
 909
 910         if( ! $CDSkeep && /CDS/) {
 911           $f->primary_tag($PROTEIN_TYPE);
 912
 913           ## duplicate problem is Location ..
 914           if ($f->location->isa("Bio::Location::SplitLocationI")) {
 915             # my($b,$e)=($f->start, $f->end); # is this all we need?
 916             my($b,$e)=(-1,0);
 917             foreach my $l ($f->location->each_Location) {
 918                $b = $l->start if($b<0 || $b > $l->start);
 919                $e = $l->end if($e < $l->end);
 920                }
 921             $f->location( Bio::Location::Simple->new(
 922                 -start => $b, -end => $e, -strand => $f->strand) );
 923             }
 924
 925             $f->add_tag_value( Derives_from => $rna_id );
 926           }
 927           else {
 928             $f->add_tag_value( Parent => $rna_id );
 929           }
 930
 931         $f->add_tag_value( ID => $pro_id );
 932
 933         move_translation_fasta($f, $pro_id);
 934         #if( $f->has_tag('translation')) {
 935         #   my ($aa) = $f->get_tag_values("translation");
 936         #    $proteinfa{$pro_id}= $aa;
 937         #    $f->remove_tag("translation");
 938         #    $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
 939         #}
 940     } elsif ( /region/ ) {
 941         $f->primary_tag('gene_component_region');
 942         $f->add_tag_value( Parent => $gene_id );
 943     } else {
 944         return GM_NOT_PART unless $gene_id;
 945         $f->add_tag_value( Parent => $gene_id );
 946     }
 947
 948     ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;
 949
 950     return GM_NEW_PART;
 951 }
 952
 953 ## was generic_features >  add_generic_id
 954 sub add_generic_id {
 955     my ($f, $ft_name, $flags) = @_;
 956     my $method = $f->primary_tag;
 957     $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above
 958
 959     if ($f->has_tag('ID')) {
 960
 961     }
 962     elsif ( $f->has_tag($method) ) {
 963         my ($name) = $f->get_tag_values($method);
 964         $f->add_tag_value( ID => "$method:$name" );
 965     }
 966     elsif($ft_name) { # is this unique ?
 967         $f->add_tag_value( ID => $ft_name );
 968     }
 969     else {
 970         $idh->generate_unique_persistent_id($f);
 971     }
 972
 973     move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )
 974       if($method =~ /CDS/);
 975
 976     # return $io->gff_string($f);
 977 }
 978
 979 sub move_translation_fasta {
 980     my ($f, $ft_id) = @_;
 981     if( $ft_id && $f->has_tag('translation') ) {
 982       my ($aa) = $f->get_tag_values("translation");
 983       if($aa && $aa !~ /^length/) {
 984         $proteinfa{$ft_id}= $aa;
 985         $f->remove_tag("translation");
 986         $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
 987         }
 988     }
 989 }
 990
 991 sub gff_header {
 992     my ($name, $end, $source_type, $source_feat) = @_;
 993     $source_type ||= "region";
 994
 995     my $info = "$source_type:$name";
 996     my $head = "##gff-version $GFF_VERSION\n".
 997                "##sequence-region $name 1 $end\n".
 998                "# conversion-by bp_genbank2gff3.pl\n";
 999     if ($source_feat) {
1000         ## dgg: these header comment fields are not useful when have multi-records, diff organisms
1001         for my $key (qw(organism Note date)) {
1002             my $value;
1003             if ($source_feat->has_tag($key)) {
1004                 ($value) = $source_feat->get_tag_values($key);
1005             }
1006             if ($value) {
1007                 $head .= "# $key $value\n";
1008                 $info .= ", $value";
1009             }
1010         }
1011         $head = "" if $didheader;
1012     } else {
1013         $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";
1014     }
1015     $didheader++;
1016     return (wantarray) ? ($head,$info) : $head;
1017 }
1018
1019 sub unflatten_seq {
1020     my $seq = shift;
1021
1022     ## print "# working on $source_type:", $seq->accession, "\n";
1023     my $uh_oh = "Possible gene unflattening error with" .  $seq->accession_number .
1024                 ": consult STDERR\n";
1025
1026     eval {
1027         $unflattener->unflatten_seq( -seq => $seq,
1028                                      -noinfer => $noinfer,
1029                                      -use_magic => 1 );
1030     };
1031
1032     # deal with unflattening errors
1033     if ( $@ ) {
1034         warn $seq->accession_number . " Unflattening error:\n";
1035         warn "Details: $@\n";
1036         print "# ".$uh_oh;
1037     }
1038
1039     return 0 if !$seq || !$seq->all_SeqFeatures;
1040
1041     # map feature types to the sequence ontology
1042     ## $tm->map_types_to_SO( -seq => $seq );
1043     #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg
1044
1045     map_types(
1046         $tm,
1047         -seq => $seq,
1048         -type_map  => $FTSOmap,
1049         -syn_map  => $FTSOsynonyms,
1050         -undefined => "region"
1051     ); #nml
1052
1053 }
1054
1055 sub filter {
1056     my $seq = shift;
1057     ## return unless $filter;
1058     my @feats;
1059     my @sources; # dgg; pick source features here; only 1 always?
1060     if ($filter) {
1061       for my $f ( $seq->remove_SeqFeatures ) {
1062         my $m = $f->primary_tag;
1063         push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1064         push @feats, $f unless $filter =~ /$m/i;
1065       }
1066       $seq->add_SeqFeature($_) foreach @feats;
1067     } else {
1068       for my $f ( $seq->get_SeqFeatures ){
1069         my $m = $f->primary_tag;
1070         push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1071         }
1072     }
1073
1074     return @sources;
1075 }
1076
1077
1078 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
1079 # changed to filter out cloned features.  We have to implement the old
1080 # method.  These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
1081 sub get_all_SeqFeatures  {
1082     my $seq = shift;
1083     my @flatarr;
1084
1085     foreach my $feat ( $seq->get_SeqFeatures ){
1086         push(@flatarr,$feat);
1087         _add_flattened_SeqFeatures(\@flatarr,$feat);
1088     }
1089     return @flatarr;
1090 }
1091
1092 sub gene_name {
1093     my $g = shift;
1094     my $gene_id = ''; # zero it;
1095
1096     if ($g->has_tag('locus_tag')) {
1097         ($gene_id) = $g->get_tag_values('locus_tag');
1098     }
1099
1100     elsif ($g->has_tag('gene')) {
1101         ($gene_id) = $g->get_tag_values('gene');
1102     }
1103     elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene
1104         ($gene_id) = $g->get_tag_values('ID');
1105     }
1106
1107     ## See Unflattener comment:
1108     # on rare occasions, records will have no /gene or /locus_tag
1109     # but it WILL have /product tags. These serve the same purpose
1110     # for grouping. For an example, see AY763288 (also in t/data)
1111     # eg. product=tRNA-Asp ;  product=similar to crooked neck protein
1112     elsif ($g->has_tag('product')) {
1113         my ($name)= $g->get_tag_values('product');
1114         ($gene_id) = $name unless($name =~ / /); # a description not name
1115     }
1116
1117     ## dgg; also handle transposon=xxxx ID/name
1118     # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746
1119     elsif ($g->has_tag('transposon')) {
1120         my ($name)= $g->get_tag_values('transposon');
1121         ($gene_id) = $name unless($name =~ / /); # a description not name
1122     }
1123
1124     return $gene_id;
1125 }
1126
1127 # same list as gene_name .. change tag to generic Name
1128 sub convert_to_name {
1129     my $g = shift;
1130     my $gene_id = ''; # zero it;
1131
1132     if ($g->has_tag('gene')) {
1133         ($gene_id) = $g->get_tag_values('gene');
1134         $g->remove_tag('gene');
1135         $g->add_tag_value('Name', $gene_id);
1136     }
1137     elsif ($g->has_tag('locus_tag')) {
1138         ($gene_id) = $g->get_tag_values('locus_tag');
1139         $g->remove_tag('locus_tag');
1140         $g->add_tag_value('Name', $gene_id);
1141     }
1142
1143     elsif ($g->has_tag('product')) {
1144         my ($name)= $g->get_tag_values('product');
1145         ($gene_id) = $name unless($name =~ / /); # a description not name
1146         ## $g->remove_tag('product');
1147         $g->add_tag_value('Name', $gene_id);
1148     }
1149
1150     elsif ($g->has_tag('transposon')) {
1151         my ($name)= $g->get_tag_values('transposon');
1152         ($gene_id) = $name unless($name =~ / /); # a description not name
1153         ## $g->remove_tag('transposon');
1154         $g->add_tag_value('Name', $gene_id);
1155     }
1156     elsif ($g->has_tag('ID')) {
1157         my ($name)= $g->get_tag_values('ID');
1158         $g->add_tag_value('Name', $name);
1159     }
1160     return $gene_id;
1161 }
1162
1163
1164 sub _add_flattened_SeqFeatures  {
1165     my ($arrayref,$feat) = @_;
1166     my @subs = ();
1167
1168     if ($feat->isa("Bio::FeatureHolderI")) {
1169         @subs = $feat->get_SeqFeatures;
1170     }
1171     elsif ($feat->isa("Bio::SeqFeatureI")) {
1172         @subs = $feat->sub_SeqFeature;
1173     }
1174     else {
1175         warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
1176             "Don't know how to flatten.";
1177     }
1178
1179     for my $sub (@subs) {
1180         push(@$arrayref,$sub);
1181         _add_flattened_SeqFeatures($arrayref,$sub);
1182     }
1183
1184 }
1185
1186 sub map_types {
1187
1188     my ($self, @args) = @_;
1189
1190     my($sf, $seq, $type_map, $syn_map, $undefmap) =
1191         $self->_rearrange([qw(FEATURE
1192                     SEQ
1193                     TYPE_MAP
1194                     SYN_MAP
1195                     UNDEFINED
1196                     )],
1197                 @args);
1198
1199     if (!$sf && !$seq) {
1200         $self->throw("you need to pass in either -feature or -seq");
1201     }
1202
1203     my @sfs = ($sf);
1204     if ($seq) {
1205         $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
1206         @sfs = $seq->get_all_SeqFeatures;
1207     }
1208     $type_map = $type_map || $self->typemap; # dgg: was type_map;
1209     foreach my $feat (@sfs) {
1210
1211         $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
1212         $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
1213
1214         my $primary_tag = $feat->primary_tag;
1215
1216         #if ($primary_tag =~ /^pseudo(.*)$/) {
1217         #    $primary_tag = $1;
1218         #    $feat->primary_tag($primary_tag);
1219         #}
1220
1221         my $mtype = $type_map->{$primary_tag};
1222         if ($mtype) {
1223             if (ref($mtype)) {
1224                 if (ref($mtype) eq 'ARRAY') {
1225                     my $soID;
1226                     ($mtype, $soID) = @$mtype;
1227
1228                     if ($soID && ref($ONTOLOGY)) {
1229                         my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
1230                         $mtype = $term->name if $term;
1231                     }
1232                     # if SO ID is undefined AND we have an ontology to search, we want to delete
1233                     # the feature type hash entry in order to force a fuzzy search
1234                     elsif (! defined $soID && ref($ONTOLOGY)) {
1235                         undef $mtype;
1236                         delete $type_map->{$primary_tag};
1237                     }
1238                     elsif ($undefmap && $mtype eq 'undefined') { # dgg
1239                         $mtype= $undefmap;
1240                     }
1241
1242                     $type_map->{$primary_tag} = $mtype if $mtype;
1243                 }
1244                 elsif (ref($mtype) eq 'CODE') {
1245                     $mtype = $mtype->($feat);
1246                 }
1247                 else {
1248                     $self->throw('must be scalar or CODE ref');
1249                 }
1250             }
1251             elsif ($undefmap && $mtype eq 'undefined') { # dgg
1252                 $mtype= $undefmap;
1253             }
1254             $feat->primary_tag($mtype);
1255         }
1256
1257         if ($CONF) {
1258             conf_read();
1259             my %perfect_matches;
1260             while (my ($p_tag,$rules) = each %$YAML) {
1261                 RULE:
1262                 for my $rule (@$rules) {
1263                     for my $tags (@$rule) {
1264                         while (my ($tag,$values) = each %$tags) {
1265                             for my $value (@$values) {
1266                                 if ($feat->has_tag($tag)) {
1267                                     for ($feat->get_tag_values($tag)) {
1268                                         next RULE unless $_ =~ /\Q$value\E/;
1269                                     }
1270                                 } elsif ($tag eq 'primary_tag') {
1271                                     next RULE unless $value eq
1272                                         $feat->primary_tag;
1273                                 } elsif ($tag eq 'location') {
1274                                     next RULE unless $value eq
1275                                         $feat->start.'..'.$feat->end;
1276                                 } else { next RULE }
1277                             }
1278                         }
1279                     }
1280                     $perfect_matches{$p_tag}++;
1281                 }
1282             }
1283             if (scalar(keys %perfect_matches) == 1) {
1284                 $mtype = $_ for keys %perfect_matches;
1285             } elsif (scalar(keys %perfect_matches) > 1) {
1286                 warn "There are conflicting rules in the config file for the" .
1287                      " following types: ";
1288                 warn "\t$_\n" for keys %perfect_matches;
1289                 warn "Until conflict resolution is built into the converter," .
1290                      " you will have to manually edit the config file to remove the" .
1291                      " conflict. Sorry :(. Skipping user preference for this entry";
1292                 sleep(2);
1293             }
1294         }
1295
1296         if ( ! $mtype  && $syn_map) {
1297             if ($feat->has_tag('note')) {
1298
1299                 my @all_matches;
1300
1301                 my @note = $feat->each_tag_value('note');
1302
1303                 for my $k (keys %$syn_map) {
1304
1305                     if ($k =~ /"(.+)"/) {
1306
1307                         my $syn = $1;
1308
1309                         for my $note (@note) {
1310
1311                             # look through the notes to see if the description
1312                             # is an exact match for synonyms
1313                             if ( $syn eq $note ) {
1314
1315                                 my @map = @{$syn_map->{$k}};
1316
1317
1318                                 my $best_guess = $map[0];
1319
1320                                 unshift @{$all_matches[-1]}, [$best_guess];
1321
1322                                 $mtype = $MANUAL
1323                                     ? manual_curation($feat, $best_guess, \@all_matches)
1324                                     : $best_guess;
1325
1326                                 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1327                                     . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1328                                     . '#' x 78 . "\n\n";
1329
1330                             } else {
1331                                 # check both primary tag and and note against
1332                                 # SO synonyms for best matching description
1333
1334                                 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
1335                             }
1336
1337                         }
1338                     }
1339                 }
1340
1341                 #unless ($mtype) {
1342                 for my $note (@note) {
1343                     for my $name (values %$type_map) {
1344                     # check primary tag against SO names for best matching
1345                     # descriptions //NML also need to check against
1346                     # definition && camel case split terms
1347
1348                         SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
1349                     }
1350                 }
1351                 #}
1352
1353                 if (scalar(@all_matches) && !$mtype) {
1354
1355                     my $top_matches = first { defined $_ } @{$all_matches[-1]};
1356
1357                     my $best_guess = $top_matches->[0];
1358
1359
1360
1361                     # if guess has quotes, it is a synonym term. we need to
1362                     # look up the corresponding name term
1363                     # otherwise, guess is a name, so we can use it directly
1364                     if ($best_guess =~ /"(.+)"/) {
1365
1366                         $best_guess = $syn_map->{$best_guess}->[0];
1367
1368                     }
1369
1370                     @RETURN = @all_matches;
1371                     $mtype = $MANUAL
1372                         ? manual_curation($feat, $best_guess, \@all_matches)
1373                         : $best_guess;
1374
1375                     print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1376                         . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1377                         . '#' x 78 . "\n\n";
1378
1379                 }
1380             }
1381             $mtype ||= $undefmap;
1382             $feat->primary_tag($mtype);
1383         }
1384     }
1385
1386
1387 }
1388
1389 sub SO_fuzzy_match {
1390
1391     my $candidate = shift;
1392     my $primary_tag = shift;
1393     my $note = shift;
1394     my $SO_terms = shift;
1395     my $best_matches_ref = shift;
1396     my $modifier = shift;
1397
1398     $modifier ||= '';
1399
1400         my @feat_terms;
1401
1402     for ( split(" |_", $primary_tag) ) {
1403         #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1404         my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1405         push @feat_terms, @camelCase;
1406     }
1407
1408     for ( split(" |_", $note) ) {
1409         #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1410         #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1411         (my $word = $_) =~ s/[;:.,]//g;
1412         push @feat_terms, $word;
1413     }
1414
1415
1416     my @SO_terms = split(" |_", $SO_terms);
1417
1418     # fuzzy match works on a simple point system. When 2 words match,
1419     # the $plus counter adds one. When they don't, the $minus counter adds
1420     # one. This is used to sort similar matches together. Better matches
1421     # are found at the end of the array, near the top.
1422
1423     # NML: can we improve best match by using synonym tags
1424     # EXACT,RELATED,NARROW,BROAD?
1425
1426     my ($plus, $minus) = (0, 0);
1427     my %feat_terms;
1428     my %SO_terms;
1429
1430     #unique terms
1431     map {$feat_terms{$_} = 1} @feat_terms;
1432     map {$SO_terms{$_} = 1} @SO_terms;
1433
1434     for my $st (keys %SO_terms) {
1435         for my $ft (keys %feat_terms) {
1436
1437             ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
1438
1439         }
1440     }
1441
1442     push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
1443
1444 }
1445
1446 sub manual_curation {
1447
1448     my ($feat, $default_opt,  $all_matches) = @_;
1449
1450     my @all_matches = @$all_matches;
1451
1452     # convert all SO synonyms into names and filter
1453     # all matches into unique term list because
1454     # synonyms can map to multiple duplicate names
1455
1456     my (@unique_SO_terms, %seen);
1457     for (reverse @all_matches) {
1458         for (@$_) {
1459             for (@$_) {
1460                 #my @names;
1461                 if ($_ =~ /"(.+)"/) {
1462                     for (@{$SYN_MAP->{$_}}) {
1463                         push @unique_SO_terms, $_ unless $seen{$_};
1464                         $seen{$_}++;
1465                     }
1466                 } else {
1467                     push @unique_SO_terms, $_ unless $seen{$_};
1468                     $seen{$_}++;
1469                 }
1470             }
1471         }
1472     }
1473
1474     my $s = scalar(@unique_SO_terms);
1475
1476     my $choice = 0;
1477
1478     my $more =
1479         "[a]uto   : automatic input (selects best guess for remaining entries)\r" .
1480         "[f]ind   : search for other SO terms matching your query (e.g. f gene)\r" .
1481         "[i]nput  : add a specific term\r" .
1482         "[r]eset  : reset to the beginning of matches\r" .
1483         "[s]kip   : skip this entry (selects best guess for this entry)\r"
1484     ;
1485
1486     $more .=
1487         "[n]ext   : view the next ".OPTION_CYCLE." terms\r" .
1488         "[p]rev   : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
1489
1490     my $msg = #"\n\n" . '-' x 156 . "\n"
1491          "The converter found $s possible matches for the following GenBank entry: ";
1492
1493     my $directions =
1494         "Type a number to select the SO term that best matches"
1495         . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
1496
1497
1498     # lookup filtered list to pull out definitions
1499     my @options = map {
1500         my $term = $_;
1501         my %term;
1502         for (['name', 'name'], ['def', 'definition'], ['synonym',
1503                 'each_synonym']) {
1504             my ($label, $method) = @$_;
1505             $term{$label} = \@{[$term->$method]};
1506         }
1507         [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
1508     }  map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
1509
1510
1511     my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
1512             $default_opt, @options);
1513
1514     if ($option eq 'skip') { return $default_opt
1515     } elsif ($option eq 'auto') {
1516         $MANUAL = 0;
1517         return $default_opt;
1518     } else { return $option }
1519
1520 }
1521
1522 sub options_cycle {
1523
1524     my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
1525
1526     #NML: really should only call GenBank_entry once. Will need to change
1527     #method to return array & shift off header
1528     my $entry = GenBank_entry($feat, "\r");
1529
1530     my $total = scalar(@opt);
1531
1532     ($start,$stop) = (0, OPTION_CYCLE)
1533         if ( ($start < 0) && ($stop > 0) );
1534
1535     ($start,$stop) = (0, OPTION_CYCLE)
1536         if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
1537
1538     ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
1539     ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
1540
1541     $stop = $total if $stop > $total;
1542
1543     my $dir_copy = $directions;
1544     my $msg_copy = $msg;
1545     my $format = "format STDOUT = \n" .
1546         '-' x 156 . "\n" .
1547         '^' . '<' x 77 .  '| Available Commands:' . "\n" .
1548         '$msg_copy' . "\n" .
1549         '-' x 156 . "\n" .
1550         ' ' x 78 . "|\n" .
1551         '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
1552         '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
1553         (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
1554         ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000  . ".\n";
1555
1556     {
1557         # eval throws redefined warning that breaks formatting.
1558         # Turning off warnings just for the eval to fix this.
1559         no warnings 'redefine';
1560         eval $format;
1561     }
1562
1563     write;
1564
1565     print '-' x 156 . "\n" .
1566         'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
1567         " - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
1568         "\n" . '-' x 156 . "\n";
1569
1570     for  (my $i = $start; $i < $stop; $i+=2) {
1571
1572         my ($left, $right) = @opt[$i,$i+1];
1573
1574         my ($nL, $nmL, $descL, $termL, @synL) = @$left;
1575
1576         #odd numbered lists can cause fatal undefined errors, so check
1577         #to make sure we have data
1578
1579         my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
1580
1581
1582         my $format = "format STDOUT = \n";
1583
1584         $format .=
1585             ' ' x 78 . "|\n" .
1586
1587             '@>>: name: ^' . '<' x 64 . '~' . ' |' .
1588                 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) .  "\n" .
1589             '$nL,' . ' ' x 7 . '$nmL,' .
1590                 ( ref($right) ? (' ' x 63 . '$nR,' .  ' ' x 7 .  "\$nmR,") : '' ) . "\n" .
1591
1592             ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
1593                 (ref($right) ? ('           ^' . '<' x 61 .  '...~') : '') . "\n" .
1594             ' ' x 11 . '$nmL,' .
1595                 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
1596             #' ' x 78 . '|' . "\n" .
1597
1598
1599             '     def:  ^' . '<' x 65 . ' |' .
1600                 (ref($right) ? ('     def:  ^' . '<' x 64 . '~') : '') . "\n" .
1601             ' ' x 11 . '$descL,' .
1602                 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1603
1604
1605             ('           ^' . '<' x 65 . ' |' .
1606                 (ref($right) ? ('           ^' . '<' x 64 . '~') : '') . "\n" .
1607             ' ' x 11 . '$descL,' .
1608                 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
1609
1610
1611             '           ^' . '<' x 61 . '...~ |' .
1612                 (ref($right) ? ('           ^' . '<' x 61 . '...~') : '') . "\n" .
1613             ' ' x 11 . '$descL,' .
1614                 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1615
1616             ".\n";
1617
1618         {
1619             # eval throws redefined warning that breaks formatting.
1620             # Turning off warnings just for the eval to fix this.
1621             no warnings 'redefine';
1622             eval $format;
1623         }
1624         write;
1625
1626     }
1627     print '-' x 156 . "\nenter a command:";
1628
1629     while (<STDIN>) {
1630
1631         (my $input = $_) =~ s/\s+$//;
1632
1633         if ($input =~ /^\d+$/) {
1634             if ( $input && defined $opt[$input-1] ) {
1635                 return $opt[$input-1]->[1]
1636             } else {
1637                 print "\nThat number is not an option. Please enter a valid number.\n:";
1638             }
1639         } elsif ($input =~ /^n/i | $input =~ /next/i ) {
1640             return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
1641                     $feat, $directions, $best_guess, @opt)
1642         } elsif ($input =~ /^p/i | $input =~ /prev/i ) {
1643             return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
1644                     $feat, $directions, $best_guess, @opt)
1645         } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
1646         } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
1647         } elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
1648             return manual_curation($feat, $best_guess, \@RETURN );
1649         } elsif ( $input =~ /^f/i || $input =~ /find/i ) {
1650
1651             my ($query, @query_results);
1652
1653             if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
1654             } else {
1655
1656                 #do a SO search
1657                 print "Type your search query\n:";
1658                 while (<STDIN>) { chomp($query = $_); last }
1659             }
1660
1661             for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
1662                 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
1663             }
1664
1665             return manual_curation($feat, $best_guess, \@query_results);
1666
1667         } elsif ( $input =~ /^i/i || $input =~ /input/i ) {
1668
1669             #NML fast input for later
1670             #my $query;
1671             #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
1672
1673             #manual input
1674             print "Type the term you want to use\n:";
1675             while (<STDIN>) {
1676                 chomp(my $input = $_);
1677
1678                 if (! $TYPE_MAP->{$input}) {
1679
1680                     print "\"$input\" doesn't appear to be a valid SO term. Are ".
1681                         "you sure you want to use it? (y or n)\n:";
1682
1683                     while (<STDIN>) {
1684
1685                         chomp(my $choice = $_);
1686
1687                         if ($choice eq 'y') {
1688                             print
1689                                 "\nWould you like to save your preference for " .
1690                                 "future use (so you don't have to redo manual " .
1691                                 "curation for this feature everytime you run " .
1692                                 "the converter)? (y or n)\n";
1693
1694                             #NML: all these while loops are a mess. Really should condense it.
1695                             while (<STDIN>) {
1696
1697                                 chomp(my $choice = $_);
1698
1699                                 if ($choice eq 'y') {
1700                                     curation_save($feat, $input);
1701                                     return $input;
1702                                 } elsif ($choice eq 'n') {
1703                                     return $input
1704                                 } else {
1705                                     print "\nDidn't recognize that command. Please " .
1706                                         "type y or n.\n:"
1707                                 }
1708                             }
1709
1710
1711                         } elsif ($choice eq 'n') {
1712                             return options_cycle($start, $stop, $msg, $feat,
1713                                     $directions, $best_guess, @opt)
1714                         } else {
1715                             print "\nDidn't recognize that command. Please " .
1716                                 "type y or n.\n:"
1717                         }
1718                     }
1719
1720                 } else {
1721                     print
1722                         "\nWould you like to save your preference for " .
1723                         "future use (so you don't have to redo manual " .
1724                         "curation for this feature everytime you run  " .
1725                         "the converter)? (y or n)\n";
1726
1727                     #NML: all these while loops are a mess. Really should condense it.
1728                     while (<STDIN>) {
1729
1730                         chomp(my $choice = $_);
1731
1732                         if ($choice eq 'y') {
1733                             curation_save($feat, $input);
1734                             return $input;
1735                         } elsif ($choice eq 'n') {
1736                             return $input
1737                         } else {
1738                             print "\nDidn't recognize that command. Please " .
1739                                 "type y or n.\n:"
1740                         }
1741                     }
1742
1743                 }
1744
1745             }
1746         } else {
1747             print "\nDidn't recognize that command. Please re-enter your choice.\n:"
1748         }
1749     }
1750
1751 }
1752
1753 sub GenBank_entry {
1754     my ($f, $delimiter, $num) = @_;
1755
1756     $delimiter ||= "\n";
1757
1758
1759     my $entry  =
1760
1761         ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
1762         . ($num
1763             ? ' ' x (12 - length $f->primary_tag ) . ' [2] '
1764             : ' ' x (15 - length $f->primary_tag)
1765           )
1766         . $f->start.'..'.$f->end
1767
1768         . "$delimiter";
1769
1770     if ($num) {
1771         words_tag($f, \$entry);
1772     } else {
1773         for my $tag ($f->all_tags) {
1774             for my $val ( $f->each_tag_value($tag) ) {
1775                 $entry .= ' ' x 20;
1776                 #$entry .= "/$tag=\"$val\"$delimiter";
1777                 $entry .= $val eq '_no_value'
1778                     ? "/$tag$delimiter"
1779                     : "/$tag=\"$val\"$delimiter";
1780             }
1781         }
1782
1783     }
1784
1785     return $entry;
1786 }
1787
1788
1789 sub gff_validate {
1790     warn "Validating GFF file\n" if $DEBUG;
1791     my @feat = @_;
1792
1793     my (%parent2child, %all_ids, %descendants, %reserved);
1794
1795     for my $f (@feat) {
1796         for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
1797             map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
1798                 if $f->has_tag($$aTags[0]);
1799         }
1800     }
1801
1802     if ($SO_FILE) {
1803         while (my ($parentID, $aChildren) = each %parent2child) {
1804             parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
1805         }
1806     }
1807
1808     id_validate(\%all_ids, \%reserved);
1809 }
1810
1811 sub parent_validate {
1812     my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
1813
1814     my $aParents = $hAll->{$parentID};
1815
1816     map {
1817         my $child = $_;
1818         $child->add_tag_value( validation_error =>
1819         "feature tried to add Parent tag, but no Parent found with ID $parentID"
1820         );
1821         my %parents;
1822         map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1823         delete $parents{$parentID};
1824         my @parents = keys %parents;
1825
1826         $child->remove_tag('Parent');
1827
1828         unless ($child->has_tag('ID')) {
1829             my $id = gene_name($child);
1830             $child->add_tag_value('ID', $id);
1831             push @{$hAll->{$id}}, $child
1832         }
1833
1834         $child->add_tag_value('Parent', @parents) if @parents;
1835
1836     } @$aChildren and return unless scalar(@$aParents);
1837
1838     my $par = join(',', map { $_->primary_tag } @$aParents);
1839     warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
1840
1841     #NML manual curation needs to happen here
1842
1843
1844     my %parentsToRemove;
1845
1846     CHILD:
1847     for my $child (@$aChildren) {
1848         my $childType  = $child->primary_tag;
1849
1850         warn "WORKING ON $childType at ".$child->start.' to '.$child->end
1851             if $DEBUG;
1852
1853         for (my $i = 0; $i < scalar(@$aParents); $i++) {
1854             my $parent = $aParents->[$i];
1855             my $parentType = $parent->primary_tag;
1856
1857             warn "CHECKING $childType against $parentType" if $DEBUG;
1858
1859             #cache descendants so we don't have to do repeat searches
1860             unless ($hDescendants->{$parentType}) {
1861
1862                 for my $term ($ONTOLOGY->find_terms(
1863                         -name => $parentType
1864                     ) ) {
1865
1866                     map {
1867                         $hDescendants->{$parentType}{$_->name}++
1868                     } $ONTOLOGY->get_descendant_terms($term);
1869
1870                 }
1871
1872                 # NML: hopefully temporary fix.
1873                 # SO doesn't consider exon/CDS to be a child of mRNA
1874                 # even though common knowledge dictates that they are
1875                 # This cheat fixes the false positives for now
1876                 if ($parentType eq 'mRNA') {
1877                     $hDescendants->{$parentType}{'exon'} = 1;
1878                     $hDescendants->{$parentType}{'CDS'} = 1;
1879                 }
1880
1881             }
1882
1883             warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
1884                 " be a child of $parentType?" if $DEBUG;
1885             if ($hDescendants->{$parentType}{$childType}) {
1886                 warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
1887
1888                 #NML need to deal with multiple children matched to multiple different
1889                 #parents. This model only assumes the first parent id that matches a child will
1890                 #be the reserved feature.
1891
1892                 $hReserved->{$parentID}{$parent}{'parent'} = $parent;
1893                 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
1894
1895                 #mark parent for later removal from all IDs
1896                 #so we don't accidentally change any parents
1897
1898                 $parentsToRemove{$i}++;
1899
1900                 next CHILD;
1901             }
1902         }
1903
1904
1905
1906         #NML shouldn't have to check this; somehow child can lose Parent
1907         #it's happening W3110
1908         #need to track this down
1909         if ( $child->has_tag('Parent') ) {
1910
1911             warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
1912                 if $DEBUG;
1913
1914             my %parents;
1915
1916             map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1917
1918             delete $parents{$parentID};
1919             my @parents = keys %parents;
1920
1921             warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
1922                 ' to ' . $child->end . " cannot be a child of ID $parentID"
1923                 if $DEBUG;
1924
1925             $child->add_tag_value( validation_error =>
1926                     "feature cannot be a child of $parentID");
1927
1928             $child->remove_tag('Parent');
1929
1930             unless ($child->has_tag('ID')) {
1931                 my $id = gene_name($child);
1932                 $child->add_tag_value('ID', $id);
1933                 push @{$hAll->{$id}}, $child
1934             }
1935
1936             $child->add_tag_value('Parent', @parents) if @parents;
1937         }
1938
1939     }
1940
1941     #delete $aParents->[$_] for keys %parentsToRemove;
1942     splice(@$aParents, $_, 1) for keys %parentsToRemove;
1943 }
1944
1945 sub id_validate {
1946     my ($hAll, $hReserved) = @_;
1947
1948
1949     for my $id (keys %$hAll) {
1950
1951         #since 1 feature can have this id,
1952         #let's just shift it off and uniquify
1953         #the rest (unless it's reserved)
1954
1955         shift @{$hAll->{$id}} unless $hReserved->{$id};
1956         for my $feat (@{$hAll->{$id}}) {
1957             id_uniquify(0, $id, $feat, $hAll);
1958         }
1959     }
1960
1961     for my $parentID (keys %$hReserved) {
1962
1963         my @keys = keys %{$hReserved->{$parentID}};
1964
1965         shift @keys;
1966
1967         for my $k (@keys) {
1968             my $parent = $hReserved->{$parentID}{$k}{'parent'};
1969             my $aChildren= $hReserved->{$parentID}{$k}{'children'};
1970
1971             my $value = id_uniquify(0, $parentID, $parent, $hAll);
1972             for my $child (@$aChildren) {
1973
1974                 my %parents;
1975                 map { $parents{$_}++ } $child->get_tag_values('Parent');
1976                 $child->remove_tag('Parent');
1977                 delete $parents{$parentID};
1978                 $parents{$value}++;
1979                 my @parents = keys %parents;
1980                 $child->add_tag_value('Parent', @parents);
1981             }
1982
1983         }
1984     }
1985 }
1986
1987 sub id_uniquify {
1988     my ($count, $value, $feat, $hAll) = @_;
1989
1990     warn "UNIQUIFYING $value" if $DEBUG;
1991
1992     if (! $count) {
1993         $feat->add_tag_value(Alias => $value);
1994         $value .= ('.' . $feat->primary_tag)
1995     } elsif ($count == 1) {
1996         $value .= ".$count"
1997     } else {
1998         chop $value;
1999         $value .= $count
2000     }
2001     $count++;
2002
2003     warn "ENDED UP WITH $value" if $DEBUG;
2004     if ( $hAll->{$value} ) {
2005         warn "$value IS ALREADY TAKEN" if $DEBUG;
2006         id_uniquify($count, $value, $feat, $hAll);
2007     } else {
2008         #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
2009         $feat->remove_tag('ID');
2010         $feat->add_tag_value('ID', $value);
2011         push @{$hAll->{$value}}, $value;
2012     }
2013
2014     $value;
2015 }
2016
2017 sub conf_read {
2018
2019     print "\nCannot read $CONF. Change file permissions and retry, " .
2020         "or enter another file\n" and conf_locate() unless -r $CONF;
2021
2022     print "\nCannot write $CONF. Change file permissions and retry, " .
2023         "or enter another file\n" and conf_locate() unless -w $CONF;
2024
2025     $YAML = LoadFile($CONF);
2026
2027 }
2028
2029 sub conf_create {
2030
2031     my ($path, $input) = @_;
2032
2033     print "Cannot write to $path. Change directory permissions and retry " .
2034         "or enter another save path\n" and conf_locate() unless -w $path;
2035
2036     $CONF = $input;
2037
2038     open(FH, '>', $CONF);
2039     close(FH);
2040     conf_read();
2041
2042
2043 }
2044
2045 sub conf_write { DumpFile($CONF, $YAML) }
2046
2047 sub conf_locate {
2048
2049     print "\nEnter the location of a previously saved config, or a new " .
2050         "path and file name to create a new config (this step is " .
2051         "necessary to save any preferences)";
2052
2053     print "\n\nenter a command:";
2054
2055     while (<STDIN>) {
2056         chomp(my $input = $_);
2057         my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
2058
2059         if (-e $input && (! -d $input)) {
2060
2061             print "\nReading $input...\n";
2062             $CONF = $input;
2063
2064             conf_read();
2065             last;
2066
2067         } elsif (! -d $input && $fn.$suffix) {
2068
2069             print "Creating $input...\n";
2070             conf_create($path, $input);
2071             last;
2072
2073         } elsif (-e $input && -d $input) {
2074             print "You only entered a directory. " .
2075                 "Please enter BOTH a directory and filename\n";
2076         } else {
2077             print "$input does not appear to be a valid path. Please enter a " .
2078                 "valid directory and filename\n";
2079         }
2080         print "\nenter a command:";
2081     }
2082 }
2083
2084 sub curation_save {
2085
2086     my ($feat, $input) = @_;
2087
2088     #my $error = "Enter the location of a previously saved config, or a new " .
2089     #    "path and file name to create a new config (this step is " .
2090     #    "necessary to save any preferences)\n";
2091
2092     if (!$CONF) {
2093         print "\n\n";
2094         conf_locate();
2095     } elsif (! -e $CONF) {
2096         print "\n\nThe config file you have chosen doesn't exist.\n";
2097         conf_locate();
2098     } else { conf_read() }
2099
2100     my $entry = GenBank_entry($feat, "\r", 1);
2101
2102     my $msg = "Term entered: $input";
2103     my $directions = "Please select any/all tags that provide evidence for the term you
2104 have entered. You may enter multiple tags by separating them by commas/dashes
2105 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
2106 the option of either selecting the entire note as evidence, or specific
2107 keywords. If a tag has multiple keywords, they will be tagged alphabetically
2108 for selection. To select a specific keyword in a tag field, you must enter the
2109 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
2110 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
2111 to be to match your curation. To match the GenBank entry exactly as it
2112 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
2113 preference, you will no longer be prompted to choose a feature type for any
2114 matching entries until you edit the curation.ini file.";
2115     my $msg_copy = $msg;
2116     my $dir_copy = $directions;
2117
2118     my $format = "format STDOUT = \n" .
2119         '-' x 156 . "\n" .
2120         '^' . '<' x 77 .  '| Directions:' . "\n" .
2121         '$msg_copy' . "\n" .
2122         '-' x 156 . "\n" .
2123         ' ' x 78 . "|\n" .
2124         '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
2125         '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
2126         (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
2127         ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20  . ".\n";
2128
2129     {
2130         # eval throws redefined warning that breaks formatting.
2131         # Turning off warnings just for the eval to fix this.
2132         no warnings 'redefine';
2133         eval $format;
2134     }
2135
2136     write;
2137     print '-' x 156 . "\nenter a command:";
2138
2139     my @tags = words_tag($feat);
2140
2141     my $final = {};
2142     my $choices;
2143     while (<STDIN>) {
2144
2145         chomp(my $choice = $_);
2146
2147         if (scalar(keys %$final) && $choice =~ /^y/i) { last
2148
2149         } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
2150
2151         } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
2152
2153         } elsif ($choice eq 'all') {
2154
2155             $choice = '';
2156             for (my $i=1; $i < scalar(@tags); $i++) {
2157                 $choice .= "$i,";
2158             }
2159             chop $choice;
2160         }
2161         #print "CHOICE [$choice]";
2162
2163         my @selections;
2164         for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
2165             if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
2166
2167                 for ($1..$2) { push @selections, $_ }
2168
2169                 my $dangling_alphas = $3;
2170                 alpha_expand($dangling_alphas, \@selections);
2171
2172             } else {
2173                 alpha_expand($_, \@selections);
2174             }
2175         }
2176
2177         foreach my $numbers (@selections) {
2178
2179             my @c = split(/(?=[\w])/, $numbers);
2180             s/\W+//g foreach @c;
2181             my $num;
2182
2183             {
2184                 $^W = 0;
2185                 $num = 0 + shift @c;
2186             }
2187
2188             my $tag = $tags[$num];
2189             if (ref $tag && scalar(@c)) {
2190                 my $no_value;
2191                 foreach (@c) {
2192                     if (defined $tag->{$_}) {
2193                         $choices .= "${num}[$_] ";
2194                         my ($t,$v) = @{$tag->{$_}};
2195                         push @{${$final->{$input}}[0]{$t}}, $v;
2196
2197                     } else { $no_value++ }
2198                     }
2199
2200                 if ($no_value) {
2201                     _selection_add($tag,$final,$input,\$choices,$num);
2202                     #my ($t,$v) = @{$tag->{'all'}};
2203                     #unless (defined ${$final->{$input}}[0]{$t}) {
2204                         #$choices .= "$num, ";
2205                         #push @{${$final->{$input}}[0]{$t}}, $v
2206                     #}
2207                 }
2208
2209                 $choices = substr($choices, 0, -2);
2210                 $choices .= ', ';
2211
2212             } elsif (ref $tag) {
2213                 _selection_add($tag,$final,$input,\$choices,$num);
2214                 #my ($t,$v) = @{$tag->{'all'}};
2215                 #unless (defined ${$final->{$input}}[0]{$t}) {
2216                     #$choices .= "$num, ";
2217                     #push @{${$final->{$input}}[0]{$t}}, $v
2218                 #}
2219             }
2220         }
2221         $choices = substr($choices, 0, -2) if $choices;
2222         if ($final) {
2223             print "\nYou have chosen the following tags:\n$choices\n";
2224             print "This will be written to the config file as:\n";
2225             print Dump $final;
2226             print "\nIs this correct? (y or n)\n";
2227         } else { print "\nInvalid selection. Please try again\n" }
2228     }
2229     push @{$YAML->{$input}}, $final->{$input};
2230     conf_write();
2231 }
2232
2233 #  words_tag() splits each tag value string into multiple words so that the
2234 #  user can select the parts he/she wants to use for curation
2235 #  it can tag 702 (a - zz) separate words; this should be enough
2236
2237 sub words_tag {
2238
2239     my ($feat, $entry) = @_;
2240
2241     my @tags;
2242
2243     @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
2244     my $i = 3;
2245     foreach my $tag ($feat->all_tags) {
2246         foreach my $value ($feat->each_tag_value($tag)) {
2247
2248             my ($string, $tagged_string);
2249
2250             my @words = split(/(?=\w+?)/, $value);
2251
2252             my $pos = 0;
2253
2254
2255             foreach my $word (@words) {
2256
2257                 (my $sanitized_word = $word) =~ s/\W+?//g;
2258                 $string .= $word;
2259
2260                 my $lead = int($pos/ALPHABET_DIVISOR);
2261                 my $lag = $pos % ALPHABET_DIVISOR;
2262
2263                 my $a =  $lead ? ${(ALPHABET)}[$lead-1] : '';
2264                 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2265
2266                 $tagged_string .= " ($a) $word";
2267
2268                 $tags[$i]{$a} = [$tag, $sanitized_word];
2269                 $pos++;
2270             }
2271
2272             $value = $tagged_string if scalar(@words) > 1;
2273
2274             $$entry .= "[$i] /$tag=\"$value\"\r";
2275
2276             $tags[$i]{'all'} = [$tag, $string];
2277         }
2278         $i++;
2279     }
2280
2281     return @tags;
2282
2283 }
2284
2285 sub alpha_expand {
2286
2287     my ($dangling_alphas, $selections) = @_;
2288
2289     if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
2290
2291         my $digit = $1;
2292         push @$selections, $digit if $digit;
2293
2294         my $start = $2;
2295         my $stop = $3;
2296
2297         my @starts = split('', $start);
2298         my @stops = split('', $stop);
2299
2300         my ($final_start, $final_stop);
2301
2302         for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
2303
2304             my ($final, $splits) = @$_;
2305
2306             my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
2307             my $rem;
2308
2309
2310             if ($$splits[1]) {
2311                 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
2312                 $int++
2313             } else {
2314                 $rem = $int;
2315                 $int = 0;
2316             }
2317
2318
2319             $$final = $int * ALPHABET_DIVISOR;
2320             $$final += $rem;
2321
2322         }
2323
2324         my $last_number = pop @$selections;
2325         for my $pos ($final_start..$final_stop) {
2326             my $lead = int($pos/ALPHABET_DIVISOR);
2327             my $lag = $pos % ALPHABET_DIVISOR;
2328             my $alpha =  $lead ? ${(ALPHABET)}[$lead-1] : '';
2329             $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2330             push @$selections, $last_number.$alpha;
2331         }
2332
2333     } elsif (defined($dangling_alphas)) {
2334         if ($dangling_alphas =~ /^\d/) {
2335             push @$selections, $dangling_alphas;
2336         } elsif ($dangling_alphas =~ /^\D/) {
2337             #print "$dangling_alphas ".Dumper @$selections;
2338             my $last_number = pop @$selections;
2339             $last_number ||= '';
2340             push @$selections, $last_number.$dangling_alphas;
2341             #$$selections[-1] .= $dangling_alphas;
2342         }
2343     }
2344
2345 }
2346
2347 sub _selection_add {
2348
2349     my ($tag, $final, $input, $choices, $num) = @_;
2350     my ($t,$v) = @{$tag->{'all'}};
2351     unless (defined ${$final->{$input}}[0]{$t}) {
2352         $$choices .= "$num, ";
2353         push @{${$final->{$input}}[0]{$t}}, $v
2354     }
2355
2356 }