bin/bp_genbank2gff3

   1 #!/usr/bin/perl
   2
   3
   4
   5 =pod
   6
   7 =head1 NAME
   8
   9 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3
  10
  11 =head1 SYNOPSIS
  12
  13   bp_genbank2gff3.pl [options] filename(s)
  14
  15   # process a directory containing GenBank flatfiles
  16   perl bp_genbank2gff3.pl --dir path_to_files --zip
  17
  18   # process a single file, ignore explicit exons and introns
  19   perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz
  20
  21   # process a list of files
  22   perl bp_genbank2gff3.pl *gbk.gz
  23
  24   # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader
  25   curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \
  26   | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \
  27   | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata
  28
  29     Options:
  30         --noinfer  -r  don't infer exon/mRNA subfeatures
  31         --conf     -i  path to the curation configuration file that contains user preferences
  32                        for Genbank entries (must be YAML format)
  33                        (if --manual is passed without --ini, user will be prompted to
  34                         create the file if any manual input is saved)
  35         --sofile  -l  path to to the so.obo file to use for feature type mapping
  36                        (--sofile live will download the latest online revision)
  37         --manual   -m  when trying to guess the proper SO term, if more than
  38                        one option matches the primary tag, the converter will
  39                        wait for user input to choose the correct one
  40                        (only works with --sofile)
  41         --dir      -d  path to a list of genbank flatfiles
  42         --outdir   -o  location to write GFF files (can be 'stdout' or '-' for pipe)
  43         --zip      -z  compress GFF3 output files with gzip
  44         --summary  -s  print a summary of the features in each contig
  45         --filter   -x  genbank feature type(s) to ignore
  46         --split    -y  split output to separate GFF and fasta files for
  47                        each genbank record
  48         --nolump   -n  separate file for each reference sequence
  49                        (default is to lump all records together into one
  50                        output file for each input file)
  51         --ethresh  -e  error threshold for unflattener
  52                        set this high (>2) to ignore all unflattener errors
  53         --[no]CDS  -c  Keep CDS-exons, or convert to alternate gene-RNA-protein-exon
  54                        model. --CDS is default. Use --CDS to keep default GFF gene model,
  55                        use --noCDS to convert to g-r-p-e.
  56         --format   -f  Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work
  57                        (GenBank is default)
  58         --GFF_VERSION  3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available
  59         --quiet        don't talk about what is being processed
  60         --typesource   SO sequence type for source (e.g. chromosome; region; contig)
  61         --help     -h  display this message
  62
  63
  64 =head1 DESCRIPTION
  65
  66 This script uses Bio::SeqFeature::Tools::Unflattener and
  67 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene
  68 containment hierarchies mapped for optimal display in gbrowse.
  69
  70 The input files are assumed to be gzipped GenBank flatfiles for refseq
  71 contigs.  The files may contain multiple GenBank records.  Either a
  72 single file or an entire directory can be processed.  By default, the
  73 DNA sequence is embedded in the GFF but it can be saved into separate
  74 fasta file with the --split(-y) option.
  75
  76 If an input file contains multiple records, the default behaviour is
  77 to dump all GFF and sequence to a file of the same name (with .gff
  78 appended).  Using the 'nolump' option will create a separate file for
  79 each genbank record.  Using the 'split' option will create separate
  80 GFF and Fasta files for each genbank record.
  81
  82
  83 =head2 Notes
  84
  85 =head3 'split' and 'nolump' produce many files
  86
  87 In cases where the input files contain many GenBank records (for
  88 example, the chromosome files for the mouse genome build), a very
  89 large number of output files will be produced if the 'split' or
  90 'nolump' options are selected.  If you do have lists of files E<gt> 6000,
  91 use the --long_list option in bp_bulk_load_gff.pl or
  92 bp_fast_load_gff.pl to load the gff and/ or fasta files.
  93
  94 =head3 Designed for RefSeq
  95
  96 This script is designed for RefSeq genomic sequence entries.  It may
  97 work for third party annotations but this has not been tested.
  98 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl
  99 if you don't mind some gene model unflattener errors (dgg).
 100
 101 =head3 G-R-P-E Gene Model
 102
 103 Don Gilbert worked this over with needs to produce GFF3 suited to
 104 loading to GMOD Chado databases.  Most of the changes I believe are
 105 suited for general use.  One main chado-specific addition is the
 106   --[no]cds2protein  flag
 107
 108 My favorite GFF is to set the above as ON by default (disable with --nocds2prot)
 109 For general use it probably should be OFF, enabled with --cds2prot.
 110
 111 This writes GFF with an alternate, but useful Gene model,
 112 instead of the consensus model for GFF3
 113
 114   [ gene > mRNA> (exon,CDS,UTR) ]
 115
 116 This alternate is
 117
 118   gene > mRNA > polypeptide > exon
 119
 120 means the only feature with dna bases is the exon.  The others
 121 specify only location ranges on a genome.  Exon of course is a child
 122 of mRNA and protein/peptide.
 123
 124 The protein/polypeptide feature is an important one, having all the
 125 annotations of the GenBank CDS feature, protein ID, translation, GO
 126 terms, Dbxrefs to other proteins.
 127
 128 UTRs, introns, CDS-exons are all inferred from the primary exon bases
 129 inside/outside appropriate higher feature ranges.   Other special gene
 130 model features remain the same.
 131
 132 Several other improvements and bugfixes, minor but useful are included
 133
 134   * IO pipes now work:
 135     curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...
 136
 137   * GenBank main record fields are added to source feature, e.g. organism, date,
 138     and the sourcetype, commonly chromosome for  genomes, is used.
 139
 140   * Gene Model handling for ncRNA, pseudogenes are added.
 141
 142   * GFF header is cleaner, more informative.
 143     --GFF_VERSION flag allows choice of v2 as well as default v3
 144
 145   * GFF ##FASTA inclusion is improved, and
 146     CDS translation sequence is moved to FASTA records.
 147
 148   * FT -> GFF attribute mapping is improved.
 149
 150   * --format choice of SeqIO input formats (GenBank default).
 151     Uniprot/Swissprot and EMBL work and produce useful GFF.
 152
 153   * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions
 154       and more flexible usage.
 155
 156 =head1 TODO
 157
 158 =head2 Are these additions desired?
 159
 160  * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY
 161  * handle Entrezgene, other non-sequence SeqIO structures (really should change
 162     those parsers to produce consistent annotation tags).
 163
 164 =head2 Related bugfixes/tests
 165
 166 These items from Bioperl mail were tested (sample data generating
 167 errors), and found corrected:
 168
 169  From: Ed Green <green <at> eva.mpg.de>
 170  Subject: genbank2gff3.pl on new human RefSeq
 171  Date: 2006-03-13 21:22:26 GMT
 172    -- unspecified errors (sample data works now).
 173
 174  From: Eric Just <e-just <at> northwestern.edu>
 175  Subject: genbank2gff3.pl
 176  Date: 2007-01-26 17:08:49 GMT
 177    -- bug fixed in genbank2gff3 for multi-record handling
 178
 179 This error is for a /trans_splice gene that is hard to handle, and
 180 unflattner/genbank2 doesn't
 181
 182  From: Chad Matsalla <chad <at> dieselwurks.com>
 183  Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?
 184  Date: 2005-07-15 19:51:48 GMT
 185
 186 =head1 AUTHOR
 187
 188 Sheldon McKay (mckays@cshl.edu)
 189
 190 Copyright (c) 2004 Cold Spring Harbor Laboratory.
 191
 192 =head2 AUTHOR of hacks for GFF2Chado loading
 193
 194 Don Gilbert (gilbertd@indiana.edu)
 195
 196
 197 =cut
 198
 199 use strict;
 200 use warnings;
 201
 202 use Pod::Usage;
 203 use Bio::Root::RootI;
 204 use Bio::SeqIO;
 205 use File::Spec;
 206 use Bio::SeqFeature::Tools::Unflattener;
 207 use Bio::SeqFeature::Tools::TypeMapper;
 208 use Bio::SeqFeature::Tools::IDHandler;
 209 use Bio::Location::SplitLocationI;
 210 use Bio::Location::Simple;
 211 use Bio::Tools::GFF;
 212 use Getopt::Long;
 213 use List::Util qw(first);
 214 use Bio::OntologyIO;
 215 use YAML qw(Dump LoadFile DumpFile);
 216 use File::Basename;
 217
 218 use vars qw/$split @filter $zip $outdir $help $ethresh
 219             $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
 220             $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
 221             $file @files $dir $summary $nolump
 222             $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
 223             $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;
 224
 225 use constant SO_URL =>
 226     'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';
 227 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];
 228 use constant ALPHABET_TO_NUMBER => {
 229     a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,
 230     j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,
 231     r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,
 232     z => 25,
 233     };
 234 use constant ALPHABET_DIVISOR => 26;
 235 use constant GM_NEW_TOPLEVEL => 2;
 236 use constant GM_NEW_PART => 1;
 237 use constant GM_DUP_PART => 0;
 238 use constant GM_NOT_PART => -1;
 239
 240 # Options cycle in multiples of 2 because of left side/right side pairing.
 241 # You can make this number odd, but displayed matches will still round up
 242 use constant OPTION_CYCLE => 6;
 243
 244 $GFF_VERSION = 3; # allow v2 ...
 245 $verbose = 1; # right default? -nov to turn off
 246
 247 # dgg: change the gene model to  Gene/mRNA/Polypeptide/exons...
 248 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()
 249 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;
 250   # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support
 251
 252 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **
 253 my $SOURCEID= $FORMAT; # "UniProt" "GenBank"  "EMBL" should work
 254    # other Bio::SeqIO formats may work.  TEST: EntrezGene < problematic tags; InterPro  KEGG
 255
 256
 257 my %TAG_MAP = (
 258   db_xref => 'Dbxref',
 259   name => 'Name',
 260   note => 'Note', # also pull GO: ids into Ontology_term
 261   synonym => 'Alias',
 262   symbol => 'Alias',  # is symbol still used?
 263   # protein_id => 'Dbxref', also seen Dbxref tags: EC_number
 264   # translation: handled in gene_features
 265 );
 266
 267
 268 $| = 1;
 269 my $quiet= !$verbose;
 270 my $ok= GetOptions( 'd|dir|input:s'   => \$dir,
 271             'z|zip'     => \$zip,
 272             'h|help'    => \$help,
 273             's|summary' => \$summary,
 274             'r|noinfer' => \$noinfer,
 275             'i|conf=s' => \$CONF,
 276             'sofile=s' => \$SO_FILE,
 277             'm|manual' => \$MANUAL,
 278             'o|outdir|output:s'=> \$outdir,
 279             'x|filter:s'=> \@filter,
 280             'y|split'   => \$split,
 281             "ethresh|e=s"=>\$ethresh,
 282             'c|CDS!'    => \$CDSkeep,
 283             'f|format=s' => \$FORMAT,
 284             'typesource=s' => \$source_type,
 285             'GFF_VERSION=s' => \$GFF_VERSION,
 286             'quiet!'    => \$quiet, # swap quiet to verbose
 287             'DEBUG!'    => \$DEBUG,
 288             'n|nolump'  => \$nolump);
 289
 290 my $lump = 1 unless $nolump || $split;
 291 $verbose= !$quiet;
 292
 293 # look for help request
 294 pod2usage(2) if $help || !$ok;
 295
 296 # keep SOURCEID as-is and change FORMAT for SeqIO types;
 297 # note SeqIO uses file.suffix to guess type; not useful here
 298 $SOURCEID= $FORMAT;
 299 $FORMAT  = "swiss" if $FORMAT =~/UniProt|trembl/;
 300 $verbose =1 if($DEBUG);
 301
 302 # initialize handlers
 303 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);
 304 $unflattener->error_threshold($ethresh) if $ethresh;
 305 $unflattener->verbose(1) if($DEBUG);
 306 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?
 307 # ensembl parsing is still problematic, forget this
 308
 309 my $tm  = Bio::SeqFeature::Tools::TypeMapper->new;
 310 my $idh = Bio::SeqFeature::Tools::IDHandler->new;
 311
 312 # dgg
 313 $source_type ||= "region"; # should really parse from FT.source contents below
 314
 315 #my $FTSOmap = $tm->FT_SO_map();
 316 my $FTSOmap;
 317 my $FTSOsynonyms;
 318
 319 if (defined($SO_FILE) && $SO_FILE eq 'live') {
 320     print "\nDownloading the latest SO file from ".SO_URL."\n\n";
 321     use LWP::UserAgent;
 322     my $ua = LWP::UserAgent->new(timeout => 30);
 323     my $request = HTTP::Request->new(GET => SO_URL);
 324     my $response = $ua->request($request);
 325
 326
 327     if ($response->status_line =~ /200/) {
 328         use File::Temp qw/ tempfile /;
 329         my ($fh, $fn) = tempfile();
 330         print $fh $response->content;
 331         $SO_FILE = $fn;
 332     } else {
 333         print "Couldn't download SO file online...skipping validation.\n"
 334             . "HTTP Status was " . $response->status_line . "\n"
 335             and undef $SO_FILE
 336     }
 337 }
 338
 339 if ($SO_FILE) {
 340
 341
 342     my (%terms, %syn);
 343
 344     my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );
 345     $ONTOLOGY = $parser->next_ontology();
 346
 347     for ($ONTOLOGY->get_all_terms) {
 348         my $feat = $_;
 349
 350         $terms{$feat->name} = $feat->name;
 351         #$terms{$feat->name} = $feat;
 352
 353         my @syn = $_->each_synonym;
 354
 355         push @{$syn{$_}}, $feat->name for @syn;
 356         #push @{$syn{$_}}, $feat for @syn;
 357     }
 358
 359     $FTSOmap = \%terms;
 360     $FTSOsynonyms = \%syn;
 361
 362     my %hardTerms = %{ $tm->FT_SO_map() };
 363     map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;
 364
 365 } else {
 366     my %terms = %{ $tm->FT_SO_map() };
 367     while (my ($k,$v) = each %terms) {
 368         $FTSOmap->{$k} = ref($v) ? shift @$v : $v;
 369     }
 370 }
 371
 372 $TYPE_MAP = $FTSOmap;
 373 $SYN_MAP = $FTSOsynonyms;
 374
 375
 376 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")
 377
 378 # stringify filter list if applicable
 379 my $filter = join ' ', @filter  if @filter;
 380
 381 # determine input files
 382 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use
 383 if ($dir && ($dir eq '-' || $dir eq 'stdin')) {
 384   $stdin=1;  $dir=''; @files=('stdin');
 385
 386 } elsif ( $dir ) {
 387     if ( -d $dir ) {
 388         opendir DIR, $dir or die "could not open $dir for reading: $!";
 389         @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;
 390         closedir DIR;
 391     }
 392     else {
 393         die "$dir is not a directory\n";
 394     }
 395 }
 396 else {
 397     @files = @ARGV;
 398     $dir = '';
 399 }
 400
 401 # we should have some files by now
 402 pod2usage(2) unless @files;
 403
 404
 405 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use
 406 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {
 407   warn("std. output chosen: cannot split\n") if($split);
 408   warn("std. output chosen: cannot zip\n") if($zip);
 409   warn("std. output chosen: cannot nolump\n") if($nolump);
 410   $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip
 411
 412 } elsif ( $outdir && !-e $outdir ) {
 413     mkdir($outdir) or die "could not create directory $outdir: $!\n";
 414 }
 415 elsif ( !$outdir ) {
 416     $outdir = $dir || '.';
 417 }
 418
 419 for my $file ( @files ) {
 420     # dgg ; allow 'stdin' / '-' input ?
 421     chomp $file;
 422     die "$! $file" unless($stdin || -e $file);
 423     print "# Input: $file\n" if($verbose);
 424
 425     my ($lump_fh, $lumpfa_fh, $outfile, $outfa);
 426     if ($stdout) {
 427       $lump_fh= *STDOUT; $lump="stdout$$";
 428       $outfa= "stdout$$.fa";  # this is a temp file ... see below
 429       open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
 430
 431     } elsif ( $lump ) {
 432         my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);
 433         $lump = File::Spec->catfile($outdir, $fileonly.'.gff');
 434         ($outfa = $lump) =~ s/\.gff/\.fa/;
 435         open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";
 436         open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
 437
 438     }
 439
 440     # open input file, unzip if req'd
 441     if ($stdin) {
 442        *FH = *STDIN;
 443     } elsif ( $file =~ /\.gz/ ) {
 444         open FH, "gunzip -c $file |";
 445     }
 446     else {
 447         open FH, '<', $file;
 448     }
 449
 450     my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);
 451     my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );
 452
 453     while ( my $seq = $in->next_seq() ) {
 454         my $seq_name = $seq->accession_number;
 455         my $end = $seq->length;
 456         my @to_print;
 457
 458         # arrange disposition of GFF output
 459         $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');
 460         my $out;
 461
 462         if ( $lump ) {
 463             $outfile = $lump;
 464             $out = $lump_fh;
 465         }
 466         else {
 467             $outfile = File::Spec->catfile($outdir, $seq_name.'.gff');
 468             open $out, ">$outfile";
 469         }
 470
 471         # filter out unwanted features
 472         my $source_feat= undef;
 473         my @source= filter($seq); $source_feat= $source[0];
 474
 475         ($source_type,$source_feat)=
 476           getSourceInfo( $seq, $source_type, $source_feat ) ;
 477           # always; here we build main prot $source_feat; # if @source;
 478
 479         # abort if there are no features
 480         warn "$seq_name has no features, skipping\n" and next
 481             if !$seq->all_SeqFeatures;
 482
 483
 484         $FTSOmap->{'source'} = $source_type;
 485         ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features
 486
 487         # construct a GFF header
 488         # add: get source_type from attributes of source feature? chromosome=X tag
 489         # also combine 1st ft line here with source ft from $seq ..
 490         my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);
 491         print $out $header;
 492         print "# working on $info\n" if($verbose);
 493
 494         # unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..
 495         unflatten_seq($seq);
 496
 497         # Note that we use our own get_all_SeqFeatures function
 498         # to rescue cloned exons
 499
 500         @GFF_LINE_FEAT = ();
 501         for my $feature ( get_all_SeqFeatures($seq) ) {
 502
 503             my $method = $feature->primary_tag;
 504             next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);
 505
 506             $feature->seq_id($seq->id) unless($feature->seq_id);
 507             $feature->source_tag($SOURCEID);
 508
 509             # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;
 510             ## also, pull any GO:000 ids from /note tag and put into Ontology_term
 511             maptags2gff($feature);
 512
 513             # current gene name.  The unflattened gene features should be in order so any
 514             # exons, CDSs, etc that follow will belong to this gene
 515             my $gene_name;
 516             if ( $method eq 'gene' || $method eq 'pseudogene' ) {
 517               @to_print= print_held($out, $gffio, \@to_print);
 518               $gene_id = $gene_name= gene_name($feature);
 519             } else {
 520               $gene_name= gene_name($feature);
 521             }
 522
 523             #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx
 524             # be converted to or added as  Name=xxx (if not ID= or as well)
 525             ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags
 526             convert_to_name($feature);
 527
 528             ## dgg: extended to protein|polypeptide
 529             ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes
 530             ## in yeast have that genbank tag; why?
 531             ## these include pseudogene ...
 532
 533             ## Note we also have mapped types to SO, so these RNA's are now transcripts:
 534             # pseudomRNA => "pseudogenic_transcript",
 535             # pseudotranscript" => "pseudogenic_transcript",
 536             # misc_RNA=>'processed_transcript',
 537
 538             warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;
 539
 540             if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
 541                || ( $gene_id && $gene_name eq $gene_id ) ) {
 542
 543                 my $action = gene_features($feature, $gene_id, $gene_name);  # -1, 0, 1, 2 result
 544                 if ($action == GM_DUP_PART) {
 545                   # ignore, this is dupl. exon with new parent ...
 546
 547                 } elsif ($action == GM_NOT_PART) {
 548                   add_generic_id( $feature, $gene_name, "nocount");
 549                   my $gff = $gffio->gff_string($feature);
 550                   push @GFF_LINE_FEAT, $feature;
 551                   #print $out "$gff\n" if $gff;
 552
 553                 } elsif ($action > 0) {
 554                  # hold off print because exon etc. may get 2nd, 3rd parents
 555                   @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);
 556                   push(@to_print, $feature);
 557                 }
 558
 559             }
 560
 561             # otherwise handle as generic feats with IDHandler labels
 562             else {
 563                 add_generic_id( $feature, $gene_name, "");
 564                 my $gff= $gffio->gff_string($feature);
 565                 push @GFF_LINE_FEAT, $feature;
 566                 #print $out "$gff\n" if $gff;
 567             }
 568         }
 569
 570         # don't like doing this after others; do after each new gene id?
 571         @to_print= print_held($out, $gffio, \@to_print);
 572
 573         gff_validate(@GFF_LINE_FEAT);
 574
 575         for my $feature (@GFF_LINE_FEAT) {
 576             my $gff= $gffio->gff_string($feature);
 577             print $out "$gff\n" if $gff;
 578         }
 579
 580         # deal with the corresponding DNA
 581         my ($fa_out,$fa_outfile);
 582         my $dna = $seq->seq;
 583         if($dna || %proteinfa) {
 584           $method{'RESIDUES'} += length($dna);
 585           $dna    =~ s/(\S{60})/$1\n/g;
 586           $dna   .= "\n";
 587
 588           if ($split) {
 589               $fa_outfile = $outfile;
 590               $fa_outfile =~ s/gff$/fa/;
 591               open $fa_out, ">$fa_outfile" or die $!;
 592               print $fa_out ">$seq_name\n$dna" if $dna;
 593               foreach my $aid (sort keys %proteinfa) {
 594                 my $aa= delete $proteinfa{$aid};
 595                 $method{'RESIDUES(tr)'} += length($aa);
 596                 $aa =~ s/(\S{60})/$1\n/g;
 597                 print $fa_out ">$aid\n$aa\n";
 598                 }
 599
 600           }
 601           else {
 602           ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out
 603           ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk
 604           ## maybe write this to temp .fa then cat to end of lumped gff $out
 605               print $lumpfa_fh ">$seq_name\n$dna" if $dna;
 606               foreach my $aid (sort keys %proteinfa) {
 607                 my $aa= delete $proteinfa{$aid};
 608                 $method{'RESIDUES(tr)'} += length($aa);
 609                 $aa =~ s/(\S{60})/$1\n/g;
 610                 print $lumpfa_fh ">$aid\n$aa\n";
 611                 }
 612           }
 613
 614         %proteinfa=();
 615         }
 616
 617         if ( $zip && !$lump ) {
 618             system "gzip -f $outfile";
 619             system "gzip -f $fa_outfile" if($fa_outfile);
 620             $outfile .= '.gz';
 621             $fa_outfile .= '.gz' if $split;
 622         }
 623
 624         # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA
 625         print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);
 626         print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);
 627
 628         # dgg: moved to after all inputs; here it prints cumulative sum for each record
 629         #if ( $summary ) {
 630         #      print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
 631         #
 632         #      for ( keys %method ) {
 633         #          print "# $_  $method{$_}\n";
 634         #      }
 635         #      print "# \n";
 636         #  }
 637
 638     }
 639
 640     print "# GFF3 saved to $outfile\n" if( $verbose && $lump);
 641     if ( $summary ) {
 642         print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
 643         for ( keys %method ) {
 644             print "# $_  $method{$_}\n";
 645         }
 646         print "# \n";
 647     }
 648
 649     ## FIXME for piped output w/ split FA files ...
 650     close($lumpfa_fh) if $lumpfa_fh;
 651     if (!$split && $outfa && $lump_fh) {
 652         print $lump_fh "##FASTA\n"; # GFF3 spec
 653         open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";
 654         while( <$lumpfa_fh>) {
 655             print $lump_fh $_;
 656         } # is $lump_fh still open?
 657         close($lumpfa_fh);
 658         unlink($outfa);
 659     }
 660
 661
 662     if ( $zip && $lump ) {
 663         system "gzip -f $lump";
 664     }
 665
 666     close FH;
 667 }
 668
 669
 670
 671
 672
 673 sub typeorder {
 674   return 1 if ($_[0] =~ /gene/);
 675   return 2 if ($_[0] =~ /RNA|transcript/);
 676   return 3 if ($_[0] =~ /protein|peptide/);
 677   return 4 if ($_[0] =~ /exon|CDS/);
 678   return 3; # default before exon (smallest part)
 679 }
 680
 681 sub sort_by_feattype {
 682   my($at,$bt)= ($a->primary_tag, $b->primary_tag);
 683   return (typeorder($at) <=> typeorder($bt))
 684     or ($at cmp $bt);
 685     ## or ($a->name() cmp $b->name());
 686 }
 687
 688 sub print_held {
 689   my($out,$gffio,$to_print)= @_;
 690   return unless(@$to_print);
 691   @$to_print = sort sort_by_feattype  @$to_print; # put exons after mRNA, otherwise chado loader chokes
 692   while ( my $feature = shift @$to_print) {
 693     my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode
 694     push @GFF_LINE_FEAT, $feature;
 695     #print $out "$gff\n";
 696     }
 697   return (); # @to_print
 698 }
 699
 700 sub maptags2gff {
 701   my $f = shift;
 702   ## should copy/move locus_tag to Alias, if not ID/Name/Alias already
 703   # but see below /gene /locus_tag usage
 704   foreach my $tag (keys %TAG_MAP) {
 705     if ($f->has_tag($tag)) {
 706       my $newtag= $TAG_MAP{$tag};
 707       my @v= $f->get_tag_values($tag);
 708       $f->remove_tag($tag);
 709       $f->add_tag_value($newtag,@v);
 710
 711       ## also, pull any GO:000 ids from /note tag and put into Ontology_term
 712       ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'
 713       if ($tag eq 'note') {
 714         map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;
 715         my @go= map { m/(GO:\d+)/g } @v;
 716         $f->add_tag_value('Ontology_term',@go) if(@go);
 717         }
 718
 719       }
 720     }
 721 }
 722
 723
 724 sub getSourceInfo {
 725   my ($seq, $source_type, $sf) = @_;
 726
 727   my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);
 728   my $is_gene = ($SOURCEID =~/entrezgene/i);
 729   my $is_rich = (ref($seq) =~ /RichSeq/);
 730   my $seq_name= $seq->accession_number();
 731
 732   unless($sf) { # make one
 733     $source_type=  $is_swiss ? $PROTEIN_TYPE
 734         : $is_gene ? "eneg" # "gene"  # "region" #
 735         : $is_rich ? $seq->molecule : $source_type;
 736     $sf = Bio::SeqFeature::Generic->direct_new();
 737
 738     my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);
 739     my $loc= $seq->can('location') ? $seq->location()
 740             :  new Bio::Location::Simple( -start => $start, -end => $len);
 741     $sf->location( $loc );
 742     $sf->primary_tag($source_type);
 743     $sf->source_tag($SOURCEID);
 744     $sf->seq_id( $seq_name);
 745     #? $sf->display_name($seq->id()); ## Name or Alias ?
 746     $sf->add_tag_value( Alias => $seq->id()); # unless id == accession
 747     $seq->add_SeqFeature($sf);
 748     ## $source_feat= $sf;
 749   }
 750
 751   if ($sf->has_tag("chromosome")) {
 752     $source_type= "chromosome";
 753     my ($chrname) = $sf->get_tag_values("chromosome");
 754     ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead
 755     ## e.g. Mouse chr 19 has two IDs in NCBI genbank now
 756     $sf->add_tag_value( Alias => $chrname );
 757   }
 758
 759   # pull GB Comment, Description for source ft ...
 760   # add reference - can be long, not plain string...
 761   warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;
 762   # GenBank   fields: keyword,comment,reference,date_changed
 763   # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM
 764
 765   # is this just for main $seq object or for all seqfeatures ?
 766   my %AnnotTagMap= (
 767       'gene_name' => 'Alias',
 768       'ALIAS_SYMBOL' => 'Alias',  # Entrezgene
 769       'LOCUS_SYNONYM' => 'Alias', #?
 770       'symbol' => 'Alias',
 771       'synonym' => 'Alias',
 772       'dblink' => 'Dbxref',
 773       'product' => 'product',
 774       'Reference' => 'reference',
 775       'OntologyTerm' => 'Ontology_term',
 776       'comment'  => 'Note',
 777       'comment1' => 'Note',
 778       # various map-type locations
 779       # gene accession tag is named per source db !??
 780       # 'Index terms' => keywords ??
 781       );
 782
 783
 784   my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );
 785   my ($date)= $seq->annotation->get_Annotations("dates")
 786       || $seq->annotation->get_Annotations("update-date")
 787       || $is_rich ? $seq->get_dates() : ();
 788   my ($comment)= $seq->annotation->get_Annotations("comment");
 789   my ($species)= $seq->annotation->get_Annotations("species");
 790   if (!$species
 791        && $seq->can('species')
 792        && defined $seq->species()
 793        && $seq->species()->can('binomial') ) {
 794     $species= $seq->species()->binomial();
 795   }
 796
 797   # update source feature with main GB fields
 798   $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');
 799   $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));
 800   $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));
 801   $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));
 802   $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));
 803
 804   $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;
 805
 806   foreach my $atag (sort keys %AnnotTagMap) {
 807       my $gtag= $AnnotTagMap{$atag}; next unless($gtag);
 808       my @anno = map{
 809           if (ref $_ && $_->can('get_all_values')) {
 810               split( /[,;] */, join ";", $_->get_all_values)
 811           }
 812           elsif (ref $_ && $_->can('display_text')) {
 813               split( /[,;] */, $_->display_text)
 814           }
 815           elsif (ref $_ && $_->can('value')) {
 816               split( /[,;] */, $_->value)
 817           }
 818           else {
 819              ();
 820           }
 821       } $seq->annotation->get_Annotations($atag);
 822       foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }
 823   }
 824
 825   #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');
 826   #$sf->add_tag_value( Alias => $_ ) foreach(@genes);
 827   #
 828   #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all
 829   #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);
 830
 831   return (wantarray)? ($source_type,$sf) : $source_type; #?
 832 }
 833
 834
 835 sub gene_features {
 836     my ($f, $gene_id, $genelinkID) = @_;
 837     local $_ = $f->primary_tag;
 838     $method{$_}++;
 839
 840     if ( /gene/ ) {
 841         $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?
 842         $tnum = $rnum= 0; $ncrna_id= $rna_id  = '';
 843         return GM_NEW_TOPLEVEL;
 844
 845     } elsif ( /mRNA/ ) {
 846         return GM_NOT_PART unless $gene_id;
 847         return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
 848         ($rna_id  = $gene_id ) =~ s/gene/mRNA/;
 849         $rna_id   .= '.t0' . ++$tnum;
 850         $f->add_tag_value( ID => $rna_id );
 851         $f->add_tag_value( Parent => $gene_id );
 852
 853     } elsif ( /RNA|transcript/) {
 854         ## misc_RNA here; missing exons ... flattener problem?
 855         #  all of {t,nc,sn}RNA can have gene models now
 856         ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag
 857         ## CDS needs to use mRNA, not misc_RNA, rna_id ...
 858         ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level
 859
 860         if($gene_id) {
 861           return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
 862           ($ncrna_id = $gene_id) =~ s/gene/ncRNA/;
 863           $ncrna_id .= '.r0' . ++$rnum;
 864           $f->add_tag_value( Parent => $gene_id );
 865           $f->add_tag_value( ID => $ncrna_id );
 866         } else {
 867           unless ($f->has_tag('ID')) {
 868             if($genelinkID) {
 869               $f->add_tag_value( ID => $genelinkID  ) ;
 870             } else {
 871               $idh->generate_unique_persistent_id($f);
 872             }
 873           }
 874          ($ncrna_id)= $f->get_tag_values('ID');
 875          return GM_NEW_TOPLEVEL;
 876           # this feat now acts as gene-top-level; need to print @to_print to flush prior exons?
 877         }
 878
 879     } elsif ( /exon/ ) { # can belong to any kind of RNA
 880         return GM_NOT_PART unless ($rna_id||$ncrna_id);
 881         return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
 882         ## we are getting duplicate Parents here, which chokes chado loader, with reason...
 883         ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice
 884         ## check all Parents
 885         for my $expar ($rna_id, $ncrna_id) {
 886           next unless($expar);
 887           if ( $exonpar{$expar} and $f->has_tag('Parent') ) {
 888             my @vals = $f->get_tag_values('Parent');
 889             next if (grep {$expar eq $_} @vals);
 890             }
 891           $exonpar{$expar}++;
 892           $f->add_tag_value( Parent => $expar);
 893           # last; #? could be both
 894         }
 895         # now we can skip cloned exons
 896         # dgg note: multiple parents get added and printed for each unique exon
 897         return GM_DUP_PART if ++$seen{$f} > 1;
 898
 899     } elsif ( /CDS|protein|polypeptide/ ) {
 900         return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??
 901         return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??
 902         (my $pro_id = $rna_id) =~ s/\.t/\.p/;
 903
 904         if( ! $CDSkeep && /CDS/) {
 905           $f->primary_tag($PROTEIN_TYPE);
 906
 907           ## duplicate problem is Location ..
 908           if ($f->location->isa("Bio::Location::SplitLocationI")) {
 909             # my($b,$e)=($f->start, $f->end); # is this all we need?
 910             my($b,$e)=(-1,0);
 911             foreach my $l ($f->location->each_Location) {
 912                $b = $l->start if($b<0 || $b > $l->start);
 913                $e = $l->end if($e < $l->end);
 914                }
 915             $f->location( Bio::Location::Simple->new(
 916                 -start => $b, -end => $e, -strand => $f->strand) );
 917             }
 918
 919             $f->add_tag_value( Derives_from => $rna_id );
 920           }
 921           else {
 922             $f->add_tag_value( Parent => $rna_id );
 923           }
 924
 925         $f->add_tag_value( ID => $pro_id );
 926
 927         move_translation_fasta($f, $pro_id);
 928         #if( $f->has_tag('translation')) {
 929         #   my ($aa) = $f->get_tag_values("translation");
 930         #    $proteinfa{$pro_id}= $aa;
 931         #    $f->remove_tag("translation");
 932         #    $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
 933         #}
 934     } elsif ( /region/ ) {
 935         $f->primary_tag('gene_component_region');
 936         $f->add_tag_value( Parent => $gene_id );
 937     } else {
 938         return GM_NOT_PART unless $gene_id;
 939         $f->add_tag_value( Parent => $gene_id );
 940     }
 941
 942     ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;
 943
 944     return GM_NEW_PART;
 945 }
 946
 947 ## was generic_features >  add_generic_id
 948 sub add_generic_id {
 949     my ($f, $ft_name, $flags) = @_;
 950     my $method = $f->primary_tag;
 951     $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above
 952
 953     if ($f->has_tag('ID')) {
 954
 955     }
 956     elsif ( $f->has_tag($method) ) {
 957         my ($name) = $f->get_tag_values($method);
 958         $f->add_tag_value( ID => "$method:$name" );
 959     }
 960     elsif($ft_name) { # is this unique ?
 961         $f->add_tag_value( ID => $ft_name );
 962     }
 963     else {
 964         $idh->generate_unique_persistent_id($f);
 965     }
 966
 967     move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )
 968       if($method =~ /CDS/);
 969
 970     # return $io->gff_string($f);
 971 }
 972
 973 sub move_translation_fasta {
 974     my ($f, $ft_id) = @_;
 975     if( $ft_id && $f->has_tag('translation') ) {
 976       my ($aa) = $f->get_tag_values("translation");
 977       if($aa && $aa !~ /^length/) {
 978         $proteinfa{$ft_id}= $aa;
 979         $f->remove_tag("translation");
 980         $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
 981         }
 982     }
 983 }
 984
 985 sub gff_header {
 986     my ($name, $end, $source_type, $source_feat) = @_;
 987     $source_type ||= "region";
 988
 989     my $info = "$source_type:$name";
 990     my $head = "##gff-version $GFF_VERSION\n".
 991                "##sequence-region $name 1 $end\n".
 992                "# conversion-by bp_genbank2gff3.pl\n";
 993     if ($source_feat) {
 994         ## dgg: these header comment fields are not useful when have multi-records, diff organisms
 995         for my $key (qw(organism Note date)) {
 996             my $value;
 997             if ($source_feat->has_tag($key)) {
 998                 ($value) = $source_feat->get_tag_values($key);
 999             }
1000             if ($value) {
1001                 $head .= "# $key $value\n";
1002                 $info .= ", $value";
1003             }
1004         }
1005         $head = "" if $didheader;
1006     } else {
1007         $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";
1008     }
1009     $didheader++;
1010     return (wantarray) ? ($head,$info) : $head;
1011 }
1012
1013 sub unflatten_seq {
1014     my $seq = shift;
1015
1016     ## print "# working on $source_type:", $seq->accession, "\n";
1017     my $uh_oh = "Possible gene unflattening error with" .  $seq->accession_number .
1018                 ": consult STDERR\n";
1019
1020     eval {
1021         $unflattener->unflatten_seq( -seq => $seq,
1022                                      -noinfer => $noinfer,
1023                                      -use_magic => 1 );
1024     };
1025
1026     # deal with unflattening errors
1027     if ( $@ ) {
1028         warn $seq->accession_number . " Unflattening error:\n";
1029         warn "Details: $@\n";
1030         print "# ".$uh_oh;
1031     }
1032
1033     return 0 if !$seq || !$seq->all_SeqFeatures;
1034
1035     # map feature types to the sequence ontology
1036     ## $tm->map_types_to_SO( -seq => $seq );
1037     #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg
1038
1039     map_types(
1040         $tm,
1041         -seq => $seq,
1042         -type_map  => $FTSOmap,
1043         -syn_map  => $FTSOsynonyms,
1044         -undefined => "region"
1045     ); #nml
1046
1047 }
1048
1049 sub filter {
1050     my $seq = shift;
1051     ## return unless $filter;
1052     my @feats;
1053     my @sources; # dgg; pick source features here; only 1 always?
1054     if ($filter) {
1055       for my $f ( $seq->remove_SeqFeatures ) {
1056         my $m = $f->primary_tag;
1057         push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1058         push @feats, $f unless $filter =~ /$m/i;
1059       }
1060       $seq->add_SeqFeature($_) foreach @feats;
1061     } else {
1062       for my $f ( $seq->get_SeqFeatures ){
1063         my $m = $f->primary_tag;
1064         push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1065         }
1066     }
1067
1068     return @sources;
1069 }
1070
1071
1072 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
1073 # changed to filter out cloned features.  We have to implement the old
1074 # method.  These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
1075 sub get_all_SeqFeatures  {
1076     my $seq = shift;
1077     my @flatarr;
1078
1079     foreach my $feat ( $seq->get_SeqFeatures ){
1080         push(@flatarr,$feat);
1081         _add_flattened_SeqFeatures(\@flatarr,$feat);
1082     }
1083     return @flatarr;
1084 }
1085
1086 sub gene_name {
1087     my $g = shift;
1088     my $gene_id = ''; # zero it;
1089
1090     if ($g->has_tag('locus_tag')) {
1091         ($gene_id) = $g->get_tag_values('locus_tag');
1092     }
1093
1094     elsif ($g->has_tag('gene')) {
1095         ($gene_id) = $g->get_tag_values('gene');
1096     }
1097     elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene
1098         ($gene_id) = $g->get_tag_values('ID');
1099     }
1100
1101     ## See Unflattener comment:
1102     # on rare occasions, records will have no /gene or /locus_tag
1103     # but it WILL have /product tags. These serve the same purpose
1104     # for grouping. For an example, see AY763288 (also in t/data)
1105     # eg. product=tRNA-Asp ;  product=similar to crooked neck protein
1106     elsif ($g->has_tag('product')) {
1107         my ($name)= $g->get_tag_values('product');
1108         ($gene_id) = $name unless($name =~ / /); # a description not name
1109     }
1110
1111     ## dgg; also handle transposon=xxxx ID/name
1112     # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746
1113     elsif ($g->has_tag('transposon')) {
1114         my ($name)= $g->get_tag_values('transposon');
1115         ($gene_id) = $name unless($name =~ / /); # a description not name
1116     }
1117
1118     return $gene_id;
1119 }
1120
1121 # same list as gene_name .. change tag to generic Name
1122 sub convert_to_name {
1123     my $g = shift;
1124     my $gene_id = ''; # zero it;
1125
1126     if ($g->has_tag('gene')) {
1127         ($gene_id) = $g->get_tag_values('gene');
1128         $g->remove_tag('gene');
1129         $g->add_tag_value('Name', $gene_id);
1130     }
1131     elsif ($g->has_tag('locus_tag')) {
1132         ($gene_id) = $g->get_tag_values('locus_tag');
1133         $g->remove_tag('locus_tag');
1134         $g->add_tag_value('Name', $gene_id);
1135     }
1136
1137     elsif ($g->has_tag('product')) {
1138         my ($name)= $g->get_tag_values('product');
1139         ($gene_id) = $name unless($name =~ / /); # a description not name
1140         ## $g->remove_tag('product');
1141         $g->add_tag_value('Name', $gene_id);
1142     }
1143
1144     elsif ($g->has_tag('transposon')) {
1145         my ($name)= $g->get_tag_values('transposon');
1146         ($gene_id) = $name unless($name =~ / /); # a description not name
1147         ## $g->remove_tag('transposon');
1148         $g->add_tag_value('Name', $gene_id);
1149     }
1150     elsif ($g->has_tag('ID')) {
1151         my ($name)= $g->get_tag_values('ID');
1152         $g->add_tag_value('Name', $name);
1153     }
1154     return $gene_id;
1155 }
1156
1157
1158 sub _add_flattened_SeqFeatures  {
1159     my ($arrayref,$feat) = @_;
1160     my @subs = ();
1161
1162     if ($feat->isa("Bio::FeatureHolderI")) {
1163         @subs = $feat->get_SeqFeatures;
1164     }
1165     elsif ($feat->isa("Bio::SeqFeatureI")) {
1166         @subs = $feat->sub_SeqFeature;
1167     }
1168     else {
1169         warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
1170             "Don't know how to flatten.";
1171     }
1172
1173     for my $sub (@subs) {
1174         push(@$arrayref,$sub);
1175         _add_flattened_SeqFeatures($arrayref,$sub);
1176     }
1177
1178 }
1179
1180 sub map_types {
1181
1182     my ($self, @args) = @_;
1183
1184     my($sf, $seq, $type_map, $syn_map, $undefmap) =
1185         $self->_rearrange([qw(FEATURE
1186                     SEQ
1187                     TYPE_MAP
1188                     SYN_MAP
1189                     UNDEFINED
1190                     )],
1191                 @args);
1192
1193     if (!$sf && !$seq) {
1194         $self->throw("you need to pass in either -feature or -seq");
1195     }
1196
1197     my @sfs = ($sf);
1198     if ($seq) {
1199         $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
1200         @sfs = $seq->get_all_SeqFeatures;
1201     }
1202     $type_map = $type_map || $self->typemap; # dgg: was type_map;
1203     foreach my $feat (@sfs) {
1204
1205         $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
1206         $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
1207
1208         my $primary_tag = $feat->primary_tag;
1209
1210         #if ($primary_tag =~ /^pseudo(.*)$/) {
1211         #    $primary_tag = $1;
1212         #    $feat->primary_tag($primary_tag);
1213         #}
1214
1215         my $mtype = $type_map->{$primary_tag};
1216         if ($mtype) {
1217             if (ref($mtype)) {
1218                 if (ref($mtype) eq 'ARRAY') {
1219                     my $soID;
1220                     ($mtype, $soID) = @$mtype;
1221
1222                     if ($soID && ref($ONTOLOGY)) {
1223                         my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
1224                         $mtype = $term->name if $term;
1225                     }
1226                     # if SO ID is undefined AND we have an ontology to search, we want to delete
1227                     # the feature type hash entry in order to force a fuzzy search
1228                     elsif (! defined $soID && ref($ONTOLOGY)) {
1229                         undef $mtype;
1230                         delete $type_map->{$primary_tag};
1231                     }
1232                     elsif ($undefmap && $mtype eq 'undefined') { # dgg
1233                         $mtype= $undefmap;
1234                     }
1235
1236                     $type_map->{$primary_tag} = $mtype if $mtype;
1237                 }
1238                 elsif (ref($mtype) eq 'CODE') {
1239                     $mtype = $mtype->($feat);
1240                 }
1241                 else {
1242                     $self->throw('must be scalar or CODE ref');
1243                 }
1244             }
1245             elsif ($undefmap && $mtype eq 'undefined') { # dgg
1246                 $mtype= $undefmap;
1247             }
1248             $feat->primary_tag($mtype);
1249         }
1250
1251         if ($CONF) {
1252             conf_read();
1253             my %perfect_matches;
1254             while (my ($p_tag,$rules) = each %$YAML) {
1255                 RULE:
1256                 for my $rule (@$rules) {
1257                     for my $tags (@$rule) {
1258                         while (my ($tag,$values) = each %$tags) {
1259                             for my $value (@$values) {
1260                                 if ($feat->has_tag($tag)) {
1261                                     for ($feat->get_tag_values($tag)) {
1262                                         next RULE unless $_ =~ /\Q$value\E/;
1263                                     }
1264                                 } elsif ($tag eq 'primary_tag') {
1265                                     next RULE unless $value eq
1266                                         $feat->primary_tag;
1267                                 } elsif ($tag eq 'location') {
1268                                     next RULE unless $value eq
1269                                         $feat->start.'..'.$feat->end;
1270                                 } else { next RULE }
1271                             }
1272                         }
1273                     }
1274                     $perfect_matches{$p_tag}++;
1275                 }
1276             }
1277             if (scalar(keys %perfect_matches) == 1) {
1278                 $mtype = $_ for keys %perfect_matches;
1279             } elsif (scalar(keys %perfect_matches) > 1) {
1280                 warn "There are conflicting rules in the config file for the" .
1281                      " following types: ";
1282                 warn "\t$_\n" for keys %perfect_matches;
1283                 warn "Until conflict resolution is built into the converter," .
1284                      " you will have to manually edit the config file to remove the" .
1285                      " conflict. Sorry :(. Skipping user preference for this entry";
1286                 sleep(2);
1287             }
1288         }
1289
1290         if ( ! $mtype  && $syn_map) {
1291             if ($feat->has_tag('note')) {
1292
1293                 my @all_matches;
1294
1295                 my @note = $feat->each_tag_value('note');
1296
1297                 for my $k (keys %$syn_map) {
1298
1299                     if ($k =~ /"(.+)"/) {
1300
1301                         my $syn = $1;
1302
1303                         for my $note (@note) {
1304
1305                             # look through the notes to see if the description
1306                             # is an exact match for synonyms
1307                             if ( $syn eq $note ) {
1308
1309                                 my @map = @{$syn_map->{$k}};
1310
1311
1312                                 my $best_guess = $map[0];
1313
1314                                 unshift @{$all_matches[-1]}, [$best_guess];
1315
1316                                 $mtype = $MANUAL
1317                                     ? manual_curation($feat, $best_guess, \@all_matches)
1318                                     : $best_guess;
1319
1320                                 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1321                                     . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1322                                     . '#' x 78 . "\n\n";
1323
1324                             } else {
1325                                 # check both primary tag and and note against
1326                                 # SO synonyms for best matching description
1327
1328                                 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
1329                             }
1330
1331                         }
1332                     }
1333                 }
1334
1335                 #unless ($mtype) {
1336                 for my $note (@note) {
1337                     for my $name (values %$type_map) {
1338                     # check primary tag against SO names for best matching
1339                     # descriptions //NML also need to check against
1340                     # definition && camel case split terms
1341
1342                         SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
1343                     }
1344                 }
1345                 #}
1346
1347                 if (scalar(@all_matches) && !$mtype) {
1348
1349                     my $top_matches = first { defined $_ } @{$all_matches[-1]};
1350
1351                     my $best_guess = $top_matches->[0];
1352
1353
1354
1355                     # if guess has quotes, it is a synonym term. we need to
1356                     # look up the corresponding name term
1357                     # otherwise, guess is a name, so we can use it directly
1358                     if ($best_guess =~ /"(.+)"/) {
1359
1360                         $best_guess = $syn_map->{$best_guess}->[0];
1361
1362                     }
1363
1364                     @RETURN = @all_matches;
1365                     $mtype = $MANUAL
1366                         ? manual_curation($feat, $best_guess, \@all_matches)
1367                         : $best_guess;
1368
1369                     print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1370                         . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1371                         . '#' x 78 . "\n\n";
1372
1373                 }
1374             }
1375             $mtype ||= $undefmap;
1376             $feat->primary_tag($mtype);
1377         }
1378     }
1379
1380
1381 }
1382
1383 sub SO_fuzzy_match {
1384
1385     my $candidate = shift;
1386     my $primary_tag = shift;
1387     my $note = shift;
1388     my $SO_terms = shift;
1389     my $best_matches_ref = shift;
1390     my $modifier = shift;
1391
1392     $modifier ||= '';
1393
1394         my @feat_terms;
1395
1396     for ( split(" |_", $primary_tag) ) {
1397         #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1398         my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1399         push @feat_terms, @camelCase;
1400     }
1401
1402     for ( split(" |_", $note) ) {
1403         #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1404         #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1405         (my $word = $_) =~ s/[;:.,]//g;
1406         push @feat_terms, $word;
1407     }
1408
1409
1410     my @SO_terms = split(" |_", $SO_terms);
1411
1412     # fuzzy match works on a simple point system. When 2 words match,
1413     # the $plus counter adds one. When they don't, the $minus counter adds
1414     # one. This is used to sort similar matches together. Better matches
1415     # are found at the end of the array, near the top.
1416
1417     # NML: can we improve best match by using synonym tags
1418     # EXACT,RELATED,NARROW,BROAD?
1419
1420     my ($plus, $minus) = (0, 0);
1421     my %feat_terms;
1422     my %SO_terms;
1423
1424     #unique terms
1425     map {$feat_terms{$_} = 1} @feat_terms;
1426     map {$SO_terms{$_} = 1} @SO_terms;
1427
1428     for my $st (keys %SO_terms) {
1429         for my $ft (keys %feat_terms) {
1430
1431             ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
1432
1433         }
1434     }
1435
1436     push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
1437
1438 }
1439
1440 sub manual_curation {
1441
1442     my ($feat, $default_opt,  $all_matches) = @_;
1443
1444     my @all_matches = @$all_matches;
1445
1446     # convert all SO synonyms into names and filter
1447     # all matches into unique term list because
1448     # synonyms can map to multiple duplicate names
1449
1450     my (@unique_SO_terms, %seen);
1451     for (reverse @all_matches) {
1452         for (@$_) {
1453             for (@$_) {
1454                 #my @names;
1455                 if ($_ =~ /"(.+)"/) {
1456                     for (@{$SYN_MAP->{$_}}) {
1457                         push @unique_SO_terms, $_ unless $seen{$_};
1458                         $seen{$_}++;
1459                     }
1460                 } else {
1461                     push @unique_SO_terms, $_ unless $seen{$_};
1462                     $seen{$_}++;
1463                 }
1464             }
1465         }
1466     }
1467
1468     my $s = scalar(@unique_SO_terms);
1469
1470     my $choice = 0;
1471
1472     my $more =
1473         "[a]uto   : automatic input (selects best guess for remaining entries)\r" .
1474         "[f]ind   : search for other SO terms matching your query (e.g. f gene)\r" .
1475         "[i]nput  : add a specific term\r" .
1476         "[r]eset  : reset to the beginning of matches\r" .
1477         "[s]kip   : skip this entry (selects best guess for this entry)\r"
1478     ;
1479
1480     $more .=
1481         "[n]ext   : view the next ".OPTION_CYCLE." terms\r" .
1482         "[p]rev   : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
1483
1484     my $msg = #"\n\n" . '-' x 156 . "\n"
1485          "The converter found $s possible matches for the following GenBank entry: ";
1486
1487     my $directions =
1488         "Type a number to select the SO term that best matches"
1489         . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
1490
1491
1492     # lookup filtered list to pull out definitions
1493     my @options = map {
1494         my $term = $_;
1495         my %term;
1496         for (['name', 'name'], ['def', 'definition'], ['synonym',
1497                 'each_synonym']) {
1498             my ($label, $method) = @$_;
1499             $term{$label} = \@{[$term->$method]};
1500         }
1501         [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
1502     }  map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
1503
1504
1505     my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
1506             $default_opt, @options);
1507
1508     if ($option eq 'skip') { return $default_opt
1509     } elsif ($option eq 'auto') {
1510         $MANUAL = 0;
1511         return $default_opt;
1512     } else { return $option }
1513
1514 }
1515
1516 sub options_cycle {
1517
1518     my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
1519
1520     #NML: really should only call GenBank_entry once. Will need to change
1521     #method to return array & shift off header
1522     my $entry = GenBank_entry($feat, "\r");
1523
1524     my $total = scalar(@opt);
1525
1526     ($start,$stop) = (0, OPTION_CYCLE)
1527         if ( ($start < 0) && ($stop > 0) );
1528
1529     ($start,$stop) = (0, OPTION_CYCLE)
1530         if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
1531
1532     ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
1533     ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
1534
1535     $stop = $total if $stop > $total;
1536
1537     my $dir_copy = $directions;
1538     my $msg_copy = $msg;
1539     my $format = "format STDOUT = \n" .
1540         '-' x 156 . "\n" .
1541         '^' . '<' x 77 .  '| Available Commands:' . "\n" .
1542         '$msg_copy' . "\n" .
1543         '-' x 156 . "\n" .
1544         ' ' x 78 . "|\n" .
1545         '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
1546         '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
1547         (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
1548         ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000  . ".\n";
1549
1550     {
1551         # eval throws redefined warning that breaks formatting.
1552         # Turning off warnings just for the eval to fix this.
1553         no warnings 'redefine';
1554         eval $format;
1555     }
1556
1557     write;
1558
1559     print '-' x 156 . "\n" .
1560         'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
1561         " - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
1562         "\n" . '-' x 156 . "\n";
1563
1564     for  (my $i = $start; $i < $stop; $i+=2) {
1565
1566         my ($left, $right) = @opt[$i,$i+1];
1567
1568         my ($nL, $nmL, $descL, $termL, @synL) = @$left;
1569
1570         #odd numbered lists can cause fatal undefined errors, so check
1571         #to make sure we have data
1572
1573         my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
1574
1575
1576         my $format = "format STDOUT = \n";
1577
1578         $format .=
1579             ' ' x 78 . "|\n" .
1580
1581             '@>>: name: ^' . '<' x 64 . '~' . ' |' .
1582                 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) .  "\n" .
1583             '$nL,' . ' ' x 7 . '$nmL,' .
1584                 ( ref($right) ? (' ' x 63 . '$nR,' .  ' ' x 7 .  "\$nmR,") : '' ) . "\n" .
1585
1586             ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
1587                 (ref($right) ? ('           ^' . '<' x 61 .  '...~') : '') . "\n" .
1588             ' ' x 11 . '$nmL,' .
1589                 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
1590             #' ' x 78 . '|' . "\n" .
1591
1592
1593             '     def:  ^' . '<' x 65 . ' |' .
1594                 (ref($right) ? ('     def:  ^' . '<' x 64 . '~') : '') . "\n" .
1595             ' ' x 11 . '$descL,' .
1596                 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1597
1598
1599             ('           ^' . '<' x 65 . ' |' .
1600                 (ref($right) ? ('           ^' . '<' x 64 . '~') : '') . "\n" .
1601             ' ' x 11 . '$descL,' .
1602                 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
1603
1604
1605             '           ^' . '<' x 61 . '...~ |' .
1606                 (ref($right) ? ('           ^' . '<' x 61 . '...~') : '') . "\n" .
1607             ' ' x 11 . '$descL,' .
1608                 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1609
1610             ".\n";
1611
1612         {
1613             # eval throws redefined warning that breaks formatting.
1614             # Turning off warnings just for the eval to fix this.
1615             no warnings 'redefine';
1616             eval $format;
1617         }
1618         write;
1619
1620     }
1621     print '-' x 156 . "\nenter a command:";
1622
1623     while (<STDIN>) {
1624
1625         (my $input = $_) =~ s/\s+$//;
1626
1627         if ($input =~ /^\d+$/) {
1628             if ( $input && defined $opt[$input-1] ) {
1629                 return $opt[$input-1]->[1]
1630             } else {
1631                 print "\nThat number is not an option. Please enter a valid number.\n:";
1632             }
1633         } elsif ($input =~ /^n/i | $input =~ /next/i ) {
1634             return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
1635                     $feat, $directions, $best_guess, @opt)
1636         } elsif ($input =~ /^p/i | $input =~ /prev/i ) {
1637             return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
1638                     $feat, $directions, $best_guess, @opt)
1639         } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
1640         } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
1641         } elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
1642             return manual_curation($feat, $best_guess, \@RETURN );
1643         } elsif ( $input =~ /^f/i || $input =~ /find/i ) {
1644
1645             my ($query, @query_results);
1646
1647             if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
1648             } else {
1649
1650                 #do a SO search
1651                 print "Type your search query\n:";
1652                 while (<STDIN>) { chomp($query = $_); last }
1653             }
1654
1655             for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
1656                 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
1657             }
1658
1659             return manual_curation($feat, $best_guess, \@query_results);
1660
1661         } elsif ( $input =~ /^i/i || $input =~ /input/i ) {
1662
1663             #NML fast input for later
1664             #my $query;
1665             #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
1666
1667             #manual input
1668             print "Type the term you want to use\n:";
1669             while (<STDIN>) {
1670                 chomp(my $input = $_);
1671
1672                 if (! $TYPE_MAP->{$input}) {
1673
1674                     print "\"$input\" doesn't appear to be a valid SO term. Are ".
1675                         "you sure you want to use it? (y or n)\n:";
1676
1677                     while (<STDIN>) {
1678
1679                         chomp(my $choice = $_);
1680
1681                         if ($choice eq 'y') {
1682                             print
1683                                 "\nWould you like to save your preference for " .
1684                                 "future use (so you don't have to redo manual " .
1685                                 "curation for this feature everytime you run " .
1686                                 "the converter)? (y or n)\n";
1687
1688                             #NML: all these while loops are a mess. Really should condense it.
1689                             while (<STDIN>) {
1690
1691                                 chomp(my $choice = $_);
1692
1693                                 if ($choice eq 'y') {
1694                                     curation_save($feat, $input);
1695                                     return $input;
1696                                 } elsif ($choice eq 'n') {
1697                                     return $input
1698                                 } else {
1699                                     print "\nDidn't recognize that command. Please " .
1700                                         "type y or n.\n:"
1701                                 }
1702                             }
1703
1704
1705                         } elsif ($choice eq 'n') {
1706                             return options_cycle($start, $stop, $msg, $feat,
1707                                     $directions, $best_guess, @opt)
1708                         } else {
1709                             print "\nDidn't recognize that command. Please " .
1710                                 "type y or n.\n:"
1711                         }
1712                     }
1713
1714                 } else {
1715                     print
1716                         "\nWould you like to save your preference for " .
1717                         "future use (so you don't have to redo manual " .
1718                         "curation for this feature everytime you run  " .
1719                         "the converter)? (y or n)\n";
1720
1721                     #NML: all these while loops are a mess. Really should condense it.
1722                     while (<STDIN>) {
1723
1724                         chomp(my $choice = $_);
1725
1726                         if ($choice eq 'y') {
1727                             curation_save($feat, $input);
1728                             return $input;
1729                         } elsif ($choice eq 'n') {
1730                             return $input
1731                         } else {
1732                             print "\nDidn't recognize that command. Please " .
1733                                 "type y or n.\n:"
1734                         }
1735                     }
1736
1737                 }
1738
1739             }
1740         } else {
1741             print "\nDidn't recognize that command. Please re-enter your choice.\n:"
1742         }
1743     }
1744
1745 }
1746
1747 sub GenBank_entry {
1748     my ($f, $delimiter, $num) = @_;
1749
1750     $delimiter ||= "\n";
1751
1752
1753     my $entry  =
1754
1755         ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
1756         . ($num
1757             ? ' ' x (12 - length $f->primary_tag ) . ' [2] '
1758             : ' ' x (15 - length $f->primary_tag)
1759           )
1760         . $f->start.'..'.$f->end
1761
1762         . "$delimiter";
1763
1764     if ($num) {
1765         words_tag($f, \$entry);
1766     } else {
1767         for my $tag ($f->all_tags) {
1768             for my $val ( $f->each_tag_value($tag) ) {
1769                 $entry .= ' ' x 20;
1770                 #$entry .= "/$tag=\"$val\"$delimiter";
1771                 $entry .= $val eq '_no_value'
1772                     ? "/$tag$delimiter"
1773                     : "/$tag=\"$val\"$delimiter";
1774             }
1775         }
1776
1777     }
1778
1779     return $entry;
1780 }
1781
1782
1783 sub gff_validate {
1784     warn "Validating GFF file\n" if $DEBUG;
1785     my @feat = @_;
1786
1787     my (%parent2child, %all_ids, %descendants, %reserved);
1788
1789     for my $f (@feat) {
1790         for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
1791             map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
1792                 if $f->has_tag($$aTags[0]);
1793         }
1794     }
1795
1796     if ($SO_FILE) {
1797         while (my ($parentID, $aChildren) = each %parent2child) {
1798             parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
1799         }
1800     }
1801
1802     id_validate(\%all_ids, \%reserved);
1803 }
1804
1805 sub parent_validate {
1806     my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
1807
1808     my $aParents = $hAll->{$parentID};
1809
1810     map {
1811         my $child = $_;
1812         $child->add_tag_value( validation_error =>
1813         "feature tried to add Parent tag, but no Parent found with ID $parentID"
1814         );
1815         my %parents;
1816         map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1817         delete $parents{$parentID};
1818         my @parents = keys %parents;
1819
1820         $child->remove_tag('Parent');
1821
1822         unless ($child->has_tag('ID')) {
1823             my $id = gene_name($child);
1824             $child->add_tag_value('ID', $id);
1825             push @{$hAll->{$id}}, $child
1826         }
1827
1828         $child->add_tag_value('Parent', @parents) if @parents;
1829
1830     } @$aChildren and return unless scalar(@$aParents);
1831
1832     my $par = join(',', map { $_->primary_tag } @$aParents);
1833     warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
1834
1835     #NML manual curation needs to happen here
1836
1837
1838     my %parentsToRemove;
1839
1840     CHILD:
1841     for my $child (@$aChildren) {
1842         my $childType  = $child->primary_tag;
1843
1844         warn "WORKING ON $childType at ".$child->start.' to '.$child->end
1845             if $DEBUG;
1846
1847         for (my $i = 0; $i < scalar(@$aParents); $i++) {
1848             my $parent = $aParents->[$i];
1849             my $parentType = $parent->primary_tag;
1850
1851             warn "CHECKING $childType against $parentType" if $DEBUG;
1852
1853             #cache descendants so we don't have to do repeat searches
1854             unless ($hDescendants->{$parentType}) {
1855
1856                 for my $term ($ONTOLOGY->find_terms(
1857                         -name => $parentType
1858                     ) ) {
1859
1860                     map {
1861                         $hDescendants->{$parentType}{$_->name}++
1862                     } $ONTOLOGY->get_descendant_terms($term);
1863
1864                 }
1865
1866                 # NML: hopefully temporary fix.
1867                 # SO doesn't consider exon/CDS to be a child of mRNA
1868                 # even though common knowledge dictates that they are
1869                 # This cheat fixes the false positives for now
1870                 if ($parentType eq 'mRNA') {
1871                     $hDescendants->{$parentType}{'exon'} = 1;
1872                     $hDescendants->{$parentType}{'CDS'} = 1;
1873                 }
1874
1875             }
1876
1877             warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
1878                 " be a child of $parentType?" if $DEBUG;
1879             if ($hDescendants->{$parentType}{$childType}) {
1880                 warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
1881
1882                 #NML need to deal with multiple children matched to multiple different
1883                 #parents. This model only assumes the first parent id that matches a child will
1884                 #be the reserved feature.
1885
1886                 $hReserved->{$parentID}{$parent}{'parent'} = $parent;
1887                 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
1888
1889                 #mark parent for later removal from all IDs
1890                 #so we don't accidentally change any parents
1891
1892                 $parentsToRemove{$i}++;
1893
1894                 next CHILD;
1895             }
1896         }
1897
1898
1899
1900         #NML shouldn't have to check this; somehow child can lose Parent
1901         #it's happening W3110
1902         #need to track this down
1903         if ( $child->has_tag('Parent') ) {
1904
1905             warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
1906                 if $DEBUG;
1907
1908             my %parents;
1909
1910             map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1911
1912             delete $parents{$parentID};
1913             my @parents = keys %parents;
1914
1915             warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
1916                 ' to ' . $child->end . " cannot be a child of ID $parentID"
1917                 if $DEBUG;
1918
1919             $child->add_tag_value( validation_error =>
1920                     "feature cannot be a child of $parentID");
1921
1922             $child->remove_tag('Parent');
1923
1924             unless ($child->has_tag('ID')) {
1925                 my $id = gene_name($child);
1926                 $child->add_tag_value('ID', $id);
1927                 push @{$hAll->{$id}}, $child
1928             }
1929
1930             $child->add_tag_value('Parent', @parents) if @parents;
1931         }
1932
1933     }
1934
1935     #delete $aParents->[$_] for keys %parentsToRemove;
1936     splice(@$aParents, $_, 1) for keys %parentsToRemove;
1937 }
1938
1939 sub id_validate {
1940     my ($hAll, $hReserved) = @_;
1941
1942
1943     for my $id (keys %$hAll) {
1944
1945         #since 1 feature can have this id,
1946         #let's just shift it off and uniquify
1947         #the rest (unless it's reserved)
1948
1949         shift @{$hAll->{$id}} unless $hReserved->{$id};
1950         for my $feat (@{$hAll->{$id}}) {
1951             id_uniquify(0, $id, $feat, $hAll);
1952         }
1953     }
1954
1955     for my $parentID (keys %$hReserved) {
1956
1957         my @keys = keys %{$hReserved->{$parentID}};
1958
1959         shift @keys;
1960
1961         for my $k (@keys) {
1962             my $parent = $hReserved->{$parentID}{$k}{'parent'};
1963             my $aChildren= $hReserved->{$parentID}{$k}{'children'};
1964
1965             my $value = id_uniquify(0, $parentID, $parent, $hAll);
1966             for my $child (@$aChildren) {
1967
1968                 my %parents;
1969                 map { $parents{$_}++ } $child->get_tag_values('Parent');
1970                 $child->remove_tag('Parent');
1971                 delete $parents{$parentID};
1972                 $parents{$value}++;
1973                 my @parents = keys %parents;
1974                 $child->add_tag_value('Parent', @parents);
1975             }
1976
1977         }
1978     }
1979 }
1980
1981 sub id_uniquify {
1982     my ($count, $value, $feat, $hAll) = @_;
1983
1984     warn "UNIQUIFYING $value" if $DEBUG;
1985
1986     if (! $count) {
1987         $feat->add_tag_value(Alias => $value);
1988         $value .= ('.' . $feat->primary_tag)
1989     } elsif ($count == 1) {
1990         $value .= ".$count"
1991     } else {
1992         chop $value;
1993         $value .= $count
1994     }
1995     $count++;
1996
1997     warn "ENDED UP WITH $value" if $DEBUG;
1998     if ( $hAll->{$value} ) {
1999         warn "$value IS ALREADY TAKEN" if $DEBUG;
2000         id_uniquify($count, $value, $feat, $hAll);
2001     } else {
2002         #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
2003         $feat->remove_tag('ID');
2004         $feat->add_tag_value('ID', $value);
2005         push @{$hAll->{$value}}, $value;
2006     }
2007
2008     $value;
2009 }
2010
2011 sub conf_read {
2012
2013     print "\nCannot read $CONF. Change file permissions and retry, " .
2014         "or enter another file\n" and conf_locate() unless -r $CONF;
2015
2016     print "\nCannot write $CONF. Change file permissions and retry, " .
2017         "or enter another file\n" and conf_locate() unless -w $CONF;
2018
2019     $YAML = LoadFile($CONF);
2020
2021 }
2022
2023 sub conf_create {
2024
2025     my ($path, $input) = @_;
2026
2027     print "Cannot write to $path. Change directory permissions and retry " .
2028         "or enter another save path\n" and conf_locate() unless -w $path;
2029
2030     $CONF = $input;
2031
2032     open(FH, '>', $CONF);
2033     close(FH);
2034     conf_read();
2035
2036
2037 }
2038
2039 sub conf_write { DumpFile($CONF, $YAML) }
2040
2041 sub conf_locate {
2042
2043     print "\nEnter the location of a previously saved config, or a new " .
2044         "path and file name to create a new config (this step is " .
2045         "necessary to save any preferences)";
2046
2047     print "\n\nenter a command:";
2048
2049     while (<STDIN>) {
2050         chomp(my $input = $_);
2051         my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
2052
2053         if (-e $input && (! -d $input)) {
2054
2055             print "\nReading $input...\n";
2056             $CONF = $input;
2057
2058             conf_read();
2059             last;
2060
2061         } elsif (! -d $input && $fn.$suffix) {
2062
2063             print "Creating $input...\n";
2064             conf_create($path, $input);
2065             last;
2066
2067         } elsif (-e $input && -d $input) {
2068             print "You only entered a directory. " .
2069                 "Please enter BOTH a directory and filename\n";
2070         } else {
2071             print "$input does not appear to be a valid path. Please enter a " .
2072                 "valid directory and filename\n";
2073         }
2074         print "\nenter a command:";
2075     }
2076 }
2077
2078 sub curation_save {
2079
2080     my ($feat, $input) = @_;
2081
2082     #my $error = "Enter the location of a previously saved config, or a new " .
2083     #    "path and file name to create a new config (this step is " .
2084     #    "necessary to save any preferences)\n";
2085
2086     if (!$CONF) {
2087         print "\n\n";
2088         conf_locate();
2089     } elsif (! -e $CONF) {
2090         print "\n\nThe config file you have chosen doesn't exist.\n";
2091         conf_locate();
2092     } else { conf_read() }
2093
2094     my $entry = GenBank_entry($feat, "\r", 1);
2095
2096     my $msg = "Term entered: $input";
2097     my $directions = "Please select any/all tags that provide evidence for the term you
2098 have entered. You may enter multiple tags by separating them by commas/dashes
2099 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
2100 the option of either selecting the entire note as evidence, or specific
2101 keywords. If a tag has multiple keywords, they will be tagged alphabetically
2102 for selection. To select a specific keyword in a tag field, you must enter the
2103 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
2104 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
2105 to be to match your curation. To match the GenBank entry exactly as it
2106 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
2107 preference, you will no longer be prompted to choose a feature type for any
2108 matching entries until you edit the curation.ini file.";
2109     my $msg_copy = $msg;
2110     my $dir_copy = $directions;
2111
2112     my $format = "format STDOUT = \n" .
2113         '-' x 156 . "\n" .
2114         '^' . '<' x 77 .  '| Directions:' . "\n" .
2115         '$msg_copy' . "\n" .
2116         '-' x 156 . "\n" .
2117         ' ' x 78 . "|\n" .
2118         '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
2119         '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
2120         (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
2121         ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20  . ".\n";
2122
2123     {
2124         # eval throws redefined warning that breaks formatting.
2125         # Turning off warnings just for the eval to fix this.
2126         no warnings 'redefine';
2127         eval $format;
2128     }
2129
2130     write;
2131     print '-' x 156 . "\nenter a command:";
2132
2133     my @tags = words_tag($feat);
2134
2135     my $final = {};
2136     my $choices;
2137     while (<STDIN>) {
2138
2139         chomp(my $choice = $_);
2140
2141         if (scalar(keys %$final) && $choice =~ /^y/i) { last
2142
2143         } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
2144
2145         } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
2146
2147         } elsif ($choice eq 'all') {
2148
2149             $choice = '';
2150             for (my $i=1; $i < scalar(@tags); $i++) {
2151                 $choice .= "$i,";
2152             }
2153             chop $choice;
2154         }
2155         #print "CHOICE [$choice]";
2156
2157         my @selections;
2158         for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
2159             if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
2160
2161                 for ($1..$2) { push @selections, $_ }
2162
2163                 my $dangling_alphas = $3;
2164                 alpha_expand($dangling_alphas, \@selections);
2165
2166             } else {
2167                 alpha_expand($_, \@selections);
2168             }
2169         }
2170
2171         foreach my $numbers (@selections) {
2172
2173             my @c = split(/(?=[\w])/, $numbers);
2174             s/\W+//g foreach @c;
2175             my $num;
2176
2177             {
2178                 $^W = 0;
2179                 $num = 0 + shift @c;
2180             }
2181
2182             my $tag = $tags[$num];
2183             if (ref $tag && scalar(@c)) {
2184                 my $no_value;
2185                 foreach (@c) {
2186                     if (defined $tag->{$_}) {
2187                         $choices .= "${num}[$_] ";
2188                         my ($t,$v) = @{$tag->{$_}};
2189                         push @{${$final->{$input}}[0]{$t}}, $v;
2190
2191                     } else { $no_value++ }
2192                     }
2193
2194                 if ($no_value) {
2195                     _selection_add($tag,$final,$input,\$choices,$num);
2196                     #my ($t,$v) = @{$tag->{'all'}};
2197                     #unless (defined ${$final->{$input}}[0]{$t}) {
2198                         #$choices .= "$num, ";
2199                         #push @{${$final->{$input}}[0]{$t}}, $v
2200                     #}
2201                 }
2202
2203                 $choices = substr($choices, 0, -2);
2204                 $choices .= ', ';
2205
2206             } elsif (ref $tag) {
2207                 _selection_add($tag,$final,$input,\$choices,$num);
2208                 #my ($t,$v) = @{$tag->{'all'}};
2209                 #unless (defined ${$final->{$input}}[0]{$t}) {
2210                     #$choices .= "$num, ";
2211                     #push @{${$final->{$input}}[0]{$t}}, $v
2212                 #}
2213             }
2214         }
2215         $choices = substr($choices, 0, -2) if $choices;
2216         if ($final) {
2217             print "\nYou have chosen the following tags:\n$choices\n";
2218             print "This will be written to the config file as:\n";
2219             print Dump $final;
2220             print "\nIs this correct? (y or n)\n";
2221         } else { print "\nInvalid selection. Please try again\n" }
2222     }
2223     push @{$YAML->{$input}}, $final->{$input};
2224     conf_write();
2225 }
2226
2227 #  words_tag() splits each tag value string into multiple words so that the
2228 #  user can select the parts he/she wants to use for curation
2229 #  it can tag 702 (a - zz) separate words; this should be enough
2230
2231 sub words_tag {
2232
2233     my ($feat, $entry) = @_;
2234
2235     my @tags;
2236
2237     @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
2238     my $i = 3;
2239     foreach my $tag ($feat->all_tags) {
2240         foreach my $value ($feat->each_tag_value($tag)) {
2241
2242             my ($string, $tagged_string);
2243
2244             my @words = split(/(?=\w+?)/, $value);
2245
2246             my $pos = 0;
2247
2248
2249             foreach my $word (@words) {
2250
2251                 (my $sanitized_word = $word) =~ s/\W+?//g;
2252                 $string .= $word;
2253
2254                 my $lead = int($pos/ALPHABET_DIVISOR);
2255                 my $lag = $pos % ALPHABET_DIVISOR;
2256
2257                 my $a =  $lead ? ${(ALPHABET)}[$lead-1] : '';
2258                 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2259
2260                 $tagged_string .= " ($a) $word";
2261
2262                 $tags[$i]{$a} = [$tag, $sanitized_word];
2263                 $pos++;
2264             }
2265
2266             $value = $tagged_string if scalar(@words) > 1;
2267
2268             $$entry .= "[$i] /$tag=\"$value\"\r";
2269
2270             $tags[$i]{'all'} = [$tag, $string];
2271         }
2272         $i++;
2273     }
2274
2275     return @tags;
2276
2277 }
2278
2279 sub alpha_expand {
2280
2281     my ($dangling_alphas, $selections) = @_;
2282
2283     if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
2284
2285         my $digit = $1;
2286         push @$selections, $digit if $digit;
2287
2288         my $start = $2;
2289         my $stop = $3;
2290
2291         my @starts = split('', $start);
2292         my @stops = split('', $stop);
2293
2294         my ($final_start, $final_stop);
2295
2296         for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
2297
2298             my ($final, $splits) = @$_;
2299
2300             my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
2301             my $rem;
2302
2303
2304             if ($$splits[1]) {
2305                 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
2306                 $int++
2307             } else {
2308                 $rem = $int;
2309                 $int = 0;
2310             }
2311
2312
2313             $$final = $int * ALPHABET_DIVISOR;
2314             $$final += $rem;
2315
2316         }
2317
2318         my $last_number = pop @$selections;
2319         for my $pos ($final_start..$final_stop) {
2320             my $lead = int($pos/ALPHABET_DIVISOR);
2321             my $lag = $pos % ALPHABET_DIVISOR;
2322             my $alpha =  $lead ? ${(ALPHABET)}[$lead-1] : '';
2323             $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2324             push @$selections, $last_number.$alpha;
2325         }
2326
2327     } elsif (defined($dangling_alphas)) {
2328         if ($dangling_alphas =~ /^\d/) {
2329             push @$selections, $dangling_alphas;
2330         } elsif ($dangling_alphas =~ /^\D/) {
2331             #print "$dangling_alphas ".Dumper @$selections;
2332             my $last_number = pop @$selections;
2333             $last_number ||= '';
2334             push @$selections, $last_number.$dangling_alphas;
2335             #$$selections[-1] .= $dangling_alphas;
2336         }
2337     }
2338
2339 }
2340
2341 sub _selection_add {
2342
2343     my ($tag, $final, $input, $choices, $num) = @_;
2344     my ($t,$v) = @{$tag->{'all'}};
2345     unless (defined ${$final->{$input}}[0]{$t}) {
2346         $$choices .= "$num, ";
2347         push @{${$final->{$input}}[0]{$t}}, $v
2348     }
2349
2350 }