Apply carsonhh's patch
[bioperl-live.git] / scripts / Bio-DB-GFF / bp_genbank2gff3.pl
blob4acc7a8a2e5483e3a6f7a6ed917959a2609528fb
1 #!/usr/bin/perl
5 =pod
7 =head1 NAME
9 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3
11 =head1 SYNOPSIS
13 bp_genbank2gff3.pl [options] filename(s)
15 # process a directory containing GenBank flatfiles
16 perl bp_genbank2gff3.pl --dir path_to_files --zip
18 # process a single file, ignore explicit exons and introns
19 perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz
21 # process a list of files
22 perl bp_genbank2gff3.pl *gbk.gz
24 # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader
25 curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \
26 | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \
27 | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata
29 Options:
30 --noinfer -r don't infer exon/mRNA subfeatures
31 --conf -i path to the curation configuration file that contains user preferences
32 for Genbank entries (must be YAML format)
33 (if --manual is passed without --ini, user will be prompted to
34 create the file if any manual input is saved)
35 --sofile -l path to to the so.obo file to use for feature type mapping
36 (--sofile live will download the latest online revision)
37 --manual -m when trying to guess the proper SO term, if more than
38 one option matches the primary tag, the converter will
39 wait for user input to choose the correct one
40 (only works with --sofile)
41 --dir -d path to a list of genbank flatfiles
42 --outdir -o location to write GFF files (can be 'stdout' or '-' for pipe)
43 --zip -z compress GFF3 output files with gzip
44 --summary -s print a summary of the features in each contig
45 --filter -x genbank feature type(s) to ignore
46 --split -y split output to separate GFF and fasta files for
47 each genbank record
48 --nolump -n separate file for each reference sequence
49 (default is to lump all records together into one
50 output file for each input file)
51 --ethresh -e error threshold for unflattener
52 set this high (>2) to ignore all unflattener errors
53 --[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon
54 model. --CDS is default. Use --CDS to keep default GFF gene model,
55 use --noCDS to convert to g-r-p-e.
56 --format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work
57 (GenBank is default)
58 --GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available
59 --quiet don't talk about what is being processed
60 --typesource SO sequence type for source (e.g. chromosome; region; contig)
61 --help -h display this message
64 =head1 DESCRIPTION
66 This script uses Bio::SeqFeature::Tools::Unflattener and
67 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene
68 containment hierarchies mapped for optimal display in gbrowse.
70 The input files are assumed to be gzipped GenBank flatfiles for refseq
71 contigs. The files may contain multiple GenBank records. Either a
72 single file or an entire directory can be processed. By default, the
73 DNA sequence is embedded in the GFF but it can be saved into separate
74 fasta file with the --split(-y) option.
76 If an input file contains multiple records, the default behaviour is
77 to dump all GFF and sequence to a file of the same name (with .gff
78 appended). Using the 'nolump' option will create a separate file for
79 each genbank record. Using the 'split' option will create separate
80 GFF and Fasta files for each genbank record.
83 =head2 Notes
85 =head3 'split' and 'nolump' produce many files
87 In cases where the input files contain many GenBank records (for
88 example, the chromosome files for the mouse genome build), a very
89 large number of output files will be produced if the 'split' or
90 'nolump' options are selected. If you do have lists of files E<gt> 6000,
91 use the --long_list option in bp_bulk_load_gff.pl or
92 bp_fast_load_gff.pl to load the gff and/ or fasta files.
94 =head3 Designed for RefSeq
96 This script is designed for RefSeq genomic sequence entries. It may
97 work for third party annotations but this has not been tested.
98 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl
99 if you don't mind some gene model unflattener errors (dgg).
101 =head3 G-R-P-E Gene Model
103 Don Gilbert worked this over with needs to produce GFF3 suited to
104 loading to GMOD Chado databases. Most of the changes I believe are
105 suited for general use. One main chado-specific addition is the
106 --[no]cds2protein flag
108 My favorite GFF is to set the above as ON by default (disable with --nocds2prot)
109 For general use it probably should be OFF, enabled with --cds2prot.
111 This writes GFF with an alternate, but useful Gene model,
112 instead of the consensus model for GFF3
114 [ gene > mRNA> (exon,CDS,UTR) ]
116 This alternate is
118 gene > mRNA > polypeptide > exon
120 means the only feature with dna bases is the exon. The others
121 specify only location ranges on a genome. Exon of course is a child
122 of mRNA and protein/peptide.
124 The protein/polypeptide feature is an important one, having all the
125 annotations of the GenBank CDS feature, protein ID, translation, GO
126 terms, Dbxrefs to other proteins.
128 UTRs, introns, CDS-exons are all inferred from the primary exon bases
129 inside/outside appropriate higher feature ranges. Other special gene
130 model features remain the same.
132 Several other improvements and bugfixes, minor but useful are included
134 * IO pipes now work:
135 curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...
137 * GenBank main record fields are added to source feature, e.g. organism, date,
138 and the sourcetype, commonly chromosome for genomes, is used.
140 * Gene Model handling for ncRNA, pseudogenes are added.
142 * GFF header is cleaner, more informative.
143 --GFF_VERSION flag allows choice of v2 as well as default v3
145 * GFF ##FASTA inclusion is improved, and
146 CDS translation sequence is moved to FASTA records.
148 * FT -> GFF attribute mapping is improved.
150 * --format choice of SeqIO input formats (GenBank default).
151 Uniprot/Swissprot and EMBL work and produce useful GFF.
153 * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions
154 and more flexible usage.
156 =head1 TODO
158 =head2 Are these additions desired?
160 * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY
161 * handle Entrezgene, other non-sequence SeqIO structures (really should change
162 those parsers to produce consistent annotation tags).
164 =head2 Related bugfixes/tests
166 These items from Bioperl mail were tested (sample data generating
167 errors), and found corrected:
169 From: Ed Green <green <at> eva.mpg.de>
170 Subject: genbank2gff3.pl on new human RefSeq
171 Date: 2006-03-13 21:22:26 GMT
172 -- unspecified errors (sample data works now).
174 From: Eric Just <e-just <at> northwestern.edu>
175 Subject: genbank2gff3.pl
176 Date: 2007-01-26 17:08:49 GMT
177 -- bug fixed in genbank2gff3 for multi-record handling
179 This error is for a /trans_splice gene that is hard to handle, and
180 unflattner/genbank2 doesn't
182 From: Chad Matsalla <chad <at> dieselwurks.com>
183 Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?
184 Date: 2005-07-15 19:51:48 GMT
186 =head1 AUTHOR
188 Sheldon McKay (mckays@cshl.edu)
190 Copyright (c) 2004 Cold Spring Harbor Laboratory.
192 =head2 AUTHOR of hacks for GFF2Chado loading
194 Don Gilbert (gilbertd@indiana.edu)
197 =cut
199 use strict;
200 use warnings;
202 use lib "$ENV{HOME}/bioperl-live";
203 # chad put this here to enable situations when this script is tested
204 # against bioperl compiled into blib along with other programs using blib
205 BEGIN {
206 unshift(@INC,'blib/lib');
208 use Pod::Usage;
209 use Bio::Root::RootI;
210 use Bio::SeqIO;
211 use File::Spec;
212 use Bio::SeqFeature::Tools::Unflattener;
213 use Bio::SeqFeature::Tools::TypeMapper;
214 use Bio::SeqFeature::Tools::IDHandler;
215 use Bio::Location::SplitLocationI;
216 use Bio::Location::Simple;
217 use Bio::Tools::GFF;
218 use Getopt::Long;
219 use List::Util qw(first);
220 use Bio::OntologyIO;
221 use YAML qw(Dump LoadFile DumpFile);
222 use File::Basename;
224 use vars qw/$split @filter $zip $outdir $help $ethresh
225 $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
226 $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
227 $file @files $dir $summary $nolump
228 $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
229 $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;
231 use constant SO_URL =>
232 'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';
233 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];
234 use constant ALPHABET_TO_NUMBER => {
235 a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,
236 j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,
237 r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,
238 z => 25,
240 use constant ALPHABET_DIVISOR => 26;
241 use constant GM_NEW_TOPLEVEL => 2;
242 use constant GM_NEW_PART => 1;
243 use constant GM_DUP_PART => 0;
244 use constant GM_NOT_PART => -1;
246 # Options cycle in multiples of 2 because of left side/right side pairing.
247 # You can make this number odd, but displayed matches will still round up
248 use constant OPTION_CYCLE => 6;
250 $GFF_VERSION = 3; # allow v2 ...
251 $verbose = 1; # right default? -nov to turn off
253 # dgg: change the gene model to Gene/mRNA/Polypeptide/exons...
254 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()
255 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;
256 # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support
258 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **
259 my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work
260 # other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG
263 my %TAG_MAP = (
264 db_xref => 'Dbxref',
265 name => 'Name',
266 note => 'Note', # also pull GO: ids into Ontology_term
267 synonym => 'Alias',
268 symbol => 'Alias', # is symbol still used?
269 # protein_id => 'Dbxref', also seen Dbxref tags: EC_number
270 # translation: handled in gene_features
274 $| = 1;
275 my $quiet= !$verbose;
276 my $ok= GetOptions( 'd|dir|input:s' => \$dir,
277 'z|zip' => \$zip,
278 'h|help' => \$help,
279 's|summary' => \$summary,
280 'r|noinfer' => \$noinfer,
281 'i|conf=s' => \$CONF,
282 'sofile=s' => \$SO_FILE,
283 'm|manual' => \$MANUAL,
284 'o|outdir|output:s'=> \$outdir,
285 'x|filter:s'=> \@filter,
286 'y|split' => \$split,
287 "ethresh|e=s"=>\$ethresh,
288 'c|CDS!' => \$CDSkeep,
289 'f|format=s' => \$FORMAT,
290 'typesource=s' => \$source_type,
291 'GFF_VERSION=s' => \$GFF_VERSION,
292 'quiet!' => \$quiet, # swap quiet to verbose
293 'DEBUG!' => \$DEBUG,
294 'n|nolump' => \$nolump);
296 my $lump = 1 unless $nolump || $split;
297 $verbose= !$quiet;
299 # look for help request
300 pod2usage(2) if $help || !$ok;
302 # keep SOURCEID as-is and change FORMAT for SeqIO types;
303 # note SeqIO uses file.suffix to guess type; not useful here
304 $SOURCEID= $FORMAT;
305 $FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/;
306 $verbose =1 if($DEBUG);
308 # initialize handlers
309 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);
310 $unflattener->error_threshold($ethresh) if $ethresh;
311 $unflattener->verbose(1) if($DEBUG);
312 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?
313 # ensembl parsing is still problematic, forget this
315 my $tm = Bio::SeqFeature::Tools::TypeMapper->new;
316 my $idh = Bio::SeqFeature::Tools::IDHandler->new;
318 # dgg
319 $source_type ||= "region"; # should really parse from FT.source contents below
321 #my $FTSOmap = $tm->FT_SO_map();
322 my $FTSOmap;
323 my $FTSOsynonyms;
325 if (defined($SO_FILE) && $SO_FILE eq 'live') {
326 print "\nDownloading the latest SO file from ".SO_URL."\n\n";
327 use LWP::UserAgent;
328 my $ua = LWP::UserAgent->new(timeout => 30);
329 my $request = HTTP::Request->new(GET => SO_URL);
330 my $response = $ua->request($request);
333 if ($response->status_line =~ /200/) {
334 use File::Temp qw/ tempfile /;
335 my ($fh, $fn) = tempfile();
336 print $fh $response->content;
337 $SO_FILE = $fn;
338 } else {
339 print "Couldn't download SO file online...skipping validation.\n"
340 . "HTTP Status was " . $response->status_line . "\n"
341 and undef $SO_FILE
345 if ($SO_FILE) {
348 my (%terms, %syn);
350 my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );
351 $ONTOLOGY = $parser->next_ontology();
353 for ($ONTOLOGY->get_all_terms) {
354 my $feat = $_;
356 $terms{$feat->name} = $feat->name;
357 #$terms{$feat->name} = $feat;
359 my @syn = $_->each_synonym;
361 push @{$syn{$_}}, $feat->name for @syn;
362 #push @{$syn{$_}}, $feat for @syn;
365 $FTSOmap = \%terms;
366 $FTSOsynonyms = \%syn;
368 my %hardTerms = %{ $tm->FT_SO_map() };
369 map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;
371 } else {
372 my %terms = %{ $tm->FT_SO_map() };
373 while (my ($k,$v) = each %terms) {
374 $FTSOmap->{$k} = ref($v) ? shift @$v : $v;
378 $TYPE_MAP = $FTSOmap;
379 $SYN_MAP = $FTSOsynonyms;
382 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")
384 # stringify filter list if applicable
385 my $filter = join ' ', @filter if @filter;
387 # determine input files
388 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use
389 if ($dir && ($dir eq '-' || $dir eq 'stdin')) {
390 $stdin=1; $dir=''; @files=('stdin');
392 } elsif ( $dir ) {
393 if ( -d $dir ) {
394 opendir DIR, $dir or die "could not open $dir for reading: $!";
395 @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;
396 closedir DIR;
398 else {
399 die "$dir is not a directory\n";
402 else {
403 @files = @ARGV;
404 $dir = '';
407 # we should have some files by now
408 pod2usage(2) unless @files;
411 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use
412 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {
413 warn("std. output chosen: cannot split\n") if($split);
414 warn("std. output chosen: cannot zip\n") if($zip);
415 warn("std. output chosen: cannot nolump\n") if($nolump);
416 $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip
418 } elsif ( $outdir && !-e $outdir ) {
419 mkdir($outdir) or die "could not create directory $outdir: $!\n";
421 elsif ( !$outdir ) {
422 $outdir = $dir || '.';
425 for my $file ( @files ) {
426 # dgg ; allow 'stdin' / '-' input ?
427 chomp $file;
428 die "$! $file" unless($stdin || -e $file);
429 print "# Input: $file\n" if($verbose);
431 my ($lump_fh, $lumpfa_fh, $outfile, $outfa);
432 if ($stdout) {
433 $lump_fh= *STDOUT; $lump="stdout$$";
434 $outfa= "stdout$$.fa"; # this is a temp file ... see below
435 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
437 } elsif ( $lump ) {
438 my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);
439 $lump = File::Spec->catfile($outdir, $fileonly.'.gff');
440 ($outfa = $lump) =~ s/\.gff/\.fa/;
441 open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";
442 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
446 # open input file, unzip if req'd
447 if ($stdin) {
448 *FH = *STDIN;
449 } elsif ( $file =~ /\.gz/ ) {
450 open FH, "gunzip -c $file |";
452 else {
453 open FH, '<', $file;
456 my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);
457 my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );
459 while ( my $seq = $in->next_seq() ) {
460 my $seq_name = $seq->accession_number;
461 my $end = $seq->length;
462 my @to_print;
464 # arrange disposition of GFF output
465 $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');
466 my $out;
468 if ( $lump ) {
469 $outfile = $lump;
470 $out = $lump_fh;
472 else {
473 $outfile = File::Spec->catfile($outdir, $seq_name.'.gff');
474 open $out, ">$outfile";
477 # filter out unwanted features
478 my $source_feat= undef;
479 my @source= filter($seq); $source_feat= $source[0];
481 ($source_type,$source_feat)=
482 getSourceInfo( $seq, $source_type, $source_feat ) ;
483 # always; here we build main prot $source_feat; # if @source;
485 # abort if there are no features
486 warn "$seq_name has no features, skipping\n" and next
487 if !$seq->all_SeqFeatures;
490 $FTSOmap->{'source'} = $source_type;
491 ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features
493 # construct a GFF header
494 # add: get source_type from attributes of source feature? chromosome=X tag
495 # also combine 1st ft line here with source ft from $seq ..
496 my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);
497 print $out $header;
498 print "# working on $info\n" if($verbose);
500 # unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..
501 unflatten_seq($seq);
503 # Note that we use our own get_all_SeqFeatures function
504 # to rescue cloned exons
506 @GFF_LINE_FEAT = ();
507 for my $feature ( get_all_SeqFeatures($seq) ) {
509 my $method = $feature->primary_tag;
510 next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);
512 $feature->seq_id($seq->id) unless($feature->seq_id);
513 $feature->source_tag($SOURCEID);
515 # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;
516 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
517 maptags2gff($feature);
519 # current gene name. The unflattened gene features should be in order so any
520 # exons, CDSs, etc that follow will belong to this gene
521 my $gene_name;
522 if ( $method eq 'gene' || $method eq 'pseudogene' ) {
523 @to_print= print_held($out, $gffio, \@to_print);
524 $gene_id = $gene_name= gene_name($feature);
525 } else {
526 $gene_name= gene_name($feature);
529 #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx
530 # be converted to or added as Name=xxx (if not ID= or as well)
531 ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags
532 convert_to_name($feature);
534 ## dgg: extended to protein|polypeptide
535 ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes
536 ## in yeast have that genbank tag; why?
537 ## these include pseudogene ...
539 ## Note we also have mapped types to SO, so these RNA's are now transcripts:
540 # pseudomRNA => "pseudogenic_transcript",
541 # pseudotranscript" => "pseudogenic_transcript",
542 # misc_RNA=>'processed_transcript',
544 warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;
546 if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
547 || ( $gene_id && $gene_name eq $gene_id ) ) {
549 my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result
550 if ($action == GM_DUP_PART) {
551 # ignore, this is dupl. exon with new parent ...
553 } elsif ($action == GM_NOT_PART) {
554 add_generic_id( $feature, $gene_name, "nocount");
555 my $gff = $gffio->gff_string($feature);
556 push @GFF_LINE_FEAT, $feature;
557 #print $out "$gff\n" if $gff;
559 } elsif ($action > 0) {
560 # hold off print because exon etc. may get 2nd, 3rd parents
561 @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);
562 push(@to_print, $feature);
567 # otherwise handle as generic feats with IDHandler labels
568 else {
569 add_generic_id( $feature, $gene_name, "");
570 my $gff= $gffio->gff_string($feature);
571 push @GFF_LINE_FEAT, $feature;
572 #print $out "$gff\n" if $gff;
576 # don't like doing this after others; do after each new gene id?
577 @to_print= print_held($out, $gffio, \@to_print);
579 gff_validate(@GFF_LINE_FEAT);
581 for my $feature (@GFF_LINE_FEAT) {
582 my $gff= $gffio->gff_string($feature);
583 print $out "$gff\n" if $gff;
586 # deal with the corresponding DNA
587 my ($fa_out,$fa_outfile);
588 my $dna = $seq->seq;
589 if($dna || %proteinfa) {
590 $method{'RESIDUES'} += length($dna);
591 $dna =~ s/(\S{60})/$1\n/g;
592 $dna .= "\n";
594 if ($split) {
595 $fa_outfile = $outfile;
596 $fa_outfile =~ s/gff$/fa/;
597 open $fa_out, ">$fa_outfile" or die $!;
598 print $fa_out ">$seq_name\n$dna" if $dna;
599 foreach my $aid (sort keys %proteinfa) {
600 my $aa= delete $proteinfa{$aid};
601 $method{'RESIDUES(tr)'} += length($aa);
602 $aa =~ s/(\S{60})/$1\n/g;
603 print $fa_out ">$aid\n$aa\n";
607 else {
608 ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out
609 ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk
610 ## maybe write this to temp .fa then cat to end of lumped gff $out
611 print $lumpfa_fh ">$seq_name\n$dna" if $dna;
612 foreach my $aid (sort keys %proteinfa) {
613 my $aa= delete $proteinfa{$aid};
614 $method{'RESIDUES(tr)'} += length($aa);
615 $aa =~ s/(\S{60})/$1\n/g;
616 print $lumpfa_fh ">$aid\n$aa\n";
620 %proteinfa=();
623 if ( $zip && !$lump ) {
624 system "gzip -f $outfile";
625 system "gzip -f $fa_outfile" if($fa_outfile);
626 $outfile .= '.gz';
627 $fa_outfile .= '.gz' if $split;
630 # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA
631 print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);
632 print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);
634 # dgg: moved to after all inputs; here it prints cumulative sum for each record
635 #if ( $summary ) {
636 # print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
638 # for ( keys %method ) {
639 # print "# $_ $method{$_}\n";
641 # print "# \n";
642 # }
646 print "# GFF3 saved to $outfile\n" if( $verbose && $lump);
647 if ( $summary ) {
648 print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
649 for ( keys %method ) {
650 print "# $_ $method{$_}\n";
652 print "# \n";
655 ## FIXME for piped output w/ split FA files ...
656 close($lumpfa_fh) if $lumpfa_fh;
657 if (!$split && $outfa && $lump_fh) {
658 print $lump_fh "##FASTA\n"; # GFF3 spec
659 open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";
660 while( <$lumpfa_fh>) {
661 print $lump_fh $_;
662 } # is $lump_fh still open?
663 close($lumpfa_fh);
664 unlink($outfa);
668 if ( $zip && $lump ) {
669 system "gzip -f $lump";
672 close FH;
679 sub typeorder {
680 return 1 if ($_[0] =~ /gene/);
681 return 2 if ($_[0] =~ /RNA|transcript/);
682 return 3 if ($_[0] =~ /protein|peptide/);
683 return 4 if ($_[0] =~ /exon|CDS/);
684 return 3; # default before exon (smallest part)
687 sub sort_by_feattype {
688 my($at,$bt)= ($a->primary_tag, $b->primary_tag);
689 return (typeorder($at) <=> typeorder($bt))
690 or ($at cmp $bt);
691 ## or ($a->name() cmp $b->name());
694 sub print_held {
695 my($out,$gffio,$to_print)= @_;
696 return unless(@$to_print);
697 @$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes
698 while ( my $feature = shift @$to_print) {
699 my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode
700 push @GFF_LINE_FEAT, $feature;
701 #print $out "$gff\n";
703 return (); # @to_print
706 sub maptags2gff {
707 my $f = shift;
708 ## should copy/move locus_tag to Alias, if not ID/Name/Alias already
709 # but see below /gene /locus_tag usage
710 foreach my $tag (keys %TAG_MAP) {
711 if ($f->has_tag($tag)) {
712 my $newtag= $TAG_MAP{$tag};
713 my @v= $f->get_tag_values($tag);
714 $f->remove_tag($tag);
715 $f->add_tag_value($newtag,@v);
717 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
718 ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'
719 if ($tag eq 'note') {
720 map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;
721 my @go= map { m/(GO:\d+)/g } @v;
722 $f->add_tag_value('Ontology_term',@go) if(@go);
730 sub getSourceInfo {
731 my ($seq, $source_type, $sf) = @_;
733 my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);
734 my $is_gene = ($SOURCEID =~/entrezgene/i);
735 my $is_rich = (ref($seq) =~ /RichSeq/);
736 my $seq_name= $seq->accession_number();
738 unless($sf) { # make one
739 $source_type= $is_swiss ? $PROTEIN_TYPE
740 : $is_gene ? "eneg" # "gene" # "region" #
741 : $is_rich ? $seq->molecule : $source_type;
742 $sf = Bio::SeqFeature::Generic->direct_new();
744 my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);
745 my $loc= $seq->can('location') ? $seq->location()
746 : new Bio::Location::Simple( -start => $start, -end => $len);
747 $sf->location( $loc );
748 $sf->primary_tag($source_type);
749 $sf->source_tag($SOURCEID);
750 $sf->seq_id( $seq_name);
751 #? $sf->display_name($seq->id()); ## Name or Alias ?
752 $sf->add_tag_value( Alias => $seq->id()); # unless id == accession
753 $seq->add_SeqFeature($sf);
754 ## $source_feat= $sf;
757 if ($sf->has_tag("chromosome")) {
758 $source_type= "chromosome";
759 my ($chrname) = $sf->get_tag_values("chromosome");
760 ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead
761 ## e.g. Mouse chr 19 has two IDs in NCBI genbank now
762 $sf->add_tag_value( Alias => $chrname );
765 # pull GB Comment, Description for source ft ...
766 # add reference - can be long, not plain string...
767 warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;
768 # GenBank fields: keyword,comment,reference,date_changed
769 # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM
771 # is this just for main $seq object or for all seqfeatures ?
772 my %AnnotTagMap= (
773 'gene_name' => 'Alias',
774 'ALIAS_SYMBOL' => 'Alias', # Entrezgene
775 'LOCUS_SYNONYM' => 'Alias', #?
776 'symbol' => 'Alias',
777 'synonym' => 'Alias',
778 'dblink' => 'Dbxref',
779 'product' => 'product',
780 'Reference' => 'reference',
781 'OntologyTerm' => 'Ontology_term',
782 'comment' => 'Note',
783 'comment1' => 'Note',
784 # various map-type locations
785 # gene accession tag is named per source db !??
786 # 'Index terms' => keywords ??
790 my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );
791 my ($date)= $seq->annotation->get_Annotations("dates")
792 || $seq->annotation->get_Annotations("update-date")
793 || $is_rich ? $seq->get_dates() : ();
794 my ($comment)= $seq->annotation->get_Annotations("comment");
795 my ($species)= $seq->annotation->get_Annotations("species");
796 if (!$species
797 && $seq->can('species')
798 && defined $seq->species()
799 && $seq->species()->can('binomial') ) {
800 $species= $seq->species()->binomial();
803 # update source feature with main GB fields
804 $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');
805 $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));
806 $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));
807 $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));
808 $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));
810 $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;
812 foreach my $atag (sort keys %AnnotTagMap) {
813 my $gtag= $AnnotTagMap{$atag}; next unless($gtag);
814 my @anno = map{
815 if (ref $_ && $_->can('get_all_values')) {
816 split( /[,;] */, join ";", $_->get_all_values)
818 elsif (ref $_ && $_->can('display_text')) {
819 split( /[,;] */, $_->display_text)
821 elsif (ref $_ && $_->can('value')) {
822 split( /[,;] */, $_->value)
824 else {
827 } $seq->annotation->get_Annotations($atag);
828 foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }
831 #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');
832 #$sf->add_tag_value( Alias => $_ ) foreach(@genes);
834 #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all
835 #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);
837 return (wantarray)? ($source_type,$sf) : $source_type; #?
841 sub gene_features {
842 my ($f, $gene_id, $genelinkID) = @_;
843 local $_ = $f->primary_tag;
844 $method{$_}++;
846 if ( /gene/ ) {
847 $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?
848 $tnum = $rnum= 0; $ncrna_id= $rna_id = '';
849 return GM_NEW_TOPLEVEL;
851 } elsif ( /mRNA/ ) {
852 return GM_NOT_PART unless $gene_id;
853 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
854 ($rna_id = $gene_id ) =~ s/gene/mRNA/;
855 $rna_id .= '.t0' . ++$tnum;
856 $f->add_tag_value( ID => $rna_id );
857 $f->add_tag_value( Parent => $gene_id );
859 } elsif ( /RNA|transcript/) {
860 ## misc_RNA here; missing exons ... flattener problem?
861 # all of {t,nc,sn}RNA can have gene models now
862 ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag
863 ## CDS needs to use mRNA, not misc_RNA, rna_id ...
864 ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level
866 if($gene_id) {
867 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
868 ($ncrna_id = $gene_id) =~ s/gene/ncRNA/;
869 $ncrna_id .= '.r0' . ++$rnum;
870 $f->add_tag_value( Parent => $gene_id );
871 $f->add_tag_value( ID => $ncrna_id );
872 } else {
873 unless ($f->has_tag('ID')) {
874 if($genelinkID) {
875 $f->add_tag_value( ID => $genelinkID ) ;
876 } else {
877 $idh->generate_unique_persistent_id($f);
880 ($ncrna_id)= $f->get_tag_values('ID');
881 return GM_NEW_TOPLEVEL;
882 # this feat now acts as gene-top-level; need to print @to_print to flush prior exons?
885 } elsif ( /exon/ ) { # can belong to any kind of RNA
886 return GM_NOT_PART unless ($rna_id||$ncrna_id);
887 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
888 ## we are getting duplicate Parents here, which chokes chado loader, with reason...
889 ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice
890 ## check all Parents
891 for my $expar ($rna_id, $ncrna_id) {
892 next unless($expar);
893 if ( $exonpar{$expar} and $f->has_tag('Parent') ) {
894 my @vals = $f->get_tag_values('Parent');
895 next if (grep {$expar eq $_} @vals);
897 $exonpar{$expar}++;
898 $f->add_tag_value( Parent => $expar);
899 # last; #? could be both
901 # now we can skip cloned exons
902 # dgg note: multiple parents get added and printed for each unique exon
903 return GM_DUP_PART if ++$seen{$f} > 1;
905 } elsif ( /CDS|protein|polypeptide/ ) {
906 return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??
907 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??
908 (my $pro_id = $rna_id) =~ s/\.t/\.p/;
910 if( ! $CDSkeep && /CDS/) {
911 $f->primary_tag($PROTEIN_TYPE);
913 ## duplicate problem is Location ..
914 if ($f->location->isa("Bio::Location::SplitLocationI")) {
915 # my($b,$e)=($f->start, $f->end); # is this all we need?
916 my($b,$e)=(-1,0);
917 foreach my $l ($f->location->each_Location) {
918 $b = $l->start if($b<0 || $b > $l->start);
919 $e = $l->end if($e < $l->end);
921 $f->location( Bio::Location::Simple->new(
922 -start => $b, -end => $e, -strand => $f->strand) );
925 $f->add_tag_value( Derives_from => $rna_id );
927 else {
928 $f->add_tag_value( Parent => $rna_id );
931 $f->add_tag_value( ID => $pro_id );
933 move_translation_fasta($f, $pro_id);
934 #if( $f->has_tag('translation')) {
935 # my ($aa) = $f->get_tag_values("translation");
936 # $proteinfa{$pro_id}= $aa;
937 # $f->remove_tag("translation");
938 # $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
940 } elsif ( /region/ ) {
941 $f->primary_tag('gene_component_region');
942 $f->add_tag_value( Parent => $gene_id );
943 } else {
944 return GM_NOT_PART unless $gene_id;
945 $f->add_tag_value( Parent => $gene_id );
948 ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;
950 return GM_NEW_PART;
953 ## was generic_features > add_generic_id
954 sub add_generic_id {
955 my ($f, $ft_name, $flags) = @_;
956 my $method = $f->primary_tag;
957 $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above
959 if ($f->has_tag('ID')) {
962 elsif ( $f->has_tag($method) ) {
963 my ($name) = $f->get_tag_values($method);
964 $f->add_tag_value( ID => "$method:$name" );
966 elsif($ft_name) { # is this unique ?
967 $f->add_tag_value( ID => $ft_name );
969 else {
970 $idh->generate_unique_persistent_id($f);
973 move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )
974 if($method =~ /CDS/);
976 # return $io->gff_string($f);
979 sub move_translation_fasta {
980 my ($f, $ft_id) = @_;
981 if( $ft_id && $f->has_tag('translation') ) {
982 my ($aa) = $f->get_tag_values("translation");
983 if($aa && $aa !~ /^length/) {
984 $proteinfa{$ft_id}= $aa;
985 $f->remove_tag("translation");
986 $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
991 sub gff_header {
992 my ($name, $end, $source_type, $source_feat) = @_;
993 $source_type ||= "region";
995 my $info = "$source_type:$name";
996 my $head = "##gff-version $GFF_VERSION\n".
997 "##sequence-region $name 1 $end\n".
998 "# conversion-by bp_genbank2gff3.pl\n";
999 if ($source_feat) {
1000 ## dgg: these header comment fields are not useful when have multi-records, diff organisms
1001 for my $key (qw(organism Note date)) {
1002 my $value;
1003 if ($source_feat->has_tag($key)) {
1004 ($value) = $source_feat->get_tag_values($key);
1006 if ($value) {
1007 $head .= "# $key $value\n";
1008 $info .= ", $value";
1011 $head = "" if $didheader;
1012 } else {
1013 $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";
1015 $didheader++;
1016 return (wantarray) ? ($head,$info) : $head;
1019 sub unflatten_seq {
1020 my $seq = shift;
1022 ## print "# working on $source_type:", $seq->accession, "\n";
1023 my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number .
1024 ": consult STDERR\n";
1026 eval {
1027 $unflattener->unflatten_seq( -seq => $seq,
1028 -noinfer => $noinfer,
1029 -use_magic => 1 );
1032 # deal with unflattening errors
1033 if ( $@ ) {
1034 warn $seq->accession_number . " Unflattening error:\n";
1035 warn "Details: $@\n";
1036 print "# ".$uh_oh;
1039 return 0 if !$seq || !$seq->all_SeqFeatures;
1041 # map feature types to the sequence ontology
1042 ## $tm->map_types_to_SO( -seq => $seq );
1043 #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg
1045 map_types(
1046 $tm,
1047 -seq => $seq,
1048 -type_map => $FTSOmap,
1049 -syn_map => $FTSOsynonyms,
1050 -undefined => "region"
1051 ); #nml
1055 sub filter {
1056 my $seq = shift;
1057 ## return unless $filter;
1058 my @feats;
1059 my @sources; # dgg; pick source features here; only 1 always?
1060 if ($filter) {
1061 for my $f ( $seq->remove_SeqFeatures ) {
1062 my $m = $f->primary_tag;
1063 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1064 push @feats, $f unless $filter =~ /$m/i;
1066 $seq->add_SeqFeature($_) foreach @feats;
1067 } else {
1068 for my $f ( $seq->get_SeqFeatures ){
1069 my $m = $f->primary_tag;
1070 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1074 return @sources;
1078 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
1079 # changed to filter out cloned features. We have to implement the old
1080 # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
1081 sub get_all_SeqFeatures {
1082 my $seq = shift;
1083 my @flatarr;
1085 foreach my $feat ( $seq->get_SeqFeatures ){
1086 push(@flatarr,$feat);
1087 _add_flattened_SeqFeatures(\@flatarr,$feat);
1089 return @flatarr;
1092 sub gene_name {
1093 my $g = shift;
1094 my $gene_id = ''; # zero it;
1096 if ($g->has_tag('locus_tag')) {
1097 ($gene_id) = $g->get_tag_values('locus_tag');
1100 elsif ($g->has_tag('gene')) {
1101 ($gene_id) = $g->get_tag_values('gene');
1103 elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene
1104 ($gene_id) = $g->get_tag_values('ID');
1107 ## See Unflattener comment:
1108 # on rare occasions, records will have no /gene or /locus_tag
1109 # but it WILL have /product tags. These serve the same purpose
1110 # for grouping. For an example, see AY763288 (also in t/data)
1111 # eg. product=tRNA-Asp ; product=similar to crooked neck protein
1112 elsif ($g->has_tag('product')) {
1113 my ($name)= $g->get_tag_values('product');
1114 ($gene_id) = $name unless($name =~ / /); # a description not name
1117 ## dgg; also handle transposon=xxxx ID/name
1118 # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746
1119 elsif ($g->has_tag('transposon')) {
1120 my ($name)= $g->get_tag_values('transposon');
1121 ($gene_id) = $name unless($name =~ / /); # a description not name
1124 return $gene_id;
1127 # same list as gene_name .. change tag to generic Name
1128 sub convert_to_name {
1129 my $g = shift;
1130 my $gene_id = ''; # zero it;
1132 if ($g->has_tag('gene')) {
1133 ($gene_id) = $g->get_tag_values('gene');
1134 $g->remove_tag('gene');
1135 $g->add_tag_value('Name', $gene_id);
1137 elsif ($g->has_tag('locus_tag')) {
1138 ($gene_id) = $g->get_tag_values('locus_tag');
1139 $g->remove_tag('locus_tag');
1140 $g->add_tag_value('Name', $gene_id);
1143 elsif ($g->has_tag('product')) {
1144 my ($name)= $g->get_tag_values('product');
1145 ($gene_id) = $name unless($name =~ / /); # a description not name
1146 ## $g->remove_tag('product');
1147 $g->add_tag_value('Name', $gene_id);
1150 elsif ($g->has_tag('transposon')) {
1151 my ($name)= $g->get_tag_values('transposon');
1152 ($gene_id) = $name unless($name =~ / /); # a description not name
1153 ## $g->remove_tag('transposon');
1154 $g->add_tag_value('Name', $gene_id);
1156 elsif ($g->has_tag('ID')) {
1157 my ($name)= $g->get_tag_values('ID');
1158 $g->add_tag_value('Name', $name);
1160 return $gene_id;
1164 sub _add_flattened_SeqFeatures {
1165 my ($arrayref,$feat) = @_;
1166 my @subs = ();
1168 if ($feat->isa("Bio::FeatureHolderI")) {
1169 @subs = $feat->get_SeqFeatures;
1171 elsif ($feat->isa("Bio::SeqFeatureI")) {
1172 @subs = $feat->sub_SeqFeature;
1174 else {
1175 warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
1176 "Don't know how to flatten.";
1179 for my $sub (@subs) {
1180 push(@$arrayref,$sub);
1181 _add_flattened_SeqFeatures($arrayref,$sub);
1186 sub map_types {
1188 my ($self, @args) = @_;
1190 my($sf, $seq, $type_map, $syn_map, $undefmap) =
1191 $self->_rearrange([qw(FEATURE
1193 TYPE_MAP
1194 SYN_MAP
1195 UNDEFINED
1197 @args);
1199 if (!$sf && !$seq) {
1200 $self->throw("you need to pass in either -feature or -seq");
1203 my @sfs = ($sf);
1204 if ($seq) {
1205 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
1206 @sfs = $seq->get_all_SeqFeatures;
1208 $type_map = $type_map || $self->typemap; # dgg: was type_map;
1209 foreach my $feat (@sfs) {
1211 $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
1212 $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
1214 my $primary_tag = $feat->primary_tag;
1216 #if ($primary_tag =~ /^pseudo(.*)$/) {
1217 # $primary_tag = $1;
1218 # $feat->primary_tag($primary_tag);
1221 my $mtype = $type_map->{$primary_tag};
1222 if ($mtype) {
1223 if (ref($mtype)) {
1224 if (ref($mtype) eq 'ARRAY') {
1225 my $soID;
1226 ($mtype, $soID) = @$mtype;
1228 if ($soID && ref($ONTOLOGY)) {
1229 my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
1230 $mtype = $term->name if $term;
1232 # if SO ID is undefined AND we have an ontology to search, we want to delete
1233 # the feature type hash entry in order to force a fuzzy search
1234 elsif (! defined $soID && ref($ONTOLOGY)) {
1235 undef $mtype;
1236 delete $type_map->{$primary_tag};
1238 elsif ($undefmap && $mtype eq 'undefined') { # dgg
1239 $mtype= $undefmap;
1242 $type_map->{$primary_tag} = $mtype if $mtype;
1244 elsif (ref($mtype) eq 'CODE') {
1245 $mtype = $mtype->($feat);
1247 else {
1248 $self->throw('must be scalar or CODE ref');
1251 elsif ($undefmap && $mtype eq 'undefined') { # dgg
1252 $mtype= $undefmap;
1254 $feat->primary_tag($mtype);
1257 if ($CONF) {
1258 conf_read();
1259 my %perfect_matches;
1260 while (my ($p_tag,$rules) = each %$YAML) {
1261 RULE:
1262 for my $rule (@$rules) {
1263 for my $tags (@$rule) {
1264 while (my ($tag,$values) = each %$tags) {
1265 for my $value (@$values) {
1266 if ($feat->has_tag($tag)) {
1267 for ($feat->get_tag_values($tag)) {
1268 next RULE unless $_ =~ /\Q$value\E/;
1270 } elsif ($tag eq 'primary_tag') {
1271 next RULE unless $value eq
1272 $feat->primary_tag;
1273 } elsif ($tag eq 'location') {
1274 next RULE unless $value eq
1275 $feat->start.'..'.$feat->end;
1276 } else { next RULE }
1280 $perfect_matches{$p_tag}++;
1283 if (scalar(keys %perfect_matches) == 1) {
1284 $mtype = $_ for keys %perfect_matches;
1285 } elsif (scalar(keys %perfect_matches) > 1) {
1286 warn "There are conflicting rules in the config file for the" .
1287 " following types: ";
1288 warn "\t$_\n" for keys %perfect_matches;
1289 warn "Until conflict resolution is built into the converter," .
1290 " you will have to manually edit the config file to remove the" .
1291 " conflict. Sorry :(. Skipping user preference for this entry";
1292 sleep(2);
1296 if ( ! $mtype && $syn_map) {
1297 if ($feat->has_tag('note')) {
1299 my @all_matches;
1301 my @note = $feat->each_tag_value('note');
1303 for my $k (keys %$syn_map) {
1305 if ($k =~ /"(.+)"/) {
1307 my $syn = $1;
1309 for my $note (@note) {
1311 # look through the notes to see if the description
1312 # is an exact match for synonyms
1313 if ( $syn eq $note ) {
1315 my @map = @{$syn_map->{$k}};
1318 my $best_guess = $map[0];
1320 unshift @{$all_matches[-1]}, [$best_guess];
1322 $mtype = $MANUAL
1323 ? manual_curation($feat, $best_guess, \@all_matches)
1324 : $best_guess;
1326 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1327 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1328 . '#' x 78 . "\n\n";
1330 } else {
1331 # check both primary tag and and note against
1332 # SO synonyms for best matching description
1334 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
1341 #unless ($mtype) {
1342 for my $note (@note) {
1343 for my $name (values %$type_map) {
1344 # check primary tag against SO names for best matching
1345 # descriptions //NML also need to check against
1346 # definition && camel case split terms
1348 SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
1353 if (scalar(@all_matches) && !$mtype) {
1355 my $top_matches = first { defined $_ } @{$all_matches[-1]};
1357 my $best_guess = $top_matches->[0];
1361 # if guess has quotes, it is a synonym term. we need to
1362 # look up the corresponding name term
1363 # otherwise, guess is a name, so we can use it directly
1364 if ($best_guess =~ /"(.+)"/) {
1366 $best_guess = $syn_map->{$best_guess}->[0];
1370 @RETURN = @all_matches;
1371 $mtype = $MANUAL
1372 ? manual_curation($feat, $best_guess, \@all_matches)
1373 : $best_guess;
1375 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1376 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1377 . '#' x 78 . "\n\n";
1381 $mtype ||= $undefmap;
1382 $feat->primary_tag($mtype);
1389 sub SO_fuzzy_match {
1391 my $candidate = shift;
1392 my $primary_tag = shift;
1393 my $note = shift;
1394 my $SO_terms = shift;
1395 my $best_matches_ref = shift;
1396 my $modifier = shift;
1398 $modifier ||= '';
1400 my @feat_terms;
1402 for ( split(" |_", $primary_tag) ) {
1403 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1404 my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1405 push @feat_terms, @camelCase;
1408 for ( split(" |_", $note) ) {
1409 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1410 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1411 (my $word = $_) =~ s/[;:.,]//g;
1412 push @feat_terms, $word;
1416 my @SO_terms = split(" |_", $SO_terms);
1418 # fuzzy match works on a simple point system. When 2 words match,
1419 # the $plus counter adds one. When they don't, the $minus counter adds
1420 # one. This is used to sort similar matches together. Better matches
1421 # are found at the end of the array, near the top.
1423 # NML: can we improve best match by using synonym tags
1424 # EXACT,RELATED,NARROW,BROAD?
1426 my ($plus, $minus) = (0, 0);
1427 my %feat_terms;
1428 my %SO_terms;
1430 #unique terms
1431 map {$feat_terms{$_} = 1} @feat_terms;
1432 map {$SO_terms{$_} = 1} @SO_terms;
1434 for my $st (keys %SO_terms) {
1435 for my $ft (keys %feat_terms) {
1437 ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
1442 push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
1446 sub manual_curation {
1448 my ($feat, $default_opt, $all_matches) = @_;
1450 my @all_matches = @$all_matches;
1452 # convert all SO synonyms into names and filter
1453 # all matches into unique term list because
1454 # synonyms can map to multiple duplicate names
1456 my (@unique_SO_terms, %seen);
1457 for (reverse @all_matches) {
1458 for (@$_) {
1459 for (@$_) {
1460 #my @names;
1461 if ($_ =~ /"(.+)"/) {
1462 for (@{$SYN_MAP->{$_}}) {
1463 push @unique_SO_terms, $_ unless $seen{$_};
1464 $seen{$_}++;
1466 } else {
1467 push @unique_SO_terms, $_ unless $seen{$_};
1468 $seen{$_}++;
1474 my $s = scalar(@unique_SO_terms);
1476 my $choice = 0;
1478 my $more =
1479 "[a]uto : automatic input (selects best guess for remaining entries)\r" .
1480 "[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .
1481 "[i]nput : add a specific term\r" .
1482 "[r]eset : reset to the beginning of matches\r" .
1483 "[s]kip : skip this entry (selects best guess for this entry)\r"
1486 $more .=
1487 "[n]ext : view the next ".OPTION_CYCLE." terms\r" .
1488 "[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
1490 my $msg = #"\n\n" . '-' x 156 . "\n"
1491 "The converter found $s possible matches for the following GenBank entry: ";
1493 my $directions =
1494 "Type a number to select the SO term that best matches"
1495 . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
1498 # lookup filtered list to pull out definitions
1499 my @options = map {
1500 my $term = $_;
1501 my %term;
1502 for (['name', 'name'], ['def', 'definition'], ['synonym',
1503 'each_synonym']) {
1504 my ($label, $method) = @$_;
1505 $term{$label} = \@{[$term->$method]};
1507 [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
1508 } map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
1511 my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
1512 $default_opt, @options);
1514 if ($option eq 'skip') { return $default_opt
1515 } elsif ($option eq 'auto') {
1516 $MANUAL = 0;
1517 return $default_opt;
1518 } else { return $option }
1522 sub options_cycle {
1524 my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
1526 #NML: really should only call GenBank_entry once. Will need to change
1527 #method to return array & shift off header
1528 my $entry = GenBank_entry($feat, "\r");
1530 my $total = scalar(@opt);
1532 ($start,$stop) = (0, OPTION_CYCLE)
1533 if ( ($start < 0) && ($stop > 0) );
1535 ($start,$stop) = (0, OPTION_CYCLE)
1536 if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
1538 ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
1539 ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
1541 $stop = $total if $stop > $total;
1543 my $dir_copy = $directions;
1544 my $msg_copy = $msg;
1545 my $format = "format STDOUT = \n" .
1546 '-' x 156 . "\n" .
1547 '^' . '<' x 77 . '| Available Commands:' . "\n" .
1548 '$msg_copy' . "\n" .
1549 '-' x 156 . "\n" .
1550 ' ' x 78 . "|\n" .
1551 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
1552 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
1553 (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
1554 ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";
1557 # eval throws redefined warning that breaks formatting.
1558 # Turning off warnings just for the eval to fix this.
1559 no warnings 'redefine';
1560 eval $format;
1563 write;
1565 print '-' x 156 . "\n" .
1566 'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
1567 " - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
1568 "\n" . '-' x 156 . "\n";
1570 for (my $i = $start; $i < $stop; $i+=2) {
1572 my ($left, $right) = @opt[$i,$i+1];
1574 my ($nL, $nmL, $descL, $termL, @synL) = @$left;
1576 #odd numbered lists can cause fatal undefined errors, so check
1577 #to make sure we have data
1579 my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
1582 my $format = "format STDOUT = \n";
1584 $format .=
1585 ' ' x 78 . "|\n" .
1587 '@>>: name: ^' . '<' x 64 . '~' . ' |' .
1588 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .
1589 '$nL,' . ' ' x 7 . '$nmL,' .
1590 ( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .
1592 ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
1593 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1594 ' ' x 11 . '$nmL,' .
1595 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
1596 #' ' x 78 . '|' . "\n" .
1599 ' def: ^' . '<' x 65 . ' |' .
1600 (ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .
1601 ' ' x 11 . '$descL,' .
1602 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1605 (' ^' . '<' x 65 . ' |' .
1606 (ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .
1607 ' ' x 11 . '$descL,' .
1608 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
1611 ' ^' . '<' x 61 . '...~ |' .
1612 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1613 ' ' x 11 . '$descL,' .
1614 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1616 ".\n";
1619 # eval throws redefined warning that breaks formatting.
1620 # Turning off warnings just for the eval to fix this.
1621 no warnings 'redefine';
1622 eval $format;
1624 write;
1627 print '-' x 156 . "\nenter a command:";
1629 while (<STDIN>) {
1631 (my $input = $_) =~ s/\s+$//;
1633 if ($input =~ /^\d+$/) {
1634 if ( $input && defined $opt[$input-1] ) {
1635 return $opt[$input-1]->[1]
1636 } else {
1637 print "\nThat number is not an option. Please enter a valid number.\n:";
1639 } elsif ($input =~ /^n/i | $input =~ /next/i ) {
1640 return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
1641 $feat, $directions, $best_guess, @opt)
1642 } elsif ($input =~ /^p/i | $input =~ /prev/i ) {
1643 return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
1644 $feat, $directions, $best_guess, @opt)
1645 } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
1646 } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
1647 } elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
1648 return manual_curation($feat, $best_guess, \@RETURN );
1649 } elsif ( $input =~ /^f/i || $input =~ /find/i ) {
1651 my ($query, @query_results);
1653 if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
1654 } else {
1656 #do a SO search
1657 print "Type your search query\n:";
1658 while (<STDIN>) { chomp($query = $_); last }
1661 for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
1662 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
1665 return manual_curation($feat, $best_guess, \@query_results);
1667 } elsif ( $input =~ /^i/i || $input =~ /input/i ) {
1669 #NML fast input for later
1670 #my $query;
1671 #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
1673 #manual input
1674 print "Type the term you want to use\n:";
1675 while (<STDIN>) {
1676 chomp(my $input = $_);
1678 if (! $TYPE_MAP->{$input}) {
1680 print "\"$input\" doesn't appear to be a valid SO term. Are ".
1681 "you sure you want to use it? (y or n)\n:";
1683 while (<STDIN>) {
1685 chomp(my $choice = $_);
1687 if ($choice eq 'y') {
1688 print
1689 "\nWould you like to save your preference for " .
1690 "future use (so you don't have to redo manual " .
1691 "curation for this feature everytime you run " .
1692 "the converter)? (y or n)\n";
1694 #NML: all these while loops are a mess. Really should condense it.
1695 while (<STDIN>) {
1697 chomp(my $choice = $_);
1699 if ($choice eq 'y') {
1700 curation_save($feat, $input);
1701 return $input;
1702 } elsif ($choice eq 'n') {
1703 return $input
1704 } else {
1705 print "\nDidn't recognize that command. Please " .
1706 "type y or n.\n:"
1711 } elsif ($choice eq 'n') {
1712 return options_cycle($start, $stop, $msg, $feat,
1713 $directions, $best_guess, @opt)
1714 } else {
1715 print "\nDidn't recognize that command. Please " .
1716 "type y or n.\n:"
1720 } else {
1721 print
1722 "\nWould you like to save your preference for " .
1723 "future use (so you don't have to redo manual " .
1724 "curation for this feature everytime you run " .
1725 "the converter)? (y or n)\n";
1727 #NML: all these while loops are a mess. Really should condense it.
1728 while (<STDIN>) {
1730 chomp(my $choice = $_);
1732 if ($choice eq 'y') {
1733 curation_save($feat, $input);
1734 return $input;
1735 } elsif ($choice eq 'n') {
1736 return $input
1737 } else {
1738 print "\nDidn't recognize that command. Please " .
1739 "type y or n.\n:"
1746 } else {
1747 print "\nDidn't recognize that command. Please re-enter your choice.\n:"
1753 sub GenBank_entry {
1754 my ($f, $delimiter, $num) = @_;
1756 $delimiter ||= "\n";
1759 my $entry =
1761 ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
1762 . ($num
1763 ? ' ' x (12 - length $f->primary_tag ) . ' [2] '
1764 : ' ' x (15 - length $f->primary_tag)
1766 . $f->start.'..'.$f->end
1768 . "$delimiter";
1770 if ($num) {
1771 words_tag($f, \$entry);
1772 } else {
1773 for my $tag ($f->all_tags) {
1774 for my $val ( $f->each_tag_value($tag) ) {
1775 $entry .= ' ' x 20;
1776 #$entry .= "/$tag=\"$val\"$delimiter";
1777 $entry .= $val eq '_no_value'
1778 ? "/$tag$delimiter"
1779 : "/$tag=\"$val\"$delimiter";
1785 return $entry;
1789 sub gff_validate {
1790 warn "Validating GFF file\n" if $DEBUG;
1791 my @feat = @_;
1793 my (%parent2child, %all_ids, %descendants, %reserved);
1795 for my $f (@feat) {
1796 for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
1797 map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
1798 if $f->has_tag($$aTags[0]);
1802 if ($SO_FILE) {
1803 while (my ($parentID, $aChildren) = each %parent2child) {
1804 parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
1808 id_validate(\%all_ids, \%reserved);
1811 sub parent_validate {
1812 my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
1814 my $aParents = $hAll->{$parentID};
1816 map {
1817 my $child = $_;
1818 $child->add_tag_value( validation_error =>
1819 "feature tried to add Parent tag, but no Parent found with ID $parentID"
1821 my %parents;
1822 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1823 delete $parents{$parentID};
1824 my @parents = keys %parents;
1826 $child->remove_tag('Parent');
1828 unless ($child->has_tag('ID')) {
1829 my $id = gene_name($child);
1830 $child->add_tag_value('ID', $id);
1831 push @{$hAll->{$id}}, $child
1834 $child->add_tag_value('Parent', @parents) if @parents;
1836 } @$aChildren and return unless scalar(@$aParents);
1838 my $par = join(',', map { $_->primary_tag } @$aParents);
1839 warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
1841 #NML manual curation needs to happen here
1844 my %parentsToRemove;
1846 CHILD:
1847 for my $child (@$aChildren) {
1848 my $childType = $child->primary_tag;
1850 warn "WORKING ON $childType at ".$child->start.' to '.$child->end
1851 if $DEBUG;
1853 for (my $i = 0; $i < scalar(@$aParents); $i++) {
1854 my $parent = $aParents->[$i];
1855 my $parentType = $parent->primary_tag;
1857 warn "CHECKING $childType against $parentType" if $DEBUG;
1859 #cache descendants so we don't have to do repeat searches
1860 unless ($hDescendants->{$parentType}) {
1862 for my $term ($ONTOLOGY->find_terms(
1863 -name => $parentType
1864 ) ) {
1866 map {
1867 $hDescendants->{$parentType}{$_->name}++
1868 } $ONTOLOGY->get_descendant_terms($term);
1872 # NML: hopefully temporary fix.
1873 # SO doesn't consider exon/CDS to be a child of mRNA
1874 # even though common knowledge dictates that they are
1875 # This cheat fixes the false positives for now
1876 if ($parentType eq 'mRNA') {
1877 $hDescendants->{$parentType}{'exon'} = 1;
1878 $hDescendants->{$parentType}{'CDS'} = 1;
1883 warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
1884 " be a child of $parentType?" if $DEBUG;
1885 if ($hDescendants->{$parentType}{$childType}) {
1886 warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
1888 #NML need to deal with multiple children matched to multiple different
1889 #parents. This model only assumes the first parent id that matches a child will
1890 #be the reserved feature.
1892 $hReserved->{$parentID}{$parent}{'parent'} = $parent;
1893 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
1895 #mark parent for later removal from all IDs
1896 #so we don't accidentally change any parents
1898 $parentsToRemove{$i}++;
1900 next CHILD;
1906 #NML shouldn't have to check this; somehow child can lose Parent
1907 #it's happening W3110
1908 #need to track this down
1909 if ( $child->has_tag('Parent') ) {
1911 warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
1912 if $DEBUG;
1914 my %parents;
1916 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1918 delete $parents{$parentID};
1919 my @parents = keys %parents;
1921 warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
1922 ' to ' . $child->end . " cannot be a child of ID $parentID"
1923 if $DEBUG;
1925 $child->add_tag_value( validation_error =>
1926 "feature cannot be a child of $parentID");
1928 $child->remove_tag('Parent');
1930 unless ($child->has_tag('ID')) {
1931 my $id = gene_name($child);
1932 $child->add_tag_value('ID', $id);
1933 push @{$hAll->{$id}}, $child
1936 $child->add_tag_value('Parent', @parents) if @parents;
1941 #delete $aParents->[$_] for keys %parentsToRemove;
1942 splice(@$aParents, $_, 1) for keys %parentsToRemove;
1945 sub id_validate {
1946 my ($hAll, $hReserved) = @_;
1949 for my $id (keys %$hAll) {
1951 #since 1 feature can have this id,
1952 #let's just shift it off and uniquify
1953 #the rest (unless it's reserved)
1955 shift @{$hAll->{$id}} unless $hReserved->{$id};
1956 for my $feat (@{$hAll->{$id}}) {
1957 id_uniquify(0, $id, $feat, $hAll);
1961 for my $parentID (keys %$hReserved) {
1963 my @keys = keys %{$hReserved->{$parentID}};
1965 shift @keys;
1967 for my $k (@keys) {
1968 my $parent = $hReserved->{$parentID}{$k}{'parent'};
1969 my $aChildren= $hReserved->{$parentID}{$k}{'children'};
1971 my $value = id_uniquify(0, $parentID, $parent, $hAll);
1972 for my $child (@$aChildren) {
1974 my %parents;
1975 map { $parents{$_}++ } $child->get_tag_values('Parent');
1976 $child->remove_tag('Parent');
1977 delete $parents{$parentID};
1978 $parents{$value}++;
1979 my @parents = keys %parents;
1980 $child->add_tag_value('Parent', @parents);
1987 sub id_uniquify {
1988 my ($count, $value, $feat, $hAll) = @_;
1990 warn "UNIQUIFYING $value" if $DEBUG;
1992 if (! $count) {
1993 $feat->add_tag_value(Alias => $value);
1994 $value .= ('.' . $feat->primary_tag)
1995 } elsif ($count == 1) {
1996 $value .= ".$count"
1997 } else {
1998 chop $value;
1999 $value .= $count
2001 $count++;
2003 warn "ENDED UP WITH $value" if $DEBUG;
2004 if ( $hAll->{$value} ) {
2005 warn "$value IS ALREADY TAKEN" if $DEBUG;
2006 id_uniquify($count, $value, $feat, $hAll);
2007 } else {
2008 #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
2009 $feat->remove_tag('ID');
2010 $feat->add_tag_value('ID', $value);
2011 push @{$hAll->{$value}}, $value;
2014 $value;
2017 sub conf_read {
2019 print "\nCannot read $CONF. Change file permissions and retry, " .
2020 "or enter another file\n" and conf_locate() unless -r $CONF;
2022 print "\nCannot write $CONF. Change file permissions and retry, " .
2023 "or enter another file\n" and conf_locate() unless -w $CONF;
2025 $YAML = LoadFile($CONF);
2029 sub conf_create {
2031 my ($path, $input) = @_;
2033 print "Cannot write to $path. Change directory permissions and retry " .
2034 "or enter another save path\n" and conf_locate() unless -w $path;
2036 $CONF = $input;
2038 open(FH, '>', $CONF);
2039 close(FH);
2040 conf_read();
2045 sub conf_write { DumpFile($CONF, $YAML) }
2047 sub conf_locate {
2049 print "\nEnter the location of a previously saved config, or a new " .
2050 "path and file name to create a new config (this step is " .
2051 "necessary to save any preferences)";
2053 print "\n\nenter a command:";
2055 while (<STDIN>) {
2056 chomp(my $input = $_);
2057 my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
2059 if (-e $input && (! -d $input)) {
2061 print "\nReading $input...\n";
2062 $CONF = $input;
2064 conf_read();
2065 last;
2067 } elsif (! -d $input && $fn.$suffix) {
2069 print "Creating $input...\n";
2070 conf_create($path, $input);
2071 last;
2073 } elsif (-e $input && -d $input) {
2074 print "You only entered a directory. " .
2075 "Please enter BOTH a directory and filename\n";
2076 } else {
2077 print "$input does not appear to be a valid path. Please enter a " .
2078 "valid directory and filename\n";
2080 print "\nenter a command:";
2084 sub curation_save {
2086 my ($feat, $input) = @_;
2088 #my $error = "Enter the location of a previously saved config, or a new " .
2089 # "path and file name to create a new config (this step is " .
2090 # "necessary to save any preferences)\n";
2092 if (!$CONF) {
2093 print "\n\n";
2094 conf_locate();
2095 } elsif (! -e $CONF) {
2096 print "\n\nThe config file you have chosen doesn't exist.\n";
2097 conf_locate();
2098 } else { conf_read() }
2100 my $entry = GenBank_entry($feat, "\r", 1);
2102 my $msg = "Term entered: $input";
2103 my $directions = "Please select any/all tags that provide evidence for the term you
2104 have entered. You may enter multiple tags by separating them by commas/dashes
2105 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
2106 the option of either selecting the entire note as evidence, or specific
2107 keywords. If a tag has multiple keywords, they will be tagged alphabetically
2108 for selection. To select a specific keyword in a tag field, you must enter the
2109 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
2110 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
2111 to be to match your curation. To match the GenBank entry exactly as it
2112 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
2113 preference, you will no longer be prompted to choose a feature type for any
2114 matching entries until you edit the curation.ini file.";
2115 my $msg_copy = $msg;
2116 my $dir_copy = $directions;
2118 my $format = "format STDOUT = \n" .
2119 '-' x 156 . "\n" .
2120 '^' . '<' x 77 . '| Directions:' . "\n" .
2121 '$msg_copy' . "\n" .
2122 '-' x 156 . "\n" .
2123 ' ' x 78 . "|\n" .
2124 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
2125 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
2126 (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
2127 ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";
2130 # eval throws redefined warning that breaks formatting.
2131 # Turning off warnings just for the eval to fix this.
2132 no warnings 'redefine';
2133 eval $format;
2136 write;
2137 print '-' x 156 . "\nenter a command:";
2139 my @tags = words_tag($feat);
2141 my $final = {};
2142 my $choices;
2143 while (<STDIN>) {
2145 chomp(my $choice = $_);
2147 if (scalar(keys %$final) && $choice =~ /^y/i) { last
2149 } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
2151 } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
2153 } elsif ($choice eq 'all') {
2155 $choice = '';
2156 for (my $i=1; $i < scalar(@tags); $i++) {
2157 $choice .= "$i,";
2159 chop $choice;
2161 #print "CHOICE [$choice]";
2163 my @selections;
2164 for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
2165 if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
2167 for ($1..$2) { push @selections, $_ }
2169 my $dangling_alphas = $3;
2170 alpha_expand($dangling_alphas, \@selections);
2172 } else {
2173 alpha_expand($_, \@selections);
2177 foreach my $numbers (@selections) {
2179 my @c = split(/(?=[\w])/, $numbers);
2180 s/\W+//g foreach @c;
2181 my $num;
2184 $^W = 0;
2185 $num = 0 + shift @c;
2188 my $tag = $tags[$num];
2189 if (ref $tag && scalar(@c)) {
2190 my $no_value;
2191 foreach (@c) {
2192 if (defined $tag->{$_}) {
2193 $choices .= "${num}[$_] ";
2194 my ($t,$v) = @{$tag->{$_}};
2195 push @{${$final->{$input}}[0]{$t}}, $v;
2197 } else { $no_value++ }
2200 if ($no_value) {
2201 _selection_add($tag,$final,$input,\$choices,$num);
2202 #my ($t,$v) = @{$tag->{'all'}};
2203 #unless (defined ${$final->{$input}}[0]{$t}) {
2204 #$choices .= "$num, ";
2205 #push @{${$final->{$input}}[0]{$t}}, $v
2209 $choices = substr($choices, 0, -2);
2210 $choices .= ', ';
2212 } elsif (ref $tag) {
2213 _selection_add($tag,$final,$input,\$choices,$num);
2214 #my ($t,$v) = @{$tag->{'all'}};
2215 #unless (defined ${$final->{$input}}[0]{$t}) {
2216 #$choices .= "$num, ";
2217 #push @{${$final->{$input}}[0]{$t}}, $v
2221 $choices = substr($choices, 0, -2) if $choices;
2222 if ($final) {
2223 print "\nYou have chosen the following tags:\n$choices\n";
2224 print "This will be written to the config file as:\n";
2225 print Dump $final;
2226 print "\nIs this correct? (y or n)\n";
2227 } else { print "\nInvalid selection. Please try again\n" }
2229 push @{$YAML->{$input}}, $final->{$input};
2230 conf_write();
2233 # words_tag() splits each tag value string into multiple words so that the
2234 # user can select the parts he/she wants to use for curation
2235 # it can tag 702 (a - zz) separate words; this should be enough
2237 sub words_tag {
2239 my ($feat, $entry) = @_;
2241 my @tags;
2243 @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
2244 my $i = 3;
2245 foreach my $tag ($feat->all_tags) {
2246 foreach my $value ($feat->each_tag_value($tag)) {
2248 my ($string, $tagged_string);
2250 my @words = split(/(?=\w+?)/, $value);
2252 my $pos = 0;
2255 foreach my $word (@words) {
2257 (my $sanitized_word = $word) =~ s/\W+?//g;
2258 $string .= $word;
2260 my $lead = int($pos/ALPHABET_DIVISOR);
2261 my $lag = $pos % ALPHABET_DIVISOR;
2263 my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';
2264 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2266 $tagged_string .= " ($a) $word";
2268 $tags[$i]{$a} = [$tag, $sanitized_word];
2269 $pos++;
2272 $value = $tagged_string if scalar(@words) > 1;
2274 $$entry .= "[$i] /$tag=\"$value\"\r";
2276 $tags[$i]{'all'} = [$tag, $string];
2278 $i++;
2281 return @tags;
2285 sub alpha_expand {
2287 my ($dangling_alphas, $selections) = @_;
2289 if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
2291 my $digit = $1;
2292 push @$selections, $digit if $digit;
2294 my $start = $2;
2295 my $stop = $3;
2297 my @starts = split('', $start);
2298 my @stops = split('', $stop);
2300 my ($final_start, $final_stop);
2302 for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
2304 my ($final, $splits) = @$_;
2306 my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
2307 my $rem;
2310 if ($$splits[1]) {
2311 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
2312 $int++
2313 } else {
2314 $rem = $int;
2315 $int = 0;
2319 $$final = $int * ALPHABET_DIVISOR;
2320 $$final += $rem;
2324 my $last_number = pop @$selections;
2325 for my $pos ($final_start..$final_stop) {
2326 my $lead = int($pos/ALPHABET_DIVISOR);
2327 my $lag = $pos % ALPHABET_DIVISOR;
2328 my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';
2329 $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2330 push @$selections, $last_number.$alpha;
2333 } elsif (defined($dangling_alphas)) {
2334 if ($dangling_alphas =~ /^\d/) {
2335 push @$selections, $dangling_alphas;
2336 } elsif ($dangling_alphas =~ /^\D/) {
2337 #print "$dangling_alphas ".Dumper @$selections;
2338 my $last_number = pop @$selections;
2339 $last_number ||= '';
2340 push @$selections, $last_number.$dangling_alphas;
2341 #$$selections[-1] .= $dangling_alphas;
2347 sub _selection_add {
2349 my ($tag, $final, $input, $choices, $num) = @_;
2350 my ($t,$v) = @{$tag->{'all'}};
2351 unless (defined ${$final->{$input}}[0]{$t}) {
2352 $$choices .= "$num, ";
2353 push @{${$final->{$input}}[0]{$t}}, $v