12 bp_genbank2gff.pl - Load a Bio::DB::GFF database from GENBANK files.
16 % bp_genbank2gff.pl -d genbank -f localfile.gb
17 % bp_genbank2gff.pl -d genbank --accession AP003256
18 % bp_genbank2gff.pl --accession AP003256 --stdout
22 This script loads a Bio::DB::GFF database with the features contained
23 in a either a local genbank file or an accession that is fetched from
24 genbank. Various command-line options allow you to control which
25 database to load and whether to allow an existing database to be
28 The database must already have been created and the current user must
29 have appropriate INSERT and UPDATE privileges. The --create option
30 will initialize a new database with the appropriate schema, deleting
31 any tables that were already there.
33 =head1 COMMAND-LINE OPTIONS
35 Command-line options can be abbreviated to single-letter options.
36 e.g. -d instead of --database.
38 --create Force creation and initialization of database
39 --dsn <dsn> Data source (default dbi:mysql:test)
40 --user <user> Username for mysql authentication
41 --pass <password> Password for mysql authentication
42 --proxy <proxy> Proxy server to use for remote access
43 --stdout direct output to STDOUT
44 --adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle) --viral the genome you are loading is viral (changes tag
46 --source <source> source field for features ['genbank']
47 EITHER --file Arguments that follow are Genbank/EMBL file names
48 OR --gb_folder What follows is a folder full of gb files to process OR --accession Arguments that follow are genbank accession numbers
50 OR --acc_file Accession numbers (not gi!) in a file (one per line, no punc.)
51 OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
57 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
61 Scott Cain, cain@cshl.org
63 Copyright (c) 2003 Cold Spring Harbor Laboratory
65 This library is free software; you can redistribute it and/or modify
66 it under the same terms as Perl itself. See DISCLAIMER.txt for
67 disclaimers of warranty.
71 package Bio
::DB
::GFF
::Adaptor
::biofetch_to_stdout
;
73 use Bio
::DB
::GFF
::Util
::Rearrange
;
74 use Bio
::DB
::GFF
::Adaptor
::biofetch
;
76 @ISA = 'Bio::DB::GFF::Adaptor::biofetch';
79 my ($self,$options) = @_;
80 # synthesize GFF3-compatible line
82 if (my $id = $options->{gname
}) {
84 $parent =~ s/\..\d+$// if $options->{method
} =~ /^(mRNA|transcript|exon|gene)$/;
85 push @attributes,"Parent=".escape
($parent) if $options->{method
} =~ /^(variation|exon|CDS|transcript|mRNA|coding)$/;
86 push @attributes,"ID=".escape
($id) unless $options->{method
} =~ /^(exon|CDS)$/;
88 if (my $tstart = $options->{tstart
}) {
89 my $tstop = $options->{tstop
};
90 my $target = escape
($options->{gname
});
91 push @attributes,"Target=$target+$tstart+$tstop";
94 if (my $attributes = $options->{attributes
}) {
95 for my $a (@
$attributes) {
96 my ($tag,$value) = @
$a;
97 push @
{$a{escape
($tag)}},escape
($value);
100 push @attributes,"$a=".join(',',@
{$a{$a}});
103 ${$options}{'score'} = "." unless ${$options}{'score'};
104 ${$options}{'strand'} = "." unless ${$options}{'strand'};
105 ${$options}{'phase'} = "." unless ${$options}{'phase'};
106 my $last_column = join ';',@attributes;
107 if ($options->{method
} eq 'origin') {
108 print "##sequence-region $options->{gname} $options->{start} $options->{stop}\n";
110 print join("\t",@
{$options}{qw(ref source method start stop score strand phase)},$last_column),"\n";
113 sub load_sequence_string
{
117 $seq =~ s/(.{1,60})/$1\n/g;
118 print ">$acc\n\L$seq\U\n";
123 print "##gff-version 3\n";
134 Usage: $0 [options] [<gff file 1> <gff file 2>] ...
135 Load a Bio::DB::GFF database from GFF files.
138 --create Force creation and initialization of database
139 --dsn <dsn> Data source (default dbi:mysql:test)
140 --user <user> Username for mysql authentication
141 --pass <password> Password for mysql authentication
142 --proxy <proxy> Proxy server to use for remote access
143 --stdout direct output to STDOUT
144 --adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle)
145 --viral the genome you are loading is viral (changes tag
147 --source <source> source field for features ['genbank']
148 EITHER --file Arguments that follow are Genbank/EMBL file names
149 OR --gb_folder What follows is a folder full of gb files to process
150 OR --accession Arguments that follow are genbank accession numbers
152 OR --acc_file Accession numbers (not gi!) in a file (one per line,
154 OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
158 This script loads a Bio::DB::GFF database with the features contained
159 in a either a local genbank file or an accession that is fetched from
160 genbank. Various command-line options allow you to control which
161 database to load and whether to allow an existing database to be
167 my ($DSN,$ADAPTOR,$CREATE,$USER,$VIRAL,$PASSWORD,$gbFOLDER,
168 $FASTA,$ACC,$accFILE, $accPIPE, $FILE,$PROXY,$STDOUT,$SOURCE);
174 'password:s' => \
$PASSWORD,
175 'adaptor:s' => \
$ADAPTOR,
176 'accession' => \
$ACC,
179 'acc_file' => \
$accFILE,
180 'acc_pipe' => \
$accPIPE,
181 'source:s' => \
$SOURCE,
182 'gb_folder=s' => \
$gbFOLDER,
183 'proxy:s' => \
$PROXY,
184 'stdout' => \
$STDOUT,
185 'create' => \
$CREATE) or die $USAGE;
188 die $USAGE unless ($DSN || $STDOUT); # at a minimum we need to have a place to write to!
190 # some local defaults
191 $DSN ||= 'dbi:mysql:test';
192 $ADAPTOR ||= $STDOUT ?
'memory' : 'dbi::mysql';
194 # Ensure that biofetch inherits from the "right" adaptor.
195 # This is a horrible hack and should be fixed.
196 eval "use Bio::DB::GFF::Adaptor::${ADAPTOR}";
197 local @Bio::DB
::GFF
::Adaptor
::biofetch
::ISA
= "Bio::DB::GFF::Adaptor::${ADAPTOR}";
199 my $biofetch = $STDOUT ?
'biofetch_to_stdout' : 'biofetch';
200 my @dsn = $STDOUT ?
() : (-dsn
=> $DSN);
203 push @auth,(-user
=>$USER) if defined $USER;
204 push @auth,(-pass
=>$PASSWORD) if defined $PASSWORD;
205 push @auth,(-proxy
=>$PROXY) if defined $PROXY;
207 my %preferred_tags = (
216 $preferred_tags{'product'} = 90 if $VIRAL; # added this to the default list for viral genomes
217 # since most functions come from post-translational processing, so the default labels are c**p!
219 my $db = Bio
::DB
::GFF
->new(-adaptor
=>$biofetch,
222 -preferred_tags
=> \
%preferred_tags,
223 -source
=> $SOURCE || 'Genbank')
224 or die "Can't open database: ",Bio
::DB
::GFF
->error,"\n";
230 die "you must specify either an accession to retrieve from\nembl or a local file containing data in embl format\n" if (($FILE || $ACC) && !scalar(@ARGV));
234 status
(loading
=> $_);
235 my $result = $db->load_from_embl(/^NC_/?
'refseq':'embl' => $_);
236 status
(done
=> $result);
243 status
('loading' => $_);
244 my $result = $db->load_from_file($_);
245 status
(done
=> $result);
251 my $filename = shift;
252 die "you must supply a filename after the --accFILE command line flag\n" unless $filename;
253 die "file $filename does not exist\n" unless (-e
$filename && !(-d
$filename));
254 open my $IN, '<', $filename or die "Could not read file '$filename' for reading accession numbers: $!\n";
255 while (my $line = <$IN>){
257 status
(loading
=> $line);
258 my $result = $db->load_from_embl(/^NC_/?
'refseq':'embl' => $line);
259 status
(done
=> $result);
267 die "folder $dir does not exist\n" unless (-e
$dir && -d
$dir);
268 opendir DIR
, "$dir" || die "can't open directory $dir for reading: $!\n";
269 my @files = readdir DIR
;
270 foreach my $file(@files){
271 if (!(-e
"$gbFOLDER/$file") || (-d
"$gbFOLDER/$file")){
272 print STDERR
" $gbFOLDER/$file is not a filename! Skipping...\n";
275 my $result = $db->load_from_file("$gbFOLDER/$file");
276 print STDERR
$result ?
"ok\n" : "failed\n";
279 my @accessions = <STDIN
>;
281 foreach (@accessions){
282 status
(loading
=> $_);
283 my $result = $db->load_from_embl(/^NC_/?
'refseq':'embl' => $_);
284 status
(done
=> $result);
293 status
(loading
=> $_);
294 my $result = $db->load_from_file($_);
295 status
(done
=> $result);
298 $done || die "\n\nno source of data provided\n\n";
303 my ($state,$msg) = @_;
305 if ($state eq 'loading') {
306 print STDERR
"Loading $msg...";
307 } elsif ($state eq 'done') {
308 print STDERR
$msg ?
"ok\n" : "failed\n";