A test to ensure Bio::PrimarySeqI->trunc() doesn't use clone() for a Bio::Seq::RichSe...
[bioperl-live.git] / scripts / Bio-DB-GFF / bp_genbank2gff.pl
blob2e1144630320033b6c9cc2cda94b16d554022680
1 #!/usr/bin/perl
3 use lib '.';
5 use strict;
6 use warnings;
7 use Bio::DB::GFF;
8 use Getopt::Long;
10 =head1 NAME
12 bp_genbank2gff.pl - Load a Bio::DB::GFF database from GENBANK files.
14 =head1 SYNOPSIS
16 % bp_genbank2gff.pl -d genbank -f localfile.gb
17 % bp_genbank2gff.pl -d genbank --accession AP003256
18 % bp_genbank2gff.pl --accession AP003256 --stdout
20 =head1 DESCRIPTION
22 This script loads a Bio::DB::GFF database with the features contained
23 in a either a local genbank file or an accession that is fetched from
24 genbank. Various command-line options allow you to control which
25 database to load and whether to allow an existing database to be
26 overwritten.
28 The database must already have been created and the current user must
29 have appropriate INSERT and UPDATE privileges. The --create option
30 will initialize a new database with the appropriate schema, deleting
31 any tables that were already there.
33 =head1 COMMAND-LINE OPTIONS
35 Command-line options can be abbreviated to single-letter options.
36 e.g. -d instead of --database.
38 --create Force creation and initialization of database
39 --dsn <dsn> Data source (default dbi:mysql:test)
40 --user <user> Username for mysql authentication
41 --pass <password> Password for mysql authentication
42 --proxy <proxy> Proxy server to use for remote access
43 --stdout direct output to STDOUT
44 --adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle) --viral the genome you are loading is viral (changes tag
45 choices)
46 --source <source> source field for features ['genbank']
47 EITHER --file Arguments that follow are Genbank/EMBL file names
48 OR --gb_folder What follows is a folder full of gb files to process OR --accession Arguments that follow are genbank accession numbers
49 (not gi!)
50 OR --acc_file Accession numbers (not gi!) in a file (one per line, no punc.)
51 OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
52 per line)
55 =head1 SEE ALSO
57 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
59 =head1 AUTHOR
61 Scott Cain, cain@cshl.org
63 Copyright (c) 2003 Cold Spring Harbor Laboratory
65 This library is free software; you can redistribute it and/or modify
66 it under the same terms as Perl itself. See DISCLAIMER.txt for
67 disclaimers of warranty.
69 =cut
71 package Bio::DB::GFF::Adaptor::biofetch_to_stdout;
72 use CGI 'escape';
73 use Bio::DB::GFF::Util::Rearrange;
74 use Bio::DB::GFF::Adaptor::biofetch;
75 use vars '@ISA';
76 @ISA = 'Bio::DB::GFF::Adaptor::biofetch';
78 sub load_gff_line {
79 my ($self,$options) = @_;
80 # synthesize GFF3-compatible line
81 my @attributes;
82 if (my $id = $options->{gname}) {
83 my $parent = $id;
84 $parent =~ s/\..\d+$// if $options->{method} =~ /^(mRNA|transcript|exon|gene)$/;
85 push @attributes,"Parent=".escape($parent) if $options->{method} =~ /^(variation|exon|CDS|transcript|mRNA|coding)$/;
86 push @attributes,"ID=".escape($id) unless $options->{method} =~ /^(exon|CDS)$/;
88 if (my $tstart = $options->{tstart}) {
89 my $tstop = $options->{tstop};
90 my $target = escape($options->{gname});
91 push @attributes,"Target=$target+$tstart+$tstop";
93 my %a;
94 if (my $attributes = $options->{attributes}) {
95 for my $a (@$attributes) {
96 my ($tag,$value) = @$a;
97 push @{$a{escape($tag)}},escape($value);
99 for my $a (keys %a) {
100 push @attributes,"$a=".join(',',@{$a{$a}});
103 ${$options}{'score'} = "." unless ${$options}{'score'};
104 ${$options}{'strand'} = "." unless ${$options}{'strand'};
105 ${$options}{'phase'} = "." unless ${$options}{'phase'};
106 my $last_column = join ';',@attributes;
107 if ($options->{method} eq 'origin') {
108 print "##sequence-region $options->{gname} $options->{start} $options->{stop}\n";
110 print join("\t",@{$options}{qw(ref source method start stop score strand phase)},$last_column),"\n";
113 sub load_sequence_string {
114 my $self = shift;
115 my ($acc,$seq) = @_;
116 return unless $seq;
117 $seq =~ s/(.{1,60})/$1\n/g;
118 print ">$acc\n\L$seq\U\n";
121 sub setup_load {
122 my $self = shift;
123 print "##gff-version 3\n";
126 sub finish_load { }
130 package main;
132 my $USAGE = <<USAGE;
134 Usage: $0 [options] [<gff file 1> <gff file 2>] ...
135 Load a Bio::DB::GFF database from GFF files.
137 Options:
138 --create Force creation and initialization of database
139 --dsn <dsn> Data source (default dbi:mysql:test)
140 --user <user> Username for mysql authentication
141 --pass <password> Password for mysql authentication
142 --proxy <proxy> Proxy server to use for remote access
143 --stdout direct output to STDOUT
144 --adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle)
145 --viral the genome you are loading is viral (changes tag
146 choices)
147 --source <source> source field for features ['genbank']
148 EITHER --file Arguments that follow are Genbank/EMBL file names
149 OR --gb_folder What follows is a folder full of gb files to process
150 OR --accession Arguments that follow are genbank accession numbers
151 (not gi!)
152 OR --acc_file Accession numbers (not gi!) in a file (one per line,
153 no punc.)
154 OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
155 per line)
158 This script loads a Bio::DB::GFF database with the features contained
159 in a either a local genbank file or an accession that is fetched from
160 genbank. Various command-line options allow you to control which
161 database to load and whether to allow an existing database to be
162 overwritten.
164 USAGE
167 my ($DSN,$ADAPTOR,$CREATE,$USER,$VIRAL,$PASSWORD,$gbFOLDER,
168 $FASTA,$ACC,$accFILE, $accPIPE, $FILE,$PROXY,$STDOUT,$SOURCE);
171 GetOptions (
172 'dsn:s' => \$DSN,
173 'user:s' => \$USER,
174 'password:s' => \$PASSWORD,
175 'adaptor:s' => \$ADAPTOR,
176 'accession' => \$ACC,
177 'file' => \$FILE,
178 'viral' => \$VIRAL,
179 'acc_file' => \$accFILE,
180 'acc_pipe' => \$accPIPE,
181 'source:s' => \$SOURCE,
182 'gb_folder=s' => \$gbFOLDER,
183 'proxy:s' => \$PROXY,
184 'stdout' => \$STDOUT,
185 'create' => \$CREATE) or die $USAGE;
188 die $USAGE unless ($DSN || $STDOUT); # at a minimum we need to have a place to write to!
190 # some local defaults
191 $DSN ||= 'dbi:mysql:test';
192 $ADAPTOR ||= $STDOUT ? 'memory' : 'dbi::mysql';
194 # Ensure that biofetch inherits from the "right" adaptor.
195 # This is a horrible hack and should be fixed.
196 eval "use Bio::DB::GFF::Adaptor::${ADAPTOR}";
197 local @Bio::DB::GFF::Adaptor::biofetch::ISA = "Bio::DB::GFF::Adaptor::${ADAPTOR}";
199 my $biofetch = $STDOUT ? 'biofetch_to_stdout' : 'biofetch';
200 my @dsn = $STDOUT ? () : (-dsn => $DSN);
202 my @auth;
203 push @auth,(-user=>$USER) if defined $USER;
204 push @auth,(-pass=>$PASSWORD) if defined $PASSWORD;
205 push @auth,(-proxy=>$PROXY) if defined $PROXY;
207 my %preferred_tags = (
208 strain => 10,
209 organism => 20,
210 protein_id => 40,
211 locus_tag => 50,
212 locus => 60,
213 gene => 70,
214 standard_name => 80,
216 $preferred_tags{'product'} = 90 if $VIRAL; # added this to the default list for viral genomes
217 # since most functions come from post-translational processing, so the default labels are c**p!
219 my $db = Bio::DB::GFF->new(-adaptor=>$biofetch,
220 @dsn,
221 @auth,
222 -preferred_tags => \%preferred_tags,
223 -source=> $SOURCE || 'Genbank')
224 or die "Can't open database: ",Bio::DB::GFF->error,"\n";
226 if ($CREATE) {
227 $db->initialize(1);
230 die "you must specify either an accession to retrieve from\nembl or a local file containing data in embl format\n" if (($FILE || $ACC) && !scalar(@ARGV));
232 if ($ACC) {
233 while ($_ = shift) {
234 status(loading => $_);
235 my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
236 status(done => $result);
238 exit 1;
241 elsif ($FILE) {
242 while ($_ = shift) {
243 status('loading' => $_);
244 my $result = $db->load_from_file($_);
245 status (done => $result);
247 exit 1;
250 elsif ($accFILE){
251 my $filename = shift;
252 die "you must supply a filename after the --accFILE command line flag\n" unless $filename;
253 die "file $filename does not exist\n" unless (-e $filename && !(-d $filename));
254 open my $IN, '<', $filename or die "Could not read file '$filename' for reading accession numbers: $!\n";
255 while (my $line = <$IN>){
256 chomp $line;
257 status(loading => $line);
258 my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $line);
259 status(done => $result);
261 close $IN;
262 exit 1;
265 elsif ($gbFOLDER){
266 my $dir = $gbFOLDER;
267 die "folder $dir does not exist\n" unless (-e $dir && -d $dir);
268 opendir DIR, "$dir" || die "can't open directory $dir for reading: $!\n";
269 my @files = readdir DIR;
270 foreach my $file(@files){
271 if (!(-e "$gbFOLDER/$file") || (-d "$gbFOLDER/$file")){
272 print STDERR " $gbFOLDER/$file is not a filename! Skipping...\n";
273 next
275 my $result = $db->load_from_file("$gbFOLDER/$file");
276 print STDERR $result ? "ok\n" : "failed\n";
278 } elsif ($accPIPE){
279 my @accessions = <STDIN>;
280 chomp @accessions;
281 foreach (@accessions){
282 status(loading => $_);
283 my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
284 status(done => $result);
286 exit 1;
289 else {
290 my $done;
291 while ($_ = shift) {
292 $done = 1;
293 status(loading => $_);
294 my $result = $db->load_from_file($_);
295 status(done => $result);
298 $done || die "\n\nno source of data provided\n\n";
299 exit 1;
302 sub status {
303 my ($state,$msg) = @_;
304 return if $STDOUT;
305 if ($state eq 'loading') {
306 print STDERR "Loading $msg...";
307 } elsif ($state eq 'done') {
308 print STDERR $msg ? "ok\n" : "failed\n";