scripts/Bio-DB-GFF/bp_genbank2gff.pl

   1 #!/usr/bin/perl
   2
   3 use lib '.';
   4
   5 use strict;
   6 use warnings;
   7 use Bio::DB::GFF;
   8 use Getopt::Long;
   9
  10 =head1 NAME
  11
  12 bp_genbank2gff.pl - Load a Bio::DB::GFF database from GENBANK files.
  13
  14 =head1 SYNOPSIS
  15
  16   % bp_genbank2gff.pl -d genbank -f localfile.gb
  17   % bp_genbank2gff.pl -d genbank --accession AP003256
  18   % bp_genbank2gff.pl --accession AP003256 --stdout
  19
  20 =head1 DESCRIPTION
  21
  22 This script loads a Bio::DB::GFF database with the features contained
  23 in a either a local genbank file or an accession that is fetched from
  24 genbank.  Various command-line options allow you to control which
  25 database to load and whether to allow an existing database to be
  26 overwritten.
  27
  28 The database must already have been created and the current user must
  29 have appropriate INSERT and UPDATE privileges.  The --create option
  30 will initialize a new database with the appropriate schema, deleting
  31 any tables that were already there.
  32
  33 =head1 COMMAND-LINE OPTIONS
  34
  35 Command-line options can be abbreviated to single-letter options.
  36 e.g. -d instead of --database.
  37
  38    --create                 Force creation and initialization of database
  39    --dsn       <dsn>        Data source (default dbi:mysql:test)
  40    --user      <user>       Username for mysql authentication
  41    --pass      <password>   Password for mysql authentication
  42    --proxy     <proxy>      Proxy server to use for remote access
  43    --stdout                 direct output to STDOUT
  44    --adaptor   <adaptor>    adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle)   --viral                  the genome you are loading is viral (changes tag
  45                                  choices)
  46    --source    <source>     source field for features ['genbank']
  47     EITHER --file           Arguments that follow are Genbank/EMBL file names
  48     OR --gb_folder          What follows is a folder full of gb files to process    OR --accession          Arguments that follow are genbank accession numbers
  49                                  (not gi!)
  50     OR --acc_file           Accession numbers (not gi!) in a file (one per line,                                 no punc.)
  51     OR --acc_pipe           Accession numbers (not gi!) from a STDIN pipe (one
  52                                  per line)
  53
  54
  55 =head1 SEE ALSO
  56
  57 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
  58
  59 =head1 AUTHOR
  60
  61 Scott Cain, cain@cshl.org
  62
  63 Copyright (c) 2003 Cold Spring Harbor Laboratory
  64
  65 This library is free software; you can redistribute it and/or modify
  66 it under the same terms as Perl itself.  See DISCLAIMER.txt for
  67 disclaimers of warranty.
  68
  69 =cut
  70
  71 package Bio::DB::GFF::Adaptor::biofetch_to_stdout;
  72 use CGI 'escape';
  73 use Bio::DB::GFF::Util::Rearrange;
  74 use Bio::DB::GFF::Adaptor::biofetch;
  75 use vars '@ISA';
  76 @ISA = 'Bio::DB::GFF::Adaptor::biofetch';
  77
  78 sub load_gff_line {
  79   my ($self,$options) = @_;
  80   # synthesize GFF3-compatible line
  81   my @attributes;
  82   if (my $id = $options->{gname}) {
  83     my $parent = $id;
  84     $parent    =~ s/\..\d+$//                  if     $options->{method} =~ /^(mRNA|transcript|exon|gene)$/;
  85     push @attributes,"Parent=".escape($parent) if     $options->{method} =~ /^(variation|exon|CDS|transcript|mRNA|coding)$/;
  86     push @attributes,"ID=".escape($id)         unless $options->{method} =~ /^(exon|CDS)$/;
  87   }
  88   if (my $tstart = $options->{tstart}) {
  89     my $tstop    = $options->{tstop};
  90     my $target   = escape($options->{gname});
  91     push @attributes,"Target=$target+$tstart+$tstop";
  92   }
  93   my %a;
  94   if (my $attributes = $options->{attributes}) {
  95     for my $a (@$attributes) {
  96       my ($tag,$value) = @$a;
  97       push @{$a{escape($tag)}},escape($value);
  98     }
  99     for my $a (keys %a) {
 100        push @attributes,"$a=".join(',',@{$a{$a}});
 101     }
 102   }
 103   ${$options}{'score'} = "." unless ${$options}{'score'};
 104   ${$options}{'strand'} = "." unless ${$options}{'strand'};
 105   ${$options}{'phase'} = "." unless ${$options}{'phase'};
 106   my $last_column = join ';',@attributes;
 107   if ($options->{method} eq 'origin') {
 108      print "##sequence-region $options->{gname} $options->{start} $options->{stop}\n";
 109   }
 110   print join("\t",@{$options}{qw(ref source method start stop score strand phase)},$last_column),"\n";
 111 }
 112
 113 sub load_sequence_string {
 114   my $self = shift;
 115   my ($acc,$seq)  = @_;
 116   return unless $seq;
 117   $seq =~ s/(.{1,60})/$1\n/g;
 118   print ">$acc\n\L$seq\U\n";
 119 }
 120
 121 sub setup_load {
 122    my $self = shift;
 123    print "##gff-version 3\n";
 124 }
 125
 126 sub finish_load { }
 127
 128 1;
 129
 130 package main;
 131
 132 my $USAGE = <<USAGE;
 133
 134 Usage: $0 [options] [<gff file 1> <gff file 2>] ...
 135 Load a Bio::DB::GFF database from GFF files.
 136
 137  Options:
 138    --create                 Force creation and initialization of database
 139    --dsn       <dsn>        Data source (default dbi:mysql:test)
 140    --user      <user>       Username for mysql authentication
 141    --pass      <password>   Password for mysql authentication
 142    --proxy     <proxy>      Proxy server to use for remote access
 143    --stdout                 direct output to STDOUT
 144    --adaptor   <adaptor>    adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle)
 145    --viral                  the genome you are loading is viral (changes tag
 146                                  choices)
 147    --source    <source>     source field for features ['genbank']
 148     EITHER --file           Arguments that follow are Genbank/EMBL file names
 149     OR --gb_folder          What follows is a folder full of gb files to process
 150     OR --accession          Arguments that follow are genbank accession numbers
 151                                  (not gi!)
 152     OR --acc_file           Accession numbers (not gi!) in a file (one per line,
 153                                  no punc.)
 154     OR --acc_pipe           Accession numbers (not gi!) from a STDIN pipe (one
 155                                  per line)
 156
 157
 158 This script loads a Bio::DB::GFF database with the features contained
 159 in a either a local genbank file or an accession that is fetched from
 160 genbank.  Various command-line options allow you to control which
 161 database to load and whether to allow an existing database to be
 162 overwritten.
 163
 164 USAGE
 165 ;
 166
 167 my ($DSN,$ADAPTOR,$CREATE,$USER,$VIRAL,$PASSWORD,$gbFOLDER,
 168     $FASTA,$ACC,$accFILE, $accPIPE, $FILE,$PROXY,$STDOUT,$SOURCE);
 169
 170
 171 GetOptions (
 172             'dsn:s'       => \$DSN,
 173             'user:s'      => \$USER,
 174             'password:s'  => \$PASSWORD,
 175             'adaptor:s'   => \$ADAPTOR,
 176             'accession'   => \$ACC,
 177             'file'        => \$FILE,
 178             'viral'       => \$VIRAL,
 179             'acc_file'    => \$accFILE,
 180             'acc_pipe'    => \$accPIPE,
 181             'source:s'    => \$SOURCE,
 182             'gb_folder=s' => \$gbFOLDER,
 183             'proxy:s'     => \$PROXY,
 184             'stdout'      => \$STDOUT,
 185             'create'      => \$CREATE) or die $USAGE;
 186
 187
 188 die $USAGE unless ($DSN || $STDOUT);  # at a minimum we need to have a place to write to!
 189
 190 # some local defaults
 191 $DSN     ||= 'dbi:mysql:test';
 192 $ADAPTOR ||= $STDOUT ? 'memory' : 'dbi::mysql';
 193
 194 # Ensure that biofetch inherits from the "right" adaptor.
 195 # This is a horrible hack and should be fixed.
 196 eval "use Bio::DB::GFF::Adaptor::${ADAPTOR}";
 197 local @Bio::DB::GFF::Adaptor::biofetch::ISA = "Bio::DB::GFF::Adaptor::${ADAPTOR}";
 198
 199 my $biofetch = $STDOUT ? 'biofetch_to_stdout' : 'biofetch';
 200 my @dsn      = $STDOUT ? () : (-dsn => $DSN);
 201
 202 my @auth;
 203 push @auth,(-user=>$USER)     if defined $USER;
 204 push @auth,(-pass=>$PASSWORD) if defined $PASSWORD;
 205 push @auth,(-proxy=>$PROXY)   if defined $PROXY;
 206
 207 my %preferred_tags = (
 208                       strain        => 10,
 209                       organism      => 20,
 210                       protein_id    => 40,
 211                       locus_tag     => 50,
 212                       locus         => 60,
 213                       gene          => 70,
 214                       standard_name => 80,
 215                      );
 216 $preferred_tags{'product'} = 90 if $VIRAL; # added this to the default list for viral genomes
 217        # since most functions come from post-translational processing, so the default labels are c**p!
 218
 219 my $db = Bio::DB::GFF->new(-adaptor=>$biofetch,
 220                            @dsn,
 221                            @auth,
 222                            -preferred_tags => \%preferred_tags,
 223                            -source=> $SOURCE || 'Genbank')
 224   or die "Can't open database: ",Bio::DB::GFF->error,"\n";
 225
 226 if ($CREATE) {
 227   $db->initialize(1);
 228 }
 229
 230 die "you must specify either an accession to retrieve from\nembl or a local file containing data in embl format\n" if (($FILE || $ACC) && !scalar(@ARGV));
 231
 232 if ($ACC) {
 233   while ($_ = shift) {
 234     status(loading => $_);
 235     my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
 236     status(done    => $result);
 237   }
 238   exit 1;
 239 }
 240
 241 elsif ($FILE) {
 242   while ($_ = shift) {
 243     status('loading' => $_);
 244     my $result = $db->load_from_file($_);
 245     status (done => $result);
 246   }
 247   exit 1;
 248 }
 249
 250 elsif ($accFILE){
 251     my $filename = shift;
 252     die "you must supply a filename after the --accFILE command line flag\n" unless $filename;
 253     die "file $filename does not exist\n" unless (-e $filename && !(-d $filename));
 254     open my $IN, '<', $filename or die "Could not read file '$filename' for reading accession numbers: $!\n";
 255     while (my $line = <$IN>){
 256         chomp $line;
 257         status(loading => $line);
 258         my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $line);
 259         status(done => $result);
 260     }
 261     close $IN;
 262     exit 1;
 263 }
 264
 265 elsif ($gbFOLDER){
 266     my $dir = $gbFOLDER;
 267     die "folder $dir does not exist\n" unless (-e $dir && -d $dir);
 268     opendir DIR, "$dir" || die "can't open directory $dir for reading: $!\n";
 269     my @files = readdir DIR;
 270     foreach my $file(@files){
 271         if (!(-e "$gbFOLDER/$file") || (-d "$gbFOLDER/$file")){
 272             print STDERR " $gbFOLDER/$file is not a filename!  Skipping...\n";
 273             next
 274         }
 275         my $result = $db->load_from_file("$gbFOLDER/$file");
 276         print STDERR $result ? "ok\n" : "failed\n";
 277     }
 278 } elsif ($accPIPE){
 279     my @accessions = <STDIN>;
 280     chomp @accessions;
 281     foreach (@accessions){
 282       status(loading => $_);
 283       my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
 284       status(done => $result);
 285     }
 286     exit 1;
 287 }
 288
 289 else {
 290   my $done;
 291   while ($_ = shift) {
 292     $done = 1;
 293     status(loading => $_);
 294     my $result = $db->load_from_file($_);
 295     status(done => $result);
 296   }
 297
 298   $done || die "\n\nno source of data provided\n\n";
 299   exit 1;
 300 }
 301
 302 sub status {
 303   my ($state,$msg) = @_;
 304   return if $STDOUT;
 305   if ($state eq 'loading') {
 306     print STDERR "Loading $msg...";
 307   } elsif ($state eq 'done') {
 308     print STDERR $msg ? "ok\n" : "failed\n";
 309   }
 310 }