scripts/Bio-DB-GFF/bp_load_gff.pl

   1 #!/usr/bin/perl
   2
   3 use strict;
   4 use warnings;
   5 use lib '../blib/lib';
   6 use Bio::DB::GFF;
   7 use Getopt::Long;
   8
   9 =head1 NAME
  10
  11 bp_load_gff.pl - Load a Bio::DB::GFF database from GFF files.
  12
  13 =head1 SYNOPSIS
  14
  15   % bp_load_gff.pl -d testdb -u user -p pw
  16      --dsn 'dbi:mysql:database=dmel_r5_1;host=myhost;port=myport'
  17         dna1.fa dna2.fa features1.gff features2.gff ...
  18
  19 =head1 DESCRIPTION
  20
  21 This script loads a Bio::DB::GFF database with the features contained
  22 in a list of GFF files and/or FASTA sequence files.  You must use the
  23 exact variant of GFF described in L<Bio::DB::GFF>.  Various
  24 command-line options allow you to control which database to load and
  25 whether to allow an existing database to be overwritten.
  26
  27 This script uses the Bio::DB::GFF interface, and so works with all
  28 database adaptors currently supported by that module (MySQL, Oracle,
  29 PostgreSQL soon).  However, it is slow.  For faster loading, see the
  30 MySQL-specific L<bp_bulk_load_gff.pl> and L<bp_fast_load_gff.pl> scripts.
  31
  32 =head2 NOTES
  33
  34 If the filename is given as "-" then the input is taken from standard
  35 input. Compressed files (.gz, .Z, .bz2) are automatically
  36 uncompressed.
  37
  38 FASTA format files are distinguished from GFF files by their filename
  39 extensions.  Files ending in .fa, .fasta, .fast, .seq, .dna and their
  40 uppercase variants are treated as FASTA files.  Everything else is
  41 treated as a GFF file.  If you wish to load -fasta files from STDIN,
  42 then use the -f command-line swith with an argument of '-', as in
  43
  44     gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
  45
  46 On the first load of a database, you will see a number of "unknown
  47 table" errors.  This is normal.
  48
  49 About maxfeature: the default value is 100,000,000 bases.  If you have
  50 features that are close to or greater that 100Mb in length, then the
  51 value of maxfeature should be increased to 1,000,000,000, or another
  52 power of 10.
  53
  54 =head1 COMMAND-LINE OPTIONS
  55
  56 Command-line options can be abbreviated to single-letter options.
  57 e.g. -d instead of --database.
  58
  59    --dsn     <dsn>       Data source (default dbi:mysql:test)
  60    --adaptor <adaptor>   Schema adaptor (default dbi::mysqlopt)
  61    --user    <user>      Username for mysql authentication
  62    --pass    <password>  Password for mysql authentication
  63    --fasta   <path>      Fasta file or directory containing fasta files for the DNA
  64    --create              Force creation and initialization of database
  65    --maxfeature          Set the value of the maximum feature size (default 100 Mb; must be a power of 10)
  66    --group               A list of one or more tag names (comma or space separated)
  67                           to be used for grouping in the 9th column.
  68    --upgrade             Upgrade existing database to current schema
  69    --gff3_munge          Activate GFF3 name munging (see Bio::DB::GFF)
  70    --quiet               No progress reports
  71    --summary             Generate summary statistics for drawing coverage histograms.
  72                            This can be run on a previously loaded database or during
  73                            the load.
  74
  75 =head1 SEE ALSO
  76
  77 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<bp_load_gff.pl>
  78
  79 =head1 AUTHOR
  80
  81 Lincoln Stein, lstein@cshl.org
  82
  83 Copyright (c) 2002 Cold Spring Harbor Laboratory
  84
  85 This library is free software; you can redistribute it and/or modify
  86 it under the same terms as Perl itself.  See DISCLAIMER.txt for
  87 disclaimers of warranty.
  88
  89 =cut
  90
  91 my ($DSN,$ADAPTOR,$CREATE,$USER,$PASSWORD,$FASTA,$UPGRADE,$MAX_BIN,$GROUP_TAG,$MUNGE,$QUIET,$SUMMARY_STATS);
  92
  93 GetOptions ('dsn:s'                  => \$DSN,
  94             'adaptor:s'              => \$ADAPTOR,
  95             'u|user:s'               => \$USER,
  96             'p|password:s'           => \$PASSWORD,
  97             'fasta:s'                => \$FASTA,
  98             'upgrade'                => \$UPGRADE,
  99             'maxbin|maxfeature:s'    => \$MAX_BIN,
 100             'group:s'                => \$GROUP_TAG,
 101             'gff3_munge'             => \$MUNGE,
 102             'quiet'                  => \$QUIET,
 103             'summary'                => \$SUMMARY_STATS,
 104             'create'                 => \$CREATE) or (system('pod2text',$0), exit -1);
 105
 106 # some local defaults
 107 $DSN     ||= 'dbi:mysql:test';
 108 $ADAPTOR ||= 'dbi::mysqlopt';
 109 $MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
 110
 111 my @args;
 112 push @args,(-user=>$USER)     if defined $USER;
 113 push @args,(-pass=>$PASSWORD) if defined $PASSWORD;
 114 push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
 115 push @args,(-create=>1)       if $CREATE;
 116 push @args,(-write=>1);
 117
 118 my $db = Bio::DB::GFF->new(-adaptor=>$ADAPTOR,-dsn => $DSN,@args)
 119   or die "Can't open database: ",Bio::DB::GFF->error,"\n";
 120
 121 $db->gff3_name_munging(1) if $MUNGE;
 122
 123 if ($CREATE) {
 124     $SUMMARY_STATS++;
 125     $MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) :
 126                $db->initialize(1);
 127 } elsif ($UPGRADE) {
 128   warn qq(expect to see several "table already exists" messages\n);
 129   $db->initialize(0);
 130   my $dbi = $db->dbh;  # get the raw database handle
 131   my ($count) = $dbi->selectrow_array('SELECT COUNT(*) FROM fnote');
 132   if (defined($count) && $count > 0) {
 133     warn qq(fnote table detected.  Translating into fattribute table.  This may take a while.\n);
 134     $dbi->do("INSERT INTO fattribute VALUES (1,'Note')") or die "failed: ",$dbi->errstr;
 135     $dbi->do("INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) SELECT fnote.fid,1,fnote FROM fnote") or die "failed: ",$dbi->errstr;
 136     warn qq(Schema successfully upgraded.  You might want to drop the fnote table when you're sure everything's working.\n);
 137   }
 138 }
 139
 140 my (@gff,@fasta);
 141 foreach (@ARGV) {
 142   if (/\.(fa|fasta|dna|seq|fast)$/i) {
 143     push @fasta,$_;
 144   } else {
 145     push @gff,$_;
 146   }
 147 }
 148
 149 for my $file (@gff) {
 150   warn "$file: loading...\n";
 151   my $loaded = $db->load_gff($file,!$QUIET);
 152   warn "$file: $loaded records loaded\n";
 153 }
 154
 155 unshift @fasta,$FASTA if defined $FASTA;
 156
 157 for my $file (@fasta) {
 158   warn "Loading fasta ",(-d $file?"directory":"file"), " $file\n";
 159   my $loaded = $db->load_fasta($file,!$QUIET);
 160   warn "$file: $loaded records loaded\n";
 161 }
 162
 163 if ($SUMMARY_STATS) {
 164     warn "Building summary statistics for coverage histograms...\n";
 165     $db->build_summary_statistics;
 166 }