bin/bp_bulk_load_gff

   1 #!/usr/bin/perl
   2
   3 use strict;
   4 use warnings;
   5 # use lib './blib/lib';
   6 use DBI;
   7 use IO::File;
   8 use File::Spec;
   9 use Getopt::Long;
  10 use Bio::DB::GFF;
  11 use Bio::DB::GFF::Util::Binning 'bin';
  12
  13 use constant MYSQL => 'mysql';
  14 use constant FDATA      => 'fdata';
  15 use constant FTYPE      => 'ftype';
  16 use constant FGROUP     => 'fgroup';
  17 use constant FDNA       => 'fdna';
  18 use constant FATTRIBUTE => 'fattribute';
  19 use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature';
  20
  21 =head1 NAME
  22
  23 bp_bulk_load_gff.pl - Bulk-load a Bio::DB::GFF database from GFF files.
  24
  25 =head1 SYNOPSIS
  26
  27   % bp_bulk_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
  28
  29 =head1 DESCRIPTION
  30
  31 This script loads a Bio::DB::GFF database with the features contained
  32 in a list of GFF files and/or FASTA sequence files.  You must use the
  33 exact variant of GFF described in L<Bio::DB::GFF>.  Various
  34 command-line options allow you to control which database to load and
  35 whether to allow an existing database to be overwritten.
  36
  37 This script differs from bp_load_gff.pl in that it is hard-coded to use
  38 MySQL and cannot perform incremental loads.  See L<bp_load_gff.pl> for an
  39 incremental loader that works with all databases supported by
  40 Bio::DB::GFF, and L<bp_fast_load_gff.pl> for a MySQL loader that supports
  41 fast incremental loads.
  42
  43 =head2 NOTES
  44
  45 If the filename is given as "-" then the input is taken from standard
  46 input. Compressed files (.gz, .Z, .bz2) are automatically
  47 uncompressed.
  48
  49 FASTA format files are distinguished from GFF files by their filename
  50 extensions.  Files ending in .fa, .fasta, .fast, .seq, .dna and their
  51 uppercase variants are treated as FASTA files.  Everything else is
  52 treated as a GFF file.  If you wish to load -fasta files from STDIN,
  53 then use the -f command-line swith with an argument of '-', as in
  54
  55     gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
  56
  57 The nature of the bulk load requires that the database be on the local
  58 machine and that the indicated user have the "file" privilege to load
  59 the tables and have enough room in /usr/tmp (or whatever is specified
  60 by the \$TMPDIR environment variable), to hold the tables transiently.
  61
  62 Local data may now be uploaded to a remote server via the --local option
  63 with the database host specified in the dsn, e.g. dbi:mysql:test:db_host
  64
  65 The adaptor used is dbi::mysqlopt.  There is currently no way to
  66 change this.
  67
  68 About maxfeature: the default value is 100,000,000 bases.  If you have
  69 features that are close to or greater that 100Mb in length, then the
  70 value of maxfeature should be increased to 1,000,000,000. This value
  71 must be a power of 10.
  72
  73 Note that Windows users must use the --create option.
  74
  75 If the list of GFF or fasta files exceeds the kernel limit for the
  76 maximum number of command-line arguments, use the
  77 --long_list /path/to/files option.
  78
  79
  80 =head1 COMMAND-LINE OPTIONS
  81
  82 Command-line options can be abbreviated to single-letter options.
  83 e.g. -d instead of --database.
  84
  85    --database <dsn>      Database name (default dbi:mysql:test)
  86    --adaptor             Adaptor name (default mysql)
  87    --create              Reinitialize/create data tables without asking
  88    --user                Username to log in as
  89    --fasta               File or directory containing fasta files to load
  90    --long_list           Directory containing a very large number of
  91                          GFF and/or FASTA files
  92    --password            Password to use for authentication
  93                            (Does not work with Postgres, password must be
  94                            supplied interactively or be left empty for
  95                            ident authentication)
  96    --maxbin              Set the value of the maximum bin size
  97    --local               Flag to indicate that the data source is local
  98    --maxfeature          Set the value of the maximum feature size (power of 10)
  99    --group               A list of one or more tag names (comma or space separated)
 100                          to be used for grouping in the 9th column.
 101    --gff3_munge          Activate GFF3 name munging (see Bio::DB::GFF)
 102    --summary             Generate summary statistics for drawing coverage histograms.
 103                            This can be run on a previously loaded database or during
 104                            the load.
 105    --Temporary           Location of a writable scratch directory
 106
 107 =head1 SEE ALSO
 108
 109 L<Bio::DB::GFF>, L<fast_load_gff.pl>, L<load_gff.pl>
 110
 111 =head1 AUTHOR
 112
 113 Lincoln Stein, lstein@cshl.org
 114
 115 Copyright (c) 2002 Cold Spring Harbor Laboratory
 116
 117 This library is free software; you can redistribute it and/or modify
 118 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 119 disclaimers of warranty.
 120
 121 =cut
 122
 123 package Bio::DB::GFF::Adaptor::fauxmysql;
 124
 125 use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
 126 use vars '@ISA';
 127 @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
 128
 129 sub insert_sequence {
 130   my $self = shift;
 131   my ($id,$offset,$seq) = @_;
 132   print join("\t",$id,$offset,$seq),"\n";
 133 };
 134
 135 package Bio::DB::GFF::Adaptor::fauxmysqlcmap;
 136
 137 use Bio::DB::GFF::Adaptor::dbi::mysqlcmap;
 138 use vars '@ISA';
 139 @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlcmap';
 140
 141 sub insert_sequence {
 142   my $self = shift;
 143   my ($id,$offset,$seq) = @_;
 144   print join("\t",$id,$offset,$seq),"\n";
 145 };
 146
 147 package Bio::DB::GFF::Adaptor::fauxpg;
 148
 149 use Bio::DB::GFF::Adaptor::dbi::pg;
 150 use vars '@ISA';
 151 @ISA = 'Bio::DB::GFF::Adaptor::dbi::pg';
 152
 153 #these two subs are to separate the table creation from the
 154 #index creation
 155 sub do_initialize {
 156   my $self = shift;
 157   my $erase = shift;
 158   $self->drop_all if $erase;
 159
 160   my $dbh = $self->features_db;
 161   my $schema = $self->schema;
 162   foreach my $table_name ($self->tables) {
 163     my $create_table_stmt = $schema->{$table_name}{table} ;
 164     $dbh->do($create_table_stmt) ||  warn $dbh->errstr;
 165   #  $self->create_other_schema_objects(\%{$schema->{$table_name}});
 166   }
 167   1;
 168 }
 169
 170 sub _create_indexes_etc {
 171   my $self = shift;
 172
 173   my $dbh = $self->features_db;
 174   my $schema = $self->schema;
 175   foreach my $table_name ($self->tables) {
 176     $self->create_other_schema_objects(\%{$schema->{$table_name}});
 177   }
 178 }
 179
 180 sub insert_sequence {
 181   my $self = shift;
 182   my ($id,$offset,$seq) = @_;
 183   print "$id\t$offset\t$seq\n";
 184 }
 185
 186 package main;
 187
 188 eval "use Time::HiRes"; undef $@;
 189 my $timer = defined &Time::HiRes::time;
 190
 191 my $bWINDOWS = 0;    # Boolean: is this a MSWindows operating system?
 192 if ($^O =~ /MSWin32/i) {
 193     $bWINDOWS = 1;
 194 }
 195
 196 my ($DSN,$ADAPTOR,$FORCE,$USER,$PASSWORD,$FASTA,$LOCAL,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR);
 197
 198 GetOptions ('database:s'    => \$DSN,
 199             'adaptor:s'     => \$ADAPTOR,
 200             'create'        => \$FORCE,
 201             'user:s'        => \$USER,
 202             'password:s'    => \$PASSWORD,
 203             'fasta:s'       => \$FASTA,
 204             'local'         => \$LOCAL,
 205             'maxbin|maxfeature:s'    => \$MAX_BIN,
 206             'group:s'       => \$GROUP_TAG,
 207             'long_list:s'   => \$LONG_LIST,
 208             'gff3_munge'    => \$MUNGE,
 209             'Temporary:s'   => \$TMPDIR,
 210            ) or (system('pod2text', $0), exit -1);
 211
 212 # If called as pg_bulk_load_gff.pl behave as that did.
 213 if ($0 =~/pg_bulk_load_gff.pl/){
 214     $ADAPTOR ||= 'Pg';
 215     $DSN     ||= 'test';
 216 }
 217 $DSN     ||= 'dbi:mysql:test';
 218 $MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
 219
 220
 221 if ($bWINDOWS && not $FORCE) {
 222   die "Note that Windows users must use the --create option.\n";
 223 }
 224
 225 unless ($FORCE) {
 226   die "This will delete all existing data in database $DSN.  If you want to do this, rerun with the --create option.\n"
 227     if $bWINDOWS;
 228   open (TTY,"/dev/tty") or die "/dev/tty: $!\n";  #TTY use removed for win compatability
 229   print STDERR "This operation will delete all existing data in database $DSN.  Continue? ";
 230   my $f = <TTY>;
 231   die "Aborted\n" unless $f =~ /^[yY]/;
 232   close TTY;
 233 }
 234
 235 # postgres DBD::Pg allows 'database', but also 'dbname', and 'db':
 236 # and it must be Pg (not pg)
 237 $DSN=~s/pg:database=/Pg:/i;
 238 $DSN=~s/pg:dbname=/Pg:/i;
 239 $DSN=~s/pg:db=/Pg:/i;
 240
 241 # leave these lines for mysql
 242 $DSN=~s/database=//i;
 243 $DSN=~s/;host=/:/i; #cater for dsn in the form of "dbi:mysql:database=$dbname;host=$host"
 244
 245
 246 my($DBI,$DBD,$DBNAME,$HOST)=split /:/,$DSN;
 247 $DBNAME=$DSN unless $DSN=~/:/;
 248 $ADAPTOR ||= $DBD;
 249 $ADAPTOR ||= 'mysql';
 250
 251 if ($DBD eq 'Pg') {
 252         # rebuild DSN, DBD::Pg requires full dbname=<name> format
 253         $DSN = "dbi:Pg:dbname=$DBNAME";
 254         if ($HOST) { $DSN .= ";host=$HOST"; }
 255 }
 256
 257 my ($use_mysql,$use_mysqlcmap,$use_pg) = (0,0,0);
 258 if ( $ADAPTOR eq 'mysqlcmap' ) {
 259   $use_mysqlcmap = 1;
 260 }
 261 elsif ( $ADAPTOR =~ /^mysql/ ) {
 262   $use_mysql = 1;
 263 }
 264 elsif ( $ADAPTOR eq "Pg" ) {
 265   $use_pg = 1;
 266 }
 267 else{
 268     die "$ADAPTOR is not an acceptable database adaptor.";
 269 }
 270
 271
 272 my (@auth,$AUTH);
 273 if (defined $USER) {
 274   push @auth,(-user=>$USER);
 275   if ( $use_mysql or $use_mysqlcmap ) {
 276     $AUTH .= " -u$USER";
 277   }
 278   elsif ( $use_pg ) {
 279     $AUTH .= " -U $USER ";
 280   }
 281 }
 282 if (defined $PASSWORD) {
 283   push @auth,(-pass=>$PASSWORD);
 284   if ( $use_mysql or $use_mysqlcmap ) {
 285     $AUTH .= " -p$PASSWORD";
 286   }
 287 #  elsif ( $use_pg ) {
 288 #    $AUTH .= " -W $PASSWORD ";
 289 #  }
 290 }
 291
 292 if (defined $HOST) {
 293   $AUTH .= " -h$HOST";
 294 }
 295 if (defined $DBNAME) {
 296   if ( $use_mysql or $use_mysqlcmap ) {
 297     $AUTH .= " -D$DBNAME ";
 298   }
 299 }
 300 if (defined $LOCAL) {
 301   $LOCAL='local';
 302   $AUTH.=' --local-infile=1';
 303 }else {
 304   $LOCAL='';
 305 }
 306
 307 my $faux_adaptor;
 308 if ( $use_mysqlcmap ) {
 309   $faux_adaptor = "fauxmysqlcmap";
 310 }
 311 elsif ( $use_mysql ) {
 312   $faux_adaptor = "fauxmysql";
 313 }
 314 elsif ( $use_pg ) {
 315   $faux_adaptor = "fauxpg";
 316 }
 317
 318 my $db = Bio::DB::GFF->new(-adaptor=>$faux_adaptor,-dsn => $DSN,@auth)
 319   or die "Can't open database: ",Bio::DB::GFF->error,"\n";
 320
 321 $db->gff3_name_munging(1) if $MUNGE;
 322
 323 $MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
 324 $MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
 325
 326 # deal with really long lists of files
 327 if ($LONG_LIST) {
 328   -d $LONG_LIST or die "The --long_list argument must be a directory\n";
 329   opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
 330   @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
 331   closedir GFFDIR;
 332
 333   if (defined $FASTA && -d $FASTA) {
 334     opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
 335     push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
 336     closedir FASTA;
 337   }
 338   elsif (defined $FASTA && -f $FASTA) {
 339     push @ARGV, $FASTA;
 340   }
 341 }
 342
 343 foreach (@ARGV) {
 344   $_ = "gunzip -c $_ |" if /\.gz$/;
 345   $_ = "uncompress -c $_ |" if /\.Z$/;
 346   $_ = "bunzip2 -c $_ |" if /\.bz2$/;
 347 }
 348
 349 my (@gff,@fasta);
 350 foreach (@ARGV) {
 351   if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) {
 352     push @fasta,$_;
 353   } else {
 354     push @gff,$_;
 355   }
 356 }
 357 @ARGV = @gff;
 358 push @fasta,$FASTA if defined $FASTA;
 359
 360 # drop everything that was there before
 361 my %FH;
 362 my $tmpdir = File::Spec->tmpdir() || '/tmp';
 363 $tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation
 364 -d $tmpdir or die <<END;
 365 I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
 366 Please select a directory and indicate its location by setting the TMP environment variable, or
 367 by using the --Temporary switch.
 368 END
 369 my @fasta_files_to_be_unlinked;
 370 my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
 371 foreach (@files) {
 372   $FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!";
 373   $FH{$_}->autoflush;
 374 }
 375
 376 if ( $use_pg ) {
 377   $FH{FDATA()                }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n");
 378   $FH{FTYPE()                }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n");
 379   $FH{FGROUP()               }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n");
 380   $FH{FATTRIBUTE()           }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n");
 381   $FH{FATTRIBUTE_TO_FEATURE()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n");
 382 }
 383 my $FID     = 1;
 384 my $GID     = 1;
 385 my $FTYPEID = 1;
 386 my $ATTRIBUTEID = 1;
 387 my %GROUPID     = ();
 388 my %FTYPEID     = ();
 389 my %ATTRIBUTEID = ();
 390 my %DONE        = ();
 391 my $FEATURES    = 0;
 392
 393 my %tmpfiles; # keep track of temporary fasta files
 394 my $count;
 395 my $fasta_sequence_id;
 396 my $gff3;
 397 my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files
 398
 399 $db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG;
 400
 401 my $last  = Time::HiRes::time() if $timer;
 402 my $start = $last;
 403
 404   # avoid hanging on standalone --fasta load
 405 if (!@ARGV) {
 406     $FH{NULL} = IO::File->new(">$tmpdir/null");
 407     push @ARGV, "$tmpdir/null";
 408 }
 409
 410 my ($cmap_db);
 411 if ($use_mysqlcmap){
 412   my $options = {
 413                  AutoCommit       => 1,
 414                  FetchHashKeyName => 'NAME_lc',
 415                  LongReadLen      => 3000,
 416                  LongTruncOk      => 1,
 417                  RaiseError       => 1,
 418                 };
 419
 420   $cmap_db = DBI->connect( $DSN, $USER, $PASSWORD, $options );
 421 }
 422 # Only load CMap::Utils if using cmap
 423 unless (!$use_mysqlcmap or
 424         eval {
 425           require Bio::GMOD::CMap::Utils;
 426           Bio::GMOD::CMap::Utils->import('next_number');
 427           1;
 428         }
 429        ) {
 430   print STDERR "Error loading Bio::GMOD::CMap::Utils\n";
 431 }
 432
 433
 434 while (<>) {
 435
 436   $current_file ||= $ARGV;
 437
 438   # reset GFF3 flag if new filehandle
 439   unless($current_file eq $ARGV){
 440     undef $gff3;
 441     $current_file = $ARGV;
 442   }
 443
 444   chomp;
 445   my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
 446
 447   # close sequence filehandle if required
 448   if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
 449     $FH{FASTA}->close;
 450     delete $FH{FASTA};
 451   }
 452
 453   # print to fasta file if the handle is open
 454   if ( defined $FH{FASTA} ) {
 455     $FH{FASTA}->print("$_\n");
 456     next;
 457   }
 458
 459   elsif (/^>(\S+)/) {  # uh oh, sequence coming
 460     $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
 461     $FH{FASTA}->print("$_\n");
 462     print STDERR "Preparing embedded sequence $1\n";
 463     push @fasta, "$tmpdir/$1\.fa";
 464     push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
 465     $tmpfiles{"$tmpdir/$1\.fa"}++;
 466     next;
 467   }
 468
 469   elsif (/^\#\#\s*gff-version\s+(\d+)/) {
 470     $gff3 = ($1 >= 3);
 471     $db->print_gff3_warning() if $gff3;
 472     next;
 473   }
 474
 475   elsif (/^\#\#\s*group-tags\s+(.+)/) {
 476     $db->preferred_groups(split(/\s+/,$1));
 477     next;
 478   }
 479
 480   elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
 481     ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
 482         ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
 483   }
 484
 485   elsif (/^\#/) {
 486     next;
 487   }
 488
 489   else {
 490     ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
 491   }
 492   if ( not defined( $ref ) or length ($ref) == 0) {
 493     warn "\$ref is null.  source = $source, method = $method, group = $group\n";
 494     next;
 495   }
 496   $FEATURES++;
 497   my $size = $stop-$start+1;
 498   warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN;
 499
 500   $source = '\N' unless defined $source;
 501   $score  = '\N' if $score  eq '.';
 502   $strand = '\N' if $strand eq '.';
 503   $phase  = '\N' if $phase  eq '.';
 504
 505   my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
 506
 507   # GFF2/3 transition
 508   $group_class = [$group_class] unless ref $group_class;
 509   $group_name  = [$group_name]  unless ref $group_name;
 510
 511   for (my $i=0; $i < @$group_name; $i++) {
 512     $group_class->[$i]  ||= '\N';
 513     $group_name->[$i]   ||= '\N';
 514     $target_start ||= '\N';
 515     $target_stop  ||= '\N';
 516     $method       ||= '\N';
 517     $source       ||= '\N';
 518
 519     my $fid     = $FID++;
 520     my $gid     = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])}  ||= $GID++;
 521     my $ftypeid = $FTYPEID{lc join('',$source,$method)}                       ||= $FTYPEID++;
 522
 523     my $bin = bin($start,$stop,$db->min_bin);
 524     $FH{ FDATA()  }->print(    join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n"   );
 525     if ($use_mysqlcmap){
 526       my $feature_id    = next_number(
 527                                       db         => $cmap_db,
 528                                       table_name => 'cmap_feature',
 529                                       id_field   => 'feature_id',
 530                                      )
 531         or die 'No feature id';
 532       my $direction = $strand eq '-' ? -1:1;
 533       $FH{ FGROUP() }->print(
 534                              join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],)
 535                              ,"\n"
 536                             ) unless $DONE{"G$gid"}++;
 537     }
 538     else {
 539       $FH{ FGROUP() }->print(    join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++;
 540     }
 541     $FH{ FTYPE()  }->print(    join("\t",$ftypeid,$method,$source),"\n"                   ) unless $DONE{"T$ftypeid"}++;
 542
 543     foreach (@$attributes) {
 544       my ($key,$value) = @$_;
 545       my $attributeid = $ATTRIBUTEID{$key}   ||= $ATTRIBUTEID++;
 546       $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n"                       ) unless $DONE{"A$attributeid"}++;
 547       $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
 548     }
 549
 550     if ( $fid % 1000 == 0) {
 551       my $now    = Time::HiRes::time() if $timer;
 552       my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
 553       $last = $now;
 554       print STDERR "$fid features parsed$elapsed...";
 555       print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
 556     }
 557   }
 558 }
 559
 560 $FH{FASTA}->close if exists $FH{FASTA};
 561
 562 for my $file (@fasta) {
 563   warn "Preparing DNA file $file....\n";
 564   if ($use_pg){
 565     $FH{FDNA() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n");
 566   }
 567   my $old = select($FH{FDNA()});
 568   $db->load_fasta($file) or warn "Couldn't load fasta file $file: $!";
 569   if ($use_pg){
 570     $FH{FDNA() }->print("\\.\n\n");
 571   }
 572   warn "done...\n";
 573   select $old;
 574   unlink $file if $tmpfiles{$file};
 575 }
 576
 577 if ($use_pg) {
 578   $FH{FDATA()                }->print("\\.\n\n");
 579   $FH{FTYPE()                }->print("\\.\n\n");
 580   $FH{FGROUP()               }->print("\\.\n\n");
 581   $FH{FATTRIBUTE()           }->print("\\.\n\n");
 582   $FH{FATTRIBUTE_TO_FEATURE()}->print("\\.\n\n");
 583 }
 584
 585
 586 $_->close foreach values %FH;
 587 printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
 588 warn "Loading feature data and analyzing tables.  You may see RDBMS messages here...\n";
 589
 590 if ($use_pg){
 591   warn "Loading feature data.  You may see Postgres comments...\n";
 592
 593   foreach (@files) {
 594     my $file = "$tmpdir/$_.$$";
 595
 596     $AUTH ? system("psql $AUTH -f $file $DBNAME")
 597           : system('psql','-f', $file, $DBNAME);
 598
 599     unlink $file;
 600   }
 601
 602   warn "Updating sequences ...\n";
 603   $db->update_sequences();
 604
 605   warn "Creating indexes ...\n";
 606   $db->_create_indexes_etc();
 607
 608   warn "done...\n";
 609
 610 }
 611
 612 elsif( $use_mysql or $use_mysqlcmap ) {
 613   $start = time();
 614
 615   my $success = 1;
 616   my $TERMINATEDBY = $bWINDOWS ? q( LINES TERMINATED BY '\r\n') : '';
 617   for my $f (@files) {
 618     my $table = function_to_table($f,$ADAPTOR);
 619     my $sql = join ('; ',
 620                     "lock tables $table write",
 621                     "delete from $table",
 622                     "load data $LOCAL infile '$tmpdir/$f.$$' replace into table $table $TERMINATEDBY",
 623                     "unlock tables");
 624     my $command = MYSQL . qq[$AUTH -s -e "$sql"];
 625     $command =~ s/\n/ /g;
 626     $success &&= system($command) == 0;
 627     unlink "$tmpdir/$f.$$";
 628   }
 629   printf STDERR "Total load time %5.2fs\n",(time() - $start) if $timer;
 630   print STDERR "done...\n";
 631
 632   print STDERR "Analyzing/optimizing tables. You will see database messages...\n";
 633   $start = time();
 634   my $sql = '';
 635   for my $f (@files) {
 636     my $table = function_to_table($f,$ADAPTOR);
 637     $sql       .= "analyze table $table;";
 638   }
 639   my $command = MYSQL . qq[$AUTH -N -s -e "$sql"];
 640   $success &&= system($command) == 0;
 641   printf STDERR "Optimization time time %5.2fs\n",(time() - $start);
 642
 643   if ($success) {
 644     print "$FEATURES features successfully loaded\n";
 645   } else {
 646     print "FAILURE: Please see standard error for details\n";
 647     exit -1;
 648   }
 649 }
 650
 651 foreach (@fasta_files_to_be_unlinked) {
 652   unlink "$tmpdir/$_.$$";
 653 }
 654
 655 warn "Building summary statistics for coverage histograms...\n";
 656 my (@args,$AUTH);
 657 if (defined $USER) {
 658   push @args,(-user=>$USER);
 659   $AUTH .= " -u$USER";
 660 }
 661 if (defined $PASSWORD) {
 662   push @args,(-pass=>$PASSWORD);
 663   $AUTH .= " -p$PASSWORD";
 664 }
 665 push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
 666 my $db = Bio::DB::GFF->new(-adaptor=>"dbi::$ADAPTOR",-dsn => $DSN,@args)
 667   or die "Can't open database: ",Bio::DB::GFF->error,"\n";
 668 $db->build_summary_statistics;
 669
 670 exit 0;
 671
 672 sub function_to_table {
 673     my $function = shift;
 674     my $adaptor  = shift;
 675
 676     if ($function eq 'fdata'){
 677         return 'fdata';
 678     }
 679     elsif ($function eq 'ftype'){
 680         return 'ftype';
 681     }
 682     elsif ($function eq 'fgroup'){
 683         return 'cmap_feature' if ($adaptor eq 'mysqlcmap');
 684         return 'fgroup';
 685     }
 686     elsif ($function eq 'fdna'){
 687         return 'fdna';
 688     }
 689     elsif ($function eq 'fattribute'){
 690         return 'fattribute';
 691     }
 692     elsif ($function eq 'fattribute_to_feature'){
 693         return 'fattribute_to_feature';
 694     }
 695     return '';
 696 }
 697
 698 __END__