scripts/Bio-DB-GFF/bp_fast_load_gff.pl

   1 #!/usr/bin/perl
   2
   3 use strict;
   4 use warnings;
   5 # use lib './blib/lib';
   6 use DBI;
   7 use IO::File;
   8 use Getopt::Long;
   9 use Bio::DB::GFF::Util::Binning 'bin';
  10 use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
  11
  12 use constant MYSQL => 'mysql';
  13
  14 use constant FDATA      => 'fdata';
  15 use constant FTYPE      => 'ftype';
  16 use constant FGROUP     => 'fgroup';
  17 use constant FDNA       => 'fdna';
  18 use constant FATTRIBUTE => 'fattribute';
  19 use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature';
  20
  21 my $DO_FAST = eval "use POSIX 'WNOHANG'; 1;";
  22
  23 =head1 NAME
  24
  25 bp_fast_load_gff.pl - Fast-load a Bio::DB::GFF database from GFF files.
  26
  27 =head1 SYNOPSIS
  28
  29   % bp_fast_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
  30
  31 =head1 DESCRIPTION
  32
  33 This script loads a Bio::DB::GFF database with the features contained
  34 in a list of GFF files and/or FASTA sequence files.  You must use the
  35 exact variant of GFF described in L<Bio::DB::GFF>.  Various
  36 command-line options allow you to control which database to load and
  37 whether to allow an existing database to be overwritten.
  38
  39 This script is similar to load_gff.pl, but is much faster.  However,
  40 it is hard-coded to use MySQL and probably only works on Unix
  41 platforms due to its reliance on pipes.  See L<bp_load_gff.pl> for an
  42 incremental loader that works with all databases supported by
  43 Bio::DB::GFF, and L<bp_bulk_load_gff.pl> for a fast MySQL loader that
  44 supports all platforms.
  45
  46 =head2 NOTES
  47
  48 If the filename is given as "-" then the input is taken from
  49 standard input. Compressed files (.gz, .Z, .bz2) are automatically
  50 uncompressed.
  51
  52 FASTA format files are distinguished from GFF files by their filename
  53 extensions.  Files ending in .fa, .fasta, .fast, .seq, .dna and their
  54 uppercase variants are treated as FASTA files.  Everything else is
  55 treated as a GFF file.  If you wish to load -fasta files from STDIN,
  56 then use the -f command-line swith with an argument of '-', as in
  57
  58     gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
  59
  60 The nature of the load requires that the database be on the local
  61 machine and that the indicated user have the "file" privilege to load
  62 the tables and have enough room in /usr/tmp (or whatever is specified
  63 by the \$TMPDIR environment variable), to hold the tables transiently.
  64 If your MySQL is version 3.22.6 and was compiled using the "load local
  65 file" option, then you may be able to load remote databases with local
  66 data using the --local option.
  67
  68 About maxfeature: the default value is 100,000,000 bases.  If you have
  69 features that are close to or greater that 100Mb in length, then the
  70 value of maxfeature should be increased to 1,000,000,000. This value
  71 must be a power of 10.
  72
  73 If the list of GFF or fasta files exceeds the kernel limit for the
  74 maximum number of command-line arguments, use the
  75 --long_list /path/to/files option.
  76
  77 The adaptor used is dbi::mysqlopt.  There is currently no way to
  78 change this.
  79
  80 =head1 COMMAND-LINE OPTIONS
  81
  82 Command-line options can be abbreviated to single-letter options.
  83 e.g. -d instead of --database.
  84
  85    --database <dsn>      Mysql database name
  86    --create              Reinitialize/create data tables without asking
  87    --local               Try to load a remote database using local data.
  88    --user                Username to log in as
  89    --fasta               File or directory containing fasta files to load
  90    --password            Password to use for authentication
  91    --long_list           Directory containing a very large number of
  92                          GFF and/or FASTA files
  93    --maxfeature          Set the value of the maximum feature size (default 100Mb; must be a power of 10)
  94    --group               A list of one or more tag names (comma or space separated)
  95                          to be used for grouping in the 9th column.
  96    --gff3_munge          Activate GFF3 name munging (see Bio::DB::GFF)
  97    --summary             Generate summary statistics for drawing coverage histograms.
  98                            This can be run on a previously loaded database or during
  99                            the load.
 100    --Temporary           Location of a writable scratch directory
 101
 102 =head1 SEE ALSO
 103
 104 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
 105
 106 =head1 AUTHOR
 107
 108 Lincoln Stein, lstein@cshl.org
 109
 110 Copyright (c) 2002 Cold Spring Harbor Laboratory
 111
 112 This library is free software; you can redistribute it and/or modify
 113 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 114 disclaimers of warranty.
 115
 116 =cut
 117
 118 package Bio::DB::GFF::Adaptor::faux;
 119
 120 use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
 121 use vars '@ISA';
 122 @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
 123
 124 sub insert_sequence {
 125   my $self = shift;
 126   my ($id,$offset,$seq) = @_;
 127   print join "\t",$id,$offset,$seq,"\n";
 128 }
 129
 130 package main;
 131
 132 eval "use Time::HiRes"; undef $@;
 133 my $timer = defined &Time::HiRes::time;
 134
 135 my ($DSN,$CREATE,$USER,$PASSWORD,$FASTA,$FAILED,$LOCAL,%PID,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR,$SUMMARY_STATS);
 136
 137 if ($DO_FAST) {
 138   $SIG{CHLD} = sub {
 139     while ((my $child = waitpid(-1,&WNOHANG)) > 0) {
 140       delete $PID{$child} or next;
 141       $FAILED++ if $? != 0;
 142     }
 143   }
 144 };
 145
 146 $SIG{INT} = $SIG{TERM} = sub {cleanup(); exit -1};
 147
 148 GetOptions ('database:s'    => \$DSN,
 149             'create'        => \$CREATE,
 150             'user:s'        => \$USER,
 151             'local'         => \$LOCAL,
 152             'password:s'    => \$PASSWORD,
 153             'fasta:s'       => \$FASTA,
 154             'group:s'       => \$GROUP_TAG,
 155             'long_list:s'   => \$LONG_LIST,
 156             'maxbin|maxfeature:s'    => \$MAX_BIN,
 157             'gff3_munge'    => \$MUNGE,
 158             'summary'       => \$SUMMARY_STATS,
 159             'Temporary:s'   => \$TMPDIR,
 160            ) or (system('pod2text',$0), exit -1);
 161
 162 $DSN ||= 'test';
 163 $MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
 164
 165 my (@args,$AUTH);
 166 if (defined $USER) {
 167   push @args,(-user=>$USER);
 168   $AUTH .= " -u$USER";
 169 }
 170 if (defined $PASSWORD) {
 171   push @args,(-pass=>$PASSWORD);
 172   $AUTH .= " -p$PASSWORD";
 173 }
 174 push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
 175
 176 my $db = Bio::DB::GFF->new(-adaptor=>'faux',-dsn => $DSN,@args)
 177   or die "Can't open database: ",Bio::DB::GFF->error,"\n";
 178
 179 $db->gff3_name_munging(1) if $MUNGE;
 180
 181 if ($CREATE) {
 182   $SUMMARY_STATS++;
 183   $MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
 184 }
 185
 186 $MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
 187
 188 # deal with really long lists of files
 189 if ($LONG_LIST) {
 190   -d $LONG_LIST or die "The --long_list argument must be a directory\n";
 191   opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
 192   @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
 193   closedir GFFDIR;
 194
 195   if (defined $FASTA && -d $FASTA) {
 196     opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
 197     push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
 198     closedir FASTA;
 199   }
 200 }
 201
 202 foreach (@ARGV) {
 203   $_ = "gunzip -c $_ |" if /\.gz$/;
 204   $_ = "uncompress -c $_ |" if /\.Z$/;
 205   $_ = "bunzip2 -c $_ |" if /\.bz2$/;
 206 }
 207 my(@fasta,@gff);
 208 foreach (@ARGV) {
 209   if (/\.(fa|fasta|dna|seq|fast)(?:\.|$)/i) {
 210     push @fasta,$_;
 211   } else {
 212     push @gff,$_;
 213   }
 214 }
 215 @ARGV = @gff;
 216 push @fasta,$FASTA if defined $FASTA;
 217
 218 # initialize state variables
 219 my $FID     = 1;
 220 my $GID     = 1;
 221 my $FTYPEID = 1;
 222 my $ATTRIBUTEID = 1;
 223 my %GROUPID     = ();
 224 my %FTYPEID     = ();
 225 my %ATTRIBUTEID = ();
 226 my %DONE        = ();
 227 my $FEATURES    = 0;
 228
 229 load_tables($db->dbh) unless $CREATE;
 230 my ($major,$minor,$sub) = split /\./,$db->dbh->get_info(18); # SQL_DBMS_VER
 231 my $can_disable_indexes = ($major >= 4 and $minor >= 0);
 232
 233 # open up pipes to the database
 234 my (%FH,%COMMAND);
 235 my $MYSQL  = MYSQL;
 236 my $tmpdir = $TMPDIR || $ENV{TMPDIR} || $ENV{TMP} || File::Spec->tmpdir();
 237 -d $tmpdir or die <<END;
 238 I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
 239 Please select a directory and indicate its location by setting the TMP environment variable, or
 240 by using the --Temporary switch.
 241 END
 242
 243 my @fasta_files_to_be_unlinked;
 244 my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
 245 foreach (@files) {
 246   my $file = "$tmpdir/$_.$$";
 247   print STDERR "creating load file $file...";
 248   $DO_FAST &&= (system("mkfifo $file") == 0);  # for system(), 0 = success
 249   print STDERR "ok\n";
 250   my $delete = $CREATE ? "delete from $_" : '';
 251   my $local  = $LOCAL ? 'local' : '';
 252   my $analyze = "analyze table $_";
 253   my $command =<<END;
 254 $MYSQL $AUTH
 255 -N
 256 -s
 257 -e "lock tables $_ write; $delete; load data $local infile '$file' replace into table $_; unlock tables; $analyze"
 258 $DSN
 259 END
 260 ;
 261   $command =~ s/\n/ /g;
 262   $COMMAND{$_} = $command;
 263
 264   if ($DO_FAST) {
 265     if (my $pid = fork) {
 266       $PID{$pid} = $_;
 267       print STDERR "pausing for 0.5 sec..." if $DO_FAST;
 268       select(undef,undef,undef,0.50); # work around a race condition
 269       print STDERR "ok\n";
 270     } else {  # THIS IS IN CHILD PROCESS
 271       die "Couldn't fork: $!" unless defined $pid;
 272       exec $command || die "Couldn't exec: $!";
 273       exit 0;
 274     }
 275   }
 276   print STDERR "opening load file for writing...";
 277   $FH{$_} = IO::File->new($file,'>') or die $_,": $!";
 278   print STDERR "ok\n";
 279   $FH{$_}->autoflush;
 280 }
 281
 282 print STDERR "Fast loading enabled\n"    if $DO_FAST;
 283
 284 my ($count,$gff3,$last,$start,$beginning,$current_file);
 285
 286 $last  = Time::HiRes::time() if $timer;
 287 $beginning = $start = $last;
 288
 289 # avoid hanging on standalone --fasta load
 290 if (!@ARGV) {
 291     $FH{NULL} = IO::File->new(">$tmpdir/null");
 292     push @ARGV, "$tmpdir/null";
 293 }
 294
 295 while (<>) {
 296
 297   # reset GFF3 flag if new filehandle
 298   $current_file ||= $ARGV;
 299   unless ($current_file eq $ARGV) {
 300     undef $gff3;
 301     $current_file = $ARGV;
 302   }
 303
 304   chomp;
 305   my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
 306
 307   # close sequence filehandle if required
 308   if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
 309       $FH{FASTA}->close;
 310       delete $FH{FASTA};
 311   }
 312
 313   # print to fasta file if the handle is open
 314   if ( defined $FH{FASTA} ) {
 315       $FH{FASTA}->print("$_\n");
 316       next;
 317   }
 318
 319   elsif (/^>(\S+)/) {  # uh oh, sequence coming
 320       $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
 321       $FH{FASTA}->print("$_\n");
 322       push @fasta, "$tmpdir/$1\.fa";
 323       push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
 324       print STDERR "Processing embedded sequence $1\n";
 325       next;
 326   }
 327
 328   elsif (/^\#\#\s*group-tags\s+(.+)/) {
 329     $db->preferred_groups(split(/\s+/,$1));
 330     next;
 331   }
 332
 333   elsif (/^\#\#\s*gff-version\s+(\d+)/) {
 334     $gff3 = ($1 >= 3);
 335     $db->print_gff3_warning() if $gff3;
 336     next;
 337   }
 338
 339   elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
 340     ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
 341       ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
 342   }
 343
 344   elsif (/^\#/) {
 345     next;
 346   }
 347
 348   else {
 349     ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
 350   }
 351   next unless defined $ref;
 352   $FEATURES++;
 353
 354   warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN;
 355
 356   $source = '\N' unless defined $source;
 357   $score  = '\N' if $score  eq '.';
 358   $strand = '\N' if $strand eq '.';
 359   $phase  = '\N' if $phase  eq '.';
 360
 361   my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
 362   # GFF2/3 transition
 363   $gclass = [$gclass] unless ref $gclass;
 364   $gname  = [$gname]  unless ref $gname;
 365
 366   for (my $i=0; $i < @$gname; $i++) {
 367     my $group_class = $gclass->[$i];
 368     my $group_name  = $gname->[$i];
 369     $group_class  ||= '\N';
 370     $group_name   ||= '\N';
 371     $target_start ||= '\N';
 372     $target_stop  ||= '\N';
 373     $method       ||= '\N';
 374     $source       ||= '\N';
 375
 376     my $fid     = $FID++;
 377     my $gid     = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++;
 378     my $ftypeid = $FTYPEID{lc join($;,$source,$method)}          ||= $FTYPEID++;
 379
 380     my $bin = bin($start,$stop,$db->min_bin);
 381     $FH{ FDATA()  }->print(    join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n"   );
 382     $FH{ FGROUP() }->print(    join("\t",$gid,$group_class,$group_name),"\n"              ) unless $DONE{"fgroup$;$gid"}++;
 383     $FH{ FTYPE()  }->print(    join("\t",$ftypeid,$method,$source),"\n"                   ) unless $DONE{"ftype$;$ftypeid"}++;
 384
 385     foreach (@$attributes) {
 386       my ($key,$value) = @$_;
 387       my $attributeid = $ATTRIBUTEID{lc $key}   ||= $ATTRIBUTEID++;
 388       $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n"                       ) unless $DONE{"fattribute$;$attributeid"}++;
 389       $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
 390     }
 391
 392     if ( $FEATURES % 1000 == 0) {
 393       my $now    = Time::HiRes::time() if $timer;
 394       my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
 395       $last = $now;
 396       print STDERR "$fid features parsed$elapsed...";
 397       print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
 398     }
 399   }
 400 }
 401
 402 $FH{FASTA}->close if exists $FH{FASTA};
 403
 404 printf STDERR "Feature load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
 405 $start = time();
 406
 407 for my $fasta (@fasta) {
 408   warn "Loading fasta ",(-d $fasta?"directory":"file"), " $fasta\n";
 409   my $old = select($FH{FDNA()});
 410   my $loaded = $db->load_fasta($fasta);
 411   warn "$fasta: $loaded records loaded\n";
 412   select $old;
 413 }
 414
 415 printf STDERR "Fasta load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
 416 $start = time();
 417
 418 my $success = 1;
 419 if ($DO_FAST) {
 420   warn "Indexing and analyzing tables.  This may take some time (you may see database messages during the process)...\n";
 421 }
 422
 423 $_->close foreach values %FH;
 424
 425 if (!$DO_FAST) {
 426   warn "Loading feature data and analyzing tables.  You may see database messages here...\n";
 427   $success &&= system($COMMAND{$_}) == 0 foreach @files;
 428 }
 429
 430 # wait for children
 431 while (%PID) {
 432   sleep;
 433 }
 434 $success &&= !$FAILED;
 435
 436 cleanup();
 437
 438 printf STDERR "Total parse & load time %5.2fs\n",(Time::HiRes::time() - $beginning) if $timer;
 439
 440 if ($success) {
 441   print "SUCCESS: $FEATURES features successfully loaded\n";
 442   exit 0;
 443 } else {
 444   print "FAILURE: Please see standard error for details\n";
 445   exit -1;
 446 }
 447
 448 if ($SUMMARY_STATS) {
 449     warn "Building summary statistics for coverage histograms...\n";
 450     $db->build_summary_statistics;
 451 }
 452
 453 exit 0;
 454
 455 sub cleanup {
 456   foreach (@files,@fasta_files_to_be_unlinked) {
 457     unlink "$tmpdir/$_.$$";
 458   }
 459 }
 460
 461 # load copies of some of the tables into memory
 462 sub load_tables {
 463   my $dbh = shift;
 464   print STDERR "loading normalized group, type and attribute information...";
 465   $FID         = 1 + get_max_id($dbh,'fdata','fid');
 466   $GID         = 1 + get_max_id($dbh,'fgroup','gid');
 467   $FTYPEID     = 1 + get_max_id($dbh,'ftype','ftypeid');
 468   $ATTRIBUTEID = 1 + get_max_id($dbh,'fattribute','fattribute_id');
 469   get_ids($dbh,\%DONE,\%GROUPID,'fgroup','gid','gclass','gname');
 470   get_ids($dbh,\%DONE,\%FTYPEID,'ftype','ftypeid','fsource','fmethod');
 471   get_ids($dbh,\%DONE,\%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name');
 472   print STDERR "ok\n";
 473 }
 474
 475 sub get_max_id {
 476   my $dbh = shift;
 477   my ($table,$id) = @_;
 478   my $sql = "select max($id) from $table";
 479   my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr;
 480   $result->[0];
 481 }
 482
 483 sub get_ids {
 484   my $dbh = shift;
 485   my ($done,$idhash,$table,$id,@columns) = @_;
 486   my $columns = join ',',$id,@columns;
 487   my $sql = "select $columns from $table";
 488   my $sth = $dbh->prepare($sql) or die $dbh->errstr;
 489   $sth->execute or die $dbh->errstr;
 490   while (my($id,@cols) = $sth->fetchrow_array) {
 491     my $key = lc join $;,@cols;
 492     $idhash->{$key} = $id;
 493     $done->{$table,$id}++;
 494   }
 495 }
 496
 497 __END__