Apply carsonhh's patch
[bioperl-live.git] / scripts / Bio-DB-GFF / bp_fast_load_gff.pl
blob5445d912395b83734d10a085a01282b9dd8c52eb
1 #!/usr/bin/perl
3 use strict;
4 use warnings;
5 # use lib './blib/lib';
6 use DBI;
7 use IO::File;
8 use Getopt::Long;
9 use Bio::DB::GFF::Util::Binning 'bin';
10 use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
12 use constant MYSQL => 'mysql';
14 use constant FDATA => 'fdata';
15 use constant FTYPE => 'ftype';
16 use constant FGROUP => 'fgroup';
17 use constant FDNA => 'fdna';
18 use constant FATTRIBUTE => 'fattribute';
19 use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature';
21 my $DO_FAST = eval "use POSIX 'WNOHANG'; 1;";
23 =head1 NAME
25 bp_fast_load_gff.pl - Fast-load a Bio::DB::GFF database from GFF files.
27 =head1 SYNOPSIS
29 % bp_fast_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
31 =head1 DESCRIPTION
33 This script loads a Bio::DB::GFF database with the features contained
34 in a list of GFF files and/or FASTA sequence files. You must use the
35 exact variant of GFF described in L<Bio::DB::GFF>. Various
36 command-line options allow you to control which database to load and
37 whether to allow an existing database to be overwritten.
39 This script is similar to load_gff.pl, but is much faster. However,
40 it is hard-coded to use MySQL and probably only works on Unix
41 platforms due to its reliance on pipes. See L<bp_load_gff.pl> for an
42 incremental loader that works with all databases supported by
43 Bio::DB::GFF, and L<bp_bulk_load_gff.pl> for a fast MySQL loader that
44 supports all platforms.
46 =head2 NOTES
48 If the filename is given as "-" then the input is taken from
49 standard input. Compressed files (.gz, .Z, .bz2) are automatically
50 uncompressed.
52 FASTA format files are distinguished from GFF files by their filename
53 extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their
54 uppercase variants are treated as FASTA files. Everything else is
55 treated as a GFF file. If you wish to load -fasta files from STDIN,
56 then use the -f command-line swith with an argument of '-', as in
58 gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
60 The nature of the load requires that the database be on the local
61 machine and that the indicated user have the "file" privilege to load
62 the tables and have enough room in /usr/tmp (or whatever is specified
63 by the \$TMPDIR environment variable), to hold the tables transiently.
64 If your MySQL is version 3.22.6 and was compiled using the "load local
65 file" option, then you may be able to load remote databases with local
66 data using the --local option.
68 About maxfeature: the default value is 100,000,000 bases. If you have
69 features that are close to or greater that 100Mb in length, then the
70 value of maxfeature should be increased to 1,000,000,000. This value
71 must be a power of 10.
73 If the list of GFF or fasta files exceeds the kernel limit for the
74 maximum number of command-line arguments, use the
75 --long_list /path/to/files option.
77 The adaptor used is dbi::mysqlopt. There is currently no way to
78 change this.
80 =head1 COMMAND-LINE OPTIONS
82 Command-line options can be abbreviated to single-letter options.
83 e.g. -d instead of --database.
85 --database <dsn> Mysql database name
86 --create Reinitialize/create data tables without asking
87 --local Try to load a remote database using local data.
88 --user Username to log in as
89 --fasta File or directory containing fasta files to load
90 --password Password to use for authentication
91 --long_list Directory containing a very large number of
92 GFF and/or FASTA files
93 --maxfeature Set the value of the maximum feature size (default 100Mb; must be a power of 10)
94 --group A list of one or more tag names (comma or space separated)
95 to be used for grouping in the 9th column.
96 --gff3_munge Activate GFF3 name munging (see Bio::DB::GFF)
97 --summary Generate summary statistics for drawing coverage histograms.
98 This can be run on a previously loaded database or during
99 the load.
100 --Temporary Location of a writable scratch directory
102 =head1 SEE ALSO
104 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
106 =head1 AUTHOR
108 Lincoln Stein, lstein@cshl.org
110 Copyright (c) 2002 Cold Spring Harbor Laboratory
112 This library is free software; you can redistribute it and/or modify
113 it under the same terms as Perl itself. See DISCLAIMER.txt for
114 disclaimers of warranty.
116 =cut
118 package Bio::DB::GFF::Adaptor::faux;
120 use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
121 use vars '@ISA';
122 @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
124 sub insert_sequence {
125 my $self = shift;
126 my ($id,$offset,$seq) = @_;
127 print join "\t",$id,$offset,$seq,"\n";
130 package main;
132 eval "use Time::HiRes"; undef $@;
133 my $timer = defined &Time::HiRes::time;
135 my ($DSN,$CREATE,$USER,$PASSWORD,$FASTA,$FAILED,$LOCAL,%PID,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR,$SUMMARY_STATS);
137 if ($DO_FAST) {
138 $SIG{CHLD} = sub {
139 while ((my $child = waitpid(-1,&WNOHANG)) > 0) {
140 delete $PID{$child} or next;
141 $FAILED++ if $? != 0;
146 $SIG{INT} = $SIG{TERM} = sub {cleanup(); exit -1};
148 GetOptions ('database:s' => \$DSN,
149 'create' => \$CREATE,
150 'user:s' => \$USER,
151 'local' => \$LOCAL,
152 'password:s' => \$PASSWORD,
153 'fasta:s' => \$FASTA,
154 'group:s' => \$GROUP_TAG,
155 'long_list:s' => \$LONG_LIST,
156 'maxbin|maxfeature:s' => \$MAX_BIN,
157 'gff3_munge' => \$MUNGE,
158 'summary' => \$SUMMARY_STATS,
159 'Temporary:s' => \$TMPDIR,
160 ) or (system('pod2text',$0), exit -1);
162 $DSN ||= 'test';
163 $MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
165 my (@args,$AUTH);
166 if (defined $USER) {
167 push @args,(-user=>$USER);
168 $AUTH .= " -u$USER";
170 if (defined $PASSWORD) {
171 push @args,(-pass=>$PASSWORD);
172 $AUTH .= " -p$PASSWORD";
174 push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
176 my $db = Bio::DB::GFF->new(-adaptor=>'faux',-dsn => $DSN,@args)
177 or die "Can't open database: ",Bio::DB::GFF->error,"\n";
179 $db->gff3_name_munging(1) if $MUNGE;
181 if ($CREATE) {
182 $SUMMARY_STATS++;
183 $MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
186 $MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
188 # deal with really long lists of files
189 if ($LONG_LIST) {
190 -d $LONG_LIST or die "The --long_list argument must be a directory\n";
191 opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
192 @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
193 closedir GFFDIR;
195 if (defined $FASTA && -d $FASTA) {
196 opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
197 push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
198 closedir FASTA;
202 foreach (@ARGV) {
203 $_ = "gunzip -c $_ |" if /\.gz$/;
204 $_ = "uncompress -c $_ |" if /\.Z$/;
205 $_ = "bunzip2 -c $_ |" if /\.bz2$/;
207 my(@fasta,@gff);
208 foreach (@ARGV) {
209 if (/\.(fa|fasta|dna|seq|fast)(?:\.|$)/i) {
210 push @fasta,$_;
211 } else {
212 push @gff,$_;
215 @ARGV = @gff;
216 push @fasta,$FASTA if defined $FASTA;
218 # initialize state variables
219 my $FID = 1;
220 my $GID = 1;
221 my $FTYPEID = 1;
222 my $ATTRIBUTEID = 1;
223 my %GROUPID = ();
224 my %FTYPEID = ();
225 my %ATTRIBUTEID = ();
226 my %DONE = ();
227 my $FEATURES = 0;
229 load_tables($db->dbh) unless $CREATE;
230 my ($major,$minor,$sub) = split /\./,$db->dbh->get_info(18); # SQL_DBMS_VER
231 my $can_disable_indexes = ($major >= 4 and $minor >= 0);
233 # open up pipes to the database
234 my (%FH,%COMMAND);
235 my $MYSQL = MYSQL;
236 my $tmpdir = $TMPDIR || $ENV{TMPDIR} || $ENV{TMP} || File::Spec->tmpdir();
237 -d $tmpdir or die <<END;
238 I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
239 Please select a directory and indicate its location by setting the TMP environment variable, or
240 by using the --Temporary switch.
243 my @fasta_files_to_be_unlinked;
244 my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
245 foreach (@files) {
246 my $file = "$tmpdir/$_.$$";
247 print STDERR "creating load file $file...";
248 $DO_FAST &&= (system("mkfifo $file") == 0); # for system(), 0 = success
249 print STDERR "ok\n";
250 my $delete = $CREATE ? "delete from $_" : '';
251 my $local = $LOCAL ? 'local' : '';
252 my $analyze = "analyze table $_";
253 my $command =<<END;
254 $MYSQL $AUTH
257 -e "lock tables $_ write; $delete; load data $local infile '$file' replace into table $_; unlock tables; $analyze"
258 $DSN
261 $command =~ s/\n/ /g;
262 $COMMAND{$_} = $command;
264 if ($DO_FAST) {
265 if (my $pid = fork) {
266 $PID{$pid} = $_;
267 print STDERR "pausing for 0.5 sec..." if $DO_FAST;
268 select(undef,undef,undef,0.50); # work around a race condition
269 print STDERR "ok\n";
270 } else { # THIS IS IN CHILD PROCESS
271 die "Couldn't fork: $!" unless defined $pid;
272 exec $command || die "Couldn't exec: $!";
273 exit 0;
276 print STDERR "opening load file for writing...";
277 $FH{$_} = IO::File->new($file,'>') or die $_,": $!";
278 print STDERR "ok\n";
279 $FH{$_}->autoflush;
282 print STDERR "Fast loading enabled\n" if $DO_FAST;
284 my ($count,$gff3,$last,$start,$beginning,$current_file);
286 $last = Time::HiRes::time() if $timer;
287 $beginning = $start = $last;
289 # avoid hanging on standalone --fasta load
290 if (!@ARGV) {
291 $FH{NULL} = IO::File->new(">$tmpdir/null");
292 push @ARGV, "$tmpdir/null";
295 while (<>) {
297 # reset GFF3 flag if new filehandle
298 $current_file ||= $ARGV;
299 unless ($current_file eq $ARGV) {
300 undef $gff3;
301 $current_file = $ARGV;
304 chomp;
305 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
307 # close sequence filehandle if required
308 if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
309 $FH{FASTA}->close;
310 delete $FH{FASTA};
313 # print to fasta file if the handle is open
314 if ( defined $FH{FASTA} ) {
315 $FH{FASTA}->print("$_\n");
316 next;
319 elsif (/^>(\S+)/) { # uh oh, sequence coming
320 $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
321 $FH{FASTA}->print("$_\n");
322 push @fasta, "$tmpdir/$1\.fa";
323 push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
324 print STDERR "Processing embedded sequence $1\n";
325 next;
328 elsif (/^\#\#\s*group-tags\s+(.+)/) {
329 $db->preferred_groups(split(/\s+/,$1));
330 next;
333 elsif (/^\#\#\s*gff-version\s+(\d+)/) {
334 $gff3 = ($1 >= 3);
335 $db->print_gff3_warning() if $gff3;
336 next;
339 elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
340 ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
341 ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
344 elsif (/^\#/) {
345 next;
348 else {
349 ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
351 next unless defined $ref;
352 $FEATURES++;
354 warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN;
356 $source = '\N' unless defined $source;
357 $score = '\N' if $score eq '.';
358 $strand = '\N' if $strand eq '.';
359 $phase = '\N' if $phase eq '.';
361 my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
362 # GFF2/3 transition
363 $gclass = [$gclass] unless ref $gclass;
364 $gname = [$gname] unless ref $gname;
366 for (my $i=0; $i < @$gname; $i++) {
367 my $group_class = $gclass->[$i];
368 my $group_name = $gname->[$i];
369 $group_class ||= '\N';
370 $group_name ||= '\N';
371 $target_start ||= '\N';
372 $target_stop ||= '\N';
373 $method ||= '\N';
374 $source ||= '\N';
376 my $fid = $FID++;
377 my $gid = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++;
378 my $ftypeid = $FTYPEID{lc join($;,$source,$method)} ||= $FTYPEID++;
380 my $bin = bin($start,$stop,$db->min_bin);
381 $FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
382 $FH{ FGROUP() }->print( join("\t",$gid,$group_class,$group_name),"\n" ) unless $DONE{"fgroup$;$gid"}++;
383 $FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"ftype$;$ftypeid"}++;
385 foreach (@$attributes) {
386 my ($key,$value) = @$_;
387 my $attributeid = $ATTRIBUTEID{lc $key} ||= $ATTRIBUTEID++;
388 $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"fattribute$;$attributeid"}++;
389 $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
392 if ( $FEATURES % 1000 == 0) {
393 my $now = Time::HiRes::time() if $timer;
394 my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
395 $last = $now;
396 print STDERR "$fid features parsed$elapsed...";
397 print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
402 $FH{FASTA}->close if exists $FH{FASTA};
404 printf STDERR "Feature load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
405 $start = time();
407 for my $fasta (@fasta) {
408 warn "Loading fasta ",(-d $fasta?"directory":"file"), " $fasta\n";
409 my $old = select($FH{FDNA()});
410 my $loaded = $db->load_fasta($fasta);
411 warn "$fasta: $loaded records loaded\n";
412 select $old;
415 printf STDERR "Fasta load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
416 $start = time();
418 my $success = 1;
419 if ($DO_FAST) {
420 warn "Indexing and analyzing tables. This may take some time (you may see database messages during the process)...\n";
423 $_->close foreach values %FH;
425 if (!$DO_FAST) {
426 warn "Loading feature data and analyzing tables. You may see database messages here...\n";
427 $success &&= system($COMMAND{$_}) == 0 foreach @files;
430 # wait for children
431 while (%PID) {
432 sleep;
434 $success &&= !$FAILED;
436 cleanup();
438 printf STDERR "Total parse & load time %5.2fs\n",(Time::HiRes::time() - $beginning) if $timer;
440 if ($success) {
441 print "SUCCESS: $FEATURES features successfully loaded\n";
442 exit 0;
443 } else {
444 print "FAILURE: Please see standard error for details\n";
445 exit -1;
448 if ($SUMMARY_STATS) {
449 warn "Building summary statistics for coverage histograms...\n";
450 $db->build_summary_statistics;
453 exit 0;
455 sub cleanup {
456 foreach (@files,@fasta_files_to_be_unlinked) {
457 unlink "$tmpdir/$_.$$";
461 # load copies of some of the tables into memory
462 sub load_tables {
463 my $dbh = shift;
464 print STDERR "loading normalized group, type and attribute information...";
465 $FID = 1 + get_max_id($dbh,'fdata','fid');
466 $GID = 1 + get_max_id($dbh,'fgroup','gid');
467 $FTYPEID = 1 + get_max_id($dbh,'ftype','ftypeid');
468 $ATTRIBUTEID = 1 + get_max_id($dbh,'fattribute','fattribute_id');
469 get_ids($dbh,\%DONE,\%GROUPID,'fgroup','gid','gclass','gname');
470 get_ids($dbh,\%DONE,\%FTYPEID,'ftype','ftypeid','fsource','fmethod');
471 get_ids($dbh,\%DONE,\%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name');
472 print STDERR "ok\n";
475 sub get_max_id {
476 my $dbh = shift;
477 my ($table,$id) = @_;
478 my $sql = "select max($id) from $table";
479 my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr;
480 $result->[0];
483 sub get_ids {
484 my $dbh = shift;
485 my ($done,$idhash,$table,$id,@columns) = @_;
486 my $columns = join ',',$id,@columns;
487 my $sql = "select $columns from $table";
488 my $sth = $dbh->prepare($sql) or die $dbh->errstr;
489 $sth->execute or die $dbh->errstr;
490 while (my($id,@cols) = $sth->fetchrow_array) {
491 my $key = lc join $;,@cols;
492 $idhash->{$key} = $id;
493 $done->{$table,$id}++;
497 __END__