5 # use lib './blib/lib';
9 use Bio
::DB
::GFF
::Util
::Binning
'bin';
10 use Bio
::DB
::GFF
::Adaptor
::dbi
::mysqlopt
;
12 use constant MYSQL
=> 'mysql';
14 use constant FDATA
=> 'fdata';
15 use constant FTYPE
=> 'ftype';
16 use constant FGROUP
=> 'fgroup';
17 use constant FDNA
=> 'fdna';
18 use constant FATTRIBUTE
=> 'fattribute';
19 use constant FATTRIBUTE_TO_FEATURE
=> 'fattribute_to_feature';
21 my $DO_FAST = eval "use POSIX 'WNOHANG'; 1;";
25 bp_fast_load_gff.pl - Fast-load a Bio::DB::GFF database from GFF files.
29 % bp_fast_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
33 This script loads a Bio::DB::GFF database with the features contained
34 in a list of GFF files and/or FASTA sequence files. You must use the
35 exact variant of GFF described in L<Bio::DB::GFF>. Various
36 command-line options allow you to control which database to load and
37 whether to allow an existing database to be overwritten.
39 This script is similar to load_gff.pl, but is much faster. However,
40 it is hard-coded to use MySQL and probably only works on Unix
41 platforms due to its reliance on pipes. See L<bp_load_gff.pl> for an
42 incremental loader that works with all databases supported by
43 Bio::DB::GFF, and L<bp_bulk_load_gff.pl> for a fast MySQL loader that
44 supports all platforms.
48 If the filename is given as "-" then the input is taken from
49 standard input. Compressed files (.gz, .Z, .bz2) are automatically
52 FASTA format files are distinguished from GFF files by their filename
53 extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their
54 uppercase variants are treated as FASTA files. Everything else is
55 treated as a GFF file. If you wish to load -fasta files from STDIN,
56 then use the -f command-line swith with an argument of '-', as in
58 gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
60 The nature of the load requires that the database be on the local
61 machine and that the indicated user have the "file" privilege to load
62 the tables and have enough room in /usr/tmp (or whatever is specified
63 by the \$TMPDIR environment variable), to hold the tables transiently.
64 If your MySQL is version 3.22.6 and was compiled using the "load local
65 file" option, then you may be able to load remote databases with local
66 data using the --local option.
68 About maxfeature: the default value is 100,000,000 bases. If you have
69 features that are close to or greater that 100Mb in length, then the
70 value of maxfeature should be increased to 1,000,000,000. This value
71 must be a power of 10.
73 If the list of GFF or fasta files exceeds the kernel limit for the
74 maximum number of command-line arguments, use the
75 --long_list /path/to/files option.
77 The adaptor used is dbi::mysqlopt. There is currently no way to
80 =head1 COMMAND-LINE OPTIONS
82 Command-line options can be abbreviated to single-letter options.
83 e.g. -d instead of --database.
85 --database <dsn> Mysql database name
86 --create Reinitialize/create data tables without asking
87 --local Try to load a remote database using local data.
88 --user Username to log in as
89 --fasta File or directory containing fasta files to load
90 --password Password to use for authentication
91 --long_list Directory containing a very large number of
92 GFF and/or FASTA files
93 --maxfeature Set the value of the maximum feature size (default 100Mb; must be a power of 10)
94 --group A list of one or more tag names (comma or space separated)
95 to be used for grouping in the 9th column.
96 --gff3_munge Activate GFF3 name munging (see Bio::DB::GFF)
97 --summary Generate summary statistics for drawing coverage histograms.
98 This can be run on a previously loaded database or during
100 --Temporary Location of a writable scratch directory
104 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
108 Lincoln Stein, lstein@cshl.org
110 Copyright (c) 2002 Cold Spring Harbor Laboratory
112 This library is free software; you can redistribute it and/or modify
113 it under the same terms as Perl itself. See DISCLAIMER.txt for
114 disclaimers of warranty.
118 package Bio
::DB
::GFF
::Adaptor
::faux
;
120 use Bio
::DB
::GFF
::Adaptor
::dbi
::mysqlopt
;
122 @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
124 sub insert_sequence
{
126 my ($id,$offset,$seq) = @_;
127 print join "\t",$id,$offset,$seq,"\n";
132 eval "use Time::HiRes"; undef $@
;
133 my $timer = defined &Time
::HiRes
::time;
135 my ($DSN,$CREATE,$USER,$PASSWORD,$FASTA,$FAILED,$LOCAL,%PID,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR,$SUMMARY_STATS);
139 while ((my $child = waitpid(-1,&WNOHANG
)) > 0) {
140 delete $PID{$child} or next;
141 $FAILED++ if $?
!= 0;
146 $SIG{INT
} = $SIG{TERM
} = sub {cleanup
(); exit -1};
148 GetOptions
('database:s' => \
$DSN,
149 'create' => \
$CREATE,
152 'password:s' => \
$PASSWORD,
153 'fasta:s' => \
$FASTA,
154 'group:s' => \
$GROUP_TAG,
155 'long_list:s' => \
$LONG_LIST,
156 'maxbin|maxfeature:s' => \
$MAX_BIN,
157 'gff3_munge' => \
$MUNGE,
158 'summary' => \
$SUMMARY_STATS,
159 'Temporary:s' => \
$TMPDIR,
160 ) or (system('pod2text',$0), exit -1);
163 $MAX_BIN ||= 1_000_000_000
; # to accomodate human-sized chromosomes
167 push @args,(-user
=>$USER);
170 if (defined $PASSWORD) {
171 push @args,(-pass
=>$PASSWORD);
172 $AUTH .= " -p$PASSWORD";
174 push @args,(-preferred_groups
=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
176 my $db = Bio
::DB
::GFF
->new(-adaptor
=>'faux',-dsn
=> $DSN,@args)
177 or die "Can't open database: ",Bio
::DB
::GFF
->error,"\n";
179 $db->gff3_name_munging(1) if $MUNGE;
183 $MAX_BIN ?
$db->initialize(-erase
=>1,-MAX_BIN
=>$MAX_BIN) : $db->initialize(1);
186 $MAX_BIN ||= $db->meta('max_bin') || 100_000_000
;
188 # deal with really long lists of files
190 -d
$LONG_LIST or die "The --long_list argument must be a directory\n";
191 opendir GFFDIR
,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
192 @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR
;
195 if (defined $FASTA && -d
$FASTA) {
196 opendir FASTA
,$FASTA or die "Could not open $FASTA for reading: $!";
197 push @ARGV, map { "$FASTA\/$_" } readdir FASTA
;
203 $_ = "gunzip -c $_ |" if /\.gz$/;
204 $_ = "uncompress -c $_ |" if /\.Z$/;
205 $_ = "bunzip2 -c $_ |" if /\.bz2$/;
209 if (/\.(fa|fasta|dna|seq|fast)(?:\.|$)/i) {
216 push @fasta,$FASTA if defined $FASTA;
218 # initialize state variables
225 my %ATTRIBUTEID = ();
229 load_tables
($db->dbh) unless $CREATE;
230 my ($major,$minor,$sub) = split /\./,$db->dbh->get_info(18); # SQL_DBMS_VER
231 my $can_disable_indexes = ($major >= 4 and $minor >= 0);
233 # open up pipes to the database
236 my $tmpdir = $TMPDIR || $ENV{TMPDIR
} || $ENV{TMP
} || File
::Spec
->tmpdir();
237 -d
$tmpdir or die <<END;
238 I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
239 Please select a directory and indicate its location by setting the TMP environment variable, or
240 by using the --Temporary switch.
243 my @fasta_files_to_be_unlinked;
244 my @files = (FDATA
,FTYPE
,FGROUP
,FDNA
,FATTRIBUTE
,FATTRIBUTE_TO_FEATURE
);
246 my $file = "$tmpdir/$_.$$";
247 print STDERR
"creating load file $file...";
248 $DO_FAST &&= (system("mkfifo $file") == 0); # for system(), 0 = success
250 my $delete = $CREATE ?
"delete from $_" : '';
251 my $local = $LOCAL ?
'local' : '';
252 my $analyze = "analyze table $_";
257 -e "lock tables $_ write; $delete; load data $local infile '$file' replace into table $_; unlock tables; $analyze"
261 $command =~ s/\n/ /g;
262 $COMMAND{$_} = $command;
265 if (my $pid = fork) {
267 print STDERR
"pausing for 0.5 sec..." if $DO_FAST;
268 select(undef,undef,undef,0.50); # work around a race condition
270 } else { # THIS IS IN CHILD PROCESS
271 die "Couldn't fork: $!" unless defined $pid;
272 exec $command || die "Couldn't exec: $!";
276 print STDERR
"opening load file for writing...";
277 $FH{$_} = IO
::File
->new($file,'>') or die $_,": $!";
282 print STDERR
"Fast loading enabled\n" if $DO_FAST;
284 my ($count,$gff3,$last,$start,$beginning,$current_file);
286 $last = Time
::HiRes
::time() if $timer;
287 $beginning = $start = $last;
289 # avoid hanging on standalone --fasta load
291 $FH{NULL
} = IO
::File
->new(">$tmpdir/null");
292 push @ARGV, "$tmpdir/null";
297 # reset GFF3 flag if new filehandle
298 $current_file ||= $ARGV;
299 unless ($current_file eq $ARGV) {
301 $current_file = $ARGV;
305 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
307 # close sequence filehandle if required
308 if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA
}) {
313 # print to fasta file if the handle is open
314 if ( defined $FH{FASTA
} ) {
315 $FH{FASTA
}->print("$_\n");
319 elsif (/^>(\S+)/) { # uh oh, sequence coming
320 $FH{FASTA
} = IO
::File
->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
321 $FH{FASTA
}->print("$_\n");
322 push @fasta, "$tmpdir/$1\.fa";
323 push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
324 print STDERR
"Processing embedded sequence $1\n";
328 elsif (/^\#\#\s*group-tags\s+(.+)/) {
329 $db->preferred_groups(split(/\s+/,$1));
333 elsif (/^\#\#\s*gff-version\s+(\d+)/) {
335 $db->print_gff3_warning() if $gff3;
339 elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
340 ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
341 ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ?
"ID=Sequence:$1": qq(Sequence
"$1"));
349 ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
351 next unless defined $ref;
354 warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN;
356 $source = '\N' unless defined $source;
357 $score = '\N' if $score eq '.';
358 $strand = '\N' if $strand eq '.';
359 $phase = '\N' if $phase eq '.';
361 my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
363 $gclass = [$gclass] unless ref $gclass;
364 $gname = [$gname] unless ref $gname;
366 for (my $i=0; $i < @
$gname; $i++) {
367 my $group_class = $gclass->[$i];
368 my $group_name = $gname->[$i];
369 $group_class ||= '\N';
370 $group_name ||= '\N';
371 $target_start ||= '\N';
372 $target_stop ||= '\N';
377 my $gid = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++;
378 my $ftypeid = $FTYPEID{lc join($;,$source,$method)} ||= $FTYPEID++;
380 my $bin = bin
($start,$stop,$db->min_bin);
381 $FH{ FDATA
() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
382 $FH{ FGROUP
() }->print( join("\t",$gid,$group_class,$group_name),"\n" ) unless $DONE{"fgroup$;$gid"}++;
383 $FH{ FTYPE
() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"ftype$;$ftypeid"}++;
385 foreach (@
$attributes) {
386 my ($key,$value) = @
$_;
387 my $attributeid = $ATTRIBUTEID{lc $key} ||= $ATTRIBUTEID++;
388 $FH{ FATTRIBUTE
() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"fattribute$;$attributeid"}++;
389 $FH{ FATTRIBUTE_TO_FEATURE
() }->print( join("\t",$fid,$attributeid,$value),"\n");
392 if ( $FEATURES % 1000 == 0) {
393 my $now = Time
::HiRes
::time() if $timer;
394 my $elapsed = $timer ?
sprintf(" in %5.2fs",$now - $last) : '';
396 print STDERR
"$fid features parsed$elapsed...";
397 print STDERR
-t STDOUT
&& !$ENV{EMACS
} ?
"\r" : "\n";
402 $FH{FASTA
}->close if exists $FH{FASTA
};
404 printf STDERR
"Feature load time %5.2fs\n",(Time
::HiRes
::time() - $start) if $timer;
407 for my $fasta (@fasta) {
408 warn "Loading fasta ",(-d
$fasta?
"directory":"file"), " $fasta\n";
409 my $old = select($FH{FDNA
()});
410 my $loaded = $db->load_fasta($fasta);
411 warn "$fasta: $loaded records loaded\n";
415 printf STDERR
"Fasta load time %5.2fs\n",(Time
::HiRes
::time() - $start) if $timer;
420 warn "Indexing and analyzing tables. This may take some time (you may see database messages during the process)...\n";
423 $_->close foreach values %FH;
426 warn "Loading feature data and analyzing tables. You may see database messages here...\n";
427 $success &&= system($COMMAND{$_}) == 0 foreach @files;
434 $success &&= !$FAILED;
438 printf STDERR
"Total parse & load time %5.2fs\n",(Time
::HiRes
::time() - $beginning) if $timer;
441 print "SUCCESS: $FEATURES features successfully loaded\n";
444 print "FAILURE: Please see standard error for details\n";
448 if ($SUMMARY_STATS) {
449 warn "Building summary statistics for coverage histograms...\n";
450 $db->build_summary_statistics;
456 foreach (@files,@fasta_files_to_be_unlinked) {
457 unlink "$tmpdir/$_.$$";
461 # load copies of some of the tables into memory
464 print STDERR
"loading normalized group, type and attribute information...";
465 $FID = 1 + get_max_id
($dbh,'fdata','fid');
466 $GID = 1 + get_max_id
($dbh,'fgroup','gid');
467 $FTYPEID = 1 + get_max_id
($dbh,'ftype','ftypeid');
468 $ATTRIBUTEID = 1 + get_max_id
($dbh,'fattribute','fattribute_id');
469 get_ids
($dbh,\
%DONE,\
%GROUPID,'fgroup','gid','gclass','gname');
470 get_ids
($dbh,\
%DONE,\
%FTYPEID,'ftype','ftypeid','fsource','fmethod');
471 get_ids
($dbh,\
%DONE,\
%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name');
477 my ($table,$id) = @_;
478 my $sql = "select max($id) from $table";
479 my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr;
485 my ($done,$idhash,$table,$id,@columns) = @_;
486 my $columns = join ',',$id,@columns;
487 my $sql = "select $columns from $table";
488 my $sth = $dbh->prepare($sql) or die $dbh->errstr;
489 $sth->execute or die $dbh->errstr;
490 while (my($id,@cols) = $sth->fetchrow_array) {
491 my $key = lc join $;,@cols;
492 $idhash->{$key} = $id;
493 $done->{$table,$id}++;