Add tests for memory leaks and weaken for Issue #81
[bioperl-live.git] / scripts / Bio-DB-SeqFeature-Store / bp_seqfeature_load.pl
blobf3a32e23bbaf7bacd4f592e12cc740f9f5939bf0
1 #!/usr/bin/perl
3 use strict;
4 use warnings;
6 ## Used to output the 'usage' message
7 use Pod::Usage;
9 ## Used to parse command line options
10 use Getopt::Long;
12 ## Used to create temporary files, if necessary
13 use File::Spec;
15 ## BioPerl!
16 use Bio::DB::SeqFeature::Store;
17 use Bio::DB::SeqFeature::Store::GFF3Loader;
21 ## The available options. Note, these defaults are 'hard coded' into
22 ## the USAGE POD, so if you change one of the defaults (you shouldn't),
23 ## you should update the USAGE.
25 my $DSN = 'dbi:mysql:test';
26 my $SFCLASS = 'Bio::DB::SeqFeature';
27 my $ADAPTOR = 'DBI::mysql';
28 my $NAMESPACE;
29 my $VERBOSE = 1;
30 my $FAST = 0;
31 my $TMP = File::Spec->tmpdir();
32 my $IGNORE_SEQREGION = 0;
33 my $CREATE = 0;
34 my $USER = '';
35 my $PASS = '';
36 my $COMPRESS = 0;
37 my $INDEX_SUB = 1;
38 my $NOALIAS_TARGET = 0;
39 my $SUMMARY_STATS = 0;
40 my $NOSUMMARY_STATS = 0;
42 ## Two flags based on http://stackoverflow.com/questions/1232116
43 ## how-to-create-pod-and-use-pod2usage-in-perl
44 my $opt_help;
45 my $opt_man;
47 GetOptions( 'd|dsn=s' => \$DSN,
48 's|seqfeature=s' => \$SFCLASS,
49 'n|namespace=s' => \$NAMESPACE,
50 'a|adaptor=s' => \$ADAPTOR,
51 'v|verbose!' => \$VERBOSE,
52 'f|fast' => \$FAST,
53 'T|temporary-directory=s' => \$TMP,
54 'i|ignore-seqregion' => \$IGNORE_SEQREGION,
55 'c|create' => \$CREATE,
56 'u|user=s' => \$USER,
57 'p|password=s' => \$PASS,
58 'z|zip' => \$COMPRESS,
59 'S|subfeatures!' => \$INDEX_SUB,
61 ## Any good single letter choices here?
62 'noalias-target' => \$NOALIAS_TARGET,
63 'summary' => \$SUMMARY_STATS,
64 'N|nosummary' => \$NOSUMMARY_STATS,
66 ## I miss '--help' when it isn't there!
67 'h|help!' => \$opt_help,
68 'm|man!' => \$opt_man,
70 or pod2usage( -message =>
71 "\nTry 'bp_seqfeature_load.pl --help' for more information\n",
72 -verbose => 0,
73 -exitval => 2,
76 ## Should we output usage information?
77 pod2usage( -verbose => 1 ) if $opt_help;
78 pod2usage( -verbose => 2 ) if $opt_man;
80 ## Did we get any files to process?
81 @ARGV
82 or pod2usage( -message =>
83 "\nYou need to pass some GFF or fasta files to load\n",
84 -verbose => 0,
85 -exitval => 2,
90 ## POD
92 =head1 NAME
94 bp_seqfeature_load.pl - Load GFF into a SeqFeature database
96 =head1 DESCRIPTION
98 Pass any number of GFF or fasta format files (or GFF with embedded
99 fasta) to load the features and sequences into a SeqFeature
100 database. The database (and adaptor) to use is specified on the
101 command line. Use the --create flag to create a new SeqFeature
102 database.
104 =head1 SYNOPSIS
106 bp_seqfeature_load.pl [options] gff_or_fasta_file1 [gff_or_fasta_file2 [...]]
108 Try 'bp_seqfeature_load.pl --help' or '--man' for more information.
110 =head1 OPTIONS
112 =over 4
114 =item -d, --dsn
116 DBI data source (default dbi:mysql:test)
118 =item -n, --namespace
120 The table prefix to use (default undef) Allows several independent
121 sequence feature databases to be stored in a single database
123 =item -s, --seqfeature
125 The type of SeqFeature to create... RTSC (default Bio::DB::SeqFeature)
127 =item -a, --adaptor
129 The storage adaptor (class) to use (default DBI::mysql)
131 =item -v, --verbose
133 Turn on verbose progress reporting (default true) Use --noverbose to
134 switch this off.
136 =item -f, --fast
138 Activate fast loading. (default 0) Only available for some adaptors.
140 =item -T, --temporary-directory
142 Specify temporary directory for fast loading (default
143 File::Spec->tmpdir())
145 =item -i, --ignore-seqregion
147 If true, then ignore ##sequence-region directives in the GFF3 file
148 (default, create a feature for each region)
150 =item -c, --create
152 Create the database and reinitialize it (default false) Note, this
153 will erase previous database contents, if any.
155 =item -u, --user
157 User to connect to database as
159 =item -p, --password
161 Password to use to connect to database
163 =item -z, --zip
165 Compress database tables to save space (default false)
167 =item -S, --subfeatures
169 Turn on indexing of subfeatures (default true) Use --nosubfeatures to
170 switch this off.
172 =item --summary
174 Generate summary statistics for coverage graphs (default false) This
175 can be run on a previously loaded database or during the load. It will
176 default to true if --create is used.
178 =item -N, --nosummary
180 Do not generate summary statistics to save some space and load time (default if
181 --create is not specified, use this option to explicitly turn off summary
182 statistics when --create is specified)
184 =item --noalias-target
186 Don't create an Alias attribute whose value is the target_id in a
187 Target attribute (if the feature contains a Target attribute, the
188 default is to create an Alias attribute whose value is the target_id
189 in the Target attribute)
191 =back
193 Please see http://www.sequenceontology.org/gff3.shtml for information
194 about the GFF3 format. BioPerl extends the format slightly by adding a
195 ##index-subfeatures directive. Set this to a true value if you wish
196 the database to be able to retrieve a feature's individual parts (such
197 as the exons of a transcript) independently of the top level feature:
199 ##index-subfeatures 1
201 It is also possible to control the indexing of subfeatures on a
202 case-by-case basis by adding "index=1" or "index=0" to the feature's
203 attribute list. This should only be used for subfeatures.
205 Subfeature indexing is true by default. Set to false (0) to save lots
206 of database space and speed performance. You may use --nosubfeatures
207 to force this.
209 =cut
215 if ($FAST) {
216 -d $TMP && -w $TMP
217 or die "Fast loading is requested, but I cannot write into the directory $TMP";
218 $DSN .= ";mysql_local_infile=1" if $ADAPTOR =~ /mysql/i && $DSN !~ /mysql_local_infile/;
221 my @options;
222 @options = ($USER,$PASS) if $USER || $PASS;
224 my $store = Bio::DB::SeqFeature::Store->new
226 -dsn => $DSN,
227 -namespace => $NAMESPACE,
228 -adaptor => $ADAPTOR,
229 -tmpdir => $TMP,
230 -user => $USER,
231 -pass => $PASS,
232 -write => 1,
233 -create => $CREATE,
234 -compress => $COMPRESS,
236 or die "Couldn't create connection to the database";
238 $store->init_database('erase') if $CREATE;
239 $SUMMARY_STATS++ if $CREATE; # this is a good thing
241 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new
243 -store => $store,
244 -sf_class => $SFCLASS,
245 -verbose => $VERBOSE,
246 -tmpdir => $TMP,
247 -fast => $FAST,
248 -ignore_seqregion => $IGNORE_SEQREGION,
249 -index_subfeatures => $INDEX_SUB,
250 -noalias_target => $NOALIAS_TARGET,
251 -summary_stats => $NOSUMMARY_STATS ? 0 : $SUMMARY_STATS,
253 or die "Couldn't create GFF3 loader";
255 # on signals, give objects a chance to call their DESTROY methods
256 $SIG{TERM} = $SIG{INT} = sub { undef $loader; undef $store; die "Aborted..."; };
258 $loader->load(@ARGV);
260 exit 0;