t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / DB / SeqFeature / Store / Loader.pm
bloba47cd971b7cf2d3c347e9b5f0c6d7f1751f8c6ed
1 package Bio::DB::SeqFeature::Store::Loader;
4 =head1 NAME
6 Bio::DB::SeqFeature::Store::Loader -- Loader
8 =head1 SYNOPSIS
10 # non-instantiable base class
12 =head1 DESCRIPTION
14 This is the base class for Bio::DB::SeqFeature::Loader::GFF3Loader,
15 Bio::DB::SeqFeature::Loader::GFFLoader, and
16 Bio::DB::SeqFeature::FeatureFileLoader. Please see the manual pages
17 for these modules.
19 =cut
22 # load utility - incrementally load the store based on GFF3 file
24 # two modes:
25 # slow mode -- features can occur in any order in the GFF3 file
26 # fast mode -- all features with same ID must be contiguous in GFF3 file
28 use strict;
29 use Carp 'croak';
30 use IO::File;
31 use Bio::DB::GFF::Util::Rearrange;
32 use Bio::DB::SeqFeature::Store;
33 use File::Spec;
34 use File::Temp 'tempdir';
35 use base 'Bio::Root::Root';
37 use constant DEFAULT_SEQ_CHUNK_SIZE => 2000;
39 =head2 new
41 Title : new
42 Usage : $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(@options)
43 Function: create a new parser
44 Returns : a Bio::DB::SeqFeature::Store::GFF3Loader gff3 parser and loader
45 Args : several - see below
46 Status : public
48 This method creates a new GFF3 loader and establishes its connection
49 with a Bio::DB::SeqFeature::Store database. Arguments are -name=E<gt>$value
50 pairs as described in this table:
52 Name Value
53 ---- -----
55 -store A writable Bio::DB::SeqFeature::Store database handle.
57 -seqfeature_class The name of the type of Bio::SeqFeatureI object to create
58 and store in the database (Bio::DB::SeqFeature by default)
60 -sf_class A shorter alias for -seqfeature_class
62 -verbose Send progress information to standard error.
64 -fast If true, activate fast loading (see below)
66 -chunk_size Set the storage chunk size for nucleotide/protein sequences
67 (default 2000 bytes)
69 -tmp Indicate a temporary directory to use when loading non-normalized
70 features.
72 -map_coords A code ref that will transform a list of ($ref,[$start1,$end1]...)
73 coordinates into a list of ($newref,[$newstart1,$newend1]...)
75 -index_subfeatures Indicate true if subfeatures should be indexed. Default is true.
77 -summary_stats Rebuild summary stats at the end of loading (not incremental,
78 so takes a long time)
80 When you call new(), a connection to a Bio::DB::SeqFeature::Store
81 database should already have been established and the database
82 initialized (if appropriate).
84 Some combinations of Bio::SeqFeatures and Bio::DB::SeqFeature::Store
85 databases support a fast loading mode. Currently the only reliable
86 implementation of fast loading is the combination of DBI::mysql with
87 Bio::DB::SeqFeature. The other important restriction on fast loading
88 is the requirement that a feature that contains subfeatures must occur
89 in the GFF3 file before any of its subfeatures. Otherwise the
90 subfeatures that occurred before the parent feature will not be
91 attached to the parent correctly. This restriction does not apply to
92 normal (slow) loading.
94 If you use an unnormalized feature class, such as
95 Bio::SeqFeature::Generic, then the loader needs to create a temporary
96 database in which to cache features until all their parts and subparts
97 have been seen. This temporary databases uses the "berkeleydb" adaptor. The
98 -tmp option specifies the directory in which that database will be
99 created. If not present, it defaults to the system default tmp
100 directory specified by File::Spec-E<gt>tmpdir().
102 The -chunk_size option allows you to tune the representation of
103 DNA/Protein sequence in the Store database. By default, sequences are
104 split into 2000 base/residue chunks and then reassembled as
105 needed. This avoids the problem of pulling a whole chromosome into
106 memory in order to fetch a short subsequence from somewhere in the
107 middle. Depending on your usage patterns, you may wish to tune this
108 parameter using a chunk size that is larger or smaller than the
109 default.
111 =cut
113 sub new {
114 my $self = shift;
115 my ($store,$seqfeature_class,$tmpdir,$verbose,$fast,
116 $seq_chunk_size,$coordinate_mapper,$index_subfeatures,$summary_stats,$no_close_fasta) =
117 rearrange(['STORE',
118 ['SF_CLASS','SEQFEATURE_CLASS'],
119 ['TMP','TMPDIR'],
120 'VERBOSE',
121 'FAST',
122 'CHUNK_SIZE',
123 'MAP_COORDS',
124 'INDEX_SUBFEATURES',
125 'SUMMARY_STATS',
126 'NO_CLOSE_FASTA',
127 ],@_);
130 $seqfeature_class ||= $self->default_seqfeature_class;
131 eval "require $seqfeature_class" unless $seqfeature_class->can('new');
132 $self->throw($@) if $@;
134 my $normalized = $seqfeature_class->can('subfeatures_are_normalized')
135 && $seqfeature_class->subfeatures_are_normalized;
137 my $in_table = $seqfeature_class->can('subfeatures_are_stored_in_a_table')
138 && $seqfeature_class->subfeatures_are_stored_in_a_table;
140 if ($fast) {
141 my $canfast = $normalized && $in_table;
142 warn <<END unless $canfast;
143 Only features that support the Bio::DB::SeqFeature::NormalizedTableFeature interface
144 can be loaded using the -fast method. Reverting to slower feature-by-feature method.
146 $fast &&= $canfast;
149 # try to bring in highres time() function
150 eval "require Time::HiRes";
152 $tmpdir ||= File::Spec->tmpdir();
154 my ($tmp_store,$temp_load);
155 unless ($normalized) {
157 # remember the temporary directory in order to delete it on exit
158 $temp_load = tempdir(
159 'BioDBSeqFeature_XXXXXXX',
160 DIR=>$tmpdir,
161 CLEANUP=>1
164 $tmp_store = Bio::DB::SeqFeature::Store->new(-adaptor => 'berkeleydb',
165 -temporary=> 1,
166 -dsn => $temp_load,
167 -cache => 1,
168 -write => 1)
169 unless $normalized;
172 $index_subfeatures = 1 unless defined $index_subfeatures;
174 return bless {
175 store => $store,
176 tmp_store => $tmp_store,
177 seqfeature_class => $seqfeature_class,
178 fast => $fast,
179 seq_chunk_size => $seq_chunk_size || DEFAULT_SEQ_CHUNK_SIZE,
180 verbose => $verbose,
181 load_data => {},
182 tmpdir => $tmpdir,
183 temp_load => $temp_load,
184 subfeatures_normalized => $normalized,
185 subfeatures_in_table => $in_table,
186 coordinate_mapper => $coordinate_mapper,
187 index_subfeatures => $index_subfeatures,
188 summary_stats => $summary_stats,
189 no_close_fasta => $no_close_fasta,
190 },ref($self) || $self;
193 sub coordinate_mapper {
194 my $self = shift;
195 my $d = $self->{coordinate_mapper};
196 $self->{coordinate_mapper} = shift if @_;
200 sub index_subfeatures {
201 my $self = shift;
202 my $d = $self->{index_subfeatures};
203 $self->{index_subfeatures} = shift if @_;
208 sub summary_stats {
209 my $self = shift;
210 my $d = $self->{summary_stats};
211 $self->{summary_stats} = shift if @_;
215 =head2 load
217 Title : load
218 Usage : $count = $loader->load(@ARGV)
219 Function: load the indicated files or filehandles
220 Returns : number of feature lines loaded
221 Args : list of files or filehandles
222 Status : public
224 Once the loader is created, invoke its load() method with a list of
225 GFF3 or FASTA file paths or previously-opened filehandles in order to
226 load them into the database. Compressed files ending with .gz, .Z and
227 .bz2 are automatically recognized and uncompressed on the fly. Paths
228 beginning with http: or ftp: are treated as URLs and opened using the
229 LWP GET program (which must be on your path).
231 FASTA files are recognized by their initial "E<gt>" character. Do not feed
232 the loader a file that is neither GFF3 nor FASTA; I don't know what
233 will happen, but it will probably not be what you expect.
235 =cut
237 sub load {
238 my $self = shift;
239 my $start = $self->time();
240 my $count = 0;
242 for my $file_or_fh (@_) {
243 $self->msg("loading $file_or_fh...\n");
244 my $fh = $self->open_fh($file_or_fh) or $self->throw("Couldn't open $file_or_fh: $!");
245 $count += $self->load_fh($fh);
246 $self->msg(sprintf "load time: %5.2fs\n",$self->time()-$start);
249 if ($self->summary_stats) {
250 $self->msg("Building summary statistics for coverage graphs...");
251 my $start = $self->time();
252 $self->build_summary;
253 $self->msg(sprintf "coverage graph build time: %5.2fs\n",$self->time()-$start);
255 $self->msg(sprintf "total load time: %5.2fs\n",$self->time()-$start);
256 $count;
259 =head2 accessors
261 The following read-only accessors return values passed or created during new():
263 store() the long-term Bio::DB::SeqFeature::Store object
265 tmp_store() the temporary Bio::DB::SeqFeature::Store object used
266 during loading
268 sfclass() the Bio::SeqFeatureI class
270 fast() whether fast loading is active
272 seq_chunk_size() the sequence chunk size
274 verbose() verbose progress messages
276 =cut
278 sub store { shift->{store} }
279 sub tmp_store { shift->{tmp_store} }
280 sub sfclass { shift->{seqfeature_class} }
281 sub fast { shift->{fast} }
282 sub seq_chunk_size { shift->{seq_chunk_size} }
283 sub verbose { shift->{verbose} }
285 =head2 Internal Methods
287 The following methods are used internally and may be overridden by
288 subclasses.
290 =over 4
292 =item default_seqfeature_class
294 $class = $loader->default_seqfeature_class
296 Return the default SeqFeatureI class (Bio::DB::SeqFeature).
298 =cut
300 sub default_seqfeature_class {
301 my $self = shift;
302 return 'Bio::DB::SeqFeature';
305 =item subfeatures_normalized
307 $flag = $loader->subfeatures_normalized([$new_flag])
309 Get or set a flag that indicates that the subfeatures are
310 normalized. This is deduced from the SeqFeature class information.
312 =cut
314 sub subfeatures_normalized {
315 my $self = shift;
316 my $d = $self->{subfeatures_normalized};
317 $self->{subfeatures_normalized} = shift if @_;
321 =item subfeatures_in_table
323 $flag = $loader->subfeatures_in_table([$new_flag])
325 Get or set a flag that indicates that feature/subfeature relationships
326 are stored in a table. This is deduced from the SeqFeature class and
327 Store information.
329 =cut
331 sub subfeatures_in_table {
332 my $self = shift;
333 my $d = $self->{subfeatures_in_table};
334 $self->{subfeatures_in_table} = shift if @_;
338 =item load_fh
340 $count = $loader->load_fh($filehandle)
342 Load the GFF3 data at the other end of the filehandle and return true
343 if successful. Internally, load_fh() invokes:
345 start_load();
346 do_load($filehandle);
347 finish_load();
349 =cut
351 sub load_fh {
352 my $self = shift;
353 my $fh = shift;
354 $self->start_load();
355 my $count = $self->do_load($fh);
356 $self->finish_load();
357 $count;
361 =item start_load, finish_load
363 These methods are called at the start and end of a filehandle load.
365 =cut
367 sub start_load {
368 my $self = shift;
369 $self->create_load_data;
370 $self->store->start_bulk_update() if $self->fast;
373 sub create_load_data {
374 my $self = shift;
375 $self->{load_data}{CurrentFeature} = undef;
376 $self->{load_data}{CurrentID} = undef;
377 $self->{load_data}{IndexIt} = {};
378 $self->{load_data}{Local2GlobalID} = {};
379 $self->{load_data}{count} = 0;
380 $self->{load_data}{mode} = undef;
381 $self->{load_data}{start_time} = 0;
384 sub delete_load_data {
385 my $self = shift;
386 delete $self->{load_data};
389 sub finish_load {
390 my $self = shift;
392 $self->store_current_feature(); # during fast loading, we will have a feature left at the very end
393 $self->start_or_finish_sequence(); # finish any half-loaded sequences
395 if ($self->fast) {
396 $self->{load_data}{start_time} = $self->time();
397 $self->store->finish_bulk_update;
399 $self->msg(sprintf "%5.2fs\n",$self->time()-$self->{load_data}{start_time});
400 eval {$self->store->commit};
402 # don't delete load data so that caller can ask for the loaded IDs
403 # $self->delete_load_data;
406 =item build_summary
408 $loader->build_summary
410 Call this to rebuild the summary coverage statistics. This is done automatically
411 if new() was passed a true value for -summary_stats at create time.
413 =cut
415 sub build_summary {
416 my $self = shift;
417 $self->store->build_summary_statistics;
420 =item do_load
422 $count = $loader->do_load($fh)
424 This is called by load_fh() to load the GFF3 file's filehandle and
425 return the number of lines loaded.
427 =cut
429 sub do_load {
430 my $self = shift;
431 my $fh = shift;
433 $self->{load_data}{start_time} = $self->time();
434 $self->{load_data}->{millenium_time} = $self->{load_data}{start_time};
435 $self->load_line($_) while <$fh>;
436 $self->msg(sprintf "%d features loaded in %5.2fs%s\r",
437 $self->{load_data}->{count},
438 $self->time()-$self->{load_data}{start_time},
439 ' 'x80
441 $self->{load_data}{count};
444 =item load_line
446 $loader->load_line($data);
448 Load a line of a GFF3 file. You must bracket this with calls to
449 start_load() and finish_load()!
451 $loader->start_load();
452 $loader->load_line($_) while <FH>;
453 $loader->finish_load();
455 =cut
457 sub load_line {
458 my $self = shift;
459 my $line = shift;
460 # don't do anything
464 =item handle_feature
466 $loader->handle_feature($data_line)
468 This method is called to process a single data line. It manipulates
469 information stored a data structure called $self-E<gt>{load_data}.
471 =cut
473 sub handle_feature {
474 my $self = shift;
475 my $line = shift;
476 # do nothing
479 =item handle_meta
481 $loader->handle_meta($data_line)
483 This method is called to process a single data line. It manipulates
484 information stored a data structure called $self-E<gt>{load_data}.
486 =cut
488 sub handle_meta {
489 my $self = shift;
490 my $line = shift;
491 # do nothing
494 sub _indexit {
495 my $self = shift;
496 my $id = shift;
497 $id ||= ''; # avoid uninit warnings
498 my $indexhash = $self->{load_data}{IndexIt};
499 $indexhash->{$id} = shift if @_;
500 return $indexhash->{$id};
503 sub _local2global {
504 my $self = shift;
505 my $id = shift;
506 $id ||= ''; # avoid uninit warnings
507 my $indexhash = $self->{load_data}{Local2GlobalID};
508 $indexhash->{$id} = shift if @_;
509 return $indexhash->{$id};
512 =item store_current_feature
514 $loader->store_current_feature()
516 This method is called to store the currently active feature in the
517 database. It uses a data structure stored in $self-E<gt>{load_data}.
519 =cut
521 sub store_current_feature {
522 my $self = shift;
524 my $ld = $self->{load_data};
525 defined $ld->{CurrentFeature} or return;
526 my $f = $ld->{CurrentFeature};
528 my $normalized = $self->subfeatures_normalized;
529 my $indexed = $self->_indexit($ld->{CurrentID});
531 # logic is as follows:
532 # 1. If the feature is an indexed feature, then we store it into the main database
533 # so that it can be searched. It doesn't matter whether it is a top-level feature
534 # or a subfeature.
535 # 2. If the feature class is normalized, but not indexed, then we store it into the
536 # main database using the "no_index" method. This will make it accessible to
537 # queries on the top level parent, but it won't come up by itself in range or
538 # attribute searches.
539 # 3. Otherwise, this is an unindexed subfeature; we store it in the temporary database
540 # until the object build step, at which point it gets integrated into its object tree
541 # and copied into the main database.
543 if ($indexed) {
544 $self->store->store($f);
547 elsif ($normalized) {
548 $self->store->store_noindex($f)
551 else {
552 $self->tmp_store->store_noindex($f)
555 my $id = $f->primary_id; # assigned by store()
557 $self->_local2global($ld->{CurrentID} => $id);
558 $self->_indexit($ld->{CurrentID} => 0)if $normalized; # no need to remember this
559 undef $ld->{CurrentID};
560 undef $ld->{CurrentFeature};
563 =item parse_attributes
565 ($reserved,$unreserved) = $loader->parse_attributes($attribute_line)
567 This method parses the information contained in the $attribute_line
568 into two hashrefs, one containing the values of reserved attribute
569 tags (e.g. ID) and the other containing the values of unreserved ones.
571 =cut
573 sub parse_attributes {
574 my $self = shift;
575 my $att = shift;
576 # do nothing
579 =item start_or_finish_sequence
581 $loader->start_or_finish_sequence('Chr9')
583 This method is called at the beginning and end of a fasta section.
585 =cut
587 # this gets called at the beginning and end of a fasta section
588 sub start_or_finish_sequence {
589 my $self = shift;
590 my $seqid = shift;
591 if (my $sl = $self->{fasta_load}) {
592 if (defined $sl->{seqid}) {
593 $self->store->insert_sequence($sl->{seqid},$sl->{sequence},$sl->{offset});
594 delete $self->{fasta_load};
597 if (defined $seqid) {
598 $self->{fasta_load} = {seqid => $seqid,
599 offset => 0,
600 sequence => ''};
604 =item load_sequence
606 $loader->load_sequence('gatttcccaaa')
608 This method is called to load some amount of sequence after
609 start_or_finish_sequence() is first called.
611 =cut
613 sub load_sequence {
614 my $self = shift;
615 my $seq = shift;
616 my $sl = $self->{fasta_load} or return;
617 my $cs = $self->seq_chunk_size;
618 $sl->{sequence} .= $seq;
619 while (length $sl->{sequence} >= $cs) {
620 my $chunk = substr($sl->{sequence},0,$cs);
621 $self->store->insert_sequence($sl->{seqid},$chunk,$sl->{offset});
622 $sl->{offset} += length $chunk;
623 substr($sl->{sequence},0,$cs) = '';
627 =item open_fh
629 my $io_file = $loader->open_fh($filehandle_or_path)
631 This method opens up the indicated file or pipe, using some
632 intelligence to recognized compressed files and URLs and doing the
633 right thing.
635 =cut
638 sub open_fh {
639 my $self = shift;
640 my $thing = shift;
642 no strict 'refs';
644 return $thing if defined fileno($thing);
645 return IO::File->new("gunzip -c $thing |") if $thing =~ /\.gz$/;
646 return IO::File->new("uncompress -c $thing |") if $thing =~ /\.Z$/;
647 return IO::File->new("bunzip2 -c $thing |") if $thing =~ /\.bz2$/;
648 return IO::File->new("GET $thing |") if $thing =~ /^(http|ftp):/;
649 return $thing if ref $thing && $thing->isa('IO::String');
650 return IO::File->new($thing);
653 sub msg {
654 my $self = shift;
655 my @msg = @_;
656 return unless $self->verbose;
657 print STDERR @msg;
660 =item loaded_ids
662 my $ids = $loader->loaded_ids;
663 my $id_cnt = @$ids;
665 After performing a load, this returns an array ref containing all the
666 feature primary ids that were created during the load.
668 =cut
670 sub loaded_ids {
671 my $self = shift;
672 my @ids = values %{$self->{load_data}{Local2GlobalID}}
673 if $self->{load_data};
674 return \@ids;
677 =item local_ids
679 my $ids = $self->local_ids;
680 my $id_cnt = @$ids;
682 After performing a load, this returns an array ref containing all the
683 load file IDs that were contained within the file just loaded.
685 =cut
687 sub local_ids {
688 my $self = shift;
689 my @ids = keys %{$self->{load_data}{Local2GlobalID}}
690 if $self->{load_data};
691 return \@ids;
694 =item time
696 my $time = $loader->time
698 This method returns the current time in seconds, using Time::HiRes if available.
700 =cut
702 sub time {
703 return Time::HiRes::time() if Time::HiRes->can('time');
704 return time();
707 =item unescape
709 my $unescaped = GFF3Loader::unescape($escaped)
711 This is an internal utility. It is the same as CGI::Util::unescape,
712 but doesn't change pluses into spaces and ignores unicode escapes.
714 =cut
716 sub unescape {
717 my $self = shift;
718 my $todecode = shift;
719 $todecode =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
720 return $todecode;
723 sub DESTROY {
724 my $self = shift;
725 # Close filehandles, so temporal files can be properly deleted
726 my $store = $self->store;
727 if ( $store->isa('Bio::DB::SeqFeature::Store::memory')
728 or $store->isa('Bio::DB::SeqFeature::Store::berkeleydb3')
730 $store->private_fasta_file->close;
732 if ($store->{fasta_db} && !$self->{no_close_fasta}) {
733 while (my ($file, $fh) = each %{ $store->{fasta_db}->{fhcache} }) {
734 $fh->close;
736 $store->{fasta_db}->_close_index($store->{fasta_db}->{offsets});
739 elsif ($store->isa('Bio::DB::SeqFeature::Store::DBI::SQLite')) {
740 if (%DBI::installed_drh) {
741 DBI->disconnect_all;
742 %DBI::installed_drh = ();
744 undef $store->{dbh};
747 if (my $ld = $self->{temp_load}) {
748 unlink $ld;
753 __END__
755 =back
757 =head1 BUGS
759 This is an early version, so there are certainly some bugs. Please
760 use the BioPerl bug tracking system to report bugs.
762 =head1 SEE ALSO
764 L<bioperl>,
765 L<Bio::DB::SeqFeature::Store>,
766 L<Bio::DB::SeqFeature::Segment>,
767 L<Bio::DB::SeqFeature::NormalizedFeature>,
768 L<Bio::DB::SeqFeature::Store::GFF3Loader>,
769 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
770 L<Bio::DB::SeqFeature::Store::berkeleydb>
772 =head1 AUTHOR
774 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
776 Copyright (c) 2006 Cold Spring Harbor Laboratory.
778 This library is free software; you can redistribute it and/or modify
779 it under the same terms as Perl itself.
781 =cut