1 package Bio
::DB
::SeqFeature
::Store
::Loader
;
6 Bio::DB::SeqFeature::Store::Loader -- Loader
10 # non-instantiable base class
14 This is the base class for Bio::DB::SeqFeature::Loader::GFF3Loader,
15 Bio::DB::SeqFeature::Loader::GFFLoader, and
16 Bio::DB::SeqFeature::FeatureFileLoader. Please see the manual pages
22 # load utility - incrementally load the store based on GFF3 file
25 # slow mode -- features can occur in any order in the GFF3 file
26 # fast mode -- all features with same ID must be contiguous in GFF3 file
31 use Bio
::DB
::GFF
::Util
::Rearrange
;
32 use Bio
::DB
::SeqFeature
::Store
;
34 use File
::Temp
'tempdir';
35 use base
'Bio::Root::Root';
37 use constant DEFAULT_SEQ_CHUNK_SIZE
=> 2000;
42 Usage : $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(@options)
43 Function: create a new parser
44 Returns : a Bio::DB::SeqFeature::Store::GFF3Loader gff3 parser and loader
45 Args : several - see below
48 This method creates a new GFF3 loader and establishes its connection
49 with a Bio::DB::SeqFeature::Store database. Arguments are -name=E<gt>$value
50 pairs as described in this table:
55 -store A writable Bio::DB::SeqFeature::Store database handle.
57 -seqfeature_class The name of the type of Bio::SeqFeatureI object to create
58 and store in the database (Bio::DB::SeqFeature by default)
60 -sf_class A shorter alias for -seqfeature_class
62 -verbose Send progress information to standard error.
64 -fast If true, activate fast loading (see below)
66 -chunk_size Set the storage chunk size for nucleotide/protein sequences
69 -tmp Indicate a temporary directory to use when loading non-normalized
72 -map_coords A code ref that will transform a list of ($ref,[$start1,$end1]...)
73 coordinates into a list of ($newref,[$newstart1,$newend1]...)
75 -index_subfeatures Indicate true if subfeatures should be indexed. Default is true.
77 -summary_stats Rebuild summary stats at the end of loading (not incremental,
80 When you call new(), a connection to a Bio::DB::SeqFeature::Store
81 database should already have been established and the database
82 initialized (if appropriate).
84 Some combinations of Bio::SeqFeatures and Bio::DB::SeqFeature::Store
85 databases support a fast loading mode. Currently the only reliable
86 implementation of fast loading is the combination of DBI::mysql with
87 Bio::DB::SeqFeature. The other important restriction on fast loading
88 is the requirement that a feature that contains subfeatures must occur
89 in the GFF3 file before any of its subfeatures. Otherwise the
90 subfeatures that occurred before the parent feature will not be
91 attached to the parent correctly. This restriction does not apply to
92 normal (slow) loading.
94 If you use an unnormalized feature class, such as
95 Bio::SeqFeature::Generic, then the loader needs to create a temporary
96 database in which to cache features until all their parts and subparts
97 have been seen. This temporary databases uses the "berkeleydb" adaptor. The
98 -tmp option specifies the directory in which that database will be
99 created. If not present, it defaults to the system default tmp
100 directory specified by File::Spec-E<gt>tmpdir().
102 The -chunk_size option allows you to tune the representation of
103 DNA/Protein sequence in the Store database. By default, sequences are
104 split into 2000 base/residue chunks and then reassembled as
105 needed. This avoids the problem of pulling a whole chromosome into
106 memory in order to fetch a short subsequence from somewhere in the
107 middle. Depending on your usage patterns, you may wish to tune this
108 parameter using a chunk size that is larger or smaller than the
115 my ($store,$seqfeature_class,$tmpdir,$verbose,$fast,
116 $seq_chunk_size,$coordinate_mapper,$index_subfeatures,$summary_stats,$no_close_fasta) =
118 ['SF_CLASS','SEQFEATURE_CLASS'],
130 $seqfeature_class ||= $self->default_seqfeature_class;
131 eval "require $seqfeature_class" unless $seqfeature_class->can('new');
132 $self->throw($@
) if $@
;
134 my $normalized = $seqfeature_class->can('subfeatures_are_normalized')
135 && $seqfeature_class->subfeatures_are_normalized;
137 my $in_table = $seqfeature_class->can('subfeatures_are_stored_in_a_table')
138 && $seqfeature_class->subfeatures_are_stored_in_a_table;
141 my $canfast = $normalized && $in_table;
142 warn <<END unless $canfast;
143 Only features that support the Bio::DB::SeqFeature::NormalizedTableFeature interface
144 can be loaded using the -fast method. Reverting to slower feature-by-feature method.
149 # try to bring in highres time() function
150 eval "require Time::HiRes";
152 $tmpdir ||= File
::Spec
->tmpdir();
154 my ($tmp_store,$temp_load);
155 unless ($normalized) {
157 # remember the temporary directory in order to delete it on exit
158 $temp_load = tempdir
(
159 'BioDBSeqFeature_XXXXXXX',
164 $tmp_store = Bio
::DB
::SeqFeature
::Store
->new(-adaptor
=> 'berkeleydb',
172 $index_subfeatures = 1 unless defined $index_subfeatures;
176 tmp_store
=> $tmp_store,
177 seqfeature_class
=> $seqfeature_class,
179 seq_chunk_size
=> $seq_chunk_size || DEFAULT_SEQ_CHUNK_SIZE
,
183 temp_load
=> $temp_load,
184 subfeatures_normalized
=> $normalized,
185 subfeatures_in_table
=> $in_table,
186 coordinate_mapper
=> $coordinate_mapper,
187 index_subfeatures
=> $index_subfeatures,
188 summary_stats
=> $summary_stats,
189 no_close_fasta
=> $no_close_fasta,
190 },ref($self) || $self;
193 sub coordinate_mapper
{
195 my $d = $self->{coordinate_mapper
};
196 $self->{coordinate_mapper
} = shift if @_;
200 sub index_subfeatures
{
202 my $d = $self->{index_subfeatures
};
203 $self->{index_subfeatures
} = shift if @_;
210 my $d = $self->{summary_stats
};
211 $self->{summary_stats
} = shift if @_;
218 Usage : $count = $loader->load(@ARGV)
219 Function: load the indicated files or filehandles
220 Returns : number of feature lines loaded
221 Args : list of files or filehandles
224 Once the loader is created, invoke its load() method with a list of
225 GFF3 or FASTA file paths or previously-opened filehandles in order to
226 load them into the database. Compressed files ending with .gz, .Z and
227 .bz2 are automatically recognized and uncompressed on the fly. Paths
228 beginning with http: or ftp: are treated as URLs and opened using the
229 LWP GET program (which must be on your path).
231 FASTA files are recognized by their initial "E<gt>" character. Do not feed
232 the loader a file that is neither GFF3 nor FASTA; I don't know what
233 will happen, but it will probably not be what you expect.
239 my $start = $self->time();
242 for my $file_or_fh (@_) {
243 $self->msg("loading $file_or_fh...\n");
244 my $fh = $self->open_fh($file_or_fh) or $self->throw("Couldn't open $file_or_fh: $!");
245 $count += $self->load_fh($fh);
246 $self->msg(sprintf "load time: %5.2fs\n",$self->time()-$start);
249 if ($self->summary_stats) {
250 $self->msg("Building summary statistics for coverage graphs...");
251 my $start = $self->time();
252 $self->build_summary;
253 $self->msg(sprintf "coverage graph build time: %5.2fs\n",$self->time()-$start);
255 $self->msg(sprintf "total load time: %5.2fs\n",$self->time()-$start);
261 The following read-only accessors return values passed or created during new():
263 store() the long-term Bio::DB::SeqFeature::Store object
265 tmp_store() the temporary Bio::DB::SeqFeature::Store object used
268 sfclass() the Bio::SeqFeatureI class
270 fast() whether fast loading is active
272 seq_chunk_size() the sequence chunk size
274 verbose() verbose progress messages
278 sub store
{ shift->{store
} }
279 sub tmp_store
{ shift->{tmp_store
} }
280 sub sfclass
{ shift->{seqfeature_class
} }
281 sub fast
{ shift->{fast
} }
282 sub seq_chunk_size
{ shift->{seq_chunk_size
} }
283 sub verbose
{ shift->{verbose
} }
285 =head2 Internal Methods
287 The following methods are used internally and may be overridden by
292 =item default_seqfeature_class
294 $class = $loader->default_seqfeature_class
296 Return the default SeqFeatureI class (Bio::DB::SeqFeature).
300 sub default_seqfeature_class
{
302 return 'Bio::DB::SeqFeature';
305 =item subfeatures_normalized
307 $flag = $loader->subfeatures_normalized([$new_flag])
309 Get or set a flag that indicates that the subfeatures are
310 normalized. This is deduced from the SeqFeature class information.
314 sub subfeatures_normalized
{
316 my $d = $self->{subfeatures_normalized
};
317 $self->{subfeatures_normalized
} = shift if @_;
321 =item subfeatures_in_table
323 $flag = $loader->subfeatures_in_table([$new_flag])
325 Get or set a flag that indicates that feature/subfeature relationships
326 are stored in a table. This is deduced from the SeqFeature class and
331 sub subfeatures_in_table
{
333 my $d = $self->{subfeatures_in_table
};
334 $self->{subfeatures_in_table
} = shift if @_;
340 $count = $loader->load_fh($filehandle)
342 Load the GFF3 data at the other end of the filehandle and return true
343 if successful. Internally, load_fh() invokes:
346 do_load($filehandle);
355 my $count = $self->do_load($fh);
356 $self->finish_load();
361 =item start_load, finish_load
363 These methods are called at the start and end of a filehandle load.
369 $self->create_load_data;
370 $self->store->start_bulk_update() if $self->fast;
373 sub create_load_data
{
375 $self->{load_data
}{CurrentFeature
} = undef;
376 $self->{load_data
}{CurrentID
} = undef;
377 $self->{load_data
}{IndexIt
} = {};
378 $self->{load_data
}{Local2GlobalID
} = {};
379 $self->{load_data
}{count
} = 0;
380 $self->{load_data
}{mode
} = undef;
381 $self->{load_data
}{start_time
} = 0;
384 sub delete_load_data
{
386 delete $self->{load_data
};
392 $self->store_current_feature(); # during fast loading, we will have a feature left at the very end
393 $self->start_or_finish_sequence(); # finish any half-loaded sequences
396 $self->{load_data
}{start_time
} = $self->time();
397 $self->store->finish_bulk_update;
399 $self->msg(sprintf "%5.2fs\n",$self->time()-$self->{load_data
}{start_time
});
400 eval {$self->store->commit};
402 # don't delete load data so that caller can ask for the loaded IDs
403 # $self->delete_load_data;
408 $loader->build_summary
410 Call this to rebuild the summary coverage statistics. This is done automatically
411 if new() was passed a true value for -summary_stats at create time.
417 $self->store->build_summary_statistics;
422 $count = $loader->do_load($fh)
424 This is called by load_fh() to load the GFF3 file's filehandle and
425 return the number of lines loaded.
433 $self->{load_data
}{start_time
} = $self->time();
434 $self->{load_data
}->{millenium_time
} = $self->{load_data
}{start_time
};
435 $self->load_line($_) while <$fh>;
436 $self->msg(sprintf "%d features loaded in %5.2fs%s\r",
437 $self->{load_data
}->{count
},
438 $self->time()-$self->{load_data
}{start_time
},
441 $self->{load_data
}{count
};
446 $loader->load_line($data);
448 Load a line of a GFF3 file. You must bracket this with calls to
449 start_load() and finish_load()!
451 $loader->start_load();
452 $loader->load_line($_) while <FH>;
453 $loader->finish_load();
466 $loader->handle_feature($data_line)
468 This method is called to process a single data line. It manipulates
469 information stored a data structure called $self-E<gt>{load_data}.
481 $loader->handle_meta($data_line)
483 This method is called to process a single data line. It manipulates
484 information stored a data structure called $self-E<gt>{load_data}.
497 $id ||= ''; # avoid uninit warnings
498 my $indexhash = $self->{load_data
}{IndexIt
};
499 $indexhash->{$id} = shift if @_;
500 return $indexhash->{$id};
506 $id ||= ''; # avoid uninit warnings
507 my $indexhash = $self->{load_data
}{Local2GlobalID
};
508 $indexhash->{$id} = shift if @_;
509 return $indexhash->{$id};
512 =item store_current_feature
514 $loader->store_current_feature()
516 This method is called to store the currently active feature in the
517 database. It uses a data structure stored in $self-E<gt>{load_data}.
521 sub store_current_feature
{
524 my $ld = $self->{load_data
};
525 defined $ld->{CurrentFeature
} or return;
526 my $f = $ld->{CurrentFeature
};
528 my $normalized = $self->subfeatures_normalized;
529 my $indexed = $self->_indexit($ld->{CurrentID
});
531 # logic is as follows:
532 # 1. If the feature is an indexed feature, then we store it into the main database
533 # so that it can be searched. It doesn't matter whether it is a top-level feature
535 # 2. If the feature class is normalized, but not indexed, then we store it into the
536 # main database using the "no_index" method. This will make it accessible to
537 # queries on the top level parent, but it won't come up by itself in range or
538 # attribute searches.
539 # 3. Otherwise, this is an unindexed subfeature; we store it in the temporary database
540 # until the object build step, at which point it gets integrated into its object tree
541 # and copied into the main database.
544 $self->store->store($f);
547 elsif ($normalized) {
548 $self->store->store_noindex($f)
552 $self->tmp_store->store_noindex($f)
555 my $id = $f->primary_id; # assigned by store()
557 $self->_local2global($ld->{CurrentID
} => $id);
558 $self->_indexit($ld->{CurrentID
} => 0)if $normalized; # no need to remember this
559 undef $ld->{CurrentID
};
560 undef $ld->{CurrentFeature
};
563 =item parse_attributes
565 ($reserved,$unreserved) = $loader->parse_attributes($attribute_line)
567 This method parses the information contained in the $attribute_line
568 into two hashrefs, one containing the values of reserved attribute
569 tags (e.g. ID) and the other containing the values of unreserved ones.
573 sub parse_attributes
{
579 =item start_or_finish_sequence
581 $loader->start_or_finish_sequence('Chr9')
583 This method is called at the beginning and end of a fasta section.
587 # this gets called at the beginning and end of a fasta section
588 sub start_or_finish_sequence
{
591 if (my $sl = $self->{fasta_load
}) {
592 if (defined $sl->{seqid
}) {
593 $self->store->insert_sequence($sl->{seqid
},$sl->{sequence
},$sl->{offset
});
594 delete $self->{fasta_load
};
597 if (defined $seqid) {
598 $self->{fasta_load
} = {seqid
=> $seqid,
606 $loader->load_sequence('gatttcccaaa')
608 This method is called to load some amount of sequence after
609 start_or_finish_sequence() is first called.
616 my $sl = $self->{fasta_load
} or return;
617 my $cs = $self->seq_chunk_size;
618 $sl->{sequence
} .= $seq;
619 while (length $sl->{sequence
} >= $cs) {
620 my $chunk = substr($sl->{sequence
},0,$cs);
621 $self->store->insert_sequence($sl->{seqid
},$chunk,$sl->{offset
});
622 $sl->{offset
} += length $chunk;
623 substr($sl->{sequence
},0,$cs) = '';
629 my $io_file = $loader->open_fh($filehandle_or_path)
631 This method opens up the indicated file or pipe, using some
632 intelligence to recognized compressed files and URLs and doing the
644 return $thing if defined fileno($thing);
645 return IO
::File
->new("gunzip -c $thing |") if $thing =~ /\.gz$/;
646 return IO
::File
->new("uncompress -c $thing |") if $thing =~ /\.Z$/;
647 return IO
::File
->new("bunzip2 -c $thing |") if $thing =~ /\.bz2$/;
648 return IO
::File
->new("GET $thing |") if $thing =~ /^(http|ftp):/;
649 return $thing if ref $thing && $thing->isa('IO::String');
650 return IO
::File
->new($thing);
656 return unless $self->verbose;
662 my $ids = $loader->loaded_ids;
665 After performing a load, this returns an array ref containing all the
666 feature primary ids that were created during the load.
672 my @ids = values %{$self->{load_data
}{Local2GlobalID
}}
673 if $self->{load_data
};
679 my $ids = $self->local_ids;
682 After performing a load, this returns an array ref containing all the
683 load file IDs that were contained within the file just loaded.
689 my @ids = keys %{$self->{load_data
}{Local2GlobalID
}}
690 if $self->{load_data
};
696 my $time = $loader->time
698 This method returns the current time in seconds, using Time::HiRes if available.
703 return Time
::HiRes
::time() if Time
::HiRes
->can('time');
709 my $unescaped = GFF3Loader::unescape($escaped)
711 This is an internal utility. It is the same as CGI::Util::unescape,
712 but doesn't change pluses into spaces and ignores unicode escapes.
718 my $todecode = shift;
719 $todecode =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
725 # Close filehandles, so temporal files can be properly deleted
726 my $store = $self->store;
727 if ( $store->isa('Bio::DB::SeqFeature::Store::memory')
728 or $store->isa('Bio::DB::SeqFeature::Store::berkeleydb3')
730 $store->private_fasta_file->close;
732 if ($store->{fasta_db
} && !$self->{no_close_fasta
}) {
733 while (my ($file, $fh) = each %{ $store->{fasta_db
}->{fhcache
} }) {
736 $store->{fasta_db
}->_close_index($store->{fasta_db
}->{offsets
});
739 elsif ($store->isa('Bio::DB::SeqFeature::Store::DBI::SQLite')) {
740 if (%DBI::installed_drh
) {
742 %DBI::installed_drh
= ();
747 if (my $ld = $self->{temp_load
}) {
759 This is an early version, so there are certainly some bugs. Please
760 use the BioPerl bug tracking system to report bugs.
765 L<Bio::DB::SeqFeature::Store>,
766 L<Bio::DB::SeqFeature::Segment>,
767 L<Bio::DB::SeqFeature::NormalizedFeature>,
768 L<Bio::DB::SeqFeature::Store::GFF3Loader>,
769 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
770 L<Bio::DB::SeqFeature::Store::berkeleydb>
774 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
776 Copyright (c) 2006 Cold Spring Harbor Laboratory.
778 This library is free software; you can redistribute it and/or modify
779 it under the same terms as Perl itself.