tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / SeqFeature / Store / GFF3Loader.pm
blobec01d69951c5e46c3231e12c535e4aa8384f25cb
1 package Bio::DB::SeqFeature::Store::GFF3Loader;
3 # $Id$
5 =head1 NAME
7 Bio::DB::SeqFeature::Store::GFF3Loader -- GFF3 file loader for Bio::DB::SeqFeature::Store
9 =head1 SYNOPSIS
11 use Bio::DB::SeqFeature::Store;
12 use Bio::DB::SeqFeature::Store::GFF3Loader;
14 # Open the sequence database
15 my $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'DBI::mysql',
16 -dsn => 'dbi:mysql:test',
17 -write => 1 );
19 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $db,
20 -verbose => 1,
21 -fast => 1);
23 $loader->load('./my_genome.gff3');
26 =head1 DESCRIPTION
28 The Bio::DB::SeqFeature::Store::GFF3Loader object parsers GFF3-format
29 sequence annotation files and loads Bio::DB::SeqFeature::Store
30 databases. For certain combinations of SeqFeature classes and
31 SeqFeature::Store databases it features a "fast load" mode which will
32 greatly accelerate the loading of GFF3 databases by a factor of 5-10.
34 The GFF3 file format has been extended very slightly to accomodate
35 Bio::DB::SeqFeature::Store. First, the loader recognizes is a new
36 directive:
38 # #index-subfeatures [0|1]
40 Note that you can place a space between the two #'s in order to
41 prevent GFF3 validators from complaining.
43 If this is true, then subfeatures are indexed (the default) so that
44 they can be retrieved with a query. See L<Bio::DB::SeqFeature::Store>
45 for an explanation of this. If false, then subfeatures can only be
46 accessed through their parent feature.
48 Second, the loader recognizes a new attribute tag called index, which
49 if present, controls indexing of the current feature. Example:
51 ctg123 . TF_binding_site 1000 1012 . + . ID=tfbs00001;index=1
53 You can use this to turn indexing on and off, overriding the default
54 for a particular feature.
56 Note that the loader keeps a record -- in memory -- of each feature
57 that it has processed. If you find the loader running out of memory on
58 particularly large GFF3 files, please split the input file into
59 smaller pieces and do the load in steps.
61 =cut
64 # load utility - incrementally load the store based on GFF3 file
66 # two modes:
67 # slow mode -- features can occur in any order in the GFF3 file
68 # fast mode -- all features with same ID must be contiguous in GFF3 file
70 use strict;
71 use Carp 'croak';
72 use Bio::DB::GFF::Util::Rearrange;
73 use Bio::DB::SeqFeature::Store::LoadHelper;
75 use base 'Bio::DB::SeqFeature::Store::Loader';
78 my %Special_attributes =(
79 Gap => 1, Target => 1,
80 Parent => 1, Name => 1,
81 Alias => 1, ID => 1,
82 index => 1, Index => 1,
84 my %Strandedness = ( '+' => 1,
85 '-' => -1,
86 '.' => 0,
87 '' => 0,
88 0 => 0,
89 1 => 1,
90 -1 => -1,
91 +1 => 1,
92 undef => 0,
95 =head2 new
97 Title : new
98 Usage : $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(@options)
99 Function: create a new parser
100 Returns : a Bio::DB::SeqFeature::Store::GFF3Loader gff3 parser and loader
101 Args : several - see below
102 Status : public
104 This method creates a new GFF3 loader and establishes its connection
105 with a Bio::DB::SeqFeature::Store database. Arguments are -name=E<gt>$value
106 pairs as described in this table:
108 Name Value
109 ---- -----
111 -store A writeable Bio::DB::SeqFeature::Store database handle.
113 -seqfeature_class The name of the type of Bio::SeqFeatureI object to create
114 and store in the database (Bio::DB::SeqFeature by default)
116 -sf_class A shorter alias for -seqfeature_class
118 -verbose Send progress information to standard error.
120 -fast If true, activate fast loading (see below)
122 -chunk_size Set the storage chunk size for nucleotide/protein sequences
123 (default 2000 bytes)
125 -tmp Indicate a temporary directory to use when loading non-normalized
126 features.
128 -ignore_seqregion Ignore ##sequence-region directives. The default is to create a
129 feature corresponding to the directive.
131 When you call new(), a connection to a Bio::DB::SeqFeature::Store
132 database should already have been established and the database
133 initialized (if appropriate).
135 Some combinations of Bio::SeqFeatures and Bio::DB::SeqFeature::Store
136 databases support a fast loading mode. Currently the only reliable
137 implementation of fast loading is the combination of DBI::mysql with
138 Bio::DB::SeqFeature. The other important restriction on fast loading
139 is the requirement that a feature that contains subfeatures must occur
140 in the GFF3 file before any of its subfeatures. Otherwise the
141 subfeatures that occurred before the parent feature will not be
142 attached to the parent correctly. This restriction does not apply to
143 normal (slow) loading.
145 If you use an unnormalized feature class, such as
146 Bio::SeqFeature::Generic, then the loader needs to create a temporary
147 database in which to cache features until all their parts and subparts
148 have been seen. This temporary databases uses the "berkeleydb" adaptor. The
149 -tmp option specifies the directory in which that database will be
150 created. If not present, it defaults to the system default tmp
151 directory specified by File::Spec-E<gt>tmpdir().
153 The -chunk_size option allows you to tune the representation of
154 DNA/Protein sequence in the Store database. By default, sequences are
155 split into 2000 base/residue chunks and then reassembled as
156 needed. This avoids the problem of pulling a whole chromosome into
157 memory in order to fetch a short subsequence from somewhere in the
158 middle. Depending on your usage patterns, you may wish to tune this
159 parameter using a chunk size that is larger or smaller than the
160 default.
162 =cut
164 sub new {
165 my $class = shift;
166 my $self = $class->SUPER::new(@_);
167 my ($ignore_seqregion) = rearrange(['IGNORE_SEQREGION'],@_);
168 $self->ignore_seqregion($ignore_seqregion);
169 $self;
172 =head2 ignore_seqregion
174 $ignore_it = $loader->ignore_seqregion([$new_flag])
176 Get or set the ignore_seqregion flag, which if true, will cause
177 GFF3 ##sequence-region directives to be ignored. The default behavior
178 is to create a feature corresponding to the region.
180 =cut
182 sub ignore_seqregion {
183 my $self = shift;
184 my $d = $self->{ignore_seqregion};
185 $self->{ignore_seqregion} = shift if @_;
189 =head2 load
191 Title : load
192 Usage : $count = $loader->load(@ARGV)
193 Function: load the indicated files or filehandles
194 Returns : number of feature lines loaded
195 Args : list of files or filehandles
196 Status : public
198 Once the loader is created, invoke its load() method with a list of
199 GFF3 or FASTA file paths or previously-opened filehandles in order to
200 load them into the database. Compressed files ending with .gz, .Z and
201 .bz2 are automatically recognized and uncompressed on the fly. Paths
202 beginning with http: or ftp: are treated as URLs and opened using the
203 LWP GET program (which must be on your path).
205 FASTA files are recognized by their initial "E<gt>" character. Do not feed
206 the loader a file that is neither GFF3 nor FASTA; I don't know what
207 will happen, but it will probably not be what you expect.
209 =cut
211 # sub load { } inherited
213 =head2 accessors
215 The following read-only accessors return values passed or created during new():
217 store() the long-term Bio::DB::SeqFeature::Store object
219 tmp_store() the temporary Bio::DB::SeqFeature::Store object used
220 during loading
222 sfclass() the Bio::SeqFeatureI class
224 fast() whether fast loading is active
226 seq_chunk_size() the sequence chunk size
228 verbose() verbose progress messages
230 =cut
232 # sub store inherited
233 # sub tmp_store inherited
234 # sub sfclass inherited
235 # sub fast inherited
236 # sub seq_chunk_size inherited
237 # sub verbose inherited
239 =head2 Internal Methods
241 The following methods are used internally and may be overidden by
242 subclasses.
244 =over 4
246 =item default_seqfeature_class
248 $class = $loader->default_seqfeature_class
250 Return the default SeqFeatureI class (Bio::DB::SeqFeature).
252 =cut
254 # sub default_seqfeature_class { } inherited
256 =item subfeatures_normalized
258 $flag = $loader->subfeatures_normalized([$new_flag])
260 Get or set a flag that indicates that the subfeatures are
261 normalized. This is deduced from the SeqFeature class information.
263 =cut
265 # sub subfeatures_normalized { } inherited
267 =item subfeatures_in_table
269 $flag = $loader->subfeatures_in_table([$new_flag])
271 Get or set a flag that indicates that feature/subfeature relationships
272 are stored in a table. This is deduced from the SeqFeature class and
273 Store information.
275 =cut
277 # sub subfeatures_in_table { } inherited
279 =item load_fh
281 $count = $loader->load_fh($filehandle)
283 Load the GFF3 data at the other end of the filehandle and return true
284 if successful. Internally, load_fh() invokes:
286 start_load();
287 do_load($filehandle);
288 finish_load();
290 =cut
292 # sub load_fh { } inherited
294 =item start_load, finish_load
296 These methods are called at the start and end of a filehandle load.
298 =cut
300 sub create_load_data { #overridden
301 my $self = shift;
302 $self->SUPER::create_load_data;
303 $self->{load_data}{TemporaryID} = "GFFLoad0000000";
304 $self->{load_data}{IndexSubfeatures} = $self->index_subfeatures();
305 $self->{load_data}{mode} = 'gff';
307 $self->{load_data}{Helper} =
308 Bio::DB::SeqFeature::Store::LoadHelper->new($self->{tmpdir});
311 sub finish_load { #overridden
312 my $self = shift;
314 $self->store_current_feature(); # during fast loading, we will have a feature left at the very end
315 $self->start_or_finish_sequence(); # finish any half-loaded sequences
317 $self->msg("Building object tree...");
318 my $start = $self->time();
319 $self->build_object_tree;
320 $self->msg(sprintf "%5.2fs\n",$self->time()-$start);
322 if ($self->fast) {
323 $self->msg("Loading bulk data into database...");
324 $start = $self->time();
325 $self->store->finish_bulk_update;
326 $self->msg(sprintf "%5.2fs\n",$self->time()-$start);
328 eval {$self->store->commit};
329 # don't delete load data so that caller can ask for the loaded IDs
330 # $self->delete_load_data;
333 =item do_load
335 $count = $loader->do_load($fh)
337 This is called by load_fh() to load the GFF3 file's filehandle and
338 return the number of lines loaded.
340 =cut
342 # sub do_load { } inherited
344 =item load_line
346 $loader->load_line($data);
348 Load a line of a GFF3 file. You must bracket this with calls to
349 start_load() and finish_load()!
351 $loader->start_load();
352 $loader->load_line($_) while <FH>;
353 $loader->finish_load();
355 =cut
357 sub load_line { #overridden
358 my $self = shift;
359 my $line = shift;
361 chomp($line);
362 my $load_data = $self->{load_data};
363 $load_data->{line}++;
365 return unless $line =~ /^\S/; # blank line
366 $load_data->{mode} = 'gff' if /\t/; # if it has a tab in it, switch to gff mode
368 if ($line =~ /^\#\s?\#\s*(.+)/) { ## meta instruction
369 $load_data->{mode} = 'gff';
370 $self->handle_meta($1);
372 } elsif ($line =~ /^\#/) {
373 $load_data->{mode} = 'gff'; # just to be safe
374 return; # comment
377 elsif ($line =~ /^>\s*(\S+)/) { # FASTA lines are coming
378 $load_data->{mode} = 'fasta';
379 $self->start_or_finish_sequence($1);
382 elsif ($load_data->{mode} eq 'fasta') {
383 $self->load_sequence($line);
386 elsif ($load_data->{mode} eq 'gff') {
387 $self->handle_feature($line);
388 if (++$load_data->{count} % 1000 == 0) {
389 my $now = $self->time();
390 my $nl = -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
391 local $^W = 0; # kill uninit variable warning
392 $self->msg(sprintf("%d features loaded in %5.2fs (%5.2fs/1000 features)...%s$nl",
393 $load_data->{count},$now - $load_data->{start_time},
394 $now - $load_data->{millenium_time},
395 ' ' x 80
397 $load_data->{millenium_time} = $now;
401 else {
402 $self->throw("I don't know what to do with this line:\n$line");
407 =item handle_meta
409 $loader->handle_meta($meta_directive)
411 This method is called to handle meta-directives such as
412 ##sequence-region. The method will receive the directive with the
413 initial ## stripped off.
415 =cut
417 sub handle_meta {
418 my $self = shift;
419 my $instruction = shift;
421 if ( $instruction =~ /^#$/ ) {
422 $self->store_current_feature() ; # during fast loading, we will have a feature left at the very end
423 $self->start_or_finish_sequence(); # finish any half-loaded sequences
424 if ( $self->store->can('handle_resolution_meta') ) {
425 $self->store->handle_resolution_meta($instruction);
427 return;
430 if ($instruction =~ /sequence-region\s+(.+)\s+(-?\d+)\s+(-?\d+)/i
431 && !$self->ignore_seqregion()) {
432 my($ref,$start,$end,$strand) = $self->_remap($1,$2,$3,+1);
433 my $feature = $self->sfclass->new(-name => $ref,
434 -seq_id => $ref,
435 -start => $start,
436 -end => $end,
437 -strand => $strand,
438 -primary_tag => 'region');
439 $self->store->store($feature);
440 return;
443 if ($instruction =~/index-subfeatures\s+(\S+)/i) {
444 $self->{load_data}{IndexSubfeatures} = $1;
445 $self->store->index_subfeatures($1);
446 return;
449 if ( $self->store->can('handle_unrecognized_meta') ) {
450 $self->store->handle_unrecognized_meta($instruction);
451 return;
455 =item handle_feature
457 $loader->handle_feature($gff3_line)
459 This method is called to process a single GFF3 line. It manipulates
460 information stored a data structure called $self-E<gt>{load_data}.
462 =cut
464 sub handle_feature { #overridden
465 my $self = shift;
466 my $gff_line = shift;
467 my $ld = $self->{load_data};
469 my $allow_whitespace = $self->allow_whitespace;
470 $gff_line =~ s/\s+/\t/g if $allow_whitespace;
472 my @columns = map {$_ eq '.' ? undef : $_ } split /\t/,$gff_line;
474 $self->invalid_gff($gff_line) if @columns < 4;
475 $self->invalid_gff($gff_line) if @columns > 9 && $allow_whitespace;
478 local $^W = 0;
479 if (@columns > 9) { #oops, split too much due to whitespace
480 $columns[8] = join(' ',@columns[8..$#columns]);
484 my ($refname,$source,$method,$start,$end,$score,$strand,$phase,$attributes) = @columns;
486 $self->invalid_gff($gff_line) unless defined $refname;
487 $self->invalid_gff($gff_line) unless $start eq '.' || $start =~ /^[\d.-]+$/;
488 $self->invalid_gff($gff_line) unless $end eq '.' || $end =~ /^[\d.-]+$/;
489 $self->invalid_gff($gff_line) unless defined $method;
491 $strand = $Strandedness{$strand||0};
492 my ($reserved,$unreserved) = $attributes ? $self->parse_attributes($attributes) : ();
494 my $name = ($reserved->{Name} && $reserved->{Name}[0]);
496 my $has_loadid = defined $reserved->{ID}[0];
498 my $feature_id = defined $reserved->{ID}[0] ? $reserved->{ID}[0] : $ld->{TemporaryID}++;
499 my @parent_ids = @{$reserved->{Parent}} if defined $reserved->{Parent};
501 my $index_it = $ld->{IndexSubfeatures};
502 if (exists $reserved->{Index} || exists $reserved->{index}) {
503 $index_it = $reserved->{Index}[0] || $reserved->{index}[0];
506 # Everything in the unreserved hash becomes an attribute, so we copy
507 # some attributes over
508 $unreserved->{Note} = $reserved->{Note} if exists $reserved->{Note};
509 $unreserved->{Alias} = $reserved->{Alias} if exists $reserved->{Alias};
510 $unreserved->{Target} = $reserved->{Target} if exists $reserved->{Target};
511 $unreserved->{Gap} = $reserved->{Gap} if exists $reserved->{Gap};
512 $unreserved->{load_id}= $reserved->{ID} if exists $reserved->{ID};
514 # mec@stowers-institute.org, wondering why not all attributes are
515 # carried forward, adds ID tag in particular service of
516 # round-tripping ID, which, though present in database as load_id
517 # attribute, was getting lost as itself
518 # $unreserved->{ID}= $reserved->{ID} if exists $reserved->{ID};
520 # TEMPORARY HACKS TO SIMPLIFY DEBUGGING
521 $feature_id = '' unless defined $feature_id;
522 $name = '' unless defined $name; # prevent uninit variable warnings
523 # push @{$unreserved->{Alias}},$feature_id if $has_loadid && $feature_id ne $name;
524 $unreserved->{parent_id} = \@parent_ids if @parent_ids;
526 # POSSIBLY A PERMANENT HACK -- TARGETS BECOME ALIASES
527 # THIS IS TO ALLOW FOR TARGET-BASED LOOKUPS
528 if (exists $reserved->{Target}) {
529 my %aliases = map {$_=>1} @{$unreserved->{Alias}};
530 for my $t (@{$reserved->{Target}}) {
531 (my $tc = $t) =~ s/\s+.*$//; # get rid of coordinates
532 $name ||= $tc;
533 push @{$unreserved->{Alias}},$tc unless $name eq $tc || $aliases{$tc};
537 ($refname,$start,$end,$strand) = $self->_remap($refname,$start,$end,$strand) or return;
539 my @args = (-display_name => $name,
540 -seq_id => $refname,
541 -start => $start,
542 -end => $end,
543 -strand => $strand || 0,
544 -score => $score,
545 -phase => $phase,
546 -primary_tag => $method || 'feature',
547 -source => $source,
548 -tag => $unreserved,
549 -attributes => $unreserved,
552 # Here's where we handle feature lines that have the same ID (multiple locations, not
553 # parent/child relationships)
555 my $old_feat;
557 # Current feature is the same as the previous feature, which hasn't yet been loaded
558 if (defined $ld->{CurrentID} && $ld->{CurrentID} eq $feature_id) {
559 $old_feat = $ld->{CurrentFeature};
562 # Current feature is the same as a feature that was loaded earlier
563 elsif (defined(my $id = $self->{load_data}{Helper}->local2global($feature_id))) {
564 $old_feat = $self->fetch($feature_id)
565 or $self->warn(<<END);
566 ID=$feature_id has been used more than once, but it cannot be found in the database.
567 This can happen if you have specified fast loading, but features sharing the same ID
568 are not contiguous in the GFF file. This will be loaded as a separate feature.
569 Line $.: "$_"
573 # contiguous feature, so add a segment
574 warn $old_feat if defined $old_feat and !ref $old_feat;
575 if (defined $old_feat) {
576 # set this to 1 to disable split-location behavior
577 if (0 && @parent_ids) { # If multiple features are held together by the same ID
578 $feature_id = $ld->{TemporaryID}++; # AND they have a Parent attribute, this causes an undesirable
579 } # additional layer of aggregation. Changing the ID fixes this.
580 elsif (
581 $old_feat->seq_id ne $refname ||
582 $old_feat->start != $start ||
583 $old_feat->end != $end # make sure endpoints are distinct
586 $self->add_segment($old_feat,$self->sfclass->new(@args));
587 return;
591 # we get here if this is a new feature
592 # first of all, store the current feature if it is there
593 $self->store_current_feature() if defined $ld->{CurrentID};
595 # now create the new feature
596 # (index top-level features only if policy asks us to)
597 my $feature = $self->sfclass->new(@args);
598 $feature->object_store($self->store) if $feature->can('object_store'); # for lazy table features
599 $ld->{CurrentFeature} = $feature;
600 $ld->{CurrentID} = $feature_id;
602 my $top_level = !@parent_ids;
603 my $has_id = defined $reserved->{ID}[0];
604 $index_it ||= $top_level;
606 my $helper = $ld->{Helper};
607 $helper->indexit($feature_id=>1) if $index_it;
608 $helper->toplevel($feature_id=>1) if !$self->{fast}
609 && $top_level; # need to track top level features
612 # remember parentage
613 for my $parent (@parent_ids) {
614 $helper->add_children($parent=>$feature_id);
619 sub invalid_gff {
620 my $self = shift;
621 my $line = shift;
622 $self->throw("invalid GFF line at line $self->{load_data}{line}.\n".$line);
625 =item allow_whitespace
627 $allow_it = $loader->allow_whitespace([$newvalue]);
629 Get or set the allow_whitespace flag. If true, then GFF3 files are
630 allowed to be delimited with whitespace in addition to tabs.
632 =cut
634 sub allow_whitespace {
635 my $self = shift;
636 my $d = $self->{allow_whitespace};
637 $self->{allow_whitespace} = shift if @_;
641 =item store_current_feature
643 $loader->store_current_feature()
645 This method is called to store the currently active feature in the
646 database. It uses a data structure stored in $self-E<gt>{load_data}.
648 =cut
650 # sub store_current_feature { } inherited
652 =item build_object_tree
654 $loader->build_object_tree()
656 This method gathers together features and subfeatures and builds the graph that connects them.
658 =cut
661 # put objects together
663 sub build_object_tree {
664 my $self = shift;
665 $self->subfeatures_in_table ? $self->build_object_tree_in_tables : $self->build_object_tree_in_features;
668 =item build_object_tree_in_tables
670 $loader->build_object_tree_in_tables()
672 This method gathers together features and subfeatures and builds the
673 graph that connects them, assuming that parent/child relationships
674 will be stored in a database table.
676 =cut
678 sub build_object_tree_in_tables {
679 my $self = shift;
680 my $store = $self->store;
681 my $helper = $self->{load_data}{Helper};
683 while (my ($load_id,$children) = $helper->each_family()) {
685 my $parent_id = $helper->local2global($load_id);
686 die $self->throw("$load_id doesn't have a primary id")
687 unless defined $parent_id;
689 my @children = map {$helper->local2global($_)} @$children;
690 # this updates the table that keeps track of parent/child relationships,
691 # but does not update the parent object -- so (start,end) had better be right!!!
692 $store->add_SeqFeature($parent_id,@children);
698 =item build_object_tree_in_features
700 $loader->build_object_tree_in_features()
702 This method gathers together features and subfeatures and builds the
703 graph that connects them, assuming that parent/child relationships are
704 stored in the seqfeature objects themselves.
706 =cut
708 sub build_object_tree_in_features {
709 my $self = shift;
710 my $store = $self->store;
711 my $tmp = $self->tmp_store;
712 my $ld = $self->{load_data};
713 my $normalized = $self->subfeatures_normalized;
715 my $helper = $ld->{Helper};
717 while (my $load_id = $helper->each_toplevel) {
718 my $feature = $self->fetch($load_id)
719 or $self->throw("$load_id (id="
720 .$helper->local2global($load_id)
721 ." should have a database entry, but doesn't");
722 $self->attach_children($store,$ld,$load_id,$feature);
723 # Indexed objects are updated, not created anew
724 $feature->primary_id(undef) unless $helper->indexit($load_id);
725 $store->store($feature);
730 =item attach_children
732 $loader->attach_children($store,$load_data,$load_id,$feature)
734 This recursively adds children to features and their subfeatures. It
735 is called when subfeatures are directly contained within other
736 features, rather than stored in a relational table.
738 =cut
740 sub attach_children {
741 my $self = shift;
742 my ($store,$ld,$load_id,$feature) = @_;
744 my $children = $ld->{Helper}->children() or return;
745 for my $child_id (@$children) {
746 my $child = $self->fetch($child_id)
747 or $self->throw("$child_id should have a database entry, but doesn't");
748 $self->attach_children($store,$ld,$child_id,$child); # recursive call
749 $feature->add_SeqFeature($child);
753 =item fetch
755 my $feature = $loader->fetch($load_id)
757 Given a load ID (from the ID= attribute) this method returns the
758 feature from the temporary database or the permanent one, depending on
759 where it is stored.
761 =cut
763 sub fetch {
764 my $self = shift;
765 my $load_id = shift;
766 my $helper = $self->{load_data}{Helper};
767 my $id = $helper->local2global($load_id);
769 return
770 ($self->subfeatures_normalized || $helper->indexit($load_id)
771 ? $self->store->fetch($id)
772 : $self->tmp_store->fetch($id)
776 =item add_segment
778 $loader->add_segment($parent,$child)
780 This method is used to add a split location to the parent.
782 =cut
784 sub add_segment {
785 my $self = shift;
786 my ($parent,$child) = @_;
788 if ($parent->can('add_segment')) { # probably a lazy table feature
789 my $segment_count = $parent->can('denormalized_segment_count') ? $parent->denormalized_segment_count
790 : $parent->can('denormalized_segments ') ? $parent->denormalized_segments
791 : $parent->can('segments') ? $parent->segments
792 : 0;
793 unless ($segment_count) { # convert into a segmented object
794 my $segment;
795 if ($parent->can('clone')) {
796 $segment = $parent->clone;
797 } else {
798 my %clone = %$parent;
799 $segment = bless \%clone,ref $parent;
801 delete $segment->{segments};
802 eval {$segment->object_store(undef) };
803 $segment->primary_id(undef);
805 # this updates the object and expands its start and end positions without writing
806 # the segments into the database as individual objects
807 $parent->add_segment($segment);
809 $parent->add_segment($child);
810 1; # for debugging
813 # a conventional Bio::SeqFeature::Generic object - create a split location
814 else {
815 my $current_location = $parent->location;
816 if ($current_location->can('add_sub_Location')) {
817 $current_location->add_sub_Location($child->location);
818 } else {
819 eval "require Bio::Location::Split" unless Bio::Location::Split->can('add_sub_Location');
820 my $new_location = Bio::Location::Split->new();
821 $new_location->add_sub_Location($current_location);
822 $new_location->add_sub_Location($child->location);
823 $parent->location($new_location);
828 =item parse_attributes
830 ($reserved,$unreserved) = $loader->parse_attributes($attribute_line)
832 This method parses the information contained in the $attribute_line
833 into two hashrefs, one containing the values of reserved attribute
834 tags (e.g. ID) and the other containing the values of unreserved ones.
836 =cut
838 sub parse_attributes {
839 my $self = shift;
840 my $att = shift;
842 unless ($att =~ /=/) { # ouch! must be a GFF line
843 require Bio::DB::SeqFeature::Store::GFF2Loader
844 unless Bio::DB::SeqFeature::Store::GFF2Loader->can('parse_attributes');
845 return $self->Bio::DB::SeqFeature::Store::GFF2Loader::parse_attributes($att);
848 my @pairs = map { my ($name,$value) = split '=';
849 [$self->unescape($name) => $value];
850 } split ';',$att;
851 my (%reserved,%unreserved);
852 foreach (@pairs) {
853 my $tag = $_->[0];
855 unless (defined $_->[1]) {
856 warn "$tag does not have a value at GFF3 file line $.\n";
857 next;
860 my @values = split ',',$_->[1];
861 map {$_ = $self->unescape($_);} @values;
862 if ($Special_attributes{$tag}) { # reserved attribute
863 push @{$reserved{$tag}},@values;
864 } else {
865 push @{$unreserved{$tag}},@values
868 return (\%reserved,\%unreserved);
871 =item start_or_finish_sequence
873 $loader->start_or_finish_sequence('Chr9')
875 This method is called at the beginning and end of a fasta section.
877 =cut
879 # sub start_or_finish_sequence { } inherited
881 =item load_sequence
883 $loader->load_sequence('gatttcccaaa')
885 This method is called to load some amount of sequence after
886 start_or_finish_sequence() is first called.
888 =cut
890 # sub load_sequence { } inherited
892 =item open_fh
894 my $io_file = $loader->open_fh($filehandle_or_path)
896 This method opens up the indicated file or pipe, using some
897 intelligence to recognized compressed files and URLs and doing the
898 right thing.
900 =cut
902 # sub open_fh { } inherited
904 # sub msg { } inherited
906 =item time
908 my $time = $loader->time
910 This method returns the current time in seconds, using Time::HiRes if available.
912 =cut
914 # sub time { } inherited
916 =item unescape
918 my $unescaped = GFF3Loader::unescape($escaped)
920 This is an internal utility. It is the same as CGI::Util::unescape,
921 but doesn't change pluses into spaces and ignores unicode escapes.
923 =cut
925 # sub unescape { } inherited
927 sub _remap {
928 my $self = shift;
929 my ($ref,$start,$end,$strand) = @_;
930 my $mapper = $self->coordinate_mapper;
931 return ($ref,$start,$end,$strand) unless $mapper;
933 my ($newref,$coords) = $mapper->($ref,[$start,$end]);
934 return unless defined $coords->[0];
935 if ($coords->[0] > $coords->[1]) {
936 @{$coords} = reverse(@{$coords});
937 $strand *= -1;
939 return ($newref,@{$coords},$strand);
942 sub _indexit { # override
943 my $self = shift;
944 return $self->{load_data}{Helper}->indexit(@_);
947 sub _local2global { # override
948 my $self = shift;
949 return $self->{load_data}{Helper}->local2global(@_);
952 =item local_ids
954 my $ids = $self->local_ids;
955 my $id_cnt = @$ids;
957 After performing a load, this returns an array ref containing all the
958 load file IDs that were contained within the file just loaded.
960 =cut
962 sub local_ids { # override
963 my $self = shift;
964 return $self->{load_data}{Helper}->local_ids(@_);
967 =item loaded_ids
969 my $ids = $loader->loaded_ids;
970 my $id_cnt = @$ids;
972 After performing a load, this returns an array ref containing all the
973 feature primary ids that were created during the load.
975 =cut
977 sub loaded_ids { # override
978 my $self = shift;
979 return $self->{load_data}{Helper}->loaded_ids(@_);
984 __END__
986 =back
988 =head1 BUGS
990 This is an early version, so there are certainly some bugs. Please
991 use the BioPerl bug tracking system to report bugs.
993 =head1 SEE ALSO
995 L<Bio::DB::SeqFeature::Store>,
996 L<Bio::DB::SeqFeature::Segment>,
997 L<Bio::DB::SeqFeature::NormalizedFeature>,
998 L<Bio::DB::SeqFeature::GFF2Loader>,
999 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1000 L<Bio::DB::SeqFeature::Store::berkeleydb>
1002 =head1 AUTHOR
1004 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1006 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1008 This library is free software; you can redistribute it and/or modify
1009 it under the same terms as Perl itself.
1011 =cut