sync w/ main trunk
[bioperl-live.git] / Bio / DB / SeqFeature / Store / GFF3Loader.pm
blob835091eb2dcd8de3a618661be027899ba20450c6
1 package Bio::DB::SeqFeature::Store::GFF3Loader;
3 # $Id$
5 =head1 NAME
7 Bio::DB::SeqFeature::Store::GFF3Loader -- GFF3 file loader for Bio::DB::SeqFeature::Store
9 =head1 SYNOPSIS
11 use Bio::DB::SeqFeature::Store;
13 # Open the sequence database
14 my $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'DBI::mysql',
15 -dsn => 'dbi:mysql:test',
16 -write => 1 );
18 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $db,
19 -verbose => 1,
20 -fast => 1);
22 $loader->load('./my_genome.gff3');
25 =head1 DESCRIPTION
27 The Bio::DB::SeqFeature::Store::GFF3Loader object parsers GFF3-format
28 sequence annotation files and loads Bio::DB::SeqFeature::Store
29 databases. For certain combinations of SeqFeature classes and
30 SeqFeature::Store databases it features a "fast load" mode which will
31 greatly accelerate the loading of GFF3 databases by a factor of 5-10.
33 The GFF3 file format has been extended very slightly to accomodate
34 Bio::DB::SeqFeature::Store. First, the loader recognizes is a new
35 directive:
37 # #index-subfeatures [0|1]
39 Note that you can place a space between the two #'s in order to
40 prevent GFF3 validators from complaining.
42 If this is true, then subfeatures are indexed (the default) so that
43 they can be retrieved with a query. See L<Bio::DB::SeqFeature::Store>
44 for an explanation of this. If false, then subfeatures can only be
45 accessed through their parent feature.
47 Second, the loader recognizes a new attribute tag called index, which
48 if present, controls indexing of the current feature. Example:
50 ctg123 . TF_binding_site 1000 1012 . + . ID=tfbs00001;index=1
52 You can use this to turn indexing on and off, overriding the default
53 for a particular feature.
55 Note that the loader keeps a record -- in memory -- of each feature
56 that it has processed. If you find the loader running out of memory on
57 particularly large GFF3 files, please split the input file into
58 smaller pieces and do the load in steps.
60 =cut
63 # load utility - incrementally load the store based on GFF3 file
65 # two modes:
66 # slow mode -- features can occur in any order in the GFF3 file
67 # fast mode -- all features with same ID must be contiguous in GFF3 file
69 use strict;
70 use Carp 'croak';
71 use Bio::DB::GFF::Util::Rearrange;
72 use Bio::DB::SeqFeature::Store::LoadHelper;
74 use base 'Bio::DB::SeqFeature::Store::Loader';
77 my %Special_attributes =(
78 Gap => 1, Target => 1,
79 Parent => 1, Name => 1,
80 Alias => 1, ID => 1,
81 index => 1, Index => 1,
83 my %Strandedness = ( '+' => 1,
84 '-' => -1,
85 '.' => 0,
86 '' => 0,
87 0 => 0,
88 1 => 1,
89 -1 => -1,
90 +1 => 1,
91 undef => 0,
94 =head2 new
96 Title : new
97 Usage : $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(@options)
98 Function: create a new parser
99 Returns : a Bio::DB::SeqFeature::Store::GFF3Loader gff3 parser and loader
100 Args : several - see below
101 Status : public
103 This method creates a new GFF3 loader and establishes its connection
104 with a Bio::DB::SeqFeature::Store database. Arguments are -name=E<gt>$value
105 pairs as described in this table:
107 Name Value
108 ---- -----
110 -store A writeable Bio::DB::SeqFeature::Store database handle.
112 -seqfeature_class The name of the type of Bio::SeqFeatureI object to create
113 and store in the database (Bio::DB::SeqFeature by default)
115 -sf_class A shorter alias for -seqfeature_class
117 -verbose Send progress information to standard error.
119 -fast If true, activate fast loading (see below)
121 -chunk_size Set the storage chunk size for nucleotide/protein sequences
122 (default 2000 bytes)
124 -tmp Indicate a temporary directory to use when loading non-normalized
125 features.
127 -ignore_seqregion Ignore ##sequence-region directives. The default is to create a
128 feature corresponding to the directive.
130 When you call new(), a connection to a Bio::DB::SeqFeature::Store
131 database should already have been established and the database
132 initialized (if appropriate).
134 Some combinations of Bio::SeqFeatures and Bio::DB::SeqFeature::Store
135 databases support a fast loading mode. Currently the only reliable
136 implementation of fast loading is the combination of DBI::mysql with
137 Bio::DB::SeqFeature. The other important restriction on fast loading
138 is the requirement that a feature that contains subfeatures must occur
139 in the GFF3 file before any of its subfeatures. Otherwise the
140 subfeatures that occurred before the parent feature will not be
141 attached to the parent correctly. This restriction does not apply to
142 normal (slow) loading.
144 If you use an unnormalized feature class, such as
145 Bio::SeqFeature::Generic, then the loader needs to create a temporary
146 database in which to cache features until all their parts and subparts
147 have been seen. This temporary databases uses the "berkeleydb" adaptor. The
148 -tmp option specifies the directory in which that database will be
149 created. If not present, it defaults to the system default tmp
150 directory specified by File::Spec-E<gt>tmpdir().
152 The -chunk_size option allows you to tune the representation of
153 DNA/Protein sequence in the Store database. By default, sequences are
154 split into 2000 base/residue chunks and then reassembled as
155 needed. This avoids the problem of pulling a whole chromosome into
156 memory in order to fetch a short subsequence from somewhere in the
157 middle. Depending on your usage patterns, you may wish to tune this
158 parameter using a chunk size that is larger or smaller than the
159 default.
161 =cut
163 sub new {
164 my $class = shift;
165 my $self = $class->SUPER::new(@_);
166 my ($ignore_seqregion) = rearrange(['IGNORE_SEQREGION'],@_);
167 $self->ignore_seqregion($ignore_seqregion);
168 $self;
171 =head2 ignore_seqregion
173 $ignore_it = $loader->ignore_seqregion([$new_flag])
175 Get or set the ignore_seqregion flag, which if true, will cause
176 GFF3 ##sequence-region directives to be ignored. The default behavior
177 is to create a feature corresponding to the region.
179 =cut
181 sub ignore_seqregion {
182 my $self = shift;
183 my $d = $self->{ignore_seqregion};
184 $self->{ignore_seqregion} = shift if @_;
188 =head2 load
190 Title : load
191 Usage : $count = $loader->load(@ARGV)
192 Function: load the indicated files or filehandles
193 Returns : number of feature lines loaded
194 Args : list of files or filehandles
195 Status : public
197 Once the loader is created, invoke its load() method with a list of
198 GFF3 or FASTA file paths or previously-opened filehandles in order to
199 load them into the database. Compressed files ending with .gz, .Z and
200 .bz2 are automatically recognized and uncompressed on the fly. Paths
201 beginning with http: or ftp: are treated as URLs and opened using the
202 LWP GET program (which must be on your path).
204 FASTA files are recognized by their initial "E<gt>" character. Do not feed
205 the loader a file that is neither GFF3 nor FASTA; I don't know what
206 will happen, but it will probably not be what you expect.
208 =cut
210 # sub load { } inherited
212 =head2 accessors
214 The following read-only accessors return values passed or created during new():
216 store() the long-term Bio::DB::SeqFeature::Store object
218 tmp_store() the temporary Bio::DB::SeqFeature::Store object used
219 during loading
221 sfclass() the Bio::SeqFeatureI class
223 fast() whether fast loading is active
225 seq_chunk_size() the sequence chunk size
227 verbose() verbose progress messages
229 =cut
231 # sub store inherited
232 # sub tmp_store inherited
233 # sub sfclass inherited
234 # sub fast inherited
235 # sub seq_chunk_size inherited
236 # sub verbose inherited
238 =head2 Internal Methods
240 The following methods are used internally and may be overidden by
241 subclasses.
243 =over 4
245 =item default_seqfeature_class
247 $class = $loader->default_seqfeature_class
249 Return the default SeqFeatureI class (Bio::DB::SeqFeature).
251 =cut
253 # sub default_seqfeature_class { } inherited
255 =item subfeatures_normalized
257 $flag = $loader->subfeatures_normalized([$new_flag])
259 Get or set a flag that indicates that the subfeatures are
260 normalized. This is deduced from the SeqFeature class information.
262 =cut
264 # sub subfeatures_normalized { } inherited
266 =item subfeatures_in_table
268 $flag = $loader->subfeatures_in_table([$new_flag])
270 Get or set a flag that indicates that feature/subfeature relationships
271 are stored in a table. This is deduced from the SeqFeature class and
272 Store information.
274 =cut
276 # sub subfeatures_in_table { } inherited
278 =item load_fh
280 $count = $loader->load_fh($filehandle)
282 Load the GFF3 data at the other end of the filehandle and return true
283 if successful. Internally, load_fh() invokes:
285 start_load();
286 do_load($filehandle);
287 finish_load();
289 =cut
291 # sub load_fh { } inherited
293 =item start_load, finish_load
295 These methods are called at the start and end of a filehandle load.
297 =cut
299 sub create_load_data { #overridden
300 my $self = shift;
301 $self->SUPER::create_load_data;
302 $self->{load_data}{TemporaryID} = "GFFLoad0000000";
303 $self->{load_data}{IndexSubfeatures} = $self->index_subfeatures();
304 $self->{load_data}{mode} = 'gff';
306 $self->{load_data}{Helper} =
307 Bio::DB::SeqFeature::Store::LoadHelper->new($self->{tmpdir});
310 sub finish_load { #overridden
311 my $self = shift;
313 $self->store_current_feature(); # during fast loading, we will have a feature left at the very end
314 $self->start_or_finish_sequence(); # finish any half-loaded sequences
316 $self->msg("Building object tree...");
317 my $start = $self->time();
318 $self->build_object_tree;
319 $self->msg(sprintf "%5.2fs\n",$self->time()-$start);
321 if ($self->fast) {
322 $self->msg("Loading bulk data into database...");
323 $start = $self->time();
324 $self->store->finish_bulk_update;
325 $self->msg(sprintf "%5.2fs\n",$self->time()-$start);
327 eval {$self->store->commit};
328 # don't delete load data so that caller can ask for the loaded IDs
329 # $self->delete_load_data;
332 =item do_load
334 $count = $loader->do_load($fh)
336 This is called by load_fh() to load the GFF3 file's filehandle and
337 return the number of lines loaded.
339 =cut
341 # sub do_load { } inherited
343 =item load_line
345 $loader->load_line($data);
347 Load a line of a GFF3 file. You must bracket this with calls to
348 start_load() and finish_load()!
350 $loader->start_load();
351 $loader->load_line($_) while <FH>;
352 $loader->finish_load();
354 =cut
356 sub load_line { #overridden
357 my $self = shift;
358 my $line = shift;
360 chomp($line);
361 my $load_data = $self->{load_data};
362 $load_data->{line}++;
364 return unless $line =~ /^\S/; # blank line
365 $load_data->{mode} = 'gff' if /\t/; # if it has a tab in it, switch to gff mode
367 if ($line =~ /^\#\s?\#\s*(.+)/) { ## meta instruction
368 $load_data->{mode} = 'gff';
369 $self->handle_meta($1);
371 } elsif ($line =~ /^\#/) {
372 $load_data->{mode} = 'gff'; # just to be safe
373 return; # comment
376 elsif ($line =~ /^>\s*(\S+)/) { # FASTA lines are coming
377 $load_data->{mode} = 'fasta';
378 $self->start_or_finish_sequence($1);
381 elsif ($load_data->{mode} eq 'fasta') {
382 $self->load_sequence($line);
385 elsif ($load_data->{mode} eq 'gff') {
386 $self->handle_feature($line);
387 if (++$load_data->{count} % 1000 == 0) {
388 my $now = $self->time();
389 my $nl = -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
390 local $^W = 0; # kill uninit variable warning
391 $self->msg(sprintf("%d features loaded in %5.2fs (%5.2fs/1000 features)...%s$nl",
392 $load_data->{count},$now - $load_data->{start_time},
393 $now - $load_data->{millenium_time},
394 ' ' x 80
396 $load_data->{millenium_time} = $now;
400 else {
401 $self->throw("I don't know what to do with this line:\n$line");
406 =item handle_meta
408 $loader->handle_meta($meta_directive)
410 This method is called to handle meta-directives such as
411 ##sequence-region. The method will receive the directive with the
412 initial ## stripped off.
414 =cut
416 sub handle_meta {
417 my $self = shift;
418 my $instruction = shift;
420 if ( $instruction =~ /^#$/ ) {
421 $self->store_current_feature() ; # during fast loading, we will have a feature left at the very end
422 $self->start_or_finish_sequence(); # finish any half-loaded sequences
423 if ( $self->store->can('handle_resolution_meta') ) {
424 $self->store->handle_resolution_meta($instruction);
426 return;
429 if ($instruction =~ /sequence-region\s+(.+)\s+(-?\d+)\s+(-?\d+)/i
430 && !$self->ignore_seqregion()) {
431 my($ref,$start,$end,$strand) = $self->_remap($1,$2,$3,+1);
432 my $feature = $self->sfclass->new(-name => $ref,
433 -seq_id => $ref,
434 -start => $start,
435 -end => $end,
436 -strand => $strand,
437 -primary_tag => 'region');
438 $self->store->store($feature);
439 return;
442 if ($instruction =~/index-subfeatures\s+(\S+)/i) {
443 $self->{load_data}{IndexSubfeatures} = $1;
444 $self->store->index_subfeatures($1);
445 return;
448 if ( $self->store->can('handle_unrecognized_meta') ) {
449 $self->store->handle_unrecognized_meta($instruction);
450 return;
454 =item handle_feature
456 $loader->handle_feature($gff3_line)
458 This method is called to process a single GFF3 line. It manipulates
459 information stored a data structure called $self-E<gt>{load_data}.
461 =cut
463 sub handle_feature { #overridden
464 my $self = shift;
465 my $gff_line = shift;
466 my $ld = $self->{load_data};
468 my $allow_whitespace = $self->allow_whitespace;
469 $gff_line =~ s/\s+/\t/g if $allow_whitespace;
471 my @columns = map {$_ eq '.' ? undef : $_ } split /\t/,$gff_line;
473 $self->invalid_gff($gff_line) if @columns < 4;
474 $self->invalid_gff($gff_line) if @columns > 9 && $allow_whitespace;
477 local $^W = 0;
478 if (@columns > 9) { #oops, split too much due to whitespace
479 $columns[8] = join(' ',@columns[8..$#columns]);
483 my ($refname,$source,$method,$start,$end,$score,$strand,$phase,$attributes) = @columns;
485 $self->invalid_gff($gff_line) unless defined $refname;
486 $self->invalid_gff($gff_line) unless $start eq '.' || $start =~ /^[\d.-]+$/;
487 $self->invalid_gff($gff_line) unless $end eq '.' || $end =~ /^[\d.-]+$/;
488 $self->invalid_gff($gff_line) unless defined $method;
490 $strand = $Strandedness{$strand||0};
491 my ($reserved,$unreserved) = $attributes ? $self->parse_attributes($attributes) : ();
493 my $name = ($reserved->{Name} && $reserved->{Name}[0]);
495 my $has_loadid = defined $reserved->{ID}[0];
497 my $feature_id = defined $reserved->{ID}[0] ? $reserved->{ID}[0] : $ld->{TemporaryID}++;
498 my @parent_ids = @{$reserved->{Parent}} if defined $reserved->{Parent};
500 my $index_it = $ld->{IndexSubfeatures};
501 if (exists $reserved->{Index} || exists $reserved->{index}) {
502 $index_it = $reserved->{Index}[0] || $reserved->{index}[0];
505 # Everything in the unreserved hash becomes an attribute, so we copy
506 # some attributes over
507 $unreserved->{Note} = $reserved->{Note} if exists $reserved->{Note};
508 $unreserved->{Alias} = $reserved->{Alias} if exists $reserved->{Alias};
509 $unreserved->{Target} = $reserved->{Target} if exists $reserved->{Target};
510 $unreserved->{Gap} = $reserved->{Gap} if exists $reserved->{Gap};
511 $unreserved->{load_id}= $reserved->{ID} if exists $reserved->{ID};
513 # mec@stowers-institute.org, wondering why not all attributes are
514 # carried forward, adds ID tag in particular service of
515 # round-tripping ID, which, though present in database as load_id
516 # attribute, was getting lost as itself
517 # $unreserved->{ID}= $reserved->{ID} if exists $reserved->{ID};
519 # TEMPORARY HACKS TO SIMPLIFY DEBUGGING
520 $feature_id = '' unless defined $feature_id;
521 $name = '' unless defined $name; # prevent uninit variable warnings
522 # push @{$unreserved->{Alias}},$feature_id if $has_loadid && $feature_id ne $name;
523 $unreserved->{parent_id} = \@parent_ids if @parent_ids;
525 # POSSIBLY A PERMANENT HACK -- TARGETS BECOME ALIASES
526 # THIS IS TO ALLOW FOR TARGET-BASED LOOKUPS
527 if (exists $reserved->{Target}) {
528 my %aliases = map {$_=>1} @{$unreserved->{Alias}};
529 for my $t (@{$reserved->{Target}}) {
530 (my $tc = $t) =~ s/\s+.*$//; # get rid of coordinates
531 $name ||= $tc;
532 push @{$unreserved->{Alias}},$tc unless $name eq $tc || $aliases{$tc};
536 ($refname,$start,$end,$strand) = $self->_remap($refname,$start,$end,$strand) or return;
538 my @args = (-display_name => $name,
539 -seq_id => $refname,
540 -start => $start,
541 -end => $end,
542 -strand => $strand || 0,
543 -score => $score,
544 -phase => $phase,
545 -primary_tag => $method || 'feature',
546 -source => $source,
547 -tag => $unreserved,
548 -attributes => $unreserved,
551 # Here's where we handle feature lines that have the same ID (multiple locations, not
552 # parent/child relationships)
554 my $old_feat;
556 # Current feature is the same as the previous feature, which hasn't yet been loaded
557 if (defined $ld->{CurrentID} && $ld->{CurrentID} eq $feature_id) {
558 $old_feat = $ld->{CurrentFeature};
561 # Current feature is the same as a feature that was loaded earlier
562 elsif (defined(my $id = $self->{load_data}{Helper}->local2global($feature_id))) {
563 $old_feat = $self->fetch($feature_id)
564 or $self->warn(<<END);
565 ID=$feature_id has been used more than once, but it cannot be found in the database.
566 This can happen if you have specified fast loading, but features sharing the same ID
567 are not contiguous in the GFF file. This will be loaded as a separate feature.
568 Line $.: "$_"
572 # contiguous feature, so add a segment
573 warn $old_feat if defined $old_feat and !ref $old_feat;
574 if (defined $old_feat) {
575 # set this to 1 to disable split-location behavior
576 if (0 && @parent_ids) { # If multiple features are held together by the same ID
577 $feature_id = $ld->{TemporaryID}++; # AND they have a Parent attribute, this causes an undesirable
578 } # additional layer of aggregation. Changing the ID fixes this.
579 elsif (
580 $old_feat->seq_id ne $refname ||
581 $old_feat->start != $start ||
582 $old_feat->end != $end # make sure endpoints are distinct
585 $self->add_segment($old_feat,$self->sfclass->new(@args));
586 return;
590 # we get here if this is a new feature
591 # first of all, store the current feature if it is there
592 $self->store_current_feature() if defined $ld->{CurrentID};
594 # now create the new feature
595 # (index top-level features only if policy asks us to)
596 my $feature = $self->sfclass->new(@args);
597 $feature->object_store($self->store) if $feature->can('object_store'); # for lazy table features
598 $ld->{CurrentFeature} = $feature;
599 $ld->{CurrentID} = $feature_id;
601 my $top_level = !@parent_ids;
602 my $has_id = defined $reserved->{ID}[0];
603 $index_it ||= $top_level;
605 my $helper = $ld->{Helper};
606 $helper->indexit($feature_id=>1) if $index_it;
607 $helper->toplevel($feature_id=>1) if !$self->{fast}
608 && $top_level; # need to track top level features
611 # remember parentage
612 for my $parent (@parent_ids) {
613 $helper->add_children($parent=>$feature_id);
618 sub invalid_gff {
619 my $self = shift;
620 my $line = shift;
621 $self->throw("invalid GFF line at line $self->{load_data}{line}.\n".$line);
624 =item allow_whitespace
626 $allow_it = $loader->allow_whitespace([$newvalue]);
628 Get or set the allow_whitespace flag. If true, then GFF3 files are
629 allowed to be delimited with whitespace in addition to tabs.
631 =cut
633 sub allow_whitespace {
634 my $self = shift;
635 my $d = $self->{allow_whitespace};
636 $self->{allow_whitespace} = shift if @_;
640 =item store_current_feature
642 $loader->store_current_feature()
644 This method is called to store the currently active feature in the
645 database. It uses a data structure stored in $self-E<gt>{load_data}.
647 =cut
649 # sub store_current_feature { } inherited
651 =item build_object_tree
653 $loader->build_object_tree()
655 This method gathers together features and subfeatures and builds the graph that connects them.
657 =cut
660 # put objects together
662 sub build_object_tree {
663 my $self = shift;
664 $self->subfeatures_in_table ? $self->build_object_tree_in_tables : $self->build_object_tree_in_features;
667 =item build_object_tree_in_tables
669 $loader->build_object_tree_in_tables()
671 This method gathers together features and subfeatures and builds the
672 graph that connects them, assuming that parent/child relationships
673 will be stored in a database table.
675 =cut
677 sub build_object_tree_in_tables {
678 my $self = shift;
679 my $store = $self->store;
680 my $helper = $self->{load_data}{Helper};
682 while (my ($load_id,$children) = $helper->each_family()) {
684 my $parent_id = $helper->local2global($load_id);
685 die $self->throw("$load_id doesn't have a primary id")
686 unless defined $parent_id;
689 my @children = map {$helper->local2global($_)} @$children;
690 # this updates the table that keeps track of parent/child relationships,
691 # but does not update the parent object -- so (start,end) had better be right!!!
692 $store->add_SeqFeature($parent_id,@children);
698 =item build_object_tree_in_features
700 $loader->build_object_tree_in_features()
702 This method gathers together features and subfeatures and builds the
703 graph that connects them, assuming that parent/child relationships are
704 stored in the seqfeature objects themselves.
706 =cut
708 sub build_object_tree_in_features {
709 my $self = shift;
710 my $store = $self->store;
711 my $tmp = $self->tmp_store;
712 my $ld = $self->{load_data};
713 my $normalized = $self->subfeatures_normalized;
715 my $helper = $ld->{Helper};
717 while (my $load_id = $helper->each_toplevel) {
718 my $feature = $self->fetch($load_id)
719 or $self->throw("$load_id (id="
720 .$helper->local2global($load_id)
721 ." should have a database entry, but doesn't");
722 $self->attach_children($store,$ld,$load_id,$feature);
723 # Indexed objects are updated, not created anew
724 $feature->primary_id(undef) unless $helper->indexit($load_id);
725 $store->store($feature);
730 =item attach_children
732 $loader->attach_children($store,$load_data,$load_id,$feature)
734 This recursively adds children to features and their subfeatures. It
735 is called when subfeatures are directly contained within other
736 features, rather than stored in a relational table.
738 =cut
740 sub attach_children {
741 my $self = shift;
742 my ($store,$ld,$load_id,$feature) = @_;
744 my $children = $ld->{Helper}->children() or return;
745 for my $child_id (@$children) {
746 my $child = $self->fetch($child_id)
747 or $self->throw("$child_id should have a database entry, but doesn't");
748 $self->attach_children($store,$ld,$child_id,$child); # recursive call
749 $feature->add_SeqFeature($child);
753 =item fetch
755 my $feature = $loader->fetch($load_id)
757 Given a load ID (from the ID= attribute) this method returns the
758 feature from the temporary database or the permanent one, depending on
759 where it is stored.
761 =cut
763 sub fetch {
764 my $self = shift;
765 my $load_id = shift;
766 my $helper = $self->{load_data}{Helper};
767 my $id = $helper->local2global($load_id);
769 return
770 ($self->subfeatures_normalized || $helper->indexit($load_id)
771 ? $self->store->fetch($id)
772 : $self->tmp_store->fetch($id)
776 =item add_segment
778 $loader->add_segment($parent,$child)
780 This method is used to add a split location to the parent.
782 =cut
784 sub add_segment {
785 my $self = shift;
786 my ($parent,$child) = @_;
788 if ($parent->can('add_segment')) { # probably a lazy table feature
789 my $segment_count = $parent->can('denormalized_segment_count') ? $parent->denormalized_segment_count
790 : $parent->can('denormalized_segments ') ? $parent->denormalized_segments
791 : $parent->can('segments') ? $parent->segments
792 : 0;
793 unless ($segment_count) { # convert into a segmented object
794 my $segment;
795 if ($parent->can('clone')) {
796 $segment = $parent->clone;
797 } else {
798 my %clone = %$parent;
799 $segment = bless \%clone,ref $parent;
801 delete $segment->{segments};
802 eval {$segment->object_store(undef) };
803 $segment->primary_id(undef);
805 # this updates the object and expands its start and end positions without writing
806 # the segments into the database as individual objects
807 $parent->add_segment($segment);
809 $parent->add_segment($child);
810 1; # for debugging
813 # a conventional Bio::SeqFeature::Generic object - create a split location
814 else {
815 my $current_location = $parent->location;
816 if ($current_location->can('add_sub_Location')) {
817 $current_location->add_sub_Location($child->location);
818 } else {
819 eval "require Bio::Location::Split" unless Bio::Location::Split->can('add_sub_Location');
820 my $new_location = Bio::Location::Split->new();
821 $new_location->add_sub_Location($current_location);
822 $new_location->add_sub_Location($child->location);
823 $parent->location($new_location);
828 =item parse_attributes
830 ($reserved,$unreserved) = $loader->parse_attributes($attribute_line)
832 This method parses the information contained in the $attribute_line
833 into two hashrefs, one containing the values of reserved attribute
834 tags (e.g. ID) and the other containing the values of unreserved ones.
836 =cut
838 sub parse_attributes {
839 my $self = shift;
840 my $att = shift;
842 unless ($att =~ /=/) { # ouch! must be a GFF line
843 require Bio::DB::SeqFeature::Store::GFF2Loader
844 unless Bio::DB::SeqFeature::Store::GFF2Loader->can('parse_attributes');
845 return $self->Bio::DB::SeqFeature::Store::GFF2Loader::parse_attributes($att);
848 my @pairs = map { my ($name,$value) = split '=';
849 [$self->unescape($name) => $value];
850 } split ';',$att;
851 my (%reserved,%unreserved);
852 foreach (@pairs) {
853 my $tag = $_->[0];
855 unless (defined $_->[1]) {
856 warn "$tag does not have a value at GFF3 file line $.\n";
857 next;
860 my @values = split ',',$_->[1];
861 map {$_ = $self->unescape($_);} @values;
862 if ($Special_attributes{$tag}) { # reserved attribute
863 push @{$reserved{$tag}},@values;
864 } else {
865 push @{$unreserved{$tag}},@values
868 return (\%reserved,\%unreserved);
871 =item start_or_finish_sequence
873 $loader->start_or_finish_sequence('Chr9')
875 This method is called at the beginning and end of a fasta section.
877 =cut
879 # sub start_or_finish_sequence { } inherited
881 =item load_sequence
883 $loader->load_sequence('gatttcccaaa')
885 This method is called to load some amount of sequence after
886 start_or_finish_sequence() is first called.
888 =cut
890 # sub load_sequence { } inherited
892 =item open_fh
894 my $io_file = $loader->open_fh($filehandle_or_path)
896 This method opens up the indicated file or pipe, using some
897 intelligence to recognized compressed files and URLs and doing the
898 right thing.
900 =cut
902 # sub open_fh { } inherited
904 # sub msg { } inherited
906 =item time
908 my $time = $loader->time
910 This method returns the current time in seconds, using Time::HiRes if available.
912 =cut
914 # sub time { } inherited
916 =item unescape
918 my $unescaped = GFF3Loader::unescape($escaped)
920 This is an internal utility. It is the same as CGI::Util::unescape,
921 but doesn't change pluses into spaces and ignores unicode escapes.
923 =cut
925 # sub unescape { } inherited
927 sub _remap {
928 my $self = shift;
929 my ($ref,$start,$end,$strand) = @_;
930 my $mapper = $self->coordinate_mapper;
931 return ($ref,$start,$end,$strand) unless $mapper;
933 my ($newref,$coords) = $mapper->($ref,[$start,$end]);
934 return unless defined $coords->[0];
935 if ($coords->[0] > $coords->[1]) {
936 @{$coords} = reverse(@{$coords});
937 $strand *= -1;
939 return ($newref,@{$coords},$strand);
942 sub _indexit { # override
943 my $self = shift;
944 return $self->{load_data}{Helper}->indexit(@_);
947 sub _local2global { # override
948 my $self = shift;
949 return $self->{load_data}{Helper}->local2global(@_);
952 =item local_ids
954 my $ids = $self->local_ids;
955 my $id_cnt = @$ids;
957 After performing a load, this returns an array ref containing all the
958 load file IDs that were contained within the file just loaded.
960 =cut
962 sub local_ids { # override
963 my $self = shift;
964 return $self->{load_data}{Helper}->local_ids(@_);
967 =item loaded_ids
969 my $ids = $loader->loaded_ids;
970 my $id_cnt = @$ids;
972 After performing a load, this returns an array ref containing all the
973 feature primary ids that were created during the load.
975 =cut
977 sub loaded_ids { # override
978 my $self = shift;
979 return $self->{load_data}{Helper}->loaded_ids(@_);
984 __END__
986 =back
988 =head1 BUGS
990 This is an early version, so there are certainly some bugs. Please
991 use the BioPerl bug tracking system to report bugs.
993 =head1 SEE ALSO
995 L<bioperl>,
996 L<Bio::DB::SeqFeature::Store>,
997 L<Bio::DB::SeqFeature::Segment>,
998 L<Bio::DB::SeqFeature::NormalizedFeature>,
999 L<Bio::DB::SeqFeature::GFF2Loader>,
1000 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1001 L<Bio::DB::SeqFeature::Store::berkeleydb>
1003 =head1 AUTHOR
1005 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1007 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1009 This library is free software; you can redistribute it and/or modify
1010 it under the same terms as Perl itself.
1012 =cut