sync with main trunk completely (a few tests failing)
[bioperl-live.git] / Bio / DB / SeqFeature / Store / DBI / SQLite.pm
blob8c555fe854570d035edfaf7886d4ddd111e108a8
1 package Bio::DB::SeqFeature::Store::DBI::SQLite;
3 #$Id$
5 =head1 NAME
7 Bio::DB::SeqFeature::Store::DBI::SQLite -- SQLite implementation of Bio::DB::SeqFeature::Store
9 =head1 SYNOPSIS
11 use Bio::DB::SeqFeature::Store;
13 # Open the sequence database
14 my $db = Bio::DB::SeqFeature::Store->new(-adaptor => 'DBI::SQLite',
15 -dsn => '/path/to/database.db');
17 # get a feature from somewhere
18 my $feature = Bio::SeqFeature::Generic->new(...);
20 # store it
21 $db->store($feature) or die "Couldn't store!";
23 # primary ID of the feature is changed to indicate its primary ID
24 # in the database...
25 my $id = $feature->primary_id;
27 # get the feature back out
28 my $f = $db->fetch($id);
30 # change the feature and update it
31 $f->start(100);
32 $db->update($f) or die "Couldn't update!";
34 # searching...
35 # ...by id
36 my @features = $db->fetch_many(@list_of_ids);
38 # ...by name
39 @features = $db->get_features_by_name('ZK909');
41 # ...by alias
42 @features = $db->get_features_by_alias('sma-3');
44 # ...by type
45 @features = $db->get_features_by_name('gene');
47 # ...by location
48 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
50 # ...by attribute
51 @features = $db->get_features_by_attribute({description => 'protein kinase'})
53 # ...by the GFF "Note" field
54 @result_list = $db->search_notes('kinase');
56 # ...by arbitrary combinations of selectors
57 @features = $db->features(-name => $name,
58 -type => $types,
59 -seq_id => $seqid,
60 -start => $start,
61 -end => $end,
62 -attributes => $attributes);
64 # ...using an iterator
65 my $iterator = $db->get_seq_stream(-name => $name,
66 -type => $types,
67 -seq_id => $seqid,
68 -start => $start,
69 -end => $end,
70 -attributes => $attributes);
72 while (my $feature = $iterator->next_seq) {
73 # do something with the feature
76 # ...limiting the search to a particular region
77 my $segment = $db->segment('Chr1',5000=>6000);
78 my @features = $segment->features(-type=>['mRNA','match']);
80 # getting & storing sequence information
81 # Warning: this returns a string, and not a PrimarySeq object
82 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
83 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
85 # what feature types are defined in the database?
86 my @types = $db->types;
88 # create a new feature in the database
89 my $feature = $db->new_feature(-primary_tag => 'mRNA',
90 -seq_id => 'chr3',
91 -start => 10000,
92 -end => 11000);
94 =head1 DESCRIPTION
96 Bio::DB::SeqFeature::Store::SQLite is the SQLite adaptor for
97 Bio::DB::SeqFeature::Store. You will not create it directly, but
98 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
100 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
102 =head2 Using the SQLite adaptor
104 To establish a connection to the database, call
105 Bio::DB::SeqFeature::Store-E<gt>new(-adaptor=E<gt>'DBI::SQLite',@more_args). The
106 additional arguments are as follows:
108 Argument name Description
109 ------------- -----------
111 -dsn The path to the SQLite database file.
113 -namespace A prefix to attach to each table. This allows you
114 to have several virtual databases in the same
115 physical database.
117 -temp Boolean flag. If true, a temporary database
118 will be created and destroyed as soon as
119 the Store object goes out of scope. (synonym -temporary)
121 -autoindex Boolean flag. If true, features in the database will be
122 reindexed every time they change. This is the default.
125 -tmpdir Directory in which to place temporary files during "fast" loading.
126 Defaults to File::Spec->tmpdir(). (synonyms -dump_dir, -dumpdir, -tmp)
128 -dbi_options A hashref to pass to DBI->connect's 4th argument, the "attributes."
129 (synonyms -options, -dbi_attr)
131 -write Pass true to open database for writing or updating.
133 If successful, a new instance of
134 Bio::DB::SeqFeature::Store::DBI::SQLite will be returned.
136 In addition to the standard methods supported by all well-behaved
137 Bio::DB::SeqFeature::Store databases, several following
138 adaptor-specific methods are provided. These are described in the next
139 sections.
141 =cut
143 use strict;
145 use base 'Bio::DB::SeqFeature::Store::DBI::mysql';
146 use Bio::DB::SeqFeature::Store::DBI::Iterator;
147 use DBI qw(:sql_types);
148 use Memoize;
149 use Cwd 'abs_path';
150 use Bio::DB::GFF::Util::Rearrange 'rearrange';
151 use Bio::SeqFeature::Lite;
152 use File::Spec;
153 use constant DEBUG=>0;
155 # Using same limits as MySQL adaptor so I don't have to make something up.
156 use constant MAX_INT => 2_147_483_647;
157 use constant MIN_INT => -2_147_483_648;
158 use constant MAX_BIN => 1_000_000_000; # size of largest feature = 1 Gb
159 use constant MIN_BIN => 1000; # smallest bin we'll make - on a 100 Mb chromosome, there'll be 100,000 of these
160 use constant DEBUG_NONSPATIAL=>0;
162 my (@BINS,%BINS);
164 @BINS = map {2**$_} (17, 20, 23, 26, 29); # TO DO: experiment with different bin sizes
165 my $start=0;
166 for my $b (sort {$b<=>$a} @BINS) {
167 $BINS{$b} = $start;
168 $start += $BINS[-1]/$b;
173 # my %BINS = (
174 # 2**11 => 37449,
175 # 2**14 => 4681,
176 # 2**17 => 585,
177 # 2**20 => 73,
178 # 2**23 => 9,
179 # 2**26 => 1,
180 # 2**29 => 0
181 # );
182 # my @BINS = sort {$a<=>$b} keys %BINS;
184 sub calculate_bin {
185 my $self = shift;
186 my ($start,$end) = @_;
188 my $len = $end - $start;
189 for my $bin (@BINS) {
190 next if $len > $bin;
191 # possibly fits here
192 my $binstart = int $start/$bin;
193 my $binend = int $end/$bin;
194 return $binstart+$BINS{$bin} if $binstart == $binend;
197 die "unreasonable coordinates ",$start+1,"..$end";
200 sub search_bins {
201 my $self = shift;
202 my ($start,$end) = @_;
203 my @results;
205 for my $bin (@BINS) {
206 my $binstart = int $start/$bin;
207 my $binend = int $end/$bin;
208 push @results,$binstart+$BINS{$bin}..$binend+$BINS{$bin};
210 return @results;
215 # object initialization
217 sub init {
218 my $self = shift;
219 my ($dsn,
220 $is_temporary,
221 $autoindex,
222 $namespace,
223 $dump_dir,
224 $user,
225 $pass,
226 $dbi_options,
227 $writeable,
228 $create,
229 ) = rearrange(['DSN',
230 ['TEMP','TEMPORARY'],
231 'AUTOINDEX',
232 'NAMESPACE',
233 ['DUMP_DIR','DUMPDIR','TMP','TMPDIR'],
234 'USER',
235 ['PASS','PASSWD','PASSWORD'],
236 ['OPTIONS','DBI_OPTIONS','DBI_ATTR'],
237 ['WRITE','WRITEABLE'],
238 'CREATE',
239 ],@_);
240 $dbi_options ||= {};
241 $writeable = 1 if $is_temporary or $dump_dir;
243 $dsn or $self->throw("Usage: ".__PACKAGE__."->init(-dsn => \$dbh || \$dsn)");
245 my $dbh;
246 if (ref $dsn) {
247 $dbh = $dsn;
248 } else {
249 $dsn = "dbi:SQLite:$dsn" unless $dsn =~ /^dbi:/;
250 $dbh = DBI->connect($dsn,$user,$pass,$dbi_options) or $self->throw($DBI::errstr);
251 $dbh->do("PRAGMA synchronous = OFF;"); # makes writes much faster
252 $dbh->do("PRAGMA temp_store = MEMORY;"); # less disk I/O; some speedup
253 $dbh->do("PRAGMA cache_size = 20000;"); # less disk I/O; some speedup
255 $self->{dbh} = $dbh;
256 $self->{is_temp} = $is_temporary;
257 $self->{namespace} = $namespace;
258 $self->{writeable} = $writeable;
260 $self->default_settings;
261 $self->autoindex($autoindex) if defined $autoindex;
262 $self->dumpdir($dump_dir) if $dump_dir;
263 if ($self->is_temp) {
264 $self->init_tmp_database();
265 } elsif ($create) {
266 $self->init_database('erase');
270 sub table_definitions {
271 my $self = shift;
272 my $defs =
274 feature => <<END,
276 id integer primary key autoincrement,
277 typeid integer not null,
278 strand integer default 0,
279 "indexed" integer default 1,
280 object blob not null
284 locationlist => <<END,
286 id integer primary key autoincrement,
287 seqname text not null
289 create index index_locationlist on locationlist (seqname);
292 typelist => <<END,
294 id integer primary key autoincrement,
295 tag text not null
297 create index index_typelist on typelist (tag);
299 name => <<END,
301 id integer not null,
302 name text not null,
303 display_name integer default 0
305 create index index_name_id on name(id);
306 create index index_name_name on name(name);
309 attribute => <<END,
311 id integer not null,
312 attribute_id integer not null,
313 attribute_value text
315 create index index_attribute_id on attribute(attribute_id);
316 create index index_attribute_value on attribute(attribute_value);
319 attributelist => <<END,
321 id integer primary key autoincrement,
322 tag text not null
324 create index index_attributelist_id on attributelist(id);
325 create index index_attributelist_tag on attributelist(tag);
327 parent2child => <<END,
329 id integer not null,
330 child integer not null
332 create index index_parent2child_id_child on parent2child(id,child);
335 meta => <<END,
337 name text primary key,
338 value text not null
341 sequence => <<END,
343 id integer not null,
344 offset integer not null,
345 sequence blob,
346 primary key(id,offset)
351 unless ($self->_has_spatial_index) {
352 $defs->{feature_location} = <<END;
354 id int(10) primary key,
355 seqid int(10),
356 bin int,
357 start int,
358 end int
360 create index index_feature_location on feature_location(seqid,bin,start,end);
365 return $defs;
368 sub _init_database {
369 my $self = shift;
371 # must do this first before calling table_definitions
372 $self->_create_spatial_index;
373 $self->SUPER::_init_database(@_);
376 sub init_tmp_database {
377 my $self = shift;
378 my $erase = shift;
379 $self->_create_spatial_index;
380 $self->SUPER::init_tmp_database(@_);
383 sub _create_spatial_index{
384 my $self = shift;
385 my $dbh = $self->dbh;
386 local $dbh->{PrintError} = 0;
387 $dbh->do("DROP TABLE IF EXISTS feature_index"); # spatial index
388 if (DEBUG_NONSPATIAL) {
389 warn "DELIBERATELY BREAKING RTREE FUNCTIONALITY";
390 $dbh->do("CREATE VIRTUAL TABLE feature_index USING BTREE(id,seqid,bin,start,end)");
391 } else {
392 $dbh->do("CREATE VIRTUAL TABLE feature_index USING RTREE(id,seqid,bin,start,end)");
396 sub _has_spatial_index {
397 my $self = shift;
398 return $self->{'_has_spatial_index'} if exists $self->{'_has_spatial_index'};
399 my $dbh = $self->dbh;
400 my ($count) = $dbh->selectrow_array("select count(*) from sqlite_master where name='feature_index'");
401 return $self->{'_has_spatial_index'} = $count;
404 sub _finish_bulk_update {
405 my $self = shift;
406 my $dbh = $self->dbh;
407 my $dir = $self->{dumpdir} || '.';
409 $dbh->begin_work; # making this a transaction greatly improves performance
411 for my $table ('feature', $self->index_tables) {
412 my $fh = $self->dump_filehandle($table);
413 my $path = $self->dump_path($table);
414 $fh->close;
415 open($fh, $path);
416 my $qualified_table = $self->_qualify($table);
418 my $sth;
419 if ($table =~ /feature$/) {
420 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?,?,?)");
422 while (<$fh>) {
423 chomp();
424 my ($id,$typeid,$strand,$indexed,$obj) = split(/\t/);
425 $sth->bind_param(1, $id);
426 $sth->bind_param(2, $typeid);
427 $sth->bind_param(3, $strand);
428 $sth->bind_param(4, $indexed);
429 $sth->bind_param(5, pack('H*',$obj), {TYPE => SQL_BLOB});
430 $sth->execute();
432 } else {
433 my $feature_index = $self->_feature_index_table;
434 if ($table =~ /parent2child$/) {
435 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?)");
436 } elsif ($table =~ /$feature_index$/) {
437 $sth = $dbh->prepare(
438 $self->_has_spatial_index ?"REPLACE INTO $qualified_table VALUES (?,?,?,?,?)"
439 :"REPLACE INTO $qualified_table VALUES (?,?,?,?,?)"
441 } else { # attribute or name
442 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?)");
445 while (<$fh>) {
446 chomp();
447 $sth->execute(split(/\t/));
450 $fh->close();
451 unlink $path;
453 $dbh->commit; # commit the transaction
454 delete $self->{bulk_update_in_progress};
455 delete $self->{filehandles};
458 sub index_tables {
459 my $self = shift;
460 my @t = $self->SUPER::index_tables;
461 return (@t,$self->_feature_index_table);
465 # get primary sequence between start and end
467 sub _fetch_sequence {
468 my $self = shift;
469 my ($seqid,$start,$end) = @_;
471 # backward compatibility to the old days when I liked reverse complementing
472 # dna by specifying $start > $end
473 my $reversed;
474 if (defined $start && defined $end && $start > $end) {
475 $reversed++;
476 ($start,$end) = ($end,$start);
478 $start-- if defined $start;
479 $end-- if defined $end;
481 my $offset1 = $self->_offset_boundary($seqid,$start || 'left');
482 my $offset2 = $self->_offset_boundary($seqid,$end || 'right');
483 my $sequence_table = $self->_sequence_table;
484 my $locationlist_table = $self->_locationlist_table;
486 # CROSS JOIN gives a hint to the SQLite query optimizer -- mucho speedup!
487 my $sth = $self->_prepare(<<END);
488 SELECT sequence,offset
489 FROM $locationlist_table as ll CROSS JOIN $sequence_table as s
490 WHERE ll.id=s.id
491 AND ll.seqname= ?
492 AND offset >= ?
493 AND offset <= ?
494 ORDER BY offset
497 my $seq = '';
498 $sth->execute($seqid,$offset1,$offset2) or $self->throw($sth->errstr);
500 while (my($frag,$offset) = $sth->fetchrow_array) {
501 substr($frag,0,$start-$offset) = '' if defined $start && $start > $offset;
502 $seq .= $frag;
504 substr($seq,$end-$start+1) = '' if defined $end && $end-$start+1 < length($seq);
505 if ($reversed) {
506 $seq = reverse $seq;
507 $seq =~ tr/gatcGATC/ctagCTAG/;
509 $sth->finish;
510 $seq;
513 sub _offset_boundary {
514 my $self = shift;
515 my ($seqid,$position) = @_;
517 my $sequence_table = $self->_sequence_table;
518 my $locationlist_table = $self->_locationlist_table;
520 my $sql;
521 # use "CROSS JOIN" to give a hint to the SQLite query optimizer.
522 $sql = $position eq 'left' ? "SELECT min(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
523 :$position eq 'right' ? "SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
524 :"SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=? AND offset<=?";
525 my $sth = $self->_prepare($sql);
526 my @args = $position =~ /^-?\d+$/ ? ($seqid,$position) : ($seqid);
527 $sth->execute(@args) or $self->throw($sth->errstr);
528 my $boundary = $sth->fetchall_arrayref->[0][0];
529 $sth->finish;
530 return $boundary;
534 # Efficiently fetch a series of IDs from the database
535 # Can pass an array or an array ref
537 sub _fetch_many {
538 my $self = shift;
539 @_ or $self->throw('usage: fetch_many($id1,$id2,$id3...)');
540 my $ids = join ',',map {ref($_) ? @$_ : $_} @_ or return;
541 my $features = $self->_feature_table;
543 my $sth = $self->_prepare(<<END);
544 SELECT id,object FROM $features WHERE id IN ($ids)
546 $sth->execute() or $self->throw($sth->errstr);
547 return $self->_sth2objs($sth);
550 sub _features {
551 my $self = shift;
552 my ($seq_id,$start,$end,$strand,
553 $name,$class,$allow_aliases,
554 $types,
555 $attributes,
556 $range_type,
557 $fromtable,
558 $iterator,
559 $sources
560 ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
561 'NAME','CLASS','ALIASES',
562 ['TYPES','TYPE','PRIMARY_TAG'],
563 ['ATTRIBUTES','ATTRIBUTE'],
564 'RANGE_TYPE',
565 'FROM_TABLE',
566 'ITERATOR',
567 ['SOURCE','SOURCES']
568 ],@_);
570 my (@from,@where,@args,@group);
571 $range_type ||= 'overlaps';
573 my $feature_table = $self->_feature_table;
574 @from = "$feature_table as f";
576 if (defined $name) {
577 # hacky backward compatibility workaround
578 undef $class if $class && $class eq 'Sequence';
579 $name = "$class:$name" if defined $class && length $class > 0;
580 # last argument is the join field
581 my ($from,$where,$group,@a) = $self->_name_sql($name,$allow_aliases,'f.id');
582 push @from,$from if $from;
583 push @where,$where if $where;
584 push @group,$group if $group;
585 push @args,@a;
588 if (defined $seq_id) {
589 # last argument is the name of the features table
590 my ($from,$where,$group,@a) = $self->_location_sql($seq_id,$start,$end,$range_type,$strand,'f');
591 push @from,$from if $from;
592 push @where,$where if $where;
593 push @group,$group if $group;
594 push @args,@a;
597 if (defined($sources)) {
598 my @sources = ref($sources) eq 'ARRAY' ? @{$sources} : ($sources);
599 if (defined($types)) {
600 my @types = ref($types) eq 'ARRAY' ? @{$types} : ($types);
601 my @final_types;
602 foreach my $type (@types) {
603 # *** not sure what to do if user supplies both -source and -type
604 # where the type includes a source!
605 if ($type =~ /:/) {
606 push(@final_types, $type);
608 else {
609 foreach my $source (@sources) {
610 push(@final_types, $type.':'.$source);
614 $types = \@final_types;
616 else {
617 $types = [map { ':'.$_ } @sources];
620 if (defined($types)) {
621 # last argument is the name of the features table
622 my ($from,$where,$group,@a) = $self->_types_sql($types,'f');
623 push @from,$from if $from;
624 push @where,$where if $where;
625 push @group,$group if $group;
626 push @args,@a;
629 if (defined $attributes) {
630 # last argument is the join field
631 my ($from,$where,$group,@a) = $self->_attributes_sql($attributes,'f.id');
632 push @from,$from if $from;
633 push @where,$where if $where;
634 push @group,$group if $group;
635 push @args,@a;
638 if (defined $fromtable) {
639 # last argument is the join field
640 my ($from,$where,$group,@a) = $self->_from_table_sql($fromtable,'f.id');
641 push @from,$from if $from;
642 push @where,$where if $where;
643 push @group,$group if $group;
644 push @args,@a;
647 # if no other criteria are specified, then
648 # only fetch indexed (i.e. top level objects)
649 @where = '"indexed"=1' unless @where;
651 my $from = join ', ',@from;
652 my $where = join ' AND ',map {"($_)"} @where;
653 my $group = join ', ',@group;
654 $group = "GROUP BY $group" if @group;
656 my $query = <<END;
657 SELECT f.id,f.object
658 FROM $from
659 WHERE $where
660 $group
663 $self->_print_query($query,@args) if DEBUG || $self->debug;
665 my $sth = $self->_prepare($query);
666 $sth->execute(@args) or $self->throw($sth->errstr);
667 return $iterator ? Bio::DB::SeqFeature::Store::DBI::Iterator->new($sth,$self) : $self->_sth2objs($sth);
670 sub _make_attribute_group {
671 my $self = shift;
672 my ($table_name,$attributes) = @_;
673 my $key_count = keys %$attributes or return;
674 my $count = $key_count-1;
675 return "f.id HAVING count(f.id)>$count";
678 sub _location_sql {
679 my $self = shift;
680 my ($seq_id,$start,$end,$range_type,$strand,$location) = @_;
682 # the additional join on the location_list table badly impacts performance
683 # so we build a copy of the table in memory
684 my $seqid = $self->_locationid($seq_id) || 0; # zero is an invalid primary ID, so will return empty
686 my $feature_index = $self->_feature_index_table;
687 my $from = "$feature_index as fi";
689 my ($bin_where,@bin_args);
690 if (defined $start && defined $end && !$self->_has_spatial_index) {
691 my @bins = $self->search_bins($start,$end);
692 $bin_where = ' AND bin in ('.join(',',@bins).')';
695 $start = MIN_INT unless defined $start;
696 $end = MAX_INT unless defined $end;
698 my ($range,@range_args);
699 if ($range_type eq 'overlaps') {
700 $range = "fi.end>=? AND fi.start<=?".$bin_where;
701 @range_args = ($start,$end,@bin_args);
702 } elsif ($range_type eq 'contains') {
703 $range = "fi.start>=? AND fi.end<=?".$bin_where;
704 @range_args = ($start,$end,@bin_args);
705 } elsif ($range_type eq 'contained_in') {
706 $range = "fi.start<=? AND fi.end>=?";
707 @range_args = ($start,$end);
708 } else {
709 $self->throw("range_type must be one of 'overlaps', 'contains' or 'contained_in'");
712 if (defined $strand) {
713 $range .= " AND strand=?";
714 push @range_args,$strand;
717 my $where = <<END;
718 fi.seqid=?
719 AND $location.id=fi.id
720 AND $range
723 my $group = '';
725 my @args = ($seqid,@range_args);
726 return ($from,$where,$group,@args);
729 sub _feature_index_table {
730 my $self = shift;
731 return $self->_has_spatial_index ? $self->_qualify('feature_index')
732 : $self->_qualify('feature_location') }
734 # Do a case-insensitive search a la the PostgreSQL adaptor
735 sub _name_sql {
736 my $self = shift;
737 my ($name,$allow_aliases,$join) = @_;
738 my $name_table = $self->_name_table;
740 my $from = "$name_table as n";
741 my ($match,$string) = $self->_match_sql($name);
743 my $where = "n.id=$join AND lower(n.name) $match";
744 $where .= " AND n.display_name>0" unless $allow_aliases;
745 return ($from,$where,'',$string);
748 sub _search_attributes {
749 my $self = shift;
750 my ($search_string,$attribute_names,$limit) = @_;
751 my @words = map {quotemeta($_)} split /\s+/,$search_string;
753 my $name_table = $self->_name_table;
754 my $attribute_table = $self->_attribute_table;
755 my $attributelist_table = $self->_attributelist_table;
756 my $type_table = $self->_type_table;
757 my $typelist_table = $self->_typelist_table;
759 my @tags = @$attribute_names;
760 my $tag_sql = join ' OR ',("al.tag=?") x @tags;
762 my $perl_regexp = join '|',@words;
764 my @wild_card_words = map { "%$_%" } @words;
765 my $sql_regexp = join ' OR ',("a.attribute_value LIKE ?") x @words;
766 # CROSS JOIN disables SQLite's table reordering optimization
767 my $sql = <<END;
768 SELECT name,attribute_value,tl.tag,n.id
769 FROM $attributelist_table AS al
770 CROSS JOIN $attribute_table AS a ON al.id = a.attribute_id
771 CROSS JOIN $name_table AS n ON n.id = a.id
772 CROSS JOIN $type_table AS t ON t.id = n.id
773 CROSS JOIN $typelist_table AS tl ON tl.id = t.typeid
774 WHERE ($tag_sql)
775 AND ($sql_regexp)
776 AND n.display_name=1
778 $sql .= "LIMIT $limit" if defined $limit;
779 $self->_print_query($sql,@tags,@words) if DEBUG || $self->debug;
780 my $sth = $self->_prepare($sql);
781 $sth->execute(@tags,@wild_card_words) or $self->throw($sth->errstr);
783 my @results;
784 while (my($name,$value,$type,$id) = $sth->fetchrow_array) {
785 my (@hits) = $value =~ /$perl_regexp/ig;
786 my @words_in_row = split /\b/,$value;
787 my $score = int(@hits*100/@words/@words_in_row);
788 push @results,[$name,$value,$score,$type,$id];
790 $sth->finish;
791 @results = sort {$b->[2]<=>$a->[2]} @results;
792 return @results;
795 sub _match_sql {
796 my $self = shift;
797 my $name = shift;
799 my ($match,$string);
800 if ($name =~ /(?:^|[^\\])[*?]/) {
801 $name =~ s/(^|[^\\])([%_])/$1\\$2/g;
802 $name =~ s/(^|[^\\])\*/$1%/g;
803 $name =~ s/(^|[^\\])\?/$1_/g;
804 $match = "LIKE ?";
805 $string = $name;
806 } else {
807 $match = "= lower(?)";
808 $string = lc($name);
810 return ($match,$string);
813 sub _attributes_sql {
814 my $self = shift;
815 my ($attributes,$join) = @_;
817 my ($wf,@bind_args) = $self->_make_attribute_where('a','al',$attributes);
818 my ($group_by,@group_args)= $self->_make_attribute_group('a',$attributes);
820 my $attribute_table = $self->_attribute_table;
821 my $attributelist_table = $self->_attributelist_table;
823 my $from = "$attribute_table AS a INDEXED BY index_attribute_id, $attributelist_table AS al";
825 my $where = <<END;
826 a.id=$join
827 AND a.attribute_id=al.id
828 AND ($wf)
831 my $group = $group_by;
833 my @args = (@bind_args,@group_args);
834 return ($from,$where,$group,@args);
837 # overridden because of case-sensitivity of matches
838 sub _types_sql {
839 my $self = shift;
840 my ($types,$type_table) = @_;
841 my ($primary_tag,$source_tag);
843 my @types = ref $types eq 'ARRAY' ? @$types : $types;
845 my $typelist = $self->_typelist_table;
846 my $from = "$typelist AS tl";
848 my (@matches,@args);
850 for my $type (@types) {
852 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
853 $primary_tag = $type->method;
854 $source_tag = $type->source;
855 } else {
856 ($primary_tag,$source_tag) = split ':',$type,2;
859 if (defined $source_tag) {
860 push @matches,"lower(tl.tag)=lower(?)";
861 push @args,"$primary_tag:$source_tag";
862 } else {
863 push @matches,"tl.tag LIKE ?";
864 push @args,"$primary_tag:%";
867 my $matches = join ' OR ',@matches;
869 my $where = <<END;
870 tl.id=$type_table.typeid
871 AND ($matches)
874 return ($from,$where,'',@args);
877 sub optimize {
878 my $self = shift;
879 $self->dbh->do("ANALYZE $_") foreach $self->index_tables;
883 # Replace Bio::SeqFeatureI into database.
885 sub replace {
886 my $self = shift;
887 my $object = shift;
888 my $index_flag = shift || undef;
890 # ?? shouldn't need to do this
891 # $self->_load_class($object);
892 my $id = $object->primary_id;
893 my $features = $self->_feature_table;
895 my $sth = $self->_prepare(<<END);
896 REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES (?,?,?,?,?)
899 my ($seqid,$start,$end,$strand,$bin) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x5;
901 my $primary_tag = $object->primary_tag;
902 my $source_tag = $object->source_tag || '';
903 $primary_tag .= ":$source_tag";
904 my $typeid = $self->_typeid($primary_tag,1);
906 my $frozen = $self->no_blobs() ? 0 : $self->freeze($object);
908 $sth->bind_param(1, $id);
909 $sth->bind_param(2, $frozen, {TYPE => SQL_BLOB});
910 $sth->bind_param(3, $index_flag||0);
911 $sth->bind_param(4, $strand);
912 $sth->bind_param(5, $typeid);
914 $sth->execute() or $self->throw($sth->errstr);
916 my $dbh = $self->dbh;
917 $object->primary_id($dbh->func('last_insert_rowid')) unless defined $id;
919 $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
922 # doesn't work with this schema, since we have to update name and attribute
923 # tables which need object ids, which we can only know by replacing feats in
924 # the feature table one by one
925 sub bulk_replace {
926 my $self = shift;
927 my $index_flag = shift || undef;
928 my @objects = @_;
930 my $features = $self->_feature_table;
932 my @insert_values;
933 foreach my $object (@objects) {
934 my $id = $object->primary_id;
935 my (undef,undef,undef,$strand) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x4;
936 my $primary_tag = $object->primary_tag;
937 my $source_tag = $object->source_tag || '';
938 $primary_tag .= ":$source_tag";
939 my $typeid = $self->_typeid($primary_tag,1);
941 push(@insert_values, ($id,0,$index_flag||0,$strand,$typeid));
944 my @value_blocks;
945 for (1..@objects) {
946 push(@value_blocks, '(?,?,?,?,?)');
948 my $value_blocks = join(',', @value_blocks);
949 my $sql = qq{REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES $value_blocks};
951 my $sth = $self->_prepare($sql);
952 $sth->execute(@insert_values) or $self->throw($sth->errstr);
955 sub _get_location_and_bin {
956 my $self = shift;
957 my $obj = shift;
958 my $seqid = $self->_locationid($obj->seq_id);
959 my $start = $obj->start;
960 my $end = $obj->end;
961 my $strand = $obj->strand;
962 return ($seqid,$start,$end,$strand,$self->calculate_bin($start,$end));
966 # Insert one Bio::SeqFeatureI into database. primary_id must be undef
968 sub insert {
969 my $self = shift;
970 my $object = shift;
971 my $index_flag = shift || 0;
973 $self->_load_class($object);
974 defined $object->primary_id and $self->throw("$object already has a primary id");
976 my $features = $self->_feature_table;
977 my $sth = $self->_prepare(<<END);
978 INSERT INTO $features (id,object,"indexed") VALUES (?,?,?)
980 $sth->execute(undef,$self->freeze($object),$index_flag) or $self->throw($sth->errstr);
981 my $dbh = $self->dbh;
982 $object->primary_id($dbh->func('last_insert_rowid'));
983 $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
986 =head2 types
988 Title : types
989 Usage : @type_list = $db->types
990 Function: Get all the types in the database
991 Returns : array of Bio::DB::GFF::Typename objects
992 Args : none
993 Status : public
995 =cut
997 sub _genericid {
998 my $self = shift;
999 my ($table,$namefield,$name,$add_if_missing) = @_;
1000 my $qualified_table = $self->_qualify($table);
1001 my $sth = $self->_prepare(<<END);
1002 SELECT id FROM $qualified_table WHERE $namefield=?
1004 $sth->execute($name) or die $sth->errstr;
1005 my ($id) = $sth->fetchrow_array;
1006 $sth->finish;
1007 return $id if defined $id;
1008 return unless $add_if_missing;
1010 $sth = $self->_prepare(<<END);
1011 INSERT INTO $qualified_table ($namefield) VALUES (?)
1013 $sth->execute($name) or die $sth->errstr;
1014 my $dbh = $self->dbh;
1015 return $dbh->func('last_insert_rowid');
1018 # special-purpose store for bulk loading - write to a file rather than to the db
1020 sub _dump_store {
1021 my $self = shift;
1022 my $indexed = shift;
1024 my $count = 0;
1025 my $store_fh = $self->dump_filehandle('feature');
1026 my $dbh = $self->dbh;
1028 my $autoindex = $self->autoindex;
1030 for my $obj (@_) {
1031 my $id = $self->next_id;
1032 my ($seqid,$start,$end,$strand) = $indexed ? $self->_get_location_and_bin($obj) : (undef)x4;
1033 my $primary_tag = $obj->primary_tag;
1034 my $source_tag = $obj->source_tag || '';
1035 $primary_tag .= ":$source_tag";
1036 my $typeid = $self->_typeid($primary_tag,1);
1038 # Encode BLOB in hex so we can more easily import it into SQLite
1039 print $store_fh
1040 join("\t",$id,$typeid,$strand,$indexed,
1041 unpack('H*', $self->freeze($obj))),"\n";
1042 $obj->primary_id($id);
1043 $self->_update_indexes($obj) if $indexed && $autoindex;
1044 $count++;
1047 # remember whether we are have ever stored a non-indexed feature
1048 unless ($indexed or $self->{indexed_flag}++) {
1049 $self->subfeatures_are_indexed(0);
1051 $count;
1054 sub _dump_update_name_index {
1055 my $self = shift;
1056 my ($obj,$id) = @_;
1057 my $fh = $self->dump_filehandle('name');
1058 my $dbh = $self->dbh;
1059 my ($names,$aliases) = $self->feature_names($obj);
1060 # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1061 print $fh join("\t",$id,$_,1),"\n" foreach @$names;
1062 print $fh join("\t",$id,$_,0),"\n" foreach @$aliases;
1065 sub _dump_update_attribute_index {
1066 my $self = shift;
1067 my ($obj,$id) = @_;
1068 my $fh = $self->dump_filehandle('attribute');
1069 my $dbh = $self->dbh;
1070 for my $tag ($obj->all_tags) {
1071 my $tagid = $self->_attributeid($tag);
1072 for my $value ($obj->each_tag_value($tag)) {
1073 # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1074 print $fh join("\t",$id,$tagid,$value),"\n";
1079 sub _update_indexes {
1080 my $self = shift;
1081 my $obj = shift;
1082 defined (my $id = $obj->primary_id) or return;
1083 $self->SUPER::_update_indexes($obj);
1085 if ($self->{bulk_update_in_progress}) {
1086 $self->_dump_update_location_index($obj,$id);
1087 } else {
1088 $self->_update_location_index($obj,$id);
1092 sub _update_location_index {
1093 my $self = shift;
1094 my ($obj,$id) = @_;
1095 my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1097 my $table = $self->_feature_index_table;
1098 $self->_delete_index($table,$id);
1100 my ($sql,@args);
1102 if ($self->_has_spatial_index) {
1103 $sql = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1104 @args = ($id,$seqid,$bin,$start,$end);
1105 } else {
1106 $sql = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1107 @args = ($id,$seqid,$bin,$start,$end);
1110 my $sth = $self->_prepare($sql);
1111 $sth->execute(@args);
1112 $sth->finish;
1115 sub _dump_update_location_index {
1116 my $self = shift;
1117 my ($obj,$id) = @_;
1118 my $table = $self->_feature_index_table;
1119 my $fh = $self->dump_filehandle($table);
1120 my $dbh = $self->dbh;
1121 my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1122 my @args = $self->_has_spatial_index ? ($id,$seqid,$bin,$start,$end)
1123 : ($id,$seqid,$bin,$start,$end);
1124 print $fh join("\t",@args),"\n";
1130 =head1 AUTHOR
1132 Nathan Weeks <Nathan.Weeks@ars.usda.gov>
1134 Copyright (c) 2009 Nathan Weeks
1136 This library is free software; you can redistribute it and/or modify
1137 it under the same terms as Perl itself.
1139 =cut