Massive check of file open lines. Changed bareword filehandles
[bioperl-live.git] / Bio / DB / SeqFeature / Store / DBI / SQLite.pm
blob5da7e3b1f84ab56541f592a2be139aa98cef1b9a
1 package Bio::DB::SeqFeature::Store::DBI::SQLite;
3 #$Id$
5 =head1 NAME
7 Bio::DB::SeqFeature::Store::DBI::SQLite -- SQLite implementation of Bio::DB::SeqFeature::Store
9 =head1 SYNOPSIS
11 use Bio::DB::SeqFeature::Store;
13 # Open the sequence database
14 my $db = Bio::DB::SeqFeature::Store->new(-adaptor => 'DBI::SQLite',
15 -dsn => '/path/to/database.db');
17 # get a feature from somewhere
18 my $feature = Bio::SeqFeature::Generic->new(...);
20 # store it
21 $db->store($feature) or die "Couldn't store!";
23 # primary ID of the feature is changed to indicate its primary ID
24 # in the database...
25 my $id = $feature->primary_id;
27 # get the feature back out
28 my $f = $db->fetch($id);
30 # change the feature and update it
31 $f->start(100);
32 $db->update($f) or die "Couldn't update!";
34 # searching...
35 # ...by id
36 my @features = $db->fetch_many(@list_of_ids);
38 # ...by name
39 @features = $db->get_features_by_name('ZK909');
41 # ...by alias
42 @features = $db->get_features_by_alias('sma-3');
44 # ...by type
45 @features = $db->get_features_by_name('gene');
47 # ...by location
48 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
50 # ...by attribute
51 @features = $db->get_features_by_attribute({description => 'protein kinase'})
53 # ...by the GFF "Note" field
54 @result_list = $db->search_notes('kinase');
56 # ...by arbitrary combinations of selectors
57 @features = $db->features(-name => $name,
58 -type => $types,
59 -seq_id => $seqid,
60 -start => $start,
61 -end => $end,
62 -attributes => $attributes);
64 # ...using an iterator
65 my $iterator = $db->get_seq_stream(-name => $name,
66 -type => $types,
67 -seq_id => $seqid,
68 -start => $start,
69 -end => $end,
70 -attributes => $attributes);
72 while (my $feature = $iterator->next_seq) {
73 # do something with the feature
76 # ...limiting the search to a particular region
77 my $segment = $db->segment('Chr1',5000=>6000);
78 my @features = $segment->features(-type=>['mRNA','match']);
80 # getting & storing sequence information
81 # Warning: this returns a string, and not a PrimarySeq object
82 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
83 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
85 # what feature types are defined in the database?
86 my @types = $db->types;
88 # create a new feature in the database
89 my $feature = $db->new_feature(-primary_tag => 'mRNA',
90 -seq_id => 'chr3',
91 -start => 10000,
92 -end => 11000);
94 =head1 DESCRIPTION
96 Bio::DB::SeqFeature::Store::SQLite is the SQLite adaptor for
97 Bio::DB::SeqFeature::Store. You will not create it directly, but
98 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
100 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
102 =head2 Using the SQLite adaptor
104 To establish a connection to the database, call
105 Bio::DB::SeqFeature::Store-E<gt>new(-adaptor=E<gt>'DBI::SQLite',@more_args). The
106 additional arguments are as follows:
108 Argument name Description
109 ------------- -----------
111 -dsn The path to the SQLite database file.
113 -namespace A prefix to attach to each table. This allows you
114 to have several virtual databases in the same
115 physical database.
117 -temp Boolean flag. If true, a temporary database
118 will be created and destroyed as soon as
119 the Store object goes out of scope. (synonym -temporary)
121 -autoindex Boolean flag. If true, features in the database will be
122 reindexed every time they change. This is the default.
125 -tmpdir Directory in which to place temporary files during "fast" loading.
126 Defaults to File::Spec->tmpdir(). (synonyms -dump_dir, -dumpdir, -tmp)
128 -dbi_options A hashref to pass to DBI->connect's 4th argument, the "attributes."
129 (synonyms -options, -dbi_attr)
131 -write Pass true to open database for writing or updating.
133 If successful, a new instance of
134 Bio::DB::SeqFeature::Store::DBI::SQLite will be returned.
136 In addition to the standard methods supported by all well-behaved
137 Bio::DB::SeqFeature::Store databases, several following
138 adaptor-specific methods are provided. These are described in the next
139 sections.
141 =cut
143 use strict;
145 use base 'Bio::DB::SeqFeature::Store::DBI::mysql';
146 use Bio::DB::SeqFeature::Store::DBI::Iterator;
147 use DBI qw(:sql_types);
148 use Memoize;
149 use Cwd qw(abs_path getcwd);
150 use Bio::DB::GFF::Util::Rearrange 'rearrange';
151 use Bio::SeqFeature::Lite;
152 use File::Spec;
153 use constant DEBUG=>0;
154 use constant EXPERIMENTAL_COVERAGE=>1;
156 # Using same limits as MySQL adaptor so I don't have to make something up.
157 use constant MAX_INT => 2_147_483_647;
158 use constant MIN_INT => -2_147_483_648;
159 use constant SUMMARY_BIN_SIZE => 1000; # we checkpoint coverage this often, about 20 meg overhead per feature type on hg
160 use constant USE_SPATIAL=>0;
162 # The binning scheme places each feature into a bin.
163 # Bins are variably sized as powers of two. For example,
164 # there are 585 bins of size 2**17 (131072 bases)
165 my (@BINS,%BINS);
167 @BINS = map {2**$_} (17, 20, 23, 26, 29); # TO DO: experiment with different bin sizes
168 my $start=0;
169 for my $b (sort {$b<=>$a} @BINS) {
170 $BINS{$b} = $start;
171 $start += $BINS[-1]/$b;
176 # my %BINS = (
177 # 2**11 => 37449,
178 # 2**14 => 4681,
179 # 2**17 => 585,
180 # 2**20 => 73,
181 # 2**23 => 9,
182 # 2**26 => 1,
183 # 2**29 => 0
184 # );
185 # my @BINS = sort {$a<=>$b} keys %BINS;
187 sub calculate_bin {
188 my $self = shift;
189 my ($start,$end) = @_;
191 my $len = $end - $start;
192 for my $bin (@BINS) {
193 next if $len > $bin;
194 # possibly fits here
195 my $binstart = int $start/$bin;
196 my $binend = int $end/$bin;
197 return $binstart+$BINS{$bin} if $binstart == $binend;
200 die "unreasonable coordinates ",$start+1,"..$end";
203 sub search_bins {
204 my $self = shift;
205 my ($start,$end) = @_;
206 my @results;
208 for my $bin (@BINS) {
209 my $binstart = int $start/$bin;
210 my $binend = int $end/$bin;
211 push @results,$binstart+$BINS{$bin}..$binend+$BINS{$bin};
213 return @results;
218 # object initialization
220 sub init {
221 my $self = shift;
222 my ($dsn,
223 $is_temporary,
224 $autoindex,
225 $namespace,
226 $dump_dir,
227 $user,
228 $pass,
229 $dbi_options,
230 $writeable,
231 $create,
232 ) = rearrange(['DSN',
233 ['TEMP','TEMPORARY'],
234 'AUTOINDEX',
235 'NAMESPACE',
236 ['DUMP_DIR','DUMPDIR','TMP','TMPDIR'],
237 'USER',
238 ['PASS','PASSWD','PASSWORD'],
239 ['OPTIONS','DBI_OPTIONS','DBI_ATTR'],
240 ['WRITE','WRITEABLE'],
241 'CREATE',
242 ],@_);
243 $dbi_options ||= {};
244 $writeable = 1 if $is_temporary or $dump_dir;
246 $dsn or $self->throw("Usage: ".__PACKAGE__."->init(-dsn => \$dbh || \$dsn)");
248 my $dbh;
249 if (ref $dsn) {
250 $dbh = $dsn;
251 } else {
252 $dsn = "dbi:SQLite:$dsn" unless $dsn =~ /^dbi:/;
253 $dbh = DBI->connect($dsn,$user,$pass,$dbi_options) or $self->throw($DBI::errstr);
254 $dbh->do("PRAGMA synchronous = OFF;"); # makes writes much faster
255 $dbh->do("PRAGMA temp_store = MEMORY;"); # less disk I/O; some speedup
256 $dbh->do("PRAGMA cache_size = 20000;"); # less disk I/O; some speedup
257 # Keep track of database file location
258 my $cwd = getcwd;
259 my ($db_file) = ($dsn =~ m/(?:db(?:name)?|database)=(.+)$/);
260 $self->{dbh_file} = "$cwd/$db_file";
262 $self->{dbh} = $dbh;
263 $self->{is_temp} = $is_temporary;
264 $self->{namespace} = $namespace;
265 $self->{writeable} = $writeable;
267 $self->default_settings;
268 $self->autoindex($autoindex) if defined $autoindex;
269 $self->dumpdir($dump_dir) if $dump_dir;
270 if ($self->is_temp) {
271 $self->init_tmp_database();
272 } elsif ($create) {
273 $self->init_database('erase');
277 sub table_definitions {
278 my $self = shift;
279 my $defs =
281 feature => <<END,
283 id integer primary key autoincrement,
284 typeid integer not null,
285 strand integer default 0,
286 "indexed" integer default 1,
287 object blob not null
291 locationlist => <<END,
293 id integer primary key autoincrement,
294 seqname text not null
296 create index index_locationlist on locationlist (seqname);
299 typelist => <<END,
301 id integer primary key autoincrement,
302 tag text not null
304 create index index_typelist on typelist (tag);
306 name => <<END,
308 id integer not null,
309 name text not null,
310 display_name integer default 0
312 create index index_name_id on name(id);
313 create index index_name_name on name(name);
316 attribute => <<END,
318 id integer not null,
319 attribute_id integer not null,
320 attribute_value text
322 create index index_attribute_id on attribute(attribute_id);
323 create index index_attribute_value on attribute(attribute_value);
326 attributelist => <<END,
328 id integer primary key autoincrement,
329 tag text not null
331 create index index_attributelist_id on attributelist(id);
332 create index index_attributelist_tag on attributelist(tag);
334 parent2child => <<END,
336 id integer not null,
337 child integer not null
339 create unique index index_parent2child_id_child on parent2child(id,child);
342 meta => <<END,
344 name text primary key,
345 value text not null
348 sequence => <<END,
350 id integer not null,
351 offset integer not null,
352 sequence blob,
353 primary key(id,offset)
358 unless ($self->_has_spatial_index) {
359 $defs->{feature_location} = <<END;
361 id int(10) primary key,
362 seqid int(10),
363 bin int,
364 start int,
365 end int
367 create index index_feature_location on feature_location(seqid,bin,start,end);
372 if (EXPERIMENTAL_COVERAGE) {
373 $defs->{interval_stats} = <<END;
375 typeid integer not null,
376 seqid integer not null,
377 bin integer not null,
378 cum_count integer not null,
379 unique(typeid,seqid,bin)
383 return $defs;
386 sub _init_database {
387 my $self = shift;
389 # must do this first before calling table_definitions
390 $self->_create_spatial_index;
391 $self->SUPER::_init_database(@_);
394 sub init_tmp_database {
395 my $self = shift;
396 my $erase = shift;
397 $self->_create_spatial_index;
398 $self->SUPER::init_tmp_database(@_);
401 sub _create_spatial_index{
402 my $self = shift;
403 my $dbh = $self->dbh;
404 local $dbh->{PrintError} = 0;
405 $dbh->do("DROP TABLE IF EXISTS feature_index"); # spatial index
406 if (USE_SPATIAL) {
407 $dbh->do("CREATE VIRTUAL TABLE feature_index USING RTREE(id,seqid,bin,start,end)");
411 sub _has_spatial_index {
412 my $self = shift;
413 return $self->{'_has_spatial_index'} if exists $self->{'_has_spatial_index'};
414 my $dbh = $self->dbh;
415 my ($count) = $dbh->selectrow_array("select count(*) from sqlite_master where name='feature_index'");
416 return $self->{'_has_spatial_index'} = $count;
419 sub _finish_bulk_update {
420 my $self = shift;
421 my $dbh = $self->dbh;
422 my $dir = $self->{dumpdir} || '.';
424 $self->begin_work; # making this a transaction greatly improves performance
426 for my $table ('feature', $self->index_tables) {
427 my $fh = $self->dump_filehandle($table);
428 my $path = $self->dump_path($table);
429 $fh->close;
431 open $fh, '<', $path or $self->throw("Could not read file '$path': $!");
432 my $qualified_table = $self->_qualify($table);
434 my $sth;
435 if ($table =~ /feature$/) {
436 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?,?,?)");
438 while (<$fh>) {
439 chomp();
440 my ($id,$typeid,$strand,$indexed,$obj) = split(/\t/);
441 $sth->bind_param(1, $id);
442 $sth->bind_param(2, $typeid);
443 $sth->bind_param(3, $strand);
444 $sth->bind_param(4, $indexed);
445 $sth->bind_param(5, pack('H*',$obj), {TYPE => SQL_BLOB});
446 $sth->execute();
448 } else {
449 my $feature_index = $self->_feature_index_table;
450 if ($table =~ /parent2child$/) {
451 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?)");
452 } elsif ($table =~ /$feature_index$/) {
453 $sth = $dbh->prepare(
454 $self->_has_spatial_index ?"REPLACE INTO $qualified_table VALUES (?,?,?,?,?)"
455 :"REPLACE INTO $qualified_table (id,seqid,bin,start,end) VALUES (?,?,?,?,?)"
457 } else { # attribute or name
458 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?)");
461 while (<$fh>) {
462 chomp();
463 $sth->execute(split(/\t/));
466 $fh->close();
467 unlink $path;
469 $self->commit; # commit the transaction
470 delete $self->{bulk_update_in_progress};
471 delete $self->{filehandles};
474 sub index_tables {
475 my $self = shift;
476 my @t = $self->SUPER::index_tables;
477 return (@t,$self->_feature_index_table);
480 sub _enable_keys { } # nullop
481 sub _disable_keys { } # nullop
483 sub _fetch_indexed_features_sql {
484 my $self = shift;
485 my $location_table = $self->_qualify('feature_location');
486 my $feature_table = $self->_qualify('feature');
487 return <<END;
488 SELECT typeid,seqid,start-1,end
489 FROM $location_table as l,$feature_table as f
490 WHERE l.id=f.id AND f.\"indexed\"=1
491 ORDER BY typeid,seqid,start
496 # get primary sequence between start and end
498 sub _fetch_sequence {
499 my $self = shift;
500 my ($seqid,$start,$end) = @_;
502 # backward compatibility to the old days when I liked reverse complementing
503 # dna by specifying $start > $end
504 my $reversed;
505 if (defined $start && defined $end && $start > $end) {
506 $reversed++;
507 ($start,$end) = ($end,$start);
509 $start-- if defined $start;
510 $end-- if defined $end;
512 my $offset1 = $self->_offset_boundary($seqid,$start || 'left');
513 my $offset2 = $self->_offset_boundary($seqid,$end || 'right');
514 my $sequence_table = $self->_sequence_table;
515 my $locationlist_table = $self->_locationlist_table;
517 # CROSS JOIN gives a hint to the SQLite query optimizer -- mucho speedup!
518 my $sth = $self->_prepare(<<END);
519 SELECT sequence,offset
520 FROM $locationlist_table as ll CROSS JOIN $sequence_table as s
521 WHERE ll.id=s.id
522 AND ll.seqname= ?
523 AND offset >= ?
524 AND offset <= ?
525 ORDER BY offset
528 my $seq = '';
529 $sth->execute($seqid,$offset1,$offset2) or $self->throw($sth->errstr);
531 while (my($frag,$offset) = $sth->fetchrow_array) {
532 substr($frag,0,$start-$offset) = '' if defined $start && $start > $offset;
533 $seq .= $frag;
535 substr($seq,$end-$start+1) = '' if defined $end && $end-$start+1 < length($seq);
536 if ($reversed) {
537 $seq = reverse $seq;
538 $seq =~ tr/gatcGATC/ctagCTAG/;
540 $sth->finish;
541 $seq;
544 sub _offset_boundary {
545 my $self = shift;
546 my ($seqid,$position) = @_;
548 my $sequence_table = $self->_sequence_table;
549 my $locationlist_table = $self->_locationlist_table;
551 my $sql;
552 # use "CROSS JOIN" to give a hint to the SQLite query optimizer.
553 $sql = $position eq 'left' ? "SELECT min(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
554 :$position eq 'right' ? "SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
555 :"SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=? AND offset<=?";
556 my $sth = $self->_prepare($sql);
557 my @args = $position =~ /^-?\d+$/ ? ($seqid,$position) : ($seqid);
558 $sth->execute(@args) or $self->throw($sth->errstr);
559 my $boundary = $sth->fetchall_arrayref->[0][0];
560 $sth->finish;
561 return $boundary;
565 # Efficiently fetch a series of IDs from the database
566 # Can pass an array or an array ref
568 sub _fetch_many {
569 my $self = shift;
570 @_ or $self->throw('usage: fetch_many($id1,$id2,$id3...)');
571 my $ids = join ',',map {ref($_) ? @$_ : $_} @_ or return;
572 my $features = $self->_feature_table;
574 my $sth = $self->_prepare(<<END);
575 SELECT id,object FROM $features WHERE id IN ($ids)
577 $sth->execute() or $self->throw($sth->errstr);
578 return $self->_sth2objs($sth);
581 sub _features {
582 my $self = shift;
583 my ($seq_id,$start,$end,$strand,
584 $name,$class,$allow_aliases,
585 $types,
586 $attributes,
587 $range_type,
588 $fromtable,
589 $iterator,
590 $sources
591 ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
592 'NAME','CLASS','ALIASES',
593 ['TYPES','TYPE','PRIMARY_TAG'],
594 ['ATTRIBUTES','ATTRIBUTE'],
595 'RANGE_TYPE',
596 'FROM_TABLE',
597 'ITERATOR',
598 ['SOURCE','SOURCES']
599 ],@_);
601 my (@from,@where,@args,@group);
602 $range_type ||= 'overlaps';
604 my $feature_table = $self->_feature_table;
605 @from = "$feature_table as f";
607 if (defined $name) {
608 # hacky backward compatibility workaround
609 undef $class if $class && $class eq 'Sequence';
610 $name = "$class:$name" if defined $class && length $class > 0;
611 # last argument is the join field
612 my ($from,$where,$group,@a) = $self->_name_sql($name,$allow_aliases,'f.id');
613 push @from,$from if $from;
614 push @where,$where if $where;
615 push @group,$group if $group;
616 push @args,@a;
619 if (defined $seq_id) {
620 # last argument is the name of the features table
621 my ($from,$where,$group,@a) = $self->_location_sql($seq_id,$start,$end,$range_type,$strand,'f');
622 push @from,$from if $from;
623 push @where,$where if $where;
624 push @group,$group if $group;
625 push @args,@a;
628 if (defined($sources)) {
629 my @sources = ref($sources) eq 'ARRAY' ? @{$sources} : ($sources);
630 if (defined($types)) {
631 my @types = ref($types) eq 'ARRAY' ? @{$types} : ($types);
632 my @final_types;
633 foreach my $type (@types) {
634 # *** not sure what to do if user supplies both -source and -type
635 # where the type includes a source!
636 if ($type =~ /:/) {
637 push(@final_types, $type);
639 else {
640 foreach my $source (@sources) {
641 push(@final_types, $type.':'.$source);
645 $types = \@final_types;
647 else {
648 $types = [map { ':'.$_ } @sources];
651 if (defined($types)) {
652 # last argument is the name of the features table
653 my ($from,$where,$group,@a) = $self->_types_sql($types,'f');
654 push @from,$from if $from;
655 push @where,$where if $where;
656 push @group,$group if $group;
657 push @args,@a;
660 if (defined $attributes) {
661 # last argument is the join field
662 my ($from,$where,$group,@a) = $self->_attributes_sql($attributes,'f.id');
663 push @from,$from if $from;
664 push @where,$where if $where;
665 push @group,$group if $group;
666 push @args,@a;
669 if (defined $fromtable) {
670 # last argument is the join field
671 my ($from,$where,$group,@a) = $self->_from_table_sql($fromtable,'f.id');
672 push @from,$from if $from;
673 push @where,$where if $where;
674 push @group,$group if $group;
675 push @args,@a;
678 # if no other criteria are specified, then
679 # only fetch indexed (i.e. top level objects)
680 @where = '"indexed"=1' unless @where;
682 my $from = join ', ',@from;
683 my $where = join ' AND ',map {"($_)"} @where;
684 my $group = join ', ',@group;
685 $group = "GROUP BY $group" if @group;
687 my $query = <<END;
688 SELECT f.id,f.object
689 FROM $from
690 WHERE $where
691 $group
694 $self->_print_query($query,@args) if DEBUG || $self->debug;
696 my $sth = $self->_prepare($query);
697 $sth->execute(@args) or $self->throw($sth->errstr);
698 return $iterator ? Bio::DB::SeqFeature::Store::DBI::Iterator->new($sth,$self) : $self->_sth2objs($sth);
701 sub _make_attribute_group {
702 my $self = shift;
703 my ($table_name,$attributes) = @_;
704 my $key_count = keys %$attributes or return;
705 my $count = $key_count-1;
706 return "f.id HAVING count(f.id)>$count";
709 sub _location_sql {
710 my $self = shift;
711 my ($seq_id,$start,$end,$range_type,$strand,$location) = @_;
713 # the additional join on the location_list table badly impacts performance
714 # so we build a copy of the table in memory
715 my $seqid = $self->_locationid_nocreate($seq_id) || 0; # zero is an invalid primary ID, so will return empty
717 my $feature_index = $self->_feature_index_table;
718 my $from = "$feature_index as fi";
720 my ($bin_where,@bin_args);
721 if (defined $start && defined $end && !$self->_has_spatial_index) {
722 my @bins = $self->search_bins($start,$end);
723 $bin_where = ' AND bin in ('.join(',',@bins).')';
726 $start = MIN_INT unless defined $start;
727 $end = MAX_INT unless defined $end;
729 my ($range,@range_args);
730 if ($range_type eq 'overlaps') {
731 $range = "fi.end>=? AND fi.start<=?".$bin_where;
732 @range_args = ($start,$end,@bin_args);
733 } elsif ($range_type eq 'contains') {
734 $range = "fi.start>=? AND fi.end<=?".$bin_where;
735 @range_args = ($start,$end,@bin_args);
736 } elsif ($range_type eq 'contained_in') {
737 $range = "fi.start<=? AND fi.end>=?";
738 @range_args = ($start,$end);
739 } else {
740 $self->throw("range_type must be one of 'overlaps', 'contains' or 'contained_in'");
743 if (defined $strand) {
744 $range .= " AND strand=?";
745 push @range_args,$strand;
748 my $where = <<END;
749 fi.seqid=?
750 AND $location.id=fi.id
751 AND $range
754 my $group = '';
756 my @args = ($seqid,@range_args);
757 return ($from,$where,$group,@args);
760 sub _feature_index_table {
761 my $self = shift;
762 return $self->_has_spatial_index ? $self->_qualify('feature_index')
763 : $self->_qualify('feature_location') }
765 # Do a case-insensitive search a la the PostgreSQL adaptor
766 sub _name_sql {
767 my $self = shift;
768 my ($name,$allow_aliases,$join) = @_;
769 my $name_table = $self->_name_table;
771 my $from = "$name_table as n";
772 my ($match,$string) = $self->_match_sql($name);
774 my $where = "n.id=$join AND lower(n.name) $match";
775 $where .= " AND n.display_name>0" unless $allow_aliases;
776 return ($from,$where,'',$string);
779 sub _search_attributes {
780 my $self = shift;
781 my ($search_string,$attribute_names,$limit) = @_;
782 my @words = map {quotemeta($_)} split /\s+/,$search_string;
784 my $name_table = $self->_name_table;
785 my $attribute_table = $self->_attribute_table;
786 my $attributelist_table = $self->_attributelist_table;
787 my $type_table = $self->_type_table;
788 my $typelist_table = $self->_typelist_table;
790 my @tags = @$attribute_names;
791 my $tag_sql = join ' OR ',("al.tag=?") x @tags;
793 my $perl_regexp = join '|',@words;
795 my @wild_card_words = map { "%$_%" } @words;
796 my $sql_regexp = join ' OR ',("a.attribute_value LIKE ?") x @words;
797 # CROSS JOIN disables SQLite's table reordering optimization
798 my $sql = <<END;
799 SELECT name,attribute_value,tl.tag,n.id
800 FROM $attributelist_table AS al
801 CROSS JOIN $attribute_table AS a ON al.id = a.attribute_id
802 CROSS JOIN $name_table AS n ON n.id = a.id
803 CROSS JOIN $type_table AS t ON t.id = n.id
804 CROSS JOIN $typelist_table AS tl ON tl.id = t.typeid
805 WHERE ($tag_sql)
806 AND ($sql_regexp)
807 AND n.display_name=1
809 $sql .= "LIMIT $limit" if defined $limit;
810 $self->_print_query($sql,@tags,@words) if DEBUG || $self->debug;
811 my $sth = $self->_prepare($sql);
812 $sth->execute(@tags,@wild_card_words) or $self->throw($sth->errstr);
814 my @results;
815 while (my($name,$value,$type,$id) = $sth->fetchrow_array) {
816 my (@hits) = $value =~ /$perl_regexp/ig;
817 my @words_in_row = split /\b/,$value;
818 my $score = int(@hits*100/@words/@words_in_row);
819 push @results,[$name,$value,$score,$type,$id];
821 $sth->finish;
822 @results = sort {$b->[2]<=>$a->[2]} @results;
823 return @results;
826 sub _match_sql {
827 my $self = shift;
828 my $name = shift;
830 my ($match,$string);
831 if ($name =~ /(?:^|[^\\])[*?]/) {
832 $name =~ s/(^|[^\\])([%_])/$1\\$2/g;
833 $name =~ s/(^|[^\\])\*/$1%/g;
834 $name =~ s/(^|[^\\])\?/$1_/g;
835 $match = "LIKE ?";
836 $string = $name;
837 } else {
838 $match = "= lower(?)";
839 $string = lc($name);
841 return ($match,$string);
844 sub _attributes_sql {
845 my $self = shift;
846 my ($attributes,$join) = @_;
848 my ($wf,@bind_args) = $self->_make_attribute_where('a','al',$attributes);
849 my ($group_by,@group_args)= $self->_make_attribute_group('a',$attributes);
851 my $attribute_table = $self->_attribute_table;
852 my $attributelist_table = $self->_attributelist_table;
854 my $from = "$attribute_table AS a INDEXED BY index_attribute_id, $attributelist_table AS al";
856 my $where = <<END;
857 a.id=$join
858 AND a.attribute_id=al.id
859 AND ($wf)
862 my $group = $group_by;
864 my @args = (@bind_args,@group_args);
865 return ($from,$where,$group,@args);
868 # overridden because of case-sensitivity of matches
869 sub _types_sql {
870 my $self = shift;
871 my ($types,$type_table) = @_;
872 my ($primary_tag,$source_tag);
874 my @types = ref $types eq 'ARRAY' ? @$types : $types;
876 my $typelist = $self->_typelist_table;
877 my $from = "$typelist AS tl";
879 my (@matches,@args);
881 for my $type (@types) {
883 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
884 $primary_tag = $type->method;
885 $source_tag = $type->source;
886 } else {
887 ($primary_tag,$source_tag) = split ':',$type,2;
890 if (length $source_tag) {
891 push @matches,"lower(tl.tag)=lower(?)";
892 push @args,"$primary_tag:$source_tag";
893 } else {
894 push @matches,"tl.tag LIKE ?";
895 push @args,"$primary_tag:%";
898 my $matches = join ' OR ',@matches;
900 my $where = <<END;
901 tl.id=$type_table.typeid
902 AND ($matches)
905 return ($from,$where,'',@args);
908 sub optimize {
909 my $self = shift;
910 $self->dbh->do("ANALYZE $_") foreach $self->index_tables;
914 # Replace Bio::SeqFeatureI into database.
916 sub replace {
917 my $self = shift;
918 my $object = shift;
919 my $index_flag = shift || undef;
921 # ?? shouldn't need to do this
922 # $self->_load_class($object);
923 my $id = $object->primary_id;
924 my $features = $self->_feature_table;
926 my $sth = $self->_prepare(<<END);
927 REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES (?,?,?,?,?)
930 my ($seqid,$start,$end,$strand,$bin) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x6;
932 my $primary_tag = $object->primary_tag;
933 my $source_tag = $object->source_tag || '';
934 $primary_tag .= ":$source_tag";
935 my $typeid = $self->_typeid($primary_tag,1);
937 my $frozen = $self->no_blobs() ? 0 : $self->freeze($object);
939 $sth->bind_param(1, $id);
940 $sth->bind_param(2, $frozen, {TYPE => SQL_BLOB});
941 $sth->bind_param(3, $index_flag||0);
942 $sth->bind_param(4, $strand);
943 $sth->bind_param(5, $typeid);
945 $sth->execute() or $self->throw($sth->errstr);
947 my $dbh = $self->dbh;
948 $object->primary_id($dbh->func('last_insert_rowid')) unless defined $id;
950 $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
953 # doesn't work with this schema, since we have to update name and attribute
954 # tables which need object ids, which we can only know by replacing feats in
955 # the feature table one by one
956 sub bulk_replace {
957 my $self = shift;
958 my $index_flag = shift || undef;
959 my @objects = @_;
961 my $features = $self->_feature_table;
963 my @insert_values;
964 foreach my $object (@objects) {
965 my $id = $object->primary_id;
966 my (undef,undef,undef,$strand) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x4;
967 my $primary_tag = $object->primary_tag;
968 my $source_tag = $object->source_tag || '';
969 $primary_tag .= ":$source_tag";
970 my $typeid = $self->_typeid($primary_tag,1);
972 push(@insert_values, ($id,0,$index_flag||0,$strand,$typeid));
975 my @value_blocks;
976 for (1..@objects) {
977 push(@value_blocks, '(?,?,?,?,?)');
979 my $value_blocks = join(',', @value_blocks);
980 my $sql = qq{REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES $value_blocks};
982 my $sth = $self->_prepare($sql);
983 $sth->execute(@insert_values) or $self->throw($sth->errstr);
986 sub _get_location_and_bin {
987 my $self = shift;
988 my $obj = shift;
989 my $seqid = $self->_locationid($obj->seq_id||'');
990 my $start = $obj->start;
991 my $end = $obj->end;
992 my $strand = $obj->strand;
993 return ($seqid,$start,$end,$strand,$self->calculate_bin($start,$end));
999 # Insert one Bio::SeqFeatureI into database. primary_id must be undef
1001 sub insert {
1002 my $self = shift;
1003 my $object = shift;
1004 my $index_flag = shift || 0;
1006 $self->_load_class($object);
1007 defined $object->primary_id and $self->throw("$object already has a primary id");
1009 my $features = $self->_feature_table;
1010 my $sth = $self->_prepare(<<END);
1011 INSERT INTO $features (id,object,"indexed") VALUES (?,?,?)
1013 $sth->execute(undef,$self->freeze($object),$index_flag) or $self->throw($sth->errstr);
1014 my $dbh = $self->dbh;
1015 $object->primary_id($dbh->func('last_insert_rowid'));
1016 $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
1019 =head2 toplevel_types
1021 Title : toplevel_types
1022 Usage : @type_list = $db->toplevel_types
1023 Function: Get the toplevel types in the database
1024 Returns : array of Bio::DB::GFF::Typename objects
1025 Args : none
1026 Status : public
1028 This is similar to types() but only returns the types of
1029 INDEXED (toplevel) features.
1031 =cut
1033 sub toplevel_types {
1034 my $self = shift;
1035 eval "require Bio::DB::GFF::Typename"
1036 unless Bio::DB::GFF::Typename->can('new');
1037 my $typelist_table = $self->_typelist_table;
1038 my $feature_table = $self->_feature_table;
1039 my $sql = <<END;
1040 SELECT distinct(tag) from $typelist_table as tl,$feature_table as f
1041 WHERE tl.id=f.typeid
1042 AND f."indexed"=1
1045 $self->_print_query($sql) if DEBUG || $self->debug;
1046 my $sth = $self->_prepare($sql);
1047 $sth->execute() or $self->throw($sth->errstr);
1049 my @results;
1050 while (my($tag) = $sth->fetchrow_array) {
1051 push @results,Bio::DB::GFF::Typename->new($tag);
1053 $sth->finish;
1054 return @results;
1057 sub _genericid {
1058 my $self = shift;
1059 my ($table,$namefield,$name,$add_if_missing) = @_;
1060 my $qualified_table = $self->_qualify($table);
1061 my $sth = $self->_prepare(<<END);
1062 SELECT id FROM $qualified_table WHERE lower($namefield)=lower(?)
1064 $sth->execute($name) or die $sth->errstr;
1065 my ($id) = $sth->fetchrow_array;
1066 $sth->finish;
1067 return $id if defined $id;
1068 return unless $add_if_missing;
1070 $sth = $self->_prepare(<<END);
1071 INSERT INTO $qualified_table ($namefield) VALUES (?)
1073 $sth->execute($name) or die $sth->errstr;
1074 my $dbh = $self->dbh;
1075 return $dbh->func('last_insert_rowid');
1078 # special-purpose store for bulk loading - write to a file rather than to the db
1080 sub _dump_store {
1081 my $self = shift;
1082 my $indexed = shift;
1084 my $count = 0;
1085 my $store_fh = $self->dump_filehandle('feature');
1086 my $dbh = $self->dbh;
1088 my $autoindex = $self->autoindex;
1090 for my $obj (@_) {
1091 my $id = $self->next_id;
1092 my ($seqid,$start,$end,$strand) = $indexed ? $self->_get_location_and_bin($obj) : (undef)x4;
1093 my $primary_tag = $obj->primary_tag;
1094 my $source_tag = $obj->source_tag || '';
1095 $primary_tag .= ":$source_tag";
1096 my $typeid = $self->_typeid($primary_tag,1);
1098 # Encode BLOB in hex so we can more easily import it into SQLite
1099 print $store_fh
1100 join("\t",$id,$typeid,$strand,$indexed,
1101 unpack('H*', $self->freeze($obj))),"\n";
1102 $obj->primary_id($id);
1103 $self->_update_indexes($obj) if $indexed && $autoindex;
1104 $count++;
1107 # remember whether we are have ever stored a non-indexed feature
1108 unless ($indexed or $self->{indexed_flag}++) {
1109 $self->subfeatures_are_indexed(0);
1111 $count;
1114 sub _dump_update_name_index {
1115 my $self = shift;
1116 my ($obj,$id) = @_;
1117 my $fh = $self->dump_filehandle('name');
1118 my $dbh = $self->dbh;
1119 my ($names,$aliases) = $self->feature_names($obj);
1120 # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1121 print $fh join("\t",$id,lc($_),1),"\n" foreach @$names;
1122 print $fh join("\t",$id,lc($_),0),"\n" foreach @$aliases;
1125 sub _update_name_index {
1126 my $self = shift;
1127 my ($obj,$id) = @_;
1128 my $name = $self->_name_table;
1129 my $primary_id = $obj->primary_id;
1131 $self->_delete_index($name,$id);
1132 my ($names,$aliases) = $self->feature_names($obj);
1134 my $sth = $self->_prepare("INSERT INTO $name (id,name,display_name) VALUES (?,?,?)");
1136 $sth->execute($id,lc $_,1) or $self->throw($sth->errstr) foreach @$names;
1137 $sth->execute($id,lc $_,0) or $self->throw($sth->errstr) foreach @$aliases;
1138 $sth->finish;
1142 sub _dump_update_attribute_index {
1143 my $self = shift;
1144 my ($obj,$id) = @_;
1145 my $fh = $self->dump_filehandle('attribute');
1146 my $dbh = $self->dbh;
1147 for my $tag ($obj->all_tags) {
1148 my $tagid = $self->_attributeid($tag);
1149 for my $value ($obj->each_tag_value($tag)) {
1150 # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1151 print $fh join("\t",$id,$tagid,$value),"\n";
1156 sub _update_indexes {
1157 my $self = shift;
1158 my $obj = shift;
1159 defined (my $id = $obj->primary_id) or return;
1160 $self->SUPER::_update_indexes($obj);
1162 if ($self->{bulk_update_in_progress}) {
1163 $self->_dump_update_location_index($obj,$id);
1164 } else {
1165 $self->_update_location_index($obj,$id);
1169 sub _update_location_index {
1170 my $self = shift;
1171 my ($obj,$id) = @_;
1172 my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1174 my $table = $self->_feature_index_table;
1175 $self->_delete_index($table,$id);
1177 my ($sql,@args);
1179 if ($self->_has_spatial_index) {
1180 $sql = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1181 @args = ($id,$seqid,$bin,$start,$end);
1182 } else {
1183 $sql = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1184 @args = ($id,$seqid,$bin,$start,$end);
1187 my $sth = $self->_prepare($sql);
1188 $sth->execute(@args);
1189 $sth->finish;
1192 sub _dump_update_location_index {
1193 my $self = shift;
1194 my ($obj,$id) = @_;
1195 my $table = $self->_feature_index_table;
1196 my $fh = $self->dump_filehandle($table);
1197 my $dbh = $self->dbh;
1198 my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1199 my @args = $self->_has_spatial_index ? ($id,$seqid,$bin,$start,$end)
1200 : ($id,$seqid,$bin,$start,$end);
1201 print $fh join("\t",@args),"\n";
1204 sub DESTROY {
1205 my $self = shift;
1206 # Remove filehandles, so temporal files can be properly deleted
1207 if (%DBI::installed_drh) {
1208 DBI->disconnect_all;
1209 %DBI::installed_drh = ();
1211 undef $self->{dbh};
1216 =head1 AUTHOR
1218 Nathan Weeks - Nathan.Weeks@ars.usda.gov
1220 Copyright (c) 2009 Nathan Weeks
1222 Modified 2010 to support cumulative statistics by Lincoln Stein
1223 <lincoln.stein@gmail.com>.
1225 This library is free software; you can redistribute it and/or modify
1226 it under the same terms as Perl itself. See the Bioperl license for
1227 more details.
1229 =cut