Bio/DB/SeqFeature/Store/DBI/SQLite.pm

   1 package Bio::DB::SeqFeature::Store::DBI::SQLite;
   2
   3 #$Id$
   4
   5 =head1 NAME
   6
   7 Bio::DB::SeqFeature::Store::DBI::SQLite -- SQLite implementation of Bio::DB::SeqFeature::Store
   8
   9 =head1 SYNOPSIS
  10
  11   use Bio::DB::SeqFeature::Store;
  12
  13   # Open the sequence database
  14   my $db = Bio::DB::SeqFeature::Store->new(-adaptor => 'DBI::SQLite',
  15                                            -dsn     => '/path/to/database.db');
  16
  17   # get a feature from somewhere
  18   my $feature = Bio::SeqFeature::Generic->new(...);
  19
  20   # store it
  21   $db->store($feature) or die "Couldn't store!";
  22
  23   # primary ID of the feature is changed to indicate its primary ID
  24   # in the database...
  25   my $id = $feature->primary_id;
  26
  27   # get the feature back out
  28   my $f  = $db->fetch($id);
  29
  30   # change the feature and update it
  31   $f->start(100);
  32   $db->update($f) or die "Couldn't update!";
  33
  34   # searching...
  35   # ...by id
  36   my @features = $db->fetch_many(@list_of_ids);
  37
  38   # ...by name
  39   @features = $db->get_features_by_name('ZK909');
  40
  41   # ...by alias
  42   @features = $db->get_features_by_alias('sma-3');
  43
  44   # ...by type
  45   @features = $db->get_features_by_name('gene');
  46
  47   # ...by location
  48   @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
  49
  50   # ...by attribute
  51   @features = $db->get_features_by_attribute({description => 'protein kinase'})
  52
  53   # ...by the GFF "Note" field
  54   @result_list = $db->search_notes('kinase');
  55
  56   # ...by arbitrary combinations of selectors
  57   @features = $db->features(-name => $name,
  58                             -type => $types,
  59                             -seq_id => $seqid,
  60                             -start  => $start,
  61                             -end    => $end,
  62                             -attributes => $attributes);
  63
  64   # ...using an iterator
  65   my $iterator = $db->get_seq_stream(-name => $name,
  66                                      -type => $types,
  67                                      -seq_id => $seqid,
  68                                      -start  => $start,
  69                                      -end    => $end,
  70                                      -attributes => $attributes);
  71
  72   while (my $feature = $iterator->next_seq) {
  73     # do something with the feature
  74   }
  75
  76   # ...limiting the search to a particular region
  77   my $segment  = $db->segment('Chr1',5000=>6000);
  78   my @features = $segment->features(-type=>['mRNA','match']);
  79
  80   # getting & storing sequence information
  81   # Warning: this returns a string, and not a PrimarySeq object
  82   $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
  83   my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
  84
  85   # what feature types are defined in the database?
  86   my @types    = $db->types;
  87
  88   # create a new feature in the database
  89   my $feature = $db->new_feature(-primary_tag => 'mRNA',
  90                                  -seq_id      => 'chr3',
  91                                  -start      => 10000,
  92                                  -end        => 11000);
  93
  94 =head1 DESCRIPTION
  95
  96 Bio::DB::SeqFeature::Store::SQLite is the SQLite adaptor for
  97 Bio::DB::SeqFeature::Store. You will not create it directly, but
  98 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
  99
 100 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
 101
 102 =head2 Using the SQLite adaptor
 103
 104 To establish a connection to the database, call
 105 Bio::DB::SeqFeature::Store-E<gt>new(-adaptor=E<gt>'DBI::SQLite',@more_args). The
 106 additional arguments are as follows:
 107
 108   Argument name       Description
 109   -------------       -----------
 110
 111  -dsn              The path to the SQLite database file.
 112
 113  -namespace        A prefix to attach to each table. This allows you
 114                    to have several virtual databases in the same
 115                    physical database.
 116
 117  -temp             Boolean flag. If true, a temporary database
 118                    will be created and destroyed as soon as
 119                    the Store object goes out of scope. (synonym -temporary)
 120
 121  -autoindex        Boolean flag. If true, features in the database will be
 122                    reindexed every time they change. This is the default.
 123
 124
 125  -tmpdir           Directory in which to place temporary files during "fast" loading.
 126                    Defaults to File::Spec->tmpdir(). (synonyms -dump_dir, -dumpdir, -tmp)
 127
 128  -dbi_options      A hashref to pass to DBI->connect's 4th argument, the "attributes."
 129                    (synonyms -options, -dbi_attr)
 130
 131  -write            Pass true to open database for writing or updating.
 132
 133 If successful, a new instance of
 134 Bio::DB::SeqFeature::Store::DBI::SQLite will be returned.
 135
 136 In addition to the standard methods supported by all well-behaved
 137 Bio::DB::SeqFeature::Store databases, several following
 138 adaptor-specific methods are provided. These are described in the next
 139 sections.
 140
 141 =cut
 142
 143 use strict;
 144
 145 use base 'Bio::DB::SeqFeature::Store::DBI::mysql';
 146 use Bio::DB::SeqFeature::Store::DBI::Iterator;
 147 use DBI qw(:sql_types);
 148 use Memoize;
 149 use Cwd qw(abs_path getcwd);
 150 use Bio::DB::GFF::Util::Rearrange 'rearrange';
 151 use Bio::SeqFeature::Lite;
 152 use File::Spec;
 153 use constant DEBUG=>0;
 154 use constant EXPERIMENTAL_COVERAGE=>1;
 155
 156 # Using same limits as MySQL adaptor so I don't have to make something up.
 157 use constant MAX_INT =>  2_147_483_647;
 158 use constant MIN_INT => -2_147_483_648;
 159 use constant SUMMARY_BIN_SIZE =>  1000;  # we checkpoint coverage this often, about 20 meg overhead per feature type on hg
 160 use constant USE_SPATIAL=>0;
 161
 162 # The binning scheme places each feature into a bin.
 163 # Bins are variably sized as powers of two. For example,
 164 # there are 585 bins of size 2**17 (131072 bases)
 165 my (@BINS,%BINS);
 166 {
 167     @BINS = map {2**$_} (17, 20, 23, 26, 29);  # TO DO: experiment with different bin sizes
 168     my $start=0;
 169     for my $b (sort {$b<=>$a} @BINS) {
 170         $BINS{$b} = $start;
 171         $start   += $BINS[-1]/$b;
 172     }
 173 }
 174
 175
 176 # my %BINS = (
 177 #     2**11 => 37449,
 178 #     2**14 =>  4681,
 179 #     2**17 =>   585,
 180 #     2**20 =>    73,
 181 #     2**23 =>     9,
 182 #     2**26 =>     1,
 183 #     2**29 =>     0
 184 # );
 185 # my @BINS = sort {$a<=>$b} keys %BINS;
 186
 187 sub calculate_bin {
 188     my $self = shift;
 189     my ($start,$end) = @_;
 190
 191     my $len = $end - $start;
 192     for my $bin (@BINS) {
 193         next if $len > $bin;
 194         # possibly fits here
 195         my $binstart = int $start/$bin;
 196         my $binend   = int $end/$bin;
 197         return $binstart+$BINS{$bin} if $binstart == $binend;
 198     }
 199
 200     die "unreasonable coordinates ",$start+1,"..$end";
 201 }
 202
 203 sub search_bins {
 204     my $self = shift;
 205     my ($start,$end) = @_;
 206     my @results;
 207
 208     for my $bin (@BINS) {
 209         my $binstart = int $start/$bin;
 210         my $binend   = int $end/$bin;
 211         push @results,$binstart+$BINS{$bin}..$binend+$BINS{$bin};
 212     }
 213     return @results;
 214 }
 215
 216
 217 ###
 218 # object initialization
 219 #
 220 sub init {
 221   my $self          = shift;
 222   my ($dsn,
 223       $is_temporary,
 224       $autoindex,
 225       $namespace,
 226       $dump_dir,
 227       $user,
 228       $pass,
 229       $dbi_options,
 230       $writeable,
 231       $create,
 232      ) = rearrange(['DSN',
 233                     ['TEMP','TEMPORARY'],
 234                     'AUTOINDEX',
 235                     'NAMESPACE',
 236                     ['DUMP_DIR','DUMPDIR','TMP','TMPDIR'],
 237                     'USER',
 238                     ['PASS','PASSWD','PASSWORD'],
 239                     ['OPTIONS','DBI_OPTIONS','DBI_ATTR'],
 240                     ['WRITE','WRITEABLE'],
 241                     'CREATE',
 242                    ],@_);
 243   $dbi_options  ||= {};
 244   $writeable    = 1 if $is_temporary or $dump_dir;
 245
 246   $dsn or $self->throw("Usage: ".__PACKAGE__."->init(-dsn => \$dbh || \$dsn)");
 247
 248   my $dbh;
 249   if (ref $dsn) {
 250     $dbh = $dsn;
 251   } else {
 252     $dsn = "dbi:SQLite:$dsn" unless $dsn =~ /^dbi:/;
 253     $dbh = DBI->connect($dsn,$user,$pass,$dbi_options) or $self->throw($DBI::errstr);
 254     $dbh->do("PRAGMA synchronous = OFF;"); # makes writes much faster
 255     $dbh->do("PRAGMA temp_store = MEMORY;"); # less disk I/O; some speedup
 256     $dbh->do("PRAGMA cache_size = 20000;"); # less disk I/O; some speedup
 257     # Keep track of database file location
 258     my $cwd = getcwd;
 259     my ($db_file) = ($dsn =~ m/(?:db(?:name)?|database)=(.+)$/);
 260     $self->{dbh_file} = "$cwd/$db_file";
 261   }
 262   $self->{dbh}       = $dbh;
 263   $self->{is_temp}   = $is_temporary;
 264   $self->{namespace} = $namespace;
 265   $self->{writeable} = $writeable;
 266
 267   $self->default_settings;
 268   $self->autoindex($autoindex)                   if defined $autoindex;
 269   $self->dumpdir($dump_dir)                      if $dump_dir;
 270   if ($self->is_temp) {
 271     $self->init_tmp_database();
 272   } elsif ($create) {
 273     $self->init_database('erase');
 274   }
 275 }
 276
 277 sub table_definitions {
 278   my $self = shift;
 279   my $defs =
 280       {
 281           feature => <<END,
 282 (
 283   id        integer primary key autoincrement,
 284   typeid    integer not null,
 285   strand    integer default 0,
 286   "indexed" integer default 1,
 287   object    blob not null
 288 )
 289 END
 290
 291           locationlist => <<END,
 292 (
 293   id      integer primary key autoincrement,
 294   seqname text    not null
 295 );
 296 create index index_locationlist on locationlist (seqname);
 297 END
 298
 299           typelist => <<END,
 300 (
 301   id  integer primary key autoincrement,
 302   tag text    not null
 303 );
 304 create index index_typelist on typelist (tag);
 305 END
 306           name => <<END,
 307 (
 308   id           integer not null,
 309   name         text    not null,
 310   display_name integer default 0
 311 );
 312 create index index_name_id on name(id);
 313 create index index_name_name on name(name);
 314 END
 315
 316           attribute => <<END,
 317 (
 318   id              integer not null,
 319   attribute_id    integer not null,
 320   attribute_value text
 321 );
 322 create index index_attribute_id    on attribute(attribute_id);
 323 create index index_attribute_value on attribute(attribute_value);
 324 END
 325
 326           attributelist => <<END,
 327 (
 328   id  integer primary key autoincrement,
 329   tag text    not null
 330 );
 331 create index index_attributelist_id  on attributelist(id);
 332 create index index_attributelist_tag on attributelist(tag);
 333 END
 334           parent2child => <<END,
 335 (
 336   id    integer not null,
 337   child integer not null
 338 );
 339 create unique index index_parent2child_id_child on parent2child(id,child);
 340 END
 341
 342           meta => <<END,
 343 (
 344   name  text primary key,
 345   value text not null
 346 )
 347 END
 348           sequence => <<END,
 349 (
 350   id       integer not null,
 351   offset   integer not null,
 352   sequence blob,
 353   primary key(id,offset)
 354 )
 355 END
 356          };
 357
 358   unless ($self->_has_spatial_index) {
 359     $defs->{feature_location} = <<END;
 360 (
 361   id       int(10) primary key,
 362   seqid    int(10),
 363   bin      int,
 364   start    int,
 365   end      int
 366 );
 367 create index index_feature_location on feature_location(seqid,bin,start,end);
 368 END
 369
 370     }
 371
 372   if (EXPERIMENTAL_COVERAGE) {
 373     $defs->{interval_stats} = <<END;
 374 (
 375    typeid            integer not null,
 376    seqid             integer not null,
 377    bin               integer not null,
 378    cum_count         integer not null,
 379    unique(typeid,seqid,bin)
 380 );
 381 END
 382   }
 383   return $defs;
 384 }
 385
 386 sub _init_database {
 387     my $self  = shift;
 388
 389     # must do this first before calling table_definitions
 390     $self->_create_spatial_index;
 391     $self->SUPER::_init_database(@_);
 392 }
 393
 394 sub init_tmp_database {
 395     my $self = shift;
 396     my $erase = shift;
 397     $self->_create_spatial_index;
 398     $self->SUPER::init_tmp_database(@_);
 399 }
 400
 401 sub _create_spatial_index{
 402     my $self = shift;
 403     my $dbh   = $self->dbh;
 404     local $dbh->{PrintError} = 0;
 405     $dbh->do("DROP TABLE IF EXISTS feature_index"); # spatial index
 406     if (USE_SPATIAL) {
 407         $dbh->do("CREATE VIRTUAL TABLE feature_index USING RTREE(id,seqid,bin,start,end)");
 408     }
 409 }
 410
 411 sub _has_spatial_index {
 412     my $self = shift;
 413     return $self->{'_has_spatial_index'} if exists $self->{'_has_spatial_index'};
 414     my $dbh  = $self->dbh;
 415     my ($count) = $dbh->selectrow_array("select count(*) from sqlite_master where name='feature_index'");
 416     return $self->{'_has_spatial_index'} = $count;
 417 }
 418
 419 sub _finish_bulk_update {
 420   my $self = shift;
 421   my $dbh  = $self->dbh;
 422   my $dir = $self->{dumpdir} || '.';
 423
 424   $self->begin_work; # making this a transaction greatly improves performance
 425
 426   for my $table ('feature', $self->index_tables) {
 427     my $fh = $self->dump_filehandle($table);
 428     my $path = $self->dump_path($table);
 429     $fh->close;
 430
 431     open $fh, '<', $path or $self->throw("Could not read file '$path': $!");
 432     my $qualified_table = $self->_qualify($table);
 433
 434     my $sth;
 435     if ($table =~ /feature$/) {
 436       $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?,?,?)");
 437
 438       while (<$fh>) {
 439         chomp();
 440         my ($id,$typeid,$strand,$indexed,$obj) = split(/\t/);
 441         $sth->bind_param(1, $id);
 442         $sth->bind_param(2, $typeid);
 443         $sth->bind_param(3, $strand);
 444         $sth->bind_param(4, $indexed);
 445         $sth->bind_param(5, pack('H*',$obj), {TYPE => SQL_BLOB});
 446         $sth->execute();
 447       }
 448     } else {
 449         my $feature_index = $self->_feature_index_table;
 450         if ($table =~ /parent2child$/) {
 451             $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?)");
 452         } elsif ($table =~ /$feature_index$/) {
 453             $sth = $dbh->prepare(
 454                 $self->_has_spatial_index ?"REPLACE INTO $qualified_table VALUES (?,?,?,?,?)"
 455                                           :"REPLACE INTO $qualified_table (id,seqid,bin,start,end) VALUES (?,?,?,?,?)"
 456                 );
 457         } else { # attribute or name
 458             $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?)");
 459         }
 460
 461         while (<$fh>) {
 462             chomp();
 463             $sth->execute(split(/\t/));
 464         }
 465     }
 466     $fh->close();
 467     unlink $path;
 468   }
 469   $self->commit; # commit the transaction
 470   delete $self->{bulk_update_in_progress};
 471   delete $self->{filehandles};
 472 }
 473
 474 sub index_tables {
 475     my $self = shift;
 476     my @t    = $self->SUPER::index_tables;
 477     return (@t,$self->_feature_index_table);
 478 }
 479
 480 sub _enable_keys  { }  # nullop
 481 sub _disable_keys { }  # nullop
 482
 483 sub _fetch_indexed_features_sql {
 484     my $self = shift;
 485     my $location_table       = $self->_qualify('feature_location');
 486     my $feature_table        = $self->_qualify('feature');
 487     return <<END;
 488     SELECT typeid,seqid,start-1,end
 489   FROM $location_table as l,$feature_table as f
 490  WHERE l.id=f.id AND f.\"indexed\"=1
 491   ORDER BY typeid,seqid,start
 492 END
 493 }
 494
 495 ###
 496 # get primary sequence between start and end
 497 #
 498 sub _fetch_sequence {
 499   my $self = shift;
 500   my ($seqid,$start,$end) = @_;
 501
 502   # backward compatibility to the old days when I liked reverse complementing
 503   # dna by specifying $start > $end
 504   my $reversed;
 505   if (defined $start && defined $end && $start > $end) {
 506     $reversed++;
 507     ($start,$end) = ($end,$start);
 508   }
 509   $start-- if defined $start;
 510   $end--   if defined $end;
 511
 512   my $offset1 = $self->_offset_boundary($seqid,$start || 'left');
 513   my $offset2 = $self->_offset_boundary($seqid,$end   || 'right');
 514   my $sequence_table = $self->_sequence_table;
 515   my $locationlist_table = $self->_locationlist_table;
 516
 517   # CROSS JOIN gives a hint to the SQLite query optimizer -- mucho speedup!
 518   my $sth     = $self->_prepare(<<END);
 519 SELECT sequence,offset
 520    FROM $locationlist_table as ll CROSS JOIN $sequence_table as s
 521    WHERE ll.id=s.id
 522      AND ll.seqname= ?
 523      AND offset >= ?
 524      AND offset <= ?
 525    ORDER BY offset
 526 END
 527
 528   my $seq = '';
 529   $sth->execute($seqid,$offset1,$offset2) or $self->throw($sth->errstr);
 530
 531   while (my($frag,$offset) = $sth->fetchrow_array) {
 532     substr($frag,0,$start-$offset) = '' if defined $start && $start > $offset;
 533     $seq .= $frag;
 534   }
 535   substr($seq,$end-$start+1) = '' if defined $end && $end-$start+1 < length($seq);
 536   if ($reversed) {
 537     $seq = reverse $seq;
 538     $seq =~ tr/gatcGATC/ctagCTAG/;
 539   }
 540   $sth->finish;
 541   $seq;
 542 }
 543
 544 sub _offset_boundary {
 545   my $self = shift;
 546   my ($seqid,$position) = @_;
 547
 548   my $sequence_table     = $self->_sequence_table;
 549   my $locationlist_table = $self->_locationlist_table;
 550
 551   my $sql;
 552   # use "CROSS JOIN" to give a hint to the SQLite query optimizer.
 553   $sql =  $position eq 'left'  ? "SELECT min(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
 554          :$position eq 'right' ? "SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
 555          :"SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=? AND offset<=?";
 556   my $sth = $self->_prepare($sql);
 557   my @args = $position =~ /^-?\d+$/ ? ($seqid,$position) : ($seqid);
 558   $sth->execute(@args) or $self->throw($sth->errstr);
 559   my $boundary = $sth->fetchall_arrayref->[0][0];
 560   $sth->finish;
 561   return $boundary;
 562 }
 563
 564 ###
 565 # Efficiently fetch a series of IDs from the database
 566 # Can pass an array or an array ref
 567 #
 568 sub _fetch_many {
 569   my $self       = shift;
 570   @_ or $self->throw('usage: fetch_many($id1,$id2,$id3...)');
 571   my $ids = join ',',map {ref($_) ? @$_ : $_} @_ or return;
 572   my $features = $self->_feature_table;
 573
 574   my $sth = $self->_prepare(<<END);
 575 SELECT id,object FROM $features WHERE id IN ($ids)
 576 END
 577   $sth->execute() or $self->throw($sth->errstr);
 578   return $self->_sth2objs($sth);
 579 }
 580
 581 sub _features {
 582   my $self = shift;
 583   my ($seq_id,$start,$end,$strand,
 584       $name,$class,$allow_aliases,
 585       $types,
 586       $attributes,
 587       $range_type,
 588       $fromtable,
 589       $iterator,
 590       $sources
 591      ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
 592                     'NAME','CLASS','ALIASES',
 593                     ['TYPES','TYPE','PRIMARY_TAG'],
 594                     ['ATTRIBUTES','ATTRIBUTE'],
 595                     'RANGE_TYPE',
 596                     'FROM_TABLE',
 597                     'ITERATOR',
 598                     ['SOURCE','SOURCES']
 599                    ],@_);
 600
 601   my (@from,@where,@args,@group);
 602   $range_type ||= 'overlaps';
 603
 604   my $feature_table         = $self->_feature_table;
 605   @from = "$feature_table as f";
 606
 607   if (defined $name) {
 608     # hacky backward compatibility workaround
 609     undef $class if $class && $class eq 'Sequence';
 610     $name = "$class:$name" if defined $class && length $class > 0;
 611     # last argument is the join field
 612     my ($from,$where,$group,@a) = $self->_name_sql($name,$allow_aliases,'f.id');
 613     push @from,$from   if $from;
 614     push @where,$where if $where;
 615     push @group,$group if $group;
 616     push @args,@a;
 617   }
 618
 619   if (defined $seq_id) {
 620     # last argument is the name of the features table
 621     my ($from,$where,$group,@a) = $self->_location_sql($seq_id,$start,$end,$range_type,$strand,'f');
 622     push @from,$from   if $from;
 623     push @where,$where if $where;
 624     push @group,$group if $group;
 625     push @args,@a;
 626   }
 627
 628   if (defined($sources)) {
 629     my @sources = ref($sources) eq 'ARRAY' ? @{$sources} : ($sources);
 630     if (defined($types)) {
 631         my @types = ref($types) eq 'ARRAY' ? @{$types} : ($types);
 632         my @final_types;
 633         foreach my $type (@types) {
 634             # *** not sure what to do if user supplies both -source and -type
 635             #     where the type includes a source!
 636             if ($type =~ /:/) {
 637                 push(@final_types, $type);
 638             }
 639             else {
 640                 foreach my $source (@sources) {
 641                     push(@final_types, $type.':'.$source);
 642                 }
 643             }
 644         }
 645         $types = \@final_types;
 646     }
 647     else {
 648         $types = [map { ':'.$_ } @sources];
 649     }
 650   }
 651   if (defined($types)) {
 652     # last argument is the name of the features table
 653     my ($from,$where,$group,@a) = $self->_types_sql($types,'f');
 654     push @from,$from   if $from;
 655     push @where,$where if $where;
 656     push @group,$group if $group;
 657     push @args,@a;
 658   }
 659
 660   if (defined $attributes) {
 661     # last argument is the join field
 662     my ($from,$where,$group,@a) = $self->_attributes_sql($attributes,'f.id');
 663     push @from,$from    if $from;
 664     push @where,$where  if $where;
 665     push @group,$group  if $group;
 666     push @args,@a;
 667   }
 668
 669   if (defined $fromtable) {
 670     # last argument is the join field
 671     my ($from,$where,$group,@a) = $self->_from_table_sql($fromtable,'f.id');
 672     push @from,$from    if $from;
 673     push @where,$where  if $where;
 674     push @group,$group  if $group;
 675     push @args,@a;
 676   }
 677
 678   # if no other criteria are specified, then
 679   # only fetch indexed (i.e. top level objects)
 680   @where = '"indexed"=1' unless @where;
 681
 682   my $from  = join ', ',@from;
 683   my $where = join ' AND ',map {"($_)"} @where;
 684   my $group = join ', ',@group;
 685   $group    = "GROUP BY $group" if @group;
 686
 687   my $query = <<END;
 688 SELECT f.id,f.object
 689   FROM $from
 690   WHERE $where
 691   $group
 692 END
 693
 694   $self->_print_query($query,@args) if DEBUG || $self->debug;
 695
 696   my $sth = $self->_prepare($query);
 697   $sth->execute(@args) or $self->throw($sth->errstr);
 698   return $iterator ? Bio::DB::SeqFeature::Store::DBI::Iterator->new($sth,$self) : $self->_sth2objs($sth);
 699 }
 700
 701 sub _make_attribute_group {
 702   my $self                     = shift;
 703   my ($table_name,$attributes) = @_;
 704   my $key_count = keys %$attributes or return;
 705   my $count = $key_count-1;
 706   return "f.id HAVING count(f.id)>$count";
 707 }
 708
 709 sub _location_sql {
 710   my $self = shift;
 711   my ($seq_id,$start,$end,$range_type,$strand,$location) = @_;
 712
 713   # the additional join on the location_list table badly impacts performance
 714   # so we build a copy of the table in memory
 715   my $seqid = $self->_locationid_nocreate($seq_id) || 0; # zero is an invalid primary ID, so will return empty
 716
 717   my $feature_index = $self->_feature_index_table;
 718   my $from  = "$feature_index as fi";
 719
 720   my ($bin_where,@bin_args);
 721   if (defined $start && defined $end && !$self->_has_spatial_index) {
 722       my @bins   = $self->search_bins($start,$end);
 723       $bin_where = ' AND bin in ('.join(',',@bins).')';
 724   }
 725
 726   $start = MIN_INT unless defined $start;
 727   $end   = MAX_INT unless defined $end;
 728
 729   my ($range,@range_args);
 730   if ($range_type eq 'overlaps') {
 731       $range = "fi.end>=? AND fi.start<=?".$bin_where;
 732       @range_args = ($start,$end,@bin_args);
 733   } elsif ($range_type eq 'contains') {
 734       $range = "fi.start>=? AND fi.end<=?".$bin_where;
 735       @range_args = ($start,$end,@bin_args);
 736   } elsif ($range_type eq 'contained_in') {
 737       $range = "fi.start<=? AND fi.end>=?";
 738       @range_args = ($start,$end);
 739   } else {
 740       $self->throw("range_type must be one of 'overlaps', 'contains' or 'contained_in'");
 741   }
 742
 743   if (defined $strand) {
 744       $range .= " AND strand=?";
 745       push @range_args,$strand;
 746   }
 747
 748   my $where = <<END;
 749   fi.seqid=?
 750       AND   $location.id=fi.id
 751       AND   $range
 752 END
 753 ;
 754   my $group = '';
 755
 756   my @args  = ($seqid,@range_args);
 757   return ($from,$where,$group,@args);
 758 }
 759
 760 sub _feature_index_table          {
 761     my $self = shift;
 762     return $self->_has_spatial_index ? $self->_qualify('feature_index')
 763                                      : $self->_qualify('feature_location') }
 764
 765 # Do a case-insensitive search a la the PostgreSQL adaptor
 766 sub _name_sql {
 767   my $self = shift;
 768   my ($name,$allow_aliases,$join) = @_;
 769   my $name_table   = $self->_name_table;
 770
 771   my $from  = "$name_table as n";
 772   my ($match,$string) = $self->_match_sql($name);
 773
 774   my $where = "n.id=$join AND lower(n.name) $match";
 775   $where   .= " AND n.display_name>0" unless $allow_aliases;
 776   return ($from,$where,'',$string);
 777 }
 778
 779 sub _search_attributes {
 780   my $self = shift;
 781   my ($search_string,$attribute_names,$limit) = @_;
 782   my @words               = map {quotemeta($_)} split /\s+/,$search_string;
 783
 784   my $name_table          = $self->_name_table;
 785   my $attribute_table     = $self->_attribute_table;
 786   my $attributelist_table = $self->_attributelist_table;
 787   my $type_table          = $self->_type_table;
 788   my $typelist_table      = $self->_typelist_table;
 789
 790   my @tags    = @$attribute_names;
 791   my $tag_sql = join ' OR ',("al.tag=?") x @tags;
 792
 793   my $perl_regexp = join '|',@words;
 794
 795   my @wild_card_words = map { "%$_%" } @words;
 796   my $sql_regexp = join ' OR ',("a.attribute_value LIKE ?")  x @words;
 797   # CROSS JOIN disables SQLite's table reordering optimization
 798   my $sql = <<END;
 799 SELECT name,attribute_value,tl.tag,n.id
 800   FROM $attributelist_table        AS al
 801        CROSS JOIN $attribute_table AS a  ON al.id = a.attribute_id
 802        CROSS JOIN $name_table      AS n  ON n.id = a.id
 803        CROSS JOIN $type_table      AS t  ON t.id = n.id
 804        CROSS JOIN $typelist_table  AS tl ON tl.id = t.typeid
 805   WHERE ($tag_sql)
 806     AND ($sql_regexp)
 807     AND n.display_name=1
 808 END
 809   $sql .= "LIMIT $limit" if defined $limit;
 810   $self->_print_query($sql,@tags,@words) if DEBUG || $self->debug;
 811   my $sth = $self->_prepare($sql);
 812   $sth->execute(@tags,@wild_card_words) or $self->throw($sth->errstr);
 813
 814   my @results;
 815   while (my($name,$value,$type,$id) = $sth->fetchrow_array) {
 816     my (@hits) = $value =~ /$perl_regexp/ig;
 817     my @words_in_row = split /\b/,$value;
 818     my $score  = int(@hits*100/@words/@words_in_row);
 819     push @results,[$name,$value,$score,$type,$id];
 820   }
 821   $sth->finish;
 822   @results = sort {$b->[2]<=>$a->[2]} @results;
 823   return @results;
 824 }
 825
 826 sub _match_sql {
 827   my $self = shift;
 828   my $name = shift;
 829
 830   my ($match,$string);
 831   if ($name =~ /(?:^|[^\\])[*?]/) {
 832     $name =~ s/(^|[^\\])([%_])/$1\\$2/g;
 833     $name =~ s/(^|[^\\])\*/$1%/g;
 834     $name =~ s/(^|[^\\])\?/$1_/g;
 835     $match = "LIKE ?";
 836     $string  = $name;
 837   } else {
 838     $match = "= lower(?)";
 839     $string  = lc($name);
 840   }
 841   return ($match,$string);
 842 }
 843
 844 sub _attributes_sql {
 845   my $self = shift;
 846   my ($attributes,$join) = @_;
 847
 848   my ($wf,@bind_args)       = $self->_make_attribute_where('a','al',$attributes);
 849   my ($group_by,@group_args)= $self->_make_attribute_group('a',$attributes);
 850
 851   my $attribute_table       = $self->_attribute_table;
 852   my $attributelist_table   = $self->_attributelist_table;
 853
 854   my $from = "$attribute_table AS a INDEXED BY index_attribute_id, $attributelist_table AS al";
 855
 856   my $where = <<END;
 857   a.id=$join
 858   AND   a.attribute_id=al.id
 859   AND ($wf)
 860 END
 861
 862   my $group = $group_by;
 863
 864   my @args  = (@bind_args,@group_args);
 865   return ($from,$where,$group,@args);
 866 }
 867
 868 # overridden because of case-sensitivity of matches
 869 sub _types_sql {
 870   my $self  = shift;
 871   my ($types,$type_table) = @_;
 872   my ($primary_tag,$source_tag);
 873
 874   my @types = ref $types eq 'ARRAY' ?  @$types : $types;
 875
 876   my $typelist      = $self->_typelist_table;
 877   my $from = "$typelist AS tl";
 878
 879   my (@matches,@args);
 880
 881   for my $type (@types) {
 882
 883     if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
 884       $primary_tag = $type->method;
 885       $source_tag  = $type->source;
 886     } else {
 887       ($primary_tag,$source_tag) = split ':',$type,2;
 888     }
 889
 890     if (length $source_tag) {
 891       push @matches,"lower(tl.tag)=lower(?)";
 892       push @args,"$primary_tag:$source_tag";
 893     } else {
 894       push @matches,"tl.tag LIKE ?";
 895       push @args,"$primary_tag:%";
 896     }
 897   }
 898   my $matches = join ' OR ',@matches;
 899
 900   my $where = <<END;
 901    tl.id=$type_table.typeid
 902    AND   ($matches)
 903 END
 904
 905   return ($from,$where,'',@args);
 906 }
 907
 908 sub optimize {
 909   my $self = shift;
 910   $self->dbh->do("ANALYZE $_") foreach $self->index_tables;
 911 }
 912
 913 ###
 914 # Replace Bio::SeqFeatureI into database.
 915 #
 916 sub replace {
 917   my $self       = shift;
 918   my $object     = shift;
 919   my $index_flag = shift || undef;
 920
 921   # ?? shouldn't need to do this
 922   # $self->_load_class($object);
 923   my $id = $object->primary_id;
 924   my $features = $self->_feature_table;
 925
 926   my $sth = $self->_prepare(<<END);
 927 REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES (?,?,?,?,?)
 928 END
 929
 930   my ($seqid,$start,$end,$strand,$bin) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x6;
 931
 932   my $primary_tag = $object->primary_tag;
 933   my $source_tag  = $object->source_tag || '';
 934   $primary_tag    .= ":$source_tag";
 935   my $typeid   = $self->_typeid($primary_tag,1);
 936
 937   my $frozen = $self->no_blobs() ? 0 : $self->freeze($object);
 938
 939   $sth->bind_param(1, $id);
 940   $sth->bind_param(2, $frozen, {TYPE => SQL_BLOB});
 941   $sth->bind_param(3, $index_flag||0);
 942   $sth->bind_param(4, $strand);
 943   $sth->bind_param(5, $typeid);
 944
 945   $sth->execute() or $self->throw($sth->errstr);
 946
 947   my $dbh = $self->dbh;
 948   $object->primary_id($dbh->func('last_insert_rowid')) unless defined $id;
 949
 950   $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
 951 }
 952
 953 # doesn't work with this schema, since we have to update name and attribute
 954 # tables which need object ids, which we can only know by replacing feats in
 955 # the feature table one by one
 956 sub bulk_replace {
 957     my $self       = shift;
 958     my $index_flag = shift || undef;
 959     my @objects    = @_;
 960
 961     my $features = $self->_feature_table;
 962
 963     my @insert_values;
 964     foreach my $object (@objects) {
 965         my $id = $object->primary_id;
 966         my (undef,undef,undef,$strand) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x4;
 967         my $primary_tag = $object->primary_tag;
 968         my $source_tag  = $object->source_tag || '';
 969         $primary_tag    .= ":$source_tag";
 970         my $typeid   = $self->_typeid($primary_tag,1);
 971
 972         push(@insert_values, ($id,0,$index_flag||0,$strand,$typeid));
 973     }
 974
 975     my @value_blocks;
 976     for (1..@objects) {
 977         push(@value_blocks, '(?,?,?,?,?)');
 978     }
 979     my $value_blocks = join(',', @value_blocks);
 980     my $sql = qq{REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES $value_blocks};
 981
 982     my $sth = $self->_prepare($sql);
 983     $sth->execute(@insert_values) or $self->throw($sth->errstr);
 984 }
 985
 986 sub _get_location_and_bin {
 987     my $self = shift;
 988     my $obj  = shift;
 989     my $seqid   = $self->_locationid($obj->seq_id||'');
 990     my $start   = $obj->start;
 991     my $end     = $obj->end;
 992     my $strand  = $obj->strand;
 993     return ($seqid,$start,$end,$strand,$self->calculate_bin($start,$end));
 994 }
 995
 996
 997
 998 ###
 999 # Insert one Bio::SeqFeatureI into database. primary_id must be undef
1000 #
1001 sub insert {
1002   my $self = shift;
1003   my $object = shift;
1004   my $index_flag = shift || 0;
1005
1006   $self->_load_class($object);
1007   defined $object->primary_id and $self->throw("$object already has a primary id");
1008
1009   my $features = $self->_feature_table;
1010   my $sth = $self->_prepare(<<END);
1011 INSERT INTO $features (id,object,"indexed") VALUES (?,?,?)
1012 END
1013   $sth->execute(undef,$self->freeze($object),$index_flag) or $self->throw($sth->errstr);
1014   my $dbh = $self->dbh;
1015   $object->primary_id($dbh->func('last_insert_rowid'));
1016   $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
1017 }
1018
1019 =head2 toplevel_types
1020
1021  Title   : toplevel_types
1022  Usage   : @type_list = $db->toplevel_types
1023  Function: Get the toplevel types in the database
1024  Returns : array of Bio::DB::GFF::Typename objects
1025  Args    : none
1026  Status  : public
1027
1028 This is similar to types() but only returns the types of
1029 INDEXED (toplevel) features.
1030
1031 =cut
1032
1033 sub toplevel_types {
1034     my $self = shift;
1035     eval "require Bio::DB::GFF::Typename"
1036         unless Bio::DB::GFF::Typename->can('new');
1037     my $typelist_table      = $self->_typelist_table;
1038     my $feature_table       = $self->_feature_table;
1039     my $sql = <<END;
1040 SELECT distinct(tag) from $typelist_table as tl,$feature_table as f
1041  WHERE tl.id=f.typeid
1042    AND f."indexed"=1
1043 END
1044 ;
1045     $self->_print_query($sql) if DEBUG || $self->debug;
1046     my $sth = $self->_prepare($sql);
1047     $sth->execute() or $self->throw($sth->errstr);
1048
1049     my @results;
1050     while (my($tag) = $sth->fetchrow_array) {
1051         push @results,Bio::DB::GFF::Typename->new($tag);
1052     }
1053     $sth->finish;
1054     return @results;
1055 }
1056
1057 sub _genericid {
1058   my $self = shift;
1059   my ($table,$namefield,$name,$add_if_missing) = @_;
1060   my $qualified_table = $self->_qualify($table);
1061   my $sth = $self->_prepare(<<END);
1062 SELECT id FROM $qualified_table WHERE lower($namefield)=lower(?)
1063 END
1064   $sth->execute($name) or die $sth->errstr;
1065   my ($id) = $sth->fetchrow_array;
1066   $sth->finish;
1067   return $id if defined $id;
1068   return     unless $add_if_missing;
1069
1070   $sth = $self->_prepare(<<END);
1071 INSERT INTO $qualified_table ($namefield) VALUES (?)
1072 END
1073   $sth->execute($name) or die $sth->errstr;
1074   my $dbh = $self->dbh;
1075   return $dbh->func('last_insert_rowid');
1076 }
1077 ###
1078 # special-purpose store for bulk loading - write to a file rather than to the db
1079 #
1080 sub _dump_store {
1081   my $self    = shift;
1082   my $indexed = shift;
1083
1084   my $count = 0;
1085   my $store_fh = $self->dump_filehandle('feature');
1086   my $dbh      = $self->dbh;
1087
1088   my $autoindex = $self->autoindex;
1089
1090   for my $obj (@_) {
1091     my $id       = $self->next_id;
1092     my ($seqid,$start,$end,$strand) = $indexed ? $self->_get_location_and_bin($obj) : (undef)x4;
1093     my $primary_tag = $obj->primary_tag;
1094     my $source_tag  = $obj->source_tag || '';
1095     $primary_tag    .= ":$source_tag";
1096     my $typeid   = $self->_typeid($primary_tag,1);
1097
1098     # Encode BLOB in hex so we can more easily import it into SQLite
1099     print $store_fh
1100     join("\t",$id,$typeid,$strand,$indexed,
1101          unpack('H*', $self->freeze($obj))),"\n";
1102     $obj->primary_id($id);
1103     $self->_update_indexes($obj) if $indexed && $autoindex;
1104     $count++;
1105   }
1106
1107   # remember whether we are have ever stored a non-indexed feature
1108   unless ($indexed or $self->{indexed_flag}++) {
1109     $self->subfeatures_are_indexed(0);
1110   }
1111   $count;
1112 }
1113
1114 sub _dump_update_name_index {
1115   my $self = shift;
1116   my ($obj,$id) = @_;
1117   my $fh      = $self->dump_filehandle('name');
1118   my $dbh     = $self->dbh;
1119   my ($names,$aliases) = $self->feature_names($obj);
1120   # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1121   print $fh join("\t",$id,lc($_),1),"\n" foreach @$names;
1122   print $fh join("\t",$id,lc($_),0),"\n" foreach @$aliases;
1123 }
1124
1125 sub _update_name_index {
1126   my $self = shift;
1127   my ($obj,$id) = @_;
1128   my $name = $self->_name_table;
1129   my $primary_id = $obj->primary_id;
1130
1131   $self->_delete_index($name,$id);
1132   my ($names,$aliases) = $self->feature_names($obj);
1133
1134   my $sth = $self->_prepare("INSERT INTO $name (id,name,display_name) VALUES (?,?,?)");
1135
1136   $sth->execute($id,lc $_,1) or $self->throw($sth->errstr)   foreach @$names;
1137   $sth->execute($id,lc $_,0) or $self->throw($sth->errstr) foreach @$aliases;
1138   $sth->finish;
1139 }
1140
1141
1142 sub _dump_update_attribute_index {
1143   my $self = shift;
1144   my ($obj,$id) = @_;
1145   my $fh        = $self->dump_filehandle('attribute');
1146   my $dbh       = $self->dbh;
1147   for my $tag ($obj->all_tags) {
1148     my $tagid = $self->_attributeid($tag);
1149     for my $value ($obj->each_tag_value($tag)) {
1150       # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1151       print $fh join("\t",$id,$tagid,$value),"\n";
1152     }
1153   }
1154 }
1155
1156 sub _update_indexes {
1157     my $self = shift;
1158     my $obj  = shift;
1159     defined (my $id   = $obj->primary_id) or return;
1160     $self->SUPER::_update_indexes($obj);
1161
1162     if ($self->{bulk_update_in_progress}) {
1163         $self->_dump_update_location_index($obj,$id);
1164     } else {
1165         $self->_update_location_index($obj,$id);
1166     }
1167 }
1168
1169 sub _update_location_index {
1170     my $self = shift;
1171     my ($obj,$id) = @_;
1172     my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1173
1174     my $table = $self->_feature_index_table;
1175     $self->_delete_index($table,$id);
1176
1177     my ($sql,@args);
1178
1179     if ($self->_has_spatial_index) {
1180         $sql    = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1181         @args   = ($id,$seqid,$bin,$start,$end);
1182     } else {
1183         $sql    = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1184         @args   = ($id,$seqid,$bin,$start,$end);
1185     }
1186
1187     my $sth  = $self->_prepare($sql);
1188     $sth->execute(@args);
1189     $sth->finish;
1190 }
1191
1192 sub _dump_update_location_index {
1193     my $self = shift;
1194     my ($obj,$id) = @_;
1195     my $table   = $self->_feature_index_table;
1196     my $fh      = $self->dump_filehandle($table);
1197     my $dbh     = $self->dbh;
1198     my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1199     my @args = $self->_has_spatial_index ? ($id,$seqid,$bin,$start,$end)
1200                                          : ($id,$seqid,$bin,$start,$end);
1201     print $fh join("\t",@args),"\n";
1202 }
1203
1204 sub DESTROY {
1205     my $self = shift;
1206     # Remove filehandles, so temporal files can be properly deleted
1207     if (%DBI::installed_drh) {
1208         DBI->disconnect_all;
1209         %DBI::installed_drh = ();
1210     }
1211     undef $self->{dbh};
1212 }
1213
1214 1;
1215
1216 =head1 AUTHOR
1217
1218 Nathan Weeks - Nathan.Weeks@ars.usda.gov
1219
1220 Copyright (c) 2009 Nathan Weeks
1221
1222 Modified 2010 to support cumulative statistics by Lincoln Stein
1223 <lincoln.stein@gmail.com>.
1224
1225 This library is free software; you can redistribute it and/or modify
1226 it under the same terms as Perl itself. See the Bioperl license for
1227 more details.
1228
1229 =cut