Bio/DB/SeqFeature/Store/DBI/SQLite.pm

   1 package Bio::DB::SeqFeature::Store::DBI::SQLite;
   2
   3 #$Id$
   4
   5 =head1 NAME
   6
   7 Bio::DB::SeqFeature::Store::DBI::SQLite -- SQLite implementation of Bio::DB::SeqFeature::Store
   8
   9 =head1 SYNOPSIS
  10
  11   use Bio::DB::SeqFeature::Store;
  12
  13   # Open the sequence database
  14   my $db = Bio::DB::SeqFeature::Store->new(-adaptor => 'DBI::SQLite',
  15                                            -dsn     => '/path/to/database.db');
  16
  17   # get a feature from somewhere
  18   my $feature = Bio::SeqFeature::Generic->new(...);
  19
  20   # store it
  21   $db->store($feature) or die "Couldn't store!";
  22
  23   # primary ID of the feature is changed to indicate its primary ID
  24   # in the database...
  25   my $id = $feature->primary_id;
  26
  27   # get the feature back out
  28   my $f  = $db->fetch($id);
  29
  30   # change the feature and update it
  31   $f->start(100);
  32   $db->update($f) or die "Couldn't update!";
  33
  34   # searching...
  35   # ...by id
  36   my @features = $db->fetch_many(@list_of_ids);
  37
  38   # ...by name
  39   @features = $db->get_features_by_name('ZK909');
  40
  41   # ...by alias
  42   @features = $db->get_features_by_alias('sma-3');
  43
  44   # ...by type
  45   @features = $db->get_features_by_name('gene');
  46
  47   # ...by location
  48   @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
  49
  50   # ...by attribute
  51   @features = $db->get_features_by_attribute({description => 'protein kinase'})
  52
  53   # ...by the GFF "Note" field
  54   @result_list = $db->search_notes('kinase');
  55
  56   # ...by arbitrary combinations of selectors
  57   @features = $db->features(-name => $name,
  58                             -type => $types,
  59                             -seq_id => $seqid,
  60                             -start  => $start,
  61                             -end    => $end,
  62                             -attributes => $attributes);
  63
  64   # ...using an iterator
  65   my $iterator = $db->get_seq_stream(-name => $name,
  66                                      -type => $types,
  67                                      -seq_id => $seqid,
  68                                      -start  => $start,
  69                                      -end    => $end,
  70                                      -attributes => $attributes);
  71
  72   while (my $feature = $iterator->next_seq) {
  73     # do something with the feature
  74   }
  75
  76   # ...limiting the search to a particular region
  77   my $segment  = $db->segment('Chr1',5000=>6000);
  78   my @features = $segment->features(-type=>['mRNA','match']);
  79
  80   # getting & storing sequence information
  81   # Warning: this returns a string, and not a PrimarySeq object
  82   $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
  83   my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
  84
  85   # what feature types are defined in the database?
  86   my @types    = $db->types;
  87
  88   # create a new feature in the database
  89   my $feature = $db->new_feature(-primary_tag => 'mRNA',
  90                                  -seq_id      => 'chr3',
  91                                  -start      => 10000,
  92                                  -end        => 11000);
  93
  94 =head1 DESCRIPTION
  95
  96 Bio::DB::SeqFeature::Store::SQLite is the SQLite adaptor for
  97 Bio::DB::SeqFeature::Store. You will not create it directly, but
  98 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
  99
 100 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
 101
 102 =head2 Using the SQLite adaptor
 103
 104 To establish a connection to the database, call
 105 Bio::DB::SeqFeature::Store-E<gt>new(-adaptor=E<gt>'DBI::SQLite',@more_args). The
 106 additional arguments are as follows:
 107
 108   Argument name       Description
 109   -------------       -----------
 110
 111  -dsn              The path to the SQLite database file.
 112
 113  -namespace        A prefix to attach to each table. This allows you
 114                    to have several virtual databases in the same
 115                    physical database.
 116
 117  -temp             Boolean flag. If true, a temporary database
 118                    will be created and destroyed as soon as
 119                    the Store object goes out of scope. (synonym -temporary)
 120
 121  -autoindex        Boolean flag. If true, features in the database will be
 122                    reindexed every time they change. This is the default.
 123
 124
 125  -tmpdir           Directory in which to place temporary files during "fast" loading.
 126                    Defaults to File::Spec->tmpdir(). (synonyms -dump_dir, -dumpdir, -tmp)
 127
 128  -dbi_options      A hashref to pass to DBI->connect's 4th argument, the "attributes."
 129                    (synonyms -options, -dbi_attr)
 130
 131  -write            Pass true to open database for writing or updating.
 132
 133 If successful, a new instance of
 134 Bio::DB::SeqFeature::Store::DBI::SQLite will be returned.
 135
 136 In addition to the standard methods supported by all well-behaved
 137 Bio::DB::SeqFeature::Store databases, several following
 138 adaptor-specific methods are provided. These are described in the next
 139 sections.
 140
 141 =cut
 142
 143 use strict;
 144
 145 use base 'Bio::DB::SeqFeature::Store::DBI::mysql';
 146 use Bio::DB::SeqFeature::Store::DBI::Iterator;
 147 use DBI qw(:sql_types);
 148 use Memoize;
 149 use Cwd 'abs_path';
 150 use Bio::DB::GFF::Util::Rearrange 'rearrange';
 151 use Bio::SeqFeature::Lite;
 152 use File::Spec;
 153 use constant DEBUG=>0;
 154
 155 # Using same limits as MySQL adaptor so I don't have to make something up.
 156 use constant MAX_INT =>  2_147_483_647;
 157 use constant MIN_INT => -2_147_483_648;
 158 use constant MAX_BIN =>  1_000_000_000;  # size of largest feature = 1 Gb
 159 use constant MIN_BIN =>  1000;           # smallest bin we'll make - on a 100 Mb chromosome, there'll be 100,000 of these
 160 use constant DEBUG_NONSPATIAL=>0;
 161
 162 my (@BINS,%BINS);
 163 {
 164     @BINS = map {2**$_} (17, 20, 23, 26, 29);  # TO DO: experiment with different bin sizes
 165     my $start=0;
 166     for my $b (sort {$b<=>$a} @BINS) {
 167         $BINS{$b} = $start;
 168         $start   += $BINS[-1]/$b;
 169     }
 170 }
 171
 172
 173 # my %BINS = (
 174 #     2**11 => 37449,
 175 #     2**14 =>  4681,
 176 #     2**17 =>   585,
 177 #     2**20 =>    73,
 178 #     2**23 =>     9,
 179 #     2**26 =>     1,
 180 #     2**29 =>     0
 181 # );
 182 # my @BINS = sort {$a<=>$b} keys %BINS;
 183
 184 sub calculate_bin {
 185     my $self = shift;
 186     my ($start,$end) = @_;
 187
 188     my $len = $end - $start;
 189     for my $bin (@BINS) {
 190         next if $len > $bin;
 191         # possibly fits here
 192         my $binstart = int $start/$bin;
 193         my $binend   = int $end/$bin;
 194         return $binstart+$BINS{$bin} if $binstart == $binend;
 195     }
 196
 197     die "unreasonable coordinates ",$start+1,"..$end";
 198 }
 199
 200 sub search_bins {
 201     my $self = shift;
 202     my ($start,$end) = @_;
 203     my @results;
 204
 205     for my $bin (@BINS) {
 206         my $binstart = int $start/$bin;
 207         my $binend   = int $end/$bin;
 208         push @results,$binstart+$BINS{$bin}..$binend+$BINS{$bin};
 209     }
 210     return @results;
 211 }
 212
 213
 214 ###
 215 # object initialization
 216 #
 217 sub init {
 218   my $self          = shift;
 219   my ($dsn,
 220       $is_temporary,
 221       $autoindex,
 222       $namespace,
 223       $dump_dir,
 224       $user,
 225       $pass,
 226       $dbi_options,
 227       $writeable,
 228       $create,
 229      ) = rearrange(['DSN',
 230                     ['TEMP','TEMPORARY'],
 231                     'AUTOINDEX',
 232                     'NAMESPACE',
 233                     ['DUMP_DIR','DUMPDIR','TMP','TMPDIR'],
 234                     'USER',
 235                     ['PASS','PASSWD','PASSWORD'],
 236                     ['OPTIONS','DBI_OPTIONS','DBI_ATTR'],
 237                     ['WRITE','WRITEABLE'],
 238                     'CREATE',
 239                    ],@_);
 240   $dbi_options  ||= {};
 241   $writeable    = 1 if $is_temporary or $dump_dir;
 242
 243   $dsn or $self->throw("Usage: ".__PACKAGE__."->init(-dsn => \$dbh || \$dsn)");
 244
 245   my $dbh;
 246   if (ref $dsn) {
 247     $dbh = $dsn;
 248   } else {
 249     $dsn = "dbi:SQLite:$dsn" unless $dsn =~ /^dbi:/;
 250     $dbh = DBI->connect($dsn,$user,$pass,$dbi_options) or $self->throw($DBI::errstr);
 251     $dbh->do("PRAGMA synchronous = OFF;"); # makes writes much faster
 252     $dbh->do("PRAGMA temp_store = MEMORY;"); # less disk I/O; some speedup
 253     $dbh->do("PRAGMA cache_size = 20000;"); # less disk I/O; some speedup
 254   }
 255   $self->{dbh}       = $dbh;
 256   $self->{is_temp}   = $is_temporary;
 257   $self->{namespace} = $namespace;
 258   $self->{writeable} = $writeable;
 259
 260   $self->default_settings;
 261   $self->autoindex($autoindex)                   if defined $autoindex;
 262   $self->dumpdir($dump_dir)                      if $dump_dir;
 263   if ($self->is_temp) {
 264     $self->init_tmp_database();
 265   } elsif ($create) {
 266     $self->init_database('erase');
 267   }
 268 }
 269
 270 sub table_definitions {
 271   my $self = shift;
 272   my $defs =
 273       {
 274           feature => <<END,
 275 (
 276   id        integer primary key autoincrement,
 277   typeid    integer not null,
 278   strand    integer default 0,
 279   "indexed" integer default 1,
 280   object    blob not null
 281 )
 282 END
 283
 284           locationlist => <<END,
 285 (
 286   id      integer primary key autoincrement,
 287   seqname text    not null
 288 );
 289 create index index_locationlist on locationlist (seqname);
 290 END
 291
 292           typelist => <<END,
 293 (
 294   id  integer primary key autoincrement,
 295   tag text    not null
 296 );
 297 create index index_typelist on typelist (tag);
 298 END
 299           name => <<END,
 300 (
 301   id           integer not null,
 302   name         text    not null,
 303   display_name integer default 0
 304 );
 305 create index index_name_id on name(id);
 306 create index index_name_name on name(name);
 307 END
 308
 309           attribute => <<END,
 310 (
 311   id              integer not null,
 312   attribute_id    integer not null,
 313   attribute_value text
 314 );
 315 create index index_attribute_id    on attribute(attribute_id);
 316 create index index_attribute_value on attribute(attribute_value);
 317 END
 318
 319           attributelist => <<END,
 320 (
 321   id  integer primary key autoincrement,
 322   tag text    not null
 323 );
 324 create index index_attributelist_id  on attributelist(id);
 325 create index index_attributelist_tag on attributelist(tag);
 326 END
 327           parent2child => <<END,
 328 (
 329   id    integer not null,
 330   child integer not null
 331 );
 332 create index index_parent2child_id_child on parent2child(id,child);
 333 END
 334
 335           meta => <<END,
 336 (
 337   name  text primary key,
 338   value text not null
 339 )
 340 END
 341           sequence => <<END,
 342 (
 343   id       integer not null,
 344   offset   integer not null,
 345   sequence blob,
 346   primary key(id,offset)
 347 )
 348 END
 349          };
 350
 351   unless ($self->_has_spatial_index) {
 352     $defs->{feature_location} = <<END;
 353 (
 354   id       int(10) primary key,
 355   seqid    int(10),
 356   bin      int,
 357   start    int,
 358   end      int
 359 );
 360 create index index_feature_location on feature_location(seqid,bin,start,end);
 361 END
 362
 363   }
 364
 365   return $defs;
 366 }
 367
 368 sub _init_database {
 369     my $self  = shift;
 370
 371     # must do this first before calling table_definitions
 372     $self->_create_spatial_index;
 373     $self->SUPER::_init_database(@_);
 374 }
 375
 376 sub init_tmp_database {
 377     my $self = shift;
 378     my $erase = shift;
 379     $self->_create_spatial_index;
 380     $self->SUPER::init_tmp_database(@_);
 381 }
 382
 383 sub _create_spatial_index{
 384     my $self = shift;
 385     my $dbh   = $self->dbh;
 386     local $dbh->{PrintError} = 0;
 387     $dbh->do("DROP TABLE IF EXISTS feature_index"); # spatial index
 388     if (DEBUG_NONSPATIAL) {
 389         warn "DELIBERATELY BREAKING RTREE FUNCTIONALITY";
 390         $dbh->do("CREATE VIRTUAL TABLE feature_index USING BTREE(id,seqid,bin,start,end)");
 391     } else {
 392         $dbh->do("CREATE VIRTUAL TABLE feature_index USING RTREE(id,seqid,bin,start,end)");
 393     }
 394 }
 395
 396 sub _has_spatial_index {
 397     my $self = shift;
 398     return $self->{'_has_spatial_index'} if exists $self->{'_has_spatial_index'};
 399     my $dbh  = $self->dbh;
 400     my ($count) = $dbh->selectrow_array("select count(*) from sqlite_master where name='feature_index'");
 401     return $self->{'_has_spatial_index'} = $count;
 402 }
 403
 404 sub _finish_bulk_update {
 405   my $self = shift;
 406   my $dbh  = $self->dbh;
 407   my $dir = $self->{dumpdir} || '.';
 408
 409   $dbh->begin_work; # making this a transaction greatly improves performance
 410
 411   for my $table ('feature', $self->index_tables) {
 412     my $fh = $self->dump_filehandle($table);
 413     my $path = $self->dump_path($table);
 414     $fh->close;
 415     open($fh, $path);
 416     my $qualified_table = $self->_qualify($table);
 417
 418     my $sth;
 419     if ($table =~ /feature$/) {
 420       $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?,?,?)");
 421
 422       while (<$fh>) {
 423         chomp();
 424         my ($id,$typeid,$strand,$indexed,$obj) = split(/\t/);
 425         $sth->bind_param(1, $id);
 426         $sth->bind_param(2, $typeid);
 427         $sth->bind_param(3, $strand);
 428         $sth->bind_param(4, $indexed);
 429         $sth->bind_param(5, pack('H*',$obj), {TYPE => SQL_BLOB});
 430         $sth->execute();
 431       }
 432     } else {
 433         my $feature_index = $self->_feature_index_table;
 434         if ($table =~ /parent2child$/) {
 435             $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?)");
 436         } elsif ($table =~ /$feature_index$/) {
 437             $sth = $dbh->prepare(
 438                 $self->_has_spatial_index ?"REPLACE INTO $qualified_table VALUES (?,?,?,?,?)"
 439                                           :"REPLACE INTO $qualified_table VALUES (?,?,?,?,?)"
 440                 );
 441         } else { # attribute or name
 442             $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?)");
 443         }
 444
 445         while (<$fh>) {
 446             chomp();
 447             $sth->execute(split(/\t/));
 448         }
 449     }
 450     $fh->close();
 451     unlink $path;
 452   }
 453   $dbh->commit; # commit the transaction
 454   delete $self->{bulk_update_in_progress};
 455   delete $self->{filehandles};
 456 }
 457
 458 sub index_tables {
 459     my $self = shift;
 460     my @t    = $self->SUPER::index_tables;
 461     return (@t,$self->_feature_index_table);
 462 }
 463
 464 ###
 465 # get primary sequence between start and end
 466 #
 467 sub _fetch_sequence {
 468   my $self = shift;
 469   my ($seqid,$start,$end) = @_;
 470
 471   # backward compatibility to the old days when I liked reverse complementing
 472   # dna by specifying $start > $end
 473   my $reversed;
 474   if (defined $start && defined $end && $start > $end) {
 475     $reversed++;
 476     ($start,$end) = ($end,$start);
 477   }
 478   $start-- if defined $start;
 479   $end--   if defined $end;
 480
 481   my $offset1 = $self->_offset_boundary($seqid,$start || 'left');
 482   my $offset2 = $self->_offset_boundary($seqid,$end   || 'right');
 483   my $sequence_table = $self->_sequence_table;
 484   my $locationlist_table = $self->_locationlist_table;
 485
 486   # CROSS JOIN gives a hint to the SQLite query optimizer -- mucho speedup!
 487   my $sth     = $self->_prepare(<<END);
 488 SELECT sequence,offset
 489    FROM $locationlist_table as ll CROSS JOIN $sequence_table as s
 490    WHERE ll.id=s.id
 491      AND ll.seqname= ?
 492      AND offset >= ?
 493      AND offset <= ?
 494    ORDER BY offset
 495 END
 496
 497   my $seq = '';
 498   $sth->execute($seqid,$offset1,$offset2) or $self->throw($sth->errstr);
 499
 500   while (my($frag,$offset) = $sth->fetchrow_array) {
 501     substr($frag,0,$start-$offset) = '' if defined $start && $start > $offset;
 502     $seq .= $frag;
 503   }
 504   substr($seq,$end-$start+1) = '' if defined $end && $end-$start+1 < length($seq);
 505   if ($reversed) {
 506     $seq = reverse $seq;
 507     $seq =~ tr/gatcGATC/ctagCTAG/;
 508   }
 509   $sth->finish;
 510   $seq;
 511 }
 512
 513 sub _offset_boundary {
 514   my $self = shift;
 515   my ($seqid,$position) = @_;
 516
 517   my $sequence_table     = $self->_sequence_table;
 518   my $locationlist_table = $self->_locationlist_table;
 519
 520   my $sql;
 521   # use "CROSS JOIN" to give a hint to the SQLite query optimizer.
 522   $sql =  $position eq 'left'  ? "SELECT min(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
 523          :$position eq 'right' ? "SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
 524          :"SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=? AND offset<=?";
 525   my $sth = $self->_prepare($sql);
 526   my @args = $position =~ /^-?\d+$/ ? ($seqid,$position) : ($seqid);
 527   $sth->execute(@args) or $self->throw($sth->errstr);
 528   my $boundary = $sth->fetchall_arrayref->[0][0];
 529   $sth->finish;
 530   return $boundary;
 531 }
 532
 533 ###
 534 # Efficiently fetch a series of IDs from the database
 535 # Can pass an array or an array ref
 536 #
 537 sub _fetch_many {
 538   my $self       = shift;
 539   @_ or $self->throw('usage: fetch_many($id1,$id2,$id3...)');
 540   my $ids = join ',',map {ref($_) ? @$_ : $_} @_ or return;
 541   my $features = $self->_feature_table;
 542
 543   my $sth = $self->_prepare(<<END);
 544 SELECT id,object FROM $features WHERE id IN ($ids)
 545 END
 546   $sth->execute() or $self->throw($sth->errstr);
 547   return $self->_sth2objs($sth);
 548 }
 549
 550 sub _features {
 551   my $self = shift;
 552   my ($seq_id,$start,$end,$strand,
 553       $name,$class,$allow_aliases,
 554       $types,
 555       $attributes,
 556       $range_type,
 557       $fromtable,
 558       $iterator,
 559       $sources
 560      ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
 561                     'NAME','CLASS','ALIASES',
 562                     ['TYPES','TYPE','PRIMARY_TAG'],
 563                     ['ATTRIBUTES','ATTRIBUTE'],
 564                     'RANGE_TYPE',
 565                     'FROM_TABLE',
 566                     'ITERATOR',
 567                     ['SOURCE','SOURCES']
 568                    ],@_);
 569
 570   my (@from,@where,@args,@group);
 571   $range_type ||= 'overlaps';
 572
 573   my $feature_table         = $self->_feature_table;
 574   @from = "$feature_table as f";
 575
 576   if (defined $name) {
 577     # hacky backward compatibility workaround
 578     undef $class if $class && $class eq 'Sequence';
 579     $name = "$class:$name" if defined $class && length $class > 0;
 580     # last argument is the join field
 581     my ($from,$where,$group,@a) = $self->_name_sql($name,$allow_aliases,'f.id');
 582     push @from,$from   if $from;
 583     push @where,$where if $where;
 584     push @group,$group if $group;
 585     push @args,@a;
 586   }
 587
 588   if (defined $seq_id) {
 589     # last argument is the name of the features table
 590     my ($from,$where,$group,@a) = $self->_location_sql($seq_id,$start,$end,$range_type,$strand,'f');
 591     push @from,$from   if $from;
 592     push @where,$where if $where;
 593     push @group,$group if $group;
 594     push @args,@a;
 595   }
 596
 597   if (defined($sources)) {
 598     my @sources = ref($sources) eq 'ARRAY' ? @{$sources} : ($sources);
 599     if (defined($types)) {
 600         my @types = ref($types) eq 'ARRAY' ? @{$types} : ($types);
 601         my @final_types;
 602         foreach my $type (@types) {
 603             # *** not sure what to do if user supplies both -source and -type
 604             #     where the type includes a source!
 605             if ($type =~ /:/) {
 606                 push(@final_types, $type);
 607             }
 608             else {
 609                 foreach my $source (@sources) {
 610                     push(@final_types, $type.':'.$source);
 611                 }
 612             }
 613         }
 614         $types = \@final_types;
 615     }
 616     else {
 617         $types = [map { ':'.$_ } @sources];
 618     }
 619   }
 620   if (defined($types)) {
 621     # last argument is the name of the features table
 622     my ($from,$where,$group,@a) = $self->_types_sql($types,'f');
 623     push @from,$from   if $from;
 624     push @where,$where if $where;
 625     push @group,$group if $group;
 626     push @args,@a;
 627   }
 628
 629   if (defined $attributes) {
 630     # last argument is the join field
 631     my ($from,$where,$group,@a) = $self->_attributes_sql($attributes,'f.id');
 632     push @from,$from    if $from;
 633     push @where,$where  if $where;
 634     push @group,$group  if $group;
 635     push @args,@a;
 636   }
 637
 638   if (defined $fromtable) {
 639     # last argument is the join field
 640     my ($from,$where,$group,@a) = $self->_from_table_sql($fromtable,'f.id');
 641     push @from,$from    if $from;
 642     push @where,$where  if $where;
 643     push @group,$group  if $group;
 644     push @args,@a;
 645   }
 646
 647   # if no other criteria are specified, then
 648   # only fetch indexed (i.e. top level objects)
 649   @where = '"indexed"=1' unless @where;
 650
 651   my $from  = join ', ',@from;
 652   my $where = join ' AND ',map {"($_)"} @where;
 653   my $group = join ', ',@group;
 654   $group    = "GROUP BY $group" if @group;
 655
 656   my $query = <<END;
 657 SELECT f.id,f.object
 658   FROM $from
 659   WHERE $where
 660   $group
 661 END
 662
 663   $self->_print_query($query,@args) if DEBUG || $self->debug;
 664
 665   my $sth = $self->_prepare($query);
 666   $sth->execute(@args) or $self->throw($sth->errstr);
 667   return $iterator ? Bio::DB::SeqFeature::Store::DBI::Iterator->new($sth,$self) : $self->_sth2objs($sth);
 668 }
 669
 670 sub _make_attribute_group {
 671   my $self                     = shift;
 672   my ($table_name,$attributes) = @_;
 673   my $key_count = keys %$attributes or return;
 674   my $count = $key_count-1;
 675   return "f.id HAVING count(f.id)>$count";
 676 }
 677
 678 sub _location_sql {
 679   my $self = shift;
 680   my ($seq_id,$start,$end,$range_type,$strand,$location) = @_;
 681
 682   # the additional join on the location_list table badly impacts performance
 683   # so we build a copy of the table in memory
 684   my $seqid = $self->_locationid($seq_id) || 0; # zero is an invalid primary ID, so will return empty
 685
 686   my $feature_index = $self->_feature_index_table;
 687   my $from  = "$feature_index as fi";
 688
 689   my ($bin_where,@bin_args);
 690   if (defined $start && defined $end && !$self->_has_spatial_index) {
 691       my @bins   = $self->search_bins($start,$end);
 692       $bin_where = ' AND bin in ('.join(',',@bins).')';
 693   }
 694
 695   $start = MIN_INT unless defined $start;
 696   $end   = MAX_INT unless defined $end;
 697
 698   my ($range,@range_args);
 699   if ($range_type eq 'overlaps') {
 700       $range = "fi.end>=? AND fi.start<=?".$bin_where;
 701       @range_args = ($start,$end,@bin_args);
 702   } elsif ($range_type eq 'contains') {
 703       $range = "fi.start>=? AND fi.end<=?".$bin_where;
 704       @range_args = ($start,$end,@bin_args);
 705   } elsif ($range_type eq 'contained_in') {
 706       $range = "fi.start<=? AND fi.end>=?";
 707       @range_args = ($start,$end);
 708   } else {
 709       $self->throw("range_type must be one of 'overlaps', 'contains' or 'contained_in'");
 710   }
 711
 712   if (defined $strand) {
 713       $range .= " AND strand=?";
 714       push @range_args,$strand;
 715   }
 716
 717   my $where = <<END;
 718   fi.seqid=?
 719       AND   $location.id=fi.id
 720       AND   $range
 721 END
 722 ;
 723   my $group = '';
 724
 725   my @args  = ($seqid,@range_args);
 726   return ($from,$where,$group,@args);
 727 }
 728
 729 sub _feature_index_table          {
 730     my $self = shift;
 731     return $self->_has_spatial_index ? $self->_qualify('feature_index')
 732                                      : $self->_qualify('feature_location') }
 733
 734 # Do a case-insensitive search a la the PostgreSQL adaptor
 735 sub _name_sql {
 736   my $self = shift;
 737   my ($name,$allow_aliases,$join) = @_;
 738   my $name_table   = $self->_name_table;
 739
 740   my $from  = "$name_table as n";
 741   my ($match,$string) = $self->_match_sql($name);
 742
 743   my $where = "n.id=$join AND lower(n.name) $match";
 744   $where   .= " AND n.display_name>0" unless $allow_aliases;
 745   return ($from,$where,'',$string);
 746 }
 747
 748 sub _search_attributes {
 749   my $self = shift;
 750   my ($search_string,$attribute_names,$limit) = @_;
 751   my @words               = map {quotemeta($_)} split /\s+/,$search_string;
 752
 753   my $name_table          = $self->_name_table;
 754   my $attribute_table     = $self->_attribute_table;
 755   my $attributelist_table = $self->_attributelist_table;
 756   my $type_table          = $self->_type_table;
 757   my $typelist_table      = $self->_typelist_table;
 758
 759   my @tags    = @$attribute_names;
 760   my $tag_sql = join ' OR ',("al.tag=?") x @tags;
 761
 762   my $perl_regexp = join '|',@words;
 763
 764   my @wild_card_words = map { "%$_%" } @words;
 765   my $sql_regexp = join ' OR ',("a.attribute_value LIKE ?")  x @words;
 766   # CROSS JOIN disables SQLite's table reordering optimization
 767   my $sql = <<END;
 768 SELECT name,attribute_value,tl.tag,n.id
 769   FROM $attributelist_table        AS al
 770        CROSS JOIN $attribute_table AS a  ON al.id = a.attribute_id
 771        CROSS JOIN $name_table      AS n  ON n.id = a.id
 772        CROSS JOIN $type_table      AS t  ON t.id = n.id
 773        CROSS JOIN $typelist_table  AS tl ON tl.id = t.typeid
 774   WHERE ($tag_sql)
 775     AND ($sql_regexp)
 776     AND n.display_name=1
 777 END
 778   $sql .= "LIMIT $limit" if defined $limit;
 779   $self->_print_query($sql,@tags,@words) if DEBUG || $self->debug;
 780   my $sth = $self->_prepare($sql);
 781   $sth->execute(@tags,@wild_card_words) or $self->throw($sth->errstr);
 782
 783   my @results;
 784   while (my($name,$value,$type,$id) = $sth->fetchrow_array) {
 785     my (@hits) = $value =~ /$perl_regexp/ig;
 786     my @words_in_row = split /\b/,$value;
 787     my $score  = int(@hits*100/@words/@words_in_row);
 788     push @results,[$name,$value,$score,$type,$id];
 789   }
 790   $sth->finish;
 791   @results = sort {$b->[2]<=>$a->[2]} @results;
 792   return @results;
 793 }
 794
 795 sub _match_sql {
 796   my $self = shift;
 797   my $name = shift;
 798
 799   my ($match,$string);
 800   if ($name =~ /(?:^|[^\\])[*?]/) {
 801     $name =~ s/(^|[^\\])([%_])/$1\\$2/g;
 802     $name =~ s/(^|[^\\])\*/$1%/g;
 803     $name =~ s/(^|[^\\])\?/$1_/g;
 804     $match = "LIKE ?";
 805     $string  = $name;
 806   } else {
 807     $match = "= lower(?)";
 808     $string  = lc($name);
 809   }
 810   return ($match,$string);
 811 }
 812
 813 sub _attributes_sql {
 814   my $self = shift;
 815   my ($attributes,$join) = @_;
 816
 817   my ($wf,@bind_args)       = $self->_make_attribute_where('a','al',$attributes);
 818   my ($group_by,@group_args)= $self->_make_attribute_group('a',$attributes);
 819
 820   my $attribute_table       = $self->_attribute_table;
 821   my $attributelist_table   = $self->_attributelist_table;
 822
 823   my $from = "$attribute_table AS a INDEXED BY index_attribute_id, $attributelist_table AS al";
 824
 825   my $where = <<END;
 826   a.id=$join
 827   AND   a.attribute_id=al.id
 828   AND ($wf)
 829 END
 830
 831   my $group = $group_by;
 832
 833   my @args  = (@bind_args,@group_args);
 834   return ($from,$where,$group,@args);
 835 }
 836
 837 # overridden because of case-sensitivity of matches
 838 sub _types_sql {
 839   my $self  = shift;
 840   my ($types,$type_table) = @_;
 841   my ($primary_tag,$source_tag);
 842
 843   my @types = ref $types eq 'ARRAY' ?  @$types : $types;
 844
 845   my $typelist      = $self->_typelist_table;
 846   my $from = "$typelist AS tl";
 847
 848   my (@matches,@args);
 849
 850   for my $type (@types) {
 851
 852     if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
 853       $primary_tag = $type->method;
 854       $source_tag  = $type->source;
 855     } else {
 856       ($primary_tag,$source_tag) = split ':',$type,2;
 857     }
 858
 859     if (defined $source_tag) {
 860       push @matches,"lower(tl.tag)=lower(?)";
 861       push @args,"$primary_tag:$source_tag";
 862     } else {
 863       push @matches,"tl.tag LIKE ?";
 864       push @args,"$primary_tag:%";
 865     }
 866   }
 867   my $matches = join ' OR ',@matches;
 868
 869   my $where = <<END;
 870    tl.id=$type_table.typeid
 871    AND   ($matches)
 872 END
 873
 874   return ($from,$where,'',@args);
 875 }
 876
 877 sub optimize {
 878   my $self = shift;
 879   $self->dbh->do("ANALYZE $_") foreach $self->index_tables;
 880 }
 881
 882 ###
 883 # Replace Bio::SeqFeatureI into database.
 884 #
 885 sub replace {
 886   my $self       = shift;
 887   my $object     = shift;
 888   my $index_flag = shift || undef;
 889
 890   # ?? shouldn't need to do this
 891   # $self->_load_class($object);
 892   my $id = $object->primary_id;
 893   my $features = $self->_feature_table;
 894
 895   my $sth = $self->_prepare(<<END);
 896 REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES (?,?,?,?,?)
 897 END
 898
 899   my ($seqid,$start,$end,$strand,$bin) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x5;
 900
 901   my $primary_tag = $object->primary_tag;
 902   my $source_tag  = $object->source_tag || '';
 903   $primary_tag    .= ":$source_tag";
 904   my $typeid   = $self->_typeid($primary_tag,1);
 905
 906   my $frozen = $self->no_blobs() ? 0 : $self->freeze($object);
 907
 908   $sth->bind_param(1, $id);
 909   $sth->bind_param(2, $frozen, {TYPE => SQL_BLOB});
 910   $sth->bind_param(3, $index_flag||0);
 911   $sth->bind_param(4, $strand);
 912   $sth->bind_param(5, $typeid);
 913
 914   $sth->execute() or $self->throw($sth->errstr);
 915
 916   my $dbh = $self->dbh;
 917   $object->primary_id($dbh->func('last_insert_rowid')) unless defined $id;
 918
 919   $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
 920 }
 921
 922 # doesn't work with this schema, since we have to update name and attribute
 923 # tables which need object ids, which we can only know by replacing feats in
 924 # the feature table one by one
 925 sub bulk_replace {
 926     my $self       = shift;
 927     my $index_flag = shift || undef;
 928     my @objects    = @_;
 929
 930     my $features = $self->_feature_table;
 931
 932     my @insert_values;
 933     foreach my $object (@objects) {
 934         my $id = $object->primary_id;
 935         my (undef,undef,undef,$strand) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x4;
 936         my $primary_tag = $object->primary_tag;
 937         my $source_tag  = $object->source_tag || '';
 938         $primary_tag    .= ":$source_tag";
 939         my $typeid   = $self->_typeid($primary_tag,1);
 940
 941         push(@insert_values, ($id,0,$index_flag||0,$strand,$typeid));
 942     }
 943
 944     my @value_blocks;
 945     for (1..@objects) {
 946         push(@value_blocks, '(?,?,?,?,?)');
 947     }
 948     my $value_blocks = join(',', @value_blocks);
 949     my $sql = qq{REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES $value_blocks};
 950
 951     my $sth = $self->_prepare($sql);
 952     $sth->execute(@insert_values) or $self->throw($sth->errstr);
 953 }
 954
 955 sub _get_location_and_bin {
 956     my $self = shift;
 957     my $obj  = shift;
 958     my $seqid   = $self->_locationid($obj->seq_id);
 959     my $start   = $obj->start;
 960     my $end     = $obj->end;
 961     my $strand  = $obj->strand;
 962     return ($seqid,$start,$end,$strand,$self->calculate_bin($start,$end));
 963 }
 964
 965 ###
 966 # Insert one Bio::SeqFeatureI into database. primary_id must be undef
 967 #
 968 sub insert {
 969   my $self = shift;
 970   my $object = shift;
 971   my $index_flag = shift || 0;
 972
 973   $self->_load_class($object);
 974   defined $object->primary_id and $self->throw("$object already has a primary id");
 975
 976   my $features = $self->_feature_table;
 977   my $sth = $self->_prepare(<<END);
 978 INSERT INTO $features (id,object,"indexed") VALUES (?,?,?)
 979 END
 980   $sth->execute(undef,$self->freeze($object),$index_flag) or $self->throw($sth->errstr);
 981   my $dbh = $self->dbh;
 982   $object->primary_id($dbh->func('last_insert_rowid'));
 983   $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
 984 }
 985
 986 =head2 types
 987
 988  Title   : types
 989  Usage   : @type_list = $db->types
 990  Function: Get all the types in the database
 991  Returns : array of Bio::DB::GFF::Typename objects
 992  Args    : none
 993  Status  : public
 994
 995 =cut
 996
 997 sub _genericid {
 998   my $self = shift;
 999   my ($table,$namefield,$name,$add_if_missing) = @_;
1000   my $qualified_table = $self->_qualify($table);
1001   my $sth = $self->_prepare(<<END);
1002 SELECT id FROM $qualified_table WHERE $namefield=?
1003 END
1004   $sth->execute($name) or die $sth->errstr;
1005   my ($id) = $sth->fetchrow_array;
1006   $sth->finish;
1007   return $id if defined $id;
1008   return     unless $add_if_missing;
1009
1010   $sth = $self->_prepare(<<END);
1011 INSERT INTO $qualified_table ($namefield) VALUES (?)
1012 END
1013   $sth->execute($name) or die $sth->errstr;
1014   my $dbh = $self->dbh;
1015   return $dbh->func('last_insert_rowid');
1016 }
1017 ###
1018 # special-purpose store for bulk loading - write to a file rather than to the db
1019 #
1020 sub _dump_store {
1021   my $self    = shift;
1022   my $indexed = shift;
1023
1024   my $count = 0;
1025   my $store_fh = $self->dump_filehandle('feature');
1026   my $dbh      = $self->dbh;
1027
1028   my $autoindex = $self->autoindex;
1029
1030   for my $obj (@_) {
1031     my $id       = $self->next_id;
1032     my ($seqid,$start,$end,$strand) = $indexed ? $self->_get_location_and_bin($obj) : (undef)x4;
1033     my $primary_tag = $obj->primary_tag;
1034     my $source_tag  = $obj->source_tag || '';
1035     $primary_tag    .= ":$source_tag";
1036     my $typeid   = $self->_typeid($primary_tag,1);
1037
1038     # Encode BLOB in hex so we can more easily import it into SQLite
1039     print $store_fh
1040     join("\t",$id,$typeid,$strand,$indexed,
1041          unpack('H*', $self->freeze($obj))),"\n";
1042     $obj->primary_id($id);
1043     $self->_update_indexes($obj) if $indexed && $autoindex;
1044     $count++;
1045   }
1046
1047   # remember whether we are have ever stored a non-indexed feature
1048   unless ($indexed or $self->{indexed_flag}++) {
1049     $self->subfeatures_are_indexed(0);
1050   }
1051   $count;
1052 }
1053
1054 sub _dump_update_name_index {
1055   my $self = shift;
1056   my ($obj,$id) = @_;
1057   my $fh      = $self->dump_filehandle('name');
1058   my $dbh     = $self->dbh;
1059   my ($names,$aliases) = $self->feature_names($obj);
1060   # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1061   print $fh join("\t",$id,$_,1),"\n" foreach @$names;
1062   print $fh join("\t",$id,$_,0),"\n" foreach @$aliases;
1063 }
1064
1065 sub _dump_update_attribute_index {
1066   my $self = shift;
1067   my ($obj,$id) = @_;
1068   my $fh        = $self->dump_filehandle('attribute');
1069   my $dbh       = $self->dbh;
1070   for my $tag ($obj->all_tags) {
1071     my $tagid = $self->_attributeid($tag);
1072     for my $value ($obj->each_tag_value($tag)) {
1073       # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1074       print $fh join("\t",$id,$tagid,$value),"\n";
1075     }
1076   }
1077 }
1078
1079 sub _update_indexes {
1080     my $self = shift;
1081     my $obj  = shift;
1082     defined (my $id   = $obj->primary_id) or return;
1083     $self->SUPER::_update_indexes($obj);
1084
1085     if ($self->{bulk_update_in_progress}) {
1086         $self->_dump_update_location_index($obj,$id);
1087     } else {
1088         $self->_update_location_index($obj,$id);
1089     }
1090 }
1091
1092 sub _update_location_index {
1093     my $self = shift;
1094     my ($obj,$id) = @_;
1095     my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1096
1097     my $table = $self->_feature_index_table;
1098     $self->_delete_index($table,$id);
1099
1100     my ($sql,@args);
1101
1102     if ($self->_has_spatial_index) {
1103         $sql    = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1104         @args   = ($id,$seqid,$bin,$start,$end);
1105     } else {
1106         $sql    = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1107         @args   = ($id,$seqid,$bin,$start,$end);
1108     }
1109
1110     my $sth  = $self->_prepare($sql);
1111     $sth->execute(@args);
1112     $sth->finish;
1113 }
1114
1115 sub _dump_update_location_index {
1116     my $self = shift;
1117     my ($obj,$id) = @_;
1118     my $table   = $self->_feature_index_table;
1119     my $fh      = $self->dump_filehandle($table);
1120     my $dbh     = $self->dbh;
1121     my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1122     my @args = $self->_has_spatial_index ? ($id,$seqid,$bin,$start,$end)
1123                                          : ($id,$seqid,$bin,$start,$end);
1124     print $fh join("\t",@args),"\n";
1125 }
1126
1127
1128 1;
1129
1130 =head1 AUTHOR
1131
1132 Nathan Weeks <Nathan.Weeks@ars.usda.gov>
1133
1134 Copyright (c) 2009 Nathan Weeks
1135
1136 This library is free software; you can redistribute it and/or modify
1137 it under the same terms as Perl itself.
1138
1139 =cut
1140