Bio/DB/SeqFeature/Store/DBI/mysql.pm

   1 package Bio::DB::SeqFeature::Store::DBI::mysql;
   2
   3 =head1 NAME
   4
   5 Bio::DB::SeqFeature::Store::DBI::mysql -- Mysql implementation of Bio::DB::SeqFeature::Store
   6
   7 =head1 SYNOPSIS
   8
   9   use Bio::DB::SeqFeature::Store;
  10
  11   # Open the sequence database
  12   my $db = Bio::DB::SeqFeature::Store->new(-adaptor => 'DBI::mysql',
  13                                           -dsn     => 'dbi:mysql:test');
  14
  15   # get a feature from somewhere
  16   my $feature = Bio::SeqFeature::Generic->new(...);
  17
  18   # store it
  19   $db->store($feature) or die "Couldn't store!";
  20
  21   # primary ID of the feature is changed to indicate its primary ID
  22   # in the database...
  23   my $id = $feature->primary_id;
  24
  25   # get the feature back out
  26   my $f  = $db->fetch($id);
  27
  28   # change the feature and update it
  29   $f->start(100);
  30   $f->update($f) or die "Couldn't update!";
  31
  32   # searching...
  33   # ...by id
  34   my @features = $db->fetch_many(@list_of_ids);
  35
  36   # ...by name
  37   @features = $db->get_features_by_name('ZK909');
  38
  39   # ...by alias
  40   @features = $db->get_features_by_alias('sma-3');
  41
  42   # ...by type
  43   @features = $db->get_features_by_name('gene');
  44
  45   # ...by location
  46   @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
  47
  48   # ...by attribute
  49   @features = $db->get_features_by_attribute({description => 'protein kinase'})
  50
  51   # ...by the GFF "Note" field
  52   @result_list = $db->search_notes('kinase');
  53
  54   # ...by arbitrary combinations of selectors
  55   @features = $db->features(-name => $name,
  56                             -type => $types,
  57                             -seq_id => $seqid,
  58                             -start  => $start,
  59                             -end    => $end,
  60                             -attributes => $attributes);
  61
  62   # ...using an iterator
  63   my $iterator = $db->get_seq_stream(-name => $name,
  64                                      -type => $types,
  65                                      -seq_id => $seqid,
  66                                      -start  => $start,
  67                                      -end    => $end,
  68                                      -attributes => $attributes);
  69
  70   while (my $feature = $iterator->next_seq) {
  71     # do something with the feature
  72   }
  73
  74   # ...limiting the search to a particular region
  75   my $segment  = $db->segment('Chr1',5000=>6000);
  76   my @features = $segment->features(-type=>['mRNA','match']);
  77
  78   # getting & storing sequence information
  79   # Warning: this returns a string, and not a PrimarySeq object
  80   $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
  81   my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
  82
  83   # what feature types are defined in the database?
  84   my @types    = $db->types;
  85
  86   # create a new feature in the database
  87   my $feature = $db->new_feature(-primary_tag => 'mRNA',
  88                                  -seq_id      => 'chr3',
  89                                  -start      => 10000,
  90                                  -end        => 11000);
  91
  92 =head1 DESCRIPTION
  93
  94 Bio::DB::SeqFeature::Store::mysql is the Mysql adaptor for
  95 Bio::DB::SeqFeature::Store. You will not create it directly, but
  96 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
  97
  98 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
  99
 100 =head2 Using the Mysql adaptor
 101
 102 Before you can use the adaptor, you must use the mysqladmin tool to
 103 create a database and establish a user account with write
 104 permission. In order to use "fast" loading, the user account must have
 105 "file" privileges.
 106
 107 To establish a connection to the database, call
 108 Bio::DB::SeqFeature::Store-E<gt>new(-adaptor=E<gt>'DBI::mysql',@more_args). The
 109 additional arguments are as follows:
 110
 111   Argument name       Description
 112   -------------       -----------
 113
 114  -dsn              The database name. You can abbreviate
 115                    "dbi:mysql:foo" as "foo" if you wish.
 116
 117  -user             Username for authentication.
 118
 119  -pass             Password for authentication.
 120
 121  -namespace        A prefix to attach to each table. This allows you
 122                    to have several virtual databases in the same
 123                    physical database.
 124
 125  -temp             Boolean flag. If true, a temporary database
 126                    will be created and destroyed as soon as
 127                    the Store object goes out of scope. (synonym -temporary)
 128
 129  -autoindex        Boolean flag. If true, features in the database will be
 130                    reindexed every time they change. This is the default.
 131
 132
 133  -tmpdir           Directory in which to place temporary files during "fast" loading.
 134                    Defaults to File::Spec->tmpdir(). (synonyms -dump_dir, -dumpdir, -tmp)
 135
 136  -dbi_options      A hashref to pass to DBI->connect's 4th argument, the "attributes."
 137                    (synonyms -options, -dbi_attr)
 138
 139  -write            Pass true to open database for writing or updating.
 140
 141 If successful, a new instance of
 142 Bio::DB::SeqFeature::Store::DBI::mysql will be returned.
 143
 144 In addition to the standard methods supported by all well-behaved
 145 Bio::DB::SeqFeature::Store databases, several following
 146 adaptor-specific methods are provided. These are described in the next
 147 sections.
 148
 149 =cut
 150
 151 use strict;
 152
 153 use base 'Bio::DB::SeqFeature::Store';
 154 use Bio::DB::SeqFeature::Store::DBI::Iterator;
 155 use DBI;
 156 use Memoize;
 157 use Cwd 'abs_path';
 158 use Bio::DB::GFF::Util::Rearrange 'rearrange';
 159 use Bio::SeqFeature::Lite;
 160 use File::Spec;
 161 use Carp 'carp','cluck','croak';
 162 use constant DEBUG=>0;
 163
 164 # from the MySQL documentation...
 165 # WARNING: if your sequence uses coordinates greater than 2 GB, you are out of luck!
 166 use constant MAX_INT =>  2_147_483_647;
 167 use constant MIN_INT => -2_147_483_648;
 168 use constant MAX_BIN =>  1_000_000_000;  # size of largest feature = 1 Gb
 169 use constant MIN_BIN =>  1000;           # smallest bin we'll make - on a 100 Mb chromosome, there'll be 100,000 of these
 170 use constant SUMMARY_BIN_SIZE => 1000;
 171
 172 # tier 0 == 1000 bp bins
 173 # tier 1 == 10,000 bp bins
 174 # etc.
 175
 176 memoize('_typeid');
 177 memoize('_locationid');
 178 memoize('_attributeid');
 179 memoize('dump_path');
 180
 181 ###
 182 # object initialization
 183 #
 184 sub init {
 185   my $self          = shift;
 186   my ($dsn,
 187       $is_temporary,
 188       $autoindex,
 189       $namespace,
 190       $dump_dir,
 191       $user,
 192       $pass,
 193       $dbi_options,
 194       $writeable,
 195       $create,
 196      ) = rearrange(['DSN',
 197                     ['TEMP','TEMPORARY'],
 198                     'AUTOINDEX',
 199                     'NAMESPACE',
 200                     ['DUMP_DIR','DUMPDIR','TMP','TMPDIR'],
 201                     'USER',
 202                     ['PASS','PASSWD','PASSWORD'],
 203                     ['OPTIONS','DBI_OPTIONS','DBI_ATTR'],
 204                     ['WRITE','WRITEABLE'],
 205                     'CREATE',
 206                    ],@_);
 207   $dbi_options  ||= {};
 208   $writeable    = 1 if $is_temporary or $dump_dir;
 209
 210   $dsn or $self->throw("Usage: ".__PACKAGE__."->init(-dsn => \$dbh || \$dsn)");
 211
 212   my $dbh;
 213   if (ref $dsn) {
 214     $dbh = $dsn;
 215   } else {
 216     $dsn = "dbi:mysql:$dsn" unless $dsn =~ /^dbi:/;
 217     $dbh = DBI->connect($dsn,$user,$pass,$dbi_options) or $self->throw($DBI::errstr);
 218     $dbh->{mysql_auto_reconnect} = 1;
 219   }
 220   $self->{dbh}       = $dbh;
 221   $self->{is_temp}   = $is_temporary;
 222   $self->{namespace} = $namespace;
 223   $self->{writeable} = $writeable;
 224
 225   $self->default_settings;
 226   $self->autoindex($autoindex)                   if defined $autoindex;
 227   $self->dumpdir($dump_dir)                      if $dump_dir;
 228   if ($self->is_temp) {
 229     $self->init_tmp_database();
 230   } elsif ($create) {
 231     $self->init_database('erase');
 232   }
 233 }
 234
 235 sub writeable { shift->{writeable} }
 236
 237 sub can_store_parentage { 1 }
 238
 239 sub table_definitions {
 240   my $self = shift;
 241   return {
 242           feature => <<END,
 243 (
 244   id       int(10) auto_increment primary key,
 245   typeid   int(10)      not null,
 246   seqid    int(10),
 247   start    int,
 248   end      int,
 249   strand   tinyint      default 0,
 250   tier     tinyint,
 251   bin      int,
 252   indexed  tinyint default 1,
 253   object     MEDIUMBLOB not null,
 254   index(seqid,tier,bin,typeid),
 255   index(typeid)
 256 )
 257 END
 258
 259           locationlist => <<END,
 260 (
 261   id         int(10)       auto_increment primary key,
 262   seqname    varchar(255)   not null,
 263   index(seqname)
 264 )
 265 END
 266
 267           typelist => <<END,
 268 (
 269   id       int(10) auto_increment primary key,
 270   tag      varchar(255)  not null,
 271   index(tag)
 272 )
 273 END
 274           name => <<END,
 275 (
 276   id           int(10)       not null,
 277   name         varchar(255)  not null,
 278   display_name tinyint       default 0,
 279   index(id),
 280   index(name)
 281 )
 282 END
 283
 284           attribute => <<END,
 285 (
 286   id               int(10)       not null,
 287   attribute_id     int(10)   not null,
 288   attribute_value  text,
 289   index(id),
 290   index(attribute_id,attribute_value(10))
 291 )
 292 END
 293
 294           attributelist => <<END,
 295 (
 296   id       int(10) auto_increment primary key,
 297   tag      varchar(255)  not null,
 298   index(tag)
 299 )
 300 END
 301           parent2child => <<END,
 302 (
 303   id               int(10)       not null,
 304   child            int(10)       not null,
 305   unique index(id,child)
 306 )
 307 END
 308
 309           meta => <<END,
 310 (
 311   name      varchar(128) primary key,
 312   value     varchar(128) not null
 313 )
 314 END
 315           sequence => <<END,
 316 (
 317   id       int(10) not null,
 318   offset   int(10) unsigned not null,
 319   sequence longblob,
 320   primary key(id,offset)
 321 )
 322 END
 323           interval_stats => <<END,
 324 (
 325    typeid            integer not null,
 326    seqid             integer not null,
 327    bin               integer not null,
 328    cum_count         integer not null,
 329    primary key(typeid,seqid,bin)
 330 )
 331 END
 332          };
 333 }
 334
 335 ###
 336 # default settings -- will create and populate meta table if needed
 337 #
 338 sub default_settings {
 339   my $self = shift;
 340   $self->maybe_create_meta();
 341   $self->SUPER::default_settings;
 342   $self->autoindex(1);
 343   $self->dumpdir(File::Spec->tmpdir);
 344 }
 345
 346
 347 ###
 348 # retrieve database handle
 349 #
 350 sub dbh {
 351   my $self = shift;
 352   my $d    = $self->{dbh};
 353   $self->{dbh} = shift if @_;
 354   $d;
 355 }
 356
 357 sub clone {
 358     my $self = shift;
 359     $self->{dbh}{InactiveDestroy} = 1;
 360     $self->{dbh} = $self->{dbh}->clone
 361         unless $self->is_temp;
 362 }
 363
 364 ###
 365 # get/set directory for bulk load tables
 366 #
 367 sub dumpdir {
 368   my $self = shift;
 369   my $d = $self->{dumpdir};
 370   $self->{dumpdir} = abs_path(shift) if @_;
 371   $d;
 372 }
 373
 374 ###
 375 # table namespace (multiple dbs in one mysql db)
 376 #
 377 sub namespace {
 378   my $self = shift;
 379   my $d    = $self->{namespace};
 380   $self->{namespace} = shift if @_;
 381   $d;
 382 }
 383
 384 ###
 385 # Required for Pg not mysql
 386 #
 387 sub remove_namespace {
 388   return;
 389 }
 390
 391 ###
 392 # find a path that corresponds to a dump table
 393 #
 394 sub dump_path {
 395   my $self = shift;
 396   my $table = $self->_qualify(shift);
 397   return "$self->{dumpdir}/$table.$$";
 398 }
 399
 400 ###
 401 # make a filehandle (writeable) that corresponds to a dump table
 402 #
 403 sub dump_filehandle {
 404   my $self = shift;
 405   my $table = shift;
 406   eval "require IO::File" unless IO::File->can('new');
 407   my $path  = $self->dump_path($table);
 408   my $fh = $self->{filehandles}{$path} ||= IO::File->new(">$path");
 409   $fh;
 410 }
 411
 412 ###
 413 # find the next ID for a feature (used only during bulk loading)
 414 #
 415 sub next_id {
 416   my $self = shift;
 417   $self->{max_id} ||= $self->max_id;
 418   return ++$self->{max_id};
 419 }
 420
 421 ###
 422 # find the maximum ID for a feature (used only during bulk loading)
 423 #
 424 sub max_id {
 425   my $self = shift;
 426   my $features = $self->_feature_table;
 427   my $sth  = $self->_prepare("SELECT max(id) from $features");
 428   $sth->execute or $self->throw($sth->errstr);
 429   my ($id) = $sth->fetchrow_array;
 430   $id;
 431 }
 432
 433 ###
 434 # wipe database clean and reinstall schema
 435 #
 436 sub _init_database {
 437   my $self = shift;
 438   my $erase = shift;
 439
 440   my $dbh    = $self->dbh;
 441   my $tables = $self->table_definitions;
 442
 443   for my $t (keys %$tables) {
 444     next if $t eq 'meta';      # don't get rid of meta data!
 445     my $table = $self->_qualify($t);
 446     $dbh->do("DROP table IF EXISTS $table") if $erase;
 447     my $query = "CREATE TABLE IF NOT EXISTS $table $tables->{$t}";
 448     $self->_create_table($dbh,$query);
 449   }
 450   $self->subfeatures_are_indexed(1) if $erase;
 451   1;
 452 }
 453
 454 sub init_tmp_database {
 455   my $self = shift;
 456
 457   my $dbh    = $self->dbh;
 458   my $tables = $self->table_definitions;
 459
 460   for my $t (keys %$tables) {
 461     next if $t eq 'meta';  # done earlier
 462     my $table = $self->_qualify($t);
 463     my $query = "CREATE TEMPORARY TABLE $table $tables->{$t}";
 464     $self->_create_table($dbh,$query);
 465   }
 466   1;
 467 }
 468
 469 sub _create_table {
 470     my $self         = shift;
 471     my ($dbh,$query) = @_;
 472     for my $q (split ';',$query) {
 473         chomp($q);
 474         next unless $q =~ /\S/;
 475         $dbh->do("$q;\n") or $self->throw($dbh->errstr);
 476     }
 477 }
 478
 479 sub maybe_create_meta {
 480   my $self = shift;
 481   return unless $self->writeable;
 482   my $meta   = $self->_meta_table;
 483   my $tables = $self->table_definitions;
 484   my $temporary = $self->is_temp ? 'TEMPORARY' : '';
 485   $self->dbh->do("CREATE $temporary TABLE IF NOT EXISTS $meta $tables->{meta}");
 486 }
 487
 488 ###
 489 # use temporary tables
 490 #
 491 sub is_temp {
 492   shift->{is_temp};
 493 }
 494
 495 sub attributes {
 496     my $self = shift;
 497     my $dbh  = $self->dbh;
 498     my $attributelist_table = $self->_attributelist_table;
 499
 500     my $a    = $dbh->selectcol_arrayref("SELECT tag FROM $attributelist_table")
 501        or $self->throw($dbh->errstr);
 502     return @$a;
 503 }
 504
 505 sub _store {
 506   my $self    = shift;
 507
 508   # special case for bulk updates
 509   return $self->_dump_store(@_) if $self->{bulk_update_in_progress};
 510
 511   my $indexed = shift;
 512   my $count = 0;
 513
 514   my $autoindex = $self->autoindex;
 515
 516   my $dbh = $self->dbh;
 517   local $dbh->{RaiseError} = 1;
 518   $self->begin_work;
 519   eval {
 520     for my $obj (@_) {
 521       $self->replace($obj,$indexed);
 522       $self->_update_indexes($obj) if $indexed && $autoindex;
 523       $count++;
 524     }
 525   };
 526
 527   if ($@) {
 528     warn "Transaction aborted because $@";
 529     $self->rollback;
 530   }
 531   else {
 532     $self->commit;
 533   }
 534
 535   # remember whether we are have ever stored a non-indexed feature
 536   unless ($indexed or $self->{indexed_flag}++) {
 537     $self->subfeatures_are_indexed(0);
 538   }
 539   $count;
 540 }
 541
 542 # we memoize this in order to avoid making zillions of calls
 543 sub autoindex {
 544   my $self = shift;
 545
 546   # special case for bulk update -- need to build the indexes
 547   # at the same time we build the main feature table
 548   return 1 if $self->{bulk_update_in_progress};
 549   my $d = $self->setting('autoindex');
 550   $self->setting(autoindex=>shift) if @_;
 551   $d;
 552 }
 553
 554 sub _start_bulk_update {
 555   my $self = shift;
 556   my $dbh  = $self->dbh;
 557   $self->begin_work;
 558   $self->{bulk_update_in_progress}++;
 559 }
 560
 561 sub _finish_bulk_update {
 562   my $self = shift;
 563   my $dbh  = $self->dbh;
 564   my $dir = $self->{dumpdir} || '.';
 565   for my $table ($self->_feature_table,$self->index_tables) {
 566     my $fh = $self->dump_filehandle($table);
 567     my $path = $self->dump_path($table);
 568     $fh->close;
 569     #print STDERR "$path\n";
 570
 571     $dbh->do("LOAD DATA LOCAL INFILE '$path' REPLACE INTO TABLE $table FIELDS OPTIONALLY ENCLOSED BY '\\''")
 572       or $self->throw($dbh->errstr);
 573     unlink $path;
 574   }
 575   delete $self->{bulk_update_in_progress};
 576   delete $self->{   filehandles};
 577   $self->commit;
 578 }
 579
 580
 581 ###
 582 # Add a subparts to a feature. Both feature and all subparts must already be in database.
 583 #
 584 sub _add_SeqFeature {
 585   my $self     = shift;
 586
 587   # special purpose method for case when we are doing a bulk update
 588   return $self->_dump_add_SeqFeature(@_) if $self->{bulk_update_in_progress};
 589
 590   my $parent   = shift;
 591   my @children = @_;
 592
 593   my $dbh = $self->dbh;
 594   local $dbh->{RaiseError} = 1;
 595
 596   my $parent2child = $self->_parent2child_table();
 597   my $count = 0;
 598
 599   my $sth = $self->_prepare(<<END);
 600 REPLACE INTO $parent2child (id,child) VALUES (?,?)
 601 END
 602
 603   my $parent_id = (ref $parent ? $parent->primary_id : $parent)
 604     or $self->throw("$parent should have a primary_id");
 605
 606   $self->begin_work or $self->throw($dbh->errstr);
 607   eval {
 608     for my $child (@children) {
 609       my $child_id = ref $child ? $child->primary_id : $child;
 610       defined $child_id or die "no primary ID known for $child";
 611       $sth->execute($parent_id,$child_id);
 612       $count++;
 613     }
 614   };
 615
 616   if ($@) {
 617     warn "Transaction aborted because $@";
 618     $self->rollback;
 619   }
 620   else {
 621     $self->commit;
 622   }
 623   $sth->finish;
 624   $count;
 625 }
 626
 627 sub _fetch_SeqFeatures {
 628   my $self   = shift;
 629   my $parent = shift;
 630   my @types  = @_;
 631
 632   my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
 633   my $features = $self->_feature_table;
 634   my $parent2child   = $self->_parent2child_table();
 635
 636   my @from  = ("$features as f","$parent2child as c");
 637   my @where = ('f.id=c.child','c.id=?');
 638   my @args  = $parent_id;
 639
 640   if (@types) {
 641     my ($from,$where,undef,@a) = $self->_types_sql(\@types,'f');
 642     push @from,$from   if $from;
 643     push @where,$where if $where;
 644     push @args,@a;
 645   }
 646
 647   my $from  = join ', ',@from;
 648   my $where = join ' AND ',@where;
 649
 650   my $query = <<END;
 651 SELECT f.id,f.object
 652   FROM $from
 653   WHERE $where
 654 END
 655
 656   $self->_print_query($query,@args) if DEBUG || $self->debug;
 657
 658   my $sth = $self->_prepare($query) or $self->throw($self->dbh->errstr);
 659
 660   $sth->execute(@args) or $self->throw($sth->errstr);
 661   return $self->_sth2objs($sth);
 662 }
 663
 664 ###
 665 # get primary sequence between start and end
 666 #
 667 sub _fetch_sequence {
 668   my $self = shift;
 669   my ($seqid,$start,$end) = @_;
 670
 671   # backward compatibility to the old days when I liked reverse complementing
 672   # dna by specifying $start > $end
 673   my $reversed;
 674   if (defined $start && defined $end && $start > $end) {
 675     $reversed++;
 676     ($start,$end) = ($end,$start);
 677   }
 678   $start-- if defined $start;
 679   $end--   if defined $end;
 680
 681   my $id      = $self->_locationid($seqid);
 682   my $offset1 = $self->_offset_boundary($id,$start || 'left');
 683   my $offset2 = $self->_offset_boundary($id,$end   || 'right');
 684   my $sequence_table = $self->_sequence_table;
 685
 686   my $sql = <<END;
 687 SELECT sequence,offset
 688    FROM $sequence_table as s
 689    WHERE s.id=?
 690      AND s.offset >= ?
 691      AND s.offset <= ?
 692    ORDER BY s.offset
 693 END
 694
 695   my $sth     = $self->_prepare($sql);
 696   my $seq = '';
 697   $self->_print_query($sql,$id,$offset1,$offset2) if DEBUG || $self->debug;
 698   $sth->execute($id,$offset1,$offset2) or $self->throw($sth->errstr);
 699
 700   while (my($frag,$offset) = $sth->fetchrow_array) {
 701     substr($frag,0,$start-$offset) = '' if defined $start && $start > $offset;
 702     $seq .= $frag;
 703   }
 704   substr($seq,$end-$start+1) = '' if defined $end && $end-$start+1 < length($seq);
 705   if ($reversed) {
 706     $seq = reverse $seq;
 707     $seq =~ tr/gatcGATC/ctagCTAG/;
 708   }
 709   $sth->finish;
 710   $seq;
 711 }
 712
 713 sub _offset_boundary {
 714   my $self = shift;
 715   my ($seqid,$position) = @_;
 716
 717   my $sequence_table     = $self->_sequence_table;
 718   my $locationlist_table = $self->_locationlist_table;
 719
 720   my $sql;
 721   $sql =  $position eq 'left'  ? "SELECT min(offset) FROM $sequence_table as s WHERE s.id=?"
 722          :$position eq 'right' ? "SELECT max(offset) FROM $sequence_table as s WHERE s.id=?"
 723          :"SELECT max(offset) FROM $sequence_table as s WHERE s.id=? AND offset<=?";
 724   my $sth = $self->_prepare($sql);
 725   my @args = $position =~ /^-?\d+$/ ? ($seqid,$position) : ($seqid);
 726   $self->_print_query($sql,@args) if DEBUG || $self->debug;
 727   $sth->execute(@args) or $self->throw($sth->errstr);
 728   my $boundary = $sth->fetchall_arrayref->[0][0];
 729   $sth->finish;
 730   return $boundary;
 731 }
 732
 733
 734 ###
 735 # add namespace to tablename
 736 #
 737 sub _qualify {
 738   my $self = shift;
 739   my $table_name = shift;
 740   my $namespace = $self->namespace;
 741   return $table_name if (!defined $namespace ||
 742                          # is namespace already present in table name?
 743                          index($table_name, $namespace) == 0);
 744   return "${namespace}_${table_name}";
 745 }
 746
 747 ###
 748 # Fetch a Bio::SeqFeatureI from database using its primary_id
 749 #
 750 sub _fetch {
 751   my $self       = shift;
 752   @_ or $self->throw("usage: fetch(\$primary_id)");
 753   my $primary_id = shift;
 754   my $features = $self->_feature_table;
 755   my $sth = $self->_prepare(<<END);
 756 SELECT id,object FROM $features WHERE id=?
 757 END
 758   $sth->execute($primary_id) or $self->throw($sth->errstr);
 759   my $obj = $self->_sth2obj($sth);
 760   $sth->finish;
 761   $obj;
 762 }
 763
 764 ###
 765 # Efficiently fetch a series of IDs from the database
 766 # Can pass an array or an array ref
 767 #
 768 sub _fetch_many {
 769   my $self       = shift;
 770   @_ or $self->throw('usage: fetch_many($id1,$id2,$id3...)');
 771   my $ids = join ',',map {ref($_) ? @$_ : $_} @_ or return;
 772   my $features = $self->_feature_table;
 773
 774   my $sth = $self->_prepare(<<END);
 775 SELECT id,object FROM $features WHERE id IN ($ids)
 776 END
 777   $sth->execute() or $self->throw($sth->errstr);
 778   return $self->_sth2objs($sth);
 779 }
 780
 781 sub _features {
 782   my $self = shift;
 783   my ($seq_id,$start,$end,$strand,
 784       $name,$class,$allow_aliases,
 785       $types,
 786       $attributes,
 787       $range_type,
 788       $fromtable,
 789       $iterator,
 790       $sources,
 791       ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
 792                      'NAME','CLASS','ALIASES',
 793                      ['TYPES','TYPE','PRIMARY_TAG'],
 794                      ['ATTRIBUTES','ATTRIBUTE'],
 795                      'RANGE_TYPE',
 796                      'FROM_TABLE',
 797                      'ITERATOR',
 798                      ['SOURCE','SOURCES'],
 799                     ],@_);
 800
 801   my (@from,@where,@args,@group);
 802   $range_type ||= 'overlaps';
 803
 804   my $features         = $self->_feature_table;
 805   @from = "$features as f";
 806
 807   if (defined $name) {
 808     # hacky backward compatibility workaround
 809     undef $class if $class && $class eq 'Sequence';
 810     $name = "$class:$name" if defined $class && length $class > 0;
 811     # last argument is the join field
 812     my ($from,$where,$group,@a) = $self->_name_sql($name,$allow_aliases,'f.id');
 813     push @from,$from   if $from;
 814     push @where,$where if $where;
 815     push @group,$group if $group;
 816     push @args,@a;
 817   }
 818
 819   if (defined $seq_id) {
 820     # last argument is the name of the features table
 821     my ($from,$where,$group,@a) = $self->_location_sql($seq_id,$start,$end,$range_type,$strand,'f');
 822     push @from,$from   if $from;
 823     push @where,$where if $where;
 824     push @group,$group if $group;
 825     push @args,@a;
 826   }
 827
 828   if (defined($sources)) {
 829     my @sources = ref($sources) eq 'ARRAY' ? @{$sources} : ($sources);
 830     if (defined($types)) {
 831         my @types = ref($types) eq 'ARRAY' ? @{$types} : ($types);
 832         my @final_types;
 833         foreach my $type (@types) {
 834             # *** not sure what to do if user supplies both -source and -type
 835             #     where the type includes a source!
 836             if ($type =~ /:/) {
 837                 push(@final_types, $type);
 838             }
 839             else {
 840                 foreach my $source (@sources) {
 841                     push(@final_types, $type.':'.$source);
 842                 }
 843             }
 844         }
 845         $types = \@final_types;
 846     }
 847     else {
 848         $types = [map { ':'.$_ } @sources];
 849     }
 850   }
 851
 852   if (defined($types)) {
 853     # last argument is the name of the features table
 854     my ($from,$where,$group,@a) = $self->_types_sql($types,'f');
 855     push @from,$from   if $from;
 856     push @where,$where if $where;
 857     push @group,$group if $group;
 858     push @args,@a;
 859   }
 860
 861   if (defined $attributes) {
 862     # last argument is the join field
 863     my ($from,$where,$group,@a) = $self->_attributes_sql($attributes,'f.id');
 864     push @from,$from    if $from;
 865     push @where,$where  if $where;
 866     push @group,$group  if $group;
 867     push @args,@a;
 868   }
 869
 870   if (defined $fromtable) {
 871     # last argument is the join field
 872     my ($from,$where,$group,@a) = $self->_from_table_sql($fromtable,'f.id');
 873     push @from,$from    if $from;
 874     push @where,$where  if $where;
 875     push @group,$group  if $group;
 876     push @args,@a;
 877   }
 878
 879   # if no other criteria are specified, then
 880   # only fetch indexed (i.e. top level objects)
 881   @where = 'indexed=1' unless @where;
 882
 883   my $from  = join ', ',@from;
 884   my $where = join ' AND ',map {"($_)"} @where;
 885   my $group = join ', ',@group;
 886   $group    = "GROUP BY $group" if @group;
 887
 888   my $query = <<END;
 889 SELECT f.id,f.object,f.typeid,f.seqid,f.start,f.end,f.strand
 890   FROM $from
 891   WHERE $where
 892   $group
 893 END
 894 ;
 895   $self->_print_query($query,@args) if DEBUG || $self->debug;
 896
 897   my $sth = $self->_prepare($query) or $self->throw($self->dbh->errstr);
 898   $sth->execute(@args) or $self->throw($sth->errstr);
 899   return $iterator ? Bio::DB::SeqFeature::Store::DBI::Iterator->new($sth,$self) : $self->_sth2objs($sth);
 900 }
 901
 902 sub _aggregate_bins {
 903     my $self = shift;
 904     my $sth  = shift;
 905     my (%types,$binsize,$binstart);
 906     while (my ($type,$seqname,$bin,$count,$bins,$start,$end) = $sth->fetchrow_array) {
 907         $binsize                ||= ($end-$start+1)/$bins;
 908         $binstart               ||= int($start/$binsize);
 909         $types{$type}{seqname}  ||= $seqname;
 910         $types{$type}{min}      ||= $start;
 911         $types{$type}{max}      ||= $end;
 912         $types{$type}{bins}     ||= [(0) x $bins];
 913         $types{$type}{bins}[$bin-$binstart] = $count;
 914         $types{$type}{count} += $count;
 915     }
 916     my @results;
 917     for my $type (keys %types) {
 918         my $min  = $types{$type}{min};
 919         my $max  = $types{$type}{max};
 920         my $seqid= $types{$type}{seqname};
 921         my $f = Bio::SeqFeature::Lite->new(-seq_id => $seqid,
 922                                            -start  => $min,
 923                                            -end    => $max,
 924                                            -type   => "$type:bins",
 925                                            -score  => $types{$type}{count},
 926                                            -attributes => {coverage => join ',',@{$types{$type}{bins}}});
 927         push @results,$f;
 928     }
 929     return @results;
 930 }
 931
 932 sub _name_sql {
 933   my $self = shift;
 934   my ($name,$allow_aliases,$join) = @_;
 935   my $name_table   = $self->_name_table;
 936
 937   my $from  = "$name_table as n";
 938   my ($match,$string) = $self->_match_sql($name);
 939
 940   my $where = "n.id=$join AND n.name $match";
 941   $where   .= " AND n.display_name>0" unless $allow_aliases;
 942   return ($from,$where,'',$string);
 943 }
 944
 945 sub _search_attributes {
 946   my $self = shift;
 947   my ($search_string,$attribute_names,$limit) = @_;
 948   my @words               = map {quotemeta($_)} split /\s+/,$search_string;
 949
 950   my $name_table          = $self->_name_table;
 951   my $attribute_table     = $self->_attribute_table;
 952   my $attributelist_table = $self->_attributelist_table;
 953   my $type_table          = $self->_type_table;
 954   my $typelist_table      = $self->_typelist_table;
 955
 956   my @tags    = @$attribute_names;
 957   my $tag_sql = join ' OR ',("al.tag=?") x @tags;
 958
 959   my $perl_regexp = join '|',@words;
 960
 961   my $sql_regexp = join ' OR ',("a.attribute_value REGEXP ?")  x @words;
 962   my $sql = <<END;
 963 SELECT name,attribute_value,tl.tag,n.id
 964   FROM $name_table as n,$attribute_table as a,$attributelist_table as al,$type_table as t,$typelist_table as tl
 965   WHERE n.id=a.id
 966     AND al.id=a.attribute_id
 967     AND n.id=t.id
 968     AND t.typeid=tl.id
 969     AND n.display_name=1
 970     AND ($tag_sql)
 971     AND ($sql_regexp)
 972 END
 973   $sql .= "LIMIT $limit" if defined $limit;
 974   $self->_print_query($sql,@tags,@words) if DEBUG || $self->debug;
 975   my $sth = $self->_prepare($sql);
 976   $sth->execute(@tags,@words) or $self->throw($sth->errstr);
 977
 978   my @results;
 979   while (my($name,$value,$type,$id) = $sth->fetchrow_array) {
 980     my (@hits) = $value =~ /$perl_regexp/ig;
 981     my @words_in_row = split /\b/,$value;
 982     my $score   = int(@hits * 10);
 983     push @results,[$name,$value,$score,$type,$id];
 984   }
 985   $sth->finish;
 986   @results = sort {$b->[2]<=>$a->[2]} @results;
 987   return @results;
 988 }
 989
 990 sub _match_sql {
 991   my $self = shift;
 992   my $name = shift;
 993
 994   my ($match,$string);
 995   if ($name =~ /(?:^|[^\\])[*?]/) {
 996     $name =~ s/(^|[^\\])([%_])/$1\\$2/g;
 997     $name =~ s/(^|[^\\])\*/$1%/g;
 998     $name =~ s/(^|[^\\])\?/$1_/g;
 999     $match = "LIKE ?";
1000     $string  = $name;
1001   } else {
1002     $match = "= ?";
1003     $string  = $name;
1004   }
1005   return ($match,$string);
1006 }
1007
1008 sub _from_table_sql {
1009   my $self = shift;
1010   my ($from_table,$join) = @_;
1011   my $from  = "$from_table as ft";
1012   my $where = "ft.id=$join";
1013   return ($from,$where,'');
1014 }
1015
1016 sub _attributes_sql {
1017   my $self = shift;
1018   my ($attributes,$join) = @_;
1019
1020   my ($wf,@bind_args)       = $self->_make_attribute_where('a','al',$attributes);
1021   my ($group_by,@group_args)= $self->_make_attribute_group('a',$attributes);
1022
1023   my $attribute_table       = $self->_attribute_table;
1024   my $attributelist_table   = $self->_attributelist_table;
1025
1026   my $from = "$attribute_table as a use index(attribute_id), $attributelist_table as al";
1027
1028   my $where = <<END;
1029   a.id=$join
1030   AND   a.attribute_id=al.id
1031   AND ($wf)
1032 END
1033
1034   my $group = $group_by;
1035
1036   my @args  = (@bind_args,@group_args);
1037   return ($from,$where,$group,@args);
1038 }
1039
1040 sub subfeature_types_are_indexed     { 1 }
1041 sub subfeature_locations_are_indexed { 1 }
1042
1043 sub _types_sql {
1044   my $self  = shift;
1045   my ($types,$type_table) = @_;
1046   my ($primary_tag,$source_tag);
1047
1048   my @types = ref $types eq 'ARRAY' ?  @$types : $types;
1049
1050   my $typelist      = $self->_typelist_table;
1051   my $from = "$typelist AS tl";
1052
1053   my (@matches,@args);
1054
1055   for my $type (@types) {
1056
1057     if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1058       $primary_tag = $type->method;
1059       $source_tag  = $type->source;
1060     } else {
1061       ($primary_tag,$source_tag) = split ':',$type,2;
1062     }
1063
1064     if (defined $source_tag && length $source_tag) {
1065       if (defined $primary_tag && length($primary_tag)) {
1066         push @matches,"tl.tag=?";
1067         push @args,"$primary_tag:$source_tag";
1068       }
1069       else {
1070         push @matches,"tl.tag LIKE ?";
1071         push @args,"%:$source_tag";
1072       }
1073     } else {
1074       push @matches,"tl.tag LIKE ?";
1075       push @args,"$primary_tag:%";
1076     }
1077   }
1078   my $matches = join ' OR ',@matches;
1079
1080   my $where = <<END;
1081    tl.id=$type_table.typeid
1082    AND   ($matches)
1083 END
1084
1085   return ($from,$where,'',@args);
1086 }
1087
1088 sub _location_sql {
1089   my $self = shift;
1090   my ($seq_id,$start,$end,$range_type,$strand,$location) = @_;
1091
1092   # the additional join on the location_list table badly impacts performance
1093   # so we build a copy of the table in memory
1094   my $seqid = $self->_locationid_nocreate($seq_id) || 0; # zero is an invalid primary ID, so will return empty
1095
1096   $start = MIN_INT unless defined $start;
1097   $end   = MAX_INT unless defined $end;
1098
1099   my ($bin_where,@bin_args) = $self->bin_where($start,$end,$location);
1100
1101   my ($range,@range_args);
1102   if ($range_type eq 'overlaps') {
1103     $range = "$location.end>=? AND $location.start<=? AND ($bin_where)";
1104     @range_args = ($start,$end,@bin_args);
1105   } elsif ($range_type eq 'contains') {
1106     $range = "$location.start>=? AND $location.end<=? AND ($bin_where)";
1107     @range_args = ($start,$end,@bin_args);
1108   } elsif ($range_type eq 'contained_in') {
1109     $range = "$location.start<=? AND $location.end>=?";
1110     @range_args = ($start,$end);
1111   } else {
1112     $self->throw("range_type must be one of 'overlaps', 'contains' or 'contained_in'");
1113   }
1114
1115   if (defined $strand) {
1116     $range .= " AND strand=?";
1117     push @range_args,$strand;
1118   }
1119
1120   my $where = <<END;
1121    $location.seqid=?
1122    AND   $range
1123 END
1124
1125   my $from  = '';
1126   my $group = '';
1127
1128   my @args  = ($seqid,@range_args);
1129   return ($from,$where,$group,@args);
1130 }
1131
1132 ###
1133 # force reindexing
1134 #
1135 sub reindex {
1136   my $self = shift;
1137   my $from_update_table = shift;  # if present, will take ids from "update_table"
1138
1139   my $dbh  = $self->dbh;
1140   my $count = 0;
1141   my $now;
1142
1143   # try to bring in highres time() function
1144   eval "require Time::HiRes";
1145
1146   my $last_time = $self->time();
1147
1148   # tell _delete_index() not to bother removing the index rows corresponding
1149   # to each individual feature
1150   local $self->{reindexing} = 1;
1151
1152   $self->begin_work;
1153   eval {
1154     my $update = $from_update_table;
1155     for my $table ($self->index_tables) {
1156       my $query = $from_update_table ? "DELETE $table FROM $table,$update WHERE $table.id=$update.id"
1157                                      : "DELETE FROM $table";
1158       $dbh->do($query);
1159       $self->_disable_keys($dbh,$table);
1160     }
1161     my $iterator = $self->get_seq_stream(-from_table=>$from_update_table ? $update : undef);
1162     while (my $f = $iterator->next_seq) {
1163       if (++$count %1000 == 0) {
1164         $now = $self->time();
1165         my $elapsed = sprintf(" in %5.2fs",$now - $last_time);
1166         $last_time = $now;
1167         print STDERR "$count features indexed$elapsed...",' 'x60;
1168         print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
1169       }
1170       $self->_update_indexes($f);
1171     }
1172   };
1173   for my $table ($self->index_tables) {
1174       $self->_enable_keys($dbh,$table);
1175   }
1176   if (@_) {
1177     warn "Couldn't complete transaction: $@";
1178     $self->rollback;
1179     return;
1180   } else {
1181     $self->commit;
1182     return 1;
1183   }
1184 }
1185
1186 sub optimize {
1187   my $self = shift;
1188   $self->dbh->do("ANALYZE TABLE $_") foreach $self->index_tables;
1189 }
1190
1191 sub all_tables {
1192   my $self = shift;
1193   my @index_tables = $self->index_tables;
1194   my $features = $self->_feature_table;
1195   return ($features,@index_tables);
1196 }
1197
1198 sub index_tables {
1199   my $self = shift;
1200   return map {$self->_qualify($_)} qw(name attribute parent2child)
1201 }
1202
1203 sub _firstid {
1204   my $self = shift;
1205   my $features = $self->_feature_table;
1206   my $query = <<END;
1207 SELECT min(id) FROM $features
1208 END
1209   my $sth=$self->_prepare($query);
1210   $sth->execute();
1211   my ($first) = $sth->fetchrow_array;
1212   $sth->finish;
1213   $first;
1214 }
1215
1216 sub _nextid {
1217   my $self = shift;
1218   my $lastkey = shift;
1219   my $features = $self->_feature_table;
1220   my $query = <<END;
1221 SELECT min(id) FROM $features WHERE id>?
1222 END
1223   my $sth=$self->_prepare($query);
1224   $sth->execute($lastkey);
1225   my ($next) = $sth->fetchrow_array;
1226   $sth->finish;
1227   $next;
1228 }
1229
1230 sub _existsid {
1231   my $self = shift;
1232   my $key  = shift;
1233   my $features = $self->_feature_table;
1234   my $query = <<END;
1235 SELECT count(*) FROM $features WHERE id=?
1236 END
1237   my $sth=$self->_prepare($query);
1238   $sth->execute($key);
1239   my ($count) = $sth->fetchrow_array;
1240   $sth->finish;
1241   $count > 0;
1242 }
1243
1244 sub _deleteid {
1245   my $self = shift;
1246   my $key  = shift;
1247   my $dbh = $self->dbh;
1248   my $parent2child = $self->_parent2child_table;
1249   my $query = "SELECT child FROM $parent2child WHERE id=?";
1250   my $sth=$self->_prepare($query);
1251   $sth->execute($key);
1252   my $success = 0;
1253   while (my ($cid) = $sth->fetchrow_array) {
1254     # Backcheck looking for multiple parents, delete only if one is present. I'm
1255     # sure there is a nice way to left join the parent2child table onto itself
1256     # to get this in one query above, just haven't worked it out yet...
1257     my $sth2 = $self->_prepare("SELECT count(id) FROM $parent2child WHERE child=?");
1258     $sth2->execute($cid);
1259     my ($count) = $sth2->fetchrow_array;
1260     if ($count == 1) {
1261         $self->_deleteid($cid) || warn "An error occurred while removing subfeature id=$cid. Perhaps it was previously deleted?\n";
1262     }
1263   }
1264   for my $table ($self->all_tables) {
1265     $success += $dbh->do("DELETE FROM $table WHERE id=$key") || 0;
1266   }
1267   return $success;
1268 }
1269
1270 sub _clearall {
1271   my $self = shift;
1272   my $dbh = $self->dbh;
1273   for my $table ($self->all_tables) {
1274     $dbh->do("DELETE FROM $table");
1275   }
1276 }
1277
1278 sub _featurecount {
1279   my $self = shift;
1280   my $dbh = $self->dbh;
1281   my $features = $self->_feature_table;
1282   my $query = <<END;
1283 SELECT count(*) FROM $features
1284 END
1285   my $sth=$self->_prepare($query);
1286   $sth->execute();
1287   my ($count) = $sth->fetchrow_array;
1288   $sth->finish;
1289   $count;
1290 }
1291
1292 sub _seq_ids {
1293   my $self = shift;
1294   my $dbh = $self->dbh;
1295   my $location = $self->_locationlist_table;
1296   my $sth = $self->_prepare("SELECT DISTINCT seqname FROM $location");
1297   $sth->execute() or $self->throw($sth->errstr);
1298   my @result;
1299   while (my ($id) = $sth->fetchrow_array) {
1300     push @result,$id;
1301   }
1302   return @result;
1303 }
1304
1305 sub setting {
1306   my $self = shift;
1307   my ($variable_name,$value) = @_;
1308   my $meta  = $self->_meta_table;
1309
1310   if (defined $value && $self->writeable) {
1311     my $query = <<END;
1312 REPLACE INTO $meta (name,value) VALUES (?,?)
1313 END
1314     my $sth = $self->_prepare($query);
1315     $sth->execute($variable_name,$value) or $self->throw($sth->errstr);
1316     $sth->finish;
1317     $self->{settings_cache}{$variable_name} = $value;
1318   }
1319   else {
1320     return $self->{settings_cache}{$variable_name} if exists $self->{settings_cache}{$variable_name};
1321     my $query = <<END;
1322 SELECT value FROM $meta as m WHERE m.name=?
1323 END
1324     my $sth = $self->_prepare($query);
1325     $sth->execute($variable_name) or $self->throw($sth->errstr);
1326     my ($value) = $sth->fetchrow_array;
1327     $sth->finish;
1328     return $self->{settings_cache}{$variable_name} = $value;
1329   }
1330 }
1331
1332 ###
1333 # Replace Bio::SeqFeatureI into database.
1334 #
1335 sub replace {
1336   my $self       = shift;
1337   my $object     = shift;
1338   my $index_flag = shift || undef;
1339
1340   # ?? shouldn't need to do this
1341   # $self->_load_class($object);
1342   my $id = $object->primary_id;
1343   my $features = $self->_feature_table;
1344
1345   my $sth = $self->_prepare(<<END);
1346 REPLACE INTO $features (id,object,indexed,seqid,start,end,strand,tier,bin,typeid) VALUES (?,?,?,?,?,?,?,?,?,?)
1347 END
1348
1349   my @location = $index_flag ? $self->_get_location_and_bin($object) : (undef)x6;
1350
1351   my $primary_tag = $object->primary_tag;
1352   my $source_tag  = $object->source_tag || '';
1353   $primary_tag    .= ":$source_tag";
1354   my $typeid   = $self->_typeid($primary_tag,1);
1355
1356   my $frozen = $self->no_blobs() ? 0 : $self->freeze($object);
1357
1358   $sth->execute($id,$frozen,$index_flag||0,@location,$typeid) or $self->throw($sth->errstr);
1359
1360   my $dbh = $self->dbh;
1361   $object->primary_id($dbh->{mysql_insertid}) unless defined $id;
1362
1363   $self->flag_for_indexing($dbh->{mysql_insertid}) if $self->{bulk_update_in_progress};
1364 }
1365
1366 # doesn't work with this schema, since we have to update name and attribute
1367 # tables which need object ids, which we can only know by replacing feats in
1368 # the feature table one by one
1369 sub bulk_replace {
1370     my $self       = shift;
1371     my $index_flag = shift || undef;
1372     my @objects    = @_;
1373
1374     my $features = $self->_feature_table;
1375
1376     my @insert_values;
1377     foreach my $object (@objects) {
1378         my $id = $object->primary_id;
1379         my @location = $index_flag ? $self->_get_location_and_bin($object) : (undef)x6;
1380         my $primary_tag = $object->primary_tag;
1381         my $source_tag  = $object->source_tag || '';
1382         $primary_tag    .= ":$source_tag";
1383         my $typeid   = $self->_typeid($primary_tag,1);
1384
1385         push(@insert_values, ($id,0,$index_flag||0,@location,$typeid));
1386     }
1387
1388     my @value_blocks;
1389     for (1..@objects) {
1390         push(@value_blocks, '(?,?,?,?,?,?,?,?,?,?)');
1391     }
1392     my $value_blocks = join(',', @value_blocks);
1393     my $sql = qq{REPLACE INTO $features (id,object,indexed,seqid,start,end,strand,tier,bin,typeid) VALUES $value_blocks};
1394
1395     my $sth = $self->_prepare($sql);
1396     $sth->execute(@insert_values) or $self->throw($sth->errstr);
1397 }
1398
1399 ###
1400 # Insert one Bio::SeqFeatureI into database. primary_id must be undef
1401 #
1402 sub insert {
1403   my $self = shift;
1404   my $object = shift;
1405   my $index_flag = shift || 0;
1406
1407   $self->_load_class($object);
1408   defined $object->primary_id and $self->throw("$object already has a primary id");
1409
1410   my $features = $self->_feature_table;
1411   my $sth = $self->_prepare(<<END);
1412 INSERT INTO $features (id,object,indexed) VALUES (?,?,?)
1413 END
1414   $sth->execute(undef,$self->freeze($object),$index_flag) or $self->throw($sth->errstr);
1415   my $dbh = $self->dbh;
1416   $object->primary_id($dbh->{mysql_insertid});
1417   $self->flag_for_indexing($dbh->{mysql_insertid}) if $self->{bulk_update_in_progress};
1418 }
1419
1420 =head2 types
1421
1422  Title   : types
1423  Usage   : @type_list = $db->types
1424  Function: Get all the types in the database
1425  Returns : array of Bio::DB::GFF::Typename objects
1426  Args    : none
1427  Status  : public
1428
1429 =cut
1430
1431 sub types {
1432     my $self = shift;
1433     eval "require Bio::DB::GFF::Typename"
1434         unless Bio::DB::GFF::Typename->can('new');
1435     my $typelist      = $self->_typelist_table;
1436     my $sql = <<END;
1437 SELECT tag from $typelist
1438 END
1439 ;
1440     $self->_print_query($sql) if DEBUG || $self->debug;
1441     my $sth = $self->_prepare($sql);
1442     $sth->execute() or $self->throw($sth->errstr);
1443
1444     my @results;
1445     while (my($tag) = $sth->fetchrow_array) {
1446         push @results,Bio::DB::GFF::Typename->new($tag);
1447     }
1448     $sth->finish;
1449     return @results;
1450 }
1451
1452 =head2 toplevel_types
1453
1454  Title   : toplevel_types
1455  Usage   : @type_list = $db->toplevel_types
1456  Function: Get the toplevel types in the database
1457  Returns : array of Bio::DB::GFF::Typename objects
1458  Args    : none
1459  Status  : public
1460
1461 This is similar to types() but only returns the types of
1462 INDEXED (toplevel) features.
1463
1464 =cut
1465
1466 sub toplevel_types {
1467     my $self = shift;
1468     eval "require Bio::DB::GFF::Typename"
1469         unless Bio::DB::GFF::Typename->can('new');
1470     my $typelist      = $self->_typelist_table;
1471     my $features       = $self->_feature_table;
1472     my $sql = <<END;
1473 SELECT distinct(tag) from $typelist as tl,$features as f
1474  WHERE tl.id=f.typeid
1475    AND f.indexed=1
1476 END
1477 ;
1478     $self->_print_query($sql) if DEBUG || $self->debug;
1479     my $sth = $self->_prepare($sql);
1480     $sth->execute() or $self->throw($sth->errstr);
1481
1482     my @results;
1483     while (my($tag) = $sth->fetchrow_array) {
1484         push @results,Bio::DB::GFF::Typename->new($tag);
1485     }
1486     $sth->finish;
1487     return @results;
1488 }
1489
1490 ###
1491 # Insert a bit of DNA or protein into the database
1492 #
1493 sub _insert_sequence {
1494   my $self = shift;
1495   my ($seqid,$seq,$offset) = @_;
1496   my $id = $self->_locationid($seqid);
1497   my $sequence = $self->_sequence_table;
1498   my $sth = $self->_prepare(<<END);
1499 REPLACE INTO $sequence (id,offset,sequence) VALUES (?,?,?)
1500 END
1501   $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr);
1502 }
1503
1504 ###
1505 # This subroutine flags the given primary ID for later reindexing
1506 #
1507 sub flag_for_indexing {
1508   my $self = shift;
1509   my $id   = shift;
1510   my $needs_updating = $self->_update_table;
1511   my $sth = $self->_prepare("REPLACE INTO $needs_updating VALUES (?)");
1512   $sth->execute($id) or $self->throw($self->dbh->errstr);
1513 }
1514
1515 ###
1516 # Update indexes for given object
1517 #
1518 sub _update_indexes {
1519   my $self = shift;
1520   my $obj  = shift;
1521   defined (my $id   = $obj->primary_id) or return;
1522
1523   if ($self->{bulk_update_in_progress}) {
1524     $self->_dump_update_name_index($obj,$id);
1525     $self->_dump_update_attribute_index($obj,$id);
1526   } else {
1527     $self->_update_name_index($obj,$id);
1528     $self->_update_attribute_index($obj,$id);
1529   }
1530 }
1531
1532 sub _update_name_index {
1533   my $self = shift;
1534   my ($obj,$id) = @_;
1535   my $name = $self->_name_table;
1536   my $primary_id = $obj->primary_id;
1537
1538   $self->_delete_index($name,$id);
1539   my ($names,$aliases) = $self->feature_names($obj);
1540
1541   my $sth = $self->_prepare("INSERT INTO $name (id,name,display_name) VALUES (?,?,?)");
1542
1543   $sth->execute($id,$_,1) or $self->throw($sth->errstr)   foreach @$names;
1544   $sth->execute($id,$_,0) or $self->throw($sth->errstr) foreach @$aliases;
1545   $sth->finish;
1546 }
1547
1548 sub _update_attribute_index {
1549   my $self = shift;
1550   my ($obj,$id) = @_;
1551   my $attribute = $self->_attribute_table;
1552   $self->_delete_index($attribute,$id);
1553
1554   my $sth = $self->_prepare("INSERT INTO $attribute (id,attribute_id,attribute_value) VALUES (?,?,?)");
1555   for my $tag ($obj->get_all_tags) {
1556     my $tagid = $self->_attributeid($tag);
1557     for my $value ($obj->get_tag_values($tag)) {
1558       $sth->execute($id,$tagid,$value) or $self->throw($sth->errstr);
1559     }
1560   }
1561   $sth->finish;
1562 }
1563
1564 sub _genericid {
1565   my $self = shift;
1566   my ($table,$namefield,$name,$add_if_missing) = @_;
1567   my $qualified_table = $self->_qualify($table);
1568   my $sth = $self->_prepare(<<END);
1569 SELECT id FROM $qualified_table WHERE $namefield=?
1570 END
1571   $sth->execute($name) or die $sth->errstr;
1572   my ($id) = $sth->fetchrow_array;
1573   $sth->finish;
1574   return $id if defined $id;
1575   return     unless $add_if_missing;
1576
1577   $sth = $self->_prepare(<<END);
1578 INSERT INTO $qualified_table ($namefield) VALUES (?)
1579 END
1580   $sth->execute($name) or die $sth->errstr;
1581   my $dbh = $self->dbh;
1582   return $dbh->{mysql_insertid};
1583 }
1584
1585 sub _typeid {
1586   shift->_genericid('typelist','tag',shift,1);
1587 }
1588 sub _locationid {
1589   shift->_genericid('locationlist','seqname',shift,1);
1590 }
1591 sub _locationid_nocreate {
1592   shift->_genericid('locationlist','seqname',shift,0);
1593 }
1594 sub _attributeid {
1595   shift->_genericid('attributelist','tag',shift,1);
1596 }
1597
1598 sub _get_location_and_bin {
1599   my $self = shift;
1600   my $feature = shift;
1601   my $seqid   = $self->_locationid($feature->seq_id||'');
1602   my $start   = $feature->start;
1603   my $end     = $feature->end;
1604   my $strand  = $feature->strand || 0;
1605   my ($tier,$bin) = $self->get_bin($start,$end);
1606   return ($seqid,$start,$end,$strand,$tier,$bin);
1607 }
1608
1609 sub get_bin {
1610   my $self = shift;
1611   my ($start,$end) = @_;
1612   my $binsize = MIN_BIN;
1613   my ($bin_start,$bin_end,$tier);
1614   $tier = 0;
1615   while (1) {
1616     $bin_start = int $start/$binsize;
1617     $bin_end   = int $end/$binsize;
1618     last if $bin_start == $bin_end;
1619     $binsize *= 10;
1620     $tier++;
1621   }
1622   return ($tier,$bin_start);
1623 }
1624
1625 sub bin_where {
1626   my $self = shift;
1627   my ($start,$end,$f) = @_;
1628   my (@bins,@args);
1629
1630   my $tier         = 0;
1631   my $binsize      = MIN_BIN;
1632   while ($binsize <= MAX_BIN) {
1633     my $bin_start = int($start/$binsize);
1634     my $bin_end   = int($end/$binsize);
1635     push @bins,"($f.tier=? AND $f.bin between ? AND ?)";
1636     push @args,($tier,$bin_start,$bin_end);
1637     $binsize *= 10;
1638     $tier++;
1639   }
1640   my $query = join ("\n\t OR ",@bins);
1641   return wantarray ? ($query,@args) : substitute($query,@args);
1642 }
1643
1644 sub _delete_index {
1645   my $self = shift;
1646   my ($table_name,$id) = @_;
1647   return if $self->{reindexing};
1648   my $sth = $self->_prepare("DELETE FROM $table_name WHERE id=?") or $self->throw($self->dbh->errstr);
1649   $sth->execute($id);
1650 }
1651
1652 # given a statement handler that is expected to return rows of (id,object)
1653 # unthaw each object and return a list of 'em
1654 sub _sth2objs {
1655   my $self = shift;
1656   my $sth  = shift;
1657   my @result;
1658   while (my ($id,$o,$typeid,$seqid,$start,$end,$strand) = $sth->fetchrow_array) {
1659     my $obj;
1660     if ($o eq '0') {
1661         # rebuild a new feat object from the data stored in the db
1662         $obj = $self->_rebuild_obj($id,$typeid,$seqid,$start,$end,$strand);
1663     }
1664     else {
1665         $obj = $self->thaw($o,$id);
1666     }
1667
1668     push @result,$obj;
1669   }
1670   $sth->finish;
1671   return @result;
1672 }
1673
1674 # given a statement handler that is expected to return rows of (id,object)
1675 # unthaw each object and return a list of 'em
1676 sub _sth2obj {
1677   my $self = shift;
1678   my $sth  = shift;
1679   my ($id,$o,$typeid,$seqid,$start,$end,$strand) = $sth->fetchrow_array;
1680   return unless defined $o;
1681   my $obj;
1682   if ($o eq '0') {  # I don't understand why an object ever needs to be rebuilt!
1683     # rebuild a new feat object from the data stored in the db
1684     $obj = $self->_rebuild_obj($id,$typeid,$seqid,$start,$end,$strand);
1685   }
1686   else {
1687     $obj = $self->thaw($o,$id);
1688   }
1689
1690   $obj;
1691 }
1692
1693 sub _rebuild_obj {
1694     my ($self, $id, $typeid, $db_seqid, $start, $end, $strand) = @_;
1695     my ($type, $source, $seqid);
1696
1697     # convert typeid to type and source
1698     if (exists $self->{_type_cache}->{$typeid}) {
1699         ($type, $source) = @{$self->{_type_cache}->{$typeid}};
1700     }
1701     else {
1702         my $sql = qq{ SELECT `tag` FROM typelist WHERE `id` = ? };
1703         my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1704         $sth->execute($typeid);
1705         my $result;
1706         $sth->bind_columns(\$result);
1707         while ($sth->fetch()) {
1708             # there should be only one row returned, but we ensure to get all rows
1709         }
1710
1711         ($type, $source) = split(':', $result);
1712         $self->{_type_cache}->{$typeid} = [$type, $source];
1713     }
1714
1715     # convert the db seqid to the sequence name
1716     if (exists $self->{_seqid_cache}->{$db_seqid}) {
1717         $seqid = $self->{_seqid_cache}->{$db_seqid};
1718     }
1719     else {
1720         my $sql = qq{ SELECT `seqname` FROM locationlist WHERE `id` = ? };
1721         my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1722         $sth->execute($db_seqid);
1723         $sth->bind_columns(\$seqid);
1724         while ($sth->fetch()) {
1725             # there should be only one row returned, but we ensure to get all rows
1726         }
1727
1728         $self->{_seqid_cache}->{$db_seqid} = $seqid;
1729     }
1730
1731     # get the names from name table?
1732
1733     # get the attributes and store those in obj
1734     my $sql = qq{ SELECT attribute_id,attribute_value FROM attribute WHERE `id` = ? };
1735     my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1736     $sth->execute($id);
1737     my ($attribute_id, $attribute_value);
1738     $sth->bind_columns(\($attribute_id, $attribute_value));
1739     my %attribs;
1740     while ($sth->fetch()) {
1741         # convert the attribute_id to its real name
1742         my $attribute;
1743         if (exists $self->{_attribute_cache}->{$attribute_id}) {
1744             $attribute = $self->{_attribute_cache}->{$attribute_id};
1745         }
1746         else {
1747             my $sql = qq{ SELECT `tag` FROM attributelist WHERE `id` = ? };
1748             my $sth2 = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1749             $sth2->execute($attribute_id);
1750             $sth2->bind_columns(\$attribute);
1751             while ($sth2->fetch()) {
1752                 # there should be only one row returned, but we ensure to get all rows
1753             }
1754
1755             $self->{_attribute_cache}->{$attribute_id} = $attribute;
1756         }
1757
1758         if ($source && $attribute eq 'source' && $attribute_value eq $source) {
1759             next;
1760         }
1761
1762         $attribs{$attribute} = $attribute_value;
1763     }
1764
1765     # if we weren't called with all the params, pull those out of the database too
1766     if ( not ( grep { defined($_) } ( $typeid, $db_seqid, $start, $end, $strand ))) {
1767       my $sql = qq{ SELECT start,end,tag,strand,seqname
1768                     FROM feature,feature_location,typelist,locationlist
1769                     WHERE feature.id=feature_location.id AND feature.typeid=typelist.id
1770                     AND seqid=locationlist.id AND feature.id = ? };
1771       my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1772       $sth->execute($id);
1773       my ($feature_start, $feature_end, $feature_type, $feature_strand,$feature_seqname);
1774       $sth->bind_columns(\($feature_start, $feature_end, $feature_type, $feature_strand, $feature_seqname));
1775       while ($sth->fetch()) {
1776         # there should be only one row returned, but we call like this to
1777         # ensure we get all rows
1778       }
1779       $start  ||= $feature_start;
1780       $end    ||= $feature_end;
1781       $strand ||= $feature_strand;
1782       $seqid  ||= $feature_seqname;
1783
1784       my( $feature_typename , $feature_typesource ) = split /:/ , $feature_type;
1785       $type   ||= $feature_typename;
1786       $source ||= $feature_typesource;
1787     }
1788
1789     my $obj = Bio::SeqFeature::Lite->new(-primary_id => $id,
1790                                          $type ? (-type => $type) : (),
1791                                          $source ? (-source => $source) : (),
1792                                          $seqid ? (-seq_id => $seqid) : (),
1793                                          defined $start ? (-start => $start) : (),
1794                                          defined $end ? (-end => $end) : (),
1795                                          defined $strand ? (-strand => $strand) : (),
1796                                          keys %attribs ? (-attributes => \%attribs) : ());
1797
1798     return $obj;
1799 }
1800
1801 sub _prepare {
1802   my $self = shift;
1803   my $query = shift;
1804   my $dbh   = $self->dbh;
1805   my $sth   = $dbh->prepare_cached($query, {}, 3) or $self->throw($dbh->errstr);
1806   $sth;
1807 }
1808
1809
1810 ####################################################################################################
1811 # SQL Fragment generators
1812 ####################################################################################################
1813
1814 sub _attribute_table      {  shift->_qualify('attribute')      }
1815 sub _attributelist_table  {  shift->_qualify('attributelist')  }
1816 sub _feature_table        {  shift->_qualify('feature')        }
1817 sub _interval_stats_table {  shift->_qualify('interval_stats') }
1818 sub _location_table       {  shift->_qualify('location')       }
1819 sub _locationlist_table   {  shift->_qualify('locationlist')   }
1820 sub _meta_table           {  shift->_qualify('meta')           }
1821 sub _name_table           {  shift->_qualify('name')           }
1822 sub _parent2child_table   {  shift->_qualify('parent2child')   }
1823 sub _sequence_table       {  shift->_qualify('sequence')       }
1824 sub _type_table           {  shift->_qualify('feature')        }
1825 sub _typelist_table       {  shift->_qualify('typelist')       }
1826 sub _update_table         {  shift->_qualify('update_table')   }
1827
1828 sub _make_attribute_where {
1829   my $self                     = shift;
1830   my ($attributetable,$attributenametable,$attributes) = @_;
1831   my @args;
1832   my @sql;
1833   my $dbh = $self->dbh;
1834   foreach (keys %$attributes) {
1835     my @match_values;
1836     my @values = ref($attributes->{$_}) && ref($attributes->{$_}) eq 'ARRAY' ? @{$attributes->{$_}} : $attributes->{$_};
1837     foreach (@values) {  # convert * into % for wildcard matches
1838       s/\*/%/g;
1839     }
1840     my $match  = join ' OR ',map {
1841       /%/ ? "$attributetable.attribute_value LIKE ?"
1842           : "$attributetable.attribute_value=?"
1843     } @values;
1844     push @sql,"($attributenametable.tag=? AND ($match))";
1845     push @args,($_,@values);
1846   }
1847   return (join(' OR ',@sql),@args);
1848 }
1849
1850 sub _make_attribute_group {
1851   my $self                     = shift;
1852   my ($table_name,$attributes) = @_;
1853   my $key_count = keys %$attributes or return;
1854   return "f.id,f.object,f.typeid,f.seqid,f.start,f.end,f.strand HAVING count(f.id)>?",$key_count-1;
1855 }
1856
1857 sub _print_query {
1858   my $self = shift;
1859   my ($query,@args) = @_;
1860   while ($query =~ /\?/) {
1861     my $arg = $self->dbh->quote(shift @args);
1862     $query =~ s/\?/$arg/;
1863   }
1864   warn $query,"\n";
1865 }
1866
1867 ###
1868 # special-purpose store for bulk loading - write to a file rather than to the db
1869 #
1870 sub _dump_store {
1871   my $self    = shift;
1872   my $indexed = shift;
1873
1874   my $count = 0;
1875   my $store_fh = $self->dump_filehandle('feature');
1876   my $dbh      = $self->dbh;
1877
1878   my $autoindex = $self->autoindex;
1879
1880   for my $obj (@_) {
1881     my $id       = $self->next_id;
1882     my ($seqid,$start,$end,$strand,$tier,$bin) = $indexed ? $self->_get_location_and_bin($obj) : (undef)x6;
1883     my $primary_tag = $obj->primary_tag;
1884     my $source_tag  = $obj->source_tag || '';
1885     $primary_tag    .= ":$source_tag";
1886     my $typeid   = $self->_typeid($primary_tag,1);
1887
1888     print $store_fh join("\t",$id,$typeid,$seqid,$start,$end,$strand,$tier,$bin,$indexed,$dbh->quote($self->freeze($obj))),"\n";
1889     $obj->primary_id($id);
1890     $self->_update_indexes($obj) if $indexed && $autoindex;
1891     $count++;
1892   }
1893
1894   # remember whether we are have ever stored a non-indexed feature
1895   unless ($indexed or $self->{indexed_flag}++) {
1896     $self->subfeatures_are_indexed(0);
1897   }
1898   $count;
1899 }
1900
1901 sub _dump_add_SeqFeature {
1902   my $self     = shift;
1903   my $parent   = shift;
1904   my @children = @_;
1905
1906   my $dbh = $self->dbh;
1907   my $fh = $self->dump_filehandle('parent2child');
1908   my $parent_id = (ref $parent ? $parent->primary_id : $parent)
1909     or $self->throw("$parent should have a primary_id");
1910   my $count = 0;
1911
1912   for my $child_id (@children) {
1913     print $fh join("\t",$parent_id,$child_id),"\n";
1914     $count++;
1915   }
1916   $count;
1917 }
1918
1919 sub _dump_update_name_index {
1920   my $self = shift;
1921   my ($obj,$id) = @_;
1922   my $fh      = $self->dump_filehandle('name');
1923   my $dbh     = $self->dbh;
1924   my ($names,$aliases) = $self->feature_names($obj);
1925   print $fh join("\t",$id,$dbh->quote($_),1),"\n" foreach @$names;
1926   print $fh join("\t",$id,$dbh->quote($_),0),"\n" foreach @$aliases;
1927 }
1928
1929 sub _dump_update_attribute_index {
1930   my $self = shift;
1931   my ($obj,$id) = @_;
1932   my $fh        = $self->dump_filehandle('attribute');
1933   my $dbh       = $self->dbh;
1934   for my $tag ($obj->all_tags) {
1935     my $tagid = $self->_attributeid($tag);
1936     for my $value ($obj->each_tag_value($tag)) {
1937       print $fh join("\t",$id,$tagid,$dbh->quote($value)),"\n";
1938     }
1939   }
1940 }
1941
1942 sub coverage_array {
1943     my $self = shift;
1944     my ($seq_name,$start,$end,$types,$bins) =
1945         rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
1946                    ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);
1947
1948     $bins  ||= 1000;
1949     $start ||= 1;
1950     unless ($end) {
1951         my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
1952         $end        = $segment->end;
1953     }
1954
1955     my $binsize = ($end-$start+1)/$bins;
1956     my $seqid   = $self->_locationid_nocreate($seq_name) || 0;
1957
1958     return [] unless $seqid;
1959
1960     # where each bin starts
1961     my @his_bin_array = map {$start + $binsize * $_}       (0..$bins-1);
1962     my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;
1963
1964     my $interval_stats    = $self->_interval_stats_table;
1965
1966     my ($sth,@a);
1967     if ($types) {
1968         # pick up the type ids
1969         my ($from,$where,$group);
1970         ($from,$where,$group,@a) = $self->_types_sql($types,'b');
1971         $where =~ s/.+AND//s;
1972         $sth = $self->_prepare(<<END);
1973 SELECT id,tag FROM $from
1974 WHERE  $where
1975 END
1976 ;
1977     } else {
1978         $sth = $self->_prepare(<<END);
1979 SELECT id,tag FROM typelist
1980 END
1981     }
1982     my (@t,$report_tag);
1983     $sth->execute(@a);
1984     while (my ($t,$tag) = $sth->fetchrow_array) {
1985         $report_tag ||= $tag;
1986         push @t,$t;
1987     }
1988
1989     my %bins;
1990     my $sql = <<END;
1991 SELECT bin,cum_count
1992   FROM $interval_stats
1993   WHERE typeid=?
1994     AND seqid=? AND bin >= ?
1995   LIMIT 1
1996 END
1997 ;
1998     $sth = $self->_prepare($sql);
1999
2000     eval {
2001         for my $typeid (@t) {
2002
2003             for (my $i=0;$i<@sum_bin_array;$i++) {
2004
2005                 my @args = ($typeid,$seqid,$sum_bin_array[$i]);
2006                 $self->_print_query($sql,@args) if $self->debug;
2007
2008                 $sth->execute(@args) or $self->throw($sth->errstr);
2009                 my ($bin,$cum_count) = $sth->fetchrow_array;
2010                 push @{$bins{$typeid}},[$bin,$cum_count];
2011             }
2012         }
2013     };
2014     return unless %bins;
2015
2016     my @merged_bins;
2017     my $firstbin = int(($start-1)/$binsize);
2018     for my $type (keys %bins) {
2019         my $arry       = $bins{$type};
2020         my $last_count = $arry->[0][1];
2021         my $last_bin   = -1;
2022         my $i          = 0;
2023         my $delta;
2024         for my $b (@$arry) {
2025             my ($bin,$count) = @$b;
2026             $delta               = $count - $last_count if $bin > $last_bin;
2027             $merged_bins[$i++]  += $delta;
2028             $last_count          = $count;
2029             $last_bin            = $bin;
2030         }
2031     }
2032
2033     return wantarray ? (\@merged_bins,$report_tag) : \@merged_bins;
2034 }
2035
2036 sub build_summary_statistics {
2037     my $self   = shift;
2038     my $interval_stats = $self->_interval_stats_table;
2039     my $dbh    = $self->dbh;
2040     $self->begin_work;
2041
2042     my $sbs = SUMMARY_BIN_SIZE;
2043
2044     my $result = eval {
2045       $self->_add_interval_stats_table;
2046       $self->_disable_keys($dbh,$interval_stats);
2047       $dbh->do("DELETE FROM $interval_stats");
2048
2049       my $insert = $dbh->prepare(<<END) or $self->throw($dbh->errstr);
2050 INSERT INTO $interval_stats
2051            (typeid,seqid,bin,cum_count)
2052     VALUES (?,?,?,?)
2053 END
2054
2055         my $sql    = $self->_fetch_indexed_features_sql;
2056         my $select = $dbh->prepare($sql) or $self->throw($dbh->errstr);
2057
2058         my $current_bin = -1;
2059         my ($current_type,$current_seqid,$count);
2060         my $cum_count = 0;
2061         my (%residuals,$last_bin);
2062
2063         my $le = -t \*STDERR ? "\r" : "\n";
2064
2065         print STDERR "\n";
2066         $select->execute;
2067
2068         while (my($typeid,$seqid,$start,$end) = $select->fetchrow_array) {
2069             print STDERR $count," features processed$le" if ++$count % 1000 == 0;
2070
2071             my $bin = int($start/$sbs);
2072             $current_type  ||= $typeid;
2073             $current_seqid ||= $seqid;
2074
2075             # because the input is sorted by start, no more features will contribute to the
2076             # current bin so we can dispose of it
2077             if ($bin != $current_bin) {
2078                 if ($seqid != $current_seqid or $typeid != $current_type) {
2079                     # load all bins left over
2080                     $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
2081                     %residuals = () ;
2082                     $cum_count = 0;
2083                 } else {
2084                     # load all up to current one
2085                     $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin);
2086                 }
2087             }
2088
2089             $last_bin = $current_bin;
2090             ($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin);
2091
2092             # summarize across entire spanned region
2093             my $last_bin = int(($end-1)/$sbs);
2094             for (my $b=$bin;$b<=$last_bin;$b++) {
2095                 $residuals{$b}++;
2096             }
2097         }
2098         # handle tail case
2099         # load all bins left over
2100         $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
2101         $self->_enable_keys($dbh,$interval_stats);
2102         1;
2103     };
2104
2105     if ($result) { $self->commit } else { warn "Can't build summary statistics: $@"; $self->rollback };
2106     print STDERR "\n";
2107 }
2108
2109 sub _load_bins {
2110     my $self = shift;
2111     my ($insert,$residuals,$cum_count,$type,$seqid,$stop_after) = @_;
2112     for my $b (sort {$a<=>$b} keys %$residuals) {
2113         last if defined $stop_after and $b > $stop_after;
2114         $$cum_count += $residuals->{$b};
2115         my @args    = ($type,$seqid,$b,$$cum_count);
2116         $insert->execute(@args);
2117         delete $residuals->{$b}; # no longer needed
2118     }
2119 }
2120
2121 sub _add_interval_stats_table {
2122     my $self = shift;
2123     my $tables          = $self->table_definitions;
2124     my $interval_stats  = $self->_interval_stats_table;
2125     $self->dbh->do("CREATE TABLE IF NOT EXISTS $interval_stats $tables->{interval_stats}");
2126 }
2127
2128 sub _fetch_indexed_features_sql {
2129     my $self     = shift;
2130     my $features = $self->_feature_table;
2131     return <<END;
2132 SELECT typeid,seqid,start-1,end
2133   FROM $features as f
2134  WHERE f.indexed=1
2135   ORDER BY typeid,seqid,start
2136 END
2137 }
2138
2139 sub _disable_keys {
2140     my $self = shift;
2141     my ($dbh,$table) = @_;
2142     $dbh->do("ALTER TABLE $table DISABLE KEYS");
2143 }
2144 sub _enable_keys {
2145     my $self = shift;
2146     my ($dbh,$table) = @_;
2147     $dbh->do("ALTER TABLE $table ENABLE KEYS");
2148 }
2149
2150 sub time {
2151   return Time::HiRes::time() if Time::HiRes->can('time');
2152   return time();
2153 }
2154
2155 sub DESTROY {
2156   my $self = shift;
2157   if ($self->{bulk_update_in_progress}) {  # be sure to remove temp files
2158     for my $table ($self->_feature_table,$self->index_tables) {
2159       my $path = $self->dump_path($table);
2160       unlink $path;
2161     }
2162   }
2163 }
2164
2165 sub begin_work {
2166     my $self = shift;
2167     return if $self->{_in_transaction}++;
2168     my $dbh  = $self->dbh;
2169     return unless $dbh->{AutoCommit};
2170     $dbh->begin_work;
2171 }
2172
2173 sub commit {
2174     my $self = shift;
2175     return unless $self->{_in_transaction};
2176     delete $self->{_in_transaction};
2177     $self->dbh->commit;
2178 }
2179
2180 sub rollback {
2181     my $self = shift;
2182     return unless $self->{_in_transaction};
2183     delete $self->{_in_transaction};
2184     $self->dbh->rollback;
2185 }
2186
2187
2188
2189
2190 1;