t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / DB / SeqFeature / Store / DBI / mysql.pm
bloba92aff0ea17559bd0acbc028d584da8a4b22e2b8
1 package Bio::DB::SeqFeature::Store::DBI::mysql;
3 =head1 NAME
5 Bio::DB::SeqFeature::Store::DBI::mysql -- Mysql implementation of Bio::DB::SeqFeature::Store
7 =head1 SYNOPSIS
9 use Bio::DB::SeqFeature::Store;
11 # Open the sequence database
12 my $db = Bio::DB::SeqFeature::Store->new(-adaptor => 'DBI::mysql',
13 -dsn => 'dbi:mysql:test');
15 # get a feature from somewhere
16 my $feature = Bio::SeqFeature::Generic->new(...);
18 # store it
19 $db->store($feature) or die "Couldn't store!";
21 # primary ID of the feature is changed to indicate its primary ID
22 # in the database...
23 my $id = $feature->primary_id;
25 # get the feature back out
26 my $f = $db->fetch($id);
28 # change the feature and update it
29 $f->start(100);
30 $f->update($f) or die "Couldn't update!";
32 # searching...
33 # ...by id
34 my @features = $db->fetch_many(@list_of_ids);
36 # ...by name
37 @features = $db->get_features_by_name('ZK909');
39 # ...by alias
40 @features = $db->get_features_by_alias('sma-3');
42 # ...by type
43 @features = $db->get_features_by_name('gene');
45 # ...by location
46 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
48 # ...by attribute
49 @features = $db->get_features_by_attribute({description => 'protein kinase'})
51 # ...by the GFF "Note" field
52 @result_list = $db->search_notes('kinase');
54 # ...by arbitrary combinations of selectors
55 @features = $db->features(-name => $name,
56 -type => $types,
57 -seq_id => $seqid,
58 -start => $start,
59 -end => $end,
60 -attributes => $attributes);
62 # ...using an iterator
63 my $iterator = $db->get_seq_stream(-name => $name,
64 -type => $types,
65 -seq_id => $seqid,
66 -start => $start,
67 -end => $end,
68 -attributes => $attributes);
70 while (my $feature = $iterator->next_seq) {
71 # do something with the feature
74 # ...limiting the search to a particular region
75 my $segment = $db->segment('Chr1',5000=>6000);
76 my @features = $segment->features(-type=>['mRNA','match']);
78 # getting & storing sequence information
79 # Warning: this returns a string, and not a PrimarySeq object
80 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
81 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
83 # what feature types are defined in the database?
84 my @types = $db->types;
86 # create a new feature in the database
87 my $feature = $db->new_feature(-primary_tag => 'mRNA',
88 -seq_id => 'chr3',
89 -start => 10000,
90 -end => 11000);
92 =head1 DESCRIPTION
94 Bio::DB::SeqFeature::Store::mysql is the Mysql adaptor for
95 Bio::DB::SeqFeature::Store. You will not create it directly, but
96 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
98 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
100 =head2 Using the Mysql adaptor
102 Before you can use the adaptor, you must use the mysqladmin tool to
103 create a database and establish a user account with write
104 permission. In order to use "fast" loading, the user account must have
105 "file" privileges.
107 To establish a connection to the database, call
108 Bio::DB::SeqFeature::Store-E<gt>new(-adaptor=E<gt>'DBI::mysql',@more_args). The
109 additional arguments are as follows:
111 Argument name Description
112 ------------- -----------
114 -dsn The database name. You can abbreviate
115 "dbi:mysql:foo" as "foo" if you wish.
117 -user Username for authentication.
119 -pass Password for authentication.
121 -namespace A prefix to attach to each table. This allows you
122 to have several virtual databases in the same
123 physical database.
125 -temp Boolean flag. If true, a temporary database
126 will be created and destroyed as soon as
127 the Store object goes out of scope. (synonym -temporary)
129 -autoindex Boolean flag. If true, features in the database will be
130 reindexed every time they change. This is the default.
133 -tmpdir Directory in which to place temporary files during "fast" loading.
134 Defaults to File::Spec->tmpdir(). (synonyms -dump_dir, -dumpdir, -tmp)
136 -dbi_options A hashref to pass to DBI->connect's 4th argument, the "attributes."
137 (synonyms -options, -dbi_attr)
139 -write Pass true to open database for writing or updating.
141 If successful, a new instance of
142 Bio::DB::SeqFeature::Store::DBI::mysql will be returned.
144 In addition to the standard methods supported by all well-behaved
145 Bio::DB::SeqFeature::Store databases, several following
146 adaptor-specific methods are provided. These are described in the next
147 sections.
149 =cut
151 use strict;
153 use base 'Bio::DB::SeqFeature::Store';
154 use Bio::DB::SeqFeature::Store::DBI::Iterator;
155 use DBI;
156 use Memoize;
157 use Cwd 'abs_path';
158 use Bio::DB::GFF::Util::Rearrange 'rearrange';
159 use Bio::SeqFeature::Lite;
160 use File::Spec;
161 use Carp 'carp','cluck','croak';
162 use constant DEBUG=>0;
164 # from the MySQL documentation...
165 # WARNING: if your sequence uses coordinates greater than 2 GB, you are out of luck!
166 use constant MAX_INT => 2_147_483_647;
167 use constant MIN_INT => -2_147_483_648;
168 use constant MAX_BIN => 1_000_000_000; # size of largest feature = 1 Gb
169 use constant MIN_BIN => 1000; # smallest bin we'll make - on a 100 Mb chromosome, there'll be 100,000 of these
170 use constant SUMMARY_BIN_SIZE => 1000;
172 # tier 0 == 1000 bp bins
173 # tier 1 == 10,000 bp bins
174 # etc.
176 memoize('_typeid');
177 memoize('_locationid');
178 memoize('_attributeid');
179 memoize('dump_path');
182 # object initialization
184 sub init {
185 my $self = shift;
186 my ($dsn,
187 $is_temporary,
188 $autoindex,
189 $namespace,
190 $dump_dir,
191 $user,
192 $pass,
193 $dbi_options,
194 $writeable,
195 $create,
196 ) = rearrange(['DSN',
197 ['TEMP','TEMPORARY'],
198 'AUTOINDEX',
199 'NAMESPACE',
200 ['DUMP_DIR','DUMPDIR','TMP','TMPDIR'],
201 'USER',
202 ['PASS','PASSWD','PASSWORD'],
203 ['OPTIONS','DBI_OPTIONS','DBI_ATTR'],
204 ['WRITE','WRITEABLE'],
205 'CREATE',
206 ],@_);
207 $dbi_options ||= {};
208 $writeable = 1 if $is_temporary or $dump_dir;
210 $dsn or $self->throw("Usage: ".__PACKAGE__."->init(-dsn => \$dbh || \$dsn)");
212 my $dbh;
213 if (ref $dsn) {
214 $dbh = $dsn;
215 } else {
216 $dsn = "dbi:mysql:$dsn" unless $dsn =~ /^dbi:/;
217 $dbh = DBI->connect($dsn,$user,$pass,$dbi_options) or $self->throw($DBI::errstr);
218 $dbh->{mysql_auto_reconnect} = 1;
220 $self->{dbh} = $dbh;
221 $self->{is_temp} = $is_temporary;
222 $self->{namespace} = $namespace;
223 $self->{writeable} = $writeable;
225 $self->default_settings;
226 $self->autoindex($autoindex) if defined $autoindex;
227 $self->dumpdir($dump_dir) if $dump_dir;
228 if ($self->is_temp) {
229 $self->init_tmp_database();
230 } elsif ($create) {
231 $self->init_database('erase');
235 sub writeable { shift->{writeable} }
237 sub can_store_parentage { 1 }
239 sub table_definitions {
240 my $self = shift;
241 return {
242 feature => <<END,
244 id int(10) auto_increment primary key,
245 typeid int(10) not null,
246 seqid int(10),
247 start int,
248 end int,
249 strand tinyint default 0,
250 tier tinyint,
251 bin int,
252 indexed tinyint default 1,
253 object MEDIUMBLOB not null,
254 index(seqid,tier,bin,typeid),
255 index(typeid)
259 locationlist => <<END,
261 id int(10) auto_increment primary key,
262 seqname varchar(255) not null,
263 index(seqname)
267 typelist => <<END,
269 id int(10) auto_increment primary key,
270 tag varchar(255) not null,
271 index(tag)
274 name => <<END,
276 id int(10) not null,
277 name varchar(255) not null,
278 display_name tinyint default 0,
279 index(id),
280 index(name)
284 attribute => <<END,
286 id int(10) not null,
287 attribute_id int(10) not null,
288 attribute_value text,
289 index(id),
290 index(attribute_id,attribute_value(10))
294 attributelist => <<END,
296 id int(10) auto_increment primary key,
297 tag varchar(255) not null,
298 index(tag)
301 parent2child => <<END,
303 id int(10) not null,
304 child int(10) not null,
305 unique index(id,child)
309 meta => <<END,
311 name varchar(128) primary key,
312 value varchar(128) not null
315 sequence => <<END,
317 id int(10) not null,
318 offset int(10) unsigned not null,
319 sequence longblob,
320 primary key(id,offset)
323 interval_stats => <<END,
325 typeid integer not null,
326 seqid integer not null,
327 bin integer not null,
328 cum_count integer not null,
329 primary key(typeid,seqid,bin)
336 # default settings -- will create and populate meta table if needed
338 sub default_settings {
339 my $self = shift;
340 $self->maybe_create_meta();
341 $self->SUPER::default_settings;
342 $self->autoindex(1);
343 $self->dumpdir(File::Spec->tmpdir);
348 # retrieve database handle
350 sub dbh {
351 my $self = shift;
352 my $d = $self->{dbh};
353 $self->{dbh} = shift if @_;
357 sub clone {
358 my $self = shift;
359 $self->{dbh}{InactiveDestroy} = 1;
360 $self->{dbh} = $self->{dbh}->clone
361 unless $self->is_temp;
365 # get/set directory for bulk load tables
367 sub dumpdir {
368 my $self = shift;
369 my $d = $self->{dumpdir};
370 $self->{dumpdir} = abs_path(shift) if @_;
375 # table namespace (multiple dbs in one mysql db)
377 sub namespace {
378 my $self = shift;
379 my $d = $self->{namespace};
380 $self->{namespace} = shift if @_;
385 # Required for Pg not mysql
387 sub remove_namespace {
388 return;
392 # find a path that corresponds to a dump table
394 sub dump_path {
395 my $self = shift;
396 my $table = $self->_qualify(shift);
397 return "$self->{dumpdir}/$table.$$";
401 # make a filehandle (writeable) that corresponds to a dump table
403 sub dump_filehandle {
404 my $self = shift;
405 my $table = shift;
406 eval "require IO::File" unless IO::File->can('new');
407 my $path = $self->dump_path($table);
408 my $fh = $self->{filehandles}{$path} ||= IO::File->new(">$path");
409 $fh;
413 # find the next ID for a feature (used only during bulk loading)
415 sub next_id {
416 my $self = shift;
417 $self->{max_id} ||= $self->max_id;
418 return ++$self->{max_id};
422 # find the maximum ID for a feature (used only during bulk loading)
424 sub max_id {
425 my $self = shift;
426 my $features = $self->_feature_table;
427 my $sth = $self->_prepare("SELECT max(id) from $features");
428 $sth->execute or $self->throw($sth->errstr);
429 my ($id) = $sth->fetchrow_array;
430 $id;
434 # wipe database clean and reinstall schema
436 sub _init_database {
437 my $self = shift;
438 my $erase = shift;
440 my $dbh = $self->dbh;
441 my $tables = $self->table_definitions;
443 for my $t (keys %$tables) {
444 next if $t eq 'meta'; # don't get rid of meta data!
445 my $table = $self->_qualify($t);
446 $dbh->do("DROP table IF EXISTS $table") if $erase;
447 my $query = "CREATE TABLE IF NOT EXISTS $table $tables->{$t}";
448 $self->_create_table($dbh,$query);
450 $self->subfeatures_are_indexed(1) if $erase;
454 sub init_tmp_database {
455 my $self = shift;
457 my $dbh = $self->dbh;
458 my $tables = $self->table_definitions;
460 for my $t (keys %$tables) {
461 next if $t eq 'meta'; # done earlier
462 my $table = $self->_qualify($t);
463 my $query = "CREATE TEMPORARY TABLE $table $tables->{$t}";
464 $self->_create_table($dbh,$query);
469 sub _create_table {
470 my $self = shift;
471 my ($dbh,$query) = @_;
472 for my $q (split ';',$query) {
473 chomp($q);
474 next unless $q =~ /\S/;
475 $dbh->do("$q;\n") or $self->throw($dbh->errstr);
479 sub maybe_create_meta {
480 my $self = shift;
481 return unless $self->writeable;
482 my $meta = $self->_meta_table;
483 my $tables = $self->table_definitions;
484 my $temporary = $self->is_temp ? 'TEMPORARY' : '';
485 $self->dbh->do("CREATE $temporary TABLE IF NOT EXISTS $meta $tables->{meta}");
489 # use temporary tables
491 sub is_temp {
492 shift->{is_temp};
495 sub attributes {
496 my $self = shift;
497 my $dbh = $self->dbh;
498 my $attributelist_table = $self->_attributelist_table;
500 my $a = $dbh->selectcol_arrayref("SELECT tag FROM $attributelist_table")
501 or $self->throw($dbh->errstr);
502 return @$a;
505 sub _store {
506 my $self = shift;
508 # special case for bulk updates
509 return $self->_dump_store(@_) if $self->{bulk_update_in_progress};
511 my $indexed = shift;
512 my $count = 0;
514 my $autoindex = $self->autoindex;
516 my $dbh = $self->dbh;
517 local $dbh->{RaiseError} = 1;
518 $self->begin_work;
519 eval {
520 for my $obj (@_) {
521 $self->replace($obj,$indexed);
522 $self->_update_indexes($obj) if $indexed && $autoindex;
523 $count++;
527 if ($@) {
528 warn "Transaction aborted because $@";
529 $self->rollback;
531 else {
532 $self->commit;
535 # remember whether we are have ever stored a non-indexed feature
536 unless ($indexed or $self->{indexed_flag}++) {
537 $self->subfeatures_are_indexed(0);
539 $count;
542 # we memoize this in order to avoid making zillions of calls
543 sub autoindex {
544 my $self = shift;
546 # special case for bulk update -- need to build the indexes
547 # at the same time we build the main feature table
548 return 1 if $self->{bulk_update_in_progress};
549 my $d = $self->setting('autoindex');
550 $self->setting(autoindex=>shift) if @_;
554 sub _start_bulk_update {
555 my $self = shift;
556 my $dbh = $self->dbh;
557 $self->begin_work;
558 $self->{bulk_update_in_progress}++;
561 sub _finish_bulk_update {
562 my $self = shift;
563 my $dbh = $self->dbh;
564 my $dir = $self->{dumpdir} || '.';
565 for my $table ($self->_feature_table,$self->index_tables) {
566 my $fh = $self->dump_filehandle($table);
567 my $path = $self->dump_path($table);
568 $fh->close;
569 #print STDERR "$path\n";
571 $dbh->do("LOAD DATA LOCAL INFILE '$path' REPLACE INTO TABLE $table FIELDS OPTIONALLY ENCLOSED BY '\\''")
572 or $self->throw($dbh->errstr);
573 unlink $path;
575 delete $self->{bulk_update_in_progress};
576 delete $self->{ filehandles};
577 $self->commit;
582 # Add a subparts to a feature. Both feature and all subparts must already be in database.
584 sub _add_SeqFeature {
585 my $self = shift;
587 # special purpose method for case when we are doing a bulk update
588 return $self->_dump_add_SeqFeature(@_) if $self->{bulk_update_in_progress};
590 my $parent = shift;
591 my @children = @_;
593 my $dbh = $self->dbh;
594 local $dbh->{RaiseError} = 1;
596 my $parent2child = $self->_parent2child_table();
597 my $count = 0;
599 my $sth = $self->_prepare(<<END);
600 REPLACE INTO $parent2child (id,child) VALUES (?,?)
603 my $parent_id = (ref $parent ? $parent->primary_id : $parent)
604 or $self->throw("$parent should have a primary_id");
606 $self->begin_work or $self->throw($dbh->errstr);
607 eval {
608 for my $child (@children) {
609 my $child_id = ref $child ? $child->primary_id : $child;
610 defined $child_id or die "no primary ID known for $child";
611 $sth->execute($parent_id,$child_id);
612 $count++;
616 if ($@) {
617 warn "Transaction aborted because $@";
618 $self->rollback;
620 else {
621 $self->commit;
623 $sth->finish;
624 $count;
627 sub _fetch_SeqFeatures {
628 my $self = shift;
629 my $parent = shift;
630 my @types = @_;
632 my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
633 my $features = $self->_feature_table;
634 my $parent2child = $self->_parent2child_table();
636 my @from = ("$features as f","$parent2child as c");
637 my @where = ('f.id=c.child','c.id=?');
638 my @args = $parent_id;
640 if (@types) {
641 my ($from,$where,undef,@a) = $self->_types_sql(\@types,'f');
642 push @from,$from if $from;
643 push @where,$where if $where;
644 push @args,@a;
647 my $from = join ', ',@from;
648 my $where = join ' AND ',@where;
650 my $query = <<END;
651 SELECT f.id,f.object
652 FROM $from
653 WHERE $where
656 $self->_print_query($query,@args) if DEBUG || $self->debug;
658 my $sth = $self->_prepare($query) or $self->throw($self->dbh->errstr);
660 $sth->execute(@args) or $self->throw($sth->errstr);
661 return $self->_sth2objs($sth);
665 # get primary sequence between start and end
667 sub _fetch_sequence {
668 my $self = shift;
669 my ($seqid,$start,$end) = @_;
671 # backward compatibility to the old days when I liked reverse complementing
672 # dna by specifying $start > $end
673 my $reversed;
674 if (defined $start && defined $end && $start > $end) {
675 $reversed++;
676 ($start,$end) = ($end,$start);
678 $start-- if defined $start;
679 $end-- if defined $end;
681 my $id = $self->_locationid($seqid);
682 my $offset1 = $self->_offset_boundary($id,$start || 'left');
683 my $offset2 = $self->_offset_boundary($id,$end || 'right');
684 my $sequence_table = $self->_sequence_table;
686 my $sql = <<END;
687 SELECT sequence,offset
688 FROM $sequence_table as s
689 WHERE s.id=?
690 AND s.offset >= ?
691 AND s.offset <= ?
692 ORDER BY s.offset
695 my $sth = $self->_prepare($sql);
696 my $seq = '';
697 $self->_print_query($sql,$id,$offset1,$offset2) if DEBUG || $self->debug;
698 $sth->execute($id,$offset1,$offset2) or $self->throw($sth->errstr);
700 while (my($frag,$offset) = $sth->fetchrow_array) {
701 substr($frag,0,$start-$offset) = '' if defined $start && $start > $offset;
702 $seq .= $frag;
704 substr($seq,$end-$start+1) = '' if defined $end && $end-$start+1 < length($seq);
705 if ($reversed) {
706 $seq = reverse $seq;
707 $seq =~ tr/gatcGATC/ctagCTAG/;
709 $sth->finish;
710 $seq;
713 sub _offset_boundary {
714 my $self = shift;
715 my ($seqid,$position) = @_;
717 my $sequence_table = $self->_sequence_table;
718 my $locationlist_table = $self->_locationlist_table;
720 my $sql;
721 $sql = $position eq 'left' ? "SELECT min(offset) FROM $sequence_table as s WHERE s.id=?"
722 :$position eq 'right' ? "SELECT max(offset) FROM $sequence_table as s WHERE s.id=?"
723 :"SELECT max(offset) FROM $sequence_table as s WHERE s.id=? AND offset<=?";
724 my $sth = $self->_prepare($sql);
725 my @args = $position =~ /^-?\d+$/ ? ($seqid,$position) : ($seqid);
726 $self->_print_query($sql,@args) if DEBUG || $self->debug;
727 $sth->execute(@args) or $self->throw($sth->errstr);
728 my $boundary = $sth->fetchall_arrayref->[0][0];
729 $sth->finish;
730 return $boundary;
735 # add namespace to tablename
737 sub _qualify {
738 my $self = shift;
739 my $table_name = shift;
740 my $namespace = $self->namespace;
741 return $table_name if (!defined $namespace ||
742 # is namespace already present in table name?
743 index($table_name, $namespace) == 0);
744 return "${namespace}_${table_name}";
748 # Fetch a Bio::SeqFeatureI from database using its primary_id
750 sub _fetch {
751 my $self = shift;
752 @_ or $self->throw("usage: fetch(\$primary_id)");
753 my $primary_id = shift;
754 my $features = $self->_feature_table;
755 my $sth = $self->_prepare(<<END);
756 SELECT id,object FROM $features WHERE id=?
758 $sth->execute($primary_id) or $self->throw($sth->errstr);
759 my $obj = $self->_sth2obj($sth);
760 $sth->finish;
761 $obj;
765 # Efficiently fetch a series of IDs from the database
766 # Can pass an array or an array ref
768 sub _fetch_many {
769 my $self = shift;
770 @_ or $self->throw('usage: fetch_many($id1,$id2,$id3...)');
771 my $ids = join ',',map {ref($_) ? @$_ : $_} @_ or return;
772 my $features = $self->_feature_table;
774 my $sth = $self->_prepare(<<END);
775 SELECT id,object FROM $features WHERE id IN ($ids)
777 $sth->execute() or $self->throw($sth->errstr);
778 return $self->_sth2objs($sth);
781 sub _features {
782 my $self = shift;
783 my ($seq_id,$start,$end,$strand,
784 $name,$class,$allow_aliases,
785 $types,
786 $attributes,
787 $range_type,
788 $fromtable,
789 $iterator,
790 $sources,
791 ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
792 'NAME','CLASS','ALIASES',
793 ['TYPES','TYPE','PRIMARY_TAG'],
794 ['ATTRIBUTES','ATTRIBUTE'],
795 'RANGE_TYPE',
796 'FROM_TABLE',
797 'ITERATOR',
798 ['SOURCE','SOURCES'],
799 ],@_);
801 my (@from,@where,@args,@group);
802 $range_type ||= 'overlaps';
804 my $features = $self->_feature_table;
805 @from = "$features as f";
807 if (defined $name) {
808 # hacky backward compatibility workaround
809 undef $class if $class && $class eq 'Sequence';
810 $name = "$class:$name" if defined $class && length $class > 0;
811 # last argument is the join field
812 my ($from,$where,$group,@a) = $self->_name_sql($name,$allow_aliases,'f.id');
813 push @from,$from if $from;
814 push @where,$where if $where;
815 push @group,$group if $group;
816 push @args,@a;
819 if (defined $seq_id) {
820 # last argument is the name of the features table
821 my ($from,$where,$group,@a) = $self->_location_sql($seq_id,$start,$end,$range_type,$strand,'f');
822 push @from,$from if $from;
823 push @where,$where if $where;
824 push @group,$group if $group;
825 push @args,@a;
828 if (defined($sources)) {
829 my @sources = ref($sources) eq 'ARRAY' ? @{$sources} : ($sources);
830 if (defined($types)) {
831 my @types = ref($types) eq 'ARRAY' ? @{$types} : ($types);
832 my @final_types;
833 foreach my $type (@types) {
834 # *** not sure what to do if user supplies both -source and -type
835 # where the type includes a source!
836 if ($type =~ /:/) {
837 push(@final_types, $type);
839 else {
840 foreach my $source (@sources) {
841 push(@final_types, $type.':'.$source);
845 $types = \@final_types;
847 else {
848 $types = [map { ':'.$_ } @sources];
852 if (defined($types)) {
853 # last argument is the name of the features table
854 my ($from,$where,$group,@a) = $self->_types_sql($types,'f');
855 push @from,$from if $from;
856 push @where,$where if $where;
857 push @group,$group if $group;
858 push @args,@a;
861 if (defined $attributes) {
862 # last argument is the join field
863 my ($from,$where,$group,@a) = $self->_attributes_sql($attributes,'f.id');
864 push @from,$from if $from;
865 push @where,$where if $where;
866 push @group,$group if $group;
867 push @args,@a;
870 if (defined $fromtable) {
871 # last argument is the join field
872 my ($from,$where,$group,@a) = $self->_from_table_sql($fromtable,'f.id');
873 push @from,$from if $from;
874 push @where,$where if $where;
875 push @group,$group if $group;
876 push @args,@a;
879 # if no other criteria are specified, then
880 # only fetch indexed (i.e. top level objects)
881 @where = 'indexed=1' unless @where;
883 my $from = join ', ',@from;
884 my $where = join ' AND ',map {"($_)"} @where;
885 my $group = join ', ',@group;
886 $group = "GROUP BY $group" if @group;
888 my $query = <<END;
889 SELECT f.id,f.object,f.typeid,f.seqid,f.start,f.end,f.strand
890 FROM $from
891 WHERE $where
892 $group
895 $self->_print_query($query,@args) if DEBUG || $self->debug;
897 my $sth = $self->_prepare($query) or $self->throw($self->dbh->errstr);
898 $sth->execute(@args) or $self->throw($sth->errstr);
899 return $iterator ? Bio::DB::SeqFeature::Store::DBI::Iterator->new($sth,$self) : $self->_sth2objs($sth);
902 sub _aggregate_bins {
903 my $self = shift;
904 my $sth = shift;
905 my (%types,$binsize,$binstart);
906 while (my ($type,$seqname,$bin,$count,$bins,$start,$end) = $sth->fetchrow_array) {
907 $binsize ||= ($end-$start+1)/$bins;
908 $binstart ||= int($start/$binsize);
909 $types{$type}{seqname} ||= $seqname;
910 $types{$type}{min} ||= $start;
911 $types{$type}{max} ||= $end;
912 $types{$type}{bins} ||= [(0) x $bins];
913 $types{$type}{bins}[$bin-$binstart] = $count;
914 $types{$type}{count} += $count;
916 my @results;
917 for my $type (keys %types) {
918 my $min = $types{$type}{min};
919 my $max = $types{$type}{max};
920 my $seqid= $types{$type}{seqname};
921 my $f = Bio::SeqFeature::Lite->new(-seq_id => $seqid,
922 -start => $min,
923 -end => $max,
924 -type => "$type:bins",
925 -score => $types{$type}{count},
926 -attributes => {coverage => join ',',@{$types{$type}{bins}}});
927 push @results,$f;
929 return @results;
932 sub _name_sql {
933 my $self = shift;
934 my ($name,$allow_aliases,$join) = @_;
935 my $name_table = $self->_name_table;
937 my $from = "$name_table as n";
938 my ($match,$string) = $self->_match_sql($name);
940 my $where = "n.id=$join AND n.name $match";
941 $where .= " AND n.display_name>0" unless $allow_aliases;
942 return ($from,$where,'',$string);
945 sub _search_attributes {
946 my $self = shift;
947 my ($search_string,$attribute_names,$limit) = @_;
948 my @words = map {quotemeta($_)} split /\s+/,$search_string;
950 my $name_table = $self->_name_table;
951 my $attribute_table = $self->_attribute_table;
952 my $attributelist_table = $self->_attributelist_table;
953 my $type_table = $self->_type_table;
954 my $typelist_table = $self->_typelist_table;
956 my @tags = @$attribute_names;
957 my $tag_sql = join ' OR ',("al.tag=?") x @tags;
959 my $perl_regexp = join '|',@words;
961 my $sql_regexp = join ' OR ',("a.attribute_value REGEXP ?") x @words;
962 my $sql = <<END;
963 SELECT name,attribute_value,tl.tag,n.id
964 FROM $name_table as n,$attribute_table as a,$attributelist_table as al,$type_table as t,$typelist_table as tl
965 WHERE n.id=a.id
966 AND al.id=a.attribute_id
967 AND n.id=t.id
968 AND t.typeid=tl.id
969 AND n.display_name=1
970 AND ($tag_sql)
971 AND ($sql_regexp)
973 $sql .= "LIMIT $limit" if defined $limit;
974 $self->_print_query($sql,@tags,@words) if DEBUG || $self->debug;
975 my $sth = $self->_prepare($sql);
976 $sth->execute(@tags,@words) or $self->throw($sth->errstr);
978 my @results;
979 while (my($name,$value,$type,$id) = $sth->fetchrow_array) {
980 my (@hits) = $value =~ /$perl_regexp/ig;
981 my @words_in_row = split /\b/,$value;
982 my $score = int(@hits * 10);
983 push @results,[$name,$value,$score,$type,$id];
985 $sth->finish;
986 @results = sort {$b->[2]<=>$a->[2]} @results;
987 return @results;
990 sub _match_sql {
991 my $self = shift;
992 my $name = shift;
994 my ($match,$string);
995 if ($name =~ /(?:^|[^\\])[*?]/) {
996 $name =~ s/(^|[^\\])([%_])/$1\\$2/g;
997 $name =~ s/(^|[^\\])\*/$1%/g;
998 $name =~ s/(^|[^\\])\?/$1_/g;
999 $match = "LIKE ?";
1000 $string = $name;
1001 } else {
1002 $match = "= ?";
1003 $string = $name;
1005 return ($match,$string);
1008 sub _from_table_sql {
1009 my $self = shift;
1010 my ($from_table,$join) = @_;
1011 my $from = "$from_table as ft";
1012 my $where = "ft.id=$join";
1013 return ($from,$where,'');
1016 sub _attributes_sql {
1017 my $self = shift;
1018 my ($attributes,$join) = @_;
1020 my ($wf,@bind_args) = $self->_make_attribute_where('a','al',$attributes);
1021 my ($group_by,@group_args)= $self->_make_attribute_group('a',$attributes);
1023 my $attribute_table = $self->_attribute_table;
1024 my $attributelist_table = $self->_attributelist_table;
1026 my $from = "$attribute_table as a use index(attribute_id), $attributelist_table as al";
1028 my $where = <<END;
1029 a.id=$join
1030 AND a.attribute_id=al.id
1031 AND ($wf)
1034 my $group = $group_by;
1036 my @args = (@bind_args,@group_args);
1037 return ($from,$where,$group,@args);
1040 sub subfeature_types_are_indexed { 1 }
1041 sub subfeature_locations_are_indexed { 1 }
1043 sub _types_sql {
1044 my $self = shift;
1045 my ($types,$type_table) = @_;
1046 my ($primary_tag,$source_tag);
1048 my @types = ref $types eq 'ARRAY' ? @$types : $types;
1050 my $typelist = $self->_typelist_table;
1051 my $from = "$typelist AS tl";
1053 my (@matches,@args);
1055 for my $type (@types) {
1057 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1058 $primary_tag = $type->method;
1059 $source_tag = $type->source;
1060 } else {
1061 ($primary_tag,$source_tag) = split ':',$type,2;
1064 if (defined $source_tag && length $source_tag) {
1065 if (defined $primary_tag && length($primary_tag)) {
1066 push @matches,"tl.tag=?";
1067 push @args,"$primary_tag:$source_tag";
1069 else {
1070 push @matches,"tl.tag LIKE ?";
1071 push @args,"%:$source_tag";
1073 } else {
1074 push @matches,"tl.tag LIKE ?";
1075 push @args,"$primary_tag:%";
1078 my $matches = join ' OR ',@matches;
1080 my $where = <<END;
1081 tl.id=$type_table.typeid
1082 AND ($matches)
1085 return ($from,$where,'',@args);
1088 sub _location_sql {
1089 my $self = shift;
1090 my ($seq_id,$start,$end,$range_type,$strand,$location) = @_;
1092 # the additional join on the location_list table badly impacts performance
1093 # so we build a copy of the table in memory
1094 my $seqid = $self->_locationid_nocreate($seq_id) || 0; # zero is an invalid primary ID, so will return empty
1096 $start = MIN_INT unless defined $start;
1097 $end = MAX_INT unless defined $end;
1099 my ($bin_where,@bin_args) = $self->bin_where($start,$end,$location);
1101 my ($range,@range_args);
1102 if ($range_type eq 'overlaps') {
1103 $range = "$location.end>=? AND $location.start<=? AND ($bin_where)";
1104 @range_args = ($start,$end,@bin_args);
1105 } elsif ($range_type eq 'contains') {
1106 $range = "$location.start>=? AND $location.end<=? AND ($bin_where)";
1107 @range_args = ($start,$end,@bin_args);
1108 } elsif ($range_type eq 'contained_in') {
1109 $range = "$location.start<=? AND $location.end>=?";
1110 @range_args = ($start,$end);
1111 } else {
1112 $self->throw("range_type must be one of 'overlaps', 'contains' or 'contained_in'");
1115 if (defined $strand) {
1116 $range .= " AND strand=?";
1117 push @range_args,$strand;
1120 my $where = <<END;
1121 $location.seqid=?
1122 AND $range
1125 my $from = '';
1126 my $group = '';
1128 my @args = ($seqid,@range_args);
1129 return ($from,$where,$group,@args);
1133 # force reindexing
1135 sub reindex {
1136 my $self = shift;
1137 my $from_update_table = shift; # if present, will take ids from "update_table"
1139 my $dbh = $self->dbh;
1140 my $count = 0;
1141 my $now;
1143 # try to bring in highres time() function
1144 eval "require Time::HiRes";
1146 my $last_time = $self->time();
1148 # tell _delete_index() not to bother removing the index rows corresponding
1149 # to each individual feature
1150 local $self->{reindexing} = 1;
1152 $self->begin_work;
1153 eval {
1154 my $update = $from_update_table;
1155 for my $table ($self->index_tables) {
1156 my $query = $from_update_table ? "DELETE $table FROM $table,$update WHERE $table.id=$update.id"
1157 : "DELETE FROM $table";
1158 $dbh->do($query);
1159 $self->_disable_keys($dbh,$table);
1161 my $iterator = $self->get_seq_stream(-from_table=>$from_update_table ? $update : undef);
1162 while (my $f = $iterator->next_seq) {
1163 if (++$count %1000 == 0) {
1164 $now = $self->time();
1165 my $elapsed = sprintf(" in %5.2fs",$now - $last_time);
1166 $last_time = $now;
1167 print STDERR "$count features indexed$elapsed...",' 'x60;
1168 print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
1170 $self->_update_indexes($f);
1173 for my $table ($self->index_tables) {
1174 $self->_enable_keys($dbh,$table);
1176 if (@_) {
1177 warn "Couldn't complete transaction: $@";
1178 $self->rollback;
1179 return;
1180 } else {
1181 $self->commit;
1182 return 1;
1186 sub optimize {
1187 my $self = shift;
1188 $self->dbh->do("ANALYZE TABLE $_") foreach $self->index_tables;
1191 sub all_tables {
1192 my $self = shift;
1193 my @index_tables = $self->index_tables;
1194 my $features = $self->_feature_table;
1195 return ($features,@index_tables);
1198 sub index_tables {
1199 my $self = shift;
1200 return map {$self->_qualify($_)} qw(name attribute parent2child)
1203 sub _firstid {
1204 my $self = shift;
1205 my $features = $self->_feature_table;
1206 my $query = <<END;
1207 SELECT min(id) FROM $features
1209 my $sth=$self->_prepare($query);
1210 $sth->execute();
1211 my ($first) = $sth->fetchrow_array;
1212 $sth->finish;
1213 $first;
1216 sub _nextid {
1217 my $self = shift;
1218 my $lastkey = shift;
1219 my $features = $self->_feature_table;
1220 my $query = <<END;
1221 SELECT min(id) FROM $features WHERE id>?
1223 my $sth=$self->_prepare($query);
1224 $sth->execute($lastkey);
1225 my ($next) = $sth->fetchrow_array;
1226 $sth->finish;
1227 $next;
1230 sub _existsid {
1231 my $self = shift;
1232 my $key = shift;
1233 my $features = $self->_feature_table;
1234 my $query = <<END;
1235 SELECT count(*) FROM $features WHERE id=?
1237 my $sth=$self->_prepare($query);
1238 $sth->execute($key);
1239 my ($count) = $sth->fetchrow_array;
1240 $sth->finish;
1241 $count > 0;
1244 sub _deleteid {
1245 my $self = shift;
1246 my $key = shift;
1247 my $dbh = $self->dbh;
1248 my $parent2child = $self->_parent2child_table;
1249 my $query = "SELECT child FROM $parent2child WHERE id=?";
1250 my $sth=$self->_prepare($query);
1251 $sth->execute($key);
1252 my $success = 0;
1253 while (my ($cid) = $sth->fetchrow_array) {
1254 # Backcheck looking for multiple parents, delete only if one is present. I'm
1255 # sure there is a nice way to left join the parent2child table onto itself
1256 # to get this in one query above, just haven't worked it out yet...
1257 my $sth2 = $self->_prepare("SELECT count(id) FROM $parent2child WHERE child=?");
1258 $sth2->execute($cid);
1259 my ($count) = $sth2->fetchrow_array;
1260 if ($count == 1) {
1261 $self->_deleteid($cid) || warn "An error occurred while removing subfeature id=$cid. Perhaps it was previously deleted?\n";
1264 for my $table ($self->all_tables) {
1265 $success += $dbh->do("DELETE FROM $table WHERE id=$key") || 0;
1267 return $success;
1270 sub _clearall {
1271 my $self = shift;
1272 my $dbh = $self->dbh;
1273 for my $table ($self->all_tables) {
1274 $dbh->do("DELETE FROM $table");
1278 sub _featurecount {
1279 my $self = shift;
1280 my $dbh = $self->dbh;
1281 my $features = $self->_feature_table;
1282 my $query = <<END;
1283 SELECT count(*) FROM $features
1285 my $sth=$self->_prepare($query);
1286 $sth->execute();
1287 my ($count) = $sth->fetchrow_array;
1288 $sth->finish;
1289 $count;
1292 sub _seq_ids {
1293 my $self = shift;
1294 my $dbh = $self->dbh;
1295 my $location = $self->_locationlist_table;
1296 my $sth = $self->_prepare("SELECT DISTINCT seqname FROM $location");
1297 $sth->execute() or $self->throw($sth->errstr);
1298 my @result;
1299 while (my ($id) = $sth->fetchrow_array) {
1300 push @result,$id;
1302 return @result;
1305 sub setting {
1306 my $self = shift;
1307 my ($variable_name,$value) = @_;
1308 my $meta = $self->_meta_table;
1310 if (defined $value && $self->writeable) {
1311 my $query = <<END;
1312 REPLACE INTO $meta (name,value) VALUES (?,?)
1314 my $sth = $self->_prepare($query);
1315 $sth->execute($variable_name,$value) or $self->throw($sth->errstr);
1316 $sth->finish;
1317 $self->{settings_cache}{$variable_name} = $value;
1319 else {
1320 return $self->{settings_cache}{$variable_name} if exists $self->{settings_cache}{$variable_name};
1321 my $query = <<END;
1322 SELECT value FROM $meta as m WHERE m.name=?
1324 my $sth = $self->_prepare($query);
1325 $sth->execute($variable_name) or $self->throw($sth->errstr);
1326 my ($value) = $sth->fetchrow_array;
1327 $sth->finish;
1328 return $self->{settings_cache}{$variable_name} = $value;
1333 # Replace Bio::SeqFeatureI into database.
1335 sub replace {
1336 my $self = shift;
1337 my $object = shift;
1338 my $index_flag = shift || undef;
1340 # ?? shouldn't need to do this
1341 # $self->_load_class($object);
1342 my $id = $object->primary_id;
1343 my $features = $self->_feature_table;
1345 my $sth = $self->_prepare(<<END);
1346 REPLACE INTO $features (id,object,indexed,seqid,start,end,strand,tier,bin,typeid) VALUES (?,?,?,?,?,?,?,?,?,?)
1349 my @location = $index_flag ? $self->_get_location_and_bin($object) : (undef)x6;
1351 my $primary_tag = $object->primary_tag;
1352 my $source_tag = $object->source_tag || '';
1353 $primary_tag .= ":$source_tag";
1354 my $typeid = $self->_typeid($primary_tag,1);
1356 my $frozen = $self->no_blobs() ? 0 : $self->freeze($object);
1358 $sth->execute($id,$frozen,$index_flag||0,@location,$typeid) or $self->throw($sth->errstr);
1360 my $dbh = $self->dbh;
1361 $object->primary_id($dbh->{mysql_insertid}) unless defined $id;
1363 $self->flag_for_indexing($dbh->{mysql_insertid}) if $self->{bulk_update_in_progress};
1366 # doesn't work with this schema, since we have to update name and attribute
1367 # tables which need object ids, which we can only know by replacing feats in
1368 # the feature table one by one
1369 sub bulk_replace {
1370 my $self = shift;
1371 my $index_flag = shift || undef;
1372 my @objects = @_;
1374 my $features = $self->_feature_table;
1376 my @insert_values;
1377 foreach my $object (@objects) {
1378 my $id = $object->primary_id;
1379 my @location = $index_flag ? $self->_get_location_and_bin($object) : (undef)x6;
1380 my $primary_tag = $object->primary_tag;
1381 my $source_tag = $object->source_tag || '';
1382 $primary_tag .= ":$source_tag";
1383 my $typeid = $self->_typeid($primary_tag,1);
1385 push(@insert_values, ($id,0,$index_flag||0,@location,$typeid));
1388 my @value_blocks;
1389 for (1..@objects) {
1390 push(@value_blocks, '(?,?,?,?,?,?,?,?,?,?)');
1392 my $value_blocks = join(',', @value_blocks);
1393 my $sql = qq{REPLACE INTO $features (id,object,indexed,seqid,start,end,strand,tier,bin,typeid) VALUES $value_blocks};
1395 my $sth = $self->_prepare($sql);
1396 $sth->execute(@insert_values) or $self->throw($sth->errstr);
1400 # Insert one Bio::SeqFeatureI into database. primary_id must be undef
1402 sub insert {
1403 my $self = shift;
1404 my $object = shift;
1405 my $index_flag = shift || 0;
1407 $self->_load_class($object);
1408 defined $object->primary_id and $self->throw("$object already has a primary id");
1410 my $features = $self->_feature_table;
1411 my $sth = $self->_prepare(<<END);
1412 INSERT INTO $features (id,object,indexed) VALUES (?,?,?)
1414 $sth->execute(undef,$self->freeze($object),$index_flag) or $self->throw($sth->errstr);
1415 my $dbh = $self->dbh;
1416 $object->primary_id($dbh->{mysql_insertid});
1417 $self->flag_for_indexing($dbh->{mysql_insertid}) if $self->{bulk_update_in_progress};
1420 =head2 types
1422 Title : types
1423 Usage : @type_list = $db->types
1424 Function: Get all the types in the database
1425 Returns : array of Bio::DB::GFF::Typename objects
1426 Args : none
1427 Status : public
1429 =cut
1431 sub types {
1432 my $self = shift;
1433 eval "require Bio::DB::GFF::Typename"
1434 unless Bio::DB::GFF::Typename->can('new');
1435 my $typelist = $self->_typelist_table;
1436 my $sql = <<END;
1437 SELECT tag from $typelist
1440 $self->_print_query($sql) if DEBUG || $self->debug;
1441 my $sth = $self->_prepare($sql);
1442 $sth->execute() or $self->throw($sth->errstr);
1444 my @results;
1445 while (my($tag) = $sth->fetchrow_array) {
1446 push @results,Bio::DB::GFF::Typename->new($tag);
1448 $sth->finish;
1449 return @results;
1452 =head2 toplevel_types
1454 Title : toplevel_types
1455 Usage : @type_list = $db->toplevel_types
1456 Function: Get the toplevel types in the database
1457 Returns : array of Bio::DB::GFF::Typename objects
1458 Args : none
1459 Status : public
1461 This is similar to types() but only returns the types of
1462 INDEXED (toplevel) features.
1464 =cut
1466 sub toplevel_types {
1467 my $self = shift;
1468 eval "require Bio::DB::GFF::Typename"
1469 unless Bio::DB::GFF::Typename->can('new');
1470 my $typelist = $self->_typelist_table;
1471 my $features = $self->_feature_table;
1472 my $sql = <<END;
1473 SELECT distinct(tag) from $typelist as tl,$features as f
1474 WHERE tl.id=f.typeid
1475 AND f.indexed=1
1478 $self->_print_query($sql) if DEBUG || $self->debug;
1479 my $sth = $self->_prepare($sql);
1480 $sth->execute() or $self->throw($sth->errstr);
1482 my @results;
1483 while (my($tag) = $sth->fetchrow_array) {
1484 push @results,Bio::DB::GFF::Typename->new($tag);
1486 $sth->finish;
1487 return @results;
1491 # Insert a bit of DNA or protein into the database
1493 sub _insert_sequence {
1494 my $self = shift;
1495 my ($seqid,$seq,$offset) = @_;
1496 my $id = $self->_locationid($seqid);
1497 my $sequence = $self->_sequence_table;
1498 my $sth = $self->_prepare(<<END);
1499 REPLACE INTO $sequence (id,offset,sequence) VALUES (?,?,?)
1501 $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr);
1505 # This subroutine flags the given primary ID for later reindexing
1507 sub flag_for_indexing {
1508 my $self = shift;
1509 my $id = shift;
1510 my $needs_updating = $self->_update_table;
1511 my $sth = $self->_prepare("REPLACE INTO $needs_updating VALUES (?)");
1512 $sth->execute($id) or $self->throw($self->dbh->errstr);
1516 # Update indexes for given object
1518 sub _update_indexes {
1519 my $self = shift;
1520 my $obj = shift;
1521 defined (my $id = $obj->primary_id) or return;
1523 if ($self->{bulk_update_in_progress}) {
1524 $self->_dump_update_name_index($obj,$id);
1525 $self->_dump_update_attribute_index($obj,$id);
1526 } else {
1527 $self->_update_name_index($obj,$id);
1528 $self->_update_attribute_index($obj,$id);
1532 sub _update_name_index {
1533 my $self = shift;
1534 my ($obj,$id) = @_;
1535 my $name = $self->_name_table;
1536 my $primary_id = $obj->primary_id;
1538 $self->_delete_index($name,$id);
1539 my ($names,$aliases) = $self->feature_names($obj);
1541 my $sth = $self->_prepare("INSERT INTO $name (id,name,display_name) VALUES (?,?,?)");
1543 $sth->execute($id,$_,1) or $self->throw($sth->errstr) foreach @$names;
1544 $sth->execute($id,$_,0) or $self->throw($sth->errstr) foreach @$aliases;
1545 $sth->finish;
1548 sub _update_attribute_index {
1549 my $self = shift;
1550 my ($obj,$id) = @_;
1551 my $attribute = $self->_attribute_table;
1552 $self->_delete_index($attribute,$id);
1554 my $sth = $self->_prepare("INSERT INTO $attribute (id,attribute_id,attribute_value) VALUES (?,?,?)");
1555 for my $tag ($obj->get_all_tags) {
1556 my $tagid = $self->_attributeid($tag);
1557 for my $value ($obj->get_tag_values($tag)) {
1558 $sth->execute($id,$tagid,$value) or $self->throw($sth->errstr);
1561 $sth->finish;
1564 sub _genericid {
1565 my $self = shift;
1566 my ($table,$namefield,$name,$add_if_missing) = @_;
1567 my $qualified_table = $self->_qualify($table);
1568 my $sth = $self->_prepare(<<END);
1569 SELECT id FROM $qualified_table WHERE $namefield=?
1571 $sth->execute($name) or die $sth->errstr;
1572 my ($id) = $sth->fetchrow_array;
1573 $sth->finish;
1574 return $id if defined $id;
1575 return unless $add_if_missing;
1577 $sth = $self->_prepare(<<END);
1578 INSERT INTO $qualified_table ($namefield) VALUES (?)
1580 $sth->execute($name) or die $sth->errstr;
1581 my $dbh = $self->dbh;
1582 return $dbh->{mysql_insertid};
1585 sub _typeid {
1586 shift->_genericid('typelist','tag',shift,1);
1588 sub _locationid {
1589 shift->_genericid('locationlist','seqname',shift,1);
1591 sub _locationid_nocreate {
1592 shift->_genericid('locationlist','seqname',shift,0);
1594 sub _attributeid {
1595 shift->_genericid('attributelist','tag',shift,1);
1598 sub _get_location_and_bin {
1599 my $self = shift;
1600 my $feature = shift;
1601 my $seqid = $self->_locationid($feature->seq_id||'');
1602 my $start = $feature->start;
1603 my $end = $feature->end;
1604 my $strand = $feature->strand || 0;
1605 my ($tier,$bin) = $self->get_bin($start,$end);
1606 return ($seqid,$start,$end,$strand,$tier,$bin);
1609 sub get_bin {
1610 my $self = shift;
1611 my ($start,$end) = @_;
1612 my $binsize = MIN_BIN;
1613 my ($bin_start,$bin_end,$tier);
1614 $tier = 0;
1615 while (1) {
1616 $bin_start = int $start/$binsize;
1617 $bin_end = int $end/$binsize;
1618 last if $bin_start == $bin_end;
1619 $binsize *= 10;
1620 $tier++;
1622 return ($tier,$bin_start);
1625 sub bin_where {
1626 my $self = shift;
1627 my ($start,$end,$f) = @_;
1628 my (@bins,@args);
1630 my $tier = 0;
1631 my $binsize = MIN_BIN;
1632 while ($binsize <= MAX_BIN) {
1633 my $bin_start = int($start/$binsize);
1634 my $bin_end = int($end/$binsize);
1635 push @bins,"($f.tier=? AND $f.bin between ? AND ?)";
1636 push @args,($tier,$bin_start,$bin_end);
1637 $binsize *= 10;
1638 $tier++;
1640 my $query = join ("\n\t OR ",@bins);
1641 return wantarray ? ($query,@args) : substitute($query,@args);
1644 sub _delete_index {
1645 my $self = shift;
1646 my ($table_name,$id) = @_;
1647 return if $self->{reindexing};
1648 my $sth = $self->_prepare("DELETE FROM $table_name WHERE id=?") or $self->throw($self->dbh->errstr);
1649 $sth->execute($id);
1652 # given a statement handler that is expected to return rows of (id,object)
1653 # unthaw each object and return a list of 'em
1654 sub _sth2objs {
1655 my $self = shift;
1656 my $sth = shift;
1657 my @result;
1658 while (my ($id,$o,$typeid,$seqid,$start,$end,$strand) = $sth->fetchrow_array) {
1659 my $obj;
1660 if ($o eq '0') {
1661 # rebuild a new feat object from the data stored in the db
1662 $obj = $self->_rebuild_obj($id,$typeid,$seqid,$start,$end,$strand);
1664 else {
1665 $obj = $self->thaw($o,$id);
1668 push @result,$obj;
1670 $sth->finish;
1671 return @result;
1674 # given a statement handler that is expected to return rows of (id,object)
1675 # unthaw each object and return a list of 'em
1676 sub _sth2obj {
1677 my $self = shift;
1678 my $sth = shift;
1679 my ($id,$o,$typeid,$seqid,$start,$end,$strand) = $sth->fetchrow_array;
1680 return unless defined $o;
1681 my $obj;
1682 if ($o eq '0') { # I don't understand why an object ever needs to be rebuilt!
1683 # rebuild a new feat object from the data stored in the db
1684 $obj = $self->_rebuild_obj($id,$typeid,$seqid,$start,$end,$strand);
1686 else {
1687 $obj = $self->thaw($o,$id);
1690 $obj;
1693 sub _rebuild_obj {
1694 my ($self, $id, $typeid, $db_seqid, $start, $end, $strand) = @_;
1695 my ($type, $source, $seqid);
1697 # convert typeid to type and source
1698 if (exists $self->{_type_cache}->{$typeid}) {
1699 ($type, $source) = @{$self->{_type_cache}->{$typeid}};
1701 else {
1702 my $sql = qq{ SELECT `tag` FROM typelist WHERE `id` = ? };
1703 my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1704 $sth->execute($typeid);
1705 my $result;
1706 $sth->bind_columns(\$result);
1707 while ($sth->fetch()) {
1708 # there should be only one row returned, but we ensure to get all rows
1711 ($type, $source) = split(':', $result);
1712 $self->{_type_cache}->{$typeid} = [$type, $source];
1715 # convert the db seqid to the sequence name
1716 if (exists $self->{_seqid_cache}->{$db_seqid}) {
1717 $seqid = $self->{_seqid_cache}->{$db_seqid};
1719 else {
1720 my $sql = qq{ SELECT `seqname` FROM locationlist WHERE `id` = ? };
1721 my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1722 $sth->execute($db_seqid);
1723 $sth->bind_columns(\$seqid);
1724 while ($sth->fetch()) {
1725 # there should be only one row returned, but we ensure to get all rows
1728 $self->{_seqid_cache}->{$db_seqid} = $seqid;
1731 # get the names from name table?
1733 # get the attributes and store those in obj
1734 my $sql = qq{ SELECT attribute_id,attribute_value FROM attribute WHERE `id` = ? };
1735 my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1736 $sth->execute($id);
1737 my ($attribute_id, $attribute_value);
1738 $sth->bind_columns(\($attribute_id, $attribute_value));
1739 my %attribs;
1740 while ($sth->fetch()) {
1741 # convert the attribute_id to its real name
1742 my $attribute;
1743 if (exists $self->{_attribute_cache}->{$attribute_id}) {
1744 $attribute = $self->{_attribute_cache}->{$attribute_id};
1746 else {
1747 my $sql = qq{ SELECT `tag` FROM attributelist WHERE `id` = ? };
1748 my $sth2 = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1749 $sth2->execute($attribute_id);
1750 $sth2->bind_columns(\$attribute);
1751 while ($sth2->fetch()) {
1752 # there should be only one row returned, but we ensure to get all rows
1755 $self->{_attribute_cache}->{$attribute_id} = $attribute;
1758 if ($source && $attribute eq 'source' && $attribute_value eq $source) {
1759 next;
1762 $attribs{$attribute} = $attribute_value;
1765 # if we weren't called with all the params, pull those out of the database too
1766 if ( not ( grep { defined($_) } ( $typeid, $db_seqid, $start, $end, $strand ))) {
1767 my $sql = qq{ SELECT start,end,tag,strand,seqname
1768 FROM feature,feature_location,typelist,locationlist
1769 WHERE feature.id=feature_location.id AND feature.typeid=typelist.id
1770 AND seqid=locationlist.id AND feature.id = ? };
1771 my $sth = $self->_prepare($sql) or $self->throw($self->dbh->errstr);
1772 $sth->execute($id);
1773 my ($feature_start, $feature_end, $feature_type, $feature_strand,$feature_seqname);
1774 $sth->bind_columns(\($feature_start, $feature_end, $feature_type, $feature_strand, $feature_seqname));
1775 while ($sth->fetch()) {
1776 # there should be only one row returned, but we call like this to
1777 # ensure we get all rows
1779 $start ||= $feature_start;
1780 $end ||= $feature_end;
1781 $strand ||= $feature_strand;
1782 $seqid ||= $feature_seqname;
1784 my( $feature_typename , $feature_typesource ) = split /:/ , $feature_type;
1785 $type ||= $feature_typename;
1786 $source ||= $feature_typesource;
1789 my $obj = Bio::SeqFeature::Lite->new(-primary_id => $id,
1790 $type ? (-type => $type) : (),
1791 $source ? (-source => $source) : (),
1792 $seqid ? (-seq_id => $seqid) : (),
1793 defined $start ? (-start => $start) : (),
1794 defined $end ? (-end => $end) : (),
1795 defined $strand ? (-strand => $strand) : (),
1796 keys %attribs ? (-attributes => \%attribs) : ());
1798 return $obj;
1801 sub _prepare {
1802 my $self = shift;
1803 my $query = shift;
1804 my $dbh = $self->dbh;
1805 my $sth = $dbh->prepare_cached($query, {}, 3) or $self->throw($dbh->errstr);
1806 $sth;
1810 ####################################################################################################
1811 # SQL Fragment generators
1812 ####################################################################################################
1814 sub _attribute_table { shift->_qualify('attribute') }
1815 sub _attributelist_table { shift->_qualify('attributelist') }
1816 sub _feature_table { shift->_qualify('feature') }
1817 sub _interval_stats_table { shift->_qualify('interval_stats') }
1818 sub _location_table { shift->_qualify('location') }
1819 sub _locationlist_table { shift->_qualify('locationlist') }
1820 sub _meta_table { shift->_qualify('meta') }
1821 sub _name_table { shift->_qualify('name') }
1822 sub _parent2child_table { shift->_qualify('parent2child') }
1823 sub _sequence_table { shift->_qualify('sequence') }
1824 sub _type_table { shift->_qualify('feature') }
1825 sub _typelist_table { shift->_qualify('typelist') }
1826 sub _update_table { shift->_qualify('update_table') }
1828 sub _make_attribute_where {
1829 my $self = shift;
1830 my ($attributetable,$attributenametable,$attributes) = @_;
1831 my @args;
1832 my @sql;
1833 my $dbh = $self->dbh;
1834 foreach (keys %$attributes) {
1835 my @match_values;
1836 my @values = ref($attributes->{$_}) && ref($attributes->{$_}) eq 'ARRAY' ? @{$attributes->{$_}} : $attributes->{$_};
1837 foreach (@values) { # convert * into % for wildcard matches
1838 s/\*/%/g;
1840 my $match = join ' OR ',map {
1841 /%/ ? "$attributetable.attribute_value LIKE ?"
1842 : "$attributetable.attribute_value=?"
1843 } @values;
1844 push @sql,"($attributenametable.tag=? AND ($match))";
1845 push @args,($_,@values);
1847 return (join(' OR ',@sql),@args);
1850 sub _make_attribute_group {
1851 my $self = shift;
1852 my ($table_name,$attributes) = @_;
1853 my $key_count = keys %$attributes or return;
1854 return "f.id,f.object,f.typeid,f.seqid,f.start,f.end,f.strand HAVING count(f.id)>?",$key_count-1;
1857 sub _print_query {
1858 my $self = shift;
1859 my ($query,@args) = @_;
1860 while ($query =~ /\?/) {
1861 my $arg = $self->dbh->quote(shift @args);
1862 $query =~ s/\?/$arg/;
1864 warn $query,"\n";
1868 # special-purpose store for bulk loading - write to a file rather than to the db
1870 sub _dump_store {
1871 my $self = shift;
1872 my $indexed = shift;
1874 my $count = 0;
1875 my $store_fh = $self->dump_filehandle('feature');
1876 my $dbh = $self->dbh;
1878 my $autoindex = $self->autoindex;
1880 for my $obj (@_) {
1881 my $id = $self->next_id;
1882 my ($seqid,$start,$end,$strand,$tier,$bin) = $indexed ? $self->_get_location_and_bin($obj) : (undef)x6;
1883 my $primary_tag = $obj->primary_tag;
1884 my $source_tag = $obj->source_tag || '';
1885 $primary_tag .= ":$source_tag";
1886 my $typeid = $self->_typeid($primary_tag,1);
1888 print $store_fh join("\t",$id,$typeid,$seqid,$start,$end,$strand,$tier,$bin,$indexed,$dbh->quote($self->freeze($obj))),"\n";
1889 $obj->primary_id($id);
1890 $self->_update_indexes($obj) if $indexed && $autoindex;
1891 $count++;
1894 # remember whether we are have ever stored a non-indexed feature
1895 unless ($indexed or $self->{indexed_flag}++) {
1896 $self->subfeatures_are_indexed(0);
1898 $count;
1901 sub _dump_add_SeqFeature {
1902 my $self = shift;
1903 my $parent = shift;
1904 my @children = @_;
1906 my $dbh = $self->dbh;
1907 my $fh = $self->dump_filehandle('parent2child');
1908 my $parent_id = (ref $parent ? $parent->primary_id : $parent)
1909 or $self->throw("$parent should have a primary_id");
1910 my $count = 0;
1912 for my $child_id (@children) {
1913 print $fh join("\t",$parent_id,$child_id),"\n";
1914 $count++;
1916 $count;
1919 sub _dump_update_name_index {
1920 my $self = shift;
1921 my ($obj,$id) = @_;
1922 my $fh = $self->dump_filehandle('name');
1923 my $dbh = $self->dbh;
1924 my ($names,$aliases) = $self->feature_names($obj);
1925 print $fh join("\t",$id,$dbh->quote($_),1),"\n" foreach @$names;
1926 print $fh join("\t",$id,$dbh->quote($_),0),"\n" foreach @$aliases;
1929 sub _dump_update_attribute_index {
1930 my $self = shift;
1931 my ($obj,$id) = @_;
1932 my $fh = $self->dump_filehandle('attribute');
1933 my $dbh = $self->dbh;
1934 for my $tag ($obj->all_tags) {
1935 my $tagid = $self->_attributeid($tag);
1936 for my $value ($obj->each_tag_value($tag)) {
1937 print $fh join("\t",$id,$tagid,$dbh->quote($value)),"\n";
1942 sub coverage_array {
1943 my $self = shift;
1944 my ($seq_name,$start,$end,$types,$bins) =
1945 rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
1946 ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);
1948 $bins ||= 1000;
1949 $start ||= 1;
1950 unless ($end) {
1951 my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
1952 $end = $segment->end;
1955 my $binsize = ($end-$start+1)/$bins;
1956 my $seqid = $self->_locationid_nocreate($seq_name) || 0;
1958 return [] unless $seqid;
1960 # where each bin starts
1961 my @his_bin_array = map {$start + $binsize * $_} (0..$bins-1);
1962 my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;
1964 my $interval_stats = $self->_interval_stats_table;
1966 my ($sth,@a);
1967 if ($types) {
1968 # pick up the type ids
1969 my ($from,$where,$group);
1970 ($from,$where,$group,@a) = $self->_types_sql($types,'b');
1971 $where =~ s/.+AND//s;
1972 $sth = $self->_prepare(<<END);
1973 SELECT id,tag FROM $from
1974 WHERE $where
1977 } else {
1978 $sth = $self->_prepare(<<END);
1979 SELECT id,tag FROM typelist
1982 my (@t,$report_tag);
1983 $sth->execute(@a);
1984 while (my ($t,$tag) = $sth->fetchrow_array) {
1985 $report_tag ||= $tag;
1986 push @t,$t;
1989 my %bins;
1990 my $sql = <<END;
1991 SELECT bin,cum_count
1992 FROM $interval_stats
1993 WHERE typeid=?
1994 AND seqid=? AND bin >= ?
1995 LIMIT 1
1998 $sth = $self->_prepare($sql);
2000 eval {
2001 for my $typeid (@t) {
2003 for (my $i=0;$i<@sum_bin_array;$i++) {
2005 my @args = ($typeid,$seqid,$sum_bin_array[$i]);
2006 $self->_print_query($sql,@args) if $self->debug;
2008 $sth->execute(@args) or $self->throw($sth->errstr);
2009 my ($bin,$cum_count) = $sth->fetchrow_array;
2010 push @{$bins{$typeid}},[$bin,$cum_count];
2014 return unless %bins;
2016 my @merged_bins;
2017 my $firstbin = int(($start-1)/$binsize);
2018 for my $type (keys %bins) {
2019 my $arry = $bins{$type};
2020 my $last_count = $arry->[0][1];
2021 my $last_bin = -1;
2022 my $i = 0;
2023 my $delta;
2024 for my $b (@$arry) {
2025 my ($bin,$count) = @$b;
2026 $delta = $count - $last_count if $bin > $last_bin;
2027 $merged_bins[$i++] += $delta;
2028 $last_count = $count;
2029 $last_bin = $bin;
2033 return wantarray ? (\@merged_bins,$report_tag) : \@merged_bins;
2036 sub build_summary_statistics {
2037 my $self = shift;
2038 my $interval_stats = $self->_interval_stats_table;
2039 my $dbh = $self->dbh;
2040 $self->begin_work;
2042 my $sbs = SUMMARY_BIN_SIZE;
2044 my $result = eval {
2045 $self->_add_interval_stats_table;
2046 $self->_disable_keys($dbh,$interval_stats);
2047 $dbh->do("DELETE FROM $interval_stats");
2049 my $insert = $dbh->prepare(<<END) or $self->throw($dbh->errstr);
2050 INSERT INTO $interval_stats
2051 (typeid,seqid,bin,cum_count)
2052 VALUES (?,?,?,?)
2055 my $sql = $self->_fetch_indexed_features_sql;
2056 my $select = $dbh->prepare($sql) or $self->throw($dbh->errstr);
2058 my $current_bin = -1;
2059 my ($current_type,$current_seqid,$count);
2060 my $cum_count = 0;
2061 my (%residuals,$last_bin);
2063 my $le = -t \*STDERR ? "\r" : "\n";
2065 print STDERR "\n";
2066 $select->execute;
2068 while (my($typeid,$seqid,$start,$end) = $select->fetchrow_array) {
2069 print STDERR $count," features processed$le" if ++$count % 1000 == 0;
2071 my $bin = int($start/$sbs);
2072 $current_type ||= $typeid;
2073 $current_seqid ||= $seqid;
2075 # because the input is sorted by start, no more features will contribute to the
2076 # current bin so we can dispose of it
2077 if ($bin != $current_bin) {
2078 if ($seqid != $current_seqid or $typeid != $current_type) {
2079 # load all bins left over
2080 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
2081 %residuals = () ;
2082 $cum_count = 0;
2083 } else {
2084 # load all up to current one
2085 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin);
2089 $last_bin = $current_bin;
2090 ($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin);
2092 # summarize across entire spanned region
2093 my $last_bin = int(($end-1)/$sbs);
2094 for (my $b=$bin;$b<=$last_bin;$b++) {
2095 $residuals{$b}++;
2098 # handle tail case
2099 # load all bins left over
2100 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
2101 $self->_enable_keys($dbh,$interval_stats);
2105 if ($result) { $self->commit } else { warn "Can't build summary statistics: $@"; $self->rollback };
2106 print STDERR "\n";
2109 sub _load_bins {
2110 my $self = shift;
2111 my ($insert,$residuals,$cum_count,$type,$seqid,$stop_after) = @_;
2112 for my $b (sort {$a<=>$b} keys %$residuals) {
2113 last if defined $stop_after and $b > $stop_after;
2114 $$cum_count += $residuals->{$b};
2115 my @args = ($type,$seqid,$b,$$cum_count);
2116 $insert->execute(@args);
2117 delete $residuals->{$b}; # no longer needed
2121 sub _add_interval_stats_table {
2122 my $self = shift;
2123 my $tables = $self->table_definitions;
2124 my $interval_stats = $self->_interval_stats_table;
2125 $self->dbh->do("CREATE TABLE IF NOT EXISTS $interval_stats $tables->{interval_stats}");
2128 sub _fetch_indexed_features_sql {
2129 my $self = shift;
2130 my $features = $self->_feature_table;
2131 return <<END;
2132 SELECT typeid,seqid,start-1,end
2133 FROM $features as f
2134 WHERE f.indexed=1
2135 ORDER BY typeid,seqid,start
2139 sub _disable_keys {
2140 my $self = shift;
2141 my ($dbh,$table) = @_;
2142 $dbh->do("ALTER TABLE $table DISABLE KEYS");
2144 sub _enable_keys {
2145 my $self = shift;
2146 my ($dbh,$table) = @_;
2147 $dbh->do("ALTER TABLE $table ENABLE KEYS");
2150 sub time {
2151 return Time::HiRes::time() if Time::HiRes->can('time');
2152 return time();
2155 sub DESTROY {
2156 my $self = shift;
2157 if ($self->{bulk_update_in_progress}) { # be sure to remove temp files
2158 for my $table ($self->_feature_table,$self->index_tables) {
2159 my $path = $self->dump_path($table);
2160 unlink $path;
2165 sub begin_work {
2166 my $self = shift;
2167 return if $self->{_in_transaction}++;
2168 my $dbh = $self->dbh;
2169 return unless $dbh->{AutoCommit};
2170 $dbh->begin_work;
2173 sub commit {
2174 my $self = shift;
2175 return unless $self->{_in_transaction};
2176 delete $self->{_in_transaction};
2177 $self->dbh->commit;
2180 sub rollback {
2181 my $self = shift;
2182 return unless $self->{_in_transaction};
2183 delete $self->{_in_transaction};
2184 $self->dbh->rollback;