Bio::DB::SeqFeature::* move namespace into its own distribution.
[bioperl-live.git] / lib / Bio / DB / GFF / Adaptor / dbi / pg.pm
blobd1784409f28c322f34494b507f5d3d75ba08b119
1 package Bio::DB::GFF::Adaptor::dbi::pg;
3 =head1 NAME
5 Bio::DB::GFF::Adaptor::dbi::pg -- Database adaptor for a specific postgres schema
7 =head1 NOTES
9 SQL commands that need to be executed before this adaptor will work:
11 CREATE DATABASE <dbname>;
13 Also, select permission needs to be granted for each table in the
14 database to the owner of the httpd process (usually 'nobody', but
15 for some RedHat systems it is 'apache') if this adaptor is to be used
16 with the Generic Genome Browser (gbrowse):
18 CREATE USER nobody;
19 GRANT SELECT ON TABLE fmeta TO nobody;
20 GRANT SELECT ON TABLE fgroup TO nobody;
21 GRANT SELECT ON TABLE fdata TO nobody;
22 GRANT SELECT ON TABLE fattribute_to_feature TO nobody;
23 GRANT SELECT ON TABLE fdna TO nobody;
24 GRANT SELECT ON TABLE fattribute TO nobody;
25 GRANT SELECT ON TABLE ftype TO nobody;
27 =head2 Optimizing the database
29 PostgreSQL generally requires some tuning before you get very good
30 performance for large databases. For general information on tuning
31 a PostgreSQL server, see http://www.varlena.com/GeneralBits/Tidbits/perf.html
32 Of particular importance is executing VACUUM FULL ANALYZE whenever
33 you change the database.
35 Additionally, for a GFF database, there are a few items you can tune.
36 For each automatic class in your GBrowse conf file, there will be one
37 or two searches done when searching for a feature. If there are lots
38 of features, these search can take several seconds. To speed these searches,
39 do two things:
41 =over
43 =item 1
45 Set 'enable_seqscan = false' in your postgresql.conf file (and restart
46 your server).
48 =item 2
50 Create 'partial' indexes for each automatic class, doing this for the
51 example class 'Allele':
53 CREATE INDEX partial_allele_gclass ON
54 fgroup (lower('gname')) WHERE gclass='Allele';
56 And be sure to run VACUUM FULL ANALYZE after creating the indexes.
58 =back
60 =cut
62 # a simple postgres adaptor
63 use strict;
64 use Bio::DB::GFF::Util::Binning;
65 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
66 use base qw(Bio::DB::GFF::Adaptor::dbi);
68 use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get
69 use constant DEFAULT_CHUNK => 2000;
71 use constant GETSEQCOORDS =><<END;
72 SELECT fref,
73 COALESCE(gclass,'Sequence'),
74 min(fstart),
75 max(fstop),
76 fstrand,
77 gname
78 FROM fdata,fgroup
79 WHERE lower(fgroup.gname) = lower(?)
80 AND fgroup.gclass=?
81 AND fgroup.gid=fdata.gid
82 GROUP BY fref,fstrand,gclass,gname
83 END
86 use constant GETALIASCOORDS =><<END;
87 SELECT fref,
88 COALESCE(gclass,'Sequence'),
89 min(fstart),
90 max(fstop),
91 fstrand,
92 gname
93 FROM fdata,fgroup,fattribute,fattribute_to_feature
94 WHERE lower(fattribute_to_feature.fattribute_value)=lower(?)
95 AND fgroup.gclass=?
96 AND fgroup.gid=fdata.gid
97 AND fattribute.fattribute_name='Alias'
98 AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
99 AND fattribute_to_feature.fid=fdata.fid
100 GROUP BY fref,fstrand,gclass,gname
104 use constant GETALIASLIKE =><<END;
105 SELECT fref,
106 COALESCE(gclass,'Sequence'),
107 min(fstart),
108 max(fstop),
109 fstrand,
110 gname
111 FROM fdata,fgroup,fattribute,fattribute_to_feature
112 WHERE lower(fattribute_to_feature.fattribute_value) LIKE lower(?)
113 AND fgroup.gclass=?
114 AND fgroup.gid=fdata.gid
115 AND fattribute.fattribute_name='Alias'
116 AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
117 AND fattribute_to_feature.fid=fdata.fid
118 GROUP BY fref,fstrand,gname
123 use constant GETFORCEDSEQCOORDS =><<END;
124 SELECT fref,
125 COALESCE(gclass,'Sequence'),
126 min(fstart),
127 max(fstop),
128 fstrand
129 FROM fdata,fgroup
130 WHERE lower(fgroup.gname) = lower(?)
131 AND fgroup.gclass=?
132 AND lower(fdata.fref) = lower(?)
133 AND fgroup.gid=fdata.gid
134 GROUP BY fref,fstrand,gclass
138 use constant FULLTEXTWILDCARD => <<END;
139 SELECT distinct gclass,gname,fattribute_value
140 FROM fgroup,fattribute_to_feature,fdata
141 WHERE fgroup.gid=fdata.gid
142 AND fdata.fid=fattribute_to_feature.fid
143 AND lower(fattribute_to_feature.fattribute_value) LIKE lower(?)
147 ########################
148 # moved from mysqlopt.pm
149 ########################
151 # this is the largest that any reference sequence can be (100 megabases)
152 use constant MAX_BIN => 100_000_000;
154 # this is the smallest bin (1 K)
155 use constant MIN_BIN => 1000;
157 # size of range over which it is faster to force mysql to use the range for indexing
158 use constant STRAIGHT_JOIN_LIMIT => 200_000;
160 ##############################################################################
162 =head1 DESCRIPTION
164 This adaptor implements a specific postgres database schema that is
165 compatible with Bio::DB::GFF. It inherits from
166 Bio::DB::GFF::Adaptor::dbi, which itself inherits from Bio::DB::GFF.
168 The schema uses several tables:
170 =over 4
172 =item fdata
174 This is the feature data table. Its columns are:
176 fid feature ID (integer)
177 fref reference sequence name (string)
178 fstart start position relative to reference (integer)
179 fstop stop position relative to reference (integer)
180 ftypeid feature type ID (integer)
181 fscore feature score (float); may be null
182 fstrand strand; one of "+" or "-"; may be null
183 fphase phase; one of 0, 1 or 2; may be null
184 gid group ID (integer)
185 ftarget_start for similarity features, the target start position (integer)
186 ftarget_stop for similarity features, the target stop position (integer)
188 Note that it would be desirable to normalize the reference sequence
189 name, since there are usually many features that share the same
190 reference feature. However, in the current schema, query performance
191 suffers dramatically when this additional join is added.
193 =item fgroup
195 This is the group table. There is one row for each group. Columns:
197 gid the group ID (integer)
198 gclass the class of the group (string)
199 gname the name of the group (string)
201 The group table serves multiple purposes. As you might expect, it is
202 used to cluster features that logically belong together, such as the
203 multiple exons of the same transcript. It is also used to assign a
204 name and class to a singleton feature. Finally, the group table is
205 used to identify the target of a similarity hit. This is consistent
206 with the way in which the group field is used in the GFF version 2
207 format.
209 The fgroup.gid field joins with the fdata.gid field.
211 Examples:
213 sql> select * from fgroup where gname='sjj_2L52.1';
214 +-------+-------------+------------+
215 | gid | gclass | gname |
216 +-------+-------------+------------+
217 | 69736 | PCR_product | sjj_2L52.1 |
218 +-------+-------------+------------+
219 1 row in set (0.70 sec)
221 sql> select fref,fstart,fstop from fdata,fgroup
222 where gclass='PCR_product' and gname = 'sjj_2L52.1'
223 and fdata.gid=fgroup.gid;
224 +---------------+--------+-------+
225 | fref | fstart | fstop |
226 +---------------+--------+-------+
227 | CHROMOSOME_II | 1586 | 2355 |
228 +---------------+--------+-------+
229 1 row in set (0.03 sec)
231 =item ftype
233 This table contains the feature types, one per row. Columns are:
235 ftypeid the feature type ID (integer)
236 fmethod the feature type method name (string)
237 fsource the feature type source name (string)
239 The ftype.ftypeid field joins with the fdata.ftypeid field. Example:
241 sql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype
242 where gclass='PCR_product'
243 and gname = 'sjj_2L52.1'
244 and fdata.gid=fgroup.gid
245 and fdata.ftypeid=ftype.ftypeid;
246 +---------------+--------+-------+-------------+-----------+
247 | fref | fstart | fstop | fmethod | fsource |
248 +---------------+--------+-------+-------------+-----------+
249 | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs |
250 +---------------+--------+-------+-------------+-----------+
251 1 row in set (0.08 sec)
253 =item fdna
255 This table holds the raw DNA of the reference sequences. It has three
256 columns:
258 fref reference sequence name (string)
259 foffset offset of this sequence
260 fdna the DNA sequence (longblob)
262 To overcome problems loading large blobs, DNA is automatically
263 fragmented into multiple segments when loading, and the position of
264 each segment is stored in foffset. The fragment size is controlled by
265 the -clump_size argument during initialization.
267 =item fattribute_to_feature
269 This table holds "attributes", which are tag/value pairs stuffed into
270 the GFF line. The first tag/value pair is treated as the group, and
271 anything else is treated as an attribute (weird, huh?).
273 CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B"
274 CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B"
276 The columns of this table are:
278 fid feature ID (integer)
279 fattribute_id ID of the attribute (integer)
280 fattribute_value text of the attribute (text)
282 The fdata.fid column joins with fattribute_to_feature.fid.
284 =item fattribute
286 This table holds the normalized names of the attributes. Fields are:
288 fattribute_id ID of the attribute (integer)
289 fattribute_name Name of the attribute (varchar)
291 =back
293 =head2 Data Loading Methods
295 In addition to implementing the abstract SQL-generating methods of
296 Bio::DB::GFF::Adaptor::dbi, this module also implements the data
297 loading functionality of Bio::DB::GFF.
299 =cut
302 =head2 new
304 Title : new
305 Usage : $db = Bio::DB::GFF->new(@args)
306 Function: create a new adaptor
307 Returns : a Bio::DB::GFF object
308 Args : see below
309 Status : Public
311 The new constructor is identical to the "dbi" adaptor's new() method,
312 except that the prefix "dbi:pg" is added to the database DSN identifier
313 automatically if it is not there already.
315 Argument Description
316 -------- -----------
318 -dsn the DBI data source, e.g. 'dbi:Pg:dbname=:ens0040' or "ens0040"
320 -user username for authentication
322 -pass the password for authentication
324 =cut
328 sub new {
329 my $class = shift;
330 my ($dsn,$other) = rearrange([
331 [qw(FEATUREDB DB DSN)],
332 ],@_);
333 $dsn = "dbi:Pg:dbname=$dsn" if !ref($dsn) && $dsn !~ /^(dbi|DBI):/;
334 my $self = $class->SUPER::new(-dsn=>$dsn,%$other);
335 $self;
338 =head2 schema
340 Title : schema
341 Usage : $schema = $db->schema
342 Function: return the CREATE script for the schema
343 Returns : a list of CREATE statemetns
344 Args : none
345 Status : protected
347 This method returns a list containing the various CREATE statements
348 needed to initialize the database tables.
350 =cut
352 sub schema {
353 my %schema = (
354 fdata =>{
355 table=> q{
356 CREATE TABLE "fdata" (
357 "fid" serial NOT NULL,
358 "fref" character varying(100) DEFAULT '' NOT NULL,
359 "fstart" integer DEFAULT '0' NOT NULL,
360 "fstop" integer DEFAULT '0' NOT NULL,
361 "fbin" double precision DEFAULT '0.000000' NOT NULL,
362 "ftypeid" integer DEFAULT '0' NOT NULL,
363 "fscore" double precision DEFAULT NULL,
364 "fstrand" character varying(3) DEFAULT NULL,
365 "fphase" character varying(3) DEFAULT NULL,
366 "gid" integer DEFAULT '0' NOT NULL,
367 "ftarget_start" integer DEFAULT NULL,
368 "ftarget_stop" integer DEFAULT NULL,
369 CONSTRAINT chk_fdata_fstrand CHECK (fstrand IN ('+','-')),
370 CONSTRAINT chk_fdata_fphase CHECK (fphase IN ('0','1','2')),
371 CONSTRAINT pk_fdata PRIMARY KEY (fid)
373 }, # fdata table
375 #CONSTRAINT fref_fdata UNIQUE (fref, fbin, fstart, fstop, ftypeid, gid)
376 # fdata_fref_idx => q{ CREATE UNIQUE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid)},
378 index=>{
379 fdata_fref_idx => q{
380 CREATE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid)
383 fdata_ftypeid_idx => q{
384 CREATE INDEX fdata_ftypeid_idx ON fdata (ftypeid)
387 fdata_gid_idx => q{
388 CREATE INDEX fdata_gid_idx ON fdata (gid)
390 }, # fdata indexes
392 }, # fdata
396 fgroup => {
397 table => q{
398 CREATE TABLE "fgroup" (
399 "gid" serial NOT NULL,
400 "gclass" character varying(100) DEFAULT NULL,
401 "gname" character varying(100) DEFAULT NULL,
402 CONSTRAINT pk_fgroup PRIMARY KEY (gid)
404 }, # fgroup table
406 index => {
407 fgroup_gclass_idx => q{
408 CREATE UNIQUE INDEX fgroup_gclass_idx ON fgroup (gclass,gname)
410 fgroup_gname_idx => q{
411 CREATE INDEX fgroup_gname_idx ON fgroup(gname)
413 fgroup_lower_gname_idx => q{
414 CREATE INDEX fgroup_lower_gname_idx ON fgroup (lower(gname))
416 }, # fgroup indexes
418 }, # fgroup
420 ftype => {
421 table => q{
422 CREATE TABLE "ftype" (
423 "ftypeid" serial NOT NULL,
424 "fmethod" character varying(100) DEFAULT '' NOT NULL,
425 "fsource" character varying(100) DEFAULT NULL,
426 CONSTRAINT pk_ftype PRIMARY KEY (ftypeid),
427 CONSTRAINT ftype_ftype UNIQUE (fmethod, fsource)
429 }, # ftype table
431 index => {
432 ftype_fmethod_idx => q{
433 CREATE INDEX ftype_fmethod_idx ON ftype (fmethod)
436 ftype_fsource_idx => q{
437 CREATE INDEX ftype_fsource_idx ON ftype (fsource)
440 ftype_ftype_idx => q{
441 CREATE UNIQUE INDEX ftype_ftype_idx ON ftype (fmethod,fsource)
443 }, # ftype indexes
445 }, # ftype
448 fdna => {
449 table => q{
450 CREATE TABLE "fdna" (
451 "fref" character varying(100) DEFAULT '' NOT NULL,
452 "foffset" integer DEFAULT '0' NOT NULL,
453 "fdna" bytea,
454 CONSTRAINT pk_fdna PRIMARY KEY (fref, foffset)
456 } #fdna table
457 }, #fdna
459 fmeta => {
460 table => q{
461 CREATE TABLE "fmeta" (
462 "fname" character varying(255) DEFAULT '' NOT NULL,
463 "fvalue" character varying(255) DEFAULT '' NOT NULL,
464 CONSTRAINT pk_fmeta PRIMARY KEY (fname)
466 } # fmeta table
467 }, # fmeta
470 fattribute => {
471 table => q{
472 CREATE TABLE "fattribute" (
473 "fattribute_id" serial NOT NULL,
474 "fattribute_name" character varying(255) DEFAULT '' NOT NULL,
475 CONSTRAINT pk_fattribute PRIMARY KEY (fattribute_id)
477 }, # fattribute table
479 }, # fattribute
481 fattribute_to_feature => {
482 table => q{
483 CREATE TABLE "fattribute_to_feature" (
484 "fid" integer DEFAULT '0' NOT NULL,
485 "fattribute_id" integer DEFAULT '0' NOT NULL,
486 "fattribute_value" text
488 }, # fattribute_to_feature table
490 index => {
491 fattribute_to_feature_fid => q{
492 CREATE INDEX fattribute_to_feature_fid ON fattribute_to_feature (fid,fattribute_id)
494 fattribute_txt_idx => q{
495 CREATE INDEX fattribute_txt_idx ON fattribute_to_feature (fattribute_value)
497 fattribute_lower_idx => q{
498 CREATE INDEX fattribute_lower_idx ON fattribute_to_feature (lower(fattribute_value))
500 } # fattribute_to_feature indexes
501 }, # fattribute_to_feature
503 finterval_stats => {
504 table=> q{
505 CREATE TABLE "finterval_stats" (
506 "ftypeid" integer DEFAULT '0' NOT NULL,
507 "fref" character varying(100) DEFAULT '' NOT NULL,
508 "fbin" integer DEFAULT '0' NOT NULL,
509 "fcum_count" integer DEFAULT '0' NOT NULL,
510 CONSTRAINT pk_finterval_stats PRIMARY KEY (ftypeid,fref,fbin)
512 } # finterval_stats table
513 },# finterval_stats
519 return \%schema;
523 =head2 setup_load
525 Title : setup_load
526 Usage : $db->setup_load
527 Function: called before load_gff_line()
528 Returns : void
529 Args : none
530 Status : protected
532 This method performs schema-specific initialization prior to loading a
533 set of GFF records. It prepares a set of DBI statement handlers to be
534 used in loading the data.
536 =cut
538 sub setup_load {
539 my $self = shift;
540 my $schema = $self->schema;
542 my $dbh = $self->features_db;
544 if ($self->lock_on_load) {
545 my @tables = map { "$_ WRITE"} $self->tables;
546 my $tables = join ', ',@tables;
547 $dbh->do("LOCK TABLES $tables");
550 my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?');
551 my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)');
552 my $insertid_type = $dbh->prepare_delayed("SELECT currval('ftype_ftypeid_seq')");
554 my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE lower(gname)=lower(?) AND gclass=?');
555 my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)');
556 my $insertid_group = $dbh->prepare_delayed("SELECT currval('fgroup_gid_seq')");
558 my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?');
559 my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)');
560 my $insertid_attribute = $dbh->prepare_delayed("SELECT currval('fattribute_fattribute_id_seq')");
562 my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)');
564 my $insert_data = $dbh->prepare_delayed(<<END);
565 INSERT INTO fdata (fref,fstart,fstop,fbin,ftypeid,fscore,
566 fstrand,fphase,gid,ftarget_start,ftarget_stop)
567 VALUES(?,?,?,?,?,?,?,?,?,?,?)
570 my $delete_existing_data = $dbh->prepare_delayed('DELETE FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND fbin=? AND ftypeid=? AND GID=?');
571 my $insertid_data = $dbh->prepare_delayed("SELECT currval('fdata_fid_seq')");
573 $self->{load_stuff}{sth}{lookup_ftype} = $lookup_type;
574 $self->{load_stuff}{sth}{insert_ftype} = $insert_type;
575 $self->{load_stuff}{sth}{insertid_ftype} = $insertid_type;
576 $self->{load_stuff}{sth}{lookup_fgroup} = $lookup_group;
577 $self->{load_stuff}{sth}{insert_fgroup} = $insert_group;
578 $self->{load_stuff}{sth}{insertid_fgroup} = $insertid_group;
579 $self->{load_stuff}{sth}{insertid_fdata} = $insertid_data;
580 $self->{load_stuff}{sth}{insert_fdata} = $insert_data;
581 $self->{load_stuff}{sth}{delete_existing_fdata} = $delete_existing_data;
582 $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute;
583 $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute;
584 $self->{load_stuff}{sth}{insertid_fattribute} = $insertid_attribute;
585 $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value;
586 $self->{load_stuff}{types} = {};
587 $self->{load_stuff}{groups} = {};
588 $self->{load_stuff}{counter} = 0;
591 =head2 load_gff_line
593 Title : load_gff_line
594 Usage : $db->load_gff_line($fields)
595 Function: called to load one parsed line of GFF
596 Returns : true if successfully inserted
597 Args : hashref containing GFF fields
598 Status : protected
600 This method is called once per line of the GFF and passed a series of
601 parsed data items that are stored into the hashref $fields. The keys are:
603 ref reference sequence
604 source annotation source
605 method annotation method
606 start annotation start
607 stop annotation stop
608 score annotation score (may be undef)
609 strand annotation strand (may be undef)
610 phase annotation phase (may be undef)
611 group_class class of annotation's group (may be undef)
612 group_name ID of annotation's group (may be undef)
613 target_start start of target of a similarity hit
614 target_stop stop of target of a similarity hit
615 attributes array reference of attributes, each of which is a [tag=>value] array ref
617 =cut
619 sub load_gff_line {
620 my $self = shift;
621 my $gff = shift;
623 if (defined $gff->{phase}){
624 chomp($gff->{phase});
625 undef($gff->{phase}) if $gff->{phase} eq '.';
628 if (defined $gff->{strand} && $gff->{strand} eq '.'){undef($gff->{strand})};
629 if (defined $gff->{score} && $gff->{score} eq '.'){undef($gff->{score})};
631 my $s = $self->{load_stuff};
632 my $dbh = $self->features_db;
633 local $dbh->{PrintError} = 0;
635 defined(my $typeid = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return;
636 defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname} => $gff->{gclass})) or return;
638 my $bin = bin($gff->{start},$gff->{stop},$self->min_bin);
639 my $result = $s->{sth}{insert_fdata}->execute($gff->{ref},
640 $gff->{start},$gff->{stop},$bin,
641 $typeid,
642 $gff->{score},$gff->{strand},$gff->{phase},
643 $groupid,
644 $gff->{tstart},$gff->{tstop});
646 warn $dbh->errstr,"\n" and print "ref=",$gff->{ref}," start=",$gff->{start}," stop=",$gff->{stop}," bin=",$bin," typeid=",$typeid," groupid=",$groupid,"\n"
647 and return unless $result;
649 my $fid = $self->insertid($s->{sth},'fdata')
650 || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid);
652 # insert attributes
653 foreach (@{$gff->{attributes}}) {
654 defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return;
655 $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]);
658 if ( (++$s->{counter} % 1000) == 0) {
659 print STDERR "$s->{counter} records loaded...";
660 print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
663 $fid;
667 sub insertid {
668 my $self = shift;
669 my $sth = shift ;
670 my $table = shift;
672 my $insert_id;
673 if ($sth->{"insertid_$table"}->execute()){
674 $insert_id = ($sth->{"insertid_$table"}->fetchrow_array)[0];
676 else{
677 warn "No CURRVAL for SEQUENCE of table $table ",$sth->errstr,"\n";
678 return;
680 return $insert_id;
684 =head2 get_table_id
686 Title : get_table_id
687 Usage : $integer = $db->get_table_id($table,@ids)
688 Function: get the ID of a group or type
689 Returns : an integer ID or undef
690 Args : none
691 Status : private
693 This internal method is called by load_gff_line to look up the integer
694 ID of an existing feature type or group. The arguments are the name
695 of the table, and two string identifiers. For feature types, the
696 identifiers are the method and source. For groups, the identifiers
697 are group name and class.
699 This method requires that a statement handler named I<lookup_$table>,
700 have been created previously by setup_load(). It is here to overcome
701 deficiencies in mysql's INSERT syntax.
703 =cut
706 # get the object ID from a named table
707 sub get_table_id {
708 my $self = shift;
709 my $table = shift;
710 my @ids = @_;
712 # irritating warning for null id
713 my $id_key;
715 local $^W=0;
716 $id_key = join ':',@ids;
719 my $s = $self->{load_stuff};
720 my $sth = $s->{sth};
721 my $dbh = $self->features_db;
723 unless (defined($s->{$table}{$id_key})) {
724 $sth->{"lookup_$table"}->execute(@ids);
725 my @result = $sth->{"lookup_$table"}->fetchrow_array;
726 if (@result > 0) {
727 $s->{$table}{$id_key} = $result[0];
728 } else {
729 $sth->{"insert_$table"}->execute(@ids)
730 && ($s->{$table}{$id_key} = $self->insertid($sth,$table));
731 #&& ($s->{$table}{$id_key} = $self->insertid($sth->{"insertid_$table"}));
732 #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid);
736 my $id = $s->{$table}{$id_key};
737 unless (defined $id) {
738 warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n";
739 return;
741 $id;
745 #sub insertid {
746 # my $self = shift;
747 # my $insertid_sth = shift ;
748 # my $insert_id;
749 # if ($insertid_sth->execute){
750 # $insert_id = ($insertid_sth->fetchrow_array)[0];
752 # else{
753 # warn "No CURRVAL for SEQUENCE ",$insertid_sth->errstr,"\n";
754 # return;
756 # return $insert_id;
759 sub insert_sequence {
760 my $self = shift;
761 my($id,$offset,$seq) = @_;
762 my $sth = $self->{_insert_sequence}
763 ||= $self->dbh->prepare_delayed('insert into fdna values (?,?,?)');
764 $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr);
767 =head2 range_query
769 Title : range_query
770 Usage : $db->range_query($range_type,$refseq,$refclass,$start,$stop,$types,$order_by_group,$attributes,$binsize)
771 Function: create statement handle for range/overlap queries
772 Returns : a DBI statement handle
773 Args : see below
774 Status : Protected
776 This method constructs the statement handle for this module's central
777 query: given a range and/or a list of feature types, fetch their GFF
778 records. It overrides a method in dbi.pm so that the overlaps query
779 can write SQL optimized for Postgres. Specifically, instead of writing
780 the bin related section as a set of ORs, each bin piece is place in
781 a separate select and then they are UNIONed together. This subroutine
782 requires several replacements for other subroutines in dbi.pm. In this
783 module, they are named the same as those in dbi.pm but prefixed with
784 "pg_".
786 The positional arguments are as follows:
788 Argument Description
790 $isrange A flag indicating that this is a range.
791 query. Otherwise an overlap query is
792 assumed.
794 $refseq The reference sequence name (undef if no range).
796 $refclass The reference sequence class (undef if no range).
798 $start The start of the range (undef if none).
800 $stop The stop of the range (undef if none).
802 $types Array ref containing zero or feature types in the
803 format [method,source].
805 $order_by_group A flag indicating that statement handler should group
806 the features by group id (handy for iterative fetches)
808 $attributes A hash containing select attributes.
810 $binsize A bin size for generating tables of feature density.
812 =cut
814 sub range_query {
815 my $self = shift;
816 my($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$bin) = @_;
818 my $dbh = $self->features_db;
820 # my @bin_parts = split /\n\s+OR/, $self->bin_query($start,$stop);
821 # warn "bin_part: @bin_parts\n";
823 my %a = (refseq=>$refseq,class=>$class,start=>$start,stop=>$stop,types=>$types,attributes=>$attributes,bin_width=>$bin);
824 my ($query, @args, $order_by);
826 if ($rangetype ne 'overlaps') {
828 my $select = $self->make_features_select_part(\%a);
829 my $from = $self->make_features_from_part($sparse,\%a);
830 my $join = $self->make_features_join_part(\%a);
831 my $where;
832 ($where,@args) = $self->make_features_by_range_where_part($rangetype,\%a);
833 my ($group_by,@more_args) = $self->make_features_group_by_part(\%a);
834 $order_by = $self->make_features_order_by_part(\%a) if $order_by_group;
836 $query = "SELECT $select FROM $from WHERE $join";
837 $query .= " AND $where" if $where;
839 if ($group_by) {
840 $query .= " GROUP BY $group_by";
841 push @args,@more_args;
844 } else { # most common case: overlaps query
846 my @bin_parts = split /\s*OR/, $self->bin_query($start,$stop);
847 my $select = $self->make_features_select_part(\%a);
848 my $from = $self->make_features_from_part($sparse,\%a);
849 my $join = $self->make_features_join_part(\%a);
850 my $where;
851 ($where,@args) = $self->pg_make_features_by_range_where_part($rangetype,\%a);
852 my ($group_by,@more_args)= $self->make_features_group_by_part(\%a);
853 $order_by = $self->pg_make_features_order_by_part(\%a) if $order_by_group;
855 my @temp_args;
856 my @query_pieces;
857 foreach my $bin (@bin_parts) {
858 my $temp_query = "SELECT $select FROM $from WHERE $join AND $where AND $bin\n";
859 push @temp_args, @args;
861 if ($group_by) {
862 $temp_query .= " GROUP BY $group_by";
863 push @temp_args,@more_args;
866 push @query_pieces, $temp_query;
869 @args = @temp_args;
870 $query = join("UNION\n", @query_pieces);
874 $query .= " ORDER BY $order_by" if $order_by;
876 $self->dbh->do('set enable_seqscan=off');
877 my $sth = $self->dbh->do_query($query,@args);
878 $sth;
881 sub pg_make_features_by_range_where_part {
882 my $self = shift;
883 my ($rangetype,$options) = @_;
885 return unless $rangetype eq 'overlaps';
887 $options ||= {};
888 my ($refseq,$class,$start,$stop,$types,$attributes) =
889 @{$options}{qw(refseq class start stop types attributes)};
891 my (@query,@args);
893 if ($refseq) {
894 my ($q,@a) = $self->refseq_query($refseq,$class);
895 push @query,$q;
896 push @args,@a;
899 if (defined $start or defined $stop) {
900 $start = 0 unless defined($start);
901 $stop = MAX_SEGMENT unless defined($stop);
903 my ($range_query,@range_args) = $self->pg_overlap_query($start,$stop);
905 push @query,$range_query;
906 push @args,@range_args;
909 if (defined $types && @$types) {
910 my ($type_query,@type_args) = $self->types_query($types);
911 push @query,$type_query;
912 push @args,@type_args;
915 if ($attributes) {
916 my ($attribute_query,@attribute_args) = $self->make_features_by_attribute_where_part($attributes);
917 push @query,"($attribute_query)";
918 push @args,@attribute_args;
921 my $query = join "AND",@query;
922 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
925 sub pg_overlap_query {
926 my $self = shift;
927 my ($start,$stop) = @_;
929 my ($iq,@iargs) = $self->overlap_query_nobin($start,$stop);
930 my $query = "\n$iq\n";
931 my @args = @iargs;
933 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
936 sub pg_make_features_order_by_part {
937 my $self = shift;
938 my $options = shift || {};
939 return "gname";
942 =head2 search_notes
944 This PostgreSQL adaptor does not implement the search notes method
945 because it can be very slow (although the code for the method is
946 contained in this method but commented out).
947 There is, however, a PostgreSQL adaptor that does implement it in
948 a more efficient way: L<Bio::DB::GFF::Adaptor::dbi::pg_fts>,
949 which inherits from this adaptor and uses the optional PostgreSQL
950 module TSearch2 for full text indexing. See that adaptor's
951 documentation for more information.
953 See also L<Bio::DB::GFF>
955 Title : search_notes
956 Usage : @search_results = $db->search_notes("full text search string",$limit)
957 Function: Search the notes for a text string, using mysql full-text search
958 Returns : array of results
959 Args : full text search string, and an optional row limit
960 Status : public
962 This is a replacement for the mysql-specific method. Given a search string, it
963 performs a ILIKE search of the notes table and returns an array of results.
964 Each row of the returned array is a arrayref containing the following fields:
966 column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment()
967 column 2 The text of the note
968 column 3 A relevance score.
970 Note that for large databases this can be very slow and may result in
971 time out or 500-cgi errors. If this is happening on a regular basis,
972 you should look into using L<Bio::DB::GFF::Adaptor::dbi::pg_fts> which
973 implements the TSearch2 full text indexing scheme.
975 =cut
977 sub search_notes{
978 # my $self = shift;
979 # my ($search_string,$limit) = @_;
981 # $search_string =~ tr/*/%/s;
982 # $search_string = '%'.$search_string unless $search_string =~ /^\%/;
983 # $search_string = $search_string.'%' unless $search_string =~ /\%$/;
984 # warn "search_string:$search_string";
985 # my $query = FULLTEXTWILDCARD;
986 # $query .= " limit $limit" if defined $limit;
987 # my $sth = $self->dbh->do_query($query,$search_string);
989 # my @results;
990 # while (my ($class,$name,$note) = $sth->fetchrow_array) {
992 # next unless $class && $name; # sorry, ignore NULL objects
993 # my $featname = Bio::DB::GFF::Featname->new($class=>$name);
995 # push @results,[$featname,$note,0]; #gbrowse expects a score, but
996 # #pg doesn't give one, thus the 0
998 # warn @results;
1000 # return @results;
1004 =head2 make_meta_set_query
1006 Title : make_meta_set_query
1007 Usage : $sql = $db->make_meta_set_query
1008 Function: return SQL fragment for setting a meta parameter
1009 Returns : SQL fragment
1010 Args : none
1011 Status : public
1013 By default this does nothing; meta parameters are not stored or
1014 retrieved.
1016 =cut
1018 sub make_meta_set_query {
1019 return 'INSERT INTO fmeta VALUES (?,?)';
1022 sub make_classes_query {
1023 my $self = shift;
1024 return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT gclass IS NULL';
1028 sub chunk_size {
1029 my $self = shift;
1030 $self->meta('chunk_size') || DEFAULT_CHUNK;
1033 sub getseqcoords_query {
1034 my $self = shift;
1035 return GETSEQCOORDS ;
1038 sub getaliascoords_query{
1039 my $self = shift;
1040 return GETALIASCOORDS ;
1044 sub getforcedseqcoords_query{
1045 my $self = shift;
1046 return GETFORCEDSEQCOORDS ;
1050 sub getaliaslike_query{
1051 my $self = shift;
1052 return GETALIASLIKE ;
1056 sub make_features_select_part {
1057 my $self = shift;
1058 my $options = shift || {};
1059 my $s;
1060 if (my $b = $options->{bin_width}) {
1062 $s = <<END;
1063 fref,
1064 1+$b*floor(fstart/$b) as fstart,
1065 $b*(1+floor(fstart/$b)) as fstop,
1066 CASE WHEN fsource IS NULL THEN fmethod
1067 ELSE fmethod||':'||fsource,
1068 'bin',
1069 count(*) as fscore,
1070 '.','.','bin',
1071 CASE WHEN fsource IS NULL THEN fref||':'||fmethod
1072 ELSE fref||':'||fmethod||':'||fsource,
1073 NULL,NULL,NULL,NULL
1076 } else {
1077 $s = <<END;
1078 fref,fstart,fstop,fsource,fmethod,fscore,fstrand,fphase,gclass,fgroup.gname,ftarget_start,ftarget_stop,fdata.fid,fdata.gid
1082 $s .= ",count(fdata.fid)" if $options->{attributes} && keys %{$options->{attributes}}>1;
1086 sub make_features_from_part_bkup {
1087 my $self = shift;
1088 my $sparse = shift;
1089 my $options = shift || {};
1090 #my $index = $sparse ? ' USE INDEX(ftypeid)': '';
1091 my $index = '';
1092 return $options->{attributes} ? "fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n"
1093 : "fdata${index},ftype,fgroup\n";
1097 ####################################
1098 # moved from mysqlopt.pm
1099 ###################################
1100 # meta values
1101 sub default_meta_values {
1102 my $self = shift;
1103 my @values = $self->SUPER::default_meta_values;
1104 return (
1105 @values,
1106 max_bin => MAX_BIN,
1107 min_bin => MIN_BIN,
1108 straight_join_limit => STRAIGHT_JOIN_LIMIT,
1112 sub min_bin {
1113 my $self = shift;
1114 return $self->meta('min_bin') || MIN_BIN;
1116 sub max_bin {
1117 my $self = shift;
1118 return $self->meta('max_bin') || MAX_BIN;
1120 sub straight_join_limit {
1121 my $self = shift;
1122 return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT;
1126 sub _feature_by_name {
1127 my $self = shift;
1128 my ($class,$name,$location,$callback) = @_;
1129 $callback || $self->throw('must provide a callback argument');
1131 my @bin_parts = split /\s*OR/, $self->bin_query($location->[1],$location->[2]) if $location;
1132 my $select = $self->make_features_select_part;
1133 my $from = $self->make_features_from_part(undef,{sparse_groups=>1});
1134 my ($where,@args) = $self->make_features_by_name_where_part($class,$name);
1135 my $join = $self->make_features_join_part;
1136 my $range = $self->pg_make_features_by_range_where_part('overlaps',
1137 {refseq=>$location->[0],
1138 class =>'',
1139 start=>$location->[1],
1140 stop =>$location->[2]}) if $location;
1142 my @temp_args;
1143 my @query_pieces;
1144 my $query;
1145 if (@bin_parts) {
1146 foreach my $bin (@bin_parts) {
1147 my $temp_query = "SELECT $select FROM $from WHERE $join AND $where AND $range AND $bin\n";
1148 push @temp_args, @args;
1149 push @query_pieces, $temp_query;
1152 @args = @temp_args;
1153 $query = join("UNION\n", @query_pieces);
1155 } else {
1156 $query = "SELECT $select FROM $from WHERE $where AND $join";
1159 my $sth = $self->dbh->do_query($query,@args);
1161 my $count = 0;
1162 while (my @row = $sth->fetchrow_array) {
1163 $callback->(@row);
1164 $count++;
1166 $sth->finish;
1167 return $count;
1170 sub update_sequences {
1171 my $self = shift;
1172 my $dbh = $self->features_db;
1174 $dbh->do("SELECT setval('public.fdata_fid_seq', max(fid)+1) FROM fdata");
1175 $dbh->do("SELECT setval('public.fattribute_fattribute_id_seq', max(fattribute_id)+1) FROM fattribute");
1176 $dbh->do("SELECT setval('public.fgroup_gid_seq', max(gid)+1) FROM fgroup");
1177 $dbh->do("SELECT setval('public.ftype_ftypeid_seq', max(ftypeid)+1) FROM ftype");
1182 =head2 make_features_by_name_where_part
1184 Title : make_features_by_name_where_part
1185 Usage : $db->make_features_by_name_where_part
1186 Function: Overrides a function in Bio::DB::GFF::Adaptor::dbi to insure
1187 that searches will be case insensitive. It creates the SQL
1188 fragment needed to select a feature by its group name & class
1189 Returns : a SQL fragment and bind arguments
1190 Args : see below
1191 Status : Protected
1193 =cut
1195 sub make_features_by_name_where_part {
1196 my $self = shift;
1197 my ($class,$name) = @_;
1199 if ($name !~ /\*/) {
1200 #allows utilization of an index on lower(gname)
1201 return ("fgroup.gclass=? AND lower(fgroup.gname) = lower(?)",$class,$name);
1203 else {
1204 $name =~ tr/*/%/;
1205 return ("fgroup.gclass=? AND lower(fgroup.gname) LIKE lower(?)",$class,$name);
1210 # Methods from dbi.pm that need to be overridden to make
1211 # searching for fref case insensitive
1214 sub get_dna {
1215 my $self = shift;
1216 my ($ref,$start,$stop,$class) = @_;
1218 my ($offset_start,$offset_stop);
1220 my $has_start = defined $start;
1221 my $has_stop = defined $stop;
1223 my $reversed;
1224 if ($has_start && $has_stop && $start > $stop) {
1225 $reversed++;
1226 ($start,$stop) = ($stop,$start);
1229 # turn start and stop into 0-based offsets
1230 my $cs = $self->dna_chunk_size;
1231 $start -= 1; $stop -= 1;
1232 $offset_start = int($start/$cs)*$cs;
1233 $offset_stop = int($stop/$cs)*$cs;
1235 my $sth;
1236 # special case, get it all
1237 if (!($has_start || $has_stop)) {
1238 $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) order by foffset',$ref);
1241 elsif (!$has_stop) {
1242 $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) and foffset>=? order by foffset',
1243 $ref,$offset_start);
1246 else { # both start and stop defined
1247 $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) and foffset>=? and foffset<=? order by foffset',
1248 $ref,$offset_start,$offset_stop);
1251 my $dna = '';
1252 while (my($frag,$offset) = $sth->fetchrow_array) {
1253 substr($frag,0,$start-$offset) = '' if $has_start && $start > $offset;
1254 $dna .= $frag;
1256 substr($dna,$stop-$start+1) = '' if $has_stop && $stop-$start+1 < length($dna);
1257 if ($reversed) {
1258 $dna = reverse $dna;
1259 $dna =~ tr/gatcGATC/ctagCTAG/;
1262 $sth->finish;
1263 $dna;
1267 sub refseq_query {
1268 my $self = shift;
1269 my ($refseq,$refclass) = @_;
1270 my $query = "lower(fdata.fref)=lower(?)";
1271 return wantarray ? ($query,$refseq) : $self->dbh->dbi_quote($query,$refseq);
1274 sub make_types_where_part {
1275 my $self = shift;
1276 my ($srcseq,$start,$stop,$want_count,$typelist) = @_;
1277 my (@query,@args);
1278 if (defined($srcseq)) {
1279 push @query,'lower(fdata.fref)=lower(?)';
1280 push @args,$srcseq;
1281 if (defined $start or defined $stop) {
1282 $start = 1 unless defined $start;
1283 $stop = MAX_SEGMENT unless defined $stop;
1284 my ($q,@a) = $self->overlap_query($start,$stop);
1285 push @query,"($q)";
1286 push @args,@a;
1289 if (defined $typelist && @$typelist) {
1290 my ($q,@a) = $self->types_query($typelist);
1291 push @query,($q);
1292 push @args,@a;
1294 my $query = @query ? join(' AND ',@query) : '1=1';
1295 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
1298 sub get_feature_id {
1299 my $self = shift;
1300 my ($ref,$start,$stop,$typeid,$groupid) = @_;
1301 my $s = $self->{load_stuff};
1302 unless ($s->{get_feature_id}) {
1303 my $dbh = $self->features_db;
1304 $s->{get_feature_id} =
1305 $dbh->prepare_delayed('SELECT fid FROM fdata WHERE lower(fref)=lower(?) AND fstart=? AND fstop=? AND ftypeid=? AND gid=?');
1307 my $sth = $s->{get_feature_id} or return;
1308 $sth->execute($ref,$start,$stop,$typeid,$groupid) or return;
1309 my ($fid) = $sth->fetchrow_array;
1310 return $fid;
1313 sub _delete {
1314 my $self = shift;
1315 my $delete_spec = shift;
1316 my $ranges = $delete_spec->{segments} || [];
1317 my $types = $delete_spec->{types} || [];
1318 my $force = $delete_spec->{force};
1319 my $range_type = $delete_spec->{range_type};
1320 my $dbh = $self->features_db;
1322 my $query = 'delete from fdata';
1323 my @where;
1325 my @range_part;
1326 for my $segment (@$ranges) {
1327 my $ref = $dbh->quote($segment->abs_ref);
1328 my $start = $segment->abs_start;
1329 my $stop = $segment->abs_stop;
1330 my $range = $range_type eq 'overlaps' ? $self->overlap_query($start,$stop)
1331 : $range_type eq 'contains' ? $self->contains_query($start,$stop)
1332 : $range_type eq 'contained_in' ? $self->contained_in_query($start,$stop)
1333 : $self->throw("Invalid range type '$range_type'");
1334 push @range_part,"(lower(fref)=lower($ref) AND $range)";
1336 push @where,'('. join(' OR ',@range_part).')' if @range_part;
1338 # get all the types
1339 if (@$types) {
1340 my $types_where = $self->types_query($types);
1341 my $types_query = "select ftypeid from ftype where $types_where";
1342 my $result = $dbh->selectall_arrayref($types_query);
1343 my @typeids = map {$_->[0]} @$result;
1344 my $typelist = join ',',map{$dbh->quote($_)} @typeids;
1345 $typelist ||= "0"; # don't cause DBI to die with invalid SQL when
1346 # unknown feature types were requested.
1347 push @where,"(ftypeid in ($typelist))";
1349 $self->throw("This operation would delete all feature data and -force not specified")
1350 unless @where || $force;
1351 $query .= " where ".join(' and ',@where) if @where;
1352 warn "$query\n" if $self->debug;
1353 my $result = $dbh->do($query);
1354 defined $result or $self->throw($dbh->errstr);
1355 $result;
1358 sub make_abscoord_query {
1359 my $self = shift;
1360 my ($name,$class,$refseq) = @_;
1361 #my $query = GETSEQCOORDS;
1362 my $query = $self->getseqcoords_query();
1363 my $getforcedseqcoords = $self->getforcedseqcoords_query() ;
1364 if ($name =~ /\*/) {
1365 $name =~ s/%/\\%/g;
1366 $name =~ s/_/\\_/g;
1367 $name =~ tr/*/%/;
1368 $query =~ s/gname\) = lower/gname) LIKE lower/;
1370 defined $refseq
1371 ? $self->dbh->do_query($getforcedseqcoords,$name,$class,$refseq)
1372 : $self->dbh->do_query($query,$name,$class);
1375 sub make_aliasabscoord_query {
1376 my $self = shift;
1377 my ($name,$class) = @_;
1378 #my $query = GETALIASCOORDS;
1379 my $query = $self->getaliascoords_query();
1380 if ($name =~ /\*/) {
1381 $name =~ s/%/\\%/g;
1382 $name =~ s/_/\\_/g;
1383 $name =~ tr/*/%/;
1384 $query =~ s/gname\) = lower/gname) LIKE lower/;
1386 $self->dbh->do_query($query,$name,$class);