1 package Bio
::DB
::GFF
::Adaptor
::dbi
::pg
;
5 Bio::DB::GFF::Adaptor::dbi::pg -- Database adaptor for a specific postgres schema
9 SQL commands that need to be executed before this adaptor will work:
11 CREATE DATABASE <dbname>;
13 Also, select permission needs to be granted for each table in the
14 database to the owner of the httpd process (usually 'nobody', but
15 for some RedHat systems it is 'apache') if this adaptor is to be used
16 with the Generic Genome Browser (gbrowse):
19 GRANT SELECT ON TABLE fmeta TO nobody;
20 GRANT SELECT ON TABLE fgroup TO nobody;
21 GRANT SELECT ON TABLE fdata TO nobody;
22 GRANT SELECT ON TABLE fattribute_to_feature TO nobody;
23 GRANT SELECT ON TABLE fdna TO nobody;
24 GRANT SELECT ON TABLE fattribute TO nobody;
25 GRANT SELECT ON TABLE ftype TO nobody;
27 =head2 Optimizing the database
29 PostgreSQL generally requires some tuning before you get very good
30 performance for large databases. For general information on tuning
31 a PostgreSQL server, see http://www.varlena.com/GeneralBits/Tidbits/perf.html
32 Of particular importance is executing VACUUM FULL ANALYZE whenever
33 you change the database.
35 Additionally, for a GFF database, there are a few items you can tune.
36 For each automatic class in your GBrowse conf file, there will be one
37 or two searches done when searching for a feature. If there are lots
38 of features, these search can take several seconds. To speed these searches,
45 Set 'enable_seqscan = false' in your postgresql.conf file (and restart
50 Create 'partial' indexes for each automatic class, doing this for the
51 example class 'Allele':
53 CREATE INDEX partial_allele_gclass ON
54 fgroup (lower('gname')) WHERE gclass='Allele';
56 And be sure to run VACUUM FULL ANALYZE after creating the indexes.
62 # a simple postgres adaptor
64 use Bio
::DB
::GFF
::Util
::Binning
;
65 use Bio
::DB
::GFF
::Util
::Rearrange
; # for rearrange()
66 use base
qw(Bio::DB::GFF::Adaptor::dbi);
68 use constant MAX_SEGMENT
=> 100_000_000
; # the largest a segment can get
69 use constant DEFAULT_CHUNK
=> 2000;
71 use constant GETSEQCOORDS
=><<END;
73 COALESCE(gclass,'Sequence'),
79 WHERE lower(fgroup.gname) = lower(?)
81 AND fgroup.gid=fdata.gid
82 GROUP BY fref,fstrand,gclass,gname
86 use constant GETALIASCOORDS
=><<END;
88 COALESCE(gclass,'Sequence'),
93 FROM fdata,fgroup,fattribute,fattribute_to_feature
94 WHERE lower(fattribute_to_feature.fattribute_value)=lower(?)
96 AND fgroup.gid=fdata.gid
97 AND fattribute.fattribute_name='Alias'
98 AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
99 AND fattribute_to_feature.fid=fdata.fid
100 GROUP BY fref,fstrand,gclass,gname
104 use constant GETALIASLIKE
=><<END;
106 COALESCE(gclass,'Sequence'),
111 FROM fdata,fgroup,fattribute,fattribute_to_feature
112 WHERE lower(fattribute_to_feature.fattribute_value) LIKE lower(?)
114 AND fgroup.gid=fdata.gid
115 AND fattribute.fattribute_name='Alias'
116 AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
117 AND fattribute_to_feature.fid=fdata.fid
118 GROUP BY fref,fstrand,gname
123 use constant GETFORCEDSEQCOORDS
=><<END;
125 COALESCE(gclass,'Sequence'),
130 WHERE lower(fgroup.gname) = lower(?)
132 AND lower(fdata.fref) = lower(?)
133 AND fgroup.gid=fdata.gid
134 GROUP BY fref,fstrand,gclass
138 use constant FULLTEXTWILDCARD
=> <<END;
139 SELECT distinct gclass,gname,fattribute_value
140 FROM fgroup,fattribute_to_feature,fdata
141 WHERE fgroup.gid=fdata.gid
142 AND fdata.fid=fattribute_to_feature.fid
143 AND lower(fattribute_to_feature.fattribute_value) LIKE lower(?)
147 ########################
148 # moved from mysqlopt.pm
149 ########################
151 # this is the largest that any reference sequence can be (100 megabases)
152 use constant MAX_BIN
=> 100_000_000
;
154 # this is the smallest bin (1 K)
155 use constant MIN_BIN
=> 1000;
157 # size of range over which it is faster to force mysql to use the range for indexing
158 use constant STRAIGHT_JOIN_LIMIT
=> 200_000
;
160 ##############################################################################
164 This adaptor implements a specific postgres database schema that is
165 compatible with Bio::DB::GFF. It inherits from
166 Bio::DB::GFF::Adaptor::dbi, which itself inherits from Bio::DB::GFF.
168 The schema uses several tables:
174 This is the feature data table. Its columns are:
176 fid feature ID (integer)
177 fref reference sequence name (string)
178 fstart start position relative to reference (integer)
179 fstop stop position relative to reference (integer)
180 ftypeid feature type ID (integer)
181 fscore feature score (float); may be null
182 fstrand strand; one of "+" or "-"; may be null
183 fphase phase; one of 0, 1 or 2; may be null
184 gid group ID (integer)
185 ftarget_start for similarity features, the target start position (integer)
186 ftarget_stop for similarity features, the target stop position (integer)
188 Note that it would be desirable to normalize the reference sequence
189 name, since there are usually many features that share the same
190 reference feature. However, in the current schema, query performance
191 suffers dramatically when this additional join is added.
195 This is the group table. There is one row for each group. Columns:
197 gid the group ID (integer)
198 gclass the class of the group (string)
199 gname the name of the group (string)
201 The group table serves multiple purposes. As you might expect, it is
202 used to cluster features that logically belong together, such as the
203 multiple exons of the same transcript. It is also used to assign a
204 name and class to a singleton feature. Finally, the group table is
205 used to identify the target of a similarity hit. This is consistent
206 with the way in which the group field is used in the GFF version 2
209 The fgroup.gid field joins with the fdata.gid field.
213 sql> select * from fgroup where gname='sjj_2L52.1';
214 +-------+-------------+------------+
215 | gid | gclass | gname |
216 +-------+-------------+------------+
217 | 69736 | PCR_product | sjj_2L52.1 |
218 +-------+-------------+------------+
219 1 row in set (0.70 sec)
221 sql> select fref,fstart,fstop from fdata,fgroup
222 where gclass='PCR_product' and gname = 'sjj_2L52.1'
223 and fdata.gid=fgroup.gid;
224 +---------------+--------+-------+
225 | fref | fstart | fstop |
226 +---------------+--------+-------+
227 | CHROMOSOME_II | 1586 | 2355 |
228 +---------------+--------+-------+
229 1 row in set (0.03 sec)
233 This table contains the feature types, one per row. Columns are:
235 ftypeid the feature type ID (integer)
236 fmethod the feature type method name (string)
237 fsource the feature type source name (string)
239 The ftype.ftypeid field joins with the fdata.ftypeid field. Example:
241 sql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype
242 where gclass='PCR_product'
243 and gname = 'sjj_2L52.1'
244 and fdata.gid=fgroup.gid
245 and fdata.ftypeid=ftype.ftypeid;
246 +---------------+--------+-------+-------------+-----------+
247 | fref | fstart | fstop | fmethod | fsource |
248 +---------------+--------+-------+-------------+-----------+
249 | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs |
250 +---------------+--------+-------+-------------+-----------+
251 1 row in set (0.08 sec)
255 This table holds the raw DNA of the reference sequences. It has three
258 fref reference sequence name (string)
259 foffset offset of this sequence
260 fdna the DNA sequence (longblob)
262 To overcome problems loading large blobs, DNA is automatically
263 fragmented into multiple segments when loading, and the position of
264 each segment is stored in foffset. The fragment size is controlled by
265 the -clump_size argument during initialization.
267 =item fattribute_to_feature
269 This table holds "attributes", which are tag/value pairs stuffed into
270 the GFF line. The first tag/value pair is treated as the group, and
271 anything else is treated as an attribute (weird, huh?).
273 CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B"
274 CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B"
276 The columns of this table are:
278 fid feature ID (integer)
279 fattribute_id ID of the attribute (integer)
280 fattribute_value text of the attribute (text)
282 The fdata.fid column joins with fattribute_to_feature.fid.
286 This table holds the normalized names of the attributes. Fields are:
288 fattribute_id ID of the attribute (integer)
289 fattribute_name Name of the attribute (varchar)
293 =head2 Data Loading Methods
295 In addition to implementing the abstract SQL-generating methods of
296 Bio::DB::GFF::Adaptor::dbi, this module also implements the data
297 loading functionality of Bio::DB::GFF.
305 Usage : $db = Bio::DB::GFF->new(@args)
306 Function: create a new adaptor
307 Returns : a Bio::DB::GFF object
311 The new constructor is identical to the "dbi" adaptor's new() method,
312 except that the prefix "dbi:pg" is added to the database DSN identifier
313 automatically if it is not there already.
318 -dsn the DBI data source, e.g. 'dbi:Pg:dbname=:ens0040' or "ens0040"
320 -user username for authentication
322 -pass the password for authentication
330 my ($dsn,$other) = rearrange
([
331 [qw(FEATUREDB DB DSN)],
333 $dsn = "dbi:Pg:dbname=$dsn" if !ref($dsn) && $dsn !~ /^(dbi|DBI):/;
334 my $self = $class->SUPER::new
(-dsn
=>$dsn,%$other);
341 Usage : $schema = $db->schema
342 Function: return the CREATE script for the schema
343 Returns : a list of CREATE statemetns
347 This method returns a list containing the various CREATE statements
348 needed to initialize the database tables.
356 CREATE TABLE "fdata" (
357 "fid" serial NOT NULL,
358 "fref" character varying(100) DEFAULT '' NOT NULL,
359 "fstart" integer DEFAULT '0' NOT NULL,
360 "fstop" integer DEFAULT '0' NOT NULL,
361 "fbin" double precision DEFAULT '0.000000' NOT NULL,
362 "ftypeid" integer DEFAULT '0' NOT NULL,
363 "fscore" double precision DEFAULT NULL,
364 "fstrand" character varying(3) DEFAULT NULL,
365 "fphase" character varying(3) DEFAULT NULL,
366 "gid" integer DEFAULT '0' NOT NULL,
367 "ftarget_start" integer DEFAULT NULL,
368 "ftarget_stop" integer DEFAULT NULL,
369 CONSTRAINT chk_fdata_fstrand CHECK (fstrand IN ('+','-')),
370 CONSTRAINT chk_fdata_fphase CHECK (fphase IN ('0','1','2')),
371 CONSTRAINT pk_fdata PRIMARY KEY (fid)
375 #CONSTRAINT fref_fdata UNIQUE (fref, fbin, fstart, fstop, ftypeid, gid)
376 # fdata_fref_idx => q{ CREATE UNIQUE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid)},
380 CREATE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid)
383 fdata_ftypeid_idx
=> q{
384 CREATE INDEX fdata_ftypeid_idx ON fdata (ftypeid)
388 CREATE INDEX fdata_gid_idx ON fdata (gid)
398 CREATE TABLE "fgroup" (
399 "gid" serial NOT NULL,
400 "gclass" character varying(100) DEFAULT NULL,
401 "gname" character varying(100) DEFAULT NULL,
402 CONSTRAINT pk_fgroup PRIMARY KEY (gid)
407 fgroup_gclass_idx
=> q{
408 CREATE UNIQUE INDEX fgroup_gclass_idx ON fgroup (gclass,gname)
410 fgroup_gname_idx
=> q{
411 CREATE INDEX fgroup_gname_idx ON fgroup(gname)
413 fgroup_lower_gname_idx
=> q{
414 CREATE INDEX fgroup_lower_gname_idx ON fgroup (lower(gname))
422 CREATE TABLE "ftype" (
423 "ftypeid" serial NOT NULL,
424 "fmethod" character varying(100) DEFAULT '' NOT NULL,
425 "fsource" character varying(100) DEFAULT NULL,
426 CONSTRAINT pk_ftype PRIMARY KEY (ftypeid),
427 CONSTRAINT ftype_ftype UNIQUE (fmethod, fsource)
432 ftype_fmethod_idx
=> q{
433 CREATE INDEX ftype_fmethod_idx ON ftype (fmethod)
436 ftype_fsource_idx
=> q{
437 CREATE INDEX ftype_fsource_idx ON ftype (fsource)
440 ftype_ftype_idx
=> q{
441 CREATE UNIQUE INDEX ftype_ftype_idx ON ftype (fmethod,fsource)
450 CREATE TABLE "fdna" (
451 "fref" character varying(100) DEFAULT '' NOT NULL,
452 "foffset" integer DEFAULT '0' NOT NULL,
454 CONSTRAINT pk_fdna PRIMARY KEY (fref, foffset)
461 CREATE TABLE "fmeta" (
462 "fname" character varying(255) DEFAULT '' NOT NULL,
463 "fvalue" character varying(255) DEFAULT '' NOT NULL,
464 CONSTRAINT pk_fmeta PRIMARY KEY (fname)
472 CREATE TABLE "fattribute" (
473 "fattribute_id" serial NOT NULL,
474 "fattribute_name" character varying(255) DEFAULT '' NOT NULL,
475 CONSTRAINT pk_fattribute PRIMARY KEY (fattribute_id)
477 }, # fattribute table
481 fattribute_to_feature
=> {
483 CREATE TABLE "fattribute_to_feature" (
484 "fid" integer DEFAULT '0' NOT NULL,
485 "fattribute_id" integer DEFAULT '0' NOT NULL,
486 "fattribute_value" text
488 }, # fattribute_to_feature table
491 fattribute_to_feature_fid
=> q{
492 CREATE INDEX fattribute_to_feature_fid ON fattribute_to_feature (fid,fattribute_id)
494 fattribute_txt_idx
=> q{
495 CREATE INDEX fattribute_txt_idx ON fattribute_to_feature (fattribute_value)
497 fattribute_lower_idx
=> q{
498 CREATE INDEX fattribute_lower_idx ON fattribute_to_feature (lower(fattribute_value))
500 } # fattribute_to_feature indexes
501 }, # fattribute_to_feature
505 CREATE TABLE "finterval_stats" (
506 "ftypeid" integer DEFAULT '0' NOT NULL,
507 "fref" character varying(100) DEFAULT '' NOT NULL,
508 "fbin" integer DEFAULT '0' NOT NULL,
509 "fcum_count" integer DEFAULT '0' NOT NULL,
510 CONSTRAINT pk_finterval_stats PRIMARY KEY (ftypeid,fref,fbin)
512 } # finterval_stats table
526 Usage : $db->setup_load
527 Function: called before load_gff_line()
532 This method performs schema-specific initialization prior to loading a
533 set of GFF records. It prepares a set of DBI statement handlers to be
534 used in loading the data.
540 my $schema = $self->schema;
542 my $dbh = $self->features_db;
544 if ($self->lock_on_load) {
545 my @tables = map { "$_ WRITE"} $self->tables;
546 my $tables = join ', ',@tables;
547 $dbh->do("LOCK TABLES $tables");
550 my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?');
551 my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)');
552 my $insertid_type = $dbh->prepare_delayed("SELECT currval('ftype_ftypeid_seq')");
554 my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE lower(gname)=lower(?) AND gclass=?');
555 my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)');
556 my $insertid_group = $dbh->prepare_delayed("SELECT currval('fgroup_gid_seq')");
558 my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?');
559 my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)');
560 my $insertid_attribute = $dbh->prepare_delayed("SELECT currval('fattribute_fattribute_id_seq')");
562 my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)');
564 my $insert_data = $dbh->prepare_delayed(<<END);
565 INSERT INTO fdata (fref,fstart,fstop,fbin,ftypeid,fscore,
566 fstrand,fphase,gid,ftarget_start,ftarget_stop)
567 VALUES(?,?,?,?,?,?,?,?,?,?,?)
570 my $delete_existing_data = $dbh->prepare_delayed('DELETE FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND fbin=? AND ftypeid=? AND GID=?');
571 my $insertid_data = $dbh->prepare_delayed("SELECT currval('fdata_fid_seq')");
573 $self->{load_stuff
}{sth
}{lookup_ftype
} = $lookup_type;
574 $self->{load_stuff
}{sth
}{insert_ftype
} = $insert_type;
575 $self->{load_stuff
}{sth
}{insertid_ftype
} = $insertid_type;
576 $self->{load_stuff
}{sth
}{lookup_fgroup
} = $lookup_group;
577 $self->{load_stuff
}{sth
}{insert_fgroup
} = $insert_group;
578 $self->{load_stuff
}{sth
}{insertid_fgroup
} = $insertid_group;
579 $self->{load_stuff
}{sth
}{insertid_fdata
} = $insertid_data;
580 $self->{load_stuff
}{sth
}{insert_fdata
} = $insert_data;
581 $self->{load_stuff
}{sth
}{delete_existing_fdata
} = $delete_existing_data;
582 $self->{load_stuff
}{sth
}{lookup_fattribute
} = $lookup_attribute;
583 $self->{load_stuff
}{sth
}{insert_fattribute
} = $insert_attribute;
584 $self->{load_stuff
}{sth
}{insertid_fattribute
} = $insertid_attribute;
585 $self->{load_stuff
}{sth
}{insert_fattribute_value
} = $insert_attribute_value;
586 $self->{load_stuff
}{types
} = {};
587 $self->{load_stuff
}{groups
} = {};
588 $self->{load_stuff
}{counter
} = 0;
593 Title : load_gff_line
594 Usage : $db->load_gff_line($fields)
595 Function: called to load one parsed line of GFF
596 Returns : true if successfully inserted
597 Args : hashref containing GFF fields
600 This method is called once per line of the GFF and passed a series of
601 parsed data items that are stored into the hashref $fields. The keys are:
603 ref reference sequence
604 source annotation source
605 method annotation method
606 start annotation start
608 score annotation score (may be undef)
609 strand annotation strand (may be undef)
610 phase annotation phase (may be undef)
611 group_class class of annotation's group (may be undef)
612 group_name ID of annotation's group (may be undef)
613 target_start start of target of a similarity hit
614 target_stop stop of target of a similarity hit
615 attributes array reference of attributes, each of which is a [tag=>value] array ref
623 if (defined $gff->{phase
}){
624 chomp($gff->{phase
});
625 undef($gff->{phase
}) if $gff->{phase
} eq '.';
628 if (defined $gff->{strand
} && $gff->{strand
} eq '.'){undef($gff->{strand
})};
629 if (defined $gff->{score
} && $gff->{score
} eq '.'){undef($gff->{score
})};
631 my $s = $self->{load_stuff
};
632 my $dbh = $self->features_db;
633 local $dbh->{PrintError
} = 0;
635 defined(my $typeid = $self->get_table_id('ftype', $gff->{method
} => $gff->{source
})) or return;
636 defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname
} => $gff->{gclass
})) or return;
638 my $bin = bin
($gff->{start
},$gff->{stop
},$self->min_bin);
639 my $result = $s->{sth
}{insert_fdata
}->execute($gff->{ref},
640 $gff->{start
},$gff->{stop
},$bin,
642 $gff->{score
},$gff->{strand
},$gff->{phase
},
644 $gff->{tstart
},$gff->{tstop
});
646 warn $dbh->errstr,"\n" and print "ref=",$gff->{ref}," start=",$gff->{start
}," stop=",$gff->{stop
}," bin=",$bin," typeid=",$typeid," groupid=",$groupid,"\n"
647 and return unless $result;
649 my $fid = $self->insertid($s->{sth
},'fdata')
650 || $self->get_feature_id($gff->{ref},$gff->{start
},$gff->{stop
},$typeid,$groupid);
653 foreach (@
{$gff->{attributes
}}) {
654 defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return;
655 $s->{sth
}{insert_fattribute_value
}->execute($fid,$attribute_id,$_->[1]);
658 if ( (++$s->{counter
} % 1000) == 0) {
659 print STDERR
"$s->{counter} records loaded...";
660 print STDERR
-t STDOUT
&& !$ENV{EMACS
} ?
"\r" : "\n";
673 if ($sth->{"insertid_$table"}->execute()){
674 $insert_id = ($sth->{"insertid_$table"}->fetchrow_array)[0];
677 warn "No CURRVAL for SEQUENCE of table $table ",$sth->errstr,"\n";
687 Usage : $integer = $db->get_table_id($table,@ids)
688 Function: get the ID of a group or type
689 Returns : an integer ID or undef
693 This internal method is called by load_gff_line to look up the integer
694 ID of an existing feature type or group. The arguments are the name
695 of the table, and two string identifiers. For feature types, the
696 identifiers are the method and source. For groups, the identifiers
697 are group name and class.
699 This method requires that a statement handler named I<lookup_$table>,
700 have been created previously by setup_load(). It is here to overcome
701 deficiencies in mysql's INSERT syntax.
706 # get the object ID from a named table
712 # irritating warning for null id
716 $id_key = join ':',@ids;
719 my $s = $self->{load_stuff
};
721 my $dbh = $self->features_db;
723 unless (defined($s->{$table}{$id_key})) {
724 $sth->{"lookup_$table"}->execute(@ids);
725 my @result = $sth->{"lookup_$table"}->fetchrow_array;
727 $s->{$table}{$id_key} = $result[0];
729 $sth->{"insert_$table"}->execute(@ids)
730 && ($s->{$table}{$id_key} = $self->insertid($sth,$table));
731 #&& ($s->{$table}{$id_key} = $self->insertid($sth->{"insertid_$table"}));
732 #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid);
736 my $id = $s->{$table}{$id_key};
737 unless (defined $id) {
738 warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n";
747 # my $insertid_sth = shift ;
749 # if ($insertid_sth->execute){
750 # $insert_id = ($insertid_sth->fetchrow_array)[0];
753 # warn "No CURRVAL for SEQUENCE ",$insertid_sth->errstr,"\n";
759 sub insert_sequence
{
761 my($id,$offset,$seq) = @_;
762 my $sth = $self->{_insert_sequence
}
763 ||= $self->dbh->prepare_delayed('insert into fdna values (?,?,?)');
764 $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr);
770 Usage : $db->range_query($range_type,$refseq,$refclass,$start,$stop,$types,$order_by_group,$attributes,$binsize)
771 Function: create statement handle for range/overlap queries
772 Returns : a DBI statement handle
776 This method constructs the statement handle for this module's central
777 query: given a range and/or a list of feature types, fetch their GFF
778 records. It overrides a method in dbi.pm so that the overlaps query
779 can write SQL optimized for Postgres. Specifically, instead of writing
780 the bin related section as a set of ORs, each bin piece is place in
781 a separate select and then they are UNIONed together. This subroutine
782 requires several replacements for other subroutines in dbi.pm. In this
783 module, they are named the same as those in dbi.pm but prefixed with
786 The positional arguments are as follows:
790 $isrange A flag indicating that this is a range.
791 query. Otherwise an overlap query is
794 $refseq The reference sequence name (undef if no range).
796 $refclass The reference sequence class (undef if no range).
798 $start The start of the range (undef if none).
800 $stop The stop of the range (undef if none).
802 $types Array ref containing zero or feature types in the
803 format [method,source].
805 $order_by_group A flag indicating that statement handler should group
806 the features by group id (handy for iterative fetches)
808 $attributes A hash containing select attributes.
810 $binsize A bin size for generating tables of feature density.
816 my($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$bin) = @_;
818 my $dbh = $self->features_db;
820 # my @bin_parts = split /\n\s+OR/, $self->bin_query($start,$stop);
821 # warn "bin_part: @bin_parts\n";
823 my %a = (refseq
=>$refseq,class=>$class,start
=>$start,stop
=>$stop,types
=>$types,attributes
=>$attributes,bin_width
=>$bin);
824 my ($query, @args, $order_by);
826 if ($rangetype ne 'overlaps') {
828 my $select = $self->make_features_select_part(\
%a);
829 my $from = $self->make_features_from_part($sparse,\
%a);
830 my $join = $self->make_features_join_part(\
%a);
832 ($where,@args) = $self->make_features_by_range_where_part($rangetype,\
%a);
833 my ($group_by,@more_args) = $self->make_features_group_by_part(\
%a);
834 $order_by = $self->make_features_order_by_part(\
%a) if $order_by_group;
836 $query = "SELECT $select FROM $from WHERE $join";
837 $query .= " AND $where" if $where;
840 $query .= " GROUP BY $group_by";
841 push @args,@more_args;
844 } else { # most common case: overlaps query
846 my @bin_parts = split /\s*OR/, $self->bin_query($start,$stop);
847 my $select = $self->make_features_select_part(\
%a);
848 my $from = $self->make_features_from_part($sparse,\
%a);
849 my $join = $self->make_features_join_part(\
%a);
851 ($where,@args) = $self->pg_make_features_by_range_where_part($rangetype,\
%a);
852 my ($group_by,@more_args)= $self->make_features_group_by_part(\
%a);
853 $order_by = $self->pg_make_features_order_by_part(\
%a) if $order_by_group;
857 foreach my $bin (@bin_parts) {
858 my $temp_query = "SELECT $select FROM $from WHERE $join AND $where AND $bin\n";
859 push @temp_args, @args;
862 $temp_query .= " GROUP BY $group_by";
863 push @temp_args,@more_args;
866 push @query_pieces, $temp_query;
870 $query = join("UNION\n", @query_pieces);
874 $query .= " ORDER BY $order_by" if $order_by;
876 $self->dbh->do('set enable_seqscan=off');
877 my $sth = $self->dbh->do_query($query,@args);
881 sub pg_make_features_by_range_where_part
{
883 my ($rangetype,$options) = @_;
885 return unless $rangetype eq 'overlaps';
888 my ($refseq,$class,$start,$stop,$types,$attributes) =
889 @
{$options}{qw(refseq class start stop types attributes)};
894 my ($q,@a) = $self->refseq_query($refseq,$class);
899 if (defined $start or defined $stop) {
900 $start = 0 unless defined($start);
901 $stop = MAX_SEGMENT
unless defined($stop);
903 my ($range_query,@range_args) = $self->pg_overlap_query($start,$stop);
905 push @query,$range_query;
906 push @args,@range_args;
909 if (defined $types && @
$types) {
910 my ($type_query,@type_args) = $self->types_query($types);
911 push @query,$type_query;
912 push @args,@type_args;
916 my ($attribute_query,@attribute_args) = $self->make_features_by_attribute_where_part($attributes);
917 push @query,"($attribute_query)";
918 push @args,@attribute_args;
921 my $query = join "AND",@query;
922 return wantarray ?
($query,@args) : $self->dbh->dbi_quote($query,@args);
925 sub pg_overlap_query
{
927 my ($start,$stop) = @_;
929 my ($iq,@iargs) = $self->overlap_query_nobin($start,$stop);
930 my $query = "\n$iq\n";
933 return wantarray ?
($query,@args) : $self->dbh->dbi_quote($query,@args);
936 sub pg_make_features_order_by_part
{
938 my $options = shift || {};
944 This PostgreSQL adaptor does not implement the search notes method
945 because it can be very slow (although the code for the method is
946 contained in this method but commented out).
947 There is, however, a PostgreSQL adaptor that does implement it in
948 a more efficient way: L<Bio::DB::GFF::Adaptor::dbi::pg_fts>,
949 which inherits from this adaptor and uses the optional PostgreSQL
950 module TSearch2 for full text indexing. See that adaptor's
951 documentation for more information.
953 See also L<Bio::DB::GFF>
956 Usage : @search_results = $db->search_notes("full text search string",$limit)
957 Function: Search the notes for a text string, using mysql full-text search
958 Returns : array of results
959 Args : full text search string, and an optional row limit
962 This is a replacement for the mysql-specific method. Given a search string, it
963 performs a ILIKE search of the notes table and returns an array of results.
964 Each row of the returned array is a arrayref containing the following fields:
966 column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment()
967 column 2 The text of the note
968 column 3 A relevance score.
970 Note that for large databases this can be very slow and may result in
971 time out or 500-cgi errors. If this is happening on a regular basis,
972 you should look into using L<Bio::DB::GFF::Adaptor::dbi::pg_fts> which
973 implements the TSearch2 full text indexing scheme.
979 # my ($search_string,$limit) = @_;
981 # $search_string =~ tr/*/%/s;
982 # $search_string = '%'.$search_string unless $search_string =~ /^\%/;
983 # $search_string = $search_string.'%' unless $search_string =~ /\%$/;
984 # warn "search_string:$search_string";
985 # my $query = FULLTEXTWILDCARD;
986 # $query .= " limit $limit" if defined $limit;
987 # my $sth = $self->dbh->do_query($query,$search_string);
990 # while (my ($class,$name,$note) = $sth->fetchrow_array) {
992 # next unless $class && $name; # sorry, ignore NULL objects
993 # my $featname = Bio::DB::GFF::Featname->new($class=>$name);
995 # push @results,[$featname,$note,0]; #gbrowse expects a score, but
996 # #pg doesn't give one, thus the 0
1004 =head2 make_meta_set_query
1006 Title : make_meta_set_query
1007 Usage : $sql = $db->make_meta_set_query
1008 Function: return SQL fragment for setting a meta parameter
1009 Returns : SQL fragment
1013 By default this does nothing; meta parameters are not stored or
1018 sub make_meta_set_query
{
1019 return 'INSERT INTO fmeta VALUES (?,?)';
1022 sub make_classes_query
{
1024 return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT gclass IS NULL';
1030 $self->meta('chunk_size') || DEFAULT_CHUNK
;
1033 sub getseqcoords_query
{
1035 return GETSEQCOORDS
;
1038 sub getaliascoords_query
{
1040 return GETALIASCOORDS
;
1044 sub getforcedseqcoords_query
{
1046 return GETFORCEDSEQCOORDS
;
1050 sub getaliaslike_query
{
1052 return GETALIASLIKE
;
1056 sub make_features_select_part
{
1058 my $options = shift || {};
1060 if (my $b = $options->{bin_width
}) {
1064 1+$b*floor(fstart/$b) as fstart,
1065 $b*(1+floor(fstart/$b)) as fstop,
1066 CASE WHEN fsource IS NULL THEN fmethod
1067 ELSE fmethod||':'||fsource,
1071 CASE WHEN fsource IS NULL THEN fref||':'||fmethod
1072 ELSE fref||':'||fmethod||':'||fsource,
1078 fref,fstart,fstop,fsource,fmethod,fscore,fstrand,fphase,gclass,fgroup.gname,ftarget_start,ftarget_stop,fdata.fid,fdata.gid
1082 $s .= ",count(fdata.fid)" if $options->{attributes
} && keys %{$options->{attributes
}}>1;
1086 sub make_features_from_part_bkup
{
1089 my $options = shift || {};
1090 #my $index = $sparse ? ' USE INDEX(ftypeid)': '';
1092 return $options->{attributes
} ?
"fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n"
1093 : "fdata${index},ftype,fgroup\n";
1097 ####################################
1098 # moved from mysqlopt.pm
1099 ###################################
1101 sub default_meta_values
{
1103 my @values = $self->SUPER::default_meta_values
;
1108 straight_join_limit
=> STRAIGHT_JOIN_LIMIT
,
1114 return $self->meta('min_bin') || MIN_BIN
;
1118 return $self->meta('max_bin') || MAX_BIN
;
1120 sub straight_join_limit
{
1122 return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT
;
1126 sub _feature_by_name
{
1128 my ($class,$name,$location,$callback) = @_;
1129 $callback || $self->throw('must provide a callback argument');
1131 my @bin_parts = split /\s*OR/, $self->bin_query($location->[1],$location->[2]) if $location;
1132 my $select = $self->make_features_select_part;
1133 my $from = $self->make_features_from_part(undef,{sparse_groups
=>1});
1134 my ($where,@args) = $self->make_features_by_name_where_part($class,$name);
1135 my $join = $self->make_features_join_part;
1136 my $range = $self->pg_make_features_by_range_where_part('overlaps',
1137 {refseq
=>$location->[0],
1139 start
=>$location->[1],
1140 stop
=>$location->[2]}) if $location;
1146 foreach my $bin (@bin_parts) {
1147 my $temp_query = "SELECT $select FROM $from WHERE $join AND $where AND $range AND $bin\n";
1148 push @temp_args, @args;
1149 push @query_pieces, $temp_query;
1153 $query = join("UNION\n", @query_pieces);
1156 $query = "SELECT $select FROM $from WHERE $where AND $join";
1159 my $sth = $self->dbh->do_query($query,@args);
1162 while (my @row = $sth->fetchrow_array) {
1170 sub update_sequences
{
1172 my $dbh = $self->features_db;
1174 $dbh->do("SELECT setval('public.fdata_fid_seq', max(fid)+1) FROM fdata");
1175 $dbh->do("SELECT setval('public.fattribute_fattribute_id_seq', max(fattribute_id)+1) FROM fattribute");
1176 $dbh->do("SELECT setval('public.fgroup_gid_seq', max(gid)+1) FROM fgroup");
1177 $dbh->do("SELECT setval('public.ftype_ftypeid_seq', max(ftypeid)+1) FROM ftype");
1182 =head2 make_features_by_name_where_part
1184 Title : make_features_by_name_where_part
1185 Usage : $db->make_features_by_name_where_part
1186 Function: Overrides a function in Bio::DB::GFF::Adaptor::dbi to insure
1187 that searches will be case insensitive. It creates the SQL
1188 fragment needed to select a feature by its group name & class
1189 Returns : a SQL fragment and bind arguments
1195 sub make_features_by_name_where_part
{
1197 my ($class,$name) = @_;
1199 if ($name !~ /\*/) {
1200 #allows utilization of an index on lower(gname)
1201 return ("fgroup.gclass=? AND lower(fgroup.gname) = lower(?)",$class,$name);
1205 return ("fgroup.gclass=? AND lower(fgroup.gname) LIKE lower(?)",$class,$name);
1210 # Methods from dbi.pm that need to be overridden to make
1211 # searching for fref case insensitive
1216 my ($ref,$start,$stop,$class) = @_;
1218 my ($offset_start,$offset_stop);
1220 my $has_start = defined $start;
1221 my $has_stop = defined $stop;
1224 if ($has_start && $has_stop && $start > $stop) {
1226 ($start,$stop) = ($stop,$start);
1229 # turn start and stop into 0-based offsets
1230 my $cs = $self->dna_chunk_size;
1231 $start -= 1; $stop -= 1;
1232 $offset_start = int($start/$cs)*$cs;
1233 $offset_stop = int($stop/$cs)*$cs;
1236 # special case, get it all
1237 if (!($has_start || $has_stop)) {
1238 $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) order by foffset',$ref);
1241 elsif (!$has_stop) {
1242 $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) and foffset>=? order by foffset',
1243 $ref,$offset_start);
1246 else { # both start and stop defined
1247 $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) and foffset>=? and foffset<=? order by foffset',
1248 $ref,$offset_start,$offset_stop);
1252 while (my($frag,$offset) = $sth->fetchrow_array) {
1253 substr($frag,0,$start-$offset) = '' if $has_start && $start > $offset;
1256 substr($dna,$stop-$start+1) = '' if $has_stop && $stop-$start+1 < length($dna);
1258 $dna = reverse $dna;
1259 $dna =~ tr/gatcGATC/ctagCTAG/;
1269 my ($refseq,$refclass) = @_;
1270 my $query = "lower(fdata.fref)=lower(?)";
1271 return wantarray ?
($query,$refseq) : $self->dbh->dbi_quote($query,$refseq);
1274 sub make_types_where_part
{
1276 my ($srcseq,$start,$stop,$want_count,$typelist) = @_;
1278 if (defined($srcseq)) {
1279 push @query,'lower(fdata.fref)=lower(?)';
1281 if (defined $start or defined $stop) {
1282 $start = 1 unless defined $start;
1283 $stop = MAX_SEGMENT
unless defined $stop;
1284 my ($q,@a) = $self->overlap_query($start,$stop);
1289 if (defined $typelist && @
$typelist) {
1290 my ($q,@a) = $self->types_query($typelist);
1294 my $query = @query ?
join(' AND ',@query) : '1=1';
1295 return wantarray ?
($query,@args) : $self->dbh->dbi_quote($query,@args);
1298 sub get_feature_id
{
1300 my ($ref,$start,$stop,$typeid,$groupid) = @_;
1301 my $s = $self->{load_stuff
};
1302 unless ($s->{get_feature_id
}) {
1303 my $dbh = $self->features_db;
1304 $s->{get_feature_id
} =
1305 $dbh->prepare_delayed('SELECT fid FROM fdata WHERE lower(fref)=lower(?) AND fstart=? AND fstop=? AND ftypeid=? AND gid=?');
1307 my $sth = $s->{get_feature_id
} or return;
1308 $sth->execute($ref,$start,$stop,$typeid,$groupid) or return;
1309 my ($fid) = $sth->fetchrow_array;
1315 my $delete_spec = shift;
1316 my $ranges = $delete_spec->{segments
} || [];
1317 my $types = $delete_spec->{types
} || [];
1318 my $force = $delete_spec->{force
};
1319 my $range_type = $delete_spec->{range_type
};
1320 my $dbh = $self->features_db;
1322 my $query = 'delete from fdata';
1326 for my $segment (@
$ranges) {
1327 my $ref = $dbh->quote($segment->abs_ref);
1328 my $start = $segment->abs_start;
1329 my $stop = $segment->abs_stop;
1330 my $range = $range_type eq 'overlaps' ?
$self->overlap_query($start,$stop)
1331 : $range_type eq 'contains' ?
$self->contains_query($start,$stop)
1332 : $range_type eq 'contained_in' ?
$self->contained_in_query($start,$stop)
1333 : $self->throw("Invalid range type '$range_type'");
1334 push @range_part,"(lower(fref)=lower($ref) AND $range)";
1336 push @where,'('. join(' OR ',@range_part).')' if @range_part;
1340 my $types_where = $self->types_query($types);
1341 my $types_query = "select ftypeid from ftype where $types_where";
1342 my $result = $dbh->selectall_arrayref($types_query);
1343 my @typeids = map {$_->[0]} @
$result;
1344 my $typelist = join ',',map{$dbh->quote($_)} @typeids;
1345 $typelist ||= "0"; # don't cause DBI to die with invalid SQL when
1346 # unknown feature types were requested.
1347 push @where,"(ftypeid in ($typelist))";
1349 $self->throw("This operation would delete all feature data and -force not specified")
1350 unless @where || $force;
1351 $query .= " where ".join(' and ',@where) if @where;
1352 warn "$query\n" if $self->debug;
1353 my $result = $dbh->do($query);
1354 defined $result or $self->throw($dbh->errstr);
1358 sub make_abscoord_query
{
1360 my ($name,$class,$refseq) = @_;
1361 #my $query = GETSEQCOORDS;
1362 my $query = $self->getseqcoords_query();
1363 my $getforcedseqcoords = $self->getforcedseqcoords_query() ;
1364 if ($name =~ /\*/) {
1368 $query =~ s/gname\) = lower/gname) LIKE lower/;
1371 ?
$self->dbh->do_query($getforcedseqcoords,$name,$class,$refseq)
1372 : $self->dbh->do_query($query,$name,$class);
1375 sub make_aliasabscoord_query
{
1377 my ($name,$class) = @_;
1378 #my $query = GETALIASCOORDS;
1379 my $query = $self->getaliascoords_query();
1380 if ($name =~ /\*/) {
1384 $query =~ s/gname\) = lower/gname) LIKE lower/;
1386 $self->dbh->do_query($query,$name,$class);