3 # The author disclaims copyright to this source code. In place of
4 # a legal notice, here is a blessing:
6 # May you do good and not evil.
7 # May you find forgiveness for yourself and forgive others.
8 # May you share freely, never taking more than you give.
10 #***********************************************************************
11 # Tests for the whereInterstageHeuristic() routine in the query planner.
14 set testdir [file dirname $argv0]
15 source $testdir/tester.tcl
18 # The following is a simplified and "sanitized" version of the original
19 # real-world query that brought the problem to light.
21 # The issue is a slow query. The answer is correct, but it was taking too
22 # much time, because it was doing a full table scan rather than an indexed
25 # The problem was that the query planner was overestimating the number of
26 # output rows. The estimated number of output rows is accurate if the
27 # DSNAME parameter is "ds-one". In that case, a large fraction of the rows
28 # in "violation" end up being output. The query planner correctly deduces
29 # that it is faster to do a full table scan of the large "violation" table
30 # to avoid the after-query sort that implements the ORDER BY clause. However,
31 # if the DSNAME is "ds-two", then only a few rows (about 6) are generated,
32 # and it is much much faster to do an indexed lookup of "violation" followed
33 # by a sort operation to implement ORDER BY
35 # The problem, of course, is that the query planner has no way of knowing
36 # in advance how many rows will be generated. The query planner tries to
37 # estimate a worst case, which is a large number of output rows, and it picks
38 # the best plan for that case. However, the plan choosen is very inefficient
39 # when the number of output rows is small.
41 # The whereInterstageHeuristic() routine in the query planner attempts to
42 # correct this by adjusting the query plan such that it avoids the very bad
43 # query plan for a small number of rows, at the expense of a slightly less
44 # efficient plan for a large number of rows. The large number of rows case
45 # is perhaps 5% slower with the revised plan, but the small number of
46 # rows case is around 100 times faster. That seems like a good tradeoff.
49 CREATE TABLE datasource(dsid INT, name TEXT);
50 INSERT INTO datasource VALUES(1,'ds-one'),(2,'ds-two'),(3,'ds-three');
51 CREATE INDEX ds1 ON datasource(name, dsid);
53 CREATE TABLE rule(rid INT, team_id INT, dsid INT);
54 WITH RECURSIVE c(n) AS (VALUES(1) UNION ALL SELECT n+1 FROM c WHERE n<9)
55 INSERT INTO rule(rid,team_id,dsid) SELECT n, 1, 1 FROM c;
56 WITH RECURSIVE c(n) AS (VALUES(10) UNION ALL SELECT n+1 FROM c WHERE n<24)
57 INSERT INTO rule(rid,team_id,dsid) SELECT n, 2, 2 FROM c;
58 CREATE INDEX rule2 ON rule(dsid, rid);
60 CREATE TABLE violation(vid INT, rid INT, vx BLOB);
61 /*** Uncomment to insert actual data
62 WITH src(rid, cnt) AS (VALUES(1,3586),(2,1343),(3,6505),(5,76230),
63 (6,740),(7,287794),(8,457),(12,1),
64 (14,1),(16,1),(17,1),(18,1),(19,1))
65 INSERT INTO violation(vid, rid, vx)
66 SELECT rid*1000000+value, rid, randomblob(15)
67 FROM src, generate_series(1,cnt);
69 CREATE INDEX v1 ON violation(rid, vid);
70 CREATE INDEX v2 ON violation(vid);
72 DELETE FROM sqlite_stat1;
73 DROP TABLE IF EXISTS sqlite_stat4;
74 INSERT INTO sqlite_stat1 VALUES
75 ('violation','v2','376661 1'),
76 ('violation','v1','376661 28974 1'),
77 ('rule','rule2','24 12 1'),
78 ('datasource','ds1','3 1 1');
79 ANALYZE sqlite_schema;
81 set DSNAME ds-two ;# Only a few rows. Change to "ds-one" for many rows.
83 SELECT count(*), length(group_concat(vx)) FROM (
85 FROM datasource DS, rule R, violation V
93 |--CO-ROUTINE (subquery-xxxxxx)
94 | |--SEARCH DS USING COVERING INDEX ds1 (name=?)
95 | |--SEARCH R USING COVERING INDEX rule2 (dsid=?)
96 | |--SEARCH V USING INDEX v1 (rid=?)
97 | `--USE TEMP B-TREE FOR ORDER BY
98 `--SCAN (subquery-xxxxxx)
100 # ^^^^---- We want to see three SEARCH terms. No SCAN terms.
101 # The ORDER BY is implemented by a separate sorter pass.