test/whereN.test

   1 # 2024-04-02
   2 #
   3 # The author disclaims copyright to this source code.  In place of
   4 # a legal notice, here is a blessing:
   5 #
   6 #    May you do good and not evil.
   7 #    May you find forgiveness for yourself and forgive others.
   8 #    May you share freely, never taking more than you give.
   9 #
  10 #***********************************************************************
  11 # Tests for the whereInterstageHeuristic() routine in the query planner.
  12 #
  13
  14 set testdir [file dirname $argv0]
  15 source $testdir/tester.tcl
  16 set testprefix whereN
  17
  18 # The following is a simplified and "sanitized" version of the original
  19 # real-world query that brought the problem to light.
  20 #
  21 # The issue is a slow query.  The answer is correct, but it was taking too
  22 # much time, because it was doing a full table scan rather than an indexed
  23 # lookup.
  24 #
  25 # The problem was that the query planner was overestimating the number of
  26 # output rows.  The estimated number of output rows is accurate if the
  27 # DSNAME parameter is "ds-one".  In that case, a large fraction of the rows
  28 # in "violation" end up being output.  The query planner correctly deduces
  29 # that it is faster to do a full table scan of the large "violation" table
  30 # to avoid the after-query sort that implements the ORDER BY clause. However,
  31 # if the DSNAME is "ds-two", then only a few rows (about 6) are generated,
  32 # and it is much much faster to do an indexed lookup of "violation" followed
  33 # by a sort operation to implement ORDER BY
  34 #
  35 # The problem, of course, is that the query planner has no way of knowing
  36 # in advance how many rows will be generated.  The query planner tries to
  37 # estimate a worst case, which is a large number of output rows, and it picks
  38 # the best plan for that case.  However, the plan choosen is very inefficient
  39 # when the number of output rows is small.
  40 #
  41 # The whereInterstageHeuristic() routine in the query planner attempts to
  42 # correct this by adjusting the query plan such that it avoids the very bad
  43 # query plan for a small number of rows, at the expense of a slightly less
  44 # efficient plan for a large number of rows.  The large number of rows case
  45 # is perhaps 5% slower with the revised plan, but the small number of
  46 # rows case is around 100 times faster.  That seems like a good tradeoff.
  47 #
  48 do_execsql_test 1.0 {
  49   CREATE TABLE datasource(dsid INT, name TEXT);
  50   INSERT INTO datasource VALUES(1,'ds-one'),(2,'ds-two'),(3,'ds-three');
  51   CREATE INDEX ds1 ON datasource(name, dsid);
  52
  53   CREATE TABLE rule(rid INT, team_id INT, dsid INT);
  54   WITH RECURSIVE c(n) AS (VALUES(1) UNION ALL SELECT n+1 FROM c WHERE n<9)
  55     INSERT INTO rule(rid,team_id,dsid) SELECT n, 1, 1 FROM c;
  56   WITH RECURSIVE c(n) AS (VALUES(10) UNION ALL SELECT n+1 FROM c WHERE n<24)
  57     INSERT INTO rule(rid,team_id,dsid) SELECT n, 2, 2 FROM c;
  58   CREATE INDEX rule2 ON rule(dsid, rid);
  59
  60   CREATE TABLE violation(vid INT, rid INT, vx BLOB);
  61   /***  Uncomment to insert actual data
  62   WITH src(rid, cnt) AS (VALUES(1,3586),(2,1343),(3,6505),(5,76230),
  63                                (6,740),(7,287794),(8,457),(12,1),
  64                                (14,1),(16,1),(17,1),(18,1),(19,1))
  65     INSERT INTO violation(vid, rid, vx)
  66       SELECT rid*1000000+value, rid, randomblob(15)
  67         FROM src, generate_series(1,cnt);
  68   ***/
  69   CREATE INDEX v1 ON violation(rid, vid);
  70   CREATE INDEX v2 ON violation(vid);
  71   ANALYZE;
  72   DELETE FROM sqlite_stat1;
  73   DROP TABLE IF EXISTS sqlite_stat4;
  74   INSERT INTO sqlite_stat1 VALUES
  75     ('violation','v2','376661 1'),
  76     ('violation','v1','376661 28974 1'),
  77     ('rule','rule2','24 12 1'),
  78     ('datasource','ds1','3 1 1');
  79   ANALYZE sqlite_schema;
  80 }
  81 set DSNAME ds-two   ;#  Only a few rows.  Change to "ds-one" for many rows.
  82 do_eqp_test 1.1 {
  83   SELECT count(*), length(group_concat(vx)) FROM (
  84     SELECT V.*
  85       FROM datasource DS, rule R, violation V
  86      WHERE V.rid=R.rid
  87        AND R.dsid=DS.dsid
  88        AND DS.name=$DSNAME
  89      ORDER BY V.vid desc
  90   );
  91 } {
  92   QUERY PLAN
  93   |--CO-ROUTINE (subquery-xxxxxx)
  94   |  |--SEARCH DS USING COVERING INDEX ds1 (name=?)
  95   |  |--SEARCH R USING COVERING INDEX rule2 (dsid=?)
  96   |  |--SEARCH V USING INDEX v1 (rid=?)
  97   |  `--USE TEMP B-TREE FOR ORDER BY
  98   `--SCAN (subquery-xxxxxx)
  99 }
 100 #       ^^^^---- We want to see three SEARCH terms.  No SCAN terms.
 101 #                The ORDER BY is implemented by a separate sorter pass.
 102
 103 finish_test