From 9b853b827e8c27d5fbc89692b66f00ba32fcd8f5 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Mon, 8 Jun 2020 20:39:13 +0200 Subject: [PATCH 4/7] Index skip scan Implementation of basic Index Skip Scan (see Loose Index Scan in the wiki [1]) infrastructure on top of IndexOnlyScan and IndexScan. Introduces a new index am function amskip to allow advance past duplicate key values in a scan. This innocently looking description could be a bit tricky on the edge between am specific and common parts of the implementation, mostly due to different information available at each level, e.g. visibility. This means the common parts should apply skipping multiple times if necessary. Original patch and design were proposed by Thomas Munro [2], revived and improved by Dmitry Dolgov and Jesper Pedersen. [1] https://wiki.postgresql.org/wiki/Loose_indexscan [2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com Author: Jesper Pedersen, Dmitry Dolgov Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan --- contrib/bloom/blutils.c | 1 + doc/src/sgml/indexam.sgml | 1 + src/backend/access/brin/brin.c | 1 + src/backend/access/gin/ginutil.c | 1 + src/backend/access/gist/gist.c | 1 + src/backend/access/hash/hash.c | 1 + src/backend/access/index/indexam.c | 18 ++++ src/backend/access/spgist/spgutils.c | 1 + src/backend/commands/explain.c | 25 +++++ src/backend/executor/nodeIndexonlyscan.c | 97 ++++++++++++++++++- src/backend/executor/nodeIndexscan.c | 56 ++++++++++- src/backend/nodes/copyfuncs.c | 2 + src/backend/nodes/outfuncs.c | 2 + src/backend/nodes/readfuncs.c | 2 + src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/path/indxpath.c | 96 ++++++++++++++++++ src/backend/optimizer/plan/createplan.c | 20 ++-- src/backend/optimizer/util/pathnode.c | 64 ++++++++++++ src/backend/optimizer/util/plancat.c | 1 + src/backend/utils/misc/guc.c | 9 ++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/amapi.h | 8 ++ src/include/access/genam.h | 2 + src/include/access/sdir.h | 7 ++ src/include/nodes/execnodes.h | 6 ++ src/include/nodes/pathnodes.h | 4 + src/include/nodes/plannodes.h | 4 + src/include/optimizer/cost.h | 1 + src/include/optimizer/pathnode.h | 4 + src/test/regress/expected/sysviews.out | 3 +- 30 files changed, 429 insertions(+), 11 deletions(-) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index d3bf8665df..4eec21b99e 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -134,6 +134,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = blcostestimate; amroutine->amoptions = bloptions; amroutine->amproperty = NULL; diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index af87f172a7..8c552c22bd 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -150,6 +150,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 7db3ae5ee0..e3e09efa31 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -115,6 +115,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = brincostestimate; amroutine->amoptions = brinoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index a400f1fedb..7e83a8ee58 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -66,6 +66,7 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = gincostestimate; amroutine->amoptions = ginoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 79fe6eb8d6..6fe9f8f5a9 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -87,6 +87,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; amroutine->amcanreturn = gistcanreturn; + amroutine->amskip = NULL; amroutine->amcostestimate = gistcostestimate; amroutine->amoptions = gistoptions; amroutine->amproperty = gistproperty; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 3ec6d528e7..85d44ee58d 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -84,6 +84,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = hashcostestimate; amroutine->amoptions = hashoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 6b9750c244..c7b70997ff 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -33,6 +33,7 @@ * index_can_return - does index support index-only scans? * index_getprocid - get a support procedure OID * index_getprocinfo - get a support procedure's lookup info + * index_skip - advance past duplicate key values in a scan * * NOTES * This file contains the index_ routines which used @@ -736,6 +737,23 @@ index_can_return(Relation indexRelation, int attno) return indexRelation->rd_indam->amcanreturn(indexRelation, attno); } +/* ---------------- + * index_skip + * + * Skip past all tuples where the first 'prefix' columns have the + * same value as the last tuple returned in the current scan. + * ---------------- + */ +bool +index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool scanstart, int prefix) +{ + SCAN_CHECKS; + + return scan->indexRelation->rd_indam->amskip(scan, direction, + indexdir, scanstart, prefix); +} + /* ---------------- * index_getprocid * diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 0efe05e552..0491ea172a 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -69,6 +69,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; amroutine->amcanreturn = spgcanreturn; + amroutine->amskip = NULL; amroutine->amcostestimate = spgcostestimate; amroutine->amoptions = spgoptions; amroutine->amproperty = spgproperty; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index a283e4d45c..3c6258761a 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -148,6 +148,7 @@ static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es); static void ExplainIndentText(ExplainState *es); static void ExplainJSONLineEnding(ExplainState *es); static void ExplainYAMLLineStarting(ExplainState *es); +static void ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es); static void escape_yaml(StringInfo buf, const char *str); @@ -1096,6 +1097,22 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) return planstate_tree_walker(planstate, ExplainPreScanNode, rels_used); } +/* + * ExplainIndexSkipScanKeys - + * Append information about index skip scan to es->str. + * + * Can be used to print the skip prefix size. + */ +static void +ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es) +{ + if (skipPrefixSize > 0) + { + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyInteger("Distinct Prefix", NULL, skipPrefixSize, es); + } +} + /* * ExplainNode - * Appends a description of a plan tree to es->str @@ -1433,6 +1450,8 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexScan *indexscan = (IndexScan *) plan; + ExplainIndexSkipScanKeys(indexscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexscan->indexid, indexscan->indexorderdir, es); @@ -1443,6 +1462,8 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + ExplainIndexSkipScanKeys(indexonlyscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexonlyscan->indexid, indexonlyscan->indexorderdir, es); @@ -1703,6 +1724,8 @@ ExplainNode(PlanState *planstate, List *ancestors, switch (nodeTag(plan)) { case T_IndexScan: + if (((IndexScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyBool("Skip scan", true, es); show_scan_qual(((IndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); if (((IndexScan *) plan)->indexqualorig) @@ -1716,6 +1739,8 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); break; case T_IndexOnlyScan: + if (((IndexOnlyScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyBool("Skip scan", true, es); show_scan_qual(((IndexOnlyScan *) plan)->indexqual, "Index Cond", planstate, ancestors, es); if (((IndexOnlyScan *) plan)->indexqual) diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 5617ac29e7..c4e4b087a7 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -41,6 +41,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/predicate.h" +#include "storage/itemptr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -62,9 +63,26 @@ IndexOnlyNext(IndexOnlyScanState *node) EState *estate; ExprContext *econtext; ScanDirection direction; + ScanDirection readDirection; IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; + ItemPointerData startTid; + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan; + + /* + * Tells if the current position was reached via skipping. In this case + * there is no nead for the index_getnext_tid + */ + bool skipped = false; + + /* + * Index only scan must be aware that in case of skipping we can return to + * the starting point due to visibility checks. In this situation we need + * to jump further, and number of skipping attempts tell us how far do we + * need to do so. + */ + int skipAttempts = 0; /* * extract necessary information from index scan node @@ -72,7 +90,7 @@ IndexOnlyNext(IndexOnlyScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexonlyscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -114,16 +132,87 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); } + else + { + ItemPointerCopy(&scandesc->xs_heaptid, &startTid); + } + + /* + * Check if we need to skip to the next key prefix, because we've been + * asked to implement DISTINCT. + * + * When fetching a cursor in the direction opposite to a general scan + * direction, the result must be what normal fetching should have + * returned, but in reversed order. In other words, return the last or + * first scanned tuple in a DISTINCT set, depending on a cursor direction. + * Due to that we skip also when the first tuple wasn't emitted yet, but + * the directions are opposite. + */ + if (node->ioss_SkipPrefixSize > 0 && + (node->ioss_FirstTupleEmitted || + ScanDirectionsAreOpposite(direction, indexonlyscan->indexorderdir))) + { + if (!index_skip(scandesc, direction, indexonlyscan->indexorderdir, + !node->ioss_FirstTupleEmitted, node->ioss_SkipPrefixSize)) + { + /* + * Reached end of index. At this point currPos is invalidated, and + * we need to reset ioss_FirstTupleEmitted, since otherwise after + * going backwards, reaching the end of index, and going forward + * again we apply skip again. It would be incorrect and lead to an + * extra skipped item. + */ + node->ioss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + else + { + skipAttempts = 1; + skipped = true; + tid = &scandesc->xs_heaptid; + } + } + + readDirection = skipped ? indexonlyscan->indexorderdir : direction; /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (skipped || (tid = index_getnext_tid(scandesc, readDirection)) != NULL) { bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + /* + * While doing index only skip scan with advancing and reading in + * different directions we can return to the same position where we + * started after visibility check. Recognize such situations and skip + * more. + */ + if ((readDirection != direction) && + ItemPointerIsValid(&startTid) && ItemPointerEquals(&startTid, tid)) + { + int i; + skipAttempts += 1; + + for (i = 0; i < skipAttempts; i++) + { + if (!index_skip(scandesc, direction, + indexonlyscan->indexorderdir, + !node->ioss_FirstTupleEmitted, + node->ioss_SkipPrefixSize)) + { + node->ioss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + } + + tid = &scandesc->xs_heaptid; + } + + skipped = false; + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -250,6 +339,8 @@ IndexOnlyNext(IndexOnlyScanState *node) ItemPointerGetBlockNumber(tid), estate->es_snapshot); + node->ioss_FirstTupleEmitted = true; + return slot; } @@ -504,6 +595,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan; + indexstate->ioss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->ioss_FirstTupleEmitted = false; /* * Miscellaneous initialization diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index d0a96a38e0..449aaec3ac 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -85,6 +85,13 @@ IndexNext(IndexScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; + IndexScan *indexscan = (IndexScan *) node->ss.ps.plan; + + /* + * tells if the current position was reached via skipping. In this case + * there is no nead for the index_getnext_tid + */ + bool skipped = false; /* * extract necessary information from index scan node @@ -92,7 +99,7 @@ IndexNext(IndexScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -117,6 +124,12 @@ IndexNext(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Index skip scan assumes xs_want_itup, so set it to true */ + if (indexscan->indexskipprefixsize > 0) + node->iss_ScanDesc->xs_want_itup = true; + else + node->iss_ScanDesc->xs_want_itup = false; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -127,12 +140,48 @@ IndexNext(IndexScanState *node) node->iss_OrderByKeys, node->iss_NumOrderByKeys); } + /* + * Check if we need to skip to the next key prefix, because we've been + * asked to implement DISTINCT. + * + * When fetching a cursor in the direction opposite to a general scan + * direction, the result must be what normal fetching should have + * returned, but in reversed order. In other words, return the last or + * first scanned tuple in a DISTINCT set, depending on a cursor direction. + * Due to that we skip also when the first tuple wasn't emitted yet, but + * the directions are opposite. + */ + if (node->iss_SkipPrefixSize > 0 && + (node->iss_FirstTupleEmitted || + ScanDirectionsAreOpposite(direction, indexscan->indexorderdir))) + { + if (!index_skip(scandesc, direction, indexscan->indexorderdir, + !node->iss_FirstTupleEmitted, node->iss_SkipPrefixSize)) + { + /* + * Reached end of index. At this point currPos is invalidated, and + * we need to reset iss_FirstTupleEmitted, since otherwise after + * going backwards, reaching the end of index, and going forward + * again we apply skip again. It would be incorrect and lead to an + * extra skipped item. + */ + node->iss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + else + { + skipped = true; + index_fetch_heap(scandesc, slot); + } + } + /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while (skipped || index_getnext_slot(scandesc, direction, slot)) { CHECK_FOR_INTERRUPTS(); + skipped = false; /* * If the index was lossy, we have to recheck the index quals using @@ -149,6 +198,7 @@ IndexNext(IndexScanState *node) } } + node->iss_FirstTupleEmitted = true; return slot; } @@ -910,6 +960,8 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexScan; + indexstate->iss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->iss_FirstTupleEmitted = false; /* * Miscellaneous initialization diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index efe7c3ae10..236bb1435c 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -493,6 +493,7 @@ _copyIndexScan(const IndexScan *from) COPY_NODE_FIELD(indexorderbyorig); COPY_NODE_FIELD(indexorderbyops); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); return newnode; } @@ -518,6 +519,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indextlist); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index c3a9632992..88254446ce 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -562,6 +562,7 @@ _outIndexScan(StringInfo str, const IndexScan *node) WRITE_NODE_FIELD(indexorderbyorig); WRITE_NODE_FIELD(indexorderbyops); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); } static void @@ -576,6 +577,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node) WRITE_NODE_FIELD(indexorderby); WRITE_NODE_FIELD(indextlist); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 3a18571d0c..9766fc11ed 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -1829,6 +1829,7 @@ _readIndexScan(void) READ_NODE_FIELD(indexorderbyorig); READ_NODE_FIELD(indexorderbyops); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); READ_DONE(); } @@ -1848,6 +1849,7 @@ _readIndexOnlyScan(void) READ_NODE_FIELD(indexorderby); READ_NODE_FIELD(indextlist); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); READ_DONE(); } diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 945aa93374..fe6ef62e8f 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -125,6 +125,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; bool enable_indexonlyscan = true; +bool enable_indexskipscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 2a50272da6..340bbfcbb3 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -790,6 +790,16 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { IndexPath *ipath = (IndexPath *) lfirst(lc); + /* + * To prevent unique paths from index skip scans being potentially used + * when not needed scan keep them in a separate pathlist. + */ + if (ipath->indexskipprefix != 0) + { + add_unique_path(rel, (Path *) ipath); + continue; + } + if (index->amhasgettuple) add_path(rel, (Path *) ipath); @@ -878,6 +888,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; + bool not_empty_qual = false; + bool can_skip; int indexcol; /* @@ -1027,6 +1039,60 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, index_only_scan = (scantype != ST_BITMAPSCAN && check_index_only(rel, index)); + /* Check if an index skip scan is possible. */ + can_skip = enable_indexskipscan & index->amcanskip; + + /* + * Skip scan is not supported when there are qual conditions, which are not + * covered by index. The reason for that is that those conditions are + * evaluated later, already after skipping was applied. + * + * TODO: This implementation is too restrictive, and doesn't allow e.g. + * index expressions. For that we need to examine index_clauses too. + */ + if (can_skip && root->parse->jointree != NULL) + { + ListCell *lc; + + foreach(lc, (List *)root->parse->jointree->quals) + { + Node *expr, *qual = (Node *) lfirst(lc); + Var *var; + bool found = false; + + if (!is_opclause(qual)) + { + not_empty_qual = true; + break; + } + + expr = get_leftop(qual); + + if (!IsA(expr, Var)) + { + not_empty_qual = true; + break; + } + + var = (Var *) expr; + + for (int i = 0; i < index->ncolumns; i++) + { + if (index->indexkeys[i] == var->varattno) + { + found = true; + break; + } + } + + if (!found) + { + not_empty_qual = true; + break; + } + } + } + /* * 4. Generate an indexscan path if there are relevant restriction clauses * in the current clauses, OR the index ordering is potentially useful for @@ -1050,6 +1116,21 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, false); result = lappend(result, ipath); + /* Consider index skip scan as well */ + if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual) + { + ListCell *lc; + + foreach(lc, root->query_uniquekeys) + { + UniqueKey *ukey = lfirst_node(UniqueKey, lc); + result = lappend(result, + create_skipscan_unique_path(root, index, + (Path *) ipath, + ukey->exprs)); + } + } + /* * If appropriate, consider parallel index scan. We don't allow * parallel index scan for bitmap index scans. @@ -1105,6 +1186,21 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, false); result = lappend(result, ipath); + /* Consider index skip scan as well */ + if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual) + { + ListCell *lc; + + foreach(lc, root->query_uniquekeys) + { + UniqueKey *ukey = lfirst_node(UniqueKey, lc); + result = lappend(result, + create_skipscan_unique_path(root, index, + (Path *) ipath, + ukey->exprs)); + } + } + /* If appropriate, consider parallel index scan */ if (index->amcanparallel && rel->consider_parallel && outer_relids == NULL && diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index eb9543f6ad..5b591036fb 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -177,12 +177,14 @@ static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix); static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix); static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig); @@ -3005,7 +3007,8 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexquals, fixed_indexorderbys, best_path->indexinfo->indextlist, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix); else scan_plan = (Scan *) make_indexscan(tlist, qpqual, @@ -3016,7 +3019,8 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexorderbys, indexorderbys, indexorderbyops, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix); copy_generic_path_info(&scan_plan->plan, &best_path->path); @@ -5283,7 +5287,8 @@ make_indexscan(List *qptlist, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize) { IndexScan *node = makeNode(IndexScan); Plan *plan = &node->scan.plan; @@ -5300,6 +5305,7 @@ make_indexscan(List *qptlist, node->indexorderbyorig = indexorderbyorig; node->indexorderbyops = indexorderbyops; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; return node; } @@ -5312,7 +5318,8 @@ make_indexonlyscan(List *qptlist, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize) { IndexOnlyScan *node = makeNode(IndexOnlyScan); Plan *plan = &node->scan.plan; @@ -5327,6 +5334,7 @@ make_indexonlyscan(List *qptlist, node->indexorderby = indexorderby; node->indextlist = indextlist; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; return node; } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 60656bdca5..cac2ca0b0f 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2971,6 +2971,70 @@ create_upper_unique_path(PlannerInfo *root, return pathnode; } +/* + * create_skipscan_unique_path + * Creates a pathnode the same as an existing IndexPath except based on + * skipping duplicate values. This may or may not be cheaper than using + * create_upper_unique_path. + * + * The input path must be an IndexPath for an index that supports amskip. + */ +IndexPath * +create_skipscan_unique_path(PlannerInfo *root, IndexOptInfo *index, + Path *basepath, List *unique_exprs) +{ + IndexPath *pathnode = makeNode(IndexPath); + int numDistinctRows; + int distinctPrefixKeys; + ListCell *lc; + List *exprs = NIL; + + + distinctPrefixKeys = list_length(unique_exprs); + + Assert(IsA(basepath, IndexPath)); + + /* We don't want to modify basepath, so make a copy. */ + memcpy(pathnode, basepath, sizeof(IndexPath)); + + /* + * Normally we can think about distinctPrefixKeys as just + * a number of distinct keys. But if lets say we have a + * distinct key a, and the index contains b, a in exactly + * this order. In such situation we need to use position + * of a in the index as distinctPrefixKeys, otherwise skip + * will happen only by the first column. + */ + foreach(lc, unique_exprs) + { + Expr *unique_expr = (Expr *) lfirst(lc); + Var *var = (Var *) unique_expr; + + exprs = lappend(exprs, unique_expr); + + for (int i = 0; i < index->ncolumns; i++) + { + if (index->indexkeys[i] == var->varattno) + { + distinctPrefixKeys = Max(i + 1, distinctPrefixKeys); + break; + } + } + } + + Assert(distinctPrefixKeys > 0); + pathnode->indexskipprefix = distinctPrefixKeys; + + numDistinctRows = estimate_num_groups(root, exprs, + pathnode->path.rows, + NULL); + + pathnode->path.total_cost = pathnode->path.startup_cost * numDistinctRows; + pathnode->path.rows = numDistinctRows; + + return pathnode; +} + /* * create_agg_path * Creates a pathnode that represents performing aggregation/grouping diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 0b2f9d398a..6da71e5828 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -272,6 +272,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amoptionalkey = amroutine->amoptionalkey; info->amsearcharray = amroutine->amsearcharray; info->amsearchnulls = amroutine->amsearchnulls; + info->amcanskip = (amroutine->amskip != NULL); info->amcanparallel = amroutine->amcanparallel; info->amhasgettuple = (amroutine->amgettuple != NULL); info->amhasgetbitmap = amroutine->amgetbitmap != NULL && diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 031ca0327f..56b2cecdfd 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -952,6 +952,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-skip-scan plans."), + NULL + }, + &enable_indexskipscan, + true, + NULL, NULL, NULL + }, { {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of bitmap-scan plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e430e33c7b..95def9aa34 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -355,6 +355,7 @@ #enable_hashjoin = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_indexskipscan = on #enable_material = on #enable_mergejoin = on #enable_nestloop = on diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 4325faa460..85cca4d520 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -130,6 +130,13 @@ typedef void (*amrescan_function) (IndexScanDesc scan, typedef bool (*amgettuple_function) (IndexScanDesc scan, ScanDirection direction); +/* skip past duplicates in a given prefix */ +typedef bool (*amskip_function) (IndexScanDesc scan, + ScanDirection dir, + ScanDirection indexdir, + bool start, + int prefix); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -231,6 +238,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 931257bd81..95ce4a5d80 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -180,6 +180,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); extern bool index_can_return(Relation indexRelation, int attno); +extern bool index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool start, int prefix); extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum); extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, diff --git a/src/include/access/sdir.h b/src/include/access/sdir.h index 23feb90986..094a127464 100644 --- a/src/include/access/sdir.h +++ b/src/include/access/sdir.h @@ -55,4 +55,11 @@ typedef enum ScanDirection #define ScanDirectionIsForward(direction) \ ((bool) ((direction) == ForwardScanDirection)) +/* + * ScanDirectionsAreOpposite + * True iff scan directions are backward/forward or forward/backward. + */ +#define ScanDirectionsAreOpposite(dirA, dirB) \ + ((bool) (dirA != NoMovementScanDirection && dirA == -dirB)) + #endif /* SDIR_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0187989fd1..cf1e729bec 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1429,6 +1429,8 @@ typedef struct IndexScanState ExprContext *iss_RuntimeContext; Relation iss_RelationDesc; struct IndexScanDescData *iss_ScanDesc; + int iss_SkipPrefixSize; + bool iss_FirstTupleEmitted; /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; @@ -1458,6 +1460,8 @@ typedef struct IndexScanState * TableSlot slot for holding tuples fetched from the table * VMBuffer buffer in use for visibility map testing, if any * PscanLen size of parallel index-only scan descriptor + * SkipPrefixSize number of keys for skip-based DISTINCT + * FirstTupleEmitted has the first tuple been emitted * ---------------- */ typedef struct IndexOnlyScanState @@ -1476,6 +1480,8 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; TupleTableSlot *ioss_TableSlot; Buffer ioss_VMBuffer; + int ioss_SkipPrefixSize; + bool ioss_FirstTupleEmitted; Size ioss_PscanLen; } IndexOnlyScanState; diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index a5c406bd4e..7287594582 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1229,6 +1229,9 @@ typedef struct Path * we need not recompute them when considering using the same index in a * bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath * itself represent the costs of an IndexScan or IndexOnlyScan plan type. + * + * 'indexskipprefix' represents the number of columns to consider for skip + * scans. *---------- */ typedef struct IndexPath @@ -1241,6 +1244,7 @@ typedef struct IndexPath ScanDirection indexscandir; Cost indextotalcost; Selectivity indexselectivity; + int indexskipprefix; } IndexPath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 83e01074ed..efafdf9361 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -409,6 +409,8 @@ typedef struct IndexScan List *indexorderbyorig; /* the same in original form */ List *indexorderbyops; /* OIDs of sort ops for ORDER BY exprs */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for distinct + * scans */ } IndexScan; /* ---------------- @@ -436,6 +438,8 @@ typedef struct IndexOnlyScan List *indexorderby; /* list of index ORDER BY exprs */ List *indextlist; /* TargetEntry list describing index's cols */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for distinct + * scans */ } IndexOnlyScan; /* ---------------- diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 613db8eab6..c0f176eaaa 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; extern PGDLLIMPORT bool enable_indexonlyscan; +extern PGDLLIMPORT bool enable_indexskipscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 6796ad8cb7..705deaf7bd 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -207,6 +207,10 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root, Path *subpath, int numCols, double numGroups); +extern IndexPath *create_skipscan_unique_path(PlannerInfo *root, + IndexOptInfo *index, + Path *subpath, + List *unique_exprs); extern AggPath *create_agg_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 06c4c3e476..4da9594925 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -79,6 +79,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_indexskipscan | on enable_material | on enable_mergejoin | on enable_nestloop | on @@ -90,7 +91,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(18 rows) +(20 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail -- 2.25.0