From 68c39a3e9b12fefe1071bad3858005cdaab93374 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Mon, 8 Jun 2020 20:39:13 +0200 Subject: [PATCH v37 4/6] Index skip scan Implementation of basic Index Skip Scan (see Loose Index Scan in the wiki [1]) infrastructure on top of IndexOnlyScan and IndexScan. Introduces a new index am function amskip to allow advance past duplicate key values in a scan. This innocently looking description could be a bit tricky on the edge between am specific and common parts of the implementation, mostly due to different information available at each level, e.g. visibility. This means the common parts should apply skipping multiple times if necessary. Original patch and design were proposed by Thomas Munro [2], revived and improved by Dmitry Dolgov and Jesper Pedersen. [1] https://wiki.postgresql.org/wiki/Loose_indexscan [2] /message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com Author: Jesper Pedersen, Dmitry Dolgov Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan --- contrib/bloom/blutils.c | 1 + doc/src/sgml/indexam.sgml | 1 + src/backend/access/brin/brin.c | 1 + src/backend/access/gin/ginutil.c | 1 + src/backend/access/gist/gist.c | 1 + src/backend/access/hash/hash.c | 1 + src/backend/access/index/indexam.c | 18 ++ src/backend/access/spgist/spgutils.c | 1 + src/backend/commands/explain.c | 25 +++ src/backend/executor/nodeIndexonlyscan.c | 97 ++++++++- src/backend/executor/nodeIndexscan.c | 56 +++++- src/backend/nodes/copyfuncs.c | 2 + src/backend/nodes/outfuncs.c | 2 + src/backend/nodes/readfuncs.c | 2 + src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/path/indxpath.c | 187 +++++++++++++++++- src/backend/optimizer/path/pathkeys.c | 54 ++++- src/backend/optimizer/plan/createplan.c | 20 +- src/backend/optimizer/util/pathnode.c | 37 ++++ src/backend/optimizer/util/plancat.c | 1 + src/backend/utils/misc/guc.c | 9 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/amapi.h | 7 + src/include/access/genam.h | 2 + src/include/access/sdir.h | 7 + src/include/nodes/execnodes.h | 6 + src/include/nodes/pathnodes.h | 4 + src/include/nodes/plannodes.h | 4 + src/include/optimizer/cost.h | 1 + src/include/optimizer/pathnode.h | 4 + src/include/optimizer/paths.h | 5 +- src/test/regress/expected/sysviews.out | 3 +- 32 files changed, 545 insertions(+), 17 deletions(-) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 26b9927c3a..9f41527bdb 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -134,6 +134,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = blcostestimate; amroutine->amoptions = bloptions; amroutine->amproperty = NULL; diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 649020b7da..1988192fbe 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -151,6 +151,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 1f72562c60..622316ce40 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -115,6 +115,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = brincostestimate; amroutine->amoptions = brinoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index ef9b56fd36..a32301e19e 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -66,6 +66,7 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = gincostestimate; amroutine->amoptions = ginoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 25b42e38f2..e909409187 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -87,6 +87,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; amroutine->amcanreturn = gistcanreturn; + amroutine->amskip = NULL; amroutine->amcostestimate = gistcostestimate; amroutine->amoptions = gistoptions; amroutine->amproperty = gistproperty; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 7c9ccf446c..d81a3de74b 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -84,6 +84,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; amroutine->amcostestimate = hashcostestimate; amroutine->amoptions = hashoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 3fb8688f8f..8e0f1b0140 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -33,6 +33,7 @@ * index_can_return - does index support index-only scans? * index_getprocid - get a support procedure OID * index_getprocinfo - get a support procedure's lookup info + * index_skip - advance past duplicate key values in a scan * * NOTES * This file contains the index_ routines which used @@ -737,6 +738,23 @@ index_can_return(Relation indexRelation, int attno) return indexRelation->rd_indam->amcanreturn(indexRelation, attno); } +/* ---------------- + * index_skip + * + * Skip past all tuples where the first 'prefix' columns have the + * same value as the last tuple returned in the current scan. + * ---------------- + */ +bool +index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool scanstart, int prefix) +{ + SCAN_CHECKS; + + return scan->indexRelation->rd_indam->amskip(scan, direction, + indexdir, prefix); +} + /* ---------------- * index_getprocid * diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 64d3ba8288..70a610a030 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -69,6 +69,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; amroutine->amcanreturn = spgcanreturn; + amroutine->amskip = NULL; amroutine->amcostestimate = spgcostestimate; amroutine->amoptions = spgoptions; amroutine->amproperty = spgproperty; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 43f9b01e83..aa960d9ffb 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -149,6 +149,7 @@ static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es); static void ExplainIndentText(ExplainState *es); static void ExplainJSONLineEnding(ExplainState *es); static void ExplainYAMLLineStarting(ExplainState *es); +static void ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es); static void escape_yaml(StringInfo buf, const char *str); @@ -1081,6 +1082,22 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) return planstate_tree_walker(planstate, ExplainPreScanNode, rels_used); } +/* + * ExplainIndexSkipScanKeys - + * Append information about index skip scan to es->str. + * + * Can be used to print the skip prefix size. + */ +static void +ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es) +{ + if (skipPrefixSize > 0) + { + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyInteger("Distinct Prefix", NULL, skipPrefixSize, es); + } +} + /* * ExplainNode - * Appends a description of a plan tree to es->str @@ -1418,6 +1435,8 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexScan *indexscan = (IndexScan *) plan; + ExplainIndexSkipScanKeys(indexscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexscan->indexid, indexscan->indexorderdir, es); @@ -1428,6 +1447,8 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + ExplainIndexSkipScanKeys(indexonlyscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexonlyscan->indexid, indexonlyscan->indexorderdir, es); @@ -1688,6 +1709,8 @@ ExplainNode(PlanState *planstate, List *ancestors, switch (nodeTag(plan)) { case T_IndexScan: + if (((IndexScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyBool("Skip scan", true, es); show_scan_qual(((IndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); if (((IndexScan *) plan)->indexqualorig) @@ -1701,6 +1724,8 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); break; case T_IndexOnlyScan: + if (((IndexOnlyScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyBool("Skip scan", true, es); show_scan_qual(((IndexOnlyScan *) plan)->indexqual, "Index Cond", planstate, ancestors, es); if (((IndexOnlyScan *) plan)->indexqual) diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 5617ac29e7..670bf1a15e 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -41,6 +41,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/predicate.h" +#include "storage/itemptr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -62,9 +63,26 @@ IndexOnlyNext(IndexOnlyScanState *node) EState *estate; ExprContext *econtext; ScanDirection direction; + ScanDirection readDirection; IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; + ItemPointerData startTid; + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan; + + /* + * Tells if the current position was reached via skipping. In this case + * there is no nead for the index_getnext_tid + */ + bool skipped = false; + + /* + * Index only scan must be aware that in case of skipping we can return to + * the starting point due to visibility checks. In this situation we need + * to jump further, and number of skipping attempts tell us how far do we + * need to do so. + */ + int skipAttempts = 0; /* * extract necessary information from index scan node @@ -72,7 +90,7 @@ IndexOnlyNext(IndexOnlyScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexonlyscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -114,16 +132,87 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); } + else + { + ItemPointerCopy(&scandesc->xs_heaptid, &startTid); + } + + /* + * Check if we need to skip to the next key prefix, because we've been + * asked to implement DISTINCT. + * + * When fetching a cursor in the direction opposite to a general scan + * direction, the result must be what normal fetching should have + * returned, but in reversed order. In other words, return the last or + * first scanned tuple in a DISTINCT set, depending on a cursor direction. + * Due to that we skip also when the first tuple wasn't emitted yet, but + * the directions are opposite. + */ + if (node->ioss_SkipPrefixSize > 0 && + (node->ioss_FirstTupleEmitted || + ScanDirectionsAreOpposite(direction, indexonlyscan->indexorderdir))) + { + if (!index_skip(scandesc, direction, indexonlyscan->indexorderdir, + !node->ioss_FirstTupleEmitted, node->ioss_SkipPrefixSize)) + { + /* + * Reached end of index. At this point currPos is invalidated, and + * we need to reset ioss_FirstTupleEmitted, since otherwise after + * going backwards, reaching the end of index, and going forward + * again we apply skip again. It would be incorrect and lead to an + * extra skipped item. + */ + node->ioss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + else + { + skipAttempts = 1; + skipped = true; + tid = &scandesc->xs_heaptid; + } + } + + readDirection = skipped ? indexonlyscan->indexorderdir : direction; /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (skipped || (tid = index_getnext_tid(scandesc, readDirection)) != NULL) { bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + /* + * If we already emitted first tuple, while doing index only skip scan + * with advancing and reading in different directions we can return to + * the same position where we started after visibility check. Recognize + * such situations and skip more. + */ + if ((readDirection != direction) && node->ioss_FirstTupleEmitted && + ItemPointerIsValid(&startTid) && ItemPointerEquals(&startTid, tid)) + { + int i; + skipAttempts += 1; + + for (i = 0; i < skipAttempts; i++) + { + if (!index_skip(scandesc, direction, + indexonlyscan->indexorderdir, + !node->ioss_FirstTupleEmitted, + node->ioss_SkipPrefixSize)) + { + node->ioss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + } + + tid = &scandesc->xs_heaptid; + } + + skipped = false; + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -250,6 +339,8 @@ IndexOnlyNext(IndexOnlyScanState *node) ItemPointerGetBlockNumber(tid), estate->es_snapshot); + node->ioss_FirstTupleEmitted = true; + return slot; } @@ -504,6 +595,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan; + indexstate->ioss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->ioss_FirstTupleEmitted = false; /* * Miscellaneous initialization diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index d0a96a38e0..449aaec3ac 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -85,6 +85,13 @@ IndexNext(IndexScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; + IndexScan *indexscan = (IndexScan *) node->ss.ps.plan; + + /* + * tells if the current position was reached via skipping. In this case + * there is no nead for the index_getnext_tid + */ + bool skipped = false; /* * extract necessary information from index scan node @@ -92,7 +99,7 @@ IndexNext(IndexScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -117,6 +124,12 @@ IndexNext(IndexScanState *node) node->iss_ScanDesc = scandesc; + /* Index skip scan assumes xs_want_itup, so set it to true */ + if (indexscan->indexskipprefixsize > 0) + node->iss_ScanDesc->xs_want_itup = true; + else + node->iss_ScanDesc->xs_want_itup = false; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -127,12 +140,48 @@ IndexNext(IndexScanState *node) node->iss_OrderByKeys, node->iss_NumOrderByKeys); } + /* + * Check if we need to skip to the next key prefix, because we've been + * asked to implement DISTINCT. + * + * When fetching a cursor in the direction opposite to a general scan + * direction, the result must be what normal fetching should have + * returned, but in reversed order. In other words, return the last or + * first scanned tuple in a DISTINCT set, depending on a cursor direction. + * Due to that we skip also when the first tuple wasn't emitted yet, but + * the directions are opposite. + */ + if (node->iss_SkipPrefixSize > 0 && + (node->iss_FirstTupleEmitted || + ScanDirectionsAreOpposite(direction, indexscan->indexorderdir))) + { + if (!index_skip(scandesc, direction, indexscan->indexorderdir, + !node->iss_FirstTupleEmitted, node->iss_SkipPrefixSize)) + { + /* + * Reached end of index. At this point currPos is invalidated, and + * we need to reset iss_FirstTupleEmitted, since otherwise after + * going backwards, reaching the end of index, and going forward + * again we apply skip again. It would be incorrect and lead to an + * extra skipped item. + */ + node->iss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + else + { + skipped = true; + index_fetch_heap(scandesc, slot); + } + } + /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while (skipped || index_getnext_slot(scandesc, direction, slot)) { CHECK_FOR_INTERRUPTS(); + skipped = false; /* * If the index was lossy, we have to recheck the index quals using @@ -149,6 +198,7 @@ IndexNext(IndexScanState *node) } } + node->iss_FirstTupleEmitted = true; return slot; } @@ -910,6 +960,8 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexScan; + indexstate->iss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->iss_FirstTupleEmitted = false; /* * Miscellaneous initialization diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 34392f5553..5bd1e1ae5d 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -490,6 +490,7 @@ _copyIndexScan(const IndexScan *from) COPY_NODE_FIELD(indexorderbyorig); COPY_NODE_FIELD(indexorderbyops); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); return newnode; } @@ -515,6 +516,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indextlist); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 53cf4fcfa1..282a00089d 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -559,6 +559,7 @@ _outIndexScan(StringInfo str, const IndexScan *node) WRITE_NODE_FIELD(indexorderbyorig); WRITE_NODE_FIELD(indexorderbyops); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); } static void @@ -573,6 +574,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node) WRITE_NODE_FIELD(indexorderby); WRITE_NODE_FIELD(indextlist); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 7b9e8c3292..58fb93aada 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -1826,6 +1826,7 @@ _readIndexScan(void) READ_NODE_FIELD(indexorderbyorig); READ_NODE_FIELD(indexorderbyops); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); READ_DONE(); } @@ -1845,6 +1846,7 @@ _readIndexOnlyScan(void) READ_NODE_FIELD(indexorderby); READ_NODE_FIELD(indextlist); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); READ_DONE(); } diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 733f7ea543..cf31c42552 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -132,6 +132,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; bool enable_indexonlyscan = true; +bool enable_indexskipscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index bcb1bc6097..27c945413b 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -780,6 +780,16 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { IndexPath *ipath = (IndexPath *) lfirst(lc); + /* + * To prevent unique paths from index skip scans being potentially used + * when not needed scan keep them in a separate pathlist. + */ + if (ipath->indexskipprefix != 0) + { + add_unique_path(rel, (Path *) ipath); + continue; + } + if (index->amhasgettuple) add_path(rel, (Path *) ipath); @@ -862,12 +872,15 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, double loop_count; List *orderbyclauses; List *orderbyclausecols; - List *index_pathkeys; + List *index_pathkeys = NIL; List *useful_pathkeys; + List *index_pathkeys_pos = NIL; bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; + bool not_empty_qual = false; + bool can_skip; int indexcol; /* @@ -985,7 +998,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, if (index_is_ordered && pathkeys_possibly_useful) { index_pathkeys = build_index_pathkeys(root, index, - ForwardScanDirection); + ForwardScanDirection, + &index_pathkeys_pos); useful_pathkeys = truncate_useless_pathkeys(root, rel, index_pathkeys); orderbyclauses = NIL; @@ -1017,6 +1031,120 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, index_only_scan = (scantype != ST_BITMAPSCAN && check_index_only(rel, index)); + /* Check if an index skip scan is possible. */ + can_skip = enable_indexskipscan & index->amcanskip; + + if (can_skip) + { + /* + * Skip scan is not supported when there are qual conditions, which are not + * covered by index. The reason for that is that those conditions are + * evaluated later, already after skipping was applied. + * + * TODO: This implementation is too restrictive, and doesn't allow e.g. + * index expressions. For that we need to examine index_clauses too. + */ + if (root->parse->jointree != NULL) + { + ListCell *lc; + + foreach(lc, (List *)root->parse->jointree->quals) + { + Node *expr, *qual = (Node *) lfirst(lc); + Var *var; + bool found = false; + + if (!is_opclause(qual)) + { + not_empty_qual = true; + break; + } + + expr = get_leftop(qual); + + if (!IsA(expr, Var)) + { + not_empty_qual = true; + break; + } + + var = (Var *) expr; + + for (int i = 0; i < index->ncolumns; i++) + { + if (index->indexkeys[i] == var->varattno) + { + found = true; + break; + } + } + + if (!found) + { + not_empty_qual = true; + break; + } + } + } + + /* + * For an index scan verify that index fully covers distinct + * expressions, otherwise there is not enough information for skipping + */ + if (!index_only_scan && root->query_uniquekeys != NULL) + { + ListCell *lc; + + foreach(lc, root->query_uniquekeys) + { + UniqueKey *uniqueKey = (UniqueKey *) lfirst(lc); + ListCell *lc1; + + foreach(lc1, uniqueKey->exprs) + { + Expr *expr = (Expr *) lfirst(lc1); + bool found = false; + + if (!IsA(expr, Var)) + { + ListCell *lc2; + + foreach(lc2, index->indexprs) + { + if(equal(lfirst(lc1), lfirst(lc2))) + { + found = true; + break; + } + } + } + else + { + Var *var = (Var *) expr; + + for (int i = 0; i < index->ncolumns; i++) + { + if (index->indexkeys[i] == var->varattno) + { + found = true; + break; + } + } + } + + if (!found) + { + can_skip = false; + break; + } + } + + if (!can_skip) + break; + } + } + } + /* * 4. Generate an indexscan path if there are relevant restriction clauses * in the current clauses, OR the index ordering is potentially useful for @@ -1040,6 +1168,32 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, false); result = lappend(result, ipath); + /* Consider index skip scan as well */ + if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual) + { + int numusefulkeys = list_length(useful_pathkeys); + int numsortkeys = list_length(root->query_pathkeys); + + if (numusefulkeys == numsortkeys) + { + int prefix; + if (list_length(root->distinct_pathkeys) > 0) + prefix = find_index_prefix_for_pathkey(index_pathkeys, + index_pathkeys_pos, + llast_node(PathKey, + root->distinct_pathkeys)); + else + /* all are distinct keys are constant and optimized away. + * skipping with 1 is sufficient as all are constant anyway + */ + prefix = 1; + + result = lappend(result, + create_skipscan_unique_path(root, index, + (Path *) ipath, prefix)); + } + } + /* * If appropriate, consider parallel index scan. We don't allow * parallel index scan for bitmap index scans. @@ -1078,7 +1232,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, if (index_is_ordered && pathkeys_possibly_useful) { index_pathkeys = build_index_pathkeys(root, index, - BackwardScanDirection); + BackwardScanDirection, + &index_pathkeys_pos); useful_pathkeys = truncate_useless_pathkeys(root, rel, index_pathkeys); if (useful_pathkeys != NIL) @@ -1095,6 +1250,32 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, false); result = lappend(result, ipath); + /* Consider index skip scan as well */ + if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual) + { + int numusefulkeys = list_length(useful_pathkeys); + int numsortkeys = list_length(root->query_pathkeys); + + if (numusefulkeys == numsortkeys) + { + int prefix; + if (list_length(root->distinct_pathkeys) > 0) + prefix = find_index_prefix_for_pathkey(index_pathkeys, + index_pathkeys_pos, + llast_node(PathKey, + root->distinct_pathkeys)); + else + /* all are distinct keys are constant and optimized away. + * skipping with 1 is sufficient as all are constant anyway + */ + prefix = 1; + + result = lappend(result, + create_skipscan_unique_path(root, index, + (Path *) ipath, prefix)); + } + } + /* If appropriate, consider parallel index scan */ if (index->amcanparallel && rel->consider_parallel && outer_relids == NULL && diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 97e1c2de83..61c0ea98e7 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -522,6 +522,47 @@ get_cheapest_parallel_safe_total_inner(List *paths) * NEW PATHKEY FORMATION ****************************************************************************/ +/* + * Find the prefix size for a specific path key in an index. For example, an + * index with (a,b,c) finding path key b will return prefix 2. Optionally + * pathkeys_positions can be provided, to specify at which position in the + * original pathkey list this particular key could be found (this is helpful + * when we deal with redundant pathkeys). + * + * Returns 0 when not found. + */ +int +find_index_prefix_for_pathkey(List *index_pathkeys, + List *pathkeys_positions, + PathKey *target_pathkey) +{ + ListCell *lc; + int i; + + i = 0; + foreach(lc, index_pathkeys) + { + PathKey *cpathkey = (PathKey *) lfirst(lc); + + if (cpathkey == target_pathkey) + { + /* + * Prefix expected to start from 1, increment positions since + * they're 0 based. + */ + if (pathkeys_positions != NIL && + pathkeys_positions->length > i) + return list_nth_int(pathkeys_positions, i) + 1; + else + return i + 1; + } + + i++; + } + + return 0; +} + /* * build_index_pathkeys * Build a pathkeys list that describes the ordering induced by an index @@ -534,7 +575,9 @@ get_cheapest_parallel_safe_total_inner(List *paths) * We iterate only key columns of covering indexes, since non-key columns * don't influence index ordering. The result is canonical, meaning that * redundant pathkeys are removed; it may therefore have fewer entries than - * there are key columns in the index. + * there are key columns in the index. Since by removing redundant pathkeys the + * information about original position is lost, return it via positions + * argument. * * Another reason for stopping early is that we may be able to tell that * an index column's sort order is uninteresting for this query. However, @@ -545,7 +588,8 @@ get_cheapest_parallel_safe_total_inner(List *paths) List * build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index, - ScanDirection scandir) + ScanDirection scandir, + List **positions) { List *retval = NIL; ListCell *lc; @@ -554,6 +598,8 @@ build_index_pathkeys(PlannerInfo *root, if (index->sortopfamily == NULL) return NIL; /* non-orderable index */ + *positions = NIL; + i = 0; foreach(lc, index->indextlist) { @@ -607,7 +653,11 @@ build_index_pathkeys(PlannerInfo *root, * for this query. Add it to list, unless it's redundant. */ if (!pathkey_is_redundant(cpathkey, retval)) + { retval = lappend(retval, cpathkey); + *positions = lappend_int(*positions, + foreach_current_index(lc)); + } } else { diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 94280a730c..d62e96c24c 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -177,12 +177,14 @@ static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix); static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix); static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig); @@ -2987,7 +2989,8 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexquals, fixed_indexorderbys, best_path->indexinfo->indextlist, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix); else scan_plan = (Scan *) make_indexscan(tlist, qpqual, @@ -2998,7 +3001,8 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexorderbys, indexorderbys, indexorderbyops, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix); copy_generic_path_info(&scan_plan->plan, &best_path->path); @@ -5265,7 +5269,8 @@ make_indexscan(List *qptlist, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize) { IndexScan *node = makeNode(IndexScan); Plan *plan = &node->scan.plan; @@ -5282,6 +5287,7 @@ make_indexscan(List *qptlist, node->indexorderbyorig = indexorderbyorig; node->indexorderbyops = indexorderbyops; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; return node; } @@ -5294,7 +5300,8 @@ make_indexonlyscan(List *qptlist, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize) { IndexOnlyScan *node = makeNode(IndexOnlyScan); Plan *plan = &node->scan.plan; @@ -5309,6 +5316,7 @@ make_indexonlyscan(List *qptlist, node->indexorderby = indexorderby; node->indextlist = indextlist; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; return node; } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index abcfe4d9d7..0e682920f6 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3004,6 +3004,43 @@ create_upper_unique_path(PlannerInfo *root, return pathnode; } +/* + * create_skipscan_unique_path + * Creates a pathnode the same as an existing IndexPath except based on + * skipping duplicate values. This may or may not be cheaper than using + * create_upper_unique_path. + * + * The input path must be an IndexPath for an index that supports amskip. + */ +IndexPath * +create_skipscan_unique_path(PlannerInfo *root, IndexOptInfo *index, + Path *basepath, int prefix) +{ + IndexPath *pathnode = makeNode(IndexPath); + int numDistinctRows; + UniqueKey *ukey; + + Assert(IsA(basepath, IndexPath)); + + /* We don't want to modify basepath, so make a copy. */ + memcpy(pathnode, basepath, sizeof(IndexPath)); + + ukey = linitial_node(UniqueKey, root->query_uniquekeys); + + Assert(prefix > 0); + pathnode->indexskipprefix = prefix; + pathnode->path.uniquekeys = root->query_uniquekeys; + + numDistinctRows = estimate_num_groups(root, ukey->exprs, + pathnode->path.rows, + NULL); + + pathnode->path.total_cost = pathnode->path.startup_cost * numDistinctRows; + pathnode->path.rows = numDistinctRows; + + return pathnode; +} + /* * create_agg_path * Creates a pathnode that represents performing aggregation/grouping diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index bdd4876a84..6fd82ea6fd 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -272,6 +272,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amoptionalkey = amroutine->amoptionalkey; info->amsearcharray = amroutine->amsearcharray; info->amsearchnulls = amroutine->amsearchnulls; + info->amcanskip = (amroutine->amskip != NULL); info->amcanparallel = amroutine->amcanparallel; info->amhasgettuple = (amroutine->amgettuple != NULL); info->amhasgetbitmap = amroutine->amgetbitmap != NULL && diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index a62d64eaa4..005de5fb56 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -957,6 +957,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-skip-scan plans."), + NULL + }, + &enable_indexskipscan, + true, + NULL, NULL, NULL + }, { {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of bitmap-scan plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 9cb571f7cc..d5d7b7834d 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -359,6 +359,7 @@ #enable_hashjoin = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_indexskipscan = on #enable_material = on #enable_mergejoin = on #enable_nestloop = on diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 85b4766016..9f0e81ec94 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -172,6 +172,12 @@ typedef void (*amrescan_function) (IndexScanDesc scan, typedef bool (*amgettuple_function) (IndexScanDesc scan, ScanDirection direction); +/* skip past duplicates in a given prefix */ +typedef bool (*amskip_function) (IndexScanDesc scan, + ScanDirection dir, + ScanDirection indexdir, + int prefix); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -274,6 +280,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 68d90f5141..f3c5668dd0 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -180,6 +180,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); extern bool index_can_return(Relation indexRelation, int attno); +extern bool index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool start, int prefix); extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum); extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, diff --git a/src/include/access/sdir.h b/src/include/access/sdir.h index 23feb90986..094a127464 100644 --- a/src/include/access/sdir.h +++ b/src/include/access/sdir.h @@ -55,4 +55,11 @@ typedef enum ScanDirection #define ScanDirectionIsForward(direction) \ ((bool) ((direction) == ForwardScanDirection)) +/* + * ScanDirectionsAreOpposite + * True iff scan directions are backward/forward or forward/backward. + */ +#define ScanDirectionsAreOpposite(dirA, dirB) \ + ((bool) (dirA != NoMovementScanDirection && dirA == -dirB)) + #endif /* SDIR_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 6c0a7d68d6..0b0464c0b1 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1425,6 +1425,8 @@ typedef struct IndexScanState ExprContext *iss_RuntimeContext; Relation iss_RelationDesc; struct IndexScanDescData *iss_ScanDesc; + int iss_SkipPrefixSize; + bool iss_FirstTupleEmitted; /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; @@ -1454,6 +1456,8 @@ typedef struct IndexScanState * TableSlot slot for holding tuples fetched from the table * VMBuffer buffer in use for visibility map testing, if any * PscanLen size of parallel index-only scan descriptor + * SkipPrefixSize number of keys for skip-based DISTINCT + * FirstTupleEmitted has the first tuple been emitted * ---------------- */ typedef struct IndexOnlyScanState @@ -1472,6 +1476,8 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; TupleTableSlot *ioss_TableSlot; Buffer ioss_VMBuffer; + int ioss_SkipPrefixSize; + bool ioss_FirstTupleEmitted; Size ioss_PscanLen; } IndexOnlyScanState; diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 4a4ae3e926..bbab87411f 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1228,6 +1228,9 @@ typedef struct Path * we need not recompute them when considering using the same index in a * bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath * itself represent the costs of an IndexScan or IndexOnlyScan plan type. + * + * 'indexskipprefix' represents the number of columns to consider for skip + * scans. *---------- */ typedef struct IndexPath @@ -1240,6 +1243,7 @@ typedef struct IndexPath ScanDirection indexscandir; Cost indextotalcost; Selectivity indexselectivity; + int indexskipprefix; } IndexPath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 7e6b10f86b..aed2ceacd0 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -401,6 +401,8 @@ typedef struct IndexScan List *indexorderbyorig; /* the same in original form */ List *indexorderbyops; /* OIDs of sort ops for ORDER BY exprs */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for distinct + * scans */ } IndexScan; /* ---------------- @@ -428,6 +430,8 @@ typedef struct IndexOnlyScan List *indexorderby; /* list of index ORDER BY exprs */ List *indextlist; /* TargetEntry list describing index's cols */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for distinct + * scans */ } IndexOnlyScan; /* ---------------- diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 6141654e47..67192a9188 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; extern PGDLLIMPORT bool enable_indexonlyscan; +extern PGDLLIMPORT bool enable_indexskipscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 6796ad8cb7..8ec1780a56 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -207,6 +207,10 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root, Path *subpath, int numCols, double numGroups); +extern IndexPath *create_skipscan_unique_path(PlannerInfo *root, + IndexOptInfo *index, + Path *subpath, + int prefix); extern AggPath *create_agg_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 0cb8030e33..6ebcb0e0a4 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -198,8 +198,11 @@ extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths, Relids required_outer, double fraction); extern Path *get_cheapest_parallel_safe_total_inner(List *paths); +extern int find_index_prefix_for_pathkey(List *index_pathkeys, + List *pathkey_positions, + PathKey *target_pathkey); extern List *build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index, - ScanDirection scandir); + ScanDirection scandir, List **positions); extern List *build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel, ScanDirection scandir, bool *partialkeys); extern List *build_expression_pathkey(PlannerInfo *root, Expr *expr, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 81bdacf59d..5cbc2c49cf 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -95,6 +95,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_indexskipscan | on enable_material | on enable_mergejoin | on enable_nestloop | on @@ -106,7 +107,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(18 rows) +(21 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail -- 2.21.0