diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 0818196..02da20c 100644 *** a/doc/src/sgml/indices.sgml --- b/doc/src/sgml/indices.sgml *************** *** 1309,1314 **** SELECT target FROM tests WHERE subject = 'some-subject' AND success; --- 1309,1321 ---- such cases and allow index-only scans to be generated, but older versions will not. + + + In addition, in some cases index only scan technique may be internally used for optimization of queries including OFFSET. + That is applicable to queries executed using index scan, not using GROUP BY + and where all the data values required by a WHERE predicate are available in the index. + Index type must support index only scans as well. + diff --git a/src/backend/executoindex 14b0b89..9a2d214 100644 *** a/src/backend/executor/execParallel.c --- b/src/backend/executor/execParallel.c *************** *** 1303,1310 **** ParallelQueryMain(dsm_segment *seg, shm_toc *toc) pwcxt.seg = seg; ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt); ! /* Pass down any tuple bound */ ! ExecSetTupleBound(fpes->tuples_needed, queryDesc->planstate); /* * Run the plan. If we specified a tuple bound, be careful not to demand --- 1303,1310 ---- pwcxt.seg = seg; ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt); ! /* Pass down any tuple bound. Offset cannot be optimized due to parallel execution. */ ! ExecSetTupleBound(fpes->tuples_needed, 0, queryDesc->planstate); /* * Run the plan. If we specified a tuple bound, be careful not to demand diff --git a/src/backend/executor/execProcindex 43a27a9..e6504e3 100644 *** a/src/backend/executor/execProcnode.c --- b/src/backend/executor/execProcnode.c *************** *** 783,793 **** ExecShutdownNode(PlanState *node) /* * ExecSetTupleBound * ! * Set a tuple bound for a planstate node. This lets child plan nodes * optimize based on the knowledge that the maximum number of tuples that ! * their parent will demand is limited. The tuple bound for a node may ! * only be changed between scans (i.e., after node initialization or just ! * before an ExecReScan call). * * Any negative tuples_needed value means "no limit", which should be the * default assumption when this is not called at all for a particular node. --- 783,794 ---- /* * ExecSetTupleBound * ! * Set a tuple bound for a planstate node. This lets child plan nodes * optimize based on the knowledge that the maximum number of tuples that ! * their parent will demand is limited. Also tuples_to_skip may be used by ! * child nodes to optimize retrieval of tuples which are immediately skipped ! * by parent (nodeLimit). The tuple bound for a node may only be changed ! * between scans (i.e., after node initialization or just before an ExecReScan call). * * Any negative tuples_needed value means "no limit", which should be the * default assumption when this is not called at all for a particular node. *************** *** 797,803 **** ExecShutdownNode(PlanState *node) * only unchanging conditions are tested here. */ void ! ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) { /* * Since this function recurses, in principle we should check stack depth --- 798,804 ---- * only unchanging conditions are tested here. */ void ! ExecSetTupleBound(int64 tuples_needed, int64 tuples_to_skip, PlanState *child_node) { /* * Since this function recurses, in principle we should check stack depth *************** *** 839,845 **** ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) int i; for (i = 0; i < maState->ms_nplans; i++) ! ExecSetTupleBound(tuples_needed, maState->mergeplans[i]); } else if (IsA(child_node, ResultState)) { --- 840,846 ---- int i; for (i = 0; i < maState->ms_nplans; i++) ! ExecSetTupleBound(tuples_needed, 0, maState->mergeplans[i]); } else if (IsA(child_node, ResultState)) { *************** *** 853,859 **** ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) * rows will be demanded from the Result child anyway. */ if (outerPlanState(child_node)) ! ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); } else if (IsA(child_node, SubqueryScanState)) { --- 854,860 ---- * rows will be demanded from the Result child anyway. */ if (outerPlanState(child_node)) ! ExecSetTupleBound(tuples_needed, 0, outerPlanState(child_node)); } else if (IsA(child_node, SubqueryScanState)) { *************** *** 864,870 **** ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) SubqueryScanState *subqueryState = (SubqueryScanState *) child_node; if (subqueryState->ss.ps.qual == NULL) ! ExecSetTupleBound(tuples_needed, subqueryState->subplan); } else if (IsA(child_node, GatherState)) { --- 865,871 ---- SubqueryScanState *subqueryState = (SubqueryScanState *) child_node; if (subqueryState->ss.ps.qual == NULL) ! ExecSetTupleBound(tuples_needed, tuples_to_skip, subqueryState->subplan); } else if (IsA(child_node, GatherState)) { *************** *** 881,887 **** ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) gstate->tuples_needed = tuples_needed; /* Also pass down the bound to our own copy of the child plan */ ! ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); } else if (IsA(child_node, GatherMergeState)) { --- 882,888 ---- gstate->tuples_needed = tuples_needed; /* Also pass down the bound to our own copy of the child plan */ ! ExecSetTupleBound(tuples_needed, 0, outerPlanState(child_node)); } else if (IsA(child_node, GatherMergeState)) { *************** *** 890,896 **** ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) gstate->tuples_needed = tuples_needed; ! ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); } /* --- 891,905 ---- gstate->tuples_needed = tuples_needed; ! ExecSetTupleBound(tuples_needed, 0, outerPlanState(child_node)); ! } ! else if (IsA(child_node, IndexScanState)) ! { ! IndexScanState* isState = (IndexScanState *) child_node; ! ! /* Simple case of IndexScan could use index-only optimisation while skipping offset. */ ! if (!isState->ss.ps.qual && !isState->ss.ps.ps_ProjInfo && isState->iss_NumOrderByKeys == 0) ! isState->iss_SkipTuples = isState->iss_SkipTuplesRemaining = tuples_to_skip; } /* diff --git a/src/backend/executor/nodeIndeindex 01c9de8..60bd19d 100644 *** a/src/backend/executor/nodeIndexscan.c --- b/src/backend/executor/nodeIndexscan.c *************** *** 31,41 **** --- 31,43 ---- #include "access/nbtree.h" #include "access/relscan.h" + #include "access/visibilitymap.h" #include "catalog/pg_am.h" #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "lib/pairingheap.h" #include "miscadmin.h" + #include "storage/predicate.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "utils/array.h" *************** *** 86,91 **** IndexNext(IndexScanState *node) --- 88,94 ---- IndexScanDesc scandesc; HeapTuple tuple; TupleTableSlot *slot; + ItemPointer tid; /* * extract necessary information from index scan node *************** *** 118,123 **** IndexNext(IndexScanState *node) --- 121,135 ---- node->iss_ScanDesc = scandesc; + if (node->iss_SkipTuples != 0) + { + /* NodeLimit optimisation is allowed only for simple scans. See ExecSetTupleBound for details. */ + Assert((!node->ss.ps.qual && !node->ss.ps.ps_ProjInfo && node->iss_NumOrderByKeys == 0)); + + /* Set it up for index-only scan if we are going to use it for skipped tupples. */ + node->iss_VMBuffer = InvalidBuffer; + } + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. *************** *** 128,171 **** IndexNext(IndexScanState *node) node->iss_OrderByKeys, node->iss_NumOrderByKeys); } ! /* ! * ok, now that we have what we need, fetch the next tuple. ! */ ! while ((tuple = index_getnext(scandesc, direction)) != NULL) { CHECK_FOR_INTERRUPTS(); /* ! * Store the scanned tuple in the scan tuple slot of the scan state. ! * Note: we pass 'false' because tuples returned by amgetnext are ! * pointers onto disk pages and must not be pfree()'d. ! */ ! ExecStoreTuple(tuple, /* tuple to store */ ! slot, /* slot to store in */ ! scandesc->xs_cbuf, /* buffer containing tuple */ ! false); /* don't pfree */ ! ! /* ! * If the index was lossy, we have to recheck the index quals using ! * the fetched tuple. ! */ ! if (scandesc->xs_recheck) { ! econtext->ecxt_scantuple = slot; ! if (!ExecQualAndReset(node->indexqualorig, econtext)) { ! /* Fails recheck, so drop it and loop back for another */ ! InstrCountFiltered2(node, 1); ! continue; } } return slot; } /* * if we get here it means the index scan failed so we are at the end of ! * the scan.. */ node->iss_ReachedEnd = true; return ExecClearTuple(slot); --- 140,244 ---- node->iss_OrderByKeys, node->iss_NumOrderByKeys); } ! /** ! * Use visibility buffer while tuples are skipped by parent nodeLimit ! * in case of simple scan. Refer to nodeIndexonlyscan.h for comments ! * about memory ordering and concurrency. ! */ ! while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { CHECK_FOR_INTERRUPTS(); /* ! * Fetch the next tuple. Use visibility map if possible. ! * xs_want_itup is set to false because we do not need any index data. ! */ ! if (node->iss_SkipTuplesRemaining == 0 || /* tuples are not skipped by parent node */ ! scandesc->xs_recheck || /* or heap data is required */ ! scandesc->xs_continue_hot || /* or non-MVCC snapshot */ ! !VM_ALL_VISIBLE(scandesc->heapRelation, ! ItemPointerGetBlockNumber(tid), ! &node->iss_VMBuffer) /* or not all tuples are visible in page */ ! ) { ! tuple = index_fetch_heap(scandesc); ! if (tuple == NULL) ! continue; /* no visible tuple, try next index entry */ ! ! /* ! * Store the scanned tuple in the scan tuple slot of the scan state. ! * Note: we pass 'false' because tuples returned by amgetnext are ! * pointers onto disk pages and must not be pfree()'d. ! */ ! ExecStoreTuple(tuple, /* tuple to store */ ! slot, /* slot to store in */ ! scandesc->xs_cbuf, /* buffer containing tuple */ ! false); /* don't pfree */ ! ! /* ! * If the index was lossy, we have to recheck the index quals. ! */ ! if (scandesc->xs_recheck) { ! econtext->ecxt_scantuple = slot; ! if (!ExecQualAndReset(node->indexqualorig, econtext)) ! { ! /* Fails recheck, so drop it and loop back for another */ ! InstrCountFiltered2(node, 1); ! continue; ! } } + + /* + * Note: at this point we are holding a pin on the heap page, as + * recorded in scandesc->xs_cbuf. We could release that pin now, + * but it's not clear whether it's a win to do so. The next index + * entry might require a visit to the same heap page. + */ + } + else /* tuple is skipped by parent node and visible */ + { + /* + * Predicate locks for index-only scans must be acquired at the page + * level when the heap is not accessed, since tuple-level predicate + * locks need the tuple's xmin value. If we had to visit the tuple + * anyway, then we already have the tuple-level lock and can skip the + * page lock. + */ + PredicateLockPage(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + estate->es_snapshot); + /* + * We know there is a tuple in index passing all checks. + * Parent nodeLimit will skip it anyway - so just prepare fake non-empty slot. + */ + ExecClearTuple(slot); + slot->tts_isempty = false; + slot->tts_nvalid = 0; } + /* + * Decrement counter for remaining skipped tuples. + * If last tuple skipped - release the buffer. + */ + if (node->iss_SkipTuplesRemaining > 0) + node->iss_SkipTuplesRemaining--; + + if (node->iss_SkipTuplesRemaining == 0 && node->iss_VMBuffer != InvalidBuffer) + { + /* + * If we had to return one more tuple then regular index scan will used. + * So, we can unpin VM buffer. + */ + ReleaseBuffer(node->iss_VMBuffer); + node->iss_VMBuffer = InvalidBuffer; + } return slot; } /* * if we get here it means the index scan failed so we are at the end of ! * the scan. */ node->iss_ReachedEnd = true; return ExecClearTuple(slot); *************** *** 604,609 **** ExecReScanIndexScan(IndexScanState *node) --- 677,683 ---- node->iss_ScanKeys, node->iss_NumScanKeys, node->iss_OrderByKeys, node->iss_NumOrderByKeys); node->iss_ReachedEnd = false; + node->iss_SkipTuplesRemaining = node->iss_SkipTuples; /* Reset counter for skipped tuples to skip them again. */ ExecScanReScan(&node->ss); } *************** *** 813,818 **** ExecEndIndexScan(IndexScanState *node) --- 887,899 ---- indexScanDesc = node->iss_ScanDesc; relation = node->ss.ss_currentRelation; + /* Release VM buffer pin, if any. */ + if (node->iss_VMBuffer != InvalidBuffer) + { + ReleaseBuffer(node->iss_VMBuffer); + node->iss_VMBuffer = InvalidBuffer; + } + /* * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext */ diff --git a/src/backend/executor/nodeLimitindex 56d98b4..2a896df 100644 *** a/src/backend/executor/nodeLimit.c --- b/src/backend/executor/nodeLimit.c *************** *** 298,309 **** recompute_limits(LimitState *node) node->lstate = LIMIT_RESCAN; /* ! * Notify child node about limit. Note: think not to "optimize" by ! * skipping ExecSetTupleBound if compute_tuples_needed returns < 0. We ! * must update the child node anyway, in case this is a rescan and the * previous time we got a different result. */ ! ExecSetTupleBound(compute_tuples_needed(node), outerPlanState(node)); } /* --- 298,309 ---- node->lstate = LIMIT_RESCAN; /* ! * Notify child node about limit and offset. Note: think not to "optimize" ! * by skipping ExecSetTupleBound if compute_tuples_needed < 0 and offset = 0. ! * We must update the child node anyway, in case this is a rescan and the * previous time we got a different result. */ ! ExecSetTupleBound(compute_tuples_needed(node), node->offset, outerPlanState(node)); } /* diff --git a/src/include/executor/execuindex 45a077a..58d9447 100644 *** a/src/include/executor/executor.h --- b/src/include/executor/executor.h *************** *** 220,226 **** extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern bool ExecShutdownNode(PlanState *node); ! extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); /* ---------------------------------------------------------------- --- 220,226 ---- extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern bool ExecShutdownNode(PlanState *node); ! extern void ExecSetTupleBound(int64 tuples_needed, int64 tuples_to_skip, PlanState *child_node); /* ---------------------------------------------------------------- diff --git a/src/include/nodes/execnodindex a953820..875011e 100644 *** a/src/include/nodes/execnodes.h --- b/src/include/nodes/execnodes.h *************** *** 1228,1233 **** typedef struct IndexScanState --- 1228,1236 ---- bool *iss_OrderByTypByVals; int16 *iss_OrderByTypLens; Size iss_PscanLen; + int64 iss_SkipTuples; /* tuple offset, see ExecSetTupleBound */ + int64 iss_SkipTuplesRemaining; /* tuple offset counter */ + Buffer iss_VMBuffer; /* buffer used for visibility map in case of iss_SkipTuples > 0 */ } IndexScanState; /* ----------------