From 55eef1f64d3cb189443607f1f05052fe02196cf0 Mon Sep 17 00:00:00 2001 From: Andrey Date: Sun, 25 Aug 2019 14:18:31 +0500 Subject: [PATCH v9 2/2] Implement GiST build using sort support --- doc/src/sgml/gist.sgml | 52 ++++++ src/backend/access/gist/gistbuild.c | 221 ++++++++++++++++++++++++- src/backend/access/gist/gistproc.c | 41 ++++- src/backend/access/gist/gistutil.c | 30 +++- src/backend/access/gist/gistvalidate.c | 6 +- src/backend/utils/sort/tuplesort.c | 37 +++++ src/include/access/gist.h | 3 +- src/include/access/gist_private.h | 3 + src/include/catalog/pg_amproc.dat | 2 + src/include/utils/tuplesort.h | 6 + 10 files changed, 385 insertions(+), 16 deletions(-) diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml index 5d970ee9f2..6b1b99ff70 100644 --- a/doc/src/sgml/gist.sgml +++ b/doc/src/sgml/gist.sgml @@ -290,6 +290,8 @@ CREATE INDEX ON my_table USING GIST (my_inet_column inet_ops); compress method is omitted. The optional tenth method options is needed if the operator class provides the user-specified parameters. + compress method is omitted. The sortsupport + method is also optional and is used during fast GiST build. @@ -1096,6 +1098,56 @@ my_compress(PG_FUNCTION_ARGS) + + + sortsupport + + + Returns a comparator, suitable for fast GiST build. + + + + The SQL declaration of the function must look like this: + + +CREATE OR REPLACE FUNCTION my_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + + + The argument is a pointer to a SortSupport struct. + + + + The matching code in the C module could then follow this skeleton: + + +PG_FUNCTION_INFO_V1(my_sortsupport); + +static int +my_fastcmp(Datum x, Datum y, SortSupport ssup) +{ + /* establish order between x and y by computing some sorting value z */ + + int z1 = ComputeSpatialCode(x); + int z2 = ComputeSpatialCode(y); + + return z1 == z2 ? 0 : z1 > z2 ? 1 : -1; +} + +Datum +my_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = my_fastcmp; + PG_RETURN_VOID(); +} + + + + diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 671b5e9186..0df03c8c36 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -28,6 +28,8 @@ #include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/regproc.h" +#include "utils/tuplesort.h" /* Step of index tuples for check whether to switch to buffering build mode */ #define BUFFERING_MODE_SWITCH_CHECK_STEP 256 @@ -53,6 +55,15 @@ typedef enum GIST_BUFFERING_ACTIVE /* in buffering build mode */ } GistBufferingMode; +/* + * Status record for spooling/sorting phase. + */ +typedef struct +{ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + Relation index; +} GSpool; + /* Working state for gistbuild and its callback */ typedef struct { @@ -74,6 +85,7 @@ typedef struct HTAB *parentMap; GistBufferingMode bufferingMode; + GSpool *spool; } GISTBuildState; /* prototypes for private functions */ @@ -107,6 +119,175 @@ static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent); static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child); +static GSpool *gist_spoolinit(Relation heap, Relation index, SortSupport ssup); +static void gist_spool(GSpool *gspool, ItemPointer self, Datum *values, bool *isnull); +static void gist_indexsortbuild(GISTBuildState *state); +static void gist_indexsortbuild_flush(Relation rel, Page page, BlockNumber blockno, bool isNew); + +/* + * create and initialize a spool structure to sort tuples + */ +static GSpool * +gist_spoolinit(Relation heap, Relation index, SortSupport ssup) +{ + GSpool *gspool = (GSpool *) palloc0(sizeof(GSpool)); + + gspool->index = index; + + /* + * We size the sort area as maintenance_work_mem rather than work_mem to + * speed index creation. This should be OK since a single backend can't + * run multiple index creations in parallel. + */ + gspool->sortstate = tuplesort_begin_index_gist(heap, + index, + ssup, + maintenance_work_mem, + NULL, + false); + + return gspool; +} + +/* + * Flush page contents to actual page at blockno + * We have expected block number, because GiST build relies on that pages + * will be allocated in continous segments. This simplifies allocation + * logic. + */ +static void +gist_indexsortbuild_flush(Relation rel, Page page, BlockNumber blockno, bool isNew) +{ + OffsetNumber i, + maxoff; + Page newpage; + + Buffer buffer = ReadBuffer(rel, isNew ? P_NEW : blockno); + GISTInitBuffer(buffer, GistPageIsLeaf(page) ? F_LEAF : 0); + /* If the page is new - check that it was allocated correctly */ + Assert(BufferGetBlockNumber(buffer) == blockno); + + LockBuffer(buffer, GIST_EXCLUSIVE); + newpage = BufferGetPage(buffer); + + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId itemid = PageGetItemId(page, i); + IndexTuple tuple = (IndexTuple) PageGetItem(page, itemid); + gistfillbuffer(newpage, &tuple, + 1, InvalidOffsetNumber); + } + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + +/* + * given a spool loaded by successive calls to _h_spool, + * create an entire index. + */ +static void +gist_indexsortbuild(GISTBuildState *state) +{ + GSpool *gspool = state->spool; + /* + * Build constructs GiST by levels. Level is always allocated + * sequentially from level_start until level_end. + */ + BlockNumber level_start = GIST_ROOT_BLKNO + 1; + BlockNumber level_end = level_start; + BlockNumber prev_level_start; + IndexTuple itup; + /* + * We keep one page in memory for the special case + * When layer will have only on page - we will place it + * to ROOT_BLOCK_NO + */ + Page page = palloc(BLCKSZ); + gistinitpage(page, F_LEAF); + + tuplesort_performsort(gspool->sortstate); + + /* Create a first layer of leaf pages */ + while ((itup = tuplesort_getindextuple(gspool->sortstate, true)) != NULL) + { + if (PageGetFreeSpace(page) >= IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace) + { + gistfillbuffer(page, &itup, 1, InvalidOffsetNumber); + } + else + { + gist_indexsortbuild_flush(state->indexrel, page, level_end, true); + level_end++; + gistinitpage(page, F_LEAF); + gistfillbuffer(page, &itup, 1, InvalidOffsetNumber); + } + } + + /* Construct internal levels */ + do + { + /* If previous level had only one page - that page is a root */ + if (level_start == level_end) + { + gist_indexsortbuild_flush(state->indexrel, page, GIST_ROOT_BLKNO, + false); + return; + } + + gist_indexsortbuild_flush(state->indexrel, page, level_end, true); + level_end++; + gistinitpage(page, 0); + + prev_level_start = level_start; + level_start = level_end; + + for (BlockNumber i = prev_level_start; i < level_start; i++) + { + /* For each page on previous level we form one tuple */ + Buffer lower_buffer = ReadBuffer(state->indexrel, i); + Page lower_page; + IndexTuple union_tuple; + IndexTuple *itvec; + MemoryContext oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt); + int vect_len; + LockBuffer(lower_buffer, GIST_SHARE); + lower_page = BufferGetPage(lower_buffer); + + itvec = gistextractpage(lower_page, &vect_len); + union_tuple = gistunion(state->indexrel, itvec, vect_len, + state->giststate); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), i); + + if (PageGetFreeSpace(page) >= IndexTupleSize(union_tuple) + sizeof(ItemIdData) + state->freespace) + { + gistfillbuffer(page, &union_tuple, 1, InvalidOffsetNumber); + } + else + { + gist_indexsortbuild_flush(state->indexrel, page, level_end, true); + level_end++; + gistinitpage(page, 0); + gistfillbuffer(page, &union_tuple, 1, InvalidOffsetNumber); + } + + UnlockReleaseBuffer(lower_buffer); + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(state->giststate->tempCxt); + } + } while (true); +} + +/* + * spool an index entry into the sort file. + */ +static void +gist_spool(GSpool *gspool, ItemPointer self, Datum *values, bool *isnull) +{ + tuplesort_putindextuplevalues(gspool->sortstate, gspool->index, + self, values, isnull); +} + /* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain @@ -121,11 +302,15 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; - int fillfactor; + int fillfactor, i; + Oid SortSupportFnOids[INDEX_MAX_KEYS]; + bool hasallsortsupports = true; + int keyscount = IndexRelationGetNumberOfKeyAttributes(index); buildstate.indexrel = index; buildstate.heaprel = heap; + buildstate.spool = NULL; if (index->rd_options) { /* Get buffering mode from the options string */ @@ -152,6 +337,24 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; + for (i = 0; i < keyscount; i++) + { + SortSupportFnOids[i] = index_getprocid(index, i + 1, GIST_SORTSUPPORT_PROC); + if (!OidIsValid(SortSupportFnOids[i])) + { + hasallsortsupports = false; + break; + } + } + if (hasallsortsupports) + { + SortSupport sort = palloc0(sizeof(SortSupportData) * keyscount); + for (i = 0; i < keyscount; i++) + OidFunctionCall1(SortSupportFnOids[i], PointerGetDatum(sort + i)); + buildstate.bufferingMode = GIST_BUFFERING_DISABLED; + buildstate.spool = gist_spoolinit(heap, index, sort); + } + /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. @@ -208,6 +411,12 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) gistFreeBuildBuffers(buildstate.gfbb); } + if (buildstate.spool) + { + gist_indexsortbuild(&buildstate); + tuplesort_end(buildstate.spool->sortstate); + } + /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); @@ -449,14 +658,20 @@ gistBuildCallback(Relation index, GISTBuildState *buildstate = (GISTBuildState *) state; IndexTuple itup; MemoryContext oldCtx; + Datum compressed_values[INDEX_MAX_KEYS]; oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); /* form an index tuple and point it at the heap tuple */ - itup = gistFormTuple(buildstate->giststate, index, values, isnull, true); + itup = gistCompressValuesAndFormTuple(buildstate->giststate, index, values, isnull, true, compressed_values); itup->t_tid = *tid; - if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE) + if (buildstate->spool) + { + if (tupleIsAlive) + gist_spool(buildstate->spool, (ItemPointer)itup, compressed_values, isnull); + } + else if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE) { /* We have buffers, so use them. */ gistBufferingBuildInsert(buildstate, itup); diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c index a28ca4a0c4..994c797ce0 100644 --- a/src/backend/access/gist/gistproc.c +++ b/src/backend/access/gist/gistproc.c @@ -31,6 +31,11 @@ static bool gist_box_leaf_consistent(BOX *key, BOX *query, StrategyNumber strategy); static bool rtree_internal_consistent(BOX *key, BOX *query, StrategyNumber strategy); +static int64 part_bits32_by2(uint32 x); +static int64 interleave_bits32(uint32 x, uint32 y); +static inline uint64 point_zorder_internal(Point *p); +static int gist_bbox_fastcmp(Datum x, Datum y, SortSupport ssup); + /* Minimum accepted ratio of split */ #define LIMIT_RATIO 0.3 @@ -1542,6 +1547,8 @@ gist_poly_distance(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(distance); } +/* Z-order routines */ +/* Interleave 32 bits with zeroes */ static int64 part_bits32_by2(uint32 x) { @@ -1556,29 +1563,55 @@ part_bits32_by2(uint32 x) return n; } +/* + * Compute Z-order for integers. Also called Morton code. + */ static int64 interleave_bits32(uint32 x, uint32 y) { return part_bits32_by2(x) | (part_bits32_by2(y) << 1); } +/* Compute Z-order for Point */ static inline uint64 point_zorder_internal(Point *p) { + /* + * In this function we need to compute Morton codes for non-integral + * components p->x and p->y. But Morton codes are defined only for + * integral values. + * We expect floats to be in IEEE format, and the sort order of IEEE + * floats is mostly correlated to the binary sort order of the bits + * reinterpreted as an int. It isn't in some special cases, but for this + * use case we don't really care about that, we're just trying to + * encourage locality. + * There is a big jump in integer value (whether signed or + * unsigned) as you cross from positive to negative floats, and then the + * sort order is reversed. This can have negative effect on searches when + * query window touches many quadrants simulatously. In worst case this + * seaches can be x4 more costly. + * We generate a Morton code that interleaves the bits of N integers + * to produce a single integer that preserves locality: things that were + * close in the N dimensional space are close in the resulting integer. + */ union { float f; uint32 i; } a,b; a.f = p->x; b.f = p->y; + if (isnan(a.f)) + a.i = PG_INT32_MAX; + if (isnan(b.f)) + b.i = PG_INT32_MAX; return interleave_bits32(a.i, b.i); } static int -gist_point_fastcmp(Datum x, Datum y, SortSupport ssup) +gist_bbox_fastcmp(Datum x, Datum y, SortSupport ssup) { - Point *p1 = DatumGetPointP(x); - Point *p2 = DatumGetPointP(y); + Point *p1 = &(DatumGetBoxP(x)->low); + Point *p2 = &(DatumGetBoxP(y)->low); uint64 z1 = point_zorder_internal(p1); uint64 z2 = point_zorder_internal(p2); @@ -1590,6 +1623,6 @@ gist_point_sortsupport(PG_FUNCTION_ARGS) { SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); - ssup->comparator = gist_point_fastcmp; + ssup->comparator = gist_bbox_fastcmp; PG_RETURN_VOID(); } diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 765329bbcd..d63146ac28 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -173,6 +173,7 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, datum = index_getattr(itvec[j], i + 1, giststate->leafTupdesc, &IsNull); + if (IsNull) continue; @@ -575,6 +576,13 @@ gistFormTuple(GISTSTATE *giststate, Relation r, Datum attdata[], bool isnull[], bool isleaf) { Datum compatt[INDEX_MAX_KEYS]; + return gistCompressValuesAndFormTuple(giststate, r, attdata, isnull, isleaf, compatt); +} + +IndexTuple +gistCompressValuesAndFormTuple(GISTSTATE *giststate, Relation r, + Datum attdata[], bool isnull[], bool isleaf, Datum compatt[]) +{ int i; IndexTuple res; @@ -745,14 +753,10 @@ gistpenalty(GISTSTATE *giststate, int attno, * Initialize a new index page */ void -GISTInitBuffer(Buffer b, uint32 f) +gistinitpage(Page page, uint32 f) { - GISTPageOpaque opaque; - Page page; - Size pageSize; - - pageSize = BufferGetPageSize(b); - page = BufferGetPage(b); + GISTPageOpaque opaque; + Size pageSize = BLCKSZ; PageInit(page, pageSize, sizeof(GISTPageOpaqueData)); opaque = GistPageGetOpaque(page); @@ -763,6 +767,18 @@ GISTInitBuffer(Buffer b, uint32 f) opaque->gist_page_id = GIST_PAGE_ID; } +/* + * Initialize a new index buffer + */ +void +GISTInitBuffer(Buffer b, uint32 f) +{ + Page page; + + page = BufferGetPage(b); + gistinitpage(page, f); +} + /* * Verify that a freshly-read page looks sane. */ diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c index a285736a81..1630b505b0 100644 --- a/src/backend/access/gist/gistvalidate.c +++ b/src/backend/access/gist/gistvalidate.c @@ -143,6 +143,10 @@ gistvalidate(Oid opclassoid) case GIST_OPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case GIST_SORTSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -263,7 +267,7 @@ gistvalidate(Oid opclassoid) continue; /* got it */ if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || - i == GIST_OPTIONS_PROC) + i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC) continue; /* optional methods */ ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 3c49476483..c5c7e5af05 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -1167,6 +1167,43 @@ tuplesort_begin_index_hash(Relation heapRel, return state; } +Tuplesortstate * +tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + SortSupport ssup, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->sortKeys = ssup; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 4994351697..4f6dae9a76 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -37,7 +37,8 @@ #define GIST_DISTANCE_PROC 8 #define GIST_FETCH_PROC 9 #define GIST_OPTIONS_PROC 10 -#define GISTNProcs 10 +#define GIST_SORTSUPPORT_PROC 11 +#define GISTNProcs 11 /* * Page opaque data in a GiST index page. diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 4bfc628000..5cd9dd60bf 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -497,12 +497,15 @@ extern IndexTuple gistgetadjusted(Relation r, GISTSTATE *giststate); extern IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, Datum *attdata, bool *isnull, bool isleaf); +extern IndexTuple gistCompressValuesAndFormTuple(GISTSTATE *giststate, Relation r, + Datum attdata[], bool isnull[], bool isleaf, Datum compatt[]); extern OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate); extern void GISTInitBuffer(Buffer b, uint32 f); +extern void gistinitpage(Page page, uint32 f); extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, Datum k, Relation r, Page pg, OffsetNumber o, bool l, bool isNull); diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 37b580883f..a8e0c4ff8a 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -480,6 +480,8 @@ amproc => 'gist_point_distance' }, { amprocfamily => 'gist/point_ops', amproclefttype => 'point', amprocrighttype => 'point', amprocnum => '9', amproc => 'gist_point_fetch' }, +{ amprocfamily => 'gist/point_ops', amproclefttype => 'point', + amprocrighttype => 'point', amprocnum => '11', amproc => 'gist_point_sortsupport' }, { amprocfamily => 'gist/box_ops', amproclefttype => 'box', amprocrighttype => 'box', amprocnum => '1', amproc => 'gist_box_consistent' }, { amprocfamily => 'gist/box_ops', amproclefttype => 'box', diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index d992b4875a..2ecf1f0d81 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -25,6 +25,7 @@ #include "executor/tuptable.h" #include "storage/dsm.h" #include "utils/relcache.h" +#include "utils/sortsupport.h" /* @@ -217,6 +218,11 @@ extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, uint32 max_buckets, int workMem, SortCoordinate coordinate, bool randomAccess); +extern Tuplesortstate *tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + SortSupport ssup, + int workMem, SortCoordinate coordinate, + bool randomAccess); extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, -- 2.24.2 (Apple Git-127)