diff --git a/contrib/pageinspect/expected/hash.out b/contrib/pageinspect/expected/hash.out index 3ba01f6..c97b279 100644 --- a/contrib/pageinspect/expected/hash.out +++ b/contrib/pageinspect/expected/hash.out @@ -51,13 +51,13 @@ bsize | 8152 bmsize | 4096 bmshift | 15 maxbucket | 3 -highmask | 7 -lowmask | 3 -ovflpoint | 2 +highmask | 3 +lowmask | 1 +ovflpoint | 3 firstfree | 0 nmaps | 1 procid | 450 -spares | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +spares | {0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} mapp | {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, diff --git a/contrib/pageinspect/hashfuncs.c b/contrib/pageinspect/hashfuncs.c index 812a03f..3149029 100644 --- a/contrib/pageinspect/hashfuncs.c +++ b/contrib/pageinspect/hashfuncs.c @@ -502,10 +502,11 @@ hash_metapage_info(PG_FUNCTION_ARGS) TupleDesc tupleDesc; HeapTuple tuple; int i, - j; + j, + p; Datum values[16]; bool nulls[16]; - Datum spares[HASH_MAX_SPLITPOINTS]; + Datum spares[HASH_MAX_SPLITPOINTS * HASH_SPLITPOINT_PHASES]; Datum mapp[HASH_MAX_BITMAPS]; if (!superuser()) @@ -541,9 +542,11 @@ hash_metapage_info(PG_FUNCTION_ARGS) values[j++] = ObjectIdGetDatum((Oid) metad->hashm_procid); for (i = 0; i < HASH_MAX_SPLITPOINTS; i++) - spares[i] = Int64GetDatum((int64) metad->hashm_spares[i]); + for (p = 0; p < HASH_SPLITPOINT_PHASES; p++) + spares[(i * HASH_SPLITPOINT_PHASES) + p] = + Int64GetDatum((int64) metad->hashm_spares[i][p]); values[j++] = PointerGetDatum(construct_array(spares, - HASH_MAX_SPLITPOINTS, + HASH_MAX_SPLITPOINTS * HASH_SPLITPOINT_PHASES, INT8OID, 8, FLOAT8PASSBYVAL, 'd')); diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 9f41bb2..f19066f 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -667,11 +667,11 @@ bmshift | 15 maxbucket | 12512 highmask | 16383 lowmask | 8191 -ovflpoint | 14 +ovflpoint | 50 firstfree | 1204 nmaps | 1 procid | 450 -spares | {0,0,0,0,0,0,1,1,1,1,1,4,59,704,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +spares | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} mapp | {65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 1541438..8789805 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -58,35 +58,50 @@ rules to support a variable number of overflow pages while not having to move primary bucket pages around after they are created. Primary bucket pages (henceforth just "bucket pages") are allocated in -power-of-2 groups, called "split points" in the code. Buckets 0 and 1 -are created when the index is initialized. At the first split, buckets 2 -and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated; -when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket -pages of a power-of-2 group appear consecutively in the index. This -addressing scheme allows the physical location of a bucket page to be -computed from the bucket number relatively easily, using only a small -amount of control information. We take the log2() of the bucket number -to determine which split point S the bucket belongs to, and then simply -add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the -metapage) to compute the physical address. hashm_spares[S] can be -interpreted as the total number of overflow pages that have been allocated -before the bucket pages of splitpoint S. hashm_spares[0] is always 0, -so that buckets 0 and 1 (which belong to splitpoint 0) always appear at -block numbers 1 and 2, just after the meta page. We always have +power-of-2 groups, called "split points" in the code. That means on every new +split points we double the existing number of buckets. And, it seems bad to +allocate huge chunks of bucket pages all at once and we take ages to consume it. +To avoid this exponential growth of index size, we did a trick to breakup +allocation of buckets at splitpoint into 4 equal phases. If 2^x is the total +buckets need to be allocated at a splitpoint (from now on we shall call this +as splitpoint group), then we allocate 1/4th (2^(x - 2)) of total buckets at +each phase of splitpoint group. Next quarter of allocation will only happen if +buckets of previous phase has been already consumed. Since for buckets number +< 4 we cannot further divide it in to multiple phases, the first splitpoint +group 0's allocation is done as follows {1, 1, 1, 1} = 4 buckets in total, the +numbers in curly braces indicate number of buckets allocated within each phase +of splitpoint group 0. In next splitpoint group 1 the allocation phases will +be as follow {1, 1, 1, 1} = 8 buckets in total. And, for splitpoint group 2 +and 3 allocation phase will be {2, 2, 2, 2} = 16 buckets in total and +{4, 4, 4, 4} = 32 buckets in total. We can see that at each splitpoint group +we double the total number of buckets from previous group but in a incremental +phase. The bucket pages allocated within one phase of a splitpoint group will +appear consecutively in the index. This addressing scheme allows the physical +location of a bucket page to be computed from the bucket number relatively +easily, using only a small amount of control information. If we look at the +function _hash_spareindex for a given bucket number we first compute splitpoint +group it belongs to and then the phase with in it to which the bucket belongs +to. Adding them we get the global splitpoint phase number S to which the +bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[] is +an array stored in the metapage) with given bucket number to compute its +physical address. The hashm_spares[S] can be interpreted as the total number +of overflow pages that have been allocated before the bucket pages of +splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1 +(which belong to splitpoint group 0's phase 1 and phase 2 respectively) always +appear at block numbers 1 and 2, just after the meta page. We always have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the -former. The difference between the two represents the number of overflow -pages appearing between the bucket page groups of splitpoints N and N+1. - +former. The difference between the two represents the number of overflow +pages appearing between the bucket page groups of splitpoints phase N and N+1. (Note: the above describes what happens when filling an initially minimally sized hash index. In practice, we try to estimate the required index size -and allocate a suitable number of splitpoints immediately, to avoid +and allocate a suitable number of splitpoints phases immediately, to avoid expensive re-splitting during initial index build.) When S splitpoints exist altogether, the array entries hashm_spares[0] through hashm_spares[S] are valid; hashm_spares[S] records the current total number of overflow pages. New overflow pages are created as needed at the end of the index, and recorded by incrementing hashm_spares[S]. -When it is time to create a new splitpoint's worth of bucket pages, we +When it is time to create a new splitpoint phase's worth of bucket pages, we copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is stored in the hashm_ovflpoint field of the meta page). This has the effect of reserving the correct number of bucket pages at the end of the @@ -101,7 +116,7 @@ We have to allow the case "greater than" because it's possible that during an index extension we crash after allocating filesystem space and before updating the metapage. Note that on filesystems that allow "holes" in files, it's entirely likely that pages before the logical EOF are not yet -allocated: when we allocate a new splitpoint's worth of bucket pages, we +allocated: when we allocate a new splitpoint phase's worth of bucket pages, we physically zero the last such page to force the EOF up, and the first such page will be used immediately, but the intervening pages are not written until needed. diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 34cc08f..6b162a7 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -548,7 +548,7 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, num_index_tuples = 0; /* - * We need a copy of the metapage so that we can use its hashm_spares[] + * We need a copy of the metapage so that we can use its hashm_spares[][] * values to compute bucket page addresses, but a cached copy should be * good enough. (If not, we'll detect that further down and refresh the * cache as necessary.) @@ -575,7 +575,7 @@ loop_top: bool split_cleanup = false; /* Get address of bucket's start page */ - bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); + bucket_blkno = bucket_to_blkno(cachedmetap, cur_bucket); blkno = bucket_blkno; diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index de7522e..02f3328 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -263,7 +263,9 @@ hash_xlog_add_ovfl_page(XLogReaderState *record) if (!xlrec->bmpage_found) { - metap->hashm_spares[metap->hashm_ovflpoint]++; + uint32 split_grp = SP_GRP(metap->hashm_ovflpoint); + uint32 split_phase = SP_PHASE(metap->hashm_ovflpoint); + metap->hashm_spares[split_grp][split_phase]++; if (new_bmpage) { @@ -271,7 +273,7 @@ hash_xlog_add_ovfl_page(XLogReaderState *record) metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; metap->hashm_nmaps++; - metap->hashm_spares[metap->hashm_ovflpoint]++; + metap->hashm_spares[split_grp][split_phase]++; } } @@ -388,7 +390,8 @@ hash_xlog_split_allocate_page(XLogReaderState *record) ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ - metap->hashm_spares[ovflpoint] = *ovflpages; + metap->hashm_spares[SP_GRP(ovflpoint)][SP_PHASE(ovflpoint)] = + *ovflpages; metap->hashm_ovflpoint = ovflpoint; } diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index a3cae21..41cba14 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -41,7 +41,8 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) /* Determine the split number for this page (must be >= 1) */ for (i = 1; - i < splitnum && ovflbitnum > metap->hashm_spares[i]; + i < splitnum && + ovflbitnum > metap->hashm_spares[SP_GRP(i)][SP_PHASE(i)]; i++) /* loop */ ; @@ -49,7 +50,7 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) * Convert to absolute page number by adding the number of bucket pages * that exist before this split point. */ - return (BlockNumber) ((1 << i) + ovflbitnum); + return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); } /* @@ -67,17 +68,18 @@ _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) /* Determine the split number containing this page */ for (i = 1; i <= splitnum; i++) { - if (ovflblkno <= (BlockNumber) (1 << i)) + if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) break; /* oops */ - bitnum = ovflblkno - (1 << i); + bitnum = ovflblkno - _hash_get_totalbuckets(i); /* * bitnum has to be greater than number of overflow page added in * previous split point. The overflow page at this splitnum (i) if any - * should start from ((2 ^ i) + metap->hashm_spares[i - 1] + 1). + * should start from (_hash_get_totalbuckets(i) + + * metap->hashm_spares[SP_GRP(i - 1)][SP_GRP(i -1)] + 1). */ - if (bitnum > metap->hashm_spares[i - 1] && - bitnum <= metap->hashm_spares[i]) + if (bitnum > metap->hashm_spares[SP_GRP(i - 1)][SP_PHASE(i -1)] && + bitnum <= metap->hashm_spares[SP_GRP(i)][SP_PHASE(i)]) return bitnum - 1; /* -1 to convert 1-based to 0-based */ } @@ -120,6 +122,8 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) BlockNumber blkno; uint32 orig_firstfree; uint32 splitnum; + uint32 split_grp, + split_phase; uint32 *freep = NULL; uint32 max_ovflpg; uint32 bit; @@ -201,7 +205,9 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) /* want to end search with the last existing overflow page */ splitnum = metap->hashm_ovflpoint; - max_ovflpg = metap->hashm_spares[splitnum] - 1; + split_grp = SP_GRP(splitnum); + split_phase = SP_PHASE(splitnum); + max_ovflpg = metap->hashm_spares[split_grp][split_phase] - 1; last_page = max_ovflpg >> BMPG_SHIFT(metap); last_bit = max_ovflpg & BMPG_MASK(metap); @@ -273,7 +279,7 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) * marked "in use". Subsequent pages do not exist yet, but it is * convenient to pre-mark them as "in use" too. */ - bit = metap->hashm_spares[splitnum]; + bit = metap->hashm_spares[split_grp][split_phase]; /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) @@ -294,7 +300,8 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) /* Calculate address of the new overflow page */ bit = BufferIsValid(newmapbuf) ? - metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; + metap->hashm_spares[split_grp][split_phase] + 1 : + metap->hashm_spares[split_grp][split_phase]; blkno = bitno_to_blkno(metap, bit); /* @@ -329,7 +336,7 @@ found: else { /* update the count to indicate new overflow page is added */ - metap->hashm_spares[splitnum]++; + metap->hashm_spares[split_grp][split_phase]++; if (BufferIsValid(newmapbuf)) { @@ -339,7 +346,7 @@ found: /* add the new bitmap page to the metapage's list of bitmaps */ metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); metap->hashm_nmaps++; - metap->hashm_spares[splitnum]++; + metap->hashm_spares[split_grp][split_phase]++; MarkBufferDirty(metabuf); } diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 622cc4b..0d17e90 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -422,7 +422,7 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); - blkno = BUCKET_TO_BLKNO(metap, i); + blkno = bucket_to_blkno(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); @@ -502,14 +502,15 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, Page page; double dnumbuckets; uint32 num_buckets; - uint32 log2_num_buckets; + uint32 spare_index; uint32 i; /* * Choose the number of initial bucket pages to match the fill factor - * given the estimated number of tuples. We round up the result to the - * next power of 2, however, and always force at least 2 bucket pages. The - * upper limit is determined by considerations explained in + * given the estimated number of tuples. We round up the result to total + * the number of buckets which has to be allocated before using its + * _hashm_spares index slot, however, and always force at least 2 bucket + * pages. The upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; @@ -518,11 +519,10 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else - num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); + num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); - log2_num_buckets = _hash_log2(num_buckets); - Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); - Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); + spare_index = _hash_spareindex(num_buckets); + Assert(spare_index < (HASH_MAX_SPLITPOINTS * HASH_SPLITPOINT_PHASES)); page = BufferGetPage(buf); if (initpage) @@ -563,18 +563,20 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, /* * We initialize the index with N buckets, 0 .. N-1, occupying physical - * blocks 1 to N. The first freespace bitmap page is in block N+1. Since - * N is a power of 2, we can set the masks this way: + * blocks 1 to N. The first freespace bitmap page is in block N+1. */ - metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; - metap->hashm_highmask = (num_buckets << 1) - 1; + metap->hashm_maxbucket = num_buckets - 1; + + /* set hishmask, which should be sufficient to cover num_buckets. */ + metap->hashm_highmask = (1 << (_hash_log2(num_buckets))) - 1; + metap->hashm_lowmask = (metap->hashm_highmask >> 1); MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ - metap->hashm_spares[log2_num_buckets] = 1; - metap->hashm_ovflpoint = log2_num_buckets; + metap->hashm_spares[SP_GRP(spare_index)][SP_PHASE(spare_index)] = 1; + metap->hashm_ovflpoint = spare_index; metap->hashm_firstfree = 0; /* @@ -655,7 +657,7 @@ restart_expand: * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because * the calculation maxbucket+1 mustn't overflow). Currently we restrict * to half that because of overflow looping in _hash_log2() and - * insufficient space in hashm_spares[]. It's moot anyway because an + * insufficient space in hashm_spares[][]. It's moot anyway because an * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. @@ -682,7 +684,7 @@ restart_expand: old_bucket = (new_bucket & metap->hashm_lowmask); - start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); + start_oblkno = bucket_to_blkno(metap, old_bucket); buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); if (!buf_oblkno) @@ -766,32 +768,49 @@ restart_expand: * There shouldn't be any active scan on new bucket. * * Note: it is safe to compute the new bucket's blkno here, even though we - * may still need to update the BUCKET_TO_BLKNO mapping. This is because - * the current value of hashm_spares[hashm_ovflpoint] correctly shows - * where we are going to put a new splitpoint's worth of buckets. + * may still need to update the bucket_to_blkno mapping. This is because + * the current value of + * hashm_spares[SP_GRP(hashm_ovflpoint)][SP_PHASE(hashm_ovflpoint)] + * correctly shows where we are going to put a new splitpoint's worth of + * buckets. */ - start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); + start_nblkno = bucket_to_blkno(metap, new_bucket); /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to allocate a new batch of bucket pages. + * If the split point is increasing we need to allocate a new batch of + * bucket pages. */ - spare_ndx = _hash_log2(new_bucket + 1); + spare_ndx = _hash_spareindex(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { + uint32 buckets_toadd = 0; + uint32 splitpoint_group = 0; + Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* - * The number of buckets in the new splitpoint is equal to the total - * number already in existence, i.e. new_bucket. Currently this maps - * one-to-one to blocks required, but someday we may need a more - * complicated calculation here. We treat allocation of buckets as a - * separate WAL-logged action. Even if we fail after this operation, + * The number of buckets in the new splitpoint group is equal to the + * total number already in existence, i.e. new_bucket. But we do not + * allocate them at once. Each splitpoint group will have 4 slots, we + * distribute the buckets equally among them. So we allocate only one + * fourth of total buckets in new splitpoint group at a time to consume + * one phase after another. We treat allocation of buckets as a + * separate WAL-logged action. Even if we fail after this operation, * won't leak bucket pages; rather, the next split will consume this * space. In any case, even without failure we don't use all the space * in one split operation. */ - if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) + splitpoint_group = (spare_ndx >> 2); + + /* + * Each phase in the splitpoint_group will allocate + * 2^(splitpoint_group - 1) buckets if we divide buckets among 4 + * slots. The 0th group is a special case where we allocate 1 bucket + * per slot as we cannot reduce it any further. See README for more + * details. + */ + buckets_toadd = (splitpoint_group) ? (1 << (splitpoint_group - 1)) : 1; + if (!_hash_alloc_buckets(rel, start_nblkno, buckets_toadd)) { /* can't split due to BlockNumber overflow */ _hash_relbuf(rel, buf_oblkno); @@ -836,14 +855,15 @@ restart_expand: } /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to adjust the hashm_spares[] array and - * hashm_ovflpoint so that future overflow pages will be created beyond - * this new batch of bucket pages. + * If the split point is increasing we need to adjust the hashm_spares[] + * array and hashm_ovflpoint so that future overflow pages will be created + * beyond this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { - metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; + metap->hashm_spares[SP_GRP(spare_ndx)][SP_PHASE(spare_ndx)] = + metap->hashm_spares[SP_GRP(metap->hashm_ovflpoint)] + [SP_PHASE(metap->hashm_ovflpoint)]; metap->hashm_ovflpoint = spare_ndx; metap_update_splitpoint = true; } @@ -917,12 +937,15 @@ restart_expand: if (metap_update_splitpoint) { + uint32 splitpoint_grp = SP_GRP(metap->hashm_ovflpoint); + uint32 splitpoint_phase = SP_PHASE(metap->hashm_ovflpoint); + xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, sizeof(uint32)); XLogRegisterBufData(2, - (char *) &metap->hashm_spares[metap->hashm_ovflpoint], - sizeof(uint32)); + (char *) &metap->hashm_spares[splitpoint_grp][splitpoint_phase], + sizeof(uint32)); } XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); @@ -1543,7 +1566,7 @@ _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, metap->hashm_highmask, metap->hashm_lowmask); - blkno = BUCKET_TO_BLKNO(metap, bucket); + blkno = bucket_to_blkno(metap, bucket); /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE); diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c index 0e0f393..8aa8769 100644 --- a/src/backend/access/hash/hashsort.c +++ b/src/backend/access/hash/hashsort.c @@ -56,9 +56,8 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets) * num_buckets buckets in the index, the appropriate mask can be computed * as follows. * - * Note: at present, the passed-in num_buckets is always a power of 2, so - * we could just compute num_buckets - 1. We prefer not to assume that - * here, though. + * NOTE : This hash_mask calculation should be in sync with similar + * calculation in _hash_init_metabuffer. */ hspool->hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1; diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 2e99719..30d6a6a 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -25,6 +25,19 @@ old_bucket | (lowmask + 1) /* + * bucket_to_blkno -- given the bucket returns its block number in index. + */ +BlockNumber +bucket_to_blkno(HashMetaPage metap, Bucket B) +{ + uint32 prev_spare_idx = _hash_spareindex(B + 1) - 1; + + return ((BlockNumber) ((B) + + ((B) ? (metap)->hashm_spares[SP_GRP(prev_spare_idx)] + [SP_PHASE(prev_spare_idx)] : 0)) + 1); +} + +/* * _hash_checkqual -- does the index tuple satisfy the scan conditions? */ bool @@ -149,6 +162,87 @@ _hash_log2(uint32 num) return i; } +#define SPLITPOINT_PHASES_PER_GRP 4 +#define SPLITPOINT_PHASE_MASK (SPLITPOINT_PHASES_PER_GRP - 1) + +#define TOTAL_SPLITPOINT_PHASES_BEFORE_GROUP(sp_g) (sp_g > 0 ? (sp_g << 2) : 0) + +/* + * This is just a trick to save a division operation. If you look into the + * bitmap of 0-based bucket_num 2nd and 3rd most significant bit will indicate + * which phase of allocation the bucket_num belogs to with in the group. This + * is because at every splitpoint group we allocate 2^x buckets and we have + * divided the allocation process into 4 equal phases. This macro returns value + * from 0 to 3. + */ +#define SPLITPOINT_PHASES_WITHIN_GROUP(sp_g, bucket_num) \ + ((sp_g > 0) ? (((bucket_num) >> (sp_g - 1)) & SPLITPOINT_PHASE_MASK) : \ + (bucket_num)) + +/* + * At splitpoint group 0 we have 2^(0 + 2) = 4 buckets, then at splitpoint + * group 1 we have 2^(1 + 2) = 8 total buckets. As the doubling continues at + * splitpoint group "x" we will have 2^(x + 2) total buckets. Total buckets + * before x splitpoint group will be 2^(x + 1). At each phase of allocation + * within splitpoint group we add 2^(x - 1) buckets, as we have to divide the + * task of allocation of 2^(x + 1) buckets among 4 phases. + */ +#define BUCKETS_BEFORE_SP_GRP(sp_g) ((sp_g == 0) ? 0 : (1 << (sp_g + 1))) +#define BUCKETS_WITHIN_SP_GRP(sp_g, nphase) \ + ((nphase) * ((sp_g == 0) ? 1 : (1 << (sp_g - 1)))) + +/* + * _hash_spareindex -- returns spare index / global splitpoint phase of the + * bucket + */ +uint32 +_hash_spareindex(uint32 num_bucket) +{ + uint32 splitpoint_group; + + /* + * The first 4 bucket belongs to first splitpoint group 0. And since group + * 0 have 4 = 2^2 buckets, we double them in group 1. So total buckets + * after group 1 is 8 = 2^3. Then again at group 2 we add another 2^3 + * buckets to double the total number of buckets, which will become 2^4. I + * think by this time we can see a pattern which say if num_bucket > 4 + * splitpoint group = log2(num_bucket) - 2 + */ + if (num_bucket <= 4) + splitpoint_group = 0; /* converted to base 0. */ + else + splitpoint_group = _hash_log2(num_bucket) - 2; + + return TOTAL_SPLITPOINT_PHASES_BEFORE_GROUP(splitpoint_group) + + SPLITPOINT_PHASES_WITHIN_GROUP(splitpoint_group, + num_bucket - 1); /* make it 0-based */ +} + +/* + * _hash_get_totalbuckets -- returns total number of buckets allocated till + * the given splitpoint phase. + */ +uint32 +_hash_get_totalbuckets(uint32 splitpoint_phase) +{ + uint32 splitpoint_group; + + /* + * Every 4 consecutive phases makes one group and group's are numbered + * from 0 + */ + splitpoint_group = (splitpoint_phase / SPLITPOINT_PHASES_PER_GRP); + + /* + * total_buckets = total number of buckets before its splitpoint group + + * total buckets within its splitpoint group until given splitpoint_phase. + */ + return BUCKETS_BEFORE_SP_GRP(splitpoint_group) + + BUCKETS_WITHIN_SP_GRP(splitpoint_group, + (splitpoint_phase % SPLITPOINT_PHASES_PER_GRP) + + 1); +} + /* * _hash_checkpage -- sanity checks on the format of all hash pages * @@ -383,7 +477,7 @@ _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); - blkno = BUCKET_TO_BLKNO(metap, old_bucket); + blkno = bucket_to_blkno(metap, old_bucket); _hash_relbuf(rel, metabuf); @@ -413,7 +507,7 @@ _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, metap->hashm_lowmask, metap->hashm_maxbucket); - blkno = BUCKET_TO_BLKNO(metap, new_bucket); + blkno = bucket_to_blkno(metap, new_bucket); _hash_relbuf(rel, metabuf); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index eb1df57..35c2807 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -35,9 +35,6 @@ typedef uint32 Bucket; #define InvalidBucket ((Bucket) 0xFFFFFFFF) -#define BUCKET_TO_BLKNO(metap,B) \ - ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) - /* * Special space for hash index pages. * @@ -161,13 +158,14 @@ typedef HashScanOpaqueData *HashScanOpaque; #define HASH_VERSION 2 /* 2 signifies only hash key value is stored */ /* - * spares[] holds the number of overflow pages currently allocated at or - * before a certain splitpoint. For example, if spares[3] = 7 then there are - * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The - * value in spares[ovflpoint] increases as overflow pages are added at the - * end of the index. Once ovflpoint increases (ie, we have actually allocated - * the bucket pages belonging to that splitpoint) the number of spares at the - * prior splitpoint cannot change anymore. + * spares[][] holds the number of overflow pages currently allocated at or + * before a certain splitpoint phase. For example, if spares[3][0] = 7 then + * there are 7 ovflpages before splitpoint phase 12. The value in + * spares[ovflpoint / HASH_SPLITPOINT_PHASES][ovflpoint % HASH_SPLITPOINT_PHASES] + * increases as overflow pages are added at the end of the index. Once + * ovflpoint increases (ie, we have actually allocated the bucket pages + * belonging to that splitpoint phase) the number of spares at the + * prior splitpoint phases cannot change anymore. * * ovflpages that have been recycled for reuse can be found by looking at * bitmaps that are stored within ovflpages dedicated for the purpose. @@ -181,6 +179,9 @@ typedef HashScanOpaqueData *HashScanOpaque; * limit us to 64 GB of overflow space...) */ #define HASH_MAX_SPLITPOINTS 32 +#define HASH_SPLITPOINT_PHASES 4 +#define SP_GRP(splitpoint) (splitpoint / HASH_SPLITPOINT_PHASES) +#define SP_PHASE(splitpoint) (splitpoint % HASH_SPLITPOINT_PHASES) #define HASH_MAX_BITMAPS 128 typedef struct HashMetaPageData @@ -201,8 +202,9 @@ typedef struct HashMetaPageData uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */ uint32 hashm_nmaps; /* number of bitmap pages */ RegProcedure hashm_procid; /* hash procedure id from pg_proc */ - uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before - * each splitpoint */ + + /* spare pages before each splitpoint phase */ + uint32 hashm_spares[HASH_MAX_SPLITPOINTS][HASH_SPLITPOINT_PHASES]; BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ } HashMetaPageData; @@ -283,7 +285,7 @@ typedef HashMetaPageData *HashMetaPage; /* public routines */ - +BlockNumber bucket_to_blkno(HashMetaPage metap, Bucket B); extern IndexBuildResult *hashbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void hashbuildempty(Relation index); @@ -382,6 +384,8 @@ extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype); extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask); extern uint32 _hash_log2(uint32 num); +extern uint32 _hash_spareindex(uint32 num_bucket); +extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); extern bool _hash_convert_tuple(Relation index,