diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index 09db5c6..56d4836 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -623,8 +623,8 @@ brin_page_cleanup(Relation idxrel, Buffer buf) */ if (PageIsNew(page)) { - LockRelationForExtension(idxrel, ShareLock); - UnlockRelationForExtension(idxrel, ShareLock); + LockRelationForExtension(idxrel, RELEXT_SHARED); + UnlockRelationForExtension(idxrel); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (PageIsNew(page)) @@ -716,7 +716,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, */ if (!RELATION_IS_LOCAL(irel)) { - LockRelationForExtension(irel, ExclusiveLock); + LockRelationForExtension(irel, RELEXT_EXCLUSIVE); extensionLockHeld = true; } buf = ReadBuffer(irel, P_NEW); @@ -768,7 +768,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, } if (extensionLockHeld) - UnlockRelationForExtension(irel, ExclusiveLock); + UnlockRelationForExtension(irel); ReleaseBuffer(buf); return InvalidBuffer; @@ -778,7 +778,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (extensionLockHeld) - UnlockRelationForExtension(irel, ExclusiveLock); + UnlockRelationForExtension(irel); page = BufferGetPage(buf); diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c index 03e53ce..f84be0c 100644 --- a/src/backend/access/brin/brin_revmap.c +++ b/src/backend/access/brin/brin_revmap.c @@ -570,7 +570,7 @@ revmap_physical_extend(BrinRevmap *revmap) else { if (needLock) - LockRelationForExtension(irel, ExclusiveLock); + LockRelationForExtension(irel, RELEXT_EXCLUSIVE); buf = ReadBuffer(irel, P_NEW); if (BufferGetBlockNumber(buf) != mapBlk) @@ -582,7 +582,7 @@ revmap_physical_extend(BrinRevmap *revmap) * page from under whoever is using it. */ if (needLock) - UnlockRelationForExtension(irel, ExclusiveLock); + UnlockRelationForExtension(irel); LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf); return; @@ -591,7 +591,7 @@ revmap_physical_extend(BrinRevmap *revmap) page = BufferGetPage(buf); if (needLock) - UnlockRelationForExtension(irel, ExclusiveLock); + UnlockRelationForExtension(irel); } /* Check that it's a regular block (or an empty page) */ diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index d9c6483..af2679c 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -325,13 +325,13 @@ GinNewBuffer(Relation index) /* Must extend the file */ needLock = !RELATION_IS_LOCAL(index); if (needLock) - LockRelationForExtension(index, ExclusiveLock); + LockRelationForExtension(index, RELEXT_EXCLUSIVE); buffer = ReadBuffer(index, P_NEW); LockBuffer(buffer, GIN_EXCLUSIVE); if (needLock) - UnlockRelationForExtension(index, ExclusiveLock); + UnlockRelationForExtension(index); return buffer; } diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 394bc83..b383423 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -716,10 +716,10 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) needLock = !RELATION_IS_LOCAL(index); if (needLock) - LockRelationForExtension(index, ExclusiveLock); + LockRelationForExtension(index, RELEXT_EXCLUSIVE); npages = RelationGetNumberOfBlocks(index); if (needLock) - UnlockRelationForExtension(index, ExclusiveLock); + UnlockRelationForExtension(index); totFreePages = 0; @@ -766,10 +766,10 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) stats->pages_free = totFreePages; if (needLock) - LockRelationForExtension(index, ExclusiveLock); + LockRelationForExtension(index, RELEXT_EXCLUSIVE); stats->num_pages = RelationGetNumberOfBlocks(index); if (needLock) - UnlockRelationForExtension(index, ExclusiveLock); + UnlockRelationForExtension(index); return stats; } diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index d8d1c0a..d313f70 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -821,13 +821,13 @@ gistNewBuffer(Relation r) needLock = !RELATION_IS_LOCAL(r); if (needLock) - LockRelationForExtension(r, ExclusiveLock); + LockRelationForExtension(r, RELEXT_EXCLUSIVE); buffer = ReadBuffer(r, P_NEW); LockBuffer(buffer, GIST_EXCLUSIVE); if (needLock) - UnlockRelationForExtension(r, ExclusiveLock); + UnlockRelationForExtension(r); return buffer; } diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 77d9d12..ecef5c9 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -59,10 +59,10 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) /* try to find deleted pages */ if (needLock) - LockRelationForExtension(rel, ExclusiveLock); + LockRelationForExtension(rel, RELEXT_EXCLUSIVE); npages = RelationGetNumberOfBlocks(rel); if (needLock) - UnlockRelationForExtension(rel, ExclusiveLock); + UnlockRelationForExtension(rel); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) @@ -91,10 +91,10 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) /* return statistics */ stats->pages_free = totFreePages; if (needLock) - LockRelationForExtension(rel, ExclusiveLock); + LockRelationForExtension(rel, RELEXT_EXCLUSIVE); stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) - UnlockRelationForExtension(rel, ExclusiveLock); + UnlockRelationForExtension(rel); return stats; } diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 13e3bdc..0f81815 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -519,11 +519,11 @@ loop: if (needLock) { if (!use_fsm) - LockRelationForExtension(relation, ExclusiveLock); - else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock)) + LockRelationForExtension(relation, RELEXT_EXCLUSIVE); + else if (!ConditionalLockRelationForExtension(relation, RELEXT_EXCLUSIVE)) { /* Couldn't get the lock immediately; wait for it. */ - LockRelationForExtension(relation, ExclusiveLock); + LockRelationForExtension(relation, RELEXT_EXCLUSIVE); /* * Check if some other backend has extended a block for us while @@ -537,7 +537,7 @@ loop: */ if (targetBlock != InvalidBlockNumber) { - UnlockRelationForExtension(relation, ExclusiveLock); + UnlockRelationForExtension(relation); goto loop; } @@ -576,7 +576,7 @@ loop: * against vacuumlazy.c --- see comments therein. */ if (needLock) - UnlockRelationForExtension(relation, ExclusiveLock); + UnlockRelationForExtension(relation); /* * We need to initialize the empty new page. Double-check that it really diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 4c2a13a..0c57021 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -641,7 +641,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) * Note that another backend might have extended or created the relation * by the time we get the lock. */ - LockRelationForExtension(rel, ExclusiveLock); + LockRelationForExtension(rel, RELEXT_EXCLUSIVE); /* Might have to re-open if a cache flush happened */ RelationOpenSmgr(rel); @@ -679,7 +679,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) /* Update local cache with the up-to-date size */ rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now; - UnlockRelationForExtension(rel, ExclusiveLock); + UnlockRelationForExtension(rel); pfree(pg); } diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c774349..f22202f 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -659,7 +659,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) needLock = !RELATION_IS_LOCAL(rel); if (needLock) - LockRelationForExtension(rel, ExclusiveLock); + LockRelationForExtension(rel, RELEXT_EXCLUSIVE); buf = ReadBuffer(rel, P_NEW); @@ -673,7 +673,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) * condition against btvacuumscan --- see comments therein. */ if (needLock) - UnlockRelationForExtension(rel, ExclusiveLock); + UnlockRelationForExtension(rel); /* Initialize the new page before returning it */ page = BufferGetPage(buf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 399e6a1..c110737 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1058,10 +1058,10 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, { /* Get the current relation length */ if (needLock) - LockRelationForExtension(rel, ExclusiveLock); + LockRelationForExtension(rel, RELEXT_EXCLUSIVE); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) - UnlockRelationForExtension(rel, ExclusiveLock); + UnlockRelationForExtension(rel); /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index bd5301f..e635fe3 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -230,13 +230,13 @@ SpGistNewBuffer(Relation index) /* Must extend the file */ needLock = !RELATION_IS_LOCAL(index); if (needLock) - LockRelationForExtension(index, ExclusiveLock); + LockRelationForExtension(index, RELEXT_EXCLUSIVE); buffer = ReadBuffer(index, P_NEW); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (needLock) - UnlockRelationForExtension(index, ExclusiveLock); + UnlockRelationForExtension(index); return buffer; } diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index d7d5e90..991db10 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -824,10 +824,10 @@ spgvacuumscan(spgBulkDeleteState *bds) { /* Get the current relation length */ if (needLock) - LockRelationForExtension(index, ExclusiveLock); + LockRelationForExtension(index, RELEXT_EXCLUSIVE); num_pages = RelationGetNumberOfBlocks(index); if (needLock) - UnlockRelationForExtension(index, ExclusiveLock); + UnlockRelationForExtension(index); /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) diff --git a/src/backend/commands/discard.c b/src/backend/commands/discard.c index f0dcd87..216c197 100644 --- a/src/backend/commands/discard.c +++ b/src/backend/commands/discard.c @@ -19,6 +19,7 @@ #include "commands/discard.h" #include "commands/prepare.h" #include "commands/sequence.h" +#include "storage/extension_lock.h" #include "utils/guc.h" #include "utils/portal.h" @@ -71,6 +72,7 @@ DiscardAll(bool isTopLevel) ResetAllOptions(); DropAllPreparedStatements(); Async_UnlistenAll(); + RelExtLockReleaseAll(); LockReleaseAll(USER_LOCKMETHOD, true); ResetPlanCache(); ResetTempTableNamespace(); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 6587db7..6880706 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -860,8 +860,8 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockRelationForExtension(onerel, ExclusiveLock); - UnlockRelationForExtension(onerel, ExclusiveLock); + LockRelationForExtension(onerel, RELEXT_EXCLUSIVE); + UnlockRelationForExtension(onerel); LockBufferForCleanup(buf); if (PageIsNew(page)) { diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 5c256ff..5beba70 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3628,6 +3628,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_SYNC_REP: event_name = "SyncRep"; break; + case WAIT_EVENT_RELATION_EXTENSION: + event_name = "RelationExtension"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 4648473..010f7ca 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -624,7 +624,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) * Note that another backend might have extended or created the relation * by the time we get the lock. */ - LockRelationForExtension(rel, ExclusiveLock); + LockRelationForExtension(rel, RELEXT_EXCLUSIVE); /* Might have to re-open if a cache flush happened */ RelationOpenSmgr(rel); @@ -652,7 +652,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) /* Update local cache with the up-to-date size */ rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now; - UnlockRelationForExtension(rel, ExclusiveLock); + UnlockRelationForExtension(rel); pfree(pg); } diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile index e1b787e..2334a40 100644 --- a/src/backend/storage/lmgr/Makefile +++ b/src/backend/storage/lmgr/Makefile @@ -13,7 +13,7 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o lwlocknames.o spin.o \ - s_lock.o predicate.o condition_variable.o + s_lock.o predicate.o condition_variable.o extension_lock.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README index 56b0a12..5e0a394 100644 --- a/src/backend/storage/lmgr/README +++ b/src/backend/storage/lmgr/README @@ -29,6 +29,12 @@ process has to wait for an LWLock, it blocks on a SysV semaphore so as to not consume CPU time. Waiting processes will be granted the lock in arrival order. There is no timeout. +* Relation extension locks. The relation extension lock manager is +specialized in relation extensions. In PostgreSQL 10 relation extension +lock has been moved out of regular lock. It's similar to regular locks +but doesn't have full dead lock detection and group locking. When +confliction relation extension lock waits using condition variables. + * Regular locks (a/k/a heavyweight locks). The regular lock manager supports a variety of lock modes with table-driven semantics, and it has full deadlock detection and automatic release at transaction end. @@ -40,9 +46,9 @@ Acquisition of either a spinlock or a lightweight lock causes query cancel and die() interrupts to be held off until all such locks are released. No such restriction exists for regular locks, however. Also note that we can accept query cancel and die() interrupts while waiting -for a regular lock, but we will not accept them while waiting for -spinlocks or LW locks. It is therefore not a good idea to use LW locks -when the wait time might exceed a few seconds. +for a relation extension lock or a regular lock, but we will not accept +them while waiting for spinlocks or LW locks. It is therefore not a good +idea to use LW locks when the wait time might exceed a few seconds. The rest of this README file discusses the regular lock manager in detail. diff --git a/src/backend/storage/lmgr/extension_lock.c b/src/backend/storage/lmgr/extension_lock.c new file mode 100644 index 0000000..5b50890 --- /dev/null +++ b/src/backend/storage/lmgr/extension_lock.c @@ -0,0 +1,503 @@ +/*------------------------------------------------------------------------- + * + * extension_lock.c + * Relation extension lock manager + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/lmgr/extension_lock.c + * + * NOTES: + * + * This lock manager is specialized in relation extension locks; light + * weight and interruptible lock manager. It's similar to heavy-weight + * lock but doesn't have dead lock detection mechanism and group locking + * mechanism. + * + * For lock acquisition we use an atomic compare-and-exchange on the + * state variable. When a process tries to acquire a lock that conflicts + * with existing lock, it is put to sleep using condition variables + * if not conditional locking. When release the lock, we use an atomic + * decrement to release the lock, but don't remove the RELEXTLOCK entry + * in the hash table. The all unused entries will be reclaimed when + * acquisition once the hash table got full. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/extension_lock.h" +#include "utils/rel.h" + +/* + * Compute the hash code associated with a RELEXTLOCK. + * + * To avoid unnecessary recomputations of the hash code, we try to do this + * just once per function, and then pass it around as needed. Aside from + * passing the hashcode to hash_search_with_hash_value(), we can extract + * the lock partition number from the hashcode. + */ +#define RelExtLockTargetTagHashCode(relextlocktargettag) \ + get_hash_value(RelExtLockHash, (const void *) relextlocktargettag) + +/* + * The lockmgr's shared hash tables are partitioned to reduce contention. + * To determine which partition a given relid belongs to, compute the tag's + * hash code with ExtLockTagHashCode(), then apply one of these macros. + * NB: NUM_RELEXTLOCK_PARTITIONS must be a power of 2! + */ +#define RelExtLockHashPartition(hashcode) \ + ((hashcode) % NUM_RELEXTLOCK_PARTITIONS) +#define RelExtLockHashPartitionLock(hashcode) \ + (&MainLWLockArray[RELEXTLOCK_MANAGER_LWLOCK_OFFSET + \ + LockHashPartition(hashcode)].lock) +#define RelExtLockHashPartitionLockByIndex(i) \ + (&MainLWLockArray[RELEXTLOCK_MANAGER_LWLOCK_OFFSET + (i)].lock) + +#define RELEXT_VAL_EXCLUSIVE ((uint32) 1 << 24) +#define RELEXT_VAL_SHARED 1 + +#define RELEXT_LOCK_MASK ((uint32) ((1 << 25) - 1)) + +typedef struct RELEXTLOCK +{ + /* hash key -- must be first */ + Oid relid; + + /* state of exclusive/non-exclusive lock */ + pg_atomic_uint32 state; + pg_atomic_uint32 pin_counts; + + ConditionVariable cv; +} RELEXTLOCK; + +/* + * This structure holds information per-object relation extension + * lock. held_extlocks represents the RelExtLocks we're holding. + * We use this structure to keep track of locked relation extension locks + * for release during error recovery. At most one lock can be held at + * once. Note that sometimes we could try to acquire a lock for the + * additional forks while holding the lock for the main fork; for example, + * adding extra relation blocks for both relation and its free space map. + * But since this lock manager doesn't distinguish between the forks, + * we just increment nLocks in the case. + */ +typedef struct relextlock_handle +{ + RELEXTLOCK *lock; + RelExtLockMode mode; /* lock mode for this table entry */ + int nLocks; +} relextlock_handle; + +static relextlock_handle held_relextlock; +static int num_held_relextlocks = 0; + +static bool RelExtLockAcquire(Oid relid, RelExtLockMode lockmode, bool conditional); +static void RelExtLockRelease(Oid rleid); +static bool RelExtLockAttemptLock(RELEXTLOCK *extlock, RelExtLockMode lockmode); +static bool RelExtLockShrinkLocks(void); + +/* + * Pointers to hash tables containing relation extension lock state + * + * The RelExtLockHash hash table is in shared memory + */ +static HTAB *RelExtLockHash; + +/* + * InitRelExtLock + * Initialize the relation extension lock manager's data structures. + */ +void +InitRelExtLock(long max_table_size) +{ + HASHCTL info; + long init_table_size; + + /* + * Compute init/max size to request for lock hashtables. Note these + * calculations must agree with LockShmemSize! + */ + init_table_size = max_table_size / 2; + + /* + * Allocate hash table for RELEXTLOCK structs. This stores per-relation + * lock. + */ + MemSet(&info, 0, sizeof(info)); + info.keysize = sizeof(Oid); + info.entrysize = sizeof(RELEXTLOCK); + info.num_partitions = NUM_RELEXTLOCK_PARTITIONS; + + RelExtLockHash = ShmemInitHash("RELEXTLOCK Hash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | HASH_PARTITION); +} + +/* + * LockRelationForExtension + * + * This lock is used to interlock addition of pages to relations. + * We need such locking because bufmgr/smgr definition of P_NEW is not + * race-condition-proof. + * + * We assume the caller is already holding some type of regular lock on + * the relation, so no AcceptInvalidationMessages call is needed here. + */ +void +LockRelationForExtension(Relation relation, RelExtLockMode lockmode) +{ + RelExtLockAcquire(relation->rd_id, lockmode, false); +} + +/* + * ConditionalLockRelationForExtension + * + * As above, but only lock if we can get the lock without blocking. + * Returns TRUE iff the lock was acquired. + */ +bool +ConditionalLockRelationForExtension(Relation relation, RelExtLockMode lockmode) +{ + return RelExtLockAcquire(relation->rd_id, lockmode, true); +} + +/* + * RelationExtensionLockWaiterCount + * + * Count the number of processes waiting for the given relation extension lock. + */ +int +RelationExtensionLockWaiterCount(Relation relation) +{ + LWLock *partitionLock; + RELEXTLOCK *extlock; + Oid relid; + uint32 hashcode; + uint32 pin_counts; + bool found; + + relid = RelationGetRelid(relation); + + hashcode = RelExtLockTargetTagHashCode(&relid); + partitionLock = RelExtLockHashPartitionLock(hashcode); + + LWLockAcquire(partitionLock, LW_SHARED); + + extlock = (RELEXTLOCK *) hash_search_with_hash_value(RelExtLockHash, + (void *) &relid, + hashcode, + HASH_FIND, &found); + + LWLockRelease(partitionLock); + + /* We assume that we already acquire this lock */ + Assert(found); + + pin_counts = pg_atomic_read_u32(&(extlock->pin_counts)); + + /* Except for me */ + return pin_counts - 1; +} + +/* + * UnlockRelationForExtension + */ +void +UnlockRelationForExtension(Relation relation) +{ + RelExtLockRelease(relation->rd_id); +} + +/* + * RelationExtensionLockReleaseAll - release all currently-held relation extension locks + */ +void +RelExtLockReleaseAll(void) +{ + if (num_held_relextlocks > 0) + { + HOLD_INTERRUPTS(); + RelExtLockRelease(held_relextlock.lock->relid); + } +} + +/* + * Return the number of holding relation extension locks. + */ +int +RelExtLockHoldingLockCount(void) +{ + return num_held_relextlocks; +} + +/* + * Acquire relation extension lock and create RELEXTLOCK hash entry on shared + * hash table. If we're trying to acquire the same lock as what already held, + * we just increment nLock locally and return without touching the hash table. + */ +static bool +RelExtLockAcquire(Oid relid, RelExtLockMode lockmode, bool conditional) +{ + RELEXTLOCK *extlock = NULL; + LWLock *partitionLock; + uint32 hashcode; + bool found; + bool mustwait; + + hashcode = RelExtLockTargetTagHashCode(&relid); + partitionLock = RelExtLockHashPartitionLock(hashcode); + + /* + * If we already hold the lock, we can just increase the count locally. + * Since we don't support dead lock detection for relation extension + * lock and don't control the order of lock acquisition, it cannot not + * happen that trying to take a new lock while holding an another lock. + */ + if (num_held_relextlocks > 0) + { + if (relid == held_relextlock.lock->relid && + lockmode == held_relextlock.mode) + { + held_relextlock.nLocks++; + return true; + } + else + Assert(false); /* cannot happen */ + } + + for (;;) + { + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + if (!extlock) + extlock = (RELEXTLOCK *) hash_search_with_hash_value(RelExtLockHash, + (void * ) &relid, + hashcode, HASH_ENTER_NULL, + &found); + + /* + * Failed to create new hash entry. Try to shrink the hash table and + * retry. + */ + if (!extlock) + { + bool successed; + LWLockRelease(partitionLock); + successed = RelExtLockShrinkLocks(); + + if (!successed) + ereport(ERROR, + (errmsg("out of shared memory"), + errhint("You might need to increase max_pred_locks_per_transaction."))); + + continue; + } + + /* Not found, initialize */ + if (!found) + { + extlock->relid = relid; + pg_atomic_init_u32(&(extlock->state), 0); + pg_atomic_init_u32(&(extlock->pin_counts), 0); + ConditionVariableInit(&(extlock->cv)); + } + + /* Increment pin count */ + pg_atomic_add_fetch_u32(&(extlock->pin_counts), 1); + + mustwait = RelExtLockAttemptLock(extlock, lockmode); + + if (!mustwait) + break; /* got the lock */ + + /* Could not got the lock, return iff in conditional locking */ + if (mustwait && conditional) + { + pg_atomic_sub_fetch_u32(&(extlock->pin_counts), 1); + LWLockRelease(partitionLock); + return false; + } + + /* Release the partition lock before sleep */ + LWLockRelease(partitionLock); + + /* Sleep until the lock is released */ + ConditionVariableSleep(&(extlock->cv), WAIT_EVENT_RELATION_EXTENSION); + } + + LWLockRelease(partitionLock); + ConditionVariableCancelSleep(); + + Assert(!mustwait); + + /* Remember lock held by this backend */ + held_relextlock.lock = extlock; + held_relextlock.mode = lockmode; + held_relextlock.nLocks = 1; + num_held_relextlocks++; + + /* Always return true if not conditional lock */ + return true; +} + +/* + * RelExtLockRelease + * + * Release a previously acquired relation extension lock. We don't remove + * hash entry at the time. Once the hash table got full, all un-pinned hash + * entries will be removed. + */ +static void +RelExtLockRelease(Oid relid) +{ + RELEXTLOCK *extlock; + RelExtLockMode mode; + LWLock *partitionLock; + uint32 hashcode; + uint32 pin_counts; + + /* We should have acquired a lock before releasing */ + Assert(num_held_relextlocks > 0); + + if (relid != held_relextlock.lock->relid) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("relation extension lock for %u is not held", + relid))); + + /* Decrease the lock count locally */ + held_relextlock.nLocks--; + + /* If we are still holding the lock, we're done */ + if (held_relextlock.nLocks > 0) + return; + + hashcode = RelExtLockTargetTagHashCode(&relid); + partitionLock = RelExtLockHashPartitionLock(hashcode); + + /* Keep holding the partition lock until unlocking is done */ + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + extlock = held_relextlock.lock; + mode = held_relextlock.mode; + + if (mode == RELEXT_EXCLUSIVE) + pg_atomic_sub_fetch_u32(&(extlock->state), RELEXT_VAL_EXCLUSIVE); + else + pg_atomic_sub_fetch_u32(&(extlock->state), RELEXT_VAL_SHARED); + + num_held_relextlocks--; + + /* Decrement pin counter */ + pin_counts = pg_atomic_sub_fetch_u32(&(extlock->pin_counts), 1); + + LWLockRelease(partitionLock); + + /* Wake up waiters if there are someone looking at this lock */ + if (pin_counts > 0) + ConditionVariableBroadcast(&(extlock->cv)); +} + +/* + * Internal function that attempts to atomically acquire the relation + * extension lock in the passed in mode. + * + * Returns true if the lock isn't free and we need to wait. + */ +static bool +RelExtLockAttemptLock(RELEXTLOCK *extlock, RelExtLockMode lockmode) +{ + uint32 oldstate; + + oldstate = pg_atomic_read_u32(&extlock->state); + + while (true) + { + uint32 desired_state; + bool lock_free; + + desired_state = oldstate; + + if (lockmode == RELEXT_EXCLUSIVE) + { + lock_free = (oldstate & RELEXT_LOCK_MASK) == 0; + if (lock_free) + desired_state += RELEXT_VAL_EXCLUSIVE; + } + else + { + lock_free = (oldstate & RELEXT_VAL_EXCLUSIVE) == 0; + if (lock_free) + desired_state += RELEXT_VAL_SHARED; + } + + if (pg_atomic_compare_exchange_u32(&extlock->state, + &oldstate, desired_state)) + { + if (lock_free) + return false; + else + return true; + } + } + pg_unreachable(); +} + +/* + * Reclaim all un-pinned RELEXTLOCK entries from the hash table. + */ +static bool +RelExtLockShrinkLocks(void) +{ + HASH_SEQ_STATUS hstat; + RELEXTLOCK *extlock; + List *entries_to_remove = NIL; + ListCell *cell; + int i; + + /* + * To ensure consistency, take all partition locks in exclusive + * mode. + */ + for (i = 0; i < NUM_RELEXTLOCK_PARTITIONS; i++) + LWLockAcquire(RelExtLockHashPartitionLockByIndex(i), LW_EXCLUSIVE); + + /* Collect all un-pinned RELEXTLOCK entries */ + hash_seq_init(&hstat, RelExtLockHash); + while ((extlock = (RELEXTLOCK *) hash_seq_search(&hstat)) != NULL) + { + uint32 pin_count = pg_atomic_read_u32(&(extlock->pin_counts)); + + if (pin_count == 0) + entries_to_remove = lappend(entries_to_remove, extlock); + } + + /* We could not find any entries that we can remove right now */ + if (list_length(entries_to_remove) == 0) + return false; + + /* Remove collected entries from RelExtLockHash has table */ + foreach (cell, entries_to_remove) + { + RELEXTLOCK *el = (RELEXTLOCK *) lfirst(cell); + uint32 hc = RelExtLockTargetTagHashCode(&(el->relid)); + + hash_search_with_hash_value(RelExtLockHash, (void *) &(el->relid), + hc, HASH_REMOVE, NULL); + } + + /* Release all partition locks */ + for (i = 0; i < NUM_RELEXTLOCK_PARTITIONS; i++) + LWLockRelease(RelExtLockHashPartitionLockByIndex(i)); + + return true; +} diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index da5679b..4fbc0c4 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -319,78 +319,6 @@ UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode) } /* - * LockRelationForExtension - * - * This lock tag is used to interlock addition of pages to relations. - * We need such locking because bufmgr/smgr definition of P_NEW is not - * race-condition-proof. - * - * We assume the caller is already holding some type of regular lock on - * the relation, so no AcceptInvalidationMessages call is needed here. - */ -void -LockRelationForExtension(Relation relation, LOCKMODE lockmode) -{ - LOCKTAG tag; - - SET_LOCKTAG_RELATION_EXTEND(tag, - relation->rd_lockInfo.lockRelId.dbId, - relation->rd_lockInfo.lockRelId.relId); - - (void) LockAcquire(&tag, lockmode, false, false); -} - -/* - * ConditionalLockRelationForExtension - * - * As above, but only lock if we can get the lock without blocking. - * Returns true iff the lock was acquired. - */ -bool -ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode) -{ - LOCKTAG tag; - - SET_LOCKTAG_RELATION_EXTEND(tag, - relation->rd_lockInfo.lockRelId.dbId, - relation->rd_lockInfo.lockRelId.relId); - - return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); -} - -/* - * RelationExtensionLockWaiterCount - * - * Count the number of processes waiting for the given relation extension lock. - */ -int -RelationExtensionLockWaiterCount(Relation relation) -{ - LOCKTAG tag; - - SET_LOCKTAG_RELATION_EXTEND(tag, - relation->rd_lockInfo.lockRelId.dbId, - relation->rd_lockInfo.lockRelId.relId); - - return LockWaiterCount(&tag); -} - -/* - * UnlockRelationForExtension - */ -void -UnlockRelationForExtension(Relation relation, LOCKMODE lockmode) -{ - LOCKTAG tag; - - SET_LOCKTAG_RELATION_EXTEND(tag, - relation->rd_lockInfo.lockRelId.dbId, - relation->rd_lockInfo.lockRelId.relId); - - LockRelease(&tag, lockmode, false); -} - -/* * LockPage * * Obtain a page-level lock. This is currently used by some index access @@ -961,12 +889,6 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag) tag->locktag_field2, tag->locktag_field1); break; - case LOCKTAG_RELATION_EXTEND: - appendStringInfo(buf, - _("extension of relation %u of database %u"), - tag->locktag_field2, - tag->locktag_field1); - break; case LOCKTAG_PAGE: appendStringInfo(buf, _("page %u of relation %u of database %u"), diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 5833086..ad6f057 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -45,6 +45,7 @@ #include "storage/sinvaladt.h" #include "storage/spin.h" #include "storage/standby.h" +#include "storage/lmgr.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/resowner_private.h" @@ -388,6 +389,9 @@ InitLocks(void) max_table_size = NLOCKENTS(); init_table_size = max_table_size / 2; + /* Initialize lock structure for relation extension lock */ + InitRelExtLock(max_table_size); + /* * Allocate hash table for LOCK structs. This stores per-locked-object * information. @@ -717,6 +721,15 @@ LockAcquireExtended(const LOCKTAG *locktag, int status; bool log_lock = false; + /* + * We allow to take a relation extension lock after took a + * heavy-weight lock. However, since we don't have dead lock + * detection mechanism between heavy-weight lock and relation + * extension lock it's not allowed taking an another heavy-weight + * lock while holding a relation extension lock. + */ + Assert(RelExtLockHoldingLockCount() == 0); + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); lockMethodTable = LockMethods[lockmethodid]; @@ -3366,6 +3379,7 @@ LockShmemSize(void) /* lock hash table */ max_table_size = NLOCKENTS(); size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK))); + size = add_size(size, hash_estimate_size(max_table_size, sizeof(LWLock))); /* proclock hash table */ max_table_size *= 2; diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index e5c3e86..b12aba0 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -451,6 +451,13 @@ InitializeLWLocks(void) for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++) LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER); + /* Initialize relation extension lmgr's LWLocks in main array */ + lock = MainLWLockArray + NUM_INDIVIDUAL_LWLOCKS + + NUM_BUFFER_PARTITIONS + NUM_LOCK_PARTITIONS + + NUM_PREDICATELOCK_PARTITIONS; + for (id = 0; id < NUM_RELEXTLOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_RELEXT_LOCK_MANAGER); + /* Initialize named tranches. */ if (NamedLWLockTrancheRequests > 0) { @@ -508,6 +515,7 @@ RegisterLWLockTranches(void) LWLockRegisterTranche(LWTRANCHE_LOCK_MANAGER, "lock_manager"); LWLockRegisterTranche(LWTRANCHE_PREDICATE_LOCK_MANAGER, "predicate_lock_manager"); + LWLockRegisterTranche(LWTRANCHE_RELEXT_LOCK_MANAGER, "relext_lock_manager"); LWLockRegisterTranche(LWTRANCHE_PARALLEL_QUERY_DSA, "parallel_query_dsa"); LWLockRegisterTranche(LWTRANCHE_SESSION_DSA, diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 5f6727d..f698e9c 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -765,6 +765,8 @@ ProcReleaseLocks(bool isCommit) return; /* If waiting, get off wait queue (should only be needed after error) */ LockErrorCleanup(); + /* Release relation extension locks */ + RelExtLockReleaseAll(); /* Release standard locks, including session-level if aborting */ LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit); /* Release transaction-level advisory locks */ diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index 9e0a8ab..6d8916c 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -25,7 +25,6 @@ /* This must match enum LockTagType! */ const char *const LockTagTypeNames[] = { "relation", - "extend", "page", "tuple", "transactionid", @@ -234,7 +233,6 @@ pg_lock_status(PG_FUNCTION_ARGS) switch ((LockTagType) instance->locktag.locktag_type) { case LOCKTAG_RELATION: - case LOCKTAG_RELATION_EXTEND: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); nulls[3] = true; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 20f1d27..c004844 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -1153,6 +1153,7 @@ ShutdownPostgres(int code, Datum arg) * User locks are not released by transaction end, so be sure to release * them explicitly. */ + RelExtLockReleaseAll(); LockReleaseAll(USER_LOCKMETHOD, true); } diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 089b7c3..958822f 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -816,7 +816,8 @@ typedef enum WAIT_EVENT_REPLICATION_ORIGIN_DROP, WAIT_EVENT_REPLICATION_SLOT_DROP, WAIT_EVENT_SAFE_SNAPSHOT, - WAIT_EVENT_SYNC_REP + WAIT_EVENT_SYNC_REP, + WAIT_EVENT_RELATION_EXTENSION } WaitEventIPC; /* ---------- diff --git a/src/include/storage/extension_lock.h b/src/include/storage/extension_lock.h new file mode 100644 index 0000000..daa6416 --- /dev/null +++ b/src/include/storage/extension_lock.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * extension_lock.h + * Relation extension lock manager + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/extension_lock.h + * + *------------------------------------------------------------------------- + */ + +#ifndef EXTENSION_LOCK_H +#define EXTENSION_LOCK_H + +#ifdef FRONTEND +#error "extension_lock.h may not be included from frontend code" +#endif + +#include "storage/proclist_types.h" +#include "storage/s_lock.h" +#include "storage/condition_variable.h" +#include "port/atomics.h" + +typedef enum RelExtLockMode +{ + RELEXT_EXCLUSIVE, + RELEXT_SHARED +} RelExtLockMode; + +/* Lock a relation for extension */ +extern void InitRelExtLock(long max_table_size); +extern void LockRelationForExtension(Relation relation, RelExtLockMode lockmode); +extern void UnlockRelationForExtension(Relation relation); +extern bool ConditionalLockRelationForExtension(Relation relation, RelExtLockMode lockmode); +extern int RelationExtensionLockWaiterCount(Relation relation); +extern void RelExtLockReleaseAll(void); +extern int RelExtLockHoldingLockCount(void); + +#endif /* EXTENSION_LOCK_H */ diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index 0b92322..6b357aa 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -15,6 +15,7 @@ #define LMGR_H #include "lib/stringinfo.h" +#include "storage/extension_lock.h" #include "storage/itemptr.h" #include "storage/lock.h" #include "utils/rel.h" @@ -50,13 +51,6 @@ extern bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode); extern void LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode); extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode); -/* Lock a relation for extension */ -extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode); -extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode); -extern bool ConditionalLockRelationForExtension(Relation relation, - LOCKMODE lockmode); -extern int RelationExtensionLockWaiterCount(Relation relation); - /* Lock a page (currently only used within indexes) */ extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index 765431e..3be18ea 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -138,8 +138,6 @@ typedef uint16 LOCKMETHODID; typedef enum LockTagType { LOCKTAG_RELATION, /* whole relation */ - /* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */ - LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ /* same ID info as RELATION */ LOCKTAG_PAGE, /* one page of a relation */ /* ID info for a page is RELATION info + BlockNumber */ @@ -198,14 +196,6 @@ typedef struct LOCKTAG (locktag).locktag_type = LOCKTAG_RELATION, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) -#define SET_LOCKTAG_RELATION_EXTEND(locktag,dboid,reloid) \ - ((locktag).locktag_field1 = (dboid), \ - (locktag).locktag_field2 = (reloid), \ - (locktag).locktag_field3 = 0, \ - (locktag).locktag_field4 = 0, \ - (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ - (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) - #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ ((locktag).locktag_field1 = (dboid), \ (locktag).locktag_field2 = (reloid), \ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 596fdad..b138aad 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -120,14 +120,21 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests; #define LOG2_NUM_PREDICATELOCK_PARTITIONS 4 #define NUM_PREDICATELOCK_PARTITIONS (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS) +/* Number of partitions the shared relation extension lock tables are divided into */ +#define LOG2_NUM_RELEXTLOCK_PARTITIONS 4 +#define NUM_RELEXTLOCK_PARTITIONS (1 << LOG2_NUM_RELEXTLOCK_PARTITIONS) + /* Offsets for various chunks of preallocated lwlocks. */ #define BUFFER_MAPPING_LWLOCK_OFFSET NUM_INDIVIDUAL_LWLOCKS #define LOCK_MANAGER_LWLOCK_OFFSET \ (BUFFER_MAPPING_LWLOCK_OFFSET + NUM_BUFFER_PARTITIONS) #define PREDICATELOCK_MANAGER_LWLOCK_OFFSET \ (LOCK_MANAGER_LWLOCK_OFFSET + NUM_LOCK_PARTITIONS) -#define NUM_FIXED_LWLOCKS \ +#define RELEXTLOCK_MANAGER_LWLOCK_OFFSET \ (PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS) +#define NUM_FIXED_LWLOCKS \ + (PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS + \ + NUM_RELEXTLOCK_PARTITIONS) typedef enum LWLockMode { @@ -151,6 +158,8 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val); extern void LWLockReleaseAll(void); extern bool LWLockHeldByMe(LWLock *lock); extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode); +extern bool LWLockCheckForCleanup(LWLock *lock); +extern int LWLockWaiterCount(LWLock *lock); extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval); extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value); @@ -211,6 +220,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_BUFFER_MAPPING, LWTRANCHE_LOCK_MANAGER, LWTRANCHE_PREDICATE_LOCK_MANAGER, + LWTRANCHE_RELEXT_LOCK_MANAGER, LWTRANCHE_PARALLEL_QUERY_DSA, LWTRANCHE_SESSION_DSA, LWTRANCHE_SESSION_RECORD_TABLE,