From 2512c6f3d21720e6b67c6251aae99d04d5d80f40 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 19 May 2020 15:16:02 +0900 Subject: [PATCH v23 3/7] Support atomic commit among multiple foreign servers. Co-authored-by: Masahiko Sawada, Ashutosh Bapat --- src/backend/access/Makefile | 2 +- src/backend/access/fdwxact/Makefile | 17 + src/backend/access/fdwxact/README | 109 + src/backend/access/fdwxact/fdwxact.c | 2754 +++++++++++++++++ src/backend/access/fdwxact/launcher.c | 558 ++++ src/backend/access/fdwxact/resolver.c | 443 +++ src/backend/access/rmgrdesc/Makefile | 1 + src/backend/access/rmgrdesc/fdwxactdesc.c | 58 + src/backend/access/rmgrdesc/xlogdesc.c | 6 +- src/backend/access/transam/rmgr.c | 1 + src/backend/access/transam/twophase.c | 66 + src/backend/access/transam/xact.c | 28 +- src/backend/access/transam/xlog.c | 34 +- src/backend/catalog/system_views.sql | 3 + src/backend/commands/copy.c | 6 + src/backend/commands/foreigncmds.c | 30 + src/backend/executor/execPartition.c | 8 + src/backend/executor/nodeForeignscan.c | 24 + src/backend/executor/nodeModifyTable.c | 6 + src/backend/foreign/foreign.c | 55 + src/backend/postmaster/bgworker.c | 8 + src/backend/postmaster/pgstat.c | 18 + src/backend/postmaster/postmaster.c | 15 +- src/backend/replication/logical/decode.c | 1 + src/backend/storage/ipc/ipci.c | 6 + src/backend/storage/ipc/procarray.c | 46 + src/backend/storage/lmgr/lwlocknames.txt | 3 + src/backend/storage/lmgr/proc.c | 8 + src/backend/tcop/postgres.c | 14 + src/backend/utils/misc/guc.c | 79 + src/backend/utils/misc/postgresql.conf.sample | 16 + src/backend/utils/probes.d | 2 + src/bin/initdb/initdb.c | 1 + src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_resetwal/pg_resetwal.c | 2 + src/bin/pg_waldump/fdwxactdesc.c | 1 + src/bin/pg_waldump/rmgrdesc.c | 1 + src/include/access/fdwxact.h | 164 + src/include/access/fdwxact_launcher.h | 28 + src/include/access/fdwxact_resolver.h | 23 + src/include/access/fdwxact_xlog.h | 54 + src/include/access/resolver_internal.h | 63 + src/include/access/rmgrlist.h | 1 + src/include/access/twophase.h | 1 + src/include/access/xact.h | 7 + src/include/access/xlog_internal.h | 1 + src/include/catalog/pg_control.h | 1 + src/include/catalog/pg_proc.dat | 22 + src/include/foreign/fdwapi.h | 12 + src/include/foreign/foreign.h | 1 + src/include/pgstat.h | 6 + src/include/storage/proc.h | 11 + src/include/storage/procarray.h | 5 + src/include/utils/guc_tables.h | 2 + src/test/regress/expected/rules.out | 7 + 55 files changed, 4824 insertions(+), 17 deletions(-) create mode 100644 src/backend/access/fdwxact/Makefile create mode 100644 src/backend/access/fdwxact/README create mode 100644 src/backend/access/fdwxact/fdwxact.c create mode 100644 src/backend/access/fdwxact/launcher.c create mode 100644 src/backend/access/fdwxact/resolver.c create mode 100644 src/backend/access/rmgrdesc/fdwxactdesc.c create mode 120000 src/bin/pg_waldump/fdwxactdesc.c create mode 100644 src/include/access/fdwxact.h create mode 100644 src/include/access/fdwxact_launcher.h create mode 100644 src/include/access/fdwxact_resolver.h create mode 100644 src/include/access/fdwxact_xlog.h create mode 100644 src/include/access/resolver_internal.h diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 0880e0a8bb..49480dd039 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -9,6 +9,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ - table tablesample transam + table tablesample transam fdwxact include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/fdwxact/Makefile b/src/backend/access/fdwxact/Makefile new file mode 100644 index 0000000000..0207a66fb4 --- /dev/null +++ b/src/backend/access/fdwxact/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/fdwxact +# +# IDENTIFICATION +# src/backend/access/fdwxact/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/fdwxact +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = fdwxact.o resolver.o launcher.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/fdwxact/README b/src/backend/access/fdwxact/README new file mode 100644 index 0000000000..3cfa06d32f --- /dev/null +++ b/src/backend/access/fdwxact/README @@ -0,0 +1,109 @@ +src/backend/access/fdwxact/README + +Atomic Commit for Distributed Transactions +=========================================== + +The atomic commit feature enables us to commit and rollback either all of +foreign servers or nothing. This ensures that the database data is always left +in a consistent state in term of federated database. + + +Commit Sequence of Global Transactions +--------------------------------------- + +We employee two-phase commit protocol to achieve commit among all foreign +servers atomically. The sequence of distributed transaction commit consists +of the following four steps: + +1. Foreign Server Registration +During executor node initialization, accessed foreign servers are registered +to the list FdwXactParticipants, which is maintained by PostgreSQL's the +global transaction manager (GTM), as a distributed transaction participant. +The registered foreign transactions are tracked until the end of transaction. + +2. Pre-Commit phase (1st phase of two-phase commit) +We record the corresponding WAL indicating that the foreign server is involved +with the current transaction before doing PREPARE each foreign transactions. +Thus in case we loose connectivity to the foreign server or crash ourselves, +we will remember that we might have prepared transaction on the foreign +server, and try to resolve it when connectivity is restored or after crash +recovery. + +The two-phase commit is required only if the transaction modified two or more +servers including the local node. In other case, we can commit them at this +step by calling CommitForeignTransaction() API and no need further operation. + +After that we prepare all foreign transactions by calling +PrepareForeignTransaction() API. If we failed on any of them we change to +rollback, therefore at this time some participants might be prepared whereas +some are not prepared. The former foreign transactions need to be resolved +using pg_resolve_foreign_xact() manually and the latter ends transaction +in one-phase by calling RollbackForeignTransaction() API. + +3. Commit locally +Once we've prepared all of them, commit the transaction locally. + +4. Post-Commit Phase (2nd phase of two-phase commit) +The steps so far are done by the backend process committing the transaction but +this resolution step (commit or rollback) is done by the foreign transaction +resolver process. The backend process inserts itself to the wait queue, and +then wake up the resolver process (or request to launch new one if necessary). +The resolver process enqueue the waiter and fetch the distributed transaction +information that the backend is waiting for. Once all foreign transaction are +committed or rollbacked the resolver process wake up the waiter. + + +Foreign Data Wrapper Callbacks for Transaction Management +----------------------------------------------------------- + +The core GTM manages the status of individual foreign transactions and calls +transaction management callback functions according to its status. Each +callback functions PrepareForeignTransaction, CommitForeignTransaction and +RollbackForeignTransaction is responsible for PREPARE, COMMIT or ROLLBACK +the transaction on the foreign server, respectively. +FdwXactRslvState->flags could contain FDWXACT_FLAG_ONEPHASE, meaning FDW can +commit or rollback the foreign transaction in one-phase. On failure during +processing a foreign transaction, FDW needs to raise an error. However, FDW +must accept ERRCODE_UNDEFINED_OBJECT error during committing or rolling back a +foreign transaction, because there is a race condition that the coordinator +could crash in time between the resolution is completed and writing the WAL +removing the FdwXact entry. + + +Foreign Transactions Status +---------------------------- + +Every foreign transactions will have an FdwXact entry. When preparing a foreign +transaction a FdwXact entry of which status starts from FDWXACT_STATUS_PREPARING +are created with WAL logging. The status changes to FDWXACT_STATUS_PREPARED +after the foreign transaction is prepared and it changes to +FDWXACT_STATUS_COMMITTING and FDWXACT_STATUS_ABORTING before the foreign +transaction is committed and aborted by FDW callback functions respectively. +FdwXact entry is removed once the foreign transaction is resolved with WAL +logging. + +FdwXact entries recovered during the recovery are marked as in-doubt if the +corresponding local transaction is not prepared transaction. The initial +status is FDWXACT_STATUS_PREPARED(*1). Because the foreign transaction was +being processed we cannot know the exact status. So we regard it as PREPARED +for safety. + +The foreign transaction status transition is illustrated by the following graph +describing the FdwXact->status: + + +----------------------------------------------------+ + | PREPARING |----+ + +----------------------------------------------------+ | + | | + v | + +----------------------------------------------------+ | + | PREPARED(*1) | | (*2) + +----------------------------------------------------+ | + | | | + v v | + +--------------------+ +--------------------+ | + | COMMITTING(*1) | | ABORTING(*1) |<---+ + +--------------------+ +--------------------+ + +(*1) Recovered FdwXact entries starts with PREPARED +(*2) Paths when an error occurrs during preparing diff --git a/src/backend/access/fdwxact/fdwxact.c b/src/backend/access/fdwxact/fdwxact.c new file mode 100644 index 0000000000..76b973b473 --- /dev/null +++ b/src/backend/access/fdwxact/fdwxact.c @@ -0,0 +1,2754 @@ +/*------------------------------------------------------------------------- + * + * fdwxact.c + * PostgreSQL global transaction manager for foreign servers. + * + * To achieve commit among all foreign servers atomically, we employee + * two-phase commit protocol, which is a type of atomic commitment + * protocol(ACP). The basic strategy is that we prepare all of the remote + * transactions before committing locally and commit them after committing + * locally. + * + * Two-phase commit protocol is used when the transaction modified two or + * more servers including the local node. If two-phase commit protocol + * is not required all foreign transactions are committed at pre-commit + * phase. + * + * During executor node initialization, they can register the foreign server + * by calling either RegisterFdwXactByRelId() or RegisterFdwXactByServerId() + * to participate it to a group for global commit. The foreign servers are + * registered if FDW has both CommitForeignTransaction API and + * RollbackForeignTransaction API. Registered participant servers are + * identified by OIDs of foreign server and user. + * + * During pre-commit of local transaction, we prepare the transaction on + * all foreign servers. And after committing or rolling back locally, + * we notify the resolver process and tell it to commit or rollback those + * transactions. If we ask to commit, we also tell to notify us when + * it's done, so that we can wait interruptibly to finish, and so that + * we're not trying to locally do work that might fail after foreign + * transaction are committed. + * + * The best performing way to manage the waiting backends is to have a + * queue of waiting backends, so that we can avoid searching the through all + * foreign transactions each time we receive a request. We have one queue + * of which elements are ordered by the timestamp when they expect to be + * processed. Before waiting for foreign transactions being resolved the + * backend enqueues with the timestamp when they expects to be processed. + * On failure, it enqueues again with new timestamp (last timestamp + + * foreign_xact_resolution_interval). + * + * If server crash occurs or user canceled waiting the prepared foreign + * transactions are left without a holder. Such foreign transactions are + * resolved automatically by the resolver process. + * + * LOCKING + * + * Whenever a foreign transaction is processed, the corresponding FdwXact + * entry is update. To avoid holding the lock during transaction processing + * which may take an unpredicatable time the in-memory data of foreign + * transaction follows a locking model based on the following linked concepts: + * + * * All FdwXact fields except for status are protected by FdwXactLock. The + * status is protected by its mutex. + * * A process who is going to process foreign transaction needs to set locking_backend + * of the FdwXact entry to lock the entry, which prevents the entry from + * being updated and removed by concurrent processes. + * * FdwXact entries whose local transaction is either being processed + * (fdwxact->owner is not NULL) or prepared (TwoPhaseExists() is true) can be + * processed by neither pg_resolve_foreign_xact(), pg_remove_foreign_xact() nor + * automatic resolution. + * + * RECOVERY + * + * During replay WAL and replication FdwXactCtl also holds information about + * active prepared foreign transaction that haven't been moved to disk yet. + * + * Replay of fdwxact records happens by the following rules: + * + * * At the beginning of recovery, pg_fdwxacts is scanned once, filling FdwXact + * with entries marked with fdwxact->inredo and fdwxact->ondisk. FdwXact file + * data older than the XID horizon of the redo position are discarded. + * * On PREPARE redo, the foreign transaction is added to FdwXactCtl->fdwxacts. + * We set fdwxact->inredo to true for such entries. + * * On Checkpoint we iterate through FdwXactCtl->fdwxacts entries that + * have fdwxact->inredo set and are behind the redo_horizon. We save + * them to disk and then set fdwxact->ondisk to true. + * * On resolution we delete the entry from FdwXactCtl->fdwxacts. If + * fdwxact->ondisk is true, the corresponding entry from the disk is + * additionally deleted. + * * RecoverFdwXacts() and PrescanFdwXacts() have been modified to go through + * fdwxact->inredo entries that have not made it to disk. + * + * These replay rules are borrowed from twophase.c + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/fdwxact/fdwxact.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "access/fdwxact.h" +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" +#include "access/fdwxact_xlog.h" +#include "access/resolver_internal.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "foreign/fdwapi.h" +#include "foreign/foreign.h" +#include "funcapi.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "parser/parsetree.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lock.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/pmsignal.h" +#include "storage/shmem.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +/* Foreign twophase commit is enabled and requested by user */ +#define IsForeignTwophaseCommitRequested() \ + (foreign_twophase_commit > FOREIGN_TWOPHASE_COMMIT_DISABLED) + +/* Check the FdwXactParticipant is capable of two-phase commit */ +#define ServerSupportTransactionCallack(fdw_part) \ + (((FdwXactParticipant *)(fdw_part))->commit_foreign_xact_fn != NULL) +#define SeverSupportTwophaseCommit(fdw_part) \ + (((FdwXactParticipant *)(fdw_part))->prepare_foreign_xact_fn != NULL) + +/* + * Structure to bundle the foreign transaction participant. This struct + * is created at the beginning of execution for each foreign servers and + * is used until the end of transaction where we cannot look at syscaches. + * Therefore, this is allocated in the TopTransactionContext. + */ +typedef struct FdwXactParticipant +{ + /* + * Pointer to a FdwXact entry in the global array. NULL if the entry is + * not inserted yet but this is registered as a participant. + */ + FdwXact fdwxact; + + /* Foreign server and user mapping info, passed to callback routines */ + ForeignServer *server; + UserMapping *usermapping; + + /* Transaction identifier used for PREPARE */ + char *fdwxact_id; + + /* true if modified the data on the server */ + bool modified; + + /* Callbacks for foreign transaction */ + PrepareForeignTransaction_function prepare_foreign_xact_fn; + CommitForeignTransaction_function commit_foreign_xact_fn; + RollbackForeignTransaction_function rollback_foreign_xact_fn; + GetPrepareId_function get_prepareid_fn; +} FdwXactParticipant; + +/* + * List of foreign transactions involved in the transaction. A member of + * participants may not support transaction callbacks: commit, rollback and + * prepare. If a member of participants doesn't support any transaction + * callbacks, i.g. ServerSupportTransactionCallack() returns false, + * we don't end its transaction. + * + * FdwXactParticipants_tmp is used to update FdwXactParticipants atomically + * when executing COMMIT/ROLLBACK PREPARED command. In COMMIT PREPARED case, + * we don't want to rollback foreign transactions even if an error occurs, + * because the local prepared transaction never turn over rollback in that + * case. However, preparing FdwXactParticipants might be lead an error + * because of calling palloc() inside. So we prepare FdwXactParticipants in + * two phase. In the first phase, PrepareFdwXactParticipants(), we collect + * all foreign transactions associated with the local prepared transactions + * and kept them in FdwXactParticipants_tmp. Even if an error occurs during + * that, we don't rollback them. In the second phase, SetFdwXactParticipants(), + * we replace FdwXactParticipants_tmp with FdwXactParticipants and hold them. + * + * FdwXactLocalXid is the local transaction id associated with FdwXactParticipants. + */ +static List *FdwXactParticipants = NIL; +static List *FdwXactParticipants_tmp = NIL; +static TransactionId FdwXactLocalXid = InvalidTransactionId; + +/* + * True is the current transaction needs to be committed together with + * foreign servers. + */ +static bool ForeignTwophaseCommitIsRequired = false; + +/* Directory where the foreign prepared transaction files will reside */ +#define FDWXACTS_DIR "pg_fdwxact" + +/* + * Name of foreign prepared transaction file is 8 bytes database oid, + * xid, foreign server oid and user oid separated by '_'. + * + * Since FdwXact stat file is created per foreign transaction in a + * distributed transaction and the xid of unresolved distributed + * transaction never reused, the name is fairly enough to ensure + * uniqueness. + */ +#define FDWXACT_FILE_NAME_LEN (8 + 1 + 8 + 1 + 8 + 1 + 8) +#define FdwXactFilePath(path, dbid, xid, serverid, userid) \ + snprintf(path, MAXPGPATH, FDWXACTS_DIR "/%08X_%08X_%08X_%08X", \ + dbid, xid, serverid, userid) + +/* Guc parameters */ +int max_prepared_foreign_xacts = 0; +int max_foreign_xact_resolvers = 0; +int foreign_twophase_commit = FOREIGN_TWOPHASE_COMMIT_DISABLED; + +/* Keep track of registering process exit call back. */ +static bool fdwXactExitRegistered = false; + +static void register_fdwxact(Oid serverid, Oid userid, bool modified); +static void FdwXactParticipantEndTransaction(FdwXactParticipant *fdw_part, bool commit); +static bool checkForeignTwophaseCommitRequired(void); +static FdwXact FdwXactInsertFdwXactEntry(TransactionId xid, FdwXactParticipant *fdw_part); +static FdwXact insert_fdwxact(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, Oid umid, char *fdwxact_id); +static void FdwXactPrepareForeignTransactions(bool prepare_all); +static void FdwXactResolveOneFdwXact(FdwXact fdwxact); +static void FdwXactComputeRequiredXmin(void); +static void FdwXactCancelWait(void); +static void FdwXactRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn); +static void FdwXactRedoRemove(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, bool give_warnings); +static void FdwXactQueueInsert(PGPROC *waiter); +static void AtProcExit_FdwXact(int code, Datum arg); +static char *ReadFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, + Oid userid); +static void RemoveFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, bool giveWarning); +static void RecreateFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, void *content, int len); +static void XlogReadFdwXactData(XLogRecPtr lsn, char **buf, int *len); +static char *ProcessFdwXactBuffer(Oid dbid, TransactionId local_xid, + Oid serverid, Oid userid, + XLogRecPtr insert_start_lsn, + bool from_disk); +static TransactionId FdwXactDetermineTransactionFate(TransactionId xid); +static int get_fdwxact(Oid dbid, TransactionId xid, Oid serverid, Oid userid); +static char *get_fdwxact_identifier(FdwXactParticipant *fdw_part, + TransactionId xid); +static void remove_fdwxact(FdwXact fdwxact); +static FdwXactParticipant *create_fdwxact_participant(Oid serverid, Oid userid, + FdwRoutine *routine); + +#ifdef USE_ASSERT_CHECKING +static bool FdwXactQueueIsOrderedByTimestamp(void); +#endif + +/* + * Calculates the size of shared memory allocated for maintaining foreign + * prepared transaction entries. + */ +Size +FdwXactShmemSize(void) +{ + Size size; + + /* Size for foreign transaction information array */ + size = offsetof(FdwXactCtlData, fdwxacts); + size = add_size(size, mul_size(max_prepared_foreign_xacts, + sizeof(FdwXact))); + size = MAXALIGN(size); + size = add_size(size, mul_size(max_prepared_foreign_xacts, + sizeof(FdwXactData))); + + return size; +} + +/* + * Initialization of shared memory for maintaining foreign prepared transaction + * entries. The shared memory layout is defined in definition of FdwXactCtlData + * structure. + */ +void +FdwXactShmemInit(void) +{ + bool found; + + FdwXactCtl = ShmemInitStruct("Foreign transactions table", + FdwXactShmemSize(), + &found); + if (!IsUnderPostmaster) + { + FdwXact fdwxacts; + int cnt; + + Assert(!found); + FdwXactCtl->free_fdwxacts = NULL; + FdwXactCtl->num_fdwxacts = 0; + + /* Initialize the linked list of free FDW transactions */ + fdwxacts = (FdwXact) + ((char *) FdwXactCtl + + MAXALIGN(offsetof(FdwXactCtlData, fdwxacts) + + sizeof(FdwXact) * max_prepared_foreign_xacts)); + for (cnt = 0; cnt < max_prepared_foreign_xacts; cnt++) + { + fdwxacts[cnt].status = FDWXACT_STATUS_INVALID; + fdwxacts[cnt].fdwxact_free_next = FdwXactCtl->free_fdwxacts; + FdwXactCtl->free_fdwxacts = &fdwxacts[cnt]; + SpinLockInit(&fdwxacts[cnt].mutex); + } + } + else + { + Assert(FdwXactCtl); + Assert(found); + } +} + +/* + * Remember accessed foreign transaction. Both RegisterFdwXactByRelId and + * RegisterFdwXactByServerId are called by executor during initialization. + */ +void +RegisterFdwXactByRelId(Oid relid, bool modified) +{ + Relation rel; + Oid serverid; + Oid userid; + + rel = relation_open(relid, NoLock); + serverid = GetForeignServerIdByRelId(relid); + userid = rel->rd_rel->relowner ? rel->rd_rel->relowner : GetUserId(); + relation_close(rel, NoLock); + + register_fdwxact(serverid, userid, modified); +} + +void +RegisterFdwXactByServerId(Oid serverid, bool modified) +{ + register_fdwxact(serverid, GetUserId(), modified); +} + +/* + * Register given foreign transaction identified by given arguments as + * a participant of the transaction. The foreign transaction identified + * by given server id and user id. + */ +static void +register_fdwxact(Oid serverid, Oid userid, bool modified) +{ + FdwXactParticipant *fdw_part; + MemoryContext old_ctx; + FdwRoutine *routine; + ListCell *lc; + + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + + if (fdw_part->server->serverid == serverid && + fdw_part->usermapping->userid == userid) + { + /* The foreign server is already registered, return */ + fdw_part->modified |= modified; + return; + } + } + + /* on first call, register the exit hook */ + if (!fdwXactExitRegistered) + { + before_shmem_exit(AtProcExit_FdwXact, 0); + fdwXactExitRegistered = true; + } + + routine = GetFdwRoutineByServerId(serverid); + + /* + * Participant's information is also needed at the end of a transaction, + * where system cache are not available. Save it in TopTransactionContext + * so that these can live until the end of transaction. + */ + old_ctx = MemoryContextSwitchTo(TopTransactionContext); + + fdw_part = create_fdwxact_participant(serverid, userid, routine); + fdw_part->modified = modified; + + /* Add to the participants list */ + FdwXactParticipants = lappend(FdwXactParticipants, fdw_part); + + pfree(routine); + + /* Revert back the context */ + MemoryContextSwitchTo(old_ctx); +} + +/* Return palloc'd FdwXactParticipant variable */ +static FdwXactParticipant * +create_fdwxact_participant(Oid serverid, Oid userid, FdwRoutine *routine) +{ + FdwXactParticipant *fdw_part; + ForeignServer *foreign_server; + UserMapping *user_mapping; + + foreign_server = GetForeignServer(serverid); + user_mapping = GetUserMapping(userid, serverid); + + fdw_part = (FdwXactParticipant *) palloc(sizeof(FdwXactParticipant)); + + fdw_part->fdwxact = NULL; + fdw_part->server = foreign_server; + fdw_part->usermapping = user_mapping; + fdw_part->fdwxact_id = NULL; + fdw_part->modified = false; + fdw_part->prepare_foreign_xact_fn = routine->PrepareForeignTransaction; + fdw_part->commit_foreign_xact_fn = routine->CommitForeignTransaction; + fdw_part->rollback_foreign_xact_fn = routine->RollbackForeignTransaction; + fdw_part->get_prepareid_fn = routine->GetPrepareId; + + return fdw_part; +} + +/* + * Prepare all foreign transactions if foreign twophase commit is required. + * When foreign twophase commit is enabled, the behavior depends on the value + * of foreign_twophase_commit; when 'required' we strictly require for all + * foreign servers' FDW to support two-phase commit protocol and ask them to + * prepare foreign transactions, and when 'disabled' we ask all foreign servers + * to commit foreign transaction in one-phase. If we failed to commit any of + * them we change to aborting. + * + * Note that non-modified foreign servers always can be committed without + * preparation. + */ +void +PreCommit_FdwXact(void) +{ + ListCell *lc; + + /* If there are no foreign servers involved, we have no business here */ + if (FdwXactParticipants == NIL) + return; + + /* Set the local transaction id */ + FdwXactLocalXid = GetTopTransactionId(); + + /* + * Check if we need to use foreign twophase commit. Note that we don't + * support foreign twophase commit in single user mode. + */ + if (IsUnderPostmaster && checkForeignTwophaseCommitRequired()) + { + /* + * Prepare foreign transactions on foreign servers that support two-phase + * commit. Note that we keep FdwXactParticipants until the end of the + * transaction. + */ + FdwXactPrepareForeignTransactions(false); + ForeignTwophaseCommitIsRequired = true; + } + else + { + /* + * Commit other foreign transactions and delete the participant entry from + * the list. + */ + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + + Assert(!fdw_part->fdwxact); + + /* Commit the foreign transaction in one-phase */ + if (ServerSupportTransactionCallack(fdw_part)) + FdwXactParticipantEndTransaction(fdw_part, true); + } + + /* + * If we don't need two-phase commit, all participants' transactions should + * be completed at this time. + */ + list_free(FdwXactParticipants); + FdwXactParticipants = NIL; + } +} + +/* + * Return true if the current transaction modifies data on two or more servers + * in FdwXactParticipants and local server itself. + */ +static bool +checkForeignTwophaseCommitRequired(void) +{ + ListCell *lc; + bool need_twophase_commit; + bool have_notwophase = false; + int nserverswritten = 0; + + if (!IsForeignTwophaseCommitRequested()) + return false; + + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + + if (!fdw_part->modified) + continue; + + if (!SeverSupportTwophaseCommit(fdw_part)) + have_notwophase = true; + + nserverswritten++; + } + + /* Did we modify the local non-temporary data? */ + if ((MyXactFlags & XACT_FLAGS_WROTENONTEMPREL) != 0) + nserverswritten++; + + if (nserverswritten <= 1) + return false; + + /* We require for all modified server to support two-phase commit */ + need_twophase_commit = (nserverswritten >= 2); + Assert(foreign_twophase_commit == FOREIGN_TWOPHASE_COMMIT_REQUIRED); + + /* + * If foreign two phase commit is required then all foreign serves must be + * capable of doing two-phase commit + */ + if (need_twophase_commit) + { + /* Parameter check */ + if (max_prepared_foreign_xacts == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("foreign two-phase commit is required but prepared foreign transactions are disabled"), + errhint("Set max_prepared_foreign_transactions to a nonzero value."))); + + if (max_foreign_xact_resolvers == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("foreign two-phase commit is required but prepared foreign transactions are disabled"), + errhint("Set max_foreign_transaction_resolvers to a nonzero value."))); + + if (have_notwophase) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot process a distributed transaction that has operated on a foreign server that does not support two-phase commit protocol"), + errdetail("foreign_twophase_commit is \'required\' but the transaction has some foreign servers which are not capable of two-phase commit"))); + } + + return need_twophase_commit; +} + +/* + * The routine for committing or rolling back the given transaction participant. + */ +static void +FdwXactParticipantEndTransaction(FdwXactParticipant *fdw_part, bool commit) +{ + FdwXactRslvState state; + + Assert(fdw_part->commit_foreign_xact_fn); + Assert(fdw_part->rollback_foreign_xact_fn); + + state.xid = FdwXactLocalXid; + state.server = fdw_part->server; + state.usermapping = fdw_part->usermapping; + state.fdwxact_id = NULL; + state.flags = FDWXACT_FLAG_ONEPHASE; + + if (commit) + { + fdw_part->commit_foreign_xact_fn(&state); + elog(DEBUG1, "successfully committed the foreign transaction for server %u user %u", + fdw_part->usermapping->serverid, + fdw_part->usermapping->userid); + } + else + { + fdw_part->rollback_foreign_xact_fn(&state); + elog(DEBUG1, "successfully rolled back the foreign transaction for server %u user %u", + fdw_part->usermapping->serverid, + fdw_part->usermapping->userid); + } +} + +/* + * Insert FdwXact entries and prepare foreign transactions. Before inserting + * FdwXact entry we call get_preparedid callback to get a transaction + * identifier from FDW. If prepare_all is false, we prepare only modified + * foreign transactions. + * + * We still can change to rollback here on failure. If any error occurs, we + * rollback non-prepared foreign transactions. + */ +static void +FdwXactPrepareForeignTransactions(bool prepare_all) +{ + ListCell *lc; + + if (FdwXactParticipants == NIL) + return; + + /* Loop over the foreign connections */ + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + FdwXactRslvState state; + FdwXact fdwxact; + + if (!SeverSupportTwophaseCommit(fdw_part)) + continue; + + if (!prepare_all && !fdw_part->modified) + continue; + + /* Get prepared transaction identifier */ + fdw_part->fdwxact_id = get_fdwxact_identifier(fdw_part, FdwXactLocalXid); + Assert(fdw_part->fdwxact_id); + + /* + * Insert the foreign transaction entry with the + * FDWXACT_STATUS_PREPARING status. Registration persists this + * information to the disk and logs (that way relaying it on standby). + * Thus in case we loose connectivity to the foreign server or crash + * ourselves, we will remember that we might have prepared transaction + * on the foreign server and try to resolve it when connectivity is + * restored or after crash recovery. + * + * If we prepare the transaction on the foreign server before + * persisting the information to the disk and crash in-between these + * two steps, we will lost the prepared transaction on the foreign + * server and will not be able to resolve it after the crash recovery. + * Hence persist first then prepare. + */ + fdwxact = FdwXactInsertFdwXactEntry(FdwXactLocalXid, fdw_part); + + /* + * Prepare the foreign transaction. + * + * Between FdwXactInsertFdwXactEntry call till this backend hears + * acknowledge from foreign server, the backend may abort the local + * transaction (say, because of a signal). + */ + state.xid = FdwXactLocalXid; + state.server = fdw_part->server; + state.usermapping = fdw_part->usermapping; + state.fdwxact_id = pstrdup(fdw_part->fdwxact_id); + fdw_part->prepare_foreign_xact_fn(&state); + + /* succeeded, update status */ + SpinLockAcquire(&fdwxact->mutex); + fdwxact->status = FDWXACT_STATUS_PREPARED; + SpinLockRelease(&fdwxact->mutex); + } +} + +/* + * This function is used to create new foreign transaction entry before an FDW + * prepares and commit/rollback. The function adds the entry to WAL and it will + * be persisted to the disk under pg_fdwxact directory when checkpoint. + */ +static FdwXact +FdwXactInsertFdwXactEntry(TransactionId xid, FdwXactParticipant *fdw_part) +{ + FdwXact fdwxact; + FdwXactOnDiskData *fdwxact_file_data; + MemoryContext old_context; + int data_len; + + old_context = MemoryContextSwitchTo(TopTransactionContext); + + /* + * Enter the foreign transaction in the shared memory structure. + */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + fdwxact = insert_fdwxact(MyDatabaseId, xid, fdw_part->server->serverid, + fdw_part->usermapping->userid, + fdw_part->usermapping->umid, fdw_part->fdwxact_id); + fdwxact->locking_backend = MyBackendId; + LWLockRelease(FdwXactLock); + + fdw_part->fdwxact = fdwxact; + MemoryContextSwitchTo(old_context); + + /* + * Prepare to write the entry to a file. Also add xlog entry. The contents + * of the xlog record are same as what is written to the file. + */ + data_len = offsetof(FdwXactOnDiskData, fdwxact_id); + data_len = data_len + strlen(fdw_part->fdwxact_id) + 1; + data_len = MAXALIGN(data_len); + fdwxact_file_data = (FdwXactOnDiskData *) palloc0(data_len); + fdwxact_file_data->dbid = MyDatabaseId; + fdwxact_file_data->local_xid = xid; + fdwxact_file_data->serverid = fdw_part->server->serverid; + fdwxact_file_data->userid = fdw_part->usermapping->userid; + fdwxact_file_data->umid = fdw_part->usermapping->umid; + memcpy(fdwxact_file_data->fdwxact_id, fdw_part->fdwxact_id, + strlen(fdw_part->fdwxact_id) + 1); + + /* See note in RecordTransactionCommit */ + MyProc->delayChkpt = true; + + START_CRIT_SECTION(); + + /* Add the entry in the xlog and save LSN for checkpointer */ + XLogBeginInsert(); + XLogRegisterData((char *) fdwxact_file_data, data_len); + fdwxact->insert_end_lsn = XLogInsert(RM_FDWXACT_ID, XLOG_FDWXACT_INSERT); + XLogFlush(fdwxact->insert_end_lsn); + + /* If we crash now, we have prepared: WAL replay will fix things */ + + /* Store record's start location to read that later on CheckPoint */ + fdwxact->insert_start_lsn = ProcLastRecPtr; + + /* File is written completely, checkpoint can proceed with syncing */ + fdwxact->valid = true; + + /* Checkpoint can process now */ + MyProc->delayChkpt = false; + + END_CRIT_SECTION(); + + pfree(fdwxact_file_data); + return fdwxact; +} + +/* + * Insert a new entry for a given foreign transaction identified by transaction + * id, foreign server and user mapping, into the shared memory array. Caller + * must hold FdwXactLock in exclusive mode. + * + * If the entry already exists, the function raises an error. + */ +static FdwXact +insert_fdwxact(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + Oid umid, char *fdwxact_id) +{ + FdwXact fdwxact; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + + /* Check for duplicated foreign transaction entry */ + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + fdwxact = FdwXactCtl->fdwxacts[i]; + if (fdwxact->valid && + fdwxact->dbid == dbid && + fdwxact->local_xid == xid && + fdwxact->serverid == serverid && + fdwxact->userid == userid) + ereport(ERROR, (errmsg("could not insert a foreign transaction entry"), + errdetail("Duplicate entry with transaction id %u, serverid %u, userid %u exists.", + xid, serverid, userid))); + } + + /* + * Get a next free foreign transaction entry. Raise error if there are + * none left. + */ + if (!FdwXactCtl->free_fdwxacts) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("maximum number of foreign transactions reached"), + errhint("Increase max_prepared_foreign_transactions: \"%d\".", + max_prepared_foreign_xacts))); + } + fdwxact = FdwXactCtl->free_fdwxacts; + FdwXactCtl->free_fdwxacts = fdwxact->fdwxact_free_next; + + /* Insert the entry to shared memory array */ + Assert(FdwXactCtl->num_fdwxacts < max_prepared_foreign_xacts); + FdwXactCtl->fdwxacts[FdwXactCtl->num_fdwxacts++] = fdwxact; + + fdwxact->status = FDWXACT_STATUS_PREPARING; + fdwxact->owner = MyProc; + fdwxact->local_xid = xid; + fdwxact->dbid = dbid; + fdwxact->serverid = serverid; + fdwxact->userid = userid; + fdwxact->umid = umid; + fdwxact->insert_start_lsn = InvalidXLogRecPtr; + fdwxact->insert_end_lsn = InvalidXLogRecPtr; + fdwxact->locking_backend = InvalidBackendId; + fdwxact->valid = false; + fdwxact->ondisk = false; + fdwxact->inredo = false; + memcpy(fdwxact->fdwxact_id, fdwxact_id, strlen(fdwxact_id) + 1); + + return fdwxact; +} + +/* + * Remove the foreign prepared transaction entry from shared memory. + * Caller must hold FdwXactLock in exclusive mode. + */ +static void +remove_fdwxact(FdwXact fdwxact) +{ + int i; + + Assert(fdwxact != NULL); + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + + /* Search the slot where this entry resided */ + for (i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + if (FdwXactCtl->fdwxacts[i] == fdwxact) + break; + } + + /* We did not find the given entry in the array */ + if (i >= FdwXactCtl->num_fdwxacts) + ereport(ERROR, + (errmsg("could not remove a foreign transaction entry"), + errdetail("Failed to find entry for xid %u, foreign server %u, and user %u.", + fdwxact->local_xid, fdwxact->serverid, fdwxact->userid))); + + elog(DEBUG2, "remove fdwxact entry id %s, xid %u db %d user %d", + fdwxact->fdwxact_id, fdwxact->local_xid, fdwxact->dbid, + fdwxact->userid); + + /* Remove the entry from active array */ + FdwXactCtl->num_fdwxacts--; + FdwXactCtl->fdwxacts[i] = FdwXactCtl->fdwxacts[FdwXactCtl->num_fdwxacts]; + + /* Put it back into free list */ + fdwxact->fdwxact_free_next = FdwXactCtl->free_fdwxacts; + FdwXactCtl->free_fdwxacts = fdwxact; + + /* Reset informations */ + fdwxact->status = FDWXACT_STATUS_INVALID; + fdwxact->owner = NULL; + fdwxact->locking_backend = InvalidBackendId; + fdwxact->valid = false; + fdwxact->ondisk = false; + fdwxact->inredo = false; + + if (!RecoveryInProgress()) + { + xl_fdwxact_remove record; + XLogRecPtr recptr; + + /* Fill up the log record before releasing the entry */ + record.serverid = fdwxact->serverid; + record.dbid = fdwxact->dbid; + record.xid = fdwxact->local_xid; + record.userid = fdwxact->userid; + + /* + * Now writing FdwXact state data to WAL. We have to set delayChkpt + * here, otherwise a checkpoint starting immediately after the WAL + * record is inserted could complete without fsync'ing our state file. + * (This is essentially the same kind of race condition as the + * COMMIT-to-clog-write case that RecordTransactionCommit uses + * delayChkpt for; see notes there.) + */ + START_CRIT_SECTION(); + + MyProc->delayChkpt = true; + + /* + * Log that we are removing the foreign transaction entry and remove + * the file from the disk as well. + */ + XLogBeginInsert(); + XLogRegisterData((char *) &record, sizeof(xl_fdwxact_remove)); + recptr = XLogInsert(RM_FDWXACT_ID, XLOG_FDWXACT_REMOVE); + XLogFlush(recptr); + + /* Now we can mark ourselves as out of the commit critical section */ + MyProc->delayChkpt = false; + + END_CRIT_SECTION(); + } +} + +/* + * Return a null-terminated foreign transaction identifier. If the given + * foreign server's FDW provides getPrepareId callback we return the identifier + * returned from it. Otherwise we generate an unique identifier with in the + * form of "fx____ whose length is + * less than FDWXACT_ID_MAX_LEN. + * + * Returned string value is used to identify foreign transaction. The + * identifier should not be same as any other concurrent prepared transaction + * identifier. + * + * To make the foreign transactionid unique, we should ideally use something + * like UUID, which gives unique ids with high probability, but that may be + * expensive here and UUID extension which provides the function to generate + * UUID is not part of the core code. + */ +static char * +get_fdwxact_identifier(FdwXactParticipant *fdw_part, TransactionId xid) +{ + char *id; + int id_len = 0; + + /* + * If FDW doesn't provide the callback function, generate an unique + * identifier. + */ + if (!fdw_part->get_prepareid_fn) + { + char buf[FDWXACT_ID_MAX_LEN] = {0}; + + snprintf(buf, FDWXACT_ID_MAX_LEN, "fx_%ld_%u_%d_%d", + Abs(random()), xid, fdw_part->server->serverid, + fdw_part->usermapping->userid); + + return pstrdup(buf); + } + + /* Get an unique identifier from callback function */ + id = fdw_part->get_prepareid_fn(xid, fdw_part->server->serverid, + fdw_part->usermapping->userid, + &id_len); + + if (id == NULL) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + (errmsg("foreign transaction identifier is not provided")))); + + /* Check length of foreign transaction identifier */ + if (id_len > FDWXACT_ID_MAX_LEN) + { + id[FDWXACT_ID_MAX_LEN] = '\0'; + ereport(ERROR, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("foreign transaction identifier \"%s\" is too long", + id), + errdetail("Foreign transaction identifier must be less than %d characters.", + FDWXACT_ID_MAX_LEN))); + } + + id[id_len] = '\0'; + return pstrdup(id); +} + +/* + * Prepare foreign transactions by PREPARE TRANSACTION command. + * + * Note that it's possible that the transaction aborts after we prepared some + * of participants. In this case we change to rollback and rollback all foreign + * transactions. + */ +void +AtPrepare_FdwXact(void) +{ + ListCell *lc; + + if (FdwXactParticipants == NIL) + return; + + /* Set the local transaction id */ + FdwXactLocalXid = GetTopTransactionId(); + + /* Check for an invalid condition */ + if (!IsForeignTwophaseCommitRequested()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a distributed transaction when foreign_twophase_commit is \'disabled\'"))); + + /* + * Check if there is a server that doesn't support two-phase commit. All involved + * servers need to support two-phase commit as we prepare on them regardless of + * modified or not. + */ + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + + if (!SeverSupportTwophaseCommit(fdw_part)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a distributed transaction which has operated on a foreign server not supporting two-phase commit protocol"))); + } + + /* Prepare transactions on participating foreign servers. */ + FdwXactPrepareForeignTransactions(true); + + /* + * We keep prepared foreign transaction participants to rollback them in case + * of failure. + */ +} + +/* + * After PREPARE TRANSACTION, we forget all participants. + */ +void +PostPrepare_FdwXact(void) +{ + if (FdwXactParticipants == NIL) + { + Assert(FdwXactParticipants_tmp == NIL); + Assert(!ForeignTwophaseCommitIsRequired); + return; + } + + ForgetAllFdwXactParticipants(); +} + +/* + * Collect all foreign transactions associated with the given xid if it's a prepared + * transaction. Return true if COMMIT PREPARED or ROLLBACK PREPARED needs to wait for + * all foreign transactions to be resolved. The collected foreign transactions are kept + * in FdwXactParticipants_tmp. The caller must call SetFdwXactParticipants() later + * if this function returns true. + */ +bool +PrepareFdwXactParticipants(TransactionId xid) +{ + MemoryContext old_ctx; + + Assert(FdwXactParticipants_tmp == NIL); + + if (!TwoPhaseExists(xid)) + return false; + + old_ctx = MemoryContextSwitchTo(TopTransactionContext); + + LWLockAcquire(FdwXactLock, LW_SHARED); + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXactParticipant *fdw_part; + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + FdwRoutine *routine; + + if (!fdwxact->valid || fdwxact->local_xid != xid) + continue; + + routine = GetFdwRoutineByServerId(fdwxact->serverid); + fdw_part = create_fdwxact_participant(fdwxact->serverid, fdwxact->userid, + routine); + fdw_part->modified = true; + fdw_part->fdwxact = fdwxact; + + /* Add to the participants list */ + FdwXactParticipants_tmp = lappend(FdwXactParticipants_tmp, fdw_part); + } + LWLockRelease(FdwXactLock); + + MemoryContextSwitchTo(old_ctx); + + /* + * We cannot proceed to commit this prepared transaction when + * foreign_twophase_commit is disabled. + */ + if (FdwXactParticipants_tmp != NIL && + !IsForeignTwophaseCommitRequested()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot process a prepared foreign transaction commit when foreign_twophase_commit is \'disabled\'"))); + + return (FdwXactParticipants_tmp != NIL); +} + +/* + * Set the collected foreign transactions to the participants of this transaction, + * and hold them. This function must be called after CollectFdwXactParticipants(). + */ +void +SetFdwXactParticipants(TransactionId xid) +{ + ListCell *lc; + + Assert(FdwXactParticipants_tmp != NIL); + Assert(FdwXactParticipants == NIL); + + FdwXactLocalXid = xid; + FdwXactParticipants = FdwXactParticipants_tmp; + FdwXactParticipants_tmp = NIL; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + + Assert(SeverSupportTwophaseCommit(fdw_part)); + Assert(fdw_part->fdwxact->status == FDWXACT_STATUS_PREPARED); + Assert(fdw_part->fdwxact->locking_backend == InvalidBackendId); + Assert(!fdw_part->fdwxact->owner); + + /* Hold the fdwxact entry and set the status */ + fdw_part->fdwxact->locking_backend = MyBackendId; + fdw_part->fdwxact->owner = MyProc; + } + LWLockRelease(FdwXactLock); +} + +bool +FdwXactIsForeignTwophaseCommitRequired(void) +{ + return ForeignTwophaseCommitIsRequired; +} + +void +FdwXactCleanupAtProcExit(void) +{ + if (!SHMQueueIsDetached(&(MyProc->fdwXactLinks))) + { + LWLockAcquire(FdwXactResolutionLock, LW_EXCLUSIVE); + SHMQueueDelete(&(MyProc->fdwXactLinks)); + LWLockRelease(FdwXactResolutionLock); + } +} + +/* + * When the process exits, forget all the entries. + */ +static void +AtProcExit_FdwXact(int code, Datum arg) +{ + ForgetAllFdwXactParticipants(); +} + +/* + * Wait for its all foreign transactions to be resolved. + * + * Initially backends start in state FDWXACT_NOT_WAITING and then change + * that state to FDWXACT_WAITING before adding ourselves to the wait queue. + * During FdwXactResolveForeignTransaction a fdwxact resolver changes the + * state to FDWXACT_WAIT_COMPLETE once all foreign transactions are resolved. + * This backend then resets its state to FDWXACT_NOT_WAITING. + * If a resolver fails to resolve the waiting transaction it moves us to + * the retry queue. + * + * This function is inspired by SyncRepWaitForLSN. + */ +void +FdwXactWaitForResolution(TransactionId wait_xid, bool commit) +{ + ListCell *lc; + char *new_status = NULL; + const char *old_status; + + Assert(FdwXactCtl != NULL); + Assert(TransactionIdIsValid(wait_xid)); + Assert(SHMQueueIsDetached(&(MyProc->fdwXactLinks))); + Assert(MyProc->fdwXactState == FDWXACT_NOT_WAITING); + + /* + * Quick exit if either atomic commit is not requested or we don't have + * any participants. + */ + if (!IsForeignTwophaseCommitRequested() || FdwXactParticipants == NIL) + return; + + /* Set foreign transaction status */ + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + + Assert(fdw_part->fdwxact->locking_backend == MyBackendId); + Assert(fdw_part->fdwxact->owner == MyProc); + + SpinLockAcquire(&(fdw_part->fdwxact->mutex)); + fdw_part->fdwxact->status = commit + ? FDWXACT_STATUS_COMMITTING + : FDWXACT_STATUS_ABORTING; + SpinLockRelease(&(fdw_part->fdwxact->mutex)); + } + + /* Set backend status and enqueue itself to the active queue */ + LWLockAcquire(FdwXactResolutionLock, LW_EXCLUSIVE); + MyProc->fdwXactState = FDWXACT_WAITING; + MyProc->fdwXactWaitXid = wait_xid; + MyProc->fdwXactNextResolutionTs = GetCurrentTransactionStopTimestamp(); + FdwXactQueueInsert(MyProc); + Assert(FdwXactQueueIsOrderedByTimestamp()); + LWLockRelease(FdwXactResolutionLock); + + /* Launch a resolver process if not yet, or wake up */ + FdwXactLaunchOrWakeupResolver(); + + /* + * Alter ps display to show waiting for foreign transaction resolution. + */ + if (update_process_title) + { + int len; + + old_status = get_ps_display(&len); + new_status = (char *) palloc(len + 31 + 1); + memcpy(new_status, old_status, len); + sprintf(new_status + len, " waiting for resolution %d", wait_xid); + set_ps_display(new_status); + new_status[len] = '\0'; /* truncate off "waiting ..." */ + } + + /* Wait for all foreign transactions to be resolved */ + for (;;) + { + /* Must reset the latch before testing state */ + ResetLatch(MyLatch); + + /* + * Acquiring the lock is not needed, the latch ensures proper + * barriers. If it looks like we're done, we must really be done, + * because once walsender changes the state to FDWXACT_WAIT_COMPLETE, + * it will never update it again, so we can't be seeing a stale value + * in that case. + */ + if (MyProc->fdwXactState == FDWXACT_WAIT_COMPLETE) + { + ForgetAllFdwXactParticipants(); + break; + } + + /* + * If a wait for foreign transaction resolution is pending, we can + * neither acknowledge the commit nor raise ERROR or FATAL. The + * latter would lead the client to believe that the distributed + * transaction aborted, which is not true: it's already committed + * locally. The former is no good either: the client has requested + * committing a distributed transaction, and is entitled to assume + * that a acknowledged commit is also commit on all foreign servers, + * which might not be true. So in this case we issue a WARNING (which + * some clients may be able to interpret) and shut off further output. + * We do NOT reset PorcDiePending, so that the process will die after + * the commit is cleaned up. + */ + if (ProcDiePending) + { + ereport(WARNING, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("canceling the wait for resolving foreign transaction and terminating connection due to administrator command"), + errdetail("The transaction has already committed locally, but might not have been committed on the foreign server."))); + whereToSendOutput = DestNone; + FdwXactCancelWait(); + break; + } + + /* + * If a query cancel interrupt arrives we just terminate the wait with + * a suitable warning. The foreign transactions can be orphaned but + * the foreign xact resolver can pick up them and tries to resolve + * them later. + */ + if (QueryCancelPending) + { + QueryCancelPending = false; + ereport(WARNING, + (errmsg("canceling wait for resolving foreign transaction due to user request"), + errdetail("The transaction has already committed locally, but might not have been committed on the foreign server."))); + FdwXactCancelWait(); + break; + } + + /* + * If the postmaster dies, we'll probably never get an + * acknowledgement, because all the wal sender processes will exit. So + * just bail out. + */ + if (!PostmasterIsAlive()) + { + ProcDiePending = true; + whereToSendOutput = DestNone; + FdwXactCancelWait(); + break; + } + + /* + * Wait on latch. Any condition that should wake us up will set the + * latch, so no need for timeout. + */ + WaitLatch(MyLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, + WAIT_EVENT_FDWXACT_RESOLUTION); + } + + pg_read_barrier(); + + Assert(SHMQueueIsDetached(&(MyProc->fdwXactLinks))); + MyProc->fdwXactState = FDWXACT_NOT_WAITING; + + if (new_status) + { + set_ps_display(new_status); + pfree(new_status); + } +} + +/* + * Return one backend that connects to my database and is waiting for + * resolution. + */ +PGPROC * +FdwXactGetWaiter(TimestampTz now, TimestampTz *nextResolutionTs_p, + TransactionId *waitXid_p) +{ + PGPROC *proc; + bool found = false; + + Assert(LWLockHeldByMe(FdwXactResolutionLock)); + Assert(FdwXactQueueIsOrderedByTimestamp()); + + /* Initialize variables */ + *nextResolutionTs_p = -1; + *waitXid_p = InvalidTransactionId; + + proc = (PGPROC *) SHMQueueNext(&(FdwXactRslvCtl->fdwxact_queue), + &(FdwXactRslvCtl->fdwxact_queue), + offsetof(PGPROC, fdwXactLinks)); + + while (proc) + { + if (proc->databaseId == MyDatabaseId) + { + if (proc->fdwXactNextResolutionTs <= now) + { + /* Found a waiting process */ + found = true; + *waitXid_p = proc->fdwXactWaitXid; + } + else + /* Found a waiting process supposed to be processed later */ + *nextResolutionTs_p = proc->fdwXactNextResolutionTs; + + break; + } + + proc = (PGPROC *) SHMQueueNext(&(FdwXactRslvCtl->fdwxact_queue), + &(proc->fdwXactLinks), + offsetof(PGPROC, fdwXactLinks)); + } + + return found ? proc : NULL; +} + +/* + * Return true if there are at least one backend in the wait queue. The caller + * must hold FdwXactResolutionLock. + */ +bool +FdwXactWaiterExists(Oid dbid) +{ + PGPROC *proc; + + Assert(LWLockHeldByMeInMode(FdwXactResolutionLock, LW_SHARED)); + + proc = (PGPROC *) SHMQueueNext(&(FdwXactRslvCtl->fdwxact_queue), + &(FdwXactRslvCtl->fdwxact_queue), + offsetof(PGPROC, fdwXactLinks)); + + while (proc) + { + if (proc->databaseId == dbid) + return true; + + proc = (PGPROC *) SHMQueueNext(&(FdwXactRslvCtl->fdwxact_queue), + &(proc->fdwXactLinks), + offsetof(PGPROC, fdwXactLinks)); + } + + return false; +} + +/* + * Insert the waiter to the wait queue in fdwXactNextResolutoinTs order. + */ +static void +FdwXactQueueInsert(PGPROC *waiter) +{ + PGPROC *proc; + + Assert(LWLockHeldByMeInMode(FdwXactResolutionLock, LW_EXCLUSIVE)); + + proc = (PGPROC *) SHMQueuePrev(&(FdwXactRslvCtl->fdwxact_queue), + &(FdwXactRslvCtl->fdwxact_queue), + offsetof(PGPROC, fdwXactLinks)); + + while (proc) + { + if (proc->fdwXactNextResolutionTs < waiter->fdwXactNextResolutionTs) + break; + + proc = (PGPROC *) SHMQueuePrev(&(FdwXactRslvCtl->fdwxact_queue), + &(proc->fdwXactLinks), + offsetof(PGPROC, fdwXactLinks)); + } + + if (proc) + SHMQueueInsertAfter(&(proc->fdwXactLinks), &(waiter->fdwXactLinks)); + else + SHMQueueInsertAfter(&(FdwXactRslvCtl->fdwxact_queue), &(waiter->fdwXactLinks)); +} + +#ifdef USE_ASSERT_CHECKING +static bool +FdwXactQueueIsOrderedByTimestamp(void) +{ + PGPROC *proc; + TimestampTz lastTs; + + proc = (PGPROC *) SHMQueueNext(&(FdwXactRslvCtl->fdwxact_queue), + &(FdwXactRslvCtl->fdwxact_queue), + offsetof(PGPROC, fdwXactLinks)); + lastTs = 0; + + while (proc) + { + + if (proc->fdwXactNextResolutionTs < lastTs) + return false; + + lastTs = proc->fdwXactNextResolutionTs; + + proc = (PGPROC *) SHMQueueNext(&(FdwXactRslvCtl->fdwxact_queue), + &(proc->fdwXactLinks), + offsetof(PGPROC, fdwXactLinks)); + } + + return true; +} +#endif + +/* + * Acquire FdwXactResolutionLock and cancel any wait currently in progress. + */ +static void +FdwXactCancelWait(void) +{ + LWLockAcquire(FdwXactResolutionLock, LW_EXCLUSIVE); + if (!SHMQueueIsDetached(&(MyProc->fdwXactLinks))) + SHMQueueDelete(&(MyProc->fdwXactLinks)); + MyProc->fdwXactState = FDWXACT_NOT_WAITING; + LWLockRelease(FdwXactResolutionLock); +} + +/* + * In abort case, this function ends foreign transaction participants and possibly + * rollback their prepared foreign trasnactions. + */ +extern void +AtEOXact_FdwXact(bool is_commit) +{ + ListCell *lc; + + if (!is_commit) + { + foreach(lc, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = lfirst(lc); + FdwXact fdwxact = fdw_part->fdwxact; + int status; + + if (!fdwxact) + { + /* + * We rollback the foreign transaction if its foreign server + * supports transaction callbacks. Otherwise we just delete + * the entry from the list. + */ + if (ServerSupportTransactionCallack(fdw_part)) + FdwXactParticipantEndTransaction(fdw_part, false); + + FdwXactParticipants = foreach_delete_current(FdwXactParticipants, lc); + continue; + } + + /* + * Abort the foreign transaction. For participants whose status + * is FDWXACT_STATUS_PREPARING, we close the transaction in + * one-phase. In addition, since we are not sure that the + * preparation has been completed on the foreign server, we also + * attempts to rollback the prepared foreign transaction. Note + * that it's FDWs responsibility that they tolerate OBJECT_NOT_FOUND + * error in abort case. + */ + SpinLockAcquire(&(fdwxact->mutex)); + status = fdwxact->status; + fdwxact->status = FDWXACT_STATUS_ABORTING; + SpinLockRelease(&(fdwxact->mutex)); + + if (status == FDWXACT_STATUS_PREPARING) + FdwXactParticipantEndTransaction(fdw_part, false); + } + + /* + * Wait for all prepared or possibly-prepared foreign transactions + * to be resolved. + */ + if (FdwXactParticipants != NIL) + { + Assert(TransactionIdIsValid(FdwXactLocalXid)); + FdwXactWaitForResolution(FdwXactLocalXid, false); + } + } + + ForgetAllFdwXactParticipants(); +} + +/* + * Unlock foreign transaction participants and clear the FdwXactParticipants + * list. If we left foreign transaction, update the oldest xmin of unresolved + * transaction so that local transaction id of such unresolved foreign transaction + * is not truncated. + */ +void +ForgetAllFdwXactParticipants(void) +{ + ListCell *cell; + int nlefts = 0; + + if (FdwXactParticipants == NIL) + { + Assert(FdwXactParticipants_tmp == NIL); + Assert(!ForeignTwophaseCommitIsRequired); + return; + } + + foreach(cell, FdwXactParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(cell); + FdwXact fdwxact = fdw_part->fdwxact; + + /* Nothing to do if didn't register FdwXact entry yet */ + Assert(fdwxact); + + /* + * Unlock the foreign transaction entries. Note that there is a race + * condition; the FdwXact entries in FdwXactParticipants could be used + * by other backend before we forget in case where the resolver process + * removes the FdwXact entry and other backend reuses it before we + * forget. So we need to check if the entries are still associated with + * the transaction. We cannnot use locking_backend to check because the + * entry might be already held by the resolver process. + */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + if (fdwxact->valid && fdwxact->local_xid == FdwXactLocalXid) + { + if (fdwxact->locking_backend == MyBackendId) + fdwxact->locking_backend = InvalidBackendId; + + fdwxact->owner = NULL; + nlefts++; + } + LWLockRelease(FdwXactLock); + } + + /* + * If we left any FdwXact entries, update the oldest local transaction of + * unresolved distributed transaction and take over them to the foreign + * transaction resolver. + */ + if (nlefts > 0) + { + elog(DEBUG1, "left %u foreign transactions", nlefts); + FdwXactComputeRequiredXmin(); + FdwXactLaunchOrWakeupResolver(); + } + + list_free(FdwXactParticipants); + FdwXactParticipants = NIL; + FdwXactParticipants_tmp = NIL; + FdwXactLocalXid = InvalidTransactionId; + ForeignTwophaseCommitIsRequired = false; +} + +/* + * Resolve foreign transactions at the give indexes. If 'waiter' is not NULL, + * we release the waiter after we resolved all of the given foreign transactions + * On failure we re-enqueue the waiting backend after incremented the next + * resolution time. + * + * The caller must hold the given foreign transactions in advance to prevent + * concurrent update. + */ +void +FdwXactResolveFdwXacts(int *fdwxact_idxs, int nfdwxacts, PGPROC *waiter) +{ + for (int i = 0; i < nfdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[fdwxact_idxs[i]]; + + CHECK_FOR_INTERRUPTS(); + + PG_TRY(); + { + FdwXactResolveOneFdwXact(fdwxact); + } + PG_CATCH(); + { + /* + * Failed to resolve. Re-insert the waiter to the tail of retry + * queue if the waiter is still waiting. + */ + if (waiter) + { + LWLockAcquire(FdwXactResolutionLock, LW_EXCLUSIVE); + if (waiter->fdwXactState == FDWXACT_WAITING) + { + SHMQueueDelete(&(waiter->fdwXactLinks)); + pg_write_barrier(); + waiter->fdwXactNextResolutionTs = + TimestampTzPlusMilliseconds(waiter->fdwXactNextResolutionTs, + foreign_xact_resolution_retry_interval); + FdwXactQueueInsert(waiter); + } + LWLockRelease(FdwXactResolutionLock); + } + + PG_RE_THROW(); + } + PG_END_TRY(); + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + if (fdwxact->ondisk) + RemoveFdwXactFile(fdwxact->dbid, fdwxact->local_xid, fdwxact->serverid, + fdwxact->userid, true); + remove_fdwxact(fdwxact); + LWLockRelease(FdwXactLock); + } + + if (!waiter) + return; + + /* + * Remove waiter from shmem queue, if not detached yet. The waiter could + * already be detached if user cancelled to wait before resolution. + */ + LWLockAcquire(FdwXactResolutionLock, LW_EXCLUSIVE); + if (!SHMQueueIsDetached(&(waiter->fdwXactLinks))) + { + TransactionId wait_xid = waiter->fdwXactWaitXid; + + SHMQueueDelete(&(waiter->fdwXactLinks)); + pg_write_barrier(); + + /* Set state to complete */ + waiter->fdwXactState = FDWXACT_WAIT_COMPLETE; + + /* + * Wake up the waiter only when we have set state and removed from + * queue + */ + SetLatch(&(waiter->procLatch)); + + elog(DEBUG2, "released the proc with xid %u", wait_xid); + } + else + elog(DEBUG2, "the waiter backend had been already detached"); + + LWLockRelease(FdwXactResolutionLock); +} + +/* + * Return true if there is at least one prepared foreign transaction + * which matches given arguments. + */ +bool +FdwXactExists(Oid dbid, Oid serverid, Oid userid) +{ + int idx; + + LWLockAcquire(FdwXactLock, LW_SHARED); + idx = get_fdwxact(dbid, InvalidTransactionId, serverid, userid); + LWLockRelease(FdwXactLock); + + return (idx != -1); +} + +/* + * Compute the oldest xmin across all unresolved foreign transactions + * and store it in the ProcArray. + * + * XXX: we can exclude FdwXact entries whose status is already committing + * or aborting. + */ +static void +FdwXactComputeRequiredXmin(void) +{ + TransactionId agg_xmin = InvalidTransactionId; + + Assert(FdwXactCtl != NULL); + + LWLockAcquire(FdwXactLock, LW_SHARED); + + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + + if (!fdwxact->valid) + continue; + + Assert(TransactionIdIsValid(fdwxact->local_xid)); + + if (!TransactionIdIsValid(agg_xmin) || + TransactionIdPrecedes(fdwxact->local_xid, agg_xmin)) + agg_xmin = fdwxact->local_xid; + } + + LWLockRelease(FdwXactLock); + + ProcArraySetFdwXactUnresolvedXmin(agg_xmin); +} + + +/* + * Determine whether the foreign transaction associated with the given transaction + * id should be committed or rolled back according to the result of the local + * transaction. + */ +static FdwXactStatus +FdwXactDetermineTransactionFate(TransactionId xid) +{ + /* + * If the local transaction is already committed, commit prepared foreign + * transaction. + */ + if (TransactionIdDidCommit(xid)) + return FDWXACT_STATUS_COMMITTING; + + /* + * If the local transaction is already aborted, abort prepared foreign + * transactions. + */ + else if (TransactionIdDidAbort(xid)) + return FDWXACT_STATUS_ABORTING; + + + /* + * The local transaction is not in progress but the foreign transaction is + * not prepared on the foreign server. This can happen when transaction + * failed after registered this entry but before actual preparing on the + * foreign server. So let's assume it aborted. + */ + else if (!TransactionIdIsInProgress(xid)) + return FDWXACT_STATUS_ABORTING; + + /* + * The Local transaction is in progress and foreign transaction is about + * to be committed or aborted. Raise an error anyway since we cannot + * determine the fate of this foreign transaction according to the local + * transaction whose fate is also not determined. + */ + else + elog(ERROR, + "cannot resolve the foreign transaction associated with in-process transaction"); + + pg_unreachable(); +} + +/* + * Commit or rollback one prepared foreign transaction. After resolved + * successfully, the FdwXact entry is removed from the shared memory and also + * remove the corresponding on-disk file. + */ +static void +FdwXactResolveOneFdwXact(FdwXact fdwxact) +{ + FdwXactRslvState state; + ForeignServer *server; + ForeignDataWrapper *fdw; + FdwRoutine *routine; + + Assert(fdwxact != NULL); + /* + * The FdwXact entry must be either held by a backend or being processed + * by a resolver process. + */ + Assert(fdwxact->locking_backend == MyBackendId); + + if (fdwxact->status != FDWXACT_STATUS_COMMITTING && + fdwxact->status != FDWXACT_STATUS_ABORTING) + { + FdwXactStatus new_status; + + new_status = FdwXactDetermineTransactionFate(fdwxact->local_xid); + Assert(new_status == FDWXACT_STATUS_COMMITTING || + new_status == FDWXACT_STATUS_ABORTING); + + /* Update the status */ + SpinLockAcquire(&fdwxact->mutex); + fdwxact->status = new_status; + SpinLockRelease(&fdwxact->mutex); + } + + server = GetForeignServer(fdwxact->serverid); + fdw = GetForeignDataWrapper(server->fdwid); + routine = GetFdwRoutine(fdw->fdwhandler); + + /* Prepare resolution state to pass to API */ + state.xid = fdwxact->local_xid; + state.server = server; + state.usermapping = GetUserMapping(fdwxact->userid, fdwxact->serverid); + state.fdwxact_id = fdwxact->fdwxact_id; + state.flags = 0; + + if (fdwxact->status == FDWXACT_STATUS_COMMITTING) + { + routine->CommitForeignTransaction(&state); + elog(DEBUG1, "successfully committed the prepared foreign transaction for server %u user %u", + fdwxact->serverid, fdwxact->userid); + } + else + { + routine->RollbackForeignTransaction(&state); + elog(DEBUG1, "successfully rolled back the prepared foreign transaction for server %u user %u", + fdwxact->serverid, fdwxact->userid); + } +} + +/* + * Return the index of first found FdwXact entry that matched to given arguments. + * Otherwise return -1. The search condition is defined by arguments with valid + * values for respective datatypes. + */ +static int +get_fdwxact(Oid dbid, TransactionId xid, Oid serverid, Oid userid) +{ + bool found = false; + int i; + + Assert(LWLockHeldByMe(FdwXactLock)); + + for (i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + + if (!fdwxact->valid) + continue; + + /* dbid */ + if (OidIsValid(dbid) && fdwxact->dbid != dbid) + continue; + + /* xid */ + if (TransactionIdIsValid(xid) && xid != fdwxact->local_xid) + continue; + + /* serverid */ + if (OidIsValid(serverid) && serverid != fdwxact->serverid) + continue; + + /* userid */ + if (OidIsValid(userid) && fdwxact->userid != userid) + continue; + + /* This entry matches the condition */ + found = true; + break; + } + + return found ? i : -1; +} + +/* Apply the redo log for a foreign transaction */ +void +fdwxact_redo(XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_FDWXACT_INSERT) + { + /* + * Add fdwxact entry and set start/end lsn of the WAL record in + * FdwXact entry. + */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + FdwXactRedoAdd(XLogRecGetData(record), + record->ReadRecPtr, + record->EndRecPtr); + LWLockRelease(FdwXactLock); + } + else if (info == XLOG_FDWXACT_REMOVE) + { + xl_fdwxact_remove *record = (xl_fdwxact_remove *) rec; + + /* Delete FdwXact entry and file if exists */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + FdwXactRedoRemove(record->dbid, record->xid, record->serverid, + record->userid, false); + LWLockRelease(FdwXactLock); + } + else + elog(ERROR, "invalid log type %d in foreign transaction log record", info); + + return; +} + + +/* + * Store pointer to the start/end of the WAL record along with the xid in + * a fdwxact entry in shared memory FdwXactData structure. + */ +static void +FdwXactRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + FdwXactOnDiskData *fdwxact_data = (FdwXactOnDiskData *) buf; + FdwXact fdwxact; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + /* + * Add this entry into the table of foreign transactions. The status of + * the transaction is set as preparing, since we do not know the exact + * status right now. Resolver will set it later based on the status of + * local transaction which prepared this foreign transaction. + */ + fdwxact = insert_fdwxact(fdwxact_data->dbid, fdwxact_data->local_xid, + fdwxact_data->serverid, fdwxact_data->userid, + fdwxact_data->umid, fdwxact_data->fdwxact_id); + + elog(DEBUG2, "added fdwxact entry in shared memory for foreign transaction, db %u xid %u server %u user %u id %s", + fdwxact_data->dbid, fdwxact_data->local_xid, + fdwxact_data->serverid, fdwxact_data->userid, + fdwxact_data->fdwxact_id); + + /* + * Set status as PREPARED, since we do not know the xact status right now. + * We will set it later based on the status of local transaction that prepared + * this fdwxact entry. + */ + fdwxact->status = FDWXACT_STATUS_PREPARED; + fdwxact->insert_start_lsn = start_lsn; + fdwxact->insert_end_lsn = end_lsn; + fdwxact->inredo = true; /* added in redo */ + fdwxact->valid = false; + fdwxact->ondisk = XLogRecPtrIsInvalid(start_lsn); +} + +/* + * Remove the corresponding fdwxact entry from FdwXactCtl. Also remove + * FdwXact file if a foreign transaction was saved via an earlier checkpoint. + * We could not found the FdwXact entry in the case where a crash recovery + * starts from the point where is after added but before removed the entry. + */ +void +FdwXactRedoRemove(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, bool givewarning) +{ + FdwXact fdwxact; + int i; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + for (i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + fdwxact = FdwXactCtl->fdwxacts[i]; + + if (fdwxact->dbid == dbid && fdwxact->local_xid == xid && + fdwxact->serverid == serverid && fdwxact->userid == userid) + break; + } + + if (i >= FdwXactCtl->num_fdwxacts) + return; + + /* Clean up entry and any files we may have left */ + if (fdwxact->ondisk) + RemoveFdwXactFile(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + givewarning); + remove_fdwxact(fdwxact); + + elog(DEBUG2, "removed fdwxact entry from shared memory for foreign transaction, db %u xid %u server %u user %u id %s", + fdwxact->dbid, fdwxact->local_xid, fdwxact->serverid, + fdwxact->userid, fdwxact->fdwxact_id); +} + +/* + * We must fsync the foreign transaction state file that is valid or generated + * during redo and has a inserted LSN <= the checkpoint's redo horizon. + * The foreign transaction entries and hence the corresponding files are expected + * to be very short-lived. By executing this function at the end, we might have + * lesser files to fsync, thus reducing some I/O. This is similar to + * CheckPointTwoPhase(). + * + * This is deliberately run as late as possible in the checkpoint sequence, + * because FdwXacts ordinarily have short lifespans, and so it is quite + * possible that FdwXacts that were valid at checkpoint start will no longer + * exist if we wait a little bit. With typical checkpoint settings this + * will be about 3 minutes for an online checkpoint, so as a result we + * expect that there will be no FdwXacts that need to be copied to disk. + * + * If a FdwXact remains valid across multiple checkpoints, it will already + * be on disk so we don't bother to repeat that write. + */ +void +CheckPointFdwXacts(XLogRecPtr redo_horizon) +{ + int cnt; + int serialized_fdwxacts = 0; + + if (max_prepared_foreign_xacts <= 0) + return; /* nothing to do */ + + TRACE_POSTGRESQL_FDWXACT_CHECKPOINT_START(); + + /* + * We are expecting there to be zero FdwXact that need to be copied to + * disk, so we perform all I/O while holding FdwXactLock for simplicity. + * This presents any new foreign xacts from preparing while this occurs, + * which shouldn't be a problem since the presence of long-lived prepared + * foreign xacts indicated the transaction manager isn't active. + * + * It's also possible to move I/O out of the lock, but on every error we + * should check whether somebody committed our transaction in different + * backend. Let's leave this optimisation for future, if somebody will + * spot that this place cause bottleneck. + * + * Note that it isn't possible for there to be a FdwXact with a + * insert_end_lsn set prior to the last checkpoint yet is marked invalid, + * because of the efforts with delayChkpt. + */ + LWLockAcquire(FdwXactLock, LW_SHARED); + for (cnt = 0; cnt < FdwXactCtl->num_fdwxacts; cnt++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[cnt]; + + if ((fdwxact->valid || fdwxact->inredo) && + !fdwxact->ondisk && + fdwxact->insert_end_lsn <= redo_horizon) + { + char *buf; + int len; + + XlogReadFdwXactData(fdwxact->insert_start_lsn, &buf, &len); + RecreateFdwXactFile(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + buf, len); + fdwxact->ondisk = true; + fdwxact->insert_start_lsn = InvalidXLogRecPtr; + fdwxact->insert_end_lsn = InvalidXLogRecPtr; + pfree(buf); + serialized_fdwxacts++; + } + } + + LWLockRelease(FdwXactLock); + + /* + * Flush unconditionally the parent directory to make any information + * durable on disk. FdwXact files could have been removed and those + * removals need to be made persistent as well as any files newly created. + */ + fsync_fname(FDWXACTS_DIR, true); + + TRACE_POSTGRESQL_FDWXACT_CHECKPOINT_DONE(); + + if (log_checkpoints && serialized_fdwxacts > 0) + ereport(LOG, + (errmsg_plural("%u foreign transaction state file was written " + "for long-running prepared transactions", + "%u foreign transaction state files were written " + "for long-running prepared transactions", + serialized_fdwxacts, + serialized_fdwxacts))); +} + +/* + * Reads foreign transaction data from xlog. During checkpoint this data will + * be moved to fdwxact files and ReadFdwXactFile should be used instead. + * + * Note clearly that this function accesses WAL during normal operation, similarly + * to the way WALSender or Logical Decoding would do. It does not run during + * crash recovery or standby processing. + */ +static void +XlogReadFdwXactData(XLogRecPtr lsn, char **buf, int *len) +{ + XLogRecord *record; + XLogReaderState *xlogreader; + char *errormsg; + + xlogreader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &read_local_xlog_page, + .segment_open = &wal_segment_open, + .segment_close = &wal_segment_close), + NULL); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating an XLog reading processor."))); + + XLogBeginRead(xlogreader, lsn); + record = XLogReadRecord(xlogreader, &errormsg); + if (record == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read foreign transaction state from xlog at %X/%X", + (uint32) (lsn >> 32), + (uint32) lsn))); + + if (XLogRecGetRmid(xlogreader) != RM_FDWXACT_ID || + (XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK) != XLOG_FDWXACT_INSERT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("expected foreign transaction state data is not present in xlog at %X/%X", + (uint32) (lsn >> 32), + (uint32) lsn))); + + if (len != NULL) + *len = XLogRecGetDataLen(xlogreader); + + *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader)); + memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader)); + + XLogReaderFree(xlogreader); +} + +/* + * Recreates a foreign transaction state file. This is used in WAL replay + * and during checkpoint creation. + * + * Note: content and len don't include CRC. + */ +void +RecreateFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, void *content, int len) +{ + char path[MAXPGPATH]; + pg_crc32c statefile_crc; + int fd; + + /* Recompute CRC */ + INIT_CRC32C(statefile_crc); + COMP_CRC32C(statefile_crc, content, len); + FIN_CRC32C(statefile_crc); + + FdwXactFilePath(path, dbid, xid, serverid, userid); + + fd = OpenTransientFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not recreate foreign transaction state file \"%s\": %m", + path))); + + /* Write content and CRC */ + pgstat_report_wait_start(WAIT_EVENT_FDWXACT_FILE_WRITE); + if (write(fd, content, len) != len) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write foreign transaction state file: %m"))); + } + if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c)) + { + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write foreign transaction state file: %m"))); + } + pgstat_report_wait_end(); + + /* + * We must fsync the file because the end-of-replay checkpoint will not do + * so, there being no FDWXACT in shared memory yet to tell it to. + */ + pgstat_report_wait_start(WAIT_EVENT_FDWXACT_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync foreign transaction state file: %m"))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close foreign transaction file: %m"))); +} + +/* + * Given a transaction id, userid and serverid read it either from disk + * or read it directly via shmem xlog record pointer using the provided + * "insert_start_lsn". + */ +static char * +ProcessFdwXactBuffer(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, XLogRecPtr insert_start_lsn, bool fromdisk) +{ + TransactionId origNextXid = + XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + char *buf; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + + if (!fromdisk) + Assert(!XLogRecPtrIsInvalid(insert_start_lsn)); + + /* Reject XID if too new */ + if (TransactionIdFollowsOrEquals(xid, origNextXid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing future fdwxact state file for xid %u, server %u and user %u", + xid, serverid, userid))); + RemoveFdwXactFile(dbid, xid, serverid, userid, true); + } + else + { + ereport(WARNING, + (errmsg("removing future fdwxact state from memory for xid %u, server %u and user %u", + xid, serverid, userid))); + FdwXactRedoRemove(dbid, xid, serverid, userid, true); + } + return NULL; + } + + if (fromdisk) + { + /* Read and validate file */ + buf = ReadFdwXactFile(dbid, xid, serverid, userid); + } + else + { + /* Read xlog data */ + XlogReadFdwXactData(insert_start_lsn, &buf, NULL); + } + + return buf; +} + +/* + * Read and validate the foreign transaction state file. + * + * If it looks OK (has a valid magic number and CRC), return the palloc'd + * contents of the file, issuing an error when finding corrupted data. + * This state can be reached when doing recovery. + */ +static char * +ReadFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, Oid userid) +{ + char path[MAXPGPATH]; + int fd; + FdwXactOnDiskData *fdwxact_file_data; + struct stat stat; + uint32 crc_offset; + pg_crc32c calc_crc; + pg_crc32c file_crc; + char *buf; + int r; + + FdwXactFilePath(path, dbid, xid, serverid, userid); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open FDW transaction state file \"%s\": %m", + path))); + + /* + * Check file length. We can determine a lower bound pretty easily. We + * set an upper bound to avoid palloc() failure on a corrupt file, though + * we can't guarantee that we won't get an out of memory error anyway, + * even on a valid file. + */ + if (fstat(fd, &stat)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat FDW transaction state file \"%s\": %m", + path))); + + if (stat.st_size < (offsetof(FdwXactOnDiskData, fdwxact_id) + + sizeof(pg_crc32c)) || + stat.st_size > MaxAllocSize) + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("too large FDW transaction state file \"%s\": %m", + path))); + + crc_offset = stat.st_size - sizeof(pg_crc32c); + if (crc_offset != MAXALIGN(crc_offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("incorrect alignment of CRC offset for file \"%s\"", + path))); + + /* + * Ok, slurp in the file. + */ + buf = (char *) palloc(stat.st_size); + fdwxact_file_data = (FdwXactOnDiskData *) buf; + + /* Slurp the file */ + pgstat_report_wait_start(WAIT_EVENT_FDWXACT_FILE_READ); + r = read(fd, buf, stat.st_size); + if (r != stat.st_size) + { + if (r < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + else + ereport(ERROR, + (errmsg("could not read file \"%s\": read %d of %zu", + path, r, (Size) stat.st_size))); + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + /* + * Check the CRC. + */ + INIT_CRC32C(calc_crc); + COMP_CRC32C(calc_crc, buf, crc_offset); + FIN_CRC32C(calc_crc); + + file_crc = *((pg_crc32c *) (buf + crc_offset)); + + if (!EQ_CRC32C(calc_crc, file_crc)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("calculated CRC checksum does not match value stored in file \"%s\"", + path))); + + /* Check if the contents is an expected data */ + fdwxact_file_data = (FdwXactOnDiskData *) buf; + if (fdwxact_file_data->dbid != dbid || + fdwxact_file_data->serverid != serverid || + fdwxact_file_data->userid != userid || + fdwxact_file_data->local_xid != xid) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid foreign transaction state file \"%s\"", + path))); + + return buf; +} + +/* + * Scan the shared memory entries of FdwXact and determine the range of valid + * XIDs present. This is run during database startup, after we have completed + * reading WAL. ShmemVariableCache->nextFullXid has been set to one more than + * the highest XID for which evidence exists in WAL. + + * On corrupted two-phase files, fail immediately. Keeping around broken + * entries and let replay continue causes harm on the system, and a new + * backup should be rolled in. + + * Our other responsibility is to update and return the oldest valid XID + * among the distributed transactions. This is needed to synchronize pg_subtrans + * startup properly. + */ +TransactionId +PrescanFdwXacts(TransactionId oldestActiveXid) +{ + FullTransactionId nextFullXid = ShmemVariableCache->nextFullXid; + TransactionId origNextXid = XidFromFullTransactionId(nextFullXid); + TransactionId result = origNextXid; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + char *buf; + + buf = ProcessFdwXactBuffer(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + fdwxact->insert_start_lsn, fdwxact->ondisk); + + if (buf == NULL) + continue; + + if (TransactionIdPrecedes(fdwxact->local_xid, result)) + result = fdwxact->local_xid; + + pfree(buf); + } + LWLockRelease(FdwXactLock); + + return result; +} + +/* + * Scan pg_fdwxact and fill FdwXact depending on the on-disk data. + * This is called once at the beginning of recovery, saving any extra + * lookups in the future. FdwXact files that are newer than the + * minimum XID horizon are discarded on the way. + */ +void +restoreFdwXactData(void) +{ + DIR *cldir; + struct dirent *clde; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + cldir = AllocateDir(FDWXACTS_DIR); + while ((clde = ReadDir(cldir, FDWXACTS_DIR)) != NULL) + { + if (strlen(clde->d_name) == FDWXACT_FILE_NAME_LEN && + strspn(clde->d_name, "0123456789ABCDEF_") == FDWXACT_FILE_NAME_LEN) + { + TransactionId local_xid; + Oid dbid; + Oid serverid; + Oid userid; + char *buf; + + sscanf(clde->d_name, "%08x_%08x_%08x_%08x", + &dbid, &local_xid, &serverid, &userid); + + /* Read fdwxact data from disk */ + buf = ProcessFdwXactBuffer(dbid, local_xid, serverid, userid, + InvalidXLogRecPtr, true); + + if (buf == NULL) + continue; + + /* Add this entry into the table of foreign transactions */ + FdwXactRedoAdd(buf, InvalidXLogRecPtr, InvalidXLogRecPtr); + } + } + + LWLockRelease(FdwXactLock); + FreeDir(cldir); +} + +/* + * Remove the foreign transaction file for given entry. + * + * If giveWarning is false, do not complain about file-not-present; + * this is an expected case during WAL replay. + */ +static void +RemoveFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + bool giveWarning) +{ + char path[MAXPGPATH]; + + FdwXactFilePath(path, dbid, xid, serverid, userid); + if (unlink(path) < 0 && (errno != ENOENT || giveWarning)) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove foreign transaction state file \"%s\": %m", + path))); +} + +/* + * Scan the shared memory entries of FdwXact and valid them. + * + * This is run at the end of recovery, but before we allow backends to write + * WAL. + */ +void +RecoverFdwXacts(void) +{ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + char *buf; + + buf = ProcessFdwXactBuffer(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + fdwxact->insert_start_lsn, fdwxact->ondisk); + + if (buf == NULL) + continue; + + ereport(LOG, + (errmsg("recovering foreign prepared transaction %u for server %u and user %u from shared memory", + fdwxact->local_xid, fdwxact->serverid, fdwxact->userid))); + + /* recovered, so reset the flag for entries generated by redo */ + fdwxact->owner = NULL; + fdwxact->inredo = false; + fdwxact->valid = true; + pfree(buf); + } + LWLockRelease(FdwXactLock); +} + +/* Built in functions */ + +/* + * Structure to hold and iterate over the foreign transactions to be displayed + * by the built-in functions. + */ +typedef struct +{ + FdwXact fdwxacts; + int num_xacts; + int cur_xact; +} WorkingStatus; + +Datum +pg_foreign_xacts(PG_FUNCTION_ARGS) +{ +#define PG_PREPARED_FDWXACTS_COLS 6 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + LWLockAcquire(FdwXactLock, LW_SHARED); + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + FdwXactStatus status; + char *xact_status; + Datum values[PG_PREPARED_FDWXACTS_COLS]; + bool nulls[PG_PREPARED_FDWXACTS_COLS]; + + if (!fdwxact->valid) + continue; + + memset(nulls, 0, sizeof(nulls)); + + SpinLockAcquire(&fdwxact->mutex); + status = fdwxact->status; + SpinLockRelease(&fdwxact->mutex); + + values[0] = TransactionIdGetDatum(fdwxact->local_xid); + values[1] = ObjectIdGetDatum(fdwxact->serverid); + values[2] = ObjectIdGetDatum(fdwxact->userid); + + switch (status) + { + case FDWXACT_STATUS_PREPARING: + xact_status = "preparing"; + break; + case FDWXACT_STATUS_PREPARED: + xact_status = "prepared"; + break; + case FDWXACT_STATUS_COMMITTING: + xact_status = "prepared (commit)"; + break; + case FDWXACT_STATUS_ABORTING: + xact_status = "prepared (abort)"; + break; + default: + xact_status = "unknown"; + break; + } + values[3] = CStringGetTextDatum(xact_status); + values[4] = BoolGetDatum(fdwxact->owner == NULL); + values[5] = PointerGetDatum(cstring_to_text_with_len(fdwxact->fdwxact_id, + strlen(fdwxact->fdwxact_id))); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + LWLockRelease(FdwXactLock); + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} + +/* + * Built-in SQL function to resolve a prepared foreign transaction. + */ +Datum +pg_resolve_foreign_xact(PG_FUNCTION_ARGS) +{ + TransactionId xid = DatumGetTransactionId(PG_GETARG_DATUM(0)); + Oid serverid = PG_GETARG_OID(1); + Oid userid = PG_GETARG_OID(2); + FdwXact fdwxact; + int idx; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to resolve foreign transactions")))); + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + + idx = get_fdwxact(MyDatabaseId, xid, serverid, userid); + + if (idx == -1) + { + /* not found */ + LWLockRelease(FdwXactLock); + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("does not exist foreign transaction"))); + } + + fdwxact = FdwXactCtl->fdwxacts[idx]; + + if (fdwxact->locking_backend != InvalidBackendId || fdwxact->owner) + { + /* the entry is being processed by someone */ + LWLockRelease(FdwXactLock); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("foreign transaction with transaction id %u, server %u, and user %u is busy", + xid, serverid, userid))); + } + + if (TwoPhaseExists(fdwxact->local_xid)) + { + /* + * the entry's local transaction is prepared. Since we cannot know the + * fate of the local transaction, we cannot resolve this foreign + * transaction. + */ + LWLockRelease(FdwXactLock); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot resolve foreign transaction entry whose local transaction is prepared"), + errhint("Do COMMIT PREPARED or ROLLBACK PREPARED"))); + } + + /* Hold the entry */ + FdwXactCtl->fdwxacts[idx]->locking_backend = MyBackendId; + + LWLockRelease(FdwXactLock); + + PG_TRY(); + { + FdwXactResolveFdwXacts(&idx, 1, NULL); + } + PG_CATCH(); + { + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + FdwXactCtl->fdwxacts[idx]->locking_backend = InvalidBackendId; + LWLockRelease(FdwXactLock); + + PG_RE_THROW(); + } + PG_END_TRY(); + + PG_RETURN_BOOL(true); +} + +/* + * Built-in function to remove a prepared foreign transaction entry without + * resolution. The function gives a way to forget about such prepared + * transaction in case: the foreign server where it is prepared is no longer + * available, the user which prepared this transaction needs to be dropped. + */ +Datum +pg_remove_foreign_xact(PG_FUNCTION_ARGS) +{ + TransactionId xid = DatumGetTransactionId(PG_GETARG_DATUM(0)); + Oid serverid = PG_GETARG_OID(1); + Oid userid = PG_GETARG_OID(2); + FdwXact fdwxact; + int idx; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to remove foreign transactions")))); + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + + idx = get_fdwxact(MyDatabaseId, xid, serverid, userid); + + if (idx == -1) + { + /* not found */ + LWLockRelease(FdwXactLock); + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("does not exist foreign transaction on server %u", + serverid))); + } + + fdwxact = FdwXactCtl->fdwxacts[idx]; + + if (fdwxact->locking_backend != InvalidBackendId || fdwxact->owner) + { + /* the entry is being held by someone */ + LWLockRelease(FdwXactLock); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("foreign transaction with transaction id %u, server %u, and user %u is busy", + xid, serverid, userid))); + } + + /* Hold the entry */ + fdwxact->locking_backend = MyBackendId; + + PG_TRY(); + { + /* Clean up entry and any files we may have left */ + if (fdwxact->ondisk) + RemoveFdwXactFile(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + true); + remove_fdwxact(fdwxact); + } + PG_CATCH(); + { + if (fdwxact->valid) + { + Assert(fdwxact->locking_backend == MyBackendId); + fdwxact->locking_backend = InvalidBackendId; + } + LWLockRelease(FdwXactLock); + PG_RE_THROW(); + } + PG_END_TRY(); + + LWLockRelease(FdwXactLock); + + PG_RETURN_BOOL(true); +} diff --git a/src/backend/access/fdwxact/launcher.c b/src/backend/access/fdwxact/launcher.c new file mode 100644 index 0000000000..a1a41404c7 --- /dev/null +++ b/src/backend/access/fdwxact/launcher.c @@ -0,0 +1,558 @@ +/*------------------------------------------------------------------------- + * + * launcher.c + * + * The foreign transaction resolver launcher process starts foreign + * transaction resolver processes. The launcher schedules resolver + * process to be started when arrived a requested by backend process. + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/fdwxact/launcher.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "funcapi.h" +#include "pgstat.h" +#include "funcapi.h" + +#include "access/fdwxact.h" +#include "access/fdwxact_launcher.h" +#include "access/fdwxact_resolver.h" +#include "access/resolver_internal.h" +#include "access/twophase.h" +#include "commands/dbcommands.h" +#include "nodes/pg_list.h" +#include "postmaster/bgworker.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" + +/* max sleep time between cycles (3min) */ +#define DEFAULT_NAPTIME_PER_CYCLE 180000L + +static void fdwxact_launcher_onexit(int code, Datum arg); +static void fdwxact_launcher_sighup(SIGNAL_ARGS); +static void fdwxact_launch_resolver(Oid dbid); +static bool fdwxact_relaunch_resolvers(void); + +static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t got_SIGUSR2 = false; +FdwXactResolver *MyFdwXactResolver = NULL; + +/* + * Wake up the launcher process to request launching new resolvers + * immediately. + */ +void +FdwXactLauncherRequestToLaunch(void) +{ + if (FdwXactRslvCtl->launcher_pid != InvalidPid) + kill(FdwXactRslvCtl->launcher_pid, SIGUSR2); +} + +/* Report shared memory space needed by FdwXactRsoverShmemInit */ +Size +FdwXactRslvShmemSize(void) +{ + Size size = 0; + + size = add_size(size, SizeOfFdwXactRslvCtlData); + size = add_size(size, mul_size(max_foreign_xact_resolvers, + sizeof(FdwXactResolver))); + + return size; +} + +/* + * Allocate and initialize foreign transaction resolver shared + * memory. + */ +void +FdwXactRslvShmemInit(void) +{ + bool found; + + FdwXactRslvCtl = ShmemInitStruct("Foreign transactions resolvers", + FdwXactRslvShmemSize(), + &found); + + if (!IsUnderPostmaster) + { + int slot; + + /* First time through, so initialize */ + MemSet(FdwXactRslvCtl, 0, FdwXactRslvShmemSize()); + SHMQueueInit(&(FdwXactRslvCtl->fdwxact_queue)); + FdwXactRslvCtl->launcher_pid = InvalidPid; + + for (slot = 0; slot < max_foreign_xact_resolvers; slot++) + { + FdwXactResolver *resolver = &FdwXactRslvCtl->resolvers[slot]; + + memset(resolver, 0, sizeof(FdwXactResolver)); + SpinLockInit(&(resolver->mutex)); + } + } +} + +/* + * Cleanup function for fdwxact launcher + * + * Called on fdwxact launcher exit. + */ +static void +fdwxact_launcher_onexit(int code, Datum arg) +{ + FdwXactRslvCtl->launcher_pid = InvalidPid; +} + +/* SIGHUP: set flag to reload configuration at next convenient time */ +static void +fdwxact_launcher_sighup(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGHUP = true; + + SetLatch(MyLatch); + + errno = save_errno; +} + +/* SIGUSR2: set flag to launch new resolver process immediately */ +static void +fdwxact_launcher_sigusr2(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGUSR2 = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Main loop for the fdwxact launcher process. + */ +void +FdwXactLauncherMain(Datum main_arg) +{ + TimestampTz last_start_time = 0; + + ereport(DEBUG1, + (errmsg("fdwxact resolver launcher started"))); + + before_shmem_exit(fdwxact_launcher_onexit, (Datum) 0); + + Assert(FdwXactRslvCtl->launcher_pid == InvalidPid); + FdwXactRslvCtl->launcher_pid = MyProcPid; + FdwXactRslvCtl->launcher_latch = &MyProc->procLatch; + + pqsignal(SIGHUP, fdwxact_launcher_sighup); + pqsignal(SIGUSR2, fdwxact_launcher_sigusr2); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* Enter main loop */ + for (;;) + { + TimestampTz now; + long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + int rc; + + CHECK_FOR_INTERRUPTS(); + ResetLatch(MyLatch); + + now = GetCurrentTimestamp(); + + /* + * Limit the start retry to once a + * foreign_xact_resolution_retry_interval but always starts when the + * backend requested. + */ + if (got_SIGUSR2 || + TimestampDifferenceExceeds(last_start_time, now, + foreign_xact_resolution_retry_interval)) + { + MemoryContext oldctx; + MemoryContext subctx; + bool launched; + + if (got_SIGUSR2) + got_SIGUSR2 = false; + + subctx = AllocSetContextCreate(TopMemoryContext, + "Foreign Transaction Launcher", + ALLOCSET_DEFAULT_SIZES); + oldctx = MemoryContextSwitchTo(subctx); + + /* + * Launch foreign transaction resolvers that are requested but not + * running. + */ + launched = fdwxact_relaunch_resolvers(); + if (launched) + { + last_start_time = now; + wait_time = foreign_xact_resolution_retry_interval; + } + + /* Switch back to original memory context. */ + MemoryContextSwitchTo(oldctx); + /* Clean the temporary memory. */ + MemoryContextDelete(subctx); + } + else + { + /* + * The wait in previous cycle was interrupted in less than + * foreign_xact_resolution_retry_interval since last resolver + * started, this usually means crash of the resolver, so we should + * retry in foreign_xact_resolution_retry_interval again. + */ + wait_time = foreign_xact_resolution_retry_interval; + } + + /* Wait for more work */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + wait_time, + WAIT_EVENT_FDWXACT_LAUNCHER_MAIN); + + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (got_SIGHUP) + { + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } + } + + /* Not reachable */ +} + +/* + * Request launcher to launch a new foreign transaction resolver process + * or wake up the resolver if it's already running. + */ +void +FdwXactLaunchOrWakeupResolver(void) +{ + volatile FdwXactResolver *resolver; + bool found = false; + + /* + * Looking for a resolver process that is running and working on the same + * database. + */ + LWLockAcquire(FdwXactResolverLock, LW_SHARED); + for (int i = 0; i < max_foreign_xact_resolvers; i++) + { + resolver = &FdwXactRslvCtl->resolvers[i]; + + if (resolver->in_use && + resolver->dbid == MyDatabaseId) + { + found = true; + break; + } + } + LWLockRelease(FdwXactResolverLock); + + if (found) + { + /* Found the running resolver */ + elog(DEBUG1, + "found a running foreign transaction resolver process for database %u", + MyDatabaseId); + + /* + * Wakeup the resolver. It's possible that the resolver is starting up + * and doesn't attach its slot yet. Since the resolver will find + * FdwXact entry we inserted soon we don't anything. + */ + if (resolver->latch) + SetLatch(resolver->latch); + + return; + } + + /* Otherwise wake up the launcher to launch new resolver */ + FdwXactLauncherRequestToLaunch(); +} + +/* + * Launch a foreign transaction resolver process that will connect to given + * 'dbid'. + */ +static void +fdwxact_launch_resolver(Oid dbid) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + FdwXactResolver *resolver; + int unused_slot = -1; + int i; + + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + + /* Find unused resolver slot */ + for (i = 0; i < max_foreign_xact_resolvers; i++) + { + FdwXactResolver *resolver = &FdwXactRslvCtl->resolvers[i]; + + if (!resolver->in_use) + { + unused_slot = i; + break; + } + } + + /* No unused found */ + if (i >= max_foreign_xact_resolvers) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of foreign transaction resolver slots"), + errhint("You might need to increase max_foreign_transaction_resolvers."))); + + resolver = &FdwXactRslvCtl->resolvers[unused_slot]; + resolver->in_use = true; + resolver->dbid = dbid; + LWLockRelease(FdwXactResolverLock); + + /* Register the new dynamic worker */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FdwXactResolverMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "foreign transaction resolver for database %u", resolver->dbid); + snprintf(bgw.bgw_type, BGW_MAXLEN, "foreign transaction resolver"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = Int32GetDatum(unused_slot); + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + /* Failed to launch, cleanup the worker slot */ + SpinLockAcquire(&(MyFdwXactResolver->mutex)); + resolver->in_use = false; + SpinLockRelease(&(MyFdwXactResolver->mutex)); + + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of background worker slots"), + errhint("You might need to increase max_worker_processes."))); + } + + /* + * We don't need to wait until it attaches here because we're going to + * wait until all foreign transactions are resolved. + */ +} + +/* + * Launch or relaunch foreign transaction resolvers on database that has + * at least one FdwXact entry but no resolvers are running on it. + */ +static bool +fdwxact_relaunch_resolvers(void) +{ + HTAB *resolver_dbs; /* DBs resolver's running on */ + HTAB *fdwxact_dbs; /* DBs having at least one FdwXact entry */ + HASHCTL ctl; + HASH_SEQ_STATUS status; + Oid *entry; + bool launched; + + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(Oid); + fdwxact_dbs = hash_create("fdwxact dblist", + 32, &ctl, HASH_ELEM | HASH_BLOBS); + + /* Collect database oids that has at least one FdwXact entry to resolve */ + LWLockAcquire(FdwXactLock, LW_SHARED); + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + + if (!fdwxact->valid) + continue; + + /* + * A resolver process resolves the foreign transactions that are + * waiting for resolution or are not being processed by anyone. + * But we don't need to launch a resolver for foreign transactions + * whose local transaction is prepared. + */ + if ((!fdwxact->owner && !TwoPhaseExists(fdwxact->local_xid)) || + (fdwxact->owner && fdwxact->owner->fdwXactState == FDWXACT_WAITING)) + hash_search(fdwxact_dbs, &(fdwxact->dbid), HASH_ENTER, NULL); + } + LWLockRelease(FdwXactLock); + + /* There is no FdwXact entry, no need to launch new one */ + if (hash_get_num_entries(fdwxact_dbs) == 0) + { + hash_destroy(fdwxact_dbs); + return false; + } + + resolver_dbs = hash_create("resolver dblist", + 32, &ctl, HASH_ELEM | HASH_BLOBS); + + /* Collect database oids on which resolvers are running */ + LWLockAcquire(FdwXactResolverLock, LW_SHARED); + for (int i = 0; i < max_foreign_xact_resolvers; i++) + { + FdwXactResolver *resolver = &FdwXactRslvCtl->resolvers[i]; + + if (!resolver->in_use) + continue; + + hash_search(resolver_dbs, &(resolver->dbid), HASH_ENTER, NULL); + } + LWLockRelease(FdwXactResolverLock); + + /* Find DBs on which no resolvers are running and launch new one on them */ + hash_seq_init(&status, fdwxact_dbs); + while ((entry = (Oid *) hash_seq_search(&status)) != NULL) + { + bool found; + + hash_search(resolver_dbs, entry, HASH_FIND, &found); + + if (!found) + { + /* No resolver is running on this database, launch new one */ + fdwxact_launch_resolver(*entry); + launched = true; + } + } + + hash_destroy(fdwxact_dbs); + hash_destroy(resolver_dbs); + + return launched; +} + +/* Register a background worker running the foreign transaction launcher */ +void +FdwXactLauncherRegister(void) +{ + BackgroundWorker bgw; + + if (max_foreign_xact_resolvers == 0) + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FdwXactLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "foreign transaction launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, + "foreign transaction launcher"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +bool +IsFdwXactLauncher(void) +{ + return FdwXactRslvCtl->launcher_pid == MyProcPid; +} + +/* + * Stop the fdwxact resolver running on the given database. + */ +Datum +pg_stop_foreign_xact_resolver(PG_FUNCTION_ARGS) +{ + Oid dbid = PG_GETARG_OID(0); + FdwXactResolver *resolver = NULL; + int i; + + /* Must be super user */ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to stop foreign transaction resolver"))); + + if (!OidIsValid(dbid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid database id"))); + + LWLockAcquire(FdwXactResolverLock, LW_SHARED); + + /* Find the running resolver process on the given database */ + for (i = 0; i < max_foreign_xact_resolvers; i++) + { + resolver = &FdwXactRslvCtl->resolvers[i]; + + /* found! */ + if (resolver->in_use && resolver->dbid == dbid) + break; + } + + if (i >= max_foreign_xact_resolvers) + ereport(ERROR, + (errmsg("there is no running foreign transaction resolver process on database %d", + dbid))); + + /* Found the resolver, terminate it ... */ + kill(resolver->pid, SIGTERM); + + /* ... and wait for it to die */ + for (;;) + { + int rc; + + /* is it gone? */ + if (!resolver->in_use) + break; + + LWLockRelease(FdwXactResolverLock); + + /* Wait a bit --- we don't expect to have to wait long. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_SHUTDOWN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + LWLockAcquire(FdwXactResolverLock, LW_SHARED); + } + + LWLockRelease(FdwXactResolverLock); + + PG_RETURN_BOOL(true); +} diff --git a/src/backend/access/fdwxact/resolver.c b/src/backend/access/fdwxact/resolver.c new file mode 100644 index 0000000000..3e9ad7a215 --- /dev/null +++ b/src/backend/access/fdwxact/resolver.c @@ -0,0 +1,443 @@ +/*------------------------------------------------------------------------- + * + * resolver.c + * + * The foreign transaction resolver background worker resolves foreign + * transactions that participate to a distributed transaction. A resolver + * process is started by foreign transaction launcher for each databases. + * + * A resolver process continues to resolve foreign transactions on the + * database, which the backend process is waiting for resolution. + * + * Normal termination is by SIGTERM, which instructs the resolver process + * to exit(0) at the next convenient moment. Emergency termination is by + * SIGQUIT; like any backend. The resolver process also terminate by timeouts + * only if there is no pending foreign transactions on the database waiting + * to be resolved. + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/fdwxact/resolver.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/fdwxact.h" +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" +#include "access/resolver_internal.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "commands/dbcommands.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "storage/ipc.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* max sleep time between cycles (3min) */ +#define DEFAULT_NAPTIME_PER_CYCLE 180000L + +/* GUC parameters */ +int foreign_xact_resolution_retry_interval; +int foreign_xact_resolver_timeout = 60 * 1000; + +FdwXactRslvCtlData *FdwXactRslvCtl; + +static void FXRslvLoop(void); +static long FXRslvComputeSleepTime(TimestampTz now, TimestampTz targetTime); +static void FXRslvCheckTimeout(TimestampTz now); + +static void fdwxact_resolver_sighup(SIGNAL_ARGS); +static void fdwxact_resolver_onexit(int code, Datum arg); +static void fdwxact_resolver_detach(void); +static void fdwxact_resolver_attach(int slot); +static void hold_fdwxacts(PGPROC *waiter); +static void hold_indoubt_fdwxacts(void); + +/* Flags set by signal handlers */ +static volatile sig_atomic_t got_SIGHUP = false; +static TimestampTz last_resolution_time = -1; + +/* + * held_fdwxacts has indexes of FdwXact which the resolver marked + * as in-processing. We clear that flag from those entries on failure. + */ +static int *held_fdwxacts = NULL; +static int nheld; + +static bool processing_online = false; + +/* Set flag to reload configuration at next convenient time */ +static void +fdwxact_resolver_sighup(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGHUP = true; + + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Detach the resolver and cleanup the resolver info. + */ +static void +fdwxact_resolver_detach(void) +{ + /* Block concurrent access */ + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + + MyFdwXactResolver->pid = InvalidPid; + MyFdwXactResolver->in_use = false; + MyFdwXactResolver->dbid = InvalidOid; + + LWLockRelease(FdwXactResolverLock); +} + +/* + * Cleanup up foreign transaction resolver info. + */ +static void +fdwxact_resolver_onexit(int code, Datum arg) +{ + fdwxact_resolver_detach(); + + for (int i = 0; i < nheld; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[held_fdwxacts[i]]; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + fdwxact->locking_backend = InvalidBackendId; + LWLockRelease(FdwXactLock); + } + + /* + * If the resolver exits during processing online transactions, + * there might be other waiting online transactions. So request to + * re-launch. + */ + if (processing_online) + FdwXactLauncherRequestToLaunch(); +} + +/* + * Attach to a slot. + */ +static void +fdwxact_resolver_attach(int slot) +{ + /* Block concurrent access */ + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + + Assert(slot >= 0 && slot < max_foreign_xact_resolvers); + MyFdwXactResolver = &FdwXactRslvCtl->resolvers[slot]; + + if (!MyFdwXactResolver->in_use) + { + LWLockRelease(FdwXactResolverLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("foreign transaction resolver slot %d is empty, cannot attach", + slot))); + } + + Assert(OidIsValid(MyFdwXactResolver->dbid)); + + MyFdwXactResolver->pid = MyProcPid; + MyFdwXactResolver->latch = &MyProc->procLatch; + + before_shmem_exit(fdwxact_resolver_onexit, (Datum) 0); + + LWLockRelease(FdwXactResolverLock); +} + +/* Foreign transaction resolver entry point */ +void +FdwXactResolverMain(Datum main_arg) +{ + int slot = DatumGetInt32(main_arg); + + /* Attach to a slot */ + fdwxact_resolver_attach(slot); + + /* Establish signal handlers */ + pqsignal(SIGHUP, fdwxact_resolver_sighup); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* Connect to our database */ + BackgroundWorkerInitializeConnectionByOid(MyFdwXactResolver->dbid, InvalidOid, 0); + + StartTransactionCommand(); + ereport(LOG, + (errmsg("foreign transaction resolver for database \"%s\" has started", + get_database_name(MyFdwXactResolver->dbid)))); + CommitTransactionCommand(); + + held_fdwxacts = palloc(sizeof(int) * max_prepared_foreign_xacts); + nheld = 0; + + /* Initialize stats to a sanish value */ + last_resolution_time = GetCurrentTimestamp(); + + /* Run the main loop */ + FXRslvLoop(); + + proc_exit(0); +} + +/* + * Fdwxact resolver main loop + */ +static void +FXRslvLoop(void) +{ + MemoryContext resolver_ctx; + + resolver_ctx = AllocSetContextCreate(TopMemoryContext, + "Foreign Transaction Resolver", + ALLOCSET_DEFAULT_SIZES); + + /* Enter main loop */ + for (;;) + { + TransactionId waitXid = InvalidTransactionId; + TimestampTz resolutionTs = -1; + TimestampTz now; + int rc; + long sleep_time = DEFAULT_NAPTIME_PER_CYCLE; + + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + MemoryContextSwitchTo(resolver_ctx); + + if (got_SIGHUP) + { + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } + + now = GetCurrentTimestamp(); + + /* + * Process waiter until either the queue gets empty or the queue has + * only waiters that have a future resolution timestamp. + */ + processing_online = true; + for (;;) + { + PGPROC *waiter; + + CHECK_FOR_INTERRUPTS(); + + LWLockAcquire(FdwXactResolutionLock, LW_SHARED); + + waiter = FdwXactGetWaiter(now, &resolutionTs, &waitXid); + + if (!waiter) + { + /* Not found, break */ + LWLockRelease(FdwXactResolutionLock); + break; + } + + /* Hold the waiting foreign transactions */ + hold_fdwxacts(waiter); + Assert(nheld > 0); + LWLockRelease(FdwXactResolutionLock); + + /* Resolve the waiting distributed transaction */ + StartTransactionCommand(); + FdwXactResolveFdwXacts(held_fdwxacts, nheld, waiter); + CommitTransactionCommand(); + + last_resolution_time = now; + } + processing_online = false; + + /* Hold indoubt foreign transactions */ + hold_indoubt_fdwxacts(); + + if (nheld > 0) + { + StartTransactionCommand(); + FdwXactResolveFdwXacts(held_fdwxacts, nheld, NULL); + CommitTransactionCommand(); + } + + FXRslvCheckTimeout(now); + + sleep_time = FXRslvComputeSleepTime(now, resolutionTs); + + MemoryContextResetAndDeleteChildren(resolver_ctx); + MemoryContextSwitchTo(TopMemoryContext); + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + sleep_time, + WAIT_EVENT_FDWXACT_RESOLVER_MAIN); + + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + } +} + +/* + * Check whether there have been foreign transactions by the backend within + * foreign_xact_resolver_timeout and shutdown if not. + */ +static void +FXRslvCheckTimeout(TimestampTz now) +{ + TimestampTz timeout; + + if (foreign_xact_resolver_timeout == 0) + return; + + timeout = TimestampTzPlusMilliseconds(last_resolution_time, + foreign_xact_resolver_timeout); + + if (now < timeout) + return; + + LWLockAcquire(FdwXactResolutionLock, LW_SHARED); + if (!FdwXactWaiterExists(MyDatabaseId)) + { + /* There is no waiting backend */ + StartTransactionCommand(); + ereport(LOG, + (errmsg("foreign transaction resolver for database \"%s\" will stop because the timeout", + get_database_name(MyDatabaseId)))); + CommitTransactionCommand(); + + /* + * Keep holding FdwXactResolutionLock until detached the slot. It is + * necessary to prevent a race condition; a waiter enqueues after + * FdwXactWaiterExists check. + */ + fdwxact_resolver_detach(); + LWLockRelease(FdwXactResolutionLock); + proc_exit(0); + } + else + elog(DEBUG2, "resolver reached to the timeout but don't exist as the queue is not empty"); + + LWLockRelease(FdwXactResolutionLock); +} + +/* + * Compute how long we should sleep by the next cycle. We can sleep until the time + * out or the next resolution time given by nextResolutionTs. + */ +static long +FXRslvComputeSleepTime(TimestampTz now, TimestampTz nextResolutionTs) +{ + long sleeptime = DEFAULT_NAPTIME_PER_CYCLE; + + if (foreign_xact_resolver_timeout > 0) + { + TimestampTz timeout; + long sec_to_timeout; + int microsec_to_timeout; + + /* Compute relative time until wakeup. */ + timeout = TimestampTzPlusMilliseconds(last_resolution_time, + foreign_xact_resolver_timeout); + TimestampDifference(now, timeout, + &sec_to_timeout, µsec_to_timeout); + + sleeptime = Min(sleeptime, + sec_to_timeout * 1000 + microsec_to_timeout / 1000); + } + + if (nextResolutionTs > 0) + { + long sec_to_timeout; + int microsec_to_timeout; + + TimestampDifference(now, nextResolutionTs, + &sec_to_timeout, µsec_to_timeout); + + sleeptime = Min(sleeptime, + sec_to_timeout * 1000 + microsec_to_timeout / 1000); + } + + return sleeptime; +} + +bool +IsFdwXactResolver(void) +{ + return MyFdwXactResolver != NULL; +} + +/* + * Take foreign transactions whose local transaction is already finished. + */ +static void +hold_indoubt_fdwxacts(void) +{ + nheld = 0; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + + /* Take entry if not processed by anyone */ + if (fdwxact->valid && fdwxact->dbid == MyDatabaseId && + fdwxact->locking_backend == InvalidBackendId && + !fdwxact->owner && + !TwoPhaseExists(fdwxact->local_xid)) + { + held_fdwxacts[nheld++] = i; + + /* Take over the entry */ + fdwxact->locking_backend = MyBackendId; + } + } + LWLockRelease(FdwXactLock); +} + +/* + * Take foreign transactions associated with the given waiter's transaction + * as in-processing. + */ +static void +hold_fdwxacts(PGPROC *waiter) +{ + nheld = 0; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + for (int i = 0; i < FdwXactCtl->num_fdwxacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdwxacts[i]; + + if (fdwxact->valid && fdwxact->local_xid == waiter->fdwXactWaitXid) + { + Assert(fdwxact->owner->fdwXactState == FDWXACT_WAITING); + Assert(fdwxact->dbid == waiter->databaseId); + + held_fdwxacts[nheld++] = i; + fdwxact->locking_backend = MyBackendId; + } + } + LWLockRelease(FdwXactLock); +} diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile index f88d72fd86..982c1a36cc 100644 --- a/src/backend/access/rmgrdesc/Makefile +++ b/src/backend/access/rmgrdesc/Makefile @@ -13,6 +13,7 @@ OBJS = \ clogdesc.o \ committsdesc.o \ dbasedesc.o \ + fdwxactdesc.o \ genericdesc.o \ gindesc.o \ gistdesc.o \ diff --git a/src/backend/access/rmgrdesc/fdwxactdesc.c b/src/backend/access/rmgrdesc/fdwxactdesc.c new file mode 100644 index 0000000000..ca761763e5 --- /dev/null +++ b/src/backend/access/rmgrdesc/fdwxactdesc.c @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * fdwxactdesc.c + * PostgreSQL global transaction manager for foreign server. + * + * This module describes the WAL records for foreign transaction manager. + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/backend/access/rmgrdesc/fdwxactdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/fdwxact_xlog.h" + +void +fdwxact_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_FDWXACT_INSERT) + { + FdwXactOnDiskData *fdwxact_insert = (FdwXactOnDiskData *) rec; + + appendStringInfo(buf, "server: %u,", fdwxact_insert->serverid); + appendStringInfo(buf, " user: %u,", fdwxact_insert->userid); + appendStringInfo(buf, " database: %u,", fdwxact_insert->dbid); + appendStringInfo(buf, " local xid: %u,", fdwxact_insert->local_xid); + appendStringInfo(buf, " id: %s", fdwxact_insert->fdwxact_id); + } + else + { + xl_fdwxact_remove *fdwxact_remove = (xl_fdwxact_remove *) rec; + + appendStringInfo(buf, "server: %u,", fdwxact_remove->serverid); + appendStringInfo(buf, " user: %u,", fdwxact_remove->userid); + appendStringInfo(buf, " database: %u,", fdwxact_remove->dbid); + appendStringInfo(buf, " local xid: %u", fdwxact_remove->xid); + } + +} + +const char * +fdwxact_identify(uint8 info) +{ + switch (info & ~XLR_INFO_MASK) + { + case XLOG_FDWXACT_INSERT: + return "NEW FOREIGN TRANSACTION"; + case XLOG_FDWXACT_REMOVE: + return "REMOVE FOREIGN TRANSACTION"; + } + /* Keep compiler happy */ + return NULL; +} diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 1cd97852e8..ea045174e0 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -114,7 +114,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "max_connections=%d max_worker_processes=%d " "max_wal_senders=%d max_prepared_xacts=%d " "max_locks_per_xact=%d wal_level=%s " - "wal_log_hints=%s track_commit_timestamp=%s", + "wal_log_hints=%s track_commit_timestamp=%s " + "max_prepared_foreign_transactions=%d", xlrec.MaxConnections, xlrec.max_worker_processes, xlrec.max_wal_senders, @@ -122,7 +123,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xlrec.max_locks_per_xact, wal_level_str, xlrec.wal_log_hints ? "on" : "off", - xlrec.track_commit_timestamp ? "on" : "off"); + xlrec.track_commit_timestamp ? "on" : "off", + xlrec.max_prepared_foreign_xacts); } else if (info == XLOG_FPW_CHANGE) { diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 58091f6b52..200cf9d067 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -10,6 +10,7 @@ #include "access/brin_xlog.h" #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/generic_xlog.h" #include "access/ginxlog.h" #include "access/gistxlog.h" diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 54fb6cc047..6ff79e7b59 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -77,6 +77,7 @@ #include #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/htup_details.h" #include "access/subtrans.h" #include "access/transam.h" @@ -850,6 +851,35 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) return result; } +/* + * TwoPhaseExists + * Return true if there is a prepared transaction specified by XID + */ +bool +TwoPhaseExists(TransactionId xid) +{ + int i; + bool found = false; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; + + if (pgxact->xid == xid) + { + found = true; + break; + } + } + + LWLockRelease(TwoPhaseStateLock); + + return found; +} + /* * TwoPhaseGetDummyBackendId * Get the dummy backend ID for prepared transaction specified by XID @@ -2196,6 +2226,13 @@ RecordTransactionCommitPrepared(TransactionId xid, XLogRecPtr recptr; TimestampTz committs = GetCurrentTimestamp(); bool replorigin; + bool need_fdwxact_commit; + + /* + * Prepare foreign transactions involving this prepared transaction + * if exist. + */ + need_fdwxact_commit = PrepareFdwXactParticipants(xid); /* * Are we using the replication origins feature? Or, in other words, are @@ -2266,6 +2303,17 @@ RecordTransactionCommitPrepared(TransactionId xid, * in the procarray and continue to hold locks. */ SyncRepWaitForLSN(recptr, true); + + /* + * Wait for foreign transaction prepared as part of this prepared + * transaction to be committed. + */ + if (need_fdwxact_commit) + { + SetFdwXactParticipants(xid); + FdwXactWaitForResolution(xid, true); + ForgetAllFdwXactParticipants(); + } } /* @@ -2285,6 +2333,13 @@ RecordTransactionAbortPrepared(TransactionId xid, const char *gid) { XLogRecPtr recptr; + bool need_fdwxact_commit; + + /* + * Prepare foreign transactions involving this prepared transaction + * if exist. + */ + need_fdwxact_commit = PrepareFdwXactParticipants(xid); /* * Catch the scenario where we aborted partway through @@ -2325,6 +2380,17 @@ RecordTransactionAbortPrepared(TransactionId xid, * in the procarray and continue to hold locks. */ SyncRepWaitForLSN(recptr, false); + + /* + * Wait for foreign transaction prepared as part of this prepared + * transaction to be rolled back. + */ + if (need_fdwxact_commit) + { + SetFdwXactParticipants(xid); + FdwXactWaitForResolution(xid, false); + ForgetAllFdwXactParticipants(); + } } /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index cd30b62d36..348d020249 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -21,6 +21,7 @@ #include #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/multixact.h" #include "access/parallel.h" #include "access/subtrans.h" @@ -1219,6 +1220,7 @@ RecordTransactionCommit(void) SharedInvalidationMessage *invalMessages = NULL; bool RelcacheInitFileInval = false; bool wrote_xlog; + bool need_commit_globally; /* Get data needed for commit record */ nrels = smgrGetPendingDeletes(true, &rels); @@ -1227,6 +1229,7 @@ RecordTransactionCommit(void) nmsgs = xactGetCommittedInvalidationMessages(&invalMessages, &RelcacheInitFileInval); wrote_xlog = (XactLastRecEnd != 0); + need_commit_globally = FdwXactIsForeignTwophaseCommitRequired(); /* * If we haven't been assigned an XID yet, we neither can, nor do we want @@ -1265,12 +1268,13 @@ RecordTransactionCommit(void) } /* - * If we didn't create XLOG entries, we're done here; otherwise we - * should trigger flushing those entries the same as a commit record + * If we didn't create XLOG entries and the transaction does not need + * to be committed using two-phase commit. we're done here; otherwise + * we should trigger flushing those entries the same as a commit record * would. This will primarily happen for HOT pruning and the like; we * want these to be flushed to disk in due time. */ - if (!wrote_xlog) + if (!wrote_xlog && !need_commit_globally) goto cleanup; } else @@ -1428,6 +1432,14 @@ RecordTransactionCommit(void) if (wrote_xlog && markXidCommitted) SyncRepWaitForLSN(XactLastRecEnd, true); + /* + * Wait for prepared foreign transaction to be resolved, if required. + * We only want to wait if we prepared foreign transaction in this + * transaction. + */ + if (need_commit_globally && markXidCommitted) + FdwXactWaitForResolution(xid, true); + /* remember end of last commit record */ XactLastCommitEnd = XactLastRecEnd; @@ -2087,6 +2099,9 @@ CommitTransaction(void) break; } + /* Pre-commit step for foreign transactions */ + PreCommit_FdwXact(); + CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_PRE_COMMIT : XACT_EVENT_PRE_COMMIT); @@ -2254,6 +2269,7 @@ CommitTransaction(void) AtEOXact_PgStat(true, is_parallel_worker); AtEOXact_Snapshot(true, false); AtEOXact_ApplyLauncher(true); + AtEOXact_FdwXact(true); pgstat_report_xact_timestamp(0); CurrentResourceOwner = NULL; @@ -2341,6 +2357,8 @@ PrepareTransaction(void) * the transaction-abort path. */ + AtPrepare_FdwXact(); + /* Shut down the deferred-trigger manager */ AfterTriggerEndXact(true); @@ -2532,6 +2550,9 @@ PrepareTransaction(void) */ PostPrepare_Twophase(); + /* Release held FdwXact entries */ + PostPrepare_FdwXact(); + /* PREPARE acts the same as COMMIT as far as GUC is concerned */ AtEOXact_GUC(true, 1); AtEOXact_SPI(true); @@ -2751,6 +2772,7 @@ AbortTransaction(void) AtEOXact_HashTables(false); AtEOXact_PgStat(false, is_parallel_worker); AtEOXact_ApplyLauncher(false); + AtEOXact_FdwXact(false); pgstat_report_xact_timestamp(0); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 55cac186dc..3449aa524a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -24,6 +24,7 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/heaptoast.h" #include "access/multixact.h" #include "access/rewriteheap.h" @@ -4599,6 +4600,7 @@ InitControlFile(uint64 sysidentifier) ControlFile->max_worker_processes = max_worker_processes; ControlFile->max_wal_senders = max_wal_senders; ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_prepared_foreign_xacts = max_prepared_foreign_xacts; ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; @@ -6286,6 +6288,9 @@ CheckRequiredParameterValues(void) RecoveryRequiresIntParameter("max_wal_senders", max_wal_senders, ControlFile->max_wal_senders); + RecoveryRequiresIntParameter("max_prepared_foreign_transactions", + max_prepared_foreign_xacts, + ControlFile->max_prepared_foreign_xacts); RecoveryRequiresIntParameter("max_prepared_transactions", max_prepared_xacts, ControlFile->max_prepared_xacts); @@ -6836,14 +6841,15 @@ StartupXLOG(void) restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI); /* - * Before running in recovery, scan pg_twophase and fill in its status to - * be able to work on entries generated by redo. Doing a scan before - * taking any recovery action has the merit to discard any 2PC files that - * are newer than the first record to replay, saving from any conflicts at - * replay. This avoids as well any subsequent scans when doing recovery - * of the on-disk two-phase data. + * Before running in recovery, scan pg_twophase and pg_fdwxacts, and then + * fill in its status to be able to work on entries generated by redo. + * Doing a scan before taking any recovery action has the merit to discard + * any state files that are newer than the first record to replay, saving + * from any conflicts at replay. This avoids as well any subsequent scans + * when doing recovery of the on-disk two-phase or fdwxact data. */ restoreTwoPhaseData(); + restoreFdwXactData(); lastFullPageWrites = checkPoint.fullPageWrites; @@ -7045,7 +7051,10 @@ StartupXLOG(void) InitRecoveryTransactionEnvironment(); if (wasShutdown) + { oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + oldestActiveXID = PrescanFdwXacts(oldestActiveXID); + } else oldestActiveXID = checkPoint.oldestActiveXid; Assert(TransactionIdIsValid(oldestActiveXID)); @@ -7558,6 +7567,7 @@ StartupXLOG(void) * as potential problems are detected before any on-disk change is done. */ oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); + oldestActiveXID = PrescanFdwXacts(oldestActiveXID); /* * Consider whether we need to assign a new timeline ID. @@ -7888,6 +7898,9 @@ StartupXLOG(void) /* Reload shared-memory state for prepared transactions */ RecoverPreparedTransactions(); + /* Load all foreign transaction entries from disk to memory */ + RecoverFdwXacts(); + /* * Shutdown the recovery environment. This must occur after * RecoverPreparedTransactions(), see notes for lock_twophase_recover() @@ -9183,6 +9196,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointReplicationOrigin(); /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); + CheckPointFdwXacts(checkPointRedo); } /* @@ -9712,8 +9726,10 @@ XLogReportParameters(void) max_worker_processes != ControlFile->max_worker_processes || max_wal_senders != ControlFile->max_wal_senders || max_prepared_xacts != ControlFile->max_prepared_xacts || + max_prepared_foreign_xacts != ControlFile->max_prepared_foreign_xacts || max_locks_per_xact != ControlFile->max_locks_per_xact || - track_commit_timestamp != ControlFile->track_commit_timestamp) + track_commit_timestamp != ControlFile->track_commit_timestamp || + max_prepared_foreign_xacts != ControlFile->max_prepared_foreign_xacts) { /* * The change in number of backend slots doesn't need to be WAL-logged @@ -9731,6 +9747,7 @@ XLogReportParameters(void) xlrec.max_worker_processes = max_worker_processes; xlrec.max_wal_senders = max_wal_senders; xlrec.max_prepared_xacts = max_prepared_xacts; + xlrec.max_prepared_foreign_xacts = max_prepared_foreign_xacts; xlrec.max_locks_per_xact = max_locks_per_xact; xlrec.wal_level = wal_level; xlrec.wal_log_hints = wal_log_hints; @@ -9749,6 +9766,7 @@ XLogReportParameters(void) ControlFile->max_worker_processes = max_worker_processes; ControlFile->max_wal_senders = max_wal_senders; ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_prepared_foreign_xacts = max_prepared_foreign_xacts; ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; @@ -9956,6 +9974,7 @@ xlog_redo(XLogReaderState *record) RunningTransactionsData running; oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + oldestActiveXID = PrescanFdwXacts(oldestActiveXID); /* * Construct a RunningTransactions snapshot representing a shut @@ -10159,6 +10178,7 @@ xlog_redo(XLogReaderState *record) ControlFile->max_worker_processes = xlrec.max_worker_processes; ControlFile->max_wal_senders = xlrec.max_wal_senders; ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts; + ControlFile->max_prepared_foreign_xacts = xlrec.max_prepared_foreign_xacts; ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact; ControlFile->wal_level = xlrec.wal_level; ControlFile->wal_log_hints = xlrec.wal_log_hints; diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 5314e9348f..d1fded29ab 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -333,6 +333,9 @@ CREATE VIEW pg_prepared_xacts AS CREATE VIEW pg_prepared_statements AS SELECT * FROM pg_prepared_statement() AS P; +CREATE VIEW pg_foreign_xacts AS + SELECT * FROM pg_foreign_xacts() AS F; + CREATE VIEW pg_seclabels AS SELECT l.objoid, l.classoid, l.objsubid, diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 6d53dc463c..a1dea253c2 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2807,8 +2807,14 @@ CopyFrom(CopyState cstate) if (resultRelInfo->ri_FdwRoutine != NULL && resultRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL) + { + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(RelationGetRelid(resultRelInfo->ri_RelationDesc), + true); + resultRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, resultRelInfo); + } /* Prepare to catch AFTER triggers. */ AfterTriggerBeginQuery(); diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index c002a61794..fd9be68abe 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -13,6 +13,8 @@ */ #include "postgres.h" +#include "access/fdwxact.h" +#include "access/heapam.h" #include "access/htup_details.h" #include "access/reloptions.h" #include "access/table.h" @@ -1076,6 +1078,18 @@ RemoveForeignServerById(Oid srvId) if (!HeapTupleIsValid(tp)) elog(ERROR, "cache lookup failed for foreign server %u", srvId); + /* + * If there is a foreign prepared transaction with this foreign server, + * dropping it might result in dangling prepared transaction. + */ + if (FdwXactExists(MyDatabaseId, srvId, InvalidOid)) + { + Form_pg_foreign_server srvForm = (Form_pg_foreign_server) GETSTRUCT(tp); + ereport(WARNING, + (errmsg("server \"%s\" has unresolved prepared transactions on it", + NameStr(srvForm->srvname)))); + } + CatalogTupleDelete(rel, &tp->t_self); ReleaseSysCache(tp); @@ -1396,6 +1410,15 @@ RemoveUserMapping(DropUserMappingStmt *stmt) user_mapping_ddl_aclcheck(useId, srv->serverid, srv->servername); + /* + * If there is a foreign prepared transaction with this user mapping, + * dropping it might result in dangling prepared transaction. + */ + if (FdwXactExists(MyDatabaseId, srv->serverid, useId)) + ereport(WARNING, + (errmsg("server \"%s\" has unresolved prepared transaction for user \"%s\"", + srv->servername, MappingUserName(useId)))); + /* * Do the deletion */ @@ -1526,6 +1549,13 @@ ImportForeignSchema(ImportForeignSchemaStmt *stmt) errmsg("foreign-data wrapper \"%s\" does not support IMPORT FOREIGN SCHEMA", fdw->fdwname))); + /* + * Remember the transaction accesses to a foreign server. Normally during + * ImportForeignSchema we don't modify data on foreign servers, so remember it + * as not-modified server. + */ + RegisterFdwXactByServerId(server->serverid, false); + /* Call FDW to get a list of commands */ cmd_list = fdw_routine->ImportForeignSchema(stmt, server->serverid); diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index fb6ce49056..3fa8bfe09f 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/fdwxact.h" #include "access/table.h" #include "access/tableam.h" #include "catalog/partition.h" @@ -939,7 +940,14 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, */ if (partRelInfo->ri_FdwRoutine != NULL && partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL) + { + Relation child = partRelInfo->ri_RelationDesc; + + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(RelationGetRelid(child), true); + partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo); + } partRelInfo->ri_PartitionInfo = partrouteinfo; partRelInfo->ri_CopyMultiInsertBuffer = NULL; diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c index 513471ab9b..29f376e48c 100644 --- a/src/backend/executor/nodeForeignscan.c +++ b/src/backend/executor/nodeForeignscan.c @@ -22,6 +22,8 @@ */ #include "postgres.h" +#include "access/fdwxact.h" +#include "access/xact.h" #include "executor/executor.h" #include "executor/nodeForeignscan.h" #include "foreign/fdwapi.h" @@ -224,9 +226,31 @@ ExecInitForeignScan(ForeignScan *node, EState *estate, int eflags) * Tell the FDW to initialize the scan. */ if (node->operation != CMD_SELECT) + { + RangeTblEntry *rte; + + rte = exec_rt_fetch(estate->es_result_relation_info->ri_RangeTableIndex, + estate); + + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(rte->relid, true); + fdwroutine->BeginDirectModify(scanstate, eflags); + } else + { + RangeTblEntry *rte; + int rtindex = (scanrelid > 0) ? + scanrelid : + bms_next_member(node->fs_relids, -1); + + rte = exec_rt_fetch(rtindex, estate); + + /* Remember the transaction accesses to a foreign server */ + RegisterFdwXactByRelId(rte->relid, false); + fdwroutine->BeginForeignScan(scanstate, eflags); + } return scanstate; } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 1ec07bad07..e5dee94764 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -37,6 +37,7 @@ #include "postgres.h" +#include "access/fdwxact.h" #include "access/heapam.h" #include "access/htup_details.h" #include "access/tableam.h" @@ -47,6 +48,7 @@ #include "executor/executor.h" #include "executor/nodeModifyTable.h" #include "foreign/fdwapi.h" +#include "foreign/foreign.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "rewrite/rewriteHandler.h" @@ -2418,6 +2420,10 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_FdwRoutine->BeginForeignModify != NULL) { List *fdw_private = (List *) list_nth(node->fdwPrivLists, i); + Oid relid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(relid, true); resultRelInfo->ri_FdwRoutine->BeginForeignModify(mtstate, resultRelInfo, diff --git a/src/backend/foreign/foreign.c b/src/backend/foreign/foreign.c index 61e48ca3f8..8f411c0559 100644 --- a/src/backend/foreign/foreign.c +++ b/src/backend/foreign/foreign.c @@ -187,6 +187,49 @@ GetForeignServerByName(const char *srvname, bool missing_ok) return GetForeignServer(serverid); } +/* + * GetUserMappingOid - look up the user mapping by user mapping oid. + * + * If userid of the mapping is invalid, we set it to current userid. + */ +UserMapping * +GetUserMappingByOid(Oid umid) +{ + Datum datum; + HeapTuple tp; + UserMapping *um; + bool isnull; + Form_pg_user_mapping tableform; + + tp = SearchSysCache1(USERMAPPINGOID, + ObjectIdGetDatum(umid)); + + if (!HeapTupleIsValid(tp)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("user mapping not found for %d", umid))); + + tableform = (Form_pg_user_mapping) GETSTRUCT(tp); + um = (UserMapping *) palloc(sizeof(UserMapping)); + um->umid = umid; + um->userid = OidIsValid(tableform->umuser) ? + tableform->umuser : GetUserId(); + um->serverid = tableform->umserver; + + /* Extract the umoptions */ + datum = SysCacheGetAttr(USERMAPPINGUSERSERVER, + tp, + Anum_pg_user_mapping_umoptions, + &isnull); + if (isnull) + um->options = NIL; + else + um->options = untransformRelOptions(datum); + + ReleaseSysCache(tp); + + return um; +} /* * GetUserMapping - look up the user mapping. @@ -328,6 +371,18 @@ GetFdwRoutine(Oid fdwhandler) elog(ERROR, "foreign-data wrapper handler function %u did not return an FdwRoutine struct", fdwhandler); + /* Sanity check for transaction management callbacks */ + if ((routine->CommitForeignTransaction && !routine->RollbackForeignTransaction) || + (!routine->CommitForeignTransaction && routine->RollbackForeignTransaction)) + elog(ERROR, + "foreign-data wrapper must support both commit and rollback routines or neither"); + + if (routine->PrepareForeignTransaction && + !routine->CommitForeignTransaction && + !routine->RollbackForeignTransaction) + elog(ERROR, + "foreign-data wrapper that supports prepare routine must support both commit and rollback routines"); + return routine; } diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index beb5e85434..2258424e81 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -12,6 +12,8 @@ #include "postgres.h" +#include "access/fdwxact_launcher.h" +#include "access/fdwxact_resolver.h" #include "access/parallel.h" #include "libpq/pqsignal.h" #include "miscadmin.h" @@ -128,6 +130,12 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain + }, + { + "FdwXactResolverMain", FdwXactResolverMain + }, + { + "FdwXactLauncherMain", FdwXactLauncherMain } }; diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index e96134dac8..4c15d7481a 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3663,6 +3663,12 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_CHECKPOINTER_MAIN: event_name = "CheckpointerMain"; break; + case WAIT_EVENT_FDWXACT_RESOLVER_MAIN: + event_name = "FdwXactResolverMain"; + break; + case WAIT_EVENT_FDWXACT_LAUNCHER_MAIN: + event_name = "FdwXactLauncherMain"; + break; case WAIT_EVENT_LOGICAL_APPLY_MAIN: event_name = "LogicalApplyMain"; break; @@ -3773,6 +3779,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_EXECUTE_GATHER: event_name = "ExecuteGather"; break; + case WAIT_EVENT_FDWXACT_RESOLUTION: + event_name = "FdwXactResolution"; + break; case WAIT_EVENT_HASH_BATCH_ALLOCATE: event_name = "HashBatchAllocate"; break; @@ -4099,6 +4108,15 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_TWOPHASE_FILE_WRITE: event_name = "TwophaseFileWrite"; break; + case WAIT_EVENT_FDWXACT_FILE_WRITE: + event_name = "FdwXactFileWrite"; + break; + case WAIT_EVENT_FDWXACT_FILE_READ: + event_name = "FdwXactFileRead"; + break; + case WAIT_EVENT_FDWXACT_FILE_SYNC: + event_name = "FdwXactFileSync"; + break; case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ: event_name = "WALSenderTimelineHistoryRead"; break; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b4d475bb0b..803ac09937 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -93,6 +93,8 @@ #include #endif +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" #include "access/transam.h" #include "access/xlog.h" #include "bootstrap/bootstrap.h" @@ -909,6 +911,10 @@ PostmasterMain(int argc, char *argv[]) ereport(ERROR, (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\""))); + if (max_prepared_foreign_xacts > 0 && max_foreign_xact_resolvers == 0) + ereport(ERROR, + (errmsg("preparing foreign transactions (max_prepared_foreign_transactions > 0) requires max_foreign_transaction_resolvers > 0"))); + /* * Other one-time internal sanity checks can go here, if they are fast. * (Put any slow processing further down, after postmaster.pid creation.) @@ -973,12 +979,13 @@ PostmasterMain(int argc, char *argv[]) LocalProcessControlFile(false); /* - * Register the apply launcher. Since it registers a background worker, - * it needs to be called before InitializeMaxBackends(), and it's probably - * a good idea to call it before any modules had chance to take the - * background worker slots. + * Register the apply launcher and foreign transaction launcher. Since + * it registers a background worker, it needs to be called before + * InitializeMaxBackends(), and it's probably a good idea to call it + * before any modules had chance to take the background worker slots. */ ApplyLauncherRegister(); + FdwXactLauncherRegister(); /* * process any libraries that should be preloaded at postmaster start diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index c2e5e3abf8..9d34817f39 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -151,6 +151,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor case RM_COMMIT_TS_ID: case RM_REPLORIGIN_ID: case RM_GENERIC_ID: + case RM_FDWXACT_ID: /* just deal with xid, and done */ ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record), buf.origptr); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 427b0d59cd..55609eed81 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -16,6 +16,8 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact.h" +#include "access/fdwxact_launcher.h" #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" @@ -147,6 +149,8 @@ CreateSharedMemoryAndSemaphores(void) size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); + size = add_size(size, FdwXactShmemSize()); + size = add_size(size, FdwXactRslvShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -263,6 +267,8 @@ CreateSharedMemoryAndSemaphores(void) BTreeShmemInit(); SyncScanShmemInit(); AsyncShmemInit(); + FdwXactShmemInit(); + FdwXactRslvShmemInit(); #ifdef EXEC_BACKEND diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 3c2b369615..56c43cf741 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -94,6 +94,8 @@ typedef struct ProcArrayStruct TransactionId replication_slot_xmin; /* oldest catalog xmin of any replication slot */ TransactionId replication_slot_catalog_xmin; + /* local transaction id of oldest unresolved distributed transaction */ + TransactionId fdwxact_unresolved_xmin; /* indexes into allPgXact[], has PROCARRAY_MAXPROCS entries */ int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; @@ -249,6 +251,7 @@ CreateSharedProcArray(void) procArray->lastOverflowedXid = InvalidTransactionId; procArray->replication_slot_xmin = InvalidTransactionId; procArray->replication_slot_catalog_xmin = InvalidTransactionId; + procArray->fdwxact_unresolved_xmin = InvalidTransactionId; } allProcs = ProcGlobal->allProcs; @@ -1311,6 +1314,7 @@ GetOldestXmin(Relation rel, int flags) TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; + TransactionId fdwxact_unresolved_xmin = InvalidTransactionId; /* * If we're not computing a relation specific limit, or if a shared @@ -1376,6 +1380,7 @@ GetOldestXmin(Relation rel, int flags) */ replication_slot_xmin = procArray->replication_slot_xmin; replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + fdwxact_unresolved_xmin = procArray->fdwxact_unresolved_xmin; if (RecoveryInProgress()) { @@ -1425,6 +1430,15 @@ GetOldestXmin(Relation rel, int flags) NormalTransactionIdPrecedes(replication_slot_xmin, result)) result = replication_slot_xmin; + /* + * Check whether there are unresolved distributed transaction + * requiring an older xmin. + */ + if (!(flags & PROCARRAY_FDWXACT_XMIN) && + TransactionIdIsValid(fdwxact_unresolved_xmin) && + NormalTransactionIdPrecedes(fdwxact_unresolved_xmin, result)) + result = fdwxact_unresolved_xmin; + /* * After locks have been released and vacuum_defer_cleanup_age has been * applied, check whether we need to back up further to make logical @@ -3125,6 +3139,38 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin, LWLockRelease(ProcArrayLock); } +/* + * ProcArraySetFdwXactUnresolvedXmin + * + * Install limits to future computations of the xmin horizon to prevent + * vacuum clog from affected transactions still needed by resolving + * distributed transaction. + */ +void +ProcArraySetFdwXactUnresolvedXmin(TransactionId xmin) +{ + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + procArray->fdwxact_unresolved_xmin = xmin; + LWLockRelease(ProcArrayLock); +} + +/* + * ProcArrayGetFdwXactUnresolvedXmin + * + * Return the current unresolved xmin limits. + */ +TransactionId +ProcArrayGetFdwXactUnresolvedXmin(void) +{ + TransactionId xmin; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + xmin = procArray->fdwxact_unresolved_xmin; + LWLockRelease(ProcArrayLock); + + return xmin; +} #define XidCacheRemove(i) \ do { \ diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index e6985e8eed..241b099238 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -50,3 +50,6 @@ MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 LogicalRepWorkerLock 43 XactTruncationLock 44 +FdwXactLock 45 +FdwXactResolverLock 46 +FdwXactResolutionLock 47 diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e57fcd2538..470d0da3d1 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -35,6 +35,7 @@ #include #include +#include "access/fdwxact.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" @@ -421,6 +422,10 @@ InitProcess(void) MyProc->syncRepState = SYNC_REP_NOT_WAITING; SHMQueueElemInit(&(MyProc->syncRepLinks)); + /* Initialize fields for fdw xact */ + MyProc->fdwXactState = FDWXACT_NOT_WAITING; + SHMQueueElemInit(&(MyProc->fdwXactLinks)); + /* Initialize fields for group XID clearing. */ MyProc->procArrayGroupMember = false; MyProc->procArrayGroupMemberXid = InvalidTransactionId; @@ -822,6 +827,9 @@ ProcKill(int code, Datum arg) /* Make sure we're out of the sync rep lists */ SyncRepCleanupAtProcExit(); + /* Make sure we're out of the fdwxact lists */ + FdwXactCleanupAtProcExit(); + #ifdef USE_ASSERT_CHECKING { int i; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c9424f167c..f6da103fbd 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -36,6 +36,8 @@ #include "rusagestub.h" #endif +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" #include "access/parallel.h" #include "access/printtup.h" #include "access/xact.h" @@ -3054,6 +3056,18 @@ ProcessInterrupts(void) */ proc_exit(1); } + else if (IsFdwXactResolver()) + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("terminating foreign transaction resolver due to administrator command"))); + else if (IsFdwXactLauncher()) + { + /* + * The foreign transaction launcher can be stopped at any time. + * Use exit status 1 so the background worker is restarted. + */ + proc_exit(1); + } else if (RecoveryConflictPending && RecoveryConflictRetryable) { pgstat_report_recovery_conflict(RecoveryConflictReason); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 75fc6f11d6..72fe0a7167 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -27,6 +27,7 @@ #endif #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/gin.h" #include "access/rmgr.h" #include "access/tableam.h" @@ -426,6 +427,24 @@ static const struct config_enum_entry synchronous_commit_options[] = { {NULL, 0, false} }; +/* + * Although only "required" and "disabled" are documented, we accept all + * the likely variants of "on" and "off". + */ +static const struct config_enum_entry foreign_twophase_commit_options[] = { + {"required", FOREIGN_TWOPHASE_COMMIT_REQUIRED, false}, + {"disabled", FOREIGN_TWOPHASE_COMMIT_DISABLED, false}, + {"on", FOREIGN_TWOPHASE_COMMIT_REQUIRED, false}, + {"off", FOREIGN_TWOPHASE_COMMIT_DISABLED, false}, + {"true", FOREIGN_TWOPHASE_COMMIT_REQUIRED, true}, + {"false", FOREIGN_TWOPHASE_COMMIT_DISABLED, true}, + {"yes", FOREIGN_TWOPHASE_COMMIT_REQUIRED, true}, + {"no", FOREIGN_TWOPHASE_COMMIT_DISABLED, true}, + {"1", FOREIGN_TWOPHASE_COMMIT_REQUIRED, true}, + {"0", FOREIGN_TWOPHASE_COMMIT_DISABLED, true}, + {NULL, 0, false} +}; + /* * Although only "on", "off", "try" are documented, we accept all the likely * variants of "on" and "off". @@ -754,6 +773,10 @@ const char *const config_group_names[] = gettext_noop("Client Connection Defaults / Other Defaults"), /* LOCK_MANAGEMENT */ gettext_noop("Lock Management"), + /* FOREIGN_TRANSACTION */ + gettext_noop("Foreign Transaction"), + /* FOREIGN_TRANSACTION_RESOLVER */ + gettext_noop("Foreign Transaction / Resolver"), /* COMPAT_OPTIONS */ gettext_noop("Version and Platform Compatibility"), /* COMPAT_OPTIONS_PREVIOUS */ @@ -2452,6 +2475,52 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + /* + * See also CheckRequiredParameterValues() if this parameter changes + */ + { + {"max_prepared_foreign_transactions", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Sets the maximum number of simultaneously prepared transactions on foreign servers."), + NULL + }, + &max_prepared_foreign_xacts, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"foreign_transaction_resolver_timeout", PGC_SIGHUP, FOREIGN_TRANSACTION_RESOLVER, + gettext_noop("Sets the maximum time to wait for foreign transaction resolution."), + NULL, + GUC_UNIT_MS + }, + &foreign_xact_resolver_timeout, + 60 * 1000, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"max_foreign_transaction_resolvers", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Maximum number of foreign transaction resolution processes."), + NULL + }, + &max_foreign_xact_resolvers, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"foreign_transaction_resolution_retry_interval", PGC_SIGHUP, FOREIGN_TRANSACTION_RESOLVER, + gettext_noop("Sets the time to wait before retrying to resolve foreign transaction " + "after a failed attempt."), + NULL, + GUC_UNIT_MS + }, + &foreign_xact_resolution_retry_interval, + 5000, 1, INT_MAX, + NULL, NULL, NULL + }, + #ifdef LOCK_DEBUG { {"trace_lock_oidmin", PGC_SUSET, DEVELOPER_OPTIONS, @@ -4580,6 +4649,16 @@ static struct config_enum ConfigureNamesEnum[] = NULL, assign_synchronous_commit, NULL }, + { + {"foreign_twophase_commit", PGC_USERSET, FOREIGN_TRANSACTION, + gettext_noop("Use of foreign twophase commit for the current transaction."), + NULL + }, + &foreign_twophase_commit, + FOREIGN_TWOPHASE_COMMIT_DISABLED, foreign_twophase_commit_options, + NULL, NULL, NULL + }, + { {"archive_mode", PGC_POSTMASTER, WAL_ARCHIVING, gettext_noop("Allows archiving of WAL files using archive_command."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 3a25287a39..5ed8617787 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -125,6 +125,8 @@ #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) +#max_prepared_foreign_transactions = 0 # zero disables the feature + # (change requires restart) # Caution: it is not advisable to set max_prepared_transactions nonzero unless # you actively intend to use prepared transactions. #work_mem = 4MB # min 64kB @@ -344,6 +346,20 @@ #max_sync_workers_per_subscription = 2 # taken from max_logical_replication_workers +#------------------------------------------------------------------------------ +# FOREIGN TRANSACTION +#------------------------------------------------------------------------------ + +#max_foreign_transaction_resolvers = 0 # max number of resolver process + # (change requires restart) +#foreign_transaction_resolver_timeout = 60s # in milliseconds; 0 disables +#foreign_transaction_resolution_retry_interval = 5s # time to wait before + # retrying to resolve + # foreign transactions + # after a failed attempt +#foreign_twophase_commit = disabled # use two-phase commit for distributed transactions: + # disabled or required + #------------------------------------------------------------------------------ # QUERY TUNING #------------------------------------------------------------------------------ diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d index a0b0458108..8701c5f005 100644 --- a/src/backend/utils/probes.d +++ b/src/backend/utils/probes.d @@ -81,6 +81,8 @@ provider postgresql { probe multixact__checkpoint__done(bool); probe twophase__checkpoint__start(); probe twophase__checkpoint__done(); + probe fdwxact__checkpoint__start(); + probe fdwxact__checkpoint__done(); probe smgr__md__read__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int); probe smgr__md__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int); diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 786672b1b6..bc0c12b3b8 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -208,6 +208,7 @@ static const char *const subdirs[] = { "pg_snapshots", "pg_subtrans", "pg_twophase", + "pg_fdwxact", "pg_multixact", "pg_multixact/members", "pg_multixact/offsets", diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index e73639df74..3041c39bc0 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -302,6 +302,8 @@ main(int argc, char *argv[]) ControlFile->max_wal_senders); printf(_("max_prepared_xacts setting: %d\n"), ControlFile->max_prepared_xacts); + printf(_("max_prepared_foreign_transactions setting: %d\n"), + ControlFile->max_prepared_foreign_xacts); printf(_("max_locks_per_xact setting: %d\n"), ControlFile->max_locks_per_xact); printf(_("track_commit_timestamp setting: %s\n"), diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 233441837f..b040202043 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -710,6 +710,7 @@ GuessControlValues(void) ControlFile.max_wal_senders = 10; ControlFile.max_worker_processes = 8; ControlFile.max_prepared_xacts = 0; + ControlFile.max_prepared_foreign_xacts = 0; ControlFile.max_locks_per_xact = 64; ControlFile.maxAlign = MAXIMUM_ALIGNOF; @@ -914,6 +915,7 @@ RewriteControlFile(void) ControlFile.max_wal_senders = 10; ControlFile.max_worker_processes = 8; ControlFile.max_prepared_xacts = 0; + ControlFile.max_prepared_foreign_xacts = 0; ControlFile.max_locks_per_xact = 64; /* The control file gets flushed here. */ diff --git a/src/bin/pg_waldump/fdwxactdesc.c b/src/bin/pg_waldump/fdwxactdesc.c new file mode 120000 index 0000000000..ce8c21880c --- /dev/null +++ b/src/bin/pg_waldump/fdwxactdesc.c @@ -0,0 +1 @@ +../../../src/backend/access/rmgrdesc/fdwxactdesc.c \ No newline at end of file diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 852d8ca4b1..b616cea347 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -11,6 +11,7 @@ #include "access/brin_xlog.h" #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact_xlog.h" #include "access/generic_xlog.h" #include "access/ginxlog.h" #include "access/gistxlog.h" diff --git a/src/include/access/fdwxact.h b/src/include/access/fdwxact.h new file mode 100644 index 0000000000..d550ee9b87 --- /dev/null +++ b/src/include/access/fdwxact.h @@ -0,0 +1,164 @@ +/* + * fdwxact.h + * + * PostgreSQL global transaction manager + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/access/fdwxact.h + */ +#ifndef FDWXACT_H +#define FDWXACT_H + +#include "access/fdwxact_xlog.h" +#include "access/xlogreader.h" +#include "foreign/foreign.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "nodes/execnodes.h" +#include "storage/backendid.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "utils/guc.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* fdwXactState */ +#define FDWXACT_NOT_WAITING 0 +#define FDWXACT_WAITING 1 +#define FDWXACT_WAIT_COMPLETE 2 + +/* Flag passed to FDW transaction management APIs */ +#define FDWXACT_FLAG_ONEPHASE 0x01 /* transaction can commit/rollback + * without preparation */ + +/* Enum for foreign_twophase_commit parameter */ +typedef enum +{ + FOREIGN_TWOPHASE_COMMIT_DISABLED, /* disable foreign twophase commit */ + FOREIGN_TWOPHASE_COMMIT_REQUIRED /* all foreign servers have to support + * twophase commit */ +} ForeignTwophaseCommitLevel; + +/* Enum to track the status of foreign transaction */ +typedef enum +{ + FDWXACT_STATUS_INVALID = 0, + FDWXACT_STATUS_PREPARING, /* foreign transaction is being prepared */ + FDWXACT_STATUS_PREPARED, /* foreign transaction is prepared */ + FDWXACT_STATUS_COMMITTING, /* foreign prepared transaction is committed */ + FDWXACT_STATUS_ABORTING /* foreign prepared transaction is aborted */ +} FdwXactStatus; + +typedef struct FdwXactData *FdwXact; + +/* + * Shared memory state of a single foreign transaction. + */ +typedef struct FdwXactData +{ + FdwXact fdwxact_free_next; /* Next free FdwXact entry */ + + TransactionId local_xid; /* XID of local transaction */ + PGPROC *owner; /* process that executed the distributed tx. */ + + /* Information relevant with foreign transaction */ + Oid dbid; + Oid serverid; + Oid userid; + Oid umid; + + /* Foreign transaction status */ + FdwXactStatus status; + slock_t mutex; /* protect the above field */ + + /* + * Note that we need to keep track of two LSNs for each FdwXact. We keep + * track of the start LSN because this is the address we must use to read + * state data back from WAL when committing a FdwXact. We keep track of + * the end LSN because that is the LSN we need to wait for prior to + * commit. + */ + XLogRecPtr insert_start_lsn; /* XLOG offset of inserting this entry + * start */ + XLogRecPtr insert_end_lsn; /* XLOG offset of inserting this entry end */ + + bool valid; /* has the entry been complete and written to + * file? */ + BackendId locking_backend; /* backend currently working on the fdw xact */ + bool ondisk; /* true if prepare state file is on disk */ + bool inredo; /* true if entry was added via xlog_redo */ + + char fdwxact_id[FDWXACT_ID_MAX_LEN]; /* prepared transaction + * identifier */ +} FdwXactData; + +/* + * Shared memory layout for maintaining foreign prepared transaction entries. + * Adding or removing FdwXact entry needs to hold FdwXactLock in exclusive mode, + * and iterating fdwXacts needs that in shared mode. + */ +typedef struct +{ + /* Head of linked list of free FdwXactData structs */ + FdwXact free_fdwxacts; + + /* Number of valid foreign transaction entries */ + int num_fdwxacts; + + /* Upto max_prepared_foreign_xacts entries in the array */ + FdwXact fdwxacts[FLEXIBLE_ARRAY_MEMBER]; /* Variable length array */ +} FdwXactCtlData; + +/* Pointer to the shared memory holding the foreign transactions data */ +FdwXactCtlData *FdwXactCtl; + +/* State data for foreign transaction resolution, passed to FDW callbacks */ +typedef struct FdwXactRslvState +{ + TransactionId xid; + + /* Foreign transaction information */ + char *fdwxact_id; + ForeignServer *server; + UserMapping *usermapping; + + int flags; /* OR of FDWXACT_FLAG_xx flags */ +} FdwXactRslvState; + +/* GUC parameters */ +extern int max_prepared_foreign_xacts; +extern int max_foreign_xact_resolvers; +extern int foreign_xact_resolution_retry_interval; +extern int foreign_xact_resolver_timeout; +extern int foreign_twophase_commit; + +/* Function declarations */ +extern Size FdwXactShmemSize(void); +extern void FdwXactShmemInit(void); +extern void RegisterFdwXactByRelId(Oid relid, bool modified); +extern void RegisterFdwXactByServerId(Oid serverid, bool modified); +extern void ForgetAllFdwXactParticipants(void); +extern void FdwXactReleaseWaiter(PGPROC *waiter); +extern void FdwXactWaitForResolution(TransactionId wait_xid, bool commit); +extern void FdwXactResolveFdwXacts(int *fdwxact_idxs, int nfdwxacts, PGPROC *waiter); +extern PGPROC *FdwXactGetWaiter(TimestampTz now, TimestampTz *nextResolutionTs_p, + TransactionId *waitXid_p); +extern bool FdwXactWaiterExists(Oid dbid); +extern bool PrepareFdwXactParticipants(TransactionId xid); +extern void SetFdwXactParticipants(TransactionId xid); +extern void ClearFdwXactParticipants(void); +extern void PreCommit_FdwXact(void); +extern void AtEOXact_FdwXact(bool is_commit); +extern void AtPrepare_FdwXact(void); +extern void PostPrepare_FdwXact(void); +extern void FdwXactCleanupAtProcExit(void); +extern void restoreFdwXactData(void); +extern TransactionId PrescanFdwXacts(TransactionId oldestActiveXid); +extern void RecoverFdwXacts(void); +extern bool FdwXactExists(Oid dboid, Oid serverid, Oid userid); +extern void CheckPointFdwXacts(XLogRecPtr redo_horizon); +extern bool FdwXactIsForeignTwophaseCommitRequired(void); + +#endif /* FDWXACT_H */ diff --git a/src/include/access/fdwxact_launcher.h b/src/include/access/fdwxact_launcher.h new file mode 100644 index 0000000000..688b43b8d0 --- /dev/null +++ b/src/include/access/fdwxact_launcher.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_launcher.h + * PostgreSQL foreign transaction launcher definitions + * + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/access/fdwxact_launcher.h + * + *------------------------------------------------------------------------- + */ + +#ifndef FDWXACT_LAUNCHER_H +#define FDWXACT_LAUNCHER_H + +#include "access/fdwxact.h" + +extern void FdwXactLauncherRegister(void); +extern void FdwXactLauncherMain(Datum main_arg); +extern void FdwXactLauncherRequestToLaunch(void); +extern void FdwXactLaunchOrWakeupResolver(void); +extern Size FdwXactRslvShmemSize(void); +extern void FdwXactRslvShmemInit(void); +extern bool IsFdwXactLauncher(void); + + +#endif /* FDWXACT_LAUNCHER_H */ diff --git a/src/include/access/fdwxact_resolver.h b/src/include/access/fdwxact_resolver.h new file mode 100644 index 0000000000..779848113c --- /dev/null +++ b/src/include/access/fdwxact_resolver.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_resolver.h + * PostgreSQL foreign transaction resolver definitions + * + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/access/fdwxact_resolver.h + * + *------------------------------------------------------------------------- + */ +#ifndef FDWXACT_RESOLVER_H +#define FDWXACT_RESOLVER_H + +#include "access/fdwxact.h" + +extern void FdwXactResolverMain(Datum main_arg); +extern bool IsFdwXactResolver(void); + +extern int foreign_xact_resolver_timeout; + +#endif /* FDWXACT_RESOLVER_H */ diff --git a/src/include/access/fdwxact_xlog.h b/src/include/access/fdwxact_xlog.h new file mode 100644 index 0000000000..b4cec76eae --- /dev/null +++ b/src/include/access/fdwxact_xlog.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_xlog.h + * Foreign transaction XLOG definitions. + * + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/access/fdwxact_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef FDWXACT_XLOG_H +#define FDWXACT_XLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + +/* Info types for logs related to FDW transactions */ +#define XLOG_FDWXACT_INSERT 0x00 +#define XLOG_FDWXACT_REMOVE 0x10 + +/* Maximum length of the prepared transaction id, borrowed from twophase.c */ +#define FDWXACT_ID_MAX_LEN 200 + +/* + * On disk file structure, also used to WAL + */ +typedef struct +{ + TransactionId local_xid; + Oid dbid; /* database oid where to find foreign server + * and user mapping */ + Oid serverid; /* foreign server where transaction takes + * place */ + Oid userid; /* user who initiated the foreign transaction */ + Oid umid; + char fdwxact_id[FDWXACT_ID_MAX_LEN]; /* foreign txn prepare id */ +} FdwXactOnDiskData; + +typedef struct xl_fdwxact_remove +{ + TransactionId xid; + Oid serverid; + Oid userid; + Oid dbid; + bool force; +} xl_fdwxact_remove; + +extern void fdwxact_redo(XLogReaderState *record); +extern void fdwxact_desc(StringInfo buf, XLogReaderState *record); +extern const char *fdwxact_identify(uint8 info); + +#endif /* FDWXACT_XLOG_H */ diff --git a/src/include/access/resolver_internal.h b/src/include/access/resolver_internal.h new file mode 100644 index 0000000000..c935471936 --- /dev/null +++ b/src/include/access/resolver_internal.h @@ -0,0 +1,63 @@ +/*------------------------------------------------------------------------- + * + * resolver_internal.h + * Internal headers shared by fdwxact resolvers. + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/access/resolver_internal.h + * + *------------------------------------------------------------------------- + */ + +#ifndef RESOLVER_INTERNAL_H +#define RESOLVER_INTERNAL_H + +#include "storage/latch.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/timestamp.h" + +/* + * Each foreign transaction resolver has a FdwXactResolver struct in + * shared memory. This struct is protected by FdwXactResolverLaunchLock. + */ +typedef struct FdwXactResolver +{ + pid_t pid; /* this resolver's PID, or 0 if not active */ + Oid dbid; /* database oid */ + + /* Indicates if this slot is used of free */ + bool in_use; + + /* Protect shared variables shown above */ + slock_t mutex; + + /* + * Pointer to the resolver's patch. Used by backends to wake up this + * resolver when it has work to do. NULL if the resolver isn't active. + */ + Latch *latch; +} FdwXactResolver; + +/* There is one FdwXactRslvCtlData struct for the whole database cluster */ +typedef struct FdwXactRslvCtlData +{ + /* Foreign transaction resolution queue. Protected by FdwXactLock */ + SHM_QUEUE fdwxact_queue; + + /* Supervisor process and latch */ + pid_t launcher_pid; + Latch *launcher_latch; + + FdwXactResolver resolvers[FLEXIBLE_ARRAY_MEMBER]; +} FdwXactRslvCtlData; +#define SizeOfFdwXactRslvCtlData \ + (offsetof(FdwXactRslvCtlData, resolvers) + sizeof(FdwXactResolver)) + +extern FdwXactRslvCtlData *FdwXactRslvCtl; + +extern FdwXactResolver *MyFdwXactResolver; +extern FdwXactRslvCtlData *FdwXactRslvCtl; + +#endif /* RESOLVER_INTERNAL_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 6c15df7e70..986bc73566 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) +PG_RMGR(RM_FDWXACT_ID, "Fdw Transaction", fdwxact_redo, fdwxact_desc, fdwxact_identify, NULL, NULL, NULL) diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 2ca71c3445..bd027a2861 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -36,6 +36,7 @@ extern void PostPrepare_Twophase(void); extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid, bool lock_held); extern BackendId TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held); +extern bool TwoPhaseExists(TransactionId xid); extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, diff --git a/src/include/access/xact.h b/src/include/access/xact.h index c096120c94..7a5d00ddb9 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -108,6 +108,13 @@ extern int MyXactFlags; */ #define XACT_FLAGS_WROTENONTEMPREL (1U << 2) +/* + * XACT_FLAGS_FDWNONPREPARE - set when we wrote data on foreign table of which + * server isn't capable of two-phase commit + * relation. + */ +#define XACT_FLAGS_FDWNOPREPARE (1U << 3) + /* * start- and end-of-transaction callbacks for dynamically loaded modules */ diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index c8869d5226..da0d442f1b 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -232,6 +232,7 @@ typedef struct xl_parameter_change int max_worker_processes; int max_wal_senders; int max_prepared_xacts; + int max_prepared_foreign_xacts; int max_locks_per_xact; int wal_level; bool wal_log_hints; diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index de5670e538..9884f5f8e7 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -179,6 +179,7 @@ typedef struct ControlFileData int max_worker_processes; int max_wal_senders; int max_prepared_xacts; + int max_prepared_foreign_xacts; int max_locks_per_xact; bool track_commit_timestamp; diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 61f2c2f5b4..90bf2d495b 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5981,6 +5981,24 @@ proargnames => '{type,object_names,object_args,classid,objid,objsubid}', prosrc => 'pg_get_object_address' }, +{ oid => '9706', descr => 'view foreign transactions', + proname => 'pg_foreign_xacts', prorows => '1000', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{xid,oid,oid,text,bool,text}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{xid,serverid,userid,state,in_doubt,identifier}', + prosrc => 'pg_foreign_xacts' }, +{ oid => '9707', descr => 'remove foreign transaction without resolution', + proname => 'pg_remove_foreign_xact', provolatile => 'v', prorettype => 'bool', + proargtypes => 'xid oid oid', + proargnames => '{xid,serverid,userid}', + prosrc => 'pg_remove_foreign_xact' }, +{ oid => '9708', descr => 'resolve one foreign transaction', + proname => 'pg_resolve_foreign_xact', provolatile => 'v', prorettype => 'bool', + proargtypes => 'xid oid oid', + proargnames => '{xid,serverid,userid}', + prosrc => 'pg_resolve_foreign_xact' }, + { oid => '2079', descr => 'is table visible in search path?', proname => 'pg_table_is_visible', procost => '10', provolatile => 's', prorettype => 'bool', proargtypes => 'oid', prosrc => 'pg_table_is_visible' }, @@ -6099,6 +6117,10 @@ { oid => '2851', descr => 'wal filename, given a wal location', proname => 'pg_walfile_name', prorettype => 'text', proargtypes => 'pg_lsn', prosrc => 'pg_walfile_name' }, +{ oid => '9709', + descr => 'stop a foreign transaction resolver process running on the given database', + proname => 'pg_stop_foreing_xact_resolver', provolatile => 'v', prorettype => 'bool', + proargtypes => 'oid', prosrc => 'pg_stop_foreign_xact_resolver'}, { oid => '3165', descr => 'difference in bytes, given two wal locations', proname => 'pg_wal_lsn_diff', prorettype => 'numeric', diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index 95556dfb15..8d046cc4e4 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -12,6 +12,7 @@ #ifndef FDWAPI_H #define FDWAPI_H +#include "access/fdwxact.h" #include "access/parallel.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" @@ -169,6 +170,11 @@ typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root, typedef List *(*ReparameterizeForeignPathByChild_function) (PlannerInfo *root, List *fdw_private, RelOptInfo *child_rel); +typedef void (*PrepareForeignTransaction_function) (FdwXactRslvState *frstate); +typedef void (*CommitForeignTransaction_function) (FdwXactRslvState *frstate); +typedef void (*RollbackForeignTransaction_function) (FdwXactRslvState *frstate); +typedef char *(*GetPrepareId_function) (TransactionId xid, Oid serverid, + Oid userid, int *prep_id_len); /* * FdwRoutine is the struct returned by a foreign-data wrapper's handler @@ -236,6 +242,12 @@ typedef struct FdwRoutine /* Support functions for IMPORT FOREIGN SCHEMA */ ImportForeignSchema_function ImportForeignSchema; + /* Support functions for transaction management */ + PrepareForeignTransaction_function PrepareForeignTransaction; + CommitForeignTransaction_function CommitForeignTransaction; + RollbackForeignTransaction_function RollbackForeignTransaction; + GetPrepareId_function GetPrepareId; + /* Support functions for parallelism under Gather node */ IsForeignScanParallelSafe_function IsForeignScanParallelSafe; EstimateDSMForeignScan_function EstimateDSMForeignScan; diff --git a/src/include/foreign/foreign.h b/src/include/foreign/foreign.h index 5e0cf533fb..5596ee591c 100644 --- a/src/include/foreign/foreign.h +++ b/src/include/foreign/foreign.h @@ -69,6 +69,7 @@ extern ForeignServer *GetForeignServerExtended(Oid serverid, bits16 flags); extern ForeignServer *GetForeignServerByName(const char *name, bool missing_ok); extern UserMapping *GetUserMapping(Oid userid, Oid serverid); +extern UserMapping *GetUserMappingByOid(Oid umid); extern ForeignDataWrapper *GetForeignDataWrapper(Oid fdwid); extern ForeignDataWrapper *GetForeignDataWrapperExtended(Oid fdwid, bits16 flags); diff --git a/src/include/pgstat.h b/src/include/pgstat.h index c55dc1481c..2186c1c5d0 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -806,6 +806,8 @@ typedef enum WAIT_EVENT_BGWRITER_HIBERNATE, WAIT_EVENT_BGWRITER_MAIN, WAIT_EVENT_CHECKPOINTER_MAIN, + WAIT_EVENT_FDWXACT_RESOLVER_MAIN, + WAIT_EVENT_FDWXACT_LAUNCHER_MAIN, WAIT_EVENT_LOGICAL_APPLY_MAIN, WAIT_EVENT_LOGICAL_LAUNCHER_MAIN, WAIT_EVENT_PGSTAT_MAIN, @@ -853,6 +855,7 @@ typedef enum WAIT_EVENT_CHECKPOINT_DONE, WAIT_EVENT_CHECKPOINT_START, WAIT_EVENT_EXECUTE_GATHER, + WAIT_EVENT_FDWXACT_RESOLUTION, WAIT_EVENT_HASH_BATCH_ALLOCATE, WAIT_EVENT_HASH_BATCH_ELECT, WAIT_EVENT_HASH_BATCH_LOAD, @@ -969,6 +972,9 @@ typedef enum WAIT_EVENT_TWOPHASE_FILE_READ, WAIT_EVENT_TWOPHASE_FILE_SYNC, WAIT_EVENT_TWOPHASE_FILE_WRITE, + WAIT_EVENT_FDWXACT_FILE_READ, + WAIT_EVENT_FDWXACT_FILE_WRITE, + WAIT_EVENT_FDWXACT_FILE_SYNC, WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ, WAIT_EVENT_WAL_BOOTSTRAP_SYNC, WAIT_EVENT_WAL_BOOTSTRAP_WRITE, diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index b20e2ad4f6..5bc4c78ace 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -16,6 +16,7 @@ #include "access/clog.h" #include "access/xlogdefs.h" +#include "datatype/timestamp.h" #include "lib/ilist.h" #include "storage/latch.h" #include "storage/lock.h" @@ -161,6 +162,16 @@ struct PGPROC int syncRepState; /* wait state for sync rep */ SHM_QUEUE syncRepLinks; /* list link if process is in syncrep queue */ + /* + * Info to allow us to wait for foreign transaction to be resolved, if + * needed. + */ + TransactionId fdwXactWaitXid; /* waiting for foreign transaction involved with + * this transaction id to be resolved */ + int fdwXactState; /* wait state for foreign transaction resolution */ + SHM_QUEUE fdwXactLinks; /* list link if process is in queue */ + TimestampTz fdwXactNextResolutionTs; + /* * All PROCLOCK objects for locks held or awaited by this backend are * linked into one of these lists, according to the partition number of diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index a5c7d0c064..0f73b64937 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -36,6 +36,8 @@ #define PROCARRAY_SLOTS_XMIN 0x20 /* replication slot xmin, * catalog_xmin */ +#define PROCARRAY_FDWXACT_XMIN 0x40 /* unresolved distributed + transaciton xmin */ /* * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching * PGXACT->vacuumFlags. Other flags are used for different purposes and @@ -125,4 +127,7 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, TransactionId *catalog_xmin); + +extern void ProcArraySetFdwXactUnresolvedXmin(TransactionId xmin); +extern TransactionId ProcArrayGetFdwXactUnresolvedXmin(void); #endif /* PROCARRAY_H */ diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 454c2df487..f977ca43d4 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -96,6 +96,8 @@ enum config_group CLIENT_CONN_PRELOAD, CLIENT_CONN_OTHER, LOCK_MANAGEMENT, + FOREIGN_TRANSACTION, + FOREIGN_TRANSACTION_RESOLVER, COMPAT_OPTIONS, COMPAT_OPTIONS_PREVIOUS, COMPAT_OPTIONS_CLIENT, diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index b813e32215..d658791549 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1342,6 +1342,13 @@ pg_file_settings| SELECT a.sourcefile, a.applied, a.error FROM pg_show_all_file_settings() a(sourcefile, sourceline, seqno, name, setting, applied, error); +pg_foreign_xacts| SELECT f.xid, + f.serverid, + f.userid, + f.state, + f.in_doubt, + f.identifier + FROM pg_foreign_xacts() f(xid, serverid, userid, state, in_doubt, identifier); pg_group| SELECT pg_authid.rolname AS groname, pg_authid.oid AS grosysid, ARRAY( SELECT pg_auth_members.member -- 2.23.0