._v27000755 000765 000024 00000000334 13670577767 013174 0ustar00dilipkumarstaff000000 000000 Mac OS X  2??ATTR??H?Hcom.apple.macl??^???OU?=C?6?PaxHeader/v27000755 000765 000024 00000000036 13670577767 014727 xustar00dilipkumarstaff000000 000000 30 mtime=1591934967.722948437 v27/000755 000765 000024 00000000000 13670577767 013031 5ustar00dilipkumarstaff000000 000000 v27/._.DS_Store000644 000765 000024 00000000170 13670577763 014723 0ustar00dilipkumarstaff000000 000000 Mac OS X  2Fx @ATTRxxv27/PaxHeader/.DS_Store000644 000765 000024 00000000036 13670577763 016460 xustar00dilipkumarstaff000000 000000 30 mtime=1591934963.284642037 v27/.DS_Store000644 000765 000024 00000020004 13670577763 014504 0ustar00dilipkumarstaff000000 000000 Bud1 0001-I  @? @? @? @?v27-0001-Immediately-WAL-log-subtransaction-and-top-level.patchIlocblob;(???????v27-0002-Issue-individual-invalidations-with-wal_level-lo.patchIlocblob?(???????v27-0003-Extend-the-output-plugin-API-with-stream-methods.patchIlocblob(???????v27-0004-Gracefully-handle-concurrent-aborts-of-uncommitt.patchIlocblob?(??????8v27-0005-Implement-streaming-mode-in-ReorderBuffer.patchIlocblob?(??????-v27-0007-Track-statistics-for-streaming.patchIlocblob?(???????v27-0008-Add-support-for-streaming-to-built-in-replicatio.patchIlocblob=(??????>v27-0009-Enable-streaming-for-all-subscription-TAP-tests.patchIlocblob?(??????1v27-0010-Add-TAP-test-for-streaming-vs.-DDL.patchIlocblob(??????;v27-0011-Provide-new-api-to-get-the-streaming-changes.patchIlocblob?(??????.v27-0012-Add-streaming-option-in-pg_dump.patchIlocblob?(???????v27-0013-Change-buffile-interface-required-for-streaming-.patchIlocblob;????????v27-0014-Worker-tempfile-use-the-shared-buffile-infrastru.patchIlocblob???????? E DSDB `? @? @? @r-streaming-vs.-DDL.patchIlocblob(??????;v27-0011-Provide-new-api-to-get-the-streaming-changes.patchIlocblob?(??????.v27-0012-Add-streaming-option-in-pg_dump.patchIlocblob?(???????v27-0013-Change-buffile-interface-required-for-streaming-.patchIlocblob;????????v27-0014-Worker-tempfile-use-the-shared-buffile-infrastru.patchIlocblob????????v27/v27-0009-Enable-streaming-for-all-subscription-TAP-tests.patch000644 000765 000024 00000035517 13670411611 026051 0ustar00dilipkumarstaff000000 000000 From b10f2d435242fab190c64fcd2f1c4b88b37ad460 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Wed, 20 Nov 2019 16:41:13 +0530 Subject: [PATCH v27 09/14] Enable streaming for all subscription TAP tests --- src/test/subscription/t/001_rep_changes.pl | 2 +- src/test/subscription/t/002_types.pl | 2 +- src/test/subscription/t/003_constraints.pl | 2 +- src/test/subscription/t/004_sync.pl | 8 ++++---- src/test/subscription/t/005_encoding.pl | 2 +- src/test/subscription/t/006_rewrite.pl | 2 +- src/test/subscription/t/007_ddl.pl | 2 +- src/test/subscription/t/008_diff_schema.pl | 2 +- src/test/subscription/t/009_matviews.pl | 2 +- src/test/subscription/t/009_stream_simple.pl | 2 +- src/test/subscription/t/010_stream_subxact.pl | 2 +- src/test/subscription/t/010_truncate.pl | 6 +++--- src/test/subscription/t/011_generated.pl | 2 +- src/test/subscription/t/011_stream_ddl.pl | 2 +- src/test/subscription/t/012_collation.pl | 2 +- src/test/subscription/t/012_stream_subxact_abort.pl | 2 +- src/test/subscription/t/013_stream_subxact_ddl_abort.pl | 2 +- src/test/subscription/t/100_bugs.pl | 2 +- 18 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl index 3f8318fc7c..6f7bedc130 100644 --- a/src/test/subscription/t/001_rep_changes.pl +++ b/src/test/subscription/t/001_rep_changes.pl @@ -78,7 +78,7 @@ $node_publisher->safe_psql('postgres', "ALTER PUBLICATION tap_pub_ins_only ADD TABLE tab_ins"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub, tap_pub_ins_only" + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub, tap_pub_ins_only WITH (streaming = on)" ); $node_publisher->wait_for_catchup('tap_sub'); diff --git a/src/test/subscription/t/002_types.pl b/src/test/subscription/t/002_types.pl index aedcab2fbc..94c71f8ae2 100644 --- a/src/test/subscription/t/002_types.pl +++ b/src/test/subscription/t/002_types.pl @@ -108,7 +108,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR ALL TABLES"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (slot_name = tap_sub_slot)" + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (slot_name = tap_sub_slot, streaming = on)" ); $node_publisher->wait_for_catchup('tap_sub'); diff --git a/src/test/subscription/t/003_constraints.pl b/src/test/subscription/t/003_constraints.pl index 3a590f871a..4ba80869b9 100644 --- a/src/test/subscription/t/003_constraints.pl +++ b/src/test/subscription/t/003_constraints.pl @@ -35,7 +35,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR ALL TABLES;"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (copy_data = false)" + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (copy_data = false, streaming = on)" ); $node_publisher->wait_for_catchup('tap_sub'); diff --git a/src/test/subscription/t/004_sync.pl b/src/test/subscription/t/004_sync.pl index e111ab9181..a6fae9c3f1 100644 --- a/src/test/subscription/t/004_sync.pl +++ b/src/test/subscription/t/004_sync.pl @@ -33,7 +33,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR ALL TABLES"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub" + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (streaming = on)" ); $node_publisher->wait_for_catchup('tap_sub'); @@ -56,7 +56,7 @@ $node_publisher->safe_psql('postgres', # recreate the subscription, it will try to do initial copy $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub" + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (streaming = on)" ); # but it will be stuck on data copy as it will fail on constraint @@ -78,7 +78,7 @@ is($result, qq(20), 'initial data synced for second sub'); # now check another subscription for the same node pair $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub2 CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (copy_data = false)" + "CREATE SUBSCRIPTION tap_sub2 CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (copy_data = false, streaming = on)" ); # wait for it to start @@ -100,7 +100,7 @@ $node_subscriber->safe_psql('postgres', "DELETE FROM tab_rep;"); # recreate the subscription again $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub" + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (streaming = on)" ); # and wait for data sync to finish again diff --git a/src/test/subscription/t/005_encoding.pl b/src/test/subscription/t/005_encoding.pl index aec7a17a78..202871a658 100644 --- a/src/test/subscription/t/005_encoding.pl +++ b/src/test/subscription/t/005_encoding.pl @@ -26,7 +26,7 @@ my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; $node_publisher->safe_psql('postgres', "CREATE PUBLICATION mypub FOR ALL TABLES;"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub;" + "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub WITH (streaming = on);" ); $node_publisher->wait_for_catchup('mysub'); diff --git a/src/test/subscription/t/006_rewrite.pl b/src/test/subscription/t/006_rewrite.pl index c6cda10a19..70c86b22ac 100644 --- a/src/test/subscription/t/006_rewrite.pl +++ b/src/test/subscription/t/006_rewrite.pl @@ -22,7 +22,7 @@ my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; $node_publisher->safe_psql('postgres', "CREATE PUBLICATION mypub FOR ALL TABLES;"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub;" + "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub WITH (streaming = on);" ); $node_publisher->wait_for_catchup('mysub'); diff --git a/src/test/subscription/t/007_ddl.pl b/src/test/subscription/t/007_ddl.pl index 7fe6cc6d63..f9c8d1d348 100644 --- a/src/test/subscription/t/007_ddl.pl +++ b/src/test/subscription/t/007_ddl.pl @@ -22,7 +22,7 @@ my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; $node_publisher->safe_psql('postgres', "CREATE PUBLICATION mypub FOR ALL TABLES;"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub;" + "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub WITH (streaming = on);" ); $node_publisher->wait_for_catchup('mysub'); diff --git a/src/test/subscription/t/008_diff_schema.pl b/src/test/subscription/t/008_diff_schema.pl index 963334ed89..cdf9b8e7bb 100644 --- a/src/test/subscription/t/008_diff_schema.pl +++ b/src/test/subscription/t/008_diff_schema.pl @@ -32,7 +32,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR ALL TABLES"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub" + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr' PUBLICATION tap_pub WITH (streaming = on)" ); $node_publisher->wait_for_catchup('tap_sub'); diff --git a/src/test/subscription/t/009_matviews.pl b/src/test/subscription/t/009_matviews.pl index 7afc7bdba9..21f50c7012 100644 --- a/src/test/subscription/t/009_matviews.pl +++ b/src/test/subscription/t/009_matviews.pl @@ -18,7 +18,7 @@ my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; $node_publisher->safe_psql('postgres', "CREATE PUBLICATION mypub FOR ALL TABLES;"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub;" + "CREATE SUBSCRIPTION mysub CONNECTION '$publisher_connstr' PUBLICATION mypub WITH (streaming = on);" ); $node_publisher->safe_psql('postgres', diff --git a/src/test/subscription/t/009_stream_simple.pl b/src/test/subscription/t/009_stream_simple.pl index 2f01133f69..30561d8f96 100644 --- a/src/test/subscription/t/009_stream_simple.pl +++ b/src/test/subscription/t/009_stream_simple.pl @@ -40,7 +40,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE tes my $appname = 'tap_sub'; $node_subscriber->safe_psql('postgres', -"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (streaming = on)" ); wait_for_caught_up($node_publisher, $appname); diff --git a/src/test/subscription/t/010_stream_subxact.pl b/src/test/subscription/t/010_stream_subxact.pl index d2ae38592b..9a6bac6822 100644 --- a/src/test/subscription/t/010_stream_subxact.pl +++ b/src/test/subscription/t/010_stream_subxact.pl @@ -40,7 +40,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE tes my $appname = 'tap_sub'; $node_subscriber->safe_psql('postgres', -"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (streaming = on)" ); wait_for_caught_up($node_publisher, $appname); diff --git a/src/test/subscription/t/010_truncate.pl b/src/test/subscription/t/010_truncate.pl index be2c0bdc35..ed56fbf96c 100644 --- a/src/test/subscription/t/010_truncate.pl +++ b/src/test/subscription/t/010_truncate.pl @@ -52,13 +52,13 @@ $node_publisher->safe_psql('postgres', $node_publisher->safe_psql('postgres', "CREATE PUBLICATION pub3 FOR TABLE tab3, tab4"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1" + "CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1 WITH (streaming = on)" ); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION sub2 CONNECTION '$publisher_connstr' PUBLICATION pub2" + "CREATE SUBSCRIPTION sub2 CONNECTION '$publisher_connstr' PUBLICATION pub2 WITH (streaming = on)" ); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION sub3 CONNECTION '$publisher_connstr' PUBLICATION pub3" + "CREATE SUBSCRIPTION sub3 CONNECTION '$publisher_connstr' PUBLICATION pub3 WITH (streaming = on)" ); # Wait for initial sync of all subscriptions diff --git a/src/test/subscription/t/011_generated.pl b/src/test/subscription/t/011_generated.pl index f35d1cba4c..4df1ddef63 100644 --- a/src/test/subscription/t/011_generated.pl +++ b/src/test/subscription/t/011_generated.pl @@ -33,7 +33,7 @@ $node_publisher->safe_psql('postgres', $node_publisher->safe_psql('postgres', "CREATE PUBLICATION pub1 FOR ALL TABLES"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1" + "CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1 WITH (streaming = on)" ); # Wait for initial sync of all subscriptions diff --git a/src/test/subscription/t/011_stream_ddl.pl b/src/test/subscription/t/011_stream_ddl.pl index 0da39a1a8a..c3caff6149 100644 --- a/src/test/subscription/t/011_stream_ddl.pl +++ b/src/test/subscription/t/011_stream_ddl.pl @@ -40,7 +40,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE tes my $appname = 'tap_sub'; $node_subscriber->safe_psql('postgres', -"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (streaming = on)" ); wait_for_caught_up($node_publisher, $appname); diff --git a/src/test/subscription/t/012_collation.pl b/src/test/subscription/t/012_collation.pl index 4bfcef7c2f..c62eb521e7 100644 --- a/src/test/subscription/t/012_collation.pl +++ b/src/test/subscription/t/012_collation.pl @@ -80,7 +80,7 @@ $node_publisher->safe_psql('postgres', q{CREATE PUBLICATION pub1 FOR ALL TABLES}); $node_subscriber->safe_psql('postgres', - qq{CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1 WITH (copy_data = false)} + qq{CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1 WITH (copy_data = false, streaming = on)} ); $node_publisher->wait_for_catchup('sub1'); diff --git a/src/test/subscription/t/012_stream_subxact_abort.pl b/src/test/subscription/t/012_stream_subxact_abort.pl index 402df30f59..2be7542831 100644 --- a/src/test/subscription/t/012_stream_subxact_abort.pl +++ b/src/test/subscription/t/012_stream_subxact_abort.pl @@ -40,7 +40,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE tes my $appname = 'tap_sub'; $node_subscriber->safe_psql('postgres', -"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (streaming = on)" ); wait_for_caught_up($node_publisher, $appname); diff --git a/src/test/subscription/t/013_stream_subxact_ddl_abort.pl b/src/test/subscription/t/013_stream_subxact_ddl_abort.pl index becbdd0578..2da9607a7d 100644 --- a/src/test/subscription/t/013_stream_subxact_ddl_abort.pl +++ b/src/test/subscription/t/013_stream_subxact_ddl_abort.pl @@ -40,7 +40,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE tes my $appname = 'tap_sub'; $node_subscriber->safe_psql('postgres', -"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (streaming = on)" ); wait_for_caught_up($node_publisher, $appname); diff --git a/src/test/subscription/t/100_bugs.pl b/src/test/subscription/t/100_bugs.pl index 366a7a9435..96ffc091b0 100644 --- a/src/test/subscription/t/100_bugs.pl +++ b/src/test/subscription/t/100_bugs.pl @@ -53,7 +53,7 @@ $node_publisher->safe_psql('postgres', "CREATE PUBLICATION pub1 FOR ALL TABLES"); $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1" + "CREATE SUBSCRIPTION sub1 CONNECTION '$publisher_connstr' PUBLICATION pub1 WITH (streaming = on)" ); $node_publisher->wait_for_catchup('sub1'); -- 2.23.0 v27/v27-0014-Worker-tempfile-use-the-shared-buffile-infrastru.patch000644 000765 000024 00000066264 13670411611 026352 0ustar00dilipkumarstaff000000 000000 From df5b9ec348b2f9541cfe4017fd57da485b05fb64 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Thu, 11 Jun 2020 16:42:07 +0530 Subject: [PATCH v27 14/14] Worker tempfile use the shared buffile infrastructure Tobe merged with 0008, kept separate to make it easy for the review. --- src/backend/replication/logical/worker.c | 540 +++++++++++------------ 1 file changed, 270 insertions(+), 270 deletions(-) diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index d2d9469999..cdc8e4f9ab 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -56,6 +56,7 @@ #include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" #include "commands/tablecmds.h" +#include "commands/tablespace.h" #include "commands/trigger.h" #include "executor/executor.h" #include "executor/execPartition.h" @@ -85,6 +86,7 @@ #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "rewrite/rewriteHandler.h" +#include "storage/buffile.h" #include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -123,10 +125,26 @@ typedef struct SlotErrCallbackArg int remote_attnum; } SlotErrCallbackArg; +/* + * Stream xid hash entry. Whenever we see a xid we create this entry in the + * xidhash and we also create the streaming file and store the fileset handle. + * So that on the subsequent stream for the xid we can search the entry in the + * hash and get the fileset handle. The subxact file is only create if there + * are any suxact info under this xid. + */ +typedef struct StreamXidHash +{ + TransactionId xid; /* xid is the hash key and must be first */ + SharedFileSet *stream_fileset; /* shared file set for stream data */ + SharedFileSet *subxact_fileset; /* shared file set for subxact info */ +} StreamXidHash; + static MemoryContext ApplyMessageContext = NULL; -static MemoryContext LogicalStreamingContext = NULL; MemoryContext ApplyContext = NULL; +/* per stream context for streaming transactions. */ +static MemoryContext LogicalStreamingContext = NULL; + WalReceiverConn *wrconn = NULL; Subscription *MySubscription = NULL; @@ -139,12 +157,23 @@ static XLogRecPtr remote_final_lsn = InvalidXLogRecPtr; bool in_streamed_transaction = false; static TransactionId stream_xid = InvalidTransactionId; -static int stream_fd = -1; + +/* + * Hash table for storing the streaming xid information along with shared file + * set for streaming and subxact files. On every stream start we need to open + * the xid's files and for that we need the shared file set handle. So storing + * it in xid hash make it faster to search. + */ +static HTAB *xidhash = NULL; + +/* Buf file handle of the current streaming file. */ +static BufFile *stream_fd = NULL; typedef struct SubXactInfo { - TransactionId xid; /* XID of the subxact */ - off_t offset; /* offset in the file */ + TransactionId xid; /* XID of the subxact */ + int fileno; /* file number in the buffile */ + off_t offset; /* offset in the file */ } SubXactInfo; static uint32 nsubxacts = 0; @@ -171,13 +200,6 @@ static void stream_open_file(Oid subid, TransactionId xid, bool first); static void stream_write_change(char action, StringInfo s); static void stream_close_file(void); -/* - * Array of serialized XIDs. - */ -static int nxids = 0; -static int maxnxids = 0; -static TransactionId *xids = NULL; - static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); static void store_flush_position(XLogRecPtr remote_lsn); @@ -275,7 +297,7 @@ handle_streamed_transaction(const char action, StringInfo s) if (!in_streamed_transaction) return false; - Assert(stream_fd != -1); + Assert(stream_fd != NULL); Assert(TransactionIdIsValid(stream_xid)); /* @@ -666,31 +688,39 @@ static void apply_handle_stream_start(StringInfo s) { bool first_segment; + HASHCTL hash_ctl; Assert(!in_streamed_transaction); + /* + * Start a transaction on stream start, this transaction will be committed + * on the stream stop. We need the transaction for handling the buffile, + * used for serializing the streaming data and subxact info. + */ + ensure_transaction(); + /* notify handle methods we're processing a remote transaction */ in_streamed_transaction = true; /* extract XID of the top-level transaction */ stream_xid = logicalrep_read_stream_start(s, &first_segment); + /* Initialize the xidhash table if we haven't yet */ + if (xidhash == NULL) + { + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(StreamXidHash); + hash_ctl.hcxt = ApplyContext; + xidhash = hash_create("StreamXidHash", 1024, &hash_ctl, + HASH_ELEM | HASH_CONTEXT); + } + /* open the spool file for this transaction */ stream_open_file(MyLogicalRepWorker->subid, stream_xid, first_segment); - /* - * if this is not the first segment, open existing file - * - * XXX Note that the cleanup is performed by stream_open_file. - */ + /* if this is not the first segment, open existing file */ if (!first_segment) - { - MemoryContext oldctx = MemoryContextSwitchTo(LogicalStreamingContext); - - /* Read the subxacts info in per-stream context. */ subxact_info_read(MyLogicalRepWorker->subid, stream_xid); - MemoryContextSwitchTo(oldctx); - } pgstat_report_activity(STATE_RUNNING, NULL); } @@ -710,6 +740,9 @@ apply_handle_stream_stop(StringInfo s) subxact_info_write(MyLogicalRepWorker->subid, stream_xid); stream_close_file(); + /* Commit the per-stream transaction */ + CommitTransactionCommand(); + in_streamed_transaction = false; /* Reset per-stream context */ @@ -736,10 +769,7 @@ apply_handle_stream_abort(StringInfo s) * just delete the files with serialized info. */ if (xid == subxid) - { stream_cleanup_files(MyLogicalRepWorker->subid, xid, false); - return; - } else { /* @@ -761,11 +791,13 @@ apply_handle_stream_abort(StringInfo s) int64 i; int64 subidx; - int fd; + BufFile *fd; bool found = false; char path[MAXPGPATH]; + StreamXidHash *ent; subidx = -1; + ensure_transaction(); subxact_info_read(MyLogicalRepWorker->subid, xid); /* XXX optimize the search by bsearch on sorted data */ @@ -787,33 +819,32 @@ apply_handle_stream_abort(StringInfo s) { /* Cleanup the subxact info */ cleanup_subxact_info(); + CommitTransactionCommand(); return; } Assert((subidx >= 0) && (subidx < nsubxacts)); + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + &found); + Assert(found); + + /* open the changes file */ changes_filename(path, MyLogicalRepWorker->subid, xid); - fd = OpenTransientFile(path, O_WRONLY | PG_BINARY); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); - } + fd = BufFileOpenShared(ent->stream_fileset, path, O_RDWR); - /* OK, truncate the file at the right offset. */ - if (ftruncate(fd, subxacts[subidx].offset)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not truncate file \"%s\": %m", path))); - CloseTransientFile(fd); + /* OK, truncate the file at the right offset */ + BufFileTruncateShared(fd, subxacts[subidx].fileno, subxacts[subidx].offset); + BufFileClose(fd); /* discard the subxacts added later */ nsubxacts = subidx; /* write the updated subxact list */ subxact_info_write(MyLogicalRepWorker->subid, xid); + CommitTransactionCommand(); } } @@ -823,16 +854,16 @@ apply_handle_stream_abort(StringInfo s) static void apply_handle_stream_commit(StringInfo s) { - int fd; TransactionId xid; StringInfoData s2; int nchanges; - char path[MAXPGPATH]; char *buffer = NULL; + bool found; LogicalRepCommitData commit_data; - - MemoryContext oldcxt; + StreamXidHash *ent; + MemoryContext oldcxt; + BufFile *fd; Assert(!in_streamed_transaction); @@ -840,25 +871,21 @@ apply_handle_stream_commit(StringInfo s) elog(DEBUG1, "received commit for streamed transaction %u", xid); - /* open the spool file for the committed transaction */ - changes_filename(path, MyLogicalRepWorker->subid, xid); - elog(DEBUG1, "replaying changes from file '%s'", path); - fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); - } - ensure_transaction(); - oldcxt = MemoryContextSwitchTo(TopTransactionContext); - buffer = palloc(8192); + /* open the spool file for the committed transaction */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + &found); + Assert(found); + fd = BufFileOpenShared(ent->stream_fileset, path, O_RDONLY); + + buffer = palloc(BLCKSZ); initStringInfo(&s2); MemoryContextSwitchTo(oldcxt); @@ -882,7 +909,7 @@ apply_handle_stream_commit(StringInfo s) /* read length of the on-disk record */ pgstat_report_wait_start(WAIT_EVENT_LOGICAL_CHANGES_READ); - nbytes = read(fd, &len, sizeof(len)); + nbytes = BufFileRead(fd, &len, sizeof(len)); pgstat_report_wait_end(); /* have we reached end of the file? */ @@ -894,7 +921,7 @@ apply_handle_stream_commit(StringInfo s) { int save_errno = errno; - CloseTransientFile(fd); + BufFileClose(fd); errno = save_errno; ereport(ERROR, (errcode_for_file_access(), @@ -909,11 +936,11 @@ apply_handle_stream_commit(StringInfo s) /* and finally read the data into the buffer */ pgstat_report_wait_start(WAIT_EVENT_LOGICAL_CHANGES_READ); - if (read(fd, buffer, len) != len) + if (BufFileRead(fd, buffer, len) != len) { int save_errno = errno; - CloseTransientFile(fd); + BufFileClose(fd); errno = save_errno; ereport(ERROR, (errcode_for_file_access(), @@ -948,11 +975,7 @@ apply_handle_stream_commit(StringInfo s) */ send_feedback(InvalidXLogRecPtr, false, false); } - - if (CloseTransientFile(fd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", path))); + BufFileClose(fd); /* * Update origin state so we can restart streaming from correct @@ -1946,12 +1969,39 @@ UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply) static void worker_onexit(int code, Datum arg) { - int i; + HASH_SEQ_STATUS status; + StreamXidHash *ent; + char path[MAXPGPATH]; + + /* nothing to clean */ + if (xidhash == NULL) + return; + + /* + * Scan complete hash and delete the underlying files for the the xids. + * Also delete the memory for the shared file sets. + */ + hash_seq_init(&status, xidhash); + while ((ent = (StreamXidHash *) hash_seq_search(&status)) != NULL) + { + changes_filename(path, MyLogicalRepWorker->subid, ent->xid); + BufFileDeleteShared(ent->stream_fileset, path); + pfree(ent->stream_fileset); - elog(LOG, "cleanup files for %d transactions", nxids); + /* + * We might not have created the suxact fileset if there is no sub + * transaction. + */ + if (ent->subxact_fileset) + { + subxact_filename(path, MyLogicalRepWorker->subid, ent->xid); + BufFileDeleteShared(ent->subxact_fileset, path); + pfree(ent->subxact_fileset); + } + } - for (i = nxids-1; i >= 0; i--) - stream_cleanup_files(MyLogicalRepWorker->subid, xids[i], true); + /* Remove the xid hash */ + hash_destroy(xidhash); } /* @@ -2085,7 +2135,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received) /* confirm all writes so far */ send_feedback(last_received, false, false); - if (!in_remote_transaction) + if (!in_remote_transaction && !in_streamed_transaction) { /* * If we didn't get any transactions for a while there might be @@ -2441,33 +2491,63 @@ subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue) static void subxact_info_write(Oid subid, TransactionId xid) { - int fd; - char path[MAXPGPATH]; - Size len; + char path[MAXPGPATH]; + bool found; + Size len; + StreamXidHash *ent; + BufFile *fd; Assert(TransactionIdIsValid(xid)); subxact_filename(path, subid, xid); - fd = OpenTransientFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); - if (fd < 0) + /* find the xid entry in the xidhash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + &found); + /* we must found the entry for its top transaction by this time */ + Assert(found); + + /* + * If there is no subtransaction then nothing to do, but if already have + * subxact file then delete that. + */ + if (nsubxacts == 0) { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", - path))); + if (ent->subxact_fileset) + { + cleanup_subxact_info(); + BufFileDeleteShared(ent->subxact_fileset, path); + ent->subxact_fileset = NULL; + } return; } + /* + * Create the subxact file if it not already created, otherwise open the + * existing file. + */ + if (ent->subxact_fileset == NULL) + { + ent->subxact_fileset = + MemoryContextAlloc(ApplyContext, sizeof(SharedFileSet)); + + SharedFileSetInit(ent->subxact_fileset, NULL); + fd = BufFileCreateShared(ent->subxact_fileset, path); + } + else + fd = BufFileOpenShared(ent->subxact_fileset, path, O_RDWR); + len = sizeof(SubXactInfo) * nsubxacts; pgstat_report_wait_start(WAIT_EVENT_LOGICAL_SUBXACT_WRITE); - if (write(fd, &nsubxacts, sizeof(nsubxacts)) != sizeof(nsubxacts)) + if (BufFileWrite(fd, &nsubxacts, sizeof(nsubxacts)) != sizeof(nsubxacts)) { - int save_errno = errno; + int save_errno = errno; - CloseTransientFile(fd); + BufFileClose(fd); errno = save_errno; ereport(ERROR, (errcode_for_file_access(), @@ -2476,11 +2556,11 @@ subxact_info_write(Oid subid, TransactionId xid) return; } - if ((len > 0) && (write(fd, subxacts, len) != len)) + if ((len > 0) && (BufFileWrite(fd, subxacts, len) != len)) { - int save_errno = errno; + int save_errno = errno; - CloseTransientFile(fd); + BufFileClose(fd); errno = save_errno; ereport(ERROR, (errcode_for_file_access(), @@ -2490,15 +2570,7 @@ subxact_info_write(Oid subid, TransactionId xid) } pgstat_report_wait_end(); - - /* - * We don't need to fsync or anything, as we'll recreate the files after a - * crash from scratch. So just close the file. - */ - if (CloseTransientFile(fd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", path))); + BufFileClose(fd); /* * But we free the memory allocated for subxact info. There might be one @@ -2519,35 +2591,40 @@ subxact_info_write(Oid subid, TransactionId xid) static void subxact_info_read(Oid subid, TransactionId xid) { - int fd; char path[MAXPGPATH]; + bool found; Size len; + BufFile *fd; + StreamXidHash *ent; + MemoryContext oldctx; Assert(TransactionIdIsValid(xid)); Assert(!subxacts); Assert(nsubxacts == 0); Assert(nsubxacts_max == 0); - subxact_filename(path, subid, xid); + /* Find the stream xid entry in the xidhash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + &found); - fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); + /* If subxact_fileset is not valid that mean we don't have any subxact info */ + if (ent->subxact_fileset == NULL) return; - } + + subxact_filename(path, subid, xid); + + fd = BufFileOpenShared(ent->subxact_fileset, path, O_RDONLY); pgstat_report_wait_start(WAIT_EVENT_LOGICAL_SUBXACT_READ); /* read number of subxact items */ - if (read(fd, &nsubxacts, sizeof(nsubxacts)) != sizeof(nsubxacts)) + if (BufFileRead(fd, &nsubxacts, sizeof(nsubxacts)) != sizeof(nsubxacts)) { int save_errno = errno; - CloseTransientFile(fd); + BufFileClose(fd); errno = save_errno; ereport(ERROR, (errcode_for_file_access(), @@ -2564,21 +2641,22 @@ subxact_info_read(Oid subid, TransactionId xid) nsubxacts_max = 1 << my_log2(nsubxacts); /* - * Let the caller decide which memory context it will be allocated. - * Ideally, during stream start it will be allocated in the - * LogicalStreamingContext which will be reset on stream stop, and - * during the stream abort we need this memory only for short term so - * it will be allocated in ApplyMessageContext. + * Allocate subxact information in the logical streaming context. We + * need this information during the complete stream so that we can add + * the sub transaction info to this. On stream stop we will flush this + * information to the subxact file and reset the logical streaming context. */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); subxacts = palloc(nsubxacts_max * sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); pgstat_report_wait_start(WAIT_EVENT_LOGICAL_SUBXACT_READ); - if ((len > 0) && ((read(fd, subxacts, len)) != len)) + if ((len > 0) && ((BufFileRead(fd, subxacts, len)) != len)) { int save_errno = errno; - CloseTransientFile(fd); + BufFileClose(fd); errno = save_errno; ereport(ERROR, (errcode_for_file_access(), @@ -2586,13 +2664,9 @@ subxact_info_read(Oid subid, TransactionId xid) path))); return; } - pgstat_report_wait_end(); - if (CloseTransientFile(fd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", path))); + BufFileClose(fd); } /* @@ -2606,7 +2680,7 @@ subxact_info_add(TransactionId xid) /* We must have a valid top level stream xid and a stream fd. */ Assert(TransactionIdIsValid(stream_xid)); - Assert(stream_fd >= 0); + Assert(stream_fd != NULL); /* * If the XID matches the toplevel transaction, we don't want to add it. @@ -2658,7 +2732,13 @@ subxact_info_add(TransactionId xid) } subxacts[nsubxacts].xid = xid; - subxacts[nsubxacts].offset = lseek(stream_fd, 0, SEEK_END); + + /* + * Get the current offset of the stream file and store it as offset of + * this subxact. + */ + BufFileTell(stream_fd, &subxacts[nsubxacts].fileno, + &subxacts[nsubxacts].offset); nsubxacts++; } @@ -2667,44 +2747,14 @@ subxact_info_add(TransactionId xid) static void subxact_filename(char *path, Oid subid, TransactionId xid) { - char tempdirpath[MAXPGPATH]; - - TempTablespacePath(tempdirpath, DEFAULTTABLESPACE_OID); - - /* - * We might need to create the tablespace's tempfile directory, if no - * one has yet done so. - */ - if ((MakePGDirectory(tempdirpath) < 0) && errno != EEXIST) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create directory \"%s\": %m", - tempdirpath))); - - snprintf(path, MAXPGPATH, "%s/%s%d-%u-%u.subxacts", - tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, subid, xid); + snprintf(path, MAXPGPATH, "%u-%u.subxacts", subid, xid); } /* format filename for file containing serialized changes */ -static void +static inline void changes_filename(char *path, Oid subid, TransactionId xid) { - char tempdirpath[MAXPGPATH]; - - TempTablespacePath(tempdirpath, DEFAULTTABLESPACE_OID); - - /* - * We might need to create the tablespace's tempfile directory, if no - * one has yet done so. - */ - if ((MakePGDirectory(tempdirpath) < 0) && errno != EEXIST) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create directory \"%s\": %m", - tempdirpath))); - - snprintf(path, MAXPGPATH, "%s/%s%d-%u-%u.changes", - tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, subid, xid); + snprintf(path, MAXPGPATH, "%u-%u.changes", subid, xid); } /* @@ -2721,60 +2771,31 @@ changes_filename(char *path, Oid subid, TransactionId xid) static void stream_cleanup_files(Oid subid, TransactionId xid, bool missing_ok) { - int i; char path[MAXPGPATH]; - bool found = false; + StreamXidHash *ent; - subxact_filename(path, subid, xid); + /* Remove the xid entry from the stream xid hash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_REMOVE, + NULL); - if ((unlink(path) < 0) && (errno != ENOENT) && !missing_ok) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); + /* No entry created for this xid so simply return. */ + if (ent == NULL) + return; + /* Delete the change file and release the stream fileset memory */ changes_filename(path, subid, xid); + BufFileDeleteShared(ent->stream_fileset, path); + pfree(ent->stream_fileset); - if ((unlink(path) < 0) && (errno != ENOENT) && !missing_ok) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - - /* - * Cleanup the XID from the array - find the XID in the array and - * remove it by shifting all the remaining elements. The array is - * bound to be fairly small (maximum number of in-progress xacts, - * so max_connections + max_prepared_transactions) so simply loop - * through the array and find index of the XID. Then move the rest - * of the array by one element to the left. - * - * Notice we also call this from stream_open_file for first segment - * of each transaction, to deal with possible left-overs after a - * crash, so it's entirely possible not to find the XID in the - * array here. In that case we don't remove anything. - * - * XXX Perhaps it'd be better to handle this automatically after a - * restart, instead of doing it over and over for each transaction. - */ - for (i = 0; i < nxids; i++) + /* Delete the subxact file and release the memory, if it exist */ + if (ent->subxact_fileset) { - if (xids[i] == xid) - { - found = true; - break; - } + subxact_filename(path, subid, xid); + BufFileDeleteShared(ent->subxact_fileset, path); + pfree(ent->subxact_fileset); } - - if (!found) - return; - - /* - * Move the last entry from the array to the place. We don't keep - * the streamed transactions sorted or anything - we only expect - * a few of them in progress (max_connections + max_prepared_xacts) - * so linear search is just fine. - */ - xids[i] = xids[nxids-1]; - nxids--; } /* @@ -2793,61 +2814,29 @@ static void stream_open_file(Oid subid, TransactionId xid, bool first_segment) { char path[MAXPGPATH]; - int flags; + bool found; + MemoryContext oldcxt; + StreamXidHash *ent; Assert(in_streamed_transaction); Assert(OidIsValid(subid)); Assert(TransactionIdIsValid(xid)); - Assert(stream_fd == -1); + Assert(stream_fd == NULL); + + /* create or find the xid entry in the xidhash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_ENTER | HASH_FIND, + &found); + Assert(first_segment || found); + changes_filename(path, subid, xid); + elog(DEBUG1, "opening file '%s' for streamed changes", path); /* - * If this is the first segment for this transaction, try removing - * existing files (if there are any, possibly after a crash). + * Create/open the buffiles under the logical streaming context so that + * we have those files until stream stop. */ - if (first_segment) - { - MemoryContext oldcxt; - - /* XXX make sure there are no previous files for this transaction */ - stream_cleanup_files(subid, xid, true); - - /* Need to allocate this in permanent context */ - oldcxt = MemoryContextSwitchTo(ApplyContext); - - /* - * We need to remember the XIDs we spilled to files, so that we can - * remove them at worker exit (e.g. after DROP SUBSCRIPTION). - * - * The number of XIDs we may need to track is fairly small, because - * we can only stream toplevel xacts (so limited by max_connections - * and max_prepared_transactions), and we only stream the large ones. - * So we simply keep the XIDs in an unsorted array. If the number of - * xacts gets large for some reason (e.g. very high max_connections), - * a more elaborate approach might be better - e.g. sorted array, to - * speed-up the lookups. - */ - if (nxids == maxnxids) /* array of XIDs is full */ - { - if (!xids) - { - maxnxids = 64; - xids = palloc(maxnxids * sizeof(TransactionId)); - } - else - { - maxnxids = 2 * maxnxids; - xids = repalloc(xids, maxnxids * sizeof(TransactionId)); - } - } - - xids[nxids++] = xid; - - MemoryContextSwitchTo(oldcxt); - } - - changes_filename(path, subid, xid); - - elog(DEBUG1, "opening file '%s' for streamed changes", path); + oldcxt = MemoryContextSwitchTo(LogicalStreamingContext); /* * If this is the first streamed segment, the file must not exist, so @@ -2855,17 +2844,32 @@ stream_open_file(Oid subid, TransactionId xid, bool first_segment) * for writing, in append mode. */ if (first_segment) - flags = (O_WRONLY | O_CREAT | O_EXCL | PG_BINARY); - else - flags = (O_WRONLY | O_APPEND | PG_BINARY); + { + /* + * Shared fileset handle must be allocated in the persistent context. + */ + SharedFileSet *fileset = + MemoryContextAlloc(ApplyContext, sizeof(SharedFileSet)); - stream_fd = OpenTransientFile(path, flags); + PrepareTempTablespaces(); + SharedFileSetInit(fileset, NULL); + stream_fd = BufFileCreateShared(fileset, path); - if (stream_fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); + /* Remember the fileset for the next stream of the same transaction */ + ent->xid = xid; + ent->stream_fileset = fileset; + ent->subxact_fileset = NULL; + } + else + { + /* + * Open the file and seek to the end of the file because we always + * append the changes file. + */ + stream_fd = BufFileOpenShared(ent->stream_fileset, path, O_RDWR); + BufFileSeek(stream_fd, 0, 0, SEEK_END); + } + MemoryContextSwitchTo(oldcxt); } /* @@ -2880,12 +2884,12 @@ stream_close_file(void) { Assert(in_streamed_transaction); Assert(TransactionIdIsValid(stream_xid)); - Assert(stream_fd != -1); + Assert(stream_fd != NULL); - CloseTransientFile(stream_fd); + BufFileClose(stream_fd); stream_xid = InvalidTransactionId; - stream_fd = -1; + stream_fd = NULL; } /* @@ -2907,21 +2911,19 @@ stream_write_change(char action, StringInfo s) Assert(in_streamed_transaction); Assert(TransactionIdIsValid(stream_xid)); - Assert(stream_fd != -1); + Assert(stream_fd != NULL); /* total on-disk size, including the action type character */ len = (s->len - s->cursor) + sizeof(char); - pgstat_report_wait_start(WAIT_EVENT_LOGICAL_CHANGES_WRITE); - /* first write the size */ - if (write(stream_fd, &len, sizeof(len)) != sizeof(len)) + if (BufFileWrite(stream_fd, &len, sizeof(len)) != sizeof(len)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not serialize streamed change to file: %m"))); /* then the action */ - if (write(stream_fd, &action, sizeof(action)) != sizeof(action)) + if (BufFileWrite(stream_fd, &action, sizeof(action)) != sizeof(action)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not serialize streamed change to file: %m"))); @@ -2929,12 +2931,10 @@ stream_write_change(char action, StringInfo s) /* and finally the remaining part of the buffer (after the XID) */ len = (s->len - s->cursor); - if (write(stream_fd, &s->data[s->cursor], len) != len) + if (BufFileWrite(stream_fd, &s->data[s->cursor], len) != len) ereport(ERROR, (errcode_for_file_access(), errmsg("could not serialize streamed change to file: %m"))); - - pgstat_report_wait_end(); } /* -- 2.23.0 v27/v27-0006-Bugfix-handling-of-incomplete-toast-spec-insert.patch000644 000765 000024 00000062001 13670411611 026146 0ustar00dilipkumarstaff000000 000000 From ed26baddc1eefbd1135c4124d78ba549b758b9b6 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Thu, 11 Jun 2020 15:25:02 +0530 Subject: [PATCH v27 06/14] Bugfix handling of incomplete toast/spec insert --- src/backend/access/heap/heapam.c | 3 + src/backend/replication/logical/decode.c | 17 +- .../replication/logical/reorderbuffer.c | 337 ++++++++++++++---- src/include/access/heapam_xlog.h | 1 + src/include/replication/reorderbuffer.h | 47 ++- 5 files changed, 329 insertions(+), 76 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 2d77107c4f..3927448f46 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1955,6 +1955,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, { xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; bufflags |= REGBUF_KEEP_DATA; + + if (IsToastRelation(relation)) + xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; } XLogBeginInsert(); diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 69c1f45ef6..c841687c66 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -727,7 +727,9 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); } /* @@ -794,7 +796,8 @@ DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); } /* @@ -851,7 +854,8 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); } /* @@ -887,7 +891,7 @@ DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) memcpy(change->data.truncate.relids, xlrec->relids, xlrec->nrelids * sizeof(Oid)); ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), - buf->origptr, change); + buf->origptr, change, false); } /* @@ -987,7 +991,7 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = false; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), - buf->origptr, change); + buf->origptr, change, false); /* move to the next xl_multi_insert_tuple entry */ data += datalen; @@ -1025,7 +1029,8 @@ DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); } diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 47dc31298d..36958fe2ee 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -179,6 +179,21 @@ typedef struct ReorderBufferDiskChange /* data follows */ } ReorderBufferDiskChange; +#define IsSpecInsert(action) \ +( \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \ +) +#define IsSpecConfirm(action) \ +( \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) \ +) +#define IsInsertOrUpdate(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INSERT) || \ + ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \ +) + /* * Maximum number of changes kept in memory, per transaction. After that, * changes are spooled to disk. @@ -237,7 +252,8 @@ static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, char *change); static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn); -static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + bool partial_truncate); static void ReorderBufferCleanupSerializedTXNs(const char *slotname); static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, XLogSegNo segno); @@ -646,14 +662,91 @@ ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, return txn; } +/* + * Handle incomplete tuple during streaming. If streaming is enabled then we + * might need to stream the in-progress transaction. So the problem is that + * sometime we might get some incomplete changes which we can not stream + * until we get the complete change. e.g. toast table insert without the main + * table insert. So this function remember the lsn of the last complete change + * and the complete size upto last complete lsn so that if we need to stream + * we can only stream upto last complete lsn. + */ +static void +ReorderBufferHandleIncompleteTuple(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, + bool toast_insert, Size total_size) +{ + ReorderBufferTXN *toptxn; + + /* Get the top transaction. */ + if (txn->toptxn != NULL) + toptxn = txn->toptxn; + else + toptxn = txn; + + /* + * If this is a first incomplete change then set the size of the complete + * change. + */ + if (!(rbtxn_has_incomplete_tuple(toptxn)) && + (toast_insert || IsSpecInsert(change->action))) + toptxn->complete_size = total_size; + + /* + * If this is a toast insert then set the corresponding bit. Basically, + * both update and insert will do the insert in the toast table. And as + * explained in the function header we can not stream the only toast + * changes. So whenever we get the toast insert we set the flag and clear + * the same whenever we get the next insert or update on the main table. + */ + if (toast_insert) + toptxn->txn_flags |= RBTXN_HAS_TOAST_INSERT; + else if (rbtxn_has_toast_insert(toptxn) && + IsInsertOrUpdate(change->action)) + toptxn->txn_flags &= ~RBTXN_HAS_TOAST_INSERT; + + /* + * Set the spec insert bit whenever we get the speculative insert to + * indicate the partial tuple and clear the same on speculative confirm. + */ + if (IsSpecInsert(change->action)) + toptxn->txn_flags |= RBTXN_HAS_SPEC_INSERT; + else if (IsSpecConfirm(change->action)) + toptxn->txn_flags &= ~RBTXN_HAS_SPEC_INSERT; + + /* + * If we don't have any incomplete change after this change then set this + * LSN as last complete lsn. + */ + if (!(rbtxn_has_incomplete_tuple(toptxn))) + { + toptxn->last_complete_lsn = change->lsn; + + /* + * If the transaction is serialized and the the changes are complete in + * the top level transaction then immediately stream the transaction. + * The reason for not waiting for memory limit to get full is that in + * the streaming mode, if the transaction serialized that means we have + * already reached the memory limit but that time we could not stream + * this due to incomplete tuple so now stream it as soon as the tuple + * is complete. Also, if we don't stream the serialized changes then + * if we get some more incomplete changes in this transaction then we + * don't have a way to partly truncate the serialized changes. + */ + if (rbtxn_is_serialized(txn)) + ReorderBufferStreamTXN(rb, toptxn); + } +} + /* * Queue a change into a transaction so it can be replayed upon commit. */ void ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, - ReorderBufferChange *change) + ReorderBufferChange *change, bool toast_insert) { ReorderBufferTXN *txn; + Size total_size = 0; txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); @@ -665,9 +758,28 @@ ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, txn->nentries++; txn->nentries_mem++; + /* + * Get the total size of the top transaction before updating the size for + * current change so that if this is the incomplete tuple we know the size + * prior to this change. That will be used for updating the size of the + * complete changes in the top transaction for streaming. + */ + if (ReorderBufferCanStream(rb)) + { + if (txn->toptxn != NULL) + total_size = txn->toptxn->total_size; + else + total_size = txn->total_size; + } + /* update memory accounting information */ ReorderBufferChangeMemoryUpdate(rb, change, true); + /* Handle the incomplete tuple, If streaming is enabled */ + if (ReorderBufferCanStream(rb)) + ReorderBufferHandleIncompleteTuple(rb, txn, change, toast_insert, + total_size); + /* check the memory limits and evict something if needed */ ReorderBufferCheckMemoryLimit(rb); } @@ -697,7 +809,7 @@ ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, change->data.msg.message = palloc(message_size); memcpy(change->data.msg.message, message, message_size); - ReorderBufferQueueChange(rb, xid, lsn, change); + ReorderBufferQueueChange(rb, xid, lsn, change, false); MemoryContextSwitchTo(oldcontext); } @@ -1407,11 +1519,45 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) /* * Discard changes from a transaction (and subtransactions), after streaming * them. Keep the remaining info - transactions, tuplecids and snapshots. + * If partial_truncate is false we completely truncate the transaction, + * otherwise we truncate upto last_complete_lsn */ static void -ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + bool partial_truncate) { dlist_mutable_iter iter; + ReorderBufferTXN *toptxn; + + /* Get the top transaction */ + if (txn->toptxn != NULL) + toptxn = txn->toptxn; + else + toptxn = txn; + + /* + * The serialized transaction should never be partly truncated, because if + * it is serialized then we stream it as soon as its changes get completed. + */ + Assert(!(rbtxn_is_serialized(txn) && partial_truncate)); + + /* + * Mark the transaction as streamed. + * + * The toplevel transaction, identified by (toptxn==NULL), is marked + * as streamed always, even if it does not contain any changes (that + * is, when all the changes are in subtransactions). + * + * For subtransactions, we only mark them as streamed when there are + * changes in them. + * + * We do it this way because of aborts - we don't want to send aborts + * for XIDs the downstream is not aware of. And of course, it always + * knows about the toplevel xact (we send the XID in all messages), + * but we never stream XIDs of empty subxacts. + */ + if ((!txn->toptxn) || (txn->nentries_mem != 0)) + txn->txn_flags |= RBTXN_IS_STREAMED; /* cleanup subtransactions & their changes */ dlist_foreach_modify(iter, &txn->subtxns) @@ -1428,7 +1574,7 @@ ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) Assert(rbtxn_is_known_subxact(subtxn)); Assert(subtxn->nsubtxns == 0); - ReorderBufferTruncateTXN(rb, subtxn); + ReorderBufferTruncateTXN(rb, subtxn, partial_truncate); } /* cleanup changes in the toplevel txn */ @@ -1438,30 +1584,19 @@ ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) change = dlist_container(ReorderBufferChange, node, iter.cur); + /* We have truncated upto last complete lsn so stop. */ + if (partial_truncate && (change->lsn > toptxn->last_complete_lsn)) + { + /* The transaction must have incomplete changes. */ + Assert(rbtxn_has_incomplete_tuple(toptxn)); + break; + } + /* remove the change from it's containing list */ dlist_delete(&change->node); - ReorderBufferReturnChange(rb, change); } - /* - * Mark the transaction as streamed. - * - * The toplevel transaction, identified by (toptxn==NULL), is marked - * as streamed always, even if it does not contain any changes (that - * is, when all the changes are in subtransactions). - * - * For subtransactions, we only mark them as streamed when there are - * changes in them. - * - * We do it this way because of aborts - we don't want to send aborts - * for XIDs the downstream is not aware of. And of course, it always - * knows about the toplevel xact (we send the XID in all messages), - * but we never stream XIDs of empty subxacts. - */ - if ((!txn->toptxn) || (txn->nentries_mem != 0)) - txn->txn_flags |= RBTXN_IS_STREAMED; - /* * Destroy the (relfilenode, ctid) hashtable, so that we don't leak * any memory. We could also keep the hash table and update it with @@ -1473,9 +1608,39 @@ ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) txn->tuplecid_hash = NULL; } - /* also reset the number of entries in the transaction */ - txn->nentries_mem = 0; - txn->nentries = 0; + /* + * Adjust nentries/nentries_mem based on the changes processed.  See + * comments where nprocessed is declared. + */ + if (partial_truncate) + { + txn->nentries -= txn->nprocessed; + txn->nentries_mem -= txn->nprocessed; + } + else + { + txn->nentries = 0; + txn->nentries_mem = 0; + } + txn->nprocessed = 0; + + /* + * If this is a top transaction then we can reset the + * last_complete_lsn and complete_size, because by now we would + * have stream all the changes upto last_complete_lsn. + */ + if (partial_truncate && (txn->toptxn == NULL)) + { + toptxn->last_complete_lsn = InvalidXLogRecPtr; + toptxn->complete_size = 0; + } + + /* If this txn is serialized then clean the disk space. */ + if (rbtxn_is_serialized(txn)) + { + ReorderBufferRestoreCleanup(rb, txn); + txn->txn_flags &= ~RBTXN_IS_SERIALIZED; + } } /* @@ -1762,7 +1927,7 @@ ReorderBufferHandleConcurrentAbort(ReorderBuffer *rb, ReorderBufferTXN *txn, ReorderBufferChange *specinsert) { /* Discard the changes that we just streamed. */ - ReorderBufferTruncateTXN(rb, txn); + ReorderBufferTruncateTXN(rb, txn, false); /* Stop the stream. */ rb->stream_stop(rb, txn, last_lsn); @@ -1794,6 +1959,8 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; ReorderBufferChange *volatile specinsert = NULL; volatile bool stream_started = false; + volatile bool partial_truncate = false; + /* build data to be able to lookup the CommandIds of catalog tuples */ ReorderBufferBuildTupleCidHash(rb, txn); @@ -1816,6 +1983,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, PG_TRY(); { ReorderBufferChange *change; + ReorderBufferTXN *curtxn; if (using_subtxn) BeginInternalSubTransaction(streaming? "stream" : "replay"); @@ -1852,7 +2020,10 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, /* Set the xid for concurrent abort check. */ if (streaming) - SetupCheckXidLive(change->txn->xid); + { + curtxn = change->txn; + SetupCheckXidLive(curtxn->xid); + } switch (change->action) { @@ -2116,6 +2287,27 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, elog(ERROR, "tuplecid value in changequeue"); break; } + + if (streaming) + { + /* + * Increment the nprocessed count. See the detailed comment + * for usage of this in ReorderBufferTXN structure. + */ + curtxn->nprocessed++; + + /* + * If the transaction contains incomplete tuple and this is the + * last complete change then stop further processing of the + * transaction. And, set the partial truncate flag to true. + */ + if (rbtxn_has_incomplete_tuple(txn) && + prev_lsn == txn->last_complete_lsn) + { + partial_truncate = true; + break; + } + } } /* @@ -2135,7 +2327,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, /* * Done with current changes, call stream_stop callback for streaming - * transaction, commit callback otherwise. If we have sent + * transaction, commit callback otherwise. Only If we have sent * start/begin. */ if (stream_started) @@ -2187,7 +2379,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, */ if (streaming) { - ReorderBufferTruncateTXN(rb, txn); + ReorderBufferTruncateTXN(rb, txn, partial_truncate); /* Reset the CheckXidAlive */ CheckXidAlive = InvalidTransactionId; @@ -2524,7 +2716,7 @@ ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, change->data.snapshot = snap; change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT; - ReorderBufferQueueChange(rb, xid, lsn, change); + ReorderBufferQueueChange(rb, xid, lsn, change, false); } /* @@ -2573,7 +2765,7 @@ ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, change->data.command_id = cid; change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID; - ReorderBufferQueueChange(rb, xid, lsn, change); + ReorderBufferQueueChange(rb, xid, lsn, change, false); } /* @@ -2596,6 +2788,7 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, { Size sz; ReorderBufferTXN *txn; + ReorderBufferTXN *toptxn = NULL; Assert(change->txn); @@ -2610,8 +2803,13 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, txn = change->txn; /* if subxact, and streaming supported, use the toplevel instead */ - if (txn->toptxn && ReorderBufferCanStream(rb)) - txn = txn->toptxn; + if (ReorderBufferCanStream(rb)) + { + if (txn->toptxn) + toptxn = txn->toptxn; + else + toptxn = txn; + } sz = ReorderBufferChangeSize(change); @@ -2619,12 +2817,20 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, { txn->size += sz; rb->size += sz; + + /* Update the total size in the top transaction. */ + if (toptxn) + toptxn->total_size += sz; } else { Assert((rb->size >= sz) && (txn->size >= sz)); txn->size -= sz; rb->size -= sz; + + /* Update the total size in the top transaction. */ + if (toptxn) + toptxn->total_size -= sz; } Assert(txn->size <= rb->size); @@ -2685,7 +2891,7 @@ ReorderBufferAddInvalidation(ReorderBuffer *rb, TransactionId xid, memcpy(change->data.inval.invalidations, msgs, sizeof(SharedInvalidationMessage) * nmsgs); - ReorderBufferQueueChange(rb, xid, lsn, change); + ReorderBufferQueueChange(rb, xid, lsn, change, false); MemoryContextSwitchTo(oldcontext); } @@ -2872,18 +3078,28 @@ ReorderBufferLargestTopTXN(ReorderBuffer *rb) dlist_foreach(iter, &rb->toplevel_by_lsn) { ReorderBufferTXN *txn; + Size size = 0; + Size largest_size = 0; txn = dlist_container(ReorderBufferTXN, node, iter.cur); - /* if the current transaction is larger, remember it */ - if ((!largest) || (txn->size > largest->size)) + /* + * If this transaction have some incomplete changes then only consider + * the size upto last complete lsn. + */ + if (rbtxn_has_incomplete_tuple(txn)) + size = txn->complete_size; + else + size = txn->total_size; + + /* If the current transaction is larger then remember it. */ + if ((largest != NULL || size > largest_size) && size > 0) + { largest = txn; + largest_size = size; + } } - Assert(largest); - Assert(largest->size > 0); - Assert(largest->size <= rb->size); - return largest; } @@ -2921,27 +3137,22 @@ ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) * Pick the largest transaction (or subtransaction) and evict it from * memory by streaming, if supported. Otherwise spill to disk. */ - if (ReorderBufferCanStream(rb)) + if (ReorderBufferCanStream(rb) && + (txn = ReorderBufferLargestTopTXN(rb)) != NULL) { - /* - * Pick the largest toplevel transaction and evict it from memory by - * streaming the already decoded part. - */ - txn = ReorderBufferLargestTopTXN(rb); - /* we know there has to be one, because the size is not zero */ Assert(txn && !txn->toptxn); - Assert(txn->size > 0); - Assert(rb->size >= txn->size); + Assert(txn->total_size > 0); + Assert(rb->size >= txn->total_size); ReorderBufferStreamTXN(rb, txn); } else { /* - * Pick the largest transaction (or subtransaction) and evict it from - * memory by serializing it to disk. - */ + * Pick the largest transaction (or subtransaction) and evict it from + * memory by serializing it to disk. + */ txn = ReorderBufferLargestTXN(rb); /* we know there has to be one, because the size is not zero */ @@ -2950,14 +3161,14 @@ ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) Assert(rb->size >= txn->size); ReorderBufferSerializeTXN(rb, txn); - } - /* - * After eviction, the transaction should have no entries in memory, - * and should use 0 bytes for changes. - */ - Assert(txn->size == 0); - Assert(txn->nentries_mem == 0); + /* + * After eviction, the transaction should have no entries in memory, and + * should use 0 bytes for changes. + */ + Assert(txn->size == 0); + Assert(txn->nentries_mem == 0); + } } /* We must be under the memory limit now. */ @@ -3356,10 +3567,6 @@ ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) */ ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now, command_id, true); - - Assert(dlist_is_empty(&txn->changes)); - Assert(txn->nentries == 0); - Assert(txn->nentries_mem == 0); } /* diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 95d18cdb12..aa17f7df84 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -67,6 +67,7 @@ #define XLH_INSERT_LAST_IN_MULTI (1<<1) #define XLH_INSERT_IS_SPECULATIVE (1<<2) #define XLH_INSERT_CONTAINS_NEW_TUPLE (1<<3) +#define XLH_INSERT_ON_TOAST_RELATION (1<<4) /* * xl_heap_update flag values, 8 bits are available. diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index b3e2b3f64b..2d86209f61 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -172,6 +172,8 @@ typedef struct ReorderBufferChange #define RBTXN_IS_SUBXACT 0x0002 #define RBTXN_IS_SERIALIZED 0x0004 #define RBTXN_IS_STREAMED 0x0008 +#define RBTXN_HAS_TOAST_INSERT 0x0010 +#define RBTXN_HAS_SPEC_INSERT 0x0020 /* Does the transaction have catalog changes? */ #define rbtxn_has_catalog_changes(txn) \ @@ -191,6 +193,26 @@ typedef struct ReorderBufferChange ((txn)->txn_flags & RBTXN_IS_SERIALIZED) != 0 \ ) +/* This transaction's changes has toast insert, without main table insert. */ +#define rbtxn_has_toast_insert(txn) \ +( \ + ((txn)->txn_flags & RBTXN_HAS_TOAST_INSERT) != 0 \ +) +/* + * This transaction's changes has speculative insert, without speculative + * confirm. + */ +#define rbtxn_has_spec_insert(txn) \ +( \ + ((txn)->txn_flags & RBTXN_HAS_SPEC_INSERT) != 0 \ +) + +/* Check whether this transaction has an incomplete change. */ +#define rbtxn_has_incomplete_tuple(txn) \ +( \ + rbtxn_has_toast_insert(txn) || rbtxn_has_spec_insert(txn) \ +) + /* * Has this transaction been streamed to downstream? * @@ -199,10 +221,6 @@ typedef struct ReorderBufferChange * which case we'd have nentries==0 for the toplevel one, which would say * nothing about the streaming. So we maintain this flag, but only for the * toplevel transaction.) - * - * Note: We never do both stream and serialize a transaction (we only spill - * to disk when streaming is not supported by the plugin), so only one of - * those two flags may be set at any given time. */ #define rbtxn_is_streamed(txn) \ ( \ @@ -350,6 +368,23 @@ typedef struct ReorderBufferTXN * Size of this transaction (changes currently in memory, in bytes). */ Size size; + + /* Size of top-transaction including sub-transactions. */ + Size total_size; + + /* Size of the complete changes. */ + Size complete_size; + + /* LSN of the last complete change. */ + XLogRecPtr last_complete_lsn; + + /* + * Number of changes processed.  This is used to keep track of changes that + * remained to be streamed.  As of now, this can happen either due to toast + * tuples or speculative insertions where we need to wait for multiple + * changes before we can send them. + */ + uint64 nprocessed; } ReorderBufferTXN; /* so we can define the callbacks used inside struct ReorderBuffer itself */ @@ -537,7 +572,9 @@ void ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *); Oid *ReorderBufferGetRelids(ReorderBuffer *, int nrelids); void ReorderBufferReturnRelids(ReorderBuffer *, Oid *relids); -void ReorderBufferQueueChange(ReorderBuffer *, TransactionId, XLogRecPtr lsn, ReorderBufferChange *); +void ReorderBufferQueueChange(ReorderBuffer *, TransactionId, + XLogRecPtr lsn, ReorderBufferChange *, + bool incomplte_data); void ReorderBufferQueueMessage(ReorderBuffer *, TransactionId, Snapshot snapshot, XLogRecPtr lsn, bool transactional, const char *prefix, Size message_size, const char *message); -- 2.23.0 v27/v27-0001-Immediately-WAL-log-subtransaction-and-top-level.patch000644 000765 000024 00000026154 13670577036 026206 0ustar00dilipkumarstaff000000 000000 From 2b84cec6903c50af9264fb0a6efda8356b0f6f2a Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Fri, 5 Jun 2020 09:03:16 +0530 Subject: [PATCH v27 01/14] Immediately WAL-log subtransaction and top-level XID association. The logical decoding infrastructure needs to know which top-level transaction the subxact belongs to, in order to decode all the changes. Until now that might be delayed until commit, due to the caching (GPROC_MAX_CACHED_SUBXIDS), preventing features requiring incremental decoding. So we also write the assignment info into WAL immediately, as part of the next WAL record (to minimize overhead) only when wal_level=logical. We can not remove the existing XLOG_XACT_ASSIGNMENT WAL as that is required for avoiding overflow in the hot standby snapshot. Author: Tomas Vondra, Dilip Kumar, Amit Kapila Reviewed-by: Amit Kapila Tested-by: Neha Sharma and Mahendra Singh Thalor Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com --- src/backend/access/transam/xact.c | 50 ++++++++++++++++++++++++ src/backend/access/transam/xloginsert.c | 23 ++++++++++- src/backend/access/transam/xlogreader.c | 5 +++ src/backend/replication/logical/decode.c | 44 +++++++++++---------- src/include/access/xact.h | 3 ++ src/include/access/xlog.h | 1 + src/include/access/xlogreader.h | 3 ++ src/include/access/xlogrecord.h | 1 + 8 files changed, 107 insertions(+), 23 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index cd30b62d36..04fd5ca870 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -191,6 +191,7 @@ typedef struct TransactionStateData bool didLogXid; /* has xid been included in WAL record? */ int parallelModeLevel; /* Enter/ExitParallelMode counter */ bool chain; /* start a new block after this one */ + bool assigned; /* assigned to top-level XID */ struct TransactionStateData *parent; /* back link to parent */ } TransactionStateData; @@ -223,6 +224,7 @@ typedef struct SerializedTransactionState static TransactionStateData TopTransactionStateData = { .state = TRANS_DEFAULT, .blockState = TBLOCK_DEFAULT, + .assigned = false, }; /* @@ -5118,6 +5120,7 @@ PushTransaction(void) GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext); s->prevXactReadOnly = XactReadOnly; s->parallelModeLevel = 0; + s->assigned = false; CurrentTransactionState = s; @@ -6020,3 +6023,50 @@ xact_redo(XLogReaderState *record) else elog(PANIC, "xact_redo: unknown op code %u", info); } + +/* + * IsSubTransactionAssignmentPending + * + * This is used to decide whether we need to WAL log the top-level XID for + * operation in a subtransaction. We require that for logical decoding, see + * LogicalDecodingProcessRecord. + * + * This returns true if wal_level >= logical and we are inside a valid + * subtransaction, for which the assignment was not yet written to any WAL + * record. + */ +bool +IsSubTransactionAssignmentPending(void) +{ + /* wal_level has to be logical */ + if (!XLogLogicalInfoActive()) + return false; + + /* we need to be in a transaction state */ + if (!IsTransactionState()) + return false; + + /* it has to be a subtransaction */ + if (!IsSubTransaction()) + return false; + + /* the subtransaction has to have a XID assigned */ + if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + return false; + + /* and it should not be already 'assigned' */ + return !CurrentTransactionState->assigned; +} + +/* + * MarkSubTransactionAssigned + * + * Mark the subtransaction assignment as completed. + */ +void +MarkSubTransactionAssigned(void) +{ + Assert(IsSubTransactionAssignmentPending()); + + CurrentTransactionState->assigned = true; +} diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index b21679f09e..c526bb1928 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -89,11 +89,13 @@ static XLogRecData hdr_rdt; static char *hdr_scratch = NULL; #define SizeOfXlogOrigin (sizeof(RepOriginId) + sizeof(char)) +#define SizeOfXLogTransactionId (sizeof(TransactionId) + sizeof(char)) #define HEADER_SCRATCH_SIZE \ (SizeOfXLogRecord + \ MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \ - SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin) + SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin + \ + SizeOfXLogTransactionId) /* * An array of XLogRecData structs, to hold registered data. @@ -195,6 +197,10 @@ XLogResetInsertion(void) { int i; + /* reset the subxact assignment flag (if needed) */ + if (curinsert_flags & XLOG_INCLUDE_XID) + MarkSubTransactionAssigned(); + for (i = 0; i < max_registered_block_id; i++) registered_buffers[i].in_use = false; @@ -398,7 +404,7 @@ void XLogSetRecordFlags(uint8 flags) { Assert(begininsert_called); - curinsert_flags = flags; + curinsert_flags |= flags; } /* @@ -748,6 +754,19 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, scratch += sizeof(replorigin_session_origin); } + /* followed by toplevel XID, if not already included in previous record */ + if (IsSubTransactionAssignmentPending()) + { + TransactionId xid = GetTopTransactionIdIfAny(); + + /* update the flag (later used by XLogResetInsertion) */ + XLogSetRecordFlags(XLOG_INCLUDE_XID); + + *(scratch++) = (char) XLR_BLOCK_ID_TOPLEVEL_XID; + memcpy(scratch, &xid, sizeof(TransactionId)); + scratch += sizeof(TransactionId); + } + /* followed by main data, if any */ if (mainrdata_len > 0) { diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index cb76be4f46..a757baccfc 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1197,6 +1197,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) state->decoded_record = record; state->record_origin = InvalidRepOriginId; + state->toplevel_xid = InvalidTransactionId; ptr = (char *) record; ptr += SizeOfXLogRecord; @@ -1235,6 +1236,10 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) { COPY_HEADER_FIELD(&state->record_origin, sizeof(RepOriginId)); } + else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) + { + COPY_HEADER_FIELD(&state->toplevel_xid, sizeof(TransactionId)); + } else if (block_id <= XLR_MAX_BLOCK_ID) { /* XLogRecordBlockHeader */ diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index c2e5e3abf8..0c0c371739 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -94,11 +94,27 @@ void LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *record) { XLogRecordBuffer buf; + TransactionId txid; buf.origptr = ctx->reader->ReadRecPtr; buf.endptr = ctx->reader->EndRecPtr; buf.record = record; + txid = XLogRecGetTopXid(record); + + /* + * If the top-level xid is valid, we need to assign the subxact to the + * top-level xact. We need to do this for all records, hence we do it + * before the switch. + */ + if (TransactionIdIsValid(txid)) + { + ReorderBufferAssignChild(ctx->reorder, + txid, + record->decoded_record->xl_xid, + buf.origptr); + } + /* cast so we get a warning when new rmgrs are added */ switch ((RmgrId) XLogRecGetRmid(record)) { @@ -216,13 +232,8 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) /* * If the snapshot isn't yet fully built, we cannot decode anything, so * bail out. - * - * However, it's critical to process XLOG_XACT_ASSIGNMENT records even - * when the snapshot is being built: it is possible to get later records - * that require subxids to be properly assigned. */ - if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT && - info != XLOG_XACT_ASSIGNMENT) + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) return; switch (info) @@ -264,22 +275,13 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) break; } case XLOG_XACT_ASSIGNMENT: - { - xl_xact_assignment *xlrec; - int i; - TransactionId *sub_xid; - xlrec = (xl_xact_assignment *) XLogRecGetData(r); - - sub_xid = &xlrec->xsub[0]; - - for (i = 0; i < xlrec->nsubxacts; i++) - { - ReorderBufferAssignChild(reorder, xlrec->xtop, - *(sub_xid++), buf->origptr); - } - break; - } + /* + * We assign subxact to the toplevel xact while processing each + * record if required. So, we don't need to do anything here. + * See LogicalDecodingProcessRecord. + */ + break; case XLOG_XACT_PREPARE: /* diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 88025b1cc2..22bb96ca2a 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -428,6 +428,9 @@ extern void UnregisterXactCallback(XactCallback callback, void *arg); extern void RegisterSubXactCallback(SubXactCallback callback, void *arg); extern void UnregisterSubXactCallback(SubXactCallback callback, void *arg); +extern bool IsSubTransactionAssignmentPending(void); +extern void MarkSubTransactionAssigned(void); + extern int xactGetCommittedChildren(TransactionId **ptr); extern XLogRecPtr XactLogCommitRecord(TimestampTz commit_time, diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index e917dfe92d..05cc2b696c 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -237,6 +237,7 @@ extern bool XLOG_DEBUG; */ #define XLOG_INCLUDE_ORIGIN 0x01 /* include the replication origin */ #define XLOG_MARK_UNIMPORTANT 0x02 /* record not important for durability */ +#define XLOG_INCLUDE_XID 0x04 /* include XID of top-level xact */ /* Checkpoint statistics */ diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index b0f2a6ed43..b976882229 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -191,6 +191,8 @@ struct XLogReaderState RepOriginId record_origin; + TransactionId toplevel_xid; /* XID of top-level transaction */ + /* information about blocks referenced by the record. */ DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1]; @@ -304,6 +306,7 @@ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, #define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid) #define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid) #define XLogRecGetOrigin(decoder) ((decoder)->record_origin) +#define XLogRecGetTopXid(decoder) ((decoder)->toplevel_xid) #define XLogRecGetData(decoder) ((decoder)->main_data) #define XLogRecGetDataLen(decoder) ((decoder)->main_data_len) #define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0) diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index acd9af0194..2f0c8bf589 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -223,5 +223,6 @@ typedef struct XLogRecordDataHeaderLong #define XLR_BLOCK_ID_DATA_SHORT 255 #define XLR_BLOCK_ID_DATA_LONG 254 #define XLR_BLOCK_ID_ORIGIN 253 +#define XLR_BLOCK_ID_TOPLEVEL_XID 252 #endif /* XLOGRECORD_H */ -- 2.23.0 v27/v27-0012-Add-streaming-option-in-pg_dump.patch000644 000765 000024 00000005337 13670411611 023047 0ustar00dilipkumarstaff000000 000000 From 9daf82c29d7d12e813cee550a7786a08ef90667c Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Mon, 27 Apr 2020 15:36:39 +0530 Subject: [PATCH v27 12/14] Add streaming option in pg_dump --- src/bin/pg_dump/pg_dump.c | 9 +++++++-- src/bin/pg_dump/pg_dump.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index dfe43968b8..8ca4a05822 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -4201,6 +4201,7 @@ getSubscriptions(Archive *fout) int i_oid; int i_subname; int i_rolname; + int i_substream; int i_subconninfo; int i_subslotname; int i_subsynccommit; @@ -4235,8 +4236,8 @@ getSubscriptions(Archive *fout) appendPQExpBuffer(query, "SELECT s.tableoid, s.oid, s.subname," "(%s s.subowner) AS rolname, " - " s.subconninfo, s.subslotname, s.subsynccommit, " - " s.subpublications " + " s.substream, s.subconninfo, s.subslotname, " + " s.subsynccommit, s.subpublications " "FROM pg_subscription s " "WHERE s.subdbid = (SELECT oid FROM pg_database" " WHERE datname = current_database())", @@ -4249,6 +4250,7 @@ getSubscriptions(Archive *fout) i_oid = PQfnumber(res, "oid"); i_subname = PQfnumber(res, "subname"); i_rolname = PQfnumber(res, "rolname"); + i_substream = PQfnumber(res, "substream"); i_subconninfo = PQfnumber(res, "subconninfo"); i_subslotname = PQfnumber(res, "subslotname"); i_subsynccommit = PQfnumber(res, "subsynccommit"); @@ -4265,6 +4267,7 @@ getSubscriptions(Archive *fout) AssignDumpId(&subinfo[i].dobj); subinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_subname)); subinfo[i].rolname = pg_strdup(PQgetvalue(res, i, i_rolname)); + subinfo[i].substream = pg_strdup(PQgetvalue(res, i, i_substream)); subinfo[i].subconninfo = pg_strdup(PQgetvalue(res, i, i_subconninfo)); if (PQgetisnull(res, i, i_subslotname)) subinfo[i].subslotname = NULL; @@ -4342,6 +4345,8 @@ dumpSubscription(Archive *fout, SubscriptionInfo *subinfo) else appendPQExpBufferStr(query, "NONE"); + if (strcmp(subinfo->substream, "f") != 0) + appendPQExpBuffer(query, ", streaming = on"); if (strcmp(subinfo->subsynccommit, "off") != 0) appendPQExpBuffer(query, ", synchronous_commit = %s", fmtId(subinfo->subsynccommit)); diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 0c2fcfb3a9..af64270c55 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -623,6 +623,7 @@ typedef struct _SubscriptionInfo { DumpableObject dobj; char *rolname; + char *substream; char *subconninfo; char *subslotname; char *subsynccommit; -- 2.23.0 v27/v27-0013-Change-buffile-interface-required-for-streaming-.patch000644 000765 000024 00000022140 13670411611 026220 0ustar00dilipkumarstaff000000 000000 From f8b755ca0aa9801ab68a0e29a8f1ccad3db1a2c8 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Thu, 11 Jun 2020 16:40:25 +0530 Subject: [PATCH v27 13/14] Change buffile interface required for streaming transaction Implement the BuffileTruncate and SEEK_END. And, also add an option to provide a mode while opening the shared buffiles, instead of always opening in readonly mode --- src/backend/storage/file/buffile.c | 52 ++++++++++++++++++++--- src/backend/storage/file/fd.c | 10 ++--- src/backend/storage/file/sharedfileset.c | 7 +-- src/backend/utils/sort/logtape.c | 4 +- src/backend/utils/sort/sharedtuplestore.c | 2 +- src/include/storage/buffile.h | 4 +- src/include/storage/fd.h | 2 +- src/include/storage/sharedfileset.h | 3 +- 8 files changed, 65 insertions(+), 19 deletions(-) diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 35e8f12e62..184c6d9c3b 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -277,7 +277,7 @@ BufFileCreateShared(SharedFileSet *fileset, const char *name) * backends and render it read-only. */ BufFile * -BufFileOpenShared(SharedFileSet *fileset, const char *name) +BufFileOpenShared(SharedFileSet *fileset, const char *name, int mode) { BufFile *file; char segment_name[MAXPGPATH]; @@ -301,7 +301,7 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name) } /* Try to load a segment. */ SharedSegmentName(segment_name, name, nfiles); - files[nfiles] = SharedFileSetOpen(fileset, segment_name); + files[nfiles] = SharedFileSetOpen(fileset, segment_name, mode); if (files[nfiles] <= 0) break; ++nfiles; @@ -321,7 +321,7 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name) file = makeBufFileCommon(nfiles); file->files = files; - file->readOnly = true; /* Can't write to files opened this way */ + file->readOnly = (mode == O_RDONLY) ? true : false; file->fileset = fileset; file->name = pstrdup(name); @@ -670,11 +670,14 @@ BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) newFile = file->curFile; newOffset = (file->curOffset + file->pos) + offset; break; -#ifdef NOT_USED case SEEK_END: - /* could be implemented, not needed currently */ + /* + * Get the file size of the last file to get the last offset + * of that file. + */ + newFile = file->numFiles - 1; + newOffset = FileSize(file->files[file->numFiles - 1]); break; -#endif default: elog(ERROR, "invalid whence: %d", whence); return EOF; @@ -843,3 +846,40 @@ BufFileAppend(BufFile *target, BufFile *source) return startBlock; } + +/* + * Truncate the file upto the given fileno and the offsets. + */ +void +BufFileTruncateShared(BufFile *file, int fileno, off_t offset) +{ + int newFile = file->numFiles; + off_t newOffset; + char segment_name[MAXPGPATH]; + int i; + + /* Loop over all the files upto the fileno which we want to truncate. */ + for (i = file->numFiles - 1; i >= fileno; i--) + { + /* + * Except the fileno, we can directly delete other files. If the + * offset is 0 then we can delete the fileno file as well unless it + * is the first file. + */ + if ((i != fileno || offset == 0) && fileno != 0) + { + SharedSegmentName(segment_name, file->name, i); + SharedFileSetDelete(file->fileset, segment_name, true); + newFile--; + newOffset = MAX_PHYSICAL_FILESIZE; + } + else + { + FileTruncate(file->files[i], offset, WAIT_EVENT_BUFFILE_READ); + newOffset = offset; + } + } + + file->numFiles = newFile; + file->curOffset = newOffset; +} diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 7dc6dd2f15..10591fee18 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1741,18 +1741,18 @@ PathNameCreateTemporaryFile(const char *path, bool error_on_failure) /* * Open a file that was created with PathNameCreateTemporaryFile, possibly in * another backend. Files opened this way don't count against the - * temp_file_limit of the caller, are read-only and are automatically closed - * at the end of the transaction but are not deleted on close. + * temp_file_limit of the caller, are read-only if the flag is set and are + * automatically closed at the end of the transaction but are not deleted on + * close. */ File -PathNameOpenTemporaryFile(const char *path) +PathNameOpenTemporaryFile(const char *path, int mode) { File file; ResourceOwnerEnlargeFiles(CurrentResourceOwner); - /* We open the file read-only. */ - file = PathNameOpenFile(path, O_RDONLY | PG_BINARY); + file = PathNameOpenFile(path, mode | PG_BINARY); /* If no such file, then we don't raise an error. */ if (file <= 0 && errno != ENOENT) diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c index f7206c9175..4b39d91320 100644 --- a/src/backend/storage/file/sharedfileset.c +++ b/src/backend/storage/file/sharedfileset.c @@ -68,7 +68,8 @@ SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg) } /* Register our cleanup callback. */ - on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); + if (seg) + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); } /* @@ -131,13 +132,13 @@ SharedFileSetCreate(SharedFileSet *fileset, const char *name) * another backend. */ File -SharedFileSetOpen(SharedFileSet *fileset, const char *name) +SharedFileSetOpen(SharedFileSet *fileset, const char *name, int mode) { char path[MAXPGPATH]; File file; SharedFilePath(path, fileset, name); - file = PathNameOpenTemporaryFile(path); + file = PathNameOpenTemporaryFile(path, mode); return file; } diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index 138da0c1b4..6c3114edad 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -78,6 +78,8 @@ #include "postgres.h" +#include + #include "storage/buffile.h" #include "utils/builtins.h" #include "utils/logtape.h" @@ -544,7 +546,7 @@ ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared, lt = <s->tapes[i]; pg_itoa(i, filename); - file = BufFileOpenShared(fileset, filename); + file = BufFileOpenShared(fileset, filename, O_RDONLY); filesize = BufFileSize(file); /* diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index c3ab494a45..efba5dca6e 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -563,7 +563,7 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data) sts_filename(name, accessor, accessor->read_participant); accessor->read_file = - BufFileOpenShared(accessor->fileset, name); + BufFileOpenShared(accessor->fileset, name, O_RDONLY); } /* Seek and load the chunk header. */ diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index 60433f35b4..8b1633415a 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -48,7 +48,9 @@ extern long BufFileAppend(BufFile *target, BufFile *source); extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name); extern void BufFileExportShared(BufFile *file); -extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name); +extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name, + int mode); extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name); +extern void BufFileTruncateShared(BufFile *file, int fileno, off_t offset); #endif /* BUFFILE_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 8cd125d7df..e209f047e8 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -94,7 +94,7 @@ extern mode_t FileGetRawMode(File file); /* Operations used for sharing named temporary files */ extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure); -extern File PathNameOpenTemporaryFile(const char *name); +extern File PathNameOpenTemporaryFile(const char *path, int mode); extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure); extern void PathNameCreateTemporaryDir(const char *base, const char *name); extern void PathNameDeleteTemporaryDir(const char *name); diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h index 2d6cf077e5..b2f4ba4bd8 100644 --- a/src/include/storage/sharedfileset.h +++ b/src/include/storage/sharedfileset.h @@ -37,7 +37,8 @@ typedef struct SharedFileSet extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg); extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg); extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name); -extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name); +extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name, + int mode); extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name, bool error_on_failure); extern void SharedFileSetDeleteAll(SharedFileSet *fileset); -- 2.23.0 v27/v27-0011-Provide-new-api-to-get-the-streaming-changes.patch000644 000765 000024 00000014524 13670411611 025340 0ustar00dilipkumarstaff000000 000000 From e42ed7aff7f1f7feb70f4f6fb849c0a1376fd1c8 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Sat, 2 May 2020 11:41:59 +0530 Subject: [PATCH v27 11/14] Provide new api to get the streaming changes --- .gitignore | 1 + doc/src/sgml/test-decoding.sgml | 22 ++++++++++++++++++ src/backend/catalog/system_views.sql | 8 +++++++ .../replication/logical/logicalfuncs.c | 23 +++++++++++++++---- src/include/catalog/pg_proc.dat | 9 ++++++++ 5 files changed, 58 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 794e35b73c..6083744c07 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,4 @@ lib*.pc /Debug/ /Release/ /tmp_install/ +/build/ diff --git a/doc/src/sgml/test-decoding.sgml b/doc/src/sgml/test-decoding.sgml index 8356a3d67b..eed6e9d134 100644 --- a/doc/src/sgml/test-decoding.sgml +++ b/doc/src/sgml/test-decoding.sgml @@ -39,4 +39,26 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'i + + We can also get the changes of the in-progress transaction and the typical + output, might be: + + +postgres[33712]=#* SELECT * FROM pg_logical_slot_get_streaming_changes('test_slot', NULL, NULL); + lsn | xid | data +-----------+-----+-------------------------------------------------- + 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/16B21F8 | 503 | streaming change for TXN 503 + 0/16B2300 | 503 | streaming change for TXN 503 + 0/16B2408 | 503 | streaming change for TXN 503 + 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 + 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/16BECA8 | 503 | streaming change for TXN 503 + 0/16BEDB0 | 503 | streaming change for TXN 503 + 0/16BEEB8 | 503 | streaming change for TXN 503 + 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 +(10 rows) + + + diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 9f509fbc21..5fe6f28ba2 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1243,6 +1243,14 @@ LANGUAGE INTERNAL VOLATILE ROWS 1000 COST 1000 AS 'pg_logical_slot_get_changes'; +CREATE OR REPLACE FUNCTION pg_logical_slot_get_streaming_changes( + IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', + OUT lsn pg_lsn, OUT xid xid, OUT data text) +RETURNS SETOF RECORD +LANGUAGE INTERNAL +VOLATILE ROWS 1000 COST 1000 +AS 'pg_logical_slot_get_streaming_changes'; + CREATE OR REPLACE FUNCTION pg_logical_slot_peek_changes( IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', OUT lsn pg_lsn, OUT xid xid, OUT data text) diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index b99c94e848..70c28ffa91 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -108,7 +108,8 @@ check_permissions(void) * Helper function for the various SQL callable logical decoding functions. */ static Datum -pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary) +pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, + bool binary, bool streaming) { Name name; XLogRecPtr upto_lsn; @@ -252,6 +253,9 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin NameStr(*name)), errdetail("This slot has never previously reserved WAL, or has been invalidated."))); + /* If called has not asked for streaming changes then disable it. */ + ctx->streaming &= streaming; + MemoryContextSwitchTo(oldcontext); /* @@ -362,7 +366,16 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin Datum pg_logical_slot_get_changes(PG_FUNCTION_ARGS) { - return pg_logical_slot_get_changes_guts(fcinfo, true, false); + return pg_logical_slot_get_changes_guts(fcinfo, true, false, false); +} + +/* + * SQL function to get the streaming changes as text, consuming the data. + */ +Datum +pg_logical_slot_get_streaming_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, true, false, true); } /* @@ -371,7 +384,7 @@ pg_logical_slot_get_changes(PG_FUNCTION_ARGS) Datum pg_logical_slot_peek_changes(PG_FUNCTION_ARGS) { - return pg_logical_slot_get_changes_guts(fcinfo, false, false); + return pg_logical_slot_get_changes_guts(fcinfo, false, false, false); } /* @@ -380,7 +393,7 @@ pg_logical_slot_peek_changes(PG_FUNCTION_ARGS) Datum pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS) { - return pg_logical_slot_get_changes_guts(fcinfo, true, true); + return pg_logical_slot_get_changes_guts(fcinfo, true, true, false); } /* @@ -389,7 +402,7 @@ pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS) Datum pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS) { - return pg_logical_slot_get_changes_guts(fcinfo, false, true); + return pg_logical_slot_get_changes_guts(fcinfo, false, true, false); } diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 7869f721da..875e0bef28 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10115,6 +10115,15 @@ proargmodes => '{i,i,i,v,o,o,o}', proargnames => '{slot_name,upto_lsn,upto_nchanges,options,lsn,xid,data}', prosrc => 'pg_logical_slot_get_binary_changes' }, +{ oid => '6150', descr => 'get streaming changes from replication slot', + proname => 'pg_logical_slot_get_streaming_changes', procost => '1000', + prorows => '1000', provariadic => 'text', proisstrict => 'f', + proretset => 't', provolatile => 'v', proparallel => 'u', + prorettype => 'record', proargtypes => 'name pg_lsn int4 _text', + proallargtypes => '{name,pg_lsn,int4,_text,pg_lsn,xid,text}', + proargmodes => '{i,i,i,v,o,o,o}', + proargnames => '{slot_name,upto_lsn,upto_nchanges,options,lsn,xid,data}', + prosrc => 'pg_logical_slot_get_streaming_changes' }, { oid => '3784', descr => 'peek at changes from replication slot', proname => 'pg_logical_slot_peek_changes', procost => '1000', prorows => '1000', provariadic => 'text', proisstrict => 'f', -- 2.23.0 v27/v27-0003-Extend-the-output-plugin-API-with-stream-methods.patch000644 000765 000024 00000106360 13670411611 026174 0ustar00dilipkumarstaff000000 000000 From 92e418399e28686ee274c458f33dff60c2411704 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 26 Sep 2019 17:26:31 +0200 Subject: [PATCH v27 03/14] Extend the output plugin API with stream methods This adds four methods the output plugin API, adding support for streaming changes for large transactions. * stream_message * stream_change * stream_truncate * stream_abort * stream_commit * stream_start * stream_stop Most of this is a simple extension of the existing methods, with the semantic difference that the transaction (or subtransaction) is incomplete and may be aborted later (which is something the regular API does not really need to deal with). This also extends the 'test_decoding' plugin, implementing these new stream methods. The stream_start/start_stop are used to demarcate the a chunk of changes streamed for a particular toplevel transaction. --- contrib/test_decoding/test_decoding.c | 100 ++++++ doc/src/sgml/logicaldecoding.sgml | 213 +++++++++++++ src/backend/replication/logical/logical.c | 365 ++++++++++++++++++++++ src/include/replication/logical.h | 5 + src/include/replication/output_plugin.h | 69 ++++ src/include/replication/reorderbuffer.h | 59 ++++ 6 files changed, 811 insertions(+) diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c index 93c948856e..64f651fa72 100644 --- a/contrib/test_decoding/test_decoding.c +++ b/contrib/test_decoding/test_decoding.c @@ -62,6 +62,28 @@ static void pg_decode_message(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, XLogRecPtr message_lsn, bool transactional, const char *prefix, Size sz, const char *message); +static void pg_decode_stream_message(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr message_lsn, + bool transactional, const char *prefix, + Size sz, const char *message); +static void pg_decode_stream_change(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + Relation relation, + ReorderBufferChange *change); +static void pg_decode_stream_truncate(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + int nrelations, Relation relations[], + ReorderBufferChange *change); +static void pg_decode_stream_abort(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); +static void pg_decode_stream_commit(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr apply_lsn); +static void pg_decode_stream_start(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pg_decode_stream_stop(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); void _PG_init(void) @@ -83,6 +105,13 @@ _PG_output_plugin_init(OutputPluginCallbacks *cb) cb->filter_by_origin_cb = pg_decode_filter; cb->shutdown_cb = pg_decode_shutdown; cb->message_cb = pg_decode_message; + cb->stream_message_cb = pg_decode_stream_message; + cb->stream_change_cb = pg_decode_stream_change; + cb->stream_truncate_cb = pg_decode_stream_truncate; + cb->stream_abort_cb = pg_decode_stream_abort; + cb->stream_commit_cb = pg_decode_stream_commit; + cb->stream_start_cb = pg_decode_stream_start; + cb->stream_stop_cb = pg_decode_stream_stop; } @@ -540,3 +569,74 @@ pg_decode_message(LogicalDecodingContext *ctx, appendBinaryStringInfo(ctx->out, message, sz); OutputPluginWrite(ctx, true); } + +static void +pg_decode_stream_message(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr lsn, bool transactional, + const char *prefix, Size sz, const char *message) +{ + OutputPluginPrepareWrite(ctx, true); + appendStringInfo(ctx->out, "streaming message: transactional: %d prefix: %s, sz: %zu content:", + transactional, prefix, sz); + appendBinaryStringInfo(ctx->out, message, sz); + OutputPluginWrite(ctx, true); +} + +static void +pg_decode_stream_change(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + Relation relation, + ReorderBufferChange *change) +{ + OutputPluginPrepareWrite(ctx, true); + appendStringInfo(ctx->out, "streaming change for TXN %u", txn->xid); + OutputPluginWrite(ctx, true); +} + +static void +pg_decode_stream_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + int nrelations, Relation relations[], + ReorderBufferChange *change) +{ + OutputPluginPrepareWrite(ctx, true); + appendStringInfo(ctx->out, "streaming truncate for TXN %u", txn->xid); + OutputPluginWrite(ctx, true); +} + +static void +pg_decode_stream_abort(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + OutputPluginPrepareWrite(ctx, true); + appendStringInfo(ctx->out, "aborting streamed (sub)transaction TXN %u", txn->xid); + OutputPluginWrite(ctx, true); +} + +static void +pg_decode_stream_commit(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr apply_lsn) +{ + OutputPluginPrepareWrite(ctx, true); + appendStringInfo(ctx->out, "committing streamed transaction TXN %u", txn->xid); + OutputPluginWrite(ctx, true); +} + +static void +pg_decode_stream_start(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + OutputPluginPrepareWrite(ctx, true); + appendStringInfo(ctx->out, "opening a streamed block for transaction TXN %u", txn->xid); + OutputPluginWrite(ctx, true); +} + +static void +pg_decode_stream_stop(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + OutputPluginPrepareWrite(ctx, true); + appendStringInfo(ctx->out, "closing a streamed block for transaction TXN %u", txn->xid); + OutputPluginWrite(ctx, true); +} diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml index c89f93cf6b..50cfd6fa47 100644 --- a/doc/src/sgml/logicaldecoding.sgml +++ b/doc/src/sgml/logicaldecoding.sgml @@ -389,6 +389,13 @@ typedef struct OutputPluginCallbacks LogicalDecodeMessageCB message_cb; LogicalDecodeFilterByOriginCB filter_by_origin_cb; LogicalDecodeShutdownCB shutdown_cb; + LogicalDecodeStreamChangeCB stream_change_cb; + LogicalDecodeStreamTruncateCB stream_truncate_cb; + LogicalDecodeStreamMessageCB stream_message_cb; + LogicalDecodeStreamCommitCB stream_commit_cb; + LogicalDecodeStreamAbortCB stream_abort_cb; + LogicalDecodeStreamStartCB stream_start_cb; + LogicalDecodeStreamStopCB stream_stop_cb; } OutputPluginCallbacks; typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb); @@ -401,6 +408,15 @@ typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb); If truncate_cb is not set but a TRUNCATE is to be decoded, the action will be ignored. + + + An output plugin may also define functions to support streaming of large, + in-progress transactions. The stream_change_cb, + stream_commit_cb, stream_abort_cb, + stream_start_cb and stream_stop_cb + are required, while stream_message_cb and + stream_truncate_cb are optional. + @@ -679,6 +695,112 @@ typedef void (*LogicalDecodeMessageCB) (struct LogicalDecodingContext *ctx, + + Stream Start Callback + + The stream_start_cb callback is called when opening + a block of streamed changes from an in-progress transaction. + +typedef void (*LogicalDecodeStreamStartCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); + + + + + + Stream Stop Callback + + The stream_stop_cb callback is called when closing + a block of streamed changes from an in-progress transaction. + +typedef void (*LogicalDecodeStreamStopCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); + + + + + + Stream Change Callback + + The stream_change_cb callback is called when sending + a change in a block of streamed changes (demarcated by + stream_start_cb and stream_stop_cb calls). + +typedef void (*LogicalDecodeStreamChangeCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + Relation relation, + ReorderBufferChange *change); + + + + + + Stream Truncate Callback + + The stream_truncate_cb callback is called for a + TRUNCATE command in a block of streamed changes + (demarcated by stream_start_cb and + stream_stop_cb calls). + +typedef void (*LogicalDecodeStreamTruncateCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + int nrelations, + Relation relations[], + ReorderBufferChange *change); + + The parameters are analogous to the stream_change_cb + callback. However, because TRUNCATE actions on + tables connected by foreign keys need to be executed together, this + callback receives an array of relations instead of just a single one. + See the description of the statement for + details. + + + + + Stream Message Callback + + The stream_message_cb callback is called when sending + a generic message in a block of streamed changes (demarcated by + stream_start_cb and stream_stop_cb calls). + +typedef void (*LogicalDecodeStreamMessageCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr message_lsn, + bool transactional, + const char *prefix, + Size message_size, + const char *message); + + + + + + Stream Commit Callback + + The stream_commit_cb callback is called to commit + a previously streamed transaction. + +typedef void (*LogicalDecodeStreamCommitCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); + + + + + + Stream Abort Callback + + The stream_abort_cb callback is called to abort + a previously streamed transaction. + +typedef void (*LogicalDecodeStreamAbortCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); + + + + @@ -747,4 +869,95 @@ OutputPluginWrite(ctx, true); + + + Streaming of Large Transactions for Logical Decoding + + + The basic output plugin callbacks (e.g. begin_cb, + change_cb, commit_cb and + message_cb) are only invoked when the transaction + actually commits. The changes are still decoded from the transaction + log, but are only passed to the output plugin at commit (and discarded + if the transaction aborts). + + + + This means that while the decoding happens incrementally, and may spill + to disk to keep memory usage under control, all the decoded changes have + to be transmitted when the transaction finally commits (or more precisely, + when the commit is decoded from the transaction log). Depending on the + size of the transaction and network bandwidth, the transfer time may + significantly increase the apply lag. + + + + To reduce the apply lag caused by large transactions, an output plugin + may provide additional callback to support incremental streaming of + in-progress transactions. There are multiple required streaming callbacks + (stream_change_cb, stream_commit_cb, + stream_abort_cb, stream_start_cb + and stream_stop_cb) and one optional callback + (stream_message_cb). + + + + When streaming an in-progress transaction, the changes (and messages) are + streamed in blocks demarcated by stream_start_cb + and stream_stop_cb callbacks. Once all the decoded + changes are transmitted, the transaction is committed using the + stream_commit_cb callback (or possibly aborted using + the stream_abort_cb callback). + + + + One example sequence of streaming callback calls for one transaction may + look like this: + +stream_start_cb(...); <-- start of first block of changes + stream_change_cb(...); + stream_change_cb(...); + stream_message_cb(...); + stream_change_cb(...); + ... + stream_change_cb(...); +stream_stop_cb(...); <-- end of first block of changes + +stream_start_cb(...); <-- start of second block of changes + stream_change_cb(...); + stream_change_cb(...); + stream_change_cb(...); + ... + stream_message_cb(...); + stream_change_cb(...); +stream_stop_cb(...); <-- end of second block of changes + +stream_commit_cb(...); <-- commit of the streamed transaction + + + + + The actual sequence of callback calls may be more complicated, of course. + There may be blocks for multiple streamed transactions, some of the + transactions may get aborted, etc. + + + + Similar to spill-to-disk behavior, streaming is triggered when the total + amount of changes decoded from the WAL (for all in-progress transactions) + exceeds limit defined by logical_decoding_work_mem setting. + At that point the largest toplevel transaction (measured by amount of memory + currently used for decoded changes) is selected and streamed. However, in + some cases we still have to spill to the disk even if streaming is enabled + because if we cross the memory limit but we still have not decoded the + complete tuple e.g. only decoded toast table insert but not the main table + insert. + + + + Even when streaming large transactions, the changes are still applied in + commit order, preserving the same guarantees as the non-streaming mode. + + + diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 61902be3b0..26d461effb 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -65,6 +65,23 @@ static void message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, XLogRecPtr message_lsn, bool transactional, const char *prefix, Size message_size, const char *message); +/* streaming callbacks */ +static void stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change); +static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message); +static void stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); +static void stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr apply_lsn); +static void stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn); +static void stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn); + static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin); /* @@ -189,6 +206,39 @@ StartupDecodingContext(List *output_plugin_options, ctx->reorder->commit = commit_cb_wrapper; ctx->reorder->message = message_cb_wrapper; + /* + * To support streaming, we require change/commit/abort callbacks. The + * message callback is optional, similar to regular output plugins. We + * however enable streaming when at least one of the methods is enabled, + * so that we can easily identify missing methods. + * + * We decide it here, but only check it later in the wrappers. + */ + ctx->streaming = (ctx->callbacks.stream_change_cb != NULL) || + (ctx->callbacks.stream_abort_cb != NULL) || + (ctx->callbacks.stream_message_cb != NULL) || + (ctx->callbacks.stream_truncate_cb != NULL) || + (ctx->callbacks.stream_commit_cb != NULL) || + (ctx->callbacks.stream_start_cb != NULL) || + (ctx->callbacks.stream_stop_cb != NULL); + + /* + * streaming callbacks + * + * stream_message and stream_truncate callbacks are optional, + * so we do not fail with ERROR when missing, but the wrappers + * simply do nothing. We must set the ReorderBuffer callbacks + * to something, otherwise the calls from there will crash (we + * don't want to move the checks there). + */ + ctx->reorder->stream_change = stream_change_cb_wrapper; + ctx->reorder->stream_abort = stream_abort_cb_wrapper; + ctx->reorder->stream_commit = stream_commit_cb_wrapper; + ctx->reorder->stream_start = stream_start_cb_wrapper; + ctx->reorder->stream_stop = stream_stop_cb_wrapper; + ctx->reorder->stream_message = stream_message_cb_wrapper; + ctx->reorder->stream_truncate = stream_truncate_cb_wrapper; + ctx->out = makeStringInfo(); ctx->prepare_write = prepare_write; ctx->write = do_write; @@ -866,6 +916,321 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, error_context_stack = errcallback.previous; } +static void +stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + /* in streaming mode, stream_change_cb is required */ + if (ctx->callbacks.stream_change_cb == NULL) + ereport(ERROR, + (errmsg("Output plugin supports streaming, but has not registered " + "stream_change_cb callback."))); + + ctx->callbacks.stream_change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], + ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (!ctx->callbacks.stream_truncate_cb) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_truncate"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->callbacks.stream_truncate_cb(ctx, txn, nrelations, relations, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (ctx->callbacks.stream_message_cb == NULL) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_message"; + state.report_location = message_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId; + ctx->write_location = message_lsn; + + /* do the actual work: call callback */ + ctx->callbacks.stream_message_cb(ctx, txn, message_lsn, transactional, prefix, + message_size, message); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_abort"; + state.report_location = abort_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = abort_lsn; + + /* in streaming mode, stream_abort_cb is required */ + if (ctx->callbacks.stream_abort_cb == NULL) + ereport(ERROR, + (errmsg("Output plugin supports streaming, but has not registered " + "stream_abort_cb callback."))); + + ctx->callbacks.stream_abort_cb(ctx, txn, abort_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr apply_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_commit"; + state.report_location = apply_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = apply_lsn; + + /* in streaming mode, stream_abort_cb is required */ + if (ctx->callbacks.stream_commit_cb == NULL) + ereport(ERROR, + (errmsg("Output plugin supports streaming, but has not registered " + "stream_commit_cb callback."))); + + ctx->callbacks.stream_commit_cb(ctx, txn, apply_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_start"; + state.report_location = first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = first_lsn; + + /* in streaming mode, stream_start_cb is required */ + if (ctx->callbacks.stream_start_cb == NULL) + ereport(ERROR, + (errmsg("Output plugin supports streaming, but has not registered " + "stream_start_cb callback."))); + + ctx->callbacks.stream_start_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_stop"; + state.report_location = last_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = last_lsn; + + /* in streaming mode, stream_stop_cb is required */ + if (ctx->callbacks.stream_stop_cb == NULL) + ereport(ERROR, + (errmsg("Output plugin supports streaming, but has not registered " + "stream_stop_cb callback."))); + + ctx->callbacks.stream_stop_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + /* * Set the required catalog xmin horizon for historic snapshots in the current * replication slot. diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index c2f2475e5d..deef31825d 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -79,6 +79,11 @@ typedef struct LogicalDecodingContext */ void *output_writer_private; + /* + * Does the output plugin support streaming, and is it enabled? + */ + bool streaming; + /* * State for writing output. */ diff --git a/src/include/replication/output_plugin.h b/src/include/replication/output_plugin.h index 3dd9236c57..0d0a94a648 100644 --- a/src/include/replication/output_plugin.h +++ b/src/include/replication/output_plugin.h @@ -99,6 +99,67 @@ typedef bool (*LogicalDecodeFilterByOriginCB) (struct LogicalDecodingContext *ct */ typedef void (*LogicalDecodeShutdownCB) (struct LogicalDecodingContext *ctx); +/* + * Callback for streaming individual changes from in-progress transactions. + */ +typedef void (*LogicalDecodeStreamChangeCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + Relation relation, + ReorderBufferChange *change); + +/* + * Callback for streaming truncates from in-progress transactions. + */ +typedef void (*LogicalDecodeStreamTruncateCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + int nrelations, + Relation relations[], + ReorderBufferChange *change); + +/* + * Callback for streaming generic logical decoding messages from in-progress + * transactions. + */ +typedef void (*LogicalDecodeStreamMessageCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr message_lsn, + bool transactional, + const char *prefix, + Size message_size, + const char *message); + +/* + * Called to discard changes streamed to remote node from in-progress + * transaction. + */ +typedef void (*LogicalDecodeStreamAbortCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); + +/* + * Called to apply changes streamed to remote node from in-progress + * transaction. + */ +typedef void (*LogicalDecodeStreamCommitCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); + +/* + * Called when starting to stream a block of changes from in-progress + * transaction (may be called repeatedly, if it's streamed in multiple + * chunks). + */ +typedef void (*LogicalDecodeStreamStartCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); + +/* + * Called when stopping to stream a block of changes from in-progress + * transaction to a remote node (may be called repeatedly, if it's streamed + * in multiple chunks). + */ +typedef void (*LogicalDecodeStreamStopCB) (struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); + /* * Output plugin callbacks */ @@ -112,6 +173,14 @@ typedef struct OutputPluginCallbacks LogicalDecodeMessageCB message_cb; LogicalDecodeFilterByOriginCB filter_by_origin_cb; LogicalDecodeShutdownCB shutdown_cb; + /* streaming of changes */ + LogicalDecodeStreamChangeCB stream_change_cb; + LogicalDecodeStreamTruncateCB stream_truncate_cb; + LogicalDecodeStreamMessageCB stream_message_cb; + LogicalDecodeStreamAbortCB stream_abort_cb; + LogicalDecodeStreamCommitCB stream_commit_cb; + LogicalDecodeStreamStartCB stream_start_cb; + LogicalDecodeStreamStopCB stream_stop_cb; } OutputPluginCallbacks; /* Functions in replication/logical/logical.c */ diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index af35287896..65814af9f5 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -354,6 +354,54 @@ typedef void (*ReorderBufferMessageCB) (ReorderBuffer *rb, const char *prefix, Size sz, const char *message); +/* stream change callback signature */ +typedef void (*ReorderBufferStreamChangeCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + Relation relation, + ReorderBufferChange *change); + +/* stream truncate callback signature */ +typedef void (*ReorderBufferStreamTruncateCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + int nrelations, + Relation relations[], + ReorderBufferChange *change); + +/* stream message callback signature */ +typedef void (*ReorderBufferStreamMessageCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + XLogRecPtr message_lsn, + bool transactional, + const char *prefix, Size sz, + const char *message); + +/* discard streamed transaction callback signature */ +typedef void (*ReorderBufferStreamAbortCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); + +/* commit streamed transaction callback signature */ +typedef void (*ReorderBufferStreamCommitCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); + +/* start streaming transaction callback signature */ +typedef void (*ReorderBufferStreamStartCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + XLogRecPtr first_lsn); + +/* stop streaming transaction callback signature */ +typedef void (*ReorderBufferStreamStopCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + XLogRecPtr last_lsn); + struct ReorderBuffer { /* @@ -392,6 +440,17 @@ struct ReorderBuffer ReorderBufferCommitCB commit; ReorderBufferMessageCB message; + /* + * Callbacks to be called when streaming a transaction. + */ + ReorderBufferStreamStartCB stream_start; + ReorderBufferStreamStopCB stream_stop; + ReorderBufferStreamChangeCB stream_change; + ReorderBufferStreamTruncateCB stream_truncate; + ReorderBufferStreamMessageCB stream_message; + ReorderBufferStreamAbortCB stream_abort; + ReorderBufferStreamCommitCB stream_commit; + /* * Pointer that will be passed untouched to the callbacks. */ -- 2.23.0 v27/v27-0008-Add-support-for-streaming-to-built-in-replicatio.patch000644 000765 000024 00000263043 13670411611 026301 0ustar00dilipkumarstaff000000 000000 From 6dcecde9d2ebc0e3f4b489248af026d14dd7d6d9 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Thu, 11 Jun 2020 15:34:29 +0530 Subject: [PATCH v27 08/14] Add support for streaming to built-in replication To add support for streaming of in-progress transactions into the built-in transaction, we need to do three things: * Extend the logical replication protocol, so identify in-progress transactions, and allow adding additional bits of information (e.g. XID of subtransactions). * Modify the output plugin (pgoutput) to implement the new stream API callbacks, by leveraging the extended replication protocol. * Modify the replication apply worker, to properly handle streamed in-progress transaction by spilling the data to disk and then replaying them on commit. We however must explicitly disable streaming replication during replication slot creation, even if the plugin supports it. We don't need to replicate the changes accumulated during this phase, and moreover we don't have a replication connection open so we don't have where to send the data anyway. --- doc/src/sgml/ref/alter_subscription.sgml | 4 +- doc/src/sgml/ref/create_subscription.sgml | 11 + src/backend/catalog/pg_subscription.c | 1 + src/backend/commands/subscriptioncmds.c | 45 +- src/backend/postmaster/pgstat.c | 12 + .../libpqwalreceiver/libpqwalreceiver.c | 3 + src/backend/replication/logical/proto.c | 140 ++- src/backend/replication/logical/worker.c | 1012 +++++++++++++++++ src/backend/replication/pgoutput/pgoutput.c | 318 +++++- src/backend/replication/slotfuncs.c | 6 + src/backend/replication/walsender.c | 6 + src/include/catalog/pg_subscription.h | 3 + src/include/pgstat.h | 6 +- src/include/replication/logicalproto.h | 42 +- src/include/replication/walreceiver.h | 1 + src/test/subscription/t/009_stream_simple.pl | 86 ++ src/test/subscription/t/010_stream_subxact.pl | 102 ++ src/test/subscription/t/011_stream_ddl.pl | 95 ++ .../t/012_stream_subxact_abort.pl | 82 ++ .../t/013_stream_subxact_ddl_abort.pl | 84 ++ 20 files changed, 2019 insertions(+), 40 deletions(-) create mode 100644 src/test/subscription/t/009_stream_simple.pl create mode 100644 src/test/subscription/t/010_stream_subxact.pl create mode 100644 src/test/subscription/t/011_stream_ddl.pl create mode 100644 src/test/subscription/t/012_stream_subxact_abort.pl create mode 100644 src/test/subscription/t/013_stream_subxact_ddl_abort.pl diff --git a/doc/src/sgml/ref/alter_subscription.sgml b/doc/src/sgml/ref/alter_subscription.sgml index c24ace14d1..d8de56c928 100644 --- a/doc/src/sgml/ref/alter_subscription.sgml +++ b/doc/src/sgml/ref/alter_subscription.sgml @@ -163,8 +163,8 @@ ALTER SUBSCRIPTION name RENAME TO < This clause alters parameters originally set by . See there for more - information. The allowed options are slot_name and - synchronous_commit + information. The allowed options are slot_name, + synchronous_commit and streaming. diff --git a/doc/src/sgml/ref/create_subscription.sgml b/doc/src/sgml/ref/create_subscription.sgml index 5bbc165f70..c25b7c5962 100644 --- a/doc/src/sgml/ref/create_subscription.sgml +++ b/doc/src/sgml/ref/create_subscription.sgml @@ -206,6 +206,17 @@ CREATE SUBSCRIPTION subscription_name + + streaming (boolean) + + + Specifies whether streaming of in-progress transactions should + be enabled for this subscription. By default, all transactions + are fully decoded on the publisher, and only then sent to the + subscriber as a whole. + + + diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index cb15731115..f28482f0f4 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -65,6 +65,7 @@ GetSubscription(Oid subid, bool missing_ok) sub->name = pstrdup(NameStr(subform->subname)); sub->owner = subform->subowner; sub->enabled = subform->subenabled; + sub->stream = subform->substream; /* Get conninfo */ datum = SysCacheGetAttr(SUBSCRIPTIONOID, diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 9ebb026187..9065a1be1b 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -59,7 +59,8 @@ parse_subscription_options(List *options, bool *connect, bool *enabled_given, bool *enabled, bool *create_slot, bool *slot_name_given, char **slot_name, bool *copy_data, char **synchronous_commit, - bool *refresh) + bool *refresh, bool *streaming, + bool *streaming_given) { ListCell *lc; bool connect_given = false; @@ -90,6 +91,8 @@ parse_subscription_options(List *options, bool *connect, bool *enabled_given, *synchronous_commit = NULL; if (refresh) *refresh = true; + if (streaming) + *streaming_given = false; /* Parse options */ foreach(lc, options) @@ -175,6 +178,16 @@ parse_subscription_options(List *options, bool *connect, bool *enabled_given, refresh_given = true; *refresh = defGetBoolean(defel); } + else if (strcmp(defel->defname, "streaming") == 0 && streaming) + { + if (*streaming_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + *streaming_given = true; + *streaming = defGetBoolean(defel); + } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -318,6 +331,8 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) bool enabled_given; bool enabled; bool copy_data; + bool streaming; + bool streaming_given; char *synchronous_commit; char *conninfo; char *slotname; @@ -334,7 +349,7 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) parse_subscription_options(stmt->options, &connect, &enabled_given, &enabled, &create_slot, &slotname_given, &slotname, ©_data, &synchronous_commit, - NULL); + NULL, &streaming, &streaming_given); /* * Since creating a replication slot is not transactional, rolling back @@ -412,6 +427,13 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) values[Anum_pg_subscription_subpublications - 1] = publicationListToArray(publications); + if (streaming_given) + values[Anum_pg_subscription_substream - 1] = + BoolGetDatum(streaming); + else + values[Anum_pg_subscription_substream - 1] = + BoolGetDatum(false); + tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); /* Insert tuple into catalog. */ @@ -669,10 +691,13 @@ AlterSubscription(AlterSubscriptionStmt *stmt) char *slotname; bool slotname_given; char *synchronous_commit; + bool streaming; + bool streaming_given; parse_subscription_options(stmt->options, NULL, NULL, NULL, NULL, &slotname_given, &slotname, - NULL, &synchronous_commit, NULL); + NULL, &synchronous_commit, NULL, + &streaming, &streaming_given); if (slotname_given) { @@ -697,6 +722,13 @@ AlterSubscription(AlterSubscriptionStmt *stmt) replaces[Anum_pg_subscription_subsynccommit - 1] = true; } + if (streaming_given) + { + values[Anum_pg_subscription_substream - 1] = + BoolGetDatum(streaming); + replaces[Anum_pg_subscription_substream - 1] = true; + } + update_tuple = true; break; } @@ -708,7 +740,8 @@ AlterSubscription(AlterSubscriptionStmt *stmt) parse_subscription_options(stmt->options, NULL, &enabled_given, &enabled, NULL, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, + NULL, NULL); Assert(enabled_given); if (!sub->slotname && enabled) @@ -746,7 +779,7 @@ AlterSubscription(AlterSubscriptionStmt *stmt) parse_subscription_options(stmt->options, NULL, NULL, NULL, NULL, NULL, NULL, ©_data, - NULL, &refresh); + NULL, &refresh, NULL, NULL); values[Anum_pg_subscription_subpublications - 1] = publicationListToArray(stmt->publication); @@ -783,7 +816,7 @@ AlterSubscription(AlterSubscriptionStmt *stmt) parse_subscription_options(stmt->options, NULL, NULL, NULL, NULL, NULL, NULL, ©_data, - NULL, NULL); + NULL, NULL, NULL, NULL); AlterSubscription_refresh(sub, copy_data); diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 309378ae54..6713392d4d 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4139,6 +4139,18 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_WAL_WRITE: event_name = "WALWrite"; break; + case WAIT_EVENT_LOGICAL_CHANGES_READ: + event_name = "ReorderLogicalChangesRead"; + break; + case WAIT_EVENT_LOGICAL_CHANGES_WRITE: + event_name = "ReorderLogicalChangesWrite"; + break; + case WAIT_EVENT_LOGICAL_SUBXACT_READ: + event_name = "ReorderLogicalSubxactRead"; + break; + case WAIT_EVENT_LOGICAL_SUBXACT_WRITE: + event_name = "ReorderLogicalSubxactWrite"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index e4fd1f9bb6..5257ab0394 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -408,6 +408,9 @@ libpqrcv_startstreaming(WalReceiverConn *conn, appendStringInfo(&cmd, "proto_version '%u'", options->proto.logical.proto_version); + if (options->proto.logical.streaming) + appendStringInfo(&cmd, ", streaming 'on'"); + pubnames = options->proto.logical.publication_names; pubnames_str = stringlist_to_identifierstr(conn->streamConn, pubnames); if (!pubnames_str) diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index 3c6d0cd171..83d0642cf3 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -139,10 +139,15 @@ logicalrep_read_origin(StringInfo in, XLogRecPtr *origin_lsn) * Write INSERT to the output stream. */ void -logicalrep_write_insert(StringInfo out, Relation rel, HeapTuple newtuple) +logicalrep_write_insert(StringInfo out, TransactionId xid, + Relation rel, HeapTuple newtuple) { pq_sendbyte(out, 'I'); /* action INSERT */ + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -178,8 +183,8 @@ logicalrep_read_insert(StringInfo in, LogicalRepTupleData *newtup) * Write UPDATE to the output stream. */ void -logicalrep_write_update(StringInfo out, Relation rel, HeapTuple oldtuple, - HeapTuple newtuple) +logicalrep_write_update(StringInfo out, TransactionId xid, Relation rel, + HeapTuple oldtuple, HeapTuple newtuple) { pq_sendbyte(out, 'U'); /* action UPDATE */ @@ -187,6 +192,10 @@ logicalrep_write_update(StringInfo out, Relation rel, HeapTuple oldtuple, rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX); + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -248,7 +257,8 @@ logicalrep_read_update(StringInfo in, bool *has_oldtuple, * Write DELETE to the output stream. */ void -logicalrep_write_delete(StringInfo out, Relation rel, HeapTuple oldtuple) +logicalrep_write_delete(StringInfo out, TransactionId xid, + Relation rel, HeapTuple oldtuple) { Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || @@ -256,6 +266,10 @@ logicalrep_write_delete(StringInfo out, Relation rel, HeapTuple oldtuple) pq_sendbyte(out, 'D'); /* action DELETE */ + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -296,6 +310,7 @@ logicalrep_read_delete(StringInfo in, LogicalRepTupleData *oldtup) */ void logicalrep_write_truncate(StringInfo out, + TransactionId xid, int nrelids, Oid relids[], bool cascade, bool restart_seqs) @@ -305,6 +320,10 @@ logicalrep_write_truncate(StringInfo out, pq_sendbyte(out, 'T'); /* action TRUNCATE */ + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + pq_sendint32(out, nrelids); /* encode and send truncate flags */ @@ -347,12 +366,16 @@ logicalrep_read_truncate(StringInfo in, * Write relation description to the output stream. */ void -logicalrep_write_rel(StringInfo out, Relation rel) +logicalrep_write_rel(StringInfo out, TransactionId xid, Relation rel) { char *relname; pq_sendbyte(out, 'R'); /* sending RELATION */ + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -397,7 +420,7 @@ logicalrep_read_rel(StringInfo in) * This function will always write base type info. */ void -logicalrep_write_typ(StringInfo out, Oid typoid) +logicalrep_write_typ(StringInfo out, TransactionId xid, Oid typoid) { Oid basetypoid = getBaseType(typoid); HeapTuple tup; @@ -405,6 +428,10 @@ logicalrep_write_typ(StringInfo out, Oid typoid) pq_sendbyte(out, 'Y'); /* sending TYPE */ + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(basetypoid)); if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for type %u", basetypoid); @@ -685,3 +712,104 @@ logicalrep_read_namespace(StringInfo in) return nspname; } + +void +logicalrep_write_stream_start(StringInfo out, + TransactionId xid, bool first_segment) +{ + pq_sendbyte(out, 'S'); /* action STREAM START */ + + Assert(TransactionIdIsValid(xid)); + + /* transaction ID (we're starting to stream, so must be valid) */ + pq_sendint32(out, xid); + + /* 1 if this is the first streaming segment for this xid */ + pq_sendbyte(out, first_segment ? 1 : 0); +} + +TransactionId +logicalrep_read_stream_start(StringInfo in, bool *first_segment) +{ + TransactionId xid; + + Assert(first_segment); + + xid = pq_getmsgint(in, 4); + *first_segment = (pq_getmsgbyte(in) == 1); + + return xid; +} + +void +logicalrep_write_stream_stop(StringInfo out) +{ + pq_sendbyte(out, 'E'); /* action STREAM END */ +} + +void +logicalrep_write_stream_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, 'c'); /* action STREAM COMMIT */ + + Assert(TransactionIdIsValid(txn->xid)); + + /* transaction ID */ + pq_sendint32(out, txn->xid); + + /* send the flags field (unused for now) */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->commit_time); +} + +TransactionId +logicalrep_read_stream_commit(StringInfo in, LogicalRepCommitData *commit_data) +{ + TransactionId xid; + uint8 flags; + + xid = pq_getmsgint(in, 4); + + /* read flags (unused for now) */ + flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit message", flags); + + /* read fields */ + commit_data->commit_lsn = pq_getmsgint64(in); + commit_data->end_lsn = pq_getmsgint64(in); + commit_data->committime = pq_getmsgint64(in); + + return xid; +} + +void +logicalrep_write_stream_abort(StringInfo out, TransactionId xid, + TransactionId subxid) +{ + pq_sendbyte(out, 'A'); /* action STREAM ABORT */ + + Assert(TransactionIdIsValid(xid) && TransactionIdIsValid(subxid)); + + /* transaction ID */ + pq_sendint32(out, xid); + pq_sendint32(out, subxid); +} + +void +logicalrep_read_stream_abort(StringInfo in, TransactionId *xid, + TransactionId *subxid) +{ + Assert(xid && subxid); + + *xid = pq_getmsgint(in, 4); + *subxid = pq_getmsgint(in, 4); +} diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index a752a1224d..d2d9469999 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -18,11 +18,32 @@ * This module includes server facing code and shares libpqwalreceiver * module with walreceiver for providing the libpq specific functionality. * + * + * STREAMED TRANSACTIONS + * --------------------- + * + * Streamed transactions (large transactions exceeding a memory limit on the + * upstream) are not applied immediately, but instead the data is written + * to files and then applied at once when the final commit arrives. + * + * Unlike the regular (non-streamed) case, handling streamed transactions has + * to handle aborts of both the toplevel transaction and subtransactions. This + * is achieved by tracking offsets for subtransactions, which is then used + * to truncate the file with serialized changes. + * + * The files are placed in tmp file directory by default, and the filenames + * include both the XID of the toplevel transaction and OID of the subscription. + * This is necessary so that different workers processing a remote transaction + * with the same XID don't interfere. + * *------------------------------------------------------------------------- */ #include "postgres.h" +#include +#include + #include "access/table.h" #include "access/tableam.h" #include "access/xact.h" @@ -33,6 +54,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_subscription.h" #include "catalog/pg_subscription_rel.h" +#include "catalog/pg_tablespace.h" #include "commands/tablecmds.h" #include "commands/trigger.h" #include "executor/executor.h" @@ -64,6 +86,7 @@ #include "replication/worker_internal.h" #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/proc.h" @@ -71,6 +94,7 @@ #include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/catcache.h" +#include "utils/dynahash.h" #include "utils/datum.h" #include "utils/fmgroids.h" #include "utils/guc.h" @@ -100,6 +124,7 @@ typedef struct SlotErrCallbackArg } SlotErrCallbackArg; static MemoryContext ApplyMessageContext = NULL; +static MemoryContext LogicalStreamingContext = NULL; MemoryContext ApplyContext = NULL; WalReceiverConn *wrconn = NULL; @@ -110,12 +135,58 @@ bool MySubscriptionValid = false; bool in_remote_transaction = false; static XLogRecPtr remote_final_lsn = InvalidXLogRecPtr; +/* fields valid only when processing streamed transaction */ +bool in_streamed_transaction = false; + +static TransactionId stream_xid = InvalidTransactionId; +static int stream_fd = -1; + +typedef struct SubXactInfo +{ + TransactionId xid; /* XID of the subxact */ + off_t offset; /* offset in the file */ +} SubXactInfo; + +static uint32 nsubxacts = 0; +static uint32 nsubxacts_max = 0; +static SubXactInfo *subxacts = NULL; +static TransactionId subxact_last = InvalidTransactionId; + +static void subxact_filename(char *path, Oid subid, TransactionId xid); +static void changes_filename(char *path, Oid subid, TransactionId xid); + +/* + * Information about subtransactions of a given toplevel transaction. + */ +static void subxact_info_write(Oid subid, TransactionId xid); +static void subxact_info_read(Oid subid, TransactionId xid); +static void subxact_info_add(TransactionId xid); +static inline void cleanup_subxact_info(void); + +/* + * Serialize and deserialize changes for a toplevel transaction. + */ +static void stream_cleanup_files(Oid subid, TransactionId xid, bool missing_ok); +static void stream_open_file(Oid subid, TransactionId xid, bool first); +static void stream_write_change(char action, StringInfo s); +static void stream_close_file(void); + +/* + * Array of serialized XIDs. + */ +static int nxids = 0; +static int maxnxids = 0; +static TransactionId *xids = NULL; + static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); static void store_flush_position(XLogRecPtr remote_lsn); static void maybe_reread_subscription(void); +/* prototype needed because of stream_commit */ +static void apply_dispatch(StringInfo s); + static void apply_handle_insert_internal(ResultRelInfo *relinfo, EState *estate, TupleTableSlot *remoteslot); static void apply_handle_update_internal(ResultRelInfo *relinfo, @@ -187,6 +258,42 @@ ensure_transaction(void) return true; } +/* + * Handle streamed transactions. + * + * If in streaming mode (receiving a block of streamed transaction), we + * simply redirect it to a file for the proper toplevel transaction. + * + * Returns true for streamed transactions, false otherwise (regular mode). + */ +static bool +handle_streamed_transaction(const char action, StringInfo s) +{ + TransactionId xid; + + /* not in streaming mode */ + if (!in_streamed_transaction) + return false; + + Assert(stream_fd != -1); + Assert(TransactionIdIsValid(stream_xid)); + + /* + * We should have received XID of the subxact as the first part of the + * message, so extract it. + */ + xid = pq_getmsgint(s, 4); + + Assert(TransactionIdIsValid(xid)); + + /* Add the new subxact to the array (unless already there). */ + subxact_info_add(xid); + + /* write the change to the current file */ + stream_write_change(action, s); + + return true; +} /* * Executor state preparation for evaluation of constraint expressions, @@ -552,6 +659,326 @@ apply_handle_origin(StringInfo s) errmsg("ORIGIN message sent out of order"))); } +/* + * Handle STREAM START message. + */ +static void +apply_handle_stream_start(StringInfo s) +{ + bool first_segment; + + Assert(!in_streamed_transaction); + + /* notify handle methods we're processing a remote transaction */ + in_streamed_transaction = true; + + /* extract XID of the top-level transaction */ + stream_xid = logicalrep_read_stream_start(s, &first_segment); + + /* open the spool file for this transaction */ + stream_open_file(MyLogicalRepWorker->subid, stream_xid, first_segment); + + /* + * if this is not the first segment, open existing file + * + * XXX Note that the cleanup is performed by stream_open_file. + */ + if (!first_segment) + { + MemoryContext oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + + /* Read the subxacts info in per-stream context. */ + subxact_info_read(MyLogicalRepWorker->subid, stream_xid); + MemoryContextSwitchTo(oldctx); + } + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Handle STREAM STOP message. + */ +static void +apply_handle_stream_stop(StringInfo s) +{ + Assert(in_streamed_transaction); + + /* + * Close the file with serialized changes, and serialize information about + * subxacts for the toplevel transaction. + */ + subxact_info_write(MyLogicalRepWorker->subid, stream_xid); + stream_close_file(); + + in_streamed_transaction = false; + + /* Reset per-stream context */ + MemoryContextReset(LogicalStreamingContext); + + pgstat_report_activity(STATE_IDLE, NULL); +} + +/* + * Handle STREAM abort message. + */ +static void +apply_handle_stream_abort(StringInfo s) +{ + TransactionId xid; + TransactionId subxid; + + Assert(!in_streamed_transaction); + + logicalrep_read_stream_abort(s, &xid, &subxid); + + /* + * If the two XIDs are the same, it's in fact abort of toplevel xact, so + * just delete the files with serialized info. + */ + if (xid == subxid) + { + stream_cleanup_files(MyLogicalRepWorker->subid, xid, false); + return; + } + else + { + /* + * OK, so it's a subxact. We need to read the subxact file for the + * toplevel transaction, determine the offset tracked for the subxact, + * and truncate the file with changes. We also remove the subxacts + * with higher offsets (or rather higher XIDs). + * + * We intentionally scan the array from the tail, because we're likely + * aborting a change for the most recent subtransactions. + * + * XXX Can we rely on the subxact XIDs arriving in sorted order? That + * would allow us to use binary search here. + * + * XXX Or perhaps we can rely on the aborts to arrive in the reverse + * order, i.e. from the inner-most subxact (when nested)? In which + * case we could simply check the last element. + */ + + int64 i; + int64 subidx; + int fd; + bool found = false; + char path[MAXPGPATH]; + + subidx = -1; + subxact_info_read(MyLogicalRepWorker->subid, xid); + + /* XXX optimize the search by bsearch on sorted data */ + for (i = nsubxacts; i > 0; i--) + { + if (subxacts[i - 1].xid == subxid) + { + subidx = (i - 1); + found = true; + break; + } + } + + /* + * If it's an empty sub-transaction then we will not find the subxid + * here so just cleanup the subxact info and return. + */ + if (!found) + { + /* Cleanup the subxact info */ + cleanup_subxact_info(); + return; + } + + Assert((subidx >= 0) && (subidx < nsubxacts)); + + changes_filename(path, MyLogicalRepWorker->subid, xid); + fd = OpenTransientFile(path, O_WRONLY | PG_BINARY); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } + + /* OK, truncate the file at the right offset. */ + if (ftruncate(fd, subxacts[subidx].offset)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", path))); + CloseTransientFile(fd); + + /* discard the subxacts added later */ + nsubxacts = subidx; + + /* write the updated subxact list */ + subxact_info_write(MyLogicalRepWorker->subid, xid); + } +} + +/* + * Handle STREAM COMMIT message. + */ +static void +apply_handle_stream_commit(StringInfo s) +{ + int fd; + TransactionId xid; + StringInfoData s2; + int nchanges; + + char path[MAXPGPATH]; + char *buffer = NULL; + LogicalRepCommitData commit_data; + + MemoryContext oldcxt; + + Assert(!in_streamed_transaction); + + xid = logicalrep_read_stream_commit(s, &commit_data); + + elog(DEBUG1, "received commit for streamed transaction %u", xid); + + /* open the spool file for the committed transaction */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + + elog(DEBUG1, "replaying changes from file '%s'", path); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } + + ensure_transaction(); + + oldcxt = MemoryContextSwitchTo(TopTransactionContext); + + buffer = palloc(8192); + initStringInfo(&s2); + + MemoryContextSwitchTo(oldcxt); + + /* + * Make sure the handle apply_dispatch methods are aware we're in a remote + * transaction. + */ + in_remote_transaction = true; + pgstat_report_activity(STATE_RUNNING, NULL); + + /* + * Read the entries one by one and pass them through the same logic as in + * apply_dispatch. + */ + nchanges = 0; + while (true) + { + int nbytes; + int len; + + /* read length of the on-disk record */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_CHANGES_READ); + nbytes = read(fd, &len, sizeof(len)); + pgstat_report_wait_end(); + + /* have we reached end of the file? */ + if (nbytes == 0) + break; + + /* do we have a correct length? */ + if (nbytes != sizeof(len)) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file: %m"))); + return; + } + + Assert(len > 0); + + /* make sure we have sufficiently large buffer */ + buffer = repalloc(buffer, len); + + /* and finally read the data into the buffer */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_CHANGES_READ); + if (read(fd, buffer, len) != len) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file: %m"))); + return; + } + pgstat_report_wait_end(); + + /* copy the buffer to the stringinfo and call apply_dispatch */ + resetStringInfo(&s2); + appendBinaryStringInfo(&s2, buffer, len); + + /* Ensure we are reading the data into our memory context. */ + oldcxt = MemoryContextSwitchTo(ApplyMessageContext); + + apply_dispatch(&s2); + + MemoryContextReset(ApplyMessageContext); + + MemoryContextSwitchTo(oldcxt); + + nchanges++; + + if (nchanges % 1000 == 0) + elog(DEBUG1, "replayed %d changes from file '%s'", + nchanges, path); + + /* + * send feedback to upstream + * + * XXX Probably should send a valid LSN. But which one? + */ + send_feedback(InvalidXLogRecPtr, false, false); + } + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + /* + * Update origin state so we can restart streaming from correct + * position in case of crash. + */ + replorigin_session_origin_lsn = commit_data.end_lsn; + replorigin_session_origin_timestamp = commit_data.committime; + + pfree(buffer); + pfree(s2.data); + + CommitTransactionCommand(); + pgstat_report_stat(false); + + store_flush_position(commit_data.end_lsn); + + elog(DEBUG1, "replayed %d (all) changes from file '%s'", + nchanges, path); + + in_remote_transaction = false; + pgstat_report_activity(STATE_IDLE, NULL); + + /* unlink the files with serialized changes and subxact info */ + stream_cleanup_files(MyLogicalRepWorker->subid, xid, false); +} + /* * Handle RELATION message. * @@ -565,6 +992,9 @@ apply_handle_relation(StringInfo s) { LogicalRepRelation *rel; + if (handle_streamed_transaction('R', s)) + return; + rel = logicalrep_read_rel(s); logicalrep_relmap_update(rel); } @@ -580,6 +1010,9 @@ apply_handle_type(StringInfo s) { LogicalRepTyp typ; + if (handle_streamed_transaction('Y', s)) + return; + logicalrep_read_typ(s, &typ); logicalrep_typmap_update(&typ); } @@ -616,6 +1049,9 @@ apply_handle_insert(StringInfo s) TupleTableSlot *remoteslot; MemoryContext oldctx; + if (handle_streamed_transaction('I', s)) + return; + ensure_transaction(); relid = logicalrep_read_insert(s, &newtup); @@ -731,6 +1167,9 @@ apply_handle_update(StringInfo s) RangeTblEntry *target_rte; MemoryContext oldctx; + if (handle_streamed_transaction('U', s)) + return; + ensure_transaction(); relid = logicalrep_read_update(s, &has_oldtup, &oldtup, @@ -873,6 +1312,9 @@ apply_handle_delete(StringInfo s) TupleTableSlot *remoteslot; MemoryContext oldctx; + if (handle_streamed_transaction('D', s)) + return; + ensure_transaction(); relid = logicalrep_read_delete(s, &oldtup); @@ -1243,6 +1685,9 @@ apply_handle_truncate(StringInfo s) List *relids_logged = NIL; ListCell *lc; + if (handle_streamed_transaction('T', s)) + return; + ensure_transaction(); remote_relids = logicalrep_read_truncate(s, &cascade, &restart_seqs); @@ -1381,6 +1826,22 @@ apply_dispatch(StringInfo s) case 'O': apply_handle_origin(s); break; + /* STREAM START */ + case 'S': + apply_handle_stream_start(s); + break; + /* STREAM END */ + case 'E': + apply_handle_stream_stop(s); + break; + /* STREAM ABORT */ + case 'A': + apply_handle_stream_abort(s); + break; + /* STREAM COMMIT */ + case 'c': + apply_handle_stream_commit(s); + break; default: ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -1477,6 +1938,22 @@ UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply) } } +/* + * Cleanup function. + * + * Called on logical replication worker exit. + */ +static void +worker_onexit(int code, Datum arg) +{ + int i; + + elog(LOG, "cleanup files for %d transactions", nxids); + + for (i = nxids-1; i >= 0; i--) + stream_cleanup_files(MyLogicalRepWorker->subid, xids[i], true); +} + /* * Apply main loop. */ @@ -1493,6 +1970,17 @@ LogicalRepApplyLoop(XLogRecPtr last_received) "ApplyMessageContext", ALLOCSET_DEFAULT_SIZES); + /* + * This memory context used for per stream data when streaming mode is + * enabled. This context is reeset on each stream stop. + */ + LogicalStreamingContext = AllocSetContextCreate(ApplyContext, + "LogicalStreamingContext", + ALLOCSET_DEFAULT_SIZES); + + /* do cleanup on worker exit (e.g. after DROP SUBSCRIPTION) */ + before_shmem_exit(worker_onexit, (Datum) 0); + /* mark as idle, before starting to loop */ pgstat_report_activity(STATE_IDLE, NULL); @@ -1941,6 +2429,529 @@ subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue) MySubscriptionValid = false; } +/* + * subxact_info_write + * Store information about subxacts for a toplevel transaction. + * + * For each subxact we store offset of it's first change in the main file. + * The file is always over-written as a whole. + * + * XXX We should only store subxacts that were not aborted yet. + */ +static void +subxact_info_write(Oid subid, TransactionId xid) +{ + int fd; + char path[MAXPGPATH]; + Size len; + + Assert(TransactionIdIsValid(xid)); + + subxact_filename(path, subid, xid); + + fd = OpenTransientFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + path))); + return; + } + + len = sizeof(SubXactInfo) * nsubxacts; + + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_SUBXACT_WRITE); + + if (write(fd, &nsubxacts, sizeof(nsubxacts)) != sizeof(nsubxacts)) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + path))); + return; + } + + if ((len > 0) && (write(fd, subxacts, len) != len)) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + path))); + return; + } + + pgstat_report_wait_end(); + + /* + * We don't need to fsync or anything, as we'll recreate the files after a + * crash from scratch. So just close the file. + */ + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + /* + * But we free the memory allocated for subxact info. There might be one + * exceptional transaction with many subxacts, and we don't want to keep + * the memory allocated forever. + */ + cleanup_subxact_info(); +} + +/* + * subxact_info_read + * Restore information about subxacts of a streamed transaction. + * + * Read information about subxacts into the global variables. + * + * XXX Add calls to pgstat_report_wait_start/pgstat_report_wait_end. + */ +static void +subxact_info_read(Oid subid, TransactionId xid) +{ + int fd; + char path[MAXPGPATH]; + Size len; + + Assert(TransactionIdIsValid(xid)); + Assert(!subxacts); + Assert(nsubxacts == 0); + Assert(nsubxacts_max == 0); + + subxact_filename(path, subid, xid); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + return; + } + + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_SUBXACT_READ); + + /* read number of subxact items */ + if (read(fd, &nsubxacts, sizeof(nsubxacts)) != sizeof(nsubxacts)) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + return; + } + + pgstat_report_wait_end(); + + len = sizeof(SubXactInfo) * nsubxacts; + + /* we keep the maximum as a power of 2 */ + nsubxacts_max = 1 << my_log2(nsubxacts); + + /* + * Let the caller decide which memory context it will be allocated. + * Ideally, during stream start it will be allocated in the + * LogicalStreamingContext which will be reset on stream stop, and + * during the stream abort we need this memory only for short term so + * it will be allocated in ApplyMessageContext. + */ + subxacts = palloc(nsubxacts_max * sizeof(SubXactInfo)); + + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_SUBXACT_READ); + + if ((len > 0) && ((read(fd, subxacts, len)) != len)) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + return; + } + + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + +/* + * subxact_info_add + * Add information about a subxact (offset in the main file). + */ +static void +subxact_info_add(TransactionId xid) +{ + int64 i; + + /* We must have a valid top level stream xid and a stream fd. */ + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd >= 0); + + /* + * If the XID matches the toplevel transaction, we don't want to add it. + */ + if (stream_xid == xid) + return; + + /* + * In most cases we're checking the same subxact as we've already seen in + * the last call, so make sure to ignore it (this change comes later). + */ + if (subxact_last == xid) + return; + + /* OK, remember we're processing this XID. */ + subxact_last = xid; + + /* + * Check if the transaction is already present in the array of subxact. We + * intentionally scan the array from the tail, because we're likely adding + * a change for the most recent subtransactions. + * + * XXX Can we rely on the subxact XIDs arriving in sorted order? That + * would allow us to use binary search here. + */ + for (i = nsubxacts; i > 0; i--) + { + /* found, so we're done */ + if (subxacts[i - 1].xid == xid) + return; + } + + /* This is a new subxact, so we need to add it to the array. */ + if (nsubxacts == 0) + { + MemoryContext oldctx; + + nsubxacts_max = 128; + + /* Allocate this in per-stream context */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + subxacts = palloc(nsubxacts_max * sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); + } + else if (nsubxacts == nsubxacts_max) + { + nsubxacts_max *= 2; + subxacts = repalloc(subxacts, nsubxacts_max * sizeof(SubXactInfo)); + } + + subxacts[nsubxacts].xid = xid; + subxacts[nsubxacts].offset = lseek(stream_fd, 0, SEEK_END); + + nsubxacts++; +} + +/* format filename for file containing the info about subxacts */ +static void +subxact_filename(char *path, Oid subid, TransactionId xid) +{ + char tempdirpath[MAXPGPATH]; + + TempTablespacePath(tempdirpath, DEFAULTTABLESPACE_OID); + + /* + * We might need to create the tablespace's tempfile directory, if no + * one has yet done so. + */ + if ((MakePGDirectory(tempdirpath) < 0) && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + tempdirpath))); + + snprintf(path, MAXPGPATH, "%s/%s%d-%u-%u.subxacts", + tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, subid, xid); +} + +/* format filename for file containing serialized changes */ +static void +changes_filename(char *path, Oid subid, TransactionId xid) +{ + char tempdirpath[MAXPGPATH]; + + TempTablespacePath(tempdirpath, DEFAULTTABLESPACE_OID); + + /* + * We might need to create the tablespace's tempfile directory, if no + * one has yet done so. + */ + if ((MakePGDirectory(tempdirpath) < 0) && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + tempdirpath))); + + snprintf(path, MAXPGPATH, "%s/%s%d-%u-%u.changes", + tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, subid, xid); +} + +/* + * stream_cleanup_files + * Cleanup files for a subscription / toplevel transaction. + * + * Remove files with serialized changes and subxact info for a particular + * toplevel transaction. Each subscription has a separate set of files. + * + * Note: The files may not exists, so handle ENOENT as non-error. + * + * missing_ok - don't report error for missing file is the flag is passed true. + */ +static void +stream_cleanup_files(Oid subid, TransactionId xid, bool missing_ok) +{ + int i; + char path[MAXPGPATH]; + bool found = false; + + subxact_filename(path, subid, xid); + + if ((unlink(path) < 0) && (errno != ENOENT) && !missing_ok) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + + changes_filename(path, subid, xid); + + if ((unlink(path) < 0) && (errno != ENOENT) && !missing_ok) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + + /* + * Cleanup the XID from the array - find the XID in the array and + * remove it by shifting all the remaining elements. The array is + * bound to be fairly small (maximum number of in-progress xacts, + * so max_connections + max_prepared_transactions) so simply loop + * through the array and find index of the XID. Then move the rest + * of the array by one element to the left. + * + * Notice we also call this from stream_open_file for first segment + * of each transaction, to deal with possible left-overs after a + * crash, so it's entirely possible not to find the XID in the + * array here. In that case we don't remove anything. + * + * XXX Perhaps it'd be better to handle this automatically after a + * restart, instead of doing it over and over for each transaction. + */ + for (i = 0; i < nxids; i++) + { + if (xids[i] == xid) + { + found = true; + break; + } + } + + if (!found) + return; + + /* + * Move the last entry from the array to the place. We don't keep + * the streamed transactions sorted or anything - we only expect + * a few of them in progress (max_connections + max_prepared_xacts) + * so linear search is just fine. + */ + xids[i] = xids[nxids-1]; + nxids--; +} + +/* + * stream_open_file + * Open file we'll use to serialize changes for a toplevel transaction. + * + * Open a file for streamed changes from a toplevel transaction identified + * by stream_xid (global variable). If it's the first chunk of streamed + * changes for this transaction, perform cleanup by removing existing + * files after a possible previous crash. + * + * This can only be called at the beginning of a "streaming" block, i.e. + * between stream_start/stream_stop messages from the upstream. + */ +static void +stream_open_file(Oid subid, TransactionId xid, bool first_segment) +{ + char path[MAXPGPATH]; + int flags; + + Assert(in_streamed_transaction); + Assert(OidIsValid(subid)); + Assert(TransactionIdIsValid(xid)); + Assert(stream_fd == -1); + + /* + * If this is the first segment for this transaction, try removing + * existing files (if there are any, possibly after a crash). + */ + if (first_segment) + { + MemoryContext oldcxt; + + /* XXX make sure there are no previous files for this transaction */ + stream_cleanup_files(subid, xid, true); + + /* Need to allocate this in permanent context */ + oldcxt = MemoryContextSwitchTo(ApplyContext); + + /* + * We need to remember the XIDs we spilled to files, so that we can + * remove them at worker exit (e.g. after DROP SUBSCRIPTION). + * + * The number of XIDs we may need to track is fairly small, because + * we can only stream toplevel xacts (so limited by max_connections + * and max_prepared_transactions), and we only stream the large ones. + * So we simply keep the XIDs in an unsorted array. If the number of + * xacts gets large for some reason (e.g. very high max_connections), + * a more elaborate approach might be better - e.g. sorted array, to + * speed-up the lookups. + */ + if (nxids == maxnxids) /* array of XIDs is full */ + { + if (!xids) + { + maxnxids = 64; + xids = palloc(maxnxids * sizeof(TransactionId)); + } + else + { + maxnxids = 2 * maxnxids; + xids = repalloc(xids, maxnxids * sizeof(TransactionId)); + } + } + + xids[nxids++] = xid; + + MemoryContextSwitchTo(oldcxt); + } + + changes_filename(path, subid, xid); + + elog(DEBUG1, "opening file '%s' for streamed changes", path); + + /* + * If this is the first streamed segment, the file must not exist, so + * make sure we're the ones creating it. Otherwise just open the file + * for writing, in append mode. + */ + if (first_segment) + flags = (O_WRONLY | O_CREAT | O_EXCL | PG_BINARY); + else + flags = (O_WRONLY | O_APPEND | PG_BINARY); + + stream_fd = OpenTransientFile(path, flags); + + if (stream_fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + +/* + * stream_close_file + * Close the currently open file with streamed changes. + * + * This can only be called at the end of a streaming block, i.e. at stream_stop + * message from the upstream. + */ +static void +stream_close_file(void) +{ + Assert(in_streamed_transaction); + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd != -1); + + CloseTransientFile(stream_fd); + + stream_xid = InvalidTransactionId; + stream_fd = -1; +} + +/* + * stream_write_change + * Serialize a change to a file for the current toplevel transaction. + * + * The change is serialized in a simple format, with length (not including + * the length), action code (identifying the message type) and message + * contents (without the subxact TransactionId value). + * + * XXX The subxact file includes CRC32C of the contents. Maybe we should + * include something like that here too, but doing so will not be as + * straighforward, because we write the file in chunks. + */ +static void +stream_write_change(char action, StringInfo s) +{ + int len; + + Assert(in_streamed_transaction); + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd != -1); + + /* total on-disk size, including the action type character */ + len = (s->len - s->cursor) + sizeof(char); + + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_CHANGES_WRITE); + + /* first write the size */ + if (write(stream_fd, &len, sizeof(len)) != sizeof(len)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not serialize streamed change to file: %m"))); + + /* then the action */ + if (write(stream_fd, &action, sizeof(action)) != sizeof(action)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not serialize streamed change to file: %m"))); + + /* and finally the remaining part of the buffer (after the XID) */ + len = (s->len - s->cursor); + + if (write(stream_fd, &s->data[s->cursor], len) != len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not serialize streamed change to file: %m"))); + + pgstat_report_wait_end(); +} + +/* + * Cleanup the memory for subxacts and reset the related variables. + */ +static inline void +cleanup_subxact_info() +{ + if (subxacts) + pfree(subxacts); + + subxacts = NULL; + subxact_last = InvalidTransactionId; + nsubxacts = 0; + nsubxacts_max = 0; +} + /* Logical Replication Apply worker entry point */ void ApplyWorkerMain(Datum main_arg) @@ -2106,6 +3117,7 @@ ApplyWorkerMain(Datum main_arg) options.slotname = myslotname; options.proto.logical.proto_version = LOGICALREP_PROTO_VERSION_NUM; options.proto.logical.publication_names = MySubscription->publications; + options.proto.logical.streaming = MySubscription->stream; /* Start normal logical streaming replication. */ walrcv_startstreaming(wrconn, &options); diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 15379e3118..1509f9b826 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -46,29 +46,57 @@ static void pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferChange *change); static bool pgoutput_origin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id); +static void pgoutput_stream_abort(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); + +static void pgoutput_stream_commit(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); + +static void pgoutput_stream_start(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); + +static void pgoutput_stream_stop(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); static bool publications_valid; +static bool in_streaming; static List *LoadPublications(List *pubnames); static void publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue); -static void send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx); +static void send_relation_and_attrs(Relation relation, TransactionId xid, + LogicalDecodingContext *ctx); /* * Entry in the map used to remember which relation schemas we sent. * + * The schema_sent flag determines if the current schema record was already + * sent to the subscriber (in which case we don't need to send it again). + * + * The schema cache on downstream is however updated only at commit time, + * and with streamed transactions the commit order may be different from + * the order the transactions are sent in. Also, the (sub) transactions + * might get aborted so we need to send the schema for each (sub) transaction + * so that we don't loose the schema information on abort. For handling this, + * we maintain the list of xids (streamed_txns) for those we have already sent + * the schema. + * * For partitions, 'pubactions' considers not only the table's own * publications, but also those of all of its ancestors. */ typedef struct RelationSyncEntry { Oid relid; /* relation oid */ - + TransactionId xid; /* transaction that created the record */ /* * Did we send the schema? If ancestor relid is set, its schema must also * have been sent for this to be true. */ bool schema_sent; + List *streamed_txns; /* streamed toplevel transactions with this + * schema */ bool replicate_valid; PublicationActions pubactions; @@ -94,11 +122,17 @@ typedef struct RelationSyncEntry static HTAB *RelationSyncCache = NULL; static void init_rel_sync_cache(MemoryContext decoding_context); +static void cleanup_rel_sync_cache(TransactionId xid, bool is_commit); static RelationSyncEntry *get_rel_sync_entry(PGOutputData *data, Oid relid); static void rel_sync_cache_relation_cb(Datum arg, Oid relid); static void rel_sync_cache_publication_cb(Datum arg, int cacheid, uint32 hashvalue); +static void set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, + TransactionId xid); +static bool get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, + TransactionId xid); + /* * Specify output plugin callbacks */ @@ -114,15 +148,24 @@ _PG_output_plugin_init(OutputPluginCallbacks *cb) cb->commit_cb = pgoutput_commit_txn; cb->filter_by_origin_cb = pgoutput_origin_filter; cb->shutdown_cb = pgoutput_shutdown; + + /* transaction streaming */ + cb->stream_change_cb = pgoutput_change; + cb->stream_truncate_cb = pgoutput_truncate; + cb->stream_abort_cb = pgoutput_stream_abort; + cb->stream_commit_cb = pgoutput_stream_commit; + cb->stream_start_cb = pgoutput_stream_start; + cb->stream_stop_cb = pgoutput_stream_stop; } static void parse_output_parameters(List *options, uint32 *protocol_version, - List **publication_names) + List **publication_names, bool *enable_streaming) { ListCell *lc; bool protocol_version_given = false; bool publication_names_given = false; + bool streaming_given = false; foreach(lc, options) { @@ -168,6 +211,23 @@ parse_output_parameters(List *options, uint32 *protocol_version, (errcode(ERRCODE_INVALID_NAME), errmsg("invalid publication_names syntax"))); } + else if (strcmp(defel->defname, "streaming") == 0) + { + if (streaming_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + streaming_given = true; + + /* the value must be on/off */ + if (strcmp(strVal(defel->arg), "on") && strcmp(strVal(defel->arg), "off")) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid streaming value"))); + + /* enable streaming if it's 'on' */ + *enable_streaming = (strcmp(strVal(defel->arg), "on") == 0); + } else elog(ERROR, "unrecognized pgoutput option: %s", defel->defname); } @@ -180,6 +240,7 @@ static void pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init) { + bool enable_streaming = false; PGOutputData *data = palloc0(sizeof(PGOutputData)); /* Create our memory context for private allocations. */ @@ -202,7 +263,8 @@ pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, /* Parse the params and ERROR if we see any we don't recognize */ parse_output_parameters(ctx->output_plugin_options, &data->protocol_version, - &data->publication_names); + &data->publication_names, + &enable_streaming); /* Check if we support requested protocol */ if (data->protocol_version > LOGICALREP_PROTO_VERSION_NUM) @@ -222,6 +284,27 @@ pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("publication_names parameter missing"))); + /* + * Decide whether to enable streaming. It is disabled by default, in + * which case we just update the flag in decoding context. Otherwise + * we only allow it with sufficient version of the protocol, and when + * the output plugin supports it. + */ + if (!enable_streaming) + ctx->streaming = false; + else if (data->protocol_version < LOGICALREP_PROTO_STREAM_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("requested proto_version=%d does not support streaming, need %d or higher", + data->protocol_version, LOGICALREP_PROTO_STREAM_VERSION_NUM))); + else if (!ctx->streaming) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("streaming requested, but not supported by output plugin"))); + + /* Also remember we're currently not streaming any transaction. */ + in_streaming = false; + /* Init publication state. */ data->publications = NIL; publications_valid = false; @@ -290,9 +373,41 @@ pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, */ static void maybe_send_schema(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, ReorderBufferChange *change, Relation relation, RelationSyncEntry *relentry) { - if (relentry->schema_sent) + bool schema_sent; + TransactionId xid = InvalidTransactionId; + TransactionId topxid = InvalidTransactionId; + + /* + * Remember XID of the (sub)transaction for the change. We don't care if + * it's top-level transaction or not (we have already sent that XID in + * start the current streaming block). + * + * If we're not in a streaming block, just use InvalidTransactionId and + * the write methods will not include it. + */ + if (in_streaming) + xid = change->txn->xid; + + if (change->txn->toptxn) + topxid = change->txn->toptxn->xid; + else + topxid = xid; + + /* + * Do we need to send the schema? We do track streamed transactions + * separately, because those may not be applied later (and the regular + * transactions won't see their effects until then) and in an order + * that we don't know at this point. + */ + if (in_streaming) + schema_sent = get_schema_sent_in_streamed_txn(relentry, topxid); + else + schema_sent = relentry->schema_sent; + + if (schema_sent) return; /* If needed, send the ancestor's schema first. */ @@ -308,19 +423,25 @@ maybe_send_schema(LogicalDecodingContext *ctx, relentry->map = convert_tuples_by_name(CreateTupleDescCopy(indesc), CreateTupleDescCopy(outdesc)); MemoryContextSwitchTo(oldctx); - send_relation_and_attrs(ancestor, ctx); + send_relation_and_attrs(ancestor, xid, ctx); RelationClose(ancestor); } - send_relation_and_attrs(relation, ctx); - relentry->schema_sent = true; + send_relation_and_attrs(relation, xid, ctx); + relentry->xid = change->txn->xid; + + if (in_streaming) + set_schema_sent_in_streamed_txn(relentry, topxid); + else + relentry->schema_sent = true; } /* * Sends a relation */ static void -send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx) +send_relation_and_attrs(Relation relation, TransactionId xid, + LogicalDecodingContext *ctx) { TupleDesc desc = RelationGetDescr(relation); int i; @@ -344,17 +465,19 @@ send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx) continue; OutputPluginPrepareWrite(ctx, false); - logicalrep_write_typ(ctx->out, att->atttypid); + logicalrep_write_typ(ctx->out, xid, att->atttypid); OutputPluginWrite(ctx, false); } OutputPluginPrepareWrite(ctx, false); - logicalrep_write_rel(ctx->out, relation); + logicalrep_write_rel(ctx->out, xid, relation); OutputPluginWrite(ctx, false); } /* * Sends the decoded DML over wire. + * + * XXX May be called both in streaming and non-streaming modes. */ static void pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, @@ -363,6 +486,10 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; MemoryContext old; RelationSyncEntry *relentry; + TransactionId xid = InvalidTransactionId; + + if (in_streaming) + xid = change->txn->xid; if (!is_publishable_relation(relation)) return; @@ -391,7 +518,7 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, /* Avoid leaking memory by using and resetting our own context */ old = MemoryContextSwitchTo(data->context); - maybe_send_schema(ctx, relation, relentry); + maybe_send_schema(ctx, txn, change, relation, relentry); /* Send the data */ switch (change->action) @@ -411,7 +538,7 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, } OutputPluginPrepareWrite(ctx, true); - logicalrep_write_insert(ctx->out, relation, tuple); + logicalrep_write_insert(ctx->out, xid, relation, tuple); OutputPluginWrite(ctx, true); break; } @@ -435,7 +562,7 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, } OutputPluginPrepareWrite(ctx, true); - logicalrep_write_update(ctx->out, relation, oldtuple, newtuple); + logicalrep_write_update(ctx->out, xid, relation, oldtuple, newtuple); OutputPluginWrite(ctx, true); break; } @@ -455,7 +582,7 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, } OutputPluginPrepareWrite(ctx, true); - logicalrep_write_delete(ctx->out, relation, oldtuple); + logicalrep_write_delete(ctx->out, xid, relation, oldtuple); OutputPluginWrite(ctx, true); } else @@ -480,6 +607,10 @@ pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, int i; int nrelids; Oid *relids; + TransactionId xid = InvalidTransactionId; + + if (in_streaming) + xid = change->txn->xid; old = MemoryContextSwitchTo(data->context); @@ -508,13 +639,14 @@ pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, continue; relids[nrelids++] = relid; - maybe_send_schema(ctx, relation, relentry); + maybe_send_schema(ctx, txn, change, relation, relentry); } if (nrelids > 0) { OutputPluginPrepareWrite(ctx, true); logicalrep_write_truncate(ctx->out, + xid, nrelids, relids, change->data.truncate.cascade, @@ -587,6 +719,91 @@ publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue) rel_sync_cache_publication_cb(arg, cacheid, hashvalue); } +/* + * Notify downstream to discard the streamed transaction (along with all + * it's subtransactions, if it's a toplevel transaction). + */ +static void +pgoutput_stream_abort(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + ReorderBufferTXN *toptxn; + + /* + * The abort should happen outside streaming block, even for streamed + * transactions. The transaction has to be marked as streamed, though. + */ + Assert(!in_streaming); + + /* determine the toplevel transaction */ + toptxn = (txn->toptxn) ? txn->toptxn : txn; + + Assert(rbtxn_is_streamed(toptxn)); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_abort(ctx->out, toptxn->xid, txn->xid); + OutputPluginWrite(ctx, true); + + cleanup_rel_sync_cache(toptxn->xid, false); +} + +/* + * Notify downstream to apply the streamed transaction (along with all + * it's subtransactions). + */ +static void +pgoutput_stream_commit(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + /* + * The commit should happen outside streaming block, even for streamed + * transactions. The transaction has to be marked as streamed, though. + */ + Assert(!in_streaming); + Assert(rbtxn_is_streamed(txn)); + + OutputPluginUpdateProgress(ctx); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_commit(ctx->out, txn, commit_lsn); + OutputPluginWrite(ctx, true); + + cleanup_rel_sync_cache(txn->xid, true); +} + + +static void +pgoutput_stream_start(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + /* we can't nest streaming of transactions */ + Assert(!in_streaming); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_start(ctx->out, txn->xid, !rbtxn_is_streamed(txn)); + OutputPluginWrite(ctx, true); + + /* we're streaming a chunk of transaction now */ + in_streaming = true; +} + +static void +pgoutput_stream_stop(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + /* we should be streaming a trasanction */ + Assert(in_streaming); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_stop(ctx->out); + OutputPluginWrite(ctx, true); + + /* we've stopped streaming a transaction */ + in_streaming = false; +} + /* * Initialize the relation schema sync cache for a decoding session. * @@ -623,6 +840,38 @@ init_rel_sync_cache(MemoryContext cachectx) (Datum) 0); } +/* + * We expect relatively small number of streamed transactions. + */ +static bool +get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid) +{ + ListCell *lc; + foreach (lc, entry->streamed_txns) + { + if (xid == lfirst_int(lc)) + return true; + } + + return false; +} + +/* + * Add the xid in the rel sync entry for which we have already sent the schema + * of the relation. + */ +static void +set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid) +{ + MemoryContext oldctx; + + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + + entry->streamed_txns = lappend_int(entry->streamed_txns, xid); + + MemoryContextSwitchTo(oldctx); +} + /* * Find or create entry in the relation schema cache. * @@ -753,11 +1002,44 @@ get_rel_sync_entry(PGOutputData *data, Oid relid) } if (!found) + { entry->schema_sent = false; + entry->streamed_txns = NULL; + } return entry; } +/* + * Cleanup list of streamed transactions and update the schema_sent flag. + * + * When a streamed transaction commits or aborts, we need to remove the + * toplevel XID from the schema cache. If the transaction aborted, the + * subscriber will simply throw away the schema records we streamed, so + * we don't need to do anything else. + * + * If the transaction committed, the subscriber will update the relation + * cache - so tweak the schema_sent flag accordingly. + */ +static void +cleanup_rel_sync_cache(TransactionId xid, bool is_commit) +{ + HASH_SEQ_STATUS hash_seq; + RelationSyncEntry *entry; + + Assert(RelationSyncCache != NULL); + + hash_seq_init(&hash_seq, RelationSyncCache); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + if (is_commit) + entry->schema_sent = true; + + /* Remove the xid from the schema sent list. */ + entry->streamed_txns = list_delete_int(entry->streamed_txns, xid); + } +} + /* * Relcache invalidation callback */ @@ -793,7 +1075,11 @@ rel_sync_cache_relation_cb(Datum arg, Oid relid) * Reset schema sent status as the relation definition may have changed. */ if (entry != NULL) + { entry->schema_sent = false; + list_free(entry->streamed_txns); + entry->streamed_txns = NULL; + } } /* diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 1b929a603e..cbc416a274 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -157,6 +157,12 @@ create_logical_replication_slot(char *name, char *plugin, .segment_close = wal_segment_close), NULL, NULL, NULL); + /* + * Make sure streaming is disabled here - we may have the methods, + * but we don't have anywhere to send the data yet. + */ + ctx->streaming = false; + /* * If caller needs us to determine the decoding start point, do so now. * This might take a while. diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index d0c0674848..ffc3d50081 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -1016,6 +1016,12 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) WalSndPrepareWrite, WalSndWriteData, WalSndUpdateProgress); + /* + * Make sure streaming is disabled here - we may have the methods, + * but we don't have anywhere to send the data yet. + */ + ctx->streaming = false; + /* * Signal that we don't need the timeout mechanism. We're just * creating the replication slot and don't yet accept feedback diff --git a/src/include/catalog/pg_subscription.h b/src/include/catalog/pg_subscription.h index 0a756d42d8..617b9094d4 100644 --- a/src/include/catalog/pg_subscription.h +++ b/src/include/catalog/pg_subscription.h @@ -48,6 +48,8 @@ CATALOG(pg_subscription,6100,SubscriptionRelationId) BKI_SHARED_RELATION BKI_ROW bool subenabled; /* True if the subscription is enabled (the * worker should be running) */ + bool substream; /* Stream in-progress transactions. */ + #ifdef CATALOG_VARLEN /* variable-length fields start here */ /* Connection string to the publisher */ text subconninfo BKI_FORCE_NOT_NULL; @@ -73,6 +75,7 @@ typedef struct Subscription char *name; /* Name of the subscription */ Oid owner; /* Oid of the subscription owner */ bool enabled; /* Indicates if the subscription is enabled */ + bool stream; /* Allow streaming in-progress transactions. */ char *conninfo; /* Connection string to the publisher */ char *slotname; /* Name of the replication slot */ char *synccommit; /* Synchronous commit setting for worker */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index c55dc1481c..899d7e2013 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -980,7 +980,11 @@ typedef enum WAIT_EVENT_WAL_READ, WAIT_EVENT_WAL_SYNC, WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN, - WAIT_EVENT_WAL_WRITE + WAIT_EVENT_WAL_WRITE, + WAIT_EVENT_LOGICAL_CHANGES_READ, + WAIT_EVENT_LOGICAL_CHANGES_WRITE, + WAIT_EVENT_LOGICAL_SUBXACT_READ, + WAIT_EVENT_LOGICAL_SUBXACT_WRITE } WaitEventIO; /* ---------- diff --git a/src/include/replication/logicalproto.h b/src/include/replication/logicalproto.h index 4860561be9..89158ed46f 100644 --- a/src/include/replication/logicalproto.h +++ b/src/include/replication/logicalproto.h @@ -23,9 +23,13 @@ * we can support. LOGICALREP_PROTO_MIN_VERSION_NUM is the oldest version we * have backwards compatibility for. The client requests protocol version at * connect time. + * + * LOGICALREP_PROTO_STREAM_VERSION_NUM is the minimum protocol version with + * support for streaming large transactions. */ #define LOGICALREP_PROTO_MIN_VERSION_NUM 1 -#define LOGICALREP_PROTO_VERSION_NUM 1 +#define LOGICALREP_PROTO_STREAM_VERSION_NUM 2 +#define LOGICALREP_PROTO_VERSION_NUM 2 /* Tuple coming via logical replication. */ typedef struct LogicalRepTupleData @@ -86,25 +90,45 @@ extern void logicalrep_read_commit(StringInfo in, extern void logicalrep_write_origin(StringInfo out, const char *origin, XLogRecPtr origin_lsn); extern char *logicalrep_read_origin(StringInfo in, XLogRecPtr *origin_lsn); -extern void logicalrep_write_insert(StringInfo out, Relation rel, - HeapTuple newtuple); +extern void logicalrep_write_insert(StringInfo out, TransactionId xid, + Relation rel, HeapTuple newtuple); extern LogicalRepRelId logicalrep_read_insert(StringInfo in, LogicalRepTupleData *newtup); -extern void logicalrep_write_update(StringInfo out, Relation rel, HeapTuple oldtuple, +extern void logicalrep_write_update(StringInfo out, TransactionId xid, + Relation rel, HeapTuple oldtuple, HeapTuple newtuple); extern LogicalRepRelId logicalrep_read_update(StringInfo in, bool *has_oldtuple, LogicalRepTupleData *oldtup, LogicalRepTupleData *newtup); -extern void logicalrep_write_delete(StringInfo out, Relation rel, - HeapTuple oldtuple); +extern void logicalrep_write_delete(StringInfo out, TransactionId xid, + Relation rel, HeapTuple oldtuple); extern LogicalRepRelId logicalrep_read_delete(StringInfo in, LogicalRepTupleData *oldtup); -extern void logicalrep_write_truncate(StringInfo out, int nrelids, Oid relids[], +extern void logicalrep_write_truncate(StringInfo out, TransactionId xid, + int nrelids, Oid relids[], bool cascade, bool restart_seqs); extern List *logicalrep_read_truncate(StringInfo in, bool *cascade, bool *restart_seqs); -extern void logicalrep_write_rel(StringInfo out, Relation rel); +extern void logicalrep_write_rel(StringInfo out, TransactionId xid, Relation rel); extern LogicalRepRelation *logicalrep_read_rel(StringInfo in); -extern void logicalrep_write_typ(StringInfo out, Oid typoid); +extern void logicalrep_write_typ(StringInfo out, TransactionId xid, Oid typoid); extern void logicalrep_read_typ(StringInfo out, LogicalRepTyp *ltyp); +extern void logicalrep_write_stream_start(StringInfo out, TransactionId xid, + bool first_segment); +extern TransactionId logicalrep_read_stream_start(StringInfo in, + bool *first_segment); + +extern void logicalrep_write_stream_stop(StringInfo out); +extern TransactionId logicalrep_read_stream_stop(StringInfo in); + +extern void logicalrep_write_stream_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +extern TransactionId logicalrep_read_stream_commit(StringInfo out, + LogicalRepCommitData *commit_data); + +extern void logicalrep_write_stream_abort(StringInfo out, TransactionId xid, + TransactionId subxid); +extern void logicalrep_read_stream_abort(StringInfo in, TransactionId *xid, + TransactionId *subxid); + #endif /* LOGICAL_PROTO_H */ diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index ac1acbb27e..95132062c6 100644 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -177,6 +177,7 @@ typedef struct { uint32 proto_version; /* Logical protocol version */ List *publication_names; /* String list of publications */ + bool streaming; /* Streaming of large transactions */ } logical; } proto; } WalRcvStreamOptions; diff --git a/src/test/subscription/t/009_stream_simple.pl b/src/test/subscription/t/009_stream_simple.pl new file mode 100644 index 0000000000..2f01133f69 --- /dev/null +++ b/src/test/subscription/t/009_stream_simple.pl @@ -0,0 +1,86 @@ +# Test streaming of simple large transaction +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 3; + +sub wait_for_caught_up +{ + my ($node, $appname) = @_; + + $node->poll_query_until('postgres', +"SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$appname';" + ) or die "Timed out while waiting for subscriber to catch up"; +} + +# Create publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->append_conf('postgresql.conf', 'logical_decoding_work_mem = 64kB'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b varchar)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO test_tab VALUES (1, 'foo'), (2, 'bar')"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', "CREATE TABLE test_tab (a int primary key, b text, c timestamptz DEFAULT now(), d bigint DEFAULT 999)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE test_tab"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +); + +wait_for_caught_up($node_publisher, $appname); + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c), count(d = 999) FROM test_tab"); +is($result, qq(2|2|2), 'check initial data was copied to subscriber'); + +# Insert, update and delete enough rows to exceed the 64kB limit. +$node_publisher->safe_psql('postgres', q{ +BEGIN; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(3, 5000) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +COMMIT; +}); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c), count(d = 999) FROM test_tab"); +is($result, qq(3334|3334|3334), 'check extra columns contain local defaults'); + +# Change the local values of the extra columns on the subscriber, +# update publisher, and check that subscriber retains the expected +# values +$node_subscriber->safe_psql('postgres', "UPDATE test_tab SET c = 'epoch'::timestamptz + 987654321 * interval '1s'"); +$node_publisher->safe_psql('postgres', "UPDATE test_tab SET b = md5(a::text)"); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(extract(epoch from c) = 987654321), count(d = 999) FROM test_tab"); +is($result, qq(3334|3334|3334), 'check extra columns contain locally changed data'); + +$node_subscriber->stop; +$node_publisher->stop; diff --git a/src/test/subscription/t/010_stream_subxact.pl b/src/test/subscription/t/010_stream_subxact.pl new file mode 100644 index 0000000000..d2ae38592b --- /dev/null +++ b/src/test/subscription/t/010_stream_subxact.pl @@ -0,0 +1,102 @@ +# Test streaming of large transaction containing large subtransactions +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 3; + +sub wait_for_caught_up +{ + my ($node, $appname) = @_; + + $node->poll_query_until('postgres', +"SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$appname';" + ) or die "Timed out while waiting for subscriber to catch up"; +} + +# Create publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->append_conf('postgresql.conf', 'logical_decoding_work_mem = 64kB'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b varchar)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO test_tab VALUES (1, 'foo'), (2, 'bar')"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', "CREATE TABLE test_tab (a int primary key, b text, c timestamptz DEFAULT now(), d bigint DEFAULT 999)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE test_tab"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +); + +wait_for_caught_up($node_publisher, $appname); + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c), count(d = 999) FROM test_tab"); +is($result, qq(2|2|2), 'check initial data was copied to subscriber'); + +# Insert, update and delete enough rowsto exceed 64kB limit. +$node_publisher->safe_psql('postgres', q{ +BEGIN; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series( 3, 500) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s1; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(501, 1000) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s2; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(1001, 1500) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s3; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(1501, 2000) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s4; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(2001, 2500) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +COMMIT; +}); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c), count(d = 999) FROM test_tab"); +is($result, qq(1667|1667|1667), 'check extra columns contain local defaults'); + +# Change the local values of the extra columns on the subscriber, +# update publisher, and check that subscriber retains the expected +# values +$node_subscriber->safe_psql('postgres', "UPDATE test_tab SET c = 'epoch'::timestamptz + 987654321 * interval '1s'"); +$node_publisher->safe_psql('postgres', "UPDATE test_tab SET b = md5(a::text)"); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(extract(epoch from c) = 987654321), count(d = 999) FROM test_tab"); +is($result, qq(1667|1667|1667), 'check extra columns contain locally changed data'); + +$node_subscriber->stop; +$node_publisher->stop; diff --git a/src/test/subscription/t/011_stream_ddl.pl b/src/test/subscription/t/011_stream_ddl.pl new file mode 100644 index 0000000000..0da39a1a8a --- /dev/null +++ b/src/test/subscription/t/011_stream_ddl.pl @@ -0,0 +1,95 @@ +# Test streaming of large transaction with DDL and subtransactions +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 2; + +sub wait_for_caught_up +{ + my ($node, $appname) = @_; + + $node->poll_query_until('postgres', +"SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$appname';" + ) or die "Timed out while waiting for subscriber to catch up"; +} + +# Create publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->append_conf('postgresql.conf', 'logical_decoding_work_mem = 64kB'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b varchar)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO test_tab VALUES (1, 'foo'), (2, 'bar')"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', "CREATE TABLE test_tab (a int primary key, b text, c INT, d INT, e INT)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE test_tab"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +); + +wait_for_caught_up($node_publisher, $appname); + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c), count(d = 999) FROM test_tab"); +is($result, qq(2|0|0), 'check initial data was copied to subscriber'); + +# a small (non-streamed) transaction with DDL and DML +$node_publisher->safe_psql('postgres', q{ +BEGIN; +INSERT INTO test_tab VALUES (3, md5(3::text)); +ALTER TABLE test_tab ADD COLUMN c INT; +SAVEPOINT s1; +INSERT INTO test_tab VALUES (4, md5(4::text), -4); +COMMIT; +}); + +# large (streamed) transaction with DDL and DML +$node_publisher->safe_psql('postgres', q{ +BEGIN; +INSERT INTO test_tab SELECT i, md5(i::text), -i FROM generate_series(5, 1000) s(i); +ALTER TABLE test_tab ADD COLUMN d INT; +SAVEPOINT s1; +INSERT INTO test_tab SELECT i, md5(i::text), -i, 2*i FROM generate_series(1001, 2000) s(i); +COMMIT; +}); + +# a small (non-streamed) transaction with DDL and DML +$node_publisher->safe_psql('postgres', q{ +BEGIN; +INSERT INTO test_tab VALUES (2001, md5(2001::text), -2001, 2*2001); +ALTER TABLE test_tab ADD COLUMN e INT; +SAVEPOINT s1; +INSERT INTO test_tab VALUES (2002, md5(2002::text), -2002, 2*2002, -3*2002); +COMMIT; +}); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c), count(d), count(e) FROM test_tab"); +is($result, qq(2002|1999|1002|1), 'check extra columns contain local defaults'); + +$node_subscriber->stop; +$node_publisher->stop; diff --git a/src/test/subscription/t/012_stream_subxact_abort.pl b/src/test/subscription/t/012_stream_subxact_abort.pl new file mode 100644 index 0000000000..402df30f59 --- /dev/null +++ b/src/test/subscription/t/012_stream_subxact_abort.pl @@ -0,0 +1,82 @@ +# Test streaming of large transaction containing multiple subtransactions and rollbacks +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 2; + +sub wait_for_caught_up +{ + my ($node, $appname) = @_; + + $node->poll_query_until('postgres', +"SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$appname';" + ) or die "Timed out while waiting for subscriber to catch up"; +} + +# Create publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->append_conf('postgresql.conf', 'logical_decoding_work_mem = 64kB'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b varchar)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO test_tab VALUES (1, 'foo'), (2, 'bar')"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', "CREATE TABLE test_tab (a int primary key, b text, c INT, d INT, e INT)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE test_tab"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +); + +wait_for_caught_up($node_publisher, $appname); + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c) FROM test_tab"); +is($result, qq(2|0), 'check initial data was copied to subscriber'); + +# large (streamed) transaction with DDL, DML and ROLLBACKs +$node_publisher->safe_psql('postgres', q{ +BEGIN; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(3,500) s(i); +SAVEPOINT s1; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(501,1000) s(i); +SAVEPOINT s2; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(1001,1500) s(i); +SAVEPOINT s3; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(1501,2000) s(i); +ROLLBACK TO s2; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(2001,2500) s(i); +ROLLBACK TO s1; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(2501,3000) s(i); +COMMIT; +}); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c) FROM test_tab"); +is($result, qq(1000|0), 'check extra columns contain local defaults'); + +$node_subscriber->stop; +$node_publisher->stop; diff --git a/src/test/subscription/t/013_stream_subxact_ddl_abort.pl b/src/test/subscription/t/013_stream_subxact_ddl_abort.pl new file mode 100644 index 0000000000..becbdd0578 --- /dev/null +++ b/src/test/subscription/t/013_stream_subxact_ddl_abort.pl @@ -0,0 +1,84 @@ +# Test behavior with streaming transaction exceeding logical_decoding_work_mem +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 2; + +sub wait_for_caught_up +{ + my ($node, $appname) = @_; + + $node->poll_query_until('postgres', +"SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$appname';" + ) or die "Timed out while waiting for subscriber to catch up"; +} + +# Create publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->append_conf('postgresql.conf', 'logical_decoding_work_mem = 64kB'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b varchar)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO test_tab VALUES (1, 'foo'), (2, 'bar')"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', "CREATE TABLE test_tab (a int primary key, b text, c INT, d INT, e INT)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE test_tab"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub" +); + +wait_for_caught_up($node_publisher, $appname); + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c) FROM test_tab"); +is($result, qq(2|0), 'check initial data was copied to subscriber'); + +# large (streamed) transaction with DDL, DML and ROLLBACKs +$node_publisher->safe_psql('postgres', q{ +BEGIN; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(3,500) s(i); +ALTER TABLE test_tab ADD COLUMN c INT; +SAVEPOINT s1; +INSERT INTO test_tab SELECT i, md5(i::text), -i FROM generate_series(501,1000) s(i); +ALTER TABLE test_tab ADD COLUMN d INT; +SAVEPOINT s2; +INSERT INTO test_tab SELECT i, md5(i::text), -i, 2*i FROM generate_series(1001,1500) s(i); +ALTER TABLE test_tab ADD COLUMN e INT; +SAVEPOINT s3; +INSERT INTO test_tab SELECT i, md5(i::text), -i, 2*i, -3*i FROM generate_series(1501,2000) s(i); +ALTER TABLE test_tab DROP COLUMN c; +ROLLBACK TO s1; +INSERT INTO test_tab SELECT i, md5(i::text), i FROM generate_series(501,1000) s(i); +COMMIT; +}); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c) FROM test_tab"); +is($result, qq(1000|500), 'check extra columns contain local defaults'); + +$node_subscriber->stop; +$node_publisher->stop; -- 2.23.0 v27/v27-0010-Add-TAP-test-for-streaming-vs.-DDL.patch000644 000765 000024 00000010620 13670411611 022761 0ustar00dilipkumarstaff000000 000000 From 7059fa0607f91afd84ff0452ce437769503e964d Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 26 Sep 2019 19:15:35 +0200 Subject: [PATCH v27 10/14] Add TAP test for streaming vs. DDL --- .../subscription/t/014_stream_through_ddl.pl | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 src/test/subscription/t/014_stream_through_ddl.pl diff --git a/src/test/subscription/t/014_stream_through_ddl.pl b/src/test/subscription/t/014_stream_through_ddl.pl new file mode 100644 index 0000000000..b8d78b1972 --- /dev/null +++ b/src/test/subscription/t/014_stream_through_ddl.pl @@ -0,0 +1,98 @@ +# Test streaming of large transaction with DDL, subtransactions and rollbacks. +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 2; + +sub wait_for_caught_up +{ + my ($node, $appname) = @_; + + $node->poll_query_until('postgres', +"SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$appname';" + ) or die "Timed out while waiting for subscriber to catch up"; +} + +# Create publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->append_conf('postgresql.conf', 'logical_decoding_work_mem = 64kB'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b varchar)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO test_tab VALUES (1, 'foo'), (2, 'bar')"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', "CREATE TABLE test_tab (a int primary key, b text, c INT, d text, e INT)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub FOR TABLE test_tab"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', +"CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (streaming=true)" +); + +wait_for_caught_up($node_publisher, $appname); + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(c), count(d) FROM test_tab"); +is($result, qq(2|0|0), 'check initial data was copied to subscriber'); + + +# large (streamed) transaction with DDL and DML +$node_publisher->safe_psql('postgres', q{ +BEGIN; +SAVEPOINT s1; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(3, 1000) s(i); +SAVEPOINT s2; +ALTER TABLE test_tab ADD COLUMN c INT; +INSERT INTO test_tab SELECT i, md5(i::text), i FROM generate_series(1001, 2000) s(i); +SAVEPOINT s3; +ALTER TABLE test_tab ADD COLUMN d text; +SAVEPOINT s4; +SAVEPOINT s5; +INSERT INTO test_tab SELECT i, md5(i::text), i, md5(i::text) FROM generate_series(2001, 3000) s(i); +ALTER TABLE test_tab ADD COLUMN e INT; +INSERT INTO test_tab SELECT i, md5(i::text), i, md5(i::text), i FROM generate_series(3001, 4000) s(i); +SAVEPOINT s10; +ALTER TABLE test_tab DROP d; +INSERT INTO test_tab SELECT i, md5(i::text), i, i FROM generate_series(4001, 5000) s(i); +ALTER TABLE test_tab ADD COLUMN d text; +ROLLBACK TO SAVEPOINT s10; +RELEASE SAVEPOINT s10; +SAVEPOINT s10; +INSERT INTO test_tab SELECT i, md5(i::text), i, md5(i::text), i FROM generate_series(5001, 6000) s(i); +SAVEPOINT s6; +ALTER TABLE test_tab DROP d; +INSERT INTO test_tab SELECT i, md5(i::text), i, i FROM generate_series(6001, 7000) s(i); +SAVEPOINT s7; +ALTER TABLE test_tab ADD COLUMN d text; +INSERT INTO test_tab (a, b, c, d, e) SELECT i, md5(i::text), i, md5(i::text), i FROM generate_series(7001, 8000) s(i); +COMMIT; +}); + +wait_for_caught_up($node_publisher, $appname); + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*), count(a), count(b), count(c), count(d), count(e) FROM test_tab"); +is($result, qq(7000|7000|7000|6000|4000|4000), 'check extra columns contain local defaults'); + +$node_subscriber->stop; +$node_publisher->stop; -- 2.23.0 v27/v27-0007-Track-statistics-for-streaming.patch000644 000765 000024 00000027713 13670411611 023042 0ustar00dilipkumarstaff000000 000000 From 6e61f1ab42ce80ce3260332e9248371117f7720b Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Thu, 11 Jun 2020 15:26:18 +0530 Subject: [PATCH v27 07/14] Track statistics for streaming --- doc/src/sgml/monitoring.sgml | 33 +++++++++++++++++++ src/backend/catalog/system_views.sql | 5 ++- .../replication/logical/reorderbuffer.c | 12 +++++++ src/backend/replication/walsender.c | 32 +++++++++++++++--- src/include/catalog/pg_proc.dat | 6 ++-- src/include/replication/reorderbuffer.h | 13 +++++--- src/include/replication/walsender_private.h | 5 +++ src/test/regress/expected/rules.out | 7 ++-- 8 files changed, 98 insertions(+), 15 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 89662cc0a3..45208ad8a1 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -2494,6 +2494,39 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i Amount of decoded transaction data spilled to disk. + + + + stream_txns bigint + + + Number of in-progress transactions streamed to subscriber after + memory used by logical decoding exceeds logical_work_mem. + Streaming only works with toplevel transactions (subtransactions can't + be streamed independently), so the counter does not get incremented for + subtransactions. + + + + + + stream_count bigint + + + Number of times in-progress transactions were streamed to subscriber. + Transactions may get streamed repeatedly, and this counter gets incremented + on every such invocation. + + + + + + stream_bytes bigint + + + Amount of decoded in-progress transaction data streamed to subscriber. + + diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 56420bbc9d..9f509fbc21 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -788,7 +788,10 @@ CREATE VIEW pg_stat_replication AS W.reply_time, W.spill_txns, W.spill_count, - W.spill_bytes + W.spill_bytes, + W.stream_txns, + W.stream_count, + W.stream_bytes FROM pg_stat_get_activity(NULL) AS S JOIN pg_stat_get_wal_senders() AS W ON (S.pid = W.pid) LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid); diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 36958fe2ee..d76598e105 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -348,6 +348,10 @@ ReorderBufferAllocate(void) buffer->spillTxns = 0; buffer->spillBytes = 0; + buffer->streamCount = 0; + buffer->streamTxns = 0; + buffer->streamBytes = 0; + buffer->current_restart_decoding_lsn = InvalidXLogRecPtr; dlist_init(&buffer->toplevel_by_lsn); @@ -3562,6 +3566,14 @@ ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) ReorderBufferFreeSnap(rb, txn->snapshot_now); } + /* Update the stream statistics. */ + rb->streamCount += 1; + rb->streamBytes += (rbtxn_has_incomplete_tuple(txn)) ? + txn->complete_size : txn->total_size; + + /* Don't consider already streamed transaction. */ + rb->streamTxns += (rbtxn_is_streamed(txn)) ? 0 : 1; + /* * Access the main routine to decode the changes and send to output plugin. */ diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index e2477c47e0..d0c0674848 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -1349,7 +1349,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, * LogicalDecodingContext 'update_progress' callback. * * Write the current position to the lag tracker (see XLogSendPhysical), - * and update the spill statistics. + * and update the spill/stream statistics. */ static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid) @@ -1370,7 +1370,8 @@ WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId sendTime = now; /* - * Update statistics about transactions that spilled to disk. + * Update statistics about transactions that spilled to disk or streamed to + * subscriber (before being committed). */ UpdateSpillStats(ctx); } @@ -2421,6 +2422,9 @@ InitWalSenderSlot(void) walsnd->spillTxns = 0; walsnd->spillCount = 0; walsnd->spillBytes = 0; + walsnd->streamTxns = 0; + walsnd->streamCount = 0; + walsnd->streamBytes = 0; SpinLockRelease(&walsnd->mutex); /* don't need the lock anymore */ MyWalSnd = (WalSnd *) walsnd; @@ -3256,7 +3260,7 @@ offset_to_interval(TimeOffset offset) Datum pg_stat_get_wal_senders(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_WAL_SENDERS_COLS 15 +#define PG_STAT_GET_WAL_SENDERS_COLS 18 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; @@ -3314,6 +3318,9 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) int64 spillCount; int64 spillBytes; bool is_sync_standby; + int64 streamTxns; + int64 streamCount; + int64 streamBytes; Datum values[PG_STAT_GET_WAL_SENDERS_COLS]; bool nulls[PG_STAT_GET_WAL_SENDERS_COLS]; int j; @@ -3339,6 +3346,9 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) spillTxns = walsnd->spillTxns; spillCount = walsnd->spillCount; spillBytes = walsnd->spillBytes; + streamTxns = walsnd->streamTxns; + streamCount = walsnd->streamCount; + streamBytes = walsnd->streamBytes; SpinLockRelease(&walsnd->mutex); /* @@ -3441,6 +3451,11 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) values[12] = Int64GetDatum(spillTxns); values[13] = Int64GetDatum(spillCount); values[14] = Int64GetDatum(spillBytes); + + /* stream over-sized transactions */ + values[15] = Int64GetDatum(streamTxns); + values[16] = Int64GetDatum(streamCount); + values[17] = Int64GetDatum(streamBytes); } tuplestore_putvalues(tupstore, tupdesc, values, nulls); @@ -3683,11 +3698,18 @@ UpdateSpillStats(LogicalDecodingContext *ctx) { ReorderBuffer *rb = ctx->reorder; - elog(DEBUG2, "UpdateSpillStats: updating stats %p %lld %lld %lld", + MyWalSnd->streamTxns = rb->streamTxns; + MyWalSnd->streamCount = rb->streamCount; + MyWalSnd->streamBytes = rb->streamBytes; + + elog(DEBUG2, "UpdateSpillStats: updating stats %p %lld %lld %lld %lld %lld %lld", rb, (long long) rb->spillTxns, (long long) rb->spillCount, - (long long) rb->spillBytes); + (long long) rb->spillBytes, + (long long) rb->streamTxns, + (long long) rb->streamCount, + (long long) rb->streamBytes); SpinLockAcquire(&MyWalSnd->mutex); MyWalSnd->spillTxns = rb->spillTxns; diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 61f2c2f5b4..7869f721da 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5237,9 +5237,9 @@ proname => 'pg_stat_get_wal_senders', prorows => '10', proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => '', - proallargtypes => '{int4,text,pg_lsn,pg_lsn,pg_lsn,pg_lsn,interval,interval,interval,int4,text,timestamptz,int8,int8,int8}', - proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{pid,state,sent_lsn,write_lsn,flush_lsn,replay_lsn,write_lag,flush_lag,replay_lag,sync_priority,sync_state,reply_time,spill_txns,spill_count,spill_bytes}', + proallargtypes => '{int4,text,pg_lsn,pg_lsn,pg_lsn,pg_lsn,interval,interval,interval,int4,text,timestamptz,int8,int8,int8,int8,int8,int8}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{pid,state,sent_lsn,write_lsn,flush_lsn,replay_lsn,write_lag,flush_lag,replay_lag,sync_priority,sync_state,reply_time,spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes}', prosrc => 'pg_stat_get_wal_senders' }, { oid => '3317', descr => 'statistics: information about WAL receiver', proname => 'pg_stat_get_wal_receiver', proisstrict => 'f', provolatile => 's', diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 2d86209f61..399f3e49f2 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -549,15 +549,20 @@ struct ReorderBuffer Size size; /* - * Statistics about transactions spilled to disk. + * Statistics about transactions streamed or spilled to disk. * - * A single transaction may be spilled repeatedly, which is why we keep - * two different counters. For spilling, the transaction counter includes - * both toplevel transactions and subtransactions. + * A single transaction may be streamed/spilled repeatedly, which is + * why we keep two different counters. For spilling, the transaction + * counter includes both toplevel transactions and subtransactions. + * For streaming, it only includes toplevel transactions (we never + * stream individual subtransactions). */ int64 spillCount; /* spill-to-disk invocation counter */ int64 spillTxns; /* number of transactions spilled to disk */ int64 spillBytes; /* amount of data spilled to disk */ + int64 streamCount; /* streaming invocation counter */ + int64 streamTxns; /* number of transactions spilled to disk */ + int64 streamBytes; /* amount of data streamed to subscriber */ }; diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h index 734acec2a4..b997d1710e 100644 --- a/src/include/replication/walsender_private.h +++ b/src/include/replication/walsender_private.h @@ -83,6 +83,11 @@ typedef struct WalSnd int64 spillTxns; int64 spillCount; int64 spillBytes; + + /* Statistics for in-progress transactions streamed to subscriber. */ + int64 streamTxns; + int64 streamCount; + int64 streamBytes; } WalSnd; extern WalSnd *MyWalSnd; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index b813e32215..cf22f8a038 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2005,9 +2005,12 @@ pg_stat_replication| SELECT s.pid, w.reply_time, w.spill_txns, w.spill_count, - w.spill_bytes + w.spill_bytes, + w.stream_txns, + w.stream_count, + w.stream_bytes FROM ((pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, sslcompression, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, leader_pid) - JOIN pg_stat_get_wal_senders() w(pid, state, sent_lsn, write_lsn, flush_lsn, replay_lsn, write_lag, flush_lag, replay_lag, sync_priority, sync_state, reply_time, spill_txns, spill_count, spill_bytes) ON ((s.pid = w.pid))) + JOIN pg_stat_get_wal_senders() w(pid, state, sent_lsn, write_lsn, flush_lsn, replay_lsn, write_lag, flush_lag, replay_lag, sync_priority, sync_state, reply_time, spill_txns, spill_count, spill_bytes, stream_txns, stream_count, stream_bytes) ON ((s.pid = w.pid))) LEFT JOIN pg_authid u ON ((s.usesysid = u.oid))); pg_stat_slru| SELECT s.name, s.blks_zeroed, -- 2.23.0 v27/v27-0002-Issue-individual-invalidations-with-wal_level-lo.patch000644 000765 000024 00000041664 13670577036 026456 0ustar00dilipkumarstaff000000 000000 From 9164264fa41e9fa93a27ebaa71d7d90bcd5885a3 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Sat, 6 Jun 2020 09:54:21 +0530 Subject: [PATCH v27 02/14] Issue individual invalidations with wal_level=logical. When wal_level=logical, write individual invalidations into WAL so that decoding can use this information. We still add the invalidations to the cache, and write them to WAL at commit time in RecordTransactionCommit(). This uses the existing XLOG_INVALIDATIONS xlog record type, from the RM_STANDBY_ID resource manager (see LogStandbyInvalidations for details). So existing code relying on those invalidations (e.g. redo) does not need to be changed. The individual invalidations are written are written using a new xlog record type XLOG_XACT_INVALIDATIONS, from RM_XACT_ID resource manager. See LogLogicalInvalidations for details. These new xlog records are ignored by existing redo procedures, which still rely on the invalidations written to commit records. The invalidations are decoded and added as a new ReorderBufferChange type (REORDER_BUFFER_CHANGE_INVALIDATION), and then executed during replay, unlike the existing invalidations (which are either decoded as part of commit record, or executed immediately during decoding and not added to reorderbuffer at all). LogStandbyInvalidations was accumulating all the invalidations in memory, and then only wrote them once at commit time, which may reduce the performance impact by amortizing the overhead and deduplicating the invalidations. Author: Dilip Kumar, Tomas Vondra, Amit Kapila Reviewed-by: Amit Kapila Tested-by: Neha Sharma and Mahendra Singh Thalor Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com --- src/backend/access/rmgrdesc/xactdesc.c | 40 +++++++ src/backend/access/transam/xact.c | 7 ++ src/backend/replication/logical/decode.c | 16 +++ .../replication/logical/reorderbuffer.c | 104 +++++++++++++++--- src/backend/utils/cache/inval.c | 49 +++++++++ src/include/access/xact.h | 13 ++- src/include/replication/reorderbuffer.h | 11 ++ 7 files changed, 226 insertions(+), 14 deletions(-) diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 9fce75565f..7ab0d11ea9 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -20,6 +20,9 @@ #include "storage/standbydefs.h" #include "utils/timestamp.h" +static void xact_desc_invalidations(StringInfo buf, + int nmsgs, SharedInvalidationMessage *msgs); + /* * Parse the WAL format of an xact commit and abort records into an easier to * understand format. @@ -396,6 +399,12 @@ xact_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "xtop %u: ", xlrec->xtop); xact_desc_assignment(buf, xlrec); } + else if (info == XLOG_XACT_INVALIDATIONS) + { + xl_xact_invalidations *xlrec = (xl_xact_invalidations *) rec; + + xact_desc_invalidations(buf, xlrec->nmsgs, xlrec->msgs); + } } const char * @@ -423,7 +432,38 @@ xact_identify(uint8 info) case XLOG_XACT_ASSIGNMENT: id = "ASSIGNMENT"; break; + case XLOG_XACT_INVALIDATIONS: + id = "INVALIDATION"; + break; } return id; } + +static void +xact_desc_invalidations(StringInfo buf, + int nmsgs, SharedInvalidationMessage *msgs) +{ + int i; + + appendStringInfoString(buf, "; inval msgs:"); + for (i = 0; i < nmsgs; i++) + { + SharedInvalidationMessage *msg = &msgs[i]; + + if (msg->id >= 0) + appendStringInfo(buf, " catcache %d", msg->id); + else if (msg->id == SHAREDINVALCATALOG_ID) + appendStringInfo(buf, " catalog %u", msg->cat.catId); + else if (msg->id == SHAREDINVALRELCACHE_ID) + appendStringInfo(buf, " relcache %u", msg->rc.relId); + /* not expected, but print something anyway */ + else if (msg->id == SHAREDINVALSMGR_ID) + appendStringInfoString(buf, " smgr"); + /* not expected, but print something anyway */ + else if (msg->id == SHAREDINVALRELMAP_ID) + appendStringInfo(buf, " relmap db %u", msg->rm.dbId); + else + appendStringInfo(buf, " unrecognized id %d", msg->id); + } +} diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 04fd5ca870..72efa3c1b3 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -6020,6 +6020,13 @@ xact_redo(XLogReaderState *record) ProcArrayApplyXidAssignment(xlrec->xtop, xlrec->nsubxacts, xlrec->xsub); } + else if (info == XLOG_XACT_INVALIDATIONS) + { + /* + * XXX we do ignore this for now, what matters are invalidations + * written into the commit record. + */ + } else elog(PANIC, "xact_redo: unknown op code %u", info); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 0c0c371739..a1d87450ce 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -282,6 +282,22 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) * See LogicalDecodingProcessRecord. */ break; + case XLOG_XACT_INVALIDATIONS: + { + TransactionId xid; + xl_xact_invalidations *invals; + + xid = XLogRecGetXid(r); + invals = (xl_xact_invalidations *) XLogRecGetData(r); + + Assert(TransactionIdIsValid(xid)); + ReorderBufferAddInvalidation(reorder, xid, buf->origptr, + invals->nmsgs, invals->msgs); + + + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr); + } + break; case XLOG_XACT_PREPARE: /* diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 642a1c767f..cd406ca4d2 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -220,7 +220,7 @@ static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state); static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, ReorderBufferIterTXNState *state); -static void ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs); /* * --------------------------------------- @@ -455,6 +455,11 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change) pfree(change->data.msg.message); change->data.msg.message = NULL; break; + case REORDER_BUFFER_CHANGE_INVALIDATION: + if (change->data.inval.invalidations) + pfree(change->data.inval.invalidations); + change->data.inval.invalidations = NULL; + break; case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: if (change->data.snapshot) { @@ -1814,17 +1819,24 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, TeardownHistoricSnapshot(false); SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); - - /* - * Every time the CommandId is incremented, we could - * see new catalog contents, so execute all - * invalidations. - */ - ReorderBufferExecuteInvalidations(rb, txn); } break; + case REORDER_BUFFER_CHANGE_INVALIDATION: + + /* + * Execute the invalidation messages locally. + * + * XXX Do we need to care about relcacheInitFileInval and + * the other fields added to ReorderBufferChange, or just + * about the message itself? + */ + ReorderBufferExecuteInvalidations( + change->data.inval.ninvalidations, + change->data.inval.invalidations); + break; + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: elog(ERROR, "tuplecid value in changequeue"); break; @@ -1866,7 +1878,8 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(rb, txn); + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); @@ -1892,7 +1905,8 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(rb, txn); + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); @@ -2202,6 +2216,33 @@ ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, txn->ntuplecids++; } +/* + * Setup the invalidation of the toplevel transaction. + */ +void +ReorderBufferAddInvalidation(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, int nmsgs, + SharedInvalidationMessage *msgs) +{ + MemoryContext oldcontext; + ReorderBufferChange *change; + + oldcontext = MemoryContextSwitchTo(rb->context); + + change = ReorderBufferGetChange(rb); + change->action = REORDER_BUFFER_CHANGE_INVALIDATION; + change->data.inval.ninvalidations = nmsgs; + change->data.inval.invalidations = (SharedInvalidationMessage *) + MemoryContextAlloc(rb->context, + sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(change->data.inval.invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + + ReorderBufferQueueChange(rb, xid, lsn, change); + + MemoryContextSwitchTo(oldcontext); +} + /* * Setup the invalidation of the toplevel transaction. * @@ -2234,12 +2275,12 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, * in the changestream but we don't know which those are. */ static void -ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn) +ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs) { int i; - for (i = 0; i < txn->ninvalidations; i++) - LocalExecuteInvalidationMessage(&txn->invalidations[i]); + for (i = 0; i < nmsgs; i++) + LocalExecuteInvalidationMessage(&msgs[i]); } /* @@ -2593,6 +2634,24 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, change->data.msg.message_size); data += change->data.msg.message_size; + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + char *data; + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + sz += inval_size; + + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(data, change->data.inval.invalidations, inval_size); + data += inval_size; + break; } case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: @@ -2740,6 +2799,11 @@ ReorderBufferChangeSize(ReorderBufferChange *change) break; } + case REORDER_BUFFER_CHANGE_INVALIDATION: + sz += sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: { Snapshot snap; @@ -3006,6 +3070,20 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, change->data.msg.message_size); data += change->data.msg.message_size; + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + change->data.inval.invalidations = + MemoryContextAlloc(rb->context, inval_size); + + /* read the message */ + memcpy(change->data.inval.invalidations, data, inval_size); + data += inval_size; + break; } case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 591dd33be6..cba5b6c64b 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -85,6 +85,12 @@ * worth trying to avoid sending such inval traffic in the future, if those * problems can be overcome cheaply. * + * When wal_level=logical, write invalidations into WAL at each command end to + * support the decoding of the in-progress transaction. As of now it was + * enough to log invalidation only at commit because we are only decoding the + * transaction at the commit time. We only need to log the catalog cache and + * relcache invalidation. There can not be any active MVCC scan in logical + * decoding so we don't need to log the snapshot invalidation. * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -104,6 +110,7 @@ #include "catalog/pg_constraint.h" #include "miscadmin.h" #include "storage/sinval.h" +#include "storage/standby.h" #include "storage/smgr.h" #include "utils/catcache.h" #include "utils/inval.h" @@ -210,6 +217,8 @@ static struct RELCACHECALLBACK static int relcache_callback_count = 0; +static void LogLogicalInvalidations(void); + /* ---------------------------------------------------------------- * Invalidation list support functions * @@ -1092,6 +1101,9 @@ CommandEndInvalidationMessages(void) if (transInvalInfo == NULL) return; + if (XLogLogicalInfoActive()) + LogLogicalInvalidations(); + ProcessInvalidationMessages(&transInvalInfo->CurrentCmdInvalidMsgs, LocalExecuteInvalidationMessage); AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs, @@ -1501,3 +1513,40 @@ CallSyscacheCallbacks(int cacheid, uint32 hashvalue) i = ccitem->link - 1; } } + +/* + * Emit WAL for invalidations. + */ +static void +LogLogicalInvalidations() +{ + xl_xact_invalidations xlrec; + SharedInvalidationMessage *invalMessages; + int nmsgs = 0; + + if (transInvalInfo->CurrentCmdInvalidMsgs.cclist) + { + ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs, + MakeSharedInvalidMessagesArray); + invalMessages = SharedInvalidMessagesArray; + nmsgs = numSharedInvalidMessagesArray; + SharedInvalidMessagesArray = NULL; + numSharedInvalidMessagesArray = 0; + } + + if (nmsgs > 0) + { + /* prepare record */ + memset(&xlrec, 0, MinSizeOfXactInvalidations); + xlrec.nmsgs = nmsgs; + + /* perform insertion */ + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactInvalidations); + XLogRegisterData((char *) invalMessages, + nmsgs * sizeof(SharedInvalidationMessage)); + XLogInsert(RM_XACT_ID, XLOG_XACT_INVALIDATIONS); + + pfree(invalMessages); + } +} diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 22bb96ca2a..3f3e137531 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -146,7 +146,7 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid, #define XLOG_XACT_COMMIT_PREPARED 0x30 #define XLOG_XACT_ABORT_PREPARED 0x40 #define XLOG_XACT_ASSIGNMENT 0x50 -/* free opcode 0x60 */ +#define XLOG_XACT_INVALIDATIONS 0x60 /* free opcode 0x70 */ /* mask for filtering opcodes out of xl_info */ @@ -197,6 +197,17 @@ typedef struct xl_xact_assignment #define MinSizeOfXactAssignment offsetof(xl_xact_assignment, xsub) +/* + * Invalidations logged with wal_level=logical. + */ +typedef struct xl_xact_invalidations +{ + int nmsgs; /* number of shared inval msgs */ + SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]; +} xl_xact_invalidations; + +#define MinSizeOfXactInvalidations offsetof(xl_xact_invalidations, msgs) + /* * Commit and abort records can contain a lot of information. But a large * portion of the records won't need all possible pieces of information. So we diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 626ecf4dc9..af35287896 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -57,6 +57,7 @@ enum ReorderBufferChangeType REORDER_BUFFER_CHANGE_UPDATE, REORDER_BUFFER_CHANGE_DELETE, REORDER_BUFFER_CHANGE_MESSAGE, + REORDER_BUFFER_CHANGE_INVALIDATION, REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT, REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID, REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID, @@ -149,6 +150,14 @@ typedef struct ReorderBufferChange CommandId cmax; CommandId combocid; } tuplecid; + + /* Invalidation. */ + struct + { + uint32 ninvalidations; /* Number of messages */ + SharedInvalidationMessage *invalidations; /* invalidation + * message */ + } inval; } data; /* @@ -459,6 +468,8 @@ void ReorderBufferAddNewCommandId(ReorderBuffer *, TransactionId, XLogRecPtr ls void ReorderBufferAddNewTupleCids(ReorderBuffer *, TransactionId, XLogRecPtr lsn, RelFileNode node, ItemPointerData pt, CommandId cmin, CommandId cmax, CommandId combocid); +void ReorderBufferAddInvalidation(ReorderBuffer *, TransactionId, XLogRecPtr lsn, + int nmsgs, SharedInvalidationMessage *msgs); void ReorderBufferAddInvalidations(ReorderBuffer *, TransactionId, XLogRecPtr lsn, Size nmsgs, SharedInvalidationMessage *msgs); void ReorderBufferImmediateInvalidation(ReorderBuffer *, uint32 ninvalidations, -- 2.23.0 v27/v27-0005-Implement-streaming-mode-in-ReorderBuffer.patch000644 000765 000024 00000113643 13670411611 025030 0ustar00dilipkumarstaff000000 000000 From 97d2a59e201f9cd10c339ff3d72f64531f18d5c4 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Fri, 1 May 2020 19:56:35 +0530 Subject: [PATCH v27 05/14] Implement streaming mode in ReorderBuffer Instead of serializing the transaction to disk after reaching the logical_decoding_work_mem limit in memory, we consume the changes we have in memory and invoke new stream API methods. This happens in ReorderBufferStreamTXN() using about the same logic as in ReorderBufferCommit() logic. However, sometime if we have incomplete toast or speculative insert we spill to the disk because we can not generate the complete tuple and stream. And, as soon as we get the complete tuple we stream the transaction including the serialized changes. We can do this incremental processing thanks to having assignments (associating subxact with toplevel xacts) in WAL right away, and thanks to logging the invalidation messages. It also adds ReorderBufferTXN pointer to two places: * ReorderBufferChange, so that we know which xact it belongs to * ReorderBufferTXN, pointing to toplevel xact (from subxact) The output plugin can use this to decide which changes to discard in case of stream_abort_cb (e.g. when a subxact gets discarded). --- src/backend/access/heap/heapam_visibility.c | 38 +- .../replication/logical/reorderbuffer.c | 764 ++++++++++++++++-- src/include/replication/reorderbuffer.h | 31 + 3 files changed, 757 insertions(+), 76 deletions(-) diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index dba10890aa..160b167adb 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -1571,8 +1571,23 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, htup, buffer, &cmin, &cmax); + /* + * If we haven't resolved the combocid to cmin/cmax, that means + * we have not decoded the combocid yet. That means the cmin is + * definitely in the future, and we're not supposed to see the + * tuple yet. + * + * XXX This only applies to decoding of in-progress transactions. + * In regular logical decoding we only execute this code at commit + * time, at which point we should have seen all relevant combocids. + * So we should error out in this case. + * + * XXX For the streaming case, we can track the largest combocid + * assigned, and error out based on this (when unable to resolve + * combocid below that observed maximum value). + */ if (!resolved) - elog(ERROR, "could not resolve cmin/cmax of catalog tuple"); + return false; Assert(cmin != InvalidCommandId); @@ -1642,10 +1657,23 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, htup, buffer, &cmin, &cmax); - if (!resolved) - elog(ERROR, "could not resolve combocid to cmax"); - - Assert(cmax != InvalidCommandId); + /* + * If we haven't resolved the combocid to cmin/cmax, that means + * we have not decoded the combocid yet. That means the cmax is + * definitely in the future, and we're still supposed to see the + * tuple. + * + * XXX This only applies to decoding of in-progress transactions. + * In regular logical decoding we only execute this code at commit + * time, at which point we should have seen all relevant combocids. + * So we should error out in this case. + * + * XXX For the streaming case, we can track the largest combocid + * assigned, and error out based on this (when unable to resolve + * combocid below that observed maximum value). + */ + if (!resolved || cmax == InvalidCommandId) + return true; if (cmax >= snapshot->curcid) return true; /* deleted after scan started */ diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index cd406ca4d2..47dc31298d 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -100,6 +100,7 @@ #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */ #include "storage/bufmgr.h" #include "storage/fd.h" +#include "storage/procarray.h" #include "storage/sinval.h" #include "utils/builtins.h" #include "utils/combocid.h" @@ -236,6 +237,7 @@ static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, char *change); static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); static void ReorderBufferCleanupSerializedTXNs(const char *slotname); static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, XLogSegNo segno); @@ -244,6 +246,15 @@ static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap); static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, ReorderBufferTXN *txn, CommandId cid); +/* + * --------------------------------------- + * Streaming support functions + * --------------------------------------- + */ +static inline bool ReorderBufferCanStream(ReorderBuffer *rb); +static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn); + /* --------------------------------------- * toast reassembly support * --------------------------------------- @@ -371,6 +382,9 @@ ReorderBufferGetTXN(ReorderBuffer *rb) dlist_init(&txn->tuplecids); dlist_init(&txn->subtxns); + /* InvalidCommandId is not zero, so set it explicitly */ + txn->command_id = InvalidCommandId; + return txn; } @@ -772,6 +786,38 @@ AssertTXNLsnOrder(ReorderBuffer *rb) #endif } +/* + * AssertChangeLsnOrder + * + * Check ordering of changes in the toplevel transaction. + */ +static void +AssertChangeLsnOrder(ReorderBufferTXN *txn) +{ +#ifdef USE_ASSERT_CHECKING + dlist_iter iter; + XLogRecPtr prev_lsn = txn->first_lsn; + + dlist_foreach(iter, &txn->changes) + { + ReorderBufferChange *cur_change; + + cur_change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(cur_change->lsn != InvalidXLogRecPtr); + Assert(txn->first_lsn <= cur_change->lsn); + + if (txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_change->lsn <= txn->end_lsn); + + Assert(prev_lsn <= cur_change->lsn); + + prev_lsn = cur_change->lsn; + } +#endif +} + /* * ReorderBufferGetOldestTXN * Return oldest transaction in reorderbuffer @@ -865,6 +911,9 @@ ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid, subtxn->toplevel_xid = xid; Assert(subtxn->nsubtxns == 0); + /* set the reference to toplevel transaction */ + subtxn->toptxn = txn; + /* add to subtransaction list */ dlist_push_tail(&txn->subtxns, &subtxn->node); txn->nsubtxns++; @@ -1024,6 +1073,9 @@ ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, *iter_state = NULL; + /* Check ordering of changes in the toplevel transaction. */ + AssertChangeLsnOrder(txn); + /* * Calculate the size of our heap: one element for every transaction that * contains changes. (Besides the transactions already in the reorder @@ -1038,6 +1090,9 @@ ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + /* Check ordering of changes in this subtransaction. */ + AssertChangeLsnOrder(cur_txn); + if (cur_txn->nentries > 0) nr_txns++; } @@ -1315,6 +1370,15 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) dlist_delete(&txn->base_snapshot_node); } + /* + * Cleanup the snapshot for the last streamed run. + */ + if (txn->snapshot_now != NULL) + { + Assert(rbtxn_is_streamed(txn)); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + } + /* * Remove TXN from its containing list. * @@ -1340,6 +1404,80 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) ReorderBufferReturnTXN(rb, txn); } +/* + * Discard changes from a transaction (and subtransactions), after streaming + * them. Keep the remaining info - transactions, tuplecids and snapshots. + */ +static void +ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferTruncateTXN(rb, subtxn); + } + + /* cleanup changes in the toplevel txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* remove the change from it's containing list */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change); + } + + /* + * Mark the transaction as streamed. + * + * The toplevel transaction, identified by (toptxn==NULL), is marked + * as streamed always, even if it does not contain any changes (that + * is, when all the changes are in subtransactions). + * + * For subtransactions, we only mark them as streamed when there are + * changes in them. + * + * We do it this way because of aborts - we don't want to send aborts + * for XIDs the downstream is not aware of. And of course, it always + * knows about the toplevel xact (we send the XID in all messages), + * but we never stream XIDs of empty subxacts. + */ + if ((!txn->toptxn) || (txn->nentries_mem != 0)) + txn->txn_flags |= RBTXN_IS_STREAMED; + + /* + * Destroy the (relfilenode, ctid) hashtable, so that we don't leak + * any memory. We could also keep the hash table and update it with + * new ctid values, but this seems simpler and good enough for now. + */ + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + /* also reset the number of entries in the transaction */ + txn->nentries_mem = 0; + txn->nentries = 0; +} + /* * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by * HeapTupleSatisfiesHistoricMVCC. @@ -1491,57 +1629,171 @@ ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap) } /* - * Perform the replay of a transaction and its non-aborted subtransactions. - * - * Subtransactions previously have to be processed by - * ReorderBufferCommitChild(), even if previously assigned to the toplevel - * transaction with ReorderBufferAssignChild. - * - * We currently can only decode a transaction's contents when its commit - * record is read because that's the only place where we know about cache - * invalidations. Thus, once a toplevel commit is read, we iterate over the top - * and subtransactions (using a k-way merge) and replay the changes in lsn - * order. + * If the transaction was (partially) streamed, we need to commit it in a + * 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_commit message. */ -void -ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, - XLogRecPtr commit_lsn, XLogRecPtr end_lsn, - TimestampTz commit_time, - RepOriginId origin_id, XLogRecPtr origin_lsn) +static void +ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn) { - ReorderBufferTXN *txn; - volatile Snapshot snapshot_now; - volatile CommandId command_id = FirstCommandId; - bool using_subtxn; - ReorderBufferIterTXNState *volatile iterstate = NULL; + /* we should only call this for previously streamed transactions */ + Assert(rbtxn_is_streamed(txn)); - txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, - false); + ReorderBufferStreamTXN(rb, txn); - /* unknown transaction, nothing to replay */ - if (txn == NULL) - return; + rb->stream_commit(rb, txn, txn->final_lsn); - txn->final_lsn = commit_lsn; - txn->end_lsn = end_lsn; - txn->commit_time = commit_time; - txn->origin_id = origin_id; - txn->origin_lsn = origin_lsn; + ReorderBufferCleanupTXN(rb, txn); +} +/* + * Set xid for concurrent abort check + * + * While streaming an in-progress transaction there is a possibility that the + * (sub)transaction might get aborted concurrently. In such case if the + * (sub)transaction has catalog update then we might decode the tuple using + * wrong catalog version. So for detecting the concurrent abort we set + * CheckXidAlive to the current (sub)transaction's xid for which this change + * belongs to. And, during catalog scan we can check the status of the xid and + * if it is aborted we will report a specific error so that we can stop + * streaming current transaction and discard the already streamed changes on + * such an error. We might have already streamed some of the changes for the + * aborted (sub)transaction, but that is fine because when we decode the abort + * we will stream abort message to truncate the changes in the subscriber. + */ +static inline void +SetupCheckXidLive(TransactionId xid) +{ /* - * If this transaction has no snapshot, it didn't make any changes to the - * database, so there's nothing to decode. Note that - * ReorderBufferCommitChild will have transferred any snapshots from - * subtransactions if there were any. + * If the input transaction id is already set as a CheckXidAlive then + * nothing to do. */ - if (txn->base_snapshot == NULL) - { - Assert(txn->ninvalidations == 0); - ReorderBufferCleanupTXN(rb, txn); + if (TransactionIdEquals(CheckXidAlive, xid)) return; + + /* + * setup CheckXidAlive if it's not committed yet. We don't check if the xid + * aborted. That will happen during catalog access. Also, reset the + * bsysscan flag. + */ + if (!TransactionIdDidCommit(xid)) + { + CheckXidAlive = xid; + bsysscan = false; } + else + CheckXidAlive = InvalidTransactionId; +} - snapshot_now = txn->base_snapshot; +/* + * Helper function for ReorderBufferProcessTXN for applying change. + */ +static inline void +ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change, + bool streaming) +{ + if (streaming) + rb->stream_change(rb, txn, relation, change); + else + rb->apply_change(rb, txn, relation, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the truncate + */ +static inline void +ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn, + int nrelations, Relation *relations, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_truncate(rb, txn, nrelations, relations, change); + else + rb->apply_truncate(rb, txn, nrelations, relations, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the message + */ +static inline void +ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); + else + rb->message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); +} + +/* + * Function to store the command id and snapshot at the end of the current + * stream so that we can reuse the same while sending the next stream. + */ +static inline void +ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, CommandId command_id) +{ + txn->command_id = command_id; + + /* Avoid copying if it's already copied. */ + if (snapshot_now->copied) + txn->snapshot_now = snapshot_now; + else + txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); +} + +/* + * Helper function for ReorderBufferProcessTXN to handle the concurrent + * abort of the streaming transaction. + */ +static void +ReorderBufferHandleConcurrentAbort(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, + CommandId command_id, + XLogRecPtr last_lsn, + ReorderBufferChange *specinsert) +{ + /* Discard the changes that we just streamed. */ + ReorderBufferTruncateTXN(rb, txn); + + /* Stop the stream. */ + rb->stream_stop(rb, txn, last_lsn); + + /* Remember the command ID and snapshot for the streaming run. */ + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + + ReorderBufferToastReset(rb, txn); + if (specinsert != NULL) + ReorderBufferReturnChange(rb, specinsert); +} + +/* + * Helper function for ReorderBufferCommit and ReorderBufferStreamTXN + * + * Send data of a transaction (and its subtransactions) to the + * output plugin. If streaming is true then data will be sent using stream API. + */ +static void +ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn, + volatile Snapshot snapshot_now, + volatile CommandId command_id, + bool streaming) +{ + bool using_subtxn; + MemoryContext ccxt = CurrentMemoryContext; + ReorderBufferIterTXNState *volatile iterstate = NULL; + volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; + ReorderBufferChange *volatile specinsert = NULL; + volatile bool stream_started = false; /* build data to be able to lookup the CommandIds of catalog tuples */ ReorderBufferBuildTupleCidHash(rb, txn); @@ -1564,21 +1816,44 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, PG_TRY(); { ReorderBufferChange *change; - ReorderBufferChange *specinsert = NULL; if (using_subtxn) - BeginInternalSubTransaction("replay"); + BeginInternalSubTransaction(streaming? "stream" : "replay"); else StartTransactionCommand(); - rb->begin(rb, txn); - ReorderBufferIterTXNInit(rb, txn, &iterstate); while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL) { Relation relation = NULL; Oid reloid; + /* + * Start stream or begin transaction for the first change in the + * current stream. + */ + if (prev_lsn == InvalidXLogRecPtr) + { + if (streaming) + rb->stream_start(rb, txn, change->lsn); + else + rb->begin(rb, txn); + stream_started = true; + } + + /* + * Enforce correct ordering of changes, merged from multiple + * subtransactions. The changes may have the same LSN due to + * MULTI_INSERT xlog records. + */ + Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn); + + prev_lsn = change->lsn; + + /* Set the xid for concurrent abort check. */ + if (streaming) + SetupCheckXidLive(change->txn->xid); + switch (change->action) { case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: @@ -1655,7 +1930,8 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, if (!IsToastRelation(relation)) { ReorderBufferToastReplace(rb, txn, relation, change); - rb->apply_change(rb, txn, relation, change); + ReorderBufferApplyChange(rb, txn, relation, change, + streaming); /* * Only clear reassembled toast chunks if we're sure @@ -1695,7 +1971,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, specinsert = NULL; } - if (relation != NULL) + if (RelationIsValid(relation)) { RelationClose(relation); relation = NULL; @@ -1753,7 +2029,10 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, relations[nrelations++] = relation; } - rb->apply_truncate(rb, txn, nrelations, relations, change); + /* Apply the truncate. */ + ReorderBufferApplyTruncate(rb, txn, nrelations, + relations, change, + streaming); for (i = 0; i < nrelations; i++) RelationClose(relations[i]); @@ -1762,10 +2041,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, } case REORDER_BUFFER_CHANGE_MESSAGE: - rb->message(rb, txn, change->lsn, true, - change->data.msg.prefix, - change->data.msg.message_size, - change->data.msg.message); + ReorderBufferApplyMessage(rb, txn, change, streaming); break; case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: @@ -1796,7 +2072,6 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, snapshot_now = change->data.snapshot; } - /* and continue with the new one */ SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); break; @@ -1858,14 +2133,34 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, ReorderBufferIterTXNFinish(rb, iterstate); iterstate = NULL; - /* call commit callback */ - rb->commit(rb, txn, commit_lsn); + /* + * Done with current changes, call stream_stop callback for streaming + * transaction, commit callback otherwise. If we have sent + * start/begin. + */ + if (stream_started) + { + if (streaming) + rb->stream_stop(rb, txn, prev_lsn); + else + rb->commit(rb, txn, commit_lsn); + stream_started = false; + } /* this is just a sanity check against bad output plugin behaviour */ if (GetCurrentTransactionIdIfAny() != InvalidTransactionId) elog(ERROR, "output plugin used XID %u", GetCurrentTransactionId()); + /* + * Remember the command ID and snapshot if transaction is streaming + * otherwise free the snapshot if we have copied it. + */ + if (streaming) + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + else if (snapshot_now->copied) + ReorderBufferFreeSnap(rb, snapshot_now); + /* cleanup */ TeardownHistoricSnapshot(false); @@ -1884,14 +2179,27 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); - if (snapshot_now->copied) - ReorderBufferFreeSnap(rb, snapshot_now); + /* + * If we are streaming the in-progress transaction then discard the + * changes that we just streamed, and mark the transactions as streamed + * (if they contained changes). Otherwise, remove all the changes and + * deallocate the ReorderBufferTXN. + */ + if (streaming) + { + ReorderBufferTruncateTXN(rb, txn); - /* remove potential on-disk data, and deallocate */ - ReorderBufferCleanupTXN(rb, txn); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + ReorderBufferCleanupTXN(rb, txn); } PG_CATCH(); { + MemoryContext ecxt = MemoryContextSwitchTo(ccxt); + ErrorData *errdata = CopyErrorData(); + /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */ if (iterstate) ReorderBufferIterTXNFinish(rb, iterstate); @@ -1911,17 +2219,122 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); - if (snapshot_now->copied) - ReorderBufferFreeSnap(rb, snapshot_now); + /* Reset the CheckXidAlive */ + if (streaming) + CheckXidAlive = InvalidTransactionId; - /* remove potential on-disk data, and deallocate */ - ReorderBufferCleanupTXN(rb, txn); + /* + * If the error code is ERRCODE_TRANSACTION_ROLLBACK, that means we + * have detected a concurrent abort of the (sub)transaction we are + * streaming. So just do the cleanup and return gracefully. + * Otherwise, Re-throw the error. + */ + if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK) + { + /* + * Only in the streaming mode we can get this error, because only + * in the streaming mode we send in-progress transaction. + */ + Assert(streaming); - PG_RE_THROW(); + /* + * In the TRY block we only stop the stream after we have send + * all the changes. So we have detected the concurrent abort then + * the stream should have not been stopped yet. + */ + Assert(stream_started); + + /* Cleanup the temporary error state. */ + FlushErrorState(); + FreeErrorData(errdata); + errdata = NULL; + + /* Handle the concurrent abort. */ + ReorderBufferHandleConcurrentAbort(rb, txn, snapshot_now, + command_id, prev_lsn, + specinsert); + } + else + { + ReorderBufferCleanupTXN(rb, txn); + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } } PG_END_TRY(); } +/* + * Perform the replay of a transaction and its non-aborted subtransactions. + * + * Subtransactions previously have to be processed by + * ReorderBufferCommitChild(), even if previously assigned to the toplevel + * transaction with ReorderBufferAssignChild. + * + * This interface is called once a toplevel commit is read for both streamed + * as well as non-streamed transactions. We iterate over the top and + * subtransactions (using a k-way merge) and replay the changes in lsn + * order. + */ +void +ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + Snapshot snapshot_now; + CommandId command_id = FirstCommandId; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + /* + * If the transaction was (partially) streamed, we need to commit it in a + * 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_commit message. + * + * XXX Called after everything (origin ID and LSN, ...) is stored in the + * transaction, so we don't pass that directly. + */ + if (rbtxn_is_streamed(txn)) + { + ReorderBufferStreamCommit(rb, txn); + return; + } + + /* + * If this transaction has no snapshot, it didn't make any changes to the + * database, so there's nothing to decode. Note that + * ReorderBufferCommitChild will have transferred any snapshots from + * subtransactions if there were any. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + ReorderBufferCleanupTXN(rb, txn); + return; + } + + snapshot_now = txn->base_snapshot; + + /* + * Access the main routine to decode the changes and send to output plugin. + */ + ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now, + command_id, false); +} + /* * Abort a transaction that possibly has previous changes. Needs to be first * called for subtransactions and then for the toplevel xid. @@ -1946,6 +2359,13 @@ ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) if (txn == NULL) return; + /* + * If the (sub)transaction was streamed, notify the remote node about the + * abort. + */ + if (rbtxn_is_streamed(txn)) + rb->stream_abort(rb, txn, lsn); + /* cosmetic... */ txn->final_lsn = lsn; @@ -2015,6 +2435,13 @@ ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) if (txn == NULL) return; + /* + * If the (sub)transaction was streamed, notify the remote node about the + * abort. + */ + if (rbtxn_is_streamed(txn)) + rb->stream_abort(rb, txn, lsn); + /* cosmetic... */ txn->final_lsn = lsn; @@ -2150,8 +2577,17 @@ ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, } /* - * Update the memory accounting info. We track memory used by the whole - * reorder buffer and the transaction containing the change. + * Update memory counters to account for the new or removed change. + * + * We update two counters - in the reorder buffer, and in the transaction + * containing the change. The reorder buffer counter allows us to quickly + * decide if we reached the memory limit, the transaction counter allows + * us to quickly pick the largest transaction for eviction. + * + * When streaming is enabled, we need to update the toplevel transaction + * counters instead - we don't really care about subtransactions as we + * can't stream them individually anyway, and we only pick toplevel + * transactions for eviction. So only toplevel transactions matter. */ static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, @@ -2159,6 +2595,7 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, bool addition) { Size sz; + ReorderBufferTXN *txn; Assert(change->txn); @@ -2170,19 +2607,28 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID) return; + txn = change->txn; + + /* if subxact, and streaming supported, use the toplevel instead */ + if (txn->toptxn && ReorderBufferCanStream(rb)) + txn = txn->toptxn; + sz = ReorderBufferChangeSize(change); if (addition) { - change->txn->size += sz; + txn->size += sz; rb->size += sz; } else { - Assert((rb->size >= sz) && (change->txn->size >= sz)); - change->txn->size -= sz; + Assert((rb->size >= sz) && (txn->size >= sz)); + txn->size -= sz; rb->size -= sz; } + + Assert(txn->size <= rb->size); + Assert((txn->size >= 0) && (rb->size >= 0)); } /* @@ -2211,6 +2657,7 @@ ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, change->lsn = lsn; change->txn = txn; change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID; + change->txn = txn; dlist_push_tail(&txn->tuplecids, &change->node); txn->ntuplecids++; @@ -2295,6 +2742,16 @@ ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; + + /* + * Mark toplevel transaction as having catalog changes too if one of its + * children has so that the ReorderBufferBuildTupleCidHash can conveniently + * check just toplevel transaction and decide whethe we need to build the + * hash table or not. In non-streaming mode we mark the toplevel + * transaction in DecodeCommit as we only stream on commit. + */ + if (txn->toptxn != NULL) + txn->toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; } /* @@ -2398,6 +2855,38 @@ ReorderBufferLargestTXN(ReorderBuffer *rb) return largest; } +/* + * Find the largest toplevel transaction to evict (by streaming). + * + * This can be seen as an optimized version of ReorderBufferLargestTXN, which + * should give us the same transaction (because we don't update memory account + * for subtransaction with streaming, so it's always 0). But we can simply + * iterate over the limited number of toplevel transactions. + */ +static ReorderBufferTXN * +ReorderBufferLargestTopTXN(ReorderBuffer *rb) +{ + dlist_iter iter; + ReorderBufferTXN *largest = NULL; + + dlist_foreach(iter, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* if the current transaction is larger, remember it */ + if ((!largest) || (txn->size > largest->size)) + largest = txn; + } + + Assert(largest); + Assert(largest->size > 0); + Assert(largest->size <= rb->size); + + return largest; +} + /* * Check whether the logical_decoding_work_mem limit was reached, and if yes * pick the largest (sub)transaction at-a-time to evict and spill its changes to @@ -2430,11 +2919,38 @@ ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) { /* * Pick the largest transaction (or subtransaction) and evict it from - * memory by serializing it to disk. + * memory by streaming, if supported. Otherwise spill to disk. */ - txn = ReorderBufferLargestTXN(rb); + if (ReorderBufferCanStream(rb)) + { + /* + * Pick the largest toplevel transaction and evict it from memory by + * streaming the already decoded part. + */ + txn = ReorderBufferLargestTopTXN(rb); - ReorderBufferSerializeTXN(rb, txn); + /* we know there has to be one, because the size is not zero */ + Assert(txn && !txn->toptxn); + Assert(txn->size > 0); + Assert(rb->size >= txn->size); + + ReorderBufferStreamTXN(rb, txn); + } + else + { + /* + * Pick the largest transaction (or subtransaction) and evict it from + * memory by serializing it to disk. + */ + txn = ReorderBufferLargestTXN(rb); + + /* we know there has to be one, because the size is not zero */ + Assert(txn); + Assert(txn->size > 0); + Assert(rb->size >= txn->size); + + ReorderBufferSerializeTXN(rb, txn); + } /* * After eviction, the transaction should have no entries in memory, @@ -2750,6 +3266,102 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, Assert(ondisk->change.action == change->action); } +static inline bool +ReorderBufferCanStream(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + + return ctx->streaming; +} + +/* + * Send data of a large transaction (and its subtransactions) to the + * output plugin, but using the stream API. + * + * XXX Do we need to check if the transaction has some changes to stream + * (maybe it got streamed right before the commit, which attempts to + * stream it again before the commit)? + */ +static void +ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + Snapshot snapshot_now; + CommandId command_id; + + /* We can never reach here for a sub transaction. */ + Assert(txn->toptxn == NULL); + + /* + * XXX Not sure if we can make any assumptions about base snapshot here, + * similarly to what ReorderBufferCommit() does. That relies on + * base_snapshot getting transferred from subxact in + * ReorderBufferCommitChild(), but that was not yet called as the + * transaction is in-progress. + * + * So just walk the subxacts and use the same logic here. But we only need + * to do that once, when the transaction is streamed for the first time. + * After that we need to reuse the snapshot from the previous run. + */ + if (txn->snapshot_now == NULL) + { + dlist_iter subxact_i; + + /* make sure this transaction is streamed for the first time */ + Assert(!rbtxn_is_streamed(txn)); + + /* at the beginning we should have invalid command ID */ + Assert(txn->command_id == InvalidCommandId); + + dlist_foreach(subxact_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur); + ReorderBufferTransferSnapToParent(txn, subtxn); + } + + command_id = FirstCommandId; + snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot, + txn, command_id); + } + else + { + /* the transaction must have been already streamed */ + Assert(rbtxn_is_streamed(txn)); + + /* + * Nah, we already have snapshot from the previous streaming run. We + * assume new subxacts can't move the LSN backwards, and so can't beat + * the LSN condition in the previous branch (so no need to walk + * through subxacts again). In fact, we must not do that as we may be + * using snapshot half-way through the subxact. + */ + command_id = txn->command_id; + + /* + * We can not use txn->snapshot_now directly because after the last + * streaming run, we might have got some new sub-transactions. So we + * need to add them to the snapshot. + */ + snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now, + txn, command_id); + + /* Free the previously copied snapshot. */ + Assert(txn->snapshot_now->copied); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + } + + /* + * Access the main routine to decode the changes and send to output plugin. + */ + ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now, + command_id, true); + + Assert(dlist_is_empty(&txn->changes)); + Assert(txn->nentries == 0); + Assert(txn->nentries_mem == 0); +} + /* * Size of a change in memory. */ @@ -3868,6 +4480,16 @@ ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, BlockNumber blockno; bool updated_mapping = false; + /* + * Return unresolved if tuplecid_data is not valid. That's because when + * streaming in-progress transactions we may run into tuples with the CID + * before actually decoding them. Think e.g. about INSERT followed by + * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the + * INSERT. So in such cases we assume the CID is from the future command. + */ + if (tuplecid_data == NULL) + return false; + /* be careful about padding */ memset(&key, 0, sizeof(key)); diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 65814af9f5..b3e2b3f64b 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -171,6 +171,7 @@ typedef struct ReorderBufferChange #define RBTXN_HAS_CATALOG_CHANGES 0x0001 #define RBTXN_IS_SUBXACT 0x0002 #define RBTXN_IS_SERIALIZED 0x0004 +#define RBTXN_IS_STREAMED 0x0008 /* Does the transaction have catalog changes? */ #define rbtxn_has_catalog_changes(txn) \ @@ -190,6 +191,24 @@ typedef struct ReorderBufferChange ((txn)->txn_flags & RBTXN_IS_SERIALIZED) != 0 \ ) +/* + * Has this transaction been streamed to downstream? + * + * (It's not possible to deduce this from nentries and nentries_mem for + * various reasons. For example, all changes may be in subtransactions in + * which case we'd have nentries==0 for the toplevel one, which would say + * nothing about the streaming. So we maintain this flag, but only for the + * toplevel transaction.) + * + * Note: We never do both stream and serialize a transaction (we only spill + * to disk when streaming is not supported by the plugin), so only one of + * those two flags may be set at any given time. + */ +#define rbtxn_is_streamed(txn) \ +( \ + ((txn)->txn_flags & RBTXN_IS_STREAMED) != 0 \ +) + typedef struct ReorderBufferTXN { /* See above */ @@ -224,6 +243,11 @@ typedef struct ReorderBufferTXN */ XLogRecPtr final_lsn; + /* + * Toplevel transaction for this subxact (NULL for top-level). + */ + struct ReorderBufferTXN *toptxn; + /* * LSN pointing to the end of the commit record + 1. */ @@ -254,6 +278,13 @@ typedef struct ReorderBufferTXN XLogRecPtr base_snapshot_lsn; dlist_node base_snapshot_node; /* link in txns_by_base_snapshot_lsn */ + /* + * Snapshot/CID from the previous streaming run. Only valid for already + * streamed transactions (NULL/InvalidCommandId otherwise). + */ + Snapshot snapshot_now; + CommandId command_id; + /* * How many ReorderBufferChange's do we have in this txn. * -- 2.23.0 v27/v27-0004-Gracefully-handle-concurrent-aborts-of-uncommitt.patch000644 000765 000024 00000032536 13670411611 026444 0ustar00dilipkumarstaff000000 000000 From be7449ec75c6776e5ebc1b5237cfb88de7d5a193 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Thu, 9 Apr 2020 10:55:19 +0530 Subject: [PATCH v27 04/14] Gracefully handle concurrent aborts of uncommitted transactions that are being decoded alongside. When a transaction aborts, it's changes are considered unnecessary for other transactions. That means the changes may be either cleaned up by vacuum or removed from HOT chains (thus made inaccessible through indexes), and there may be other such consequences. When decoding committed transactions this is not an issue, and we never decode transactions that abort before the decoding starts. But for in-progress transactions, this may cause failures when the output plugin consults catalogs (both system and user-defined). We handle such failures by returning ERRCODE_TRANSACTION_ROLLBACK sqlerrcode from system table scan APIs to the backend decoding a specific uncommitted transaction. The decoding logic on the receipt of such an sqlerrcode aborts the ongoing decoding and returns gracefully. --- doc/src/sgml/logicaldecoding.sgml | 9 +++-- src/backend/access/heap/heapam.c | 10 ++++++ src/backend/access/index/genam.c | 53 ++++++++++++++++++++++++++++ src/backend/access/table/tableam.c | 8 +++++ src/backend/utils/time/snapmgr.c | 13 +++++++ src/include/access/tableam.h | 55 ++++++++++++++++++++++++++++++ src/include/utils/snapmgr.h | 2 ++ 7 files changed, 147 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml index 50cfd6fa47..ab689f8d19 100644 --- a/doc/src/sgml/logicaldecoding.sgml +++ b/doc/src/sgml/logicaldecoding.sgml @@ -433,9 +433,12 @@ typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb); ALTER TABLE user_catalog_table SET (user_catalog_table = true); CREATE TABLE another_catalog_table(data text) WITH (user_catalog_table = true); - Any actions leading to transaction ID assignment are prohibited. That, among others, - includes writing to tables, performing DDL changes, and - calling pg_current_xact_id(). + Note that access to user catalog tables or regular system catalog tables + in the output plugins has to be done via the systable_* + scan APIs only. Access via the heap_* scan APIs will + error out. Additionally, any actions leading to transaction ID assignment + are prohibited. That, among others, includes writing to tables, performing + DDL changes, and calling pg_current_xact_id(). diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 94eb37d48d..2d77107c4f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1288,6 +1288,16 @@ heap_getnext(TableScanDesc sscan, ScanDirection direction) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg_internal("only heap AM is supported"))); + /* + * We don't expect direct calls to heap_getnext with valid CheckXidAlive + * for catalog or regular tables. See detailed comments at snapmgr.c + * where these variables are declared. Normally we have such a check at + * tableam level API but this is called from many places so we need to + * ensure it here. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected heap_getnext call during logical decoding"); + /* Note: no locking manipulations needed */ if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index dfba5ae39a..446b8cbc86 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -28,6 +28,7 @@ #include "lib/stringinfo.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/procarray.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/lsyscache.h" @@ -429,9 +430,36 @@ systable_beginscan(Relation heapRelation, sysscan->iscan = NULL; } + /* + * If CheckXidAlive is set then set a flag to indicate that system table + * scan is in-progress. See detailed comments at snapmgr.c where these + * variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = true; + return sysscan; } +/* + * HandleConcurrentAbort - Handle concurrent abort of the CheckXidAlive. + * + * Error out, if CheckXidAlive is aborted. We can't directly use + * TransactionIdDidAbort as after crash such transaction might not have been + * marked as aborted. See detailed comments at snapmgr.c where the variable + * is declared. + */ +static inline void +HandleConcurrentAbort() +{ + if (TransactionIdIsValid(CheckXidAlive) && + !TransactionIdIsInProgress(CheckXidAlive) && + !TransactionIdDidCommit(CheckXidAlive)) + ereport(ERROR, + (errcode(ERRCODE_TRANSACTION_ROLLBACK), + errmsg("transaction aborted during system catalog scan"))); +} + /* * systable_getnext --- get next tuple in a heap-or-index scan * @@ -481,6 +509,12 @@ systable_getnext(SysScanDesc sysscan) } } + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + return htup; } @@ -517,6 +551,12 @@ systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup) sysscan->slot, freshsnap); + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + return result; } @@ -545,6 +585,13 @@ systable_endscan(SysScanDesc sysscan) if (sysscan->snapshot) UnregisterSnapshot(sysscan->snapshot); + /* + * Reset the sysbegin_called flag at the end of the systable scan. See + * detailed comments at snapmgr.c where these variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = false; + pfree(sysscan); } @@ -643,6 +690,12 @@ systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction) if (htup && sysscan->iscan->xs_recheck) elog(ERROR, "system catalog scans with lossy index conditions are not implemented"); + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + return htup; } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index c814733b22..2f52b407c6 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -230,6 +230,14 @@ table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid) Relation rel = scan->rs_rd; const TableAmRoutine *tableam = rel->rd_tableam; + /* + * We don't expect direct calls to table_tuple_get_latest_tid with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding"); + /* * Since this can be called with user-supplied TID, don't trust the input * too much. diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 1c063c592c..9f1ecd123f 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -153,6 +153,19 @@ static Snapshot SecondarySnapshot = NULL; static Snapshot CatalogSnapshot = NULL; static Snapshot HistoricSnapshot = NULL; +/* + * CheckXidAlive is a xid value pointing to a possibly ongoing (sub) + * transaction.  Currently, it is used in logical decoding.  It's possible + * that such transactions can get aborted while the decoding is ongoing in + * which case we skip decoding that particular transaction. To ensure that we + * check whether the CheckXidAlive is aborted after fetching the tuple from + * system tables.  We also ensure that during logical decoding we never + * directly access the tableam or heap APIs because we are checking for the + * concurrent aborts only in systable_* APIs. + */ +TransactionId CheckXidAlive = InvalidTransactionId; +bool bsysscan = false; + /* * These are updated by GetSnapshotData. We initialize them this way * for the convenience of TransactionIdIsInProgress: even in bootstrap diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index eb18739c36..2b7d3df617 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -21,6 +21,7 @@ #include "access/sdir.h" #include "utils/guc.h" #include "utils/rel.h" +#include "utils/snapmgr.h" #include "utils/snapshot.h" @@ -903,6 +904,15 @@ static inline bool table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) { slot->tts_tableOid = RelationGetRelid(sscan->rs_rd); + + /* + * We don't expect direct calls to table_scan_getnextslot with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_getnextslot call during logical decoding"); + return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot); } @@ -1015,6 +1025,13 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, TupleTableSlot *slot, bool *call_again, bool *all_dead) { + /* + * We don't expect direct calls to table_index_fetch_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, slot, call_again, @@ -1054,6 +1071,14 @@ table_tuple_fetch_row_version(Relation rel, Snapshot snapshot, TupleTableSlot *slot) { + /* + * We don't expect direct calls to table_tuple_fetch_row_version with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding"); + return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); } @@ -1710,6 +1735,14 @@ static inline bool table_scan_bitmap_next_block(TableScanDesc scan, struct TBMIterateResult *tbmres) { + /* + * We don't expect direct calls to table_scan_bitmap_next_block with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding"); + return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, tbmres); } @@ -1727,6 +1760,14 @@ table_scan_bitmap_next_tuple(TableScanDesc scan, struct TBMIterateResult *tbmres, TupleTableSlot *slot) { + /* + * We don't expect direct calls to table_scan_bitmap_next_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding"); + return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan, tbmres, slot); @@ -1745,6 +1786,13 @@ static inline bool table_scan_sample_next_block(TableScanDesc scan, struct SampleScanState *scanstate) { + /* + * We don't expect direct calls to table_scan_sample_next_block with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_sample_next_block call during logical decoding"); return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate); } @@ -1761,6 +1809,13 @@ table_scan_sample_next_tuple(TableScanDesc scan, struct SampleScanState *scanstate, TupleTableSlot *slot) { + /* + * We don't expect direct calls to table_scan_sample_next_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments at + * snapmgr.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_sample_next_tuple call during logical decoding"); return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate, slot); } diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index b28d13ce84..5af6df698b 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -145,6 +145,8 @@ extern bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); /* Support for catalog timetravel for logical decoding */ struct HTAB; +extern TransactionId CheckXidAlive; +extern bool bsysscan; extern struct HTAB *HistoricSnapshotGetTupleCids(void); extern void SetupHistoricSnapshot(Snapshot snapshot_now, struct HTAB *tuplecids); extern void TeardownHistoricSnapshot(bool is_error); -- 2.23.0