From 5be13fbfd31ab5af9265ac9cb3adcf0589bf2a8e Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Tue, 14 Jul 2020 07:23:14 +0530 Subject: [PATCH v3] Retry Cached Remote Connections For postgres_fdw Remote connections are cached for postgres_fdw in the local backend. There are high chances that the remote backend can no logner be avaiable i.e. it can get killed, the subsequent foreign queries from local backend/session that uses cached connection fails as the remote backend woule have become unavailable/killed. So, this patch, solves this problem, 1. local backend/session uses cached connection, but recieves error 2. upon receiving the first error, it deletes the cached entry 3. try to get new connection at the start of begin remote xact --- contrib/postgres_fdw/connection.c | 65 ++++++++++++++++--- .../postgres_fdw/expected/postgres_fdw.out | 55 ++++++++++++++++ contrib/postgres_fdw/sql/postgres_fdw.sql | 31 +++++++++ 3 files changed, 143 insertions(+), 8 deletions(-) diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c index 52d1fe3563..b7ae8dbbac 100644 --- a/contrib/postgres_fdw/connection.c +++ b/contrib/postgres_fdw/connection.c @@ -169,12 +169,6 @@ GetConnection(UserMapping *user, bool will_prep_stmt) disconnect_pg_server(entry); } - /* - * We don't check the health of cached connection here, because it would - * require some overhead. Broken connection will be detected when the - * connection is actually used. - */ - /* * If cache entry doesn't have a connection, we have to establish a new * connection. (If connect_pg_server throws an error, the cache entry @@ -205,9 +199,64 @@ GetConnection(UserMapping *user, bool will_prep_stmt) } /* - * Start a new transaction or subtransaction if needed. + * We check the health of cached connection here, while the remote xact is + * going to begin. Broken connection will be detected and if so, get a + * new connection and resubmit the remote xact begin. We retry the broken + * connections only once during each begin remote xact call, still the broken + * connection exist after that then, the query fails. */ - begin_remote_xact(entry); + PG_TRY(); + { + /* Start a new transaction or subtransaction if needed. */ + begin_remote_xact(entry); + } + PG_CATCH(); + { + if (entry->conn && + PQstatus(entry->conn) == CONNECTION_BAD && + entry->xact_depth <= 0) + { + /* server object will be used for log messages. */ + ForeignServer *server = GetForeignServer(user->serverid); + + /* + * xact_depth <=0, which means we are retrying only at the beginning + * of remote xact. This is sensible, since we can't retry in the + * middle of a remote xact. + */ + elog(DEBUG3, "postgres_fdw connection %p to the server \"%s\" is broken. retrying to connect", + entry->conn, server->servername); + + PQreset(entry->conn); + + if (entry->conn && PQstatus(entry->conn) == CONNECTION_OK) + elog(DEBUG3, "postgres_fdw connection retry to the server \"%s\" is successful", + server->servername); + else + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg("could not connect to server \"%s\"", + server->servername), + errdetail_internal("%s", pchomp(PQerrorMessage(entry->conn))))); + + /* Retry starting a new transaction. */ + begin_remote_xact(entry); + } + else + { + /* + * We are here, due to either some error other than CONNECTION_BAD + * occurred or connection may have broken during start of a + * subtransacation. Just, clear off any result, try rethrowing the + * error, so that it will be caught appropriately. + */ + PGresult *res = NULL; + res = PQgetResult(entry->conn); + PQclear(res); + PG_RE_THROW(); + } + } + PG_END_TRY(); /* Remember if caller will prepare statements */ entry->have_prep_stmt |= will_prep_stmt; diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 90db550b92..46511a340e 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -8974,3 +8974,58 @@ PREPARE TRANSACTION 'fdw_tpc'; ERROR: cannot PREPARE a transaction that has operated on postgres_fdw foreign tables ROLLBACK; WARNING: there is no transaction in progress +-- Retry cached connections at the beginning of the remote xact +-- in case remote backend is killed. +-- Let's use a different application name for remote connection, +-- so that this test will not kill other backends wrongly. +ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check'); +-- Generate a connection to remote. Local backend will cache it. +SELECT * FROM ft1 LIMIT 1; + c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 +------+----+----+----+----+----+------------+---- + 1111 | 2 | | | | | ft1 | +(1 row) + +-- Retrieve pid of remote backend with application name fdw_retry_check +-- and kill it intentionally here. Note that, local backend still has +-- the remote connection/backend info in it's cache. +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; + pg_terminate_backend +---------------------- + t +(1 row) + +-- Next query using the same foreign server should succeed if connection +-- retry works. +SELECT * FROM ft1 LIMIT 1; + c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 +------+----+----+----+----+----+------------+---- + 1111 | 2 | | | | | ft1 | +(1 row) + +-- Subtransaction - remote backend is killed in the middle of a remote +-- xact, and we don't do retry connection, hence the subsequent query +-- using the same foreign server should fail. +BEGIN; +DECLARE c CURSOR FOR SELECT * FROM ft1 LIMIT 1; +FETCH c; + c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 +------+----+----+----+----+----+------------+---- + 1111 | 2 | | | | | ft1 | +(1 row) + +SAVEPOINT s; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; + pg_terminate_backend +---------------------- + t +(1 row) + +SELECT * FROM ft1 LIMIT 1; -- should fail +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: remote SQL command: SAVEPOINT s2 +COMMIT; diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 83971665e3..6f647450e7 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -2634,3 +2634,34 @@ SELECT count(*) FROM ft1; -- error here PREPARE TRANSACTION 'fdw_tpc'; ROLLBACK; + +-- Retry cached connections at the beginning of the remote xact +-- in case remote backend is killed. +-- Let's use a different application name for remote connection, +-- so that this test will not kill other backends wrongly. +ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check'); + +-- Generate a connection to remote. Local backend will cache it. +SELECT * FROM ft1 LIMIT 1; + +-- Retrieve pid of remote backend with application name fdw_retry_check +-- and kill it intentionally here. Note that, local backend still has +-- the remote connection/backend info in it's cache. +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; + +-- Next query using the same foreign server should succeed if connection +-- retry works. +SELECT * FROM ft1 LIMIT 1; + +-- Subtransaction - remote backend is killed in the middle of a remote +-- xact, and we don't do retry connection, hence the subsequent query +-- using the same foreign server should fail. +BEGIN; +DECLARE c CURSOR FOR SELECT * FROM ft1 LIMIT 1; +FETCH c; +SAVEPOINT s; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; +SELECT * FROM ft1 LIMIT 1; -- should fail +COMMIT; -- 2.25.1