From 952e6fbab1a735f677d8e6e92ba0aa8d53d9ab3e Mon Sep 17 00:00:00 2001
From: Craig Ringer <craig@2ndquadrant.com>
Date: Tue, 10 Apr 2018 14:08:32 +0800
Subject: [PATCH v3] PANIC when we detect a possible fsync I/O error instead of
 retrying fsync

Panic the server on fsync failure in places where we can't simply repeat the
whole operation on retry. Most imporantly, panic when fsync fails during a
checkpoint.

This will result in log messages like:

    PANIC:  58030: could not fsync file "base/12367/16386": Input/output error
    LOG:  00000: checkpointer process (PID 10799) was terminated by signal 6: Aborted

and, if the condition persists during redo:

    LOG:  00000: checkpoint starting: end-of-recovery immediate
    PANIC:  58030: could not fsync file "base/12367/16386": Input/output error
    LOG:  00000: startup process (PID 10808) was terminated by signal 6: Aborted

Why?

In a number of places PostgreSQL we responded to fsync() errors by retrying the
fsync(), expecting that this would force the operating system to repeat any
write attempts. The code assumed that fsync() would return an error on all
subsequent calls until any I/O error was resolved.

This is not what actually happens on some platforms, including Linux. The
operating system may give up and drop dirty buffers for async writes on the
floor and mark the page mapping as bad. The first fsync() clears any error flag
from the page entry and/or our file descriptor. So a subsequent fsync() returns
success, even though the data PostgreSQL wrote was really discarded.

We have no way to find out which writes failed, and no way to ask the kernel to
retry indefinitely, so all we can do is PANIC. Redo will attempt the write
again, and if it fails again, it will also PANIC.

This doesn't completely prevent fsync reliability issues, because it only
handles cases where the kernel actually reports the error to us. It's entirely
possible for a buffered write to be lost without causing fsync to report an
error at all (see discussion below). Work on addressing those issues and
documenting them is ongoing and will be committed separately.

Because NFS on Linux performs an implicit fsync() on close(), we also PANIC on
close() failures for non-temporary files managed by fd.c.

See:

* /message-id/CAMsr%2BYHh%2B5Oq4xziwwoEfhoTZgr07vdGG%2Bhu%3D1adXx59aTeaoQ%40mail.gmail.com
* /message-id/20180427222842.in2e4mibx45zdth5@alap3.anarazel.de
* https://lwn.net/Articles/752063/
* https://lwn.net/Articles/753650/
* https://lwn.net/Articles/752952/
* https://lwn.net/Articles/752613/
---
 src/backend/access/heap/rewriteheap.c       |  6 ++--
 src/backend/access/transam/timeline.c       |  4 +--
 src/backend/access/transam/twophase.c       |  2 +-
 src/backend/access/transam/xlog.c           |  9 +++++-
 src/backend/replication/logical/snapbuild.c |  3 ++
 src/backend/storage/file/fd.c               | 50 +++++++++++++++++++++++++----
 src/backend/storage/smgr/md.c               | 22 +++++++++----
 src/backend/utils/cache/relmapper.c         |  2 +-
 src/backend/utils/error/elog.c              | 11 +++++++
 src/include/utils/elog.h                    |  3 +-
 10 files changed, 91 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 8d3c861a33..a86a2f3824 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -965,7 +965,7 @@ logical_end_heap_rewrite(RewriteState state)
 	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
 	{
 		if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
-			ereport(ERROR,
+			ereport(promote_ioerr_to_panic(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m", src->path)));
 		FileClose(src->vfd);
@@ -1180,7 +1180,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
 	 */
 	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
 	if (pg_fsync(fd) != 0)
-		ereport(ERROR,
+		ereport(promote_ioerr_to_panic(ERROR),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", path)));
 	pgstat_report_wait_end();
@@ -1279,7 +1279,7 @@ CheckPointLogicalRewriteHeap(void)
 			 */
 			pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
 			if (pg_fsync(fd) != 0)
-				ereport(ERROR,
+				ereport(promote_ioerr_to_panic(ERROR),
 						(errcode_for_file_access(),
 						 errmsg("could not fsync file \"%s\": %m", path)));
 			pgstat_report_wait_end();
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index 61d36050c3..65e5ff6a82 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -406,7 +406,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,

 	pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
 	if (pg_fsync(fd) != 0)
-		ereport(ERROR,
+		ereport(promote_ioerr_to_panic(ERROR),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
 	pgstat_report_wait_end();
@@ -485,7 +485,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)

 	pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
 	if (pg_fsync(fd) != 0)
-		ereport(ERROR,
+		ereport(promote_ioerr_to_panic(ERROR),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
 	pgstat_report_wait_end();
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 65194db70e..350238e2fb 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1687,7 +1687,7 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
 	if (pg_fsync(fd) != 0)
 	{
 		CloseTransientFile(fd);
-		ereport(ERROR,
+		ereport(promote_ioerr_to_panic(ERROR),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync two-phase state file: %m")));
 	}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index adbd6a2126..c6c6b64826 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3268,7 +3268,14 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
 	if (pg_fsync(fd) != 0)
 	{
+		int save_errno = errno;
+		unlink(tmppath);
 		close(fd);
+		errno = save_errno;
+		/*
+		 * This fsync failure is not PANIC-worthy because it's still a temp
+		 * file at this time.
+		 */
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
@@ -3435,7 +3442,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,

 	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
 	if (pg_fsync(fd) != 0)
-		ereport(ERROR,
+		ereport(promote_ioerr_to_panic(ERROR),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
 	pgstat_report_wait_end();
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 4123cdebcf..31ab7c1de9 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1616,6 +1616,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 	 * fsync the file before renaming so that even if we crash after this we
 	 * have either a fully valid file or nothing.
 	 *
+	 * It's safe to just ERROR on fsync() here because we'll retry the whole
+	 * operation including the writes.
+	 *
 	 * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
 	 * some noticeable overhead since it's performed synchronously during
 	 * decoding?
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 441f18dcf5..9a29eaef7b 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -353,6 +353,17 @@ pg_fsync(int fd)
 /*
  * pg_fsync_no_writethrough --- same as fsync except does nothing if
  *	enableFsync is off
+ *
+ * WARNING: It is unsafe to retry fsync() calls without repeating the preceding
+ * writes.  fsync() clears the error flag on some platforms (including Linux,
+ * true up to at least 4.14) when it reports the error to the caller. A second
+ * call may return success even though writes are lost. Many callers test the
+ * return value and PANIC on failure so that redo repeats the writes. It is
+ * safe to ERROR instead if the whole operation can be retried without needing
+ * WAL redo.
+ *
+ * See https://lwn.net/Articles/752063/
+ * and /message-id/CAMsr%2BYHh%2B5Oq4xziwwoEfhoTZgr07vdGG%2Bhu%3D1adXx59aTeaoQ%40mail.gmail.com
  */
 int
 pg_fsync_no_writethrough(int fd)
@@ -443,7 +454,12 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
 		rc = sync_file_range(fd, offset, nbytes,
 							 SYNC_FILE_RANGE_WRITE);

-		/* don't error out, this is just a performance optimization */
+		/*
+		 * Don't error out, this is just a performance optimization.
+		 *
+		 * sync_file_range(SYNC_FILE_RANGE_WRITE) won't clear any error flags,
+		 * so we don't have to worry about this impacting fsync reliability.
+		 */
 		if (rc != 0)
 		{
 			ereport(WARNING,
@@ -518,7 +534,12 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
 			rc = msync(p, (size_t) nbytes, MS_ASYNC);
 			if (rc != 0)
 			{
-				ereport(WARNING,
+				/*
+				 * We must panic here to preserve fsync reliability,
+				 * as msync may clear the fsync error state on some
+				 * OSes. See pg_fsync_no_writethrough().
+				 */
+				ereport(PANIC,
 						(errcode_for_file_access(),
 						 errmsg("could not flush dirty data: %m")));
 				/* NB: need to fall through to munmap()! */
@@ -1046,11 +1067,17 @@ LruDelete(File file)
 	}

 	/*
-	 * Close the file.  We aren't expecting this to fail; if it does, better
-	 * to leak the FD than to mess up our internal state.
+	 * Close the file.  We aren't expecting this to fail; if it does, we need
+	 * to PANIC on I/O errors for non-temporary files in case it's an an
+	 * important file. That's because NFS on Linux may do an implicit fsync()
+	 * on close() which can cause similar issues to those discussed in the
+	 * comments on pg_fsync.
+	 *
+	 * Otherwise, better to leak the FD than to mess up our internal state.
 	 */
 	if (close(vfdP->fd))
-		elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+		elog(vfdP->fdstate & FD_DELETE_AT_CLOSE ? LOG : promote_ioerr_to_panic(LOG),
+			 "could not close file \"%s\": %m", vfdP->fileName);
 	vfdP->fd = VFD_CLOSED;
 	--nfile;

@@ -1754,7 +1781,14 @@ FileClose(File file)
 	{
 		/* close the file */
 		if (close(vfdP->fd))
-			elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+		{
+			/*
+			 * We must panic on failure to close non-temporary files; see
+			 * LruDelete.
+			 */
+			elog(vfdP->fdstate & FD_DELETE_AT_CLOSE ? LOG : promote_ioerr_to_panic(LOG),
+				"could not close file \"%s\": %m", vfdP->fileName);
+		}

 		--nfile;
 		vfdP->fd = VFD_CLOSED;
@@ -3250,6 +3284,10 @@ looks_like_temp_rel_name(const char *name)
  * harmless cases such as read-only files in the data directory, and that's
  * not good either.
  *
+ * Importantly, on Linux (true in 4.14) and some other platforms, fsync errors
+ * will consume the error, causing a subsequent fsync to succeed even though
+ * the writes did not succeed. See pg_fsync_no_writethrough().
+ *
  * Note we assume we're chdir'd into PGDATA to begin with.
  */
 void
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2ec103e604..c604ec4e28 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1038,7 +1038,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];

 		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
-			ereport(ERROR,
+			ereport(promote_ioerr_to_panic(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
@@ -1265,13 +1265,22 @@ mdsync(void)
 					 * _mdfd_getseg() and for FileSync, since fd.c might have
 					 * closed the file behind our back.
 					 *
-					 * XXX is there any point in allowing more than one retry?
-					 * Don't see one at the moment, but easy to change the
-					 * test here if so.
+					 * It's unsafe to ignore failures for other errors,
+					 * particularly EIO or (undocumented, but possible) ENOSPC.
+					 * The first fsync() will clear any error flag on dirty
+					 * buffers pending writeback and/or the file descriptor, so
+					 * a second fsync report success despite the buffers
+					 * possibly not being written. (Verified on Linux 4.14).
+					 * To cope with this we must PANIC and redo all writes
+					 * since the last successful checkpoint. See discussion at:
+					 *
+					 * /message-id/CAMsr%2BYHh%2B5Oq4xziwwoEfhoTZgr07vdGG%2Bhu%3D1adXx59aTeaoQ%40mail.gmail.com
+					 *
+					 * for details.
 					 */
 					if (!FILE_POSSIBLY_DELETED(errno) ||
 						failures > 0)
-						ereport(ERROR,
+						ereport(promote_ioerr_to_panic(ERROR),
 								(errcode_for_file_access(),
 								 errmsg("could not fsync file \"%s\": %m",
 										path)));
@@ -1280,6 +1289,7 @@ mdsync(void)
 								(errcode_for_file_access(),
 								 errmsg("could not fsync file \"%s\" but retrying: %m",
 										path)));
+
 					pfree(path);

 					/*
@@ -1444,7 +1454,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 				(errmsg("could not forward fsync request because request queue is full")));

 		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
-			ereport(ERROR,
+			ereport(promote_ioerr_to_panic(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(seg->mdfd_vfd))));
diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c
index 99d095f2df..662812b799 100644
--- a/src/backend/utils/cache/relmapper.c
+++ b/src/backend/utils/cache/relmapper.c
@@ -795,7 +795,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
 	 */
 	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
 	if (pg_fsync(fd) != 0)
-		ereport(ERROR,
+		ereport(promote_ioerr_to_panic(ERROR),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync relation mapping file \"%s\": %m",
 						mapfilename)));
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 16531f7a0f..e202094642 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -692,6 +692,17 @@ errcode_for_socket_access(void)
 	return 0;					/* return value does not matter */
 }

+/*
+ * PostgreSQL needs to PANIC on EIO in some cases to preserve data integrity.
+ * See explanation on pg_fsync for details. This keeps that logic in one place.
+ */
+int
+promote_ioerr_to_panic(int elevel)
+{
+	ErrorData  *edata = &errordata[errordata_stack_depth];
+	return (edata->saved_errno == EIO || edata->saved_errno == ENOSPC) ? elevel : PANIC;
+}
+

 /*
  * This macro handles expansion of a format string and associated parameters;
diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h
index 7a9ba7f2ff..abd078075c 100644
--- a/src/include/utils/elog.h
+++ b/src/include/utils/elog.h
@@ -133,6 +133,8 @@ extern int	errcode(int sqlerrcode);
 extern int	errcode_for_file_access(void);
 extern int	errcode_for_socket_access(void);

+extern int  promote_ioerr_to_panic(int elevel);
+
 extern int	errmsg(const char *fmt,...) pg_attribute_printf(1, 2);
 extern int	errmsg_internal(const char *fmt,...) pg_attribute_printf(1, 2);

@@ -182,7 +184,6 @@ extern int	geterrcode(void);
 extern int	geterrposition(void);
 extern int	getinternalerrposition(void);

-
 /*----------
  * Old-style error reporting API: to be used in this way:
  *		elog(ERROR, "portal \"%s\" not found", stmt->portalname);
-- 
2.14.3