diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 2ef21fb..356aed4 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2500,6 +2500,24 @@ include_dir 'conf.d' + + checkpoint_flush_to_disk (bool) + + checkpoint_flush_to_disk configuration parameter + + + + + When writing data for a checkpoint, hint the underlying OS that the + data must be sent to disk as soon as possible. This may help smoothing + disk I/O writes and avoid a stall when fsync is issued at the end of + the checkpoint, but it may also reduce average performance. + This setting may have no effect on some platforms. + The default is on on Linux, off otherwise. + + + + min_wal_size (integer) diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index f538698..1b658f2 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -558,6 +558,18 @@ + On Linux and POSIX platforms, + allows to hint the OS that pages written on checkpoints must be flushed + to disk quickly. Otherwise, these pages may be kept in cache for some time, + inducing a stall later when fsync is called to actually + complete the checkpoint. This setting helps to reduce transaction latency, + but it may also have a small adverse effect on the average transaction rate + at maximum throughput on some OS. It should be beneficial for high write + loads on HDD. This feature probably brings no benefit on SSD, as the I/O + write latency is small on such hardware, thus it may be disabled. + + + The number of WAL segment files in pg_xlog directory depends on min_wal_size, max_wal_size and the amount of WAL generated in previous checkpoint cycles. When old log diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 6a6fc3b..2a8f645 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -918,7 +918,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state) * Note that we deviate from the usual WAL coding practices here, * check the above "Logical rewrite support" comment for reasoning. */ - written = FileWrite(src->vfd, waldata_start, len); + written = FileWrite(src->vfd, waldata_start, len, false, NULL); if (written != len) ereport(ERROR, (errcode_for_file_access(), diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index cf4a6dc..4b5e9cd 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -203,7 +203,7 @@ btbuildempty(PG_FUNCTION_ARGS) /* Write the page. If archiving/streaming, XLOG it. */ PageSetChecksumInplace(metapage, BTREE_METAPAGE); smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, - (char *) metapage, true); + (char *) metapage, true, false, NULL); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, BTREE_METAPAGE, metapage, false); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index f95f67a..ea7a45d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -315,7 +315,7 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { /* overwriting a block we zero-filled before */ smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, - (char *) page, true); + (char *) page, true, false, NULL); } pfree(page); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index bceee8d..b700efb 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -170,7 +170,7 @@ spgbuildempty(PG_FUNCTION_ARGS) /* Write the page. If archiving/streaming, XLOG it. */ PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, - (char *) page, true); + (char *) page, true, false, NULL); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, page, false); @@ -180,7 +180,7 @@ spgbuildempty(PG_FUNCTION_ARGS) PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO, - (char *) page, true); + (char *) page, true, false, NULL); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_ROOT_BLKNO, page, true); @@ -190,7 +190,7 @@ spgbuildempty(PG_FUNCTION_ARGS) PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO, - (char *) page, true); + (char *) page, true, false, NULL); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_NULL_BLKNO, page, true); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 3b3a09e..e361907 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -665,7 +665,8 @@ ImmediateCheckpointRequested(void) * fraction between 0.0 meaning none, and 1.0 meaning all done. */ void -CheckpointWriteDelay(int flags, double progress) +CheckpointWriteDelay(int flags, double progress, + FileFlushContext * context, int ctx_size) { static int absorb_counter = WRITES_PER_ABSORB; @@ -700,6 +701,26 @@ CheckpointWriteDelay(int flags, double progress) */ pgstat_send_bgwriter(); +#if defined(HAVE_SYNC_FILE_RANGE) || defined(HAVE_POSIX_FADVISE) + + /* + * Before sleeping, flush written blocks for each tablespace. + */ + if (checkpoint_flush_to_disk) + { + int i; + + for (i = 0; i < ctx_size; i++) + { + if (context[i].ncalls != 0) + { + PerformFileFlush(&context[i]); + ResetFileFlushContext(&context[i]); + } + } + } +#endif /* HAVE_SYNC_FILE_RANGE || HAVE_POSIX_FADVISE */ + /* * This sleep used to be connected to bgwriter_delay, typically 200ms. * That resulted in more frequent wakeups if not much work to do. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 09af13b..deacec1 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -105,6 +105,8 @@ int bgwriter_lru_maxpages = 100; double bgwriter_lru_multiplier = 2.0; bool track_io_timing = false; int effective_io_concurrency = 0; +/* hint to move writes to high priority */ +bool checkpoint_flush_to_disk = DEFAULT_CHECKPOINT_FLUSH_TO_DISK; bool checkpoint_sort = true; /* @@ -427,7 +429,8 @@ static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy); static void PinBuffer_Locked(volatile BufferDesc *buf); static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner); static void BufferSync(int flags); -static int SyncOneBuffer(int buf_id, bool skip_recently_used); +static int SyncOneBuffer(int buf_id, bool skip_recently_used, + bool flush_to_disk, FileFlushContext *context); static void WaitIO(volatile BufferDesc *buf); static bool StartBufferIO(volatile BufferDesc *buf, bool forInput); static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, @@ -440,7 +443,8 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr); -static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln); +static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln, + bool flush_to_disk, FileFlushContext *context); static void AtProcExit_Buffers(int code, Datum arg); static void CheckForBufferLeaks(void); static int rnode_comparator(const void *p1, const void *p2); @@ -1107,7 +1111,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgr->smgr_rnode.node.dbNode, smgr->smgr_rnode.node.relNode); - FlushBuffer(buf, NULL); + FlushBuffer(buf, NULL, false, NULL); LWLockRelease(buf->content_lock); TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum, @@ -1770,6 +1774,7 @@ BufferSync(int flags) HTAB *spcBuffers; TableSpaceCheckpointStatus *spcStatus = NULL; int nb_spaces, space; + FileFlushContext * spcContext = NULL; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); @@ -1857,10 +1862,12 @@ BufferSync(int flags) TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write); - /* Build checkpoint tablespace buffer status */ + /* Build checkpoint tablespace buffer status & flush context arrays */ nb_spaces = hash_get_num_entries(spcBuffers); spcStatus = (TableSpaceCheckpointStatus *) palloc(sizeof(TableSpaceCheckpointStatus) * nb_spaces); + spcContext = (FileFlushContext *) + palloc(sizeof(FileFlushContext) * nb_spaces); { int index = 0; @@ -1877,6 +1884,12 @@ BufferSync(int flags) /* should it be randomized? chosen with some criterion? */ spcStatus[index].index = 0; +#if defined(HAVE_SYNC_FILE_RANGE) || defined(HAVE_POSIX_FADVISE) + + ResetFileFlushContext(&spcContext[index]); + +#endif /* HAVE_SYNC_FILE_RANGE || HAVE_POSIX_FADVISE */ + index ++; } } @@ -1932,7 +1945,8 @@ BufferSync(int flags) */ if (bufHdr != NULL && bufHdr->flags & BM_CHECKPOINT_NEEDED) { - if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) + if (SyncOneBuffer(buf_id, false, checkpoint_flush_to_disk, + &spcContext[space]) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); BgWriterStats.m_buf_written_checkpoints++; @@ -1942,7 +1956,8 @@ BufferSync(int flags) /* * Sleep to throttle our I/O rate. */ - CheckpointWriteDelay(flags, (double) num_written / num_to_write); + CheckpointWriteDelay(flags, (double) num_written / num_to_write, + spcContext, nb_spaces); } } @@ -1959,6 +1974,13 @@ BufferSync(int flags) if (spcStatus[space].index >= num_to_write || spcStatus[space].num_written >= spcStatus[space].num_to_write) { +#if defined(HAVE_SYNC_FILE_RANGE) || defined(HAVE_POSIX_FADVISE) + + PerformFileFlush(&spcContext[space]); + ResetFileFlushContext(&spcContext[space]); + +#endif /* HAVE_SYNC_FILE_RANGE || HAVE_POSIX_FADVISE */ + nb_spaces--; if (space != nb_spaces) spcStatus[space] = spcStatus[nb_spaces]; @@ -1969,6 +1991,8 @@ BufferSync(int flags) pfree(spcStatus); spcStatus = NULL; + pfree(spcContext); + spcContext = NULL; /* * Update checkpoint statistics. As noted above, this doesn't include @@ -2216,7 +2240,7 @@ BgBufferSync(void) /* Execute the LRU scan */ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) { - int buffer_state = SyncOneBuffer(next_to_clean, true); + int buffer_state = SyncOneBuffer(next_to_clean, true, false, NULL); if (++next_to_clean >= NBuffers) { @@ -2293,7 +2317,8 @@ BgBufferSync(void) * Note: caller must have done ResourceOwnerEnlargeBuffers. */ static int -SyncOneBuffer(int buf_id, bool skip_recently_used) +SyncOneBuffer(int buf_id, bool skip_recently_used, bool flush_to_disk, + FileFlushContext * context) { volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id); int result = 0; @@ -2334,7 +2359,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used) PinBuffer_Locked(bufHdr); LWLockAcquire(bufHdr->content_lock, LW_SHARED); - FlushBuffer(bufHdr, NULL); + FlushBuffer(bufHdr, NULL, flush_to_disk, context); LWLockRelease(bufHdr->content_lock); UnpinBuffer(bufHdr, true); @@ -2596,9 +2621,16 @@ BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, * * If the caller has an smgr reference for the buffer's relation, pass it * as the second parameter. If not, pass NULL. + * + * The third parameter tries to hint the OS that a high priority write is meant, + * possibly because io-throttling is already managed elsewhere. + * The last parameter holds the current flush context that accumulates flush + * requests to be performed in one call, instead of being performed on a buffer + * per buffer basis. */ static void -FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) +FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln, bool flush_to_disk, + FileFlushContext * context) { XLogRecPtr recptr; ErrorContextCallback errcallback; @@ -2687,7 +2719,9 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) buf->tag.forkNum, buf->tag.blockNum, bufToWrite, - false); + false, + flush_to_disk, + context); if (track_io_timing) { @@ -3109,7 +3143,9 @@ FlushRelationBuffers(Relation rel) bufHdr->tag.forkNum, bufHdr->tag.blockNum, localpage, - false); + false, + false, + NULL); bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); @@ -3143,7 +3179,7 @@ FlushRelationBuffers(Relation rel) { PinBuffer_Locked(bufHdr); LWLockAcquire(bufHdr->content_lock, LW_SHARED); - FlushBuffer(bufHdr, rel->rd_smgr); + FlushBuffer(bufHdr, rel->rd_smgr, false, NULL); LWLockRelease(bufHdr->content_lock); UnpinBuffer(bufHdr, true); } @@ -3195,7 +3231,7 @@ FlushDatabaseBuffers(Oid dbid) { PinBuffer_Locked(bufHdr); LWLockAcquire(bufHdr->content_lock, LW_SHARED); - FlushBuffer(bufHdr, NULL); + FlushBuffer(bufHdr, NULL, false, NULL); LWLockRelease(bufHdr->content_lock); UnpinBuffer(bufHdr, true); } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3144afe..114a0a6 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -208,7 +208,9 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bufHdr->tag.forkNum, bufHdr->tag.blockNum, localpage, - false); + false, + false, + NULL); /* Mark not-dirty now in case we error out below */ bufHdr->flags &= ~BM_DIRTY; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index ea4d689..fb3b383 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -317,7 +317,7 @@ BufFileDumpBuffer(BufFile *file) return; /* seek failed, give up */ file->offsets[file->curFile] = file->curOffset; } - bytestowrite = FileWrite(thisfile, file->buffer + wpos, bytestowrite); + bytestowrite = FileWrite(thisfile, file->buffer + wpos, bytestowrite, false, NULL); if (bytestowrite <= 0) return; /* failed to write */ file->offsets[file->curFile] += bytestowrite; diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 1ba4946..e880a9e 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1344,8 +1344,97 @@ retry: return returnCode; } +#if defined(HAVE_SYNC_FILE_RANGE) || defined(HAVE_POSIX_FADVISE) + +void +ResetFileFlushContext(FileFlushContext * context) +{ + context->fd = 0; + context->ncalls = 0; + context->offset = 0; + context->nbytes = 0; + context->filename = NULL; +} + +void +PerformFileFlush(FileFlushContext * context) +{ + if (context->ncalls != 0) + { + int rc; + +#if defined(HAVE_SYNC_FILE_RANGE) + + /* + * Linux: tell the memory manager to move these blocks to io so + * that they are considered for being actually written to disk. + */ + rc = sync_file_range(context->fd, context->offset, context->nbytes, + SYNC_FILE_RANGE_WRITE); + +#elif defined(HAVE_POSIX_FADVISE) + + /* + * Others: say that data should not be kept in memory... + * This is not exactly what we want to say, because we want to write + * the data for durability but we may need it later nevertheless. + * It seems that Linux would free the memory *if* the data has + * already been written do disk, else the "dontneed" call is ignored. + * For FreeBSD this may have the desired effect of moving the + * data to the io layer, although the system does not seem to + * take into account the provided offset & size, so it is rather + * rough... + */ + rc = posix_fadvise(context->fd, context->offset, context->nbytes, + POSIX_FADV_DONTNEED); + +#endif + + if (rc < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not flush block " INT64_FORMAT + " on " INT64_FORMAT " blocks in file \"%s\": %m", + context->offset / BLCKSZ, + context->nbytes / BLCKSZ, + context->filename))); + } +} + +void +FileAsynchronousFlush(FileFlushContext * context, + int fd, off_t offset, off_t nbytes, char * filename) +{ + if (context->ncalls != 0 && context->fd == fd) + { + /* same file: merge current flush with previous ones */ + off_t new_offset = offset < context->offset? offset: context->offset; + + context->nbytes = + (context->offset + context->nbytes > offset + nbytes ? + context->offset + context->nbytes : offset + nbytes) - + new_offset; + context->offset = new_offset; + context->ncalls ++; + } + else + { + /* other file: do flush previous file & reset flush accumulator */ + PerformFileFlush(context); + + context->fd = fd; + context->ncalls = 1; + context->offset = offset; + context->nbytes = nbytes; + context->filename = filename; + } +} + +#endif /* HAVE_SYNC_FILE_RANGE || HAVE_POSIX_FADVISE */ + int -FileWrite(File file, char *buffer, int amount) +FileWrite(File file, char *buffer, int amount, bool flush_to_disk, + FileFlushContext * context) { int returnCode; @@ -1395,6 +1484,28 @@ retry: if (returnCode >= 0) { + +#if defined(HAVE_SYNC_FILE_RANGE) || defined(HAVE_POSIX_FADVISE) + + /* + * Calling "write" tells the OS that pg wants to write some page to disk, + * however when it is really done is chosen by the OS. + * Depending on other disk activities this may be delayed significantly, + * maybe up to an "fsync" call, which could induce an IO write surge. + * When checkpointing pg is doing its own throttling and the result + * should really be written to disk with high priority, so as to meet + * the completion target. + * This call hints that such write have a higher priority. + */ + if (flush_to_disk && returnCode == amount && errno == 0) + { + FileAsynchronousFlush(context, + VfdCache[file].fd, VfdCache[file].seekPos, + amount, VfdCache[file].fileName); + } + +#endif /* HAVE_SYNC_FILE_RANGE || HAVE_POSIX_FADVISE */ + VfdCache[file].seekPos += returnCode; /* maintain fileSize and temporary_files_size if it's a temp file */ diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 42a43bb..dbf057f 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -531,7 +531,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errmsg("could not seek to block %u in file \"%s\": %m", blocknum, FilePathName(v->mdfd_vfd)))); - if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) + if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, false, NULL)) != BLCKSZ) { if (nbytes < 0) ereport(ERROR, @@ -738,7 +738,8 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, */ void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) + char *buffer, bool skipFsync, bool flush_to_disk, + FileFlushContext * context) { off_t seekpos; int nbytes; @@ -767,7 +768,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errmsg("could not seek to block %u in file \"%s\": %m", blocknum, FilePathName(v->mdfd_vfd)))); - nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ); + nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, flush_to_disk, context); TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 244b4ea..2db3cd3 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -52,7 +52,8 @@ typedef struct f_smgr void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, char *buffer, bool skipFsync, + bool flush_to_disk, FileFlushContext *context); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); @@ -643,10 +644,11 @@ smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, */ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) + char *buffer, bool skipFsync, bool flush_to_disk, + FileFlushContext * context) { (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum, - buffer, skipFsync); + buffer, skipFsync, flush_to_disk, context); } /* diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 1cd2aa0..95deb71 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -158,6 +158,7 @@ static bool check_bonjour(bool *newval, void **extra, GucSource source); static bool check_ssl(bool *newval, void **extra, GucSource source); static bool check_stage_log_stats(bool *newval, void **extra, GucSource source); static bool check_log_stats(bool *newval, void **extra, GucSource source); +static bool check_flush_to_disk(bool *newval, void **extra, GucSource source); static bool check_canonical_path(char **newval, void **extra, GucSource source); static bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); static void assign_timezone_abbreviations(const char *newval, void *extra); @@ -1024,6 +1025,17 @@ static struct config_bool ConfigureNamesBool[] = }, { + {"checkpoint_flush_to_disk", PGC_SIGHUP, WAL_CHECKPOINTS, + gettext_noop("Hint that checkpoint's writes are high priority."), + NULL + }, + &checkpoint_flush_to_disk, + /* see bufmgr.h: true on Linux, false otherwise */ + DEFAULT_CHECKPOINT_FLUSH_TO_DISK, + check_flush_to_disk, NULL, NULL + }, + + { {"log_connections", PGC_SU_BACKEND, LOGGING_WHAT, gettext_noop("Logs each successful connection."), NULL @@ -9805,6 +9817,21 @@ check_log_stats(bool *newval, void **extra, GucSource source) } static bool +check_flush_to_disk(bool *newval, void **extra, GucSource source) +{ +/* This test must be consistent with the one in FileWrite (storage/file/fd.c) + */ +#if ! (defined(HAVE_SYNC_FILE_RANGE) || defined(HAVE_POSIX_FADVISE)) + /* just warn if it has no effect */ + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Setting \"checkpoint_flush_to_disk\" has no effect " + "on this platform."))); +#endif /* ! (HAVE_SYNC_FILE_RANGE || HAVE_POSIX_FADVISE) */ + return true; +} + +static bool check_canonical_path(char **newval, void **extra, GucSource source) { /* diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 8020c1c..e4cf2a1 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -202,6 +202,8 @@ #min_wal_size = 80MB #checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 #checkpoint_sort = on # sort buffers on checkpoint +#checkpoint_flush_to_disk = ? # send buffers to disk on checkpoint + # default is on if Linux, off otherwise #checkpoint_warning = 30s # 0 disables # - Archiving - diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index a49c208..f9c8ca1 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -16,6 +16,7 @@ #define _BGWRITER_H #include "storage/block.h" +#include "storage/fd.h" #include "storage/relfilenode.h" @@ -29,7 +30,8 @@ extern void BackgroundWriterMain(void) pg_attribute_noreturn(); extern void CheckpointerMain(void) pg_attribute_noreturn(); extern void RequestCheckpoint(int flags); -extern void CheckpointWriteDelay(int flags, double progress); +extern void CheckpointWriteDelay(int flags, double progress, + FileFlushContext * context, int ctx_size); extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index b56802b..cd9d130 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -54,6 +54,14 @@ extern int bgwriter_lru_maxpages; extern double bgwriter_lru_multiplier; extern bool track_io_timing; extern int target_prefetch_pages; + +#ifdef HAVE_SYNC_FILE_RANGE +#define DEFAULT_CHECKPOINT_FLUSH_TO_DISK true +#else +#define DEFAULT_CHECKPOINT_FLUSH_TO_DISK false +#endif /* HAVE_SYNC_FILE_RANGE */ + +extern bool checkpoint_flush_to_disk; extern bool checkpoint_sort; /* in buf_init.c */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 7eabe09..c7b2a6d 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -59,6 +59,24 @@ extern int max_files_per_process; */ extern int max_safe_fds; +/* + * FileFlushContext structure: + * + * This is used to accumulate several flush requests on a file + * into a larger flush request. + * - fd: file descriptor of the file + * - ncalls: number of flushes merged together + * - offset: starting offset (minimum of all offsets) + * - nbytes: size (minimum extent to cover all flushed data) + * - filename: filename of fd for error messages + */ +typedef struct FileFlushContext { + int fd; + int ncalls; + off_t offset; + off_t nbytes; + char * filename; +} FileFlushContext; /* * prototypes for functions in fd.c @@ -70,7 +88,12 @@ extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); extern int FilePrefetch(File file, off_t offset, int amount); extern int FileRead(File file, char *buffer, int amount); -extern int FileWrite(File file, char *buffer, int amount); +extern void ResetFileFlushContext(FileFlushContext * context); +extern void PerformFileFlush(FileFlushContext * context); +extern void FileAsynchronousFlush(FileFlushContext * context, + int fd, off_t offset, off_t nbytes, char * filename); +extern int FileWrite(File file, char *buffer, int amount, bool flush_to_disk, + FileFlushContext * context); extern int FileSync(File file); extern off_t FileSeek(File file, off_t offset, int whence); extern int FileTruncate(File file, off_t offset); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 69a624f..a46a70c 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -16,6 +16,7 @@ #include "fmgr.h" #include "storage/block.h" +#include "storage/fd.h" #include "storage/relfilenode.h" @@ -95,7 +96,8 @@ extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum, extern void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, char *buffer, bool skipFsync, + bool flush_to_disk, FileFlushContext * context); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); @@ -120,8 +122,9 @@ extern void mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); -extern void mdwrite(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); +extern void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync, bool flush_to_disk, + FileFlushContext * context); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks);