diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 9e7bcf5..2ef21fb 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2457,6 +2457,28 @@ include_dir 'conf.d' + + checkpoint_sort (bool) + + checkpoint_sort configuration parameter + + + + + Whether to sort buffers before writting them out to disk on checkpoint. + For a HDD storage, this setting allows to group together + neighboring pages written to disk, thus improving performance by + reducing random write activity. + This sorting should have limited performance effects on SSD backends + as such storages have good random write performance, but it may + help with wear-leveling so be worth keeping anyway. + The default is on. + This parameter can only be set in the postgresql.conf + file or on the server command line. + + + + checkpoint_warning (integer) diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index e3941c9..f538698 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -546,6 +546,18 @@ + When hard-disk drives (HDD) are used for terminal data storage + allows to sort pages + so that neighboring pages on disk will be flushed together by + chekpoints, reducing the random write load and improving performance. + If solid-state drives (SSD) are used, sorting pages induces no benefit + as their random write I/O performance is good: this feature could then + be disabled by setting checkpoint_sort to off. + It is possible that sorting may help with SSD wear leveling, so it may + be kept on that account. + + + The number of WAL segment files in pg_xlog directory depends on min_wal_size, max_wal_size and the amount of WAL generated in previous checkpoint cycles. When old log diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 152d4ed..7291447 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7999,11 +7999,13 @@ LogCheckpointEnd(bool restartpoint) sync_secs, total_secs, longest_secs, + sort_secs, average_secs; int write_usecs, sync_usecs, total_usecs, longest_usecs, + sort_usecs, average_usecs; uint64 average_sync_time; @@ -8034,6 +8036,10 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_end_t, &total_secs, &total_usecs); + TimestampDifference(CheckpointStats.ckpt_sort_t, + CheckpointStats.ckpt_sort_end_t, + &sort_secs, &sort_usecs); + /* * Timing values returned from CheckpointStats are in microseconds. * Convert to the second plus microsecond form that TimestampDifference @@ -8052,8 +8058,8 @@ LogCheckpointEnd(bool restartpoint) elog(LOG, "%s complete: wrote %d buffers (%.1f%%); " "%d transaction log file(s) added, %d removed, %d recycled; " - "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " - "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " + "sort=%ld.%03d s, write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s;" + " sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB", restartpoint ? "restartpoint" : "checkpoint", CheckpointStats.ckpt_bufs_written, @@ -8061,6 +8067,7 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_segs_added, CheckpointStats.ckpt_segs_removed, CheckpointStats.ckpt_segs_recycled, + sort_secs, sort_usecs / 1000, write_secs, write_usecs / 1000, sync_secs, sync_usecs / 1000, total_secs, total_usecs / 1000, diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 3ae2848..3bd5eab 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -65,7 +65,8 @@ void InitBufferPool(void) { bool foundBufs, - foundDescs; + foundDescs, + foundCpid; /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN( @@ -77,10 +78,14 @@ InitBufferPool(void) ShmemInitStruct("Buffer Blocks", NBuffers * (Size) BLCKSZ, &foundBufs); - if (foundDescs || foundBufs) + CheckpointBufferIds = (CheckpointSortItem *) + ShmemInitStruct("Checkpoint BufferIds", + NBuffers * sizeof(CheckpointSortItem), &foundCpid); + + if (foundDescs || foundBufs || foundCpid) { - /* both should be present or neither */ - Assert(foundDescs && foundBufs); + /* all should be present or neither */ + Assert(foundDescs && foundBufs && foundCpid); /* note: this path is only taken in EXEC_BACKEND case */ } else @@ -144,5 +149,8 @@ BufferShmemSize(void) /* size of stuff controlled by freelist.c */ size = add_size(size, StrategyShmemSize()); + /* size of checkpoint sort array in bufmgr.c */ + size = add_size(size, mul_size(NBuffers, sizeof(CheckpointSortItem))); + return size; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 8c0358e..09af13b 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -75,12 +75,37 @@ typedef struct PrivateRefCountEntry /* 64 bytes, about the size of a cache line on common systems */ #define REFCOUNT_ARRAY_ENTRIES 8 +/* + * Status of buffers to checkpoint for a particular tablespace, + * used internally in BufferSync. + * - space: oid of the tablespace + * - num_to_write: number of checkpoint pages counted for this tablespace + * - num_written: number of pages actually written out + * - index: scanning position in CheckpointBufferIds for this tablespace + */ +typedef struct TableSpaceCheckpointStatus { + Oid space; + int num_to_write; + int num_written; + int index; +} TableSpaceCheckpointStatus; + +/* + * Entry structure for table space to count hashtable, + * used internally in BufferSync. + */ +typedef struct TableSpaceCountEntry { + Oid space; + int count; +} TableSpaceCountEntry; + /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; double bgwriter_lru_multiplier = 2.0; bool track_io_timing = false; int effective_io_concurrency = 0; +bool checkpoint_sort = true; /* * How many buffers PrefetchBuffer callers should try to stay ahead of their @@ -98,6 +123,9 @@ static bool IsForInput; /* local state for LockBufferForCleanup */ static volatile BufferDesc *PinCountWaitBuf = NULL; +/* array of buffer ids & sort criterion of all buffers to checkpoint */ +CheckpointSortItem *CheckpointBufferIds = NULL; + /* * Backend-Private refcount management: * @@ -1622,6 +1650,106 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner) } } +/* checkpoint buffers comparison */ +static int +bufcmp(const void * pa, const void * pb) +{ + CheckpointSortItem + *a = (CheckpointSortItem *) pa, + *b = (CheckpointSortItem *) pb; + + /* compare relation */ + if (a->relNode < b->relNode) + return -1; + else if (a->relNode > b->relNode) + return 1; + /* same relation, compare fork */ + else if (a->forkNum < b->forkNum) + return -1; + else if (a->forkNum > b->forkNum) + return 1; + /* same relation/fork, so same segmented "file", compare block number + * which are mapped on different segments depending on the number. + */ + else if (a->blockNum < b->blockNum) + return -1; + else /* should not be the same block anyway... */ + return 1; +} + +/* + * Return the next buffer to write, or -1. + * this function balances buffers over tablespaces, see comment inside. + */ +static int +NextBufferToWrite(TableSpaceCheckpointStatus *spcStatus, int nb_spaces, + int *pspace, int num_to_write, int num_written) +{ + int space = *pspace, buf_id = -1, index; + + /* + * Select a tablespace depending on the current overall progress. + * + * The progress ratio of each unfinished tablespace is compared to + * the overall progress ratio to find one with is not in advance + * (i.e. overall ratio > tablespace ratio, + * i.e. tablespace written/to_write > overall written/to_write + * + * Existence: it is bound to exist otherwise the overall progress + * ratio would be inconsistent: with positive buffers to write (t1 & t2) + * and already written buffers (w1 & w2), we have: + * + * If w1/t1 > (w1+w2)/(t1+t2) # one table space is in advance + * => w1t1+w1t2 > w1t1+w2t1 => w1t2 > w2t1 => w1t2+w2t2 > w2t1+w2t2 + * => (w1+w2) / (t1+t2) > w2 / t2 # the other one is late + * + * The round robin ensures that each space is given some attention + * till it is over the current ratio, before going to the next. + * + * Precision: using int32 computations for comparing fractions + * (w1 / t1 > w / t <=> w1 t > w t1) seems a bad idea as the values + * can overflow 32-bit integers: the limit would be sqrt(2**31) ~ + * 46340 buffers, i.e. a 362 MB checkpoint. So ensure that 64-bit + * integers are used in the comparison. + */ + while ((int64) spcStatus[space].num_written * num_to_write > + (int64) num_written * spcStatus[space].num_to_write) + space = (space + 1) % nb_spaces; /* round robin */ + + /* + * Find a valid buffer in the selected tablespace, + * by continuing the tablespace specific buffer scan + * where it was left. + */ + index = spcStatus[space].index; + + while (index < num_to_write && buf_id == -1) + { + volatile BufferDesc *bufHdr; + + buf_id = CheckpointBufferIds[index].buf_id; + bufHdr = GetBufferDescriptor(buf_id); + + /* + * Skip if in another tablespace or not in checkpoint anymore. + * No lock is acquired, see comments below. + */ + if (spcStatus[space].space != bufHdr->tag.rnode.spcNode || + ! (bufHdr->flags & BM_CHECKPOINT_NEEDED)) + { + index ++; + buf_id = -1; + } + } + + /* update tablespace writing status, will start over at next index */ + spcStatus[space].index = index + 1; + + *pspace = space; + + return buf_id; +} + /* * BufferSync -- Write out all dirty buffers in the pool. * @@ -1635,11 +1763,13 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner) static void BufferSync(int flags) { - int buf_id; - int num_to_scan; + int buf_id = -1; int num_to_write; int num_written; int mask = BM_DIRTY; + HTAB *spcBuffers; + TableSpaceCheckpointStatus *spcStatus = NULL; + int nb_spaces, space; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); @@ -1670,6 +1800,18 @@ BufferSync(int flags) * certainly need to be written for the next checkpoint attempt, too. */ num_to_write = 0; + + /* initialize oid -> int buffer count hash table */ + { + HASHCTL ctl; + + MemSet(&ctl, 0, sizeof(HASHCTL)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(TableSpaceCountEntry); + spcBuffers = hash_create("Number of buffers to write per tablespace", + 16, &ctl, HASH_ELEM | HASH_BLOBS); + } + for (buf_id = 0; buf_id < NBuffers; buf_id++) { volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id); @@ -1682,32 +1824,99 @@ BufferSync(int flags) if ((bufHdr->flags & mask) == mask) { + Oid spc; + TableSpaceCountEntry * entry; + bool found; + bufHdr->flags |= BM_CHECKPOINT_NEEDED; + CheckpointBufferIds[num_to_write].buf_id = buf_id; + CheckpointBufferIds[num_to_write].relNode = bufHdr->tag.rnode.relNode; + CheckpointBufferIds[num_to_write].forkNum = bufHdr->tag.forkNum; + CheckpointBufferIds[num_to_write].blockNum = bufHdr->tag.blockNum; num_to_write++; + + /* keep track of per tablespace buffers */ + spc = bufHdr->tag.rnode.spcNode; + entry = (TableSpaceCountEntry *) + hash_search(spcBuffers, (void *) &spc, HASH_ENTER, &found); + + if (found) + entry->count++; + else + entry->count = 1; } UnlockBufHdr(bufHdr); } if (num_to_write == 0) + { + hash_destroy(spcBuffers); return; /* nothing to do */ + } TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write); + /* Build checkpoint tablespace buffer status */ + nb_spaces = hash_get_num_entries(spcBuffers); + spcStatus = (TableSpaceCheckpointStatus *) + palloc(sizeof(TableSpaceCheckpointStatus) * nb_spaces); + + { + int index = 0; + HASH_SEQ_STATUS hseq; + TableSpaceCountEntry * entry; + + hash_seq_init(&hseq, spcBuffers); + while ((entry = (TableSpaceCountEntry *) hash_seq_search(&hseq))) + { + Assert(index < nb_spaces); + spcStatus[index].space = entry->space; + spcStatus[index].num_to_write = entry->count; + spcStatus[index].num_written = 0; + /* should it be randomized? chosen with some criterion? */ + spcStatus[index].index = 0; + + index ++; + } + } + + hash_destroy(spcBuffers); + spcBuffers = NULL; + + /* sort buffer ids to help find sequential writes */ + CheckpointStats.ckpt_sort_t = GetCurrentTimestamp(); + + if (checkpoint_sort) + { + qsort(CheckpointBufferIds, num_to_write, sizeof(CheckpointSortItem), + bufcmp); + } + + CheckpointStats.ckpt_sort_end_t = GetCurrentTimestamp(); + /* - * Loop over all buffers again, and write the ones (still) marked with - * BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep point - * since we might as well dump soon-to-be-recycled buffers first. + * Loop over buffers to write through CheckpointBufferIds, + * and write the ones (still) marked with BM_CHECKPOINT_NEEDED, + * with some round robin over table spaces so as to balance writes, + * so that buffer writes move forward roughly proportionally for each + * tablespace. * - * Note that we don't read the buffer alloc count here --- that should be - * left untouched till the next BgBufferSync() call. + * Termination: if a tablespace is selected by the inner while loop + * (see argument there), its index is incremented and will eventually + * reach num_to_write, mark this table space scanning as done and + * decrement the number of (active) spaces, which will thus reach 0. */ - buf_id = StrategySyncStart(NULL, NULL); - num_to_scan = NBuffers; + space = 0; num_written = 0; - while (num_to_scan-- > 0) + + while (nb_spaces != 0) { - volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id); + volatile BufferDesc *bufHdr = NULL; + buf_id = NextBufferToWrite(spcStatus, nb_spaces, &space, + num_to_write, num_written); + if (buf_id != -1) + bufHdr = GetBufferDescriptor(buf_id); /* * We don't need to acquire the lock here, because we're only looking @@ -1721,39 +1930,46 @@ BufferSync(int flags) * write the buffer though we didn't need to. It doesn't seem worth * guarding against this, though. */ - if (bufHdr->flags & BM_CHECKPOINT_NEEDED) + if (bufHdr != NULL && bufHdr->flags & BM_CHECKPOINT_NEEDED) { if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); BgWriterStats.m_buf_written_checkpoints++; + spcStatus[space].num_written++; num_written++; /* - * We know there are at most num_to_write buffers with - * BM_CHECKPOINT_NEEDED set; so we can stop scanning if - * num_written reaches num_to_write. - * - * Note that num_written doesn't include buffers written by - * other backends, or by the bgwriter cleaning scan. That - * means that the estimate of how much progress we've made is - * conservative, and also that this test will often fail to - * trigger. But it seems worth making anyway. - */ - if (num_written >= num_to_write) - break; - - /* * Sleep to throttle our I/O rate. */ CheckpointWriteDelay(flags, (double) num_written / num_to_write); } } - if (++buf_id >= NBuffers) - buf_id = 0; + /* + * Detect checkpoint end for a tablespace: either the scan is done + * or all tablespace buffers have been written out. If so, the + * another active tablespace status is moved in place of the current + * one and the next round will start on this one, or maybe round about. + * + * Note: maybe an exchange could be made instead in order to keep + * informations about the closed table space, but this is currently + * not used afterwards. + */ + if (spcStatus[space].index >= num_to_write || + spcStatus[space].num_written >= spcStatus[space].num_to_write) + { + nb_spaces--; + if (space != nb_spaces) + spcStatus[space] = spcStatus[nb_spaces]; + else + space = 0; + } } + pfree(spcStatus); + spcStatus = NULL; + /* * Update checkpoint statistics. As noted above, this doesn't include * buffers written by other backends or bgwriter scan. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 8ebf424..1cd2aa0 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1012,6 +1012,17 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + + { + {"checkpoint_sort", PGC_SIGHUP, WAL_CHECKPOINTS, + gettext_noop("Whether disk-page buffers are sorted on checkpoints."), + NULL + }, + &checkpoint_sort, + true, + NULL, NULL, NULL + }, + { {"log_connections", PGC_SU_BACKEND, LOGGING_WHAT, gettext_noop("Logs each successful connection."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 8c65287..8020c1c 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -201,6 +201,7 @@ #max_wal_size = 1GB #min_wal_size = 80MB #checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 +#checkpoint_sort = on # sort buffers on checkpoint #checkpoint_warning = 30s # 0 disables # - Archiving - diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 790ca66..11815a8 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -186,6 +186,8 @@ extern bool XLOG_DEBUG; typedef struct CheckpointStatsData { TimestampTz ckpt_start_t; /* start of checkpoint */ + TimestampTz ckpt_sort_t; /* start buffer sorting */ + TimestampTz ckpt_sort_end_t; /* end of sorting */ TimestampTz ckpt_write_t; /* start of flushing buffers */ TimestampTz ckpt_sync_t; /* start of fsyncs */ TimestampTz ckpt_sync_end_t; /* end of fsyncs */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 521ee1c..32f2006 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -210,6 +210,23 @@ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; /* in localbuf.c */ extern BufferDesc *LocalBufferDescriptors; +/* in bufmgr.c */ + +/* + * Structure to sort buffers per file on checkpoints. + * + * This structure is allocated per buffer in shared memory, so it should be + * kept as little as possible. Maybe the sort criterion could be compacted + * to reduce memory requirement and for faster comparison? + */ +typedef struct CheckpointSortItem { + int buf_id; + Oid relNode; + ForkNumber forkNum; /* hm... enum with only 4 values */ + BlockNumber blockNum; +} CheckpointSortItem; + +extern CheckpointSortItem *CheckpointBufferIds; /* * Internal routines: only called by bufmgr diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 0f59201..b56802b 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -54,6 +54,7 @@ extern int bgwriter_lru_maxpages; extern double bgwriter_lru_multiplier; extern bool track_io_timing; extern int target_prefetch_pages; +extern bool checkpoint_sort; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks;