commit 689fc9b602c1379dbb58a785bdda4dc1e2fdc2bb Author: mithun Date: Tue Nov 29 10:43:01 2016 +0530 pg_autoprewarm_patch_02 diff --git a/contrib/pg_prewarm/Makefile b/contrib/pg_prewarm/Makefile index 7ad941e..8ec0411 100644 --- a/contrib/pg_prewarm/Makefile +++ b/contrib/pg_prewarm/Makefile @@ -1,7 +1,7 @@ # contrib/pg_prewarm/Makefile -MODULE_big = pg_prewarm -OBJS = pg_prewarm.o $(WIN32RES) +MODULES = pg_prewarm pg_autoprewarm +OBJS = pg_prewarm.o pg_autoprewarm.o $(WIN32RES) EXTENSION = pg_prewarm DATA = pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql diff --git a/contrib/pg_prewarm/README b/contrib/pg_prewarm/README new file mode 100644 index 0000000..047f436 --- /dev/null +++ b/contrib/pg_prewarm/README @@ -0,0 +1,29 @@ +# pg_autoprewarm. + +This a PostgreSQL contrib module which automatically dump all of the blocknums +present in buffer pool at a regular interval and at the time of server shutdown +(smart and fast mode only) and load these blocks when server restarts. + +Design: +------ +We have created a BG worker Auto Pre-warmer which during shutdown dumps all the +blocknum in buffer pool after sorting same. +Format of each entry is . +Auto Pre-warmer is started as soon as the postmaster is started we do not wait +for recovery to finish and database to reach a consistent state. If there is a +"dump_file" to load we start loading each block entry to buffer pool until +there is a free buffer. This way we do not replace any new blocks which was +loaded either by recovery process or querying clients. + +HOW TO USE: +----------- +Build and add the pg_autoprewarm to shared_preload_libraries. Auto Pre-warmer +process automatically do dumping of buffer pool block info and load them when +restarted. +Set pg_autoprewarm.buff_dump_interval in seconds to specify minimum interval +between two dumps. If pg_autoprewarm.buff_dump_interval is set to zero then +dumping based on timer is disabled. We only dump while server shutdown. + +TO DO: +------ +Add functionality to dump based on timer at regular interval. diff --git a/contrib/pg_prewarm/pg_autoprewarm.c b/contrib/pg_prewarm/pg_autoprewarm.c new file mode 100644 index 0000000..6796306 --- /dev/null +++ b/contrib/pg_prewarm/pg_autoprewarm.c @@ -0,0 +1,493 @@ +/*------------------------------------------------------------------------- + * + * pg_autoprewarm.c + * Automatically dumps and load buffer pool. + * + * contrib/pg_autoprewarm/pg_autoprewarm.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "miscadmin.h" +#include "postmaster/bgworker.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "pgstat.h" +#include "storage/buf_internals.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/resowner.h" +#include "catalog/pg_class.h" +#include +#include "utils/guc.h" + +PG_MODULE_MAGIC; + +static void AutoPreWarmerMain (Datum main_arg); +static bool +load_block(RelFileNode rnode, char reltype, ForkNumber forkNum, + BlockNumber blockNum); + +/* Primary functions */ +void _PG_init(void); + +/* Secondary/supporting functions */ +static void sigtermHandler(SIGNAL_ARGS); + +/* flags set by signal handlers */ +static volatile sig_atomic_t got_sigterm = false; + +/* + * Signal handler for SIGTERM + * set our latch to wake it up. + */ +static void +sigtermHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + got_sigterm = true; + + if (MyProc) + SetLatch(&MyProc->procLatch); + + errno = save_errno; +} + +/* Meta-data of each persistent page buffer which is dumped and used to load. */ +typedef struct BlockInfoRecord +{ + Oid database; /* datbase */ + Oid spcNode; /* tablespace */ + Oid filenode; /* relation */ + ForkNumber forknum; /* fork number */ + BlockNumber blocknum; /* block number */ +}BlockInfoRecord; + +/* Try loading only once during startup. If any error do not retry. */ +static bool avoid_loading = false; + +/* + * And avoid dumping if we receive sigterm while loading. Also do not re-try if + * dump has failed previously. + */ +static bool avoid_dumping = false; + +int buff_dump_interval = 0; + +/* compare member elements to check if they are not equal. */ +#define cmp_member_elem(fld) \ +do { \ + if (a->fld < b->fld) \ + return -1; \ + else if (a->fld > b->fld) \ + return 1; \ +} while(0); + +/* + * sort_cmp_func - compare function used while qsorting BlockInfoRecord objects. + */ +static int +sort_cmp_func(const void *p, const void *q) +{ + BlockInfoRecord *a = (BlockInfoRecord *) p; + BlockInfoRecord *b = (BlockInfoRecord *) q; + + cmp_member_elem(database); + cmp_member_elem(spcNode); + cmp_member_elem(filenode); + cmp_member_elem(forknum); + cmp_member_elem(blocknum); + return 0; +} + +#define DUMP_FILENAME "autoprewarm" + +/* + * load_block - Load a given block. + */ +bool +load_block(RelFileNode rnode, char reltype, ForkNumber forkNum, + BlockNumber blockNum) +{ + Buffer buffer; + + /* Load the page only if there exist a free buffer. We do not want to + * replace an existing buffer. */ + if (have_free_buffer()) + { + SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + + /* + * Check if fork exists first otherwise we will not be able to use one + * free buffer for each non existing block. + */ + if (smgrexists(smgr, forkNum)) + { + buffer = ReadBufferForPrewarm(smgr, reltype, + forkNum, blockNum, + RBM_NORMAL, NULL); + if (!BufferIsValid(buffer)) + elog(LOG, "\n Skipped the buff page. \n"); + else + ReleaseBuffer(buffer); + } + + return true; + } + + return false; +} + +/* + * load_now - Main routine which reads from dump file and load each block. + * We try to load each blocknum read from DUMP_FILENAME until we have + * any free buffer left or SIGTERM is received. If we fail to load a block we + * ignore the ERROR and try to load next blocknum. This is because there is + * possibility that corresponding blocknum might have been deleted. + */ +static void load_now(void) +{ + static char dump_file_path[MAXPGPATH]; + FILE *file = NULL; + uint32 i, num_buffers = 0; + + avoid_loading = true; + + /* Check if file exists and open file in read mode. */ + snprintf(dump_file_path, sizeof(dump_file_path), "%s.save", DUMP_FILENAME); + file = fopen(dump_file_path, PG_BINARY_R); + + if (!file) + return; /* No file to load. */ + + if (fscanf(file,"<<%u>>", &num_buffers) != 1) + { + fclose(file); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("Error reading num of elements in \"%s\" for autoprewarm : %m", dump_file_path))); + } + + elog(LOG, "Num buffers : %d \n", num_buffers); + + for (i = 0; i < num_buffers; i++) + { + RelFileNode rnode; + uint32 forknum; + BlockNumber blocknum; + bool have_free_buf = true; + + if (got_sigterm) + { + /* + * Received shutdown while we were still loading the buffers. + * No need to dump at this stage. + */ + avoid_dumping = true; + break; + } + + if(!have_free_buf) + break; + + /* Get next block. */ + if (5 != fscanf(file, "%u,%u,%u,%u,%u\n", &rnode.dbNode, &rnode.spcNode, + &rnode.relNode, &forknum, &blocknum)) + break; /* No more valid entry hence stop processing. */ + + PG_TRY(); + { + have_free_buf = load_block(rnode, RELPERSISTENCE_PERMANENT, + (ForkNumber)forknum, blocknum); + } + PG_CATCH(); + { + /* Any error handle it and then try to load next buffer. */ + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + LWLockReleaseAll(); + AbortBufferIO(); + UnlockBuffers(); + + /* buffer pins are released here. */ + ResourceOwnerRelease(CurrentResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, true); + FlushErrorState(); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + } + PG_END_TRY(); + } + + fclose(file); + + elog(LOG, "loaded"); + return; +} + + +/* + * dump_now - Main routine which goes through each buffer header and dump + * their metadata in the format. + * . We Sort these data + * and then dump them. Sorting is necessary as it facilitates sequential read + * during load. Unlike load if we encounter any error we abort the dump. + */ +static void dump_now(void) +{ + static char dump_file_path[MAXPGPATH], + transient_dump_file_path[MAXPGPATH]; + uint32 i; + int ret; + uint32 num_buffers; + BlockInfoRecord *block_info_array; + BufferDesc *bufHdr; + FILE *file = NULL; + + if (avoid_dumping) + return; + + avoid_dumping = true; + block_info_array = (BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers); + + for (num_buffers = 0, i = 0; i < NBuffers; i++) + { + uint32 buf_state; + + bufHdr = GetBufferDescriptor(i); + + /* Lock each buffer header before inspecting. */ + buf_state = LockBufHdr(bufHdr); + + /* Only valid and persistant page buffers are dumped. */ + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID) && + (buf_state & BM_PERMANENT)) + { + block_info_array[num_buffers].database = bufHdr->tag.rnode.dbNode; + block_info_array[num_buffers].spcNode = bufHdr->tag.rnode.spcNode; + block_info_array[num_buffers].filenode = bufHdr->tag.rnode.relNode; + block_info_array[num_buffers].forknum = bufHdr->tag.forkNum; + block_info_array[num_buffers].blocknum = bufHdr->tag.blockNum; + ++num_buffers; + } + + UnlockBufHdr(bufHdr, buf_state); + } + + /* Sorting now only to avoid sorting while loading. */ + pg_qsort(block_info_array, num_buffers, sizeof(BlockInfoRecord), sort_cmp_func); + + snprintf(transient_dump_file_path, sizeof(dump_file_path), + "%s.save.tmp", DUMP_FILENAME); + file = fopen(transient_dump_file_path, "w"); + if (file == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open \"%s\": %m", dump_file_path))); + + snprintf(dump_file_path, sizeof(dump_file_path), + "%s.save", DUMP_FILENAME); + + /* Write num_buffers first and then BlockMetaInfoRecords. */ + ret = fprintf(file, "<<%u>>\n", num_buffers); + if (ret < 0) + { + fclose(file); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("error writing to \"%s\" : %m", dump_file_path))); + } + + for (i = 0; i < num_buffers; i++) + { + ret = fprintf(file, "%u,%u,%u,%u,%u\n", + block_info_array[i].database, + block_info_array[i].spcNode, + block_info_array[i].filenode, + (uint32)block_info_array[i].forknum, + block_info_array[i].blocknum); + if (ret < 0) + { + fclose(file); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("error writing to \"%s\" : %m", dump_file_path))); + } + } + + pfree(block_info_array); + + /* + * Rename transient_dump_file_path to dump_file_path to make things + * permanent. + */ + ret = fclose(file); + if (ret != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("error closing \"%s\" : %m", transient_dump_file_path))); + + ret = unlink(dump_file_path); + if (ret != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unlink \"%s\" failed : %m", dump_file_path))); + + ret = rename(transient_dump_file_path, dump_file_path); + if (ret != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("Failed to rename \"%s\" to \"%s\" : %m", + transient_dump_file_path, dump_file_path))); + + if (!got_sigterm) + avoid_dumping = false; + + elog(LOG, "Buffer Dump: saved metadata of %d blocks", num_buffers); +} + +/* Extension's entry point. */ +void _PG_init(void) +{ + BackgroundWorker auto_prewarm; + + /* Define custom GUC variables. */ + DefineCustomIntVariable("pg_autoprewarm.buff_dump_interval", + "Sets the maximum time between two buffer pool dumps", + "If set to Zero, timer based dumping is diabled.", + &buff_dump_interval, + 0, + 0, INT_MAX / 1000, + PGC_POSTMASTER, + GUC_UNIT_S, + NULL, + NULL, + NULL); + + /* Register AutoPreWarmer. */ + MemSet(&auto_prewarm, 0, sizeof(auto_prewarm)); + auto_prewarm.bgw_main_arg = Int32GetDatum(0); + auto_prewarm.bgw_flags = BGWORKER_SHMEM_ACCESS; + + /* Register the Auto Pre-warmer background worker */ + auto_prewarm.bgw_start_time = BgWorkerStart_PostmasterStart; + auto_prewarm.bgw_restart_time = 0; /* Keep the Auto Pre-warmer running */ + auto_prewarm.bgw_main = AutoPreWarmerMain; + snprintf(auto_prewarm.bgw_name, BGW_MAXLEN, "Auto Pre-warmer"); + RegisterBackgroundWorker(&auto_prewarm); +} + +/* + * AutoPreWarmerMain -- Main entry point of Auto-prewarmer process. + * This is invoked as a background worker. + */ +static void AutoPreWarmerMain (Datum main_arg) +{ + MemoryContext autoprewarmer_context; + sigjmp_buf local_sigjmp_buf; + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGTERM, sigtermHandler); + + /* + * Create a resource owner to keep track of our resources. + */ + CurrentResourceOwner = ResourceOwnerCreate(NULL, "AutoPreWarmer"); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. + */ + autoprewarmer_context = AllocSetContextCreate(TopMemoryContext, + "AutoPreWarmer", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContextSwitchTo(autoprewarmer_context); + + + /* + * If an exception is encountered, processing resumes here. + * See notes in postgres.c about the design of this coding. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + LWLockReleaseAll(); + AbortBufferIO(); + UnlockBuffers(); + + /* buffer pins are released here. */ + ResourceOwnerRelease(CurrentResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, true); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + + MemoryContextSwitchTo(autoprewarmer_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(autoprewarmer_context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* Close all open files after any error. */ + smgrcloseall(); + + /* Error while dumping is treated as fatal hence do proc_exit */ + if (avoid_dumping) + proc_exit(0); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + if (!avoid_loading) + load_now(); + while (!got_sigterm) + { + int rc; + int timeout = 10; + + if (buff_dump_interval) + timeout = buff_dump_interval; + + ResetLatch(&MyProc->procLatch); + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + timeout * 1000, PG_WAIT_EXTENSION); + + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + /* If buff_dump_interval is set then dump the buff pool. */ + if ((rc & WL_TIMEOUT) && buff_dump_interval) + dump_now(); + } + + /* One last buffer pool dump while postmaster shutdown. */ + dump_now(); +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 58b0a97..acce5d2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -693,6 +693,20 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, mode, strategy, &hit); } +/* + * ReadBufferForPrewarm -- This new interface is for pg_autoprewarm. + */ +Buffer +ReadBufferForPrewarm(SMgrRelation smgr, char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode, BufferAccessStrategy strategy) +{ + bool hit; + + return ReadBuffer_common(smgr, relpersistence, forkNum, blockNum, + mode, strategy, &hit); +} + /* * ReadBuffer_common -- common logic for all ReadBuffer variants diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 88b90dc..8d267fa 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -169,6 +169,19 @@ ClockSweepTick(void) } /* + * have_free_buffer -- This function check whether there is a free buffer in + * buffer pool. Used by pg_autoprewarm module. + */ +bool +have_free_buffer() +{ + if (StrategyControl->firstFreeBuffer >= 0) + return true; + else + return false; +} + +/* * StrategyGetBuffer * * Called by the bufmgr to get the next candidate buffer to use in diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index c7da9f6..59a5277 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -317,6 +317,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern bool have_free_buffer(void); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 821bee5..495fa8e 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -16,6 +16,7 @@ #include "storage/block.h" #include "storage/buf.h" +#include "storage/smgr.h" #include "storage/bufpage.h" #include "storage/relfilenode.h" #include "utils/relcache.h" @@ -183,6 +184,10 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy); +extern Buffer ReadBufferForPrewarm(SMgrRelation smgr, char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode, + BufferAccessStrategy strategy); extern void ReleaseBuffer(Buffer buffer); extern void UnlockReleaseBuffer(Buffer buffer); extern void MarkBufferDirty(Buffer buffer);