diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index dcfef36591..8a9f14d4b2 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -54,6 +54,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/encryption.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/large_object.h" @@ -77,6 +78,7 @@ #include "pg_trace.h" extern uint32 bootstrap_data_checksum_version; +extern char *bootstrap_encryption_sample; /* File path names (all relative to $PGDATA) */ #define RECOVERY_COMMAND_FILE "recovery.conf" @@ -862,6 +864,7 @@ static void LocalSetXLogInsertAllowed(void); static void CreateEndOfRecoveryRecord(void); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); +static void XLogWritePages(char *from, int npages); static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic); @@ -1233,7 +1236,7 @@ ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, uint64 endbytepos; uint64 prevbytepos; - size = MAXALIGN(size); + size = XLOG_REC_ALIGN(size); /* All (non xlog-switch) records should contain data. */ Assert(size > SizeOfXLogRecord); @@ -1287,7 +1290,7 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) uint64 startbytepos; uint64 endbytepos; uint64 prevbytepos; - uint32 size = MAXALIGN(SizeOfXLogRecord); + uint32 size = XLOG_REC_ALIGN(SizeOfXLogRecord); XLogRecPtr ptr; uint32 segleft; @@ -1583,7 +1586,7 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, else { /* Align the end position, so that the next record starts aligned */ - CurrPos = MAXALIGN64(CurrPos); + CurrPos = XLOG_REC_ALIGN(CurrPos); } if (CurrPos != EndPos) @@ -2467,8 +2470,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) { char *from; Size nbytes; - Size nleft; - int written; /* Need to seek in the file? */ if (openLogOff != startoffset) @@ -2485,30 +2486,40 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) /* OK to write the page(s) */ from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; nbytes = npages * (Size) XLOG_BLCKSZ; - nleft = nbytes; - do + if (data_encrypted) { - errno = 0; - pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - written = write(openLogFile, from, nleft); - pgstat_report_wait_end(); - if (written <= 0) +#ifdef USE_ENCRYPTION + int i; + + /* + * XXX: use larger encryption buffer to enable larger writes + * and reduce number of syscalls? + */ + for (i = 0; i < npages; i++) { - if (errno == EINTR) - continue; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log file %s " - "at offset %u, length %zu: %m", - XLogFileNameP(ThisTimeLineID, openLogSegNo), - openLogOff, nbytes))); - } - nleft -= written; - from += written; - } while (nleft > 0); + char buf[XLOG_BLCKSZ]; + char tweak[TWEAK_SIZE]; + + XLogEncryptionTweak(tweak, openLogSegNo, openLogOff); + encrypt_block(from, buf, XLOG_BLCKSZ, tweak); + pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); + XLogWritePages(buf, 1); + pgstat_report_wait_end(); + + from += XLOG_BLCKSZ; + openLogOff += XLOG_BLCKSZ; + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + else + { + XLogWritePages(from, npages); + openLogOff += nbytes; + } /* Update state for write */ - openLogOff += nbytes; npages = 0; /* @@ -2624,6 +2635,32 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) } } +static void +XLogWritePages(char *from, int npages) +{ + Size nleft = npages * (Size) XLOG_BLCKSZ; + Size written; + + do + { + errno = 0; + written = write(openLogFile, from, nleft); + if (written <= 0) + { + if (errno == EINTR) + continue; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log file %s " + "at offset %u, length %zu: %m", + XLogFileNameP(ThisTimeLineID, openLogSegNo), + openLogOff, npages * (Size) XLOG_BLCKSZ))); + } + nleft -= written; + from += written; + } while (nleft > 0); +} + /* * Record the LSN for an asynchronous transaction commit/abort * and nudge the WALWriter if there is work for it to do. @@ -4681,6 +4718,31 @@ ReadControlFile(void) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"."))); + /* + * Initialize encryption, but not if the current backend has already done + * that. + */ + if (ControlFile->data_encrypted && !data_encrypted) + { + char sample[ENCRYPTION_SAMPLE_SIZE]; + + setup_encryption(false); + + memset(sample, 0, ENCRYPTION_SAMPLE_SIZE); + sample_encryption(sample); + if (memcmp(ControlFile->encryption_verification, sample, ENCRYPTION_SAMPLE_SIZE)) + ereport(FATAL, + (errmsg("invalid encryption key"), + errdetail("The passed encryption key does not match" + " database encryption key."))); + } + SetConfigOption("data_encryption", DataEncryptionEnabled() ? "yes" : "no", + PGC_INTERNAL, PGC_S_OVERRIDE); + + /* + * This calculation relies on data_encryption (in particular the header + * sizes do), so we could not do it earlier. + */ UsableBytesInSegment = (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) - (SizeOfXLogLongPHD - SizeOfXLogShortPHD); @@ -4768,6 +4830,16 @@ DataChecksumsEnabled(void) } /* + * Is this cluster encrypted? + */ +bool +DataEncryptionEnabled(void) +{ + Assert(ControlFile != NULL); + return (ControlFile->data_encrypted); +} + +/* * Returns a fake LSN for unlogged relations. * * Each call generates an LSN that is greater than any previous value @@ -5146,6 +5218,18 @@ BootStrapXLOG(void) use_existent = false; openLogFile = XLogFileInit(1, &use_existent, false); + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + XLogEncryptionTweak(tweak, 1, 0); + encrypt_block((char *) page, (char *) page, XLOG_BLCKSZ, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + /* Write the first page with the initial record */ errno = 0; pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE); @@ -5195,6 +5279,22 @@ BootStrapXLOG(void) ControlFile->wal_log_hints = wal_log_hints; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = bootstrap_data_checksum_version; + ControlFile->data_encrypted = data_encrypted; + + if (data_encrypted) + { + char *sample; + + sample = palloc0(ENCRYPTION_SAMPLE_SIZE); + sample_encryption(sample); + + memcpy(ControlFile->encryption_verification, sample, + ENCRYPTION_SAMPLE_SIZE); + pfree(sample); + } + else + memset(ControlFile->encryption_verification, 0, + ENCRYPTION_SAMPLE_SIZE); /* some additional ControlFile fields are set in WriteControlFile() */ @@ -11710,6 +11810,18 @@ retry: Assert(targetPageOff == readOff); Assert(reqLen <= readLen); + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + XLogEncryptionTweak(tweak, readSegNo, readOff); + decrypt_block(readBuf, readBuf, XLOG_BLCKSZ, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + *readTLI = curFileTLI; /* diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5bea073a2b..148d436632 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -28,6 +28,7 @@ #include "miscadmin.h" #include "replication/origin.h" #include "storage/bufmgr.h" +#include "storage/encryption.h" #include "storage/proc.h" #include "utils/memutils.h" #include "pg_trace.h" diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index dd96cef8f0..be7a420b96 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -24,6 +24,7 @@ #include "catalog/pg_control.h" #include "common/pg_lzcompress.h" #include "replication/origin.h" +#include "storage/encryption.h" #ifndef FRONTEND #include "utils/memutils.h" @@ -317,9 +318,9 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) * * NB: Even though we use an XLogRecord pointer here, the whole record * header might not fit on this page. xl_tot_len is the first field of the - * struct, so it must be on this page (the records are MAXALIGNed), but we - * cannot access any other fields until we've verified that we got the - * whole header. + * struct, so it must be on this page (the records are XLOG_REC_ALIGNed) + * but we cannot access any other fields until we've verified that we got + * the whole header. */ record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); total_len = record->xl_tot_len; @@ -461,8 +462,8 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); state->ReadRecPtr = RecPtr; - state->EndRecPtr = targetPagePtr + pageHeaderSize - + MAXALIGN(pageHeader->xlp_rem_len); + state->EndRecPtr = targetPagePtr + pageHeaderSize; + state->EndRecPtr += XLOG_REC_ALIGN(pageHeader->xlp_rem_len); } else { @@ -476,7 +477,7 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) if (!ValidXLogRecord(state, record, RecPtr)) goto err; - state->EndRecPtr = RecPtr + MAXALIGN(total_len); + state->EndRecPtr = RecPtr + XLOG_REC_ALIGN(total_len); state->ReadRecPtr = RecPtr; memcpy(state->readRecordBuf, record, total_len); @@ -532,6 +533,15 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) Assert((pageptr % XLOG_BLCKSZ) == 0); + /* + * Make sure that we only return data that can be decrypted in a sensible + * way. If the valid data ended in the middle of encryption block, then + * decryption of that last block would turn the contained data into + * garbage. + */ + if (data_encrypted) + reqLen = XLOG_REC_ALIGN(reqLen); + XLByteToSeg(pageptr, targetSegNo, state->wal_segment_size); targetPageOff = XLogSegmentOffset(pageptr, state->wal_segment_size); @@ -954,9 +964,10 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) * next page header will contain the remaining length of the * continuation data * - * Note that record headers are MAXALIGN'ed + * Note that record headers are XLOG_REC_ALIGN'ed */ - if (MAXALIGN(header->xlp_rem_len) > (XLOG_BLCKSZ - pageHeaderSize)) + if (XLOG_REC_ALIGN(header->xlp_rem_len) > + (XLOG_BLCKSZ - pageHeaderSize)) tmpRecPtr = targetPagePtr + XLOG_BLCKSZ; else { @@ -965,7 +976,7 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) * tmpRecPtr to point to the first valid record */ tmpRecPtr = targetPagePtr + pageHeaderSize - + MAXALIGN(header->xlp_rem_len); + + XLOG_REC_ALIGN(header->xlp_rem_len); break; } } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 4ecdc9220f..736059531d 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -25,6 +25,7 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "pgstat.h" +#include "storage/encryption.h" #include "storage/smgr.h" #include "utils/guc.h" #include "utils/hsearch.h" @@ -653,12 +654,15 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, * frontend). Probably these should be merged at some point. */ static void -XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, - Size count) +XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, Size count) { - char *p; XLogRecPtr recptr; Size nbytes; +#ifdef USE_ENCRYPTION + char *decrypt_p; + uint32 decryptOff; +#endif + char *p; /* state maintained across calls */ static int sendFile = -1; @@ -679,6 +683,9 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, int readbytes; startoff = XLogSegmentOffset(recptr, segsize); +#ifdef USE_ENCRYPTION + decryptOff = startoff; +#endif /* Do we need to switch to a different xlog segment? */ if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo, segsize) || @@ -758,6 +765,25 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, sendOff += readbytes; nbytes -= readbytes; p += readbytes; + + /* Decrypt completed blocks */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + while (decrypt_p + XLOG_BLCKSZ <= p) + { + char tweak[TWEAK_SIZE]; + + XLogEncryptionTweak(tweak, sendSegNo, decryptOff); + decrypt_block(decrypt_p, decrypt_p, XLOG_BLCKSZ, tweak); + + decrypt_p += XLOG_BLCKSZ; + decryptOff += XLOG_BLCKSZ; + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } } } diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 7e34bee63e..cf8df2f81d 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -23,6 +23,7 @@ #include "bootstrap/bootstrap.h" #include "catalog/index.h" #include "catalog/pg_collation.h" +#include "catalog/pg_control.h" #include "catalog/pg_type.h" #include "libpq/pqsignal.h" #include "miscadmin.h" @@ -36,6 +37,7 @@ #include "storage/bufmgr.h" #include "storage/bufpage.h" #include "storage/condition_variable.h" +#include "storage/encryption.h" #include "storage/ipc.h" #include "storage/proc.h" #include "tcop/tcopprot.h" @@ -49,7 +51,6 @@ uint32 bootstrap_data_checksum_version = 0; /* No checksum */ - #define ALLOC(t, c) \ ((t *) MemoryContextAllocZero(TopMemoryContext, (unsigned)(c) * sizeof(t))) @@ -223,7 +224,7 @@ AuxiliaryProcessMain(int argc, char *argv[]) /* If no -x argument, we are a CheckerProcess */ MyAuxProcType = CheckerProcess; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:X:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:FkK:r:x:X:-:")) != -1) { switch (flag) { @@ -249,6 +250,25 @@ AuxiliaryProcessMain(int argc, char *argv[]) case 'F': SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV); break; +#ifdef USE_OPENSSL + case 'K': + encryption_key_command = strdup(optarg); + + /* + * When auxiliary process (typically bootstrap) starts, the + * control file might not exist yet. In this case we also use + * encryption_key_command to indicate that the encryption is + * enabled. + * + * Postmaster should not set this variable. Instead, it just + * sets data_encrypted according to the control file and child + * processes inherit that. + */ + Assert(!IsUnderPostmaster); + data_encrypted = true; + + break; +#endif /* USE_OPENSSL */ case 'k': bootstrap_data_checksum_version = PG_DATA_CHECKSUM_VERSION; break; @@ -370,6 +390,20 @@ AuxiliaryProcessMain(int argc, char *argv[]) if (!IsUnderPostmaster) InitializeMaxBackends(); + /* + * If data_encryption is set because of command line argument, do the + * setup now. (If set by postmaster, postmaster should have performed the + * setup.) + * + * This should only be useful for the bootstrap process. Anyone else + * initializes the encryption via ReadControlFile(). + */ + if (data_encrypted && MyAuxProcType == BootstrapProcess) + { + Assert(!IsUnderPostmaster); + setup_encryption(true); + } + BaseInit(); /* diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 5342f217c0..0645ac00ba 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -625,7 +625,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * * We don't need to copy subdirectories */ - copydir(srcpath, dstpath, false); + { + RelFileNode fromNode = {srctablespace, src_dboid, InvalidOid}; + RelFileNode toNode = {dsttablespace, dboid, InvalidOid}; + + copydir(srcpath, dstpath, &fromNode, &toNode); + } /* Record the filesystem change in XLOG */ { @@ -1254,7 +1259,12 @@ movedb(const char *dbname, const char *tblspcname) /* * Copy files from the old tablespace to the new one */ - copydir(src_dbpath, dst_dbpath, false); + { + RelFileNode fromNode = {src_tblspcoid, db_id, InvalidOid}; + RelFileNode toNode = {dst_tblspcoid, db_id, InvalidOid}; + + copydir(src_dbpath, dst_dbpath, &fromNode, &toNode); + } /* * Record the filesystem change in XLOG @@ -2116,7 +2126,12 @@ dbase_redo(XLogReaderState *record) * * We don't need to copy subdirectories */ - copydir(src_path, dst_path, false); + { + RelFileNode fromNode = {xlrec->src_tablespace_id, xlrec->src_db_id, InvalidOid}; + RelFileNode toNode = {xlrec->tablespace_id, xlrec->db_id, InvalidOid}; + + copydir(src_path, dst_path, &fromNode, &toNode); + } } else if (info == XLOG_DBASE_DROP) { diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 084573e77c..9254c855af 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -42,6 +42,7 @@ #include "catalog/pg_database.h" #include "catalog/pg_proc.h" #include "common/ip.h" +#include "lib/stringinfo.h" #include "libpq/libpq.h" #include "libpq/pqsignal.h" #include "mb/pg_wchar.h" @@ -52,6 +53,7 @@ #include "postmaster/postmaster.h" #include "replication/walsender.h" #include "storage/backendid.h" +#include "storage/encryption.h" #include "storage/dsm.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -105,6 +107,10 @@ #define PGSTAT_TAB_HASH_SIZE 512 #define PGSTAT_FUNCTION_HASH_SIZE 512 +/* + * Size of a buffer used to read stats files from file. + */ +#define PGSTAT_FILE_BUFFER_SIZE 1024 /* ---------- * Total number of backends including auxiliary @@ -309,6 +315,9 @@ static HTAB *pgstat_collect_oids(Oid catalogid); static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared); static void pgstat_setup_memcxt(void); +static StringInfo pgstat_setup_serial_buffer(MemoryContext *context); +static bool pgstat_read_file(StringInfo buf, const char *statfile, + FILE *fpin); static const char *pgstat_get_wait_activity(WaitEventActivity w); static const char *pgstat_get_wait_client(WaitEventClient w); @@ -337,6 +346,12 @@ static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int le static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); +static bool readFromStringInfo(StringInfo str, void *data, Size size); + +#ifdef USE_ENCRYPTION +static void pgstat_encryption_tweak(char *tweak, Oid dbid, bool permament); +#endif + /* ------------------------------------------------------------ * Public functions called from postmaster follow * ------------------------------------------------------------ @@ -3784,6 +3799,15 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_DSM_FILL_ZERO_WRITE: event_name = "DSMFillZeroWrite"; break; + case WAIT_EVENT_KDF_FILE_READ: + event_name = "KDFFileRead"; + break; + case WAIT_EVENT_KDF_FILE_SYNC: + event_name = "KDFFileSync"; + break; + case WAIT_EVENT_KDF_FILE_WRITE: + event_name = "KDFFileWrite"; + break; case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ: event_name = "LockFileAddToDataDirRead"; break; @@ -4680,6 +4704,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; int rc; + StringInfo buf; + MemoryContext serialContext; elog(DEBUG2, "writing stats file \"%s\"", statfile); @@ -4702,23 +4728,25 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) globalStats.stats_timestamp = GetCurrentTimestamp(); /* + * Memory for data serialization. + */ + buf = pgstat_setup_serial_buffer(&serialContext); + + /* * Write the file header --- currently just a format ID. */ format_id = PGSTAT_FILE_FORMAT_ID; - rc = fwrite(&format_id, sizeof(format_id), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + appendBinaryStringInfo(buf, (char *) &format_id, sizeof(format_id)); /* * Write global stats struct */ - rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + appendBinaryStringInfo(buf, (char *) &globalStats, sizeof(globalStats)); /* * Write archiver stats struct */ - rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + appendBinaryStringInfo(buf, (char *) &archiverStats, sizeof(archiverStats)); /* * Walk through the database table. @@ -4742,18 +4770,53 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) * Write out the DB entry. We don't write the tables or functions * pointers, since they're of no use to any other process. */ - fputc('D', fpout); - rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + appendBinaryStringInfo(buf, (char *) "D", 1); + appendBinaryStringInfo(buf, (char *) dbentry, + offsetof(PgStat_StatDBEntry, tables)); + } + + /* + * No more output to be done. + */ + appendBinaryStringInfo(buf, (char *) "E", 1); + + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + /* + * Make sure the data is aligned to ENCRYPTION_BLOCK. + */ + if (buf->len % ENCRYPTION_BLOCK > 0) + { + char zerobuf[ENCRYPTION_BLOCK]; + + memset(zerobuf, 0, ENCRYPTION_BLOCK); + appendBinaryStringInfo(buf, + zerobuf, + ENCRYPTION_BLOCK - buf->len % ENCRYPTION_BLOCK); + } + + pgstat_encryption_tweak(tweak, InvalidOid, permanent); + encrypt_block(buf->data, buf->data, buf->len, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ } /* - * No more output to be done. Close the temp file and replace the old - * pgstat.stat with it. The ferror() check replaces testing for error - * after each individual fputc or fwrite above. + * Write the data to the file. */ - fputc('E', fpout); + rc = fwrite(buf->data, buf->len, 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* Free the temporary storage. */ + MemoryContextResetOnly(serialContext); + /* + * Close the temp file and replace the old pgstat.stat with it. + */ if (ferror(fpout)) { ereport(LOG, @@ -4834,6 +4897,8 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) int rc; char tmpfile[MAXPGPATH]; char statfile[MAXPGPATH]; + StringInfo buf; + MemoryContext serialContext; get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH); get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH); @@ -4854,11 +4919,15 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) } /* + * Memory for data serialization. + */ + buf = pgstat_setup_serial_buffer(&serialContext); + + /* * Write the file header --- currently just a format ID. */ format_id = PGSTAT_FILE_FORMAT_ID; - rc = fwrite(&format_id, sizeof(format_id), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + appendBinaryStringInfo(buf, (char *) &format_id, sizeof(format_id)); /* * Walk through the database's access stats per table. @@ -4866,9 +4935,8 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) hash_seq_init(&tstat, dbentry->tables); while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL) { - fputc('T', fpout); - rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + appendBinaryStringInfo(buf, (char *) "T", 1); + appendBinaryStringInfo(buf, (char *) tabentry, sizeof(PgStat_StatTabEntry)); } /* @@ -4877,18 +4945,52 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) hash_seq_init(&fstat, dbentry->functions); while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) { - fputc('F', fpout); - rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + appendBinaryStringInfo(buf, (char *) "F", 1); + appendBinaryStringInfo(buf, (char *) funcentry, sizeof(PgStat_StatFuncEntry)); } /* - * No more output to be done. Close the temp file and replace the old - * pgstat.stat with it. The ferror() check replaces testing for error - * after each individual fputc or fwrite above. + * No more output to be done. */ - fputc('E', fpout); + appendBinaryStringInfo(buf, (char *) "E", 1); + + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + /* + * Make sure the data is aligned to ENCRYPTION_BLOCK. + */ + if (buf->len % ENCRYPTION_BLOCK > 0) + { + char zerobuf[ENCRYPTION_BLOCK]; + memset(zerobuf, 0, ENCRYPTION_BLOCK); + appendBinaryStringInfo(buf, + zerobuf, + ENCRYPTION_BLOCK - buf->len % ENCRYPTION_BLOCK); + } + + pgstat_encryption_tweak(tweak, dbid, permanent); + encrypt_block(buf->data, buf->data, buf->len, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + + /* + * Write the data to the file. + */ + rc = fwrite(buf->data, buf->len, 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* Free the temporary storage. */ + MemoryContextResetOnly(serialContext); + + /* + * Close the temp file and replace the old pgstat.stat with it. + */ if (ferror(fpout)) { ereport(LOG, @@ -4955,6 +5057,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) int32 format_id; bool found; const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + StringInfo buf; + MemoryContext serialContext; /* * The tables will live in pgStatLocalContext. @@ -5005,9 +5109,38 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) } /* + * Memory for data serialization. + */ + buf = pgstat_setup_serial_buffer(&serialContext); + + /* + * Read the data into memory. + */ + if (!pgstat_read_file(buf, statfile, fpin)) + goto done; + + /* + * Decrypt the data if it's encrypted. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + pgstat_encryption_tweak(tweak, InvalidOid, permanent); + decrypt_block(buf->data, buf->data, buf->len, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + + /* Prepare for reading. */ + buf->cursor = 0; + + /* * Verify it's of the expected format. */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || + if (!readFromStringInfo(buf, &format_id, sizeof(format_id)) || format_id != PGSTAT_FILE_FORMAT_ID) { ereport(pgStatRunningInCollector ? LOG : WARNING, @@ -5018,7 +5151,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) /* * Read global stats struct */ - if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats)) + if (!readFromStringInfo(buf, &globalStats, sizeof(globalStats))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); @@ -5039,7 +5172,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) /* * Read archiver stats struct */ - if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats)) + if (!readFromStringInfo(buf, &archiverStats, sizeof(archiverStats))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); @@ -5053,15 +5186,24 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) */ for (;;) { - switch (fgetc(fpin)) + char kind; + + if (!readFromStringInfo(buf, &kind, 1)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + goto done; + } + + switch (kind) { /* * 'D' A PgStat_StatDBEntry struct describing a database * follows. */ case 'D': - if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables), - fpin) != offsetof(PgStat_StatDBEntry, tables)) + if (!readFromStringInfo(buf, &dbbuf, + offsetof(PgStat_StatDBEntry, tables))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", @@ -5151,6 +5293,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) done: FreeFile(fpin); + /* Free the temporary storage. */ + MemoryContextResetOnly(serialContext); + /* If requested to read the permanent file, also get rid of it. */ if (permanent) { @@ -5188,6 +5333,8 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, int32 format_id; bool found; char statfile[MAXPGPATH]; + StringInfo buf; + MemoryContext serialContext; get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH); @@ -5211,9 +5358,39 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, } /* + * Memory for data serialization. + */ + buf = pgstat_setup_serial_buffer(&serialContext); + + /* + * Read the data into memory. + */ + + if (!pgstat_read_file(buf, statfile, fpin)) + goto done; + + /* + * Decrypt the data if it's encrypted. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + pgstat_encryption_tweak(tweak, databaseid, permanent); + decrypt_block(buf->data, buf->data, buf->len, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + + /* Prepare for reading. */ + buf->cursor = 0; + + /* * Verify it's of the expected format. */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || + if (!readFromStringInfo(buf, &format_id, sizeof(format_id)) || format_id != PGSTAT_FILE_FORMAT_ID) { ereport(pgStatRunningInCollector ? LOG : WARNING, @@ -5227,14 +5404,22 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, */ for (;;) { - switch (fgetc(fpin)) + char kind; + + if (!readFromStringInfo(buf, &kind, 1)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + goto done; + } + + switch (kind) { /* * 'T' A PgStat_StatTabEntry follows. */ case 'T': - if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry), - fpin) != sizeof(PgStat_StatTabEntry)) + if (!readFromStringInfo(buf, &tabbuf, sizeof(PgStat_StatTabEntry))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", @@ -5267,8 +5452,8 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, * 'F' A PgStat_StatFuncEntry follows. */ case 'F': - if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry), - fpin) != sizeof(PgStat_StatFuncEntry)) + if (!readFromStringInfo(buf, &funcbuf, + sizeof(PgStat_StatFuncEntry))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", @@ -5314,6 +5499,9 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, done: FreeFile(fpin); + /* Free the temporary storage. */ + MemoryContextResetOnly(serialContext); + if (permanent) { elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); @@ -5348,6 +5536,8 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, FILE *fpin; int32 format_id; const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + StringInfo buf; + MemoryContext serialContext; /* * Try to open the stats file. As above, anything but ENOENT is worthy of @@ -5364,38 +5554,68 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, } /* + * Memory for data serialization. + */ + buf = pgstat_setup_serial_buffer(&serialContext); + + /* + * Read the data into memory. + */ + if (!pgstat_read_file(buf, statfile, fpin)) + goto done; + + /* + * Decrypt the data if it's encrypted. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + pgstat_encryption_tweak(tweak, InvalidOid, permanent); + decrypt_block(buf->data, buf->data, buf->len, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + + /* Prepare for reading. */ + buf->cursor = 0; + + /* * Verify it's of the expected format. */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || + if (!readFromStringInfo(buf, &format_id, sizeof(format_id)) || format_id != PGSTAT_FILE_FORMAT_ID) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); FreeFile(fpin); + MemoryContextResetOnly(serialContext); return false; } /* * Read global stats struct */ - if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), - fpin) != sizeof(myGlobalStats)) + if (!readFromStringInfo(buf, &myGlobalStats, sizeof(myGlobalStats))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); FreeFile(fpin); + MemoryContextResetOnly(serialContext); return false; } /* * Read archiver stats struct */ - if (fread(&myArchiverStats, 1, sizeof(myArchiverStats), - fpin) != sizeof(myArchiverStats)) + if (!readFromStringInfo(buf, &myArchiverStats, sizeof(myArchiverStats))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); FreeFile(fpin); + MemoryContextResetOnly(serialContext); return false; } @@ -5408,15 +5628,24 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, */ for (;;) { - switch (fgetc(fpin)) + char kind; + + if (!readFromStringInfo(buf, &kind, 1)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + goto done; + } + + switch (kind) { /* * 'D' A PgStat_StatDBEntry struct describing a database * follows. */ case 'D': - if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables), - fpin) != offsetof(PgStat_StatDBEntry, tables)) + if (!readFromStringInfo(buf, &dbentry, + offsetof(PgStat_StatDBEntry, tables))) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", @@ -5449,6 +5678,7 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, done: FreeFile(fpin); + MemoryContextResetOnly(serialContext); return true; } @@ -5600,6 +5830,61 @@ pgstat_setup_memcxt(void) ALLOCSET_SMALL_SIZES); } +/* ---------- + * pgstat_setup_serial_buffer() - + * + * Allocate memory for data serialization and return the containing memory + * context. + * + * ---------- + */ +static StringInfo +pgstat_setup_serial_buffer(MemoryContext *context) +{ + MemoryContext oldCxt; + StringInfo result; + + *context = AllocSetContextCreate(TopMemoryContext, + "Statistics encryption", + ALLOCSET_DEFAULT_SIZES); + oldCxt = MemoryContextSwitchTo(*context); + result = makeStringInfo(); + MemoryContextSwitchTo(oldCxt); + + return result; +} + +/* + * Read statistics data from file into memory. + */ +static bool +pgstat_read_file(StringInfo buf, const char *statfile, FILE *fpin) +{ + for (;;) + { + char bufread[PGSTAT_FILE_BUFFER_SIZE]; + Size bytesread; + + bytesread = fread(bufread, 1, PGSTAT_FILE_BUFFER_SIZE, fpin); + appendBinaryStringInfo(buf, bufread, bytesread); + + if (bytesread < PGSTAT_FILE_BUFFER_SIZE) + break; + } + + /* + * Check if the amount of data makes sense. + */ + if (buf->len == 0 || (data_encrypted && buf->len % ENCRYPTION_BLOCK > 0)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + return false; + } + + return true; +} + /* ---------- * pgstat_clear_snapshot() - @@ -6309,6 +6594,35 @@ pgstat_write_statsfile_needed(void) return false; } +/* + * Read "size" bytes into "data" from StringInfo. + * + * Return true iff there was enough data available. + * + * XXX Consider moving this function to stringinfo.c. + */ +static bool +readFromStringInfo(StringInfo str, void *data, Size size) +{ + if (str->cursor + size > str->len) + return false; + + memcpy(data, str->data + str->cursor, size); + str->cursor += size; + + return true; +} + +#ifdef USE_ENCRYPTION +static void +pgstat_encryption_tweak(char *tweak, Oid dbid, bool permanent) +{ + memset(tweak, 0, TWEAK_SIZE); + memcpy(tweak, &dbid, sizeof(Oid)); + memcpy(tweak + sizeof(Oid), &permanent, sizeof(bool)); +} +#endif /* USE_ENCRYPTION */ + /* ---------- * pgstat_db_requested() - * diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index a4b53b33cd..907a2f5cf0 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -116,6 +116,7 @@ #include "postmaster/syslogger.h" #include "replication/logicallauncher.h" #include "replication/walsender.h" +#include "storage/encryption.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" @@ -674,7 +675,7 @@ PostmasterMain(int argc, char *argv[]) * tcop/postgres.c (the option sets should not conflict) and with the * common help() function in main/main.c. */ - while ((opt = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1) + while ((opt = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijK:k:lN:nOo:Pp:r:S:sTt:W:-:")) != -1) { switch (opt) { @@ -732,6 +733,12 @@ PostmasterMain(int argc, char *argv[]) /* only used by interactive backend */ break; +#ifdef USE_OPENSSL + case 'K': + encryption_key_command = strdup(optarg); + break; +#endif /* USE_OPENSSL */ + case 'k': SetConfigOption("unix_socket_directories", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; @@ -1175,6 +1182,11 @@ PostmasterMain(int argc, char *argv[]) /* * Set up shared memory and semaphores. + * + * This includes call of setup_encryption() as soon as we realize that + * "data_encrypted" field of the control file is set. The encryption must + * be initialized at the point so that "encryption_verification" field of + * the control file can be checked. */ reset_shared(PostPortNumber); diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 3f1eae38a9..c4fcb1e33f 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1102,7 +1102,7 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, /* Exclude all forks for unlogged tables except the init fork */ if (isDbDir && parse_filename_for_nontemp_relation(de->d_name, &relOidChars, - &relForkNum)) + &relForkNum, NULL)) { /* Never exclude init forks */ if (relForkNum != INIT_FORKNUM) diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 3799ad4011..61bebca84a 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -70,6 +70,7 @@ #include "replication/slot.h" #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */ #include "storage/bufmgr.h" +#include "storage/encryption.h" #include "storage/fd.h" #include "storage/sinval.h" #include "utils/builtins.h" @@ -190,7 +191,7 @@ static void ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTX static void ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, - int fd, ReorderBufferChange *change); + int fd, XLogSegNo segno, ReorderBufferChange *change); static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, int *fd, XLogSegNo *segno); static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, @@ -216,6 +217,20 @@ static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *t Relation relation, ReorderBufferChange *change); +/* ---------------------------------------------- + * encryption / decryption of serialized changes + * ---------------------------------------------- + */ + +/* + * Compute encryption tweak for buffer change. + */ +#define REORDER_BUFFER_CHANGE_TWEAK(tweak, segno) \ + StaticAssertStmt(sizeof(XLogSegNo) <= TWEAK_SIZE, \ + "XLogSegNo does not fit into encryption tweak"); \ + memset((tweak), 0, TWEAK_SIZE); \ + memcpy((tweak), (char *) &segno, sizeof(XLogSegNo)) + /* * Allocate a new ReorderBuffer and clean out any old serialized state from * prior ReorderBuffer instances for the same slot. @@ -2134,7 +2149,7 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) errmsg("could not open file \"%s\": %m", path))); } - ReorderBufferSerializeChange(rb, txn, fd, change); + ReorderBufferSerializeChange(rb, txn, fd, curOpenSegNo, change); dlist_delete(&change->node); ReorderBufferReturnChange(rb, change); @@ -2155,12 +2170,26 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) */ static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, - int fd, ReorderBufferChange *change) + int fd, XLogSegNo segno, + ReorderBufferChange *change) { ReorderBufferDiskChange *ondisk; - Size sz = sizeof(ReorderBufferDiskChange); + Size sz, + sz_hdr; + char *outbuf; + + sz = sizeof(ReorderBufferDiskChange); - ReorderBufferSerializeReserve(rb, sz); + /* + * As the on-disk change has variable length, the header will have to be + * de-serialized separate, see ReorderBufferRestoreChanges(). Therefore we + * also need to encrypt it separate. Make sure the size is appropriate. + */ + if (data_encrypted) + sz = TYPEALIGN(ENCRYPTION_BLOCK, sz); + + sz_hdr = sz; + ReorderBufferSerializeReserve(rb, sz_hdr); ondisk = (ReorderBufferDiskChange *) rb->outbuf; memcpy(&ondisk->change, change, sizeof(ReorderBufferChange)); @@ -2197,6 +2226,15 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, } /* make sure we have enough space */ + if (data_encrypted) + { + /* + * Encryption works with blocks. As the changes are stored + * and retrieved separately, we also have to decrypt them + * alone. + */ + sz = TYPEALIGN(ENCRYPTION_BLOCK, sz); + } ReorderBufferSerializeReserve(rb, sz); data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); @@ -2227,12 +2265,15 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, char *data; Size prefix_size = strlen(change->data.msg.prefix) + 1; + sz += prefix_size + change->data.msg.message_size + sizeof(Size) + sizeof(Size); + + if (data_encrypted) + sz = TYPEALIGN(ENCRYPTION_BLOCK, sz); ReorderBufferSerializeReserve(rb, sz); data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); - /* might have been reallocated above */ ondisk = (ReorderBufferDiskChange *) rb->outbuf; @@ -2265,11 +2306,13 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, ; /* make sure we have enough space */ + if (data_encrypted) + sz = TYPEALIGN(ENCRYPTION_BLOCK, sz); ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); /* might have been reallocated above */ ondisk = (ReorderBufferDiskChange *) rb->outbuf; - memcpy(data, snap, sizeof(SnapshotData)); data += sizeof(SnapshotData); @@ -2298,8 +2341,50 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, ondisk->size = sz; + /* + * Encrypt the change if encryption is required. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + REORDER_BUFFER_CHANGE_TWEAK(tweak, segno); + + enlarge_encryption_buffer(ondisk->size); + + /* + * Encrypt the header and the payload separate as explained at the top + * of the function. + */ + encrypt_block((char *) rb->outbuf, encryption_buffer, sz_hdr, + tweak); + + if (ondisk->size > sz_hdr) + encrypt_block((char *) rb->outbuf + sz_hdr, + encryption_buffer + sz_hdr, + ondisk->size - sz_hdr, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + + /* + * Make sure the correct buffer is written to disk. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + outbuf = encryption_buffer; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + else + outbuf = rb->outbuf; + pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE); - if (write(fd, rb->outbuf, ondisk->size) != ondisk->size) + if (write(fd, outbuf, ondisk->size) != ondisk->size) { int save_errno = errno; @@ -2327,6 +2412,10 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, Size restored = 0; XLogSegNo last_segno; dlist_mutable_iter cleanup_iter; + Size sz_hdr; +#ifdef USE_ENCRYPTION + char encryption_tweak[TWEAK_SIZE]; +#endif Assert(txn->first_lsn != InvalidXLogRecPtr); Assert(txn->final_lsn != InvalidXLogRecPtr); @@ -2345,6 +2434,17 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size); + /* + * See ReorderBufferSerializeChange for explanation. + */ + sz_hdr = sizeof(ReorderBufferDiskChange); + if (data_encrypted) +#ifdef USE_ENCRYPTION + sz_hdr = TYPEALIGN(ENCRYPTION_BLOCK, sz_hdr); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + while (restored < max_changes_in_memory && *segno <= last_segno) { int readBytes; @@ -2386,9 +2486,9 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, * about the total size. If we couldn't read a record, we're at the * end of this file. */ - ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange)); + ReorderBufferSerializeReserve(rb, sz_hdr); pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_READ); - readBytes = read(*fd, rb->outbuf, sizeof(ReorderBufferDiskChange)); + readBytes = read(*fd, rb->outbuf, sz_hdr); pgstat_report_wait_end(); /* eof */ @@ -2403,38 +2503,64 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from reorderbuffer spill file: %m"))); - else if (readBytes != sizeof(ReorderBufferDiskChange)) + else if (readBytes != sz_hdr) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", readBytes, - (uint32) sizeof(ReorderBufferDiskChange)))); + (uint32) sz_hdr))); + + /* + * Decrypt the header if the change is decrypted, so that we know the + * change size. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + REORDER_BUFFER_CHANGE_TWEAK(encryption_tweak, *segno); + decrypt_block(rb->outbuf, rb->outbuf, sz_hdr, encryption_tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } ondisk = (ReorderBufferDiskChange *) rb->outbuf; - ReorderBufferSerializeReserve(rb, - sizeof(ReorderBufferDiskChange) + ondisk->size); + ReorderBufferSerializeReserve(rb, ondisk->size); ondisk = (ReorderBufferDiskChange *) rb->outbuf; pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_READ); - readBytes = read(*fd, rb->outbuf + sizeof(ReorderBufferDiskChange), - ondisk->size - sizeof(ReorderBufferDiskChange)); + readBytes = read(*fd, rb->outbuf + sz_hdr, ondisk->size - sz_hdr); pgstat_report_wait_end(); if (readBytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from reorderbuffer spill file: %m"))); - else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange)) + else if (readBytes != ondisk->size - sz_hdr) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", readBytes, - (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange))))); + (uint32) (ondisk->size - sz_hdr)))); + + /* + * ok, read a full change from disk, now decrypt it if it's encrypted + * and if there's some data beyond the header. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + if (ondisk->size > sz_hdr) + decrypt_block(rb->outbuf + sz_hdr, rb->outbuf + sz_hdr, + ondisk->size - sz_hdr, encryption_tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } /* - * ok, read a full change from disk, now restore it into proper - * in-memory format + * Restore the change into proper in-memory format */ ReorderBufferRestoreChange(rb, txn, rb->outbuf); restored++; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index efbede7629..ca0848dd20 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -47,12 +47,17 @@ #include "storage/fd.h" #include "storage/buffile.h" #include "storage/buf_internals.h" +#include "storage/encryption.h" +#include "utils/datetime.h" #include "utils/resowner.h" /* * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE. * The reason is that we'd like large BufFiles to be spread across multiple * tablespaces when available. + * + * The number of useful bytes is appended to each segment of shared encrypted + * file, see BufFile.useful. */ #define MAX_PHYSICAL_FILESIZE 0x40000000 #define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ) @@ -68,6 +73,35 @@ struct BufFile /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */ File *files; /* palloc'd array with numFiles entries */ off_t *offsets; /* palloc'd array with numFiles entries */ +#ifdef USE_ENCRYPTION + + /* + * If the file is encrypted, only the whole buffer can be loaded / dumped + * --- see BufFileLoadBuffer() for more info --- whether it's space is + * used up or not. Therefore we need to keep track of the actual on-disk + * size buffer of each component file, as it would be if there was no + * encryption. + * + * List would make coding simpler, however would not contribute to + * performance. Random access is important here. + */ + off_t *useful; + + /* + * The array may need to be expanded independent from extendBufFile() + * (i.e. earlier than the buffer gets dumped), so store the number of + * elements separate from numFiles. + */ + int nuseful; + + /* + * Segment number is used to compute encryption tweak so we must remember + * the original numbers of segments if the file is encrypted and if it was + * passed as target to BufFileAppend() at least once. If this field is + * NULL, ->curFile is used to compute the tweak. + */ + off_t *segnos; +#endif /* * offsets[i] is the current seek position of files[i]. We use this to @@ -106,6 +140,10 @@ static void BufFileLoadBuffer(BufFile *file); static void BufFileDumpBuffer(BufFile *file); static int BufFileFlush(BufFile *file); static File MakeNewSharedSegment(BufFile *file, int segment); +#ifdef USE_ENCRYPTION +static void BufFileTweak(char *tweak, BufFile *file); +static void ensureUsefulArraySize(BufFile *file, int required); +#endif /* * Create BufFile and perform the common initialization. @@ -125,6 +163,28 @@ makeBufFileCommon(int nfiles) file->pos = 0; file->nbytes = 0; + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + file->useful = (off_t *) palloc0(sizeof(off_t) * nfiles); + file->nuseful = nfiles; + file->segnos = NULL; + + /* + * The unused (trailing) part of the buffer should not contain + * undefined data: if we encrypt such a buffer and flush it to disk, + * the encrypted form of that "undefined part" can get zeroed due to + * seek and write beyond EOF. If such a buffer gets loaded and + * decrypted, the change of the undefined part to zeroes can affect + * the valid part if it does not end at block boundary. By setting the + * whole buffer to zeroes we ensure that the unused part always + * contains zeroes. + */ + MemSet(file->buffer, 0, BLCKSZ); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } return file; } @@ -172,6 +232,22 @@ extendBufFile(BufFile *file) (file->numFiles + 1) * sizeof(File)); file->offsets = (off_t *) repalloc(file->offsets, (file->numFiles + 1) * sizeof(off_t)); + + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + ensureUsefulArraySize(file, file->numFiles + 1); + + if (file->segnos) + { + file->segnos = (off_t *) repalloc(file->segnos, + (file->numFiles + 1) * sizeof(off_t)); + file->segnos[file->numFiles] = file->numFiles; + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } file->files[file->numFiles] = pfile; file->offsets[file->numFiles] = 0L; file->numFiles++; @@ -302,6 +378,7 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name) files[nfiles] = SharedFileSetOpen(fileset, segment_name); if (files[nfiles] <= 0) break; + ++nfiles; CHECK_FOR_INTERRUPTS(); @@ -317,7 +394,64 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name) errmsg("could not open BufFile \"%s\"", name))); file = makeBufFileCommon(nfiles); + + /* + * Shared encrypted segment should contain information on the number of + * useful bytes at the end. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + off_t pos; + int i; + + for (i = 0; i < nfiles; i++) + { + int nbytes; + File segment = files[i]; + + pos = FileSeek(segment, -sizeof(off_t), SEEK_END); + + /* + * The word must immediately follow the last buffer of the + * segment. + */ + if (pos <= 0 || pos % BLCKSZ != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not find padding info in BufFile \"%s\"", + name))); + + nbytes = FileRead(segment, (char *) &file->useful[i], + sizeof(off_t), WAIT_EVENT_BUFFILE_READ); + if (nbytes != sizeof(off_t)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read padding info from BufFile \"%s\"", + name))); + Assert(file->useful[i] > 0); + + /* + * Keep ->offsets up-to-date. + */ + file->offsets[i] = FileTell(segment); + + CHECK_FOR_INTERRUPTS(); + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + file->files = files; + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + file->nuseful = nfiles; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } file->readOnly = true; /* Can't write to files opened this way */ file->fileset = fileset; file->name = pstrdup(name); @@ -397,6 +531,17 @@ BufFileClose(BufFile *file) /* release the buffer space */ pfree(file->files); pfree(file->offsets); + + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + if (file->segnos) + pfree(file->segnos); + pfree(file->useful); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } pfree(file); } @@ -404,7 +549,7 @@ BufFileClose(BufFile *file) * BufFileLoadBuffer * * Load some data into buffer, if possible, starting from curOffset. - * At call, must have dirty = false, pos and nbytes = 0. + * At call, must have dirty = false, nbytes = 0. * On exit, nbytes is number of bytes loaded. */ static void @@ -413,6 +558,22 @@ BufFileLoadBuffer(BufFile *file) File thisfile; /* + * Only whole multiple of ENCRYPTION_BLOCK can be encrypted / decrypted, + * but we choose to use BLCKSZ (i.e. BufFile buffer) as the unit. The + * point is that curOffset is a component of the encryption tweak, and all + * data within particular call of encrypt_block() / decrypt_block() must + * have the same tweak. So whichever unit we choose we must stick on it + * and never encrypt / decrypt multiple units at a time. + * + * BLCKSZ also seems better choice than ENCRYPTION_BLOCK for performance + * purposes. We assume that alignment to BLCKSZ implies alignment to + * ENCRYPTION_BLOCK. + */ + Assert((file->curOffset % BLCKSZ == 0 && + file->curOffset % ENCRYPTION_BLOCK == 0) || + !data_encrypted); + + /* * Advance to next component file if necessary and possible. */ if (file->curOffset >= MAX_PHYSICAL_FILESIZE && @@ -423,6 +584,20 @@ BufFileLoadBuffer(BufFile *file) } /* + * See makeBufFile(). + * + * Actually here we only handle the case of FileRead() returning zero + * bytes below. In contrast, if the buffer contains any data but it's not + * full, it should already have the trailing zeroes (encrypted) on disk. + * And as the encrypted buffer is always loaded in its entirety (i.e. EOF + * should only appear at buffer boundary if the data is encrypted), all + * unused bytes of the buffer should eventually be zeroes after the + * decryption. + */ + if (data_encrypted) + MemSet(file->buffer, 0, BLCKSZ); + + /* * May need to reposition physical file. */ thisfile = file->files[file->curFile]; @@ -445,6 +620,64 @@ BufFileLoadBuffer(BufFile *file) file->offsets[file->curFile] += file->nbytes; /* we choose not to advance curOffset here */ + if (data_encrypted && file->nbytes > 0) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + int nbytes = file->nbytes; + + /* + * The encrypted component file can only consist of whole number of + * our encryption units. (Only the whole buffers are dumped / loaded.) + * The only exception is that we're at the end of segment file and + * found the word indicating the number of useful bytes in the + * segment. This can only happen for shared file. + */ + if (nbytes % BLCKSZ != 0) + { + Assert(nbytes == sizeof(off_t) && file->fileset != NULL); + + /* + * This special word cannot start within the useful data. + */ + Assert(file->offsets[file->curFile] - nbytes >= + file->useful[file->curFile]); + + /* + * This metadata his hidden to caller, so all he needs to know + * that there's no real data at the end of the file. + */ + file->nbytes = 0; + return; + } + + BufFileTweak(tweak, file); + + /* + * The whole block is encrypted / decrypted at once as explained + * above. + */ + decrypt_block(file->buffer, file->buffer, BLCKSZ, tweak); + +#ifdef USE_ASSERT_CHECKING + + /* + * The unused part of the buffer which we've read from disk and + * decrypted should only contain zeroes, as explained in front of the + * MemSet() call. + */ + { + int i; + + for (i = file->nbytes; i < BLCKSZ; i++) + Assert(file->buffer[i] == 0); + } +#endif /* USE_ASSERT_CHECKING */ +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + if (file->nbytes > 0) pgBufferUsage.temp_blks_read++; } @@ -455,6 +688,9 @@ BufFileLoadBuffer(BufFile *file) * Dump buffer contents starting at curOffset. * At call, should have dirty = true, nbytes > 0. * On exit, dirty is cleared if successful write, and curOffset is advanced. + * + * XXX Consider separate function for encrypted buffer. (No loop is needed to + * dump the encrypted buffer.) */ static void BufFileDumpBuffer(BufFile *file) @@ -462,6 +698,52 @@ BufFileDumpBuffer(BufFile *file) int wpos = 0; int bytestowrite; File thisfile; + char *write_ptr; + + /* + * See comments in BufFileLoadBuffer(); + */ + Assert((file->curOffset % BLCKSZ == 0 && + file->curOffset % ENCRYPTION_BLOCK == 0) || + !data_encrypted); + + /* + * Caller's responsibility. + */ + Assert(file->pos <= file->nbytes); + + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + BufFileTweak(tweak, file); + + /* + * The amount of data encrypted must be a multiple of + * ENCRYPTION_BLOCK. We meet this condition simply by encrypting the + * whole buffer. + * + * XXX Alternatively we could get the encrypted chunk length by + * rounding file->nbytes up to the nearest multiple of + * ENCRYPTION_BLOCK, and for decryption use the + * file->useful[file->curFile] value to find out how many blocks + * should be decrypted. That would reduce I/O if the buffer is mostly + * empty, but (BLCKSZ / ENCRYPTION_BLOCK) calls of encrypt_block() + * would be needed for full buffers. See also BufFileLoadBuffer() for + * explanation why we must stick on the unit of data amount encrypted + * / decrypted. + */ + if (encryption_buf_size < BLCKSZ) + enlarge_encryption_buffer(BLCKSZ); + encrypt_block(file->buffer, encryption_buffer, BLCKSZ, tweak); + write_ptr = encryption_buffer; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + else + write_ptr = file->buffer; /* * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it @@ -469,8 +751,6 @@ BufFileDumpBuffer(BufFile *file) */ while (wpos < file->nbytes) { - off_t availbytes; - /* * Advance to next component file if necessary and possible. */ @@ -482,31 +762,56 @@ BufFileDumpBuffer(BufFile *file) file->curOffset = 0L; } - /* - * Determine how much we need to write into this file. - */ - bytestowrite = file->nbytes - wpos; - availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; + if (!data_encrypted) + { + off_t availbytes; + + bytestowrite = file->nbytes - wpos; + availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; - if ((off_t) bytestowrite > availbytes) - bytestowrite = (int) availbytes; + if ((off_t) bytestowrite > availbytes) + bytestowrite = (int) availbytes; + } + else + { + /* + * This condition plus the alignment of curOffset to BLCKSZ + * (checked above) ensure that the encrypted buffer never crosses + * component file boundary. + */ + StaticAssertStmt((MAX_PHYSICAL_FILESIZE % BLCKSZ) == 0, + "BLCKSZ is not whole multiple of MAX_PHYSICAL_FILESIZE"); + + /* + * Encrypted data is dumped all at once. + * + * Here we don't have to check availbytes because --- according to + * the assertions above --- currOffset should be lower than + * MAX_PHYSICAL_FILESIZE by non-zero multiple of BLCKSZ. + */ + bytestowrite = BLCKSZ; + } /* * May need to reposition physical file. */ thisfile = file->files[file->curFile]; + if (file->curOffset != file->offsets[file->curFile]) { if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset) return; /* seek failed, give up */ + file->offsets[file->curFile] = file->curOffset; } bytestowrite = FileWrite(thisfile, - file->buffer + wpos, + write_ptr + wpos, bytestowrite, WAIT_EVENT_BUFFILE_WRITE); - if (bytestowrite <= 0) + if (bytestowrite <= 0 || + (data_encrypted && bytestowrite != BLCKSZ)) return; /* failed to write */ + file->offsets[file->curFile] += bytestowrite; file->curOffset += bytestowrite; wpos += bytestowrite; @@ -515,25 +820,130 @@ BufFileDumpBuffer(BufFile *file) } file->dirty = false; - /* - * At this point, curOffset has been advanced to the end of the buffer, - * ie, its original value + nbytes. We need to make it point to the - * logical file position, ie, original value + pos, in case that is less - * (as could happen due to a small backwards seek in a dirty buffer!) - */ - file->curOffset -= (file->nbytes - file->pos); - if (file->curOffset < 0) /* handle possible segment crossing */ + if (!data_encrypted) { - file->curFile--; - Assert(file->curFile >= 0); - file->curOffset += MAX_PHYSICAL_FILESIZE; + /* + * At this point, curOffset has been advanced to the end of the + * buffer, ie, its original value + nbytes. We need to make it point + * to the logical file position, ie, original value + pos, in case + * that is less (as could happen due to a small backwards seek in a + * dirty buffer!) + */ + file->curOffset -= (file->nbytes - file->pos); + if (file->curOffset < 0) /* handle possible segment crossing */ + { + file->curFile--; + Assert(file->curFile >= 0); + file->curOffset += MAX_PHYSICAL_FILESIZE; + } + + /* + * Now we can set the buffer empty without changing the logical + * position + */ + file->pos = 0; + file->nbytes = 0; } + else + { + /* + * curOffset should be at buffer boundary and buffer is the smallest + * I/O unit for encrypted data. + */ + Assert(file->curOffset % BLCKSZ == 0); - /* - * Now we can set the buffer empty without changing the logical position - */ - file->pos = 0; - file->nbytes = 0; + /* + * The number of useful bytes needs to be written at the end of each + * encrypted segment of a shared file so that the other backends know + * how many bytes of the last buffer are useful. + */ + if (file->fileset != NULL) + { + off_t useful; + +#ifdef USE_ENCRYPTION + + /* + * nuseful may be increased earlier than numFiles but not later, + * so the corresponding entry should always exist in ->useful. + */ + Assert(file->curFile < file->nuseful); + + /* + * The number of useful bytes in the current segment file. + */ + useful = file->useful[file->curFile]; +#endif + + /* + * Have we dumped the last buffer of the segment, i.e. the one + * that can contain padding? + */ + if (file->curOffset >= useful) + { + int bytes_extra; + + /* + * Write the number of useful bytes in the segment. + */ + bytes_extra = FileWrite(thisfile, + (char *) &useful, + sizeof(useful), + WAIT_EVENT_BUFFILE_WRITE); + if (bytes_extra != sizeof(useful)) + return; /* failed to write */ + + /* + * Remember the file position at OS level, but do not increase + * curOffset. That ensures that the next buffer appended will + * overwrite the "useful" value just written, instead of being + * appended to it. + * + * XXX The additional seek backwards at OS level when the next + * buffer is gonna be written is hard to avoid by writing this + * info higher in the stack (e.g. in BufFileWrite() or + * BufFileFlush()). Once we've cleared the "dirty" flag, we + * cannot recognize easily later that the number of useful + * bytes might need to be updated on disk because of this + * buffer. Is it worth introducing another bool field and + * making the logic trickier? + */ + file->offsets[file->curFile] += bytes_extra; + } + } + + if (file->pos >= BLCKSZ) + { + Assert(file->pos == BLCKSZ); + + /* + * curOffset points to the beginning of the next buffer, so just + * reset pos and nbytes. + */ + file->pos = 0; + file->nbytes = 0; + + /* See makeBufFile() */ + if (data_encrypted) + MemSet(file->buffer, 0, BLCKSZ); + } + else + { + /* + * Move curOffset to the beginning of the just-written buffer and + * preserve pos. + */ + file->curOffset -= BLCKSZ; + + /* + * At least pos bytes should be written even if the first change + * since now appears at pos == nbytes, but in fact the whole + * buffer will be written regardless pos. This is the price we pay + * for the choosing BLCKSZ as the I/O unit for encrypted data. + */ + file->nbytes = BLCKSZ; + } + } } /* @@ -556,18 +966,107 @@ BufFileRead(BufFile *file, void *ptr, size_t size) while (size > 0) { + int avail; + if (file->pos >= file->nbytes) { + /* + * Neither read nor write nor seek should leave pos greater than + * nbytes, regardless the data is encrypted or not. pos can only + * be greater if nbytes is zero --- this situation can be caused + * by BufFileSeek(). + */ + Assert(file->pos == file->nbytes || file->nbytes == 0); + + /* + * The Assert() above implies that pos is a whole multiple of + * BLCKSZ, so curOffset has meet the same encryption-specific + * requirement too. + */ + Assert(file->curOffset % BLCKSZ == 0 || !data_encrypted); + /* Try to load more data into buffer. */ - file->curOffset += file->pos; - file->pos = 0; - file->nbytes = 0; - BufFileLoadBuffer(file); - if (file->nbytes <= 0) - break; /* no more data available */ + if (!data_encrypted || file->pos % BLCKSZ == 0) + { + file->curOffset += file->pos; + file->pos = 0; + file->nbytes = 0; + BufFileLoadBuffer(file); + if (file->nbytes <= 0) + break; /* no more data available */ + } + else + { + int nbytes_orig = file->nbytes; + + /* + * Given that BLCKSZ is the I/O unit for encrypted data (see + * comments in BufFileLoadBuffer()), we cannot add pos to + * curOffset because that would make it point outside block + * boundary. The only thing we can do is to reload the whole + * buffer and see if more data is eventually there than the + * previous load has fetched. + */ + BufFileLoadBuffer(file); + if (file->nbytes <= nbytes_orig) + break; /* no more data available */ + } + } + + avail = file->nbytes; + nthistime = avail - file->pos; + + /* + * The buffer can contain trailing zeroes because BLCKSZ is the I/O + * unit for encrypted data. These are not available for reading. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + off_t useful = file->useful[file->curFile]; + + /* + * The criterion is whether the useful data end within the + * currently loaded buffer. + */ + if (useful < file->curOffset + BLCKSZ) + { + /* + * Compute the number of bytes available in the current + * buffer. + */ + avail = useful - file->curOffset; + Assert(avail >= 0); + + /* + * An empty buffer can exist, e.g. after a seek to the end of + * the last component file. + */ + if (avail == 0) + break; + + /* + * Seek beyond the current EOF, which was not followed by + * write, could have resulted in position outside the useful + * data + */ + if (file->pos > avail) + break; + + nthistime = avail - file->pos; + Assert(nthistime >= 0); + + /* + * Have we reached the end of the valid data? + */ + if (nthistime == 0) + break; + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ } - nthistime = file->nbytes - file->pos; if (nthistime > size) nthistime = size; Assert(nthistime > 0); @@ -609,11 +1108,28 @@ BufFileWrite(BufFile *file, void *ptr, size_t size) } else { - /* Hmm, went directly from reading to writing? */ + /* + * Hmm, went directly from reading to writing? + * + * As pos should be exactly BLCKSZ, there is nothing special + * to do about data_encrypted. Except for zeroing the buffer. + */ + Assert(file->pos == BLCKSZ); + file->curOffset += file->pos; file->pos = 0; file->nbytes = 0; + + /* See makeBufFile() */ + if (data_encrypted) + MemSet(file->buffer, 0, BLCKSZ); } + + /* + * If curOffset changed above, it should still meet the assumption + * that buffer is the I/O unit for encrypted data. + */ + Assert(file->curOffset % BLCKSZ == 0 || !data_encrypted); } nthistime = BLCKSZ - file->pos; @@ -627,6 +1143,65 @@ BufFileWrite(BufFile *file, void *ptr, size_t size) file->pos += nthistime; if (file->nbytes < file->pos) file->nbytes = file->pos; + + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + off_t new_offset; + int fileno = file->curFile; + + /* + * curFile does not necessarily correspond to the offset: it can + * still have the initial value if BufFileSeek() skipped the + * previous file w/o dumping anything of it. While curFile will be + * fixed during the next dump, we need valid fileno now. + */ + if (file->curOffset >= MAX_PHYSICAL_FILESIZE) + { + /* + * Even BufFileSeek() should not allow curOffset to become + * more than MAX_PHYSICAL_FILESIZE (if caller passes higher + * offset, curFile gets increased instead). + */ + Assert(file->curOffset == MAX_PHYSICAL_FILESIZE); + + fileno++; + } + + /* + * fileno can point to a segment that does not exist on disk yet. + */ + ensureUsefulArraySize(file, fileno + 1); + + /* + * Update the "useful offset" of the underlying component file if + * we've added any useful data. + */ + new_offset = file->curOffset + file->pos; + + /* + * Make sure the offset is relative to the correct component file. + * It should have been adjusted during sequential write, but if + * we've just used BufFileSeek() to jump to segment boundary w/o + * writing, the value is relative to the start of the *previous* + * segment. + */ + if (file->curOffset % MAX_PHYSICAL_FILESIZE == 0) + new_offset %= MAX_PHYSICAL_FILESIZE; + + /* + * Adjust the number of useful bytes in the file if needed. This + * has to happen immediately, independent from + * BufFileDumpBuffer(), so that BufFileRead() works correctly + * anytime. + */ + if (new_offset > file->useful[fileno]) + file->useful[fileno] = new_offset; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + ptr = (void *) ((char *) ptr + nthistime); size -= nthistime; nwritten += nthistime; @@ -741,9 +1316,43 @@ BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) return EOF; /* Seek is OK! */ file->curFile = newFile; - file->curOffset = newOffset; - file->pos = 0; - file->nbytes = 0; + if (!data_encrypted) + { + file->curOffset = newOffset; + file->pos = 0; + file->nbytes = 0; + } + else + { + /* + * Offset of an encrypted buffer must be a multiple of BLCKSZ. + */ + file->pos = newOffset % BLCKSZ; + file->curOffset = newOffset - file->pos; + + /* + * BufFileLoadBuffer() will set nbytes iff it can read something. + */ + file->nbytes = 0; + + /* + * Load and decrypt the existing part of the buffer. + */ + BufFileLoadBuffer(file); + if (file->nbytes == 0) + { + /* + * The data requested is not in the file, but this is not an + * error. + */ + return 0; + } + + /* + * The whole buffer should have been loaded. + */ + Assert(file->nbytes == BLCKSZ); + } return 0; } @@ -774,6 +1383,55 @@ BufFileSeekBlock(BufFile *file, long blknum) SEEK_SET); } +#ifdef USE_ENCRYPTION +static void +BufFileTweak(char *tweak, BufFile *file) +{ + off_t block; + int curFile = file->curFile; + + /* + * If the file was produced by BufFileAppend(), we need the original + * curFile, as it was used originally for encryption. + */ + if (file->segnos) + curFile = file->segnos[curFile]; + + block = curFile * BUFFILE_SEG_SIZE + file->curOffset / BLCKSZ; + + /* + * The unused bytes should always be defined. + */ + memset(tweak, 0, TWEAK_SIZE); + StaticAssertStmt(sizeof(block) <= TWEAK_SIZE, + "block number does not fit into encryption tweak"); + memcpy(tweak, &block, sizeof(off_t)); +} + +/* + * Make sure that BufFile.useful array has the required size. + */ +static void +ensureUsefulArraySize(BufFile *file, int required) +{ + /* + * Does the array already have enough space? + */ + if (required <= file->nuseful) + return; + + /* + * It shouldn't be possible to jump beyond the end of the last segment, + * i.e. skip more than 1 segment. + */ + Assert(file->nuseful + 1 == required); + + file->useful = (off_t *) repalloc(file->useful, required * sizeof(off_t)); + file->useful[file->nuseful] = 0L; + file->nuseful++; +} +#endif + #ifdef NOT_USED /* * BufFileTellBlock --- block-oriented tell @@ -803,11 +1461,33 @@ BufFileSize(BufFile *file) { off_t lastFileSize; - /* Get the size of the last physical file by seeking to end. */ - lastFileSize = FileSeek(file->files[file->numFiles - 1], 0, SEEK_END); - if (lastFileSize < 0) - return -1; - file->offsets[file->numFiles - 1] = lastFileSize; + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + /* + * "useful" should be initialized even for shared file, see + * BufFileOpenShared(). + */ + Assert(file->useful != NULL && file->nuseful >= file->numFiles); + + /* + * The number of useful bytes in the segment is what caller is + * interested in. + */ + lastFileSize = file->useful[file->nuseful - 1]; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + else + { + /* Get the size of the last physical file by seeking to end. */ + lastFileSize = FileSeek(file->files[file->numFiles - 1], 0, SEEK_END); + if (lastFileSize < 0) + return -1; + + file->offsets[file->numFiles - 1] = lastFileSize; + } return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) + lastFileSize; @@ -837,6 +1517,9 @@ BufFileAppend(BufFile *target, BufFile *source) { long startBlock = target->numFiles * BUFFILE_SEG_SIZE; int newNumFiles = target->numFiles + source->numFiles; +#ifdef USE_ENCRYPTION + int newNUseful = target->nuseful + source->nuseful; +#endif int i; Assert(target->fileset != NULL); @@ -851,11 +1534,75 @@ BufFileAppend(BufFile *target, BufFile *source) repalloc(target->files, sizeof(File) * newNumFiles); target->offsets = (off_t *) repalloc(target->offsets, sizeof(off_t) * newNumFiles); + for (i = target->numFiles; i < newNumFiles; i++) { - target->files[i] = source->files[i - target->numFiles]; - target->offsets[i] = source->offsets[i - target->numFiles]; + int n = i - target->numFiles; + + target->files[i] = source->files[n]; + target->offsets[i] = source->offsets[n]; } + + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + /* + * XXX As the typical use case is that parallel workers expose file to + * the leader, can we expect both target and source to have been + * exported, i.e. flushed? In such a case "nuseful" would have to be + * equal to "numFiles" for both input files and the code could get a + * bit simpler. It seems that at least source should be flushed, as + * source->readOnly is expected to be true above. + */ + target->useful = (off_t *) + repalloc(target->useful, sizeof(off_t) * newNUseful); + + for (i = target->nuseful; i < newNUseful; i++) + target->useful[i] = source->useful[i - target->nuseful]; + target->nuseful = newNUseful; + + /* + * File segments can appear at different position due to + * concatenation, so make sure we remember the original positions for + * the sake of encryption tweak. + */ + if (target->segnos == NULL) + { + /* + * If the target does not have the array yet, allocate it for both + * target and source and initialize the target part. + */ + target->segnos = (off_t *) palloc(newNumFiles * sizeof(off_t)); + for (i = 0; i < target->numFiles; i++) + target->segnos[i] = i; + } + else + { + /* + * Use the existing target part and add space for the source part. + */ + target->segnos = (off_t *) repalloc(target->segnos, + newNumFiles * sizeof(off_t)); + } + + /* + * The source segment number either equals to (0-based) index of the + * segment, or to an element of an already existing array. + */ + for (i = target->numFiles; i < newNumFiles; i++) + { + off_t segno = i - target->numFiles; + + if (source->segnos == NULL) + target->segnos[i] = segno; + else + target->segnos[i] = source->segnos[segno]; + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + target->numFiles = newNumFiles; return startBlock; diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index 4a0d23b11e..eb4ad205e1 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -23,24 +23,30 @@ #include #include "storage/copydir.h" +#include "storage/encryption.h" #include "storage/fd.h" +#include "storage/reinit.h" +#include "storage/smgr.h" #include "miscadmin.h" #include "pgstat.h" /* * copydir: copy a directory * - * If recurse is false, subdirectories are ignored. Anything that's not - * a directory or a regular file is ignored. + * RelFileNode values must specify tablespace and database oids for source + * and target to support re-encryption if necessary. relNode value in provided + * structs will be clobbered. */ void -copydir(char *fromdir, char *todir, bool recurse) +copydir(char *fromdir, char *todir, RelFileNode *fromNode, RelFileNode *toNode) { DIR *xldir; struct dirent *xlde; char fromfile[MAXPGPATH * 2]; char tofile[MAXPGPATH * 2]; + Assert(!data_encrypted || (fromNode != NULL && toNode != NULL)); + if (MakePGDirectory(todir) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -67,14 +73,32 @@ copydir(char *fromdir, char *todir, bool recurse) (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fromfile))); - if (S_ISDIR(fst.st_mode)) + if (S_ISREG(fst.st_mode)) { - /* recurse to handle subdirectories */ - if (recurse) - copydir(fromfile, tofile, true); + int oidchars; + ForkNumber forkNum; + int segment; + + /* + * For encrypted databases we need to reencrypt files with new + * tweaks. + */ + if (data_encrypted && + parse_filename_for_nontemp_relation(xlde->d_name, + &oidchars, &forkNum, &segment)) + { + char oidbuf[OIDCHARS + 1]; + + memcpy(oidbuf, xlde->d_name, oidchars); + oidbuf[oidchars] = '\0'; + + /* We scribble over the provided RelFileNodes here */ + fromNode->relNode = toNode->relNode = atol(oidbuf); + copy_file(fromfile, tofile, fromNode, toNode, forkNum, forkNum, segment); + } + else + copy_file(fromfile, tofile, NULL, NULL, 0, 0, 0); } - else if (S_ISREG(fst.st_mode)) - copy_file(fromfile, tofile); } FreeDir(xldir); @@ -121,17 +145,24 @@ copydir(char *fromdir, char *todir, bool recurse) } /* - * copy one file + * copy one file. If decryption and reencryption may be needed specify + * relfilenodes for source and target. */ void -copy_file(char *fromfile, char *tofile) +copy_file(char *fromfile, char *tofile, RelFileNode *fromNode, + RelFileNode *toNode, ForkNumber fromForkNum, ForkNumber toForkNum, + int segment) { char *buffer; int srcfd; int dstfd; int nbytes; + int bytesread; off_t offset; off_t flush_offset; +#ifdef USE_ENCRYPTION + BlockNumber blockNum = segment * RELSEG_SIZE; +#endif /* Size of copy buffer (read and write requests) */ #define COPY_BUF_SIZE (8 * BLCKSZ) @@ -186,16 +217,55 @@ copy_file(char *fromfile, char *tofile) flush_offset = offset; } - pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ); - nbytes = read(srcfd, buffer, COPY_BUF_SIZE); - pgstat_report_wait_end(); - if (nbytes < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", fromfile))); + /* + * Try to read as much as we fit in the buffer so we can deal with + * complete blocks if we need to reencrypt. + */ + nbytes = 0; + while (nbytes < COPY_BUF_SIZE) + { + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ); + bytesread = read(srcfd, buffer + nbytes, COPY_BUF_SIZE - nbytes); + pgstat_report_wait_end(); + if (bytesread < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", fromfile))); + nbytes += bytesread; + if (bytesread == 0) + break; + } if (nbytes == 0) break; + + /* + * If encryption is in place, only complete blocks should have been + * read. Thus we do not have to care whether any encryption block is + * affected by the previous buffer contents. + */ + Assert(nbytes % BLCKSZ == 0 || (fromNode == NULL && toNode == NULL)); + errno = 0; + + /* + * If the database is encrypted we need to decrypt the data here and + * reencrypt it to adjust the tweak values of blocks. + */ + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + if (fromNode != NULL) + { + Assert(toNode != NULL); + blockNum = ReencryptBlock(buffer, nbytes / BLCKSZ, + fromNode, toNode, fromForkNum, + toForkNum, blockNum); + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE); if ((int) write(dstfd, buffer, nbytes) != nbytes) { diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 8dd51f1767..b3e3a50d30 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -2163,10 +2163,6 @@ FileSeek(File file, off_t offset, int whence) return vfdP->seekPos; } -/* - * XXX not actually used but here for completeness - */ -#ifdef NOT_USED off_t FileTell(File file) { @@ -2175,7 +2171,6 @@ FileTell(File file) file, VfdCache[file].fileName)); return VfdCache[file].seekPos; } -#endif int FileTruncate(File file, off_t offset, uint32 wait_event_info) diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 74ff6c359b..70e138503e 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -16,6 +16,7 @@ #include +#include "catalog/pg_tablespace.h" #include "common/relpath.h" #include "storage/copydir.h" #include "storage/fd.h" @@ -24,9 +25,9 @@ #include "utils/memutils.h" static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, - int op); + int op, Oid spcOid); static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, - int op); + int op, Oid spcOid, Oid dbOid); typedef struct { @@ -68,7 +69,7 @@ ResetUnloggedRelations(int op) /* * First process unlogged files in pg_default ($PGDATA/base) */ - ResetUnloggedRelationsInTablespaceDir("base", op); + ResetUnloggedRelationsInTablespaceDir("base", op, DEFAULTTABLESPACE_OID); /* * Cycle through directories for all non-default tablespaces. @@ -77,13 +78,16 @@ ResetUnloggedRelations(int op) while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL) { + Oid spcOid; + if (strcmp(spc_de->d_name, ".") == 0 || strcmp(spc_de->d_name, "..") == 0) continue; snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); - ResetUnloggedRelationsInTablespaceDir(temp_path, op); + spcOid = atoi(spc_de->d_name); + ResetUnloggedRelationsInTablespaceDir(temp_path, op, spcOid); } FreeDir(spc_dir); @@ -99,7 +103,8 @@ ResetUnloggedRelations(int op) * Process one tablespace directory for ResetUnloggedRelations */ static void -ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) +ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op, + Oid spcOid) { DIR *ts_dir; struct dirent *de; @@ -126,6 +131,8 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) while ((de = ReadDir(ts_dir, tsdirname)) != NULL) { + Oid dbOid; + /* * We're only interested in the per-database directories, which have * numeric names. Note that this code will also (properly) ignore "." @@ -134,9 +141,10 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) continue; + dbOid = atoi(de->d_name); snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", tsdirname, de->d_name); - ResetUnloggedRelationsInDbspaceDir(dbspace_path, op); + ResetUnloggedRelationsInDbspaceDir(dbspace_path, op, spcOid, dbOid); } FreeDir(ts_dir); @@ -146,7 +154,8 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) * Process one per-dbspace directory for ResetUnloggedRelations */ static void -ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) +ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op, + Oid spcOid, Oid dbOid) { DIR *dbspace_dir; struct dirent *de; @@ -187,7 +196,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) + &forkNum, NULL)) continue; /* Also skip it unless this is the init fork. */ @@ -229,7 +238,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) + &forkNum, NULL)) continue; /* We never remove the init fork. */ @@ -279,13 +288,14 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) { ForkNumber forkNum; int oidchars; + int segment; char oidbuf[OIDCHARS + 1]; char srcpath[MAXPGPATH * 2]; char dstpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) + &forkNum, &segment)) continue; /* Also skip it unless this is the init fork. */ @@ -305,7 +315,13 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) /* OK, we're ready to perform the actual copy. */ elog(DEBUG2, "copying %s to %s", srcpath, dstpath); - copy_file(srcpath, dstpath); + { + RelFileNode srcNode = {spcOid, dbOid, atol(oidbuf)}; + RelFileNode dstNode = srcNode; + + copy_file(srcpath, dstpath, &srcNode, &dstNode, + INIT_FORKNUM, MAIN_FORKNUM, segment); + } } FreeDir(dbspace_dir); @@ -327,7 +343,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) + &forkNum, NULL)) continue; /* Also skip it unless this is the init fork. */ @@ -372,9 +388,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) */ bool parse_filename_for_nontemp_relation(const char *name, int *oidchars, - ForkNumber *fork) + ForkNumber *fork, int *segment) { int pos; + int segstart = 0; /* Look for a non-empty string of digits (that isn't too long). */ for (pos = 0; isdigit((unsigned char) name[pos]); ++pos) @@ -401,6 +418,7 @@ parse_filename_for_nontemp_relation(const char *name, int *oidchars, { int segchar; + segstart = pos + 1; for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) ; if (segchar <= 1) @@ -411,5 +429,14 @@ parse_filename_for_nontemp_relation(const char *name, int *oidchars, /* Now we should be at the end. */ if (name[pos] != '\0') return false; + + if (segment != NULL) + { + if (segstart == 0) + *segment = 0; + else + *segment = atoi(name + segstart); + } + return true; } diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index dfbda5458f..1c3d9463d4 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -17,6 +17,7 @@ #include "access/htup_details.h" #include "access/itup.h" #include "access/xlog.h" +#include "common/string.h" #include "storage/checksum.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -81,11 +82,8 @@ bool PageIsVerified(Page page, BlockNumber blkno) { PageHeader p = (PageHeader) page; - size_t *pagebytes; - int i; bool checksum_failure = false; bool header_sane = false; - bool all_zeroes = false; uint16 checksum = 0; /* @@ -118,26 +116,8 @@ PageIsVerified(Page page, BlockNumber blkno) return true; } - /* - * Check all-zeroes case. Luckily BLCKSZ is guaranteed to always be a - * multiple of size_t - and it's much faster to compare memory using the - * native word size. - */ - StaticAssertStmt(BLCKSZ == (BLCKSZ / sizeof(size_t)) * sizeof(size_t), - "BLCKSZ has to be a multiple of sizeof(size_t)"); - - all_zeroes = true; - pagebytes = (size_t *) page; - for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++) - { - if (pagebytes[i] != 0) - { - all_zeroes = false; - break; - } - } - - if (all_zeroes) + /* Check all-zeroes case */ + if (IsAllZero((char *) page, BLCKSZ)) return true; /* diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile index 2b95cb0df1..0601ea7894 100644 --- a/src/backend/storage/smgr/Makefile +++ b/src/backend/storage/smgr/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = md.o smgr.o smgrtype.o +OBJS = encryption.o md.o smgr.o smgrtype.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/smgr/encryption.c b/src/backend/storage/smgr/encryption.c new file mode 100644 index 0000000000..cef36b1f5a --- /dev/null +++ b/src/backend/storage/smgr/encryption.c @@ -0,0 +1,814 @@ +/*------------------------------------------------------------------------- + * + * encryption.c + * This code handles encryption and decryption of data. + * + * Encryption is done by extension modules loaded by encryption_library GUC. + * The extension module must register itself and provide a cryptography + * implementation. Key setup is left to the extension module. + * + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/storage/smgr/encryption.c + * + * NOTES + * This file is compiled as both front-end and backend code, so it + * may not use ereport, server-defined static variables, etc. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "common/fe_memutils.h" +#include "common/sha2.h" +#include "common/string.h" +#include "catalog/pg_control.h" +#ifndef FRONTEND +#include "pgstat.h" +#endif /* FRONTEND */ +#include "storage/bufpage.h" +#include "storage/encryption.h" +#include "storage/fd.h" +#include "utils/memutils.h" +#include "miscadmin.h" +#include "fmgr.h" +#include "port.h" + +#ifdef USE_ENCRYPTION +#include +#include +#include +#endif + +#ifdef USE_ENCRYPTION +unsigned char *encryption_key = NULL; +const char *encryption_key_prefix = "encryption_key="; +const char *encryption_pwd_prefix = "encryption_password="; +#endif + +bool data_encrypted = false; + +#ifdef USE_ENCRYPTION +char *encryption_key_command = NULL; +char *encryption_buffer = NULL; +Size encryption_buf_size = 0; + +static bool initialized = false; + +static void setup_encryption_internal(void); +static char *run_encryption_key_command(bool *is_key_p, size_t *len_p); +static void evp_error(void); + +/* + * Pointer to the KDF parameters. + * + * XXX Rename this and the write / read functions so they contain the + * 'keysetup' string? + */ +KDFParamsData *KDFParams = NULL; + +/* + * Initialize KDFParamsData and write it to a file. + * + * This is very similar to WriteControlFile(). + */ +#ifndef FRONTEND +extern void +write_kdf_file(void) +{ + KDFParamsPBKDF2 *params; + int i, + fd; + + StaticAssertStmt(sizeof(KDFParamsData) <= KDF_PARAMS_FILE_SIZE, + "kdf file is too large for atomic disk writes"); + + /* + * The initialization should not be repeated. + */ + Assert(KDFParams == NULL); + + KDFParams = MemoryContextAllocZero(TopMemoryContext, + KDF_PARAMS_FILE_SIZE); + KDFParams->function = KDF_OPENSSL_PKCS5_PBKDF2_HMAC_SHA; + params = &KDFParams->data.pbkdf2; + params->niter = ENCRYPTION_KDF_NITER; + for (i = 0; i < ENCRYPTION_KDF_SALT_LEN; i++) + params->salt[i] = (unsigned char) random(); + + /* Contents are protected with a CRC */ + INIT_CRC32C(KDFParams->crc); + COMP_CRC32C(KDFParams->crc, + (char *) KDFParams, + offsetof(KDFParamsData, crc)); + FIN_CRC32C(KDFParams->crc); + + fd = BasicOpenFile(KDF_PARAMS_FILE, + O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not create key setup file \"%s\": %m", + KDF_PARAMS_FILE))); + + pgstat_report_wait_start(WAIT_EVENT_KDF_FILE_WRITE); + if (write(fd, KDFParams, KDF_PARAMS_FILE_SIZE) != KDF_PARAMS_FILE_SIZE) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to key setup file: %m"))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_KDF_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync key setup file: %m"))); + pgstat_report_wait_end(); + + if (close(fd)) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close key setup file: %m"))); +} + +/* + * Read KDFParamsData from file and store it in local memory. + * + * postmaster should call the function early enough for any other process to + * inherit valid pointer to the data. + */ +extern void +read_kdf_file(void) +{ + pg_crc32c crc; + int fd; + + KDFParams = MemoryContextAllocZero(TopMemoryContext, + KDF_PARAMS_FILE_SIZE); + + fd = BasicOpenFile(KDF_PARAMS_FILE, O_RDONLY | PG_BINARY); + + if (fd < 0) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open key setup file \"%s\": %m", + KDF_PARAMS_FILE))); + } + + pgstat_report_wait_start(WAIT_EVENT_KDF_FILE_READ); + + if (read(fd, KDFParams, sizeof(KDFParamsData)) != sizeof(KDFParamsData)) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read from key setup file: %m"))); + } + + pgstat_report_wait_end(); + + close(fd); + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, + (char *) KDFParams, + offsetof(KDFParamsData, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, KDFParams->crc)) + ereport(FATAL, + (errmsg("incorrect checksum in key setup file"))); + + if (KDFParams->function != KDF_OPENSSL_PKCS5_PBKDF2_HMAC_SHA) + ereport(FATAL, + (errmsg("unsupported KDF function %d", KDFParams->function))); +} +#endif /* FRONTEND */ +#endif /* USE_ENCRYPTION */ + +/* + * Encrypts a fixed value into *buf to verify that encryption key is correct. + * Caller provided buf needs to be able to hold at least ENCRYPTION_SAMPLE_SIZE + * bytes. + */ +void +sample_encryption(char *buf) +{ +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + int i; + + for (i = 0; i < TWEAK_SIZE; i++) + tweak[i] = i; + + encrypt_block("postgresqlcrypt", buf, ENCRYPTION_SAMPLE_SIZE, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ +} + +/* + * Encrypts one block of data with a specified tweak value. May only be called + * when encryption_enabled is true. + * + * Input and output buffer may point to the same location. + * + * "size" must be a (non-zero) multiple of ENCRYPTION_BLOCK. + * + * "tweak" value must be TWEAK_SIZE bytes long. + * + * All-zero blocks are not encrypted to correctly handle relation extension, + * and also to simplify handling of holes created by seek past EOF and + * consequent write (see buffile.c). + */ +void +encrypt_block(const char *input, char *output, Size size, const char *tweak) +{ +#ifdef USE_ENCRYPTION + Assert(size >= ENCRYPTION_BLOCK && size % ENCRYPTION_BLOCK == 0); + Assert(initialized); + + /* + * The EVP API does not seem to expect the output buffer to be equal to + * the input. Ensure that we pass separate pointers. + */ + if (input == output) + { + if (size > encryption_buf_size) + enlarge_encryption_buffer(size); + + memcpy(encryption_buffer, input, size); + input = encryption_buffer; + } + + if (IsAllZero(input, size)) + memset(output, 0, size); + else + { + int out_size; + EVP_CIPHER_CTX *ctx; + + if ((ctx = EVP_CIPHER_CTX_new()) == NULL) + evp_error(); + + if (EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, encryption_key, + (unsigned char *) tweak) != 1) + evp_error(); + + /* + * No padding is needed, the input block size should already be a + * multiple of ENCRYPTION_BLOCK_OPENSSL. + */ + EVP_CIPHER_CTX_set_padding(ctx, 0); + + Assert(EVP_CIPHER_CTX_block_size(ctx) == ENCRYPTION_BLOCK_OPENSSL); + Assert(EVP_CIPHER_CTX_iv_length(ctx) == TWEAK_SIZE); + Assert(EVP_CIPHER_CTX_key_length(ctx) == ENCRYPTION_KEY_LENGTH); + + /* + * Do the actual encryption. As the padding is disabled, + * EVP_EncryptFinal_ex() won't be needed. + */ + if (EVP_EncryptUpdate(ctx, (unsigned char *) output, &out_size, + (unsigned char *) input, size) != 1) + evp_error(); + + /* + * The input size is a multiple of ENCRYPTION_BLOCK_OPENSSL, so the + * output of AES-XTS should meet this condition. + */ + Assert(out_size == size); + + if (EVP_CIPHER_CTX_cleanup(ctx) != 1) + evp_error(); + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ +} + +/* + * Decrypts one block of data with a specified tweak value. May only be called + * when encryption_enabled is true. + * + * Input and output buffer may point to the same location. + * + * "size" must be a (non-zero) multiple of ENCRYPTION_BLOCK. + * + * "tweak" value must be TWEAK_SIZE bytes long. + * + * All-zero blocks are not decrypted to correctly handle relation extension, + * and also to simplify handling of holes created by seek past EOF and + * consequent write (see buffile.c). + */ +void +decrypt_block(const char *input, char *output, Size size, const char *tweak) +{ +#ifdef USE_ENCRYPTION + Assert(size >= ENCRYPTION_BLOCK && size % ENCRYPTION_BLOCK == 0); + Assert(initialized); + + /* + * The EVP API does not seem to expect the output buffer to be equal to + * the input. Ensure that we pass separate pointers. + */ + if (input == output) + { + if (size > encryption_buf_size) + enlarge_encryption_buffer(size); + + memcpy(encryption_buffer, input, size); + input = encryption_buffer; + } + + if (IsAllZero(input, size)) + memset(output, 0, size); + else + { + int out_size; + EVP_CIPHER_CTX *ctx; + + if ((ctx = EVP_CIPHER_CTX_new()) == NULL) + evp_error(); + + if (EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, encryption_key, + (unsigned char *) tweak) != 1) + evp_error(); + + /* The same considerations apply below as those in encrypt_block(). */ + EVP_CIPHER_CTX_set_padding(ctx, 0); + Assert(EVP_CIPHER_CTX_block_size(ctx) == ENCRYPTION_BLOCK_OPENSSL); + Assert(EVP_CIPHER_CTX_iv_length(ctx) == TWEAK_SIZE); + Assert(EVP_CIPHER_CTX_key_length(ctx) == ENCRYPTION_KEY_LENGTH); + + if (EVP_DecryptUpdate(ctx, (unsigned char *) output, &out_size, + (unsigned char *) input, size) != 1) + evp_error(); + + Assert(out_size == size); + + if (EVP_CIPHER_CTX_cleanup(ctx) != 1) + evp_error(); + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ +} + +/* + * Report an error in an universal way so that caller does not have to care + * whether it executes in backend or front-end. + */ +void +encryption_error(bool fatal, char *message) +{ +#ifndef FRONTEND + elog(fatal ? FATAL : INFO, "%s", message); +#else + fprintf(stderr, "%s\n", message); + if (fatal) + exit(EXIT_FAILURE); +#endif +} + +/* + * Initialize encryption subsystem for use. Must be called before any + * encryptable data is read from or written to data directory. + */ +void +setup_encryption(bool bootstrap) +{ +#ifdef USE_ENCRYPTION + char *credentials; + bool is_key; + size_t len; + + credentials = run_encryption_key_command(&is_key, &len); + + /* + * Setup KDF if we need to derive the key from a password. + * + * Front-ends always need the key because some of them are not aware of + * the data directory and thus they'd need one more command line option to + * find the key setup file. + */ +#ifndef FRONTEND + if (!is_key) + { + if (bootstrap) + { + write_kdf_file(); + } + else + read_kdf_file(); + } +#endif /* FRONTEND */ + + setup_encryption_key(credentials, is_key, len); + pfree(credentials); + setup_encryption_internal(); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ +} + +#ifdef USE_ENCRYPTION +static void +setup_encryption_internal(void) +{ + /* + * Setup OpenSSL. + * + * None of these functions should return a value or raise error. + */ + ERR_load_crypto_strings(); + OpenSSL_add_all_algorithms(); + OPENSSL_config(NULL); + + /* + * It makes no sense to initialize the encryption multiple times. + */ + Assert(!initialized); + + initialized = true; +} +#endif /* USE_ENCRYPTION */ + +/* + * If credentials is a key, just copy it to encryption_key. If it's a + * password, derive the key from it. + */ +void +setup_encryption_key(char *credentials, bool is_key, size_t len) +{ +#ifdef USE_ENCRYPTION + Assert(credentials != NULL); + Assert(encryption_key == NULL); + + /* + * Although the function should be called in the TopMemoryContext, we + * should be pretty sure that the key does not become garbage due to + * pfree(). The cluster should crash in such a case, but if it did not + * happen immediately, some data could be encrypted using an invalid key + * and therefore become lost. + */ +#ifndef FRONTEND + encryption_key = (unsigned char *) MemoryContextAllocZero(TopMemoryContext, + ENCRYPTION_KEY_LENGTH); +#else + encryption_key = (unsigned char *) palloc(ENCRYPTION_KEY_LENGTH); +#endif /* FRONTEND */ + if (is_key) + { + Assert(len = ENCRYPTION_KEY_LENGTH); + memcpy(encryption_key, credentials, len); + } + else + { + KDFParamsPBKDF2 *params; + int rc; + + /* + * The file contains password so we need the KDF parameters to turn it + * to key. + */ + if (KDFParams == NULL) + { +#ifndef FRONTEND + ereport(FATAL, + (errmsg("this instance does not accept encryption password"), + errdetail("Encryption key was probably used to initialize the instance."))); +#else + encryption_error(true, + "this instance does not accept encryption password.\n" + "Encryption key was probably used to initialize the instance.\n"); +#endif /* FRONTEND */ + } + + /* + * Turn the password into the encryption key. + */ + params = &KDFParams->data.pbkdf2; + rc = PKCS5_PBKDF2_HMAC(credentials, + len, + params->salt, + ENCRYPTION_KDF_SALT_LEN, + params->niter, + EVP_sha(), + ENCRYPTION_KEY_LENGTH, + encryption_key); + + if (rc != 1) + { +#ifndef FRONTEND + ereport(FATAL, + (errmsg("failed to derive key from password"))); +#else + encryption_error(true, "failed to derive key from password"); +#endif /* FRONTEND */ + } + } + +#else /* USE_ENCRYPTION */ + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ +} + +#ifdef USE_ENCRYPTION +void +enlarge_encryption_buffer(Size new_size) +{ + Assert(new_size > 0); + + /* + * Shrinkage is not the use case for this routine. + */ + if (new_size <= encryption_buf_size) + return; + + /* + * Allocate a new chunk if nothing is there yet, else reallocate the + * existing one. + */ + if (encryption_buf_size == 0) +#ifndef FRONTEND + encryption_buffer = (char *) MemoryContextAlloc(TopMemoryContext, + new_size); +#else + encryption_buffer = (char *) palloc(new_size); +#endif /* FRONTEND */ + else + encryption_buffer = (char *) repalloc(encryption_buffer, new_size); + encryption_buf_size = new_size; +} + +/* + * Run the command stored in encryption_key_command and return the key or + * password. + * + * *is_key_p receives true if the command returns key, false if it's + * password. *len_p receives length of the data. + */ +static char * +run_encryption_key_command(bool *is_key_p, size_t *len_p) +{ + FILE *fp; + char *buf, + *result; + bool is_key; + size_t key_pref_len, + pwd_pref_len, + read_len, + bytes_read; + size_t buf_size, + result_size; + int i; + + if (encryption_key_command == NULL || + !strlen(encryption_key_command)) + { + /* + * encryption_key_command should have been set by initdb. It's weird + * if it was not, but there's no better recommendation we can give the + * user. + */ +#ifndef FRONTEND + ereport(FATAL, + (errmsg("encryption key not provided"), + errdetail("The database cluster was initialized with encryption" + " but the server was started without an encryption key."), + errhint("Set the encryption_key_command configuration variable."))); +#else /* FRONTEND */ + encryption_error(true, + "The database cluster was initialized with encryption" + " but the server was started without an encryption key. " + "Set the encryption_key_command configuration variable.\n"); +#endif /* FRONTEND */ + } + + encryption_error(false, + psprintf("Executing \"%s\" to set up encryption key", + encryption_key_command)); + + fp = popen(encryption_key_command, "r"); + if (fp == NULL) + encryption_error(true, + psprintf("Failed to execute encryption_key_command \"%s\"", + encryption_key_command)); + + /* + * Check which prefix the file starts with. + * + * The prefixes probably won't change after the release but they might + * change during development. The reading logic should be generic so that + * change of prefix length requires no additional coding. + */ + key_pref_len = strlen(encryption_key_prefix); + pwd_pref_len = strlen(encryption_pwd_prefix); + + /* + * The buffer must accommodate either prefix. + */ + buf_size = Max(key_pref_len, pwd_pref_len); + buf = (char *) palloc(buf_size); + + /* + * Read as few bytes as necessary so that we don't have to move back in + * the buffer if the first comparison does not match. + */ + read_len = Min(key_pref_len, pwd_pref_len); + + if (fread(buf, 1, read_len, fp) != read_len) + encryption_error(true, "Not enough data received from encryption_key_command"); + + if (read_len == key_pref_len && + strncmp(buf, encryption_key_prefix, key_pref_len) == 0) + is_key = true; + else if (read_len == pwd_pref_len && + strncmp(buf, encryption_pwd_prefix, pwd_pref_len) == 0) + is_key = false; + else if (buf_size > read_len) + { + size_t len_diff; + + /* + * Read enough data so that one of the prefixes must match. + */ + len_diff = buf_size - read_len; + if (fread(buf + read_len, 1, len_diff, fp) != len_diff) + encryption_error(true, + "Not enough data received from encryption_key_command"); + read_len += len_diff; + + /* + * Try to match the prefixes again. + */ + if (read_len == key_pref_len && + strncmp(buf, encryption_key_prefix, key_pref_len) == 0) + is_key = true; + else if (read_len == pwd_pref_len && + strncmp(buf, encryption_pwd_prefix, pwd_pref_len) == 0) + is_key = false; + else + encryption_error(true, + "Unknown data received from encryption_key_command"); + } + + *is_key_p = is_key; + + /* + * Read the actual credentials. + */ + read_len = is_key ? ENCRYPTION_KEY_LENGTH * 2 : ENCRYPTION_PWD_MAX_LENGTH; + + /* + * One extra character can be read and this should be nothing but line + * delimiter, see below. + */ + read_len++; + + if (read_len > buf_size) + { + buf = (char *) repalloc(buf, read_len); + buf_size = read_len; + } + + bytes_read = fread(buf, 1, read_len, fp); + + if ((is_key && bytes_read < read_len) || + (!is_key && bytes_read < (ENCRYPTION_PWD_MIN_LENGTH + 1))) + { + if (feof(fp)) + encryption_error(true, + "Not enough data provided by encryption_key_command"); + else + encryption_error(true, + psprintf("encryption_key_command returned error code %d", + ferror(fp))); + } + Assert(bytes_read > 0); + + if (buf[bytes_read - 1] != '\n') + encryption_error(true, "Encryption key is too long."); + + /* + * The credentials must not contain line delimiter. + * + * If that was allowed, the line delimiter would have to terminate + * password even if more characters followed, however those extra + * characters would be useless. Since distinguishing of the special (still + * legal) case "password\n" would make the checks less simple and + * since the delimiter itself is useless in the case of a key (because the + * key has constant length), let's prohibit the line delimiters anywhere + * except for the last position. + */ + for (i = 0; i < bytes_read; i++) + if (buf[i] == '\n' && i < (bytes_read - 1)) + encryption_error(true, + "Neither password nor key may contain line break."); + + /* + * For a key the result size is different from the amount of data read. + */ + result_size = is_key ? ENCRYPTION_KEY_LENGTH : ENCRYPTION_PWD_MAX_LENGTH; + + result = (char *) palloc(result_size); + + if (is_key) + { + int i; + + for (i = 0; i < ENCRYPTION_KEY_LENGTH; i++) + { + if (sscanf(buf + 2 * i, "%2hhx", result + i) == 0) + encryption_error(true, + psprintf("Invalid character in encryption key at position %d", + 2 * i)); + } + *len_p = ENCRYPTION_KEY_LENGTH; + } + else + { + /* + * Ignore the line delimiter. + */ + *len_p = bytes_read - 1; + memcpy(result, buf, *len_p); + + } + + /* + * No extra data is allowed. + */ + if (fread(buf, 1, 1, fp) > 0) + encryption_error(true, + "Credentials are followed by useless data"); + + + pfree(buf); + pclose(fp); + + return result; +} +#endif /* USE_ENCRYPTION */ + +/* + * Error callback for openssl. + */ +#ifdef USE_ENCRYPTION +static void +evp_error(void) +{ + ERR_print_errors_fp(stderr); +#ifndef FRONTEND + + /* + * FATAL is the appropriate level because backend can hardly fix anything + * if encryption / decryption has failed. + * + * XXX Do we yet need EVP_CIPHER_CTX_cleanup() here? + */ + elog(FATAL, "OpenSSL encountered error during encryption or decryption."); +#else + fprintf(stderr, + "OpenSSL encountered error during encryption or decryption."); + exit(EXIT_FAILURE); +#endif /* FRONTEND */ +} +#endif /* USE_ENCRYPTION */ + +/* + * Xlog is encrypted page at a time. Each xlog page gets a unique tweak via + * segment and offset. Unfortunately we can't include timeline because + * exitArchiveRecovery() can copy part of the last segment of the old timeline + * into the first segment of the new timeline. + * + * TODO Consider teaching exitArchiveRecovery() to decrypt the copied pages + * and encrypt them using a tweak that mentions the new timeline. + * + * The function is located here rather than some of the xlog*.c modules so + * that front-end applications can easily use it too. + */ +void +XLogEncryptionTweak(char *tweak, XLogSegNo segment, uint32 offset) +{ +#ifdef USE_ENCRYPTION + memset(tweak, 0, TWEAK_SIZE); + memcpy(tweak, &segment, sizeof(XLogSegNo)); + memcpy(tweak + sizeof(XLogSegNo), &offset, sizeof(offset)); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ +} diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2ec103e604..f831a52e18 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -32,6 +32,7 @@ #include "postmaster/bgwriter.h" #include "storage/fd.h" #include "storage/bufmgr.h" +#include "storage/encryption.h" #include "storage/relfilenode.h" #include "storage/smgr.h" #include "utils/hsearch.h" @@ -112,7 +113,14 @@ typedef struct _MdfdVec } MdfdVec; static MemoryContext MdCxt; /* context for all MdfdVec objects */ - +#ifdef USE_ENCRYPTION +/* + * encryption_buffer from encryption.h is not used here because of the special + * memory context. + */ +static char *md_encryption_buffer; +static char *md_encryption_tweak; +#endif /* * In some contexts (currently, standalone backends and the checkpointer) @@ -197,6 +205,11 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); +#ifdef USE_ENCRYPTION +static void mdtweak(char *tweak, RelFileNode *relnode, ForkNumber forknum, BlockNumber blocknum); +static void mdencrypt(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); +static void mddecrypt(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); +#endif /* * mdinit() -- Initialize private state for magnetic disk storage manager. @@ -241,6 +254,11 @@ mdinit(void) HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); pendingUnlinks = NIL; } + +#ifdef USE_ENCRYPTION + md_encryption_buffer = MemoryContextAllocZero(MdCxt, BLCKSZ); + md_encryption_tweak = MemoryContextAllocZero(MdCxt, TWEAK_SIZE); +#endif } /* @@ -536,6 +554,16 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errmsg("could not seek to block %u in file \"%s\": %m", blocknum, FilePathName(v->mdfd_vfd)))); + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + mdencrypt(reln, forknum, blocknum, buffer); + buffer = md_encryption_buffer; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { if (nbytes < 0) @@ -733,6 +761,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, off_t seekpos; int nbytes; MdfdVec *v; + char *buffer_read = buffer; TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -753,7 +782,15 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errmsg("could not seek to block %u in file \"%s\": %m", blocknum, FilePathName(v->mdfd_vfd)))); - nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ); + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + buffer_read = md_encryption_buffer; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + nbytes = FileRead(v->mdfd_vfd, buffer_read, BLCKSZ, WAIT_EVENT_DATA_FILE_READ); TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -788,6 +825,14 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum, FilePathName(v->mdfd_vfd), nbytes, BLCKSZ))); } + else if (data_encrypted) + { +#ifdef USE_ENCRYPTION + mddecrypt(reln, forknum, blocknum, buffer); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } } /* @@ -829,6 +874,15 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errmsg("could not seek to block %u in file \"%s\": %m", blocknum, FilePathName(v->mdfd_vfd)))); + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + mdencrypt(reln, forknum, blocknum, buffer); + buffer = md_encryption_buffer; +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE); TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, @@ -1950,3 +2004,58 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* note that this calculation will ignore any partial block at EOF */ return (BlockNumber) (len / BLCKSZ); } + +#ifdef USE_ENCRYPTION +/* + * md files are encrypted block at a time. Tweak will alias higher numbered + * forks for huge tables. + */ +static void +mdtweak(char *tweak, RelFileNode *relnode, ForkNumber forknum, BlockNumber blocknum) +{ + uint32 fork_and_block = (forknum << 24) ^ blocknum; + + memcpy(tweak, relnode, sizeof(RelFileNode)); + memcpy(tweak + sizeof(RelFileNode), &fork_and_block, 4); +} + +static void +mdencrypt(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +{ + mdtweak(md_encryption_tweak, &(reln->smgr_rnode.node), forknum, blocknum); + encrypt_block(buffer, md_encryption_buffer, BLCKSZ, md_encryption_tweak); +} + +static void +mddecrypt(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *dest) +{ + mdtweak(md_encryption_tweak, &(reln->smgr_rnode.node), forknum, blocknum); + decrypt_block(md_encryption_buffer, dest, BLCKSZ, md_encryption_tweak); +} + +/* + * Copying relations between tablespaces/databases means that the tweak values + * of each block will change. This function transcodes a series of blocks with + * new tweak values. Returns the new block number for convenience. + */ +BlockNumber +ReencryptBlock(char *buffer, int blocks, + RelFileNode *srcNode, RelFileNode *dstNode, + ForkNumber srcForkNum, ForkNumber dstForkNum, + BlockNumber blockNum) +{ + char *cur; + char srcTweak[TWEAK_SIZE]; + char dstTweak[TWEAK_SIZE]; + + for (cur = buffer; cur < buffer + blocks * BLCKSZ; cur += BLCKSZ) + { + mdtweak(srcTweak, srcNode, srcForkNum, blockNum); + mdtweak(dstTweak, dstNode, dstForkNum, blockNum); + decrypt_block(cur, cur, BLCKSZ, srcTweak); + encrypt_block(cur, cur, BLCKSZ, dstTweak); + blockNum++; + } + return blockNum; +} +#endif /* USE_ENCRYPTION */ diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index f4133953be..d946729d17 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -63,6 +63,7 @@ #include "replication/walsender.h" #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" +#include "storage/encryption.h" #include "storage/ipc.h" #include "storage/proc.h" #include "storage/procsignal.h" @@ -3401,7 +3402,7 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, * postmaster/postmaster.c (the option sets should not conflict) and with * the common help() function in main/main.c. */ - while ((flag = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:v:W:-:")) != -1) + while ((flag = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijK:k:lN:nOo:Pp:r:S:sTt:v:W:-:")) != -1) { switch (flag) { @@ -3459,6 +3460,12 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, UseSemiNewlineNewline = true; break; +#ifdef USE_OPENSSL + case 'K': + encryption_key_command = strdup(optarg); + break; +#endif /* USE_OPENSSL */ + case 'k': SetConfigOption("unix_socket_directories", optarg, ctx, gucsource); break; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 03b28c3604..1c878baf88 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1510,6 +1510,7 @@ ValidatePgVersion(const char *path) * GUC variables: lists of library names to be preloaded at postmaster * start and at backend start */ +char *encryption_library_string = NULL; char *session_preload_libraries_string = NULL; char *shared_preload_libraries_string = NULL; char *local_preload_libraries_string = NULL; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 859ef931e7..a9d86c6ea2 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -69,6 +69,7 @@ #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/dsm_impl.h" +#include "storage/encryption.h" #include "storage/standby.h" #include "storage/fd.h" #include "storage/large_object.h" @@ -193,6 +194,9 @@ static bool check_cluster_name(char **newval, void **extra, GucSource source); static const char *show_unix_socket_permissions(void); static const char *show_log_file_mode(void); static const char *show_data_directory_mode(void); +#ifdef USE_OPENSSL +static const char *show_encryption_key_command(void); +#endif /* USE_OPENSSL */ /* Private functions in guc-file.l that need to be called from guc.c */ static ConfigVariable *ProcessConfigFileInternal(GucContext context, @@ -1717,6 +1721,17 @@ static struct config_bool ConfigureNamesBool[] = }, { + {"data_encryption", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows whether data encryption is turned on for this cluster."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &data_encrypted, + false, + NULL, NULL, NULL + }, + + { {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), NULL @@ -3882,6 +3897,19 @@ static struct config_string ConfigureNamesString[] = NULL, NULL, NULL }, +#ifdef USE_OPENSSL + { + {"encryption_key_command", PGC_POSTMASTER, 0, + gettext_noop("Sets the shell command that will be called to fetch database encryption key."), + NULL, + GUC_IS_NAME | GUC_NOT_IN_SAMPLE + }, + &encryption_key_command, + NULL, + NULL, NULL, show_encryption_key_command + }, +#endif /* USE_OPENSSL */ + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL @@ -10784,4 +10812,15 @@ show_data_directory_mode(void) return buf; } +#ifdef USE_OPENSSL +static const char * +show_encryption_key_command(void) +{ + if (encryption_key_command) + return encryption_key_command; + else + return "(disabled)"; +} +#endif /* USE_OPENSSL */ + #include "guc-file.c" diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 9e39baf466..50a22b15a9 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -88,6 +88,7 @@ #authentication_timeout = 1min # 1s-600s #password_encryption = md5 # md5 or scram-sha-256 #db_user_namespace = off +#encryption_key_command = '' # GSSAPI using Kerberos #krb_server_keyfile = '' diff --git a/src/bin/Makefile b/src/bin/Makefile index 8c11060a2f..431c406a65 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -21,6 +21,7 @@ SUBDIRS = \ pg_controldata \ pg_ctl \ pg_dump \ + pg_keysetup \ pg_resetwal \ pg_rewind \ pg_test_fsync \ diff --git a/src/bin/initdb/Makefile b/src/bin/initdb/Makefile index 8c23941930..ea7af9144f 100644 --- a/src/bin/initdb/Makefile +++ b/src/bin/initdb/Makefile @@ -26,7 +26,7 @@ ifneq (,$(with_system_tzdata)) override CPPFLAGS += '-DSYSTEMTZDIR="$(with_system_tzdata)"' endif -OBJS= initdb.o findtimezone.o localtime.o encnames.o $(WIN32RES) +OBJS= initdb.o findtimezone.o localtime.o encnames.o encryption.o $(WIN32RES) all: initdb @@ -45,6 +45,9 @@ encnames.c: % : $(top_srcdir)/src/backend/utils/mb/% localtime.c: % : $(top_srcdir)/src/timezone/% rm -f $@ && $(LN_S) $< . +encryption.c: % : $(top_srcdir)/src/backend/storage/smgr/% + rm -f $@ && $(LN_S) $< . + install: all installdirs $(INSTALL_PROGRAM) initdb$(X) '$(DESTDIR)$(bindir)/initdb$(X)' @@ -55,7 +58,7 @@ uninstall: rm -f '$(DESTDIR)$(bindir)/initdb$(X)' clean distclean maintainer-clean: - rm -f initdb$(X) $(OBJS) encnames.c localtime.c + rm -f initdb$(X) $(OBJS) encnames.c localtime.c encryption.c rm -rf tmp_check # ensure that changes in datadir propagate into object file diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ae22e7d9fb..e828f27bc3 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -68,6 +68,9 @@ #include "common/restricted_token.h" #include "common/username.h" #include "fe_utils/string_utils.h" +#include "storage/encryption.h" +#include "lib/ilist.h" +#include "mb/pg_wchar.h" #include "getaddrinfo.h" #include "getopt_long.h" #include "mb/pg_wchar.h" @@ -77,6 +80,12 @@ /* Ideally this would be in a .h file, but it hardly seems worth the trouble */ extern const char *select_default_timezone(const char *share_path); +typedef struct +{ + dlist_node list_node; + char *value; +} extra_option; + static const char *const auth_methods_host[] = { "trust", "reject", "scram-sha-256", "md5", "password", "ident", "radius", #ifdef ENABLE_GSS @@ -141,9 +150,11 @@ static bool do_sync = true; static bool sync_only = false; static bool show_setting = false; static bool data_checksums = false; +static char *encr_key_cmd_str = NULL; static char *xlog_dir = NULL; static char *str_wal_segment_size_mb = NULL; static int wal_segment_size_mb; +static dlist_head extra_options = DLIST_STATIC_INIT(extra_options); /* internal vars */ @@ -529,6 +540,7 @@ readfile(const char *path) fclose(infile); free(buffer); + result[n] = NULL; return result; @@ -1070,6 +1082,39 @@ pretty_wal_size(int segment_count) return result; } +static void +append_extra_options(char ***conflines) +{ + dlist_iter iter; + int n_extra = 0; + int n_current = 0; + int i = 0; + char **new_conflines; + + dlist_foreach(iter, &extra_options) + { + n_extra++; + } + while ((*conflines)[i++] != NULL) + n_current++; + + new_conflines = (char **) pg_malloc((n_current + n_extra + 1) * sizeof(char *)); + for (i = 0; i < n_current; i++) + new_conflines[i] = (*conflines)[i]; + + dlist_foreach(iter, &extra_options) + { + extra_option *opt = dlist_container(extra_option, list_node, iter.cur); + + new_conflines[i++] = opt->value; + } + + new_conflines[i] = NULL; + pg_free(*conflines); + *conflines = new_conflines; +} + + /* * set up all the config files */ @@ -1089,6 +1134,8 @@ setup_config(void) conflines = readfile(conf_file); + append_extra_options(&conflines); + snprintf(repltok, sizeof(repltok), "max_connections = %d", n_connections); conflines = replace_token(conflines, "#max_connections = 100", repltok); @@ -1231,6 +1278,17 @@ setup_config(void) "log_file_mode = 0640"); } +#ifdef USE_OPENSSL + if (encryption_key_command) + { + snprintf(repltok, sizeof(repltok), "encryption_key_command = '%s'", + encryption_key_command); + conflines = replace_token(conflines, + "#encryption_key_command = ''", repltok); + } +#endif /* USE_OPENSSL */ + + snprintf(path, sizeof(path), "%s/postgresql.conf", pg_data); writefile(path, conflines); @@ -1448,11 +1506,29 @@ bootstrap_template1(void) /* Also ensure backend isn't confused by this environment var: */ unsetenv("PGCLIENTENCODING"); +#ifdef USE_OPENSSL + /* Prepare the -K option for the backend. */ + if (encryption_key_command) + { + size_t len; + + len = 3 + strlen(encryption_key_command) + 1; + encr_key_cmd_str = (char *) pg_malloc(len); + snprintf(encr_key_cmd_str, len, "-K %s", encryption_key_command); + } + else + { + encr_key_cmd_str = (char *) pg_malloc(1); + encr_key_cmd_str[0] = '\0'; + } +#endif /* USE_OPENSSL */ + snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -x1 -X %u %s %s %s", + "\"%s\" --boot -x1 -X %u %s %s %s %s", backend_exec, wal_segment_size_mb * (1024 * 1024), data_checksums ? "-k" : "", + encr_key_cmd_str, boot_options, debug ? "-d 5" : ""); @@ -2403,6 +2479,10 @@ usage(const char *progname) printf(_("\nLess commonly used options:\n")); printf(_(" -d, --debug generate lots of debugging output\n")); printf(_(" -k, --data-checksums use data page checksums\n")); +#ifdef USE_OPENSSL + printf(_(" -K, --encryption-key-command\n" + " command that returns encryption key\n")); +#endif /* USE_OPENSSL */ printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -n, --no-clean do not clean up after errors\n")); printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); @@ -2509,7 +2589,6 @@ setup_pgdata(void) putenv(pgdata_set_env); } - void setup_bin_paths(const char *argv0) { @@ -3022,8 +3101,8 @@ initialize_data_directory(void) fflush(stdout); snprintf(cmd, sizeof(cmd), - "\"%s\" %s template1 >%s", - backend_exec, backend_options, + "\"%s\" %s %s template1 >%s", + backend_exec, backend_options, encr_key_cmd_str, DEVNULL); PG_CMD_OPEN; @@ -3064,6 +3143,23 @@ initialize_data_directory(void) check_ok(); } +static void +parse_extra_option_arg(char *optarg) +{ + extra_option *opt; + + if (!strchr(optarg, '=')) + { + fprintf(stderr, _("Option value is not in key=value format")); + exit(1); + } + + opt = malloc(sizeof(extra_option)); + if (asprintf(&opt->value, "%s\n", optarg) < 0) + exit(1); + + dlist_push_tail(&extra_options, &opt->list_node); +} int main(int argc, char *argv[]) @@ -3098,6 +3194,9 @@ main(int argc, char *argv[]) {"waldir", required_argument, NULL, 'X'}, {"wal-segsize", required_argument, NULL, 12}, {"data-checksums", no_argument, NULL, 'k'}, +#ifdef USE_OPENSSL + {"encryption-key-command", required_argument, NULL, 'K'}, +#endif /* USE_OPENSSL */ {"allow-group-access", no_argument, NULL, 'g'}, {NULL, 0, NULL, 0} }; @@ -3140,7 +3239,7 @@ main(int argc, char *argv[]) /* process command-line options */ - while ((c = getopt_long(argc, argv, "dD:E:kL:nNU:WA:sST:X:g", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "dD:E:kK:L:nNU:WA:sST:X:g", long_options, &option_index)) != -1) { switch (c) { @@ -3192,6 +3291,14 @@ main(int argc, char *argv[]) case 'k': data_checksums = true; break; +#ifdef USE_OPENSSL + case 'K': + encryption_key_command = pg_strdup(optarg); + break; +#endif /* USE_OPENSSL */ + case 'c': + parse_extra_option_arg(optarg); + break; case 'L': share_path = pg_strdup(optarg); break; @@ -3365,7 +3472,12 @@ main(int argc, char *argv[]) if (pwprompt || pwfilename) get_su_pwd(); - printf("\n"); +#ifdef USE_OPENSSL + if (encryption_key_command) + printf(_("Data encryption is enabled.\n")); + else + printf(_("Data encryption is disabled.\n")); +#endif /* USE_OPENSSL */ initialize_data_directory(); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 895a51f89d..32c702a978 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -340,5 +340,14 @@ main(int argc, char *argv[]) ControlFile->data_checksum_version); printf(_("Mock authentication nonce: %s\n"), mock_auth_nonce_str); + printf(_("Data encryption: %s\n"), + ControlFile->data_encrypted ? _("on") : _("off")); + if (ControlFile->data_encrypted) + printf(_("Data encryption fingerprint: %08X%08X%08X%08X\n"), + htonl(((uint32 *) ControlFile->encryption_verification)[0]), + htonl(((uint32 *) ControlFile->encryption_verification)[1]), + htonl(((uint32 *) ControlFile->encryption_verification)[2]), + htonl(((uint32 *) ControlFile->encryption_verification)[3]) + ); return 0; } diff --git a/src/bin/pg_keysetup/Makefile b/src/bin/pg_keysetup/Makefile new file mode 100644 index 0000000000..c9fe1064d0 --- /dev/null +++ b/src/bin/pg_keysetup/Makefile @@ -0,0 +1,32 @@ +# src/bin/pg_keysetup/Makefile + +PGFILEDESC = "pg_keysetup - generate encryption key from a password" +PGAPPICON=win32 + +subdir = src/bin/pg_keysetup +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = pg_keysetup.o $(RMGRDESCOBJS) encryption.o $(WIN32RES) + +override CPPFLAGS := -DFRONTEND $(CPPFLAGS) + +all: pg_keysetup + +pg_keysetup: $(OBJS) | submake-libpgport + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +encryption.c: % : $(top_srcdir)/src/backend/storage/smgr/% + rm -f $@ && $(LN_S) $< . + +install: all installdirs + $(INSTALL_PROGRAM) pg_keysetup$(X) '$(DESTDIR)$(bindir)/pg_keysetup$(X)' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/pg_keysetup$(X)' + +clean distclean maintainer-clean: + rm -f pg_keysetup$(X) $(OBJS) encryption.c diff --git a/src/bin/pg_keysetup/pg_keysetup.c b/src/bin/pg_keysetup/pg_keysetup.c new file mode 100644 index 0000000000..68c33b26f7 --- /dev/null +++ b/src/bin/pg_keysetup/pg_keysetup.c @@ -0,0 +1,208 @@ +/*------------------------------------------------------------------------- + * + * pg_keysetup.c - Turn password into encryption key. + * + * Copyright (c) 2013-2017, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_keysetup/pg_keysetup.c + *------------------------------------------------------------------------- + */ + +#define FRONTEND 1 +#include "postgres.h" + +#include +#include + +#include "port/pg_crc32c.h" +#include "storage/encryption.h" +#include "getopt_long.h" + +#ifdef USE_ENCRYPTION +static const char *progname; + +static void +usage(const char *progname) +{ + printf(_("%s derives encryption key from a password.\n\n"), + progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION]...\n"), progname); + printf(_("\nOptions:\n")); + printf(_(" [-D] DATADIR data directory\n")); + printf(_(" -?, --help show this help, then exit\n\n")); + printf(_("Password is read from stdin and the key is sent to stdout\n")); +} + +static void fatal_error(const char *fmt,...) pg_attribute_printf(1, 2); + +/* + * Big red button to push when things go horribly wrong. + */ +static void +fatal_error(const char *fmt,...) +{ + va_list args; + + fflush(stdout); + + fprintf(stderr, _("%s: FATAL: "), progname); + va_start(args, fmt); + vfprintf(stderr, _(fmt), args); + va_end(args); + fputc('\n', stderr); + + exit(EXIT_FAILURE); +} + +/* + * This function does the same as read_kdf_file() in the backend. However most + * of the calls look different on frontend side, it seems better to write it + * here from scratch than to use too many #ifdef-#else-#endif constructs. + */ +static void +read_kdf_file(char *path) +{ + pg_crc32c crc; + int fd; + + KDFParams = palloc(KDF_PARAMS_FILE_SIZE); + + fd = open(path, O_RDONLY | PG_BINARY, S_IRUSR); + + if (fd < 0) + fatal_error("could not open key setup file \"%s\": %m", + KDF_PARAMS_FILE); + + if (read(fd, KDFParams, sizeof(KDFParamsData)) != sizeof(KDFParamsData)) + fatal_error("could not read from key setup file: %m"); + + close(fd); + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, + (char *) KDFParams, + offsetof(KDFParamsData, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, KDFParams->crc)) + fatal_error("incorrect checksum in key setup file"); + + if (KDFParams->function != KDF_OPENSSL_PKCS5_PBKDF2_HMAC_SHA) + fatal_error("unsupported KDF function %d", KDFParams->function); +} +#endif /* USE_ENCRYPTION */ + +int +main(int argc, char **argv) +{ +#ifdef USE_ENCRYPTION + int c; + char *DataDir = NULL; + char fpath[MAXPGPATH]; + int fd; + char password[ENCRYPTION_PWD_MAX_LENGTH]; + size_t pwd_len, + i; + + progname = get_progname(argv[0]); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(progname); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_controldata (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + while ((c = getopt(argc, argv, "D:")) != -1) + { + switch (c) + { + case 'D': + DataDir = optarg; + break; + + default: + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + } + + if (DataDir == NULL) + { + if (optind < argc) + DataDir = argv[optind++]; + else + DataDir = getenv("PGDATA"); + } + + /* Complain if any arguments remain */ + if (optind < argc) + { + fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"), + progname, argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + + if (DataDir == NULL) + { + fprintf(stderr, _("%s: no data directory specified\n"), progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + + snprintf(fpath, MAXPGPATH, "%s/%s", DataDir, KDF_PARAMS_FILE); + fd = open(fpath, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + fatal_error("could not open file \"%s\"", fpath); + + read_kdf_file(fpath); + close(fd); + + /* + * Read the password. + */ + pwd_len = 0; + while (true) + { + int c = getchar(); + + if (c == EOF || c == '\n') + break; + + if (pwd_len >= ENCRYPTION_PWD_MAX_LENGTH) + fatal_error("The password is too long"); + + password[pwd_len++] = c; + } + + if (pwd_len < ENCRYPTION_PWD_MIN_LENGTH) + fatal_error("The password is too short"); + + /* + * Run the key derivation function. + */ + setup_encryption_key(password, false, pwd_len); + + /* + * Finally print the encryption key. + */ + for (i = 0; i < ENCRYPTION_KEY_LENGTH; i++) + printf("%.2x", encryption_key[i]); + printf("\n"); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + return 0; +} diff --git a/src/bin/pg_resetwal/Makefile b/src/bin/pg_resetwal/Makefile index e49d52b928..6ea9c515f7 100644 --- a/src/bin/pg_resetwal/Makefile +++ b/src/bin/pg_resetwal/Makefile @@ -15,13 +15,18 @@ subdir = src/bin/pg_resetwal top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS= pg_resetwal.o $(WIN32RES) +override CPPFLAGS := -DFRONTEND $(CPPFLAGS) + +OBJS= pg_resetwal.o encryption.o $(WIN32RES) all: pg_resetwal pg_resetwal: $(OBJS) | submake-libpgport $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) +encryption.c: % : $(top_srcdir)/src/backend/storage/smgr/% + rm -f $@ && $(LN_S) $< . + install: all installdirs $(INSTALL_PROGRAM) pg_resetwal$(X) '$(DESTDIR)$(bindir)/pg_resetwal$(X)' @@ -32,7 +37,7 @@ uninstall: rm -f '$(DESTDIR)$(bindir)/pg_resetwal$(X)' clean distclean maintainer-clean: - rm -f pg_resetwal$(X) $(OBJS) + rm -f pg_resetwal$(X) $(OBJS) encryption.c rm -rf tmp_check check: diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 8cff535692..601ba83532 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -54,11 +54,11 @@ #include "common/fe_memutils.h" #include "common/file_perm.h" #include "common/restricted_token.h" +#include "storage/encryption.h" #include "storage/large_object.h" #include "pg_getopt.h" #include "getopt_long.h" - static ControlFileData ControlFile; /* pg_control values */ static XLogSegNo newXlogSegNo; /* new XLOG segment # */ static bool guessed = false; /* T if we had to guess at any values */ @@ -135,7 +135,7 @@ main(int argc, char *argv[]) } - while ((c = getopt_long(argc, argv, "c:D:e:fl:m:no:O:x:", long_options, NULL)) != -1) + while ((c = getopt_long(argc, argv, "c:D:e:fK:l:m:no:O:x:", long_options, NULL)) != -1) { switch (c) { @@ -278,6 +278,12 @@ main(int argc, char *argv[]) } break; +#ifdef USE_ENCRYPTION + case 'K': + encryption_key_command = strdup(optarg); + break; +#endif /* USE_ENCRYPTION */ + case 'l': if (strspn(optarg, "01234567890ABCDEFabcdef") != XLOG_FNAME_LEN) { @@ -415,6 +421,16 @@ main(int argc, char *argv[]) XLogFromFileName(log_fname, &minXlogTli, &minXlogSegNo, WalSegSz); /* + * If the data is encrypted, we'll also have to encrypt the XLOG record + * below. + */ + if (ControlFile.data_encrypted) + { + setup_encryption(false); + data_encrypted = true; + } + + /* * Also look at existing segment files to set up newXlogSegNo */ FindEndOfXLOG(); @@ -1255,6 +1271,18 @@ WriteEmptyXLOG(void) FIN_CRC32C(crc); record->xl_crc = crc; + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + XLogEncryptionTweak(tweak, newXlogSegNo, 0); + encrypt_block(buffer, buffer, XLOG_BLCKSZ, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + /* Write the first page */ XLogFilePath(path, ControlFile.checkPointCopy.ThisTimeLineID, newXlogSegNo, WalSegSz); @@ -1318,6 +1346,10 @@ usage(void) printf(_(" [-D, --pgdata=]DATADIR data directory\n")); printf(_(" -e, --epoch=XIDEPOCH set next transaction ID epoch\n")); printf(_(" -f, --force force update to be done\n")); +#ifdef USE_ENCRYPTION + printf(_(" -K, --encryption-key-command\n" + " command that returns encryption key\n")); +#endif /* USE_ENCRYPTION */ printf(_(" -l, --next-wal-file=WALFILE set minimum starting location for new WAL\n")); printf(_(" -m, --multixact-ids=MXID,MXID set next and oldest multitransaction ID\n")); printf(_(" -n, --dry-run no update, just show what would be done\n")); diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 2bcfcc61af..50263c0438 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -20,7 +20,7 @@ LDFLAGS_INTERNAL += $(libpq_pgport) OBJS = pg_rewind.o parsexlog.o xlogreader.o datapagemap.o timeline.o \ fetch.o file_ops.o copy_fetch.o libpq_fetch.o filemap.o logging.o \ - $(WIN32RES) + encryption.o $(WIN32RES) EXTRA_CLEAN = xlogreader.c @@ -32,6 +32,9 @@ pg_rewind: $(OBJS) | submake-libpq submake-libpgport xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% rm -f $@ && $(LN_S) $< . +encryption.c: % : $(top_srcdir)/src/backend/storage/smgr/% + rm -f $@ && $(LN_S) $< . + install: all installdirs $(INSTALL_PROGRAM) pg_rewind$(X) '$(DESTDIR)$(bindir)/pg_rewind$(X)' @@ -42,7 +45,7 @@ uninstall: rm -f '$(DESTDIR)$(bindir)/pg_rewind$(X)' clean distclean maintainer-clean: - rm -f pg_rewind$(X) $(OBJS) xlogreader.c + rm -f pg_rewind$(X) $(OBJS) xlogreader.c encryption.c rm -rf tmp_check check: diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index b4c1f827a6..057287748d 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -23,7 +23,7 @@ #include "catalog/pg_control.h" #include "catalog/storage_xlog.h" #include "commands/dbcommands_xlog.h" - +#include "storage/encryption.h" /* * RmgrNames is an array of resource manager names, to make error messages @@ -316,6 +316,18 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, return -1; } + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + + XLogEncryptionTweak(tweak, xlogreadsegno, targetPageOff); + decrypt_block(readBuf, readBuf, XLOG_BLCKSZ, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + Assert(targetSegNo == xlogreadsegno); *pageTLI = targetHistory[private->tliIndex].tli; diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index b0fd3f66ac..c003ed072a 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -28,6 +28,7 @@ #include "common/restricted_token.h" #include "getopt_long.h" #include "storage/bufpage.h" +#include "storage/encryption.h" static void usage(const char *progname); @@ -69,6 +70,10 @@ usage(const char *progname) printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n")); printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n")); printf(_(" --source-server=CONNSTR source server to synchronize with\n")); +#ifdef USE_OPENSSL + printf(_(" -K, --encryption-key-command=COMMAND\n" + " command that returns encryption key\n")); +#endif /* USE_OPENSSL */ printf(_(" -n, --dry-run stop before modifying anything\n")); printf(_(" -P, --progress write progress messages\n")); printf(_(" --debug write a lot of debug messages\n")); @@ -90,6 +95,9 @@ main(int argc, char **argv) {"dry-run", no_argument, NULL, 'n'}, {"progress", no_argument, NULL, 'P'}, {"debug", no_argument, NULL, 3}, +#ifdef USE_OPENSSL + {"encryption-key-command", required_argument, NULL, 'K'}, +#endif /* USE_OPENSSL */ {NULL, 0, NULL, 0} }; int option_index; @@ -124,7 +132,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "D:nP", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "D:K:nP", long_options, &option_index)) != -1) { switch (c) { @@ -154,6 +162,12 @@ main(int argc, char **argv) case 2: /* --source-server */ connstr_source = pg_strdup(optarg); break; +#ifdef USE_OPENSSL + case 4: /* --encryption-key-command */ + case 'K': + encryption_key_command = strdup(optarg); + break; +#endif /* USE_OPENSSL */ } } @@ -233,6 +247,16 @@ main(int argc, char **argv) sanityChecks(); /* + * Setup encryption if it's obvious that we'll have to deal with encrypted + * XLOG. + */ + if (ControlFile_target.data_encrypted) + { + setup_encryption(false); + data_encrypted = true; + } + + /* * If both clusters are already on the same timeline, there's nothing to * do. */ @@ -425,6 +449,24 @@ sanityChecks(void) ControlFile_source.state != DB_SHUTDOWNED && ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("source data directory must be shut down cleanly\n"); + + /* + * Since slave receives XLOG stream encrypted by master, handling + * differently encrypted clusters is not the typical use case for + * pg_rewind. Yet we should check the encryption. + */ + if (ControlFile_source.data_encrypted || + ControlFile_target.data_encrypted) + { + if (ControlFile_source.data_encrypted != + ControlFile_target.data_encrypted) + pg_fatal("source and target server must be both unencrypted or both encrypted\n"); + + if (memcmp(ControlFile_source.encryption_verification, + ControlFile_target.encryption_verification, + ENCRYPTION_SAMPLE_SIZE)) + pg_fatal("both source and target server must use the same encryption key"); + } } /* diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 577db73f10..f698a51701 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -267,6 +267,23 @@ check_cluster_versions(void) GET_MAJOR_VERSION(new_cluster.bin_version)) pg_fatal("New cluster data and binary directories are from different major versions.\n"); + /* + * As this is the first version that supports encryption, we actually do + * not expect the old cluster to be encrypted. + * + * TODO Change this logic as soon as we merge the encryption into the + * next, even minor version (pg_upgrade seems to accept clusters that + * differ only in the minor version), or for the PG core patch. Eventually + * we'll support the upgrade if either both clusters are unencrypted, or + * if both are encrypted using the same key. In the latter case, we need + * to ensure that the PGENCRYPTIONKEY environment variable is passed to + * both clusters. + */ + if (old_cluster.controldata.data_encrypted) + pg_fatal("Old cluster is encrypted.\n"); + if (new_cluster.controldata.data_encrypted) + pg_fatal("New cluster is encrypted.\n"); + check_ok(); } diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 0fe98a550e..d9fa8c63d1 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -58,6 +58,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) bool got_large_object = false; bool got_date_is_int = false; bool got_data_checksum_version = false; + bool got_data_encrypted = false; char *lc_collate = NULL; char *lc_ctype = NULL; char *lc_monetary = NULL; @@ -135,6 +136,13 @@ get_control_data(ClusterInfo *cluster, bool live_check) got_data_checksum_version = true; } + /* Only in <= 9.6 */ + if (GET_MAJOR_VERSION(cluster->major_version) <= 906) + { + cluster->controldata.data_encrypted = false; + got_data_encrypted = true; + } + /* we have the result of cmd in "output". so parse it line by line now */ while (fgets(bufin, sizeof(bufin), output)) { @@ -418,6 +426,34 @@ get_control_data(ClusterInfo *cluster, bool live_check) cluster->controldata.data_checksum_version = str2uint(p); got_data_checksum_version = true; } + else if ((p = strstr(bufin, "encryption fingerprint")) != NULL) + { + int i; + + p = strchr(p, ':'); + + if (p == NULL || strlen(p) <= 1) + pg_fatal("%d: controldata retrieval problem\n", __LINE__); + + cluster->controldata.data_encrypted = true; + + /* Skip the colon and any whitespace after it */ + p = strchr(p, ':'); + if (p == NULL || strlen(p) <= 1) + pg_fatal("%d: controldata retrieval problem\n", __LINE__); + p = strpbrk(p, "01234567890ABCDEF"); + if (p == NULL || strlen(p) <= 1) + pg_fatal("%d: controldata retrieval problem\n", __LINE__); + + /* Make sure it looks like a valid finerprint */ + if (strspn(p, "0123456789ABCDEF") != 32) + pg_fatal("%d: controldata retrieval problem\n", __LINE__); + + for (i = 0; i < 16; i++) + sscanf(p + 2 * i, "%2hhx", + cluster->controldata.encryption_verification + i); + got_data_encrypted = true; + } } pclose(output); @@ -472,7 +508,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) !got_index || !got_toast || (!got_large_object && cluster->controldata.ctrl_ver >= LARGE_OBJECT_SIZE_PG_CONTROL_VER) || - !got_date_is_int || !got_data_checksum_version) + !got_date_is_int || !got_data_checksum_version || !got_data_encrypted) { if (cluster == &old_cluster) pg_log(PG_REPORT, @@ -538,6 +574,10 @@ get_control_data(ClusterInfo *cluster, bool live_check) if (!got_data_checksum_version) pg_log(PG_REPORT, " data checksum version\n"); + /* value added in Postgres 10 */ + if (!got_data_encrypted) + pg_log(PG_REPORT, " data encryption status\n"); + pg_fatal("Cannot continue without required control information, terminating\n"); } } @@ -602,6 +642,18 @@ check_control_data(ControlData *oldctrl, pg_fatal("old cluster uses data checksums but the new one does not\n"); else if (oldctrl->data_checksum_version != newctrl->data_checksum_version) pg_fatal("old and new cluster pg_controldata checksum versions do not match\n"); + + if (oldctrl->data_encrypted && !newctrl->data_encrypted) + pg_fatal("old cluster is encrypted, but the new one is not\n"); + else if (!oldctrl->data_encrypted && newctrl->data_encrypted) + pg_fatal("old cluster is not encrypted, but the new one is\n"); + else if (oldctrl->data_encrypted && newctrl->data_encrypted) + { + if (oldctrl->encryption_verification != newctrl->encryption_verification) + pg_fatal("encryption keys do not match between old and new cluster\n"); + else + pg_fatal("upgrading encrypted databases is not implemented yet\n"); /* TODO */ + } } diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 7e5e971294..66291cbf47 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -227,6 +227,8 @@ typedef struct bool date_is_int; bool float8_pass_by_value; bool data_checksum_version; + bool data_encrypted; + uint8 encryption_verification[16]; } ControlData; /* diff --git a/src/bin/pg_waldump/Makefile b/src/bin/pg_waldump/Makefile index f5957bd75a..5a8bd1fba9 100644 --- a/src/bin/pg_waldump/Makefile +++ b/src/bin/pg_waldump/Makefile @@ -7,8 +7,8 @@ subdir = src/bin/pg_waldump top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = pg_waldump.o compat.o xlogreader.o rmgrdesc.o \ - $(RMGRDESCOBJS) $(WIN32RES) +OBJS = pg_waldump.o compat.o xlogreader.o rmgrdesc.o encryption.o \ + $(RMGRDESCOBJS) encryption.o $(WIN32RES) override CPPFLAGS := -DFRONTEND $(CPPFLAGS) @@ -27,6 +27,9 @@ xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% $(RMGRDESCSOURCES): % : $(top_srcdir)/src/backend/access/rmgrdesc/% rm -f $@ && $(LN_S) $< . +encryption.c: % : $(top_srcdir)/src/backend/storage/smgr/% + rm -f $@ && $(LN_S) $< . + install: all installdirs $(INSTALL_PROGRAM) pg_waldump$(X) '$(DESTDIR)$(bindir)/pg_waldump$(X)' @@ -37,4 +40,5 @@ uninstall: rm -f '$(DESTDIR)$(bindir)/pg_waldump$(X)' clean distclean maintainer-clean: - rm -f pg_waldump$(X) $(OBJS) $(RMGRDESCSOURCES) xlogreader.c + rm -f pg_waldump$(X) $(OBJS) $(RMGRDESCSOURCES) xlogreader.c \ +encryption.c diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 5c4f38e597..0013d20785 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -21,12 +21,15 @@ #include "access/xlog_internal.h" #include "access/transam.h" #include "common/fe_memutils.h" +#include "storage/encryption.h" #include "getopt_long.h" #include "rmgrdesc.h" static const char *progname; +static char *str_wal_segment_size_mb = NULL; +static bool encr_inited = false; static int WalSegSz; typedef struct XLogDumpPrivate @@ -213,8 +216,85 @@ search_directory(const char *directory, const char *fname) if (read(fd, buf, XLOG_BLCKSZ) == XLOG_BLCKSZ) { - XLogLongPageHeader longhdr = (XLogLongPageHeader) buf; + XLogLongPageHeader longhdr; + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + uint32 tli; + char tweak[TWEAK_SIZE]; + XLogSegNo segNo; + int wal_segment_size; + XLogPageHeader hdr; + + /* + * Segment size is contained in the encryption tweak so we + * cannot get the size from the header until the page is + * decrypted. User needs to pass the size if he uses + * non-default value. + * + * TODO The processing of wal-segsize is almost copy & pasted + * from initdb.c. Reuse in a better way. + */ + /* set wal segment size */ + if (str_wal_segment_size_mb == NULL) + wal_segment_size = DEFAULT_XLOG_SEG_SIZE; + else + { + char *endptr; + + /* check that the argument is a number */ + wal_segment_size = strtol(str_wal_segment_size_mb, &endptr, 10); + + /* verify that wal segment size is valid */ + if (endptr == str_wal_segment_size_mb || *endptr != '\0') + { + fprintf(stderr, + _("%s: argument of --wal-segsize must be a number\n"), + progname); + exit(1); + } + + wal_segment_size *= 1024 * 1024; + if (!IsValidWalSegSize(wal_segment_size)) + { + fprintf(stderr, + _("%s: argument of --wal-segsize must be a power of 2 between 1 and 1024\n"), + progname); + exit(1); + } + } + + XLogFromFileName(fname, &tli, &segNo, wal_segment_size); + XLogEncryptionTweak(tweak, segNo, 0); + setup_encryption(false); + encr_inited = true; + decrypt_block(buf, buf, XLOG_BLCKSZ, tweak); + + hdr = (XLogPageHeader) buf; + if (hdr->xlp_magic != XLOG_PAGE_MAGIC) + { + fprintf(stderr, + _("invalid magic number %04X. Make sure you've passed correct argument for --wal-segsize"), + hdr->xlp_magic); + exit(1); + } +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + else + { + if (str_wal_segment_size_mb != NULL) + { + fprintf(stderr, + _("%s: --wal-segsize is only accepted for encrypted WAL\n"), + progname); + exit(1); + } + } + + longhdr = (XLogLongPageHeader) buf; WalSegSz = longhdr->xlp_seg_size; if (!IsValidWalSegSize(WalSegSz)) @@ -453,6 +533,23 @@ XLogDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogDumpXLogRead(private->inpath, private->timeline, targetPagePtr, readBuff, count); + if (data_encrypted) + { +#ifdef USE_ENCRYPTION + char tweak[TWEAK_SIZE]; + XLogSegNo readSegNo; + uint32 readSegOff; + + XLByteToSeg(targetPagePtr, readSegNo, WalSegSz); + readSegOff = targetPagePtr % WalSegSz; + + XLogEncryptionTweak(tweak, readSegNo, readSegOff); + decrypt_block(readBuff, readBuff, count, tweak); +#else + ENCRYPTION_NOT_SUPPORTED_MSG; +#endif /* USE_ENCRYPTION */ + } + return count; } @@ -801,6 +898,12 @@ usage(void) printf(_(" -b, --bkp-details output detailed information about backup blocks\n")); printf(_(" -e, --end=RECPTR stop reading at WAL location RECPTR\n")); printf(_(" -f, --follow keep retrying after reaching end of WAL\n")); +#ifdef USE_ENCRYPTION + printf(_(" -K, --encryption-key-command=COMMAND\n" + " command that returns encryption key\n")); + printf(_(" -w, --wal-segsize=SIZE\n" + " size of WAL segments, in megabytes\n")); +#endif /* USE_ENCRYPTION */ printf(_(" -n, --limit=N number of records to display\n")); printf(_(" -p, --path=PATH directory in which to find log segment files or a\n" " directory with a ./pg_wal that contains such files\n" @@ -835,6 +938,7 @@ main(int argc, char **argv) {"end", required_argument, NULL, 'e'}, {"follow", no_argument, NULL, 'f'}, {"help", no_argument, NULL, '?'}, + {"encryption-key-command", required_argument, NULL, 'K'}, {"limit", required_argument, NULL, 'n'}, {"path", required_argument, NULL, 'p'}, {"rmgr", required_argument, NULL, 'r'}, @@ -842,6 +946,7 @@ main(int argc, char **argv) {"timeline", required_argument, NULL, 't'}, {"xid", required_argument, NULL, 'x'}, {"version", no_argument, NULL, 'V'}, + {"wal-segsize", required_argument, NULL, 'w'}, {"stats", optional_argument, NULL, 'z'}, {NULL, 0, NULL, 0} }; @@ -877,7 +982,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "be:?fn:p:r:s:t:Vx:z", + while ((option = getopt_long(argc, argv, "be:?fK:n:p:r:s:t:Vx:z", long_options, &optindex)) != -1) { switch (option) @@ -901,6 +1006,12 @@ main(int argc, char **argv) usage(); exit(EXIT_SUCCESS); break; +#ifdef USE_ENCRYPTION + case 'K': + encryption_key_command = pg_strdup(optarg); + data_encrypted = true; + break; +#endif /* USE_ENCRYPTION */ case 'n': if (sscanf(optarg, "%d", &config.stop_after_records) != 1) { @@ -961,6 +1072,9 @@ main(int argc, char **argv) puts("pg_waldump (PostgreSQL) " PG_VERSION); exit(EXIT_SUCCESS); break; + case 'w': + str_wal_segment_size_mb = pg_strdup(optarg); + break; case 'x': if (sscanf(optarg, "%u", &config.filter_by_xid) != 1) { @@ -1108,6 +1222,12 @@ main(int argc, char **argv) /* done with argument parsing, do the actual work */ + if (data_encrypted && !encr_inited) + { + setup_encryption(false); + encr_inited = false; + } + /* we have everything we need, start reading */ xlogreader_state = XLogReaderAllocate(WalSegSz, XLogDumpReadPage, &private); diff --git a/src/common/string.c b/src/common/string.c index 3260d37a84..57ede1dd14 100644 --- a/src/common/string.c +++ b/src/common/string.c @@ -42,6 +42,45 @@ pg_str_endswith(const char *str, const char *end) return strcmp(str, end) == 0; } +/* + * Helper function to check if a page is completely empty. + * + * TODO Invent name that is more consistent with that of the other function(s) + * in this module. + */ +bool +IsAllZero(const char *input, Size size) +{ + const char *pos = input; + const char *aligned_start = (char *) MAXALIGN64(input); + const char *end = input + size; + + /* Check 1 byte at a time until pos is 8 byte aligned */ + while (pos < aligned_start) + if (*pos++ != 0) + return false; + + /* + * Run 8 parallel 8 byte checks in one iteration. On 2016 hardware + * slightly faster than 4 parallel checks. + */ + while (pos + 8 * sizeof(uint64) <= end) + { + uint64 *p = (uint64 *) pos; + + if ((p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]) != 0) + return false; + pos += 8 * sizeof(uint64); + } + + /* Handle unaligned tail. */ + while (pos < end) + if (*pos++ != 0) + return false; + + return true; +} + /* * strtoint --- just like strtol, but returns int not long diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 421ba6d775..c7fea0f6cb 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -258,6 +258,7 @@ extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); extern bool DataChecksumsEnabled(void); +extern bool DataEncryptionEnabled(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 7c766836db..7499bd0ca9 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -52,7 +52,7 @@ typedef struct XLogPageHeaderData uint32 xlp_rem_len; /* total len of remaining data for record */ } XLogPageHeaderData; -#define SizeOfXLogShortPHD MAXALIGN(sizeof(XLogPageHeaderData)) +#define SizeOfXLogShortPHD XLOG_REC_ALIGN(sizeof(XLogPageHeaderData)) typedef XLogPageHeaderData *XLogPageHeader; @@ -69,7 +69,7 @@ typedef struct XLogLongPageHeaderData uint32 xlp_xlog_blcksz; /* just as a cross-check */ } XLogLongPageHeaderData; -#define SizeOfXLogLongPHD MAXALIGN(sizeof(XLogLongPageHeaderData)) +#define SizeOfXLogLongPHD XLOG_REC_ALIGN(sizeof(XLogLongPageHeaderData)) typedef XLogLongPageHeaderData *XLogLongPageHeader; diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 773d9e6eba..ce4386d592 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -94,6 +94,11 @@ typedef enum DBState } DBState; /* + * Number of bytes reserved to store encryption sample in ControlFileData. + */ +#define ENCRYPTION_SAMPLE_SIZE 16 + +/* * Contents of pg_control. */ @@ -227,6 +232,11 @@ typedef struct ControlFileData */ char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN]; + /* Is data directory encrypted? */ + bool data_encrypted; + /* Sample value for encryption key verification */ + uint8 encryption_verification[ENCRYPTION_SAMPLE_SIZE]; + /* CRC of all above ... MUST BE LAST! */ pg_crc32c crc; } ControlFileData; diff --git a/src/include/common/string.h b/src/include/common/string.h index 78a450192e..26fbfdc51e 100644 --- a/src/include/common/string.h +++ b/src/include/common/string.h @@ -13,5 +13,5 @@ extern bool pg_str_endswith(const char *str, const char *end); extern int strtoint(const char *pg_restrict str, char **pg_restrict endptr, int base); - +extern bool IsAllZero(const char *input, Size size); #endif /* COMMON_STRING_H */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index e167ee8fcb..bc01ace87a 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -429,6 +429,7 @@ extern void BaseInit(void); /* in utils/init/miscinit.c */ extern bool IgnoreSystemIndexes; extern PGDLLIMPORT bool process_shared_preload_libraries_in_progress; +extern char *encryption_library_string; extern char *session_preload_libraries_string; extern char *shared_preload_libraries_string; extern char *local_preload_libraries_string; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index be2f59239b..b6948fce83 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -874,6 +874,9 @@ typedef enum WAIT_EVENT_DATA_FILE_TRUNCATE, WAIT_EVENT_DATA_FILE_WRITE, WAIT_EVENT_DSM_FILL_ZERO_WRITE, + WAIT_EVENT_KDF_FILE_READ, + WAIT_EVENT_KDF_FILE_SYNC, + WAIT_EVENT_KDF_FILE_WRITE, WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ, WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC, WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE, diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h index 4fef3e2107..8aea3eb2ed 100644 --- a/src/include/storage/copydir.h +++ b/src/include/storage/copydir.h @@ -13,7 +13,9 @@ #ifndef COPYDIR_H #define COPYDIR_H -extern void copydir(char *fromdir, char *todir, bool recurse); -extern void copy_file(char *fromfile, char *tofile); +#include "storage/relfilenode.h" + +extern void copydir(char *fromdir, char *todir, RelFileNode *fromNode, RelFileNode *toNode); +extern void copy_file(char *fromfile, char *tofile, RelFileNode *fromNode, RelFileNode *toNode, ForkNumber fromForkNum, ForkNumber toForkNum, int segment); #endif /* COPYDIR_H */ diff --git a/src/include/storage/encryption.h b/src/include/storage/encryption.h new file mode 100644 index 0000000000..dab1bea600 --- /dev/null +++ b/src/include/storage/encryption.h @@ -0,0 +1,182 @@ +/*------------------------------------------------------------------------- + * + * encryption.h + * Full database encryption support + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/encryption.h + * + *------------------------------------------------------------------------- + */ +#ifndef ENCRYPTION_H +#define ENCRYPTION_H + +#include "access/xlogdefs.h" +#include "port/pg_crc32c.h" +#include "lib/ilist.h" + +/* + * OpenSSL is currently the only implementation of encryption we use. + */ +#ifdef USE_OPENSSL +#define USE_ENCRYPTION +#endif + +/* + * Common error message issued when particular code path cannot be executed + * due to absence of the OpenSSL library. + */ +#define ENCRYPTION_NOT_SUPPORTED_MSG \ + encryption_error(true, \ + "data encryption cannot be used because SSL is not supported by this build\n" \ + "Compile with --with-openssl to use SSL connections.") +#ifdef USE_ENCRYPTION +/* + * Full database encryption key. + * + * EVP_aes_256_xts() needs the key twice as long as AES would do in general. + */ +#define ENCRYPTION_KEY_LENGTH 64 + +/* + * TODO Tune these values. + */ +#define ENCRYPTION_PWD_MIN_LENGTH 8 +#define ENCRYPTION_PWD_MAX_LENGTH 16 +#define ENCRYPTION_KDF_NITER 1048576 +#define ENCRYPTION_KDF_SALT_LEN 8 + +extern unsigned char *encryption_key; +#endif /* USE_ENCRYPTION */ + +/* + * The encrypted data is a series of blocks of size + * ENCRYPTION_BLOCK. Currently we use the EVP_aes_256_xts implementation. Make + * sure the following constants match if adopting another algorithm. + */ +#define ENCRYPTION_BLOCK 16 + +#ifdef USE_ENCRYPTION +/* + * The openssl EVP API refers to a block in terms of padding of the output + * chunk. That's the purpose of this constant. However the openssl + * implementation of AES XTS still uses the 16-byte block internally, as + * defined by ENCRYPTION_BLOCK. + */ +#define ENCRYPTION_BLOCK_OPENSSL 1 + +#define TWEAK_SIZE 16 + +#define KDF_PARAMS_FILE "global/pg_keysetup" +#define KDF_PARAMS_FILE_SIZE 512 + +/* + * Key derivation function. + */ +typedef enum KDFKind +{ + KDF_OPENSSL_PKCS5_PBKDF2_HMAC_SHA = 0 +} KFDKind; + +typedef struct KDFParamsPBKDF2 +{ + unsigned long int niter; + unsigned char salt[ENCRYPTION_KDF_SALT_LEN]; +} KDFParamsPBKDF2; + +/* + * Parameters of the key derivation function. + * + * The parameters are generated by initdb and stored into a file, which is + * then read during PG startup. This is similar to storing various settings in + * pg_control. However an existing KDF file is read only, so it does not have + * to be stored in shared memory. + */ +typedef struct KDFParamsData +{ + KFDKind function; + + /* + * Function-specific parameters. + */ + union + { + KDFParamsPBKDF2 pbkdf2; + } data; + + /* CRC of all above ... MUST BE LAST! */ + pg_crc32c crc; +} KDFParamsData; + +extern KDFParamsData *KDFParams; +#endif /* USE_ENCRYPTION */ + +extern PGDLLIMPORT bool data_encrypted; + +#ifdef USE_ENCRYPTION +#ifndef FRONTEND +extern void write_kdf_file(void); +extern void read_kdf_file(void); +#endif /* FRONTEND */ + +extern PGDLLIMPORT char *encryption_key_command; + +extern char *encryption_buffer; +extern Size encryption_buf_size; + +extern void enlarge_encryption_buffer(Size new_size); +#endif /* USE_ENCRYPTION */ + +/* + * If one XLOG record ended and the following one started in the same block, + * we'd have to either encrypt and decrypt both records together, or encrypt + * (after having zeroed the part of the block occupied by the other record) + * and decrypt them separate. Neither approach is compatible with streaming + * replication. In the first case we can't ask standby not to decrypt the + * first record until the second has been streamed. The second approach would + * imply streaming of two different versions of the same block two times. + * + * We avoid this problem by aligning XLOG records to the encryption block + * size. This way no adjacent XLOG records should appear in the same block. + * + * For similar reasons, the alignment to ENCRYPTION_BLOCK also has to be + * applied when storing changes to disk in reorderbuffer.c. Another module + * that takes the block into account is buffile.c. + * + * TODO If the configuration allows walsender to decrypt the XLOG stream + * before sending it, adjust this expression so that the additional padding is + * not added to XLOG records. (Since the XLOG alignment cannot change without + * initdb, the same would apply to the configuration variable that makes + * walsender perform the decryption. Does such a variable make sense?) + */ +#define DO_ENCRYPTION_BLOCK_ALIGN data_encrypted + +/* + * Use TYPEALIGN64 since besides record size we also need to align XLogRecPtr. + */ +#define ENCRYPTION_BLOCK_ALIGN(LEN) TYPEALIGN64(ENCRYPTION_BLOCK, (LEN)) + +/* + * Universal computation of XLOG record alignment. + */ +#define XLOG_REC_ALIGN(LEN) ((DO_ENCRYPTION_BLOCK_ALIGN) ?\ + ENCRYPTION_BLOCK_ALIGN(LEN) : MAXALIGN64(LEN)) + +extern void setup_encryption(bool bootstrap); +extern void setup_encryption_key(char *credentials, bool is_key, size_t len); +extern void sample_encryption(char *buf); +extern void encrypt_block(const char *input, char *output, Size size, + const char *tweak); +extern void decrypt_block(const char *input, char *output, Size size, + const char *tweak); +#ifdef USE_ENCRYPTION +extern void encryption_error(bool fatal, char *message); +#endif + +extern void XLogEncryptionTweak(char *tweak, XLogSegNo segment, + uint32 offset); + +#endif /* ENCRYPTION_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 8e7c9728f4..0567c68c0c 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -72,6 +72,7 @@ extern int FileRead(File file, char *buffer, int amount, uint32 wait_event_info) extern int FileWrite(File file, char *buffer, int amount, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info); extern off_t FileSeek(File file, off_t offset, int whence); +extern off_t FileTell(File file); extern int FileTruncate(File file, off_t offset, uint32 wait_event_info); extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info); extern char *FilePathName(File file); diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h index a62703c647..5efd101d03 100644 --- a/src/include/storage/reinit.h +++ b/src/include/storage/reinit.h @@ -20,7 +20,8 @@ extern void ResetUnloggedRelations(int op); extern bool parse_filename_for_nontemp_relation(const char *name, - int *oidchars, ForkNumber *fork); + int *oidchars, ForkNumber *fork, + int *segment); #define UNLOGGED_RELATION_CLEANUP 0x0001 #define UNLOGGED_RELATION_INIT 0x0002 diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 558e4d8518..4866821897 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -144,4 +144,17 @@ extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum); extern void ForgetDatabaseFsyncRequests(Oid dbid); +#ifdef USE_OPENSSL +extern BlockNumber ReencryptBlock(char *buffer, int blocks, + RelFileNode *srcNode, RelFileNode *dstNode, + ForkNumber srcForkNum, ForkNumber dstForkNum, + BlockNumber blockNum); +#endif + +/* smgrtype.c */ +extern Datum smgrout(PG_FUNCTION_ARGS); +extern Datum smgrin(PG_FUNCTION_ARGS); +extern Datum smgreq(PG_FUNCTION_ARGS); +extern Datum smgrne(PG_FUNCTION_ARGS); + #endif /* SMGR_H */ diff --git a/src/test/modules/buffile/Makefile b/src/test/modules/buffile/Makefile new file mode 100644 index 0000000000..b658c0dd8d --- /dev/null +++ b/src/test/modules/buffile/Makefile @@ -0,0 +1,21 @@ +PG_CONFIG ?= pg_config +MODULE_big = buffile_test +OBJS = buffile.o $(WIN32RES) +PGFILEDESC = "buffile_test" + +EXTENSION = buffile +DATA = buffile--1.0.sql + +REGRESS = test_00 test_01 test_02 test_03 test_04 test_05 test_06 test_07 \ +test_08 test_09 test_10 test_11 test_12 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/buffile +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/buffile/README b/src/test/modules/buffile/README new file mode 100644 index 0000000000..9c80787797 --- /dev/null +++ b/src/test/modules/buffile/README @@ -0,0 +1,11 @@ +This extension was written to check if changes introduced due to cluster +encryption do not break buffile.c. + +Caution: To make the test cheaper, it was decided to adjust the segment sizeq, +see + +#define MAX_PHYSICAL_FILESIZE (4 * BLCKSZ) + +in buffile.c. BLCKSZ is 8192 (the default). All the tests rely on this +value. So if you haven't compiled Postgres with this value, the tests will +create 1 GB files and they will fail. diff --git a/src/test/modules/buffile/buffile--1.0.sql b/src/test/modules/buffile/buffile--1.0.sql new file mode 100644 index 0000000000..82baa043f7 --- /dev/null +++ b/src/test/modules/buffile/buffile--1.0.sql @@ -0,0 +1,39 @@ +CREATE FUNCTION buffile_create() +RETURNS void +AS 'MODULE_PATHNAME', 'buffile_create' +LANGUAGE C; + +CREATE FUNCTION buffile_close() +RETURNS void +AS 'MODULE_PATHNAME', 'buffile_close' +LANGUAGE C; + +CREATE FUNCTION buffile_write(text) +RETURNS bigint +AS 'MODULE_PATHNAME', 'buffile_write' +LANGUAGE C; + +CREATE FUNCTION buffile_read(bigint) +RETURNS bytea +AS 'MODULE_PATHNAME', 'buffile_read' +LANGUAGE C; + +CREATE FUNCTION buffile_seek(int, bigint) +RETURNS int +AS 'MODULE_PATHNAME', 'buffile_seek' +LANGUAGE C; + +CREATE FUNCTION buffile_assert_fileno(int) +RETURNS void +AS 'MODULE_PATHNAME', 'buffile_assert_fileno' +LANGUAGE C; + +CREATE FUNCTION buffile_test_shared() +RETURNS void +AS 'MODULE_PATHNAME', 'buffile_test_shared' +LANGUAGE C; + +CREATE FUNCTION buffile_test_shared_append() +RETURNS void +AS 'MODULE_PATHNAME', 'buffile_test_shared_append' +LANGUAGE C; diff --git a/src/test/modules/buffile/buffile.c b/src/test/modules/buffile/buffile.c new file mode 100644 index 0000000000..45ad6778ab --- /dev/null +++ b/src/test/modules/buffile/buffile.c @@ -0,0 +1,353 @@ +#include "postgres.h" +#include "fmgr.h" +#include "lib/stringinfo.h" +#include "storage/buffile.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + +PG_MODULE_MAGIC; + +/* + * To cover various corner cases, the tests assume MAX_PHYSICAL_FILESIZE to be + * exactly MAX_PHYSICAL_FILESIZE_TEST. + */ +#define MAX_PHYSICAL_FILESIZE_TEST (4 * BLCKSZ) + +static BufFile *bf = NULL; + +static void check_file(void); + +extern Datum buffile_create(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_create); +Datum +buffile_create(PG_FUNCTION_ARGS) +{ + MemoryContext old_cxt; + ResourceOwner old_ro; + + if (bf != NULL) + elog(ERROR, "file already exists"); + + old_cxt = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Make sure the file is not deleted across function calls. + */ + old_ro = CurrentResourceOwner; + CurrentResourceOwner = TopTransactionResourceOwner; + + bf = BufFileCreateTemp(false); + + CurrentResourceOwner = old_ro; + MemoryContextSwitchTo(old_cxt); + + PG_RETURN_VOID(); +} + +extern Datum buffile_close(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_close); +Datum +buffile_close(PG_FUNCTION_ARGS) +{ + if (bf == NULL) + elog(ERROR, "there's no file to close"); + + BufFileClose(bf); + bf = NULL; + + PG_RETURN_VOID(); +} + +extern Datum buffile_write(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_write); +Datum +buffile_write(PG_FUNCTION_ARGS) +{ + Datum d = PG_GETARG_DATUM(0); + char *s = TextDatumGetCString(d); + size_t res; + + check_file(); + res = BufFileWrite(bf, s, strlen(s)); + + PG_RETURN_INT64(res); +} + +extern Datum buffile_read(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_read); +Datum +buffile_read(PG_FUNCTION_ARGS) +{ + int64 size = PG_GETARG_INT64(0); + StringInfo buf = makeStringInfo(); + size_t res_size; + bytea *result; + + check_file(); + + enlargeStringInfo(buf, size); + res_size = BufFileRead(bf, buf->data, size); + buf->len = res_size; + + result = DatumGetByteaPP(DirectFunctionCall1(bytearecv, + PointerGetDatum(buf))); + PG_RETURN_BYTEA_P(result); +} + +extern Datum buffile_seek(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_seek); +Datum +buffile_seek(PG_FUNCTION_ARGS) +{ + int32 fileno = PG_GETARG_INT32(0); + int64 offset = PG_GETARG_INT64(1); + int32 res; + + check_file(); + res = BufFileSeek(bf, fileno, offset, SEEK_SET); + + PG_RETURN_INT32(res); +} + +extern Datum buffile_assert_fileno(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_assert_fileno); +Datum +buffile_assert_fileno(PG_FUNCTION_ARGS) +{ + int32 fileno_expected = PG_GETARG_INT32(0); + int32 fileno; + off_t offset; + + check_file(); + BufFileTell(bf, &fileno, &offset); + + if (fileno != fileno_expected) + { + /* + * Bring the backend down so that the following tests have no chance + * to create the 1GB files. + */ + elog(FATAL, "file number does not match"); + } + + PG_RETURN_VOID(); +} + +static void +check_file(void) +{ + if (bf == NULL) + elog(ERROR, "the file is not opened"); +} + +/* + * This test is especially important for shared encrypted files, see the + * comments below. + */ +extern Datum buffile_test_shared(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_test_shared); +Datum +buffile_test_shared(PG_FUNCTION_ARGS) +{ + dsm_segment *seg; + SharedFileSet *fileset; + BufFile *bf_1, + *bf_2; + char *data_1, + *data_2, + *data; + Size chunk_size_1, + chunk_size_2; + int fileno, + i; + off_t offset, + res, + total_size; + + /* + * The size is not important, we actually do not need the shared memory. + * The segment is only needed to initialize the fileset. + */ + seg = dsm_create(1024, 0); + + /* + * The fileset must survive error handling, so that dsm_detach works fine. + * (The typical use case is that the fileset is in shared memory.) + */ + fileset = (SharedFileSet *) MemoryContextAlloc(TopTransactionContext, + sizeof(SharedFileSet)); + SharedFileSetInit(fileset, seg); + + bf_1 = BufFileCreateShared(fileset, "file_1"); + + /* + * Write more data than the buffer size, so that we can check that the + * number of "useful bytes" word is only appended at the end of the + * segment, not after each buffer. + */ + chunk_size_1 = BLCKSZ + 256; + data_1 = (char *) palloc(chunk_size_1); + memset(data_1, 1, chunk_size_1); + if (BufFileWrite(bf_1, data_1, chunk_size_1) != chunk_size_1) + elog(ERROR, "Failed to write data"); + pfree(data_1); + + /* + * Enforce buffer flush (The BufFileFlush() function is not exported). + * Thus the "useful bytes" metadata should appear at the current end the + * first file segment. The next write will have to seek back to overwrite + * the metadata. + */ + BufFileTell(bf_1, &fileno, &offset); + if (BufFileSeek(bf_1, 0, 0, SEEK_SET) != 0) + elog(ERROR, "seek failed"); + if (BufFileSeek(bf_1, fileno, offset, SEEK_SET) != 0) + elog(ERROR, "seek failed"); + + /* + * Write another chunk that does not fit into the first segment file. Thus + * the "useful bytes" metadata should appear at the end of both segments. + */ + chunk_size_2 = 3 * BLCKSZ; + data_2 = (char *) palloc(chunk_size_2); + memset(data_2, 1, chunk_size_2); + if (BufFileWrite(bf_1, data_2, chunk_size_2) != chunk_size_2) + elog(ERROR, "Failed to write data"); + pfree(data_2); + BufFileClose(bf_1); + + /* + * The word indicating the number of "useful bytes" (i.e. the actual data + * w/o padding to buffer size) is stored at the end of each segment file. + * Check that this metadata is read correctly. + */ + bf_2 = BufFileOpenShared(fileset, "file_1"); + total_size = BufFileSize(bf_2); + if (total_size != (chunk_size_1 + chunk_size_2)) + elog(ERROR, "Incorrect file size: %zu", total_size); + + data = (char *) palloc(total_size); + res = BufFileRead(bf_2, data, total_size); + if (res != total_size) + elog(ERROR, "Incorrect chunk size read: %zu", res); + for (i = 0; i < total_size; i++) + if (data[i] != 1) + elog(ERROR, "Unexpected data read from the file"); + pfree(data); + BufFileClose(bf_2); + + dsm_detach(seg); + + PG_RETURN_VOID(); +} + + +/* + * Test BufFileAppend(). + */ +extern Datum buffile_test_shared_append(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(buffile_test_shared_append); +Datum +buffile_test_shared_append(PG_FUNCTION_ARGS) +{ + dsm_segment *seg; + SharedFileSet *fileset; + BufFile *bf_1, + *bf_2, + *bf_3; + char *data; + Size chunk_size; + int fileno, + i; + off_t offset, + res, + total_size; + + seg = dsm_create(1024, 0); + + fileset = (SharedFileSet *) MemoryContextAlloc(TopTransactionContext, + sizeof(SharedFileSet)); + SharedFileSetInit(fileset, seg); + + /* + * XXX Does the chunk size matter much? + */ + chunk_size = 8; + data = (char *) palloc(chunk_size); + memset(data, 1, chunk_size); + + bf_1 = BufFileCreateShared(fileset, "file_1"); + if (BufFileWrite(bf_1, data, chunk_size) != chunk_size) + elog(ERROR, "Failed to write data"); + + bf_2 = BufFileCreateShared(fileset, "file_2"); + if (BufFileWrite(bf_2, data, chunk_size) != chunk_size) + elog(ERROR, "Failed to write data"); + + /* + * Make sure it's read-only so that BufFileAppend() can accept it as + * source. + */ + BufFileClose(bf_2); + bf_2 = BufFileOpenShared(fileset, "file_2"); + + bf_3 = BufFileCreateShared(fileset, "file_3"); + if (BufFileWrite(bf_3, data, chunk_size) != chunk_size) + elog(ERROR, "Failed to write data"); + BufFileClose(bf_3); + bf_3 = BufFileOpenShared(fileset, "file_3"); + + BufFileAppend(bf_1, bf_2); + BufFileAppend(bf_1, bf_3); + + total_size = BufFileSize(bf_1); + + /* + * The result should contain complete segments of bf_1 and bf_2 and the + * valid part of bf_3. + */ + if (total_size != (2 * MAX_PHYSICAL_FILESIZE_TEST + chunk_size)) + elog(ERROR, "Incorrect total size of the appended data: %zu", + total_size); + + /* + * Check that data of the 2nd segment was decrypted correctly. + */ + if (BufFileSeek(bf_1, 1, 0, SEEK_SET) != 0) + elog(ERROR, "seek failed"); + res = BufFileRead(bf_1, data, chunk_size); + if (res != chunk_size) + elog(ERROR, "Incorrect chunk size read: %zu", res); + for (i = 0; i < chunk_size; i++) + if (data[i] != 1) + elog(ERROR, "Unexpected data read from the file"); + + /* + * And the same for the 3rd segment. + * + * TODO Reuse the code above by putting it into a function. + */ + if (BufFileSeek(bf_1, 2, 0, SEEK_SET) != 0) + elog(ERROR, "seek failed"); + res = BufFileRead(bf_1, data, chunk_size); + if (res != chunk_size) + elog(ERROR, "Incorrect chunk size read: %zu", res); + for (i = 0; i < chunk_size; i++) + if (data[i] != 1) + elog(ERROR, "Unexpected data read from the file"); + + BufFileClose(bf_1); + dsm_detach(seg); + PG_RETURN_VOID(); +} diff --git a/src/test/modules/buffile/buffile.control b/src/test/modules/buffile/buffile.control new file mode 100644 index 0000000000..8472c5a348 --- /dev/null +++ b/src/test/modules/buffile/buffile.control @@ -0,0 +1,5 @@ +# buffile_test extension +comment = 'buffile_test' +default_version = '1.0' +module_pathname = '$libdir/buffile_test' +relocatable = true diff --git a/src/test/modules/buffile/expected/test_00.out b/src/test/modules/buffile/expected/test_00.out new file mode 100644 index 0000000000..637c33689f --- /dev/null +++ b/src/test/modules/buffile/expected/test_00.out @@ -0,0 +1,47 @@ +CREATE EXTENSION IF NOT EXISTS buffile; +-- This test only verifies that PG is compiled with a component file size of +-- 32 kB (i.e. 4 buffers of 8 kB) instead of 1 GB. That seems appropriate for +-- testing. Some other tests may rely on it. +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Skip the first component file. +SELECT buffile_seek(0, 32768); + buffile_seek +-------------- + 0 +(1 row) + +-- Write the first byte of the second component file. We can't simply +-- buffile_seek() beyond the position 32768 as it would return EOF. +SELECT buffile_write('a'); + buffile_write +--------------- + 1 +(1 row) + +-- Enforce BufFileFlush(), which actually adds the component file. +SELECT buffile_read(1); + buffile_read +-------------- + \x +(1 row) + +-- Check that we're in the 2nd file, i.e. the file size is as expected. +SELECT buffile_assert_fileno(1); + buffile_assert_fileno +----------------------- + +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_01.out b/src/test/modules/buffile/expected/test_01.out new file mode 100644 index 0000000000..448bac171c --- /dev/null +++ b/src/test/modules/buffile/expected/test_01.out @@ -0,0 +1,59 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +SELECT buffile_seek(0, 1); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('abc'); + buffile_write +--------------- + 3 +(1 row) + +SELECT buffile_seek(0, 0); + buffile_seek +-------------- + 0 +(1 row) + +-- Check that the trailing zeroes are not fetched. +SELECT buffile_read(16); + buffile_read +-------------- + \x00616263 +(1 row) + +-- Adjust the number of useful bytes. +SELECT buffile_write('abc'); + buffile_write +--------------- + 3 +(1 row) + +-- ... and check again. +SELECT buffile_seek(0, 0); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(16); + buffile_read +------------------ + \x00616263616263 +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_02.out b/src/test/modules/buffile/expected/test_02.out new file mode 100644 index 0000000000..f783d0cb24 --- /dev/null +++ b/src/test/modules/buffile/expected/test_02.out @@ -0,0 +1,48 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +SELECT buffile_seek(0, 8189); + buffile_seek +-------------- + 0 +(1 row) + +-- Initialize the last 3 positions of the first buffer and the initial 3 +-- positions of the 2nd buffer. +SELECT buffile_write('abcdef'); + buffile_write +--------------- + 6 +(1 row) + +SELECT buffile_seek(0, 0); + buffile_seek +-------------- + 0 +(1 row) + +-- Read the first buffer. +SELECT length(buffile_read(8192)); + length +-------- + 8192 +(1 row) + +-- Only 3 bytes of the 2nd buffer should be fetched. +SELECT length(buffile_read(8192)); + length +-------- + 3 +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_03.out b/src/test/modules/buffile/expected/test_03.out new file mode 100644 index 0000000000..e899fa3b38 --- /dev/null +++ b/src/test/modules/buffile/expected/test_03.out @@ -0,0 +1,27 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Read from an empty file. +SELECT buffile_seek(0, 8); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(16); + buffile_read +-------------- + \x +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_04.out b/src/test/modules/buffile/expected/test_04.out new file mode 100644 index 0000000000..1f8eeabe48 --- /dev/null +++ b/src/test/modules/buffile/expected/test_04.out @@ -0,0 +1,84 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Write something near the end of the first buffer, but leave some trailing +-- space. +SELECT buffile_seek(0, 8184); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('abcd'); + buffile_write +--------------- + 4 +(1 row) + +-- Leave the 2nd buffer empty, as well as a few leading bytes. Thus we should +-- get a hole that spans the whole 2nd buffer as well as a few adjacent bytes +-- on each side. +SELECT buffile_seek(0, 2 * 8192 + 4); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('efgh'); + buffile_write +--------------- + 4 +(1 row) + +-- Check the initial part of the hole, which crosses the boundary of the 1st +-- and the 2nd buffer. +SELECT buffile_seek(0, 8184); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(16); + buffile_read +------------------------------------ + \x61626364000000000000000000000000 +(1 row) + +-- Check the trailing part of the whole, which crosses the boundary of the 2nd +-- and the 3rd buffer. +SELECT buffile_seek(0, 2 * 8192 - 8); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(16); + buffile_read +------------------------------------ + \x00000000000000000000000065666768 +(1 row) + +-- Check that the hole contains nothing but zeroes. +SELECT buffile_seek(0, 8192 - 4); + buffile_seek +-------------- + 0 +(1 row) + +SELECT btrim(buffile_read(8192 + 8), '\x00'); + btrim +------- + \x +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_05.out b/src/test/modules/buffile/expected/test_05.out new file mode 100644 index 0000000000..6a73147711 --- /dev/null +++ b/src/test/modules/buffile/expected/test_05.out @@ -0,0 +1,33 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Seek does not extend the file if it's not followed by write. +SELECT buffile_seek(0, 1); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_seek(0, 0); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(2); + buffile_read +-------------- + \x +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_06.out b/src/test/modules/buffile/expected/test_06.out new file mode 100644 index 0000000000..def9af5112 --- /dev/null +++ b/src/test/modules/buffile/expected/test_06.out @@ -0,0 +1,41 @@ +-- This test shows that the first component file (segment) stays empty, read +-- stops prematurely even if it starts on that segment, even though it'd +-- otherwise receive some data from the following one. +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +SELECT buffile_seek(0, 32768); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('a'); + buffile_write +--------------- + 1 +(1 row) + +SELECT buffile_seek(0, 32767); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(2); + buffile_read +-------------- + \x +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_07.out b/src/test/modules/buffile/expected/test_07.out new file mode 100644 index 0000000000..215280f0b0 --- /dev/null +++ b/src/test/modules/buffile/expected/test_07.out @@ -0,0 +1,39 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Write data at component file boundary and try to read it. +SELECT buffile_seek(0, 32768); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('abcd'); + buffile_write +--------------- + 4 +(1 row) + +SELECT buffile_seek(0, 32768); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(8); + buffile_read +-------------- + \x61626364 +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_08.out b/src/test/modules/buffile/expected/test_08.out new file mode 100644 index 0000000000..7c1069930c --- /dev/null +++ b/src/test/modules/buffile/expected/test_08.out @@ -0,0 +1,39 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Write data across component file boundary and try to read it. +SELECT buffile_seek(0, 32766); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('abcd'); + buffile_write +--------------- + 4 +(1 row) + +SELECT buffile_seek(0, 32766); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(8); + buffile_read +-------------- + \x61626364 +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_09.out b/src/test/modules/buffile/expected/test_09.out new file mode 100644 index 0000000000..b9d325b676 --- /dev/null +++ b/src/test/modules/buffile/expected/test_09.out @@ -0,0 +1,39 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Write data across buffer boundary and try to read it. +SELECT buffile_seek(0, 8190); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('abcd'); + buffile_write +--------------- + 4 +(1 row) + +SELECT buffile_seek(0, 8190); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(8); + buffile_read +-------------- + \x61626364 +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_10.out b/src/test/modules/buffile/expected/test_10.out new file mode 100644 index 0000000000..8e457fdcda --- /dev/null +++ b/src/test/modules/buffile/expected/test_10.out @@ -0,0 +1,76 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +-- Write some data at the end of the buffer. +SELECT buffile_seek(0, 8188); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_write('abcd'); + buffile_write +--------------- + 4 +(1 row) + +SELECT buffile_seek(0, 8189); + buffile_seek +-------------- + 0 +(1 row) + +-- Enforce flush with the write position not at the end of the buffer. This is +-- special by not moving curOffset to the next buffer. +SELECT buffile_read(1); + buffile_read +-------------- + \x62 +(1 row) + +-- Therefore the next writes should eventually affect the original data. (Here +-- we also test going directly from read to write and vice versa.) +SELECT buffile_write('x'); + buffile_write +--------------- + 1 +(1 row) + +SELECT buffile_read(1); + buffile_read +-------------- + \x64 +(1 row) + +-- Start a new buffer, i.e. force flushing of the previous one. +SELECT buffile_write('z'); + buffile_write +--------------- + 1 +(1 row) + +-- Check that the 'x' and 'y' letters are in the first buffer, not in the +-- 2nd. (We read enough data to find any non-zero bytes in the 2nd buffer.) +SELECT buffile_seek(0, 8188); + buffile_seek +-------------- + 0 +(1 row) + +SELECT buffile_read(4 + 8192); + buffile_read +-------------- + \x616278647a +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_11.out b/src/test/modules/buffile/expected/test_11.out new file mode 100644 index 0000000000..c6804d08af --- /dev/null +++ b/src/test/modules/buffile/expected/test_11.out @@ -0,0 +1,34 @@ +BEGIN; +SELECT buffile_create(); + buffile_create +---------------- + +(1 row) + +SELECT buffile_write('abcd'); + buffile_write +--------------- + 4 +(1 row) + +-- Seek beyond EOF not followed by write. +SELECT buffile_seek(0, 5); + buffile_seek +-------------- + 0 +(1 row) + +-- Nothing should be fetched. +SELECT buffile_read(8); + buffile_read +-------------- + \x +(1 row) + +SELECT buffile_close(); + buffile_close +--------------- + +(1 row) + +COMMIT; diff --git a/src/test/modules/buffile/expected/test_12.out b/src/test/modules/buffile/expected/test_12.out new file mode 100644 index 0000000000..3dce7eaf0f --- /dev/null +++ b/src/test/modules/buffile/expected/test_12.out @@ -0,0 +1,12 @@ +SELECT buffile_test_shared(); + buffile_test_shared +--------------------- + +(1 row) + +SELECT buffile_test_shared_append(); + buffile_test_shared_append +---------------------------- + +(1 row) + diff --git a/src/test/modules/buffile/sql/test_00.sql b/src/test/modules/buffile/sql/test_00.sql new file mode 100644 index 0000000000..c907a5e195 --- /dev/null +++ b/src/test/modules/buffile/sql/test_00.sql @@ -0,0 +1,18 @@ +CREATE EXTENSION IF NOT EXISTS buffile; + +-- This test only verifies that PG is compiled with a component file size of +-- 32 kB (i.e. 4 buffers of 8 kB) instead of 1 GB. That seems appropriate for +-- testing. Some other tests may rely on it. +BEGIN; +SELECT buffile_create(); +-- Skip the first component file. +SELECT buffile_seek(0, 32768); +-- Write the first byte of the second component file. We can't simply +-- buffile_seek() beyond the position 32768 as it would return EOF. +SELECT buffile_write('a'); +-- Enforce BufFileFlush(), which actually adds the component file. +SELECT buffile_read(1); +-- Check that we're in the 2nd file, i.e. the file size is as expected. +SELECT buffile_assert_fileno(1); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_01.sql b/src/test/modules/buffile/sql/test_01.sql new file mode 100644 index 0000000000..8cc87de841 --- /dev/null +++ b/src/test/modules/buffile/sql/test_01.sql @@ -0,0 +1,14 @@ +BEGIN; +SELECT buffile_create(); +SELECT buffile_seek(0, 1); +SELECT buffile_write('abc'); +SELECT buffile_seek(0, 0); +-- Check that the trailing zeroes are not fetched. +SELECT buffile_read(16); +-- Adjust the number of useful bytes. +SELECT buffile_write('abc'); +-- ... and check again. +SELECT buffile_seek(0, 0); +SELECT buffile_read(16); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_02.sql b/src/test/modules/buffile/sql/test_02.sql new file mode 100644 index 0000000000..35d34722d7 --- /dev/null +++ b/src/test/modules/buffile/sql/test_02.sql @@ -0,0 +1,13 @@ +BEGIN; +SELECT buffile_create(); +SELECT buffile_seek(0, 8189); +-- Initialize the last 3 positions of the first buffer and the initial 3 +-- positions of the 2nd buffer. +SELECT buffile_write('abcdef'); +SELECT buffile_seek(0, 0); +-- Read the first buffer. +SELECT length(buffile_read(8192)); +-- Only 3 bytes of the 2nd buffer should be fetched. +SELECT length(buffile_read(8192)); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_03.sql b/src/test/modules/buffile/sql/test_03.sql new file mode 100644 index 0000000000..a95391f7c3 --- /dev/null +++ b/src/test/modules/buffile/sql/test_03.sql @@ -0,0 +1,7 @@ +BEGIN; +SELECT buffile_create(); +-- Read from an empty file. +SELECT buffile_seek(0, 8); +SELECT buffile_read(16); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_04.sql b/src/test/modules/buffile/sql/test_04.sql new file mode 100644 index 0000000000..64e8d39f94 --- /dev/null +++ b/src/test/modules/buffile/sql/test_04.sql @@ -0,0 +1,25 @@ +BEGIN; +SELECT buffile_create(); +-- Write something near the end of the first buffer, but leave some trailing +-- space. +SELECT buffile_seek(0, 8184); +SELECT buffile_write('abcd'); +-- Leave the 2nd buffer empty, as well as a few leading bytes. Thus we should +-- get a hole that spans the whole 2nd buffer as well as a few adjacent bytes +-- on each side. +SELECT buffile_seek(0, 2 * 8192 + 4); +SELECT buffile_write('efgh'); +-- Check the initial part of the hole, which crosses the boundary of the 1st +-- and the 2nd buffer. +SELECT buffile_seek(0, 8184); +SELECT buffile_read(16); +-- Check the trailing part of the whole, which crosses the boundary of the 2nd +-- and the 3rd buffer. +SELECT buffile_seek(0, 2 * 8192 - 8); +SELECT buffile_read(16); +-- Check that the hole contains nothing but zeroes. +SELECT buffile_seek(0, 8192 - 4); +SELECT btrim(buffile_read(8192 + 8), '\x00'); + +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_05.sql b/src/test/modules/buffile/sql/test_05.sql new file mode 100644 index 0000000000..5fd642e558 --- /dev/null +++ b/src/test/modules/buffile/sql/test_05.sql @@ -0,0 +1,8 @@ +BEGIN; +SELECT buffile_create(); +-- Seek does not extend the file if it's not followed by write. +SELECT buffile_seek(0, 1); +SELECT buffile_seek(0, 0); +SELECT buffile_read(2); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_06.sql b/src/test/modules/buffile/sql/test_06.sql new file mode 100644 index 0000000000..bbf506ec0e --- /dev/null +++ b/src/test/modules/buffile/sql/test_06.sql @@ -0,0 +1,11 @@ +-- This test shows that the first component file (segment) stays empty, read +-- stops prematurely even if it starts on that segment, even though it'd +-- otherwise receive some data from the following one. +BEGIN; +SELECT buffile_create(); +SELECT buffile_seek(0, 32768); +SELECT buffile_write('a'); +SELECT buffile_seek(0, 32767); +SELECT buffile_read(2); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_07.sql b/src/test/modules/buffile/sql/test_07.sql new file mode 100644 index 0000000000..ca78506ecb --- /dev/null +++ b/src/test/modules/buffile/sql/test_07.sql @@ -0,0 +1,9 @@ +BEGIN; +SELECT buffile_create(); +-- Write data at component file boundary and try to read it. +SELECT buffile_seek(0, 32768); +SELECT buffile_write('abcd'); +SELECT buffile_seek(0, 32768); +SELECT buffile_read(8); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_08.sql b/src/test/modules/buffile/sql/test_08.sql new file mode 100644 index 0000000000..0e5b1de383 --- /dev/null +++ b/src/test/modules/buffile/sql/test_08.sql @@ -0,0 +1,9 @@ +BEGIN; +SELECT buffile_create(); +-- Write data across component file boundary and try to read it. +SELECT buffile_seek(0, 32766); +SELECT buffile_write('abcd'); +SELECT buffile_seek(0, 32766); +SELECT buffile_read(8); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_09.sql b/src/test/modules/buffile/sql/test_09.sql new file mode 100644 index 0000000000..cc7060932e --- /dev/null +++ b/src/test/modules/buffile/sql/test_09.sql @@ -0,0 +1,9 @@ +BEGIN; +SELECT buffile_create(); +-- Write data across buffer boundary and try to read it. +SELECT buffile_seek(0, 8190); +SELECT buffile_write('abcd'); +SELECT buffile_seek(0, 8190); +SELECT buffile_read(8); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_10.sql b/src/test/modules/buffile/sql/test_10.sql new file mode 100644 index 0000000000..63af760d9f --- /dev/null +++ b/src/test/modules/buffile/sql/test_10.sql @@ -0,0 +1,25 @@ +BEGIN; +SELECT buffile_create(); +-- Write some data at the end of the buffer. +SELECT buffile_seek(0, 8188); +SELECT buffile_write('abcd'); +SELECT buffile_seek(0, 8189); +-- Enforce flush with the write position not at the end of the buffer. This is +-- special by not moving curOffset to the next buffer. +SELECT buffile_read(1); + +-- Therefore the next writes should eventually affect the original data. (Here +-- we also test going directly from read to write and vice versa.) +SELECT buffile_write('x'); +SELECT buffile_read(1); + +-- Start a new buffer, i.e. force flushing of the previous one. +SELECT buffile_write('z'); + +-- Check that the 'x' and 'y' letters are in the first buffer, not in the +-- 2nd. (We read enough data to find any non-zero bytes in the 2nd buffer.) +SELECT buffile_seek(0, 8188); +SELECT buffile_read(4 + 8192); + +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_11.sql b/src/test/modules/buffile/sql/test_11.sql new file mode 100644 index 0000000000..94300d253f --- /dev/null +++ b/src/test/modules/buffile/sql/test_11.sql @@ -0,0 +1,9 @@ +BEGIN; +SELECT buffile_create(); +SELECT buffile_write('abcd'); +-- Seek beyond EOF not followed by write. +SELECT buffile_seek(0, 5); +-- Nothing should be fetched. +SELECT buffile_read(8); +SELECT buffile_close(); +COMMIT; diff --git a/src/test/modules/buffile/sql/test_12.sql b/src/test/modules/buffile/sql/test_12.sql new file mode 100644 index 0000000000..a28ae395ef --- /dev/null +++ b/src/test/modules/buffile/sql/test_12.sql @@ -0,0 +1,2 @@ +SELECT buffile_test_shared(); +SELECT buffile_test_shared_append();