PG-1604 fix: preallocate one more record for the cache

dutow · dutow · commit ee37fc9677bb · 2025-08-18T08:20:50.000+01:00
There is at lesat one corner case scenario where we have to load the
last record into the cache during a write:

* replica crashes, receives last segment from primary
* replica replays last segment, reaches end
* replica activtes new key
* replica replays prepared transaction, has to use old keys again
* old key write function sees that we generated a new key, tries to load
  it

In this scenario we could get away by detecting that we are in a write,
and asserting if we tried to use the last key.

But in a release build assertions are not fired, and we would end up
writing some non encrypted data to disk, and later if we have to run
recovery failing.

It could be a FATAL, but that would still crash the server, and the next
startup would crash again and again...

Instead, to properly avoid this situation we preallocate memory for one
more key in the cache during initialization. Since we can only add one
extra key to the cache during the servers run, this means we no longer
try to allocate in the critical section in any corner case.

While this is not the nicest solution, it is simple and keeps the
current cache and decrypt/encrypt logic the same as before. Any other
solution would be more complex, and even more of a hack, as it would
require dealing with a possibly out of date cache.
diff --git a/contrib/pg_tde/src/access/pg_tde_xlog_keys.c b/contrib/pg_tde/src/access/pg_tde_xlog_keys.c
@@ -16,6 +16,7 @@
 #include "common/pg_tde_utils.h"
 #include "encryption/enc_aes.h"
 #include "encryption/enc_tde.h"
+#include "utils/palloc.h"
 
 #ifdef FRONTEND
 #include "pg_tde_fe.h"
@@ -43,7 +44,9 @@ typedef struct WalKeyFileEntry
 } WalKeyFileEntry;
 
 static WALKeyCacheRec *tde_wal_key_cache = NULL;
+static WALKeyCacheRec *tde_wal_extra_alloc = NULL;
 static WALKeyCacheRec *tde_wal_key_last_rec = NULL;
+static WalEncryptionKey *tde_wal_extra_key = NULL;
 
 static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key);
 static WalEncryptionKey *pg_tde_decrypt_wal_key(const TDEPrincipalKey *principal_key, WalKeyFileEntry *entry);
@@ -326,6 +329,34 @@ pg_tde_fetch_wal_keys(WalLocation start)
 	return return_wal_rec;
 }
 
+/*
+ * In special cases, we have to add one more record to the WAL key cache during write (in the critical section, when we can't allocate).
+ * This method is a helper to deal with that: when adding a single key, we potentially allocate 2 records.
+ * These variables help preallocate them, so in the critical section we can just use the already allocated objects, and update the cache with them.
+ *
+ * While this is somewhat a hack, it is also simple, safe, reliable, and way easier to implement than to refactor the cache or the decryption/encryption loop.
+ */
+void
+pg_tde_wal_cache_extra_palloc(void)
+{
+#ifndef FRONTEND
+	MemoryContext oldCtx;
+
+	oldCtx = MemoryContextSwitchTo(TopMemoryContext);
+#endif
+	if (tde_wal_extra_alloc == NULL)
+	{
+		tde_wal_extra_alloc = palloc0_object(WALKeyCacheRec);
+	}
+	if (tde_wal_extra_key == NULL)
+	{
+		tde_wal_extra_key = palloc0_object(WalEncryptionKey);
+	}
+#ifndef FRONTEND
+	MemoryContextSwitchTo(oldCtx);
+#endif
+}
+
 static WALKeyCacheRec *
 pg_tde_add_wal_key_to_cache(WalEncryptionKey *key)
 {
@@ -335,7 +366,8 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key)
 
 	oldCtx = MemoryContextSwitchTo(TopMemoryContext);
 #endif
-	wal_rec = palloc0_object(WALKeyCacheRec);
+	wal_rec = tde_wal_extra_alloc == NULL ? palloc0_object(WALKeyCacheRec) : tde_wal_extra_alloc;
+	tde_wal_extra_alloc = NULL;
 #ifndef FRONTEND
 	MemoryContextSwitchTo(oldCtx);
 #endif
@@ -553,7 +585,9 @@ pg_tde_write_wal_key_file_entry(const WalEncryptionKey *rel_key_data,
 static WalEncryptionKey *
 pg_tde_decrypt_wal_key(const TDEPrincipalKey *principal_key, WalKeyFileEntry *entry)
 {
-	WalEncryptionKey *key = palloc_object(WalEncryptionKey);
+	WalEncryptionKey *key = tde_wal_extra_key == NULL ? palloc_object(WalEncryptionKey) : tde_wal_extra_key;
+
+	tde_wal_extra_key = NULL;
 
 	Assert(principal_key);
 
diff --git a/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c b/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c
@@ -248,6 +248,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
 
 		/* cache is empty, prefetch keys from disk */
 		pg_tde_fetch_wal_keys(start);
+		pg_tde_wal_cache_extra_palloc();
 	}
 
 	if (key)
@@ -284,8 +285,8 @@ TDEXLogWriteEncryptedPagesOldKeys(int fd, const void *buf, size_t count, off_t o
 	memcpy(enc_buff, buf, count);
 
 	/*
-	 * This method potentially allocates, but only in very early execution
-	 * Shouldn't happen in a write, where we are in a critical section
+	 * This method potentially allocates, but only in very early execution Can
+	 * happen during a write, but we have one more cache entry preallocated.
 	 */
 	TDEXLogCryptBuffer(buf, enc_buff, count, offset, tli, segno, segSize);
 
@@ -356,7 +357,7 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
 	lastKeyUsable = (writeKeyLoc.lsn != 0);
 	afterWriteKey = wal_location_cmp(writeKeyLoc, loc) <= 0;
 
-	if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && !lastKeyUsable)
+	if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && !lastKeyUsable && afterWriteKey)
 	{
 		WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
 
@@ -442,10 +443,8 @@ TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
 		WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
 		WalLocation write_loc = {.tli = TDEXLogGetEncKeyTli(),.lsn = write_key_lsn};
 
-		Assert(last_key);
-
 		/* write has generated a new key, need to fetch it */
-		if (wal_location_cmp(last_key->start, write_loc) < 0)
+		if (last_key != NULL && wal_location_cmp(last_key->start, write_loc) < 0)
 		{
 			pg_tde_fetch_wal_keys(write_loc);
 
diff --git a/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h b/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h
@@ -82,5 +82,6 @@ extern WalEncryptionKey *pg_tde_read_last_wal_key(void);
 extern void pg_tde_save_server_key(const TDEPrincipalKey *principal_key, bool write_xlog);
 extern void pg_tde_save_server_key_redo(const TDESignedPrincipalKeyInfo *signed_key_info);
 extern void pg_tde_wal_last_key_set_location(WalLocation loc);
+extern void pg_tde_wal_cache_extra_palloc(void);
 
 #endif							/* PG_TDE_XLOG_KEYS_H */