summaryrefslogtreecommitdiff
path: root/src/backend/storage
diff options
context:
space:
mode:
authorThomas Munro2024-12-20 08:53:25 +0000
committerThomas Munro2024-12-20 10:57:02 +0000
commit38c579b08988e6f1a5bd74241d0a1001421d8015 (patch)
tree172ea3eee7ad5da273e6246b0dcd86c9a992e5e7 /src/backend/storage
parent02a8d0c45253eb54e57b1974c8627e5be3e1d852 (diff)
Fix corruption when relation truncation fails.
RelationTruncate() does three things, while holding an AccessExclusiveLock and preventing checkpoints: 1. Logs the truncation. 2. Drops buffers, even if they're dirty. 3. Truncates some number of files. Step 2 could previously be canceled if it had to wait for I/O, and step 3 could and still can fail in file APIs. All orderings of these operations have data corruption hazards if interrupted, so we can't give up until the whole operation is done. When dirty pages were discarded but the corresponding blocks were left on disk due to ERROR, old page versions could come back from disk, reviving deleted data (see pgsql-bugs #18146 and several like it). When primary and standby were allowed to disagree on relation size, standbys could panic (see pgsql-bugs #18426) or revive data unknown to visibility management on the primary (theorized). Changes: * WAL is now unconditionally flushed first * smgrtruncate() is now called in a critical section, preventing interrupts and causing PANIC on file API failure * smgrtruncate() has a new parameter for existing fork sizes, because it can't call smgrnblocks() itself inside a critical section The changes apply to RelationTruncate(), smgr_redo() and pg_truncate_visibility_map(). That last is also brought up to date with other evolutions of the truncation protocol. The VACUUM FileTruncate() failure mode had been discussed in older reports than the ones referenced below, with independent analysis from many people, but earlier theories on how to fix it were too complicated to back-patch. The more recently invented cancellation bug was diagnosed by Alexander Lakhin. Other corruption scenarios were spotted by me while iterating on this patch and earlier commit 75818b3a. Back-patch to all supported releases. Reviewed-by: Michael Paquier <[email protected]> Reviewed-by: Robert Haas <[email protected]> Reported-by: [email protected] Reported-by: Alexander Lakhin <[email protected]> Discussion: https://postgr.es/m/18146-04e908c662113ad5%40postgresql.org Discussion: https://postgr.es/m/18426-2d18da6586f152d6%40postgresql.org
Diffstat (limited to 'src/backend/storage')
-rw-r--r--src/backend/storage/smgr/md.c28
-rw-r--r--src/backend/storage/smgr/smgr.c14
2 files changed, 30 insertions, 12 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 4a897a83593..11fccda475f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1154,19 +1154,21 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
/*
* mdtruncate() -- Truncate relation to specified number of blocks.
+ *
+ * Guaranteed not to allocate memory, so it can be used in a critical section.
+ * Caller must have called smgrnblocks() to obtain curnblk while holding a
+ * sufficient lock to prevent a change in relation size, and not used any smgr
+ * functions for this relation or handled interrupts in between. This makes
+ * sure we have opened all active segments, so that truncate loop will get
+ * them all!
*/
void
-mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+mdtruncate(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber curnblk, BlockNumber nblocks)
{
- BlockNumber curnblk;
BlockNumber priorblocks;
int curopensegs;
- /*
- * NOTE: mdnblocks makes sure we have opened all active segments, so that
- * truncation loop will get them all!
- */
- curnblk = mdnblocks(reln, forknum);
if (nblocks > curnblk)
{
/* Bogus request ... but no complaint if InRecovery */
@@ -1505,7 +1507,7 @@ _fdvec_resize(SMgrRelation reln,
reln->md_seg_fds[forknum] =
MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
}
- else
+ else if (nseg > reln->md_num_open_segs[forknum])
{
/*
* It doesn't seem worthwhile complicating the code to amortize
@@ -1517,6 +1519,16 @@ _fdvec_resize(SMgrRelation reln,
repalloc(reln->md_seg_fds[forknum],
sizeof(MdfdVec) * nseg);
}
+ else
+ {
+ /*
+ * We don't reallocate a smaller array, because we want mdtruncate()
+ * to be able to promise that it won't allocate memory, so that it is
+ * allowed in a critical section. This means that a bit of space in
+ * the array is now wasted, until the next time we add a segment and
+ * reallocate.
+ */
+ }
reln->md_num_open_segs[forknum] = nseg;
}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 925728eb6c1..36ad34aa6ac 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -101,7 +101,7 @@ typedef struct f_smgr
BlockNumber blocknum, BlockNumber nblocks);
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks);
+ BlockNumber old_blocks, BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
} f_smgr;
@@ -719,10 +719,15 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
*
* The caller must hold AccessExclusiveLock on the relation, to ensure that
* other backends receive the smgr invalidation event that this function sends
- * before they access any forks of the relation again.
+ * before they access any forks of the relation again. The current size of
+ * the forks should be provided in old_nblocks. This function should normally
+ * be called in a critical section, but the current size must be checked
+ * outside the critical section, and no interrupts or smgr functions relating
+ * to this relation should be called in between.
*/
void
-smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks)
+smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
+ BlockNumber *old_nblocks, BlockNumber *nblocks)
{
int i;
@@ -750,7 +755,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
/* Make the cached size is invalid if we encounter an error. */
reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
- smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
+ smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
+ old_nblocks[i], nblocks[i]);
/*
* We might as well update the local smgr_cached_nblocks values. The