summaryrefslogtreecommitdiff
path: root/src/backend/access/gin
diff options
context:
space:
mode:
authorTeodor Sigaev2018-05-04 08:27:50 +0000
committerTeodor Sigaev2018-05-04 08:27:50 +0000
commit0bef1c0678d94436f80450d562a866bb6fa5e509 (patch)
treefbcc3b758f520971188a8d9f6727e482f23afe02 /src/backend/access/gin
parent7d8679975f650d1150d706c8b6a5f04f08dcdd00 (diff)
Re-think predicate locking on GIN indexes.
The principle behind the locking was not very well thought-out, and not documented. Add a section in the README to explain how it's supposed to work, and change the code so that it actually works that way. This fixes two bugs: 1. If fast update was turned on concurrently, subsequent inserts to the pending list would not conflict with predicate locks that were acquired earlier, on entry pages. The included 'predicate-gin-fastupdate' test demonstrates that. To fix, make all scans acquire a predicate lock on the metapage. That lock represents a scan of the pending list, whether or not there is a pending list at the moment. Forget about the optimization to skip locking/checking for locks, when fastupdate=off. 2. If a scan finds no match, it still needs to lock the entry page. The point of predicate locks is to lock the gabs between values, whether or not there is a match. The included 'predicate-gin-nomatch' test tests that case. In addition to those two bug fixes, this removes some unnecessary locking, following the principle laid out in the README. Because all items in a posting tree have the same key value, a lock on the posting tree root is enough to cover all the items. (With a very large posting tree, it would possibly be better to lock the posting tree leaf pages instead, so that a "skip scan" with a query like "A & B", you could avoid unnecessary conflict if a new tuple is inserted with A but !B. But let's keep this simple.) Also, some spelling fixes. Author: Heikki Linnakangas with some editorization by me Review: Andrey Borodin, Alexander Korotkov Discussion: https://www.postgresql.org/message-id/[email protected]
Diffstat (limited to 'src/backend/access/gin')
-rw-r--r--src/backend/access/gin/README34
-rw-r--r--src/backend/access/gin/ginbtree.c3
-rw-r--r--src/backend/access/gin/gindatapage.c7
-rw-r--r--src/backend/access/gin/ginfast.c8
-rw-r--r--src/backend/access/gin/ginget.c116
-rw-r--r--src/backend/access/gin/gininsert.c26
-rw-r--r--src/backend/access/gin/ginutil.c7
-rw-r--r--src/backend/access/gin/ginvacuum.c1
8 files changed, 104 insertions, 98 deletions
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README
index 990b5ffa581..cc434b1feb7 100644
--- a/src/backend/access/gin/README
+++ b/src/backend/access/gin/README
@@ -331,6 +331,40 @@ page-deletions safe; it stamps the deleted pages with an XID and keeps the
deleted pages around with the right-link intact until all concurrent scans
have finished.)
+Predicate Locking
+-----------------
+
+GIN supports predicate locking, for serializable snapshot isolation.
+A predicate locks represent that a scan has scanned a range of values. They
+are not concerned with physical pages as such, but the logical key values.
+A predicate lock on a page covers the key range that would belong on that
+page, whether or not there are any matching tuples there currently. In other
+words, a predicate lock on an index page covers the "gaps" between the index
+tuples. To minimize false positives, predicate locks are acquired at the
+finest level possible.
+
+* Like in the B-tree index, it is enough to lock only leaf pages, because all
+ insertions happen at the leaf level.
+
+* In an equality search (i.e. not a partial match search), if a key entry has
+ a posting tree, we lock the posting tree root page, to represent a lock on
+ just that key entry. Otherwise, we lock the entry tree page. We also lock
+ the entry tree page if no match is found, to lock the "gap" where the entry
+ would've been, had there been one.
+
+* In a partial match search, we lock all the entry leaf pages that we scan,
+ in addition to locks on posting tree roots, to represent the "gaps" between
+ values.
+
+* In addition to the locks on entry leaf pages and posting tree roots, all
+ scans grab a lock the metapage. This is to interlock with insertions to
+ the fast update pending list. An insertion to the pending list can really
+ belong anywhere in the tree, and the lock on the metapage represents that.
+
+The interlock for fastupdate pending lists means that with fastupdate=on,
+we effectively always grab a full-index lock, so you could get a lot of false
+positives.
+
Compatibility
-------------
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index 828c7074b70..030d0f44183 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -84,6 +84,9 @@ ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot)
stack->parent = NULL;
stack->predictNumber = 1;
+ if (!searchMode)
+ CheckForSerializableConflictIn(btree->index, NULL, stack->buffer);
+
for (;;)
{
Page page;
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index 59bf21744f5..aeaf8adab09 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -1812,8 +1812,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
blkno = BufferGetBlockNumber(buffer);
/*
- * Copy a predicate lock from entry tree leaf (containing posting list) to
- * posting tree.
+ * Copy any predicate locks from the entry tree leaf (containing posting
+ * list) to the posting tree.
*/
PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
@@ -1864,7 +1864,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
return blkno;
}
-void
+static void
ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
{
memset(btree, 0, sizeof(GinBtreeData));
@@ -1911,7 +1911,6 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
btree.itemptr = insertdata.items[insertdata.curitem];
stack = ginFindLeafPage(&btree, false, NULL);
- GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
ginInsertValue(&btree, stack, &insertdata, buildStats);
}
}
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index 615730b8e55..5f624cf6fac 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -31,6 +31,7 @@
#include "postmaster/autovacuum.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
#include "utils/builtins.h"
/* GUC parameter */
@@ -245,6 +246,13 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
metapage = BufferGetPage(metabuffer);
+ /*
+ * An insertion to the pending list could logically belong anywhere in
+ * the tree, so it conflicts with all serializable scans. All scans
+ * acquire a predicate lock on the metabuffer to represent that.
+ */
+ CheckForSerializableConflictIn(index, NULL, metabuffer);
+
if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
{
/*
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index f3db7cc6405..ef3cd7dbe2a 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -36,20 +36,6 @@ typedef struct pendingPosition
/*
- * Place predicate lock on GIN page if needed.
- */
-static void
-GinPredicateLockPage(Relation index, BlockNumber blkno, Snapshot snapshot)
-{
- /*
- * When fast update is on then no need in locking pages, because we anyway
- * need to lock the whole index.
- */
- if (!GinGetUseFastUpdate(index))
- PredicateLockPage(index, blkno, snapshot);
-}
-
-/*
* Goes to the next page if current offset is outside of bounds
*/
static bool
@@ -68,7 +54,7 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot
stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
stack->blkno = BufferGetBlockNumber(stack->buffer);
stack->off = FirstOffsetNumber;
- GinPredicateLockPage(btree->index, stack->blkno, snapshot);
+ PredicateLockPage(btree->index, stack->blkno, snapshot);
}
return true;
@@ -100,11 +86,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
*/
for (;;)
{
- /*
- * Predicate lock each leaf page in posting tree
- */
- GinPredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot);
-
page = BufferGetPage(buffer);
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
{
@@ -158,7 +139,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
* Predicate lock entry leaf page, following pages will be locked by
* moveRightIfItNeeded()
*/
- GinPredicateLockPage(btree->index, stack->buffer, snapshot);
+ PredicateLockPage(btree->index, stack->buffer, snapshot);
for (;;)
{
@@ -253,6 +234,13 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
LockBuffer(stack->buffer, GIN_UNLOCK);
+ /*
+ * Acquire predicate lock on the posting tree. We already hold
+ * a lock on the entry page, but insertions to the posting tree
+ * don't check for conflicts on that level.
+ */
+ PredicateLockPage(btree->index, rootPostingTree, snapshot);
+
/* Collect all the TIDs in this entry's posting tree */
scanPostingTree(btree->index, scanEntry, rootPostingTree,
snapshot);
@@ -400,10 +388,6 @@ restartScanEntry:
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
- /* Predicate lock visited entry leaf page */
- GinPredicateLockPage(ginstate->index,
- BufferGetBlockNumber(stackEntry->buffer), snapshot);
-
if (GinIsPostingTree(itup))
{
BlockNumber rootPostingTree = GinGetPostingTree(itup);
@@ -412,6 +396,13 @@ restartScanEntry:
ItemPointerData minItem;
/*
+ * This is an equality scan, so lock the root of the posting tree.
+ * It represents a lock on the exact key value, and covers all the
+ * items in the posting tree.
+ */
+ PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
+
+ /*
* We should unlock entry page before touching posting tree to
* prevent deadlocks with vacuum processes. Because entry is never
* deleted from page and posting tree is never reduced to the
@@ -426,12 +417,6 @@ restartScanEntry:
entry->buffer = stack->buffer;
/*
- * Predicate lock visited posting tree page, following pages will
- * be locked by moveRightIfItNeeded or entryLoadMoreItems
- */
- GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
-
- /*
* We keep buffer pinned because we need to prevent deletion of
* page during scan. See GIN's vacuum implementation. RefCount is
* increased to keep buffer pinned after freeGinBtreeStack() call.
@@ -452,15 +437,38 @@ restartScanEntry:
freeGinBtreeStack(stack);
entry->isFinished = false;
}
- else if (GinGetNPosting(itup) > 0)
+ else
{
- entry->list = ginReadTuple(ginstate, entry->attnum, itup,
- &entry->nlist);
- entry->predictNumberResult = entry->nlist;
+ /*
+ * Lock the entry leaf page. This is more coarse-grained than
+ * necessary, because it will conflict with any insertions that
+ * land on the same leaf page, not only the exacty key we searched
+ * for. But locking an individual tuple would require updating
+ * that lock whenever it moves because of insertions or vacuums,
+ * which seems too complicated.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer),
+ snapshot);
+ if (GinGetNPosting(itup) > 0)
+ {
+ entry->list = ginReadTuple(ginstate, entry->attnum, itup,
+ &entry->nlist);
+ entry->predictNumberResult = entry->nlist;
- entry->isFinished = false;
+ entry->isFinished = false;
+ }
}
}
+ else
+ {
+ /*
+ * No entry found. Predicate lock the leaf page, to lock the place
+ * where the entry would've been, had there been one.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer), snapshot);
+ }
if (needUnlock)
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
@@ -533,7 +541,7 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
for (i = 0; i < key->nentries - 1; i++)
{
- /* Pass all entries <= i as false, and the rest as MAYBE */
+ /* Pass all entries <= i as FALSE, and the rest as MAYBE */
for (j = 0; j <= i; j++)
key->entryRes[entryIndexes[j]] = GIN_FALSE;
for (j = i + 1; j < key->nentries; j++)
@@ -673,8 +681,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
entry->btree.fullScan = false;
stack = ginFindLeafPage(&entry->btree, true, snapshot);
- GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(stack->buffer), snapshot);
-
/* we don't need the stack, just the buffer. */
entry->buffer = stack->buffer;
IncrBufferRefCount(entry->buffer);
@@ -719,10 +725,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
entry->buffer = ginStepRight(entry->buffer,
ginstate->index,
GIN_SHARE);
-
- GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
-
-
page = BufferGetPage(entry->buffer);
}
stepright = true;
@@ -1084,8 +1086,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
* lossy page even when none of the other entries match.
*
* Our strategy is to call the tri-state consistent function, with the
- * lossy-page entries set to MAYBE, and all the other entries false. If it
- * returns false, none of the lossy items alone are enough for a match, so
+ * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
+ * returns FALSE, none of the lossy items alone are enough for a match, so
* we don't need to return a lossy-page pointer. Otherwise, return a
* lossy-page pointer to indicate that the whole heap page must be
* checked. (On subsequent calls, we'll do nothing until minItem is past
@@ -1746,8 +1748,7 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
}
/*
- * Collect all matched rows from pending list into bitmap. Also function
- * takes PendingLockRelation if it's needed.
+ * Collect all matched rows from pending list into bitmap.
*/
static void
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
@@ -1764,6 +1765,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
*ntids = 0;
+ /*
+ * Acquire predicate lock on the metapage, to conflict with any
+ * fastupdate insertions.
+ */
+ PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
+
LockBuffer(metabuffer, GIN_SHARE);
page = BufferGetPage(metabuffer);
TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
@@ -1777,24 +1784,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
{
/* No pending list, so proceed with normal scan */
UnlockReleaseBuffer(metabuffer);
-
- /*
- * If fast update is enabled, we acquire a predicate lock on the
- * entire relation as fast update postpones the insertion of tuples
- * into index structure due to which we can't detect rw conflicts.
- */
- if (GinGetUseFastUpdate(scan->indexRelation))
- PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
-
return;
}
- /*
- * Pending list is not empty, we need to lock the index doesn't despite on
- * fastupdate state
- */
- PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
-
pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
LockBuffer(pos.pendingBuffer, GIN_SHARE);
pos.firstOffset = FirstOffsetNumber;
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index cf218dd75d4..5281eb68238 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -219,7 +219,7 @@ ginEntryInsert(GinState *ginstate,
return;
}
- GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
+ CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
/* modify an existing leaf entry */
itup = addItemPointersToLeafTuple(ginstate, itup,
items, nitem, buildStats, stack->buffer);
@@ -228,7 +228,7 @@ ginEntryInsert(GinState *ginstate,
}
else
{
- GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
+ CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
/* no match, so construct a new leaf entry */
itup = buildFreshLeafTuple(ginstate, attnum, key, category,
items, nitem, buildStats, stack->buffer);
@@ -517,18 +517,6 @@ gininsert(Relation index, Datum *values, bool *isnull,
memset(&collector, 0, sizeof(GinTupleCollector));
- /*
- * With fastupdate on each scan and each insert begin with access to
- * pending list, so it effectively lock entire index. In this case we
- * aquire predicate lock and check for conflicts over index relation,
- * and hope that it will reduce locking overhead.
- *
- * Do not use GinCheckForSerializableConflictIn() here, because it
- * will do nothing (it does actual work only with fastupdate off).
- * Check for conflicts for entire index.
- */
- CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
-
for (i = 0; i < ginstate->origTupdesc->natts; i++)
ginHeapTupleFastCollect(ginstate, &collector,
(OffsetNumber) (i + 1),
@@ -539,16 +527,6 @@ gininsert(Relation index, Datum *values, bool *isnull,
}
else
{
- GinStatsData stats;
-
- /*
- * Fastupdate is off but if pending list isn't empty then we need to
- * check conflicts with PredicateLockRelation in scanPendingInsert().
- */
- ginGetStats(index, &stats);
- if (stats.nPendingPages > 0)
- CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
-
for (i = 0; i < ginstate->origTupdesc->natts; i++)
ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
values[i], isnull[i],
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 4367523dd98..0a32182dd7f 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -718,10 +718,3 @@ ginUpdateStats(Relation index, const GinStatsData *stats)
END_CRIT_SECTION();
}
-
-void
-GinCheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
-{
- if (!GinGetUseFastUpdate(relation))
- CheckForSerializableConflictIn(relation, tuple, buffer);
-}
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index dd8e31b8721..3104bc12b63 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -166,7 +166,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
START_CRIT_SECTION();
/* Unlink the page by changing left sibling's rightlink */
-
page = BufferGetPage(lBuffer);
GinPageGetOpaque(page)->rightlink = rightlink;