diff options
author | Teodor Sigaev | 2018-05-04 08:27:50 +0000 |
---|---|---|
committer | Teodor Sigaev | 2018-05-04 08:27:50 +0000 |
commit | 0bef1c0678d94436f80450d562a866bb6fa5e509 (patch) | |
tree | fbcc3b758f520971188a8d9f6727e482f23afe02 /src/backend/access/gin | |
parent | 7d8679975f650d1150d706c8b6a5f04f08dcdd00 (diff) |
Re-think predicate locking on GIN indexes.
The principle behind the locking was not very well thought-out, and not
documented. Add a section in the README to explain how it's supposed to
work, and change the code so that it actually works that way.
This fixes two bugs:
1. If fast update was turned on concurrently, subsequent inserts to the
pending list would not conflict with predicate locks that were acquired
earlier, on entry pages. The included 'predicate-gin-fastupdate' test
demonstrates that. To fix, make all scans acquire a predicate lock on
the metapage. That lock represents a scan of the pending list, whether
or not there is a pending list at the moment. Forget about the
optimization to skip locking/checking for locks, when fastupdate=off.
2. If a scan finds no match, it still needs to lock the entry page. The
point of predicate locks is to lock the gabs between values, whether
or not there is a match. The included 'predicate-gin-nomatch' test
tests that case.
In addition to those two bug fixes, this removes some unnecessary locking,
following the principle laid out in the README. Because all items in
a posting tree have the same key value, a lock on the posting tree root is
enough to cover all the items. (With a very large posting tree, it would
possibly be better to lock the posting tree leaf pages instead, so that a
"skip scan" with a query like "A & B", you could avoid unnecessary conflict
if a new tuple is inserted with A but !B. But let's keep this simple.)
Also, some spelling fixes.
Author: Heikki Linnakangas with some editorization by me
Review: Andrey Borodin, Alexander Korotkov
Discussion: https://www.postgresql.org/message-id/[email protected]
Diffstat (limited to 'src/backend/access/gin')
-rw-r--r-- | src/backend/access/gin/README | 34 | ||||
-rw-r--r-- | src/backend/access/gin/ginbtree.c | 3 | ||||
-rw-r--r-- | src/backend/access/gin/gindatapage.c | 7 | ||||
-rw-r--r-- | src/backend/access/gin/ginfast.c | 8 | ||||
-rw-r--r-- | src/backend/access/gin/ginget.c | 116 | ||||
-rw-r--r-- | src/backend/access/gin/gininsert.c | 26 | ||||
-rw-r--r-- | src/backend/access/gin/ginutil.c | 7 | ||||
-rw-r--r-- | src/backend/access/gin/ginvacuum.c | 1 |
8 files changed, 104 insertions, 98 deletions
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index 990b5ffa581..cc434b1feb7 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -331,6 +331,40 @@ page-deletions safe; it stamps the deleted pages with an XID and keeps the deleted pages around with the right-link intact until all concurrent scans have finished.) +Predicate Locking +----------------- + +GIN supports predicate locking, for serializable snapshot isolation. +A predicate locks represent that a scan has scanned a range of values. They +are not concerned with physical pages as such, but the logical key values. +A predicate lock on a page covers the key range that would belong on that +page, whether or not there are any matching tuples there currently. In other +words, a predicate lock on an index page covers the "gaps" between the index +tuples. To minimize false positives, predicate locks are acquired at the +finest level possible. + +* Like in the B-tree index, it is enough to lock only leaf pages, because all + insertions happen at the leaf level. + +* In an equality search (i.e. not a partial match search), if a key entry has + a posting tree, we lock the posting tree root page, to represent a lock on + just that key entry. Otherwise, we lock the entry tree page. We also lock + the entry tree page if no match is found, to lock the "gap" where the entry + would've been, had there been one. + +* In a partial match search, we lock all the entry leaf pages that we scan, + in addition to locks on posting tree roots, to represent the "gaps" between + values. + +* In addition to the locks on entry leaf pages and posting tree roots, all + scans grab a lock the metapage. This is to interlock with insertions to + the fast update pending list. An insertion to the pending list can really + belong anywhere in the tree, and the lock on the metapage represents that. + +The interlock for fastupdate pending lists means that with fastupdate=on, +we effectively always grab a full-index lock, so you could get a lot of false +positives. + Compatibility ------------- diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 828c7074b70..030d0f44183 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -84,6 +84,9 @@ ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot) stack->parent = NULL; stack->predictNumber = 1; + if (!searchMode) + CheckForSerializableConflictIn(btree->index, NULL, stack->buffer); + for (;;) { Page page; diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 59bf21744f5..aeaf8adab09 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -1812,8 +1812,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, blkno = BufferGetBlockNumber(buffer); /* - * Copy a predicate lock from entry tree leaf (containing posting list) to - * posting tree. + * Copy any predicate locks from the entry tree leaf (containing posting + * list) to the posting tree. */ PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno); @@ -1864,7 +1864,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, return blkno; } -void +static void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) { memset(btree, 0, sizeof(GinBtreeData)); @@ -1911,7 +1911,6 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno, btree.itemptr = insertdata.items[insertdata.curitem]; stack = ginFindLeafPage(&btree, false, NULL); - GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer); ginInsertValue(&btree, stack, &insertdata, buildStats); } } diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 615730b8e55..5f624cf6fac 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -31,6 +31,7 @@ #include "postmaster/autovacuum.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "utils/builtins.h" /* GUC parameter */ @@ -245,6 +246,13 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); + /* + * An insertion to the pending list could logically belong anywhere in + * the tree, so it conflicts with all serializable scans. All scans + * acquire a predicate lock on the metabuffer to represent that. + */ + CheckForSerializableConflictIn(index, NULL, metabuffer); + if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) { /* diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index f3db7cc6405..ef3cd7dbe2a 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -36,20 +36,6 @@ typedef struct pendingPosition /* - * Place predicate lock on GIN page if needed. - */ -static void -GinPredicateLockPage(Relation index, BlockNumber blkno, Snapshot snapshot) -{ - /* - * When fast update is on then no need in locking pages, because we anyway - * need to lock the whole index. - */ - if (!GinGetUseFastUpdate(index)) - PredicateLockPage(index, blkno, snapshot); -} - -/* * Goes to the next page if current offset is outside of bounds */ static bool @@ -68,7 +54,7 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); stack->blkno = BufferGetBlockNumber(stack->buffer); stack->off = FirstOffsetNumber; - GinPredicateLockPage(btree->index, stack->blkno, snapshot); + PredicateLockPage(btree->index, stack->blkno, snapshot); } return true; @@ -100,11 +86,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, */ for (;;) { - /* - * Predicate lock each leaf page in posting tree - */ - GinPredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot); - page = BufferGetPage(buffer); if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) { @@ -158,7 +139,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, * Predicate lock entry leaf page, following pages will be locked by * moveRightIfItNeeded() */ - GinPredicateLockPage(btree->index, stack->buffer, snapshot); + PredicateLockPage(btree->index, stack->buffer, snapshot); for (;;) { @@ -253,6 +234,13 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, LockBuffer(stack->buffer, GIN_UNLOCK); + /* + * Acquire predicate lock on the posting tree. We already hold + * a lock on the entry page, but insertions to the posting tree + * don't check for conflicts on that level. + */ + PredicateLockPage(btree->index, rootPostingTree, snapshot); + /* Collect all the TIDs in this entry's posting tree */ scanPostingTree(btree->index, scanEntry, rootPostingTree, snapshot); @@ -400,10 +388,6 @@ restartScanEntry: { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); - /* Predicate lock visited entry leaf page */ - GinPredicateLockPage(ginstate->index, - BufferGetBlockNumber(stackEntry->buffer), snapshot); - if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); @@ -412,6 +396,13 @@ restartScanEntry: ItemPointerData minItem; /* + * This is an equality scan, so lock the root of the posting tree. + * It represents a lock on the exact key value, and covers all the + * items in the posting tree. + */ + PredicateLockPage(ginstate->index, rootPostingTree, snapshot); + + /* * We should unlock entry page before touching posting tree to * prevent deadlocks with vacuum processes. Because entry is never * deleted from page and posting tree is never reduced to the @@ -426,12 +417,6 @@ restartScanEntry: entry->buffer = stack->buffer; /* - * Predicate lock visited posting tree page, following pages will - * be locked by moveRightIfItNeeded or entryLoadMoreItems - */ - GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot); - - /* * We keep buffer pinned because we need to prevent deletion of * page during scan. See GIN's vacuum implementation. RefCount is * increased to keep buffer pinned after freeGinBtreeStack() call. @@ -452,15 +437,38 @@ restartScanEntry: freeGinBtreeStack(stack); entry->isFinished = false; } - else if (GinGetNPosting(itup) > 0) + else { - entry->list = ginReadTuple(ginstate, entry->attnum, itup, - &entry->nlist); - entry->predictNumberResult = entry->nlist; + /* + * Lock the entry leaf page. This is more coarse-grained than + * necessary, because it will conflict with any insertions that + * land on the same leaf page, not only the exacty key we searched + * for. But locking an individual tuple would require updating + * that lock whenever it moves because of insertions or vacuums, + * which seems too complicated. + */ + PredicateLockPage(ginstate->index, + BufferGetBlockNumber(stackEntry->buffer), + snapshot); + if (GinGetNPosting(itup) > 0) + { + entry->list = ginReadTuple(ginstate, entry->attnum, itup, + &entry->nlist); + entry->predictNumberResult = entry->nlist; - entry->isFinished = false; + entry->isFinished = false; + } } } + else + { + /* + * No entry found. Predicate lock the leaf page, to lock the place + * where the entry would've been, had there been one. + */ + PredicateLockPage(ginstate->index, + BufferGetBlockNumber(stackEntry->buffer), snapshot); + } if (needUnlock) LockBuffer(stackEntry->buffer, GIN_UNLOCK); @@ -533,7 +541,7 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) for (i = 0; i < key->nentries - 1; i++) { - /* Pass all entries <= i as false, and the rest as MAYBE */ + /* Pass all entries <= i as FALSE, and the rest as MAYBE */ for (j = 0; j <= i; j++) key->entryRes[entryIndexes[j]] = GIN_FALSE; for (j = i + 1; j < key->nentries; j++) @@ -673,8 +681,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, entry->btree.fullScan = false; stack = ginFindLeafPage(&entry->btree, true, snapshot); - GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(stack->buffer), snapshot); - /* we don't need the stack, just the buffer. */ entry->buffer = stack->buffer; IncrBufferRefCount(entry->buffer); @@ -719,10 +725,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, entry->buffer = ginStepRight(entry->buffer, ginstate->index, GIN_SHARE); - - GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot); - - page = BufferGetPage(entry->buffer); } stepright = true; @@ -1084,8 +1086,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, * lossy page even when none of the other entries match. * * Our strategy is to call the tri-state consistent function, with the - * lossy-page entries set to MAYBE, and all the other entries false. If it - * returns false, none of the lossy items alone are enough for a match, so + * lossy-page entries set to MAYBE, and all the other entries FALSE. If it + * returns FALSE, none of the lossy items alone are enough for a match, so * we don't need to return a lossy-page pointer. Otherwise, return a * lossy-page pointer to indicate that the whole heap page must be * checked. (On subsequent calls, we'll do nothing until minItem is past @@ -1746,8 +1748,7 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) } /* - * Collect all matched rows from pending list into bitmap. Also function - * takes PendingLockRelation if it's needed. + * Collect all matched rows from pending list into bitmap. */ static void scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) @@ -1764,6 +1765,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) *ntids = 0; + /* + * Acquire predicate lock on the metapage, to conflict with any + * fastupdate insertions. + */ + PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot); + LockBuffer(metabuffer, GIN_SHARE); page = BufferGetPage(metabuffer); TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); @@ -1777,24 +1784,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) { /* No pending list, so proceed with normal scan */ UnlockReleaseBuffer(metabuffer); - - /* - * If fast update is enabled, we acquire a predicate lock on the - * entire relation as fast update postpones the insertion of tuples - * into index structure due to which we can't detect rw conflicts. - */ - if (GinGetUseFastUpdate(scan->indexRelation)) - PredicateLockRelation(scan->indexRelation, scan->xs_snapshot); - return; } - /* - * Pending list is not empty, we need to lock the index doesn't despite on - * fastupdate state - */ - PredicateLockRelation(scan->indexRelation, scan->xs_snapshot); - pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); LockBuffer(pos.pendingBuffer, GIN_SHARE); pos.firstOffset = FirstOffsetNumber; diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index cf218dd75d4..5281eb68238 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -219,7 +219,7 @@ ginEntryInsert(GinState *ginstate, return; } - GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer); + CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer); /* modify an existing leaf entry */ itup = addItemPointersToLeafTuple(ginstate, itup, items, nitem, buildStats, stack->buffer); @@ -228,7 +228,7 @@ ginEntryInsert(GinState *ginstate, } else { - GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer); + CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer); /* no match, so construct a new leaf entry */ itup = buildFreshLeafTuple(ginstate, attnum, key, category, items, nitem, buildStats, stack->buffer); @@ -517,18 +517,6 @@ gininsert(Relation index, Datum *values, bool *isnull, memset(&collector, 0, sizeof(GinTupleCollector)); - /* - * With fastupdate on each scan and each insert begin with access to - * pending list, so it effectively lock entire index. In this case we - * aquire predicate lock and check for conflicts over index relation, - * and hope that it will reduce locking overhead. - * - * Do not use GinCheckForSerializableConflictIn() here, because it - * will do nothing (it does actual work only with fastupdate off). - * Check for conflicts for entire index. - */ - CheckForSerializableConflictIn(index, NULL, InvalidBuffer); - for (i = 0; i < ginstate->origTupdesc->natts; i++) ginHeapTupleFastCollect(ginstate, &collector, (OffsetNumber) (i + 1), @@ -539,16 +527,6 @@ gininsert(Relation index, Datum *values, bool *isnull, } else { - GinStatsData stats; - - /* - * Fastupdate is off but if pending list isn't empty then we need to - * check conflicts with PredicateLockRelation in scanPendingInsert(). - */ - ginGetStats(index, &stats); - if (stats.nPendingPages > 0) - CheckForSerializableConflictIn(index, NULL, InvalidBuffer); - for (i = 0; i < ginstate->origTupdesc->natts; i++) ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1), values[i], isnull[i], diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 4367523dd98..0a32182dd7f 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -718,10 +718,3 @@ ginUpdateStats(Relation index, const GinStatsData *stats) END_CRIT_SECTION(); } - -void -GinCheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer) -{ - if (!GinGetUseFastUpdate(relation)) - CheckForSerializableConflictIn(relation, tuple, buffer); -} diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index dd8e31b8721..3104bc12b63 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -166,7 +166,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn START_CRIT_SECTION(); /* Unlink the page by changing left sibling's rightlink */ - page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; |