summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/nbtree/nbtree.c75
-rw-r--r--src/backend/access/nbtree/nbtsearch.c715
-rw-r--r--src/backend/access/nbtree/nbtutils.c2
-rw-r--r--src/include/access/nbtree.h62
4 files changed, 388 insertions, 466 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 9b968aa0f29..f4f79f27062 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -66,7 +66,9 @@ typedef enum
*/
typedef struct BTParallelScanDescData
{
- BlockNumber btps_scanPage; /* latest or next page to be scanned */
+ BlockNumber btps_nextScanPage; /* next page to be scanned */
+ BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
+ * btps_nextScanPage */
BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for
* possible states of parallel scan. */
@@ -550,7 +552,8 @@ btinitparallelscan(void *target)
BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
SpinLockInit(&bt_target->btps_mutex);
- bt_target->btps_scanPage = InvalidBlockNumber;
+ bt_target->btps_nextScanPage = InvalidBlockNumber;
+ bt_target->btps_lastCurrPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
ConditionVariableInit(&bt_target->btps_cv);
}
@@ -575,7 +578,8 @@ btparallelrescan(IndexScanDesc scan)
* consistency.
*/
SpinLockAcquire(&btscan->btps_mutex);
- btscan->btps_scanPage = InvalidBlockNumber;
+ btscan->btps_nextScanPage = InvalidBlockNumber;
+ btscan->btps_lastCurrPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
SpinLockRelease(&btscan->btps_mutex);
}
@@ -591,18 +595,20 @@ btparallelrescan(IndexScanDesc scan)
* start just yet (only backends that call from _bt_first are capable of
* starting primitive index scans, which they indicate by passing first=true).
*
- * If the return value is true, *pageno returns the next or current page
- * of the scan (depending on the scan direction). An invalid block number
- * means the scan hasn't yet started, or that caller needs to start the next
- * primitive index scan (if it's the latter case we'll set so.needPrimScan).
- * The first time a participating process reaches the last page, it will return
- * true and set *pageno to P_NONE; after that, further attempts to seize the
- * scan will return false.
+ * If the return value is true, *next_scan_page returns the next page of the
+ * scan, and *last_curr_page returns the page that *next_scan_page came from.
+ * An invalid *next_scan_page means the scan hasn't yet started, or that
+ * caller needs to start the next primitive index scan (if it's the latter
+ * case we'll set so.needPrimScan). The first time a participating process
+ * reaches the last page, it will return true and set *next_scan_page to
+ * P_NONE; after that, further attempts to seize the scan will return false.
*
- * Callers should ignore the value of pageno if the return value is false.
+ * Callers should ignore the value of *next_scan_page and *last_curr_page if
+ * the return value is false.
*/
bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
+ BlockNumber *last_curr_page, bool first)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
bool exit_loop = false;
@@ -610,7 +616,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
- *pageno = P_NONE;
+ *next_scan_page = P_NONE;
+ *last_curr_page = InvalidBlockNumber;
+
+ /*
+ * Reset so->currPos, and initialize moreLeft/moreRight such that the next
+ * call to _bt_readnextpage treats this backend similarly to a serial
+ * backend that steps from *last_curr_page to *next_scan_page (unless this
+ * backend's so->currPos is initialized by _bt_readfirstpage before then).
+ */
+ BTScanPosInvalidate(so->currPos);
+ so->currPos.moreLeft = so->currPos.moreRight = true;
if (first)
{
@@ -660,7 +676,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
array->cur_elem = btscan->btps_arrElems[i];
skey->sk_argument = array->elem_values[array->cur_elem];
}
- *pageno = InvalidBlockNumber;
+ *next_scan_page = InvalidBlockNumber;
exit_loop = true;
}
else
@@ -688,7 +704,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
* of advancing it to a new page!
*/
btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
- *pageno = btscan->btps_scanPage;
+ *next_scan_page = btscan->btps_nextScanPage;
+ *last_curr_page = btscan->btps_lastCurrPage;
exit_loop = true;
}
SpinLockRelease(&btscan->btps_mutex);
@@ -703,17 +720,21 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
/*
* _bt_parallel_release() -- Complete the process of advancing the scan to a
- * new page. We now have the new value btps_scanPage; some other backend
+ * new page. We now have the new value btps_nextScanPage; another backend
* can now begin advancing the scan.
*
- * Callers whose scan uses array keys must save their scan_page argument so
+ * Callers whose scan uses array keys must save their curr_page argument so
* that it can be passed to _bt_parallel_primscan_schedule, should caller
- * determine that another primitive index scan is required. If that happens,
- * scan_page won't be scanned by any backend (unless the next primitive index
- * scan lands on scan_page).
+ * determine that another primitive index scan is required.
+ *
+ * Note: unlike the serial case, parallel scans don't need to remember both
+ * sibling links. next_scan_page is whichever link is next given the scan's
+ * direction. That's all we'll ever need, since the direction of a parallel
+ * scan can never change.
*/
void
-_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+_bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
+ BlockNumber curr_page)
{
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
@@ -722,7 +743,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
parallel_scan->ps_offset);
SpinLockAcquire(&btscan->btps_mutex);
- btscan->btps_scanPage = scan_page;
+ btscan->btps_nextScanPage = next_scan_page;
+ btscan->btps_lastCurrPage = curr_page;
btscan->btps_pageStatus = BTPARALLEL_IDLE;
SpinLockRelease(&btscan->btps_mutex);
ConditionVariableSignal(&btscan->btps_cv);
@@ -778,13 +800,13 @@ _bt_parallel_done(IndexScanDesc scan)
/*
* _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
*
- * Caller passes the block number most recently passed to _bt_parallel_release
+ * Caller passes the curr_page most recently passed to _bt_parallel_release
* by its backend. Caller successfully schedules the next primitive index scan
* if the shared parallel state hasn't been seized since caller's backend last
* advanced the scan.
*/
void
-_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -796,10 +818,11 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
parallel_scan->ps_offset);
SpinLockAcquire(&btscan->btps_mutex);
- if (btscan->btps_scanPage == prev_scan_page &&
+ if (btscan->btps_lastCurrPage == curr_page &&
btscan->btps_pageStatus == BTPARALLEL_IDLE)
{
- btscan->btps_scanPage = InvalidBlockNumber;
+ btscan->btps_nextScanPage = InvalidBlockNumber;
+ btscan->btps_lastCurrPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
/* Serialize scan's current array keys */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index a662665b55f..2275553be78 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -43,12 +43,13 @@ static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum,
ItemPointer heapTid, int tupleOffset);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
-static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
-static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
- ScanDirection dir);
-static Buffer _bt_walk_left(Relation rel, Buffer buf);
+static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
+ ScanDirection dir);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+ BlockNumber lastcurrblkno, ScanDirection dir);
+static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
+ BlockNumber lastcurrblkno);
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
-static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
/*
@@ -880,7 +881,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- Buffer buf;
BTStack stack;
OffsetNumber offnum;
StrategyNumber strat;
@@ -889,10 +889,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
ScanKeyData notnullkeys[INDEX_MAX_KEYS];
int keysz = 0;
int i;
- bool status;
StrategyNumber strat_total;
BTScanPosItem *currItem;
- BlockNumber blkno;
Assert(!BTScanPosIsValid(so->currPos));
@@ -924,7 +922,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
*/
if (scan->parallel_scan != NULL)
{
- status = _bt_parallel_seize(scan, &blkno, true);
+ BlockNumber blkno,
+ lastcurrblkno;
+ bool status;
+
+ status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true);
/*
* Initialize arrays (when _bt_parallel_seize didn't already set up
@@ -942,7 +944,13 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
else if (blkno != InvalidBlockNumber)
{
- if (!_bt_parallel_readpage(scan, blkno, dir))
+ Assert(!so->needPrimScan);
+
+ /*
+ * We anticipated starting another primitive scan, but some other
+ * worker bet us to it
+ */
+ if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir))
return false;
goto readcomplete;
}
@@ -1392,12 +1400,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* position ourselves on the target leaf page.
*/
Assert(ScanDirectionIsBackward(dir) == inskey.backward);
- stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
+ stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
/* don't need to keep the stack around... */
_bt_freestack(stack);
- if (!BufferIsValid(buf))
+ if (!BufferIsValid(so->currPos.buf))
{
/*
* We only get here if the index is completely empty. Lock relation
@@ -1411,11 +1419,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
if (IsolationIsSerializable())
{
PredicateLockRelation(rel, scan->xs_snapshot);
- stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
+ stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
_bt_freestack(stack);
}
- if (!BufferIsValid(buf))
+ if (!BufferIsValid(so->currPos.buf))
{
/*
* Mark parallel scan as done, so that all the workers can finish
@@ -1427,17 +1435,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
}
- PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
-
- _bt_initialize_more_data(so, dir);
-
/* position to the precise item on the page */
- offnum = _bt_binsrch(rel, &inskey, buf);
- Assert(!BTScanPosIsValid(so->currPos));
- so->currPos.buf = buf;
+ offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
/*
- * Now load data from the first page of the scan.
+ * Now load data from the first page of the scan (usually the page
+ * currently in so->currPos.buf).
*
* If inskey.nextkey = false and inskey.backward = false, offnum is
* positioned at the first non-pivot tuple >= inskey.scankeys.
@@ -1455,24 +1458,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* for the page. For example, when inskey is both < the leaf page's high
* key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
*/
- if (!_bt_readpage(scan, dir, offnum, true))
- {
- /*
- * There's no actually-matching data on this page. Try to advance to
- * the next page. Return false if there's no matching data at all.
- */
- _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
- if (!_bt_steppage(scan, dir))
- return false;
- }
- else
- {
- /* We have at least one item to return as scan's first item */
- _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
- }
+ if (!_bt_readfirstpage(scan, offnum, dir))
+ return false;
readcomplete:
/* OK, itemIndex says what to return */
+ Assert(BTScanPosIsValid(so->currPos));
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_heaptid = currItem->heapTid;
if (scan->xs_want_itup)
@@ -1492,8 +1483,9 @@ readcomplete:
* heap tuple, and if requested, scan->xs_itup points to a copy of the
* index tuple. so->currPos is updated as needed.
*
- * On failure exit (no more tuples), we release pin and set
- * so->currPos.buf to InvalidBuffer.
+ * On failure exit (no more tuples), we invalidate so->currPos. It'll
+ * still be possible for the scan to return tuples by changing direction,
+ * though we'll need to call _bt_first anew in that other direction.
*/
bool
_bt_next(IndexScanDesc scan, ScanDirection dir)
@@ -1523,6 +1515,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
}
/* OK, itemIndex says what to return */
+ Assert(BTScanPosIsValid(so->currPos));
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_heaptid = currItem->heapTid;
if (scan->xs_want_itup)
@@ -1545,14 +1538,6 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
* that there can be no more matching tuples in the current scan direction
* (could just be for the current primitive index scan when scan has arrays).
*
- * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
- * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
- * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
- * its scan key was marked required), so _bt_first _must_ pass us an offnum
- * exactly at the beginning of where equal tuples are to be found. When we're
- * passed an offnum past the end of the page, we might still manage to stop
- * the scan on this page by calling _bt_checkkeys against the high key.
- *
* In the case of a parallel scan, caller must have called _bt_parallel_seize
* prior to calling this function; this function will invoke
* _bt_parallel_release before returning.
@@ -1563,6 +1548,7 @@ static bool
_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
bool firstPage)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
@@ -1573,27 +1559,36 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
int itemIndex,
indnatts;
- /*
- * We must have the buffer pinned and locked, but the usual macro can't be
- * used here; this function is what makes it good for currPos.
- */
- Assert(BufferIsValid(so->currPos.buf));
-
+ /* save the page/buffer block number, along with its sibling links */
page = BufferGetPage(so->currPos.buf);
opaque = BTPageGetOpaque(page);
+ so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+ so->currPos.prevPage = opaque->btpo_prev;
+ so->currPos.nextPage = opaque->btpo_next;
+
+ Assert(!P_IGNORE(opaque));
+ Assert(BTScanPosIsPinned(so->currPos));
- /* allow next page be processed by parallel worker */
if (scan->parallel_scan)
{
+ /* allow next/prev page to be read by other worker without delay */
if (ScanDirectionIsForward(dir))
- pstate.prev_scan_page = opaque->btpo_next;
+ _bt_parallel_release(scan, so->currPos.nextPage,
+ so->currPos.currPage);
else
- pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
-
- _bt_parallel_release(scan, pstate.prev_scan_page);
+ _bt_parallel_release(scan, so->currPos.prevPage,
+ so->currPos.currPage);
}
- indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+ /* initialize remaining currPos fields (before moreLeft/moreright) */
+ so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+ so->currPos.dir = dir;
+ so->currPos.nextTupleOffset = 0;
+
+ PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+
+ /* initialize local variables */
+ indnatts = IndexRelationGetNumberOfAttributes(rel);
arrayKeys = so->numArrayKeys != 0;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
@@ -1613,35 +1608,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
pstate.targetdistance = 0;
/*
- * We note the buffer's block number so that we can release the pin later.
- * This allows us to re-read the buffer if it is needed again for hinting.
- */
- so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
-
- /*
- * We save the LSN of the page as we read it, so that we know whether it
- * safe to apply LP_DEAD hints to the page later. This allows us to drop
- * the pin for MVCC scans, which allows vacuum to avoid blocking.
- */
- so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
-
- /*
- * we must save the page's right-link while scanning it; this tells us
- * where to step right to after we're done with these items. There is no
- * corresponding need for the left-link, since splits always go right.
- */
- so->currPos.nextPage = opaque->btpo_next;
-
- /* initialize tuple workspace to empty */
- so->currPos.nextTupleOffset = 0;
-
- /*
- * Now that the current page has been made consistent, the macro should be
- * good.
- */
- Assert(BTScanPosIsPinned(so->currPos));
-
- /*
* Prechecking the value of the continuescan flag for the last item on the
* page (for backwards scan it will be the first item on a page). If we
* observe it to be true, then it should be true for all other items. This
@@ -1829,7 +1795,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
int truncatt;
- truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
+ truncatt = BTreeTupleGetNAtts(itup, rel);
pstate.prechecked = false; /* precheck didn't cover HIKEY */
_bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
}
@@ -1959,7 +1925,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* We don't need to visit page to the left when no more matches will
* be found there
*/
- if (!pstate.continuescan || P_LEFTMOST(opaque))
+ if (!pstate.continuescan)
so->currPos.moreLeft = false;
Assert(itemIndex >= 0);
@@ -2060,20 +2026,25 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
/*
* _bt_steppage() -- Step to next page containing valid data for scan
*
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
- * if pinned, we'll drop the pin before moving to next page. The buffer is
- * not locked on entry.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
+ * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
+ * the pin eagerly earlier on. The scan must have so->currPos.currPage set to
+ * a valid block, in any case.
*
- * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
- * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
- * to InvalidBuffer. We return true to indicate success.
+ * This is a wrapper on _bt_readnextpage that performs final steps for the
+ * current page. It sets up the _bt_readnextpage call using either local
+ * state saved in so->currPos by the most recent _bt_readpage call, or using
+ * shared parallel scan state (obtained by seizing the parallel scan here).
+ *
+ * Parallel scan callers that have already seized the scan should directly
+ * call _bt_readnextpage, rather than calling here.
*/
static bool
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- BlockNumber blkno = InvalidBlockNumber;
- bool status;
+ BlockNumber blkno,
+ lastcurrblkno;
Assert(BTScanPosIsValid(so->currPos));
@@ -2115,7 +2086,6 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
* In effect, btrestpos leaves advancing the arrays up to the first
* _bt_readpage call (that takes place after it has restored markPos).
*/
- Assert(so->markPos.dir == dir);
if (so->needPrimScan)
{
if (ScanDirectionIsForward(dir))
@@ -2123,319 +2093,288 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
else
so->markPos.moreLeft = true;
}
+
+ /* mark/restore not supported by parallel scans */
+ Assert(!scan->parallel_scan);
}
- if (ScanDirectionIsForward(dir))
+ BTScanPosUnpinIfPinned(so->currPos);
+
+ /* Walk to the next page with data */
+ if (!scan->parallel_scan)
{
- /* Walk right to the next page with data */
- if (scan->parallel_scan != NULL)
- {
- /*
- * Seize the scan to get the next block number; if the scan has
- * ended already, bail out.
- */
- status = _bt_parallel_seize(scan, &blkno, false);
- if (!status)
- {
- /* release the previous buffer, if pinned */
- BTScanPosUnpinIfPinned(so->currPos);
- BTScanPosInvalidate(so->currPos);
- return false;
- }
- }
- else
- {
- /* Not parallel, so use the previously-saved nextPage link. */
+ /* Not parallel, so use local state set by the last _bt_readpage */
+ if (ScanDirectionIsForward(dir))
blkno = so->currPos.nextPage;
- }
+ else
+ blkno = so->currPos.prevPage;
+ lastcurrblkno = so->currPos.currPage;
+ }
+ else
+ {
+ /*
+ * Seize the scan to get the nextPage and currPage from shared
+ * parallel state (saved from parallel scan's last _bt_readpage)
+ */
+ if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+ return false;
+ }
- /* Remember we left a page with data */
- so->currPos.moreLeft = true;
+ return _bt_readnextpage(scan, blkno, lastcurrblkno, dir);
+}
+
+/*
+ * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
+ *
+ * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
+ * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
+ * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
+ * its scan key was marked required), so _bt_first _must_ pass us an offnum
+ * exactly at the beginning of where equal tuples are to be found. When we're
+ * passed an offnum past the end of the page, we might still manage to stop
+ * the scan on this page by calling _bt_checkkeys against the high key. See
+ * _bt_readpage for full details.
+ *
+ * On entry, so->currPos must be pinned and locked (so offnum stays valid).
+ * Parallel scan callers must have seized the scan before calling here.
+ *
+ * On exit, we'll have updated so->currPos and retained locks and pins
+ * according to the same rules as those laid out for _bt_readnextpage exit.
+ * Like _bt_readnextpage, our return value indicates if there are any matching
+ * records in the given direction.
+ *
+ * We always release the scan for a parallel scan caller, regardless of
+ * success or failure; we'll call _bt_parallel_release as soon as possible.
+ */
+static bool
+_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
- /* release the previous buffer, if pinned */
- BTScanPosUnpinIfPinned(so->currPos);
+ so->numKilled = 0; /* just paranoia */
+ so->markItemIndex = -1; /* ditto */
+
+ /* Initialize currPos for so->currPos */
+ if (so->needPrimScan)
+ {
+ Assert(so->numArrayKeys);
+
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = true;
+ so->needPrimScan = false;
}
- else
+ else if (ScanDirectionIsForward(dir))
{
- /* Remember we left a page with data */
+ so->currPos.moreLeft = false;
so->currPos.moreRight = true;
+ }
+ else
+ {
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = false;
+ }
- if (scan->parallel_scan != NULL)
- {
- /*
- * Seize the scan to get the current block number; if the scan has
- * ended already, bail out.
- */
- status = _bt_parallel_seize(scan, &blkno, false);
- BTScanPosUnpinIfPinned(so->currPos);
- if (!status)
- {
- BTScanPosInvalidate(so->currPos);
- return false;
- }
- }
- else
- {
- /* Not parallel, so just use our own notion of the current page */
- blkno = so->currPos.currPage;
- }
+ /*
+ * Attempt to load matching tuples from the page in so->currPos.buf.
+ *
+ * Note that _bt_readpage will finish initializing the so->currPos fields.
+ * _bt_readpage also releases parallel scan (even when it returns false).
+ */
+ if (_bt_readpage(scan, dir, offnum, true))
+ {
+ /*
+ * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
+ * so->currPos.buf in preparation for btgettuple returning tuples.
+ */
+ Assert(BTScanPosIsPinned(so->currPos));
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ return true;
}
- if (!_bt_readnextpage(scan, blkno, dir))
- return false;
+ /* There's no actually-matching data on the page in so->currPos.buf */
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
- /* We have at least one item to return as scan's next item */
- _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ /* Call _bt_readnextpage using its _bt_steppage wrapper function */
+ if (!_bt_steppage(scan, dir))
+ return false;
+ /* _bt_readpage for a later page (now in so->currPos) succeeded */
return true;
}
/*
- * _bt_readnextpage() -- Read next page containing valid data for scan
+ * _bt_readnextpage() -- Read next page containing valid data for _bt_next
+ *
+ * Caller's blkno is the next interesting page's link, taken from either the
+ * previously-saved right link or left link. lastcurrblkno is the page that
+ * was current at the point where the blkno link was saved, which we use to
+ * reason about concurrent page splits/page deletions during backwards scans
+ * (_bt_parallel_seize also requires it, regardless of scan direction).
+ *
+ * On entry, caller shouldn't hold any locks or pins on any page (we work
+ * directly off of blkno and lastcurrblkno instead). Parallel scan callers
+ * must have seized the scan before calling here (blkno and lastcurrblkno
+ * arguments should come from the seized scan).
*
* On success exit, so->currPos is updated to contain data from the next
- * interesting page, and we return true. Caller must release the lock (and
- * maybe the pin) on the buffer on success exit.
+ * interesting page, and we return true (parallel scan callers should not use
+ * so->currPos to determine which page to scan next, though). We hold a pin
+ * on the buffer on success exit, except when _bt_drop_lock_and_maybe_pin
+ * decided it was safe to eagerly drop the pin (to avoid blocking VACUUM).
*
* If there are no more matching records in the given direction, we drop all
- * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
+ * locks and pins, invalidate so->currPos, and return false.
+ *
+ * We always release the scan for a parallel scan caller, regardless of
+ * success or failure; we'll call _bt_parallel_release as soon as possible.
*/
static bool
-_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+ BlockNumber lastcurrblkno, ScanDirection dir)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- Relation rel;
Page page;
BTPageOpaque opaque;
- bool status;
- rel = scan->indexRelation;
+ Assert(so->currPos.currPage == lastcurrblkno || scan->parallel_scan != NULL);
+ Assert(!BTScanPosIsPinned(so->currPos));
+ /*
+ * Remember that the scan already read lastcurrblkno, a page to the left
+ * of blkno (or remember reading a page to the right, for backwards scans)
+ */
if (ScanDirectionIsForward(dir))
- {
- for (;;)
- {
- /*
- * if we're at end of scan, give up and mark parallel scan as
- * done, so that all the workers can finish their scan
- */
- if (blkno == P_NONE || !so->currPos.moreRight)
- {
- _bt_parallel_done(scan);
- BTScanPosInvalidate(so->currPos);
- return false;
- }
- /* check for interrupts while we're not holding any buffer lock */
- CHECK_FOR_INTERRUPTS();
- /* step right one page */
- so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(so->currPos.buf);
- opaque = BTPageGetOpaque(page);
- /* check for deleted page */
- if (!P_IGNORE(opaque))
- {
- PredicateLockPage(rel, blkno, scan->xs_snapshot);
- /* see if there are any matches on this page */
- /* note that this will clear moreRight if we can stop */
- if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
- break;
- }
- else if (scan->parallel_scan != NULL)
- {
- /* allow next page be processed by parallel worker */
- _bt_parallel_release(scan, opaque->btpo_next);
- }
-
- /* nope, keep going */
- if (scan->parallel_scan != NULL)
- {
- _bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno, false);
- if (!status)
- {
- BTScanPosInvalidate(so->currPos);
- return false;
- }
- }
- else
- {
- blkno = opaque->btpo_next;
- _bt_relbuf(rel, so->currPos.buf);
- }
- }
- }
+ so->currPos.moreLeft = true;
else
+ so->currPos.moreRight = true;
+
+ for (;;)
{
/*
- * Should only happen in parallel cases, when some other backend
- * advanced the scan.
+ * if we're at end of scan, give up and mark parallel scan as done, so
+ * that all the workers can finish their scan
*/
- if (so->currPos.currPage != blkno)
- {
- BTScanPosUnpinIfPinned(so->currPos);
- so->currPos.currPage = blkno;
- }
-
- /* Done if we know that the left sibling link isn't of interest */
- if (!so->currPos.moreLeft)
+ if (blkno == P_NONE ||
+ (ScanDirectionIsForward(dir) ?
+ !so->currPos.moreRight : !so->currPos.moreLeft))
{
- BTScanPosUnpinIfPinned(so->currPos);
_bt_parallel_done(scan);
BTScanPosInvalidate(so->currPos);
return false;
}
- /*
- * Walk left to the next page with data. This is much more complex
- * than the walk-right case because of the possibility that the page
- * to our left splits while we are in flight to it, plus the
- * possibility that the page we were on gets deleted after we leave
- * it. See nbtree/README for details.
- *
- * It might be possible to rearrange this code to have less overhead
- * in pinning and locking, but that would require capturing the left
- * sibling block number when the page is initially read, and then
- * optimistically starting there (rather than pinning the page twice).
- * It is not clear that this would be worth the complexity.
- */
- if (BTScanPosIsPinned(so->currPos))
- _bt_lockbuf(rel, so->currPos.buf, BT_READ);
+ if (ScanDirectionIsForward(dir))
+ {
+ /* read blkno, but check for interrupts first */
+ CHECK_FOR_INTERRUPTS();
+ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+ }
else
- so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
-
- for (;;)
{
- /* Done if we know that the left sibling link isn't of interest */
- if (!so->currPos.moreLeft)
+ /* read blkno, avoiding race (also checks for interrupts) */
+ so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
+ lastcurrblkno);
+ if (so->currPos.buf == InvalidBuffer)
{
- _bt_relbuf(rel, so->currPos.buf);
+ /* must have been a concurrent deletion of leftmost page */
_bt_parallel_done(scan);
BTScanPosInvalidate(so->currPos);
return false;
}
+ }
- /* Step to next physical page */
- so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
-
- /* if we're physically at end of index, return failure */
- if (so->currPos.buf == InvalidBuffer)
+ page = BufferGetPage(so->currPos.buf);
+ opaque = BTPageGetOpaque(page);
+ lastcurrblkno = blkno;
+ if (likely(!P_IGNORE(opaque)))
+ {
+ /* see if there are any matches on this page */
+ if (ScanDirectionIsForward(dir))
{
- _bt_parallel_done(scan);
- BTScanPosInvalidate(so->currPos);
- return false;
+ /* note that this will clear moreRight if we can stop */
+ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
+ break;
+ blkno = so->currPos.nextPage;
}
-
- /*
- * Okay, we managed to move left to a non-deleted page. Done if
- * it's not half-dead and contains matching tuples. Else loop back
- * and do it all again.
- */
- page = BufferGetPage(so->currPos.buf);
- opaque = BTPageGetOpaque(page);
- if (!P_IGNORE(opaque))
+ else
{
- PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
- /* see if there are any matches on this page */
/* note that this will clear moreLeft if we can stop */
if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
break;
+ blkno = so->currPos.prevPage;
}
- else if (scan->parallel_scan != NULL)
- {
- /* allow next page be processed by parallel worker */
- _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
- }
-
- /*
- * For parallel scans, get the last page scanned as it is quite
- * possible that by the time we try to seize the scan, some other
- * worker has already advanced the scan to a different page. We
- * must continue based on the latest page scanned by any worker.
- */
+ }
+ else
+ {
+ /* _bt_readpage not called, so do all this for ourselves */
+ if (ScanDirectionIsForward(dir))
+ blkno = opaque->btpo_next;
+ else
+ blkno = opaque->btpo_prev;
if (scan->parallel_scan != NULL)
- {
- _bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno, false);
- if (!status)
- {
- BTScanPosInvalidate(so->currPos);
- return false;
- }
- so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
- }
+ _bt_parallel_release(scan, blkno, lastcurrblkno);
}
- }
- return true;
-}
+ /* no matching tuples on this page */
+ _bt_relbuf(rel, so->currPos.buf);
-/*
- * _bt_parallel_readpage() -- Read current page containing valid data for scan
- *
- * On success, release lock and maybe pin on buffer. We return true to
- * indicate success.
- */
-static bool
-_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
-{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
-
- Assert(!so->needPrimScan);
-
- _bt_initialize_more_data(so, dir);
-
- if (!_bt_readnextpage(scan, blkno, dir))
- return false;
+ /* parallel scan seizes another page (won't use so->currPos blkno) */
+ if (scan->parallel_scan != NULL &&
+ !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+ {
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+ }
- /* We have at least one item to return as scan's next item */
+ /*
+ * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
+ * so->currPos.buf in preparation for btgettuple returning tuples.
+ */
+ Assert(so->currPos.currPage == blkno);
+ Assert(BTScanPosIsPinned(so->currPos));
_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
return true;
}
/*
- * _bt_walk_left() -- step left one page, if possible
+ * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
+ * recovering from concurrent page splits/page deletions when necessary
+ *
+ * Called during backwards scans, to deal with their unique concurrency rules.
*
- * The given buffer must be pinned and read-locked. This will be dropped
- * before stepping left. On return, we have pin and read lock on the
- * returned page, instead.
+ * blkno points to the block number of the page that we expect to move the
+ * scan to. We'll successfully move the scan there when we find that its
+ * right sibling link still points to lastcurrblkno (the page we just read).
+ * Otherwise, we have to figure out which page is the correct one for the scan
+ * to now read the hard way, reasoning about concurrent splits and deletions.
+ * See nbtree/README.
*
- * Returns InvalidBuffer if there is no page to the left (no lock is held
- * in that case).
+ * On return, we have both a pin and a read lock on the returned page, whose
+ * block number will be set in *blkno. Returns InvalidBuffer if there is no
+ * page to the left (no lock or pin is held in that case).
*
* It is possible for the returned leaf page to be half-dead; caller must
* check that condition and step left again when required.
*/
static Buffer
-_bt_walk_left(Relation rel, Buffer buf)
+_bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
+ BlockNumber lastcurrblkno)
{
- Page page;
- BTPageOpaque opaque;
-
- page = BufferGetPage(buf);
- opaque = BTPageGetOpaque(page);
+ BlockNumber origblkno = *blkno; /* detects circular links */
for (;;)
{
- BlockNumber obknum;
- BlockNumber lblkno;
- BlockNumber blkno;
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
int tries;
- /* if we're at end of tree, release buf and return failure */
- if (P_LEFTMOST(opaque))
- {
- _bt_relbuf(rel, buf);
- break;
- }
- /* remember original page we are stepping left from */
- obknum = BufferGetBlockNumber(buf);
- /* step left */
- blkno = lblkno = opaque->btpo_prev;
- _bt_relbuf(rel, buf);
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
- buf = _bt_getbuf(rel, blkno, BT_READ);
+ buf = _bt_getbuf(rel, *blkno, BT_READ);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
@@ -2443,7 +2382,7 @@ _bt_walk_left(Relation rel, Buffer buf)
* If this isn't the page we want, walk right till we find what we
* want --- but go no more than four hops (an arbitrary limit). If we
* don't find the correct page by then, the most likely bet is that
- * the original page got deleted and isn't in the sibling chain at all
+ * lastcurrblkno got deleted and isn't in the sibling chain at all
* anymore, not that its left sibling got split more than four times.
*
* Note that it is correct to test P_ISDELETED not P_IGNORE here,
@@ -2452,21 +2391,27 @@ _bt_walk_left(Relation rel, Buffer buf)
tries = 0;
for (;;)
{
- if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+ if (likely(!P_ISDELETED(opaque) &&
+ opaque->btpo_next == lastcurrblkno))
{
/* Found desired page, return it */
return buf;
}
if (P_RIGHTMOST(opaque) || ++tries > 4)
break;
- blkno = opaque->btpo_next;
- buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ /* step right */
+ *blkno = opaque->btpo_next;
+ buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
}
- /* Return to the original page to see what's up */
- buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+ /*
+ * Return to the original page (usually the page most recently read by
+ * _bt_readpage, which is passed by caller as lastcurrblkno) to see
+ * what's up with its prev sibling link
+ */
+ buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
if (P_ISDELETED(opaque))
@@ -2482,31 +2427,42 @@ _bt_walk_left(Relation rel, Buffer buf)
if (P_RIGHTMOST(opaque))
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
- blkno = opaque->btpo_next;
- buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ lastcurrblkno = opaque->btpo_next;
+ buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
if (!P_ISDELETED(opaque))
break;
}
-
- /*
- * Now return to top of loop, resetting obknum to point to this
- * nondeleted page, and try again.
- */
}
else
{
/*
- * It wasn't deleted; the explanation had better be that the page
- * to the left got split or deleted. Without this check, we'd go
- * into an infinite loop if there's anything wrong.
+ * Original lastcurrblkno wasn't deleted; the explanation had
+ * better be that the page to the left got split or deleted.
+ * Without this check, we risk going into an infinite loop.
*/
- if (opaque->btpo_prev == lblkno)
+ if (opaque->btpo_prev == origblkno)
elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
- obknum, RelationGetRelationName(rel));
- /* Okay to try again with new lblkno value */
+ lastcurrblkno, RelationGetRelationName(rel));
+ /* Okay to try again, since left sibling link changed */
}
+
+ /*
+ * Original lastcurrblkno from caller was concurrently deleted (could
+ * also have been a great many concurrent left sibling page splits).
+ * Found a non-deleted page that should now act as our lastcurrblkno.
+ */
+ if (P_LEFTMOST(opaque))
+ {
+ /* New lastcurrblkno has no left sibling (concurrently deleted) */
+ _bt_relbuf(rel, buf);
+ break;
+ }
+
+ /* Start from scratch with new lastcurrblkno's blkno/prev link */
+ *blkno = origblkno = opaque->btpo_prev;
+ _bt_relbuf(rel, buf);
}
return InvalidBuffer;
@@ -2606,7 +2562,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- Buffer buf;
Page page;
BTPageOpaque opaque;
OffsetNumber start;
@@ -2617,9 +2572,9 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
* version of _bt_search(). We don't maintain a stack since we know we
* won't need it.
*/
- buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
+ so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
- if (!BufferIsValid(buf))
+ if (!BufferIsValid(so->currPos.buf))
{
/*
* Empty index. Lock the whole relation, as nothing finer to lock
@@ -2630,8 +2585,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
return false;
}
- PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
- page = BufferGetPage(buf);
+ page = BufferGetPage(so->currPos.buf);
opaque = BTPageGetOpaque(page);
Assert(P_ISLEAF(opaque));
@@ -2654,31 +2608,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
start = 0; /* keep compiler quiet */
}
- /* remember which buffer we have pinned */
- so->currPos.buf = buf;
-
- _bt_initialize_more_data(so, dir);
-
/*
* Now load data from the first page of the scan.
*/
- if (!_bt_readpage(scan, dir, start, true))
- {
- /*
- * There's no actually-matching data on this page. Try to advance to
- * the next page. Return false if there's no matching data at all.
- */
- _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
- if (!_bt_steppage(scan, dir))
- return false;
- }
- else
- {
- /* We have at least one item to return as scan's first item */
- _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
- }
+ if (!_bt_readfirstpage(scan, start, dir))
+ return false;
/* OK, itemIndex says what to return */
+ Assert(BTScanPosIsValid(so->currPos));
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_heaptid = currItem->heapTid;
if (scan->xs_want_itup)
@@ -2686,33 +2623,3 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
return true;
}
-
-/*
- * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
- * from currPos
- */
-static inline void
-_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
-{
- so->currPos.dir = dir;
- if (so->needPrimScan)
- {
- Assert(so->numArrayKeys);
-
- so->currPos.moreLeft = true;
- so->currPos.moreRight = true;
- so->needPrimScan = false;
- }
- else if (ScanDirectionIsForward(dir))
- {
- so->currPos.moreLeft = false;
- so->currPos.moreRight = true;
- }
- else
- {
- so->currPos.moreLeft = true;
- so->currPos.moreRight = false;
- }
- so->numKilled = 0; /* just paranoia */
- so->markItemIndex = -1; /* ditto */
-}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 61ded00ca24..76be65123c8 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2407,7 +2407,7 @@ new_prim_scan:
so->needPrimScan = true; /* ...but call _bt_first again */
if (scan->parallel_scan)
- _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page);
+ _bt_parallel_primscan_schedule(scan, so->currPos.currPage);
/* Caller's tuple doesn't match the new qual */
return false;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 90a68375a2e..5fb523ecec3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -925,13 +925,13 @@ typedef BTVacuumPostingData *BTVacuumPosting;
* Index scans work a page at a time: we pin and read-lock the page, identify
* all the matching items on the page and save them in BTScanPosData, then
* release the read-lock while returning the items to the caller for
- * processing. This approach minimizes lock/unlock traffic. Note that we
- * keep the pin on the index page until the caller is done with all the items
- * (this is needed for VACUUM synchronization, see nbtree/README). When we
- * are ready to step to the next page, if the caller has told us any of the
- * items were killed, we re-lock the page to mark them killed, then unlock.
- * Finally we drop the pin and step to the next page in the appropriate
- * direction.
+ * processing. This approach minimizes lock/unlock traffic. We must always
+ * drop the lock to make it okay for caller to process the returned items.
+ * Whether or not we can also release the pin during this window will vary.
+ * We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM
+ * (see nbtree/README section about making concurrent TID recycling safe).
+ * We'll always release both the lock and the pin on the current page before
+ * moving on to its sibling page.
*
* If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go
@@ -950,28 +950,15 @@ typedef struct BTScanPosItem /* what we remember about each match */
typedef struct BTScanPosData
{
- Buffer buf; /* if valid, the buffer is pinned */
+ Buffer buf; /* currPage buf (invalid means unpinned) */
- XLogRecPtr lsn; /* pos in the WAL stream when page was read */
+ /* page details as of the saved position's call to _bt_readpage */
BlockNumber currPage; /* page referenced by items array */
- BlockNumber nextPage; /* page's right link when we scanned it */
+ BlockNumber prevPage; /* currPage's left link */
+ BlockNumber nextPage; /* currPage's right link */
+ XLogRecPtr lsn; /* currPage's LSN */
- /*
- * moreLeft and moreRight track whether we think there may be matching
- * index entries to the left and right of the current page, respectively.
- * We can clear the appropriate one of these flags when _bt_checkkeys()
- * sets BTReadPageState.continuescan = false.
- */
- bool moreLeft;
- bool moreRight;
-
- /*
- * Direction of the scan at the time that _bt_readpage was called.
- *
- * Used by btrestrpos to "restore" the scan's array keys by resetting each
- * array to its first element's value (first in this scan direction). This
- * avoids the need to directly track the array keys in btmarkpos.
- */
+ /* scan direction for the saved position's call to _bt_readpage */
ScanDirection dir;
/*
@@ -981,6 +968,13 @@ typedef struct BTScanPosData
int nextTupleOffset;
/*
+ * moreLeft and moreRight track whether we think there may be matching
+ * index entries to the left and right of the current page, respectively.
+ */
+ bool moreLeft;
+ bool moreRight;
+
+ /*
* The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the
* array back-to-front, so we start at the last slot and fill downwards.
@@ -1021,11 +1015,8 @@ typedef BTScanPosData *BTScanPos;
)
#define BTScanPosInvalidate(scanpos) \
do { \
- (scanpos).currPage = InvalidBlockNumber; \
- (scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
- (scanpos).lsn = InvalidXLogRecPtr; \
- (scanpos).nextTupleOffset = 0; \
+ (scanpos).currPage = InvalidBlockNumber; \
} while (0)
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */
@@ -1091,7 +1082,6 @@ typedef struct BTReadPageState
OffsetNumber minoff; /* Lowest non-pivot tuple's offset */
OffsetNumber maxoff; /* Highest non-pivot tuple's offset */
IndexTuple finaltup; /* Needed by scans with array keys */
- BlockNumber prev_scan_page; /* previous _bt_parallel_release block */
Page page; /* Page being read */
/* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
@@ -1192,12 +1182,14 @@ extern int btgettreeheight(Relation rel);
/*
* prototypes for internal functions in nbtree.c
*/
-extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
- bool first);
-extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
+extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
+ BlockNumber *last_curr_page, bool first);
+extern void _bt_parallel_release(IndexScanDesc scan,
+ BlockNumber next_scan_page,
+ BlockNumber curr_page);
extern void _bt_parallel_done(IndexScanDesc scan);
extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
- BlockNumber prev_scan_page);
+ BlockNumber curr_page);
/*
* prototypes for functions in nbtdedup.c