summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtsearch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtsearch.c')
-rw-r--r--src/backend/access/nbtree/nbtsearch.c249
1 files changed, 188 insertions, 61 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index e3fff90d8e4..d241e8ea1dc 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -907,7 +907,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
*/
if (!so->qual_ok)
{
- /* Notify any other workers that we're done with this scan key. */
_bt_parallel_done(scan);
return false;
}
@@ -917,10 +916,22 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* scan has not started, proceed to find out first leaf page in the usual
* way while keeping other participating processes waiting. If the scan
* has already begun, use the page number from the shared structure.
+ *
+ * When a parallel scan has another primitive index scan scheduled, a
+ * parallel worker will seize the scan for that purpose now. This is
+ * similar to the case where the top-level scan hasn't started.
*/
if (scan->parallel_scan != NULL)
{
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, true);
+
+ /*
+ * Initialize arrays (when _bt_parallel_seize didn't already set up
+ * the next primitive index scan)
+ */
+ if (so->numArrayKeys && !so->needPrimScan)
+ _bt_start_array_keys(scan, dir);
+
if (!status)
return false;
else if (blkno == P_NONE)
@@ -935,6 +946,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
goto readcomplete;
}
}
+ else if (so->numArrayKeys && !so->needPrimScan)
+ {
+ /*
+ * First _bt_first call (for current btrescan) without parallelism.
+ *
+ * Initialize arrays, and the corresponding scan keys that were just
+ * output by _bt_preprocess_keys.
+ */
+ _bt_start_array_keys(scan, dir);
+ }
/*----------
* Examine the scan keys to discover where we need to start the scan.
@@ -980,6 +1001,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
*
* The selected scan keys (at most one per index column) are remembered by
* storing their addresses into the local startKeys[] array.
+ *
+ * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
+ * the next primitive index scan (for scans with array keys) based in part
+ * on an understanding of how it'll enable us to reposition the scan.
+ * They're directly aware of how we'll sometimes cons up an explicit
+ * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
+ * symmetric "deduce NOT NULL" rule of their own. This allows top-level
+ * scans to skip large groups of NULLs through repeated deductions about
+ * key strictness (for a required inequality key) and whether NULLs in the
+ * key's index column are stored last or first (relative to non-NULLs).
+ * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
+ * need to be kept in sync.
*----------
*/
strat_total = BTEqualStrategyNumber;
@@ -1502,7 +1535,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
* We scan the current page starting at offnum and moving in the indicated
* direction. All items matching the scan keys are loaded into currPos.items.
* moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
- * that there can be no more matching tuples in the current scan direction.
+ * that there can be no more matching tuples in the current scan direction
+ * (could just be for the current primitive index scan when scan has arrays).
*
* _bt_first caller passes us an offnum returned by _bt_binsrch, which might
* be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
@@ -1527,11 +1561,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
- int itemIndex;
- bool continuescan;
- int indnatts;
- bool continuescanPrechecked;
- bool haveFirstMatch = false;
+ BTReadPageState pstate;
+ bool arrayKeys;
+ int itemIndex,
+ indnatts;
/*
* We must have the buffer pinned and locked, but the usual macro can't be
@@ -1546,16 +1579,32 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
if (scan->parallel_scan)
{
if (ScanDirectionIsForward(dir))
- _bt_parallel_release(scan, opaque->btpo_next);
+ pstate.prev_scan_page = opaque->btpo_next;
else
- _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+ pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
+
+ _bt_parallel_release(scan, pstate.prev_scan_page);
}
- continuescan = true; /* default assumption */
indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+ arrayKeys = so->numArrayKeys != 0;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
+ /* initialize page-level state that we'll pass to _bt_checkkeys */
+ pstate.dir = dir;
+ pstate.minoff = minoff;
+ pstate.maxoff = maxoff;
+ pstate.finaltup = NULL;
+ pstate.page = page;
+ pstate.offnum = InvalidOffsetNumber;
+ pstate.skip = InvalidOffsetNumber;
+ pstate.continuescan = true; /* default assumption */
+ pstate.prechecked = false;
+ pstate.firstmatch = false;
+ pstate.rechecks = 0;
+ pstate.targetdistance = 0;
+
/*
* We note the buffer's block number so that we can release the pin later.
* This allows us to re-read the buffer if it is needed again for hinting.
@@ -1598,10 +1647,34 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* corresponding value from the last item on the page. So checking with
* the last item on the page would give a more precise answer.
*
- * We skip this for the first page in the scan to evade the possible
- * slowdown of the point queries.
+ * We skip this for the first page read by each (primitive) scan, to avoid
+ * slowing down point queries. They typically don't stand to gain much
+ * when the optimization can be applied, and are more likely to notice the
+ * overhead of the precheck.
+ *
+ * The optimization is unsafe and must be avoided whenever _bt_checkkeys
+ * just set a low-order required array's key to the best available match
+ * for a truncated -inf attribute value from the prior page's high key
+ * (array element 0 is always the best available match in this scenario).
+ * It's quite likely that matches for array element 0 begin on this page,
+ * but the start of matches won't necessarily align with page boundaries.
+ * When the start of matches is somewhere in the middle of this page, it
+ * would be wrong to treat page's final non-pivot tuple as representative.
+ * Doing so might lead us to treat some of the page's earlier tuples as
+ * being part of a group of tuples thought to satisfy the required keys.
+ *
+ * Note: Conversely, in the case where the scan's arrays just advanced
+ * using the prior page's HIKEY _without_ advancement setting scanBehind,
+ * the start of matches must be aligned with page boundaries, which makes
+ * it safe to attempt the optimization here now. It's also safe when the
+ * prior page's HIKEY simply didn't need to advance any required array. In
+ * both cases we can safely assume that the _first_ tuple from this page
+ * must be >= the current set of array keys/equality constraints. And so
+ * if the final tuple is == those same keys (and also satisfies any
+ * required < or <= strategy scan keys) during the precheck, we can safely
+ * assume that this must also be true of all earlier tuples from the page.
*/
- if (!firstPage && minoff < maxoff)
+ if (!firstPage && !so->scanBehind && minoff < maxoff)
{
ItemId iid;
IndexTuple itup;
@@ -1609,22 +1682,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
itup = (IndexTuple) PageGetItem(page, iid);
- /*
- * Do the precheck. Note that we pass the pointer to the
- * 'continuescanPrechecked' to the 'continuescan' argument. That will
- * set flag to true if all required keys are satisfied and false
- * otherwise.
- */
- (void) _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescanPrechecked, false, false);
- }
- else
- {
- continuescanPrechecked = false;
+ /* Call with arrayKeys=false to avoid undesirable side-effects */
+ _bt_checkkeys(scan, &pstate, false, itup, indnatts);
+ pstate.prechecked = pstate.continuescan;
+ pstate.continuescan = true; /* reset */
}
if (ScanDirectionIsForward(dir))
{
+ /* SK_SEARCHARRAY forward scans must provide high key up front */
+ if (arrayKeys && !P_RIGHTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, P_HIKEY);
+
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+ }
+
/* load items[] in ascending order */
itemIndex = 0;
@@ -1649,23 +1722,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
- passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan,
- continuescanPrechecked,
- haveFirstMatch);
+ pstate.offnum = offnum;
+ passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+ itup, indnatts);
/*
- * If the result of prechecking required keys was true, then in
- * assert-enabled builds we also recheck that the _bt_checkkeys()
- * result is the same.
+ * Check if we need to skip ahead to a later tuple (only possible
+ * when the scan uses array keys)
*/
- Assert((!continuescanPrechecked && haveFirstMatch) ||
- passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan, false, false));
+ if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ {
+ Assert(!passes_quals && pstate.continuescan);
+ Assert(offnum < pstate.skip);
+
+ offnum = pstate.skip;
+ pstate.skip = InvalidOffsetNumber;
+ continue;
+ }
+
if (passes_quals)
{
/* tuple passes all scan key conditions */
- haveFirstMatch = true;
+ pstate.firstmatch = true;
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
@@ -1696,7 +1774,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
}
/* When !continuescan, there can't be any more matches, so stop */
- if (!continuescan)
+ if (!pstate.continuescan)
break;
offnum = OffsetNumberNext(offnum);
@@ -1713,17 +1791,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* only appear on non-pivot tuples on the right sibling page are
* common.
*/
- if (continuescan && !P_RIGHTMOST(opaque))
+ if (pstate.continuescan && !P_RIGHTMOST(opaque))
{
ItemId iid = PageGetItemId(page, P_HIKEY);
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
int truncatt;
truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
- _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false);
+ pstate.prechecked = false; /* precheck didn't cover HIKEY */
+ _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
}
- if (!continuescan)
+ if (!pstate.continuescan)
so->currPos.moreRight = false;
Assert(itemIndex <= MaxTIDsPerBTreePage);
@@ -1733,6 +1812,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
else
{
+ /* SK_SEARCHARRAY backward scans must provide final tuple up front */
+ if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, minoff);
+
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+ }
+
/* load items[] in descending order */
itemIndex = MaxTIDsPerBTreePage;
@@ -1772,23 +1859,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
- passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan,
- continuescanPrechecked,
- haveFirstMatch);
+ pstate.offnum = offnum;
+ passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+ itup, indnatts);
/*
- * If the result of prechecking required keys was true, then in
- * assert-enabled builds we also recheck that the _bt_checkkeys()
- * result is the same.
+ * Check if we need to skip ahead to a later tuple (only possible
+ * when the scan uses array keys)
*/
- Assert((!continuescanPrechecked && !haveFirstMatch) ||
- passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan, false, false));
+ if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ {
+ Assert(!passes_quals && pstate.continuescan);
+ Assert(offnum > pstate.skip);
+
+ offnum = pstate.skip;
+ pstate.skip = InvalidOffsetNumber;
+ continue;
+ }
+
if (passes_quals && tuple_alive)
{
/* tuple passes all scan key conditions */
- haveFirstMatch = true;
+ pstate.firstmatch = true;
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
@@ -1824,7 +1916,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
}
}
- if (!continuescan)
+ if (!pstate.continuescan)
{
/* there can't be any more matches, so stop */
so->currPos.moreLeft = false;
@@ -1970,6 +2062,31 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
so->currPos.nextTupleOffset);
so->markPos.itemIndex = so->markItemIndex;
so->markItemIndex = -1;
+
+ /*
+ * If we're just about to start the next primitive index scan
+ * (possible with a scan that has arrays keys, and needs to skip to
+ * continue in the current scan direction), moreLeft/moreRight only
+ * indicate the end of the current primitive index scan. They must
+ * never be taken to indicate that the top-level index scan has ended
+ * (that would be wrong).
+ *
+ * We could handle this case by treating the current array keys as
+ * markPos state. But depending on the current array state like this
+ * would add complexity. Instead, we just unset markPos's copy of
+ * moreRight or moreLeft (whichever might be affected), while making
+ * btrestpos reset the scan's arrays to their initial scan positions.
+ * In effect, btrestpos leaves advancing the arrays up to the first
+ * _bt_readpage call (that takes place after it has restored markPos).
+ */
+ Assert(so->markPos.dir == dir);
+ if (so->needPrimScan)
+ {
+ if (ScanDirectionIsForward(dir))
+ so->markPos.moreRight = true;
+ else
+ so->markPos.moreLeft = true;
+ }
}
if (ScanDirectionIsForward(dir))
@@ -1981,7 +2098,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
* Seize the scan to get the next block number; if the scan has
* ended already, bail out.
*/
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
/* release the previous buffer, if pinned */
@@ -2013,7 +2130,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
* Seize the scan to get the current block number; if the scan has
* ended already, bail out.
*/
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
BTScanPosUnpinIfPinned(so->currPos);
if (!status)
{
@@ -2097,7 +2214,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
if (scan->parallel_scan != NULL)
{
_bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
BTScanPosInvalidate(so->currPos);
@@ -2193,7 +2310,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
if (scan->parallel_scan != NULL)
{
_bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
BTScanPosInvalidate(so->currPos);
@@ -2218,6 +2335,8 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Assert(!so->needPrimScan);
+
_bt_initialize_more_data(so, dir);
if (!_bt_readnextpage(scan, blkno, dir))
@@ -2524,14 +2643,22 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
}
/*
- * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
- * for scan direction
+ * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
+ * from currPos
*/
static inline void
_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
{
- /* initialize moreLeft/moreRight appropriately for scan direction */
- if (ScanDirectionIsForward(dir))
+ so->currPos.dir = dir;
+ if (so->needPrimScan)
+ {
+ Assert(so->numArrayKeys);
+
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = true;
+ so->needPrimScan = false;
+ }
+ else if (ScanDirectionIsForward(dir))
{
so->currPos.moreLeft = false;
so->currPos.moreRight = true;