diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 395 |
1 files changed, 335 insertions, 60 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index d7566ba082f..e3336039125 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -17,9 +17,9 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" -#include "access/tableam.h" #include "access/transam.h" #include "access/xloginsert.h" +#include "lib/qunique.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" @@ -37,6 +37,7 @@ static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, + bool indexUnchanged, BTStack stack, Relation heapRel); static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack); @@ -60,8 +61,16 @@ static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem); static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, - bool lpdeadonly, bool checkingunique, - bool uniquedup); + bool simpleonly, bool checkingunique, + bool uniquedup, bool indexUnchanged); +static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, + OffsetNumber *deletable, int ndeletable, + IndexTuple newitem, OffsetNumber minoff, + OffsetNumber maxoff); +static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable, + int ndeletable, IndexTuple newitem, + int *nblocks); +static inline int _bt_blk_cmp(const void *arg1, const void *arg2); /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. @@ -75,6 +84,11 @@ static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and * don't actually insert. * + * indexUnchanged executor hint indicates if itup is from an + * UPDATE that didn't logically change the indexed value, but + * must nevertheless have a new entry to point to a successor + * version. + * * The result value is only significant for UNIQUE_CHECK_PARTIAL: * it must be true if the entry is known unique, else false. * (In the current implementation we'll also return true after a @@ -83,7 +97,8 @@ static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, */ bool _bt_doinsert(Relation rel, IndexTuple itup, - IndexUniqueCheck checkUnique, Relation heapRel) + IndexUniqueCheck checkUnique, bool indexUnchanged, + Relation heapRel) { bool is_unique = false; BTInsertStateData insertstate; @@ -238,7 +253,7 @@ search: * checkingunique. */ newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, - stack, heapRel); + indexUnchanged, stack, heapRel); _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, itup, insertstate.itemsz, newitemoff, insertstate.postingoff, false); @@ -480,11 +495,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * items as quickly as we can. We only apply _bt_compare() when * we get to a non-killed item. We could reuse the bounds to * avoid _bt_compare() calls for known equal tuples, but it - * doesn't seem worth it. Workloads with heavy update activity - * tend to have many deduplication passes, so we'll often avoid - * most of those comparisons, too (we call _bt_compare() when the - * posting list tuple is initially encountered, though not when - * processing later TIDs from the same tuple). + * doesn't seem worth it. */ if (!inposting) curitemid = PageGetItemId(page, offset); @@ -777,6 +788,17 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * room for the new tuple, this function moves right, trying to find a * legal page that does.) * + * If 'indexUnchanged' is true, this is for an UPDATE that didn't + * logically change the indexed value, but must nevertheless have a new + * entry to point to a successor version. This hint from the executor + * will influence our behavior when the page might have to be split and + * we must consider our options. Bottom-up index deletion can avoid + * pathological version-driven page splits, but we only want to go to the + * trouble of trying it when we already have moderate confidence that + * it's appropriate. The hint should not significantly affect our + * behavior over time unless practically all inserts on to the leaf page + * get the hint. + * * On exit, insertstate buffer contains the chosen insertion page, and * the offset within that page is returned. If _bt_findinsertloc needed * to move right, the lock and pin on the original page are released, and @@ -793,6 +815,7 @@ static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, + bool indexUnchanged, BTStack stack, Relation heapRel) { @@ -817,7 +840,7 @@ _bt_findinsertloc(Relation rel, if (itup_key->heapkeyspace) { /* Keep track of whether checkingunique duplicate seen */ - bool uniquedup = false; + bool uniquedup = indexUnchanged; /* * If we're inserting into a unique index, we may have to walk right @@ -874,14 +897,13 @@ _bt_findinsertloc(Relation rel, } /* - * If the target page is full, see if we can obtain enough space using - * one or more strategies (e.g. erasing LP_DEAD items, deduplication). - * Page splits are expensive, and should only go ahead when truly - * necessary. + * If the target page cannot fit newitem, try to avoid splitting the + * page on insert by performing deletion or deduplication now */ if (PageGetFreeSpace(page) < insertstate->itemsz) _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, - checkingunique, uniquedup); + checkingunique, uniquedup, + indexUnchanged); } else { @@ -921,9 +943,9 @@ _bt_findinsertloc(Relation rel, */ if (P_HAS_GARBAGE(opaque)) { - /* Erase LP_DEAD items (won't deduplicate) */ + /* Perform simple deletion */ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, - checkingunique, false); + false, false, false); if (PageGetFreeSpace(page) >= insertstate->itemsz) break; /* OK, now we have enough space */ @@ -970,14 +992,11 @@ _bt_findinsertloc(Relation rel, /* * There is an overlapping posting list tuple with its LP_DEAD bit * set. We don't want to unnecessarily unset its LP_DEAD bit while - * performing a posting list split, so delete all LP_DEAD items early. - * This is the only case where LP_DEAD deletes happen even though - * there is space for newitem on the page. - * - * This can only erase LP_DEAD items (it won't deduplicate). + * performing a posting list split, so perform simple index tuple + * deletion early. */ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, - checkingunique, false); + false, false, false); /* * Do new binary search. New insert location cannot overlap with any @@ -2606,21 +2625,19 @@ _bt_pgaddtup(Page page, } /* - * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split by attempting - * a variety of operations. - * - * There are two operations performed here: deleting items already marked - * LP_DEAD, and deduplication. If both operations fail to free enough space - * for the incoming item then caller will go on to split the page. We always - * attempt our preferred strategy (which is to delete items whose LP_DEAD bit - * are set) first. If that doesn't work out we move on to deduplication. + * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split. * - * Caller's checkingunique and uniquedup arguments help us decide if we should - * perform deduplication, which is primarily useful with low cardinality data, - * but can sometimes absorb version churn. + * There are three operations performed here: simple index deletion, bottom-up + * index deletion, and deduplication. If all three operations fail to free + * enough space for the incoming item then caller will go on to split the + * page. We always consider simple deletion first. If that doesn't work out + * we consider alternatives. Callers that only want us to consider simple + * deletion (without any fallback) ask for that using the 'simpleonly' + * argument. * - * Callers that only want us to look for/delete LP_DEAD items can ask for that - * directly by passing true 'lpdeadonly' argument. + * We usually pick only one alternative "complex" operation when simple + * deletion alone won't prevent a page split. The 'checkingunique', + * 'uniquedup', and 'indexUnchanged' arguments are used for that. * * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page * level flag was found set. The flag was useful back when there wasn't @@ -2638,12 +2655,13 @@ _bt_pgaddtup(Page page, static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, - bool lpdeadonly, bool checkingunique, - bool uniquedup) + bool simpleonly, bool checkingunique, + bool uniquedup, bool indexUnchanged) { OffsetNumber deletable[MaxIndexTuplesPerPage]; int ndeletable = 0; OffsetNumber offnum, + minoff, maxoff; Buffer buffer = insertstate->buf; BTScanInsert itup_key = insertstate->itup_key; @@ -2651,14 +2669,19 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); - Assert(lpdeadonly || itup_key->heapkeyspace); + Assert(simpleonly || itup_key->heapkeyspace); + Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged)); /* * Scan over all items to see which ones need to be deleted according to - * LP_DEAD flags. + * LP_DEAD flags. We'll usually manage to delete a few extra items that + * are not marked LP_DEAD in passing. Often the extra items that actually + * end up getting deleted are items that would have had their LP_DEAD bit + * set before long anyway (if we opted not to include them as extras). */ + minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); - for (offnum = P_FIRSTDATAKEY(opaque); + for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { @@ -2670,7 +2693,8 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, if (ndeletable > 0) { - _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); + _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable, + insertstate->itup, minoff, maxoff); insertstate->bounds_valid = false; /* Return when a page split has already been avoided */ @@ -2682,37 +2706,288 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, } /* - * Some callers only want to delete LP_DEAD items. Return early for these - * callers. + * We're done with simple deletion. Return early with callers that only + * call here so that simple deletion can be considered. This includes + * callers that explicitly ask for this and checkingunique callers that + * probably don't have any version churn duplicates on the page. * * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we * return at this point (or when we go on the try either or both of our * other strategies and they also fail). We do not bother expending a * separate write to clear it, however. Caller will definitely clear it - * when it goes on to split the page (plus deduplication knows to clear - * the flag when it actually modifies the page). + * when it goes on to split the page (note also that the deduplication + * process will clear the flag in passing, just to keep things tidy). */ - if (lpdeadonly) - return; - - /* - * We can get called in the checkingunique case when there is no reason to - * believe that there are any duplicates on the page; we should at least - * still check for LP_DEAD items. If that didn't work out, give up and - * let caller split the page. Deduplication cannot be justified given - * there is no reason to think that there are duplicates. - */ - if (checkingunique && !uniquedup) + if (simpleonly || (checkingunique && !uniquedup)) + { + Assert(!indexUnchanged); return; + } /* Assume bounds about to be invalidated (this is almost certain now) */ insertstate->bounds_valid = false; /* - * Perform deduplication pass, though only when it is enabled for the - * index and known to be safe (it must be an allequalimage index). + * Perform bottom-up index deletion pass when executor hint indicated that + * incoming item is logically unchanged, or for a unique index that is + * known to have physical duplicates for some other reason. (There is a + * large overlap between these two cases for a unique index. It's worth + * having both triggering conditions in order to apply the optimization in + * the event of successive related INSERT and DELETE statements.) + * + * We'll go on to do a deduplication pass when a bottom-up pass fails to + * delete an acceptable amount of free space (a significant fraction of + * the page, or space for the new item, whichever is greater). + * + * Note: Bottom-up index deletion uses the same equality/equivalence + * routines as deduplication internally. However, it does not merge + * together index tuples, so the same correctness considerations do not + * apply. We deliberately omit an index-is-allequalimage test here. */ + if ((indexUnchanged || uniquedup) && + _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz)) + return; + + /* Perform deduplication pass (when enabled and index-is-allequalimage) */ if (BTGetDeduplicateItems(rel) && itup_key->allequalimage) _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup, insertstate->itemsz, checkingunique); } + +/* + * _bt_simpledel_pass - Simple index tuple deletion pass. + * + * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers + * of all such tuples are determined by caller (caller passes these to us as + * its 'deletable' argument). + * + * We might also delete extra index tuples that turn out to be safe to delete + * in passing (though they must be cheap to check in passing to begin with). + * There is no certainty that any extra tuples will be deleted, though. The + * high level goal of the approach we take is to get the most out of each call + * here (without noticeably increasing the per-call overhead compared to what + * we need to do just to be able to delete the page's LP_DEAD-marked index + * tuples). + * + * The number of extra index tuples that turn out to be deletable might + * greatly exceed the number of LP_DEAD-marked index tuples due to various + * locality related effects. For example, it's possible that the total number + * of table blocks (pointed to by all TIDs on the leaf page) is naturally + * quite low, in which case we might end up checking if it's possible to + * delete _most_ index tuples on the page (without the tableam needing to + * access additional table blocks). The tableam will sometimes stumble upon + * _many_ extra deletable index tuples in indexes where this pattern is + * common. + * + * See nbtree/README for further details on simple index tuple deletion. + */ +static void +_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, + OffsetNumber *deletable, int ndeletable, IndexTuple newitem, + OffsetNumber minoff, OffsetNumber maxoff) +{ + Page page = BufferGetPage(buffer); + BlockNumber *deadblocks; + int ndeadblocks; + TM_IndexDeleteOp delstate; + OffsetNumber offnum; + + /* Get array of table blocks pointed to by LP_DEAD-set tuples */ + deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem, + &ndeadblocks); + + /* Initialize tableam state that describes index deletion operation */ + delstate.bottomup = false; + delstate.bottomupfreespace = 0; + delstate.ndeltids = 0; + delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); + delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids]; + TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids]; + BlockNumber tidblock; + void *match; + + if (!BTreeTupleIsPosting(itup)) + { + tidblock = ItemPointerGetBlockNumber(&itup->t_tid); + match = bsearch(&tidblock, deadblocks, ndeadblocks, + sizeof(BlockNumber), _bt_blk_cmp); + + if (!match) + { + Assert(!ItemIdIsDead(itemid)); + continue; + } + + /* + * TID's table block is among those pointed to by the TIDs from + * LP_DEAD-bit set tuples on page -- add TID to deltids + */ + odeltid->tid = itup->t_tid; + odeltid->id = delstate.ndeltids; + ostatus->idxoffnum = offnum; + ostatus->knowndeletable = ItemIdIsDead(itemid); + ostatus->promising = false; /* unused */ + ostatus->freespace = 0; /* unused */ + + delstate.ndeltids++; + } + else + { + int nitem = BTreeTupleGetNPosting(itup); + + for (int p = 0; p < nitem; p++) + { + ItemPointer tid = BTreeTupleGetPostingN(itup, p); + + tidblock = ItemPointerGetBlockNumber(tid); + match = bsearch(&tidblock, deadblocks, ndeadblocks, + sizeof(BlockNumber), _bt_blk_cmp); + + if (!match) + { + Assert(!ItemIdIsDead(itemid)); + continue; + } + + /* + * TID's table block is among those pointed to by the TIDs + * from LP_DEAD-bit set tuples on page -- add TID to deltids + */ + odeltid->tid = *tid; + odeltid->id = delstate.ndeltids; + ostatus->idxoffnum = offnum; + ostatus->knowndeletable = ItemIdIsDead(itemid); + ostatus->promising = false; /* unused */ + ostatus->freespace = 0; /* unused */ + + odeltid++; + ostatus++; + delstate.ndeltids++; + } + } + } + + pfree(deadblocks); + + Assert(delstate.ndeltids >= ndeletable); + + /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */ + _bt_delitems_delete_check(rel, buffer, heapRel, &delstate); + + pfree(delstate.deltids); + pfree(delstate.status); +} + +/* + * _bt_deadblocks() -- Get LP_DEAD related table blocks. + * + * Builds sorted and unique-ified array of table block numbers from index + * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table + * block from incoming newitem just in case it isn't among the LP_DEAD-related + * table blocks. + * + * Always counting the newitem's table block as an LP_DEAD related block makes + * sense because the cost is consistently low; it is practically certain that + * the table block will not incur a buffer miss in tableam. On the other hand + * the benefit is often quite high. There is a decent chance that there will + * be some deletable items from this block, since in general most garbage + * tuples became garbage in the recent past (in many cases this won't be the + * first logical row that core code added to/modified in table block + * recently). + * + * Returns final array, and sets *nblocks to its final size for caller. + */ +static BlockNumber * +_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, + IndexTuple newitem, int *nblocks) +{ + int spacentids, + ntids; + BlockNumber *tidblocks; + + /* + * Accumulate each TID's block in array whose initial size has space for + * one table block per LP_DEAD-set tuple (plus space for the newitem table + * block). Array will only need to grow when there are LP_DEAD-marked + * posting list tuples (which is not that common). + */ + spacentids = ndeletable + 1; + ntids = 0; + tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids); + + /* + * First add the table block for the incoming newitem. This is the one + * case where simple deletion can visit a table block that doesn't have + * any known deletable items. + */ + Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem)); + tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid); + + for (int i = 0; i < ndeletable; i++) + { + ItemId itemid = PageGetItemId(page, deletable[i]); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + + if (!BTreeTupleIsPosting(itup)) + { + if (ntids + 1 > spacentids) + { + spacentids *= 2; + tidblocks = (BlockNumber *) + repalloc(tidblocks, sizeof(BlockNumber) * spacentids); + } + + tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid); + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + if (ntids + nposting > spacentids) + { + spacentids = Max(spacentids * 2, ntids + nposting); + tidblocks = (BlockNumber *) + repalloc(tidblocks, sizeof(BlockNumber) * spacentids); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer tid = BTreeTupleGetPostingN(itup, j); + + tidblocks[ntids++] = ItemPointerGetBlockNumber(tid); + } + } + } + + qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); + *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); + + return tidblocks; +} + +/* + * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass + */ +static inline int +_bt_blk_cmp(const void *arg1, const void *arg2) +{ + BlockNumber b1 = *((BlockNumber *) arg1); + BlockNumber b2 = *((BlockNumber *) arg2); + + if (b1 < b2) + return -1; + else if (b1 > b2) + return 1; + + return 0; +} |