1 files changed, 335 insertions, 60 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index d7566ba082f..e3336039125 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -17,9 +17,9 @@
 
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
-#include "access/tableam.h"
 #include "access/transam.h"
 #include "access/xloginsert.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
@@ -37,6 +37,7 @@ static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
 static OffsetNumber _bt_findinsertloc(Relation rel,
 									  BTInsertState insertstate,
 									  bool checkingunique,
+									  bool indexUnchanged,
 									  BTStack stack,
 									  Relation heapRel);
 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
@@ -60,8 +61,16 @@ static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 								OffsetNumber itup_off, bool newfirstdataitem);
 static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 										 BTInsertState insertstate,
-										 bool lpdeadonly, bool checkingunique,
-										 bool uniquedup);
+										 bool simpleonly, bool checkingunique,
+										 bool uniquedup, bool indexUnchanged);
+static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+							   OffsetNumber *deletable, int ndeletable,
+							   IndexTuple newitem, OffsetNumber minoff,
+							   OffsetNumber maxoff);
+static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
+								   int ndeletable, IndexTuple newitem,
+								   int *nblocks);
+static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
 
 /*
  *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
@@ -75,6 +84,11 @@ static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
  *		For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
  *		don't actually insert.
  *
+ *		indexUnchanged executor hint indicates if itup is from an
+ *		UPDATE that didn't logically change the indexed value, but
+ *		must nevertheless have a new entry to point to a successor
+ *		version.
+ *
  *		The result value is only significant for UNIQUE_CHECK_PARTIAL:
  *		it must be true if the entry is known unique, else false.
  *		(In the current implementation we'll also return true after a
@@ -83,7 +97,8 @@ static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
  */
 bool
 _bt_doinsert(Relation rel, IndexTuple itup,
-			 IndexUniqueCheck checkUnique, Relation heapRel)
+			 IndexUniqueCheck checkUnique, bool indexUnchanged,
+			 Relation heapRel)
 {
 	bool		is_unique = false;
 	BTInsertStateData insertstate;
@@ -238,7 +253,7 @@ search:
 		 * checkingunique.
 		 */
 		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
-									   stack, heapRel);
+									   indexUnchanged, stack, heapRel);
 		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
 					   itup, insertstate.itemsz, newitemoff,
 					   insertstate.postingoff, false);
@@ -480,11 +495,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			 * items as quickly as we can.  We only apply _bt_compare() when
 			 * we get to a non-killed item.  We could reuse the bounds to
 			 * avoid _bt_compare() calls for known equal tuples, but it
-			 * doesn't seem worth it.  Workloads with heavy update activity
-			 * tend to have many deduplication passes, so we'll often avoid
-			 * most of those comparisons, too (we call _bt_compare() when the
-			 * posting list tuple is initially encountered, though not when
-			 * processing later TIDs from the same tuple).
+			 * doesn't seem worth it.
 			 */
 			if (!inposting)
 				curitemid = PageGetItemId(page, offset);
@@ -777,6 +788,17 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
  *		room for the new tuple, this function moves right, trying to find a
  *		legal page that does.)
  *
+ *		If 'indexUnchanged' is true, this is for an UPDATE that didn't
+ *		logically change the indexed value, but must nevertheless have a new
+ *		entry to point to a successor version.  This hint from the executor
+ *		will influence our behavior when the page might have to be split and
+ *		we must consider our options.  Bottom-up index deletion can avoid
+ *		pathological version-driven page splits, but we only want to go to the
+ *		trouble of trying it when we already have moderate confidence that
+ *		it's appropriate.  The hint should not significantly affect our
+ *		behavior over time unless practically all inserts on to the leaf page
+ *		get the hint.
+ *
  *		On exit, insertstate buffer contains the chosen insertion page, and
  *		the offset within that page is returned.  If _bt_findinsertloc needed
  *		to move right, the lock and pin on the original page are released, and
@@ -793,6 +815,7 @@ static OffsetNumber
 _bt_findinsertloc(Relation rel,
 				  BTInsertState insertstate,
 				  bool checkingunique,
+				  bool indexUnchanged,
 				  BTStack stack,
 				  Relation heapRel)
 {
@@ -817,7 +840,7 @@ _bt_findinsertloc(Relation rel,
 	if (itup_key->heapkeyspace)
 	{
 		/* Keep track of whether checkingunique duplicate seen */
-		bool		uniquedup = false;
+		bool		uniquedup = indexUnchanged;
 
 		/*
 		 * If we're inserting into a unique index, we may have to walk right
@@ -874,14 +897,13 @@ _bt_findinsertloc(Relation rel,
 		}
 
 		/*
-		 * If the target page is full, see if we can obtain enough space using
-		 * one or more strategies (e.g. erasing LP_DEAD items, deduplication).
-		 * Page splits are expensive, and should only go ahead when truly
-		 * necessary.
+		 * If the target page cannot fit newitem, try to avoid splitting the
+		 * page on insert by performing deletion or deduplication now
 		 */
 		if (PageGetFreeSpace(page) < insertstate->itemsz)
 			_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
-										 checkingunique, uniquedup);
+										 checkingunique, uniquedup,
+										 indexUnchanged);
 	}
 	else
 	{
@@ -921,9 +943,9 @@ _bt_findinsertloc(Relation rel,
 			 */
 			if (P_HAS_GARBAGE(opaque))
 			{
-				/* Erase LP_DEAD items (won't deduplicate) */
+				/* Perform simple deletion */
 				_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
-											 checkingunique, false);
+											 false, false, false);
 
 				if (PageGetFreeSpace(page) >= insertstate->itemsz)
 					break;		/* OK, now we have enough space */
@@ -970,14 +992,11 @@ _bt_findinsertloc(Relation rel,
 		/*
 		 * There is an overlapping posting list tuple with its LP_DEAD bit
 		 * set.  We don't want to unnecessarily unset its LP_DEAD bit while
-		 * performing a posting list split, so delete all LP_DEAD items early.
-		 * This is the only case where LP_DEAD deletes happen even though
-		 * there is space for newitem on the page.
-		 *
-		 * This can only erase LP_DEAD items (it won't deduplicate).
+		 * performing a posting list split, so perform simple index tuple
+		 * deletion early.
 		 */
 		_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
-									 checkingunique, false);
+									 false, false, false);
 
 		/*
 		 * Do new binary search.  New insert location cannot overlap with any
@@ -2606,21 +2625,19 @@ _bt_pgaddtup(Page page,
 }
 
 /*
- * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split by attempting
- * a variety of operations.
- *
- * There are two operations performed here: deleting items already marked
- * LP_DEAD, and deduplication.  If both operations fail to free enough space
- * for the incoming item then caller will go on to split the page.  We always
- * attempt our preferred strategy (which is to delete items whose LP_DEAD bit
- * are set) first.  If that doesn't work out we move on to deduplication.
+ * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
  *
- * Caller's checkingunique and uniquedup arguments help us decide if we should
- * perform deduplication, which is primarily useful with low cardinality data,
- * but can sometimes absorb version churn.
+ * There are three operations performed here: simple index deletion, bottom-up
+ * index deletion, and deduplication.  If all three operations fail to free
+ * enough space for the incoming item then caller will go on to split the
+ * page.  We always consider simple deletion first.  If that doesn't work out
+ * we consider alternatives.  Callers that only want us to consider simple
+ * deletion (without any fallback) ask for that using the 'simpleonly'
+ * argument.
  *
- * Callers that only want us to look for/delete LP_DEAD items can ask for that
- * directly by passing true 'lpdeadonly' argument.
+ * We usually pick only one alternative "complex" operation when simple
+ * deletion alone won't prevent a page split.  The 'checkingunique',
+ * 'uniquedup', and 'indexUnchanged' arguments are used for that.
  *
  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
  * level flag was found set.  The flag was useful back when there wasn't
@@ -2638,12 +2655,13 @@ _bt_pgaddtup(Page page,
 static void
 _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 							 BTInsertState insertstate,
-							 bool lpdeadonly, bool checkingunique,
-							 bool uniquedup)
+							 bool simpleonly, bool checkingunique,
+							 bool uniquedup, bool indexUnchanged)
 {
 	OffsetNumber deletable[MaxIndexTuplesPerPage];
 	int			ndeletable = 0;
 	OffsetNumber offnum,
+				minoff,
 				maxoff;
 	Buffer		buffer = insertstate->buf;
 	BTScanInsert itup_key = insertstate->itup_key;
@@ -2651,14 +2669,19 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
 	Assert(P_ISLEAF(opaque));
-	Assert(lpdeadonly || itup_key->heapkeyspace);
+	Assert(simpleonly || itup_key->heapkeyspace);
+	Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
 
 	/*
 	 * Scan over all items to see which ones need to be deleted according to
-	 * LP_DEAD flags.
+	 * LP_DEAD flags.  We'll usually manage to delete a few extra items that
+	 * are not marked LP_DEAD in passing.  Often the extra items that actually
+	 * end up getting deleted are items that would have had their LP_DEAD bit
+	 * set before long anyway (if we opted not to include them as extras).
 	 */
+	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
-	for (offnum = P_FIRSTDATAKEY(opaque);
+	for (offnum = minoff;
 		 offnum <= maxoff;
 		 offnum = OffsetNumberNext(offnum))
 	{
@@ -2670,7 +2693,8 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 
 	if (ndeletable > 0)
 	{
-		_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+		_bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
+						   insertstate->itup, minoff, maxoff);
 		insertstate->bounds_valid = false;
 
 		/* Return when a page split has already been avoided */
@@ -2682,37 +2706,288 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 	}
 
 	/*
-	 * Some callers only want to delete LP_DEAD items.  Return early for these
-	 * callers.
+	 * We're done with simple deletion.  Return early with callers that only
+	 * call here so that simple deletion can be considered.  This includes
+	 * callers that explicitly ask for this and checkingunique callers that
+	 * probably don't have any version churn duplicates on the page.
 	 *
 	 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
 	 * return at this point (or when we go on the try either or both of our
 	 * other strategies and they also fail).  We do not bother expending a
 	 * separate write to clear it, however.  Caller will definitely clear it
-	 * when it goes on to split the page (plus deduplication knows to clear
-	 * the flag when it actually modifies the page).
+	 * when it goes on to split the page (note also that the deduplication
+	 * process will clear the flag in passing, just to keep things tidy).
 	 */
-	if (lpdeadonly)
-		return;
-
-	/*
-	 * We can get called in the checkingunique case when there is no reason to
-	 * believe that there are any duplicates on the page; we should at least
-	 * still check for LP_DEAD items.  If that didn't work out, give up and
-	 * let caller split the page.  Deduplication cannot be justified given
-	 * there is no reason to think that there are duplicates.
-	 */
-	if (checkingunique && !uniquedup)
+	if (simpleonly || (checkingunique && !uniquedup))
+	{
+		Assert(!indexUnchanged);
 		return;
+	}
 
 	/* Assume bounds about to be invalidated (this is almost certain now) */
 	insertstate->bounds_valid = false;
 
 	/*
-	 * Perform deduplication pass, though only when it is enabled for the
-	 * index and known to be safe (it must be an allequalimage index).
+	 * Perform bottom-up index deletion pass when executor hint indicated that
+	 * incoming item is logically unchanged, or for a unique index that is
+	 * known to have physical duplicates for some other reason.  (There is a
+	 * large overlap between these two cases for a unique index.  It's worth
+	 * having both triggering conditions in order to apply the optimization in
+	 * the event of successive related INSERT and DELETE statements.)
+	 *
+	 * We'll go on to do a deduplication pass when a bottom-up pass fails to
+	 * delete an acceptable amount of free space (a significant fraction of
+	 * the page, or space for the new item, whichever is greater).
+	 *
+	 * Note: Bottom-up index deletion uses the same equality/equivalence
+	 * routines as deduplication internally.  However, it does not merge
+	 * together index tuples, so the same correctness considerations do not
+	 * apply.  We deliberately omit an index-is-allequalimage test here.
 	 */
+	if ((indexUnchanged || uniquedup) &&
+		_bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
+		return;
+
+	/* Perform deduplication pass (when enabled and index-is-allequalimage) */
 	if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
 		_bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
 					   insertstate->itemsz, checkingunique);
 }
+
+/*
+ * _bt_simpledel_pass - Simple index tuple deletion pass.
+ *
+ * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
+ * of all such tuples are determined by caller (caller passes these to us as
+ * its 'deletable' argument).
+ *
+ * We might also delete extra index tuples that turn out to be safe to delete
+ * in passing (though they must be cheap to check in passing to begin with).
+ * There is no certainty that any extra tuples will be deleted, though.  The
+ * high level goal of the approach we take is to get the most out of each call
+ * here (without noticeably increasing the per-call overhead compared to what
+ * we need to do just to be able to delete the page's LP_DEAD-marked index
+ * tuples).
+ *
+ * The number of extra index tuples that turn out to be deletable might
+ * greatly exceed the number of LP_DEAD-marked index tuples due to various
+ * locality related effects.  For example, it's possible that the total number
+ * of table blocks (pointed to by all TIDs on the leaf page) is naturally
+ * quite low, in which case we might end up checking if it's possible to
+ * delete _most_ index tuples on the page (without the tableam needing to
+ * access additional table blocks).  The tableam will sometimes stumble upon
+ * _many_ extra deletable index tuples in indexes where this pattern is
+ * common.
+ *
+ * See nbtree/README for further details on simple index tuple deletion.
+ */
+static void
+_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+				   OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
+				   OffsetNumber minoff, OffsetNumber maxoff)
+{
+	Page		page = BufferGetPage(buffer);
+	BlockNumber *deadblocks;
+	int			ndeadblocks;
+	TM_IndexDeleteOp delstate;
+	OffsetNumber offnum;
+
+	/* Get array of table blocks pointed to by LP_DEAD-set tuples */
+	deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
+								&ndeadblocks);
+
+	/* Initialize tableam state that describes index deletion operation */
+	delstate.bottomup = false;
+	delstate.bottomupfreespace = 0;
+	delstate.ndeltids = 0;
+	delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+	delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+		TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
+		TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
+		BlockNumber tidblock;
+		void	   *match;
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
+			match = bsearch(&tidblock, deadblocks, ndeadblocks,
+							sizeof(BlockNumber), _bt_blk_cmp);
+
+			if (!match)
+			{
+				Assert(!ItemIdIsDead(itemid));
+				continue;
+			}
+
+			/*
+			 * TID's table block is among those pointed to by the TIDs from
+			 * LP_DEAD-bit set tuples on page -- add TID to deltids
+			 */
+			odeltid->tid = itup->t_tid;
+			odeltid->id = delstate.ndeltids;
+			ostatus->idxoffnum = offnum;
+			ostatus->knowndeletable = ItemIdIsDead(itemid);
+			ostatus->promising = false; /* unused */
+			ostatus->freespace = 0; /* unused */
+
+			delstate.ndeltids++;
+		}
+		else
+		{
+			int			nitem = BTreeTupleGetNPosting(itup);
+
+			for (int p = 0; p < nitem; p++)
+			{
+				ItemPointer tid = BTreeTupleGetPostingN(itup, p);
+
+				tidblock = ItemPointerGetBlockNumber(tid);
+				match = bsearch(&tidblock, deadblocks, ndeadblocks,
+								sizeof(BlockNumber), _bt_blk_cmp);
+
+				if (!match)
+				{
+					Assert(!ItemIdIsDead(itemid));
+					continue;
+				}
+
+				/*
+				 * TID's table block is among those pointed to by the TIDs
+				 * from LP_DEAD-bit set tuples on page -- add TID to deltids
+				 */
+				odeltid->tid = *tid;
+				odeltid->id = delstate.ndeltids;
+				ostatus->idxoffnum = offnum;
+				ostatus->knowndeletable = ItemIdIsDead(itemid);
+				ostatus->promising = false; /* unused */
+				ostatus->freespace = 0; /* unused */
+
+				odeltid++;
+				ostatus++;
+				delstate.ndeltids++;
+			}
+		}
+	}
+
+	pfree(deadblocks);
+
+	Assert(delstate.ndeltids >= ndeletable);
+
+	/* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
+	_bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
+
+	pfree(delstate.deltids);
+	pfree(delstate.status);
+}
+
+/*
+ * _bt_deadblocks() -- Get LP_DEAD related table blocks.
+ *
+ * Builds sorted and unique-ified array of table block numbers from index
+ * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
+ * block from incoming newitem just in case it isn't among the LP_DEAD-related
+ * table blocks.
+ *
+ * Always counting the newitem's table block as an LP_DEAD related block makes
+ * sense because the cost is consistently low; it is practically certain that
+ * the table block will not incur a buffer miss in tableam.  On the other hand
+ * the benefit is often quite high.  There is a decent chance that there will
+ * be some deletable items from this block, since in general most garbage
+ * tuples became garbage in the recent past (in many cases this won't be the
+ * first logical row that core code added to/modified in table block
+ * recently).
+ *
+ * Returns final array, and sets *nblocks to its final size for caller.
+ */
+static BlockNumber *
+_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
+			   IndexTuple newitem, int *nblocks)
+{
+	int			spacentids,
+				ntids;
+	BlockNumber *tidblocks;
+
+	/*
+	 * Accumulate each TID's block in array whose initial size has space for
+	 * one table block per LP_DEAD-set tuple (plus space for the newitem table
+	 * block).  Array will only need to grow when there are LP_DEAD-marked
+	 * posting list tuples (which is not that common).
+	 */
+	spacentids = ndeletable + 1;
+	ntids = 0;
+	tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
+
+	/*
+	 * First add the table block for the incoming newitem.  This is the one
+	 * case where simple deletion can visit a table block that doesn't have
+	 * any known deletable items.
+	 */
+	Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
+	tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
+
+	for (int i = 0; i < ndeletable; i++)
+	{
+		ItemId		itemid = PageGetItemId(page, deletable[i]);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(ItemIdIsDead(itemid));
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			if (ntids + 1 > spacentids)
+			{
+				spacentids *= 2;
+				tidblocks = (BlockNumber *)
+					repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+			}
+
+			tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
+		}
+		else
+		{
+			int			nposting = BTreeTupleGetNPosting(itup);
+
+			if (ntids + nposting > spacentids)
+			{
+				spacentids = Max(spacentids * 2, ntids + nposting);
+				tidblocks = (BlockNumber *)
+					repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+			}
+
+			for (int j = 0; j < nposting; j++)
+			{
+				ItemPointer tid = BTreeTupleGetPostingN(itup, j);
+
+				tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
+			}
+		}
+	}
+
+	qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+	*nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+
+	return tidblocks;
+}
+
+/*
+ * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
+ */
+static inline int
+_bt_blk_cmp(const void *arg1, const void *arg2)
+{
+	BlockNumber b1 = *((BlockNumber *) arg1);
+	BlockNumber b2 = *((BlockNumber *) arg2);
+
+	if (b1 < b2)
+		return -1;
+	else if (b1 > b2)
+		return 1;
+
+	return 0;
+}