From 2f4a7747025cd3288453fdabd520638e37e3633c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Mon, 18 Mar 2024 10:44:08 -0500
Subject: [PATCH v4 1/3] pg_lfind32(): Optimize processing remaining elements.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/pg_lfind.h | 42 +++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..bef0e2d5be 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -95,15 +95,16 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
-	 * on a block of four registers.  Testing for SSE2 has showed this is ~40%
-	 * faster than using a block of two registers.
+	 * on a block of registers.  We first do as much processing as possible
+	 * with a block of 4 registers, then we try to process what remains with a
+	 * block of 2 registers.
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32		nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
-	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
+	uint32		tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
 	bool		assert_result = false;
@@ -157,6 +158,39 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	/*
+	 * Try processing the remaining elements using 2 registers instead of 4.
+	 */
+	nelem_per_iteration = 2 * nelem_per_vector;
+	tail_idx = nelem & ~(nelem_per_iteration - 1);
+
+	for (; i < tail_idx; i += nelem_per_iteration)
+	{
+		Vector32	vals1,
+					vals2,
+					result1,
+					result2,
+					result;
+
+		/* load the next block into 2 registers */
+		vector32_load(&vals1, &base[i]);
+		vector32_load(&vals2, &base[i + nelem_per_vector]);
+
+		/* compare each value to the key */
+		result1 = vector32_eq(keys, vals1);
+		result2 = vector32_eq(keys, vals2);
+
+		/* combine the results into a single variable */
+		result = vector32_or(result1, result2);
+
+		/* see if there was a match */
+		if (vector32_is_highbit_set(result))
+		{
+			Assert(assert_result);
+			return true;
+		}
+	}
 #endif							/* ! USE_NO_SIMD */
 
 	/* Process the remaining elements one at a time. */
-- 
2.25.1