From 205737c56c7e49e8de25e6b4afca6a96abbb4e60 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandbossart@gmail.com>
Date: Wed, 3 Aug 2022 09:49:04 -0700
Subject: [PATCH v7 1/2] Introduce optimized routine for linear searches
 through an array of integers.

If SSE2 is available, this function uses it to speed up the search.  Otherwise,
it uses a simple 'for' loop.  This is a prerequisite for a follow-up commit
that will use this function to optimize [sub]xip lookups in
XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such
an optimization.

It might be worthwhile to add an ARM-specific code path to this function in the
future.

Author: Nathan Bossart
Reviewed by: Andres Freund, John Naylor
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
---
 src/include/port/pg_lfind.h | 73 +++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 src/include/port/pg_lfind.h

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
new file mode 100644
index 0000000000..27721490a6
--- /dev/null
+++ b/src/include/port/pg_lfind.h
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_lfind.h
+ *	  Optimized linear search routines.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_lfind.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_LFIND_H
+#define PG_LFIND_H
+
+#ifdef USE_SSE2
+#include "port/pg_bitutils.h"
+#endif
+
+/*
+ * pg_lfind32
+ *
+ * Returns the address of the first element in 'base' that equals 'key', or
+ * NULL if no match is found.
+ */
+static inline uint32 *
+pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
+{
+	uint32		i = 0;
+
+	/* If possible, use SSE2 intrinsics to speed up the search. */
+#ifdef USE_SSE2
+	__m128i		keys = _mm_set1_epi32(key);	/* load 4 copies of key */
+	uint32		iterations = nelem & ~0xF;	/* round down to multiple of 16 */
+
+	for (; i < iterations; i += 16)
+	{
+		/* load the next 16 values into __m128i variables */
+		__m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]);
+		__m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]);
+		__m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]);
+		__m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]);
+
+		/* perform the comparisons */
+		__m128i result1 = _mm_cmpeq_epi32(keys, vals1);
+		__m128i result2 = _mm_cmpeq_epi32(keys, vals2);
+		__m128i result3 = _mm_cmpeq_epi32(keys, vals3);
+		__m128i result4 = _mm_cmpeq_epi32(keys, vals4);
+
+		/* shrink the results into a single variable */
+		__m128i tmp1 = _mm_packs_epi32(result1, result2);
+		__m128i tmp2 = _mm_packs_epi32(result3, result4);
+		__m128i tmp3 = _mm_packs_epi16(tmp1, tmp2);
+		uint32 result = _mm_movemask_epi8(tmp3);
+
+		/* see if there was a match */
+		if (result != 0)
+			return &base[i + pg_rightmost_one_pos32(result)];
+	}
+#endif
+
+	/* Process the remaining elements the slow way. */
+	for (; i < nelem; i++)
+	{
+		if (key == base[i])
+			return &base[i];
+	}
+
+	return NULL;
+}
+
+#endif							/* PG_LFIND_H */
-- 
2.25.1

