From d7864391c455ea77b8e555e40a358c59de1bd702 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Wed, 27 Mar 2024 16:39:24 -0500
Subject: [PATCH v16 1/1] AVX512 popcount support

---
 config/c-compiler.m4                 |  34 +++++++++
 configure                            | 100 +++++++++++++++++++++++++++
 configure.ac                         |  14 ++++
 meson.build                          |  35 ++++++++++
 src/Makefile.global.in               |   4 ++
 src/include/pg_config.h.in           |   3 +
 src/include/port/pg_bitutils.h       |  17 +++++
 src/makefiles/meson.build            |   3 +-
 src/port/Makefile                    |   6 ++
 src/port/meson.build                 |   6 +-
 src/port/pg_bitutils.c               |  56 ++++++---------
 src/port/pg_popcount_avx512.c        |  40 +++++++++++
 src/port/pg_popcount_avx512_choose.c |  61 ++++++++++++++++
 13 files changed, 340 insertions(+), 39 deletions(-)
 create mode 100644 src/port/pg_popcount_avx512.c
 create mode 100644 src/port/pg_popcount_avx512_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3268a780bb..7d13368b23 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -694,3 +694,37 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_LOONGARCH_CRC32C_INTRINSICS
+
+# PGAC_AVX512_POPCNT_INTRINSICS
+# -----------------------------
+# Check if the compiler supports the AVX512 POPCNT instructions using the
+# _mm512_setzero_si512, _mm512_loadu_si512, _mm512_popcnt_epi64,
+# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
+#
+# An optional compiler flag can be passed as argument
+# (e.g., -mavx512vpopcntdq).  If the intrinsics are supported, sets
+# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
+AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [const char buf@<:@sizeof(__m512i)@:>@;
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_POPCNT="$1"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_POPCNT_INTRINSICS
diff --git a/configure b/configure
index 36feeafbb2..86c471f4ec 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,8 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
+CFLAGS_POPCNT
+PG_POPCNT_OBJS
 LIBOBJS
 OPENSSL
 ZSTD
@@ -17438,6 +17440,104 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
+# Check for AVX512 popcount intrinsics
+#
+PG_POPCNT_OBJS=""
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics_=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
+  CFLAGS_POPCNT=""
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" = x"yes"; then
+  CFLAGS_POPCNT="-mavx512vpopcntdq"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+fi
+if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+  PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+
+$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+fi
+
+
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 57f734879e..b1aebb8583 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2063,6 +2063,20 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
+# Check for AVX512 popcount intrinsics
+#
+PG_POPCNT_OBJS=""
+PGAC_AVX512_POPCNT_INTRINSICS([])
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq])
+fi
+if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+  PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+  AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX512 popcount instructions with a runtime check.])
+fi
+AC_SUBST(PG_POPCNT_OBJS)
+AC_SUBST(CFLAGS_POPCNT)
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/meson.build b/meson.build
index 18b5be842e..fbd2aa3dbd 100644
--- a/meson.build
+++ b/meson.build
@@ -1996,6 +1996,41 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of AVX512 popcount intrinsics.
+###############################################################
+
+cflags_popcnt = []
+if host_cpu == 'x86' or host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+    const char buf[sizeof(__m512i)];
+    INT64 popcnt = 0;
+    __m512i accum = _mm512_setzero_si512();
+    const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+    const __m512i cnt = _mm512_popcnt_epi64(val);
+    accum = _mm512_add_epi64(accum, cnt);
+    popcnt = _mm512_reduce_add_epi64(accum);
+    /* return computed value, to prevent the above being optimized away */
+    return popcnt == 0;
+}
+'''
+
+  if cc.links(prog, name: 'AVX512 popcount without -mavx512vpopcntdq',
+        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+  elif cc.links(prog, name: 'AVX512 popcount with -mavx512vpopcntdq',
+        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'])
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+    cflags_popcnt += '-mavx512vpopcntdq'
+  endif
+
+endif
+
 
 ###############################################################
 # Select CRC-32C implementation.
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8b3f8c24e0..dec467b7dd 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,6 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
 CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
+CFLAGS_POPCNT = @CFLAGS_POPCNT@
 CFLAGS_CRC = @CFLAGS_CRC@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 CXXFLAGS = @CXXFLAGS@
@@ -758,6 +759,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen popcount implementation
+PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 591e1ca3df..c271c06b74 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -680,6 +680,9 @@
 /* Define to 1 to build with assertion checks. (--enable-cassert) */
 #undef USE_ASSERT_CHECKING
 
+/* Define to 1 to use AVX512 popcount instructions with a runtime check. */
+#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+
 /* Define to 1 to build with Bonjour support. (--with-bonjour) */
 #undef USE_BONJOUR
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 53e5239717..fc8d34ad25 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -298,12 +298,29 @@ pg_ceil_log2_64(uint64 num)
 #endif
 #endif
 
+/*
+ * We can also try to use the AVX512 popcount instruction on some systems.
+ * The implementation of that is located in its own file because it may
+ * require special compiler flags that we don't want to apply to any other
+ * files.
+ *
+ * NB: We assume that the availability of AVX512 intrinsics implies
+ * TRY_POPCNT_FAST.
+ */
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+extern bool pg_popcount_avx512_available(void);
+extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+#endif
+
 #ifdef TRY_POPCNT_FAST
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
 
+/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+extern uint64 pg_popcount_fast(const char *buf, int bytes);
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int	pg_popcount32(uint32 word);
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index b0f4178b3d..5a592ddaee 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -99,6 +99,7 @@ pgxs_kv = {
   'PERMIT_DECLARATION_AFTER_STATEMENT':
     ' '.join(cflags_no_decl_after_statement),
 
+  'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
   'CFLAGS_CRC': ' '.join(cflags_crc),
   'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
   'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
@@ -177,7 +178,7 @@ pgxs_empty = [
   'WANTED_LANGUAGES',
 
   # Not needed because we don't build the server / PLs with the generated makefile
-  'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
+  'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
   'DTRACEFLAGS', # only server has dtrace probes
 
   'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp',
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..7e154ac379 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_POPCNT_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	inet_net_ntop.o \
@@ -92,6 +93,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
+pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
+
 #
 # Shared library versions of object files
 #
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef..7b93233428 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -84,6 +84,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
+  ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -98,8 +100,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt}
+pgport_sources_cflags = {'crc': [], 'popcnt': []}
 
 foreach f : replace_funcs_neg
   func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 1197696e97..ada3e777f7 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -114,7 +114,6 @@ static int	pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
 static inline int pg_popcount32_fast(uint32 word);
 static inline int pg_popcount64_fast(uint64 word);
-static uint64 pg_popcount_fast(const char *buf, int bytes);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
@@ -142,20 +141,18 @@ pg_popcount_available(void)
 	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
 }
 
-/*
- * These functions get called on the first call to pg_popcount32 etc.
- * They detect whether we can use the asm implementations, and replace
- * the function pointers so that subsequent calls are routed directly to
- * the chosen implementation.
- */
-static int
-pg_popcount32_choose(uint32 word)
+static inline void
+choose_popcount_functions(void)
 {
 	if (pg_popcount_available())
 	{
 		pg_popcount32 = pg_popcount32_fast;
 		pg_popcount64 = pg_popcount64_fast;
 		pg_popcount = pg_popcount_fast;
+#ifdef USE_AVX512_POPCOUNT_WITH_RUNTIME_CHECK
+		if (pg_popcount_avx512_available())
+			pg_popcount = pg_popcount_avx512;
+#endif
 	}
 	else
 	{
@@ -163,45 +160,32 @@ pg_popcount32_choose(uint32 word)
 		pg_popcount64 = pg_popcount64_slow;
 		pg_popcount = pg_popcount_slow;
 	}
+}
 
+/*
+ * These functions get called on the first call to pg_popcount32 etc.
+ * They detect whether we can use the asm implementations, and replace
+ * the function pointers so that subsequent calls are routed directly to
+ * the chosen implementation.
+ */
+static int
+pg_popcount32_choose(uint32 word)
+{
+	choose_popcount_functions();
 	return pg_popcount32(word);
 }
 
 static int
 pg_popcount64_choose(uint64 word)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount64(word);
 }
 
 static uint64
 pg_popcount_choose(const char *buf, int bytes)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount(buf, bytes);
 }
 
@@ -243,7 +227,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount_fast
  *		Returns the number of 1-bits in buf
  */
-static uint64
+uint64
 pg_popcount_fast(const char *buf, int bytes)
 {
 	uint64		popcnt = 0;
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
new file mode 100644
index 0000000000..c39db13f85
--- /dev/null
+++ b/src/port/pg_popcount_avx512.c
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512.c
+ *	  Holds the pg_popcount() implementation that uses AVX512 instructions.
+ *
+ * Copyright (c) 2019-2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include <immintrin.h>
+
+#include "port/pg_bitutils.h"
+
+/*
+ * pg_popcount_avx512
+ *		Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_avx512(const char *buf, int bytes)
+{
+	uint64		popcnt;
+	__m512i		accum = _mm512_setzero_si512();
+
+	for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+	{
+		const		__m512i val = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i cnt = _mm512_popcnt_epi64(val);
+
+		accum = _mm512_add_epi64(accum, cnt);
+		buf += sizeof(__m512i);
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accum);
+	return popcnt + pg_popcount_fast(buf, bytes);
+}
diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c
new file mode 100644
index 0000000000..62ebc515ce
--- /dev/null
+++ b/src/port/pg_popcount_avx512_choose.c
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512_choose.c
+ *    Test whether we can use AVX512 POPCNT instructions.
+ *
+ * Copyright (c) 2019-2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    src/port/pg_popcount_avx512_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+/*
+ * Return true if CPUID indicates that the AVX512 POPCNT instruction is
+ * available.
+ *
+ * NB: We assume the availability of AVX512 intrinsics implies availability of
+ * the required CPUID and XGETBV intrinsics.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+
+	if ((exx[2] & (1 << 14)) != 0)	/* avx512vpopcntdq */
+	{
+		/* Check that the OS has enabled support for the ZMM registers. */
+#ifdef _MSC_VER
+		return (_xgetbv(0) & 0xe0) != 0;
+#else
+		uint64		xcr = 0;
+		uint32		high;
+		uint32		low;
+
+__asm__ __volatile__(" xgetbv\n":"=a"(low), "=d"(high):"c"(xcr));
+		return (low & 0xe0) != 0;
+#endif
+	}
+
+	return false;
+}
-- 
2.25.1

