For all ppc compilers, implement compare_exchange and fetch_add with asm.

This is more like how we handle s_lock.h and arch-x86.h. Reviewed by Tom Lane. Discussion: https://postgr.es/m/[email protected]
author: Noah Misch 2019-10-19 03:20:52 +0000
committer: Noah Misch 2019-10-19 03:20:52 +0000
commit: 30ee5d17c20dbb282a9952b3048d6ad52d56c371 (patch)
tree: 312f0c13adacd8797fd7ef589fd6b826bff7a2cf /src/include/port/atomics
parent: 89b4d7744c80ecb3f6bdf8a07ca711a043718db3 (diff)
2 files changed, 231 insertions, 142 deletions
diff --git a/src/include/port/atomics/arch-ppc.h b/src/include/port/atomics/arch-ppc.h
index 344b39449bd..568292d1d57 100644
--- a/src/include/port/atomics/arch-ppc.h
+++ b/src/include/port/atomics/arch-ppc.h
@@ -25,5 +25,236 @@
 #define pg_write_barrier_impl()		__asm__ __volatile__ ("lwsync" : : : "memory")
 #endif
 
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+/* 64bit atomics are only supported in 64bit mode */
+#ifdef __64BIT__
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+	volatile uint64 value pg_attribute_aligned(8);
+} pg_atomic_uint64;
+
+#endif /* __64BIT__ */
+
+/*
+ * This mimics gcc __atomic_compare_exchange_n(..., __ATOMIC_SEQ_CST), but
+ * code generation differs at the end.  __atomic_compare_exchange_n():
+ *  100:	isync
+ *  104:	mfcr    r3
+ *  108:	rlwinm  r3,r3,3,31,31
+ *  10c:	bne     120 <.eb+0x10>
+ *  110:	clrldi  r3,r3,63
+ *  114:	addi    r1,r1,112
+ *  118:	blr
+ *  11c:	nop
+ *  120:	clrldi  r3,r3,63
+ *  124:	stw     r9,0(r4)
+ *  128:	addi    r1,r1,112
+ *  12c:	blr
+ *
+ * This:
+ *   f0:	isync
+ *   f4:	mfcr    r9
+ *   f8:	rldicl. r3,r9,35,63
+ *   fc:	bne     104 <.eb>
+ *  100:	stw     r10,0(r4)
+ *  104:	addi    r1,r1,112
+ *  108:	blr
+ *
+ * This implementation may or may not have materially different performance.
+ * It's not exploiting the fact that cr0 still holds the relevant comparison
+ * bits, set during the __asm__.  One could fix that by moving more code into
+ * the __asm__.  (That would remove the freedom to eliminate dead stores when
+ * the caller ignores "expected", but few callers do.)
+ *
+ * The cmpwi variant may be dead code.  In gcc 7.2.0,
+ * __builtin_constant_p(*expected) always reports false.
+ * __atomic_compare_exchange_n() does use cmpwi when its second argument
+ * points to a constant.  Hence, using this instead of
+ * __atomic_compare_exchange_n() nominally penalizes the generic.h
+ * pg_atomic_test_set_flag_impl().  Modern GCC will use the generic-gcc.h
+ * version, making the penalty theoretical only.
+ *
+ * Recognizing constant "newval" would be superfluous, because there's no
+ * immediate-operand version of stwcx.
+ */
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	uint32 found;
+	uint32 condition_register;
+	bool ret;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+	if (__builtin_constant_p(*expected) &&
+		*expected <= PG_INT16_MAX && *expected >= PG_INT16_MIN)
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	lwarx   %0,0,%5		\n"
+			"	cmpwi   %0,%3		\n"
+			"	bne     $+12		\n"		/* branch to isync */
+			"	stwcx.  %4,0,%5		\n"
+			"	bne     $-16		\n"		/* branch to lwarx */
+			"	isync				\n"
+			"	mfcr    %1          \n"
+:			"=&r"(found), "=r"(condition_register), "+m"(ptr->value)
+:			"i"(*expected), "r"(newval), "r"(&ptr->value)
+:			"memory", "cc");
+	else
+#endif
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	lwarx   %0,0,%5		\n"
+			"	cmpw    %0,%3		\n"
+			"	bne     $+12		\n"		/* branch to isync */
+			"	stwcx.  %4,0,%5		\n"
+			"	bne     $-16		\n"		/* branch to lwarx */
+			"	isync				\n"
+			"	mfcr    %1          \n"
+:			"=&r"(found), "=r"(condition_register), "+m"(ptr->value)
+:			"r"(*expected), "r"(newval), "r"(&ptr->value)
+:			"memory", "cc");
+
+	ret = (condition_register >> 29) & 1;	/* test eq bit of cr0 */
+	if (!ret)
+		*expected = found;
+	return ret;
+}
+
+/*
+ * This mirrors gcc __sync_fetch_and_add().
+ *
+ * Like tas(), use constraint "=&b" to avoid allocating r0.
+ */
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	uint32 _t;
+	uint32 res;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+	if (__builtin_constant_p(add_) &&
+		add_ <= PG_INT16_MAX && add_ >= PG_INT16_MIN)
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	lwarx   %1,0,%4		\n"
+			"	addi    %0,%1,%3	\n"
+			"	stwcx.  %0,0,%4		\n"
+			"	bne     $-12		\n"		/* branch to lwarx */
+			"	isync				\n"
+:			"=&r"(_t), "=&b"(res), "+m"(ptr->value)
+:			"i"(add_), "r"(&ptr->value)
+:			"memory", "cc");
+	else
+#endif
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	lwarx   %1,0,%4		\n"
+			"	add     %0,%1,%3	\n"
+			"	stwcx.  %0,0,%4		\n"
+			"	bne     $-12		\n"		/* branch to lwarx */
+			"	isync				\n"
+:			"=&r"(_t), "=&r"(res), "+m"(ptr->value)
+:			"r"(add_), "r"(&ptr->value)
+:			"memory", "cc");
+
+	return res;
+}
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	uint64 found;
+	uint32 condition_register;
+	bool ret;
+
+	/* Like u32, but s/lwarx/ldarx/; s/stwcx/stdcx/; s/cmpw/cmpd/ */
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+	if (__builtin_constant_p(*expected) &&
+		*expected <= PG_INT16_MAX && *expected >= PG_INT16_MIN)
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	ldarx   %0,0,%5		\n"
+			"	cmpdi   %0,%3		\n"
+			"	bne     $+12		\n"		/* branch to isync */
+			"	stdcx.  %4,0,%5		\n"
+			"	bne     $-16		\n"		/* branch to ldarx */
+			"	isync				\n"
+			"	mfcr    %1          \n"
+:			"=&r"(found), "=r"(condition_register), "+m"(ptr->value)
+:			"i"(*expected), "r"(newval), "r"(&ptr->value)
+:			"memory", "cc");
+	else
+#endif
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	ldarx   %0,0,%5		\n"
+			"	cmpd    %0,%3		\n"
+			"	bne     $+12		\n"		/* branch to isync */
+			"	stdcx.  %4,0,%5		\n"
+			"	bne     $-16		\n"		/* branch to ldarx */
+			"	isync				\n"
+			"	mfcr    %1          \n"
+:			"=&r"(found), "=r"(condition_register), "+m"(ptr->value)
+:			"r"(*expected), "r"(newval), "r"(&ptr->value)
+:			"memory", "cc");
+
+	ret = (condition_register >> 29) & 1;	/* test eq bit of cr0 */
+	if (!ret)
+		*expected = found;
+	return ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	uint64 _t;
+	uint64 res;
+
+	/* Like u32, but s/lwarx/ldarx/; s/stwcx/stdcx/ */
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+	if (__builtin_constant_p(add_) &&
+		add_ <= PG_INT16_MAX && add_ >= PG_INT16_MIN)
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	ldarx   %1,0,%4		\n"
+			"	addi    %0,%1,%3	\n"
+			"	stdcx.  %0,0,%4		\n"
+			"	bne     $-12		\n"		/* branch to ldarx */
+			"	isync				\n"
+:			"=&r"(_t), "=&b"(res), "+m"(ptr->value)
+:			"i"(add_), "r"(&ptr->value)
+:			"memory", "cc");
+	else
+#endif
+		__asm__ __volatile__(
+			"	sync				\n"
+			"	ldarx   %1,0,%4		\n"
+			"	add     %0,%1,%3	\n"
+			"	stdcx.  %0,0,%4		\n"
+			"	bne     $-12		\n"		/* branch to ldarx */
+			"	isync				\n"
+:			"=&r"(_t), "=&r"(res), "+m"(ptr->value)
+:			"r"(add_), "r"(&ptr->value)
+:			"memory", "cc");
+
+	return res;
+}
+
+#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
+
 /* per architecture manual doubleword accesses have single copy atomicity */
 #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
diff --git a/src/include/port/atomics/generic-xlc.h b/src/include/port/atomics/generic-xlc.h
deleted file mode 100644
index 8b5c7329706..00000000000
--- a/src/include/port/atomics/generic-xlc.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * generic-xlc.h
- *	  Atomic operations for IBM's CC
- *
- * Portions Copyright (c) 2013-2019, PostgreSQL Global Development Group
- *
- * NOTES:
- *
- * Documentation:
- * * Synchronization and atomic built-in functions
- *   http://www-01.ibm.com/support/knowledgecenter/SSGH3R_13.1.2/com.ibm.xlcpp131.aix.doc/compiler_ref/bifs_sync_atomic.html
- *
- * src/include/port/atomics/generic-xlc.h
- *
- * -------------------------------------------------------------------------
- */
-
-#if defined(HAVE_ATOMICS)
-
-#define PG_HAVE_ATOMIC_U32_SUPPORT
-typedef struct pg_atomic_uint32
-{
-	volatile uint32 value;
-} pg_atomic_uint32;
-
-
-/* 64bit atomics are only supported in 64bit mode */
-#ifdef __64BIT__
-#define PG_HAVE_ATOMIC_U64_SUPPORT
-typedef struct pg_atomic_uint64
-{
-	volatile uint64 value pg_attribute_aligned(8);
-} pg_atomic_uint64;
-
-#endif /* __64BIT__ */
-
-#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
-static inline bool
-pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
-									uint32 *expected, uint32 newval)
-{
-	bool		ret;
-
-	/*
-	 * atomics.h specifies sequential consistency ("full barrier semantics")
-	 * for this interface.  Since "lwsync" provides acquire/release
-	 * consistency only, do not use it here.  GCC atomics observe the same
-	 * restriction; see its rs6000_pre_atomic_barrier().
-	 */
-	__asm__ __volatile__ ("	sync \n" ::: "memory");
-
-	/*
-	 * XXX: __compare_and_swap is defined to take signed parameters, but that
-	 * shouldn't matter since we don't perform any arithmetic operations.
-	 */
-	ret = __compare_and_swap((volatile int*)&ptr->value,
-							 (int *)expected, (int)newval);
-
-	/*
-	 * xlc's documentation tells us:
-	 * "If __compare_and_swap is used as a locking primitive, insert a call to
-	 * the __isync built-in function at the start of any critical sections."
-	 *
-	 * The critical section begins immediately after __compare_and_swap().
-	 */
-	__isync();
-
-	return ret;
-}
-
-#define PG_HAVE_ATOMIC_FETCH_ADD_U32
-static inline uint32
-pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
-{
-	uint32 _t;
-	uint32 res;
-
-	/*
-	 * xlc has a no-longer-documented __fetch_and_add() intrinsic.  In xlc
-	 * 12.01.0000.0000, it emits a leading "sync" and trailing "isync".  In
-	 * xlc 13.01.0003.0004, it emits neither.  Hence, using the intrinsic
-	 * would add redundant syncs on xlc 12.
-	 */
-	__asm__ __volatile__(
-		"	sync				\n"
-		"	lwarx   %1,0,%4		\n"
-		"	add     %0,%1,%3	\n"
-		"	stwcx.  %0,0,%4		\n"
-		"	bne     $-12		\n"		/* branch to lwarx */
-		"	isync				\n"
-:		"=&r"(_t), "=&r"(res), "+m"(ptr->value)
-:		"r"(add_), "r"(&ptr->value)
-:		"memory", "cc");
-
-	return res;
-}
-
-#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
-
-#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
-static inline bool
-pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
-									uint64 *expected, uint64 newval)
-{
-	bool		ret;
-
-	__asm__ __volatile__ ("	sync \n" ::: "memory");
-
-	ret = __compare_and_swaplp((volatile long*)&ptr->value,
-							   (long *)expected, (long)newval);
-
-	__isync();
-
-	return ret;
-}
-
-#define PG_HAVE_ATOMIC_FETCH_ADD_U64
-static inline uint64
-pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
-{
-	uint64 _t;
-	uint64 res;
-
-	/* Like u32, but s/lwarx/ldarx/; s/stwcx/stdcx/ */
-	__asm__ __volatile__(
-		"	sync				\n"
-		"	ldarx   %1,0,%4		\n"
-		"	add     %0,%1,%3	\n"
-		"	stdcx.  %0,0,%4		\n"
-		"	bne     $-12		\n"		/* branch to ldarx */
-		"	isync				\n"
-:		"=&r"(_t), "=&r"(res), "+m"(ptr->value)
-:		"r"(add_), "r"(&ptr->value)
-:		"memory", "cc");
-
-	return res;
-}
-
-#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
-
-#endif /* defined(HAVE_ATOMICS) */
author	Noah Misch	2019-10-19 03:20:52 +0000
committer	Noah Misch	2019-10-19 03:20:52 +0000
commit	30ee5d17c20dbb282a9952b3048d6ad52d56c371 (patch)
tree	312f0c13adacd8797fd7ef589fd6b826bff7a2cf /src/include/port/atomics
parent	89b4d7744c80ecb3f6bdf8a07ca711a043718db3 (diff)