C4Real: Implement floating point calculation via SSE

Contrary to x87 FPU floating point calculations, using the SSE unit should be networking safe, as the internal precision of the SSE unit is 32 bits when using single precision values. Thus, the compiler's decision on when to spill registers to memory doesn't influence the end results of the calculation. Sine and cosine are calculated using a Taylor approximation after pulling the input values down to an interval of +-pi/4, with a maximum relative error of 1.1e-6, or absolute error of 6.0e-8.
2010-05-19 07:25:05 +02:00 · 2010-05-19 07:25:05 +02:00 · e3542d739d
parent c71c5f5983
commit e3542d739d
5 changed files with 238 additions and 2 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -351,6 +351,8 @@ set(OC_CLONK_SOURCES
 	src/lib/C4RealImpl_Fixed.cpp
 	src/lib/C4RealImpl_Fixed.h
 	src/lib/C4RealImpl_FPU.h
+	src/lib/C4RealImpl_SSE.cpp
+	src/lib/C4RealImpl_SSE.h
 	src/lib/C4Rect.cpp
 	src/lib/C4Rect.h
 	src/lib/C4RTF.cpp
--- a/Makefile.am
+++ b/Makefile.am
@ -102,6 +102,8 @@ src/lib/C4Real.h \
 src/lib/C4RealImpl_Fixed.cpp \
 src/lib/C4RealImpl_Fixed.h \
 src/lib/C4RealImpl_FPU.h \
+src/lib/C4RealImpl_SSE.cpp \
+src/lib/C4RealImpl_SSE.h \
 src/lib/C4Rect.cpp \
 src/lib/C4Rect.h \
 src/lib/C4RTF.cpp \
--- a/src/lib/C4Real.h
+++ b/src/lib/C4Real.h
@ -49,10 +49,10 @@
 #define C4REAL_MODE_SOFTWARE   1
 //#define C4REAL_MODE_SOFT_FLOAT 2
 #define C4REAL_MODE_FPU_FLOAT  3
-//#define C4REAL_MODE_SSE_FLOAT  4
+#define C4REAL_MODE_SSE_FLOAT  4

 #ifndef C4REAL_MODE
-#define C4REAL_MODE C4REAL_MODE_SOFTWARE
+#define C4REAL_MODE C4REAL_MODE_SSE_FLOAT
 #endif

 template<class C4RealImpl>
@ -131,9 +131,11 @@ public:

 typedef C4RealBase<class C4RealImpl_Fixed> C4Real_Fixed;
 typedef C4RealBase<float> C4Real_FPU_Float;
+typedef C4RealBase<class C4RealImpl_SSE> C4Real_SSE_Float;

 #include "C4RealImpl_Fixed.h"
 #include "C4RealImpl_FPU.h"
+#include "C4RealImpl_SSE.h"

 // *** wrap C4Real to requested C4RealBase instantiation

@ -141,11 +143,14 @@ typedef C4RealBase<float> C4Real_FPU_Float;
 typedef C4Real_Fixed C4Real;
 #elif C4REAL_MODE == C4REAL_MODE_FPU_FLOAT
 typedef C4Real_FPU_Float C4Real;
+#elif C4REAL_MODE == C4REAL_MODE_SSE_FLOAT
+typedef C4Real_SSE_Float C4Real;
 #endif

 // Instantiate other C4RealBases as well
 template class C4RealBase<C4RealImpl_Fixed>;
 template class C4RealBase<float>;
+template class C4RealBase<C4RealImpl_SSE>;

 // conversion
 inline float fixtof(const C4Real &x) { return static_cast<float>(x); }
--- a/src/lib/C4RealImpl_SSE.cpp
+++ b/src/lib/C4RealImpl_SSE.cpp
@ -0,0 +1,118 @@
+/*
+ * OpenClonk, http://www.openclonk.org
+ *
+ * Copyright (c) 2010  Nicolas Hake
+ *
+ * Portions might be copyrighted by other authors who have contributed
+ * to OpenClonk.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ * See isc_license.txt for full license and disclaimer.
+ *
+ * "Clonk" is a registered trademark of Matthes Bender.
+ * See clonk_trademark_license.txt for full license.
+ */
+
+#include "C4Include.h"
+#include "C4Real.h"
+
+// Constants required for sine approximation
+const __m128 C4RealImpl_SSE::cephes_deg2rad = _mm_set_ps1(0.017453292f); // pi/180
+const __m128 C4RealImpl_SSE::cephes_FOPI = _mm_set_ps1(1.27323954473516f); // 4/pi
+const __m128 C4RealImpl_SSE::cephes_scaling_factors[3] = {
+	_mm_set_ps1(0.78515625f), _mm_set_ps1(2.4187564849853515625e-4f), _mm_set_ps1(3.77489497744594108e-8f)
+};
+const __m128 C4RealImpl_SSE::cephes_appx_coeffs[3] = {
+	_mm_set_ps(-1.9515295891e-4f, 2.443315711809948e-5f, 2.443315711809948e-5f, -1.9515295891e-4f),
+	_mm_set_ps(8.3321608736e-3f, -1.388731625493765e-3f, -1.388731625493765e-3f, 8.3321608736e-3f),
+	_mm_set_ps(-1.6666654611e-1f, 4.166664568298827e-2f, 4.166664568298827e-2f, -1.6666654611e-1f)
+};
+const __m128 C4RealImpl_SSE::iee754_sign_mask = _mm_set_ps1(-0.0f);
+
+// Branchless approximation of sine and cosine values
+C4RealImpl_SSE C4RealImpl_SSE::SinCos(bool cosine) const
+{
+	// NOTE: Consider storing radians directly instead of degrees to avoid
+	// precision loss due to conversion
+	__m128 radian = _mm_mul_ps(value, cephes_deg2rad);
+#ifndef NDEBUG
+	float float_radians; _mm_store_ss(&float_radians, radian);
+#endif
+	// put rad into all parts of the xmm register
+	radian = _mm_shuffle_ps(radian, radian, 0);
+
+	// cephes library sine/cosine implementation
+	// This calculates a Taylor approximation of rank 7 with slightly
+	// modified coefficients to achieve better precision on the reduced
+	// input range -pi/4..pi/4.
+
+	union m128extract {
+		float f[4];
+		uint32_t i[4];
+		__m128 v;
+	};
+
+	// Store sign and take absolute value of input
+	__m128 sign = _mm_and_ps(radian, iee754_sign_mask);
+	radian = _mm_xor_ps(radian, sign);
+	m128extract sign_bits;
+	sign_bits.v = sign;
+
+	// Select octant of the unit circle
+	__m128 scaling = _mm_mul_ps(radian, C4RealImpl_SSE::cephes_FOPI);
+	int octant = _mm_cvttss_si32(scaling);
+	octant = (octant + 1) & ~1;
+	scaling = _mm_cvtsi32_ss(scaling, octant);
+	scaling = _mm_shuffle_ps(scaling, scaling, 0);
+	uint32_t flip_sign_sine = ((octant & 4) << 29);
+	octant &= 3;
+	uint32_t flip_sign_cosine = flip_sign_sine ^ ((octant & 2) << 30);
+	flip_sign_sine ^= sign_bits.i[0];
+
+	// map input to +-pi/4
+	// note that this get more and more imprecise for abs(radian) > 8192
+	radian = _mm_sub_ps(radian, _mm_mul_ps(scaling, cephes_scaling_factors[0]));
+	radian = _mm_sub_ps(radian, _mm_mul_ps(scaling, cephes_scaling_factors[1]));
+	radian = _mm_sub_ps(radian, _mm_mul_ps(scaling, cephes_scaling_factors[2]));
+
+	// run approximation, calculating four octants at once; correct result
+	// will be selected later
+	__m128 radiansq = _mm_mul_ps(radian, radian);
+	__m128 result = cephes_appx_coeffs[0];
+	result = _mm_mul_ps(result, radiansq);
+	result = _mm_add_ps(result, cephes_appx_coeffs[1]);
+	result = _mm_mul_ps(result, radiansq);
+	result = _mm_add_ps(result, cephes_appx_coeffs[2]);
+	result = _mm_mul_ps(result, radiansq);
+	
+	radiansq = _mm_shuffle_ps(radiansq, radian, _MM_SHUFFLE(0,0,0,0));
+	radiansq = _mm_shuffle_ps(radiansq, radiansq, _MM_SHUFFLE(2,0,0,2));
+	result = _mm_mul_ps(result, radiansq);
+	
+	radiansq = _mm_mul_ps(radiansq, _mm_set_ps(1.0f, -0.5f, -0.5f, 1.0f));
+	result = _mm_add_ps(result, radiansq);
+	result = _mm_add_ps(result, _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f));
+
+	// Select correct octant
+	m128extract rv;
+	rv.v = result;
+	int sinidx = octant;
+	int cosidx = octant ^ 2;
+
+	// adjust sign
+	rv.i[sinidx] ^= flip_sign_sine;
+	rv.i[cosidx] ^= flip_sign_cosine;
+
+	// relative error less than 1.1e-6 for input values between -360 deg
+	// and 360 deg
+	// relative error less than 1.2e-7 between -260 deg and 260 deg.
+	// absolute error less than 6.0e-8 between -4.2e7 and 4.2e7 deg.
+	assert(float_radians == 0.0f || std::abs((rv.f[sinidx] - std::sinf(float_radians)) / std::sinf(float_radians)) < 1.1e-6f);
+	assert(float_radians == 0.0f || std::abs((rv.f[cosidx] - std::cosf(float_radians)) / std::cosf(float_radians)) < 1.1e-6f);
+
+	uint32_t cosine_mask = cosine * ~0;
+	int idx = (cosine_mask & cosidx) | (~cosine_mask & sinidx);
+	return rv.f[idx];
+}
--- a/src/lib/C4RealImpl_SSE.h
+++ b/src/lib/C4RealImpl_SSE.h
@ -0,0 +1,109 @@
+/*
+ * OpenClonk, http://www.openclonk.org
+ *
+ * Copyright (c) 2010  Nicolas Hake
+ *
+ * Portions might be copyrighted by other authors who have contributed
+ * to OpenClonk.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ * See isc_license.txt for full license and disclaimer.
+ *
+ * "Clonk" is a registered trademark of Matthes Bender.
+ * See clonk_trademark_license.txt for full license.
+ */
+
+#ifndef INC_RealImpl_SSE
+#define INC_RealImpl_SSE
+
+#ifndef INC_C4Real
+#error C4RealImpl_SSE.h must not be included by itself; include C4Real.h instead
+#endif
+
+#include <cassert>
+#include <xmmintrin.h>
+
+class C4RealImpl_SSE
+{
+	friend C4Real_SSE_Float Sin(const C4Real_SSE_Float &);
+	friend C4Real_SSE_Float Cos(const C4Real_SSE_Float &);
+	__m128 value;
+
+	static const __m128 iee754_sign_mask; // -0.0
+	static const __m128 cephes_FOPI; // 4/pi
+	static const __m128 cephes_deg2rad; // pi/180
+	static const __m128 cephes_appx_coeffs[3]; // approximation coefficients
+	static const __m128 cephes_scaling_factors[3]; // factors for quick scaling to -pi/4..pi/4
+	C4RealImpl_SSE SinCos(bool cosine) const; // approximation of sine and cosine
+
+public:
+	inline C4RealImpl_SSE()
+		: value(_mm_setzero_ps())
+	{}
+	inline C4RealImpl_SSE(const C4RealImpl_SSE &rhs)
+		: value(rhs.value)
+	{}
+	inline C4RealImpl_SSE(int32_t iVal)
+		: value(_mm_cvtsi32_ss(value, iVal))
+	{}
+	inline C4RealImpl_SSE(float fVal)
+		: value(_mm_set_ss(fVal))
+	{}
+
+	operator int () const
+	{
+		return _mm_cvtss_si32(value);
+	}
+	operator float () const
+	{
+		float f;
+		_mm_store_ss(&f, value);
+		return f;
+	}
+
+	// Arithmetics
+	// We're using _ps intrinsics for everything except division because they
+	// have the same latency as their _ss counterparts, but their representa-
+	// tion is one byte shorter (0F xx instead of F3 0F xx).
+	// DIVPS is about half as fast as DIVSS, so we use the scalar instruction
+	// here.
+	C4RealImpl_SSE &operator += (const C4RealImpl_SSE &rhs) { value = _mm_add_ps(value, rhs.value); return *this; }
+	C4RealImpl_SSE &operator -= (const C4RealImpl_SSE &rhs) { value = _mm_sub_ps(value, rhs.value); return *this; }
+	C4RealImpl_SSE &operator *= (const C4RealImpl_SSE &rhs) { value = _mm_mul_ps(value, rhs.value); return *this; }
+	C4RealImpl_SSE &operator /= (const C4RealImpl_SSE &rhs) { value = _mm_div_ss(value, rhs.value); return *this; }
+
+	// Negation
+	C4RealImpl_SSE operator - () const
+	{
+		C4RealImpl_SSE nrv;
+		nrv -= *this;
+		return nrv;
+	}
+
+	// Comparison
+	// COMISS is faster on newer CPUs than CMPSS, also we get a nice return
+	// value from the intrinsic instead of having to store parts of the XMM
+	// register to a variable to read.
+	bool operator < (const C4RealImpl_SSE &rhs) const { return _mm_comilt_ss(value, rhs.value) != 0; }
+	bool operator <= (const C4RealImpl_SSE &rhs) const { return _mm_comile_ss(value, rhs.value) != 0; }
+	bool operator == (const C4RealImpl_SSE &rhs) const { return _mm_comieq_ss(value, rhs.value) != 0; }
+	bool operator >= (const C4RealImpl_SSE &rhs) const { return _mm_comige_ss(value, rhs.value) != 0; }
+	bool operator > (const C4RealImpl_SSE &rhs) const { return _mm_comigt_ss(value, rhs.value) != 0; }
+	bool operator != (const C4RealImpl_SSE &rhs) const { return _mm_comineq_ss(value, rhs.value) != 0; }
+
+	operator bool () const { return _mm_comineq_ss(value, _mm_setzero_ps()) != 0; }
+	bool operator ! () const { return _mm_comieq_ss(value, _mm_setzero_ps()) != 0; }
+};
+
+inline C4Real_SSE_Float Sin(const C4Real_SSE_Float &real)
+{
+	return C4Real_SSE_Float(static_cast<float>(real.value.SinCos(false)));
+}
+inline C4Real_SSE_Float Cos(const C4Real_SSE_Float &real)
+{
+	return C4Real_SSE_Float(static_cast<float>(real.value.SinCos(true)));
+}
+
+#endif