[babl/sse-conversions-2013] SSE Float: draft 5
- From: Daniel Sabo <daniels src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl/sse-conversions-2013] SSE Float: draft 5
- Date: Tue, 2 Apr 2013 05:01:18 +0000 (UTC)
commit 389120e9993cc6c99beff17eaa13b85d09994631
Author: Daniel Sabo <DanielSabo gmail com>
Date: Mon Apr 1 21:50:13 2013 -0700
SSE Float: draft 5
Include an alternate conversion for RaGaBaA -> RGBA. Depending on
the CPU either spin or shuffle is significantly faster. Unless
I can find a consistently fast version I'm going to let them fight
it out in the babl startup benchmarks.
extensions/sse-float.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 96 insertions(+), 3 deletions(-)
---
diff --git a/extensions/sse-float.c b/extensions/sse-float.c
index 96695d4..954e359 100644
--- a/extensions/sse-float.c
+++ b/extensions/sse-float.c
@@ -95,7 +95,7 @@ conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples)
}
static long
-conv_rgbAF_linear_rgbaF_linear (const float *src, float *dst, long samples)
+conv_rgbAF_linear_rgbaF_linear_shuffle (const float *src, float *dst, long samples)
{
long i = 0;
long remainder;
@@ -162,6 +162,81 @@ conv_rgbAF_linear_rgbaF_linear (const float *src, float *dst, long samples)
return samples;
}
+static long
+conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ const long n = samples;
+ const __v4sf *s = (const __v4sf*) src;
+ __v4sf *d = (__v4sf*)dst;
+ const __v4sf zero = _mm_setzero_ps();
+ const __v4sf one = _mm_set_ss(1.0f);
+
+ for ( ; i < n; i += 1)
+ {
+ __v4sf pre_abgr0, abgr0, rgba0, raaaa0;
+
+
+ rgba0 = *s;
+ /* Rotate to ABGR */
+ pre_abgr0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+
+ if (_mm_ucomile_ss(pre_abgr0, zero))
+ {
+ /* Zero RGB */
+ abgr0 = zero;
+ }
+ else
+ {
+ /* Un-Premultiply */
+ raaaa0 = _mm_div_ss(one, pre_abgr0);
+
+ /* Expand reciprocal */
+ raaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)raaaa0, _MM_SHUFFLE(0, 0, 0, 0));
+
+ /* Un-Premultiply */
+ abgr0 = pre_abgr0 * raaaa0;
+ }
+
+ /* Move the original alpha value back in */
+ abgr0 = _mm_move_ss(abgr0, pre_abgr0);
+
+ /* Rotate to ABGR */
+ rgba0 = (__v4sf)_mm_shuffle_epi32((__m128i)abgr0, _MM_SHUFFLE(0, 1, 2, 3));
+
+ *d++ = rgba0;
+ s++;
+ }
+ _mm_empty ();
+ }
+
+ dst += i * 4;
+ src += i * 4;
+ remainder = samples - i;
+ while (remainder--)
+ {
+ float alpha = src[3];
+ float recip;
+ if (alpha <= 0.0f)
+ recip = 0.0f;
+ else
+ recip = 1.0f/alpha;
+ dst[0] = src[0] * recip;
+ dst[1] = src[1] * recip;
+ dst[2] = src[2] * recip;
+ dst[3] = alpha;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
#endif /* defined(USE_SSE2) */
#define o(src, dst) \
@@ -195,8 +270,26 @@ init (void)
(babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
{
- o (rgbaF_linear, rgbAF_linear);
- o (rgbAF_linear, rgbaF_linear);
+ babl_conversion_new(rgbaF_linear,
+ rgbAF_linear,
+ "linear",
+ conv_rgbaF_linear_rgbAF_linear,
+ NULL);
+
+ /* Which of these is faster varies by CPU, and the difference
+ * is big enough that it's worthwhile to include both and
+ * let them fight it out in the babl benchmarks.
+ */
+ babl_conversion_new(rgbAF_linear,
+ rgbaF_linear,
+ "linear",
+ conv_rgbAF_linear_rgbaF_linear_shuffle,
+ NULL);
+ babl_conversion_new(rgbAF_linear,
+ rgbaF_linear,
+ "linear",
+ conv_rgbAF_linear_rgbaF_linear_spin,
+ NULL);
}
#endif /* defined(USE_SSE2) */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]