[babl] Add support for hardware half<->float conversions
- From: Daniel Sabo <daniels src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl] Add support for hardware half<->float conversions
- Date: Fri, 1 Jan 2016 20:40:20 +0000 (UTC)
commit 0068fb5745870c50ea428294f7ecd3dcf733eaf7
Author: Daniel Sabo <DanielSabo gmail com>
Date: Sun Dec 27 07:29:55 2015 -0800
Add support for hardware half<->float conversions
These instructions require a Ivy Bridge or newer processor, so I've only
been able to test them under the Intel Software Development Emulator.
babl/babl-cpuaccel.c | 6 +-
babl/babl-cpuaccel.h | 3 +
configure.ac | 22 ++++
extensions/Makefile.am | 3 +
extensions/sse-half.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 303 insertions(+), 1 deletions(-)
---
diff --git a/babl/babl-cpuaccel.c b/babl/babl-cpuaccel.c
index 4e1683e..59fdcdd 100644
--- a/babl/babl-cpuaccel.c
+++ b/babl/babl-cpuaccel.c
@@ -118,7 +118,8 @@ enum
ARCH_X86_INTEL_FEATURE_SSSE3 = 1 << 9,
ARCH_X86_INTEL_FEATURE_SSE4_1 = 1 << 19,
ARCH_X86_INTEL_FEATURE_SSE4_2 = 1 << 20,
- ARCH_X86_INTEL_FEATURE_AVX = 1 << 28
+ ARCH_X86_INTEL_FEATURE_AVX = 1 << 28,
+ ARCH_X86_INTEL_FEATURE_F16C = 1 << 29,
};
#if !defined(ARCH_X86_64) && (defined(PIC) || defined(__PIC__))
@@ -244,6 +245,9 @@ arch_accel_intel (void)
if (ecx & ARCH_X86_INTEL_FEATURE_SSE4_1)
caps |= BABL_CPU_ACCEL_X86_SSE4_1;
+
+ if (ecx & ARCH_X86_INTEL_FEATURE_F16C)
+ caps |= BABL_CPU_ACCEL_X86_F16C;
#endif /* USE_SSE */
}
#endif /* USE_MMX */
diff --git a/babl/babl-cpuaccel.h b/babl/babl-cpuaccel.h
index 57eb118..8040d73 100644
--- a/babl/babl-cpuaccel.h
+++ b/babl/babl-cpuaccel.h
@@ -32,6 +32,9 @@ typedef enum
BABL_CPU_ACCEL_X86_SSE3 = 0x02000000,
BABL_CPU_ACCEL_X86_SSSE3 = 0x00800000,
BABL_CPU_ACCEL_X86_SSE4_1 = 0x00400000,
+ /* BABL_CPU_ACCEL_X86_SSE4_2 = 0x00200000, */
+ /* BABL_CPU_ACCEL_X86_AVX = 0x00080000, */
+ BABL_CPU_ACCEL_X86_F16C = 0x00040000,
/* powerpc accelerations */
BABL_CPU_ACCEL_PPC_ALTIVEC = 0x04000000,
diff --git a/configure.ac b/configure.ac
index f09c7ac..28e9af0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -303,6 +303,10 @@ AC_ARG_ENABLE(sse4_1,
[ --enable-sse4_1 enable SSE4_1 support (default=auto)],,
enable_sse4_1=$enable_sse)
+AC_ARG_ENABLE(f16c,
+ [ --enable-f16c enable hardware half-float support (default=auto)],,
+ enable_f16c=$enable_sse)
+
if test "x$enable_mmx" = xyes; then
BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx')
SSE_EXTRA_CFLAGS=
@@ -378,6 +382,24 @@ if test "x$enable_mmx" = xyes; then
fi
fi
+ if test "x$enable_f16c" = xyes; then
+ BABL_DETECT_CFLAGS(f16c_flag, '-mf16c')
+ SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $f16c_flag"
+
+ AC_MSG_CHECKING(whether we can compile half-floating point code)
+
+ CFLAGS="$CFLAGS $sse_flag $f16c_flag"
+
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],[_mm_cvtph_ps
((__m128i)_mm_setzero_ps());])],
+ AC_DEFINE(USE_F16C, 1, [Define to 1 if f16c intrinsics are available.])
+ AC_MSG_RESULT(yes)
+ ,
+ enable_f16c=no
+ AC_MSG_RESULT(no)
+ AC_MSG_WARN([The compiler does not support f16c intrinsics.])
+ )
+ fi
+
fi
,
enable_mmx=no
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index cd7e893..c06aa8f 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -32,6 +32,7 @@ ext_LTLIBRARIES = \
sse2-int8.la \
sse2-int16.la \
sse4-int8.la \
+ sse-half.la \
two-table.la \
ycbcr.la
@@ -50,6 +51,7 @@ sse2_float_la_SOURCES = sse2-float.c
sse2_int8_la_SOURCES = sse2-int8.c
sse2_int16_la_SOURCES = sse2-int16.c
sse4_int8_la_SOURCES = sse4-int8.c
+sse_half_la_SOURCES = sse-half.c
two_table_la_SOURCES = two-table.c two-table-tables.h
ycbcr_la_SOURCES = ycbcr.c
float_la_SOURCES = float.c
@@ -62,3 +64,4 @@ sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
sse4_int8_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS)
+sse_half_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS) $(F16C_EXTRA_CFLAGS)
diff --git a/extensions/sse-half.c b/extensions/sse-half.c
new file mode 100644
index 0000000..ca57ceb
--- /dev/null
+++ b/extensions/sse-half.c
@@ -0,0 +1,270 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2015 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE4_1) && defined(USE_F16C)
+
+#include <immintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+static inline long
+conv_yHalf_yF (const uint16_t *src, float *dst, long samples)
+{
+ const uint64_t *s_vec;
+ __v4sf *d_vec;
+
+ long n = samples;
+
+ s_vec = (const uint64_t *)src;
+ d_vec = (__v4sf *)dst;
+
+ while (n >= 4)
+ {
+ __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0);
+ __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
+ _mm_storeu_ps((float *)d_vec++, out_val);
+ n -= 4;
+ }
+
+ src = (const uint16_t *)s_vec;
+ dst = (float *)d_vec;
+
+ while (n)
+ {
+ __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0);
+ __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
+ _mm_store_ss(dst++, out_val);
+ n -= 1;
+ }
+
+ return samples;
+}
+
+static long
+conv_yaHalf_yaF (const uint16_t *src, float *dst, long samples)
+{
+ return conv_yHalf_yF (src, dst, samples * 2) / 2;
+}
+
+static long
+conv_rgbHalf_rgbF (const uint16_t *src, float *dst, long samples)
+{
+ return conv_yHalf_yF (src, dst, samples * 3) / 3;
+}
+
+static long
+conv_rgbaHalf_rgbaF (const uint16_t *src, float *dst, long samples)
+{
+ return conv_yHalf_yF (src, dst, samples * 4) / 4;
+}
+
+static inline long
+conv_yF_yHalf (const float *src, uint16_t *dst, long samples)
+{
+ const __v4sf *s_vec;
+ uint64_t *d_vec;
+
+ long n = samples;
+
+ s_vec = (const __v4sf *)src;
+ d_vec = (uint64_t *)dst;
+
+ while (n >= 4)
+ {
+ __m128 in_val = _mm_loadu_ps((float *)s_vec++);
+ __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ _mm_storel_epi64((__m128i *)d_vec++, out_val);
+ n -= 4;
+ }
+
+ src = (const float *)s_vec;
+ dst = (uint16_t *)d_vec;
+
+ while (n)
+ {
+ __m128 in_val = _mm_load_ss(src++);
+ __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ *dst++ = _mm_extract_epi16(out_val, 0);
+ n -= 1;
+ }
+
+ return samples;
+}
+
+static long
+conv_yaF_yaHalf (const float *src, uint16_t *dst, long samples)
+{
+ return conv_yF_yHalf (src, dst, samples * 2) / 2;
+}
+
+static long
+conv_rgbF_rgbHalf (const float *src, uint16_t *dst, long samples)
+{
+ return conv_yF_yHalf (src, dst, samples * 3) / 3;
+}
+
+static long
+conv_rgbaF_rgbaHalf (const float *src, uint16_t *dst, long samples)
+{
+ return conv_yF_yHalf (src, dst, samples * 4) / 4;
+}
+
+#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE4_1) && defined(USE_F16C)
+ const Babl *rgbaF_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("float"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbaHalf_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("half"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbaF_gamma = babl_format_new (
+ babl_model ("R'G'B'A"),
+ babl_type ("float"),
+ babl_component ("R'"),
+ babl_component ("G'"),
+ babl_component ("B'"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbaHalf_gamma = babl_format_new (
+ babl_model ("R'G'B'A"),
+ babl_type ("half"),
+ babl_component ("R'"),
+ babl_component ("G'"),
+ babl_component ("B'"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbF_linear = babl_format_new (
+ babl_model ("RGB"),
+ babl_type ("float"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ NULL);
+ const Babl *rgbHalf_linear = babl_format_new (
+ babl_model ("RGB"),
+ babl_type ("half"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ NULL);
+ const Babl *rgbF_gamma = babl_format_new (
+ babl_model ("R'G'B'"),
+ babl_type ("float"),
+ babl_component ("R'"),
+ babl_component ("G'"),
+ babl_component ("B'"),
+ NULL);
+ const Babl *rgbHalf_gamma = babl_format_new (
+ babl_model ("R'G'B'"),
+ babl_type ("half"),
+ babl_component ("R'"),
+ babl_component ("G'"),
+ babl_component ("B'"),
+ NULL);
+ const Babl *yaF_linear = babl_format_new (
+ babl_model ("YA"),
+ babl_type ("float"),
+ babl_component ("Y"),
+ babl_component ("A"),
+ NULL);
+ const Babl *yaHalf_linear = babl_format_new (
+ babl_model ("YA"),
+ babl_type ("half"),
+ babl_component ("Y"),
+ babl_component ("A"),
+ NULL);
+ const Babl *yaF_gamma = babl_format_new (
+ babl_model ("Y'A"),
+ babl_type ("float"),
+ babl_component ("Y'"),
+ babl_component ("A"),
+ NULL);
+ const Babl *yaHalf_gamma = babl_format_new (
+ babl_model ("Y'A"),
+ babl_type ("half"),
+ babl_component ("Y'"),
+ babl_component ("A"),
+ NULL);
+ const Babl *yF_linear = babl_format_new (
+ babl_model ("Y"),
+ babl_type ("float"),
+ babl_component ("Y"),
+ NULL);
+ const Babl *yHalf_linear = babl_format_new (
+ babl_model ("Y"),
+ babl_type ("half"),
+ babl_component ("Y"),
+ NULL);
+ const Babl *yF_gamma = babl_format_new (
+ babl_model ("Y'"),
+ babl_type ("float"),
+ babl_component ("Y'"),
+ NULL);
+ const Babl *yHalf_gamma = babl_format_new (
+ babl_model ("Y'"),
+ babl_type ("half"),
+ babl_component ("Y'"),
+ NULL);
+
+#define CONV(src, dst) \
+{ \
+ babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \
+ babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \
+}
+
+ if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1) &&
+ (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_F16C))
+ {
+ CONV(rgbaHalf, rgbaF);
+ CONV(rgbHalf, rgbF);
+ CONV(yaHalf, yaF);
+ CONV(yHalf, yF);
+ CONV(rgbaF, rgbaHalf);
+ CONV(rgbF, rgbHalf);
+ CONV(yaF, yaHalf);
+ CONV(yF, yHalf);
+ }
+
+#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */
+
+ return 0;
+}
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]