[babl/wip/msvc: 5/6] build: Enable AVX2 for Visual Studio builds
- From: Chun-wei Fan <fanchunwei src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl/wip/msvc: 5/6] build: Enable AVX2 for Visual Studio builds
- Date: Tue, 21 Jan 2020 10:20:05 +0000 (UTC)
commit e33a41225289cd2a3cabb8c07e0c60c3e66771f4
Author: Chun-wei Fan <fanchunwei src gnome org>
Date: Tue Jan 21 18:11:40 2020 +0800
build: Enable AVX2 for Visual Studio builds
The Visual Studio compiler that we support is capable of building AVX2
intrinsics, so let's enable it when building for x86 and x64. Update
the code to not use GCC-specific intrinsics calls
extensions/avx2-int8.c | 104 ++++++++++++++++++++++++-------------------------
meson.build | 6 +++
2 files changed, 58 insertions(+), 52 deletions(-)
---
diff --git a/extensions/avx2-int8.c b/extensions/avx2-int8.c
index b6d516566..96f769f3c 100644
--- a/extensions/avx2-int8.c
+++ b/extensions/avx2-int8.c
@@ -74,11 +74,11 @@ conv_yF_linear_y8_gamma (const Babl *conversion,
uint8_t *dst,
long samples)
{
- const __v8sf *src_vec;
+ const __m256 *src_vec;
__m256i *dst_vec;
- const __v8sf scale = _mm256_set1_ps (SCALE);
- const __v8sf zero = _mm256_setzero_ps ();
- const __v8sf half = _mm256_set1_ps (0.5f);
+ const __m256 scale = _mm256_set1_ps (SCALE);
+ const __m256 zero = _mm256_setzero_ps ();
+ const __m256 half = _mm256_set1_ps (0.5f);
while ((uintptr_t) src % 32 && samples > 0)
{
@@ -87,7 +87,7 @@ conv_yF_linear_y8_gamma (const Babl *conversion,
samples--;
}
- src_vec = (const __v8sf *) src;
+ src_vec = (const __m256 *) src;
dst_vec = (__m256i *) dst;
while (samples >= 32)
@@ -96,17 +96,17 @@ conv_yF_linear_y8_gamma (const Babl *conversion,
__m256i i16_01, i16_23;
__m256i i8_0123;
- #define CVT8(i) \
- do \
- { \
- __v8sf yyyyyyyy; \
- \
- yyyyyyyy = scale * src_vec[i] + half; \
- yyyyyyyy = _mm256_max_ps (yyyyyyyy, zero); \
- yyyyyyyy = _mm256_min_ps (yyyyyyyy, scale); \
- i32_##i = _mm256_cvttps_epi32 (yyyyyyyy); \
- i32_##i = _mm256_i32gather_epi32 (linear_to_gamma, i32_##i, 4); \
- } \
+ #define CVT8(i) \
+ do \
+ { \
+ __m256 yyyyyyyy; \
+ \
+ yyyyyyyy = _mm256_add_ps (_mm256_mul_ps (scale, src_vec[i]), half); \
+ yyyyyyyy = _mm256_max_ps (yyyyyyyy, zero); \
+ yyyyyyyy = _mm256_min_ps (yyyyyyyy, scale); \
+ i32_##i = _mm256_cvttps_epi32 (yyyyyyyy); \
+ i32_##i = _mm256_i32gather_epi32 (linear_to_gamma, i32_##i, 4); \
+ } \
while (0)
CVT8 (0);
@@ -154,12 +154,12 @@ conv_yaF_linear_ya8_gamma (const Babl *conversion,
{
if ((uintptr_t) src % 8 == 0)
{
- const __v8sf *src_vec;
+ const __m256 *src_vec;
__m256i *dst_vec;
- const __v8sf scale = _mm256_setr_ps (SCALE, 255.0f, SCALE, 255.0f,
+ const __m256 scale = _mm256_setr_ps (SCALE, 255.0f, SCALE, 255.0f,
SCALE, 255.0f, SCALE, 255.0f);
- const __v8sf zero = _mm256_setzero_ps ();
- const __v8sf half = _mm256_set1_ps (0.5f);
+ const __m256 zero = _mm256_setzero_ps ();
+ const __m256 half = _mm256_set1_ps (0.5f);
const __m256i mask = _mm256_setr_epi32 (-1, 0, -1, 0,
-1, 0, -1, 0);
@@ -171,7 +171,7 @@ conv_yaF_linear_ya8_gamma (const Babl *conversion,
samples--;
}
- src_vec = (const __v8sf *) src;
+ src_vec = (const __m256 *) src;
dst_vec = (__m256i *) dst;
while (samples >= 16)
@@ -180,19 +180,19 @@ conv_yaF_linear_ya8_gamma (const Babl *conversion,
__m256i i16_01, i16_23;
__m256i i8_0123;
- #define CVT8(i) \
- do \
- { \
- __v8sf yayayaya; \
- \
- yayayaya = scale * src_vec[i] + half; \
- yayayaya = _mm256_max_ps (yayayaya, zero); \
- yayayaya = _mm256_min_ps (yayayaya, scale); \
- i32_##i = _mm256_cvttps_epi32 (yayayaya); \
- i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \
- linear_to_gamma, \
- i32_##i, mask, 4); \
- } \
+ #define CVT8(i) \
+ do \
+ { \
+ __m256 yayayaya; \
+ \
+ yayayaya = _mm256_add_ps (_mm256_mul_ps (scale, src_vec[i]), half); \
+ yayayaya = _mm256_max_ps (yayayaya, zero); \
+ yayayaya = _mm256_min_ps (yayayaya, scale); \
+ i32_##i = _mm256_cvttps_epi32 (yayayaya); \
+ i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \
+ linear_to_gamma, \
+ i32_##i, mask, 4); \
+ } \
while (0)
CVT8 (0);
@@ -251,12 +251,12 @@ conv_rgbaF_linear_rgba8_gamma (const Babl *conversion,
{
if ((uintptr_t) src % 16 == 0)
{
- const __v8sf *src_vec;
+ const __m256 *src_vec;
__m256i *dst_vec;
- const __v8sf scale = _mm256_setr_ps (SCALE, SCALE, SCALE, 255.0f,
+ const __m256 scale = _mm256_setr_ps (SCALE, SCALE, SCALE, 255.0f,
SCALE, SCALE, SCALE, 255.0f);
- const __v8sf zero = _mm256_setzero_ps ();
- const __v8sf half = _mm256_set1_ps (0.5f);
+ const __m256 zero = _mm256_setzero_ps ();
+ const __m256 half = _mm256_set1_ps (0.5f);
const __m256i mask = _mm256_setr_epi32 (-1, -1, -1, 0,
-1, -1, -1, 0);
@@ -270,7 +270,7 @@ conv_rgbaF_linear_rgba8_gamma (const Babl *conversion,
samples--;
}
- src_vec = (const __v8sf *) src;
+ src_vec = (const __m256 *) src;
dst_vec = (__m256i *) dst;
while (samples >= 8)
@@ -279,19 +279,19 @@ conv_rgbaF_linear_rgba8_gamma (const Babl *conversion,
__m256i i16_01, i16_23;
__m256i i8_0123;
- #define CVT8(i) \
- do \
- { \
- __v8sf rgbargba; \
- \
- rgbargba = scale * src_vec[i] + half; \
- rgbargba = _mm256_max_ps (rgbargba, zero); \
- rgbargba = _mm256_min_ps (rgbargba, scale); \
- i32_##i = _mm256_cvttps_epi32 (rgbargba); \
- i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \
- linear_to_gamma, \
- i32_##i, mask, 4); \
- } \
+ #define CVT8(i) \
+ do \
+ { \
+ __m256 rgbargba; \
+ \
+ rgbargba = _mm256_add_ps (_mm256_mul_ps (scale, src_vec[i]), half); \
+ rgbargba = _mm256_max_ps (rgbargba, zero); \
+ rgbargba = _mm256_min_ps (rgbargba, scale); \
+ i32_##i = _mm256_cvttps_epi32 (rgbargba); \
+ i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \
+ linear_to_gamma, \
+ i32_##i, mask, 4); \
+ } \
while (0)
CVT8 (0);
diff --git a/meson.build b/meson.build
index 0fb394cfc..248aa0792 100644
--- a/meson.build
+++ b/meson.build
@@ -297,6 +297,12 @@ if cc.get_id() == 'msvc' and have_x86
if get_option('enable-sse4_1')
conf.set('USE_SSE4_1', 1, description:
'Define to 1 if sse4.1 assembly is available.')
+
+ # avx2 assembly
+ if get_option('enable-avx2')
+ conf.set('USE_AVX2', 1, description:
+ 'Define to 1 if avx2 assembly is available.')
+ endif
endif
endif
endif
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]