[babl/wip/msvc: 5/6] build: Enable AVX2 for Visual Studio builds



commit e33a41225289cd2a3cabb8c07e0c60c3e66771f4
Author: Chun-wei Fan <fanchunwei src gnome org>
Date:   Tue Jan 21 18:11:40 2020 +0800

    build: Enable AVX2 for Visual Studio builds
    
    The Visual Studio compiler that we support is capable of building AVX2
    intrinsics, so let's enable it when building for x86 and x64.  Update
    the code to not use GCC-specific intrinsics calls

 extensions/avx2-int8.c | 104 ++++++++++++++++++++++++-------------------------
 meson.build            |   6 +++
 2 files changed, 58 insertions(+), 52 deletions(-)
---
diff --git a/extensions/avx2-int8.c b/extensions/avx2-int8.c
index b6d516566..96f769f3c 100644
--- a/extensions/avx2-int8.c
+++ b/extensions/avx2-int8.c
@@ -74,11 +74,11 @@ conv_yF_linear_y8_gamma (const Babl  *conversion,
                          uint8_t     *dst,
                          long         samples)
 {
-  const __v8sf *src_vec;
+  const __m256 *src_vec;
   __m256i      *dst_vec;
-  const __v8sf  scale = _mm256_set1_ps (SCALE);
-  const __v8sf  zero  = _mm256_setzero_ps ();
-  const __v8sf  half  = _mm256_set1_ps (0.5f);
+  const __m256  scale = _mm256_set1_ps (SCALE);
+  const __m256  zero  = _mm256_setzero_ps ();
+  const __m256  half  = _mm256_set1_ps (0.5f);
 
   while ((uintptr_t) src % 32 && samples > 0)
     {
@@ -87,7 +87,7 @@ conv_yF_linear_y8_gamma (const Babl  *conversion,
       samples--;
     }
 
-  src_vec = (const __v8sf  *) src;
+  src_vec = (const __m256  *) src;
   dst_vec = (__m256i       *) dst;
 
   while (samples >= 32)
@@ -96,17 +96,17 @@ conv_yF_linear_y8_gamma (const Babl  *conversion,
       __m256i i16_01,       i16_23;
       __m256i i8_0123;
 
-      #define CVT8(i)                                                        \
-        do                                                                   \
-          {                                                                  \
-            __v8sf yyyyyyyy;                                                 \
-                                                                             \
-            yyyyyyyy = scale * src_vec[i] + half;                            \
-            yyyyyyyy = _mm256_max_ps (yyyyyyyy, zero);                       \
-            yyyyyyyy = _mm256_min_ps (yyyyyyyy, scale);                      \
-            i32_##i  = _mm256_cvttps_epi32 (yyyyyyyy);                       \
-            i32_##i  = _mm256_i32gather_epi32 (linear_to_gamma, i32_##i, 4); \
-          }                                                                  \
+      #define CVT8(i)                                                           \
+        do                                                                      \
+          {                                                                     \
+            __m256 yyyyyyyy;                                                    \
+                                                                                \
+            yyyyyyyy = _mm256_add_ps (_mm256_mul_ps (scale, src_vec[i]), half); \
+            yyyyyyyy = _mm256_max_ps (yyyyyyyy, zero);                          \
+            yyyyyyyy = _mm256_min_ps (yyyyyyyy, scale);                         \
+            i32_##i  = _mm256_cvttps_epi32 (yyyyyyyy);                          \
+            i32_##i  = _mm256_i32gather_epi32 (linear_to_gamma, i32_##i, 4);    \
+          }                                                                     \
         while (0)
 
       CVT8 (0);
@@ -154,12 +154,12 @@ conv_yaF_linear_ya8_gamma (const Babl  *conversion,
 {
   if ((uintptr_t) src % 8 == 0)
     {
-      const __v8sf  *src_vec;
+      const __m256  *src_vec;
       __m256i       *dst_vec;
-      const __v8sf   scale = _mm256_setr_ps (SCALE, 255.0f, SCALE, 255.0f,
+      const __m256   scale = _mm256_setr_ps (SCALE, 255.0f, SCALE, 255.0f,
                                              SCALE, 255.0f, SCALE, 255.0f);
-      const __v8sf   zero  = _mm256_setzero_ps ();
-      const __v8sf   half  = _mm256_set1_ps (0.5f);
+      const __m256   zero  = _mm256_setzero_ps ();
+      const __m256   half  = _mm256_set1_ps (0.5f);
       const __m256i  mask  = _mm256_setr_epi32 (-1, 0, -1, 0,
                                                 -1, 0, -1, 0);
 
@@ -171,7 +171,7 @@ conv_yaF_linear_ya8_gamma (const Babl  *conversion,
           samples--;
         }
 
-      src_vec = (const __v8sf  *) src;
+      src_vec = (const __m256  *) src;
       dst_vec = (__m256i       *) dst;
 
       while (samples >= 16)
@@ -180,19 +180,19 @@ conv_yaF_linear_ya8_gamma (const Babl  *conversion,
           __m256i i16_01,       i16_23;
           __m256i i8_0123;
 
-          #define CVT8(i)                                                  \
-            do                                                             \
-              {                                                            \
-                __v8sf yayayaya;                                           \
-                                                                           \
-                yayayaya = scale * src_vec[i] + half;                      \
-                yayayaya = _mm256_max_ps (yayayaya, zero);                 \
-                yayayaya = _mm256_min_ps (yayayaya, scale);                \
-                i32_##i  = _mm256_cvttps_epi32 (yayayaya);                 \
-                i32_##i  = _mm256_mask_i32gather_epi32 (i32_##i,           \
-                                                        linear_to_gamma,   \
-                                                        i32_##i, mask, 4); \
-              }                                                            \
+          #define CVT8(i)                                                           \
+            do                                                                      \
+              {                                                                     \
+                __m256 yayayaya;                                                    \
+                                                                                    \
+                yayayaya = _mm256_add_ps (_mm256_mul_ps (scale, src_vec[i]), half); \
+                yayayaya = _mm256_max_ps (yayayaya, zero);                          \
+                yayayaya = _mm256_min_ps (yayayaya, scale);                         \
+                i32_##i  = _mm256_cvttps_epi32 (yayayaya);                          \
+                i32_##i  = _mm256_mask_i32gather_epi32 (i32_##i,                    \
+                                                        linear_to_gamma,            \
+                                                        i32_##i, mask, 4);          \
+              }                                                                     \
             while (0)
 
           CVT8 (0);
@@ -251,12 +251,12 @@ conv_rgbaF_linear_rgba8_gamma (const Babl  *conversion,
 {
   if ((uintptr_t) src % 16 == 0)
     {
-      const __v8sf  *src_vec;
+      const __m256  *src_vec;
       __m256i       *dst_vec;
-      const __v8sf   scale = _mm256_setr_ps (SCALE, SCALE, SCALE, 255.0f,
+      const __m256   scale = _mm256_setr_ps (SCALE, SCALE, SCALE, 255.0f,
                                              SCALE, SCALE, SCALE, 255.0f);
-      const __v8sf   zero  = _mm256_setzero_ps ();
-      const __v8sf   half  = _mm256_set1_ps (0.5f);
+      const __m256   zero  = _mm256_setzero_ps ();
+      const __m256   half  = _mm256_set1_ps (0.5f);
       const __m256i  mask  = _mm256_setr_epi32 (-1, -1, -1, 0,
                                                 -1, -1, -1, 0);
 
@@ -270,7 +270,7 @@ conv_rgbaF_linear_rgba8_gamma (const Babl  *conversion,
           samples--;
         }
 
-      src_vec = (const __v8sf  *) src;
+      src_vec = (const __m256  *) src;
       dst_vec = (__m256i       *) dst;
 
       while (samples >= 8)
@@ -279,19 +279,19 @@ conv_rgbaF_linear_rgba8_gamma (const Babl  *conversion,
           __m256i i16_01,       i16_23;
           __m256i i8_0123;
 
-          #define CVT8(i)                                                  \
-            do                                                             \
-              {                                                            \
-                __v8sf rgbargba;                                           \
-                                                                           \
-                rgbargba = scale * src_vec[i] + half;                      \
-                rgbargba = _mm256_max_ps (rgbargba, zero);                 \
-                rgbargba = _mm256_min_ps (rgbargba, scale);                \
-                i32_##i  = _mm256_cvttps_epi32 (rgbargba);                 \
-                i32_##i  = _mm256_mask_i32gather_epi32 (i32_##i,           \
-                                                        linear_to_gamma,   \
-                                                        i32_##i, mask, 4); \
-              }                                                            \
+          #define CVT8(i)                                                           \
+            do                                                                      \
+              {                                                                     \
+                __m256 rgbargba;                                                    \
+                                                                                    \
+                rgbargba = _mm256_add_ps (_mm256_mul_ps (scale, src_vec[i]), half); \
+                rgbargba = _mm256_max_ps (rgbargba, zero);                          \
+                rgbargba = _mm256_min_ps (rgbargba, scale);                         \
+                i32_##i  = _mm256_cvttps_epi32 (rgbargba);                          \
+                i32_##i  = _mm256_mask_i32gather_epi32 (i32_##i,                    \
+                                                        linear_to_gamma,            \
+                                                        i32_##i, mask, 4);          \
+              }                                                                     \
             while (0)
 
           CVT8 (0);
diff --git a/meson.build b/meson.build
index 0fb394cfc..248aa0792 100644
--- a/meson.build
+++ b/meson.build
@@ -297,6 +297,12 @@ if cc.get_id() == 'msvc' and have_x86
         if get_option('enable-sse4_1')
           conf.set('USE_SSE4_1', 1, description:
                    'Define to 1 if sse4.1 assembly is available.')
+
+          # avx2 assembly
+          if get_option('enable-avx2')
+            conf.set('USE_AVX2', 1, description:
+              'Define to 1 if avx2 assembly is available.')
+          endif
         endif
       endif
     endif


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]