[gegl] buffer: add more SIMD variants of buffer resamplers

From: Øyvind "pippin" Kolås <ok src gnome org>
To: commits-list gnome org
Cc:
Subject: [gegl] buffer: add more SIMD variants of buffer resamplers
Date: Sun, 16 Jan 2022 07:01:59 +0000 (UTC)
commit 3b84a690e242d360657ec9cd3d7958d7c52902ba
Author: Øyvind Kolås <pippin gimp org>
Date:   Sun Jan 16 04:24:26 2022 +0100

    buffer: add more SIMD variants of buffer resamplers
    
    This is for scaling for display and generation of mipmaps.

 gegl/buffer/gegl-algorithms-2x2-downscale.inc |    2 +-
 gegl/buffer/gegl-algorithms-bilinear.inc      |    2 +-
 gegl/buffer/gegl-algorithms-boxfilter.inc     |    2 +-
 gegl/buffer/gegl-algorithms-x86-64-v2.c       |    4 +
 gegl/buffer/gegl-algorithms-x86-64-v3.c       |    4 +
 gegl/buffer/gegl-algorithms.c                 | 1102 ++++++++++++-------------
 gegl/buffer/gegl-algorithms.h                 |  159 +---
 gegl/buffer/gegl-buffer-private.h             |   36 +
 gegl/buffer/gegl-buffer.c                     |  174 ++++
 gegl/buffer/gegl-tile-handler-zoom.c          |   13 +-
 gegl/buffer/meson.build                       |   15 +
 gegl/gegl-init.c                              |   11 +-
 gegl/meson.build                              |   10 +-
 meson.build                                   |    8 +
 14 files changed, 830 insertions(+), 712 deletions(-)
---
diff --git a/gegl/buffer/gegl-algorithms-2x2-downscale.inc b/gegl/buffer/gegl-algorithms-2x2-downscale.inc
index e2a8cc786..2946fa436 100644
--- a/gegl/buffer/gegl-algorithms-2x2-downscale.inc
+++ b/gegl/buffer/gegl-algorithms-2x2-downscale.inc
@@ -1,6 +1,6 @@
 #define S(a)   ((DOWNSCALE_SUM)(a))
 
-void
+static void
 DOWNSCALE_FUNCNAME (const Babl *format,
                     gint        src_width,
                     gint        src_height,
diff --git a/gegl/buffer/gegl-algorithms-bilinear.inc b/gegl/buffer/gegl-algorithms-bilinear.inc
index 36486fcaf..1beb569fb 100644
--- a/gegl/buffer/gegl-algorithms-bilinear.inc
+++ b/gegl/buffer/gegl-algorithms-bilinear.inc
@@ -1,4 +1,4 @@
-void
+static void
 BILINEAR_FUNCNAME (guchar                    *dest_buf,
                    const guchar              *source_buf,
                    const GeglRectangle *dst_rect,
diff --git a/gegl/buffer/gegl-algorithms-boxfilter.inc b/gegl/buffer/gegl-algorithms-boxfilter.inc
index 1c6f46c7a..fd67dce67 100644
--- a/gegl/buffer/gegl-algorithms-boxfilter.inc
+++ b/gegl/buffer/gegl-algorithms-boxfilter.inc
@@ -1,4 +1,4 @@
-void
+static void
 BOXFILTER_FUNCNAME (guchar                    *dest_buf,
                     const guchar              *source_buf,
                     const GeglRectangle *dst_rect,
diff --git a/gegl/buffer/gegl-algorithms-x86-64-v2.c b/gegl/buffer/gegl-algorithms-x86-64-v2.c
new file mode 100644
index 000000000..393edd0e3
--- /dev/null
+++ b/gegl/buffer/gegl-algorithms-x86-64-v2.c
@@ -0,0 +1,4 @@
+
+#define GEGL_SIMD_SUFFIX(symbol)  symbol##_x86_64_v2
+
+#include "gegl-algorithms.c"
diff --git a/gegl/buffer/gegl-algorithms-x86-64-v3.c b/gegl/buffer/gegl-algorithms-x86-64-v3.c
new file mode 100644
index 000000000..b992476fe
--- /dev/null
+++ b/gegl/buffer/gegl-algorithms-x86-64-v3.c
@@ -0,0 +1,4 @@
+
+#define GEGL_SIMD_SUFFIX(symbol)  symbol##_x86_64_v3
+
+#include "gegl-algorithms.c"
diff --git a/gegl/buffer/gegl-algorithms.c b/gegl/buffer/gegl-algorithms.c
index 6545ee97c..5299b771a 100644
--- a/gegl/buffer/gegl-algorithms.c
+++ b/gegl/buffer/gegl-algorithms.c
@@ -31,15 +31,16 @@
 
 #include <math.h>
 
-void gegl_downscale_2x2 (const Babl *format,
-                         gint        src_width,
-                         gint        src_height,
-                         guchar     *src_data,
-                         gint        src_rowstride,
-                         guchar     *dst_data,
-                         gint        dst_rowstride)
+void
+GEGL_SIMD_SUFFIX(gegl_downscale_2x2) (const Babl *format,
+                                      gint        src_width,
+                                      gint        src_height,
+                                      guchar     *src_data,
+                                      gint        src_rowstride,
+                                      guchar     *dst_data,
+                                      gint        dst_rowstride)
 {
-  gegl_downscale_2x2_get_fun (format)(format, src_width, src_height,
+  GEGL_SIMD_SUFFIX(gegl_downscale_2x2_get_fun) (format)(format, src_width, src_height,
                                               src_data, src_rowstride,
                                               dst_data, dst_rowstride);;
 }
@@ -54,93 +55,10 @@ static void inline *align_16 (unsigned char *ret)
   return ret;
 }
 
-static void
-gegl_downscale_2x2_generic (const Babl *format,
-                            gint        src_width,
-                            gint        src_height,
-                            guchar     *src_data,
-                            gint        src_rowstride,
-                            guchar     *dst_data,
-                            gint        dst_rowstride)
-{
-  const Babl *tmp_format = babl_format_with_space ("RGBA float", format);
-  const Babl *from_fish  = babl_fish (format, tmp_format);
-  const Babl *to_fish    = babl_fish (tmp_format, format);
-  const gint tmp_bpp     = 4 * 4;
-  gint dst_width         = src_width / 2;
-  gint dst_height        = src_height / 2;
-  gint in_tmp_rowstride  = src_width * tmp_bpp;
-  gint out_tmp_rowstride = dst_width * tmp_bpp;
-  gint do_free = 0;
-
-  void *in_tmp;
-  void *out_tmp;
-
-  if (src_height * in_tmp_rowstride + dst_height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
-  {
-    in_tmp = align_16 (alloca (src_height * in_tmp_rowstride + 16));
-    out_tmp = align_16 (alloca (dst_height * out_tmp_rowstride + 16));
-  }
-  else
-  {
-    in_tmp = gegl_scratch_alloc (src_height * in_tmp_rowstride);
-    out_tmp = gegl_scratch_alloc (dst_height * out_tmp_rowstride);
-    do_free = 1;
-  }
-
-  babl_process_rows (from_fish,
-                     src_data, src_rowstride,
-                     in_tmp,   in_tmp_rowstride,
-                     src_width, src_height);
-  gegl_downscale_2x2_float (tmp_format, src_width, src_height,
-                            in_tmp,  in_tmp_rowstride,
-                            out_tmp, out_tmp_rowstride);
-  babl_process_rows (to_fish,
-                     out_tmp,   out_tmp_rowstride,
-                     dst_data,  dst_rowstride,
-                     dst_width, dst_height);
-
-  if (do_free)
-   {
-     gegl_scratch_free (out_tmp);
-     gegl_scratch_free (in_tmp);
-   }
-}
-
-#define LUT_DIVISOR 16
-
-static uint16_t lut_u8_to_u16[256];
-static float    lut_u8_to_u16f[256];
-static uint8_t  lut_u16_to_u8[65536/LUT_DIVISOR];
-
-void _gegl_init_u8_lut (void);
-void _gegl_init_u8_lut (void)
-{
-  static int lut_inited = 0;
-  uint8_t u8_ramp[256];
-  uint16_t u16_ramp[65536/LUT_DIVISOR];
-  int i;
-
-  if (lut_inited)
-    return;
-  for (i = 0; i < 256; i++) u8_ramp[i]=i;
-  for (i = 0; i < 65536/LUT_DIVISOR; i++) u16_ramp[i]=i * LUT_DIVISOR;
-  babl_process (babl_fish (babl_format ("Y' u8"), babl_format("Y u16")),
-                &u8_ramp[0], &lut_u8_to_u16[0],
-                256);
-  for (i = 0; i < 256; i++)
-  {
-    lut_u8_to_u16[i] = lut_u8_to_u16[i]/LUT_DIVISOR;
-    lut_u8_to_u16f[i] = lut_u8_to_u16[i];
-  }
-
-  babl_process (babl_fish (babl_format ("Y u16"), babl_format("Y' u8")),
-                &u16_ramp[0], &lut_u16_to_u8[0],
-                65536/LUT_DIVISOR);
-
-  lut_inited = 1;
-}
 
+extern uint16_t gegl_lut_u8_to_u16[256];
+extern float    gegl_lut_u8_to_u16f[256];
+extern uint8_t  gegl_lut_u16_to_u8[65536/GEGL_ALGORITHMS_LUT_DIVISOR];
 
 
 static void
@@ -239,8 +157,8 @@ gegl_boxfilter_u8_nl (guchar              *dest_buf,
               const gfloat m = middle_weight;
               const gfloat b = bottom_weight;
 
-#define C(val)                     lut_u8_to_u16f[(val)]
-#define BOXFILTER_ROUND(val)       lut_u16_to_u8[((int)((val)+0.5f))]
+#define C(val)                     gegl_lut_u8_to_u16f[(val)]
+#define BOXFILTER_ROUND(val)       gegl_lut_u16_to_u8[((int)((val)+0.5f))]
 #define BOXFILTER_ROUND_ALPHA(val) ((int)((val)+0.5f))
               dst[0] = BOXFILTER_ROUND(
                 (C(src[0][0]) * t + C(src[3][0]) * m + C(src[6][0]) * b) * l +
@@ -482,8 +400,8 @@ gegl_bilinear_u8_nl (guchar              *dest_buf,
     }\
 }while(0)
 
-#define C(val)                    lut_u8_to_u16f[(val)]
-#define BILINEAR_ROUND(val)       lut_u16_to_u8[((int)((val)+0.5f))]
+#define C(val)                    gegl_lut_u8_to_u16f[(val)]
+#define BILINEAR_ROUND(val)       gegl_lut_u16_to_u8[((int)((val)+0.5f))]
 #define BILINEAR_ROUND_ALPHA(val) ((int)((val)+0.5f))
 
    switch (components)
@@ -701,57 +619,57 @@ break;\
       switch (components)
       {
         CASE(1,
-            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                  lut_u8_to_u16[ab[0]] +
-                                                  lut_u8_to_u16[ba[0]] +
-                                                  lut_u8_to_u16[bb[0]])>>2 ];);
+            ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                  gegl_lut_u8_to_u16[ab[0]] +
+                                                  gegl_lut_u8_to_u16[ba[0]] +
+                                                  gegl_lut_u8_to_u16[bb[0]])>>2 ];);
         CASE(2,
-            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                  lut_u8_to_u16[ab[0]] +
-                                                  lut_u8_to_u16[ba[0]] +
-                                                  lut_u8_to_u16[bb[0]])>>2 ];
-            ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
-                                                  lut_u8_to_u16[ab[1]] +
-                                                  lut_u8_to_u16[ba[1]] +
-                                                  lut_u8_to_u16[bb[1]])>>2 ];);
+            ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                  gegl_lut_u8_to_u16[ab[0]] +
+                                                  gegl_lut_u8_to_u16[ba[0]] +
+                                                  gegl_lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[1] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[1]] +
+                                                  gegl_lut_u8_to_u16[ab[1]] +
+                                                  gegl_lut_u8_to_u16[ba[1]] +
+                                                  gegl_lut_u8_to_u16[bb[1]])>>2 ];);
         CASE(3,
-            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                  lut_u8_to_u16[ab[0]] +
-                                                  lut_u8_to_u16[ba[0]] +
-                                                  lut_u8_to_u16[bb[0]])>>2 ];
-            ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
-                                                  lut_u8_to_u16[ab[1]] +
-                                                  lut_u8_to_u16[ba[1]] +
-                                                  lut_u8_to_u16[bb[1]])>>2 ];
-            ((uint8_t *)dst)[2] = lut_u16_to_u8[ (lut_u8_to_u16[aa[2]] +
-                                                  lut_u8_to_u16[ab[2]] +
-                                                  lut_u8_to_u16[ba[2]] +
-                                                  lut_u8_to_u16[bb[2]])>>2 ];);
+            ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                  gegl_lut_u8_to_u16[ab[0]] +
+                                                  gegl_lut_u8_to_u16[ba[0]] +
+                                                  gegl_lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[1] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[1]] +
+                                                  gegl_lut_u8_to_u16[ab[1]] +
+                                                  gegl_lut_u8_to_u16[ba[1]] +
+                                                  gegl_lut_u8_to_u16[bb[1]])>>2 ];
+            ((uint8_t *)dst)[2] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[2]] +
+                                                  gegl_lut_u8_to_u16[ab[2]] +
+                                                  gegl_lut_u8_to_u16[ba[2]] +
+                                                  gegl_lut_u8_to_u16[bb[2]])>>2 ];);
         CASE(4,
-            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                  lut_u8_to_u16[ab[0]] +
-                                                  lut_u8_to_u16[ba[0]] +
-                                                  lut_u8_to_u16[bb[0]])>>2 ];
-            ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
-                                                  lut_u8_to_u16[ab[1]] +
-                                                  lut_u8_to_u16[ba[1]] +
-                                                  lut_u8_to_u16[bb[1]])>>2 ];
-            ((uint8_t *)dst)[2] = lut_u16_to_u8[ (lut_u8_to_u16[aa[2]] +
-                                                  lut_u8_to_u16[ab[2]] +
-                                                  lut_u8_to_u16[ba[2]] +
-                                                  lut_u8_to_u16[bb[2]])>>2 ];
-            ((uint8_t *)dst)[3] = lut_u16_to_u8[ (lut_u8_to_u16[aa[3]] +
-                                                  lut_u8_to_u16[ab[3]] +
-                                                  lut_u8_to_u16[ba[3]] +
-                                                  lut_u8_to_u16[bb[3]])>>2 ];);
+            ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                  gegl_lut_u8_to_u16[ab[0]] +
+                                                  gegl_lut_u8_to_u16[ba[0]] +
+                                                  gegl_lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[1] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[1]] +
+                                                  gegl_lut_u8_to_u16[ab[1]] +
+                                                  gegl_lut_u8_to_u16[ba[1]] +
+                                                  gegl_lut_u8_to_u16[bb[1]])>>2 ];
+            ((uint8_t *)dst)[2] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[2]] +
+                                                  gegl_lut_u8_to_u16[ab[2]] +
+                                                  gegl_lut_u8_to_u16[ba[2]] +
+                                                  gegl_lut_u8_to_u16[bb[2]])>>2 ];
+            ((uint8_t *)dst)[3] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[3]] +
+                                                  gegl_lut_u8_to_u16[ab[3]] +
+                                                  gegl_lut_u8_to_u16[ba[3]] +
+                                                  gegl_lut_u8_to_u16[bb[3]])>>2 ];);
         default:
          CASE(0,
             for (gint i = 0; i < components; i++)
               ((uint8_t *)dst)[i] =
-                lut_u16_to_u8[ (lut_u8_to_u16[aa[i]] +
-                                lut_u8_to_u16[ab[i]] +
-                                lut_u8_to_u16[ba[i]] +
-                                lut_u8_to_u16[bb[i]])>>2 ];);
+                gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[i]] +
+                                gegl_lut_u8_to_u16[ab[i]] +
+                                gegl_lut_u8_to_u16[ba[i]] +
+                                gegl_lut_u8_to_u16[bb[i]])>>2 ];);
       }
 }
 
@@ -775,33 +693,33 @@ gegl_downscale_2x2_u8_nl_alpha (const Babl *format,
       switch (components)
       {
         CASE(2,
-            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                  lut_u8_to_u16[ab[0]] +
-                                                  lut_u8_to_u16[ba[0]] +
-                                                  lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                  gegl_lut_u8_to_u16[ab[0]] +
+                                                  gegl_lut_u8_to_u16[ba[0]] +
+                                                  gegl_lut_u8_to_u16[bb[0]])>>2 ];
             ((uint8_t *)dst)[1] = (aa[1] + ab[1] + ba[1] + bb[1])>>2;);
         CASE(4,
-            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                  lut_u8_to_u16[ab[0]] +
-                                                  lut_u8_to_u16[ba[0]] +
-                                                  lut_u8_to_u16[bb[0]])>>2 ];
-            ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
-                                                  lut_u8_to_u16[ab[1]] +
-                                                  lut_u8_to_u16[ba[1]] +
-                                                  lut_u8_to_u16[bb[1]])>>2 ];
-            ((uint8_t *)dst)[2] = lut_u16_to_u8[ (lut_u8_to_u16[aa[2]] +
-                                                  lut_u8_to_u16[ab[2]] +
-                                                  lut_u8_to_u16[ba[2]] +
-                                                  lut_u8_to_u16[bb[2]])>>2 ];
+            ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                  gegl_lut_u8_to_u16[ab[0]] +
+                                                  gegl_lut_u8_to_u16[ba[0]] +
+                                                  gegl_lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[1] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[1]] +
+                                                  gegl_lut_u8_to_u16[ab[1]] +
+                                                  gegl_lut_u8_to_u16[ba[1]] +
+                                                  gegl_lut_u8_to_u16[bb[1]])>>2 ];
+            ((uint8_t *)dst)[2] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[2]] +
+                                                  gegl_lut_u8_to_u16[ab[2]] +
+                                                  gegl_lut_u8_to_u16[ba[2]] +
+                                                  gegl_lut_u8_to_u16[bb[2]])>>2 ];
             ((uint8_t *)dst)[3] = (aa[3] + ab[3] + ba[3] + bb[3])>>2;);
         default:
          CASE(0,
             for (gint i = 0; i < components - 1; i++)
               ((uint8_t *)dst)[i] =
-                lut_u16_to_u8[ (lut_u8_to_u16[aa[i]] +
-                                lut_u8_to_u16[ab[i]] +
-                                lut_u8_to_u16[ba[i]] +
-                                lut_u8_to_u16[bb[i]])>>2 ];
+                gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[i]] +
+                                gegl_lut_u8_to_u16[ab[i]] +
+                                gegl_lut_u8_to_u16[ba[i]] +
+                                gegl_lut_u8_to_u16[bb[i]])>>2 ];
             ((uint8_t *)dst)[components-1] = (aa[components-1] + ab[components-1] + ba[components-1] + 
bb[components-1])>>2;);
       }
 #undef CASE
@@ -837,18 +755,18 @@ gegl_downscale_2x2_u8_rgba (const Babl *format,
       for (x = 0; x < src_width / 2; x++)
         {
 
-          ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                lut_u8_to_u16[ab[0]] +
-                                                lut_u8_to_u16[ba[0]] +
-                                                lut_u8_to_u16[bb[0]])>>2 ];
-          ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
-                                                lut_u8_to_u16[ab[1]] +
-                                                lut_u8_to_u16[ba[1]] +
-                                                lut_u8_to_u16[bb[1]])>>2 ];
-          ((uint8_t *)dst)[2] = lut_u16_to_u8[ (lut_u8_to_u16[aa[2]] +
-                                                lut_u8_to_u16[ab[2]] +
-                                                lut_u8_to_u16[ba[2]] +
-                                                lut_u8_to_u16[bb[2]])>>2 ];
+          ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                gegl_lut_u8_to_u16[ab[0]] +
+                                                gegl_lut_u8_to_u16[ba[0]] +
+                                                gegl_lut_u8_to_u16[bb[0]])>>2 ];
+          ((uint8_t *)dst)[1] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[1]] +
+                                                gegl_lut_u8_to_u16[ab[1]] +
+                                                gegl_lut_u8_to_u16[ba[1]] +
+                                                gegl_lut_u8_to_u16[bb[1]])>>2 ];
+          ((uint8_t *)dst)[2] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[2]] +
+                                                gegl_lut_u8_to_u16[ab[2]] +
+                                                gegl_lut_u8_to_u16[ba[2]] +
+                                                gegl_lut_u8_to_u16[bb[2]])>>2 ];
           ((uint8_t *)dst)[3] = (aa[3] + ab[3] + ba[3] + bb[3])>>2;
 
           dst += bpp;
@@ -890,18 +808,18 @@ gegl_downscale_2x2_u8_rgb (const Babl *format,
       for (x = 0; x < src_width / 2; x++)
         {
 
-          ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
-                                                lut_u8_to_u16[ab[0]] +
-                                                lut_u8_to_u16[ba[0]] +
-                                                lut_u8_to_u16[bb[0]])>>2 ];
-          ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
-                                                lut_u8_to_u16[ab[1]] +
-                                                lut_u8_to_u16[ba[1]] +
-                                                lut_u8_to_u16[bb[1]])>>2 ];
-          ((uint8_t *)dst)[2] = lut_u16_to_u8[ (lut_u8_to_u16[aa[2]] +
-                                                lut_u8_to_u16[ab[2]] +
-                                                lut_u8_to_u16[ba[2]] +
-                                                lut_u8_to_u16[bb[2]])>>2 ];
+          ((uint8_t *)dst)[0] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[0]] +
+                                                gegl_lut_u8_to_u16[ab[0]] +
+                                                gegl_lut_u8_to_u16[ba[0]] +
+                                                gegl_lut_u8_to_u16[bb[0]])>>2 ];
+          ((uint8_t *)dst)[1] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[1]] +
+                                                gegl_lut_u8_to_u16[ab[1]] +
+                                                gegl_lut_u8_to_u16[ba[1]] +
+                                                gegl_lut_u8_to_u16[bb[1]])>>2 ];
+          ((uint8_t *)dst)[2] = gegl_lut_u16_to_u8[ (gegl_lut_u8_to_u16[aa[2]] +
+                                                gegl_lut_u8_to_u16[ab[2]] +
+                                                gegl_lut_u8_to_u16[ba[2]] +
+                                                gegl_lut_u8_to_u16[bb[2]])>>2 ];
           dst += bpp;
           aa += bpp * 2;
           ab += bpp * 2;
@@ -931,60 +849,14 @@ gegl_downscale_2x2_u8_rgb (const Babl *format,
 #define gegl_downscale_2x2_u8_nl_alpha ((void) gegl_downscale_2x2_u8_nl_alpha, \
                                         gegl_downscale_2x2_u8_nl)
 
-
-GeglDownscale2x2Fun gegl_downscale_2x2_get_fun (const Babl *format)
-{
-  const Babl *comp_type = babl_format_get_type (format, 0);
-  const Babl *model     = babl_format_get_model (format);
-  BablModelFlag model_flags = babl_get_model_flags (model);
-  
-  if ((model_flags & BABL_MODEL_FLAG_LINEAR)||
-      (model_flags & BABL_MODEL_FLAG_CMYK))
-  {
-    if (comp_type == gegl_babl_float())
-    {
-      return gegl_downscale_2x2_float;
-    }
-    else if (comp_type == gegl_babl_u8())
-    {
-      return gegl_downscale_2x2_u8;
-    }
-    else if (comp_type == gegl_babl_u16())
-    {
-      return gegl_downscale_2x2_u16;
-    }
-    else if (comp_type == gegl_babl_u32())
-    {
-      return gegl_downscale_2x2_u32;
-    }
-    else if (comp_type == gegl_babl_double())
-    {
-      return gegl_downscale_2x2_double;
-    }
-  }
-  if (comp_type == gegl_babl_u8())
-  {
-    if (format == gegl_babl_rgba_u8())
-      return gegl_downscale_2x2_u8_rgba;
-    if (format == gegl_babl_rgb_u8())
-      return gegl_downscale_2x2_u8_rgb;
-
-    if (babl_format_has_alpha (format))
-      return gegl_downscale_2x2_u8_nl_alpha;
-    else
-      return gegl_downscale_2x2_u8_nl;
-  }
-  return gegl_downscale_2x2_generic;
-}
-
 void
-gegl_downscale_2x2_nearest (const Babl *format,
-                            gint        src_width,
-                            gint        src_height,
-                            guchar     *src_data,
-                            gint        src_rowstride,
-                            guchar     *dst_data,
-                            gint        dst_rowstride)
+GEGL_SIMD_SUFFIX(gegl_downscale_2x2_nearest) (const Babl *format,
+                                              gint        src_width,
+                                              gint        src_height,
+                                              guchar     *src_data,
+                                              gint        src_rowstride,
+                                              guchar     *dst_data,
+                                              gint        dst_rowstride)
 {
   gint bpp = babl_format_get_bytes_per_pixel (format);
   gint y;
@@ -1007,68 +879,421 @@ gegl_downscale_2x2_nearest (const Babl *format,
     }
 }
 
-static void
-gegl_resample_boxfilter_generic (guchar       *dest_buf,
-                                 const guchar *source_buf,
-                                 const GeglRectangle *dst_rect,
-                                 const GeglRectangle *src_rect,
-                                 gint  s_rowstride,
-                                 gdouble scale,
-                                 const Babl *format,
-                                 gint bpp,
-                                 gint d_rowstride)
-{
-  const Babl *tmp_format = babl_format_with_space ("RGBA float", format);
-  const Babl *from_fish  = babl_fish (format, tmp_format);
-  const Babl *to_fish    = babl_fish (tmp_format, format);
-
-  const gint tmp_bpp     = 4 * 4;
-  gint in_tmp_rowstride  = src_rect->width * tmp_bpp;
-  gint out_tmp_rowstride = dst_rect->width * tmp_bpp;
-  gint do_free = 0;
-
-  guchar *in_tmp, *out_tmp;
 
-  if (src_rect->height * in_tmp_rowstride + dst_rect->height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
+void
+GEGL_SIMD_SUFFIX(gegl_resample_nearest) (guchar              *dst,
+                                         const guchar        *src,
+                                         const GeglRectangle *dst_rect,
+                                         const GeglRectangle *src_rect,
+                                         const gint           src_stride,
+                                         const gdouble        scale,
+                                         const gint           bpp,
+                                         const gint           dst_stride)
+{
+  gint jj[dst_rect->width];
+  gint x, y;
+  for (x = 0; x < dst_rect->width; x++)
   {
-    in_tmp = align_16 (alloca (src_rect->height * in_tmp_rowstride + 16));
-    out_tmp = align_16 (alloca (dst_rect->height * out_tmp_rowstride + 16));
+    const gfloat sx = (dst_rect->x + .5 + x) / scale - src_rect->x;
+    jj[x] = int_floorf (sx ) * bpp;
   }
-  else
+
+#define IMPL(...) do{ \
+  for (y = 0; y < dst_rect->height; y++)\
+    {\
+      const gfloat sy = (dst_rect->y + .5 + y) / scale - src_rect->y;\
+      const gint   ii = int_floorf (sy);\
+      gint *ijj = &jj[0];\
+      guchar *d = &dst[y*dst_stride];\
+      const guchar *s = &src[ii * src_stride];\
+      for (x = 0; x < dst_rect->width; x++)\
+        {\
+          __VA_ARGS__;\
+          d += bpp; \
+        }\
+    }\
+  }while(0)
+
+  switch(bpp)
   {
-    in_tmp  = gegl_scratch_alloc (src_rect->height * in_tmp_rowstride);
-    out_tmp = gegl_scratch_alloc (dst_rect->height * out_tmp_rowstride);
-    do_free = 1;
+    case 1:IMPL(
+             d[0] = s[*(ijj++)];
+           );
+    break;
+    case 2:IMPL(
+             uint16_t* d16 = (void*) d;
+             const uint16_t* s16 = (void*) &s[*(ijj++)];
+             d16[0] = s16[0];
+           );
+    break;
+    case 3:IMPL(
+             d[0] = s[*ijj];
+             d[1] = s[*ijj + 1];
+             d[2] = s[*(ijj++) + 2];
+           );
+    break;
+    case 5:IMPL(
+             uint32_t* d32 = (void*) d;
+             const uint32_t* s32 = (void*) &s[*(ijj++)];
+             d32[0] = s32[0];
+             d[4] = s[4];
+           );
+    break;
+    case 4:IMPL(
+             uint32_t* d32 = (void*) d;
+             const uint32_t* s32 = (void*) &s[*(ijj++)];
+             d32[0] = s32[0];
+           );
+    break;
+    case 6:IMPL(
+             uint32_t* d32 = (void*) d;
+             const uint32_t* s32 = (void*) &s[*(ijj++)];
+             d32[0] = s32[0];
+             d[4] = s[4];
+             d[5] = s[5];
+           );
+    break;
+    case 8:IMPL(
+             uint64_t* d64 = (void*) d;
+             const uint64_t* s64 = (void*) &s[*(ijj++)];
+             d64[0] = s64[0];
+           );
+    break;
+    case 12:IMPL(
+             uint32_t* d32 = (void*) d;
+             const uint32_t* s32 = (void*) &s[*(ijj++)];
+             d32[0] = s32[0];
+             d32[1] = s32[1];
+             d32[2] = s32[2];
+           );
+    break;
+    case 16:IMPL(
+             uint64_t* d64 = (void*) d;
+             const uint64_t* s64 = (void*) &s[*(ijj++)];
+             d64[0] = s64[0];
+             d64[1] = s64[1];
+           );
+    break;
+    default:
+         IMPL(
+          memcpy (&d[0], &s[*(ijj++)], bpp);
+           );
+    break;
   }
+#undef IMPL
+}
 
-  babl_process_rows (from_fish,
-                     source_buf, s_rowstride,
-                     in_tmp, in_tmp_rowstride,
-                     src_rect->width, src_rect->height);
-
-  gegl_resample_boxfilter_float (out_tmp, in_tmp, dst_rect, src_rect,
-                                 in_tmp_rowstride, scale, tmp_format, tmp_bpp, out_tmp_rowstride);
+#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_double
+#define BOXFILTER_TYPE       gdouble
+#define BOXFILTER_TEMP_TYPE  gdouble
+#define BOXFILTER_ROUND(val) (val)
+#include "gegl-algorithms-boxfilter.inc"
+#undef BOXFILTER_FUNCNAME
+#undef BOXFILTER_TYPE
+#undef BOXFILTER_TEMP_TYPE
+#undef BOXFILTER_ROUND
 
-  babl_process_rows (to_fish,
-                     out_tmp,  out_tmp_rowstride,
-                     dest_buf, d_rowstride,
-                     dst_rect->width, dst_rect->height);
+#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_float
+#define BOXFILTER_TYPE       gfloat
+#define BOXFILTER_TEMP_TYPE  gfloat
+#define BOXFILTER_ROUND(val) (val)
+#include "gegl-algorithms-boxfilter.inc"
+#undef BOXFILTER_FUNCNAME
+#undef BOXFILTER_TYPE
+#undef BOXFILTER_TEMP_TYPE
+#undef BOXFILTER_ROUND
 
-  if (do_free)
-    {
-      gegl_scratch_free (out_tmp);
+#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_u8
+#define BOXFILTER_TYPE       guchar
+#define BOXFILTER_TEMP_TYPE  guchar
+#define BOXFILTER_ROUND(val) ((int)((val)+0.5f))
+#include "gegl-algorithms-boxfilter.inc"
+#undef BOXFILTER_FUNCNAME
+#undef BOXFILTER_TYPE
+#undef BOXFILTER_TEMP_TYPE
+#undef BOXFILTER_ROUND
+
+#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_u16
+#define BOXFILTER_TYPE       guint16
+#define BOXFILTER_TEMP_TYPE  guint16
+#define BOXFILTER_ROUND(val) ((int)((val)+0.5f))
+#include "gegl-algorithms-boxfilter.inc"
+#undef BOXFILTER_FUNCNAME
+#undef BOXFILTER_TYPE
+#undef BOXFILTER_TEMP_TYPE
+#undef BOXFILTER_ROUND
+
+static inline guint32 _gegl_trunc_u32(guint64 value)
+{
+  if ((guint64) value > G_MAXUINT32)
+    return G_MAXUINT32;
+  return value;
+}
+
+#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_u32
+#define BOXFILTER_TYPE       guint32
+#define BOXFILTER_TEMP_TYPE  guint64
+#define BOXFILTER_ROUND(val) _gegl_trunc_u32((val)+0.5f)
+#include "gegl-algorithms-boxfilter.inc"
+#undef BOXFILTER_FUNCNAME
+#undef BOXFILTER_TEMP_TYPE
+#undef BOXFILTER_TYPE
+#undef BOXFILTER_ROUND
+
+#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_double
+#define DOWNSCALE_TYPE     gdouble
+#define DOWNSCALE_SUM      gdouble
+#define DOWNSCALE_DIVISOR  4.0
+#include "gegl-algorithms-2x2-downscale.inc"
+#undef DOWNSCALE_FUNCNAME
+#undef DOWNSCALE_TYPE
+#undef DOWNSCALE_SUM
+#undef DOWNSCALE_DIVISOR
+
+#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_float
+#define DOWNSCALE_TYPE     gfloat
+#define DOWNSCALE_SUM      gfloat
+#define DOWNSCALE_DIVISOR  4.0f
+#include "gegl-algorithms-2x2-downscale.inc"
+#undef DOWNSCALE_FUNCNAME
+#undef DOWNSCALE_TYPE
+#undef DOWNSCALE_SUM
+#undef DOWNSCALE_DIVISOR
+
+#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_u32
+#define DOWNSCALE_TYPE     guint32
+#define DOWNSCALE_SUM      guint64
+#define DOWNSCALE_DIVISOR  4
+#include "gegl-algorithms-2x2-downscale.inc"
+#undef DOWNSCALE_FUNCNAME
+#undef DOWNSCALE_TYPE
+#undef DOWNSCALE_SUM
+#undef DOWNSCALE_DIVISOR
+
+#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_u16
+#define DOWNSCALE_TYPE     guint16
+#define DOWNSCALE_SUM      guint
+#define DOWNSCALE_DIVISOR  4
+#include "gegl-algorithms-2x2-downscale.inc"
+#undef DOWNSCALE_FUNCNAME
+#undef DOWNSCALE_TYPE
+#undef DOWNSCALE_SUM
+#undef DOWNSCALE_DIVISOR
+
+#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_u8
+#define DOWNSCALE_TYPE     guint8
+#define DOWNSCALE_SUM      guint
+#define DOWNSCALE_DIVISOR  4
+#include "gegl-algorithms-2x2-downscale.inc"
+#undef DOWNSCALE_FUNCNAME
+#undef DOWNSCALE_TYPE
+#undef DOWNSCALE_SUM
+#undef DOWNSCALE_DIVISOR
+
+
+#define BILINEAR_FUNCNAME   gegl_resample_bilinear_double
+#define BILINEAR_TYPE       gdouble
+#define BILINEAR_ROUND(val) (val)
+#include "gegl-algorithms-bilinear.inc"
+#undef BILINEAR_FUNCNAME
+#undef BILINEAR_TYPE
+#undef BILINEAR_ROUND
+
+#define BILINEAR_FUNCNAME   gegl_resample_bilinear_float
+#define BILINEAR_TYPE       gfloat
+#define BILINEAR_ROUND(val) (val)
+#include "gegl-algorithms-bilinear.inc"
+#undef BILINEAR_FUNCNAME
+#undef BILINEAR_TYPE
+#undef BILINEAR_ROUND
+
+#define BILINEAR_FUNCNAME   gegl_resample_bilinear_u8
+#define BILINEAR_TYPE       guchar
+#define BILINEAR_ROUND(val) ((int)((val)+0.5f))
+#include "gegl-algorithms-bilinear.inc"
+#undef BILINEAR_FUNCNAME
+#undef BILINEAR_TYPE
+#undef BILINEAR_ROUND
+
+#define BILINEAR_FUNCNAME   gegl_resample_bilinear_u16
+#define BILINEAR_TYPE       guint16
+#define BILINEAR_ROUND(val) ((int)((val)+0.5f))
+#include "gegl-algorithms-bilinear.inc"
+#undef BILINEAR_FUNCNAME
+#undef BILINEAR_TYPE
+#undef BILINEAR_ROUND
+
+#define BILINEAR_FUNCNAME   gegl_resample_bilinear_u32
+#define BILINEAR_TYPE       guint32
+#define BILINEAR_ROUND(val) _gegl_trunc_u32((val)+0.5f)
+#include "gegl-algorithms-bilinear.inc"
+#undef BILINEAR_FUNCNAME
+#undef BILINEAR_TYPE
+#undef BILINEAR_ROUND
+
+static void
+gegl_downscale_2x2_generic2 (const Babl *format,
+                             gint        src_width,
+                             gint        src_height,
+                             guchar     *src_data,
+                             gint        src_rowstride,
+                             guchar     *dst_data,
+                             gint        dst_rowstride)
+{
+  const Babl *tmp_format = babl_format_with_space ("RGBA float", format);
+  const Babl *from_fish  = babl_fish (format, tmp_format);
+  const Babl *to_fish    = babl_fish (tmp_format, format);
+  const gint tmp_bpp     = 4 * 4;
+  gint dst_width         = src_width / 2;
+  gint dst_height        = src_height / 2;
+  gint in_tmp_rowstride  = src_width * tmp_bpp;
+  gint out_tmp_rowstride = dst_width * tmp_bpp;
+  gint do_free = 0;
+
+  void *in_tmp;
+  void *out_tmp;
+
+  if (src_height * in_tmp_rowstride + dst_height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
+  {
+    in_tmp = align_16 (alloca (src_height * in_tmp_rowstride + 16));
+    out_tmp = align_16 (alloca (dst_height * out_tmp_rowstride + 16));
+  }
+  else
+  {
+    in_tmp = gegl_scratch_alloc (src_height * in_tmp_rowstride);
+    out_tmp = gegl_scratch_alloc (dst_height * out_tmp_rowstride);
+    do_free = 1;
+  }
+
+  babl_process_rows (from_fish,
+                     src_data, src_rowstride,
+                     in_tmp,   in_tmp_rowstride,
+                     src_width, src_height);
+  gegl_downscale_2x2_float (tmp_format, src_width, src_height,
+                            in_tmp,  in_tmp_rowstride,
+                            out_tmp, out_tmp_rowstride);
+  babl_process_rows (to_fish,
+                     out_tmp,   out_tmp_rowstride,
+                     dst_data,  dst_rowstride,
+                     dst_width, dst_height);
+
+  if (do_free)
+   {
+     gegl_scratch_free (out_tmp);
+     gegl_scratch_free (in_tmp);
+   }
+}
+
+GeglDownscale2x2Fun GEGL_SIMD_SUFFIX(gegl_downscale_2x2_get_fun) (const Babl *format)
+{
+  const Babl *comp_type = babl_format_get_type (format, 0);
+  const Babl *model     = babl_format_get_model (format);
+  BablModelFlag model_flags = babl_get_model_flags (model);
+  
+  if ((model_flags & BABL_MODEL_FLAG_LINEAR)||
+      (model_flags & BABL_MODEL_FLAG_CMYK))
+  {
+    if (comp_type == gegl_babl_float())
+    {
+      return gegl_downscale_2x2_float;
+    }
+    else if (comp_type == gegl_babl_u8())
+    {
+      return gegl_downscale_2x2_u8;
+    }
+    else if (comp_type == gegl_babl_u16())
+    {
+      return gegl_downscale_2x2_u16;
+    }
+    else if (comp_type == gegl_babl_u32())
+    {
+      return gegl_downscale_2x2_u32;
+    }
+    else if (comp_type == gegl_babl_double())
+    {
+      return gegl_downscale_2x2_double;
+    }
+  }
+  if (comp_type == gegl_babl_u8())
+  {
+    if (format == gegl_babl_rgba_u8())
+      return gegl_downscale_2x2_u8_rgba;
+    if (format == gegl_babl_rgb_u8())
+      return gegl_downscale_2x2_u8_rgb;
+
+    if (babl_format_has_alpha (format))
+      return gegl_downscale_2x2_u8_nl_alpha;
+    else
+      return gegl_downscale_2x2_u8_nl;
+  }
+  return gegl_downscale_2x2_generic2;
+}
+
+
+static void
+gegl_resample_boxfilter_generic2 (guchar       *dest_buf,
+                                  const guchar *source_buf,
+                                  const GeglRectangle *dst_rect,
+                                  const GeglRectangle *src_rect,
+                                  gint  s_rowstride,
+                                  gdouble scale,
+                                  const Babl *format,
+                                  gint bpp,
+                                  gint d_rowstride)
+{
+  const Babl *tmp_format = babl_format_with_space ("RGBA float", format);
+  const Babl *from_fish  = babl_fish (format, tmp_format);
+  const Babl *to_fish    = babl_fish (tmp_format, format);
+
+  const gint tmp_bpp     = 4 * 4;
+  gint in_tmp_rowstride  = src_rect->width * tmp_bpp;
+  gint out_tmp_rowstride = dst_rect->width * tmp_bpp;
+  gint do_free = 0;
+
+  guchar *in_tmp, *out_tmp;
+
+  if (src_rect->height * in_tmp_rowstride + dst_rect->height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
+  {
+    in_tmp = align_16 (alloca (src_rect->height * in_tmp_rowstride + 16));
+    out_tmp = align_16 (alloca (dst_rect->height * out_tmp_rowstride + 16));
+  }
+  else
+  {
+    in_tmp  = gegl_scratch_alloc (src_rect->height * in_tmp_rowstride);
+    out_tmp = gegl_scratch_alloc (dst_rect->height * out_tmp_rowstride);
+    do_free = 1;
+  }
+
+  babl_process_rows (from_fish,
+                     source_buf, s_rowstride,
+                     in_tmp, in_tmp_rowstride,
+                     src_rect->width, src_rect->height);
+
+  gegl_resample_boxfilter_float (out_tmp, in_tmp, dst_rect, src_rect,
+                                 in_tmp_rowstride, scale, tmp_format, tmp_bpp, out_tmp_rowstride);
+
+  babl_process_rows (to_fish,
+                     out_tmp,  out_tmp_rowstride,
+                     dest_buf, d_rowstride,
+                     dst_rect->width, dst_rect->height);
+
+  if (do_free)
+    {
+      gegl_scratch_free (out_tmp);
       gegl_scratch_free (in_tmp);
     }
 }
 
-void gegl_resample_boxfilter (guchar              *dest_buf,
-                              const guchar        *source_buf,
-                              const GeglRectangle *dst_rect,
-                              const GeglRectangle *src_rect,
-                              gint                 s_rowstride,
-                              gdouble              scale,
-                              const Babl          *format,
-                              gint                 d_rowstride)
+
+void
+GEGL_SIMD_SUFFIX(gegl_resample_boxfilter) (guchar              *dest_buf,
+                                           const guchar        *source_buf,
+                                           const GeglRectangle *dst_rect,
+                                           const GeglRectangle *src_rect,
+                                           gint                 s_rowstride,
+                                           gdouble              scale,
+                                           const Babl          *format,
+                                           gint                 d_rowstride)
 {
   void (*func) (guchar *dest_buf,
                 const guchar        *source_buf,
@@ -1078,7 +1303,7 @@ void gegl_resample_boxfilter (guchar              *dest_buf,
                 gdouble              scale,
                 const Babl          *format,
                 gint                 bpp,
-                gint                 d_rowstride) = gegl_resample_boxfilter_generic;
+                gint                 d_rowstride) = gegl_resample_boxfilter_generic2;
 
 
   const Babl *model     = babl_format_get_model (format);
@@ -1119,15 +1344,16 @@ void gegl_resample_boxfilter (guchar              *dest_buf,
 
 }
 
+
 static void
-gegl_resample_bilinear_generic (guchar              *dest_buf,
-                                const guchar        *source_buf,
-                                const GeglRectangle *dst_rect,
-                                const GeglRectangle *src_rect,
-                                gint                 s_rowstride,
-                                gdouble              scale,
-                                const Babl          *format,
-                                gint                 d_rowstride)
+gegl_resample_bilinear_generic2 (guchar              *dest_buf,
+                                 const guchar        *source_buf,
+                                 const GeglRectangle *dst_rect,
+                                 const GeglRectangle *src_rect,
+                                 gint                 s_rowstride,
+                                 gdouble              scale,
+                                 const Babl          *format,
+                                 gint                 d_rowstride)
 {
   const Babl *tmp_format = babl_format_with_space ("RGBA float", format);
   const Babl *from_fish  = babl_fish (format, tmp_format);
@@ -1173,14 +1399,15 @@ gegl_resample_bilinear_generic (guchar              *dest_buf,
     }
 }
 
-void gegl_resample_bilinear (guchar              *dest_buf,
-                             const guchar        *source_buf,
-                             const GeglRectangle *dst_rect,
-                             const GeglRectangle *src_rect,
-                             gint                 s_rowstride,
-                             gdouble              scale,
-                             const Babl          *format,
-                             gint                 d_rowstride)
+void
+GEGL_SIMD_SUFFIX(gegl_resample_bilinear) (guchar              *dest_buf,
+                                          const guchar        *source_buf,
+                                          const GeglRectangle *dst_rect,
+                                          const GeglRectangle *src_rect,
+                                          gint                 s_rowstride,
+                                          gdouble              scale,
+                                          const Babl          *format,
+                                          gint                 d_rowstride)
 {
   const Babl *model     = babl_format_get_model (format);
   const Babl *comp_type  = babl_format_get_type (format, 0);
@@ -1207,8 +1434,8 @@ void gegl_resample_bilinear (guchar              *dest_buf,
       gegl_resample_bilinear_double (dest_buf, source_buf, dst_rect, src_rect,
                                      s_rowstride, scale, bpp, d_rowstride);
     else
-      gegl_resample_bilinear_generic (dest_buf, source_buf, dst_rect, src_rect,
-                                      s_rowstride, scale, format, d_rowstride);
+      gegl_resample_bilinear_generic2 (dest_buf, source_buf, dst_rect, src_rect,
+                                       s_rowstride, scale, format, d_rowstride);
     }
   else
     {
@@ -1224,262 +1451,11 @@ void gegl_resample_bilinear (guchar              *dest_buf,
         }
       else
         {
-          gegl_resample_bilinear_generic (dest_buf, source_buf,
-                                          dst_rect, src_rect,
-                                          s_rowstride, scale, format,
-                                          d_rowstride);
+          gegl_resample_bilinear_generic2 (dest_buf, source_buf,
+                                           dst_rect, src_rect,
+                                           s_rowstride, scale, format,
+                                           d_rowstride);
         }
     }
 }
 
-void
-gegl_resample_nearest (guchar              *dst,
-                       const guchar        *src,
-                       const GeglRectangle *dst_rect,
-                       const GeglRectangle *src_rect,
-                       const gint           src_stride,
-                       const gdouble        scale,
-                       const gint           bpp,
-                       const gint           dst_stride)
-{
-  gint jj[dst_rect->width];
-  gint x, y;
-  for (x = 0; x < dst_rect->width; x++)
-  {
-    const gfloat sx = (dst_rect->x + .5 + x) / scale - src_rect->x;
-    jj[x] = int_floorf (sx ) * bpp;
-  }
-
-#define IMPL(...) do{ \
-  for (y = 0; y < dst_rect->height; y++)\
-    {\
-      const gfloat sy = (dst_rect->y + .5 + y) / scale - src_rect->y;\
-      const gint   ii = int_floorf (sy);\
-      gint *ijj = &jj[0];\
-      guchar *d = &dst[y*dst_stride];\
-      const guchar *s = &src[ii * src_stride];\
-      for (x = 0; x < dst_rect->width; x++)\
-        {\
-          __VA_ARGS__;\
-          d += bpp; \
-        }\
-    }\
-  }while(0)
-
-  switch(bpp)
-  {
-    case 1:IMPL(
-             d[0] = s[*(ijj++)];
-           );
-    break;
-    case 2:IMPL(
-             uint16_t* d16 = (void*) d;
-             const uint16_t* s16 = (void*) &s[*(ijj++)];
-             d16[0] = s16[0];
-           );
-    break;
-    case 3:IMPL(
-             d[0] = s[*ijj];
-             d[1] = s[*ijj + 1];
-             d[2] = s[*(ijj++) + 2];
-           );
-    break;
-    case 5:IMPL(
-             uint32_t* d32 = (void*) d;
-             const uint32_t* s32 = (void*) &s[*(ijj++)];
-             d32[0] = s32[0];
-             d[4] = s[4];
-           );
-    break;
-    case 4:IMPL(
-             uint32_t* d32 = (void*) d;
-             const uint32_t* s32 = (void*) &s[*(ijj++)];
-             d32[0] = s32[0];
-           );
-    break;
-    case 6:IMPL(
-             uint32_t* d32 = (void*) d;
-             const uint32_t* s32 = (void*) &s[*(ijj++)];
-             d32[0] = s32[0];
-             d[4] = s[4];
-             d[5] = s[5];
-           );
-    break;
-    case 8:IMPL(
-             uint64_t* d64 = (void*) d;
-             const uint64_t* s64 = (void*) &s[*(ijj++)];
-             d64[0] = s64[0];
-           );
-    break;
-    case 12:IMPL(
-             uint32_t* d32 = (void*) d;
-             const uint32_t* s32 = (void*) &s[*(ijj++)];
-             d32[0] = s32[0];
-             d32[1] = s32[1];
-             d32[2] = s32[2];
-           );
-    break;
-    case 16:IMPL(
-             uint64_t* d64 = (void*) d;
-             const uint64_t* s64 = (void*) &s[*(ijj++)];
-             d64[0] = s64[0];
-             d64[1] = s64[1];
-           );
-    break;
-    default:
-         IMPL(
-          memcpy (&d[0], &s[*(ijj++)], bpp);
-           );
-    break;
-  }
-#undef IMPL
-}
-
-#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_double
-#define BOXFILTER_TYPE       gdouble
-#define BOXFILTER_TEMP_TYPE  gdouble
-#define BOXFILTER_ROUND(val) (val)
-#include "gegl-algorithms-boxfilter.inc"
-#undef BOXFILTER_FUNCNAME
-#undef BOXFILTER_TYPE
-#undef BOXFILTER_TEMP_TYPE
-#undef BOXFILTER_ROUND
-
-#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_float
-#define BOXFILTER_TYPE       gfloat
-#define BOXFILTER_TEMP_TYPE  gfloat
-#define BOXFILTER_ROUND(val) (val)
-#include "gegl-algorithms-boxfilter.inc"
-#undef BOXFILTER_FUNCNAME
-#undef BOXFILTER_TYPE
-#undef BOXFILTER_TEMP_TYPE
-#undef BOXFILTER_ROUND
-
-#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_u8
-#define BOXFILTER_TYPE       guchar
-#define BOXFILTER_TEMP_TYPE  guchar
-#define BOXFILTER_ROUND(val) ((int)((val)+0.5f))
-#include "gegl-algorithms-boxfilter.inc"
-#undef BOXFILTER_FUNCNAME
-#undef BOXFILTER_TYPE
-#undef BOXFILTER_TEMP_TYPE
-#undef BOXFILTER_ROUND
-
-#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_u16
-#define BOXFILTER_TYPE       guint16
-#define BOXFILTER_TEMP_TYPE  guint16
-#define BOXFILTER_ROUND(val) ((int)((val)+0.5f))
-#include "gegl-algorithms-boxfilter.inc"
-#undef BOXFILTER_FUNCNAME
-#undef BOXFILTER_TYPE
-#undef BOXFILTER_TEMP_TYPE
-#undef BOXFILTER_ROUND
-
-static inline guint32 _gegl_trunc_u32(guint64 value)
-{
-  if ((guint64) value > G_MAXUINT32)
-    return G_MAXUINT32;
-  return value;
-}
-
-#define BOXFILTER_FUNCNAME   gegl_resample_boxfilter_u32
-#define BOXFILTER_TYPE       guint32
-#define BOXFILTER_TEMP_TYPE  guint64
-#define BOXFILTER_ROUND(val) _gegl_trunc_u32((val)+0.5f)
-#include "gegl-algorithms-boxfilter.inc"
-#undef BOXFILTER_FUNCNAME
-#undef BOXFILTER_TEMP_TYPE
-#undef BOXFILTER_TYPE
-#undef BOXFILTER_ROUND
-
-#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_double
-#define DOWNSCALE_TYPE     gdouble
-#define DOWNSCALE_SUM      gdouble
-#define DOWNSCALE_DIVISOR  4.0
-#include "gegl-algorithms-2x2-downscale.inc"
-#undef DOWNSCALE_FUNCNAME
-#undef DOWNSCALE_TYPE
-#undef DOWNSCALE_SUM
-#undef DOWNSCALE_DIVISOR
-
-#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_float
-#define DOWNSCALE_TYPE     gfloat
-#define DOWNSCALE_SUM      gfloat
-#define DOWNSCALE_DIVISOR  4.0f
-#include "gegl-algorithms-2x2-downscale.inc"
-#undef DOWNSCALE_FUNCNAME
-#undef DOWNSCALE_TYPE
-#undef DOWNSCALE_SUM
-#undef DOWNSCALE_DIVISOR
-
-#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_u32
-#define DOWNSCALE_TYPE     guint32
-#define DOWNSCALE_SUM      guint64
-#define DOWNSCALE_DIVISOR  4
-#include "gegl-algorithms-2x2-downscale.inc"
-#undef DOWNSCALE_FUNCNAME
-#undef DOWNSCALE_TYPE
-#undef DOWNSCALE_SUM
-#undef DOWNSCALE_DIVISOR
-
-#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_u16
-#define DOWNSCALE_TYPE     guint16
-#define DOWNSCALE_SUM      guint
-#define DOWNSCALE_DIVISOR  4
-#include "gegl-algorithms-2x2-downscale.inc"
-#undef DOWNSCALE_FUNCNAME
-#undef DOWNSCALE_TYPE
-#undef DOWNSCALE_SUM
-#undef DOWNSCALE_DIVISOR
-
-#define DOWNSCALE_FUNCNAME gegl_downscale_2x2_u8
-#define DOWNSCALE_TYPE     guint8
-#define DOWNSCALE_SUM      guint
-#define DOWNSCALE_DIVISOR  4
-#include "gegl-algorithms-2x2-downscale.inc"
-#undef DOWNSCALE_FUNCNAME
-#undef DOWNSCALE_TYPE
-#undef DOWNSCALE_SUM
-#undef DOWNSCALE_DIVISOR
-
-
-#define BILINEAR_FUNCNAME   gegl_resample_bilinear_double
-#define BILINEAR_TYPE       gdouble
-#define BILINEAR_ROUND(val) (val)
-#include "gegl-algorithms-bilinear.inc"
-#undef BILINEAR_FUNCNAME
-#undef BILINEAR_TYPE
-#undef BILINEAR_ROUND
-
-#define BILINEAR_FUNCNAME   gegl_resample_bilinear_float
-#define BILINEAR_TYPE       gfloat
-#define BILINEAR_ROUND(val) (val)
-#include "gegl-algorithms-bilinear.inc"
-#undef BILINEAR_FUNCNAME
-#undef BILINEAR_TYPE
-#undef BILINEAR_ROUND
-
-#define BILINEAR_FUNCNAME   gegl_resample_bilinear_u8
-#define BILINEAR_TYPE       guchar
-#define BILINEAR_ROUND(val) ((int)((val)+0.5f))
-#include "gegl-algorithms-bilinear.inc"
-#undef BILINEAR_FUNCNAME
-#undef BILINEAR_TYPE
-#undef BILINEAR_ROUND
-
-#define BILINEAR_FUNCNAME   gegl_resample_bilinear_u16
-#define BILINEAR_TYPE       guint16
-#define BILINEAR_ROUND(val) ((int)((val)+0.5f))
-#include "gegl-algorithms-bilinear.inc"
-#undef BILINEAR_FUNCNAME
-#undef BILINEAR_TYPE
-#undef BILINEAR_ROUND
-
-#define BILINEAR_FUNCNAME   gegl_resample_bilinear_u32
-#define BILINEAR_TYPE       guint32
-#define BILINEAR_ROUND(val) _gegl_trunc_u32((val)+0.5f)
-#include "gegl-algorithms-bilinear.inc"
-#undef BILINEAR_FUNCNAME
-#undef BILINEAR_TYPE
-#undef BILINEAR_ROUND
-
diff --git a/gegl/buffer/gegl-algorithms.h b/gegl/buffer/gegl-algorithms.h
index a057f9c36..9cd4de864 100644
--- a/gegl/buffer/gegl-algorithms.h
+++ b/gegl/buffer/gegl-algorithms.h
@@ -19,12 +19,17 @@
 #ifndef __GEGL_ALGORITHMS_H__
 #define __GEGL_ALGORITHMS_H__
 
+
 #include "gegl-buffer.h"
 G_BEGIN_DECLS
 
 #define GEGL_SCALE_EPSILON 1.e-6
 
-void gegl_downscale_2x2 (const Babl *format,
+#ifndef GEGL_SIMD_SUFFIX
+#define GEGL_SIMD_SUFFIX(symbol)  symbol##_generic
+#endif
+
+void GEGL_SIMD_SUFFIX(gegl_downscale_2x2) (const Babl *format,
                          gint        src_width,
                          gint        src_height,
                          guchar     *src_data,
@@ -32,23 +37,6 @@ void gegl_downscale_2x2 (const Babl *format,
                          guchar     *dst_data,
                          gint        dst_rowstride);
 
-
-void gegl_downscale_2x2_double (const Babl *format,
-                                gint        src_width,
-                                gint        src_height,
-                                guchar     *src_data,
-                                gint        src_rowstride,
-                                guchar     *dst_data,
-                                gint        dst_rowstride);
-
-void gegl_downscale_2x2_float (const Babl *format,
-                               gint        src_width,
-                               gint        src_height,
-                               guchar     *src_data,
-                               gint        src_rowstride,
-                               guchar     *dst_data,
-                               gint        dst_rowstride);
-
 typedef void (*GeglDownscale2x2Fun) (const Babl *format,
                                      gint    src_width,
                                      gint    src_height,
@@ -57,31 +45,7 @@ typedef void (*GeglDownscale2x2Fun) (const Babl *format,
                                      guchar *dst_data,
                                      gint    dst_rowstride);
 
-void gegl_downscale_2x2_u32 (const Babl *format,
-                             gint        src_width,
-                             gint        src_height,
-                             guchar     *src_data,
-                             gint        src_rowstride,
-                             guchar     *dst_data,
-                             gint        dst_rowstride);
-
-void gegl_downscale_2x2_u16 (const Babl *format,
-                             gint        src_width,
-                             gint        src_height,
-                             guchar     *src_data,
-                             gint        src_rowstride,
-                             guchar     *dst_data,
-                             gint        dst_rowstride);
-
-void gegl_downscale_2x2_u8 (const Babl *format,
-                            gint        src_width,
-                            gint        src_height,
-                            guchar     *src_data,
-                            gint        src_rowstride,
-                            guchar     *dst_data,
-                            gint        dst_rowstride);
-
-void gegl_downscale_2x2_nearest (const Babl *format,
+void GEGL_SIMD_SUFFIX(gegl_downscale_2x2_nearest) (const Babl *format,
                                  gint        src_width,
                                  gint        src_height,
                                  guchar     *src_data,
@@ -93,7 +57,7 @@ void gegl_downscale_2x2_nearest (const Babl *format,
  * available for #format fall back to nearest neighbor.
  * #scale is assumed to be between 0.5 and +inf.
  */
-void gegl_resample_boxfilter (guchar              *dest_buf,
+void GEGL_SIMD_SUFFIX(gegl_resample_boxfilter) (guchar              *dest_buf,
                               const guchar        *source_buf,
                               const GeglRectangle *dst_rect,
                               const GeglRectangle *src_rect,
@@ -102,60 +66,10 @@ void gegl_resample_boxfilter (guchar              *dest_buf,
                               const Babl          *format,
                               gint                 d_rowstride);
 
-void gegl_resample_boxfilter_double (guchar              *dest_buf,
-                                     const guchar        *source_buf,
-                                     const GeglRectangle *dst_rect,
-                                     const GeglRectangle *src_rect,
-                                     gint                 s_rowstride,
-                                     gdouble              scale,
-                                     const Babl          *format,
-                                     gint                 bpp,
-                                     gint                 d_rowstride);
-
-void gegl_resample_boxfilter_float (guchar              *dest_buf,
-                                    const guchar        *source_buf,
-                                    const GeglRectangle *dst_rect,
-                                    const GeglRectangle *src_rect,
-                                    gint                 s_rowstride,
-                                    gdouble              scale,
-                                    const Babl          *format,
-                                    gint                 bpp,
-                                    gint                 d_rowstride);
-
-void gegl_resample_boxfilter_u32 (guchar              *dest_buf,
-                                  const guchar        *source_buf,
-                                  const GeglRectangle *dst_rect,
-                                  const GeglRectangle *src_rect,
-                                  gint                 s_rowstride,
-                                  gdouble              scale,
-                                  const Babl          *format,
-                                  gint                 bpp,
-                                  gint                 d_rowstride);
-
-void gegl_resample_boxfilter_u16 (guchar              *dest_buf,
-                                  const guchar        *source_buf,
-                                  const GeglRectangle *dst_rect,
-                                  const GeglRectangle *src_rect,
-                                  gint                 s_rowstride,
-                                  gdouble              scale,
-                                  const Babl          *format,
-                                  gint                 bpp,
-                                  gint                 d_rowstride);
-
-void gegl_resample_boxfilter_u8 (guchar              *dest_buf,
-                                 const guchar        *source_buf,
-                                 const GeglRectangle *dst_rect,
-                                 const GeglRectangle *src_rect,
-                                 gint                 s_rowstride,
-                                 gdouble              scale,
-                                 const Babl          *format,
-                                 gint                 bpp,
-                                 gint                 d_rowstride);
-
 /* Attempt to resample with a 2x2 bilinear filter, if no implementation is
  * available for #format fall back to nearest neighbor.
  */
-void gegl_resample_bilinear (guchar              *dest_buf,
+void GEGL_SIMD_SUFFIX(gegl_resample_bilinear) (guchar              *dest_buf,
                              const guchar        *source_buf,
                              const GeglRectangle *dst_rect,
                              const GeglRectangle *src_rect,
@@ -164,52 +78,7 @@ void gegl_resample_bilinear (guchar              *dest_buf,
                              const Babl          *format,
                              gint                 d_rowstride);
 
-void gegl_resample_bilinear_double (guchar              *dest_buf,
-                                    const guchar        *source_buf,
-                                    const GeglRectangle *dst_rect,
-                                    const GeglRectangle *src_rect,
-                                    gint                 s_rowstride,
-                                    gdouble              scale,
-                                    gint                 bpp,
-                                    gint                 d_rowstride);
-
-void gegl_resample_bilinear_float (guchar              *dest_buf,
-                                   const guchar        *source_buf,
-                                   const GeglRectangle *dst_rect,
-                                   const GeglRectangle *src_rect,
-                                   gint                 s_rowstride,
-                                   gdouble              scale,
-                                   gint                 bpp,
-                                   gint                 d_rowstride);
-
-void gegl_resample_bilinear_u32 (guchar              *dest_buf,
-                                 const guchar        *source_buf,
-                                 const GeglRectangle *dst_rect,
-                                 const GeglRectangle *src_rect,
-                                 gint                 s_rowstride,
-                                 gdouble              scale,
-                                 gint                 bpp,
-                                 gint                 d_rowstride);
-
-void gegl_resample_bilinear_u16 (guchar              *dest_buf,
-                                 const guchar        *source_buf,
-                                 const GeglRectangle *dst_rect,
-                                 const GeglRectangle *src_rect,
-                                 gint                 s_rowstride,
-                                 gdouble              scale,
-                                 gint                 bpp,
-                                 gint                 d_rowstride);
-
-void gegl_resample_bilinear_u8 (guchar              *dest_buf,
-                                const guchar        *source_buf,
-                                const GeglRectangle *dst_rect,
-                                const GeglRectangle *src_rect,
-                                gint                 s_rowstride,
-                                gdouble              scale,
-                                gint                 bpp,
-                                gint                 d_rowstride);
-
-void gegl_resample_nearest (guchar              *dst,
+void GEGL_SIMD_SUFFIX(gegl_resample_nearest) (guchar              *dst,
                             const guchar        *src,
                             const GeglRectangle *dst_rect,
                             const GeglRectangle *src_rect,
@@ -218,8 +87,14 @@ void gegl_resample_nearest (guchar              *dst,
                             gint                 bpp,
                             gint                 dst_stride);
 
-GeglDownscale2x2Fun gegl_downscale_2x2_get_fun (const Babl *format);
+GeglDownscale2x2Fun GEGL_SIMD_SUFFIX(gegl_downscale_2x2_get_fun) (const Babl *format);
+
+#ifdef ARCH_X86_64
+GeglDownscale2x2Fun gegl_downscale_2x2_get_fun_x86_64_v2 (const Babl *format);
+GeglDownscale2x2Fun gegl_downscale_2x2_get_fun_x86_64_v3 (const Babl *format);
+#endif
 
+#define GEGL_ALGORITHMS_LUT_DIVISOR 16
 
 G_END_DECLS
 
diff --git a/gegl/buffer/gegl-buffer-private.h b/gegl/buffer/gegl-buffer-private.h
index 62e4716d2..d15330dba 100644
--- a/gegl/buffer/gegl-buffer-private.h
+++ b/gegl/buffer/gegl-buffer-private.h
@@ -217,6 +217,42 @@ extern void (*gegl_buffer_ext_flush) (GeglBuffer *buffer, const GeglRectangle *r
 extern void (*gegl_buffer_ext_invalidate) (GeglBuffer *buffer, const GeglRectangle *rect);
 
 
+extern void (*gegl_resample_bilinear) (guchar *dest_buf,
+                                       const guchar *source_buf,
+                                       const GeglRectangle *dst_rect,
+                                       const GeglRectangle *src_rect,
+                                       gint                 s_rowstride,
+                                       gdouble              scale,
+                                       const Babl          *format,
+                                       gint                 d_rowstride);
+
+extern void (*gegl_resample_boxfilter)(guchar *dest_buf,
+                                       const guchar *source_buf,
+                                       const GeglRectangle *dst_rect,
+                                       const GeglRectangle *src_rect,
+                                       gint                 s_rowstride,
+                                       gdouble              scale,
+                                       const Babl          *format,
+                                       gint                 d_rowstride);
+
+extern void (*gegl_resample_nearest)(guchar *dest_buf,
+                                     const guchar *source_buf,
+                                     const GeglRectangle *dst_rect,
+                                     const GeglRectangle *src_rect,
+                                     gint                 s_rowstride,
+                                     gdouble              scale,
+                                     const gint           bpp,
+                                     gint                 d_rowstride);
+
+extern void (*gegl_downscale_2x2) (const Babl *format,
+                                   gint        src_width,
+                                   gint        src_height,
+                                   guchar     *src_data,
+                                   gint        src_rowstride,
+                                   guchar     *dst_data,
+                                   gint        dst_rowstride);
+
+
 #ifndef __GEGL_TILE_H__
 #define gegl_tile_get_data(tile)  ((tile)->data)
 #endif
diff --git a/gegl/buffer/gegl-buffer.c b/gegl/buffer/gegl-buffer.c
index ed1a1b5be..6b5980ee1 100644
--- a/gegl/buffer/gegl-buffer.c
+++ b/gegl/buffer/gegl-buffer.c
@@ -44,6 +44,7 @@
 #include "gegl-tile-backend-swap.h"
 #include "gegl-tile-backend-ram.h"
 #include "gegl-buffer-formats.h"
+#include "gegl-algorithms.h"
 
 #ifdef GEGL_ENABLE_DEBUG
 #define DEBUG_ALLOCATIONS (gegl_debug_flags & GEGL_DEBUG_BUFFER_ALLOC)
@@ -1303,3 +1304,176 @@ gegl_buffer_get_tile (GeglBuffer *buffer,
 void (*gegl_tile_handler_cache_ext_flush) (void *cache, const GeglRectangle *rect)=NULL;
 void (*gegl_buffer_ext_flush) (GeglBuffer *buffer, const GeglRectangle *rect)=NULL;
 void (*gegl_buffer_ext_invalidate) (GeglBuffer *buffer, const GeglRectangle *rect)=NULL;
+
+void (*gegl_resample_bilinear) (guchar *dest_buf,
+                                const guchar *source_buf,
+                                const GeglRectangle *dst_rect,
+                                const GeglRectangle *src_rect,
+                                gint                 s_rowstride,
+                                gdouble              scale,
+                                const Babl          *format,
+                                gint                 d_rowstride) =
+      gegl_resample_bilinear_generic;
+
+
+void (*gegl_resample_boxfilter) (guchar *dest_buf,
+                                 const guchar *source_buf,
+                                 const GeglRectangle *dst_rect,
+                                 const GeglRectangle *src_rect,
+                                 gint                 s_rowstride,
+                                 gdouble              scale,
+                                 const Babl          *format,
+                                 gint                 d_rowstride) =
+      gegl_resample_boxfilter_generic;
+
+
+void (*gegl_resample_nearest) (guchar *dest_buf,
+                               const guchar *source_buf,
+                               const GeglRectangle *dst_rect,
+                               const GeglRectangle *src_rect,
+                               gint                 s_rowstride,
+                               gdouble              scale,
+                               const gint           bpp,
+                               gint                 d_rowstride) =
+      gegl_resample_nearest_generic;
+
+void (*gegl_downscale_2x2) (const Babl *format,
+                         gint        src_width,
+                         gint        src_height,
+                         guchar     *src_data,
+                         gint        src_rowstride,
+                         guchar     *dst_data,
+                         gint        dst_rowstride) =
+      gegl_downscale_2x2_generic;
+
+#ifdef ARCH_X86_64
+
+void gegl_resample_bilinear_x86_64_v2 (guchar *dest_buf,
+                                       const guchar *source_buf,
+                                       const GeglRectangle *dst_rect,
+                                       const GeglRectangle *src_rect,
+                                       gint                 s_rowstride,
+                                       gdouble              scale,
+                                       const Babl          *format,
+                                       gint                 d_rowstride);
+
+
+void gegl_resample_boxfilter_x86_64_v2 (guchar *dest_buf,
+                                       const guchar *source_buf,
+                                       const GeglRectangle *dst_rect,
+                                       const GeglRectangle *src_rect,
+                                       gint                 s_rowstride,
+                                       gdouble              scale,
+                                       const Babl          *format,
+                                       gint                 d_rowstride);
+
+
+void gegl_resample_nearest_x86_64_v2 (guchar *dest_buf,
+                                      const guchar *source_buf,
+                                      const GeglRectangle *dst_rect,
+                                      const GeglRectangle *src_rect,
+                                      gint                 s_rowstride,
+                                      gdouble              scale,
+                                      const gint           bpp,
+                                      gint                 d_rowstride);
+
+void gegl_downscale_2x2_x86_64_v2 (const Babl *format,
+                                   gint        src_width,
+                                   gint        src_height,
+                                   guchar     *src_data,
+                                   gint        src_rowstride,
+                                   guchar     *dst_data,
+                                   gint        dst_rowstride);
+
+
+
+void gegl_resample_bilinear_x86_64_v3 (guchar *dest_buf,
+                                       const guchar *source_buf,
+                                       const GeglRectangle *dst_rect,
+                                       const GeglRectangle *src_rect,
+                                       gint                 s_rowstride,
+                                       gdouble              scale,
+                                       const Babl          *format,
+                                       gint                 d_rowstride);
+
+
+void gegl_resample_boxfilter_x86_64_v3 (guchar *dest_buf,
+                                       const guchar *source_buf,
+                                       const GeglRectangle *dst_rect,
+                                       const GeglRectangle *src_rect,
+                                       gint                 s_rowstride,
+                                       gdouble              scale,
+                                       const Babl          *format,
+                                       gint                 d_rowstride);
+
+
+void gegl_resample_nearest_x86_64_v3 (guchar *dest_buf,
+                                      const guchar *source_buf,
+                                      const GeglRectangle *dst_rect,
+                                      const GeglRectangle *src_rect,
+                                      gint                 s_rowstride,
+                                      gdouble              scale,
+                                      const gint           bpp,
+                                      gint                 d_rowstride);
+
+void gegl_downscale_2x2_x86_64_v3 (const Babl *format,
+                                   gint        src_width,
+                                   gint        src_height,
+                                   guchar     *src_data,
+                                   gint        src_rowstride,
+                                   guchar     *dst_data,
+                                   gint        dst_rowstride);
+
+#endif
+
+guint16 gegl_lut_u8_to_u16[256];
+gfloat  gegl_lut_u8_to_u16f[256];
+guint8  gegl_lut_u16_to_u8[65536/GEGL_ALGORITHMS_LUT_DIVISOR];
+
+
+void _gegl_init_buffer (int x86_64_version);
+void _gegl_init_buffer (int x86_64_version)
+{
+  static int inited = 0;
+  guint8 u8_ramp[256];
+  guint16 u16_ramp[65536/GEGL_ALGORITHMS_LUT_DIVISOR];
+  int i;
+
+  if (inited)
+    return;
+  inited = 1;
+
+  for (i = 0; i < 256; i++) u8_ramp[i]=i;
+  for (i = 0; i < 65536/GEGL_ALGORITHMS_LUT_DIVISOR; i++) u16_ramp[i]=i * GEGL_ALGORITHMS_LUT_DIVISOR;
+  babl_process (babl_fish (babl_format ("Y' u8"), babl_format("Y u16")),
+                &u8_ramp[0], &gegl_lut_u8_to_u16[0],
+                256);
+  for (i = 0; i < 256; i++)
+  {
+    gegl_lut_u8_to_u16[i]  = gegl_lut_u8_to_u16[i]/GEGL_ALGORITHMS_LUT_DIVISOR;
+    gegl_lut_u8_to_u16f[i] = gegl_lut_u8_to_u16[i];
+  }
+
+  babl_process (babl_fish (babl_format ("Y u16"), babl_format("Y' u8")),
+                &u16_ramp[0], &gegl_lut_u16_to_u8[0],
+                65536/GEGL_ALGORITHMS_LUT_DIVISOR);
+#ifdef ARCH_X86_64
+  switch (x86_64_version)
+  {
+    case 0:
+    case 1: break;
+    case 2:
+      gegl_resample_bilinear  = gegl_resample_bilinear_x86_64_v2;
+      gegl_resample_boxfilter = gegl_resample_boxfilter_x86_64_v2;
+      gegl_resample_nearest   = gegl_resample_nearest_x86_64_v2;
+      gegl_downscale_2x2      = gegl_downscale_2x2_x86_64_v2;
+      break;
+    case 3:
+      gegl_resample_bilinear  = gegl_resample_bilinear_x86_64_v3;
+      gegl_resample_boxfilter = gegl_resample_boxfilter_x86_64_v3;
+      gegl_resample_nearest   = gegl_resample_nearest_x86_64_v3;
+      gegl_downscale_2x2      = gegl_downscale_2x2_x86_64_v3;
+      break;
+  }
+#endif
+}
diff --git a/gegl/buffer/gegl-tile-handler-zoom.c b/gegl/buffer/gegl-tile-handler-zoom.c
index 4674f1323..c5d48a9d7 100644
--- a/gegl/buffer/gegl-tile-handler-zoom.c
+++ b/gegl/buffer/gegl-tile-handler-zoom.c
@@ -32,6 +32,7 @@
 #include "gegl-tile-storage.h"
 #include "gegl-buffer-private.h"
 #include "gegl-algorithms.h"
+#include "gegl-cpuaccel.h"
 
 
 G_DEFINE_TYPE (GeglTileHandlerZoom, gegl_tile_handler_zoom,
@@ -61,7 +62,17 @@ downscale (GeglTileHandlerZoom *zoom,
       if (src)
         {
           if (!zoom->downscale_2x2)
-            zoom->downscale_2x2 = gegl_downscale_2x2_get_fun (format);
+          {
+#ifdef ARCH_X86_64
+             GeglCpuAccelFlags cpu_accel = gegl_cpu_accel_get_support ();
+             if (cpu_accel & GEGL_CPU_ACCEL_X86_64_V3)
+               zoom->downscale_2x2 = gegl_downscale_2x2_get_fun_x86_64_v3 (format);
+             else if (cpu_accel & GEGL_CPU_ACCEL_X86_64_V2)
+               zoom->downscale_2x2 = gegl_downscale_2x2_get_fun_x86_64_v2 (format);
+             else
+#endif
+             zoom->downscale_2x2 = gegl_downscale_2x2_get_fun_generic (format);
+          }
 
           zoom->downscale_2x2 (format,
                                width, height,
diff --git a/gegl/buffer/meson.build b/gegl/buffer/meson.build
index e997d71eb..7097743fc 100644
--- a/gegl/buffer/meson.build
+++ b/gegl/buffer/meson.build
@@ -1,3 +1,18 @@
+if host_cpu_family == 'x86_64'
+
+  lib_gegl_x86_64_v2 = static_library('gegl-x86-64-v2', 'gegl-algorithms-x86-64-v2.c',
+    include_directories:[geglInclude, rootInclude],
+    dependencies:[glib, babl],
+    c_args: [gegl_cflags ] + x86_64_v2_flags
+  )
+
+  lib_gegl_x86_64_v3 = static_library('gegl-x86-64-v3', 'gegl-algorithms-x86-64-v3.c',
+    include_directories:[geglInclude, rootInclude],
+    dependencies:[glib, babl],
+    c_args: [gegl_cflags ] + x86_64_v3_flags
+  )
+endif
+
 gegl_sources += files(
   'gegl-algorithms.c',
   'gegl-buffer-access.c',
diff --git a/gegl/gegl-init.c b/gegl/gegl-init.c
index d9af0842b..0744209a4 100644
--- a/gegl/gegl-init.c
+++ b/gegl/gegl-init.c
@@ -83,6 +83,7 @@ guint gegl_debug_flags = 0;
 #include "graph/gegl-node-private.h"
 #include "gegl-random-private.h"
 #include "gegl-parallel-private.h"
+#include "gegl-cpuaccel.h"
 
 static gboolean  gegl_post_parse_hook (GOptionContext *context,
                                        GOptionGroup   *group,
@@ -164,7 +165,7 @@ gboolean gegl_is_main_thread (void)
   return g_thread_self () == main_thread;
 }
 
-void _gegl_init_u8_lut (void);
+void _gegl_init_buffer (int x86_64_version);
 
 void
 gegl_init (gint    *argc,
@@ -540,7 +541,13 @@ gegl_post_parse_hook (GOptionContext *context,
   gegl_config_parse_env (config);
 
   babl_init ();
-  _gegl_init_u8_lut ();
+
+  GeglCpuAccelFlags cpu_accel = gegl_cpu_accel_get_support ();
+  int x86_64_version = 0;
+  if (cpu_accel & GEGL_CPU_ACCEL_X86_64_V2) x86_64_version = 2;
+  if (cpu_accel & GEGL_CPU_ACCEL_X86_64_V3) x86_64_version = 3;
+
+  _gegl_init_buffer (x86_64_version);
 
 #ifdef GEGL_ENABLE_DEBUG
   {
diff --git a/gegl/meson.build b/gegl/meson.build
index 86c7a6fa1..1781db293 100644
--- a/gegl/meson.build
+++ b/gegl/meson.build
@@ -1,4 +1,3 @@
-
 gegl_library_build_dir = meson.current_build_dir()
 
 geglInclude = include_directories(
@@ -96,6 +95,13 @@ opencl_dep = declare_dependency(
     link_with : [gegl_sources],
 )
 
+
+if host_cpu_family == 'x86_64'
+  x86_64_extra = [lib_gegl_x86_64_v2, lib_gegl_x86_64_v3]
+else
+  x86_64_extra = []
+endif
+
 gegl_lib = library(api_name,
   gegl_sources,
   include_directories: [rootInclude, geglInclude],
@@ -107,6 +113,8 @@ gegl_lib = library(api_name,
     gmodule,
   ],
   c_args: gegl_cflags,
+
+  link_with: x86_64_extra,
   link_args: gegl_ldflags,
   install: true,
   version: so_version,
diff --git a/meson.build b/meson.build
index 9aea7da45..9044dd429 100644
--- a/meson.build
+++ b/meson.build
@@ -98,6 +98,8 @@ dep_ver += {
 }
 
 
+
+
 ################################################################################
 # Project infos
 
@@ -203,6 +205,12 @@ cflags_cpp = cflags_common + cflags_cpp
 add_project_arguments(cc.get_supported_arguments(cflags_c), language: 'c')
 add_project_arguments(cpp.get_supported_arguments(cflags_cpp), language: 'cpp')
 
+
+if host_cpu_family == 'x86_64'
+  x86_64_v2_flags = cc.get_supported_arguments(['-march=x86-64','-msse2', 
'-msse2','-msse4.1','-msse4.2','-mpopcnt','-mssse3'])
+  x86_64_v3_flags = x86_64_v2_flags + 
cc.get_supported_arguments(['-mavx','-mavx2','-mf16c','-mfma','-mmovbe', '-mbmi', '-mbmi2'])
+endif
+
 ################################################################################
 # Build Utilities
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]