[babl] babl: add u8 code paths to trc/matrix space conversion
- From: Øyvind "pippin" Kolås <ok src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl] babl: add u8 code paths to trc/matrix space conversion
- Date: Sun, 23 Jan 2022 04:23:16 +0000 (UTC)
commit 1eec1880691f66915c64f939008f5aff164933b1
Author: Øyvind Kolås <pippin gimp org>
Date: Sun Jan 23 05:21:21 2022 +0100
babl: add u8 code paths to trc/matrix space conversion
This is relevant for ARM, on modern x86_64 at least it is cheaper to
convert to/from float with external loops.
babl/base/babl-rgb-converter.c | 117 +++++++++++++++++++++--------------------
babl/base/babl-trc.c | 9 +++-
babl/base/babl-trc.h | 98 ++++++++++++++++++++++++++++++++++
3 files changed, 167 insertions(+), 57 deletions(-)
---
diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c
index 0f7e2fb..be0159a 100644
--- a/babl/base/babl-rgb-converter.c
+++ b/babl/base/babl-rgb-converter.c
@@ -3,8 +3,6 @@
#include "base/util.h"
#include "babl-trc.h"
#include "babl-base.h"
-///////////////////
-
static void
prep_conversion (const Babl *babl)
@@ -60,9 +58,26 @@ prep_conversion (const Babl *babl)
} \
}while(0)
-#define TRC_OUT(rgba_in, rgba_out) do{\
+#define TRC_IN_u8(rgba_in, rgba_out) do{ \
+ if ((source_space->space.trc[0] == source_space->space.trc[1]) && \
+ (source_space->space.trc[1] == source_space->space.trc[2])) \
+ { \
+ const Babl *trc = (void*)source_space->space.trc[0]; \
+ _babl_trc_to_linear_buf_u8_generic(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+ } \
+ else \
{ \
unsigned int c; \
+ for (c = 0; c < 3; c ++) \
+ { \
+ const Babl *trc = (void*)source_space->space.trc[c]; \
+ _babl_trc_to_linear_buf_u8_generic (trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+ } \
+ } \
+}while(0)
+
+#define TRC_OUT(rgba_in, rgba_out) do{\
+ { \
if ((destination_space->space.trc[0] == destination_space->space.trc[1]) && \
(destination_space->space.trc[1] == destination_space->space.trc[2])) \
{ \
@@ -71,6 +86,7 @@ prep_conversion (const Babl *babl)
} \
else \
{ \
+ unsigned int c; \
for (c = 0; c < 3; c ++) \
{ \
const Babl *trc = (void*)destination_space->space.trc[c]; \
@@ -81,6 +97,26 @@ prep_conversion (const Babl *babl)
} while(0)
+#define TRC_OUT_u8(rgba_in, rgba_out) do{\
+ { \
+ if ((destination_space->space.trc[0] == destination_space->space.trc[1]) && \
+ (destination_space->space.trc[1] == destination_space->space.trc[2])) \
+ { \
+ const Babl *trc = (void*)destination_space->space.trc[0]; \
+ _babl_trc_from_linear_buf_u8_generic(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+ } \
+ else \
+ { \
+ unsigned int c; \
+ for (c = 0; c < 3; c ++) \
+ { \
+ const Babl *trc = (void*)destination_space->space.trc[c]; \
+ _babl_trc_from_linear_buf_u8_generic(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+ } \
+ } \
+ }\
+} while(0)
+
static inline void
@@ -146,38 +182,18 @@ universal_nonlinear_rgba_u8_converter (const Babl *conversion,
void *data)
{
const Babl *destination_space = conversion->conversion.destination->format.space;
-
+ const Babl *source_space = babl_conversion_get_source_space (conversion);
float * matrixf = data;
- float * in_trc_lut_red = matrixf + 9;
- float * in_trc_lut_green = in_trc_lut_red + 256;
- float * in_trc_lut_blue = in_trc_lut_green + 256;
- unsigned int i;
uint8_t *rgba_in_u8 = (void*)src_char;
uint8_t *rgba_out_u8 = (void*)dst_char;
-
- float rgb[4*samples];
-
- for (i = 0; i < samples; i++)
+ float rgba[4*samples];
+ for (int i = 0; i < samples * 4; i+=4)
{
- rgb[i*4+0]=in_trc_lut_red[rgba_in_u8[i*4+0]];
- rgb[i*4+1]=in_trc_lut_green[rgba_in_u8[i*4+1]];
- rgb[i*4+2]=in_trc_lut_blue[rgba_in_u8[i*4+2]];
- rgba_out_u8[i*4+3] = rgba_in_u8[i*4+3];
- }
-
- babl_matrix_mul_vectorff_buf4 (matrixf, rgb, rgb, samples);
-
- {
- const Babl *from_trc_red = (void*)destination_space->space.trc[0];
- const Babl *from_trc_green = (void*)destination_space->space.trc[1];
- const Babl *from_trc_blue = (void*)destination_space->space.trc[2];
- for (i = 0; i < samples * 4; i+=4)
- {
- rgba_out_u8[i+0] = babl_trc_from_linear (from_trc_red, rgb[i+0]) * 255.5f;
- rgba_out_u8[i+1] = babl_trc_from_linear (from_trc_green, rgb[i+1]) * 255.5f;
- rgba_out_u8[i+2] = babl_trc_from_linear (from_trc_blue, rgb[i+2]) * 255.5f;
- }
+ rgba_out_u8[i+3] = rgba_in_u8[i+3];
}
+ TRC_IN_u8(rgba_in_u8, rgba);
+ babl_matrix_mul_vectorff_buf4 (matrixf, rgba, rgba, samples);
+ TRC_OUT_u8(rgba, rgba_out_u8);
}
@@ -255,7 +271,7 @@ universal_nonlinear_rgb_u8_converter (const Babl *conversion,
rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
- rgba_out[i*4+3]=rgb_in_u8[i*3+2] * 255.5f;
+ rgba_out[i*4+3]=rgb_in_u8[i*3+2] * 255.0f;
}
babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
@@ -265,7 +281,7 @@ universal_nonlinear_rgb_u8_converter (const Babl *conversion,
for (i = 0; i < samples; i++)
for (unsigned int c = 0; c < 3; c ++)
- rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.5f;
+ rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.0f;
}
}
@@ -301,7 +317,6 @@ static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
#undef m
-
static inline void
universal_nonlinear_rgba_converter_sse2 (const Babl *conversion,
unsigned char *__restrict__ src_char,
@@ -344,36 +359,24 @@ universal_nonlinear_rgba_u8_converter_sse2 (const Babl *conversion,
long samples,
void *data)
{
+ const Babl *source_space = conversion->conversion.source->format.space;
const Babl *destination_space = conversion->conversion.destination->format.space;
float * matrixf = data;
- float * in_trc_lut_red = matrixf + 9;
- float * in_trc_lut_green = in_trc_lut_red + 256;
- float * in_trc_lut_blue = in_trc_lut_green + 256;
- unsigned int i;
uint8_t *rgba_in_u8 = (void*)src_char;
uint8_t *rgba_out_u8 = (void*)dst_char;
- float rgba_out[4*samples];
+ float rgba[4*samples];
- for (i = 0; i < samples * 4; i+= 4)
+ for (int i = 0; i < samples*4; i+=4)
{
- rgba_out[i+0]=in_trc_lut_red[rgba_in_u8[i+0]];
- rgba_out[i+1]=in_trc_lut_green[rgba_in_u8[i+1]];
- rgba_out[i+2]=in_trc_lut_blue[rgba_in_u8[i+2]];
rgba_out_u8[i+3] = rgba_in_u8[i+3];
}
+ TRC_IN_u8(rgba_in_u8, rgba);
- babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
-
- {
- int c;
- TRC_OUT(rgba_out, rgba_out);
+ babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba, rgba, samples);
- for (i = 0; i < samples * 4; i+= 4)
- for (c = 0; c < 3; c ++)
- rgba_out_u8[i+c] = rgba_out[i+c] * 255.5f;
- }
+ TRC_OUT_u8(rgba, rgba_out_u8);
}
static inline void
@@ -409,7 +412,7 @@ universal_nonlinear_rgb_u8_converter_sse2 (const Babl *conversion,
for (i = 0; i < samples; i++)
for (unsigned c = 0; c < 3; c ++)
- rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.5f;
+ rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255 + 0.5f;
}
}
@@ -530,6 +533,7 @@ add_rgb_adapter (Babl *babl,
else
#endif
{
+#if 1
prep_conversion(babl_conversion_new(
babl_format_with_space("RGBA float", space),
babl_format_with_space("RGBA float", babl),
@@ -551,7 +555,8 @@ add_rgb_adapter (Babl *babl,
babl_format_with_space("R'G'B'A float", space),
"linear", universal_nonlinear_rgba_converter,
NULL));
-
+#endif
+#if 1
prep_conversion(babl_conversion_new(
babl_format_with_space("R'G'B'A float", space),
babl_format_with_space("RGBA float", babl),
@@ -562,6 +567,7 @@ add_rgb_adapter (Babl *babl,
babl_format_with_space("RGBA float", space),
"linear", universal_nonlinear_rgb_linear_converter,
NULL));
+#endif
prep_conversion(babl_conversion_new(
babl_format_with_space("R'G'B'A u8", space),
@@ -573,7 +579,7 @@ add_rgb_adapter (Babl *babl,
babl_format_with_space("R'G'B'A u8", space),
"linear", universal_nonlinear_rgba_u8_converter,
NULL));
-
+#if 1
prep_conversion(babl_conversion_new(
babl_format_with_space("R'G'B' u8", space),
babl_format_with_space("R'G'B' u8", babl),
@@ -595,8 +601,8 @@ add_rgb_adapter (Babl *babl,
babl_format_with_space("R'G'B'A float", babl),
"linear", universal_linear_rgb_nonlinear_converter,
NULL));
+#endif
}
-#if 0
prep_conversion(babl_conversion_new(
babl_format_with_space("RGB float", space),
babl_format_with_space("RGB float", babl),
@@ -607,7 +613,6 @@ add_rgb_adapter (Babl *babl,
babl_format_with_space("RGB float", space),
"linear", universal_rgb_converter,
NULL));
-#endif
prep_conversion(babl_conversion_new(
babl_format_with_space("Y float", space),
babl_format_with_space("Y float", babl),
diff --git a/babl/base/babl-trc.c b/babl/base/babl-trc.c
index 6cb4900..a2f1d6e 100644
--- a/babl/base/babl-trc.c
+++ b/babl/base/babl-trc.c
@@ -211,7 +211,7 @@ _babl_trc_formula_srgb_to_linear (const Babl *trc_,
float e = trc->lut[5];
float f = trc->lut[6];
- if (x >= d)
+ if (x >= d) // OPT can be reduced to be branchless
{
return _babl_trc_gamma_to_linear ((Babl *) trc, a * x + b) + e;
}
@@ -364,6 +364,8 @@ _babl_trc_from_linear_buf_generic (const Babl *trc_,
}
}
+
+
static inline void _babl_trc_linear_buf (const Babl *trc_,
const float *__restrict__ in,
float *__restrict__ out,
@@ -494,6 +496,8 @@ BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
trc_db[i].fun_to_linear_buf = _babl_trc_to_linear_buf_generic;
trc_db[i].fun_from_linear_buf = _babl_trc_from_linear_buf_generic;
+ trc_db[i].fun_from_linear_buf_u8 = _babl_trc_from_linear_buf_u8_generic;
+ trc_db[i].fun_to_linear_buf_u8 = _babl_trc_to_linear_buf_u8_generic;
switch (trc_db[i].type)
{
@@ -502,12 +506,14 @@ BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
trc_db[i].fun_from_linear = _babl_trc_linear;
trc_db[i].fun_from_linear_buf = _babl_trc_linear_buf;
trc_db[i].fun_to_linear_buf = _babl_trc_linear_buf;
+ //trc_db[i].fun_to_linear_buf_u8 = _babl_trc_linear_buf_u8;
break;
case BABL_TRC_FORMULA_GAMMA:
trc_db[i].fun_to_linear = _babl_trc_gamma_to_linear;
trc_db[i].fun_from_linear = _babl_trc_gamma_from_linear;
trc_db[i].fun_to_linear_buf = _babl_trc_gamma_to_linear_buf;
trc_db[i].fun_from_linear_buf = _babl_trc_gamma_from_linear_buf;
+ //trc_db[i].fun_from_linear_buf_u8 = _babl_trc_gamma_from_linear_buf_u8;
trc_db[i].poly_gamma_to_linear_x0 = POLY_GAMMA_X0;
trc_db[i].poly_gamma_to_linear_x1 = POLY_GAMMA_X1;
@@ -582,6 +588,7 @@ BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
trc_db[i].fun_to_linear = _babl_trc_srgb_to_linear;
trc_db[i].fun_from_linear = _babl_trc_srgb_from_linear;
trc_db[i].fun_from_linear_buf = _babl_trc_srgb_from_linear_buf;
+ //trc_db[i].fun_from_linear_buf_u8 = _babl_trc_srgb_from_linear_buf_u8;
trc_db[i].fun_to_linear_buf = _babl_trc_srgb_to_linear_buf;
break;
case BABL_TRC_LUT:
diff --git a/babl/base/babl-trc.h b/babl/base/babl-trc.h
index 79b6891..4556c96 100644
--- a/babl/base/babl-trc.h
+++ b/babl/base/babl-trc.h
@@ -51,6 +51,13 @@ typedef struct
int out_gap,
int components,
int count);
+ void (*fun_to_linear_buf_u8)(const Babl *trc,
+ const uint8_t *in,
+ float *out,
+ int in_gap,
+ int out_gap,
+ int components,
+ int count);
void (*fun_from_linear_buf)(const Babl *trc,
const float *in,
float *out,
@@ -58,6 +65,13 @@ typedef struct
int out_gap,
int components,
int count);
+ void (*fun_from_linear_buf_u8)(const Babl *trc,
+ const float *in,
+ uint8_t *out,
+ int in_gap,
+ int out_gap,
+ int components,
+ int count);
BablPolynomial poly_gamma_to_linear;
float poly_gamma_to_linear_x0;
float poly_gamma_to_linear_x1;
@@ -67,6 +81,8 @@ typedef struct
float *lut;
float *inv_lut;
char name[128];
+ int valid_u8_lut;
+ float u8_lut[256];
} BablTRC;
static inline void babl_trc_from_linear_buf (const Babl *trc_,
@@ -79,6 +95,26 @@ static inline void babl_trc_from_linear_buf (const Babl *trc_,
trc->fun_from_linear_buf (trc_, in, out, in_gap, out_gap, components, count);
}
+static inline void babl_trc_from_linear_buf_u8 (const Babl *trc_,
+ const float *in, uint8_t *out,
+ int in_gap, int out_gap,
+ int components,
+ int count)
+{
+ BablTRC *trc = (void*)trc_;
+ trc->fun_from_linear_buf_u8 (trc_, in, out, in_gap, out_gap, components, count);
+}
+
+static inline void babl_trc_to_linear_buf_u8 (const Babl *trc_,
+ const uint8_t *in, float *out,
+ int in_gap, int out_gap,
+ int components,
+ int count)
+{
+ BablTRC *trc = (void*)trc_;
+ trc->fun_to_linear_buf_u8 (trc_, in, out, in_gap, out_gap, components, count);
+}
+
static inline void babl_trc_to_linear_buf (const Babl *trc_,
const float *in, float *out,
int in_gap, int out_gap,
@@ -104,4 +140,66 @@ static inline float babl_trc_to_linear (const Babl *trc_, float value)
void
babl_trc_class_init_generic (void);
+
+static inline void
+_babl_trc_from_linear_buf_u8_generic (const Babl *trc_,
+ const float *__restrict__ in,
+ uint8_t *__restrict__ out,
+ int in_gap,
+ int out_gap,
+ int components,
+ int count)
+{
+ BablTRC *trc = (void*)trc_;
+ if (in_gap == out_gap && in_gap == 4 && components == 3)
+ {
+ for (int i = 0; i < count; i ++)
+ for (int c = 0; c < 3; c ++)
+ {
+ int val = trc->fun_from_linear (trc_, in[4 * i + c]) * 255.0 + 0.5;
+ out[4 * i + c] = val > 255 ? 255 : val;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < count; i ++)
+ for (int c = 0; c < components; c ++)
+ {
+ int val = trc->fun_from_linear (trc_, in[in_gap * i + c]) * 255.0 + 0.5;
+ out[out_gap * i + c] = val > 255 ? 255 : 0;
+ }
+ }
+}
+
+static inline void
+_babl_trc_to_linear_buf_u8_generic (const Babl *trc_,
+ const uint8_t *__restrict__ in,
+ float *__restrict__ out,
+ int in_gap,
+ int out_gap,
+ int components,
+ int count)
+{
+ BablTRC *trc = (void*)trc_;
+ if (!trc->valid_u8_lut)
+ {
+ for (int i = 0; i <= 255; i++)
+ trc->u8_lut[i] = trc->fun_to_linear (trc_, i/255.0f);
+ trc->valid_u8_lut=1;
+ }
+ if (in_gap == out_gap && in_gap == 4 && components == 3)
+ {
+ for (int i = 0; i < count; i ++)
+ for (int c = 0; c < 3; c ++)
+ out[4 * i + c] = trc->u8_lut[in[4 * i + c]];
+ }
+ else
+ {
+ for (int i = 0; i < count; i ++)
+ for (int c = 0; c < components; c ++)
+ out[out_gap * i + c] = trc->u8_lut[in[in_gap * i + c]];
+ }
+}
+
+
#endif
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]