[babl] make trampoline for lut processing
- From: Øyvind "pippin" Kolås <ok src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl] make trampoline for lut processing
- Date: Mon, 24 Jan 2022 06:32:36 +0000 (UTC)
commit b3e884edf3b5c58fb4c2cede1346bd8a9d9c4a1e
Author: Øyvind Kolås <pippin gimp org>
Date: Mon Jan 24 07:05:12 2022 +0100
make trampoline for lut processing
babl/babl-fish-path.c | 137 +++++++----------------------------------
babl/babl-internal.h | 13 ++++
babl/babl.c | 41 ++++++++++--
babl/base/babl-rgb-converter.c | 122 ++++++++++++++++++++++++++++++++++++
4 files changed, 193 insertions(+), 120 deletions(-)
---
diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c
index f709c3fd1..7278ec2a1 100644
--- a/babl/babl-fish-path.c
+++ b/babl/babl-fish-path.c
@@ -76,7 +76,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi,
static inline void
-process_conversion_path (BablList *path,
+_babl_process_conversion_path (BablList *path,
const void *source_buffer,
int source_bpp,
void *destination_buffer,
@@ -756,118 +756,6 @@ babl_gc_fishes (void)
// is responsibility of higher layers
}
-static int babl_fish_lut_process_maybe (const Babl *babl,
- const char *source,
- const char *destination,
- long n,
- void *data)
-{
- uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut;
- ((Babl*)babl)->fish.pixels += n;
-
-
- if (!lut && babl->fish.pixels > 256 * 128)
- {
-#if 0
- fprintf (stderr, "building LUT for %s to %s\n",
- babl_get_name (babl->conversion.source),
- babl_get_name (babl->conversion.destination));
-#endif
- lut = malloc (256 * 256 * 256 * 4);
- if (babl->fish_path.source_bpp == 8)
- {
- uint64_t *lut_in = malloc (256 * 256 * 256 * 8);
- for (int o = 0; o < 256 * 256 * 256; o++)
- {
- uint64_t v = o;
- uint64_t v0 = v & 0xff;
- uint64_t v1 = (v & 0xff00) >> 8;
- uint64_t v2 = (v & 0xff0000) >> 16;
-
-#if 1
- // gives same results... but purer white is better?
- v0 = (v0 << 8) | (((v0&1)?0xff:0)<<0);
- v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16);
- v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32);
-#else
- v0 = (v0 << 8);
- v1 = (v1 << 24);
- v2 = (v2 << 40);
-#endif
- lut_in[o] = v;
- }
-
- process_conversion_path (babl->fish_path.conversion_list,
- lut_in,
- babl->fish_path.source_bpp,
- lut,
- babl->fish_path.dest_bpp,
- 256*256*256);
- free (lut_in);
- }
- else
- {
- for (int o = 0; o < 256 * 256 * 256; o++)
- lut[o] = o;
- process_conversion_path (babl->fish_path.conversion_list,
- lut,
- babl->fish_path.source_bpp,
- lut,
- babl->fish_path.dest_bpp,
- 256*256*256);
- }
- // XXX : there is still a micro race, if lost we should only
- // leak a LUT not produce wrong results.
- if (babl->fish_path.u8_lut == NULL)
- {
- (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut;
-
- }
- else
- {
- free (lut);
- lut = (uint32_t*)babl->fish_path.u8_lut;
- }
- }
- if (lut)
- {
- if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet
- { // half and u16 need their
- // own separate handling
- uint32_t *src = (uint32_t*)source;
- uint32_t *dst = (uint32_t*)destination;
- lut = (uint32_t*)babl->fish_path.u8_lut;
- while (n--)
- {
- uint32_t col_a = *src++;
- uint32_t col_b = *src++;
- uint32_t col;
-
- uint32_t c_ar = ((col_a & 0xff000000)|
- ((col_a & 0x0000ff00) << 8));
- uint32_t c_gb = ((col_b & 0xff000000)|
- ((col_b & 0x0000ff00) << 8))>>16;
- col = c_ar|c_gb;
-
- *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
- }
- }
- else
- {
- uint32_t *src = (uint32_t*)source;
- uint32_t *dst = (uint32_t*)destination;
- lut = (uint32_t*)babl->fish_path.u8_lut;
- while (n--)
- {
- uint32_t col = *src++;
- *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
- }
- }
- BABL(babl)->fish_path.last_lut_use = babl_ticks ();
- return 1;
- }
- return 0;
-}
static void
babl_fish_path_process (const Babl *babl,
@@ -895,7 +783,7 @@ babl_fish_path_process (const Babl *babl,
conv_counter = 0;
}
}
- process_conversion_path (babl->fish_path.conversion_list,
+ _babl_process_conversion_path (babl->fish_path.conversion_list,
source,
babl->fish_path.source_bpp,
destination,
@@ -1037,7 +925,7 @@ static void inline *align_16 (unsigned char *ret)
}
static inline void
-process_conversion_path (BablList *path,
+_babl_process_conversion_path (BablList *path,
const void *source_buffer,
int source_bpp,
void *destination_buffer,
@@ -1109,6 +997,23 @@ process_conversion_path (BablList *path,
}
}
+void
+babl_process_conversion_path (BablList *path,
+ const void *source_buffer,
+ int source_bpp,
+ void *destination_buffer,
+ int dest_bpp,
+ long n)
+{
+ _babl_process_conversion_path (path,
+ source_buffer,
+ source_bpp,
+ destination_buffer,
+ dest_bpp,
+ n);
+}
+
+
static void
init_path_instrumentation (FishPathInstrumentation *fpi,
Babl *fmt_source,
@@ -1244,7 +1149,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi,
/* calculate this path's view of what the result should be */
ticks_start = babl_ticks ();
for (int i = 0; i < BABL_TEST_ITER; i ++)
- process_conversion_path (path, fpi->source, source_bpp, fpi->destination,
+ _babl_process_conversion_path (path, fpi->source, source_bpp, fpi->destination,
dest_bpp, fpi->num_test_pixels);
ticks_end = babl_ticks ();
*path_cost = (ticks_end - ticks_start);
diff --git a/babl/babl-internal.h b/babl/babl-internal.h
index ec6008b6d..4377ec379 100644
--- a/babl/babl-internal.h
+++ b/babl/babl-internal.h
@@ -373,6 +373,12 @@ extern const Babl *
extern const Babl *
(*babl_trc_lookup_by_name) (const char *name);
+extern int (*babl_fish_lut_process_maybe) (const Babl *babl,
+ const char *source,
+ const char *destination,
+ long n,
+ void *data);
+
void babl_space_to_xyz (const Babl *space, const double *rgb, double *xyz);
void babl_space_from_xyz (const Babl *space, const double *xyz, double *rgb);
@@ -473,5 +479,12 @@ _babl_space_for_lcms (const char *icc_data, int icc_length); // XXX pass profile
void
babl_trc_class_init (void);
+void
+babl_process_conversion_path (BablList *path,
+ const void *source_buffer,
+ int source_bpp,
+ void *destination_buffer,
+ int dest_bpp,
+ long n);
#endif
diff --git a/babl/babl.c b/babl/babl.c
index 515fa09b0..7bfe60f6a 100644
--- a/babl/babl.c
+++ b/babl/babl.c
@@ -200,6 +200,19 @@ void (*babl_base_init) (void) = babl_base_init_generic;
const Babl * babl_trc_lookup_by_name_generic (const char *name);
+int babl_fish_lut_process_maybe_generic (const Babl *babl,
+ const char *source,
+ const char *destination,
+ long n,
+ void *data);
+
+int (*babl_fish_lut_process_maybe) (const Babl *babl,
+ const char *source,
+ const char *destination,
+ long n,
+ void *data) =
+ babl_fish_lut_process_maybe_generic;
+
const Babl *
babl_trc_new_generic (const char *name,
@@ -222,15 +235,25 @@ const Babl *
float *lut) = babl_trc_new_generic;
#ifdef ARCH_X86_64
+
+int babl_fish_lut_process_maybe_x86_64_v2 (const Babl *babl,
+ const char *source,
+ const char *destination,
+ long n,
+ void *data);
+int babl_fish_lut_process_maybe_x86_64_v3 (const Babl *babl,
+ const char *source,
+ const char *destination,
+ long n,
+ void *data);
+
void babl_base_init_x86_64_v2 (void);
void babl_base_init_x86_64_v3 (void);
void _babl_space_add_universal_rgb_x86_64_v2 (const Babl *space);
void _babl_space_add_universal_rgb_x86_64_v3 (const Babl *space);
-const Babl *
-babl_trc_lookup_by_name_x86_64_v2 (const char *name);
-const Babl *
-babl_trc_lookup_by_name_x86_64_v3 (const char *name);
+const Babl * babl_trc_lookup_by_name_x86_64_v2 (const char *name);
+const Babl * babl_trc_lookup_by_name_x86_64_v3 (const char *name);
const Babl *
babl_trc_new_x86_64_v2 (const char *name,
@@ -247,6 +270,13 @@ babl_trc_new_x86_64_v3 (const char *name,
#endif
#ifdef ARCH_ARM
+
+int babl_fish_lut_process_maybe_arm_neon (const Babl *babl,
+ const char *source,
+ const char *destination,
+ long n,
+ void *data);
+
void babl_base_init_arm_neon (void);
void _babl_space_add_universal_rgb_arm_neon (const Babl *space);
@@ -268,6 +298,7 @@ static void simd_init (void)
BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
if ((accel & BABL_CPU_ACCEL_X86_64_V3) == BABL_CPU_ACCEL_X86_64_V3)
{
+ babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v3;
babl_base_init = babl_base_init_x86_64_v2; /// !!
// this is correct,
// it performs better
@@ -278,6 +309,7 @@ static void simd_init (void)
}
else if ((accel & BABL_CPU_ACCEL_X86_64_V2) == BABL_CPU_ACCEL_X86_64_V2)
{
+ babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v2;
babl_base_init = babl_base_init_x86_64_v2;
babl_trc_new = babl_trc_new_x86_64_v2;
babl_trc_lookup_by_name = babl_trc_lookup_by_name_x86_64_v2;
@@ -288,6 +320,7 @@ static void simd_init (void)
BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
if ((accel & BABL_CPU_ACCEL_ARM_NEON) == BABL_CPU_ACCEL_ARM_NEON)
{
+ babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_arm_neon;
babl_base_init = babl_base_init_arm_neon;
babl_trc_new = babl_trc_new_arm_neon;
babl_trc_lookup_by_name = babl_trc_lookup_by_name_arm_neon;
diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c
index 3f4da04d3..5c3d2ca08 100644
--- a/babl/base/babl-rgb-converter.c
+++ b/babl/base/babl-rgb-converter.c
@@ -533,3 +533,125 @@ BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space)
{
babl_space_class_for_each (add_rgb_adapter, (void*)space);
}
+
+void
+babl_process_conversion_path (BablList *path,
+ const void *source_buffer,
+ int source_bpp,
+ void *destination_buffer,
+ int dest_bpp,
+ long n);
+
+int BABL_SIMD_SUFFIX(babl_fish_lut_process_maybe) (const Babl *babl,
+ const char *source,
+ const char *destination,
+ long n,
+ void *data)
+{
+ uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut;
+ ((Babl*)babl)->fish.pixels += n;
+
+
+ if (!lut && babl->fish.pixels > 256 * 128)
+ {
+#if 0
+ fprintf (stderr, "building LUT for %s to %s\n",
+ babl_get_name (babl->conversion.source),
+ babl_get_name (babl->conversion.destination));
+#endif
+ lut = malloc (256 * 256 * 256 * 4);
+ if (babl->fish_path.source_bpp == 8)
+ {
+ uint64_t *lut_in = malloc (256 * 256 * 256 * 8);
+ for (int o = 0; o < 256 * 256 * 256; o++)
+ {
+ uint64_t v = o;
+ uint64_t v0 = v & 0xff;
+ uint64_t v1 = (v & 0xff00) >> 8;
+ uint64_t v2 = (v & 0xff0000) >> 16;
+
+#if 1
+ // gives same results... but purer white is better?
+ v0 = (v0 << 8) | (((v0&1)?0xff:0)<<0);
+ v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16);
+ v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32);
+#else
+ v0 = (v0 << 8);
+ v1 = (v1 << 24);
+ v2 = (v2 << 40);
+#endif
+ lut_in[o] = v;
+ }
+
+ babl_process_conversion_path (babl->fish_path.conversion_list,
+ lut_in,
+ babl->fish_path.source_bpp,
+ lut,
+ babl->fish_path.dest_bpp,
+ 256*256*256);
+ free (lut_in);
+ }
+ else
+ {
+ for (int o = 0; o < 256 * 256 * 256; o++)
+ lut[o] = o;
+ babl_process_conversion_path (babl->fish_path.conversion_list,
+ lut,
+ babl->fish_path.source_bpp,
+ lut,
+ babl->fish_path.dest_bpp,
+ 256*256*256);
+ }
+ // XXX : there is still a micro race, if lost we should only
+ // leak a LUT not produce wrong results.
+ if (babl->fish_path.u8_lut == NULL)
+ {
+ (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut;
+
+ }
+ else
+ {
+ free (lut);
+ lut = (uint32_t*)babl->fish_path.u8_lut;
+ }
+ }
+ if (lut)
+ {
+ if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet
+ { // half and u16 need their
+ // own separate handling
+ uint32_t *src = (uint32_t*)source;
+ uint32_t *dst = (uint32_t*)destination;
+ lut = (uint32_t*)babl->fish_path.u8_lut;
+ while (n--)
+ {
+ uint32_t col_a = *src++;
+ uint32_t col_b = *src++;
+ uint32_t col;
+
+ uint32_t c_ar = ((col_a & 0xff000000)|
+ ((col_a & 0x0000ff00) << 8));
+ uint32_t c_gb = ((col_b & 0xff000000)|
+ ((col_b & 0x0000ff00) << 8))>>16;
+ col = c_ar|c_gb;
+
+ *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
+ }
+ }
+ else
+ {
+ uint32_t *src = (uint32_t*)source;
+ uint32_t *dst = (uint32_t*)destination;
+ lut = (uint32_t*)babl->fish_path.u8_lut;
+ while (n--)
+ {
+ uint32_t col = *src++;
+ *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
+ }
+ }
+ BABL(babl)->fish_path.last_lut_use = babl_ticks ();
+ return 1;
+ }
+ return 0;
+}
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]