[babl] avx2-int8: add gamma u8 -> linear float conversions



commit 57bd10c3f114a2f785ca6325204f035665d2892e
Author: Ell <ell_se yahoo com>
Date:   Sun Apr 12 17:42:16 2020 +0300

    avx2-int8: add gamma u8 -> linear float conversions
    
    Add AVX2 conversions from u8 Y', Y'A, R'G'B, and R'G'B'A to float
    Y, YA, RGB, and RGBA, respectively.  The conversions use an LUT
    together with the AVX2 gather instructions to process 8 values a
     once.  Depending on the formats and cache utilization, the new
    conversions are between 1.25x to 2.2x faster than the existing
    conversions.

 extensions/avx2-int8-tables.h | 517 ++++++++++++++++++++++++++++++++++++++++++
 extensions/avx2-int8.c        | 184 +++++++++++++++
 2 files changed, 701 insertions(+)
---
diff --git a/extensions/avx2-int8-tables.h b/extensions/avx2-int8-tables.h
index 8e3c6fb2a..cfc5208bc 100644
--- a/extensions/avx2-int8-tables.h
+++ b/extensions/avx2-int8-tables.h
@@ -4098,3 +4098,520 @@ static const int32_t linear_to_gamma[65536] =
 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
 };
+
+static const float gamma_to_linear[512] =
+{
+0x0p+0,
+0x1.3e45677c176f7p-12,
+0x1.3e45677c176f7p-11,
+0x1.dd681b3a23272p-11,
+0x1.3e45677c176f7p-10,
+0x1.8dd6c15b1d4b4p-10,
+0x1.dd681b3a23272p-10,
+0x1.167cba8c94818p-9,
+0x1.3e45677c176f7p-9,
+0x1.660e146b9a5d5p-9,
+0x1.8dd6c15b1d4b4p-9,
+0x1.b6a31b5259c99p-9,
+0x1.e1e31d70c99ddp-9,
+0x1.07c38bf8583a9p-8,
+0x1.1fcc2beed6421p-8,
+0x1.390ffaf95e279p-8,
+0x1.53936cc7bc928p-8,
+0x1.6f5addb50c915p-8,
+0x1.8c6a94031b561p-8,
+0x1.aac6c0fb97351p-8,
+0x1.ca7381f9f602bp-8,
+0x1.eb74e160978dp-8,
+0x1.06e76bbda92b8p-7,
+0x1.18c2a5a8a8044p-7,
+0x1.2b4e09b3f0ae3p-7,
+0x1.3e8b7b3bde965p-7,
+0x1.527cd60af8b85p-7,
+0x1.6723eea8d3709p-7,
+0x1.7c8292a3db6b3p-7,
+0x1.929a88d67b521p-7,
+0x1.a96d91a8016bdp-7,
+0x1.c0fd67499fab6p-7,
+0x1.d94bbdefd740ep-7,
+0x1.f25a44089883fp-7,
+0x1.061551372c694p-6,
+0x1.135f3e4c2cce2p-6,
+0x1.210bb8642b172p-6,
+0x1.2f1b8c1ae46bdp-6,
+0x1.3d8f839b79c0bp-6,
+0x1.4c6866b3e9fa4p-6,
+0x1.5ba6fae794313p-6,
+0x1.6b4c0380d2deep-6,
+0x1.7b5841a1bf3acp-6,
+0x1.8bcc74542addbp-6,
+0x1.9ca95898dc8b5p-6,
+0x1.adefa9761c02p-6,
+0x1.bfa0200597bd9p-6,
+0x1.d1bb7381aec1fp-6,
+0x1.e442595227bcap-6,
+0x1.f73585185e1b5p-6,
+0x1.054ad45d76878p-5,
+0x1.0f31ba386ff26p-5,
+0x1.194fcb663747bp-5,
+0x1.23a55e62a662ap-5,
+0x1.2e32c8e148d11p-5,
+0x1.38f85fd21eacfp-5,
+0x1.43f67766310ffp-5,
+0x1.4f2d6313fa8dp-5,
+0x1.5a9d759ba5edp-5,
+0x1.6647010b254eep-5,
+0x1.722a56c2239eep-5,
+0x1.7e47c775d2427p-5,
+0x1.8a9fa33494b07p-5,
+0x1.973239698b9ccp-5,
+0x1.a3ffd8e001389p-5,
+0x1.b108cfc6b7fbcp-5,
+0x1.be4d6bb31d522p-5,
+0x1.cbcdf9a4616f2p-5,
+0x1.d98ac60675833p-5,
+0x1.e7841cb4f16dfp-5,
+0x1.f5ba48fde2048p-5,
+0x1.0216cad240765p-4,
+0x1.096f2671eb815p-4,
+0x1.10e65c38a5192p-4,
+0x1.187c90bf8bce2p-4,
+0x1.2031e85f5d6dap-4,
+0x1.28068731a1952p-4,
+0x1.2ffa9111cb94bp-4,
+0x1.380e299e53f92p-4,
+0x1.40417439ca10fp-4,
+0x1.4894940bddbfbp-4,
+0x1.5107ac0261e59p-4,
+0x1.599aded247aacp-4,
+0x1.624e4ef892ed4p-4,
+0x1.6b221ebb4817ep-4,
+0x1.7416702a539d1p-4,
+0x1.7d2b65206b527p-4,
+0x1.86611f43e9e6ap-4,
+0x1.8fb7c007a4a7p-4,
+0x1.992f68abbbc89p-4,
+0x1.a2c83a3e6566dp-4,
+0x1.ac82559cb3644p-4,
+0x1.b65ddb7354604p-4,
+0x1.c05aec3f4fe5ep-4,
+0x1.ca79a84ebe03p-4,
+0x1.d4ba2fc17a6a5p-4,
+0x1.df1ca289d34b8p-4,
+0x1.e9a1206d34003p-4,
+0x1.f447c904cbb4ep-4,
+0x1.ff10bbbe302c2p-4,
+0x1.04fe0bedfe5f1p-3,
+0x1.0a84fe3b36d8fp-3,
+0x1.101d443dfc06fp-3,
+0x1.15c6ed58eefdfp-3,
+0x1.1b8208da5fefp-3,
+0x1.214ea5fc9514ap-3,
+0x1.272cd3e610123p-3,
+0x1.2d1ca1a9d1cfbp-3,
+0x1.331e1e479cdf5p-3,
+0x1.393158ac3674ep-3,
+0x1.3f565fb1a5fd5p-3,
+0x1.458d421f735dfp-3,
+0x1.4bd60eaae3e73p-3,
+0x1.5230d3f736034p-3,
+0x1.589da095dbaa1p-3,
+0x1.5f1c8306b3a3cp-3,
+0x1.65ad89b841a2bp-3,
+0x1.6c50c307e53bfp-3,
+0x1.73063d420fc8p-3,
+0x1.79ce06a279303p-3,
+0x1.80a82d5453b5dp-3,
+0x1.8794bf727eb3fp-3,
+0x1.8e93cb07b8679p-3,
+0x1.95a55e0ecec0bp-3,
+0x1.9cc98672cf47ep-3,
+0x1.a400520f3619cp-3,
+0x1.ab49ceb01c003p-3,
+0x1.b2a60a1263b0ap-3,
+0x1.ba1511e3e632dp-3,
+0x1.c196f3c39e76fp-3,
+0x1.c92bbd41d41fep-3,
+0x1.d0d37be045851p-3,
+0x1.d88e3d1250f68p-3,
+0x1.e05c0e3d1d3ep-3,
+0x1.e83cfcb7c16fp-3,
+0x1.f03115cb6bfd3p-3,
+0x1.f83866b38924dp-3,
+0x1.00297e4ef4553p-2,
+0x1.044072557177ap-2,
+0x1.086115f6beb3ap-2,
+0x1.0c8b6fb5c735ep-2,
+0x1.10bf860ef039ap-2,
+0x1.14fd5f782a5a6p-2,
+0x1.1945026102997p-2,
+0x1.1d967532b31b1p-2,
+0x1.21f1be50339e7p-2,
+0x1.2656e41649ae3p-2,
+0x1.2ac5ecdb988f8p-2,
+0x1.2f3edef0b0ed8p-2,
+0x1.33c1c0a020438p-2,
+0x1.384e982e800b1p-2,
+0x1.3ce56bda84a81p-2,
+0x1.418641dd0c1bcp-2,
+0x1.463120692c7afp-2,
+0x1.4ae60dac4229dp-2,
+0x1.4fa50fcdfde15p-2,
+0x1.546e2cf0727a9p-2,
+0x1.59416b3022858p-2,
+0x1.5e1ed0a40daabp-2,
+0x1.6306635dbdd7bp-2,
+0x1.67f82969543a2p-2,
+0x1.6cf428cd96079p-2,
+0x1.71fa678bf915dp-2,
+0x1.770aeba0b042ap-2,
+0x1.7c25bb02b7ac5p-2,
+0x1.814adba3e0bd9p-2,
+0x1.867a5370de0b1p-2,
+0x1.8bb428514f067p-2,
+0x1.90f86027cb84ep-2,
+0x1.964700d1ef1b1p-2,
+0x1.9ba0102864521p-2,
+0x1.a10393feefafdp-2,
+0x1.a67192247a9bep-2,
+0x1.abea10631e195p-2,
+0x1.b16d14802d5cap-2,
+0x1.b6faa43c403bbp-2,
+0x1.bc92c5533d785p-2,
+0x1.c2357d7c64e5dp-2,
+0x1.c7e2d26a596dep-2,
+0x1.cd9ac9cb2aef2p-2,
+0x1.d35d69485ffc5p-2,
+0x1.d92ab686ff782p-2,
+0x1.df02b7279a10dp-2,
+0x1.e4e570c6539c5p-2,
+0x1.ead2e8faec526p-2,
+0x1.f0cb2558c9ea4p-2,
+0x1.f6ce2b6f00983p-2,
+0x1.fcdc00c85bec2p-2,
+0x1.017a5575b3cb2p-1,
+0x1.048c17ad3c04bp-1,
+0x1.07a349c9d9837p-1,
+0x1.0abfee888c05p-1,
+0x1.0de208a4444c8p-1,
+0x1.11099ad5e83ebp-1,
+0x1.1436a7d456eefp-1,
+0x1.176932546ca12p-1,
+0x1.1aa13d0906bdap-1,
+0x1.1ddecaa307b85p-1,
+0x1.2121ddd15aecep-1,
+0x1.246a7940f86d1p-1,
+0x1.27b89f9ce8c4bp-1,
+0x1.2b0c538e48b07p-1,
+0x1.2e6597bc4ccap-1,
+0x1.31c46ecc4528dp-1,
+0x1.3528db61a0f73p-1,
+0x1.3892e01df1fccp-1,
+0x1.3c027fa0f01ebp-1,
+0x1.3f77bc887cd3bp-1,
+0x1.42f29970a68f8p-1,
+0x1.467318f3ac22dp-1,
+0x1.49f93daa00113p-1,
+0x1.4d850a2a4bde1p-1,
+0x1.51168109734e5p-1,
+0x1.54ada4da97a1bp-1,
+0x1.584a782f1ac23p-1,
+0x1.5becfd96a2698p-1,
+0x1.5f95379f1b3edp-1,
+0x1.634328d4bbe97p-1,
+0x1.66f6d3c2081cfp-1,
+0x1.6ab03aefd39aap-1,
+0x1.6e6f60e5452b1p-1,
+0x1.72344827d98f6p-1,
+0x1.75fef33b6669bp-1,
+0x1.79cf64a21d1e2p-1,
+0x1.7da59edc8dabp-1,
+0x1.8181a469a9787p-1,
+0x1.856377c6c6224p-1,
+0x1.894b1b6fa0377p-1,
+0x1.8d3891de5df49p-1,
+0x1.912bdd8b91f45p-1,
+0x1.952500ee3dda5p-1,
+0x1.9923fe7bd4f67p-1,
+0x1.9d28d8a83edfcp-1,
+0x1.a13391e5da09fp-1,
+0x1.a5442ca57e52ep-1,
+0x1.a95aab567f88fp-1,
+0x1.ad771066afec2p-1,
+0x1.b1995e4262a69p-1,
+0x1.b5c197546e3f8p-1,
+0x1.b9efbe062f086p-1,
+0x1.be23d4bf8981bp-1,
+0x1.c25ddde6ecbbbp-1,
+0x1.c69ddbe154af1p-1,
+0x1.cae3d1124c90bp-1,
+0x1.cf2fbfdbf11f1p-1,
+0x1.d381aa9ef2e82p-1,
+0x1.d7d993ba988d4p-1,
+0x1.dc377d8cc0fd5p-1,
+0x1.e09b6a71e5aa6p-1,
+0x1.e5055cc51cbb4p-1,
+0x1.e97556e01b351p-1,
+0x1.edeb5b1b37216p-1,
+0x1.f2676bcd69adep-1,
+0x1.f6e98b4c51466p-1,
+0x1.fb71bbec33ab2p-1,
+0x1p+0,
+
+0x0p+0,
+0x1.010101010101p-8,
+0x1.010101010101p-7,
+0x1.8181818181818p-7,
+0x1.010101010101p-6,
+0x1.4141414141414p-6,
+0x1.8181818181818p-6,
+0x1.c1c1c1c1c1c1cp-6,
+0x1.010101010101p-5,
+0x1.2121212121212p-5,
+0x1.4141414141414p-5,
+0x1.6161616161616p-5,
+0x1.8181818181818p-5,
+0x1.a1a1a1a1a1a1ap-5,
+0x1.c1c1c1c1c1c1cp-5,
+0x1.e1e1e1e1e1e1ep-5,
+0x1.010101010101p-4,
+0x1.1111111111111p-4,
+0x1.2121212121212p-4,
+0x1.3131313131313p-4,
+0x1.4141414141414p-4,
+0x1.5151515151515p-4,
+0x1.6161616161616p-4,
+0x1.7171717171717p-4,
+0x1.8181818181818p-4,
+0x1.9191919191919p-4,
+0x1.a1a1a1a1a1a1ap-4,
+0x1.b1b1b1b1b1b1bp-4,
+0x1.c1c1c1c1c1c1cp-4,
+0x1.d1d1d1d1d1d1dp-4,
+0x1.e1e1e1e1e1e1ep-4,
+0x1.f1f1f1f1f1f1fp-4,
+0x1.010101010101p-3,
+0x1.0909090909091p-3,
+0x1.1111111111111p-3,
+0x1.1919191919192p-3,
+0x1.2121212121212p-3,
+0x1.2929292929293p-3,
+0x1.3131313131313p-3,
+0x1.3939393939394p-3,
+0x1.4141414141414p-3,
+0x1.4949494949495p-3,
+0x1.5151515151515p-3,
+0x1.5959595959596p-3,
+0x1.6161616161616p-3,
+0x1.6969696969697p-3,
+0x1.7171717171717p-3,
+0x1.7979797979798p-3,
+0x1.8181818181818p-3,
+0x1.8989898989899p-3,
+0x1.9191919191919p-3,
+0x1.999999999999ap-3,
+0x1.a1a1a1a1a1a1ap-3,
+0x1.a9a9a9a9a9a9bp-3,
+0x1.b1b1b1b1b1b1bp-3,
+0x1.b9b9b9b9b9b9cp-3,
+0x1.c1c1c1c1c1c1cp-3,
+0x1.c9c9c9c9c9c9dp-3,
+0x1.d1d1d1d1d1d1dp-3,
+0x1.d9d9d9d9d9d9ep-3,
+0x1.e1e1e1e1e1e1ep-3,
+0x1.e9e9e9e9e9e9fp-3,
+0x1.f1f1f1f1f1f1fp-3,
+0x1.f9f9f9f9f9fap-3,
+0x1.010101010101p-2,
+0x1.050505050505p-2,
+0x1.0909090909091p-2,
+0x1.0d0d0d0d0d0d1p-2,
+0x1.1111111111111p-2,
+0x1.1515151515151p-2,
+0x1.1919191919192p-2,
+0x1.1d1d1d1d1d1d2p-2,
+0x1.2121212121212p-2,
+0x1.2525252525252p-2,
+0x1.2929292929293p-2,
+0x1.2d2d2d2d2d2d3p-2,
+0x1.3131313131313p-2,
+0x1.3535353535353p-2,
+0x1.3939393939394p-2,
+0x1.3d3d3d3d3d3d4p-2,
+0x1.4141414141414p-2,
+0x1.4545454545454p-2,
+0x1.4949494949495p-2,
+0x1.4d4d4d4d4d4d5p-2,
+0x1.5151515151515p-2,
+0x1.5555555555555p-2,
+0x1.5959595959596p-2,
+0x1.5d5d5d5d5d5d6p-2,
+0x1.6161616161616p-2,
+0x1.6565656565656p-2,
+0x1.6969696969697p-2,
+0x1.6d6d6d6d6d6d7p-2,
+0x1.7171717171717p-2,
+0x1.7575757575757p-2,
+0x1.7979797979798p-2,
+0x1.7d7d7d7d7d7d8p-2,
+0x1.8181818181818p-2,
+0x1.8585858585858p-2,
+0x1.8989898989899p-2,
+0x1.8d8d8d8d8d8d9p-2,
+0x1.9191919191919p-2,
+0x1.9595959595959p-2,
+0x1.999999999999ap-2,
+0x1.9d9d9d9d9d9dap-2,
+0x1.a1a1a1a1a1a1ap-2,
+0x1.a5a5a5a5a5a5ap-2,
+0x1.a9a9a9a9a9a9bp-2,
+0x1.adadadadadadbp-2,
+0x1.b1b1b1b1b1b1bp-2,
+0x1.b5b5b5b5b5b5bp-2,
+0x1.b9b9b9b9b9b9cp-2,
+0x1.bdbdbdbdbdbdcp-2,
+0x1.c1c1c1c1c1c1cp-2,
+0x1.c5c5c5c5c5c5cp-2,
+0x1.c9c9c9c9c9c9dp-2,
+0x1.cdcdcdcdcdcddp-2,
+0x1.d1d1d1d1d1d1dp-2,
+0x1.d5d5d5d5d5d5dp-2,
+0x1.d9d9d9d9d9d9ep-2,
+0x1.ddddddddddddep-2,
+0x1.e1e1e1e1e1e1ep-2,
+0x1.e5e5e5e5e5e5ep-2,
+0x1.e9e9e9e9e9e9fp-2,
+0x1.ededededededfp-2,
+0x1.f1f1f1f1f1f1fp-2,
+0x1.f5f5f5f5f5f5fp-2,
+0x1.f9f9f9f9f9fap-2,
+0x1.fdfdfdfdfdfep-2,
+0x1.010101010101p-1,
+0x1.030303030303p-1,
+0x1.050505050505p-1,
+0x1.070707070707p-1,
+0x1.0909090909091p-1,
+0x1.0b0b0b0b0b0b1p-1,
+0x1.0d0d0d0d0d0d1p-1,
+0x1.0f0f0f0f0f0f1p-1,
+0x1.1111111111111p-1,
+0x1.1313131313131p-1,
+0x1.1515151515151p-1,
+0x1.1717171717171p-1,
+0x1.1919191919192p-1,
+0x1.1b1b1b1b1b1b2p-1,
+0x1.1d1d1d1d1d1d2p-1,
+0x1.1f1f1f1f1f1f2p-1,
+0x1.2121212121212p-1,
+0x1.2323232323232p-1,
+0x1.2525252525252p-1,
+0x1.2727272727272p-1,
+0x1.2929292929293p-1,
+0x1.2b2b2b2b2b2b3p-1,
+0x1.2d2d2d2d2d2d3p-1,
+0x1.2f2f2f2f2f2f3p-1,
+0x1.3131313131313p-1,
+0x1.3333333333333p-1,
+0x1.3535353535353p-1,
+0x1.3737373737373p-1,
+0x1.3939393939394p-1,
+0x1.3b3b3b3b3b3b4p-1,
+0x1.3d3d3d3d3d3d4p-1,
+0x1.3f3f3f3f3f3f4p-1,
+0x1.4141414141414p-1,
+0x1.4343434343434p-1,
+0x1.4545454545454p-1,
+0x1.4747474747474p-1,
+0x1.4949494949495p-1,
+0x1.4b4b4b4b4b4b5p-1,
+0x1.4d4d4d4d4d4d5p-1,
+0x1.4f4f4f4f4f4f5p-1,
+0x1.5151515151515p-1,
+0x1.5353535353535p-1,
+0x1.5555555555555p-1,
+0x1.5757575757575p-1,
+0x1.5959595959596p-1,
+0x1.5b5b5b5b5b5b6p-1,
+0x1.5d5d5d5d5d5d6p-1,
+0x1.5f5f5f5f5f5f6p-1,
+0x1.6161616161616p-1,
+0x1.6363636363636p-1,
+0x1.6565656565656p-1,
+0x1.6767676767676p-1,
+0x1.6969696969697p-1,
+0x1.6b6b6b6b6b6b7p-1,
+0x1.6d6d6d6d6d6d7p-1,
+0x1.6f6f6f6f6f6f7p-1,
+0x1.7171717171717p-1,
+0x1.7373737373737p-1,
+0x1.7575757575757p-1,
+0x1.7777777777777p-1,
+0x1.7979797979798p-1,
+0x1.7b7b7b7b7b7b8p-1,
+0x1.7d7d7d7d7d7d8p-1,
+0x1.7f7f7f7f7f7f8p-1,
+0x1.8181818181818p-1,
+0x1.8383838383838p-1,
+0x1.8585858585858p-1,
+0x1.8787878787878p-1,
+0x1.8989898989899p-1,
+0x1.8b8b8b8b8b8b9p-1,
+0x1.8d8d8d8d8d8d9p-1,
+0x1.8f8f8f8f8f8f9p-1,
+0x1.9191919191919p-1,
+0x1.9393939393939p-1,
+0x1.9595959595959p-1,
+0x1.9797979797979p-1,
+0x1.999999999999ap-1,
+0x1.9b9b9b9b9b9bap-1,
+0x1.9d9d9d9d9d9dap-1,
+0x1.9f9f9f9f9f9fap-1,
+0x1.a1a1a1a1a1a1ap-1,
+0x1.a3a3a3a3a3a3ap-1,
+0x1.a5a5a5a5a5a5ap-1,
+0x1.a7a7a7a7a7a7ap-1,
+0x1.a9a9a9a9a9a9bp-1,
+0x1.ababababababbp-1,
+0x1.adadadadadadbp-1,
+0x1.afafafafafafbp-1,
+0x1.b1b1b1b1b1b1bp-1,
+0x1.b3b3b3b3b3b3bp-1,
+0x1.b5b5b5b5b5b5bp-1,
+0x1.b7b7b7b7b7b7bp-1,
+0x1.b9b9b9b9b9b9cp-1,
+0x1.bbbbbbbbbbbbcp-1,
+0x1.bdbdbdbdbdbdcp-1,
+0x1.bfbfbfbfbfbfcp-1,
+0x1.c1c1c1c1c1c1cp-1,
+0x1.c3c3c3c3c3c3cp-1,
+0x1.c5c5c5c5c5c5cp-1,
+0x1.c7c7c7c7c7c7cp-1,
+0x1.c9c9c9c9c9c9dp-1,
+0x1.cbcbcbcbcbcbdp-1,
+0x1.cdcdcdcdcdcddp-1,
+0x1.cfcfcfcfcfcfdp-1,
+0x1.d1d1d1d1d1d1dp-1,
+0x1.d3d3d3d3d3d3dp-1,
+0x1.d5d5d5d5d5d5dp-1,
+0x1.d7d7d7d7d7d7dp-1,
+0x1.d9d9d9d9d9d9ep-1,
+0x1.dbdbdbdbdbdbep-1,
+0x1.ddddddddddddep-1,
+0x1.dfdfdfdfdfdfep-1,
+0x1.e1e1e1e1e1e1ep-1,
+0x1.e3e3e3e3e3e3ep-1,
+0x1.e5e5e5e5e5e5ep-1,
+0x1.e7e7e7e7e7e7ep-1,
+0x1.e9e9e9e9e9e9fp-1,
+0x1.ebebebebebebfp-1,
+0x1.ededededededfp-1,
+0x1.efefefefefeffp-1,
+0x1.f1f1f1f1f1f1fp-1,
+0x1.f3f3f3f3f3f3fp-1,
+0x1.f5f5f5f5f5f5fp-1,
+0x1.f7f7f7f7f7f7fp-1,
+0x1.f9f9f9f9f9fap-1,
+0x1.fbfbfbfbfbfcp-1,
+0x1.fdfdfdfdfdfep-1,
+0x1p+0
+};
diff --git a/extensions/avx2-int8.c b/extensions/avx2-int8.c
index b6d516566..a3ded4d7a 100644
--- a/extensions/avx2-int8.c
+++ b/extensions/avx2-int8.c
@@ -338,6 +338,184 @@ conv_rgbaF_linear_rgba8_gamma (const Babl  *conversion,
 #undef CVT1
 #undef CVTA1
 
+#define CVT1(src, dst) \
+  (*dst++ = gamma_to_linear[*src++])
+
+#define CVTA1(src, dst) \
+  (*dst++ = gamma_to_linear[*src++ + 256])
+
+static inline void
+conv_y8_gamma_yF_linear (const Babl    *conversion,
+                         const uint8_t *src,
+                         float         *dst,
+                         long           samples)
+{
+  const __m128i *src_vec;
+  __v8sf        *dst_vec;
+
+  while ((uintptr_t) dst % 32 && samples > 0)
+    {
+      CVT1 (src, dst);
+
+      samples--;
+    }
+
+  src_vec = (const __m128i *) src;
+  dst_vec = (__v8sf        *) dst;
+
+  while (samples >= 16)
+    {
+      __m128i i8_01;
+      __m256i i32_0;
+
+      i8_01 = _mm_loadu_si128 (src_vec++);
+
+      i32_0      = _mm256_cvtepu8_epi32 (i8_01);
+      *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
+
+      i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2));
+
+      i32_0      = _mm256_cvtepu8_epi32 (i8_01);
+      *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
+
+      samples -= 16;
+    }
+
+  src = (const uint8_t *) src_vec;
+  dst = (float         *) dst_vec;
+
+  while (samples > 0)
+    {
+      CVT1 (src, dst);
+
+      samples--;
+    }
+}
+
+static inline void
+conv_ya8_gamma_yaF_linear (const Babl    *conversion,
+                           const uint8_t *src,
+                           float         *dst,
+                           long           samples)
+{
+  const __m128i *src_vec;
+  __v8sf        *dst_vec;
+  const __m256i  offset = _mm256_setr_epi32 (0, 256, 0, 256,
+                                             0, 256, 0, 256);
+
+  while ((uintptr_t) dst % 32 && samples > 0)
+    {
+      CVT1  (src, dst);
+      CVTA1 (src, dst);
+
+      samples--;
+    }
+
+  src_vec = (const __m128i *) src;
+  dst_vec = (__v8sf        *) dst;
+
+  while (samples >= 8)
+    {
+      __m128i i8_01;
+      __m256i i32_0;
+
+      i8_01 = _mm_loadu_si128 (src_vec++);
+
+      i32_0       = _mm256_cvtepu8_epi32 (i8_01);
+      i32_0      += offset;
+      *dst_vec++  = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
+
+      i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2));
+
+      i32_0       = _mm256_cvtepu8_epi32 (i8_01);
+      i32_0      += offset;
+      *dst_vec++  = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
+
+      samples -= 8;
+    }
+
+  src = (const uint8_t *) src_vec;
+  dst = (float         *) dst_vec;
+
+  while (samples > 0)
+    {
+      CVT1  (src, dst);
+      CVTA1 (src, dst);
+
+      samples--;
+    }
+}
+
+static inline void
+conv_rgb8_gamma_rgbF_linear (const Babl    *conversion,
+                             const uint8_t *src,
+                             float         *dst,
+                             long           samples)
+{
+  conv_y8_gamma_yF_linear (conversion, src, dst, 3 * samples);
+}
+
+static inline void
+conv_rgba8_gamma_rgbaF_linear (const Babl    *conversion,
+                               const uint8_t *src,
+                               float         *dst,
+                               long           samples)
+{
+  const __m128i *src_vec;
+  __v8sf        *dst_vec;
+  const __m256i  offset = _mm256_setr_epi32 (0, 0, 0, 256,
+                                             0, 0, 0, 256);
+
+  while ((uintptr_t) dst % 32 && samples > 0)
+    {
+      CVT1  (src, dst);
+      CVT1  (src, dst);
+      CVT1  (src, dst);
+      CVTA1 (src, dst);
+
+      samples--;
+    }
+
+  src_vec = (const __m128i *) src;
+  dst_vec = (__v8sf        *) dst;
+
+  while (samples >= 4)
+    {
+      __m128i i8_01;
+      __m256i i32_0;
+
+      i8_01 = _mm_loadu_si128 (src_vec++);
+
+      i32_0       = _mm256_cvtepu8_epi32 (i8_01);
+      i32_0      += offset;
+      *dst_vec++  = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
+
+      i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2));
+
+      i32_0       = _mm256_cvtepu8_epi32 (i8_01);
+      i32_0      += offset;
+      *dst_vec++  = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
+
+      samples -= 4;
+    }
+
+  src = (const uint8_t *) src_vec;
+  dst = (float         *) dst_vec;
+
+  while (samples > 0)
+    {
+      CVT1  (src, dst);
+      CVT1  (src, dst);
+      CVT1  (src, dst);
+      CVTA1 (src, dst);
+
+      samples--;
+    }
+}
+
+#undef CVT1
+#undef CVTA1
+
 #endif /* defined(USE_AVX2) */
 
 int init (void);
@@ -407,6 +585,12 @@ init (void)
                            dst ## _gamma,                             \
                            "linear",                                  \
                            conv_ ## src ## _linear_ ## dst ## _gamma, \
+                           NULL);                                     \
+                                                                      \
+      babl_conversion_new (dst ## _gamma,                             \
+                           src ## _linear,                            \
+                           "linear",                                  \
+                           conv_ ## dst ## _gamma_ ## src ## _linear, \
                            NULL);                                     \
     }                                                                 \
   while (0)


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]