[gnome-remote-desktop] hwaccel-nvidia: Add CUDA kernels for damage region detection

From: Jonas Ådahl <jadahl src gnome org>
To: commits-list gnome org
Cc:
Subject: [gnome-remote-desktop] hwaccel-nvidia: Add CUDA kernels for damage region detection
Date: Thu, 3 Mar 2022 14:23:08 +0000 (UTC)
commit 9c16b9dc5d7a479ae5d55f317a79a77e790f6af0
Author: Pascal Nowack <Pascal Nowack gmx de>
Date:   Fri Dec 24 21:31:23 2021 +0100

    hwaccel-nvidia: Add CUDA kernels for damage region detection
    
    As already mentioned in previous commits, damage region detection on
    the GPU is much faster than damage region detection on the CPU.
    Although the CPU has a higher clock rate, the GPU has many more cores,
    which makes it more suitable for parallel tasks.
    
    The damage detection on the GPU is done in four parts.
    
    Part 1, the actual damage detection:
    The check_damaged_pixel cuda kernel checks for each pixel, whether it
    is damaged or not.
    To make the implementation easier and to avoid accessing global memory
    too much, read the values as 32-bit unsigned integers.
    Before this kernel is run, the global region_is_damaged value is set to
    0.
    After running this kernel, gnome-remote-desktop knows whether the
    region is damaged or not.
    Since, NVENC does not make use of damage regions, only the
    region_is_damaged value is retrieved, when using NVENC.
    The rest of the region will still be retrieved, as clients like
    xfreerdp still use it for optimization purposes.
    This retrieval will run later, when NVENC encodes the data, as CUDA
    kernels and data transfer between the GPU and CPU can still run, while
    NVENC is encoding, thus saving time.
    
    Part 2, Combining the coloums:
    As gnome-remote-desktop uses damage tiles of the size of 64x64,
    especially for RFX (Progressive), the damage values need to be
    combined.
    For this, the combine_damage_array_cols kernel runs multiple times.
    First, all neighbours are combined, i.e. damage values of pixels 0 and
    1 are combined in 0, and damage values of pixels 2 and 3 are combined
    in pixel 2.
    In the next round, the combine_shift is increased, leading to the
    damage values of pixels 0 and 2 being combined in 0 and the damage
    values of pixels 4 and 6 being combined in 4.
    The same thing happens to the other pixels.
    The combine_damage_array_cols kernel runs 6 times, while increasing the
    combine_shift value in each run.
    After this, every 64th value combines all damage values for maximum 64
    pixels in each row.
    
    Part 3, Combining the rows:
    This is pretty much the same as part 2, except that this kernel now
    runs on the rows, instead of the coloums.
    
    Part 4: Simplifying the damage array:
    With now every 64x64th value combines all damage values for its tile,
    simplify the resulting damage array.
    In case of the data needs to be transferred to the CPU, this
    drastically increases the speed, as the simplified damage array is very
    small.
    
    These CUDA kernels will later be used by the CUDA damage detection
    class.

 data/grd-cuda-damage-utils_30.ptx | 251 ++++++++++++++++++++++++++++++++++++++
 data/meson.build                  |   3 +
 src/grd-cuda-damage-utils.cu      | 138 +++++++++++++++++++++
 src/grd-hwaccel-nvidia.c          |  82 ++++++++++---
 4 files changed, 460 insertions(+), 14 deletions(-)
---
diff --git a/data/grd-cuda-damage-utils_30.ptx b/data/grd-cuda-damage-utils_30.ptx
new file mode 100644
index 00000000..d8b8e1e0
--- /dev/null
+++ b/data/grd-cuda-damage-utils_30.ptx
@@ -0,0 +1,251 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-27506705
+// Cuda compilation tools, release 10.2, V10.2.89
+// Based on LLVM 3.4svn
+//
+
+.version 6.5
+.target sm_30
+.address_size 64
+
+       // .globl       check_damaged_pixel
+
+.visible .entry check_damaged_pixel(
+       .param .u64 check_damaged_pixel_param_0,
+       .param .u64 check_damaged_pixel_param_1,
+       .param .u64 check_damaged_pixel_param_2,
+       .param .u64 check_damaged_pixel_param_3,
+       .param .u32 check_damaged_pixel_param_4,
+       .param .u32 check_damaged_pixel_param_5,
+       .param .u32 check_damaged_pixel_param_6,
+       .param .u32 check_damaged_pixel_param_7
+)
+{
+       .reg .pred      %p<5>;
+       .reg .b16       %rs<5>;
+       .reg .b32       %r<17>;
+       .reg .b64       %rd<14>;
+
+
+       ld.param.u64    %rd1, [check_damaged_pixel_param_0];
+       ld.param.u64    %rd2, [check_damaged_pixel_param_1];
+       ld.param.u64    %rd3, [check_damaged_pixel_param_2];
+       ld.param.u64    %rd4, [check_damaged_pixel_param_3];
+       ld.param.u32    %r3, [check_damaged_pixel_param_4];
+       ld.param.u32    %r5, [check_damaged_pixel_param_5];
+       ld.param.u32    %r6, [check_damaged_pixel_param_6];
+       ld.param.u32    %r4, [check_damaged_pixel_param_7];
+       mov.u32         %r7, %ntid.x;
+       mov.u32         %r8, %ctaid.x;
+       mov.u32         %r9, %tid.x;
+       mad.lo.s32      %r1, %r7, %r8, %r9;
+       mov.u32         %r10, %ntid.y;
+       mov.u32         %r11, %ctaid.y;
+       mov.u32         %r12, %tid.y;
+       mad.lo.s32      %r2, %r10, %r11, %r12;
+       setp.ge.u32     %p1, %r2, %r6;
+       setp.ge.u32     %p2, %r1, %r5;
+       or.pred         %p3, %p1, %p2;
+       @%p3 bra        BB0_4;
+
+       cvta.to.global.u64      %rd5, %rd4;
+       mad.lo.s32      %r13, %r2, %r4, %r1;
+       mul.wide.u32    %rd6, %r13, 4;
+       add.s64         %rd7, %rd5, %rd6;
+       cvta.to.global.u64      %rd8, %rd3;
+       add.s64         %rd9, %rd8, %rd6;
+       ld.global.u32   %r14, [%rd9];
+       ld.global.u32   %r15, [%rd7];
+       setp.eq.s32     %p4, %r15, %r14;
+       mov.u16         %rs4, 0;
+       @%p4 bra        BB0_3;
+
+       cvta.to.global.u64      %rd10, %rd2;
+       mov.u16         %rs4, 1;
+       st.global.u8    [%rd10], %rs4;
+
+BB0_3:
+       mad.lo.s32      %r16, %r2, %r3, %r1;
+       cvt.u64.u32     %rd11, %r16;
+       cvta.to.global.u64      %rd12, %rd1;
+       add.s64         %rd13, %rd12, %rd11;
+       st.global.u8    [%rd13], %rs4;
+
+BB0_4:
+       ret;
+}
+
+       // .globl       combine_damage_array_cols
+.visible .entry combine_damage_array_cols(
+       .param .u64 combine_damage_array_cols_param_0,
+       .param .u32 combine_damage_array_cols_param_1,
+       .param .u32 combine_damage_array_cols_param_2,
+       .param .u32 combine_damage_array_cols_param_3,
+       .param .u32 combine_damage_array_cols_param_4
+)
+{
+       .reg .pred      %p<6>;
+       .reg .b16       %rs<3>;
+       .reg .b32       %r<20>;
+       .reg .b64       %rd<7>;
+
+
+       ld.param.u64    %rd2, [combine_damage_array_cols_param_0];
+       ld.param.u32    %r5, [combine_damage_array_cols_param_1];
+       ld.param.u32    %r8, [combine_damage_array_cols_param_2];
+       ld.param.u32    %r6, [combine_damage_array_cols_param_3];
+       ld.param.u32    %r7, [combine_damage_array_cols_param_4];
+       cvta.to.global.u64      %rd1, %rd2;
+       mov.u32         %r9, %ntid.x;
+       mov.u32         %r10, %ctaid.x;
+       mov.u32         %r11, %tid.x;
+       mad.lo.s32      %r12, %r9, %r10, %r11;
+       mov.u32         %r13, %ntid.y;
+       mov.u32         %r14, %ctaid.y;
+       mov.u32         %r15, %tid.y;
+       mad.lo.s32      %r1, %r13, %r14, %r15;
+       add.s32         %r16, %r7, 1;
+       shl.b32         %r2, %r12, %r16;
+       setp.ge.u32     %p1, %r1, %r8;
+       setp.ge.u32     %p2, %r2, %r5;
+       or.pred         %p3, %p1, %p2;
+       @%p3 bra        BB1_4;
+
+       mov.u32         %r17, 1;
+       shl.b32         %r3, %r17, %r7;
+       add.s32         %r18, %r2, %r3;
+       setp.ge.u32     %p4, %r18, %r5;
+       @%p4 bra        BB1_4;
+
+       mad.lo.s32      %r4, %r1, %r6, %r2;
+       add.s32         %r19, %r4, %r3;
+       cvt.u64.u32     %rd3, %r19;
+       add.s64         %rd4, %rd1, %rd3;
+       ld.global.u8    %rs1, [%rd4];
+       setp.eq.s16     %p5, %rs1, 0;
+       @%p5 bra        BB1_4;
+
+       cvt.u64.u32     %rd5, %r4;
+       add.s64         %rd6, %rd1, %rd5;
+       mov.u16         %rs2, 1;
+       st.global.u8    [%rd6], %rs2;
+
+BB1_4:
+       ret;
+}
+
+       // .globl       combine_damage_array_rows
+.visible .entry combine_damage_array_rows(
+       .param .u64 combine_damage_array_rows_param_0,
+       .param .u32 combine_damage_array_rows_param_1,
+       .param .u32 combine_damage_array_rows_param_2,
+       .param .u32 combine_damage_array_rows_param_3,
+       .param .u32 combine_damage_array_rows_param_4
+)
+{
+       .reg .pred      %p<6>;
+       .reg .b16       %rs<3>;
+       .reg .b32       %r<21>;
+       .reg .b64       %rd<7>;
+
+
+       ld.param.u64    %rd2, [combine_damage_array_rows_param_0];
+       ld.param.u32    %r7, [combine_damage_array_rows_param_1];
+       ld.param.u32    %r4, [combine_damage_array_rows_param_2];
+       ld.param.u32    %r5, [combine_damage_array_rows_param_3];
+       ld.param.u32    %r6, [combine_damage_array_rows_param_4];
+       cvta.to.global.u64      %rd1, %rd2;
+       mov.u32         %r8, %ntid.x;
+       mov.u32         %r9, %ctaid.x;
+       mov.u32         %r10, %tid.x;
+       mad.lo.s32      %r1, %r8, %r9, %r10;
+       mov.u32         %r11, %ntid.y;
+       mov.u32         %r12, %ctaid.y;
+       mov.u32         %r13, %tid.y;
+       mad.lo.s32      %r14, %r11, %r12, %r13;
+       add.s32         %r15, %r6, 1;
+       shl.b32         %r2, %r14, %r15;
+       setp.ge.u32     %p1, %r2, %r4;
+       setp.ge.u32     %p2, %r1, %r7;
+       or.pred         %p3, %p1, %p2;
+       @%p3 bra        BB2_4;
+
+       mov.u32         %r16, 1;
+       shl.b32         %r17, %r16, %r6;
+       add.s32         %r18, %r2, %r17;
+       setp.ge.u32     %p4, %r18, %r4;
+       @%p4 bra        BB2_4;
+
+       mad.lo.s32      %r3, %r2, %r5, %r1;
+       shl.b32         %r19, %r5, %r6;
+       add.s32         %r20, %r3, %r19;
+       cvt.u64.u32     %rd3, %r20;
+       add.s64         %rd4, %rd1, %rd3;
+       ld.global.u8    %rs1, [%rd4];
+       setp.eq.s16     %p5, %rs1, 0;
+       @%p5 bra        BB2_4;
+
+       cvt.u64.u32     %rd5, %r3;
+       add.s64         %rd6, %rd1, %rd5;
+       mov.u16         %rs2, 1;
+       st.global.u8    [%rd6], %rs2;
+
+BB2_4:
+       ret;
+}
+
+       // .globl       simplify_damage_array
+.visible .entry simplify_damage_array(
+       .param .u64 simplify_damage_array_param_0,
+       .param .u64 simplify_damage_array_param_1,
+       .param .u32 simplify_damage_array_param_2,
+       .param .u32 simplify_damage_array_param_3,
+       .param .u32 simplify_damage_array_param_4,
+       .param .u32 simplify_damage_array_param_5
+)
+{
+       .reg .pred      %p<4>;
+       .reg .b16       %rs<2>;
+       .reg .b32       %r<17>;
+       .reg .b64       %rd<9>;
+
+
+       ld.param.u64    %rd1, [simplify_damage_array_param_0];
+       ld.param.u64    %rd2, [simplify_damage_array_param_1];
+       ld.param.u32    %r5, [simplify_damage_array_param_2];
+       ld.param.u32    %r7, [simplify_damage_array_param_3];
+       ld.param.u32    %r8, [simplify_damage_array_param_4];
+       ld.param.u32    %r6, [simplify_damage_array_param_5];
+       mov.u32         %r9, %ctaid.x;
+       mov.u32         %r10, %ntid.x;
+       mov.u32         %r11, %tid.x;
+       mad.lo.s32      %r1, %r10, %r9, %r11;
+       mov.u32         %r12, %ntid.y;
+       mov.u32         %r13, %ctaid.y;
+       mov.u32         %r14, %tid.y;
+       mad.lo.s32      %r2, %r12, %r13, %r14;
+       shl.b32         %r3, %r1, 6;
+       shl.b32         %r4, %r2, 6;
+       setp.ge.u32     %p1, %r4, %r8;
+       setp.ge.u32     %p2, %r3, %r7;
+       or.pred         %p3, %p1, %p2;
+       @%p3 bra        BB3_2;
+
+       cvta.to.global.u64      %rd3, %rd2;
+       mad.lo.s32      %r15, %r4, %r6, %r3;
+       mad.lo.s32      %r16, %r2, %r5, %r1;
+       cvt.u64.u32     %rd4, %r15;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.u8    %rs1, [%rd5];
+       cvt.u64.u32     %rd6, %r16;
+       cvta.to.global.u64      %rd7, %rd1;
+       add.s64         %rd8, %rd7, %rd6;
+       st.global.u8    [%rd8], %rs1;
+
+BB3_2:
+       ret;
+}
+
+
diff --git a/data/meson.build b/data/meson.build
index ac276bf4..4077c091 100644
--- a/data/meson.build
+++ b/data/meson.build
@@ -1,4 +1,7 @@
 if have_rdp
+  install_data(['grd-cuda-damage-utils_30.ptx'],
+    install_dir: grd_datadir,
+  )
   install_data(['grd-cuda-avc-utils_30.ptx'],
     install_dir: grd_datadir,
   )
diff --git a/src/grd-cuda-damage-utils.cu b/src/grd-cuda-damage-utils.cu
new file mode 100644
index 00000000..d5913f09
--- /dev/null
+++ b/src/grd-cuda-damage-utils.cu
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2021 Pascal Nowack
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <stdint.h>
+
+extern "C"
+{
+  __global__ void
+  check_damaged_pixel (uint8_t  *damage_array,
+                       uint8_t  *region_is_damaged,
+                       uint32_t *current_data,
+                       uint32_t *previous_data,
+                       uint32_t  damage_array_stride,
+                       uint32_t  data_width,
+                       uint32_t  data_height,
+                       uint32_t  data_stride)
+  {
+    uint32_t data_pos;
+    uint8_t damaged = 0;
+    uint32_t x, y;
+
+    x = blockIdx.x * blockDim.x + threadIdx.x;
+    y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= data_width || y >= data_height)
+      return;
+
+    data_pos = y * data_stride + x;
+    if (previous_data[data_pos] != current_data[data_pos])
+      {
+        damaged = 1;
+        *region_is_damaged = 1;
+      }
+
+    damage_array[y * damage_array_stride + x] = damaged;
+  }
+
+  __global__ void
+  combine_damage_array_cols (uint8_t  *damage_array,
+                             uint32_t  damage_array_width,
+                             uint32_t  damage_array_height,
+                             uint32_t  damage_array_stride,
+                             uint32_t  combine_shift)
+  {
+    uint32_t data_pos;
+    uint32_t neighbour_offset;
+    uint32_t x, y;
+    uint32_t sx;
+
+    sx = blockIdx.x * blockDim.x + threadIdx.x;
+    y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    x = sx << combine_shift + 1;
+
+    if (x >= damage_array_width || y >= damage_array_height)
+      return;
+
+    neighbour_offset = 1 << combine_shift;
+    if (x + neighbour_offset >= damage_array_width)
+      return;
+
+    data_pos = y * damage_array_stride + x;
+    if (damage_array[data_pos + neighbour_offset])
+      damage_array[data_pos] = 1;
+  }
+
+  __global__ void
+  combine_damage_array_rows (uint8_t  *damage_array,
+                             uint32_t  damage_array_width,
+                             uint32_t  damage_array_height,
+                             uint32_t  damage_array_stride,
+                             uint32_t  combine_shift)
+  {
+    uint32_t data_pos;
+    uint32_t neighbour_offset;
+    uint32_t x, y;
+    uint32_t sy;
+
+    x = blockIdx.x * blockDim.x + threadIdx.x;
+    sy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    y = sy << combine_shift + 1;
+
+    if (x >= damage_array_width || y >= damage_array_height)
+      return;
+
+    neighbour_offset = 1 << combine_shift;
+    if (y + neighbour_offset >= damage_array_height)
+      return;
+
+    data_pos = y * damage_array_stride + x;
+    if (damage_array[data_pos + neighbour_offset * damage_array_stride])
+      damage_array[data_pos] = 1;
+  }
+
+  __global__ void
+  simplify_damage_array (uint8_t  *dst_damage_array,
+                         uint8_t  *src_damage_array,
+                         uint32_t  dst_damage_array_stride,
+                         uint32_t  src_damage_array_width,
+                         uint32_t  src_damage_array_height,
+                         uint32_t  src_damage_array_stride)
+  {
+    uint32_t src_data_pos, dst_data_pos;
+    uint32_t sx, sy;
+    uint32_t x, y;
+
+    sx = blockIdx.x * blockDim.x + threadIdx.x;
+    sy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    x = sx << 6;
+    y = sy << 6;
+
+    if (x >= src_damage_array_width || y >= src_damage_array_height)
+      return;
+
+    src_data_pos = y * src_damage_array_stride + x;
+    dst_data_pos = sy * dst_damage_array_stride + sx;
+
+    dst_damage_array[dst_data_pos] = src_damage_array[src_data_pos];
+  }
+}
diff --git a/src/grd-hwaccel-nvidia.c b/src/grd-hwaccel-nvidia.c
index 44862523..43dafd18 100644
--- a/src/grd-hwaccel-nvidia.c
+++ b/src/grd-hwaccel-nvidia.c
@@ -77,6 +77,12 @@ struct _GrdHwAccelNvidia
   CUcontext cu_context;
   gboolean initialized;
 
+  CUmodule cu_module_dmg_utils;
+  CUfunction cu_chk_dmg_pxl;
+  CUfunction cu_cmb_dmg_arr_cols;
+  CUfunction cu_cmb_dmg_arr_rows;
+  CUfunction cu_simplify_dmg_arr;
+
   CUmodule cu_module_avc_utils;
   CUfunction cu_bgrx_to_yuv420;
 
@@ -651,6 +657,40 @@ run_function_in_egl_thread (GrdHwAccelNvidia       *hwaccel_nvidia,
   grd_sync_point_clear (&sync_point);
 }
 
+static gboolean
+load_cuda_module (GrdHwAccelNvidia *hwaccel_nvidia,
+                  CUmodule         *module,
+                  const char       *name,
+                  const char       *ptx_instructions)
+{
+  CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
+
+  if (cuda_funcs->cuModuleLoadData (module, ptx_instructions) != CUDA_SUCCESS)
+    {
+      g_warning ("[HWAccel.CUDA] Failed to load %s module", name);
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
+static gboolean
+load_cuda_function (GrdHwAccelNvidia *hwaccel_nvidia,
+                    CUfunction       *function,
+                    CUmodule          module,
+                    const char       *name)
+{
+  CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
+
+  if (cuda_funcs->cuModuleGetFunction (function, module, name) != CUDA_SUCCESS)
+    {
+      g_warning ("[HWAccel.CUDA] Failed to get kernel %s", name);
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
 GrdHwAccelNvidia *
 grd_hwaccel_nvidia_new (GrdEglThread *egl_thread)
 {
@@ -661,6 +701,8 @@ grd_hwaccel_nvidia_new (GrdEglThread *egl_thread)
   unsigned int cu_device_count = 0;
   CudaFunctions *cuda_funcs;
   NvencFunctions *nvenc_funcs;
+  g_autofree char *dmg_ptx_path = NULL;
+  g_autofree char *dmg_ptx_instructions = NULL;
   g_autofree char *avc_ptx_path = NULL;
   g_autofree char *avc_ptx_instructions = NULL;
   g_autoptr (GError) error = NULL;
@@ -749,24 +791,34 @@ grd_hwaccel_nvidia_new (GrdEglThread *egl_thread)
 
   hwaccel_nvidia->initialized = TRUE;
 
+  dmg_ptx_path = g_strdup_printf ("%s/grd-cuda-damage-utils_30.ptx", GRD_DATA_DIR);
   avc_ptx_path = g_strdup_printf ("%s/grd-cuda-avc-utils_30.ptx", GRD_DATA_DIR);
-  if (!g_file_get_contents (avc_ptx_path, &avc_ptx_instructions, NULL, &error))
+
+  if (!g_file_get_contents (dmg_ptx_path, &dmg_ptx_instructions, NULL, &error) ||
+      !g_file_get_contents (avc_ptx_path, &avc_ptx_instructions, NULL, &error))
     g_error ("[HWAccel.CUDA] Failed to read PTX instructions: %s", error->message);
 
-  if (cuda_funcs->cuModuleLoadData (&hwaccel_nvidia->cu_module_avc_utils,
-                                    avc_ptx_instructions) != CUDA_SUCCESS)
-    {
-      g_warning ("[HWAccel.CUDA] Failed to load CUDA module");
-      return NULL;
-    }
+  if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_dmg_utils,
+                         "damage utils", dmg_ptx_instructions))
+    return NULL;
 
-  if (cuda_funcs->cuModuleGetFunction (&hwaccel_nvidia->cu_bgrx_to_yuv420,
-                                       hwaccel_nvidia->cu_module_avc_utils,
-                                       "convert_2x2_bgrx_area_to_yuv420_nv12") != CUDA_SUCCESS)
-    {
-      g_warning ("[HWAccel.CUDA] Failed to get AVC CUDA kernel");
-      return NULL;
-    }
+  if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_chk_dmg_pxl,
+                           hwaccel_nvidia->cu_module_dmg_utils, "check_damaged_pixel") ||
+      !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_cols,
+                           hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_cols") ||
+      !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_rows,
+                           hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_rows") ||
+      !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_simplify_dmg_arr,
+                           hwaccel_nvidia->cu_module_dmg_utils, "simplify_damage_array"))
+    return NULL;
+
+  if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_avc_utils,
+                         "AVC utils", avc_ptx_instructions))
+    return NULL;
+
+  if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_bgrx_to_yuv420,
+                           hwaccel_nvidia->cu_module_avc_utils, "convert_2x2_bgrx_area_to_yuv420_nv12"))
+    return NULL;
 
   return g_steal_pointer (&hwaccel_nvidia);
 }
@@ -788,6 +840,8 @@ grd_hwaccel_nvidia_dispose (GObject *object)
 
   g_clear_pointer (&hwaccel_nvidia->cu_module_avc_utils,
                    hwaccel_nvidia->cuda_funcs->cuModuleUnload);
+  g_clear_pointer (&hwaccel_nvidia->cu_module_dmg_utils,
+                   hwaccel_nvidia->cuda_funcs->cuModuleUnload);
 
   g_clear_pointer (&hwaccel_nvidia->cuda_lib, dlclose);
   g_clear_pointer (&hwaccel_nvidia->extra_cuda_funcs, g_free);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]