[gnome-remote-desktop] hwaccel-nvidia: Reduce global memory access in BGRX_TO_YUV420 kernel
- From: Jonas Ådahl <jadahl src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnome-remote-desktop] hwaccel-nvidia: Reduce global memory access in BGRX_TO_YUV420 kernel
- Date: Thu, 3 Mar 2022 20:42:10 +0000 (UTC)
commit 8b5367dcf1c08cf1a6b6cdfbdd33a5747ee222e2
Author: Pascal Nowack <Pascal Nowack gmx de>
Date: Mon Jan 31 17:16:06 2022 +0100
hwaccel-nvidia: Reduce global memory access in BGRX_TO_YUV420 kernel
Global memory access (accessing data on the heap) on the GPU is slow
during compute operations.
While in most situations it cannot be avoided, its usage can still be
decreased.
Since every pixel in every frame, that is received via PipeWire, has a
size of 4 Bytes, read every pixel in the BGRX_TO_YUV420 CUDA kernel as
uint32, instead of reading it three times as uint8 per component.
Then retrieve the values of the components via the local uint32_t
variable.
For FullHD frames on a GTX 660, this reduces the overall runtime of the
BGRX_TO_YUV420 CUDA kernel by about 140-150µs (1. quartile and
3. quartile) from about 330µs to 182µs (1. quartile) and from 412µs to
266µs (3. quartile).
The average runtime value drops from about 377µs to 228µs and the
median value drops from about 394.5µs to 226µs.
data/grd-cuda-avc-utils_30.ptx | 272 ++++++++++++++++++++---------------------
src/grd-cuda-avc-utils.cu | 47 ++++---
src/grd-hwaccel-nvidia.c | 12 +-
3 files changed, 166 insertions(+), 165 deletions(-)
---
diff --git a/data/grd-cuda-avc-utils_30.ptx b/data/grd-cuda-avc-utils_30.ptx
index 17266d4c..b14b193c 100644
--- a/data/grd-cuda-avc-utils_30.ptx
+++ b/data/grd-cuda-avc-utils_30.ptx
@@ -19,23 +19,21 @@
.param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_3,
.param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_4,
.param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_5,
- .param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_6,
- .param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_7
+ .param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_6
)
{
.reg .pred %p<17>;
- .reg .b16 %rs<23>;
- .reg .b32 %r<129>;
- .reg .b64 %rd<36>;
+ .reg .b16 %rs<18>;
+ .reg .b32 %r<127>;
+ .reg .b64 %rd<40>;
ld.param.u64 %rd6, [convert_2x2_bgrx_area_to_yuv420_nv12_param_0];
ld.param.u64 %rd7, [convert_2x2_bgrx_area_to_yuv420_nv12_param_1];
ld.param.u16 %rs7, [convert_2x2_bgrx_area_to_yuv420_nv12_param_2];
ld.param.u16 %rs8, [convert_2x2_bgrx_area_to_yuv420_nv12_param_3];
- ld.param.u16 %rs9, [convert_2x2_bgrx_area_to_yuv420_nv12_param_4];
+ ld.param.u16 %rs9, [convert_2x2_bgrx_area_to_yuv420_nv12_param_5];
ld.param.u16 %rs10, [convert_2x2_bgrx_area_to_yuv420_nv12_param_6];
- ld.param.u16 %rs11, [convert_2x2_bgrx_area_to_yuv420_nv12_param_7];
mov.u32 %r30, %ntid.x;
mov.u32 %r31, %ctaid.x;
mov.u32 %r32, %tid.x;
@@ -45,10 +43,10 @@
mov.u32 %r35, %tid.y;
mad.lo.s32 %r2, %r33, %r34, %r35;
and.b32 %r36, %r1, 65535;
- ld.param.u16 %r37, [convert_2x2_bgrx_area_to_yuv420_nv12_param_5];
+ ld.param.u16 %r37, [convert_2x2_bgrx_area_to_yuv420_nv12_param_4];
shr.u32 %r38, %r37, 1;
and.b32 %r3, %r2, 65535;
- cvt.u32.u16 %r4, %rs10;
+ cvt.u32.u16 %r4, %rs9;
shr.u32 %r5, %r4, 1;
setp.ge.u32 %p1, %r3, %r5;
setp.ge.u32 %p2, %r36, %r38;
@@ -56,170 +54,168 @@
@%p3 bra BB0_10;
cvta.to.global.u64 %rd8, %rd6;
- cvt.u32.u16 %r42, %rs9;
- and.b32 %r43, %r1, 32767;
- shl.b32 %r44, %r1, 1;
- and.b32 %r6, %r44, 65534;
- mov.u32 %r45, 1;
- shl.b32 %r46, %r2, 1;
- and.b32 %r7, %r46, 65534;
- mul.lo.s32 %r47, %r7, %r42;
- cvt.u64.u32 %rd9, %r47;
- shl.b32 %r48, %r43, 3;
- cvt.u64.u32 %rd10, %r48;
+ shl.b32 %r42, %r1, 1;
+ and.b32 %r6, %r42, 65534;
+ mov.u32 %r43, 1;
+ shl.b32 %r44, %r2, 1;
+ and.b32 %r7, %r44, 65534;
+ cvt.u32.u16 %r45, %rs7;
+ mul.lo.s32 %r46, %r7, %r45;
+ cvt.u64.u32 %rd9, %r46;
+ cvt.u64.u32 %rd10, %r6;
add.s64 %rd1, %rd9, %rd10;
cvta.to.global.u64 %rd11, %rd7;
- add.s64 %rd2, %rd11, %rd1;
+ shl.b64 %rd12, %rd1, 2;
+ add.s64 %rd2, %rd11, %rd12;
setp.lt.u32 %p4, %r7, %r5;
- shl.b32 %r49, %r2, 2;
- sub.s32 %r50, %r45, %r4;
- selp.b32 %r51, 0, %r50, %p4;
- mov.u32 %r126, 0;
- add.s32 %r52, %r51, %r49;
- cvt.u64.u32 %rd12, %r52;
- and.b64 %rd13, %rd12, 65535;
- cvt.u64.u16 %rd14, %rs11;
- mul.lo.s64 %rd15, %rd13, %rd14;
- cvt.u64.u32 %rd16, %r6;
- add.s64 %rd17, %rd15, %rd16;
+ shl.b32 %r47, %r2, 2;
+ sub.s32 %r48, %r43, %r4;
+ selp.b32 %r49, 0, %r48, %p4;
+ mov.u32 %r124, 0;
+ add.s32 %r50, %r49, %r47;
+ cvt.u64.u32 %rd13, %r50;
+ and.b64 %rd14, %rd13, 65535;
+ cvt.u64.u16 %rd15, %rs10;
+ mul.lo.s64 %rd16, %rd14, %rd15;
+ add.s64 %rd17, %rd16, %rd10;
add.s64 %rd3, %rd8, %rd17;
- add.s32 %r53, %r7, 1;
- and.b32 %r54, %r53, 65535;
- setp.lt.u32 %p5, %r54, %r5;
- shl.b32 %r55, %r53, 1;
- selp.b32 %r56, 0, %r50, %p5;
- add.s32 %r57, %r56, %r55;
- cvt.u64.u32 %rd18, %r57;
+ add.s32 %r51, %r7, 1;
+ and.b32 %r52, %r51, 65535;
+ setp.lt.u32 %p5, %r52, %r5;
+ shl.b32 %r53, %r51, 1;
+ selp.b32 %r54, 0, %r48, %p5;
+ add.s32 %r55, %r54, %r53;
+ cvt.u64.u32 %rd18, %r55;
and.b64 %rd19, %rd18, 65535;
- mul.lo.s64 %rd20, %rd19, %rd14;
- add.s64 %rd21, %rd20, %rd16;
+ mul.lo.s64 %rd20, %rd19, %rd15;
+ add.s64 %rd21, %rd20, %rd10;
add.s64 %rd4, %rd8, %rd21;
- shr.u32 %r58, %r4, 2;
- setp.lt.u32 %p6, %r3, %r58;
- sub.s32 %r59, %r45, %r5;
- selp.b32 %r60, 0, %r59, %p6;
- shl.b32 %r61, %r3, 1;
- add.s32 %r62, %r61, %r60;
- cvt.u64.u32 %rd22, %r62;
+ shr.u32 %r56, %r4, 2;
+ setp.lt.u32 %p6, %r3, %r56;
+ sub.s32 %r57, %r43, %r5;
+ selp.b32 %r58, 0, %r57, %p6;
+ shl.b32 %r59, %r3, 1;
+ add.s32 %r60, %r59, %r58;
+ cvt.u64.u32 %rd22, %r60;
and.b64 %rd23, %rd22, 65535;
- cvt.u64.u16 %rd24, %rs10;
+ cvt.u64.u16 %rd24, %rs9;
add.s64 %rd25, %rd23, %rd24;
- mul.lo.s64 %rd26, %rd25, %rd14;
- add.s64 %rd27, %rd26, %rd16;
+ mul.lo.s64 %rd26, %rd25, %rd15;
+ add.s64 %rd27, %rd26, %rd10;
add.s64 %rd5, %rd8, %rd27;
- cvt.u32.u16 %r63, %rs7;
- setp.ge.u32 %p7, %r6, %r63;
- cvt.u32.u16 %r64, %rs8;
- setp.ge.u32 %p8, %r7, %r64;
- mov.u16 %rs21, 0;
+ setp.ge.u32 %p7, %r6, %r45;
+ cvt.u32.u16 %r61, %rs8;
+ setp.ge.u32 %p8, %r7, %r61;
+ mov.u16 %rs16, 0;
or.pred %p9, %p7, %p8;
- mov.u16 %rs20, %rs21;
- mov.u32 %r127, %r126;
- mov.u32 %r128, %r126;
+ mov.u16 %rs15, %rs16;
+ mov.u32 %r125, %r124;
+ mov.u32 %r126, %r124;
@%p9 bra BB0_3;
- ld.global.u8 %rs13, [%rd2];
- cvt.u32.u16 %r128, %rs13;
- ld.global.u8 %r127, [%rd2+1];
- ld.global.u8 %r126, [%rd2+2];
- mul.wide.u16 %r65, %rs13, 18;
- mad.lo.s32 %r66, %r127, 183, %r65;
- mad.lo.s32 %r67, %r126, 54, %r66;
- shr.u32 %r68, %r67, 8;
- cvt.u16.u32 %rs20, %r68;
+ ld.global.u32 %r62, [%rd2];
+ and.b32 %r126, %r62, 255;
+ bfe.u32 %r125, %r62, 8, 8;
+ bfe.u32 %r124, %r62, 16, 8;
+ mul.lo.s32 %r63, %r126, 18;
+ mad.lo.s32 %r64, %r124, 54, %r63;
+ mad.lo.s32 %r65, %r125, 183, %r64;
+ shr.u32 %r66, %r65, 8;
+ cvt.u16.u32 %rs15, %r66;
BB0_3:
- st.global.u8 [%rd3], %rs20;
+ st.global.u8 [%rd3], %rs15;
add.s32 %r14, %r6, 1;
- setp.ge.u32 %p11, %r14, %r63;
+ setp.ge.u32 %p11, %r14, %r45;
or.pred %p12, %p11, %p8;
@%p12 bra BB0_5;
- ld.global.u8 %rs15, [%rd2+4];
- cvt.u32.u16 %r71, %rs15;
- add.s32 %r128, %r71, %r128;
- ld.global.u8 %r72, [%rd2+5];
- add.s32 %r127, %r72, %r127;
- ld.global.u8 %r73, [%rd2+6];
- add.s32 %r126, %r73, %r126;
- mul.wide.u16 %r74, %rs15, 18;
- mad.lo.s32 %r75, %r72, 183, %r74;
- mad.lo.s32 %r76, %r73, 54, %r75;
- shr.u32 %r77, %r76, 8;
- cvt.u16.u32 %rs21, %r77;
+ ld.global.u32 %r69, [%rd2+4];
+ and.b32 %r70, %r69, 255;
+ add.s32 %r126, %r70, %r126;
+ bfe.u32 %r71, %r69, 8, 8;
+ add.s32 %r125, %r71, %r125;
+ bfe.u32 %r72, %r69, 16, 8;
+ add.s32 %r124, %r72, %r124;
+ mul.lo.s32 %r73, %r70, 18;
+ mad.lo.s32 %r74, %r72, 54, %r73;
+ mad.lo.s32 %r75, %r71, 183, %r74;
+ shr.u32 %r76, %r75, 8;
+ cvt.u16.u32 %rs16, %r76;
BB0_5:
- setp.lt.u32 %p13, %r6, %r63;
- st.global.u8 [%rd3+1], %rs21;
- and.b32 %r83, %r2, 32767;
- shl.b32 %r84, %r83, 1;
- add.s32 %r85, %r84, 1;
- setp.lt.u32 %p14, %r85, %r64;
+ setp.lt.u32 %p13, %r6, %r45;
+ st.global.u8 [%rd3+1], %rs16;
+ and.b32 %r82, %r2, 32767;
+ shl.b32 %r83, %r82, 1;
+ add.s32 %r84, %r83, 1;
+ setp.lt.u32 %p14, %r84, %r61;
and.pred %p15, %p13, %p14;
@%p15 bra BB0_7;
bra.uni BB0_6;
BB0_7:
- cvt.u64.u16 %rd28, %rs9;
+ cvt.u64.u16 %rd28, %rs7;
add.s64 %rd29, %rd1, %rd28;
- add.s64 %rd31, %rd11, %rd29;
- ld.global.u8 %rs18, [%rd31];
- cvt.u32.u16 %r88, %rs18;
- add.s32 %r128, %r88, %r128;
- ld.global.u8 %r89, [%rd31+1];
- add.s32 %r127, %r89, %r127;
- ld.global.u8 %r90, [%rd31+2];
- add.s32 %r126, %r90, %r126;
- mul.wide.u16 %r91, %rs18, 18;
- mad.lo.s32 %r92, %r89, 183, %r91;
- mad.lo.s32 %r93, %r90, 54, %r92;
+ shl.b64 %rd31, %rd29, 2;
+ add.s64 %rd32, %rd11, %rd31;
+ ld.global.u32 %r87, [%rd32];
+ and.b32 %r88, %r87, 255;
+ add.s32 %r126, %r88, %r126;
+ bfe.u32 %r89, %r87, 8, 8;
+ add.s32 %r125, %r89, %r125;
+ bfe.u32 %r90, %r87, 16, 8;
+ add.s32 %r124, %r90, %r124;
+ mul.lo.s32 %r91, %r88, 18;
+ mad.lo.s32 %r92, %r90, 54, %r91;
+ mad.lo.s32 %r93, %r89, 183, %r92;
shr.u32 %r94, %r93, 8;
st.global.u8 [%rd4], %r94;
- mov.u16 %rs22, 0;
+ mov.u16 %rs17, 0;
@%p11 bra BB0_9;
- add.s32 %r96, %r42, 4;
- and.b32 %r97, %r96, 65535;
- cvt.u64.u32 %rd32, %r97;
- add.s64 %rd33, %rd1, %rd32;
- add.s64 %rd35, %rd11, %rd33;
- ld.global.u8 %rs19, [%rd35];
- cvt.u32.u16 %r98, %rs19;
- add.s32 %r128, %r98, %r128;
- ld.global.u8 %r99, [%rd35+1];
- add.s32 %r127, %r99, %r127;
- ld.global.u8 %r100, [%rd35+2];
- add.s32 %r126, %r100, %r126;
- mul.wide.u16 %r101, %rs19, 18;
- mad.lo.s32 %r102, %r99, 183, %r101;
- mad.lo.s32 %r103, %r100, 54, %r102;
- shr.u32 %r104, %r103, 8;
- cvt.u16.u32 %rs22, %r104;
+ add.s64 %rd34, %rd28, 1;
+ and.b64 %rd35, %rd34, 65535;
+ add.s64 %rd36, %rd1, %rd35;
+ shl.b64 %rd38, %rd36, 2;
+ add.s64 %rd39, %rd11, %rd38;
+ ld.global.u32 %r95, [%rd39];
+ and.b32 %r96, %r95, 255;
+ add.s32 %r126, %r96, %r126;
+ bfe.u32 %r97, %r95, 8, 8;
+ add.s32 %r125, %r97, %r125;
+ bfe.u32 %r98, %r95, 16, 8;
+ add.s32 %r124, %r98, %r124;
+ mul.lo.s32 %r99, %r96, 18;
+ mad.lo.s32 %r100, %r98, 54, %r99;
+ mad.lo.s32 %r101, %r97, 183, %r100;
+ shr.u32 %r102, %r101, 8;
+ cvt.u16.u32 %rs17, %r102;
bra.uni BB0_9;
BB0_6:
- mov.u16 %rs22, 0;
- st.global.u8 [%rd4], %rs22;
+ mov.u16 %rs17, 0;
+ st.global.u8 [%rd4], %rs17;
BB0_9:
- st.global.u8 [%rd4+1], %rs22;
- bfe.u32 %r105, %r126, 2, 8;
- mul.lo.s32 %r106, %r105, -29;
- bfe.u32 %r107, %r127, 2, 8;
- mad.lo.s32 %r108, %r107, -99, %r106;
- bfe.u32 %r109, %r128, 2, 8;
- shl.b32 %r110, %r109, 7;
- add.s32 %r111, %r108, %r110;
- shr.u32 %r112, %r111, 8;
- add.s32 %r113, %r112, 128;
- st.global.u8 [%rd5], %r113;
- shl.b32 %r114, %r126, 5;
- and.b32 %r115, %r114, 32640;
- mad.lo.s32 %r116, %r107, -116, %r115;
- mad.lo.s32 %r117, %r109, -12, %r116;
- shr.u32 %r118, %r117, 8;
- add.s32 %r119, %r118, 128;
- st.global.u8 [%rd5+1], %r119;
+ st.global.u8 [%rd4+1], %rs17;
+ bfe.u32 %r103, %r124, 2, 8;
+ mul.lo.s32 %r104, %r103, -29;
+ bfe.u32 %r105, %r125, 2, 8;
+ mad.lo.s32 %r106, %r105, -99, %r104;
+ bfe.u32 %r107, %r126, 2, 8;
+ shl.b32 %r108, %r107, 7;
+ add.s32 %r109, %r106, %r108;
+ shr.u32 %r110, %r109, 8;
+ add.s32 %r111, %r110, 128;
+ st.global.u8 [%rd5], %r111;
+ shl.b32 %r112, %r124, 5;
+ and.b32 %r113, %r112, 32640;
+ mad.lo.s32 %r114, %r105, -116, %r113;
+ mad.lo.s32 %r115, %r107, -12, %r114;
+ shr.u32 %r116, %r115, 8;
+ add.s32 %r117, %r116, 128;
+ st.global.u8 [%rd5+1], %r117;
BB0_10:
ret;
diff --git a/src/grd-cuda-avc-utils.cu b/src/grd-cuda-avc-utils.cu
index 9bca1ec1..7ce7f295 100644
--- a/src/grd-cuda-avc-utils.cu
+++ b/src/grd-cuda-avc-utils.cu
@@ -73,16 +73,17 @@ extern "C"
__global__ void
convert_2x2_bgrx_area_to_yuv420_nv12 (uint8_t *dst_data,
- uint8_t *src_data,
+ uint32_t *src_data,
uint16_t src_width,
uint16_t src_height,
- uint16_t src_stride,
uint16_t aligned_width,
uint16_t aligned_height,
uint16_t aligned_stride)
{
- uint8_t *src, *dst_y0, *dst_y1, *dst_y2, *dst_y3, *dst_u, *dst_v;
+ uint8_t *dst_y0, *dst_y1, *dst_y2, *dst_y3, *dst_u, *dst_v;
+ uint32_t *src_u32;
uint16_t s0, s1, s2, s3;
+ uint32_t bgrx;
int32_t r_a, g_a, b_a;
uint8_t r, g, b;
uint16_t x_1x1, y_1x1;
@@ -102,9 +103,9 @@ extern "C"
* -------------
*/
s0 = 0;
- s1 = 4;
- s2 = src_stride;
- s3 = src_stride + 4;
+ s1 = 1;
+ s2 = src_width;
+ s3 = src_width + 1;
/*
* Technically, the correct positions for the Y data in the resulting NV12
* image would be the following:
@@ -130,7 +131,7 @@ extern "C"
x_1x1 = x_2x2 << 1;
y_1x1 = y_2x2 << 1;
- src = src_data + y_1x1 * src_stride + (x_1x1 << 2);
+ src_u32 = src_data + y_1x1 * src_width + x_1x1;
dst_y0 = dst_data +
nv12_get_interlaced_y_1x1 (y_1x1, aligned_height) * aligned_stride +
@@ -148,9 +149,11 @@ extern "C"
/* d_0 */
if (x_1x1 < src_width && y_1x1 < src_height)
{
- b_a = b = src[s0 + 0];
- g_a = g = src[s0 + 1];
- r_a = r = src[s0 + 2];
+ bgrx = src_u32[s0];
+
+ b_a = b = *(((uint8_t *) &bgrx) + 0);
+ g_a = g = *(((uint8_t *) &bgrx) + 1);
+ r_a = r = *(((uint8_t *) &bgrx) + 2);
*dst_y0 = rgb_to_y (r, g, b);
}
else
@@ -163,10 +166,12 @@ extern "C"
if (x_1x1 + 1 < src_width && y_1x1 < src_height)
{
+ bgrx = src_u32[s1];
+
/* d_1 */
- b_a += b = src[s1 + 0];
- g_a += g = src[s1 + 1];
- r_a += r = src[s1 + 2];
+ b_a += b = *(((uint8_t *) &bgrx) + 0);
+ g_a += g = *(((uint8_t *) &bgrx) + 1);
+ r_a += r = *(((uint8_t *) &bgrx) + 2);
*dst_y1 = rgb_to_y (r, g, b);
}
else
@@ -176,18 +181,22 @@ extern "C"
if (x_1x1 < src_width && y_1x1 + 1 < src_height)
{
+ bgrx = src_u32[s2];
+
/* d_2 */
- b_a += b = src[s2 + 0];
- g_a += g = src[s2 + 1];
- r_a += r = src[s2 + 2];
+ b_a += b = *(((uint8_t *) &bgrx) + 0);
+ g_a += g = *(((uint8_t *) &bgrx) + 1);
+ r_a += r = *(((uint8_t *) &bgrx) + 2);
*dst_y2 = rgb_to_y (r, g, b);
if (x_1x1 + 1 < src_width)
{
+ bgrx = src_u32[s3];
+
/* d_3 */
- b_a += b = src[s3 + 0];
- g_a += g = src[s3 + 1];
- r_a += r = src[s3 + 2];
+ b_a += b = *(((uint8_t *) &bgrx) + 0);
+ g_a += g = *(((uint8_t *) &bgrx) + 1);
+ r_a += r = *(((uint8_t *) &bgrx) + 2);
*dst_y3 = rgb_to_y (r, g, b);
}
else
diff --git a/src/grd-hwaccel-nvidia.c b/src/grd-hwaccel-nvidia.c
index fad95cbe..6708fb0f 100644
--- a/src/grd-hwaccel-nvidia.c
+++ b/src/grd-hwaccel-nvidia.c
@@ -411,10 +411,9 @@ grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia *hwaccel_nvidia,
NV_ENC_REGISTER_RESOURCE register_res = {0};
NV_ENC_MAP_INPUT_RESOURCE map_input_res = {0};
NV_ENC_PIC_PARAMS pic_params = {0};
- uint16_t src_stride;
unsigned int grid_dim_x, grid_dim_y, grid_dim_z;
unsigned int block_dim_x, block_dim_y, block_dim_z;
- void *args[8];
+ void *args[7];
if (!g_hash_table_lookup_extended (hwaccel_nvidia->encode_sessions,
GUINT_TO_POINTER (encode_session_id),
@@ -432,8 +431,6 @@ grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia *hwaccel_nvidia,
aligned_width * (aligned_height + aligned_height / 2)))
return FALSE;
- src_stride = src_width * 4;
-
/* Threads per blocks */
block_dim_x = 32;
block_dim_y = 8;
@@ -449,10 +446,9 @@ grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia *hwaccel_nvidia,
args[1] = &src_data;
args[2] = &src_width;
args[3] = &src_height;
- args[4] = &src_stride;
- args[5] = &aligned_width;
- args[6] = &aligned_height;
- args[7] = &aligned_width;
+ args[4] = &aligned_width;
+ args[5] = &aligned_height;
+ args[6] = &aligned_width;
if (hwaccel_nvidia->cuda_funcs->cuLaunchKernel (
hwaccel_nvidia->cu_bgrx_to_yuv420, grid_dim_x, grid_dim_y, grid_dim_z,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]