-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcudaProcessFrame.cpp
executable file
·170 lines (137 loc) · 5.78 KB
/
cudaProcessFrame.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/**
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* This example demonstrates how to use the Video Decode Library with CUDA
* bindings to interop between CUDA and DX9 textures for the purpose of post
* processing video.
*/
#include "cudaProcessFrame.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "dynlink_cuda.h" // <cuda.h>
#include "helper_cuda_drvapi.h"
#include "dynlink_builtin_types.h"
// These store the matrix for YUV2RGB transformation
__constant__ float constHueColorSpaceMat[9];
__constant__ float constAlpha;
extern "C"
CUresult updateConstantMemory_drvapi(CUmodule module, float *hueCSC)
{
CUdeviceptr d_constHueCSC, d_constAlpha;
size_t d_cscBytes, d_alphaBytes;
// First grab the global device pointers from the CUBIN
cuModuleGetGlobal(&d_constHueCSC, &d_cscBytes , module, "constHueColorSpaceMat");
cuModuleGetGlobal(&d_constAlpha , &d_alphaBytes, module, "constAlpha");
CUresult error = CUDA_SUCCESS;
// Copy the constants to video memory
cuMemcpyHtoD(d_constHueCSC,
reinterpret_cast<const void *>(hueCSC),
d_cscBytes);
getLastCudaDrvErrorMsg("cuMemcpyHtoD (d_constHueCSC) copy to Constant Memory failed");
uint32 cudaAlpha = ((uint32)0xff<< 24);
cuMemcpyHtoD(d_constAlpha,
reinterpret_cast<const void *>(&cudaAlpha),
d_alphaBytes);
getLastCudaDrvErrorMsg("cuMemcpyHtoD (constAlpha) copy to Constant Memory failed");
return error;
}
extern "C"
void setColorSpaceMatrix(eColorSpace CSC, float *hueCSC, float hue)
{
float hueSin = sin(hue);
float hueCos = cos(hue);
if (CSC == ITU601)
{
//CCIR 601
hueCSC[0] = 1.1644f;
hueCSC[1] = hueSin * 1.5960f;
hueCSC[2] = hueCos * 1.5960f;
hueCSC[3] = 1.1644f;
hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f);
hueCSC[6] = 1.1644f;
hueCSC[7] = hueCos * 2.0172f;
hueCSC[8] = hueSin * -2.0172f;
}
else if (CSC == ITU709)
{
//CCIR 709
hueCSC[0] = 1.0f;
hueCSC[1] = hueSin * 1.57480f;
hueCSC[2] = hueCos * 1.57480f;
hueCSC[3] = 1.0;
hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f);
hueCSC[6] = 1.0f;
hueCSC[7] = hueCos * 1.85560f;
hueCSC[8] = hueSin * -1.85560f;
}
}
// We call this function to launch the CUDA kernel (NV12 to ARGB).
extern "C"
CUresult cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch, int nBytesPerSample,
CUdeviceptr d_dstARGB, size_t nDestPitch,
uint32 width, uint32 height,
CUfunction fpFunc, CUstream streamID)
{
CUresult status;
// Each thread will output 2 pixels at a time. The grid size width is half
// as large because of this
dim3 block(32,16,1);
dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1);
#if __CUDA_API_VERSION >= 4000
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
void *args[] = { &d_srcNV12, &nSourcePitch, &nBytesPerSample,
&d_dstARGB, &nDestPitch,
&width, &height
};
// new CUDA 4.0 Driver API Kernel launch call
status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z,
block.x, block.y, block.z,
0, streamID,
args, NULL);
#else
// This is the older Driver API launch method from CUDA (V1.0 to V3.2)
checkCudaErrors(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1));
int offset = 0;
// This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers
// device pointer for Source Surface
checkCudaErrors(cuParamSetv(fpFunc, offset, &d_srcNV12, sizeof(d_srcNV12)));
offset += sizeof(d_srcNV12);
// set the Source pitch
checkCudaErrors(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch)));
offset += sizeof(nSourcePitch);
// device pointer for Destination Surface
checkCudaErrors(cuParamSetv(fpFunc, offset, &d_dstARGB, sizeof(d_dstARGB)));
offset += sizeof(d_dstARGB);
// set the Destination Pitch
checkCudaErrors(cuParamSetv(fpFunc, offset, &nDestPitch, sizeof(nDestPitch)));
offset += sizeof(nDestPitch);
// set the width of the image
ALIGN_OFFSET(offset, __alignof(width));
checkCudaErrors(cuParamSeti(fpFunc, offset, width));
offset += sizeof(width);
// set the height of the image
ALIGN_OFFSET(offset, __alignof(height));
checkCudaErrors(cuParamSeti(fpFunc, offset, height));
offset += sizeof(height);
checkCudaErrors(cuParamSetSize(fpFunc, offset));
// Launching the kernel, we need to pass in the grid dimensions
CUresult status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID);
#endif
if (CUDA_SUCCESS != status)
{
fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %p, retval = %d\n", fpFunc, status);
return status;
}
return status;
}