cudacvtfloat.cu
1.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
namespace br { namespace cuda { namespace cudacvtfloat {
__global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
// get my index
int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
int colInd = blockIdx.x*blockDim.x + threadIdx.x;
// bounds check
if (rowInd >= rows || colInd >= cols) {
return;
}
int index = rowInd*cols + colInd;
dst[index] = (float)src[index];
}
void wrapper(const unsigned char* src, float* dst, int rows, int cols) {
unsigned char* cudaSrc;
cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
float* cudaDst;
cudaMalloc(&cudaDst, rows*cols*sizeof(float));
dim3 threadsPerBlock(8, 8);
dim3 blocks(
cols / threadsPerBlock.x + 1,
rows / threadsPerBlock.y + 1
);
kernel<<<threadsPerBlock, blocks>>>(cudaSrc, cudaDst, rows, cols);
// copy the data back to the destination
cudaMemcpy(dst, cudaDst, rows*cols*sizeof(float), cudaMemcpyDeviceToHost);
}
}}}