Commit 35d31b6ea034a981ada8e46bcd1e420bd1baed96
1 parent
68d75003
added memory allocation only at the beginning of the start
saves a about 10% on performance
Showing
1 changed file
with
9 additions
and
12 deletions
openbr/plugins/cuda/cudapca.cu
| @@ -58,6 +58,8 @@ namespace br { namespace cuda { | @@ -58,6 +58,8 @@ namespace br { namespace cuda { | ||
| 58 | 58 | ||
| 59 | float* cudaEvPtr; int _evRows; int _evCols; | 59 | float* cudaEvPtr; int _evRows; int _evCols; |
| 60 | float* cudaMeanPtr; int _meanElems; | 60 | float* cudaMeanPtr; int _meanElems; |
| 61 | + float* _cudaSrcPtr; | ||
| 62 | + float* _cudaDstPtr; | ||
| 61 | 63 | ||
| 62 | void cudapca_initwrapper() { | 64 | void cudapca_initwrapper() { |
| 63 | 65 | ||
| @@ -74,6 +76,9 @@ namespace br { namespace cuda { | @@ -74,6 +76,9 @@ namespace br { namespace cuda { | ||
| 74 | // copy the mean to the GPU | 76 | // copy the mean to the GPU |
| 75 | cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float)); | 77 | cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float)); |
| 76 | cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice); | 78 | cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice); |
| 79 | + | ||
| 80 | + cudaMalloc(&_cudaSrcPtr, _meanElems*sizeof(float)); | ||
| 81 | + cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float)); | ||
| 77 | } | 82 | } |
| 78 | 83 | ||
| 79 | void cudapca_trainwrapper() { | 84 | void cudapca_trainwrapper() { |
| @@ -173,27 +178,19 @@ namespace br { namespace cuda { | @@ -173,27 +178,19 @@ namespace br { namespace cuda { | ||
| 173 | 178 | ||
| 174 | void cudapca_projectwrapper(float* src, float* dst) { | 179 | void cudapca_projectwrapper(float* src, float* dst) { |
| 175 | // copy the image to the GPU | 180 | // copy the image to the GPU |
| 176 | - float* cudaSrcPtr; | ||
| 177 | - cudaMalloc(&cudaSrcPtr, _meanElems*sizeof(float)); | ||
| 178 | - cudaMemcpy(cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); | ||
| 179 | - | ||
| 180 | - float* cudaDstPtr; | ||
| 181 | - cudaMalloc(&cudaDstPtr, _evCols*sizeof(float)); | 181 | + cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); |
| 182 | 182 | ||
| 183 | // subtract out the mean of the image (mean is 1xpixels in size) | 183 | // subtract out the mean of the image (mean is 1xpixels in size) |
| 184 | int threadsPerBlock = 64; | 184 | int threadsPerBlock = 64; |
| 185 | int numBlocks = _meanElems / threadsPerBlock; | 185 | int numBlocks = _meanElems / threadsPerBlock; |
| 186 | - cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaMeanPtr, _meanElems); | 186 | + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, cudaMeanPtr, _meanElems); |
| 187 | 187 | ||
| 188 | // perform the multiplication | 188 | // perform the multiplication |
| 189 | threadsPerBlock = 64; | 189 | threadsPerBlock = 64; |
| 190 | numBlocks = _evCols / threadsPerBlock; | 190 | numBlocks = _evCols / threadsPerBlock; |
| 191 | - cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols); | 191 | + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, _cudaDstPtr, cudaEvPtr, _evRows, _evCols); |
| 192 | 192 | ||
| 193 | // copy the data back to the CPU | 193 | // copy the data back to the CPU |
| 194 | - cudaMemcpy(dst, cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); | ||
| 195 | - | ||
| 196 | - cudaFree(cudaSrcPtr); | ||
| 197 | - cudaFree(cudaDstPtr); | 194 | + cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); |
| 198 | } | 195 | } |
| 199 | }} | 196 | }} |