Commit 35d31b6ea034a981ada8e46bcd1e420bd1baed96
1 parent
68d75003
added memory allocation only at the beginning of the start
saves a about 10% on performance
Showing
1 changed file
with
9 additions
and
12 deletions
openbr/plugins/cuda/cudapca.cu
| ... | ... | @@ -58,6 +58,8 @@ namespace br { namespace cuda { |
| 58 | 58 | |
| 59 | 59 | float* cudaEvPtr; int _evRows; int _evCols; |
| 60 | 60 | float* cudaMeanPtr; int _meanElems; |
| 61 | + float* _cudaSrcPtr; | |
| 62 | + float* _cudaDstPtr; | |
| 61 | 63 | |
| 62 | 64 | void cudapca_initwrapper() { |
| 63 | 65 | |
| ... | ... | @@ -74,6 +76,9 @@ namespace br { namespace cuda { |
| 74 | 76 | // copy the mean to the GPU |
| 75 | 77 | cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float)); |
| 76 | 78 | cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice); |
| 79 | + | |
| 80 | + cudaMalloc(&_cudaSrcPtr, _meanElems*sizeof(float)); | |
| 81 | + cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float)); | |
| 77 | 82 | } |
| 78 | 83 | |
| 79 | 84 | void cudapca_trainwrapper() { |
| ... | ... | @@ -173,27 +178,19 @@ namespace br { namespace cuda { |
| 173 | 178 | |
| 174 | 179 | void cudapca_projectwrapper(float* src, float* dst) { |
| 175 | 180 | // copy the image to the GPU |
| 176 | - float* cudaSrcPtr; | |
| 177 | - cudaMalloc(&cudaSrcPtr, _meanElems*sizeof(float)); | |
| 178 | - cudaMemcpy(cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); | |
| 179 | - | |
| 180 | - float* cudaDstPtr; | |
| 181 | - cudaMalloc(&cudaDstPtr, _evCols*sizeof(float)); | |
| 181 | + cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); | |
| 182 | 182 | |
| 183 | 183 | // subtract out the mean of the image (mean is 1xpixels in size) |
| 184 | 184 | int threadsPerBlock = 64; |
| 185 | 185 | int numBlocks = _meanElems / threadsPerBlock; |
| 186 | - cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaMeanPtr, _meanElems); | |
| 186 | + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, cudaMeanPtr, _meanElems); | |
| 187 | 187 | |
| 188 | 188 | // perform the multiplication |
| 189 | 189 | threadsPerBlock = 64; |
| 190 | 190 | numBlocks = _evCols / threadsPerBlock; |
| 191 | - cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols); | |
| 191 | + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, _cudaDstPtr, cudaEvPtr, _evRows, _evCols); | |
| 192 | 192 | |
| 193 | 193 | // copy the data back to the CPU |
| 194 | - cudaMemcpy(dst, cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); | |
| 195 | - | |
| 196 | - cudaFree(cudaSrcPtr); | |
| 197 | - cudaFree(cudaDstPtr); | |
| 194 | + cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); | |
| 198 | 195 | } |
| 199 | 196 | }} | ... | ... |