Commit 35d31b6ea034a981ada8e46bcd1e420bd1baed96

Authored by DepthDeluxe
1 parent 68d75003

added memory allocation only at the beginning of the start

saves a about 10% on performance
Showing 1 changed file with 9 additions and 12 deletions
openbr/plugins/cuda/cudapca.cu
... ... @@ -58,6 +58,8 @@ namespace br { namespace cuda {
58 58  
59 59 float* cudaEvPtr; int _evRows; int _evCols;
60 60 float* cudaMeanPtr; int _meanElems;
  61 + float* _cudaSrcPtr;
  62 + float* _cudaDstPtr;
61 63  
62 64 void cudapca_initwrapper() {
63 65  
... ... @@ -74,6 +76,9 @@ namespace br { namespace cuda {
74 76 // copy the mean to the GPU
75 77 cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float));
76 78 cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice);
  79 +
  80 + cudaMalloc(&_cudaSrcPtr, _meanElems*sizeof(float));
  81 + cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float));
77 82 }
78 83  
79 84 void cudapca_trainwrapper() {
... ... @@ -173,27 +178,19 @@ namespace br { namespace cuda {
173 178  
174 179 void cudapca_projectwrapper(float* src, float* dst) {
175 180 // copy the image to the GPU
176   - float* cudaSrcPtr;
177   - cudaMalloc(&cudaSrcPtr, _meanElems*sizeof(float));
178   - cudaMemcpy(cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
179   -
180   - float* cudaDstPtr;
181   - cudaMalloc(&cudaDstPtr, _evCols*sizeof(float));
  181 + cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
182 182  
183 183 // subtract out the mean of the image (mean is 1xpixels in size)
184 184 int threadsPerBlock = 64;
185 185 int numBlocks = _meanElems / threadsPerBlock;
186   - cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaMeanPtr, _meanElems);
  186 + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, cudaMeanPtr, _meanElems);
187 187  
188 188 // perform the multiplication
189 189 threadsPerBlock = 64;
190 190 numBlocks = _evCols / threadsPerBlock;
191   - cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols);
  191 + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, _cudaDstPtr, cudaEvPtr, _evRows, _evCols);
192 192  
193 193 // copy the data back to the CPU
194   - cudaMemcpy(dst, cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
195   -
196   - cudaFree(cudaSrcPtr);
197   - cudaFree(cudaDstPtr);
  194 + cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
198 195 }
199 196 }}
... ...