Commit 35d31b6ea034a981ada8e46bcd1e420bd1baed96

Authored by DepthDeluxe
1 parent 68d75003

added memory allocation only at the beginning of the start

saves a about 10% on performance
Showing 1 changed file with 9 additions and 12 deletions
openbr/plugins/cuda/cudapca.cu
@@ -58,6 +58,8 @@ namespace br { namespace cuda { @@ -58,6 +58,8 @@ namespace br { namespace cuda {
58 58
59 float* cudaEvPtr; int _evRows; int _evCols; 59 float* cudaEvPtr; int _evRows; int _evCols;
60 float* cudaMeanPtr; int _meanElems; 60 float* cudaMeanPtr; int _meanElems;
  61 + float* _cudaSrcPtr;
  62 + float* _cudaDstPtr;
61 63
62 void cudapca_initwrapper() { 64 void cudapca_initwrapper() {
63 65
@@ -74,6 +76,9 @@ namespace br { namespace cuda { @@ -74,6 +76,9 @@ namespace br { namespace cuda {
74 // copy the mean to the GPU 76 // copy the mean to the GPU
75 cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float)); 77 cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float));
76 cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice); 78 cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice);
  79 +
  80 + cudaMalloc(&_cudaSrcPtr, _meanElems*sizeof(float));
  81 + cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float));
77 } 82 }
78 83
79 void cudapca_trainwrapper() { 84 void cudapca_trainwrapper() {
@@ -173,27 +178,19 @@ namespace br { namespace cuda { @@ -173,27 +178,19 @@ namespace br { namespace cuda {
173 178
174 void cudapca_projectwrapper(float* src, float* dst) { 179 void cudapca_projectwrapper(float* src, float* dst) {
175 // copy the image to the GPU 180 // copy the image to the GPU
176 - float* cudaSrcPtr;  
177 - cudaMalloc(&cudaSrcPtr, _meanElems*sizeof(float));  
178 - cudaMemcpy(cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);  
179 -  
180 - float* cudaDstPtr;  
181 - cudaMalloc(&cudaDstPtr, _evCols*sizeof(float)); 181 + cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
182 182
183 // subtract out the mean of the image (mean is 1xpixels in size) 183 // subtract out the mean of the image (mean is 1xpixels in size)
184 int threadsPerBlock = 64; 184 int threadsPerBlock = 64;
185 int numBlocks = _meanElems / threadsPerBlock; 185 int numBlocks = _meanElems / threadsPerBlock;
186 - cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaMeanPtr, _meanElems); 186 + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, cudaMeanPtr, _meanElems);
187 187
188 // perform the multiplication 188 // perform the multiplication
189 threadsPerBlock = 64; 189 threadsPerBlock = 64;
190 numBlocks = _evCols / threadsPerBlock; 190 numBlocks = _evCols / threadsPerBlock;
191 - cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols); 191 + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, _cudaDstPtr, cudaEvPtr, _evRows, _evCols);
192 192
193 // copy the data back to the CPU 193 // copy the data back to the CPU
194 - cudaMemcpy(dst, cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);  
195 -  
196 - cudaFree(cudaSrcPtr);  
197 - cudaFree(cudaDstPtr); 194 + cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
198 } 195 }
199 }} 196 }}