diff --git a/openbr/core/boost.cpp b/openbr/core/boost.cpp
index 874f974..50ceb2d 100644
--- a/openbr/core/boost.cpp
+++ b/openbr/core/boost.cpp
@@ -1,4 +1,6 @@
 #include <opencv2/imgproc/imgproc.hpp>
+#include <openbr/core/opencvutils.h>
+#include <QDebug>
 
 #include "boost.h"
 #include "cxmisc.h"
@@ -131,10 +133,10 @@ static CvMat* cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, b
 
 //------------------------------------- FeatureEvaluator ---------------------------------------
 
-void FeatureEvaluator::init(Representation *_representation, int _maxSampleCount)
+void FeatureEvaluator::init(Representation *_representation, int maxSampleCount)
 {
     representation = _representation;
-    cls.create( (int)_maxSampleCount, 1, CV_32FC1 );
+    cls.create(maxSampleCount, 1, CV_32FC1);
 }
 
 void FeatureEvaluator::setImage(const Template &src, uchar clsLabel, int idx)
@@ -173,10 +175,13 @@ struct CascadeBoostTrainData : CvDTreeTrainData
     virtual void setData(const FeatureEvaluator* _featureEvaluator,
                          int _numSamples, int _precalcValBufSize, int _precalcIdxBufSize,
                          const CvDTreeParams& _params=CvDTreeParams());
+
+    void initVarType();
     void precalculate();
 
     virtual CvDTreeNode* subsample_data(const CvMat* _subsample_idx);
 
+    virtual const int* getBufferValues(CvDTreeNode* n, int* labelsBuf, uint64_t vi);
     virtual const int* get_class_labels(CvDTreeNode* n, int* labelsBuf);
     virtual const int* get_cv_labels(CvDTreeNode* n, int* labelsBuf);
     virtual const int* get_sample_indices(CvDTreeNode* n, int* indicesBuf);
@@ -188,8 +193,10 @@ struct CascadeBoostTrainData : CvDTreeTrainData
     virtual void free_train_data();
 
     const FeatureEvaluator* featureEvaluator;
+
     cv::Mat valCache; // precalculated feature values (CV_32FC1)
     CvMat _resp; // for casting
+
     int numPrecalcVal, numPrecalcIdx, channels;
 };
 
@@ -370,6 +377,19 @@ CascadeBoostTrainData::CascadeBoostTrainData(const FeatureEvaluator* _featureEva
     shared = true;
     set_params( _params );
     max_c_count = MAX( 2, featureEvaluator->getMaxCatCount() );
+
+    initVarType();
+
+    int maxSplitSize = cvAlign(sizeof(CvDTreeSplit) + (MAX(0,max_c_count - 33)/32)*sizeof(int),sizeof(void*));
+    int treeBlockSize = MAX((int)sizeof(CvDTreeNode)*8, maxSplitSize);
+    treeBlockSize = MAX(treeBlockSize + BlockSizeDelta, MinBlockSize);
+    tree_storage = cvCreateMemStorage( treeBlockSize );
+    node_heap = cvCreateSet( 0, sizeof(node_heap[0]), sizeof(CvDTreeNode), tree_storage );
+    split_heap = cvCreateSet( 0, sizeof(split_heap[0]), maxSplitSize, tree_storage );
+}
+
+void CascadeBoostTrainData::initVarType()
+{
     var_type = cvCreateMat( 1, var_count + 2, CV_32SC(channels) );
     if ( featureEvaluator->getMaxCatCount() > 0 )
     {
@@ -392,13 +412,6 @@ CascadeBoostTrainData::CascadeBoostTrainData(const FeatureEvaluator* _featureEva
     }
     var_type->data.i[var_count] = cat_var_count;
     var_type->data.i[var_count+1] = cat_var_count+1;
-
-    int maxSplitSize = cvAlign(sizeof(CvDTreeSplit) + (MAX(0,max_c_count - 33)/32)*sizeof(int),sizeof(void*));
-    int treeBlockSize = MAX((int)sizeof(CvDTreeNode)*8, maxSplitSize);
-    treeBlockSize = MAX(treeBlockSize + BlockSizeDelta, MinBlockSize);
-    tree_storage = cvCreateMemStorage( treeBlockSize );
-    node_heap = cvCreateSet( 0, sizeof(node_heap[0]), sizeof(CvDTreeNode), tree_storage );
-    split_heap = cvCreateSet( 0, sizeof(split_heap[0]), maxSplitSize, tree_storage );
 }
 
 CascadeBoostTrainData::CascadeBoostTrainData(const FeatureEvaluator* _featureEvaluator,
@@ -449,14 +462,20 @@ void CascadeBoostTrainData::setData( const FeatureEvaluator* _featureEvaluator,
     if (sample_count < 65536)
         is_buf_16u = true;
 
+    // 1048576 is the number of bytes in a megabyte
     numPrecalcVal = min( cvRound((double)_precalcValBufSize*1048576. / (sizeof(float)*sample_count)), var_count );
-    numPrecalcIdx = min( cvRound((double)_precalcIdxBufSize*1048576. /
-                ((is_buf_16u ? sizeof(unsigned short) : sizeof (int))*sample_count)), var_count );
+    qDebug("MB required to cache all %d features: %d", var_count, (sizeof(float)*(uint64_t)sample_count*var_count)/1048576);
+    qDebug("Features cached: %.2f", float(numPrecalcVal)/var_count);
+
+    numPrecalcIdx = min( cvRound((double)_precalcIdxBufSize*1048576. / ((is_buf_16u ? sizeof(unsigned short) : sizeof (int))*sample_count)), var_count );
+    qDebug("MB required to cache all %d sorted indices: %d", var_count, ((is_buf_16u ? sizeof(unsigned short) : sizeof (int))*(uint64_t)sample_count*var_count)/1048576);
+    qDebug("Indices cached: %.2f", float(numPrecalcIdx)/var_count);
 
     assert( numPrecalcIdx >= 0 && numPrecalcVal >= 0 );
 
     valCache.create( numPrecalcVal, sample_count, CV_32FC1 );
     var_type = cvCreateMat( 1, var_count + 2, CV_32SC(channels) );
+
     if ( featureEvaluator->getMaxCatCount() > 0 )
     {
         numPrecalcIdx = 0;
@@ -478,15 +497,18 @@ void CascadeBoostTrainData::setData( const FeatureEvaluator* _featureEvaluator,
     }
     var_type->data.i[var_count] = cat_var_count;
     var_type->data.i[var_count+1] = cat_var_count+1;
+
+    initVarType();
+
     work_var_count = ( cat_var_count ? 0 : numPrecalcIdx ) + 1/*cv_lables*/;
     buf_count = 2;
 
     buf_size = -1; // the member buf_size is obsolete
 
     effective_buf_size = (uint64)(work_var_count + 1)*(uint64)sample_count * buf_count; // this is the total size of "CvMat buf" to be allocated
+
     effective_buf_width = sample_count;
     effective_buf_height = work_var_count+1;
-
     if (effective_buf_width >= effective_buf_height)
         effective_buf_height *= buf_count;
     else
@@ -509,8 +531,7 @@ void CascadeBoostTrainData::setData( const FeatureEvaluator* _featureEvaluator,
     // now calculate the maximum size of split,
     // create memory storage that will keep nodes and splits of the decision tree
     // allocate root node and the buffer for the whole training data
-    int maxSplitSize = cvAlign(sizeof(CvDTreeSplit) +
-        (MAX(0,sample_count - 33)/32)*sizeof(int),sizeof(void*));
+    int maxSplitSize = cvAlign(sizeof(CvDTreeSplit) + (MAX(0,sample_count - 33)/32)*sizeof(int),sizeof(void*));
     int treeBlockSize = MAX((int)sizeof(CvDTreeNode)*8, maxSplitSize);
     treeBlockSize = MAX(treeBlockSize + BlockSizeDelta, MinBlockSize);
     tree_storage = cvCreateMemStorage( treeBlockSize );
@@ -527,9 +548,9 @@ void CascadeBoostTrainData::setData( const FeatureEvaluator* _featureEvaluator,
 
     // set sample labels
     if (is_buf_16u)
-        udst = (unsigned short*)(buf->data.s + work_var_count*sample_count);
+        udst = (unsigned short*)(buf->data.s + (uint64)work_var_count*sample_count);
     else
-        idst = buf->data.i + work_var_count*sample_count;
+        idst = buf->data.i + (uint64)work_var_count*sample_count;
 
     for (int si = 0; si < sample_count; si++)
     {
@@ -568,7 +589,7 @@ const int* CascadeBoostTrainData::get_class_labels( CvDTreeNode* n, int* labelsB
     int nodeSampleCount = n->sample_count;
     int rStep = CV_IS_MAT_CONT( responses->type ) ? 1 : responses->step / CV_ELEM_SIZE( responses->type );
 
-    int* sampleIndicesBuf = labelsBuf; //
+    int* sampleIndicesBuf = labelsBuf;
     const int* sampleIndices = get_sample_indices(n, sampleIndicesBuf);
     for( int si = 0; si < nodeSampleCount; si++ )
     {
@@ -578,79 +599,85 @@ const int* CascadeBoostTrainData::get_class_labels( CvDTreeNode* n, int* labelsB
     return labelsBuf;
 }
 
+const int* CascadeBoostTrainData::getBufferValues(CvDTreeNode* n, int* indicesBuf, uint64_t vi)
+{
+    const int* cat_values = 0;
+    if (!is_buf_16u)
+        cat_values = buf->data.i + n->buf_idx*get_length_subbuf() + vi*sample_count + n->offset;
+    else {
+        const unsigned short* short_values = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() + vi*sample_count + n->offset);
+        for (int i = 0; i < n->sample_count; i++)
+            indicesBuf[i] = short_values[i];
+        cat_values = indicesBuf;
+    }
+
+    return cat_values;
+}
+
 const int* CascadeBoostTrainData::get_sample_indices( CvDTreeNode* n, int* indicesBuf )
 {
-    return CvDTreeTrainData::get_cat_var_data( n, get_work_var_count(), indicesBuf );
+    return getBufferValues(n,indicesBuf,get_work_var_count());
 }
 
-const int* CascadeBoostTrainData::get_cv_labels( CvDTreeNode* n, int* labels_buf )
+const int* CascadeBoostTrainData::get_cv_labels( CvDTreeNode* n, int* indicesBuf )
 {
-    return CvDTreeTrainData::get_cat_var_data( n, get_work_var_count() - 1, labels_buf );
+    return getBufferValues(n,indicesBuf,get_work_var_count()-1);
 }
 
-void CascadeBoostTrainData::get_ord_var_data( CvDTreeNode* n, int vi, float* ordValuesBuf, int* sortedIndicesBuf,
-        const float** ordValues, const int** sortedIndices, int* sampleIndicesBuf )
+void CascadeBoostTrainData::get_ord_var_data( CvDTreeNode* n, int vi, float* ordValuesBuf, int* sortedIndicesBuf, const float** ordValues, const int** sortedIndices, int* sampleIndicesBuf )
 {
     int nodeSampleCount = n->sample_count;
+
+    // For this node, get our sample indices
     const int* sampleIndices = get_sample_indices(n, sampleIndicesBuf);
 
-    if ( vi < numPrecalcIdx )
-    {
-        if( !is_buf_16u )
-            *sortedIndices = buf->data.i + n->buf_idx*get_length_subbuf() + vi*sample_count + n->offset;
-        else
-        {
-            const unsigned short* shortIndices = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
-                                                    vi*sample_count + n->offset );
-            for( int i = 0; i < nodeSampleCount; i++ )
+    // For this feature (this code refers to features as values, hence vi == value index),
+    // have we precalculated (presorted) the training samples by their feature response?
+    if (vi < numPrecalcIdx) {
+        if (!is_buf_16u)
+            *sortedIndices = buf->data.i + n->buf_idx*get_length_subbuf() + (uint64)vi*sample_count + n->offset;
+        else {
+            const unsigned short* shortIndices = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() + (uint64)vi*sample_count + n->offset );
+            for (int i = 0; i < nodeSampleCount; i++)
                 sortedIndicesBuf[i] = shortIndices[i];
-
             *sortedIndices = sortedIndicesBuf;
         }
 
-        if( vi < numPrecalcVal )
-        {
-            for( int i = 0; i < nodeSampleCount; i++ )
-            {
+        // For this feature, have we precalculated all of the feature responses?
+        if (vi < numPrecalcVal) {
+            for (int i = 0; i < nodeSampleCount; i++) {
                 int idx = (*sortedIndices)[i];
                 idx = sampleIndices[idx];
-                ordValuesBuf[i] =  valCache.at<float>( vi, idx);
+                ordValuesBuf[i] = valCache.at<float>(vi, idx);
             }
-        }
-        else
-        {
-            for( int i = 0; i < nodeSampleCount; i++ )
-            {
+        } else {
+            for (int i = 0; i < nodeSampleCount; i++) {
                 int idx = (*sortedIndices)[i];
                 idx = sampleIndices[idx];
-                ordValuesBuf[i] = (*featureEvaluator)( vi, idx);
+                ordValuesBuf[i] = (*featureEvaluator)(vi, idx);
             }
         }
-    }
-    else // vi >= numPrecalcIdx
-    {
+    } else {
         cv::AutoBuffer<float> abuf(nodeSampleCount);
         float* sampleValues = &abuf[0];
 
-        if ( vi < numPrecalcVal )
-        {
-            for( int i = 0; i < nodeSampleCount; i++ )
-            {
+        if (vi < numPrecalcVal) {
+            for (int i = 0; i < nodeSampleCount; i++) {
                 sortedIndicesBuf[i] = i;
                 sampleValues[i] = valCache.at<float>( vi, sampleIndices[i] );
             }
-        }
-        else
-        {
-            for( int i = 0; i < nodeSampleCount; i++ )
-            {
+        } else {
+            for (int i = 0; i < nodeSampleCount; i++) {
                 sortedIndicesBuf[i] = i;
                 sampleValues[i] = (*featureEvaluator)( vi, sampleIndices[i]);
             }
         }
+
         icvSortIntAux( sortedIndicesBuf, nodeSampleCount, &sampleValues[0] );
-        for( int i = 0; i < nodeSampleCount; i++ )
+
+        for (int i = 0; i < nodeSampleCount; i++)
             ordValuesBuf[i] = (&sampleValues[0])[sortedIndicesBuf[i]];
+
         *sortedIndices = sortedIndicesBuf;
     }
 
@@ -660,7 +687,8 @@ void CascadeBoostTrainData::get_ord_var_data( CvDTreeNode* n, int vi, float* ord
 const int* CascadeBoostTrainData::get_cat_var_data( CvDTreeNode* n, int vi, int* catValuesBuf )
 {
     int nodeSampleCount = n->sample_count;
-    int* sampleIndicesBuf = catValuesBuf; //
+    int* sampleIndicesBuf = catValuesBuf;
+
     const int* sampleIndices = get_sample_indices(n, sampleIndicesBuf);
 
     if ( vi < numPrecalcVal )
@@ -691,112 +719,117 @@ float CascadeBoostTrainData::getVarValue( int vi, int si )
     return (*featureEvaluator)( vi, si );
 }
 
-struct FeatureIdxOnlyPrecalc : ParallelLoopBody
+struct Precalc : ParallelLoopBody
+{
+    const FeatureEvaluator* featureEvaluator;
+    int sampleCount;
+
+    Precalc(const FeatureEvaluator* featureEvaluator, int sampleCount) :
+        featureEvaluator(featureEvaluator),
+        sampleCount(sampleCount)
+    {}
+
+    virtual void operator()(const Range& range) const = 0;
+};
+
+struct IndexPrecalc : Precalc
 {
-    FeatureIdxOnlyPrecalc( const FeatureEvaluator* _featureEvaluator, CvMat* _buf, int _sample_count, bool _is_buf_16u )
+    int* idst;
+    unsigned short* udst;
+    bool isBufShort;
+
+    IndexPrecalc(const FeatureEvaluator* featureEvaluator, CvMat* buf, int sampleCount, bool isBufShort) :
+        Precalc(featureEvaluator, sampleCount),
+        isBufShort(isBufShort)
+    {
+        udst = (unsigned short*)buf->data.s;
+        idst = buf->data.i;
+    }
+
+    void setBuffer(int fi, int si) const
+    {
+        if (isBufShort) *(udst + (uint64)fi*sampleCount + si) = (unsigned short)si;
+        else            *(idst + (uint64)fi*sampleCount + si) = si;
+    }
+
+    void sortBuffer(int fi, float *valCachePtr) const
     {
-        featureEvaluator = _featureEvaluator;
-        sample_count = _sample_count;
-        udst = (unsigned short*)_buf->data.s;
-        idst = _buf->data.i;
-        is_buf_16u = _is_buf_16u;
+        if (isBufShort) icvSortUShAux(udst + (uint64)fi*sampleCount, sampleCount, valCachePtr);
+        else            icvSortIntAux(idst + (uint64)fi*sampleCount, sampleCount, valCachePtr);
     }
-    void operator()( const Range& range ) const
+
+    virtual void operator()(const Range& range) const
     {
-        cv::AutoBuffer<float> valCache(sample_count);
+        cv::AutoBuffer<float> valCache(sampleCount);
         float* valCachePtr = (float*)valCache;
-        for ( int fi = range.start; fi < range.end; fi++)
-        {
-            for( int si = 0; si < sample_count; si++ )
-            {
-                valCachePtr[si] = (*featureEvaluator)( fi, si );
-                if ( is_buf_16u )
-                    *(udst + fi*sample_count + si) = (unsigned short)si;
-                else
-                    *(idst + fi*sample_count + si) = si;
+        for (int fi = range.start; fi < range.end; fi++) {
+            for (int si = 0; si < sampleCount; si++) {
+                valCachePtr[si] = (*featureEvaluator)(fi, si);
+                setBuffer(fi, si);
             }
-            if ( is_buf_16u )
-                icvSortUShAux( udst + fi*sample_count, sample_count, valCachePtr );
-            else
-                icvSortIntAux( idst + fi*sample_count, sample_count, valCachePtr );
+            sortBuffer(fi, valCachePtr);
         }
     }
-    const FeatureEvaluator* featureEvaluator;
-    int sample_count;
-    int* idst;
-    unsigned short* udst;
-    bool is_buf_16u;
 };
 
-struct FeatureValAndIdxPrecalc : ParallelLoopBody
+struct FeatureAndIndexPrecalc : IndexPrecalc
 {
-    FeatureValAndIdxPrecalc( const FeatureEvaluator* _featureEvaluator, CvMat* _buf, Mat* _valCache, int _sample_count, bool _is_buf_16u )
-    {
-        featureEvaluator = _featureEvaluator;
-        valCache = _valCache;
-        sample_count = _sample_count;
-        udst = (unsigned short*)_buf->data.s;
-        idst = _buf->data.i;
-        is_buf_16u = _is_buf_16u;
-    }
-    void operator()( const Range& range ) const
+    Mat *valCache;
+
+    FeatureAndIndexPrecalc(const FeatureEvaluator* featureEvaluator, CvMat* buf, Mat* valCache, int sampleCount, bool isBufShort) :
+        IndexPrecalc(featureEvaluator, buf, sampleCount, isBufShort),
+        valCache(valCache)
+    {}
+
+    virtual void operator()(const Range& range) const
     {
-        for ( int fi = range.start; fi < range.end; fi++)
-        {
-            for( int si = 0; si < sample_count; si++ )
-            {
-                valCache->at<float>(fi,si) = (*featureEvaluator)( fi, si );
-                if ( is_buf_16u )
-                    *(udst + fi*sample_count + si) = (unsigned short)si;
-                else
-                    *(idst + fi*sample_count + si) = si;
+        for (int fi = range.start; fi < range.end; fi++) {
+            for (int si = 0; si < sampleCount; si++) {
+                valCache->at<float>(fi,si) = (*featureEvaluator)(fi, si);
+                setBuffer(fi, si);
             }
-            if ( is_buf_16u )
-                icvSortUShAux( udst + fi*sample_count, sample_count, valCache->ptr<float>(fi) );
-            else
-                icvSortIntAux( idst + fi*sample_count, sample_count, valCache->ptr<float>(fi) );
+            sortBuffer(fi, valCache->ptr<float>(fi));
         }
     }
-    const FeatureEvaluator* featureEvaluator;
-    Mat* valCache;
-    int sample_count;
-    int* idst;
-    unsigned short* udst;
-    bool is_buf_16u;
 };
 
-struct FeatureValOnlyPrecalc : ParallelLoopBody
+struct FeaturePrecalc : Precalc
 {
-    FeatureValOnlyPrecalc( const FeatureEvaluator* _featureEvaluator, Mat* _valCache, int _sample_count )
-    {
-        featureEvaluator = _featureEvaluator;
-        valCache = _valCache;
-        sample_count = _sample_count;
-    }
-    void operator()( const Range& range ) const
+    Mat *valCache;
+
+    FeaturePrecalc(const FeatureEvaluator* featureEvaluator, Mat* valCache, int sampleCount) :
+        Precalc(featureEvaluator, sampleCount),
+        valCache(valCache)
+    {}
+
+    virtual void operator()(const Range& range) const
     {
-        for ( int fi = range.start; fi < range.end; fi++)
-            for( int si = 0; si < sample_count; si++ )
-                valCache->at<float>(fi,si) = (*featureEvaluator)( fi, si );
+        for (int fi = range.start; fi < range.end; fi++)
+            for (int si = 0; si < sampleCount; si++)
+                valCache->at<float>(fi,si) = (*featureEvaluator)(fi, si);
     }
-    const FeatureEvaluator* featureEvaluator;
-    Mat* valCache;
-    int sample_count;
 };
 
 void CascadeBoostTrainData::precalculate()
 {
-    int minNum = MIN( numPrecalcVal, numPrecalcIdx);
+    int minPrecalc = std::min(numPrecalcVal, numPrecalcIdx);
+
+    qDebug() << "Starting precalculation...";
 
     QTime time;
     time.start();
 
-    parallel_for_( Range(numPrecalcVal, numPrecalcIdx),
-                   FeatureIdxOnlyPrecalc(featureEvaluator, buf, sample_count, is_buf_16u!=0) );
-    parallel_for_( Range(0, minNum),
-                   FeatureValAndIdxPrecalc(featureEvaluator, buf, &valCache, sample_count, is_buf_16u!=0) );
-    parallel_for_( Range(minNum, numPrecalcVal),
-                   FeatureValOnlyPrecalc(featureEvaluator, &valCache, sample_count) );
+    // Compute features and sort training samples for feature indices we are not going to cache
+    parallel_for_(Range(numPrecalcVal, numPrecalcIdx),
+                  IndexPrecalc(featureEvaluator, buf, sample_count, is_buf_16u!=0));
+
+    // Compute features and sort training samples for features indices we are going to cache
+    parallel_for_(Range(0, minPrecalc),
+                  FeatureAndIndexPrecalc(featureEvaluator, buf, &valCache, sample_count, is_buf_16u!=0));
+
+    // Compute feature values for feature indices for which we are not going to sort training samples
+    parallel_for_(Range(minPrecalc, numPrecalcVal),
+                  FeaturePrecalc(featureEvaluator, &valCache, sample_count));
 
     cout << "Precalculation time (ms): " << time.elapsed() << endl;
 }
@@ -830,89 +863,84 @@ CvDTreeNode* CascadeBoostTree::predict( int sampleIdx ) const
     return node;
 }
 
+// This function splits the training data from the parent node into training
+// data for both child nodes
 void CascadeBoostTree::split_node_data( CvDTreeNode* node )
 {
-    int n = node->sample_count, nl, nr, scount = data->sample_count;
-    char* dir = (char*)data->direction->data.ptr;
     CvDTreeNode *left = 0, *right = 0;
+
+    uint64_t nodeSampleCount = node->sample_count;
+    uint64_t sampleCount = data->sample_count;
+    uint64_t nLeft, nRight;
+
+    int workVarCount = data->get_work_var_count();
+
+    char* dir = (char*)data->direction->data.ptr;
     int* newIdx = data->split_buf->data.i;
     int newBufIdx = data->get_child_buf_idx( node );
-    int workVarCount = data->get_work_var_count();
     CvMat* buf = data->buf;
     size_t length_buf_row = data->get_length_subbuf();
-    cv::AutoBuffer<uchar> inn_buf(n*(3*sizeof(int)+sizeof(float)));
+    cv::AutoBuffer<uchar> inn_buf(nodeSampleCount*(3*sizeof(int)+sizeof(float)));
     int* tempBuf = (int*)(uchar*)inn_buf;
-    bool splitInputData;
 
     complete_node_dir(node);
 
-    for( int i = nl = nr = 0; i < n; i++ )
+    for (uint64_t i = nLeft = nRight = 0; i < nodeSampleCount; i++)
     {
         int d = dir[i];
         // initialize new indices for splitting ordered variables
-        newIdx[i] = (nl & (d-1)) | (nr & -d); // d ? ri : li
-        nr += d;
-        nl += d^1;
+        newIdx[i] = (nLeft & (d-1)) | (nRight & -d); // d ? ri : li
+        nRight += d;
+        nLeft += d^1;
     }
 
-    node->left = left = data->new_node( node, nl, newBufIdx, node->offset );
-    node->right = right = data->new_node( node, nr, newBufIdx, node->offset + nl );
+    node->left = left = data->new_node( node, nLeft, newBufIdx, node->offset );
+    node->right = right = data->new_node( node, nRight, newBufIdx, node->offset + nLeft );
 
-    splitInputData = node->depth + 1 < data->params.max_depth &&
-        (node->left->sample_count > data->params.min_sample_count ||
-        node->right->sample_count > data->params.min_sample_count);
+    bool splitInputData = node->depth + 1 < data->params.max_depth && (node->left->sample_count > data->params.min_sample_count || node->right->sample_count > data->params.min_sample_count);
 
-    // split ordered variables, keep both halves sorted.
-    for( int vi = 0; vi < ((CascadeBoostTrainData*)data)->numPrecalcIdx; vi++ )
-    {
+    const int numPreculatedIndices = ((CascadeBoostTrainData*)data)->numPrecalcIdx;
+    for (int vi = 0; vi < numPreculatedIndices; vi++) {
         int ci = data->get_var_type(vi);
         if( ci >= 0 || !splitInputData )
             continue;
 
         int n1 = node->get_num_valid(vi);
-        float *src_val_buf = (float*)(tempBuf + n);
-        int *src_sorted_idx_buf = (int*)(src_val_buf + n);
-        int *src_sample_idx_buf = src_sorted_idx_buf + n;
+        float *src_val_buf = (float*)(tempBuf + nodeSampleCount);
+        int *src_sorted_idx_buf = (int*)(src_val_buf + nodeSampleCount);
+        int *src_sample_idx_buf = src_sorted_idx_buf + nodeSampleCount;
         const int* src_sorted_idx = 0;
         const float* src_val = 0;
+
         data->get_ord_var_data(node, vi, src_val_buf, src_sorted_idx_buf, &src_val, &src_sorted_idx, src_sample_idx_buf);
 
-        for(int i = 0; i < n; i++)
+        for(uint64_t i = 0; i < nodeSampleCount; i++)
             tempBuf[i] = src_sorted_idx[i];
 
-        if (data->is_buf_16u)
-        {
+        if (data->is_buf_16u) {
             ushort *ldst, *rdst;
-            ldst = (ushort*)(buf->data.s + left->buf_idx*length_buf_row +
-                vi*scount + left->offset);
-            rdst = (ushort*)(ldst + nl);
+            ldst = (ushort*)(buf->data.s + left->buf_idx*length_buf_row + vi*sampleCount + left->offset);
+            rdst = (ushort*)(ldst + nLeft);
 
             // split sorted
-            for( int i = 0; i < n1; i++ )
-            {
+            for (int i = 0; i < n1; i++) {
                 int idx = tempBuf[i];
                 int d = dir[idx];
                 idx = newIdx[idx];
-                if (d)
-                {
+                if (d) {
                     *rdst = (ushort)idx;
                     rdst++;
-                }
-                else
-                {
+                } else {
                     *ldst = (ushort)idx;
                     ldst++;
                 }
             }
-            CV_Assert( n1 == n );
         }
         else
         {
             int *ldst, *rdst;
-            ldst = buf->data.i + left->buf_idx*length_buf_row +
-                vi*scount + left->offset;
-            rdst = buf->data.i + right->buf_idx*length_buf_row +
-                vi*scount + right->offset;
+            ldst = buf->data.i + left->buf_idx*length_buf_row + vi*sampleCount + left->offset;
+            rdst = buf->data.i + right->buf_idx*length_buf_row + vi*sampleCount + right->offset;
 
             // split sorted
             for( int i = 0; i < n1; i++ )
@@ -931,34 +959,26 @@ void CascadeBoostTree::split_node_data( CvDTreeNode* node )
                     ldst++;
                 }
             }
-            CV_Assert( n1 == n );
         }
     }
 
     // split cv_labels using newIdx relocation table
-    int *src_lbls_buf = tempBuf + n;
+    int *src_lbls_buf = tempBuf + nodeSampleCount;
     const int* src_lbls = data->get_cv_labels(node, src_lbls_buf);
 
-    for(int i = 0; i < n; i++)
+    for(uint64_t i = 0; i < nodeSampleCount; i++)
         tempBuf[i] = src_lbls[i];
 
-    if (data->is_buf_16u)
-    {
-        unsigned short *ldst = (unsigned short *)(buf->data.s + left->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + left->offset);
-        unsigned short *rdst = (unsigned short *)(buf->data.s + right->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + right->offset);
+    if (data->is_buf_16u) {
+        unsigned short *ldst = (unsigned short *)(buf->data.s + left->buf_idx*length_buf_row + (workVarCount-1)*sampleCount + left->offset);
+        unsigned short *rdst = (unsigned short *)(buf->data.s + right->buf_idx*length_buf_row + (workVarCount-1)*sampleCount + right->offset);
 
-        for( int i = 0; i < n; i++ )
-        {
+        for( uint64_t i = 0; i < nodeSampleCount; i++ ) {
             int idx = tempBuf[i];
-            if (dir[i])
-            {
+            if (dir[i]) {
                 *rdst = (unsigned short)idx;
                 rdst++;
-            }
-            else
-            {
+            } else {
                 *ldst = (unsigned short)idx;
                 ldst++;
             }
@@ -967,12 +987,10 @@ void CascadeBoostTree::split_node_data( CvDTreeNode* node )
     }
     else
     {
-        int *ldst = buf->data.i + left->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + left->offset;
-        int *rdst = buf->data.i + right->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + right->offset;
+        int *ldst = buf->data.i + left->buf_idx*length_buf_row + (workVarCount-1)*sampleCount + left->offset;
+        int *rdst = buf->data.i + right->buf_idx*length_buf_row + (workVarCount-1)*sampleCount + right->offset;
 
-        for( int i = 0; i < n; i++ )
+        for( uint64_t i = 0; i < nodeSampleCount; i++ )
         {
             int idx = tempBuf[i];
             if (dir[i])
@@ -989,28 +1007,21 @@ void CascadeBoostTree::split_node_data( CvDTreeNode* node )
     }
 
     // split sample indices
-    int *sampleIdx_src_buf = tempBuf + n;
+    int *sampleIdx_src_buf = tempBuf + nodeSampleCount;
     const int* sampleIdx_src = data->get_sample_indices(node, sampleIdx_src_buf);
 
-    for(int i = 0; i < n; i++)
+    for(uint64_t i = 0; i < nodeSampleCount; i++)
         tempBuf[i] = sampleIdx_src[i];
 
-    if (data->is_buf_16u)
-    {
-        unsigned short* ldst = (unsigned short*)(buf->data.s + left->buf_idx*length_buf_row +
-            workVarCount*scount + left->offset);
-        unsigned short* rdst = (unsigned short*)(buf->data.s + right->buf_idx*length_buf_row +
-            workVarCount*scount + right->offset);
-        for (int i = 0; i < n; i++)
-        {
+    if (data->is_buf_16u) {
+        unsigned short* ldst = (unsigned short*)(buf->data.s + left->buf_idx*length_buf_row + workVarCount*sampleCount + left->offset);
+        unsigned short* rdst = (unsigned short*)(buf->data.s + right->buf_idx*length_buf_row + workVarCount*sampleCount + right->offset);
+        for (uint64_t i = 0; i < nodeSampleCount; i++) {
             unsigned short idx = (unsigned short)tempBuf[i];
-            if (dir[i])
-            {
+            if (dir[i]) {
                 *rdst = idx;
                 rdst++;
-            }
-            else
-            {
+            } else {
                 *ldst = idx;
                 ldst++;
             }
@@ -1018,15 +1029,14 @@ void CascadeBoostTree::split_node_data( CvDTreeNode* node )
     }
     else
     {
-        int* ldst = buf->data.i + left->buf_idx*length_buf_row +
-            workVarCount*scount + left->offset;
-        int* rdst = buf->data.i + right->buf_idx*length_buf_row +
-            workVarCount*scount + right->offset;
-        for (int i = 0; i < n; i++)
+        int* ldst = buf->data.i + left->buf_idx*length_buf_row + workVarCount*sampleCount + left->offset;
+        int* rdst = buf->data.i + right->buf_idx*length_buf_row + workVarCount*sampleCount + right->offset;
+        for (uint64_t i = 0; i < nodeSampleCount; i++)
         {
             int idx = tempBuf[i];
             if (dir[i])
             {
+
                 *rdst = idx;
                 rdst++;
             }
@@ -1038,10 +1048,10 @@ void CascadeBoostTree::split_node_data( CvDTreeNode* node )
         }
     }
 
-    for( int vi = 0; vi < data->var_count; vi++ )
-    {
-        left->set_num_valid(vi, (int)(nl));
-        right->set_num_valid(vi, (int)(nr));
+    const int variableCount = data->var_count;
+    for (int vi = 0; vi < variableCount; vi++) {
+        left->set_num_valid(vi, nLeft);
+        right->set_num_valid(vi, nRight);
     }
 
     // deallocate the parent node data that is not needed anymore
@@ -1052,7 +1062,8 @@ void CascadeBoostTree::split_node_data( CvDTreeNode* node )
 
 void CascadeBoost::train(const FeatureEvaluator* _featureEvaluator,
                          int _numSamples,
-                         int _precalcValBufSize, int _precalcIdxBufSize,
+                         int _precalcValBufSize,
+                         int _precalcIdxBufSize,
                          int _channels,
                          const CascadeBoostParams& _params)
 {
@@ -1074,8 +1085,7 @@ void CascadeBoost::train(const FeatureEvaluator* _featureEvaluator,
     cout << "|  N |    HR   |    FA   |" << endl;
     cout << "+----+---------+---------+" << endl;
 
-    do
-    {
+    do {
         CascadeBoostTree* tree = new CascadeBoostTree;
         if (!tree->train( data, subsample_mask, this)) {
             delete tree;
@@ -1085,12 +1095,13 @@ void CascadeBoost::train(const FeatureEvaluator* _featureEvaluator,
         classifiers.append(tree);
         update_weights(tree);
         trim_weights();
-        if (cvCountNonZero(subsample_mask) == 0)
+        if (cvCountNonZero(subsample_mask) == 0) {
             return;
+	    }
     }
     while (!isErrDesired() && (classifiers.size() < params.weak_count));
 
-    clear();
+    //clear();
 }
 
 float CascadeBoost::predict(int sampleIdx, bool returnSum) const
@@ -1101,6 +1112,7 @@ float CascadeBoost::predict(int sampleIdx, bool returnSum) const
 
     if (!returnSum)
         sum = sum < threshold - CV_THRESHOLD_EPS ? 0.0 : 1.0;
+
     return (float)sum;
 }
 
@@ -1125,6 +1137,7 @@ void CascadeBoost::update_weights(CvBoostTree* tree)
                        ( !tree ? n*sizeof(int) : 0 );
     cv::AutoBuffer<uchar> inn_buf(inn_buf_size);
     uchar* cur_inn_buf_pos = (uchar*)inn_buf;
+
     if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )
     {
         step = CV_IS_MAT_CONT(data->responses_copy->type) ?
@@ -1133,6 +1146,7 @@ void CascadeBoost::update_weights(CvBoostTree* tree)
         sampleIdxBuf = (int*)cur_inn_buf_pos; cur_inn_buf_pos = (uchar*)(sampleIdxBuf + n);
         sampleIdx = data->get_sample_indices( data->data_root, sampleIdxBuf );
     }
+
     CvMat* buf = data->buf;
     size_t length_buf_row = data->get_length_subbuf();
     if( !tree ) // before training the first tree, initialize weights and other parameters
@@ -1156,37 +1170,26 @@ void CascadeBoost::update_weights(CvBoostTree* tree)
         weights = cvCreateMat( 1, n, CV_64F );
         subtree_weights = cvCreateMat( 1, n + 2, CV_64F );
 
-        if (data->is_buf_16u)
-        {
-            unsigned short* labels = (unsigned short*)(buf->data.s + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count);
-            for( int i = 0; i < n; i++ )
-            {
-                // save original categorical responses {0,1}, convert them to {-1,1}
-                orig_response->data.i[i] = classLabels[i]*2 - 1;
-                // make all the samples active at start.
-                // later, in trim_weights() deactivate/reactive again some, if need
-                subsample_mask->data.ptr[i] = (uchar)1;
-                // make all the initial weights the same.
-                weights->data.db[i] = w0*p[classLabels[i]];
-                // set the labels to find (from within weak tree learning proc)
-                // the particular sample weight, and where to store the response.
+        // set the labels to find (from within weak tree learning proc)
+        // the particular sample weight, and where to store the response.
+        if (data->is_buf_16u) {
+            unsigned short* labels = (unsigned short*)(buf->data.s + data->data_root->buf_idx*length_buf_row + data->data_root->offset + (uint64)(data->work_var_count-1)*data->sample_count);
+            for (int i = 0; i < n; i++)
                 labels[i] = (unsigned short)i;
-            }
-        }
-        else
-        {
-            int* labels = buf->data.i + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count;
-
+        } else {
+            int* labels = buf->data.i + data->data_root->buf_idx*length_buf_row + data->data_root->offset + (uint64)(data->work_var_count-1)*data->sample_count;
             for( int i = 0; i < n; i++ )
-            {
-                // save original categorical responses {0,1}, convert them to {-1,1}
-                orig_response->data.i[i] = classLabels[i]*2 - 1;
-                subsample_mask->data.ptr[i] = (uchar)1;
-                weights->data.db[i] = w0*p[classLabels[i]];
                 labels[i] = i;
-            }
+        }
+
+        for (int i = 0; i < n; i++) {
+            // save original categorical responses {0,1}, convert them to {-1,1}
+            orig_response->data.i[i] = classLabels[i]*2 - 1;
+            // make all the samples active at start.
+            // later, in trim_weights() deactivate/reactive again some, if need
+            subsample_mask->data.ptr[i] = (uchar)1;
+            // make all the initial weights the same.
+            weights->data.db[i] = w0*p[classLabels[i]];
         }
 
         if( params.boost_type == LOGIT )