1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 
31 #ifndef OPENCV_FLANN_KDTREE_INDEX_H_
32 #define OPENCV_FLANN_KDTREE_INDEX_H_
33 
34 #include <algorithm>
35 #include <map>
36 #include <cassert>
37 #include <cstring>
38 
39 #include "general.h"
40 #include "nn_index.h"
41 #include "dynamic_bitset.h"
42 #include "matrix.h"
43 #include "result_set.h"
44 #include "heap.h"
45 #include "allocator.h"
46 #include "random.h"
47 #include "saving.h"
48 
49 
50 namespace cvflann
51 {
52 
53 struct KDTreeIndexParams : public IndexParams
54 {
55     KDTreeIndexParams(int trees = 4)
56     {
57         (*this)["algorithm"] = FLANN_INDEX_KDTREE;
58         (*this)["trees"] = trees;
59     }
60 };
61 
62 
63 /**
64  * Randomized kd-tree index
65  *
66  * Contains the k-d trees and other information for indexing a set of points
67  * for nearest-neighbor matching.
68  */
69 template <typename Distance>
70 class KDTreeIndex : public NNIndex<Distance>
71 {
72 public:
73     typedef typename Distance::ElementType ElementType;
74     typedef typename Distance::ResultType DistanceType;
75 
76 
77     /**
78      * KDTree constructor
79      *
80      * Params:
81      *          inputData = dataset with the input features
82      *          params = parameters passed to the kdtree algorithm
83      */
84     KDTreeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = KDTreeIndexParams(),
85                 Distance d = Distance() ) :
dataset_(inputData)86         dataset_(inputData), index_params_(params), distance_(d)
87     {
88         size_ = dataset_.rows;
89         veclen_ = dataset_.cols;
90 
91         trees_ = get_param(index_params_,"trees",4);
92         tree_roots_ = new NodePtr[trees_];
93 
94         // Create a permutable array of indices to the input vectors.
95         vind_.resize(size_);
96         for (size_t i = 0; i < size_; ++i) {
97             vind_[i] = int(i);
98         }
99 
100         mean_ = new DistanceType[veclen_];
101         var_ = new DistanceType[veclen_];
102     }
103 
104 
105     KDTreeIndex(const KDTreeIndex&);
106     KDTreeIndex& operator=(const KDTreeIndex&);
107 
108     /**
109      * Standard destructor
110      */
~KDTreeIndex()111     ~KDTreeIndex()
112     {
113         if (tree_roots_!=NULL) {
114             delete[] tree_roots_;
115         }
116         delete[] mean_;
117         delete[] var_;
118     }
119 
120     /**
121      * Builds the index
122      */
buildIndex()123     void buildIndex() CV_OVERRIDE
124     {
125         /* Construct the randomized trees. */
126         for (int i = 0; i < trees_; i++) {
127             /* Randomize the order of vectors to allow for unbiased sampling. */
128 #ifndef OPENCV_FLANN_USE_STD_RAND
129             cv::randShuffle(vind_);
130 #else
131             std::random_shuffle(vind_.begin(), vind_.end());
132 #endif
133 
134             tree_roots_[i] = divideTree(&vind_[0], int(size_) );
135         }
136     }
137 
138 
getType()139     flann_algorithm_t getType() const CV_OVERRIDE
140     {
141         return FLANN_INDEX_KDTREE;
142     }
143 
144 
saveIndex(FILE * stream)145     void saveIndex(FILE* stream) CV_OVERRIDE
146     {
147         save_value(stream, trees_);
148         for (int i=0; i<trees_; ++i) {
149             save_tree(stream, tree_roots_[i]);
150         }
151     }
152 
153 
154 
loadIndex(FILE * stream)155     void loadIndex(FILE* stream) CV_OVERRIDE
156     {
157         load_value(stream, trees_);
158         if (tree_roots_!=NULL) {
159             delete[] tree_roots_;
160         }
161         tree_roots_ = new NodePtr[trees_];
162         for (int i=0; i<trees_; ++i) {
163             load_tree(stream,tree_roots_[i]);
164         }
165 
166         index_params_["algorithm"] = getType();
167         index_params_["trees"] = tree_roots_;
168     }
169 
170     /**
171      *  Returns size of index.
172      */
size()173     size_t size() const CV_OVERRIDE
174     {
175         return size_;
176     }
177 
178     /**
179      * Returns the length of an index feature.
180      */
veclen()181     size_t veclen() const CV_OVERRIDE
182     {
183         return veclen_;
184     }
185 
186     /**
187      * Computes the inde memory usage
188      * Returns: memory used by the index
189      */
usedMemory()190     int usedMemory() const CV_OVERRIDE
191     {
192         return int(pool_.usedMemory+pool_.wastedMemory+dataset_.rows*sizeof(int));  // pool memory and vind array memory
193     }
194 
195     /**
196      * Find set of nearest neighbors to vec. Their indices are stored inside
197      * the result object.
198      *
199      * Params:
200      *     result = the result object in which the indices of the nearest-neighbors are stored
201      *     vec = the vector for which to search the nearest neighbors
202      *     maxCheck = the maximum number of restarts (in a best-bin-first manner)
203      */
findNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,const SearchParams & searchParams)204     void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
205     {
206         int maxChecks = get_param(searchParams,"checks", 32);
207         float epsError = 1+get_param(searchParams,"eps",0.0f);
208 
209         if (maxChecks==FLANN_CHECKS_UNLIMITED) {
210             getExactNeighbors(result, vec, epsError);
211         }
212         else {
213             getNeighbors(result, vec, maxChecks, epsError);
214         }
215     }
216 
getParameters()217     IndexParams getParameters() const CV_OVERRIDE
218     {
219         return index_params_;
220     }
221 
222 private:
223 
224 
225     /*--------------------- Internal Data Structures --------------------------*/
226     struct Node
227     {
228         /**
229          * Dimension used for subdivision.
230          */
231         int divfeat;
232         /**
233          * The values used for subdivision.
234          */
235         DistanceType divval;
236         /**
237          * The child nodes.
238          */
239         Node* child1, * child2;
240     };
241     typedef Node* NodePtr;
242     typedef BranchStruct<NodePtr, DistanceType> BranchSt;
243     typedef BranchSt* Branch;
244 
245 
246 
save_tree(FILE * stream,NodePtr tree)247     void save_tree(FILE* stream, NodePtr tree)
248     {
249         save_value(stream, *tree);
250         if (tree->child1!=NULL) {
251             save_tree(stream, tree->child1);
252         }
253         if (tree->child2!=NULL) {
254             save_tree(stream, tree->child2);
255         }
256     }
257 
258 
load_tree(FILE * stream,NodePtr & tree)259     void load_tree(FILE* stream, NodePtr& tree)
260     {
261         tree = pool_.allocate<Node>();
262         load_value(stream, *tree);
263         if (tree->child1!=NULL) {
264             load_tree(stream, tree->child1);
265         }
266         if (tree->child2!=NULL) {
267             load_tree(stream, tree->child2);
268         }
269     }
270 
271 
272     /**
273      * Create a tree node that subdivides the list of vecs from vind[first]
274      * to vind[last].  The routine is called recursively on each sublist.
275      * Place a pointer to this new tree node in the location pTree.
276      *
277      * Params: pTree = the new node to create
278      *                  first = index of the first vector
279      *                  last = index of the last vector
280      */
divideTree(int * ind,int count)281     NodePtr divideTree(int* ind, int count)
282     {
283         NodePtr node = pool_.allocate<Node>(); // allocate memory
284 
285         /* If too few exemplars remain, then make this a leaf node. */
286         if ( count == 1) {
287             node->child1 = node->child2 = NULL;    /* Mark as leaf node. */
288             node->divfeat = *ind;    /* Store index of this vec. */
289         }
290         else {
291             int idx;
292             int cutfeat;
293             DistanceType cutval;
294             meanSplit(ind, count, idx, cutfeat, cutval);
295 
296             node->divfeat = cutfeat;
297             node->divval = cutval;
298             node->child1 = divideTree(ind, idx);
299             node->child2 = divideTree(ind+idx, count-idx);
300         }
301 
302         return node;
303     }
304 
305 
306     /**
307      * Choose which feature to use in order to subdivide this set of vectors.
308      * Make a random choice among those with the highest variance, and use
309      * its variance as the threshold value.
310      */
meanSplit(int * ind,int count,int & index,int & cutfeat,DistanceType & cutval)311     void meanSplit(int* ind, int count, int& index, int& cutfeat, DistanceType& cutval)
312     {
313         memset(mean_,0,veclen_*sizeof(DistanceType));
314         memset(var_,0,veclen_*sizeof(DistanceType));
315 
316         /* Compute mean values.  Only the first SAMPLE_MEAN values need to be
317             sampled to get a good estimate.
318          */
319         int cnt = std::min((int)SAMPLE_MEAN+1, count);
320         for (int j = 0; j < cnt; ++j) {
321             ElementType* v = dataset_[ind[j]];
322             for (size_t k=0; k<veclen_; ++k) {
323                 mean_[k] += v[k];
324             }
325         }
326         for (size_t k=0; k<veclen_; ++k) {
327             mean_[k] /= cnt;
328         }
329 
330         /* Compute variances (no need to divide by count). */
331         for (int j = 0; j < cnt; ++j) {
332             ElementType* v = dataset_[ind[j]];
333             for (size_t k=0; k<veclen_; ++k) {
334                 DistanceType dist = v[k] - mean_[k];
335                 var_[k] += dist * dist;
336             }
337         }
338         /* Select one of the highest variance indices at random. */
339         cutfeat = selectDivision(var_);
340         cutval = mean_[cutfeat];
341 
342         int lim1, lim2;
343         planeSplit(ind, count, cutfeat, cutval, lim1, lim2);
344 
345         if (lim1>count/2) index = lim1;
346         else if (lim2<count/2) index = lim2;
347         else index = count/2;
348 
349         /* If either list is empty, it means that all remaining features
350          * are identical. Split in the middle to maintain a balanced tree.
351          */
352         if ((lim1==count)||(lim2==0)) index = count/2;
353     }
354 
355 
356     /**
357      * Select the top RAND_DIM largest values from v and return the index of
358      * one of these selected at random.
359      */
selectDivision(DistanceType * v)360     int selectDivision(DistanceType* v)
361     {
362         int num = 0;
363         size_t topind[RAND_DIM];
364 
365         /* Create a list of the indices of the top RAND_DIM values. */
366         for (size_t i = 0; i < veclen_; ++i) {
367             if ((num < RAND_DIM)||(v[i] > v[topind[num-1]])) {
368                 /* Put this element at end of topind. */
369                 if (num < RAND_DIM) {
370                     topind[num++] = i;            /* Add to list. */
371                 }
372                 else {
373                     topind[num-1] = i;         /* Replace last element. */
374                 }
375                 /* Bubble end value down to right location by repeated swapping. */
376                 int j = num - 1;
377                 while (j > 0  &&  v[topind[j]] > v[topind[j-1]]) {
378                     std::swap(topind[j], topind[j-1]);
379                     --j;
380                 }
381             }
382         }
383         /* Select a random integer in range [0,num-1], and return that index. */
384         int rnd = rand_int(num);
385         return (int)topind[rnd];
386     }
387 
388 
389     /**
390      *  Subdivide the list of points by a plane perpendicular on axe corresponding
391      *  to the 'cutfeat' dimension at 'cutval' position.
392      *
393      *  On return:
394      *  dataset[ind[0..lim1-1]][cutfeat]<cutval
395      *  dataset[ind[lim1..lim2-1]][cutfeat]==cutval
396      *  dataset[ind[lim2..count]][cutfeat]>cutval
397      */
planeSplit(int * ind,int count,int cutfeat,DistanceType cutval,int & lim1,int & lim2)398     void planeSplit(int* ind, int count, int cutfeat, DistanceType cutval, int& lim1, int& lim2)
399     {
400         /* Move vector indices for left subtree to front of list. */
401         int left = 0;
402         int right = count-1;
403         for (;; ) {
404             while (left<=right && dataset_[ind[left]][cutfeat]<cutval) ++left;
405             while (left<=right && dataset_[ind[right]][cutfeat]>=cutval) --right;
406             if (left>right) break;
407             std::swap(ind[left], ind[right]); ++left; --right;
408         }
409         lim1 = left;
410         right = count-1;
411         for (;; ) {
412             while (left<=right && dataset_[ind[left]][cutfeat]<=cutval) ++left;
413             while (left<=right && dataset_[ind[right]][cutfeat]>cutval) --right;
414             if (left>right) break;
415             std::swap(ind[left], ind[right]); ++left; --right;
416         }
417         lim2 = left;
418     }
419 
420     /**
421      * Performs an exact nearest neighbor search. The exact search performs a full
422      * traversal of the tree.
423      */
getExactNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,float epsError)424     void getExactNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, float epsError)
425     {
426         //		checkID -= 1;  /* Set a different unique ID for each search. */
427 
428         if (trees_ > 1) {
429             fprintf(stderr,"It doesn't make any sense to use more than one tree for exact search");
430         }
431         if (trees_>0) {
432             searchLevelExact(result, vec, tree_roots_[0], 0.0, epsError);
433         }
434         assert(result.full());
435     }
436 
437     /**
438      * Performs the approximate nearest-neighbor search. The search is approximate
439      * because the tree traversal is abandoned after a given number of descends in
440      * the tree.
441      */
getNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,int maxCheck,float epsError)442     void getNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, int maxCheck, float epsError)
443     {
444         int i;
445         BranchSt branch;
446 
447         int checkCount = 0;
448         Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
449         DynamicBitset checked(size_);
450 
451         /* Search once through each tree down to root. */
452         for (i = 0; i < trees_; ++i) {
453             searchLevel(result, vec, tree_roots_[i], 0, checkCount, maxCheck, epsError, heap, checked);
454         }
455 
456         /* Keep searching other branches from heap until finished. */
457         while ( heap->popMin(branch) && (checkCount < maxCheck || !result.full() )) {
458             searchLevel(result, vec, branch.node, branch.mindist, checkCount, maxCheck, epsError, heap, checked);
459         }
460 
461         delete heap;
462 
463         assert(result.full());
464     }
465 
466 
467     /**
468      *  Search starting from a given node of the tree.  Based on any mismatches at
469      *  higher levels, all exemplars below this level must have a distance of
470      *  at least "mindistsq".
471      */
searchLevel(ResultSet<DistanceType> & result_set,const ElementType * vec,NodePtr node,DistanceType mindist,int & checkCount,int maxCheck,float epsError,Heap<BranchSt> * heap,DynamicBitset & checked)472     void searchLevel(ResultSet<DistanceType>& result_set, const ElementType* vec, NodePtr node, DistanceType mindist, int& checkCount, int maxCheck,
473                      float epsError, Heap<BranchSt>* heap, DynamicBitset& checked)
474     {
475         if (result_set.worstDist()<mindist) {
476             //			printf("Ignoring branch, too far\n");
477             return;
478         }
479 
480         /* If this is a leaf node, then do check and return. */
481         if ((node->child1 == NULL)&&(node->child2 == NULL)) {
482             /*  Do not check same node more than once when searching multiple trees.
483                 Once a vector is checked, we set its location in vind to the
484                 current checkID.
485              */
486             int index = node->divfeat;
487             if ( checked.test(index) || ((checkCount>=maxCheck)&& result_set.full()) ) return;
488             checked.set(index);
489             checkCount++;
490 
491             DistanceType dist = distance_(dataset_[index], vec, veclen_);
492             result_set.addPoint(dist,index);
493 
494             return;
495         }
496 
497         /* Which child branch should be taken first? */
498         ElementType val = vec[node->divfeat];
499         DistanceType diff = val - node->divval;
500         NodePtr bestChild = (diff < 0) ? node->child1 : node->child2;
501         NodePtr otherChild = (diff < 0) ? node->child2 : node->child1;
502 
503         /* Create a branch record for the branch not taken.  Add distance
504             of this feature boundary (we don't attempt to correct for any
505             use of this feature in a parent node, which is unlikely to
506             happen and would have only a small effect).  Don't bother
507             adding more branches to heap after halfway point, as cost of
508             adding exceeds their value.
509          */
510 
511         DistanceType new_distsq = mindist + distance_.accum_dist(val, node->divval, node->divfeat);
512         //		if (2 * checkCount < maxCheck  ||  !result.full()) {
513         if ((new_distsq*epsError < result_set.worstDist())||  !result_set.full()) {
514             heap->insert( BranchSt(otherChild, new_distsq) );
515         }
516 
517         /* Call recursively to search next level down. */
518         searchLevel(result_set, vec, bestChild, mindist, checkCount, maxCheck, epsError, heap, checked);
519     }
520 
521     /**
522      * Performs an exact search in the tree starting from a node.
523      */
searchLevelExact(ResultSet<DistanceType> & result_set,const ElementType * vec,const NodePtr node,DistanceType mindist,const float epsError)524     void searchLevelExact(ResultSet<DistanceType>& result_set, const ElementType* vec, const NodePtr node, DistanceType mindist, const float epsError)
525     {
526         /* If this is a leaf node, then do check and return. */
527         if ((node->child1 == NULL)&&(node->child2 == NULL)) {
528             int index = node->divfeat;
529             DistanceType dist = distance_(dataset_[index], vec, veclen_);
530             result_set.addPoint(dist,index);
531             return;
532         }
533 
534         /* Which child branch should be taken first? */
535         ElementType val = vec[node->divfeat];
536         DistanceType diff = val - node->divval;
537         NodePtr bestChild = (diff < 0) ? node->child1 : node->child2;
538         NodePtr otherChild = (diff < 0) ? node->child2 : node->child1;
539 
540         /* Create a branch record for the branch not taken.  Add distance
541             of this feature boundary (we don't attempt to correct for any
542             use of this feature in a parent node, which is unlikely to
543             happen and would have only a small effect).  Don't bother
544             adding more branches to heap after halfway point, as cost of
545             adding exceeds their value.
546          */
547 
548         DistanceType new_distsq = mindist + distance_.accum_dist(val, node->divval, node->divfeat);
549 
550         /* Call recursively to search next level down. */
551         searchLevelExact(result_set, vec, bestChild, mindist, epsError);
552 
553         if (new_distsq*epsError<=result_set.worstDist()) {
554             searchLevelExact(result_set, vec, otherChild, new_distsq, epsError);
555         }
556     }
557 
558 
559 private:
560 
561     enum
562     {
563         /**
564          * To improve efficiency, only SAMPLE_MEAN random values are used to
565          * compute the mean and variance at each level when building a tree.
566          * A value of 100 seems to perform as well as using all values.
567          */
568         SAMPLE_MEAN = 100,
569         /**
570          * Top random dimensions to consider
571          *
572          * When creating random trees, the dimension on which to subdivide is
573          * selected at random from among the top RAND_DIM dimensions with the
574          * highest variance.  A value of 5 works well.
575          */
576         RAND_DIM=5
577     };
578 
579 
580     /**
581      * Number of randomized trees that are used
582      */
583     int trees_;
584 
585     /**
586      *  Array of indices to vectors in the dataset.
587      */
588     std::vector<int> vind_;
589 
590     /**
591      * The dataset used by this index
592      */
593     const Matrix<ElementType> dataset_;
594 
595     IndexParams index_params_;
596 
597     size_t size_;
598     size_t veclen_;
599 
600 
601     DistanceType* mean_;
602     DistanceType* var_;
603 
604 
605     /**
606      * Array of k-d trees used to find neighbours.
607      */
608     NodePtr* tree_roots_;
609 
610     /**
611      * Pooled memory allocator.
612      *
613      * Using a pooled memory allocator is more efficient
614      * than allocating memory directly when there is a large
615      * number small of memory allocations.
616      */
617     PooledAllocator pool_;
618 
619     Distance distance_;
620 
621 
622 };   // class KDTreeForest
623 
624 }
625 
626 #endif //OPENCV_FLANN_KDTREE_INDEX_H_
627