1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 
31 /***********************************************************************
32  * Author: Vincent Rabaud
33  *************************************************************************/
34 
35 #ifndef OPENCV_FLANN_LSH_TABLE_H_
36 #define OPENCV_FLANN_LSH_TABLE_H_
37 
38 #include <algorithm>
39 #include <iostream>
40 #include <iomanip>
41 #include <limits.h>
42 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
43 #ifdef __GXX_EXPERIMENTAL_CXX0X__
44 #  define USE_UNORDERED_MAP 1
45 #else
46 #  define USE_UNORDERED_MAP 0
47 #endif
48 #if USE_UNORDERED_MAP
49 #include <unordered_map>
50 #else
51 #include <map>
52 #endif
53 #include <math.h>
54 #include <stddef.h>
55 
56 #include "dynamic_bitset.h"
57 #include "matrix.h"
58 
59 namespace cvflann
60 {
61 
62 namespace lsh
63 {
64 
65 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
66 
67 /** What is stored in an LSH bucket
68  */
69 typedef uint32_t FeatureIndex;
70 /** The id from which we can get a bucket back in an LSH table
71  */
72 typedef unsigned int BucketKey;
73 
74 /** A bucket in an LSH table
75  */
76 typedef std::vector<FeatureIndex> Bucket;
77 
78 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
79 
80 /** POD for stats about an LSH table
81  */
82 struct LshStats
83 {
84     std::vector<unsigned int> bucket_sizes_;
85     size_t n_buckets_;
86     size_t bucket_size_mean_;
87     size_t bucket_size_median_;
88     size_t bucket_size_min_;
89     size_t bucket_size_max_;
90     size_t bucket_size_std_dev;
91     /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
92      */
93     std::vector<std::vector<unsigned int> > size_histogram_;
94 };
95 
96 /** Overload the << operator for LshStats
97  * @param out the streams
98  * @param stats the stats to display
99  * @return the streams
100  */
101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
102 {
103     int w = 20;
104     out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
105     << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
106     << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
107     << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
108     << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
109     << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
110     << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
111 
112     // Display the histogram
113     out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
114     << std::setiosflags(std::ios::left);
115     for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
116              stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ",  ";
117 
118     return out;
119 }
120 
121 
122 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
123 
124 /** Lsh hash table. As its key is a sub-feature, and as usually
125  * the size of it is pretty small, we keep it as a continuous memory array.
126  * The value is an index in the corpus of features (we keep it as an unsigned
127  * int for pure memory reasons, it could be a size_t)
128  */
129 template<typename ElementType>
130 class LshTable
131 {
132 public:
133     /** A container of all the feature indices. Optimized for space
134      */
135 #if USE_UNORDERED_MAP
136     typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
137 #else
138     typedef std::map<BucketKey, Bucket> BucketsSpace;
139 #endif
140 
141     /** A container of all the feature indices. Optimized for speed
142      */
143     typedef std::vector<Bucket> BucketsSpeed;
144 
145     /** Default constructor
146      */
LshTable()147     LshTable()
148     {
149         key_size_ = 0;
150         feature_size_ = 0;
151         speed_level_ = kArray;
152     }
153 
154     /** Default constructor
155      * Create the mask and allocate the memory
156      * @param feature_size is the size of the feature (considered as a ElementType[])
157      * @param key_size is the number of bits that are turned on in the feature
158      */
LshTable(unsigned int feature_size,unsigned int key_size)159     LshTable(unsigned int feature_size, unsigned int key_size)
160     {
161         feature_size_ = feature_size;
162         (void)key_size;
163         std::cerr << "LSH is not implemented for that type" << std::endl;
164         assert(0);
165     }
166 
167     /** Add a feature to the table
168      * @param value the value to store for that feature
169      * @param feature the feature itself
170      */
add(unsigned int value,const ElementType * feature)171     void add(unsigned int value, const ElementType* feature)
172     {
173         // Add the value to the corresponding bucket
174         BucketKey key = (lsh::BucketKey)getKey(feature);
175 
176         switch (speed_level_) {
177         case kArray:
178             // That means we get the buckets from an array
179             buckets_speed_[key].push_back(value);
180             break;
181         case kBitsetHash:
182             // That means we can check the bitset for the presence of a key
183             key_bitset_.set(key);
184             buckets_space_[key].push_back(value);
185             break;
186         case kHash:
187         {
188             // That means we have to check for the hash table for the presence of a key
189             buckets_space_[key].push_back(value);
190             break;
191         }
192         }
193     }
194 
195     /** Add a set of features to the table
196      * @param dataset the values to store
197      */
add(Matrix<ElementType> dataset)198     void add(Matrix<ElementType> dataset)
199     {
200 #if USE_UNORDERED_MAP
201         buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
202 #endif
203         // Add the features to the table
204         for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
205         // Now that the table is full, optimize it for speed/space
206         optimize();
207     }
208 
209     /** Get a bucket given the key
210      * @param key
211      * @return
212      */
getBucketFromKey(BucketKey key)213     inline const Bucket* getBucketFromKey(BucketKey key) const
214     {
215         // Generate other buckets
216         switch (speed_level_) {
217         case kArray:
218             // That means we get the buckets from an array
219             return &buckets_speed_[key];
220             break;
221         case kBitsetHash:
222             // That means we can check the bitset for the presence of a key
223             if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
224             else return 0;
225             break;
226         case kHash:
227         {
228             // That means we have to check for the hash table for the presence of a key
229             BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
230             bucket_it = buckets_space_.find(key);
231             // Stop here if that bucket does not exist
232             if (bucket_it == bucket_end) return 0;
233             else return &bucket_it->second;
234             break;
235         }
236         }
237         return 0;
238     }
239 
240     /** Compute the sub-signature of a feature
241      */
getKey(const ElementType *)242     size_t getKey(const ElementType* /*feature*/) const
243     {
244         std::cerr << "LSH is not implemented for that type" << std::endl;
245         assert(0);
246         return 1;
247     }
248 
249     /** Get statistics about the table
250      * @return
251      */
252     LshStats getStats() const;
253 
254 private:
255     /** defines the speed fo the implementation
256      * kArray uses a vector for storing data
257      * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
258      * kHash uses a hash map only
259      */
260     enum SpeedLevel
261     {
262         kArray, kBitsetHash, kHash
263     };
264 
265     /** Initialize some variables
266      */
initialize(size_t key_size)267     void initialize(size_t key_size)
268     {
269         const size_t key_size_lower_bound = 1;
270         //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
271         const size_t key_size_upper_bound = (std::min)(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
272         if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
273         {
274             CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound));
275         }
276 
277         speed_level_ = kHash;
278         key_size_ = (unsigned)key_size;
279     }
280 
281     /** Optimize the table for speed/space
282      */
optimize()283     void optimize()
284     {
285         // If we are already using the fast storage, no need to do anything
286         if (speed_level_ == kArray) return;
287 
288         // Use an array if it will be more than half full
289         if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
290             speed_level_ = kArray;
291             // Fill the array version of it
292             buckets_speed_.resize(size_t(1) << key_size_);
293             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
294 
295             // Empty the hash table
296             buckets_space_.clear();
297             return;
298         }
299 
300         // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
301         // for the vector) or less than 512MB (key_size_ <= 30)
302         if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
303              >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
304             speed_level_ = kBitsetHash;
305             key_bitset_.resize(size_t(1) << key_size_);
306             key_bitset_.reset();
307             // Try with the BucketsSpace
308             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
309         }
310         else {
311             speed_level_ = kHash;
312             key_bitset_.clear();
313         }
314     }
315 
316     /** The vector of all the buckets if they are held for speed
317      */
318     BucketsSpeed buckets_speed_;
319 
320     /** The hash table of all the buckets in case we cannot use the speed version
321      */
322     BucketsSpace buckets_space_;
323 
324     /** What is used to store the data */
325     SpeedLevel speed_level_;
326 
327     /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
328      * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
329      */
330     DynamicBitset key_bitset_;
331 
332     /** The size of the sub-signature in bits
333      */
334     unsigned int key_size_;
335 
336     unsigned int feature_size_;
337 
338     // Members only used for the unsigned char specialization
339     /** The mask to apply to a feature to get the hash key
340      * Only used in the unsigned char case
341      */
342     std::vector<size_t> mask_;
343 };
344 
345 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
346 // Specialization for unsigned char
347 
348 template<>
LshTable(unsigned int feature_size,unsigned int subsignature_size)349 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
350 {
351     feature_size_ = feature_size;
352     initialize(subsignature_size);
353     // Allocate the mask
354     mask_ = std::vector<size_t>((feature_size * sizeof(char) + sizeof(size_t) - 1) / sizeof(size_t), 0);
355 
356     // A bit brutal but fast to code
357     std::vector<int> indices(feature_size * CHAR_BIT);
358     for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = (int)i;
359 #ifndef OPENCV_FLANN_USE_STD_RAND
360     cv::randShuffle(indices);
361 #else
362     std::random_shuffle(indices.begin(), indices.end());
363 #endif
364 
365     // Generate a random set of order of subsignature_size_ bits
366     for (unsigned int i = 0; i < key_size_; ++i) {
367         size_t index = indices[i];
368 
369         // Set that bit in the mask
370         size_t divisor = CHAR_BIT * sizeof(size_t);
371         size_t idx = index / divisor; //pick the right size_t index
372         mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
373     }
374 
375     // Set to 1 if you want to display the mask for debug
376 #if 0
377     {
378         size_t bcount = 0;
379         BOOST_FOREACH(size_t mask_block, mask_){
380             out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
381                 << std::endl;
382             bcount += __builtin_popcountll(mask_block);
383         }
384         out << "bit count : " << std::dec << bcount << std::endl;
385         out << "mask size : " << mask_.size() << std::endl;
386         return out;
387     }
388 #endif
389 }
390 
391 /** Return the Subsignature of a feature
392  * @param feature the feature to analyze
393  */
394 template<>
getKey(const unsigned char * feature)395 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
396 {
397     // no need to check if T is dividable by sizeof(size_t) like in the Hamming
398     // distance computation as we have a mask
399     // FIXIT: This is bad assumption, because we reading tail bytes after of the allocated features buffer
400     const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
401 
402     // Figure out the subsignature of the feature
403     // Given the feature ABCDEF, and the mask 001011, the output will be
404     // 000CEF
405     size_t subsignature = 0;
406     size_t bit_index = 1;
407 
408     for (unsigned i = 0; i < feature_size_; i += sizeof(size_t)) {
409         // get the mask and signature blocks
410         size_t feature_block;
411         if (i <= feature_size_ - sizeof(size_t))
412         {
413             feature_block = *feature_block_ptr;
414         }
415         else
416         {
417             size_t tmp = 0;
418             memcpy(&tmp, feature_block_ptr, feature_size_ - i); // preserve bytes order
419             feature_block = tmp;
420         }
421         size_t mask_block = mask_[i / sizeof(size_t)];
422         while (mask_block) {
423             // Get the lowest set bit in the mask block
424             size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
425             // Add it to the current subsignature if necessary
426             subsignature += (feature_block & lowest_bit) ? bit_index : 0;
427             // Reset the bit in the mask block
428             mask_block ^= lowest_bit;
429             // increment the bit index for the subsignature
430             bit_index <<= 1;
431         }
432         // Check the next feature block
433         ++feature_block_ptr;
434     }
435     return subsignature;
436 }
437 
438 template<>
getStats()439 inline LshStats LshTable<unsigned char>::getStats() const
440 {
441     LshStats stats;
442     stats.bucket_size_mean_ = 0;
443     if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
444         stats.n_buckets_ = 0;
445         stats.bucket_size_median_ = 0;
446         stats.bucket_size_min_ = 0;
447         stats.bucket_size_max_ = 0;
448         return stats;
449     }
450 
451     if (!buckets_speed_.empty()) {
452         for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
453             stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
454             stats.bucket_size_mean_ += pbucket->size();
455         }
456         stats.bucket_size_mean_ /= buckets_speed_.size();
457         stats.n_buckets_ = buckets_speed_.size();
458     }
459     else {
460         for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
461             stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
462             stats.bucket_size_mean_ += x->second.size();
463         }
464         stats.bucket_size_mean_ /= buckets_space_.size();
465         stats.n_buckets_ = buckets_space_.size();
466     }
467 
468     std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
469 
470     //  BOOST_FOREACH(int size, stats.bucket_sizes_)
471     //          std::cout << size << " ";
472     //  std::cout << std::endl;
473     stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
474     stats.bucket_size_min_ = stats.bucket_sizes_.front();
475     stats.bucket_size_max_ = stats.bucket_sizes_.back();
476 
477     // TODO compute mean and std
478     /*float mean, stddev;
479        stats.bucket_size_mean_ = mean;
480        stats.bucket_size_std_dev = stddev;*/
481 
482     // Include a histogram of the buckets
483     unsigned int bin_start = 0;
484     unsigned int bin_end = 20;
485     bool is_new_bin = true;
486     for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
487          != end; )
488         if (*iterator < bin_end) {
489             if (is_new_bin) {
490                 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
491                 stats.size_histogram_.back()[0] = bin_start;
492                 stats.size_histogram_.back()[1] = bin_end - 1;
493                 is_new_bin = false;
494             }
495             ++stats.size_histogram_.back()[2];
496             ++iterator;
497         }
498         else {
499             bin_start += 20;
500             bin_end += 20;
501             is_new_bin = true;
502         }
503 
504     return stats;
505 }
506 
507 // End the two namespaces
508 }
509 }
510 
511 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
512 
513 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */
514