1 /***********************************************************************
2 * Software License Agreement (BSD License)
3 *
4 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6 *
7 * THE BSD LICENSE
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *************************************************************************/
30
31 /***********************************************************************
32 * Author: Vincent Rabaud
33 *************************************************************************/
34
35 #ifndef OPENCV_FLANN_LSH_TABLE_H_
36 #define OPENCV_FLANN_LSH_TABLE_H_
37
38 #include <algorithm>
39 #include <iostream>
40 #include <iomanip>
41 #include <limits.h>
42 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
43 #ifdef __GXX_EXPERIMENTAL_CXX0X__
44 # define USE_UNORDERED_MAP 1
45 #else
46 # define USE_UNORDERED_MAP 0
47 #endif
48 #if USE_UNORDERED_MAP
49 #include <unordered_map>
50 #else
51 #include <map>
52 #endif
53 #include <math.h>
54 #include <stddef.h>
55
56 #include "dynamic_bitset.h"
57 #include "matrix.h"
58
59 namespace cvflann
60 {
61
62 namespace lsh
63 {
64
65 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
66
67 /** What is stored in an LSH bucket
68 */
69 typedef uint32_t FeatureIndex;
70 /** The id from which we can get a bucket back in an LSH table
71 */
72 typedef unsigned int BucketKey;
73
74 /** A bucket in an LSH table
75 */
76 typedef std::vector<FeatureIndex> Bucket;
77
78 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
79
80 /** POD for stats about an LSH table
81 */
82 struct LshStats
83 {
84 std::vector<unsigned int> bucket_sizes_;
85 size_t n_buckets_;
86 size_t bucket_size_mean_;
87 size_t bucket_size_median_;
88 size_t bucket_size_min_;
89 size_t bucket_size_max_;
90 size_t bucket_size_std_dev;
91 /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
92 */
93 std::vector<std::vector<unsigned int> > size_histogram_;
94 };
95
96 /** Overload the << operator for LshStats
97 * @param out the streams
98 * @param stats the stats to display
99 * @return the streams
100 */
101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
102 {
103 int w = 20;
104 out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
105 << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
106 << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
107 << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
108 << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
109 << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
110 << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
111
112 // Display the histogram
113 out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
114 << std::setiosflags(std::ios::left);
115 for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
116 stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", ";
117
118 return out;
119 }
120
121
122 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
123
124 /** Lsh hash table. As its key is a sub-feature, and as usually
125 * the size of it is pretty small, we keep it as a continuous memory array.
126 * The value is an index in the corpus of features (we keep it as an unsigned
127 * int for pure memory reasons, it could be a size_t)
128 */
129 template<typename ElementType>
130 class LshTable
131 {
132 public:
133 /** A container of all the feature indices. Optimized for space
134 */
135 #if USE_UNORDERED_MAP
136 typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
137 #else
138 typedef std::map<BucketKey, Bucket> BucketsSpace;
139 #endif
140
141 /** A container of all the feature indices. Optimized for speed
142 */
143 typedef std::vector<Bucket> BucketsSpeed;
144
145 /** Default constructor
146 */
LshTable()147 LshTable()
148 {
149 key_size_ = 0;
150 feature_size_ = 0;
151 speed_level_ = kArray;
152 }
153
154 /** Default constructor
155 * Create the mask and allocate the memory
156 * @param feature_size is the size of the feature (considered as a ElementType[])
157 * @param key_size is the number of bits that are turned on in the feature
158 */
LshTable(unsigned int feature_size,unsigned int key_size)159 LshTable(unsigned int feature_size, unsigned int key_size)
160 {
161 feature_size_ = feature_size;
162 (void)key_size;
163 std::cerr << "LSH is not implemented for that type" << std::endl;
164 assert(0);
165 }
166
167 /** Add a feature to the table
168 * @param value the value to store for that feature
169 * @param feature the feature itself
170 */
add(unsigned int value,const ElementType * feature)171 void add(unsigned int value, const ElementType* feature)
172 {
173 // Add the value to the corresponding bucket
174 BucketKey key = (lsh::BucketKey)getKey(feature);
175
176 switch (speed_level_) {
177 case kArray:
178 // That means we get the buckets from an array
179 buckets_speed_[key].push_back(value);
180 break;
181 case kBitsetHash:
182 // That means we can check the bitset for the presence of a key
183 key_bitset_.set(key);
184 buckets_space_[key].push_back(value);
185 break;
186 case kHash:
187 {
188 // That means we have to check for the hash table for the presence of a key
189 buckets_space_[key].push_back(value);
190 break;
191 }
192 }
193 }
194
195 /** Add a set of features to the table
196 * @param dataset the values to store
197 */
add(Matrix<ElementType> dataset)198 void add(Matrix<ElementType> dataset)
199 {
200 #if USE_UNORDERED_MAP
201 buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
202 #endif
203 // Add the features to the table
204 for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
205 // Now that the table is full, optimize it for speed/space
206 optimize();
207 }
208
209 /** Get a bucket given the key
210 * @param key
211 * @return
212 */
getBucketFromKey(BucketKey key)213 inline const Bucket* getBucketFromKey(BucketKey key) const
214 {
215 // Generate other buckets
216 switch (speed_level_) {
217 case kArray:
218 // That means we get the buckets from an array
219 return &buckets_speed_[key];
220 break;
221 case kBitsetHash:
222 // That means we can check the bitset for the presence of a key
223 if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
224 else return 0;
225 break;
226 case kHash:
227 {
228 // That means we have to check for the hash table for the presence of a key
229 BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
230 bucket_it = buckets_space_.find(key);
231 // Stop here if that bucket does not exist
232 if (bucket_it == bucket_end) return 0;
233 else return &bucket_it->second;
234 break;
235 }
236 }
237 return 0;
238 }
239
240 /** Compute the sub-signature of a feature
241 */
getKey(const ElementType *)242 size_t getKey(const ElementType* /*feature*/) const
243 {
244 std::cerr << "LSH is not implemented for that type" << std::endl;
245 assert(0);
246 return 1;
247 }
248
249 /** Get statistics about the table
250 * @return
251 */
252 LshStats getStats() const;
253
254 private:
255 /** defines the speed fo the implementation
256 * kArray uses a vector for storing data
257 * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
258 * kHash uses a hash map only
259 */
260 enum SpeedLevel
261 {
262 kArray, kBitsetHash, kHash
263 };
264
265 /** Initialize some variables
266 */
initialize(size_t key_size)267 void initialize(size_t key_size)
268 {
269 const size_t key_size_lower_bound = 1;
270 //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
271 const size_t key_size_upper_bound = (std::min)(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
272 if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
273 {
274 CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound));
275 }
276
277 speed_level_ = kHash;
278 key_size_ = (unsigned)key_size;
279 }
280
281 /** Optimize the table for speed/space
282 */
optimize()283 void optimize()
284 {
285 // If we are already using the fast storage, no need to do anything
286 if (speed_level_ == kArray) return;
287
288 // Use an array if it will be more than half full
289 if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
290 speed_level_ = kArray;
291 // Fill the array version of it
292 buckets_speed_.resize(size_t(1) << key_size_);
293 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
294
295 // Empty the hash table
296 buckets_space_.clear();
297 return;
298 }
299
300 // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
301 // for the vector) or less than 512MB (key_size_ <= 30)
302 if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
303 >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
304 speed_level_ = kBitsetHash;
305 key_bitset_.resize(size_t(1) << key_size_);
306 key_bitset_.reset();
307 // Try with the BucketsSpace
308 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
309 }
310 else {
311 speed_level_ = kHash;
312 key_bitset_.clear();
313 }
314 }
315
316 /** The vector of all the buckets if they are held for speed
317 */
318 BucketsSpeed buckets_speed_;
319
320 /** The hash table of all the buckets in case we cannot use the speed version
321 */
322 BucketsSpace buckets_space_;
323
324 /** What is used to store the data */
325 SpeedLevel speed_level_;
326
327 /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
328 * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
329 */
330 DynamicBitset key_bitset_;
331
332 /** The size of the sub-signature in bits
333 */
334 unsigned int key_size_;
335
336 unsigned int feature_size_;
337
338 // Members only used for the unsigned char specialization
339 /** The mask to apply to a feature to get the hash key
340 * Only used in the unsigned char case
341 */
342 std::vector<size_t> mask_;
343 };
344
345 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
346 // Specialization for unsigned char
347
348 template<>
LshTable(unsigned int feature_size,unsigned int subsignature_size)349 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
350 {
351 feature_size_ = feature_size;
352 initialize(subsignature_size);
353 // Allocate the mask
354 mask_ = std::vector<size_t>((feature_size * sizeof(char) + sizeof(size_t) - 1) / sizeof(size_t), 0);
355
356 // A bit brutal but fast to code
357 std::vector<int> indices(feature_size * CHAR_BIT);
358 for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = (int)i;
359 #ifndef OPENCV_FLANN_USE_STD_RAND
360 cv::randShuffle(indices);
361 #else
362 std::random_shuffle(indices.begin(), indices.end());
363 #endif
364
365 // Generate a random set of order of subsignature_size_ bits
366 for (unsigned int i = 0; i < key_size_; ++i) {
367 size_t index = indices[i];
368
369 // Set that bit in the mask
370 size_t divisor = CHAR_BIT * sizeof(size_t);
371 size_t idx = index / divisor; //pick the right size_t index
372 mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
373 }
374
375 // Set to 1 if you want to display the mask for debug
376 #if 0
377 {
378 size_t bcount = 0;
379 BOOST_FOREACH(size_t mask_block, mask_){
380 out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
381 << std::endl;
382 bcount += __builtin_popcountll(mask_block);
383 }
384 out << "bit count : " << std::dec << bcount << std::endl;
385 out << "mask size : " << mask_.size() << std::endl;
386 return out;
387 }
388 #endif
389 }
390
391 /** Return the Subsignature of a feature
392 * @param feature the feature to analyze
393 */
394 template<>
getKey(const unsigned char * feature)395 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
396 {
397 // no need to check if T is dividable by sizeof(size_t) like in the Hamming
398 // distance computation as we have a mask
399 // FIXIT: This is bad assumption, because we reading tail bytes after of the allocated features buffer
400 const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
401
402 // Figure out the subsignature of the feature
403 // Given the feature ABCDEF, and the mask 001011, the output will be
404 // 000CEF
405 size_t subsignature = 0;
406 size_t bit_index = 1;
407
408 for (unsigned i = 0; i < feature_size_; i += sizeof(size_t)) {
409 // get the mask and signature blocks
410 size_t feature_block;
411 if (i <= feature_size_ - sizeof(size_t))
412 {
413 feature_block = *feature_block_ptr;
414 }
415 else
416 {
417 size_t tmp = 0;
418 memcpy(&tmp, feature_block_ptr, feature_size_ - i); // preserve bytes order
419 feature_block = tmp;
420 }
421 size_t mask_block = mask_[i / sizeof(size_t)];
422 while (mask_block) {
423 // Get the lowest set bit in the mask block
424 size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
425 // Add it to the current subsignature if necessary
426 subsignature += (feature_block & lowest_bit) ? bit_index : 0;
427 // Reset the bit in the mask block
428 mask_block ^= lowest_bit;
429 // increment the bit index for the subsignature
430 bit_index <<= 1;
431 }
432 // Check the next feature block
433 ++feature_block_ptr;
434 }
435 return subsignature;
436 }
437
438 template<>
getStats()439 inline LshStats LshTable<unsigned char>::getStats() const
440 {
441 LshStats stats;
442 stats.bucket_size_mean_ = 0;
443 if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
444 stats.n_buckets_ = 0;
445 stats.bucket_size_median_ = 0;
446 stats.bucket_size_min_ = 0;
447 stats.bucket_size_max_ = 0;
448 return stats;
449 }
450
451 if (!buckets_speed_.empty()) {
452 for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
453 stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
454 stats.bucket_size_mean_ += pbucket->size();
455 }
456 stats.bucket_size_mean_ /= buckets_speed_.size();
457 stats.n_buckets_ = buckets_speed_.size();
458 }
459 else {
460 for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
461 stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
462 stats.bucket_size_mean_ += x->second.size();
463 }
464 stats.bucket_size_mean_ /= buckets_space_.size();
465 stats.n_buckets_ = buckets_space_.size();
466 }
467
468 std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
469
470 // BOOST_FOREACH(int size, stats.bucket_sizes_)
471 // std::cout << size << " ";
472 // std::cout << std::endl;
473 stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
474 stats.bucket_size_min_ = stats.bucket_sizes_.front();
475 stats.bucket_size_max_ = stats.bucket_sizes_.back();
476
477 // TODO compute mean and std
478 /*float mean, stddev;
479 stats.bucket_size_mean_ = mean;
480 stats.bucket_size_std_dev = stddev;*/
481
482 // Include a histogram of the buckets
483 unsigned int bin_start = 0;
484 unsigned int bin_end = 20;
485 bool is_new_bin = true;
486 for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
487 != end; )
488 if (*iterator < bin_end) {
489 if (is_new_bin) {
490 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
491 stats.size_histogram_.back()[0] = bin_start;
492 stats.size_histogram_.back()[1] = bin_end - 1;
493 is_new_bin = false;
494 }
495 ++stats.size_histogram_.back()[2];
496 ++iterator;
497 }
498 else {
499 bin_start += 20;
500 bin_end += 20;
501 is_new_bin = true;
502 }
503
504 return stats;
505 }
506
507 // End the two namespaces
508 }
509 }
510
511 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
512
513 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */
514