@@ -147,6 +147,7 @@ class LshTable
147
147
LshTable ()
148
148
{
149
149
key_size_ = 0 ;
150
+ feature_size_ = 0 ;
150
151
speed_level_ = kArray ;
151
152
}
152
153
@@ -157,7 +158,7 @@ class LshTable
157
158
*/
158
159
LshTable (unsigned int feature_size, unsigned int key_size)
159
160
{
160
- ( void ) feature_size;
161
+ feature_size_ = feature_size;
161
162
(void )key_size;
162
163
std::cerr << " LSH is not implemented for that type" << std::endl;
163
164
assert (0 );
@@ -332,6 +333,8 @@ class LshTable
332
333
*/
333
334
unsigned int key_size_;
334
335
336
+ unsigned int feature_size_;
337
+
335
338
// Members only used for the unsigned char specialization
336
339
/* * The mask to apply to a feature to get the hash key
337
340
* Only used in the unsigned char case
@@ -345,9 +348,10 @@ class LshTable
345
348
template <>
346
349
inline LshTable<unsigned char >::LshTable(unsigned int feature_size, unsigned int subsignature_size)
347
350
{
351
+ feature_size_ = feature_size;
348
352
initialize (subsignature_size);
349
353
// Allocate the mask
350
- mask_ = std::vector<size_t >((size_t ) ceil (( float )( feature_size * sizeof (char )) / ( float ) sizeof (size_t ) ), 0 );
354
+ mask_ = std::vector<size_t >((feature_size * sizeof (char ) + sizeof ( size_t ) - 1 ) / sizeof (size_t ), 0 );
351
355
352
356
// A bit brutal but fast to code
353
357
std::vector<size_t > indices (feature_size * CHAR_BIT);
@@ -392,6 +396,7 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons
392
396
{
393
397
// no need to check if T is dividable by sizeof(size_t) like in the Hamming
394
398
// distance computation as we have a mask
399
+ // FIXIT: This is bad assumption, because we reading tail bytes after of the allocated features buffer
395
400
const size_t * feature_block_ptr = reinterpret_cast <const size_t *> ((const void *)feature);
396
401
397
402
// Figure out the subsignature of the feature
@@ -400,10 +405,20 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons
400
405
size_t subsignature = 0 ;
401
406
size_t bit_index = 1 ;
402
407
403
- for (std::vector< size_t >::const_iterator pmask_block = mask_. begin (); pmask_block != mask_. end (); ++pmask_block ) {
408
+ for (unsigned i = 0 ; i < feature_size_; i += sizeof ( size_t ) ) {
404
409
// get the mask and signature blocks
405
- size_t feature_block = *feature_block_ptr;
406
- size_t mask_block = *pmask_block;
410
+ size_t feature_block;
411
+ if (i <= feature_size_ - sizeof (size_t ))
412
+ {
413
+ feature_block = *feature_block_ptr;
414
+ }
415
+ else
416
+ {
417
+ size_t tmp = 0 ;
418
+ memcpy (&tmp, feature_block_ptr, feature_size_ - i); // preserve bytes order
419
+ feature_block = tmp;
420
+ }
421
+ size_t mask_block = mask_[i / sizeof (size_t )];
407
422
while (mask_block) {
408
423
// Get the lowest set bit in the mask block
409
424
size_t lowest_bit = mask_block & (-(ptrdiff_t )mask_block);
0 commit comments