hash 函数

比较常用的是 murmurhash3 :
murmurhash3.h

1
2
3
4
5
6
7
8
9
#ifndef MURMURHASH3_H_
#define MURMURHASH3_H_

// Referenec: https://github.com/goossaert/hashmap
#include <stdint.h>

void murmurhash3_x64_128 (const void* key, int len, uint32_t seed, void * out);

#endif // MURMURHASH3_H_

murmurhash3.cpp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#include <stdlib.h>

#include "murmurhash3.hpp"

#define FORCE_INLINE inline __attribute__((always_inline))

inline uint64_t rotl64 (uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}

#define ROTL64(x, y) rotl64(x, y)

#define BIG_CONSTANT(x) (x##LLU)

FORCE_INLINE uint64_t getblock64 (const uint64_t * p, int i) {
return p[i];
}

FORCE_INLINE uint64_t fmix64 (uint64_t k) {
k ^= k >> 33;
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
k ^= k >> 33;
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
k ^= k >> 33;

return k;
}

void murmurhash3_x64_128 (const void * key, const int len,
const uint32_t seed, void * out) {
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;

uint64_t h1 = seed;
uint64_t h2 = seed;

const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);

// body
const uint64_t * blocks = (const uint64_t *)(data);

int i = 0;
for (; i < nblocks; i++) {
uint64_t k1 = getblock64(blocks,i*2+0);
uint64_t k2 = getblock64(blocks,i*2+1);
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
}

// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

uint64_t k1 = 0;
uint64_t k2 = 0;

switch (len & 15) {
case 15: k2 ^= ((uint64_t)tail[14]) << 48;
case 14: k2 ^= ((uint64_t)tail[13]) << 40;
case 13: k2 ^= ((uint64_t)tail[12]) << 32;
case 12: k2 ^= ((uint64_t)tail[11]) << 24;
case 11: k2 ^= ((uint64_t)tail[10]) << 16;
case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
case 9 : k2 ^= ((uint64_t)tail[ 8]) << 0;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

case 8: k1 ^= ((uint64_t)tail[ 7]) << 56;
case 7: k1 ^= ((uint64_t)tail[ 6]) << 48;
case 6: k1 ^= ((uint64_t)tail[ 5]) << 40;
case 5: k1 ^= ((uint64_t)tail[ 4]) << 32;
case 4: k1 ^= ((uint64_t)tail[ 3]) << 24;
case 3: k1 ^= ((uint64_t)tail[ 2]) << 16;
case 2: k1 ^= ((uint64_t)tail[ 1]) << 8;
case 1: k1 ^= ((uint64_t)tail[ 0]) << 0;
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
};

//----------
// finalization

h1 ^= len; h2 ^= len;

h1 += h2;
h2 += h1;

h1 = fmix64(h1);
h2 = fmix64(h2);

h1 += h2;
h2 += h1;

((uint64_t*)out)[0] = h1;
((uint64_t*)out)[1] = h2;
}

调用方法:

1
2
3
4
char out[16];
uint64_t key = 98723849823;
murmurhash3_x64_128((void*)(&key), sizeof(key), 0, out);
uint64_t hashcode = *((uint64_t*) out) ^ *((uint64_t*) (out + 8));// xor 不改变随机数分布

hash 表冲突解决策略:

比较常用的是 robin hood hashing:
robin_hood_hash.hpp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#ifndef ROBIN_HOOD_HASHTABLE_H_
#define ROBIN_HOOD_HASHTABLE_H_
#include <utility>
#include <algorithm>
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

// 根据具体场景优化定制的的 robin hood hashtable, 通用型不强.
// key 的类型为 uint64_t, 即 winfoid 的类型, value 的类型为 uint32_t.
// 只支持插入和查找, 不支持删除, 不支持自动扩容.
// 使用时需要用户自己指定 capacity, 这个 capacity 决定了最终的 load factor

typedef uint64_t K;
typedef uint32_t V;

class RobinHoodHashtable {
public:
struct Item {
K key;
V value;
Item() : key(0), value(0) {}
Item(K k, V v) : key(k), value(v) {}
};

RobinHoodHashtable(): _capacity(0), _item_num(0), _buffer(nullptr), _probing_num(nullptr) {}

~RobinHoodHashtable() {
if (_buffer != nullptr) {
delete [] _buffer;
}
if (_probing_num != nullptr) {
delete [] _probing_num;
}
}

// 初始化 hashtable, 设置 capacity
bool init(uint32_t capacity) {
_capacity = capacity;
_buffer = new (std::nothrow) Item[_capacity];
if (_buffer == nullptr) {
std::cout << "new buffer array failed." << std::endl;
return false;
}
_probing_num = new (std::nothrow) int32_t[_capacity];
if (_probing_num == nullptr) {
std::cout << "new probing_num array failed." << std::endl;
return false;
}
std::fill_n(_probing_num, _capacity, -1);
return true;
}

// 插入
bool insert(K key, V value) {
if (_item_num >= _capacity) {
std::cout << "you need to increase capacity" << std::endl;
return false;
}

uint32_t pos = _key_to_pos(key);
uint32_t probing_count = 0;
while (true) {
// 如果 pos 未被占用就放在这个位置
if (_probing_num[pos] == -1) {
new (_buffer + pos) Item(key, value);
_probing_num[pos] = probing_count;
break;
}
// 相同的 key 就覆盖 value
if (key == _buffer[pos].key) {
return false;
}
// 否则继续探测下一个位置
uint32_t cur_probing_count = _probing_num[pos];
if (probing_count > cur_probing_count) {
std::swap(key, _buffer[pos].key);
std::swap(value, _buffer[pos].value);
_probing_num[pos] = probing_count;
probing_count = cur_probing_count;
}
pos = ++pos >= _capacity ? pos % _capacity : pos;
++probing_count;
}
++_item_num;
return true;
}

// 根据 key 查找, 如果找到返回 <true, value>, 没有找到返回<false, 0>
__attribute__((always_inline)) std::pair<bool, V> find(const K& key) {
uint32_t pos = _key_to_pos(key);
uint32_t probing_count = 0;
while (true) {
if (_probing_num[pos] == -1 || probing_count > _probing_num[pos]) {
return std::make_pair(false, 0);
} else if (_buffer[pos].key == key) {
return std::make_pair(true, _buffer[pos].value);
}
pos = ++pos >= _capacity ? pos % _capacity : pos;
++probing_count;
}
}

private:
uint32_t _item_num;
uint32_t _capacity;
// 存储 k-v 对的缓冲区
Item * _buffer;
// 记录一个元素的探测长度, -1 表示当前位置没有元素
int32_t* _probing_num;
// 将 key 转换为pos, 通常可以通过murmurhash3_x64_128(key) 计算 hashcode, 然后在计算 pos, 但实验发现 murmurhash3_x64_128 本身也比较耗时, 直接对 key 进行取余效率更高. 但这种情况只适合数据分布比较均匀的情况, 否则有可能出现insert 和 find 冲突大大增加的情况.
__attribute__((always_inline)) uint32_t _key_to_pos(const K& key) {
uint32_t* p = (uint32_t*) &key;
uint32_t hashcode = *p ^ *(p+1);
uint32_t pos = hashcode >= _capacity ? hashcode % _capacity : hashcode;
return pos;
}
};
#endif // ROBIN_HOOD_HASHTABLE_H_

Reference