哈希表核心原理与 C++ 实现详解

一、哈希的概念

哈希（Hash）又称散列，是组织数据的一种方式，本质为：将键值 key 与存储位置建立映射关系，查找时通过哈希函数计算出 key 的存储位置快速查找。

1. 哈希冲突（碰撞）：借助哈希函数将 N 个值映射到大小为 M 的哈希表中（M≥N），而不同的 key 可能会映射到同一个位置上，称为哈希冲突，是不可避免的。

如哈希函数：h(key) = key % M（除法散列法）中，203 % 200 = 3、3 % 200 = 3。

2. 负载因子（load factor）：映射存储了 N 个值，哈希表大小为 M，则负载因子 = N / M。

负载因子越大，哈希冲突概率越高，空间利用率越高；相反，负载因子越小，哈希冲突概率越低，空间利用率越低。所以，要保持负载因子达到一定的平衡才能保持效率。

3. 关键词 key 类型需要支持整型转换：key 为整数好做映射计算。

二、直接定址法

每个关键字的值（或其 ASCII 码）就是其存储位置的下标（计数排序）。

适用于整型（如 float、string 等非整型类型不支持）且范围比较集中的数据 => 简单高效。

三、哈希函数

尽可能将 N 个值以概率均分到哈希表 M 空间。

1. 除留余数法／除法散列法 (最常用)

h(key)＝ key ％ M

该方法不关注 key 的大小，只在乎 key 的个数。

① M 尽量避免 2^x，10^x 的值： a. M 若为 2^x，则％M 本质相当于保留 key 的后 x 位（二进制下），后 x 位相同的值哈希值相同。如对于 01001101 (77) % 2^4 (16)，结果仅与后 3 位有关，前 (32 - 3) 位均能整除 2^4，故算出的哈希值没有特征性（参与计算的位数越多，哈希值越具有特征性，哈希冲突越少发生）。 b. M 若为 10^x，则％M 本质相当于保留 key 后 x 位（十进制下），与 2^x 一致。

②建议 M 取不太接近 2^x 的一个质数。

③JAVA 中 HashMap 则采用 2^x 作为 M：不用取模，直接位运算（相对于位运算，取模的效率要低很多），且扩容直接 M*2 即可（C++ 采用设置好的质数）。

int hashi = key % pow(2, 16); // 效率低
int hashi = key & ((1 << 16) - 1); // 效率高

为防止前 32-x 位没有参与运算，将后 x 位与前 32-x 位再异或。

int n = 16;
int hashi = key & (1 << (n - 1));
hashi = hashi ^ (key >> (32 - n));

但是当初始的 x < 16 时，会出现异或后数值大于 M 的情况。

2. 乘法散列法

h(key) = floor(M * (A * key) % 1.0)

a. 用 key * 常数 A(0 < A < 1)，并抽取 k * A 的小数部分。 b. 再用 M * (小数部分)，并向下取整。

3. 全域散列法

该方法为给散列函数增加随机性，防止恶意攻击。

四、开放定址法

1. 线性探测

a. 以发生冲突的位置开始，依次向后线性探测，直到找到下一个没有存储数据的位置为止（如果走到表尾就绕回到表头）。

#pragma once #include<iostream> #include<vector> #include<assert.h> using namespace std; enum STATE { EXIST, // 存在 EMPTY, // 空 DELETE // 删除 }; // stl_hashtable.h C++库中选择哈希表大小的方法 static const int __stl_num_primes = 28; static const unsigned long __stl_prime_list[__stl_num_primes] = { 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741, 3221225473, 4294967291 }; inline unsigned long __stl_next_prime(unsigned long n) { const unsigned long* first = __stl_prime_list; const unsigned long* last = __stl_prime_list + __stl_num_primes; const unsigned long* pos = lower_bound(first, last, n); return pos == last ? *(last - 1) : *pos; } // JAVA template<class K, class Hash = hash_func<K>> size_t table_size(const K& key) { size_t x = 16; size_t num = (1 << (x - 1)); // 初始值设为 2^16 size_t hash0 = (Hash()(key) & (num - 1)) ^ (Hash()(key) >> x); // 后 x-1 位与前 32-x 位异或 return hash0; } // 整型转换仿函数 template<class K> struct hash_func { size_t operator()(const K& key) { return (size_t)key; } }; template<> struct hash_func<string> // string 常用，对其特化 { // BKDR size_t operator()(const string& s) { size_t ret = 0; for (auto e : s) { ret += e * 131; } return ret; } }; namespace hash_tables { template<class K, class V> struct hash_data { pair<K, V> _data; STATE _state = EMPTY; }; template<class K, class V, class Hash = hash_func<K>> class hash_table { public: using data_type = pair<K, V>; using hash_type = hash_data<K, V>; using key_type = K; public: hash_table() : _tables(__stl_prime_list[0]), _size(0) { size_t start_num = 0; _tables.resize(__stl_prime_list[start_num]); } pair<hash_type*, bool> insert(data_type data) // 采用除法散列法 { // 冗余值不插入 if (find(data.first).second == true) return { nullptr, false }; // 更新哈希表大小负载因子大于 0.7 则扩容 if ((_size * 10 / _tables.size()) > 7) { hash_table<K, V> tmphash; tmphash._tables.resize(__stl_next_prime(_tables.size() + 1)); for (auto& e : _tables) { tmphash.insert(e._data); // 需要重新插入，哈希表大小不同，映射的值也不同 } _tables.swap(tmphash._tables); } // 哈希表的映射 size_t flag = -1; size_t i = 0; size_t hash0 = _hash(data.first) % _tables.size(); // _tables.size() 的值为 M size_t hashi = hash0; while (_tables[hashi]._state == EXIST) { hashi = linear_search(hash0, i); // 以线性探测为例 } _tables[hashi]._data = data; _tables[hashi]._state = EXIST; ++_size; return { &( _tables[hashi]), true }; } pair<hash_type*, bool> find(key_type key) // 线性探索的查找 { size_t i = 0; size_t flag = -1; size_t hash0 = _hash(key) % _tables.size(); // key 需要支持整型的转换 size_t hashi = hash0; while (_tables[hashi]._state != EMPTY && _tables[hashi]._data.first != key) // key 的类型需要支持等于的比较 { hashi = linear_search(hash0, i); } if (_tables[hashi]._state == EMPTY) return { nullptr, false }; else return { &( _tables[hashi]), true }; } bool erase(key_type key) { if (find(key).second == false) return false; (*(find(key).first))._state = DELETE; --_size; return true; } private: vector<hash_type> _tables; size_t _size = 0; Hash _hash; private: // 线性探索 size_t linear_search(size_t hash0, size_t& i) // 返回 hashi 的值 { ++i; hash0 = (hash0 + i) % _tables.size(); return hash0; } // 二次探测 size_t bidirectional_search(size_t hash0, size_t& i, size_t& flag) { // 先加后减 if (flag == -1) { flag = 1; if(i <= _tables.size() / 2) ++i; } if (flag == 1) flag = -1; hash0 = (hash0 + i * i * flag) % _tables.size(); return hash0; } }; }

// 哈希桶 namespace hash_buckets { template<class K, class V> struct hash_node // 采用单链表的形式 { pair<K, V> _data; hash_node* _next = nullptr; }; template<class K, class V, class Hash = hash_func<K>> class hash_table { public: using key_type = K; using data_type = pair<K, V>; using node_type = hash_node<K, V>; using link_type = node_type*; using self = hash_table<K, V, Hash>; public: hash_table() : _tables(__stl_prime_list[0]), _size(0) {} ~hash_table() { for (auto& head : _tables) { link_type cur = head; while (cur) { head = cur->_next; delete cur; cur = head; } } _tables.~vector(); } hash_table(const hash_table& ht) { self _tmpht; _tmpht._tables.resize(ht._tables.size()); for (const auto& head : ht._tables) { link_type cur = head; while (cur) { link_type next = cur->_next; _tmpht.insert(cur->_data); cur = next; } } _tables.swap(_tmpht._tables); _size = ht._size; } hash_table& operator=(hash_table ht) { _tables.swap(ht._tables); _size = ht._size; return *this; } public: pair<link_type, bool> insert(data_type data) { if (find(data.first).second == true) return { nullptr, false }; // 负载因子大于等于 1 时扩容 if ((_size / _tables.size()) >= 1) // 均为 size_t,最后结果会保留整数 { // 采用直接移动节点的方法，否则需要再创造新节点删除旧节点 vector<link_type> newtables(__stl_prime_list[_tables.size() + 1]); //vector<link_type> newtables((_tables.size()) * 2); for (auto& head : _tables) { link_type cur = head; while (cur) { link_type next = cur->_next; size_t hash0 = _hash(cur->_data.first) % newtables.size(); if (newtables[hash0] != nullptr) cur->_next = newtables[hash0]; newtables[hash0] = cur; cur = next; } } _tables.swap(newtables); } size_t hash0 = _hash(data.first) % _tables.size(); link_type newnode = new node_type; newnode->_data = data; if(_tables[hash0] != nullptr) newnode->_next = _tables[hash0]; _tables[hash0] = newnode; ++_size; return { _tables[hash0], true }; } pair<link_type, bool> find(key_type key) { size_t hash0 = _hash(key) % _tables.size(); link_type cur = _tables[hash0]; while (cur) { if (cur->_data.first == key) return { cur, true }; cur = cur->_next; } return { nullptr, false }; } bool erase(key_type key) { pair<link_type, bool> ret = find(key); if (ret.second == false) return false; else { size_t hash0 = _hash(key) % _tables.size(); link_type cur = _tables[hash0]; link_type prev = _tables[hash0]; while (cur) { if (cur->_data.first == key) { if (cur == _tables[hash0]) _tables[hash0] = cur->_next; else prev->_next = cur->_next; delete cur; --_size; return true; } prev = cur; cur = cur->_next; } return false; } } private: vector<link_type> _tables; // 便于实现 iterator size_t _size = 0; Hash _hash; }; }

哈希表核心原理与 C++ 实现详解

一、哈希的概念

二、直接定址法

三、哈希函数

1. 除留余数法／除法散列法 (最常用)

2. 乘法散列法

3. 全域散列法

四、开放定址法

1. 线性探测

更多推荐文章

相关免费在线工具

2. 二次探测

3. 双重散列探测

五、链地址法

六、整型转换

1. 一般采用仿函数的方式转换

2. 对于 string 类型的整型转换

3. hash 表对于 key 类型的要求

更多推荐文章

相关免费在线工具

哈希表核心原理与 C++ 实现详解

一、哈希的概念

二、直接定址法

三、哈希函数

1. 除留余数法／除法散列法 (最常用)

2. 乘法散列法

3. 全域散列法

四、开放定址法

1. 线性探测

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

2. 二次探测

3. 双重散列探测

五、链地址法

六、整型转换

1. 一般采用仿函数的方式转换

2. 对于 string 类型的整型转换

3. hash 表对于 key 类型的要求

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具