C++ 哈希表核心机制：从哈希冲突到负载因子

文章配图

C++ 的两个参考文档

非官方文档：cplusplus
准官方文档（同步更新）：C++ 官方参考文档

set 和 multiset 的参考文档：set、multiset
map 和 multimap 的参考文档：map、multimap
unordered_set 和 unordered_multiset 的参考文档：unordered_set、unordered_multiset

文章配图

前情提示

文章配图

1 ~> 初始哈希

哈希 (hash) 又称散列，故哈希表又称散列表，是一种组织数据的方式。哈希是音译名，从译名来看，有散乱排列（散列）的意思。哈希的本质就是通过哈希函数把关键字 Key 跟存储位置建立一个映射关系，查找时通过这个哈希函数计算出 Key 存储的位置，进行快速查找。

2 ~> 直接定址法

2.1 概念

#pragma once #include<vector> static const int __stl_num_primes = 28; static const unsigned long __stl_prime_list[__stl_num_primes] = { 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741, 3221225473, 4294967291 }; inline unsigned long __stl_next_prime(unsigned long n) { const unsigned long* first = __stl_prime_list; const unsigned long* last = __stl_prime_list + __stl_num_primes; // >= n const unsigned long* pos = lower_bound(first, last, n); return pos == last ? *(last - 1) : *pos; } template<class K> struct HashFunc { size_t operator()(const K& key) { return (size_t)key; } }; // 特化 template<> struct HashFunc<string> { size_t operator()(const string& key) { size_t hash = 0; for (auto ch : key) { hash += ch; hash *= 131; } return hash; } }; namespace open_address { enum State { EMPTY, EXIST, DELETE }; template<class K, class V> struct HashData { pair<K, V> _kv; State _state = EMPTY; }; template<class K, class V, class Hash = HashFunc<K>> class HashTable { public: HashTable() :_tables(__stl_next_prime(1)) { } bool Insert(const pair<K, V>& kv) { if (Find(kv.first)) return false; // 满了 / 快满了就要扩容，负载因子 >= 0.7 就要扩容 if ((double)_n / (double)_tables.size() >= 0.7) // 至少强转一个 { //vector<HashData<K, V>> newTables(__stl_next_prime(_tables.size() * 2)); // //_tables.resize()； // 扩容可以直接 resize 吗？哈希表的扩容不是那么简单的，要重新分配 // std::vector<HashData> newtables(_tables.size()); // for (size_t i = 0; i < _tables, size(); i++) // { // if (_tables[i]._state == EXIST) // { // // 重新映射到新表 // // ... // } // } // // _tables.swap(newtables); HashTable<K, V, Hash> newht; newht._tables.resize(__stl_next_prime(_tables.size() + 1)); for (size_t i = 0; i < _tables.size(); i++) { // 遍历旧表，旧表数据插入到 newht if (_tables[i]._state == EXIST) { newht.Insert(_tables[i]._kv); } } _tables.swap(newht._tables); } Hash hs; // 插入的逻辑 size_t hash0 = hs(kv.first) % _tables.size(); // 只能模 size，不能模 capacity // []访问必须要在 size 访问之内，模 capacity 放不进去（尽可能让 capacity 和 size 一致） // 线性探测 size_t i = 1; // 第一次就不加了 size_t hashi = hash0; while (_tables[hashi]._state == EXIST) { hashi = (hash0 + i) % _tables.size(); // 取模，回绕回去 ++i; // 不断加 i } _tables[hashi]._kv = kv; _tables[hashi]._state = EXIST; ++_n; return true; } HashData<K, V>* Find(const K& key) { Hash hs; size_t hash0 = hs(key) % _tables.size(); //线性探测 size_t i = 1; size_t hashi = hash0; while (_tables[hashi]._state != EMPTY) { if (_tables[hashi]._state != DELETE && _tables[hashi]._kv.first == key) { return &_tables[hashi]; } hashi = (hash0 + i) % _tables.size(); ++i; } return nullptr; } bool Erase(const K& key) { HashData<K, V>* ret = Find(key); if (ret) { ret->_state = DELETE; --_n; return true; } else { return false; } } private: std::vector<HashData<K, V>> _tables; // 指针数组 size_t _n = 0; // 存储的有效数据个数 }; } namespace Hash_bucket { template<class K,class V> struct HashNode { pair<K, V> _kv; HashNode<K, V>* _next; HashNode(const pair<K,V>& kv) :_kv(kv) ,_next(nullptr) {} }; template<class K, class V,class Hash = HashFunc<K>> class HashTable { typedef HashNode<K, V> Node; public: HashTable() :_tables(__stl_next_prime(1),nullptr) ,_n(0) { } // 析构，要单独实现 ~HashTable() { for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; while (cur) { Node* next = cur->_next; delete cur; cur = next; } _tables[i] = nullptr; } _n = 0; } bool Insert(const pair<K, V>& kv) { if (Find(kv.first)) return false; Hash hs; // 除余都要套上 hs // 负载因子 == 1 就开始扩容 if (_n == _tables.size()) { //HashTable<K, V> newht; //newht._tables.resize(_tables.size() * 2); //for (size_t i = 0; i < _tables.size(); i++) //{ // // 遍历旧表，旧表数据插入到 newht // Node* cur = _tables[i]; // while (cur) // { // newht.Insert(cur->_kv); // cur = cur->_next; // } //} //_tables.swap(newht._tables); std::vector<Node*> newtables(__stl_next_prime(_tables.size() + 1), nullptr); for (size_t i = 0; i < _tables.size(); i++) { // 遍历旧表，旧表节点重新映射，挪动到新表 Node* cur = _tables[i]; while (cur) { Node* next = cur->_next; // 头插 size_t hashi = hs(cur->_kv.first) % newtables.size(); cur->_next = newtables[hashi]; newtables[hashi] = cur; cur = next; } _tables[i] = nullptr; } _tables.swap(newtables); } size_t hashi = hs(kv.first) % _tables.size(); // 头插 Node* newnode = new Node(kv); newnode->_next = _tables[hashi]; _tables[hashi] = newnode; ++_n; // 插入，_n 是有效数据个数，要++ return true; } Node* Find(const K& key) { Hash hs; size_t hashi = hs(key) % _tables.size(); Node* cur = _tables[hashi]; while (cur) { if (cur->_kv.first == key) { return cur; } cur = cur->_next; } return nullptr; } bool Erase(const K& key) { Hash hs; size_t hashi = hs(key) % _tables.size(); Node* prev = nullptr; Node* cur = _tables[hashi]; while (cur) { if (cur->_kv.first == key) { // 删除 if (prev == nullptr) { // 哈希桶中的第一个节点 _tables[hashi] = cur->_next; } else { prev->_next = cur->_next; } --_n; // _n 是有效数据个数，每次删除之后都要减减 delete cur; return true; } prev = cur; cur = cur->_next; } return false; } private: std::vector<Node*> _tables; // 指针数组 size_t _n; // 存储的有效数据个数 //std::vector<std::list<K, V>> _tables; // 不是实现不了，而是这种实现太绕了，而且比较抽象，现阶段对我们来说还是太难了 }; }

#define _CRT_SECURE_NO_WARNINGS 1 #include<iostream> #include<unordered_map> using namespace std; #include"HashTable.h" namespace open_address { void TestHT1() { int a[] = { 19,30,5,36,13,20,21,12,58 }; HashTable<int, int> ht; for (auto e : a) { ht.Insert({ e,e }); } ht.Insert({ 2,2 }); ht.Insert({ 22,22 }); cout << ht.Find(5) << endl; cout << ht.Find(58) << endl; ht.Erase(5); cout << ht.Find(5) << endl; cout << ht.Find(58) << endl; //for (size_t i = 0; i < 100; i++) //{ // ht.Insert({ rand(),i }); //} } struct HashFuncString { // BKDR size_t operator()(const string& key) { size_t hash = 0; for (auto ch : key) { hash += ch; hash *= 131; } return hash; } }; void TestHT2() { //HashTable<string, string, HashFuncString> dict; HashTable<string, string> dict; dict.Insert({ "string","字符串" }); // string 无法取模 dict.Insert({ "string","字符串 1" }); dict.Insert({ "left","左边" }); dict.Insert({ "right","右边" }); cout << dict.Find("string") << endl; cout << dict.Find("left") << endl; cout << dict.Find("left ") << endl; HashFuncString hfs; cout << hfs("abcd") << endl; cout << hfs("acbd") << endl; cout << hfs("aadd") << endl; unordered_map<string, string> dictmap; dictmap.insert({ "string","字符串" }); // 编译报错，需要自己实现 Hash 的仿函数把 key 转成整形 //unordered_map<pair<string, int>, string> um; //um.insert({ {"string", 1}, "字符串" }); } } namespace Hash_bucket { void TestHT1() { int a[] = { 19,30,5,36,13,20,21,12,58 }; HashTable<int, int> ht; for (auto e : a) { ht.Insert({ e,e }); } ht.Insert({ 2,2 }); ht.Insert({ 22,22 }); ht.Insert({ 44,44 }); // 这两个过了就说明代码没问题了：先删 58 再删 36 ht.Erase(58); ht.Erase(36); } void TestHT2() { HashTable<string, string> dict; dict.Insert({ "string","字符串" }); // string 无法取模 dict.Insert({ "string","字符串 1" }); dict.Insert({ "left","左边" }); dict.Insert({ "right","右边" }); cout << dict.Find("string") << endl; cout << dict.Find("left") << endl; cout << dict.Find("left ") << endl; } } int main() { // open_address::TestHT1(); //open_address::TestHT2(); Hash_bucket::TestHT1(); //Hash_bucket::TestHT2(); return 0; }

C++ 哈希表核心机制：从哈希冲突到负载因子