Hash#(理解,实现与应用)_随笔

Hash#/(理解,实现与应用)

哈希概念

常见的哈希函数

直接定址法

除留余数法

闭散列和开散列

闭散列

开散列

封装unordered_map

封装unordered_set

哈希的应用

位图

布隆过滤器

哈希切割

unordered系列的关联式容器之所以效率比较高，是因为其底层使用了哈希结构哈希概念顺序结构以及平衡树中，元素关键码与其存储位置之间没有对应的关系，因此在查找一个元素时，必须要经过关键码的多次比较. 顺序查找时间复杂度为 O(N) ，平衡树中为树的高度，即 O( log2 N)，搜索的效率取决于搜索过程中元素的比较次数。理想的搜索方法：可以不经过任何比较，一次直接从表中得到要搜索的元素。如果构造一种存储结构，通过某种函数(hashFunc)使元素的存储位置与它的关键码之间能够建立一一映射的关系，那么在查找时通过该函数可以很快找到该元素。当向该结构中：插入元素根据待插入元素的关键码，以此函数计算出该元素的存储位置并按此位置进行存放搜索元素对元素的关键码进行同样的计算，把求得的函数值当做元素的存储位置，在结构中按此位置取元素比较，若关键码相等，则搜索成功该方式即为哈希(散列)方法，哈希方法中使用的转换函数称为哈希(散列)函数，构造出来的结构称为哈希表 (Hash Table)(或者称散列表) 常见的哈希函数直接定址法

取关键字的某个线性函数为散列地址： Hash （ Key ） = A*Key + B 优点：简单、均匀缺点：需要事先知道关键字的分布情况使用场景：适合查找比较小且连续的情况

除留余数法

设散列表中允许的地址数为m ，取一个不大于 m ，但最接近或者等于 m 的质数 p 作为除数，按照哈希函数：Hash(key) = key% p(p<=m),将关键码转换成哈希地址

哈希冲突: 不同关键字通过相同哈希函数计算出相同的哈希地址，该种现象称为哈希冲突

那么如何解决哈希冲突呢?

闭散列和开散列闭散列

也叫开放定址法,当发生哈希冲突时，如果哈希表未被装满，说明在哈希表中必然还有空位置，那么可以把key存放到冲突位置中的“下一个” 空位置中去。

那么这个位置如何去找呢?

线性探测二次探测

线性探测:通过哈希映射出来的位置已经冲突,那就需要往后线性找一个空位置存数据

Hash(key)=key%len + i(i=0,1,2,3...)

二次探测:通过哈希映射出来的位置已经冲突,那就需要往后次方性找一个空位置存数据

Hash(key)=key%len + i^2(i=0,1,2,3...)

闭散列实现

namespace CLOSE_HASH
{
	enum State{EMPTY,EXITS,DELETE};
	template
	struct HashDate
	{
		pair _kv;
		State _state=EMPTY;
	};
	// 特化
	template
	struct Hash
	{
		size_t operator()(const K& key)
		{
			return key;
		}
	};
	template<>
	struct Hash
	{
		// "int"  "insert" 
		// 字符串转成对应一个整形值，因为整形才能取模算映射位置
		// 期望->字符串不同，转出的整形值尽量不同
		// "abcd" "bcad"
		// "abbb" "abca"
		size_t operator()(const string& s)
		{
			// BKDR Hash
			size_t value = 0;
			for (auto ch : s)
			{
				value += ch;
				value *= 131;
			}

			return value;
		}
	};

	template>
	class HashTable
	{
	public:
		bool Insert(const pair& kv)
		{
			HashDate* ret = find(kv.first);
			if (_table.size() == 0)
			{
				_table.resize(10);
			}
			else if(_size*10 / _table.size() > 7)
			{
				HashTable newHT;
				newHT._table.resize(_table.size() * 2);

				for (auto& e : _table)
				{
					newHT.Insert(e._kv);
				}
				_table.swap(newHT._table);
			}
			KHashFunc hf;
			size_t start = hf(kv.first) % _table.size();
			size_t index = start;
			size_t i = 1;
			while (_table[index]._state == EXITS)
			{
				index = start + i;
				index %= _table.size();
				++i;
				//index += i ^ 2;
			}
			_table[index]._kv = kv;
			_table[index]._state = EXITS;
			++_size;
			return true;
		}
		HashDate* find(const K& key)
		{
			KHashFunc hf;
			if (_table.size() == 0)
			{
				return nullptr;
			}
			size_t i = 1;
			size_t start = hf(key) % _table.size();
			size_t index = start;
			while (_table[index]._state != EMPTY)
			{
				if (_table[index]._kv.first==key&&_table[index]._state==EXITS)
				{
					return &_table[index];
				}
				index = start + i;
				index %= _table.size();
				++i;
			}
			return nullptr;
		}
		bool Erase(const K& key)
		{
			HashDate* ret = find(key);
			if (ret == nullptr)
			{
				return false;
			}
			else
			{
				ret->_state = DELETE;
				return true;
			}
		}
	private:
		vector> _table;
		size_t _size=0;//存储的有效数据的个数
	};
}

开散列

开散列法又叫链地址法 ( 开链法 ) ，首先对关键码集合用散列函数计算散列地址，具有相同地址的关键码归于同一子集合，每一个子集合称为一个桶，各个桶中的元素通过一个单链表链接起来，各链表的头结点存储在哈希表中,也叫哈希桶

namespace OpenHash
{
	template
	struct Hash
	{
		size_t operator()(const K& key)
		{
			return key;
		}
	};
	// 特化
	template<>
	struct Hash < string >
	{
		// "int"  "insert" 
		// 字符串转成对应一个整形值，因为整形才能取模算映射位置
		// 期望->字符串不同，转出的整形值尽量不同
		// "abcd" "bcad"
		// "abbb" "abca"
		size_t operator()(const string& s)
		{
			// BKDR Hash
			size_t value = 0;
			for (auto ch : s)
			{
				value += ch;
				value *= 131;
			}

			return value;
		}
	};

	template
	struct HashNode
	{
		HashNode* _next;
		T _data;

		HashNode(const T& data)
			:_next(nullptr)
			, _data(data)
		{}
	};

	// 前置声明
	template
	class HashTable;

	// 迭代器
	template>
	struct __HTIterator
	{
		typedef HashNode Node;
		typedef __HTIterator Self;
		typedef HashTable HT;
		Node* _node;
		HT* _pht;

		__HTIterator(Node* node, HT* pht)
			:_node(node)
			, _pht(pht)
		{}

		Self& operator++()
		{
			// 1、当前桶中还有数据，那么就在当前桶往后走
			if (_node->_next)
			{
				_node = _node->_next;
			}
			// 2、当前桶走完了，需要往下一个桶去走。
			else
			{
				//size_t index = HashFunc()(KeyOfT()(_node->_data)) % _pht->_table.size();
				KeyOfT kot;
				HashFunc hf;
				size_t index = hf(kot(_node->_data)) % _pht->_table.size();

				++index;
				while (index < _pht->_table.size())
				{
					if (_pht->_table[index])
					{
						_node = _pht->_table[index];
						return *this;
					}
					else
					{
						++index;
					}
				}
				_node = nullptr;
			}

			return *this;
		}
		T& operator*()
		{
			return _node->_data;
		}
		T* operator->()
		{
			return &_node->_data;
		}
		bool operator != (const Self& s) const
		{
			return _node != s._node;
		}
		bool operator == (const Self& s) const
		{
			return _node == s.node;
		}
	};

	template>
	class HashTable
	{
		typedef HashNode Node;
		template
		friend struct __HTIterator;
	public:
		typedef __HTIterator iterator;

		HashTable() = default; // 显示指定生成默认构造

		HashTable(const HashTable& ht)
		{
			_n = ht._n;
			_table.resize(ht._table.size());
			for (size_t i = 0; i < ht._table.size(); i++)
			{
				Node* cur = ht._table[i];
				while (cur)
				{
					Node* copy = new Node(cur->_data);
					// 头插到新表
					copy->_next = _table[i];
					_table[i] = copy;

					cur = cur->_next;
				}
			}
		}

		HashTable& operator=(HashTable ht)
		{
			_table.swap(ht._table);
			swap(_n, ht._n);

			return *this;
		}

		~HashTable()
		{
			for (size_t i = 0; i < _table.size(); ++i)
			{
				Node* cur = _table[i];
				while (cur)
				{
					Node* next = cur->_next;
					delete cur;
					cur = next;
				}
				_table[i] = nullptr;
			}
		}
		iterator begin()
		{
			size_t i = 0;
			while (i < _table.size())
			{
				if (_table[i])
				{
					return iterator(_table[i], this);
				}
				++i;
			}
			return end();
		}
		iterator end()
		{
			return iterator(nullptr, this);
		}
		size_t GetNextPrime(size_t prime)
		{
			const int PRIMECOUNT = 28;
			static const size_t primeList[PRIMECOUNT] =
			{
				53ul, 97ul, 193ul, 389ul, 769ul,
				1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
				49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
				1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
				50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
				1610612741ul, 3221225473ul, 4294967291ul
			};

			size_t i = 0;
			for (; i < PRIMECOUNT; ++i)
			{
				if (primeList[i] > prime)
					return primeList[i];
			}

			return primeList[i];
		}

		pair Insert(const T& data)
		{
			KeyOfT kot;
			// 找到了
			auto ret = Find(kot(data));
			if (ret != end())
				return make_pair(ret, false);

			HashFunc hf;
			// 负载因子到1时，进行增容
			if (_n == _table.size())
			{
				vector newtable;
				//size_t newSize = _table.size() == 0 ? 8 : _table.size() * 2;
				//newtable.resize(newSize, nullptr);
				newtable.resize(GetNextPrime(_table.size()));

				// 遍历取旧表中节点，重新算映射到新表中的位置，挂到新表中
				for (size_t i = 0; i < _table.size(); ++i)
				{
					if (_table[i])
					{
						Node* cur = _table[i];
						while (cur)
						{
							Node* next = cur->_next;
							size_t index = hf(kot(cur->_data)) % newtable.size();
							// 头插
							cur->_next = newtable[index];
							newtable[index] = cur;

							cur = next;
						}
						_table[i] = nullptr;
					}
				}

				_table.swap(newtable);
			}

			size_t index = hf(kot(data)) % _table.size();
			Node* newnode = new Node(data);

			// 头插
			newnode->_next = _table[index];
			_table[index] = newnode;
			++_n;

			return make_pair(iterator(newnode, this), true);
		}

		iterator Find(const K& key)
		{
			if (_table.size() == 0)
			{
				return end();
			}
			KeyOfT kot;
			HashFunc hf;
			size_t index = hf(key) % _table.size();
			Node* cur = _table[index];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					return iterator(cur, this);
				}
				else
				{
					cur = cur->_next;
				}
			}

			return end();
		}

		bool Erase(const K& key)
		{
			size_t index = hf(key) % _table.size();
			Node* prev = nullptr;
			Node* cur = _table[index];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					if (_table[index] == cur)
					{
						_table[index] = cur->_next;
					}
					else
					{
						prev->_next = cur->_next;
					}

					--_n;
					delete cur;
					return true;
				}
				prev = cur;
				cur = cur->_next;
			}
			return false;
		}
	private:
		vector _table;
		size_t _n = 0;         // 有效数据的个数
	};
}

封装unordered_map

namespace ljx
{
	template
	class unordered_map
	{
		struct MapKeyOfT
		{
			const K& operator()(const pair& kv)
			{
				return kv.first;
			}
		};
	public:
		typedef typename OpenHash::HashTable, MapKeyOfT>::iterator iterator;
		iterator begin()
		{
			return _ht.begin();
		}
		iterator end()
		{
			return _ht.end();
		}
		pair insert(const pair& kv)
		{
			return _ht.Insert(kv);
		}
		V& operator[](const K& key)
		{
			pair ret = _ht.Insert(make_pair(key, V()));
			return ret.first->second;
		}
	private:
		OpenHash::HashTable, MapKeyOfT> _ht;
	};
}

封装unordered_set

namespace ljx
{
	template
	class unordered_set
	{
		struct SetKeyOfT
		{
			const K& operator()(const K& k)
			{
				return k;
			}
		};
	public:
		typedef typename OpenHash::HashTable::iterator iterator;
		iterator begin()
		{
			return _ht.begin();
		}
		iterator end()
		{
			return _ht.end();
		}
		pair insert(const K k)
		{
			return _ht.Insert(k);
		}
	private:
		OpenHash::HashTable _ht;
	};
}

哈希的应用位图所谓位图，就是用每一位来存放某种状态，适用于海量数据，数据无重复的场景。通常是用来判断某个数据存不存在的。实现

namespace Y
{
	template
	class BitSet
	{
	public:
		BitSet()
		{
			_bits.resize(N / 32 + 1, 0);
		}

		// 把x映射的位标记成1
		void Set(size_t x)
		{
			assert(x < N);

			// 算出x映射的位在第i个整数
			// 算出x映射的位在这个整数的第j个位
			size_t i = x / 32;
			size_t j = x % 32;

			// _bits[i] 的第j位标记成1，并且不影响他的其他位
			_bits[i] |= (1 << j);
		}

		void Reset(size_t x)
		{
			assert(x < N);

			size_t i = x / 32;
			size_t j = x % 32;

			// _bits[i] 的第j位标记成0，并且不影响他的其他位
			_bits[i] &= (~(1 << j));
		}

		bool Test(size_t x)
		{
			assert(x < N);

			size_t i = x / 32;
			size_t j = x % 32;

			// 如果第j位是1，结果是非0，非0就是真
			// 如果第j为是0，结果是0，0就是假
			return _bits[i] & (1 << j);
		}
	private:
		vector _bits;
	};
}

应用于求交集,快速查找一个数是否在一个集合中.

优点:节省空间,速度快缺点:只能处理整形

布隆过滤器

布隆过滤器是由布隆（Burton Howard Bloom）在1970年提出的一种紧凑型的、比较巧妙的概率型数据结构，特点是高效地插入和查询，可以用来告诉你 “某样东西一定不存在或者可能存在”，它是用多个哈希函数，将一个数据映射到位图结构中。此种方式不仅可以提升查询效率，也可以节省大量的内存空间

布隆过滤器的思想是将一个元素用多个哈希函数映射到一个位图中，因此被映射到的位置的比特位一定为1 。所以可以按照以下方式进行查找：分别计算每个哈希值对应的比特位置存储的是否为零，只要有一个为零，代表该元素一定不在哈希表中，否则可能在哈希表中。

判断在不准确,存在误判,判断不在,准确;针对更多的是字符串

一般不支持删除

有一种删除的办法,使用多个比特位作为计数器,多个值映射时,++计数,删除时,--计数;

struct HashBKDR
{
	// "int"  "insert" 
	// 字符串转成对应一个整形值，因为整形才能取模算映射位置
	// 期望->字符串不同，转出的整形值尽量不同
	// "abcd" "bcad"
	// "abbb" "abca"
	size_t operator()(const std::string& s)
	{
		// BKDR Hash
		size_t value = 0;
		for (auto ch : s)
		{
			value += ch;
			value *= 131;
		}

		return value;
	}
};

struct HashAP
{
	// "int"  "insert" 
	// 字符串转成对应一个整形值，因为整形才能取模算映射位置
	// 期望->字符串不同，转出的整形值尽量不同
	// "abcd" "bcad"
	// "abbb" "abca"
	size_t operator()(const std::string& s)
	{
		// AP Hash
		register size_t hash = 0;
		size_t ch;
		for (long i = 0; i < s.size(); i++)
		{
			ch = s[i];
			if ((i & 1) == 0)
			{
				hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
			}
			else
			{
				hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
			}
		}
		return hash;
	}
};

struct HashDJB
{
	// "int"  "insert" 
	// 字符串转成对应一个整形值，因为整形才能取模算映射位置
	// 期望->字符串不同，转出的整形值尽量不同
	// "abcd" "bcad"
	// "abbb" "abca"
	size_t operator()(const std::string& s)
	{
		// BKDR Hash
		register size_t hash = 5381;
		for (auto ch : s)
		{
			hash += (hash << 5) + ch;
		}

		return hash;
	}
};

template
class BloomFilter
{
public:
	void Set(const K& key)
	{
		//Hash1 hf1;
		//size_t i1 = hf1(key);
		size_t i1 = Hash1()(key) % N;
		size_t i2 = Hash2()(key) % N;
		size_t i3 = Hash3()(key) % N;

		cout << i1 << " " << i2 << " " << i3 << endl;

		_bitset.Set(i1);
		_bitset.Set(i2);
		_bitset.Set(i3);
	}

	bool Test(const K& key)
	{
		size_t i1 = Hash1()(key) % N;
		if (_bitset.Test(i1) == false)
		{
			return false;
		}

		size_t i2 = Hash2()(key) % N;
		if (_bitset.Test(i2) == false)
		{
			return false;
		}

		size_t i3 = Hash3()(key) % N;
		if (_bitset.Test(i3) == false)
		{
			return false;
		}

		// 这里3个位都在，有可能是其他key占了，在是不准确的，存在误判
		// 不在是准确的
		return true; 
	}

private:
	bit::BitSet _bitset;
	bit::vector _bitset;

};

void TestBloomFilter()
{
	

	BloomFilter<600> bf;

	size_t N = 100;
	std::vector v1;
	for (size_t i = 0; i < N; ++i)
	{
		std::string url = "https://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html";
		url += std::to_string(1234 + i);
		v1.push_back(url);
	}

	for (auto& str : v1)
	{
		bf.Set(str);
	}

	for (auto& str : v1)
	{
		cout << bf.Test(str) << endl;
	}
	cout << endl << endl;

	std::vector v2;
	for (size_t i = 0; i < N; ++i)
	{
		std::string url = "https://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html";
		url += std::to_string(6789 + i);
		v2.push_back(url);
	}

	size_t n2 = 0;
	for (auto& str : v2)
	{
		if (bf.Test(str))
		{
			++n2;
		}
	}
	cout << "相似字符串误判率:" << (double)n2 / (double)N << endl;

	std::vector v3;
	for (size_t i = 0; i < N; ++i)
	{
		std::string url = "https://zhuanlan.zhihu.com/p/43263751";
		url += std::to_string(6789 + i);
		v3.push_back(url);
	}

	size_t n3 = 0;
	for (auto& str : v3)
	{
		if (bf.Test(str))
		{
			++n3;
		}
	}
	cout << "不相似字符串误判率:" << (double)n3 / (double)N << endl;

}

哈希切割

哈希切割就是一种哈希的切割思想,通过切分成不均匀的部分,方便 *** 作

通过切分把相同元素放到同一个下标的文件中,然后去比较

比如给一个超过100G大小的log fifile, log中存着IP地址, 设计算法找到出现次数最多的IP地址？

方法:

统计次数

欢迎分享，转载请注明来源：内存溢出

原文地址: https://www.outofmemory.cn/zaji/5721975.html

Hash#(理解,实现与应用)

发表评论

评论列表（0条）