練習37：哈希表 · 笨辦法學C

# 練習37：哈希表 > 原文：[Exercise 37: Hashmaps](http://c.learncodethehardway.org/book/ex37.html) > 譯者：[飛龍](https://github.com/wizardforcel) 哈希表（`HashMap`、`HashTable`以及`Dictionary`）廣泛用于許多動態編程語言來儲存鍵值對的數據。哈希表通過在鍵上執行“哈希”運算產生整數，之后使用它來尋找相應的桶來獲取或儲存值。它是非常快速的使用數據結構，因為它適用于任何數據并且易于實現。下面是哈希表（也叫作字典）的一個使用示例： ```c fruit_weights = {'Apples': 10, 'Oranges': 100, 'Grapes': 1.0} for key, value in fruit_weights.items(): print key, "=", value ``` 幾乎所有現代語言都具備這種特性，所以許多人寫完代碼都不知道它實際上如何工作。通過在C中創建`Hashmap`數據結構，我會向你展示它的工作原理。我會從頭文件開始，來談論整個數據結構。 ```c #ifndef _lcthw_Hashmap_h #define _lcthw_Hashmap_h #include <stdint.h> #include <lcthw/darray.h> #define DEFAULT_NUMBER_OF_BUCKETS 100 typedef int (*Hashmap_compare)(void *a, void *b); typedef uint32_t (*Hashmap_hash)(void *key); typedef struct Hashmap { DArray *buckets; Hashmap_compare compare; Hashmap_hash hash; } Hashmap; typedef struct HashmapNode { void *key; void *data; uint32_t hash; } HashmapNode; typedef int (*Hashmap_traverse_cb)(HashmapNode *node); Hashmap *Hashmap_create(Hashmap_compare compare, Hashmap_hash); void Hashmap_destroy(Hashmap *map); int Hashmap_set(Hashmap *map, void *key, void *data); void *Hashmap_get(Hashmap *map, void *key); int Hashmap_traverse(Hashmap *map, Hashmap_traverse_cb traverse_cb); void *Hashmap_delete(Hashmap *map, void *key); #endif ``` 這個結構就是`Hashmap`，含有許多`HashmapNode`節點。觀察`Hashmap`你會看到它類似這樣： `DArray *buckets` 一個動態數組，設置為100個桶的固定大小。每個桶會含有一個`DArray`，來實際存檔`HashmapNode`對。 `Hashmap_compare compare` 這是一個比較函數，被`Hashmap`用于實際用過鍵尋找元素。它應該和其它的比較函數類似，并且默認設置為`bstrcmp`來比較字符串。 `Hashmap_hash` 這是哈希函數，它用于接收鍵，處理它的內容，之后產生一個`uint32_t`索引數值。之后你會看到默認的實現。這些告訴了你數據如何存儲，但是用作`buckets`的`DArray`還沒有創建。要記住它具有二層結構； + 第一層有100個桶，數據基于它們的哈希值儲存在桶中。 + 每個桶都是一個`DArray`，其中含有`HashmapNode`，添加時只是簡單地附加到末尾。 `HashMapNode`由下面三個元素組成： `void *key` 鍵值對的鍵。 `void *value` 鍵值對的值。 `uint32_t hash` 計算出的哈希值，它用于使查找該節點更加迅速，只要判斷鍵是否相等。有文件的剩余部分沒有新的東西，所以我現在可以向你展示`hashmap.c`的實現了： ```c #undef NDEBUG #include <stdint.h> #include <lcthw/hashmap.h> #include <lcthw/dbg.h> #include <lcthw/bstrlib.h> static int default_compare(void *a, void *b) { return bstrcmp((bstring)a, (bstring)b); } /** * Simple Bob Jenkins's hash algorithm taken from the * wikipedia description. */ static uint32_t default_hash(void *a) { size_t len = blength((bstring)a); char *key = bdata((bstring)a); uint32_t hash = 0; uint32_t i = 0; for(hash = i = 0; i < len; ++i) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } Hashmap *Hashmap_create(Hashmap_compare compare, Hashmap_hash hash) { Hashmap *map = calloc(1, sizeof(Hashmap)); check_mem(map); map->compare = compare == NULL ? default_compare : compare; map->hash = hash == NULL ? default_hash : hash; map->buckets = DArray_create(sizeof(DArray *), DEFAULT_NUMBER_OF_BUCKETS); map->buckets->end = map->buckets->max; // fake out expanding it check_mem(map->buckets); return map; error: if(map) { Hashmap_destroy(map); } return NULL; } void Hashmap_destroy(Hashmap *map) { int i = 0; int j = 0; if(map) { if(map->buckets) { for(i = 0; i < DArray_count(map->buckets); i++) { DArray *bucket = DArray_get(map->buckets, i); if(bucket) { for(j = 0; j < DArray_count(bucket); j++) { free(DArray_get(bucket, j)); } DArray_destroy(bucket); } } DArray_destroy(map->buckets); } free(map); } } static inline HashmapNode *Hashmap_node_create(int hash, void *key, void *data) { HashmapNode *node = calloc(1, sizeof(HashmapNode)); check_mem(node); node->key = key; node->data = data; node->hash = hash; return node; error: return NULL; } static inline DArray *Hashmap_find_bucket(Hashmap *map, void *key, int create, uint32_t *hash_out) { uint32_t hash = map->hash(key); int bucket_n = hash % DEFAULT_NUMBER_OF_BUCKETS; check(bucket_n >= 0, "Invalid bucket found: %d", bucket_n); *hash_out = hash; // store it for the return so the caller can use it DArray *bucket = DArray_get(map->buckets, bucket_n); if(!bucket && create) { // new bucket, set it up bucket = DArray_create(sizeof(void *), DEFAULT_NUMBER_OF_BUCKETS); check_mem(bucket); DArray_set(map->buckets, bucket_n, bucket); } return bucket; error: return NULL; } int Hashmap_set(Hashmap *map, void *key, void *data) { uint32_t hash = 0; DArray *bucket = Hashmap_find_bucket(map, key, 1, &hash); check(bucket, "Error can't create bucket."); HashmapNode *node = Hashmap_node_create(hash, key, data); check_mem(node); DArray_push(bucket, node); return 0; error: return -1; } static inline int Hashmap_get_node(Hashmap *map, uint32_t hash, DArray *bucket, void *key) { int i = 0; for(i = 0; i < DArray_end(bucket); i++) { debug("TRY: %d", i); HashmapNode *node = DArray_get(bucket, i); if(node->hash == hash && map->compare(node->key, key) == 0) { return i; } } return -1; } void *Hashmap_get(Hashmap *map, void *key) { uint32_t hash = 0; DArray *bucket = Hashmap_find_bucket(map, key, 0, &hash); if(!bucket) return NULL; int i = Hashmap_get_node(map, hash, bucket, key); if(i == -1) return NULL; HashmapNode *node = DArray_get(bucket, i); check(node != NULL, "Failed to get node from bucket when it should exist."); return node->data; error: // fallthrough return NULL; } int Hashmap_traverse(Hashmap *map, Hashmap_traverse_cb traverse_cb) { int i = 0; int j = 0; int rc = 0; for(i = 0; i < DArray_count(map->buckets); i++) { DArray *bucket = DArray_get(map->buckets, i); if(bucket) { for(j = 0; j < DArray_count(bucket); j++) { HashmapNode *node = DArray_get(bucket, j); rc = traverse_cb(node); if(rc != 0) return rc; } } } return 0; } void *Hashmap_delete(Hashmap *map, void *key) { uint32_t hash = 0; DArray *bucket = Hashmap_find_bucket(map, key, 0, &hash); if(!bucket) return NULL; int i = Hashmap_get_node(map, hash, bucket, key); if(i == -1) return NULL; HashmapNode *node = DArray_get(bucket, i); void *data = node->data; free(node); HashmapNode *ending = DArray_pop(bucket); if(ending != node) { // alright looks like it's not the last one, swap it DArray_set(bucket, i, ending); } return data; } ``` 這個實現中并沒有什么復雜的東西，但是`default_hash`和`Hashmap_find_bucket`需要一些解釋。當你使用`Hashmap_create`時，你可以傳入任何定制的比較和哈希函數。但是如果你沒有則會使用`default_compare`和`default_hash`函數。需要觀察的第一件事，是`default_hash`的行為。這是一個簡單的哈希函數，叫做“Jenkins hash”，以Bob Jenkins的名字命名。我從[維基百科](http://en.wikipedia.org/wiki/Jenkins_hash_function)上獲得了這個算法。它僅僅遍歷鍵（`bstring`）的每個字節來計算哈希，以便得出`uint32_t`的結果。它使用一些加法和異或運算來實現。哈希函數有很多中，它們具有不同的特性，然而一旦你選擇了一種，就需要一種方法來使用它找到正確的桶。`Hashmap_find_bucket`像這樣實現它： + 首先調用` map->hash(key)`來獲得鍵的哈希值。 + 之后使用`hash % DEFAULT_NUMBER_OF_BUCKETS`，這樣無論哈希值有多大，都能找到匹配的桶。 + 找到桶之后，它是個`DArray`，可能還沒有創建，這取決與`create`變量的內容。 + 一旦找到了正確的`DArray`桶，就會將它返回，并且`hash_out`變量用于向調用者提供所找到的哈希值。其它函數都使用`Hashmap_find_bucket`來完成工作： + 設置鍵值對涉及到找到正確的桶，之后創建`HashmapNode`，將它添加到桶中。 + 獲取鍵值涉及到找到正確的桶，之后找到匹配`hash`和`key`的`HashmapNode`。 + 刪除元素也需要找到正確的桶，找到所需的節點，之后通過與末尾的節點交換位置來刪除。你需要學習的唯一一個其他函數是`Hashmap_travers`，它僅僅遍歷每個桶，對于任何含有值的桶，在每個值上調用`traverse_cb`。這就是掃描整個`Hashmap`的辦法。 ## 單元測試最后你需要編寫單元測試，對于所有這些操作： ```c #include "minunit.h" #include <lcthw/hashmap.h> #include <assert.h> #include <lcthw/bstrlib.h> Hashmap *map = NULL; static int traverse_called = 0; struct tagbstring test1 = bsStatic("test data 1"); struct tagbstring test2 = bsStatic("test data 2"); struct tagbstring test3 = bsStatic("xest data 3"); struct tagbstring expect1 = bsStatic("THE VALUE 1"); struct tagbstring expect2 = bsStatic("THE VALUE 2"); struct tagbstring expect3 = bsStatic("THE VALUE 3"); static int traverse_good_cb(HashmapNode *node) { debug("KEY: %s", bdata((bstring)node->key)); traverse_called++; return 0; } static int traverse_fail_cb(HashmapNode *node) { debug("KEY: %s", bdata((bstring)node->key)); traverse_called++; if(traverse_called == 2) { return 1; } else { return 0; } } char *test_create() { map = Hashmap_create(NULL, NULL); mu_assert(map != NULL, "Failed to create map."); return NULL; } char *test_destroy() { Hashmap_destroy(map); return NULL; } char *test_get_set() { int rc = Hashmap_set(map, &test1, &expect1); mu_assert(rc == 0, "Failed to set &test1"); bstring result = Hashmap_get(map, &test1); mu_assert(result == &expect1, "Wrong value for test1."); rc = Hashmap_set(map, &test2, &expect2); mu_assert(rc == 0, "Failed to set test2"); result = Hashmap_get(map, &test2); mu_assert(result == &expect2, "Wrong value for test2."); rc = Hashmap_set(map, &test3, &expect3); mu_assert(rc == 0, "Failed to set test3"); result = Hashmap_get(map, &test3); mu_assert(result == &expect3, "Wrong value for test3."); return NULL; } char *test_traverse() { int rc = Hashmap_traverse(map, traverse_good_cb); mu_assert(rc == 0, "Failed to traverse."); mu_assert(traverse_called == 3, "Wrong count traverse."); traverse_called = 0; rc = Hashmap_traverse(map, traverse_fail_cb); mu_assert(rc == 1, "Failed to traverse."); mu_assert(traverse_called == 2, "Wrong count traverse for fail."); return NULL; } char *test_delete() { bstring deleted = (bstring)Hashmap_delete(map, &test1); mu_assert(deleted != NULL, "Got NULL on delete."); mu_assert(deleted == &expect1, "Should get test1"); bstring result = Hashmap_get(map, &test1); mu_assert(result == NULL, "Should delete."); deleted = (bstring)Hashmap_delete(map, &test2); mu_assert(deleted != NULL, "Got NULL on delete."); mu_assert(deleted == &expect2, "Should get test2"); result = Hashmap_get(map, &test2); mu_assert(result == NULL, "Should delete."); deleted = (bstring)Hashmap_delete(map, &test3); mu_assert(deleted != NULL, "Got NULL on delete."); mu_assert(deleted == &expect3, "Should get test3"); result = Hashmap_get(map, &test3); mu_assert(result == NULL, "Should delete."); return NULL; } char *all_tests() { mu_suite_start(); mu_run_test(test_create); mu_run_test(test_get_set); mu_run_test(test_traverse); mu_run_test(test_delete); mu_run_test(test_destroy); return NULL; } RUN_TESTS(all_tests); ``` 需要學習的唯一一件事情就是我在單元測試的頂端使用了`bstring`的特性來創建靜態字符串用于測試。我使用`tagbstring`和`bsStatic`在7~13行創建他們。 ## 如何改進這是一個非常簡單的`Hashmap`實現，就像書中的大多數其他數據結構那樣。我的目標不是讓你以非常快的速度來掌握數據結構。通常這些討論起來非常復雜，并且會讓你偏離真正的基礎和實用的數據結構。我的目標是提供一個易于理解的起始點，然后再改進或理解它們如何實現。對于這和練習，下面是你能夠用于改進這個實現的方法： + 你可以對每個桶進行排序，使它們有序。這會增加你的插入時間，但是減少尋找時間，因為你可以使用二分搜索來尋找每個節點。到現在為止它遍歷桶中的所有節點來尋找元素。 + 你可以動態設定桶的數量，或者讓調用者指定每個`Hashmap`中的該值。 + 你可以使用更好的`default_hash`函數，有許多這樣的函數。 + 這個實現以及幾乎所有實現都有將一些特定的鍵存到一個桶中的風險。這會使你的程序運行速度變慢，因為它使`Hashmap`的處理過程變成了處理單個的`DArray`。如果你對桶中的數組排序會有幫助，但是你可以僅僅使用更好的哈希函數來避免，并且對于真正的偏執狂，你可以添加一個隨機的鹽，讓鍵不可預測。 + 你可以刪掉不歪有任何節點的桶來節約空間，或者將空的桶當如緩存中，便于節約創建和銷毀它們的開銷。 + 現在為止它可以添加已存在的元素，編寫一個替代的實現，使它只能夠添加不存在的元素。像往常一樣，你需要瀏覽每個函數，并且使之健壯。`Hashmap`也可以使用一些調試設置，來執行不變量檢查。 ## 附加題 + 研究你最喜歡的編程語言的`Hashmap`實現，了解它們具有什么特性。 + 找到`Hashmap`的主要缺點，以及如何避免它們。例如，它們不做特定的修改就不能保存順序，并且當你基于鍵的一部分來查找元素時，它們就不能生效。 + 編寫單元測試來展示將鍵都填充到`Hashmap`的一個桶中所帶來的缺陷，之后測試這樣如何影響性能。一個實現它的好方法，就是把桶的數量減少到一個愚蠢的數值，比如1。