From 76a4579c2b6790eaf7b72cfbd38775b72b9047a8 Mon Sep 17 00:00:00 2001 From: charlesgregory Date: Tue, 21 Apr 2020 19:04:04 -0400 Subject: [PATCH 01/10] ported khashl. needs work on caching hashes --- dub.json | 15 +- source/dklib/benchmark_khash.d | 5 +- source/dklib/khashl.d | 699 +++++++++++++++++++++++++++++++++ 3 files changed, 717 insertions(+), 2 deletions(-) create mode 100644 source/dklib/khashl.d diff --git a/dub.json b/dub.json index c6b4901..df6bcb2 100644 --- a/dub.json +++ b/dub.json @@ -6,5 +6,18 @@ "license": "MIT", "dependencies": { }, - "excludedSourceFiles": ["source/dklib/benchmark_khash.d"] + "configurations": [ + { + "name": "default", + "targetType": "library", + "excludedSourceFiles": ["source/dklib/benchmark_khash.d"] + }, + { + "name": "bench", + "targetType": "executable", + "dependencies": { + "emsi_containers":"~>0.7" + } + } + ] } diff --git a/source/dklib/benchmark_khash.d b/source/dklib/benchmark_khash.d index fd7d74d..d5f612e 100644 --- a/source/dklib/benchmark_khash.d +++ b/source/dklib/benchmark_khash.d @@ -2,7 +2,8 @@ dependency "emsi_containers" version="~>0.7" dependency "dklib" path="../.." +/ -import khash; +import dklib.khash; +import dklib.khashl; import containers; import std.datetime.stopwatch : StopWatch, AutoStart; @@ -61,9 +62,11 @@ int main() testContainerInsert!(HashMap, "HashMap"); testContainerInsert!(khash, "khash"); + testContainerInsert!(khashl, "khashl"); testContainerLookup!(HashMap, "HashMap"); testContainerLookup!(khash, "khash"); + testContainerLookup!(khashl, "khashl"); return 0; } diff --git a/source/dklib/khashl.d b/source/dklib/khashl.d new file mode 100644 index 0000000..56466b4 --- /dev/null +++ b/source/dklib/khashl.d @@ -0,0 +1,699 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + Copyright (c) 2019 James S Blachly, MD + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +module dklib.khashl; + +import std.traits : isNumeric, isSomeString, isSigned, hasMember; +import core.stdc.stdint; // uint32_t, etc. +import core.memory; // GC + +/*! + @header + + Generic hash table library. + */ + +enum AC_VERSION_KHASHL_H = "0.1"; + +import core.stdc.stdlib; +import core.stdc.string; +import core.stdc.limits; + +/* compiler specific configuration */ + +alias khint32_t = uint; + +alias khint64_t = ulong; + +alias khint_t = khint32_t; +alias khiter_t = khint_t; + +pragma(inline, true) +{ + auto __kh_used(T)(const(khint32_t)* flag, T i) + { + return (flag[i >> 5] >> (i & 0x1fU) & 1U); + } + void __kh_set_used(T)(khint32_t* flag, T i) + { + (flag[i >> 5] |= 1U << (i & 0x1fU)); + } + void __kh_set_unused(T)(khint32_t* flag, T i) + { + (flag[i >> 5] &= ~(1U << (i & 0x1fU))); + } + + khint_t __kh_h2b(khint_t hash, khint_t bits) + { + return hash * 2654435769U >> (32 - bits); + } + + auto __kh_fsize(khint_t m){ + return ((m) < 32? 1 : (m)>>5); + } +} + +alias kcalloc = calloc; + +alias kmalloc = malloc; + +alias krealloc = realloc; + +alias kfree = free; + +/// Straight port of khash's generic C approach +template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = true) +{ + static assert(!isSigned!KT, "Numeric key types must be unsigned -- try uint instead of int, etc."); + + alias __hash_func = kh_hash!KT.kh_hash_func; + alias __hash_equal= kh_equal!(Bucket,cached).kh_hash_equal; + + alias kh_t = khashl; /// klib uses 'kh_t' struct name + + struct Bucket { + KT key; + static if(kh_is_map) VT val; + static if(cached) khint_t hash; + } + + struct khashl // @suppress(dscanner.style.phobos_naming_convention) + { + khint_t bits, count; + khint32_t *used; + Bucket * keys; + + ~this() + { + //kh_destroy(&this); // the free(h) at the end of kh_destroy will SIGSEGV + static if (useGC) { + GC.removeRange(this.keys); + } + kfree(cast(void*) this.keys); + kfree(cast(void*) this.used); + } + + /// Lookup by key + ref VT opIndex(KT key) + { + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_get(&this, ins); + return this.keys[x].val; + } + + /// Assign by key + void opIndexAssign(VT val, KT key) + { + int absent; + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_put(&this, ins, &absent); + this.keys[x].val = val; + static if(cached) this.keys[x].hash = __hash_func(ins.key); + } + + /// remove key/value pair + void remove(KT key) + { + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_get(&this, ins); + kh_del(&this, x); + } + + /// Get or create if does not exist; mirror built-in hashmap + /// https://dlang.org/spec/hash-map.html#inserting_if_not_present + ref VT require(KT key, lazy VT initval) + { + static assert (kh_is_map == true, "require() not sensible in a hash set"); + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_get(&this, ins); + if (x == kh_end(&this)) { + // not present + int absent; + x = kh_put(&this, ins, &absent); + this.keys[x].val = initval; + static if(cached) this.keys[x].hash = __hash_func(ins.key); + } + return this.keys[x].val; + } + + /// Return an InputRange over the keys. + /// Manipulating the hash table during iteration results in undefined behavior. + /// Returns: Voldemort type + auto byKey() + { + /** Manipulating the hash table during iteration results in undefined behavior */ + struct KeyRange + { + kh_t* kh; + khint_t itr; + bool empty() // non-const as may call popFront + { + //return (this.itr == kh_end(this.kh)); + if (this.itr == kh_end(this.kh)) return true; + // Handle the case of deleted keys + else if (__kh_used(this.kh.used, this.itr) == 0) { + while(__kh_used(this.kh.used, this.itr) == 0) { + this.popFront(); + if (this.itr == kh_end(this.kh)) return true; + } + return false; + } + return false; + } + ref KT front() + { + return kh.keys[this.itr].key; + } + void popFront() + { + if(this.itr < kh_end(this.kh)) { + this.itr++; + } + } + } + return KeyRange(&this); + } + } + + void kh_clear(kh_t* h); + int kh_resize(kh_t* h, khint_t new_n_buckets); + khint_t kh_putp(kh_t* h, Bucket * key, int* absent); + khint_t kh_put(kh_t* h, Bucket key, int* absent); + int kh_del(kh_t* h, khint_t i); + + deprecated("kept for source-level homology; use D-style RAII") + kh_t* kh_init() + { + return cast(kh_t*) kcalloc(1, kh_t.sizeof); + } + + deprecated("kept for source-level homology; kfree(h) will SIGSEGV when called as kh_destroy(&this)") + void kh_destroy(kh_t* h) + { + if (h) + { + kfree(cast(void*) h.keys); + kfree(cast(void*) h.used); + kfree(h); + } + } + + void kh_clear(kh_t* h) + { + if (h && h.used) + { + uint32_t n_buckets = 1U << h.bits; + memset(h.used, 0, __kh_fsize(n_buckets) * khint32_t.sizeof); + h.count = 0; + } + } + + khint_t kh_getp(const(kh_t)* h, Bucket * key) + { + khint_t i, last, n_buckets, mask; + if (h.keys == null) return 0; + n_buckets = 1U << h.bits; + mask = n_buckets - 1U; + i = last = __kh_h2b(__hash_func((*key).key), h.bits); + while (__kh_used(h.used, i) && !__hash_equal(h.keys[i], *key)) { + i = (i + 1U) & mask; + if (i == last) return n_buckets; + } + return !__kh_used(h.used, i)? n_buckets : i; + } + khint_t kh_get(const(kh_t) *h, Bucket key) { return kh_getp(h, &key); } + + int kh_resize(kh_t *h, khint_t new_n_buckets) + { + khint32_t * new_used = null; + khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; + while ((x >>= 1) != 0) ++j; + if (new_n_buckets & (new_n_buckets - 1)) ++j; + new_bits = j > 2? j : 2; + new_n_buckets = 1U << new_bits; + if (h.count > (new_n_buckets>>1) + (new_n_buckets>>2)) return 0; /* requested size is too small */ + new_used = cast(khint32_t*)kmalloc(__kh_fsize(new_n_buckets) * khint32_t.sizeof); + memset(new_used, 0, __kh_fsize(new_n_buckets) * khint32_t.sizeof); + if (!new_used) return -1; /* not enough memory */ + n_buckets = h.keys? 1U< new_n_buckets) /* shrink the hash table */ + h.keys = cast(Bucket*)krealloc(cast(void *)h.keys, new_n_buckets * Bucket.sizeof); + kfree(h.used); /* free the working space */ + h.used = new_used, h.bits = new_bits; + return 0; + } + + khint_t kh_putp(kh_t *h, Bucket * key, int *absent) + { + khint_t n_buckets, i, last, mask; + n_buckets = h.keys? 1U<= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */ + if (kh_resize(h, n_buckets + 1U) < 0) + return n_buckets; + n_buckets = 1U< i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) + h.keys[i] = h.keys[j], i = j; + } + __kh_set_unused(h.used, i); + --h.count; + return 1; + } + + auto kh_bucket(const(kh_t)* h, khint_t x) + { + return h.keys[x]; + } + + auto kh_key(const(kh_t)* h, khint_t x) + { + return h.keys[x].key; + } + + auto kh_val(const(kh_t)* h, khint_t x) + { + return h.keys[x].val; + } + + auto kh_end(const(kh_t)* h) + { + return kh_capacity(h); + } + + auto kh_size(const(kh_t)* h) + { + return h.count; + } + + auto kh_capacity(const(kh_t)* h) + { + return h.keys ? 1U<> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; + } + + auto kh_hash_func(T)(T key) + if (is(T == ulong) || is(T == uint64_t) || is(T == khint64_t)) + { + key = ~key + (key << 21); + key = key ^ key >> 24; + key = (key + (key << 3)) + (key << 8); + key = key ^ key >> 14; + key = (key + (key << 2)) + (key << 4); + key = key ^ key >> 28; + key = key + (key << 31); + return cast(khint_t) key; + } + + khint_t kh_hash_str(const(char)* s) + { + khint_t h = cast(khint_t)*s; + if (h) for (++s; *s; ++s) h = (h << 5) - h + cast(khint_t)*s; + return h; + } + + auto kh_hash_func(T)(T* key) + if(is(T == char) || is(T == const(char)) || is(T == immutable(char))) + { + return kh_hash_str(key); + } + + auto kh_hash_func(T)(T key) + if(isSomeString!T) + { + // rewrite __ac_X31_hash_string for D string/smart array + if (key.length == 0) return 0; + khint_t h = key[0]; + for (int i=1; i Date: Tue, 21 Apr 2020 21:17:13 -0400 Subject: [PATCH 02/10] improvements to khashl hash caching --- source/dklib/benchmark_khash.d | 10 +++++++-- source/dklib/khashl.d | 40 ++++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/source/dklib/benchmark_khash.d b/source/dklib/benchmark_khash.d index d5f612e..289548c 100644 --- a/source/dklib/benchmark_khash.d +++ b/source/dklib/benchmark_khash.d @@ -18,9 +18,14 @@ int main() enum NUMBER_OF_ITEMS = 500_000; - void testContainerInsert(alias Container, string ContainerName)() + void testContainerInsert(alias Container, string ContainerName, bool cached = false)() { - auto c = Container!(string, int)(); + static if(cached){ + static assert(ContainerName == "khashl (cached)"); + auto c = Container!(string, int,true,true,true)(); + }else{ + auto c = Container!(string, int)(); + } StopWatch sw = StopWatch(AutoStart.yes); foreach (i; 0 .. NUMBER_OF_ITEMS) @@ -63,6 +68,7 @@ int main() testContainerInsert!(HashMap, "HashMap"); testContainerInsert!(khash, "khash"); testContainerInsert!(khashl, "khashl"); + testContainerInsert!(khashl, "khashl (cached)",true); testContainerLookup!(HashMap, "HashMap"); testContainerLookup!(khash, "khash"); diff --git a/source/dklib/khashl.d b/source/dklib/khashl.d index 56466b4..531644c 100644 --- a/source/dklib/khashl.d +++ b/source/dklib/khashl.d @@ -84,7 +84,8 @@ alias krealloc = realloc; alias kfree = free; -/// Straight port of khash's generic C approach +/// Straight port of khashl's generic C approach +/// Can use cached-hashes for faster comparison of string key hashes template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = true) { static assert(!isSigned!KT, "Numeric key types must be unsigned -- try uint instead of int, etc."); @@ -135,7 +136,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = static if(cached) ins.hash = __hash_func(ins.key); auto x = kh_put(&this, ins, &absent); this.keys[x].val = val; - static if(cached) this.keys[x].hash = __hash_func(ins.key); + static if(cached) this.keys[x].hash = ins.hash; } /// remove key/value pair @@ -162,7 +163,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = int absent; x = kh_put(&this, ins, &absent); this.keys[x].val = initval; - static if(cached) this.keys[x].hash = __hash_func(ins.key); + static if(cached) this.keys[x].hash = ins.hash; } return this.keys[x].val; } @@ -245,7 +246,11 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = if (h.keys == null) return 0; n_buckets = 1U << h.bits; mask = n_buckets - 1U; - i = last = __kh_h2b(__hash_func((*key).key), h.bits); + + /// if using caching, don't rehash key + static if(cached) i = last = __kh_h2b((*key).hash, h.bits); + else i = last = __kh_h2b(__hash_func((*key).key), h.bits); + while (__kh_used(h.used, i) && !__hash_equal(h.keys[i], *key)) { i = (i + 1U) & mask; if (i == last) return n_buckets; @@ -280,7 +285,11 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = __kh_set_unused(h.used, j); while (1) { /* kick-out process; sort of like in Cuckoo hashing */ khint_t i; - i = __kh_h2b(__hash_func(key.key), new_bits); + + /// if using caching, don't rehash key + static if(cached) i = __kh_h2b(key.hash, new_bits); + else i = __kh_h2b(__hash_func(key.key), new_bits); + while (__kh_used(new_used, i)) i = (i + 1) & new_mask; __kh_set_used(new_used, i); if (i < n_buckets && __kh_used(h.used, i)) { /* kick out the existing element */ @@ -310,7 +319,12 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = n_buckets = 1U< i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) h.keys[i] = h.keys[j], i = j; } @@ -434,6 +452,9 @@ pragma(inline, true) } // end pragma(inline, true) } // end template kh_hash +/// In order to take advantage of cached-hashes +/// our equality function will actually take the bucket type as opposed to just the key. +/// This allows it to access both the store hash and the key itself. template kh_equal(T, bool cached) { pragma(inline,true) @@ -457,14 +478,15 @@ pragma(inline,true) static if(cached) return (a.hash == b.hash) && (strcmp(a, b) == 0); else return (strcmp(a.key, b.key) == 0); } + bool kh_hash_equal(T)(T a, T b) if(isSomeString!(typeof(__traits(getMember,T,"key")))) { static if(cached) return (a.hash == b.hash) && (a.key == b.key); else return (a.key == b.key); } -} -} +} // end pragma(inline, true) +} // end template kh_equal /* --- END OF HASH FUNCTIONS --- */ /* Other convenient macros... */ From bca6747bbeb4e1ff55063118a14b834152bb9716 Mon Sep 17 00:00:00 2001 From: charlesgregory Date: Thu, 23 Apr 2020 12:54:41 -0400 Subject: [PATCH 03/10] Updated MIT license for khashl Added more comments to khashl Updated unittests to actually work for khashl Added khashl to package.d --- dub.selections.json | 2 + source/dklib/khashl.d | 278 +++++++++-------------------------------- source/dklib/package.d | 1 + 3 files changed, 64 insertions(+), 217 deletions(-) diff --git a/dub.selections.json b/dub.selections.json index 322586b..bd336a2 100644 --- a/dub.selections.json +++ b/dub.selections.json @@ -1,5 +1,7 @@ { "fileVersion": 1, "versions": { + "emsi_containers": "0.7.0", + "stdx-allocator": "2.77.5" } } diff --git a/source/dklib/khashl.d b/source/dklib/khashl.d index 531644c..dfccc7e 100644 --- a/source/dklib/khashl.d +++ b/source/dklib/khashl.d @@ -1,29 +1,25 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 by Attractive Chaos - Copyright (c) 2019 James S Blachly, MD - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. +/* The MIT License + Copyright (c) 2019 by Attractive Chaos + Copyright (c) 2019 James S Blachly, MD + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. */ - module dklib.khashl; import std.traits : isNumeric, isSomeString, isSigned, hasMember; @@ -84,8 +80,24 @@ alias krealloc = realloc; alias kfree = free; -/// Straight port of khashl's generic C approach -/// Can use cached-hashes for faster comparison of string key hashes +/* Straight port of khashl's generic C approach + Khashl is hash table that performs deletions without tombstones + The main benefit over khash is that it uses less memory however it is + also faster than khash if no deletions are involved. + + Can use cached-hashes for faster comparison of string key hashes + Attractive chaos has this to say about caching hash values (https://attractivechaos.wordpress.com/): + + When we use long strings as keys, comparing two keys may take significant time. + This comparison is often unnecessary. Note that the hash of a string is a good + summary of the string. If two strings are different, their hashes are often + different. We can cache the hash and only compare two keys when their hashes are + equal. It is possible to implement the idea with any hash table implementations. + + If hash-caching is used we use compile time statements to change the bucket type to include + the hash member. We also change the logic to make equality statements check hash equality before + checking the key equalitys and the put and get methods to make sure they don't recompute the hashes. +**/ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = true) { static assert(!isSigned!KT, "Numeric key types must be unsigned -- try uint instead of int, etc."); @@ -122,7 +134,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = { Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_get(&this, ins); return this.keys[x].val; } @@ -133,10 +145,10 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = int absent; Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_put(&this, ins, &absent); this.keys[x].val = val; - static if(cached) this.keys[x].hash = ins.hash; + static if(cached) this.keys[x].hash = ins.hash; //cache the hash } /// remove key/value pair @@ -144,7 +156,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = { Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_get(&this, ins); kh_del(&this, x); } @@ -156,14 +168,14 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = static assert (kh_is_map == true, "require() not sensible in a hash set"); Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_get(&this, ins); if (x == kh_end(&this)) { // not present int absent; x = kh_put(&this, ins, &absent); this.keys[x].val = initval; - static if(cached) this.keys[x].hash = ins.hash; + static if(cached) this.keys[x].hash = ins.hash; //cache the hash } return this.keys[x].val; } @@ -459,12 +471,16 @@ template kh_equal(T, bool cached) { pragma(inline,true) { + /// Assert that we are using a bucket type with key member static assert(hasMember!(T, "key")); + + /// Assert that we are using a bucket type with hash member if using hash-caching static if(cached) static assert(hasMember!(T, "hash")); bool kh_hash_equal(T)(T a, T b) if (isNumeric!(typeof(__traits(getMember,T,"key")))) { + /// There is no benefit to caching hashes for integer keys (I think) static assert (cached == false, "No reason to cache hash for integer keys"); return (a.key == b.key); } @@ -475,6 +491,8 @@ pragma(inline,true) is(typeof(__traits(getMember,T,"key")) == const(char)) || is(typeof(__traits(getMember,T,"key")) == immutable(char))) { + /// If using hash-caching we check equality of the hashes first + /// before checking the equality of keys themselves static if(cached) return (a.hash == b.hash) && (strcmp(a, b) == 0); else return (strcmp(a.key, b.key) == 0); } @@ -482,6 +500,8 @@ pragma(inline,true) bool kh_hash_equal(T)(T a, T b) if(isSomeString!(typeof(__traits(getMember,T,"key")))) { + /// If using hash-caching we check equality of the hashes first + /// before checking the equality of keys themselves static if(cached) return (a.hash == b.hash) && (a.key == b.key); else return (a.key == b.key); } @@ -489,192 +509,16 @@ pragma(inline,true) } // end template kh_equal /* --- END OF HASH FUNCTIONS --- */ -/* Other convenient macros... */ - -/*! - @abstract Type of the hash table. - @param name Name of the hash table [symbol] - */ -//#define khash_t(name) kh_##name##_t -// Moved into template khash(KT, VT) - -/*! @function - @abstract Initiate a hash table. - @param name Name of the hash table [symbol] - @return Pointer to the hash table [khash_t(name)*] - */ -//#define kh_init(name) kh_init_##name() -// Moved into template khash(KT, VT) - -/*! @function - @abstract Destroy a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -//#define kh_destroy(name, h) kh_destroy_##name(h) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Reset a hash table without deallocating memory. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -//#define kh_clear(name, h) kh_clear_##name(h) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Resize a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param s New size [khint_t] - */ -//#define kh_resize(name, h, s) kh_resize_##name(h, s) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Insert a key to the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @param r Extra return code: -1 if the operation failed; - 0 if the key is present in the hash table; - 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] - @return Iterator to the inserted element [khint_t] - */ -//#define kh_put(name, h, k, r) kh_put_##name(h, k, r) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Retrieve a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] - */ -//#define kh_get(name, h, k) kh_get_##name(h, k) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Remove a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Iterator to the element to be deleted [khint_t] - */ -//#define kh_del(name, h, k) kh_del_##name(h, k) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Test whether a bucket contains data. - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return 1 if containing data; 0 otherwise [int] - */ -//#define kh_exist(h, x) (!__ac_iseither((h).flags, (x))) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get key given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Key [type of keys] - */ -//#define kh_key(h, x) ((h).keys[x]) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get value given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Value [type of values] - @discussion For hash sets, calling this results in segfault. - */ -//#define kh_val(h, x) ((h).vals[x]) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Alias of kh_val() - */ -//#define kh_value(h, x) ((h).vals[x]) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get the start iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The start iterator [khint_t] - */ -//#define kh_begin(h) (khint_t)(0) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get the end iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The end iterator [khint_t] - */ -//#define kh_end(h) ((h).n_buckets) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get the number of elements in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of elements in the hash table [khint_t] - */ -//#define kh_size(h) ((h).size) -// Moved into template khash(KT, VT) -/*! @function - @abstract Get the number of buckets in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of buckets in the hash table [khint_t] - */ -//#define kh_n_buckets(h) ((h).n_buckets) -// Moved into template khash(KT, VT) - -/++ foreach: TODO - -/*! @function - @abstract Iterate over the entries in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param kvar Variable to which key will be assigned - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ -auto kh_foreach(kh_t* h, kvar, vvar, code) -{ - khint_t __i; - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { - if (!kh_exist(h, __i)) continue; - kvar = kh_key(h, __i); - vvar = kh_val(h, __i); - code; - } -} - -/*! @function - @abstract Iterate over the values in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ - auto kh_foreach_value(kh_t* h, vvar, code) - { - khint_t __i; - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { - if (!kh_exist(h, __i)) continue; - vvar = kh_val(h, __i); - code; - } - } -+/ unittest { import std.stdio : writeln, writefln; - writeln("khash unit tests"); + writeln("khashl unit tests"); // test: numeric key type must be unsigned - assert(__traits(compiles, khash!(int, int)) is false); - assert(__traits(compiles, khash!(uint,int)) is true); + assert(__traits(compiles, khashl!(int, int)) is false); + assert(__traits(compiles, khashl!(uint,int)) is true); // auto kh = khash!(uint, char).kh_init(); @@ -688,7 +532,7 @@ unittest // khash!(uint, char).kh_destroy(kh); - auto kh = khash!(uint, char)(); + auto kh = khashl!(uint, char)(); kh[5] = 'J'; assert(kh[5] == 'J'); @@ -699,19 +543,19 @@ unittest /*foreach(k; kh.byKey()) writefln("Key: %s", k);*/ import std.array : array; - assert(kh.byKey().array == [5, 1, 99]); + assert(kh.byKey().array == [1, 99, 5]); // test: byKey on Empty hash table - auto kh_empty = khash!(uint, char)(); // @suppress(dscanner.suspicious.unmodified) + auto kh_empty = khashl!(uint, char)(); // @suppress(dscanner.suspicious.unmodified) assert(kh_empty.byKey.array == []); // test: keytype string - auto kh_string = khash!(string, int)(); + auto kh_string = khashl!(string, int)(); kh_string["test"] = 5; assert( kh_string["test"] == 5 ); // test: valtype string - auto kh_valstring = khash!(uint, string)(); + auto kh_valstring = khashl!(uint, string)(); kh_valstring[42] = "Adams"; assert( kh_valstring[42] == "Adams" ); diff --git a/source/dklib/package.d b/source/dklib/package.d index 4659002..a440dc1 100644 --- a/source/dklib/package.d +++ b/source/dklib/package.d @@ -24,3 +24,4 @@ module dklib; public import dklib.khash; +public import dklib.khashl; From 5663e1b04a2f0d753c46b327e7d7cc18b7af8ea9 Mon Sep 17 00:00:00 2001 From: James Blachly Date: Fri, 21 Aug 2020 20:45:58 +0000 Subject: [PATCH 04/10] README update --- README.md | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 00603fb..0463e16 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ from attractivechaos' generic C approach: https://github.com/attractivechaos/kli ## Fast -Comparison with emsi_containers HashMap: +Comparison with [emsi containers](https://github.com/dlang-community/containers) HashMap: Time, in msec, for n=500,000 operations benchmarked on linux VM; ldc2 -release @@ -15,9 +15,50 @@ Time, in msec, for n=500,000 operations benchmarked on linux VM; ldc2 -release | Retrieve (Serial) | 145 | 26 | | Retrieve (Random) | 282 | 84 | + ## Notes Key type may be numeric, C style string, D style string. -If numeric, must be unsigned +If numeric, must be unsigned. + +May be used as a hash map (default) or a hash set. To use as a hash set, +pass optional third template parameter `kh_is_map = false`. + +By default, memory allocated by the hashmap will be scanned by the GC. +(pass optional fourth template parameter `useGC = false` to disable) + +Can undergo static initialization (e.g. define as struct member +with no extra init code needed in struct ctor), unlike +[emsi containers](https://github.com/dlang-community/containers) HashMap. + + +## API + +### Declaration +```D +auto map = khash!(keytype, valuetype); +``` + +### Assignment / Insert +```D +map["monty"] = "python"; +``` + +### Retrieval +```D +auto val = map[key]; +``` + +### Retrieve or Create +```D +auto val = map.require("fruit", "apple"); +``` + +### Iteration +```D +foreach(x; map.byKey) { +... +} +``` ## Examples @@ -54,3 +95,9 @@ assert( kh_valstring[42] == "Adams" ); const auto fw = kh_string.require("flammenwerfer", 21); assert(fw == 21); ``` + +## BUGS + +Please let me know what you find. +There may be a double free bug when making a hashmap of hashmaps. + From 15de0952a42547155457636060982760fa443c18 Mon Sep 17 00:00:00 2001 From: Nathan Sashihara <21227491+n8sh@users.noreply.github.com> Date: Wed, 26 Aug 2020 09:12:04 -0700 Subject: [PATCH 05/10] Fix: khash with string key not compiling with dmd if inlining enabled Added a build type to dub.json that can be used to test this: dub test --compiler=dmd --build=unittest-inline --- dub.json | 5 +++++ source/dklib/khash.d | 39 +++++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/dub.json b/dub.json index c6b4901..a19e755 100644 --- a/dub.json +++ b/dub.json @@ -6,5 +6,10 @@ "license": "MIT", "dependencies": { }, + "buildTypes": { + "unittest-inline": { + "buildOptions": ["unittests", "optimize", "inline"] + } + }, "excludedSourceFiles": ["source/dklib/benchmark_khash.d"] } diff --git a/source/dklib/khash.d b/source/dklib/khash.d index e9d8fdd..2c564c7 100644 --- a/source/dklib/khash.d +++ b/source/dklib/khash.d @@ -471,13 +471,6 @@ pragma(inline, true) return cast(khint32_t) ((key)>>33^(key)^(key)<<11); } - khint_t __ac_X31_hash_string(const(char)* s) - { - khint_t h = cast(khint_t)*s; - if (h) for (++s; *s; ++s) h = (h << 5) - h + cast(khint_t)*s; - return h; - } - auto kh_hash_func(T)(T* key) if(is(T == char) || is(T == const(char)) || is(T == immutable(char))) { @@ -490,17 +483,6 @@ pragma(inline, true) return (strcmp(a, b) == 0); } - auto kh_hash_func(T)(T key) - if(isSomeString!T) - { - // rewrite __ac_X31_hash_string for D string/smart array - if (key.length == 0) return 0; - khint_t h = key[0]; - for (int i=1; i Date: Wed, 26 Aug 2020 14:03:16 -0400 Subject: [PATCH 06/10] moved and renamed benchmarks --- .../benchmark_khash.d => examples/bench/benchmark_hashmaps.d | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename source/dklib/benchmark_khash.d => examples/bench/benchmark_hashmaps.d (100%) diff --git a/source/dklib/benchmark_khash.d b/examples/bench/benchmark_hashmaps.d similarity index 100% rename from source/dklib/benchmark_khash.d rename to examples/bench/benchmark_hashmaps.d From 5fb115cb30c2ddb6cc918de22685385db8d5b4df Mon Sep 17 00:00:00 2001 From: charlesgregory Date: Tue, 21 Apr 2020 19:04:04 -0400 Subject: [PATCH 07/10] ported khashl. needs work on caching hashes --- dub.json | 3 +- source/dklib/benchmark_khash.d | 5 +- source/dklib/khashl.d | 699 +++++++++++++++++++++++++++++++++ 3 files changed, 704 insertions(+), 3 deletions(-) create mode 100644 source/dklib/khashl.d diff --git a/dub.json b/dub.json index a19e755..efb4b16 100644 --- a/dub.json +++ b/dub.json @@ -10,6 +10,5 @@ "unittest-inline": { "buildOptions": ["unittests", "optimize", "inline"] } - }, - "excludedSourceFiles": ["source/dklib/benchmark_khash.d"] + } } diff --git a/source/dklib/benchmark_khash.d b/source/dklib/benchmark_khash.d index fd7d74d..d5f612e 100644 --- a/source/dklib/benchmark_khash.d +++ b/source/dklib/benchmark_khash.d @@ -2,7 +2,8 @@ dependency "emsi_containers" version="~>0.7" dependency "dklib" path="../.." +/ -import khash; +import dklib.khash; +import dklib.khashl; import containers; import std.datetime.stopwatch : StopWatch, AutoStart; @@ -61,9 +62,11 @@ int main() testContainerInsert!(HashMap, "HashMap"); testContainerInsert!(khash, "khash"); + testContainerInsert!(khashl, "khashl"); testContainerLookup!(HashMap, "HashMap"); testContainerLookup!(khash, "khash"); + testContainerLookup!(khashl, "khashl"); return 0; } diff --git a/source/dklib/khashl.d b/source/dklib/khashl.d new file mode 100644 index 0000000..56466b4 --- /dev/null +++ b/source/dklib/khashl.d @@ -0,0 +1,699 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + Copyright (c) 2019 James S Blachly, MD + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +module dklib.khashl; + +import std.traits : isNumeric, isSomeString, isSigned, hasMember; +import core.stdc.stdint; // uint32_t, etc. +import core.memory; // GC + +/*! + @header + + Generic hash table library. + */ + +enum AC_VERSION_KHASHL_H = "0.1"; + +import core.stdc.stdlib; +import core.stdc.string; +import core.stdc.limits; + +/* compiler specific configuration */ + +alias khint32_t = uint; + +alias khint64_t = ulong; + +alias khint_t = khint32_t; +alias khiter_t = khint_t; + +pragma(inline, true) +{ + auto __kh_used(T)(const(khint32_t)* flag, T i) + { + return (flag[i >> 5] >> (i & 0x1fU) & 1U); + } + void __kh_set_used(T)(khint32_t* flag, T i) + { + (flag[i >> 5] |= 1U << (i & 0x1fU)); + } + void __kh_set_unused(T)(khint32_t* flag, T i) + { + (flag[i >> 5] &= ~(1U << (i & 0x1fU))); + } + + khint_t __kh_h2b(khint_t hash, khint_t bits) + { + return hash * 2654435769U >> (32 - bits); + } + + auto __kh_fsize(khint_t m){ + return ((m) < 32? 1 : (m)>>5); + } +} + +alias kcalloc = calloc; + +alias kmalloc = malloc; + +alias krealloc = realloc; + +alias kfree = free; + +/// Straight port of khash's generic C approach +template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = true) +{ + static assert(!isSigned!KT, "Numeric key types must be unsigned -- try uint instead of int, etc."); + + alias __hash_func = kh_hash!KT.kh_hash_func; + alias __hash_equal= kh_equal!(Bucket,cached).kh_hash_equal; + + alias kh_t = khashl; /// klib uses 'kh_t' struct name + + struct Bucket { + KT key; + static if(kh_is_map) VT val; + static if(cached) khint_t hash; + } + + struct khashl // @suppress(dscanner.style.phobos_naming_convention) + { + khint_t bits, count; + khint32_t *used; + Bucket * keys; + + ~this() + { + //kh_destroy(&this); // the free(h) at the end of kh_destroy will SIGSEGV + static if (useGC) { + GC.removeRange(this.keys); + } + kfree(cast(void*) this.keys); + kfree(cast(void*) this.used); + } + + /// Lookup by key + ref VT opIndex(KT key) + { + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_get(&this, ins); + return this.keys[x].val; + } + + /// Assign by key + void opIndexAssign(VT val, KT key) + { + int absent; + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_put(&this, ins, &absent); + this.keys[x].val = val; + static if(cached) this.keys[x].hash = __hash_func(ins.key); + } + + /// remove key/value pair + void remove(KT key) + { + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_get(&this, ins); + kh_del(&this, x); + } + + /// Get or create if does not exist; mirror built-in hashmap + /// https://dlang.org/spec/hash-map.html#inserting_if_not_present + ref VT require(KT key, lazy VT initval) + { + static assert (kh_is_map == true, "require() not sensible in a hash set"); + Bucket ins; + ins.key = key; + static if(cached) ins.hash = __hash_func(ins.key); + auto x = kh_get(&this, ins); + if (x == kh_end(&this)) { + // not present + int absent; + x = kh_put(&this, ins, &absent); + this.keys[x].val = initval; + static if(cached) this.keys[x].hash = __hash_func(ins.key); + } + return this.keys[x].val; + } + + /// Return an InputRange over the keys. + /// Manipulating the hash table during iteration results in undefined behavior. + /// Returns: Voldemort type + auto byKey() + { + /** Manipulating the hash table during iteration results in undefined behavior */ + struct KeyRange + { + kh_t* kh; + khint_t itr; + bool empty() // non-const as may call popFront + { + //return (this.itr == kh_end(this.kh)); + if (this.itr == kh_end(this.kh)) return true; + // Handle the case of deleted keys + else if (__kh_used(this.kh.used, this.itr) == 0) { + while(__kh_used(this.kh.used, this.itr) == 0) { + this.popFront(); + if (this.itr == kh_end(this.kh)) return true; + } + return false; + } + return false; + } + ref KT front() + { + return kh.keys[this.itr].key; + } + void popFront() + { + if(this.itr < kh_end(this.kh)) { + this.itr++; + } + } + } + return KeyRange(&this); + } + } + + void kh_clear(kh_t* h); + int kh_resize(kh_t* h, khint_t new_n_buckets); + khint_t kh_putp(kh_t* h, Bucket * key, int* absent); + khint_t kh_put(kh_t* h, Bucket key, int* absent); + int kh_del(kh_t* h, khint_t i); + + deprecated("kept for source-level homology; use D-style RAII") + kh_t* kh_init() + { + return cast(kh_t*) kcalloc(1, kh_t.sizeof); + } + + deprecated("kept for source-level homology; kfree(h) will SIGSEGV when called as kh_destroy(&this)") + void kh_destroy(kh_t* h) + { + if (h) + { + kfree(cast(void*) h.keys); + kfree(cast(void*) h.used); + kfree(h); + } + } + + void kh_clear(kh_t* h) + { + if (h && h.used) + { + uint32_t n_buckets = 1U << h.bits; + memset(h.used, 0, __kh_fsize(n_buckets) * khint32_t.sizeof); + h.count = 0; + } + } + + khint_t kh_getp(const(kh_t)* h, Bucket * key) + { + khint_t i, last, n_buckets, mask; + if (h.keys == null) return 0; + n_buckets = 1U << h.bits; + mask = n_buckets - 1U; + i = last = __kh_h2b(__hash_func((*key).key), h.bits); + while (__kh_used(h.used, i) && !__hash_equal(h.keys[i], *key)) { + i = (i + 1U) & mask; + if (i == last) return n_buckets; + } + return !__kh_used(h.used, i)? n_buckets : i; + } + khint_t kh_get(const(kh_t) *h, Bucket key) { return kh_getp(h, &key); } + + int kh_resize(kh_t *h, khint_t new_n_buckets) + { + khint32_t * new_used = null; + khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; + while ((x >>= 1) != 0) ++j; + if (new_n_buckets & (new_n_buckets - 1)) ++j; + new_bits = j > 2? j : 2; + new_n_buckets = 1U << new_bits; + if (h.count > (new_n_buckets>>1) + (new_n_buckets>>2)) return 0; /* requested size is too small */ + new_used = cast(khint32_t*)kmalloc(__kh_fsize(new_n_buckets) * khint32_t.sizeof); + memset(new_used, 0, __kh_fsize(new_n_buckets) * khint32_t.sizeof); + if (!new_used) return -1; /* not enough memory */ + n_buckets = h.keys? 1U< new_n_buckets) /* shrink the hash table */ + h.keys = cast(Bucket*)krealloc(cast(void *)h.keys, new_n_buckets * Bucket.sizeof); + kfree(h.used); /* free the working space */ + h.used = new_used, h.bits = new_bits; + return 0; + } + + khint_t kh_putp(kh_t *h, Bucket * key, int *absent) + { + khint_t n_buckets, i, last, mask; + n_buckets = h.keys? 1U<= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */ + if (kh_resize(h, n_buckets + 1U) < 0) + return n_buckets; + n_buckets = 1U< i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) + h.keys[i] = h.keys[j], i = j; + } + __kh_set_unused(h.used, i); + --h.count; + return 1; + } + + auto kh_bucket(const(kh_t)* h, khint_t x) + { + return h.keys[x]; + } + + auto kh_key(const(kh_t)* h, khint_t x) + { + return h.keys[x].key; + } + + auto kh_val(const(kh_t)* h, khint_t x) + { + return h.keys[x].val; + } + + auto kh_end(const(kh_t)* h) + { + return kh_capacity(h); + } + + auto kh_size(const(kh_t)* h) + { + return h.count; + } + + auto kh_capacity(const(kh_t)* h) + { + return h.keys ? 1U<> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; + } + + auto kh_hash_func(T)(T key) + if (is(T == ulong) || is(T == uint64_t) || is(T == khint64_t)) + { + key = ~key + (key << 21); + key = key ^ key >> 24; + key = (key + (key << 3)) + (key << 8); + key = key ^ key >> 14; + key = (key + (key << 2)) + (key << 4); + key = key ^ key >> 28; + key = key + (key << 31); + return cast(khint_t) key; + } + + khint_t kh_hash_str(const(char)* s) + { + khint_t h = cast(khint_t)*s; + if (h) for (++s; *s; ++s) h = (h << 5) - h + cast(khint_t)*s; + return h; + } + + auto kh_hash_func(T)(T* key) + if(is(T == char) || is(T == const(char)) || is(T == immutable(char))) + { + return kh_hash_str(key); + } + + auto kh_hash_func(T)(T key) + if(isSomeString!T) + { + // rewrite __ac_X31_hash_string for D string/smart array + if (key.length == 0) return 0; + khint_t h = key[0]; + for (int i=1; i Date: Tue, 21 Apr 2020 21:17:13 -0400 Subject: [PATCH 08/10] improvements to khashl hash caching --- source/dklib/benchmark_khash.d | 10 +++++++-- source/dklib/khashl.d | 40 ++++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/source/dklib/benchmark_khash.d b/source/dklib/benchmark_khash.d index d5f612e..289548c 100644 --- a/source/dklib/benchmark_khash.d +++ b/source/dklib/benchmark_khash.d @@ -18,9 +18,14 @@ int main() enum NUMBER_OF_ITEMS = 500_000; - void testContainerInsert(alias Container, string ContainerName)() + void testContainerInsert(alias Container, string ContainerName, bool cached = false)() { - auto c = Container!(string, int)(); + static if(cached){ + static assert(ContainerName == "khashl (cached)"); + auto c = Container!(string, int,true,true,true)(); + }else{ + auto c = Container!(string, int)(); + } StopWatch sw = StopWatch(AutoStart.yes); foreach (i; 0 .. NUMBER_OF_ITEMS) @@ -63,6 +68,7 @@ int main() testContainerInsert!(HashMap, "HashMap"); testContainerInsert!(khash, "khash"); testContainerInsert!(khashl, "khashl"); + testContainerInsert!(khashl, "khashl (cached)",true); testContainerLookup!(HashMap, "HashMap"); testContainerLookup!(khash, "khash"); diff --git a/source/dklib/khashl.d b/source/dklib/khashl.d index 56466b4..531644c 100644 --- a/source/dklib/khashl.d +++ b/source/dklib/khashl.d @@ -84,7 +84,8 @@ alias krealloc = realloc; alias kfree = free; -/// Straight port of khash's generic C approach +/// Straight port of khashl's generic C approach +/// Can use cached-hashes for faster comparison of string key hashes template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = true) { static assert(!isSigned!KT, "Numeric key types must be unsigned -- try uint instead of int, etc."); @@ -135,7 +136,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = static if(cached) ins.hash = __hash_func(ins.key); auto x = kh_put(&this, ins, &absent); this.keys[x].val = val; - static if(cached) this.keys[x].hash = __hash_func(ins.key); + static if(cached) this.keys[x].hash = ins.hash; } /// remove key/value pair @@ -162,7 +163,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = int absent; x = kh_put(&this, ins, &absent); this.keys[x].val = initval; - static if(cached) this.keys[x].hash = __hash_func(ins.key); + static if(cached) this.keys[x].hash = ins.hash; } return this.keys[x].val; } @@ -245,7 +246,11 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = if (h.keys == null) return 0; n_buckets = 1U << h.bits; mask = n_buckets - 1U; - i = last = __kh_h2b(__hash_func((*key).key), h.bits); + + /// if using caching, don't rehash key + static if(cached) i = last = __kh_h2b((*key).hash, h.bits); + else i = last = __kh_h2b(__hash_func((*key).key), h.bits); + while (__kh_used(h.used, i) && !__hash_equal(h.keys[i], *key)) { i = (i + 1U) & mask; if (i == last) return n_buckets; @@ -280,7 +285,11 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = __kh_set_unused(h.used, j); while (1) { /* kick-out process; sort of like in Cuckoo hashing */ khint_t i; - i = __kh_h2b(__hash_func(key.key), new_bits); + + /// if using caching, don't rehash key + static if(cached) i = __kh_h2b(key.hash, new_bits); + else i = __kh_h2b(__hash_func(key.key), new_bits); + while (__kh_used(new_used, i)) i = (i + 1) & new_mask; __kh_set_used(new_used, i); if (i < n_buckets && __kh_used(h.used, i)) { /* kick out the existing element */ @@ -310,7 +319,12 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = n_buckets = 1U< i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) h.keys[i] = h.keys[j], i = j; } @@ -434,6 +452,9 @@ pragma(inline, true) } // end pragma(inline, true) } // end template kh_hash +/// In order to take advantage of cached-hashes +/// our equality function will actually take the bucket type as opposed to just the key. +/// This allows it to access both the store hash and the key itself. template kh_equal(T, bool cached) { pragma(inline,true) @@ -457,14 +478,15 @@ pragma(inline,true) static if(cached) return (a.hash == b.hash) && (strcmp(a, b) == 0); else return (strcmp(a.key, b.key) == 0); } + bool kh_hash_equal(T)(T a, T b) if(isSomeString!(typeof(__traits(getMember,T,"key")))) { static if(cached) return (a.hash == b.hash) && (a.key == b.key); else return (a.key == b.key); } -} -} +} // end pragma(inline, true) +} // end template kh_equal /* --- END OF HASH FUNCTIONS --- */ /* Other convenient macros... */ From f486f20d003a04a208ef4ff7acb0aaedcb42b876 Mon Sep 17 00:00:00 2001 From: charlesgregory Date: Thu, 23 Apr 2020 12:54:41 -0400 Subject: [PATCH 09/10] Updated MIT license for khashl Added more comments to khashl Updated unittests to actually work for khashl Added khashl to package.d --- dub.selections.json | 2 + source/dklib/khashl.d | 278 +++++++++-------------------------------- source/dklib/package.d | 1 + 3 files changed, 64 insertions(+), 217 deletions(-) diff --git a/dub.selections.json b/dub.selections.json index 322586b..bd336a2 100644 --- a/dub.selections.json +++ b/dub.selections.json @@ -1,5 +1,7 @@ { "fileVersion": 1, "versions": { + "emsi_containers": "0.7.0", + "stdx-allocator": "2.77.5" } } diff --git a/source/dklib/khashl.d b/source/dklib/khashl.d index 531644c..dfccc7e 100644 --- a/source/dklib/khashl.d +++ b/source/dklib/khashl.d @@ -1,29 +1,25 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 by Attractive Chaos - Copyright (c) 2019 James S Blachly, MD - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. +/* The MIT License + Copyright (c) 2019 by Attractive Chaos + Copyright (c) 2019 James S Blachly, MD + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. */ - module dklib.khashl; import std.traits : isNumeric, isSomeString, isSigned, hasMember; @@ -84,8 +80,24 @@ alias krealloc = realloc; alias kfree = free; -/// Straight port of khashl's generic C approach -/// Can use cached-hashes for faster comparison of string key hashes +/* Straight port of khashl's generic C approach + Khashl is hash table that performs deletions without tombstones + The main benefit over khash is that it uses less memory however it is + also faster than khash if no deletions are involved. + + Can use cached-hashes for faster comparison of string key hashes + Attractive chaos has this to say about caching hash values (https://attractivechaos.wordpress.com/): + + When we use long strings as keys, comparing two keys may take significant time. + This comparison is often unnecessary. Note that the hash of a string is a good + summary of the string. If two strings are different, their hashes are often + different. We can cache the hash and only compare two keys when their hashes are + equal. It is possible to implement the idea with any hash table implementations. + + If hash-caching is used we use compile time statements to change the bucket type to include + the hash member. We also change the logic to make equality statements check hash equality before + checking the key equalitys and the put and get methods to make sure they don't recompute the hashes. +**/ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = true) { static assert(!isSigned!KT, "Numeric key types must be unsigned -- try uint instead of int, etc."); @@ -122,7 +134,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = { Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_get(&this, ins); return this.keys[x].val; } @@ -133,10 +145,10 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = int absent; Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_put(&this, ins, &absent); this.keys[x].val = val; - static if(cached) this.keys[x].hash = ins.hash; + static if(cached) this.keys[x].hash = ins.hash; //cache the hash } /// remove key/value pair @@ -144,7 +156,7 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = { Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_get(&this, ins); kh_del(&this, x); } @@ -156,14 +168,14 @@ template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = static assert (kh_is_map == true, "require() not sensible in a hash set"); Bucket ins; ins.key = key; - static if(cached) ins.hash = __hash_func(ins.key); + static if(cached) ins.hash = __hash_func(ins.key); //cache the hash auto x = kh_get(&this, ins); if (x == kh_end(&this)) { // not present int absent; x = kh_put(&this, ins, &absent); this.keys[x].val = initval; - static if(cached) this.keys[x].hash = ins.hash; + static if(cached) this.keys[x].hash = ins.hash; //cache the hash } return this.keys[x].val; } @@ -459,12 +471,16 @@ template kh_equal(T, bool cached) { pragma(inline,true) { + /// Assert that we are using a bucket type with key member static assert(hasMember!(T, "key")); + + /// Assert that we are using a bucket type with hash member if using hash-caching static if(cached) static assert(hasMember!(T, "hash")); bool kh_hash_equal(T)(T a, T b) if (isNumeric!(typeof(__traits(getMember,T,"key")))) { + /// There is no benefit to caching hashes for integer keys (I think) static assert (cached == false, "No reason to cache hash for integer keys"); return (a.key == b.key); } @@ -475,6 +491,8 @@ pragma(inline,true) is(typeof(__traits(getMember,T,"key")) == const(char)) || is(typeof(__traits(getMember,T,"key")) == immutable(char))) { + /// If using hash-caching we check equality of the hashes first + /// before checking the equality of keys themselves static if(cached) return (a.hash == b.hash) && (strcmp(a, b) == 0); else return (strcmp(a.key, b.key) == 0); } @@ -482,6 +500,8 @@ pragma(inline,true) bool kh_hash_equal(T)(T a, T b) if(isSomeString!(typeof(__traits(getMember,T,"key")))) { + /// If using hash-caching we check equality of the hashes first + /// before checking the equality of keys themselves static if(cached) return (a.hash == b.hash) && (a.key == b.key); else return (a.key == b.key); } @@ -489,192 +509,16 @@ pragma(inline,true) } // end template kh_equal /* --- END OF HASH FUNCTIONS --- */ -/* Other convenient macros... */ - -/*! - @abstract Type of the hash table. - @param name Name of the hash table [symbol] - */ -//#define khash_t(name) kh_##name##_t -// Moved into template khash(KT, VT) - -/*! @function - @abstract Initiate a hash table. - @param name Name of the hash table [symbol] - @return Pointer to the hash table [khash_t(name)*] - */ -//#define kh_init(name) kh_init_##name() -// Moved into template khash(KT, VT) - -/*! @function - @abstract Destroy a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -//#define kh_destroy(name, h) kh_destroy_##name(h) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Reset a hash table without deallocating memory. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -//#define kh_clear(name, h) kh_clear_##name(h) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Resize a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param s New size [khint_t] - */ -//#define kh_resize(name, h, s) kh_resize_##name(h, s) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Insert a key to the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @param r Extra return code: -1 if the operation failed; - 0 if the key is present in the hash table; - 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] - @return Iterator to the inserted element [khint_t] - */ -//#define kh_put(name, h, k, r) kh_put_##name(h, k, r) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Retrieve a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] - */ -//#define kh_get(name, h, k) kh_get_##name(h, k) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Remove a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Iterator to the element to be deleted [khint_t] - */ -//#define kh_del(name, h, k) kh_del_##name(h, k) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Test whether a bucket contains data. - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return 1 if containing data; 0 otherwise [int] - */ -//#define kh_exist(h, x) (!__ac_iseither((h).flags, (x))) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get key given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Key [type of keys] - */ -//#define kh_key(h, x) ((h).keys[x]) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get value given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Value [type of values] - @discussion For hash sets, calling this results in segfault. - */ -//#define kh_val(h, x) ((h).vals[x]) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Alias of kh_val() - */ -//#define kh_value(h, x) ((h).vals[x]) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get the start iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The start iterator [khint_t] - */ -//#define kh_begin(h) (khint_t)(0) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get the end iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The end iterator [khint_t] - */ -//#define kh_end(h) ((h).n_buckets) -// Moved into template khash(KT, VT) - -/*! @function - @abstract Get the number of elements in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of elements in the hash table [khint_t] - */ -//#define kh_size(h) ((h).size) -// Moved into template khash(KT, VT) -/*! @function - @abstract Get the number of buckets in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of buckets in the hash table [khint_t] - */ -//#define kh_n_buckets(h) ((h).n_buckets) -// Moved into template khash(KT, VT) - -/++ foreach: TODO - -/*! @function - @abstract Iterate over the entries in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param kvar Variable to which key will be assigned - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ -auto kh_foreach(kh_t* h, kvar, vvar, code) -{ - khint_t __i; - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { - if (!kh_exist(h, __i)) continue; - kvar = kh_key(h, __i); - vvar = kh_val(h, __i); - code; - } -} - -/*! @function - @abstract Iterate over the values in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ - auto kh_foreach_value(kh_t* h, vvar, code) - { - khint_t __i; - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { - if (!kh_exist(h, __i)) continue; - vvar = kh_val(h, __i); - code; - } - } -+/ unittest { import std.stdio : writeln, writefln; - writeln("khash unit tests"); + writeln("khashl unit tests"); // test: numeric key type must be unsigned - assert(__traits(compiles, khash!(int, int)) is false); - assert(__traits(compiles, khash!(uint,int)) is true); + assert(__traits(compiles, khashl!(int, int)) is false); + assert(__traits(compiles, khashl!(uint,int)) is true); // auto kh = khash!(uint, char).kh_init(); @@ -688,7 +532,7 @@ unittest // khash!(uint, char).kh_destroy(kh); - auto kh = khash!(uint, char)(); + auto kh = khashl!(uint, char)(); kh[5] = 'J'; assert(kh[5] == 'J'); @@ -699,19 +543,19 @@ unittest /*foreach(k; kh.byKey()) writefln("Key: %s", k);*/ import std.array : array; - assert(kh.byKey().array == [5, 1, 99]); + assert(kh.byKey().array == [1, 99, 5]); // test: byKey on Empty hash table - auto kh_empty = khash!(uint, char)(); // @suppress(dscanner.suspicious.unmodified) + auto kh_empty = khashl!(uint, char)(); // @suppress(dscanner.suspicious.unmodified) assert(kh_empty.byKey.array == []); // test: keytype string - auto kh_string = khash!(string, int)(); + auto kh_string = khashl!(string, int)(); kh_string["test"] = 5; assert( kh_string["test"] == 5 ); // test: valtype string - auto kh_valstring = khash!(uint, string)(); + auto kh_valstring = khashl!(uint, string)(); kh_valstring[42] = "Adams"; assert( kh_valstring[42] == "Adams" ); diff --git a/source/dklib/package.d b/source/dklib/package.d index 4659002..a440dc1 100644 --- a/source/dklib/package.d +++ b/source/dklib/package.d @@ -24,3 +24,4 @@ module dklib; public import dklib.khash; +public import dklib.khashl; From 016842f6b54ab34ca1f2c72c720ffe50dc34a990 Mon Sep 17 00:00:00 2001 From: charlesgregory Date: Wed, 26 Aug 2020 14:03:16 -0400 Subject: [PATCH 10/10] moved and renamed benchmarks --- .../benchmark_khash.d => examples/bench/benchmark_hashmaps.d | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename source/dklib/benchmark_khash.d => examples/bench/benchmark_hashmaps.d (100%) diff --git a/source/dklib/benchmark_khash.d b/examples/bench/benchmark_hashmaps.d similarity index 100% rename from source/dklib/benchmark_khash.d rename to examples/bench/benchmark_hashmaps.d