1 module perfecthash.generator; 2 3 // Based on: http://ilan.schnell-web.net/prog/perfect-hash/algo.html 4 // which is an illustration of an algorithm by Z. J. Czech, G. Havas and B.S. Majewski which is described in their 1992 paper "An optimal algorithm for generating minimal perfect hash functions" which appeared in Information Processing Letters, 43(5):257-264, 1992. 5 6 import std.range : ElementType; 7 import std.array : Appender; 8 import std.typecons : Tuple; 9 10 alias KeyValue = Tuple!(string, "key", uint, "value"); 11 12 struct Vertex { 13 size_t value; 14 size_t edgeIndex; 15 } 16 17 struct Edge { 18 size_t[2] vertices; 19 size_t hash; 20 } 21 22 struct PerfectHashFunction { 23 const(size_t)[] G; 24 HashFunction[2] hashFunction; 25 this(const(size_t)[] G, HashFunction[2] hashFunction) @safe nothrow { 26 this.G = G; 27 this.hashFunction = [hashFunction[0].clone, hashFunction[1].clone]; 28 } 29 size_t opCall(string key) @safe nothrow pure { 30 return (G[hashFunction[0](key)] + G[hashFunction[1](key)]) % G.length; 31 } 32 } 33 34 auto createPerfectHashFunction(Random)(string[] keys, Random rnd, ulong maxHnsecs = 1_000_000) @safe nothrow { 35 import std.range : enumerate; 36 import std.algorithm : map; 37 import std.array : array; 38 return createPerfectHashFunction(keys.enumerate.map!(k => KeyValue(k.value, cast(uint)k.index)).array(), rnd, maxHnsecs); 39 } 40 41 auto createPerfectHashFunction(Range, Random)(Range keys, Random rnd, ulong maxHnsecs = 1_000_000, size_t startN = size_t.max) @safe nothrow if (is(ElementType!Range == KeyValue)) { 42 import std.algorithm : map, maxElement, max; 43 import std.array : array; 44 import std.datetime : Clock; 45 auto start = Clock.currStdTime(); 46 auto maxKeyLength = keys.map!(k => k.key.length).maxElement(); 47 auto k = keys.length; 48 auto n = startN == size_t.max ? k * 2 : startN; 49 assert(n > k); 50 Vertex[] vertices = new Vertex[n]; 51 Edge[] edges = new Edge[k * 2]; 52 HashFunction[2] hashFunctions = [HashFunction(maxKeyLength), HashFunction(maxKeyLength)]; 53 PerfectHashFunction bestEffort; 54 size_t[] rawBits = new size_t[vertices.length / size_t.sizeof + 1]; 55 Appender!(ToCheck[]) toCheck; 56 for(;;) { 57 if (Clock.currStdTime() - start > maxHnsecs && bestEffort.G.length != 0) 58 break; 59 hashFunctions[0].randomize(n, 0, rnd); 60 hashFunctions[1].randomize(n, 1, rnd); 61 if (!calcEdges(keys, hashFunctions, edges, vertices)) 62 continue; 63 if (assignVertexValuesOnlyWhenACyclic(edges, vertices, n, rawBits, toCheck)) 64 continue; 65 bestEffort = PerfectHashFunction(extractG(vertices).array(), hashFunctions); 66 if (n == k) 67 break; 68 if (Clock.currStdTime() - start > maxHnsecs) 69 break; 70 n = n - (max(1, (n - k) / 20)); 71 vertices = vertices[0 .. n]; 72 vertices[] = Vertex(0,0); 73 } 74 return bestEffort; 75 } 76 77 auto recreateEdgesAndVertices(Range)(PerfectHashFunction hf, Range keys) { 78 import std.exception : enforce; 79 import std.typecons : tuple; 80 auto n = hf.G.length; 81 auto k = keys.length; 82 Vertex[] vertices = new Vertex[n]; 83 Edge[] edges = new Edge[k * 2]; 84 size_t[] rawBits = new size_t[vertices.length / size_t.sizeof + 1]; 85 Appender!(ToCheck[]) toCheck; 86 enforce(calcEdges(keys, hf.hashFunction, edges, vertices), "invalid hash function"); 87 enforce(!assignVertexValuesOnlyWhenACyclic(edges, vertices, n, rawBits, toCheck), "invalid hash function"); 88 return tuple!("edges", "vertices")(edges, vertices); 89 } 90 91 unittest { 92 import std.random; 93 import std.range : enumerate; 94 import std.algorithm : map, maxElement; 95 import std.array : array; 96 import unit_threaded; 97 import std.stdio; 98 99 auto rnd = Random(9812); 100 auto keys = [ 101 "aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", 102 "mm", "nn", "oo", "pp", "qq", "rr", "ss", "tt", "uu", "vv", "ww", "xx", "yy", 103 "zz" 104 ].enumerate.map!(e => KeyValue(e[1], cast(uint) e[0])).array(); 105 foreach(_; 0 .. 100) { 106 auto ph = createPerfectHashFunction(keys, rnd); 107 foreach (kv; keys) 108 { 109 ph(kv.key).shouldEqual(kv.value); 110 } 111 } 112 } 113 114 version (unittest) { 115 string randomIdentifier(Random)(ref Random rnd, size_t len = 6) { 116 import std.random : uniform; 117 import std.conv : text; 118 import std.algorithm : map; 119 import std.range : iota; 120 121 enum chars = "abcdefghijklmnopqrstuvwxyz"; 122 return iota(0, len).map!(_ => chars[uniform(0, chars.length, rnd)]).text(); 123 } 124 } 125 126 @("keys.random.5") 127 unittest { 128 import std.random; 129 import std.range : enumerate, iota; 130 import std.algorithm : map, maxElement; 131 import std.array : array; 132 import unit_threaded; 133 import std.stdio; 134 135 auto rnd = Random(9812); 136 auto len = 3365; 137 auto kvs = iota(0, len).map!(_ => rnd.randomIdentifier()).enumerate.map!(e => KeyValue(e[1], cast(uint)e[0])).array(); 138 139 foreach(_; 0 .. 100) { 140 auto ph = kvs.createPerfectHashFunction(rnd); 141 foreach (kv; kvs) { 142 ph(kv.key).shouldEqual(kv.value); 143 } 144 } 145 } 146 147 string toDModule(ref PerfectHashFunction fun, string symbolName, string moduleName = "") @safe pure { 148 import std.conv : to; 149 import std.string : replace; 150 size_t N = fun.G.length; 151 size_t coeffLen = fun.hashFunction[0].coeff.length; 152 string NType = N > ushort.max ? "uint" : N > ubyte.max ? "ushort" : "ubyte"; 153 auto func = q{auto $symbolName(string key) @safe nothrow pure { 154 static $NType hash(alias coeff, size_t offset)(string key) { 155 size_t t; 156 foreach(idx, c; key) 157 t += (c + offset) * coeff[idx % $coeffLen]; 158 return t % $N; 159 } 160 static immutable $NType[$N] G = $G; 161 static immutable $NType[$coeffLen] coeffA = $coeffA; 162 static immutable $NType[$coeffLen] coeffB = $coeffB; 163 return (G[hash!(coeffA, 0)(key)] + G[hash!(coeffB, 1)(key)]) % $N; 164 }}.replace("$NType", NType).replace("$moduleName", moduleName).replace("$symbolName", symbolName).replace("$N", N.to!string).replace("$coeffLen", coeffLen.to!string).replace("$G", fun.G.to!string).replace("$coeffA", fun.hashFunction[0].coeff.to!string).replace("$coeffB", fun.hashFunction[1].coeff.to!string); 165 if (moduleName.length > 0) 166 return "module $moduleName;\n".replace("$moduleName", moduleName) ~ func; 167 return func; 168 } 169 170 @safe nothrow unittest { 171 import std.random; 172 auto keys = ["asdf", "fdsa", "car", "garage", "elephant", "kangeroo"]; 173 auto rnd = Random(9123); 174 foreach(i; 0..10000) { 175 auto ph = createPerfectHashFunction(keys, rnd, 10); 176 foreach(idx, key; keys) 177 assert(ph(key) == idx); 178 } 179 } 180 181 @safe nothrow unittest { 182 import std.random; 183 auto keys = [KeyValue("asdf",0), KeyValue("fdsa",1), KeyValue("car",1), KeyValue("garage",2), KeyValue("elephant",2), KeyValue("kangeroo",0)]; 184 auto rnd = Random(9812); 185 foreach(i; 0..10000) { 186 auto ph = createPerfectHashFunction(keys, rnd, 10); 187 foreach(key; keys) 188 assert(ph(key.key) == key.value); 189 } 190 } 191 192 auto extractG(const ref Vertex[] vertices) { 193 import std.algorithm : map; 194 return vertices.map!(v => v.value); 195 } 196 197 bool calcEdges(Range)(Range keys, const HashFunction[2] hashFunction, ref Edge[] edges, ref Vertex[] vertices) if (is(ElementType!Range == KeyValue)) { 198 import std.algorithm : multiSort; 199 assert(edges.length == 2*keys.length); 200 foreach(idx, key; keys) { 201 auto a = hashFunction[0](key.key); 202 auto b = hashFunction[1](key.key); 203 if (a == b) // each edge must point to 2 distinct vertices 204 return false; 205 edges[idx * 2].vertices[0] = a; 206 edges[idx * 2].vertices[1] = b; 207 edges[idx * 2].hash = key.value; 208 edges[idx * 2 + 1].vertices[0] = b; 209 edges[idx * 2 + 1].vertices[1] = a; 210 edges[idx * 2 + 1].hash = key.value; 211 } 212 edges.multiSort!("a.vertices[0] < b.vertices[0]", "a.vertices[1] < b.vertices[1]"); 213 size_t pos = 0; 214 foreach(idx, ref v; vertices) { 215 if (idx < edges[pos].vertices[0]) { 216 v.edgeIndex = size_t.max; 217 v.value = 0; 218 } else if (idx == edges[pos].vertices[0]) { 219 v.value = 0; 220 v.edgeIndex = pos; 221 pos++; 222 for(;;) { 223 if (pos < edges.length) { 224 if (edges[pos].vertices[0] != idx) 225 break; 226 else if (edges[pos].vertices[1] == edges[pos - 1].vertices[1]) // cannot have duplicated edges 227 return false; 228 } else { 229 if (vertices.length > idx + 1) 230 vertices[idx + 1 .. $] = Vertex(0, size_t.max); 231 return true; 232 } 233 pos++; 234 } 235 } 236 } 237 return true; 238 } 239 240 @("calcEdges") 241 unittest { 242 import std.random; 243 import std.range : enumerate; 244 import std.algorithm : map, maxElement; 245 import std.array : array; 246 import unit_threaded; 247 import std.stdio; 248 249 auto rnd = Random(9812); 250 auto keys = [ 251 "aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", 252 "mm", "nn", "oo", "pp", "qq", "rr", "ss", "tt", "uu", "vv", "ww", "xx", "yy", 253 "zz" 254 ].enumerate.map!(e => KeyValue(e[1], cast(uint) e[0])).array(); 255 auto k = keys.length; 256 auto n = k*3; 257 auto maxKeyLength = keys.maxElement!(k => k.key.length).key.length; 258 HashFunction[2] hashFunctions = [ 259 HashFunction(maxKeyLength), HashFunction(maxKeyLength) 260 ]; 261 Vertex[] vertices = new Vertex[n]; 262 Edge[] edges = new Edge[k * 2]; 263 264 for(int x = 0;x < 100000;) { 265 hashFunctions[0].randomize(n, 0, rnd); 266 hashFunctions[1].randomize(n, 1, rnd); 267 if (calcEdges(keys, hashFunctions, edges, vertices)) { 268 foreach(idx, v; vertices) { 269 if (v.edgeIndex == size_t.max) 270 continue; 271 edges[v.edgeIndex].vertices[0].shouldEqual(idx); 272 } 273 x++; 274 } 275 } 276 } 277 278 // since the edges are sorted by vertex indices, we can get all outgoing edges of a vertex 279 // by starting at the index in the edges list for the particular vertex and extract all edges 280 // as long as it starts from that vertex 281 auto outgoing(const ref Edge[] edges, const ref Vertex vertex) @safe nothrow { 282 struct Impl { 283 size_t idx; 284 size_t vertexId; 285 const Edge[] edge; 286 bool empty() @safe nothrow { 287 return idx >= edge.length || edge[idx].vertices[0] != vertexId; 288 } 289 auto front() @safe nothrow { 290 return edge[idx]; 291 } 292 void popFront() @safe nothrow { 293 idx++; 294 } 295 } 296 assert(vertex.edgeIndex != size_t.max); 297 return Impl(vertex.edgeIndex, edges[vertex.edgeIndex].vertices[0], edges); 298 } 299 300 struct ToCheck { 301 size_t vertexId; 302 size_t parentId; 303 } 304 305 auto assumeNoThrow(T)(lazy T t) @trusted { 306 try { 307 return t(); 308 } catch (Exception e) { 309 assert(0, e.message); 310 } 311 } 312 313 @safe nothrow unittest { 314 auto edges = [Edge([0,1],0), Edge([0,2],2), Edge([1,0],0), Edge([1,2],1), Edge([2,0],2), Edge([2,1],1)]; 315 auto vertices = [Vertex(0, 0), Vertex(0, 2), Vertex(0, 4)]; 316 size_t[] rawBits = new size_t[vertices.length / size_t.sizeof + 1]; 317 Appender!(ToCheck[]) toCheck; 318 assert(assignVertexValuesOnlyWhenACyclic(edges, vertices, 3, rawBits, toCheck)); 319 } 320 321 // returns true when cyclic 322 bool assignVertexValuesOnlyWhenACyclic(const ref Edge[] edges, ref Vertex[] vertices, size_t n, size_t[] rawBits, ref Appender!(ToCheck[]) toCheck) @trusted nothrow { 323 import std.bitmanip : BitArray; 324 import std.array : Appender; 325 rawBits[] = 0; 326 auto bitArray = BitArray(rawBits, rawBits.length * size_t.sizeof * 8); 327 assert(bitArray.length >= vertices.length, "need as many bits as vertices"); 328 toCheck.shrinkTo(0).assumeNoThrow(); 329 foreach(idx, startVertex; vertices) { 330 if (bitArray[idx]) 331 continue; 332 startVertex.value = 0; 333 if (startVertex.edgeIndex == size_t.max) 334 continue; 335 bitArray[idx] = true; 336 foreach(edge; edges.outgoing(startVertex)) { 337 auto neighbour = edge.vertices[1]; 338 toCheck.put(ToCheck(neighbour, idx)); 339 bitArray[neighbour] = true; 340 auto diff = (n - startVertex.value + edge.hash) % n; 341 vertices[neighbour].value = diff; 342 } 343 while (toCheck.data.length > 0) { 344 auto check = toCheck.data[$-1]; 345 auto vertex = vertices[check.vertexId]; 346 toCheck.shrinkTo(toCheck.data.length - 1).assumeNoThrow(); 347 if (vertex.edgeIndex == size_t.max) 348 continue; 349 foreach(edge; edges.outgoing(vertex)) { 350 assert(edge.vertices[0] == check.vertexId); 351 auto neighbour = edge.vertices[1]; 352 if (neighbour == check.parentId) 353 continue; 354 if (bitArray[neighbour]) 355 return true; 356 auto diff = (n - vertex.value + edge.hash) % n; 357 vertices[neighbour].value = diff; 358 bitArray[neighbour] = true; 359 toCheck.put(ToCheck(neighbour, check.vertexId)); 360 } 361 } 362 } 363 return false; 364 } 365 366 struct HashFunction { 367 size_t[] coeff; 368 size_t n; 369 size_t o; 370 size_t opCall(string key) const @safe nothrow pure { 371 size_t t; 372 foreach(idx, c; key) 373 t += (c+o) * coeff[idx % coeff.length]; 374 return t % n; 375 } 376 this(size_t k) @safe nothrow { 377 coeff.length = k; 378 } 379 void randomize(Random)(size_t n, size_t o, auto ref Random rnd) @safe nothrow { 380 import std.random : uniform; 381 this.n = n; 382 this.o = o; 383 foreach(ref v; coeff[0..coeff.length]) 384 v = uniform(0, n, rnd).assumeNoThrow(); 385 } 386 auto clone() { 387 auto clone = this; 388 clone.coeff = coeff.dup(); 389 return clone; 390 } 391 }