1 module perfecthash.generator;
2 
3 // Based on: http://ilan.schnell-web.net/prog/perfect-hash/algo.html
4 // which is an illustration of an algorithm by Z. J. Czech, G. Havas and B.S. Majewski which is described in their 1992 paper "An optimal algorithm for generating minimal perfect hash functions" which appeared in Information Processing Letters, 43(5):257-264, 1992.
5 
6 import std.range : ElementType;
7 import std.array : Appender;
8 import std.typecons : Tuple;
9 
10 alias KeyValue = Tuple!(string, "key", uint, "value");
11 
12 struct Vertex {
13   size_t value;
14   size_t edgeIndex;
15 }
16 
17 struct Edge {
18   size_t[2] vertices;
19   size_t hash;
20 }
21 
22 struct PerfectHashFunction {
23   const(size_t)[] G;
24   HashFunction[2] hashFunction;
25   this(const(size_t)[] G, HashFunction[2] hashFunction) @safe nothrow {
26     this.G = G;
27     this.hashFunction = [hashFunction[0].clone, hashFunction[1].clone];
28   }
29   size_t opCall(string key) @safe nothrow pure {
30     return (G[hashFunction[0](key)] + G[hashFunction[1](key)]) % G.length;
31   }
32 }
33 
34 auto createPerfectHashFunction(Random)(string[] keys, Random rnd, ulong maxHnsecs = 1_000_000) @safe nothrow {
35   import std.range : enumerate;
36   import std.algorithm : map;
37   import std.array : array;
38   return createPerfectHashFunction(keys.enumerate.map!(k => KeyValue(k.value, cast(uint)k.index)).array(), rnd, maxHnsecs);
39 }
40 
41 auto createPerfectHashFunction(Range, Random)(Range keys, Random rnd, ulong maxHnsecs = 1_000_000, size_t startN = size_t.max) @safe nothrow if (is(ElementType!Range == KeyValue)) {
42   import std.algorithm : map, maxElement, max;
43   import std.array : array;
44   import std.datetime : Clock;
45   auto start = Clock.currStdTime();
46   auto maxKeyLength = keys.map!(k => k.key.length).maxElement();
47   auto k = keys.length;
48   auto n = startN == size_t.max ? k * 2 : startN;
49   assert(n > k);
50   Vertex[] vertices = new Vertex[n];
51   Edge[] edges = new Edge[k * 2];
52   HashFunction[2] hashFunctions = [HashFunction(maxKeyLength), HashFunction(maxKeyLength)];
53   PerfectHashFunction bestEffort;
54   size_t[] rawBits = new size_t[vertices.length / size_t.sizeof + 1];
55   Appender!(ToCheck[]) toCheck;
56   for(;;) {
57     if (Clock.currStdTime() - start > maxHnsecs && bestEffort.G.length != 0)
58       break;
59     hashFunctions[0].randomize(n, 0, rnd);
60     hashFunctions[1].randomize(n, 1, rnd);
61     if (!calcEdges(keys, hashFunctions, edges, vertices))
62       continue;
63     if (assignVertexValuesOnlyWhenACyclic(edges, vertices, n, rawBits, toCheck))
64       continue;
65     bestEffort = PerfectHashFunction(extractG(vertices).array(), hashFunctions);
66     if (n == k)
67       break;
68     if (Clock.currStdTime() - start > maxHnsecs)
69       break;
70     n = n - (max(1, (n - k) / 20));
71     vertices = vertices[0 .. n];
72     vertices[] = Vertex(0,0);
73   }
74   return bestEffort;
75 }
76 
77 auto recreateEdgesAndVertices(Range)(PerfectHashFunction hf, Range keys) {
78   import std.exception : enforce;
79   import std.typecons : tuple;
80   auto n = hf.G.length;
81   auto k = keys.length;
82   Vertex[] vertices = new Vertex[n];
83   Edge[] edges = new Edge[k * 2];
84   size_t[] rawBits = new size_t[vertices.length / size_t.sizeof + 1];
85   Appender!(ToCheck[]) toCheck;
86   enforce(calcEdges(keys, hf.hashFunction, edges, vertices), "invalid hash function");
87   enforce(!assignVertexValuesOnlyWhenACyclic(edges, vertices, n, rawBits, toCheck), "invalid hash function");
88   return tuple!("edges", "vertices")(edges, vertices);
89 }
90 
91 unittest {
92   import std.random;
93   import std.range : enumerate;
94   import std.algorithm : map, maxElement;
95   import std.array : array;
96   import unit_threaded;
97   import std.stdio;
98 
99   auto rnd = Random(9812);
100   auto keys = [
101     "aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll",
102     "mm", "nn", "oo", "pp", "qq", "rr", "ss", "tt", "uu", "vv", "ww", "xx", "yy",
103     "zz"
104   ].enumerate.map!(e => KeyValue(e[1], cast(uint) e[0])).array();
105   foreach(_; 0 .. 100) {
106     auto ph = createPerfectHashFunction(keys, rnd);
107     foreach (kv; keys)
108     {
109       ph(kv.key).shouldEqual(kv.value);
110     }
111   }
112 }
113 
114 version (unittest) {
115   string randomIdentifier(Random)(ref Random rnd, size_t len = 6) {
116     import std.random : uniform;
117     import std.conv : text;
118     import std.algorithm : map;
119     import std.range : iota;
120 
121     enum chars = "abcdefghijklmnopqrstuvwxyz";
122     return iota(0, len).map!(_ => chars[uniform(0, chars.length, rnd)]).text();
123   }
124 }
125 
126 @("keys.random.5")
127 unittest {
128   import std.random;
129   import std.range : enumerate, iota;
130   import std.algorithm : map, maxElement;
131   import std.array : array;
132   import unit_threaded;
133   import std.stdio;
134 
135   auto rnd = Random(9812);
136   auto len = 3365;
137   auto kvs = iota(0, len).map!(_ => rnd.randomIdentifier()).enumerate.map!(e => KeyValue(e[1], cast(uint)e[0])).array();
138 
139   foreach(_; 0 .. 100) {
140     auto ph = kvs.createPerfectHashFunction(rnd);
141     foreach (kv; kvs) {
142       ph(kv.key).shouldEqual(kv.value);
143     }
144   }
145 }
146 
147 string toDModule(ref PerfectHashFunction fun, string symbolName, string moduleName = "") @safe pure {
148   import std.conv : to;
149   import std.string : replace;
150   size_t N = fun.G.length;
151   size_t coeffLen = fun.hashFunction[0].coeff.length;
152   string NType = N > ushort.max ? "uint" : N > ubyte.max ? "ushort" : "ubyte";
153   auto func = q{auto $symbolName(string key) @safe nothrow pure {
154     static $NType hash(alias coeff, size_t offset)(string key) {
155         size_t t;
156         foreach(idx, c; key)
157             t += (c + offset) * coeff[idx % $coeffLen];
158         return t % $N;
159     }
160     static immutable $NType[$N] G = $G;
161     static immutable $NType[$coeffLen] coeffA = $coeffA;
162     static immutable $NType[$coeffLen] coeffB = $coeffB;
163     return (G[hash!(coeffA, 0)(key)] + G[hash!(coeffB, 1)(key)]) % $N;
164 }}.replace("$NType", NType).replace("$moduleName", moduleName).replace("$symbolName", symbolName).replace("$N", N.to!string).replace("$coeffLen", coeffLen.to!string).replace("$G", fun.G.to!string).replace("$coeffA", fun.hashFunction[0].coeff.to!string).replace("$coeffB", fun.hashFunction[1].coeff.to!string);
165   if (moduleName.length > 0)
166     return "module $moduleName;\n".replace("$moduleName", moduleName) ~ func;
167   return func;
168 }
169 
170 @safe nothrow unittest {
171   import std.random;
172   auto keys = ["asdf", "fdsa", "car", "garage", "elephant", "kangeroo"];
173   auto rnd = Random(9123);
174   foreach(i; 0..10000) {
175     auto ph = createPerfectHashFunction(keys, rnd, 10);
176     foreach(idx, key; keys)
177       assert(ph(key) == idx);
178   }
179 }
180 
181 @safe nothrow unittest {
182   import std.random;
183   auto keys = [KeyValue("asdf",0), KeyValue("fdsa",1), KeyValue("car",1), KeyValue("garage",2), KeyValue("elephant",2), KeyValue("kangeroo",0)];
184   auto rnd = Random(9812);
185   foreach(i; 0..10000) {
186     auto ph = createPerfectHashFunction(keys, rnd, 10);
187     foreach(key; keys)
188       assert(ph(key.key) == key.value);
189   }
190 }
191 
192 auto extractG(const ref Vertex[] vertices) {
193   import std.algorithm : map;
194   return vertices.map!(v => v.value);
195 }
196 
197 bool calcEdges(Range)(Range keys, const HashFunction[2] hashFunction, ref Edge[] edges, ref Vertex[] vertices) if (is(ElementType!Range == KeyValue)) {
198   import std.algorithm : multiSort;
199   assert(edges.length == 2*keys.length);
200   foreach(idx, key; keys) {
201     auto a = hashFunction[0](key.key);
202     auto b = hashFunction[1](key.key);
203     if (a == b) // each edge must point to 2 distinct vertices
204       return false;
205     edges[idx * 2].vertices[0] = a;
206     edges[idx * 2].vertices[1] = b;
207     edges[idx * 2].hash = key.value;
208     edges[idx * 2 + 1].vertices[0] = b;
209     edges[idx * 2 + 1].vertices[1] = a;
210     edges[idx * 2 + 1].hash = key.value;
211   }
212   edges.multiSort!("a.vertices[0] < b.vertices[0]", "a.vertices[1] < b.vertices[1]");
213   size_t pos = 0;
214   foreach(idx, ref v; vertices) {
215     if (idx < edges[pos].vertices[0]) {
216       v.edgeIndex = size_t.max;
217       v.value = 0;
218     } else if (idx == edges[pos].vertices[0]) {
219       v.value = 0;
220       v.edgeIndex = pos;
221       pos++;
222       for(;;) {
223         if (pos < edges.length) {
224           if (edges[pos].vertices[0] != idx)
225             break;
226           else if (edges[pos].vertices[1] == edges[pos - 1].vertices[1]) // cannot have duplicated edges
227             return false;
228         } else {
229           if (vertices.length > idx + 1)
230             vertices[idx + 1 .. $] = Vertex(0, size_t.max);
231           return true;
232         }
233         pos++;
234       }
235     }
236   }
237   return true;
238 }
239 
240 @("calcEdges")
241 unittest {
242   import std.random;
243   import std.range : enumerate;
244   import std.algorithm : map, maxElement;
245   import std.array : array;
246   import unit_threaded;
247   import std.stdio;
248 
249   auto rnd = Random(9812);
250   auto keys = [
251     "aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll",
252     "mm", "nn", "oo", "pp", "qq", "rr", "ss", "tt", "uu", "vv", "ww", "xx", "yy",
253     "zz"
254                ].enumerate.map!(e => KeyValue(e[1], cast(uint) e[0])).array();
255   auto k = keys.length;
256   auto n = k*3;
257   auto maxKeyLength = keys.maxElement!(k => k.key.length).key.length;
258   HashFunction[2] hashFunctions = [
259     HashFunction(maxKeyLength), HashFunction(maxKeyLength)
260   ];
261   Vertex[] vertices = new Vertex[n];
262   Edge[] edges = new Edge[k * 2];
263 
264   for(int x = 0;x < 100000;) {
265     hashFunctions[0].randomize(n, 0, rnd);
266     hashFunctions[1].randomize(n, 1, rnd);
267   if (calcEdges(keys, hashFunctions, edges, vertices)) {
268     foreach(idx, v; vertices) {
269       if (v.edgeIndex == size_t.max)
270         continue;
271       edges[v.edgeIndex].vertices[0].shouldEqual(idx);
272     }
273     x++;
274   }
275   }
276 }
277 
278 // since the edges are sorted by vertex indices, we can get all outgoing edges of a vertex
279 // by starting at the index in the edges list for the particular vertex and extract all edges
280 // as long as it starts from that vertex
281 auto outgoing(const ref Edge[] edges, const ref Vertex vertex) @safe nothrow {
282   struct Impl {
283     size_t idx;
284     size_t vertexId;
285     const Edge[] edge;
286     bool empty() @safe nothrow {
287       return idx >= edge.length || edge[idx].vertices[0] != vertexId;
288     }
289     auto front() @safe nothrow {
290       return edge[idx];
291     }
292     void popFront() @safe nothrow {
293       idx++;
294     }
295   }
296   assert(vertex.edgeIndex != size_t.max);
297   return Impl(vertex.edgeIndex, edges[vertex.edgeIndex].vertices[0], edges);
298 }
299 
300 struct ToCheck {
301   size_t vertexId;
302   size_t parentId;
303 }
304 
305 auto assumeNoThrow(T)(lazy T t) @trusted {
306   try {
307     return t();
308   } catch (Exception e) {
309     assert(0, e.message);
310   }
311 }
312 
313 @safe nothrow unittest {
314   auto edges = [Edge([0,1],0), Edge([0,2],2), Edge([1,0],0), Edge([1,2],1), Edge([2,0],2), Edge([2,1],1)];
315   auto vertices = [Vertex(0, 0), Vertex(0, 2), Vertex(0, 4)];
316   size_t[] rawBits = new size_t[vertices.length / size_t.sizeof + 1];
317   Appender!(ToCheck[]) toCheck;
318   assert(assignVertexValuesOnlyWhenACyclic(edges, vertices, 3, rawBits, toCheck));
319 }
320 
321 // returns true when cyclic
322 bool assignVertexValuesOnlyWhenACyclic(const ref Edge[] edges, ref Vertex[] vertices, size_t n, size_t[] rawBits, ref Appender!(ToCheck[]) toCheck) @trusted nothrow {
323   import std.bitmanip : BitArray;
324   import std.array : Appender;
325   rawBits[] = 0;
326   auto bitArray = BitArray(rawBits, rawBits.length * size_t.sizeof * 8);
327   assert(bitArray.length >= vertices.length, "need as many bits as vertices");
328   toCheck.shrinkTo(0).assumeNoThrow();
329   foreach(idx, startVertex; vertices) {
330     if (bitArray[idx])
331       continue;
332     startVertex.value = 0;
333     if (startVertex.edgeIndex == size_t.max)
334       continue;
335     bitArray[idx] = true;
336     foreach(edge; edges.outgoing(startVertex)) {
337       auto neighbour = edge.vertices[1];
338       toCheck.put(ToCheck(neighbour, idx));
339       bitArray[neighbour] = true;
340       auto diff = (n - startVertex.value + edge.hash) % n;
341       vertices[neighbour].value = diff;
342     }
343     while (toCheck.data.length > 0) {
344       auto check = toCheck.data[$-1];
345       auto vertex = vertices[check.vertexId];
346       toCheck.shrinkTo(toCheck.data.length - 1).assumeNoThrow();
347       if (vertex.edgeIndex == size_t.max)
348         continue;
349       foreach(edge; edges.outgoing(vertex)) {
350         assert(edge.vertices[0] == check.vertexId);
351         auto neighbour = edge.vertices[1];
352         if (neighbour == check.parentId)
353           continue;
354         if (bitArray[neighbour])
355           return true;
356         auto diff = (n - vertex.value + edge.hash) % n;
357         vertices[neighbour].value = diff;
358         bitArray[neighbour] = true;
359         toCheck.put(ToCheck(neighbour, check.vertexId));
360       }
361     }
362   }
363   return false;
364 }
365 
366 struct HashFunction {
367   size_t[] coeff;
368   size_t n;
369   size_t o;
370   size_t opCall(string key) const @safe nothrow pure {
371     size_t t;
372     foreach(idx, c; key)
373       t += (c+o) * coeff[idx % coeff.length];
374     return t % n;
375   }
376   this(size_t k) @safe nothrow {
377     coeff.length = k;
378   }
379   void randomize(Random)(size_t n, size_t o, auto ref Random rnd) @safe nothrow {
380     import std.random : uniform;
381     this.n = n;
382     this.o = o;
383     foreach(ref v; coeff[0..coeff.length])
384       v = uniform(0, n, rnd).assumeNoThrow();
385   }
386   auto clone() {
387     auto clone = this;
388     clone.coeff = coeff.dup();
389     return clone;
390   }
391 }