1 #----------------------------------------------------------------------
4 # Perl module that constructs minimal perfect hash functions
6 # This code constructs a minimal perfect hash function for the given
7 # set of keys, using an algorithm described in
8 # "An optimal algorithm for generating minimal perfect hash functions"
9 # by Czech, Havas and Majewski in Information Processing Letters,
10 # 43(5):256-264, October 1992.
11 # This implementation is loosely based on NetBSD's "nbperf",
12 # which was written by Joerg Sonnenberger.
14 # The resulting hash function is perfect in the sense that if the presented
15 # key is one of the original set, it will return the key's index in the set
16 # (in range 0..N-1). However, the caller must still verify the match,
17 # as false positives are possible. Also, the hash function may return
18 # values that are out of range (negative or >= N), due to summing unrelated
19 # hashtable entries. This indicates that the presented key is definitely
23 # Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
24 # Portions Copyright (c) 1994, Regents of the University of California
26 # src/tools/PerfectHash.pm
28 #----------------------------------------------------------------------
36 # At runtime, we'll compute two simple hash functions of the input key,
37 # and use them to index into a mapping table. The hash functions are just
38 # multiply-and-add in uint32 arithmetic, with different multipliers and
39 # initial seeds. All the complexity in this module is concerned with
40 # selecting hash parameters that will work and building the mapping table.
42 # We support making case-insensitive hash functions, though this only
43 # works for a strict-ASCII interpretation of case insensitivity,
44 # ie, A-Z maps onto a-z and nothing else.
49 # Construct a C function implementing a perfect hash for the given keys.
50 # The C function definition is returned as a string.
52 # The keys should be passed as an array reference. They can be any set
53 # of Perl strings; it is caller's responsibility that there not be any
54 # duplicates. (Note that the "strings" can be binary data, but hashing
55 # e.g. OIDs has endianness hazards that callers must overcome.)
57 # The name to use for the function is specified as the second argument.
58 # It will be a global function by default, but the caller may prepend
59 # "static " to the result string if it wants a static function.
61 # Additional options can be specified as keyword-style arguments:
64 # If specified as true, the hash function is case-insensitive, for the
65 # limited idea of case-insensitivity explained above.
67 # fixed_key_length => N
68 # If specified, all keys are assumed to have length N bytes, and the
69 # hash function signature will be just "int f(const void *key)"
70 # rather than "int f(const void *key, size_t keylen)".
72 sub generate_hash_function
74 my ($keys_ref, $funcname, %options) = @_;
76 # It's not worth passing this around as a parameter; just use a global.
77 $case_fold = $options{case_fold
} || 0;
79 # Try different hash function parameters until we find a set that works
80 # for these keys. The multipliers are chosen to be primes that are cheap
81 # to calculate via shift-and-add, so don't change them without care.
82 # (Commonly, random seeds are tried, but we want reproducible results
83 # from this program so we don't do that.)
90 for ($hash_seed1 = 0; $hash_seed1 < 10; $hash_seed1++)
93 for ($hash_seed2 = 0; $hash_seed2 < 10; $hash_seed2++)
95 foreach (17, 31, 127, 8191)
97 $hash_mult2 = $_; # "foreach $hash_mult2" doesn't work
98 @subresult = _construct_hash_table
(
99 $keys_ref, $hash_mult1, $hash_mult2,
100 $hash_seed1, $hash_seed2);
101 last FIND_PARAMS
if @subresult;
106 # Choke if we couldn't find a workable set of parameters.
107 die "failed to generate perfect hash" if !@subresult;
109 # Extract info from _construct_hash_table's result array.
110 my $elemtype = $subresult[0];
111 my @hashtab = @
{ $subresult[1] };
112 my $nhash = scalar(@hashtab);
114 # OK, construct the hash function definition including the hash table.
116 $f .= sprintf "int\n";
117 if (defined $options{fixed_key_length
})
119 $f .= sprintf "%s(const void *key)\n{\n", $funcname;
123 $f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname;
125 $f .= sprintf "\tstatic const %s h[%d] = {\n\t\t", $elemtype, $nhash;
126 for (my $i = 0; $i < $nhash; $i++)
129 $f .= sprintf "%d", $hashtab[$i];
130 next if ($i == $nhash - 1);
132 # Optional indentation and newline, with eight items per line.
134 ($i % 8 == 7 ?
"\n\t\t" : ' ' x
(6 - length($hashtab[$i])));
136 $f .= sprintf "\n" if ($nhash % 8 != 0);
137 $f .= sprintf "\t};\n\n";
138 $f .= sprintf "\tconst unsigned char *k = (const unsigned char *) key;\n";
139 $f .= sprintf "\tsize_t\t\tkeylen = %d;\n", $options{fixed_key_length
}
140 if (defined $options{fixed_key_length
});
141 $f .= sprintf "\tuint32\t\ta = %d;\n", $hash_seed1;
142 $f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed2;
143 $f .= sprintf "\twhile (keylen--)\n\t{\n";
144 $f .= sprintf "\t\tunsigned char c = *k++";
145 $f .= sprintf " | 0x20" if $case_fold; # see comment below
146 $f .= sprintf ";\n\n";
147 $f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1;
148 $f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2;
149 $f .= sprintf "\t}\n";
150 $f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash;
157 # Calculate a hash function as the run-time code will do.
159 # If we are making a case-insensitive hash function, we implement that
160 # by OR'ing 0x20 into each byte of the key. This correctly transforms
161 # upper-case ASCII into lower-case ASCII, while not changing digits or
162 # dollar signs. (It does change '_', as well as other characters not
163 # likely to appear in keywords; this has little effect on the hash's
164 # ability to discriminate keywords.)
167 my ($key, $mult, $seed) = @_;
170 for my $c (split //, $key)
173 $cn |= 0x20 if $case_fold;
174 $result = ($result * $mult + $cn) % 4294967296;
180 # Attempt to construct a mapping table for a minimal perfect hash function
181 # for the given keys, using the specified hash parameters.
183 # Returns an array containing the mapping table element type name as the
184 # first element, and a ref to an array of the table values as the second.
186 # Returns an empty array on failure; then caller should choose different
187 # hash parameter(s) and try again.
188 sub _construct_hash_table
190 my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed1, $hash_seed2) = @_;
191 my @keys = @
{$keys_ref};
193 # This algorithm is based on a graph whose edges correspond to the
194 # keys and whose vertices correspond to entries of the mapping table.
195 # A key's edge links the two vertices whose indexes are the outputs of
196 # the two hash functions for that key. For K keys, the mapping
197 # table must have at least 2*K+1 entries, guaranteeing that there's at
198 # least one unused entry. (In principle, larger mapping tables make it
199 # easier to find a workable hash and increase the number of inputs that
200 # can be rejected due to touching unused hashtable entries. In practice,
201 # neither effect seems strong enough to justify using a larger table.)
202 my $nedges = scalar @keys; # number of edges
203 my $nverts = 2 * $nedges + 1; # number of vertices
205 # However, it would be very bad if $nverts were exactly equal to either
206 # $hash_mult1 or $hash_mult2: effectively, that hash function would be
207 # sensitive to only the last byte of each key. Cases where $nverts is a
208 # multiple of either multiplier likewise lose information. (But $nverts
209 # can't actually divide them, if they've been intelligently chosen as
210 # primes.) We can avoid such problems by adjusting the table size.
211 while ($nverts % $hash_mult1 == 0
212 || $nverts % $hash_mult2 == 0)
217 # Initialize the array of edges.
219 foreach my $kw (@keys)
221 # Calculate hashes for this key.
222 # The hashes are immediately reduced modulo the mapping table size.
223 my $hash1 = _calc_hash
($kw, $hash_mult1, $hash_seed1) % $nverts;
224 my $hash2 = _calc_hash
($kw, $hash_mult2, $hash_seed2) % $nverts;
226 # If the two hashes are the same for any key, we have to fail
227 # since this edge would itself form a cycle in the graph.
228 return () if $hash1 == $hash2;
230 # Add the edge for this key.
231 push @E, { left
=> $hash1, right
=> $hash2 };
234 # Initialize the array of vertices, giving them all empty lists
235 # of associated edges. (The lists will be hashes of edge numbers.)
237 for (my $v = 0; $v < $nverts; $v++)
239 push @V, { edges
=> {} };
242 # Insert each edge in the lists of edges connected to its vertices.
243 for (my $e = 0; $e < $nedges; $e++)
245 my $v = $E[$e]{left
};
246 $V[$v]{edges
}->{$e} = 1;
249 $V[$v]{edges
}->{$e} = 1;
252 # Now we attempt to prove the graph acyclic.
253 # A cycle-free graph is either empty or has some vertex of degree 1.
254 # Removing the edge attached to that vertex doesn't change this property,
255 # so doing that repeatedly will reduce the size of the graph.
256 # If the graph is empty at the end of the process, it was acyclic.
257 # We track the order of edge removal so that the next phase can process
258 # them in reverse order of removal.
259 my @output_order = ();
261 # Consider each vertex as a possible starting point for edge-removal.
262 for (my $startv = 0; $startv < $nverts; $startv++)
266 # If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
267 # remove that edge, and then consider the edge's other vertex to see
268 # if it is now of degree 1. The inner loop repeats until reaching a
269 # vertex not of degree 1.
270 while (scalar(keys(%{ $V[$v]{edges
} })) == 1)
272 # Unlink its only edge.
273 my $e = (keys(%{ $V[$v]{edges
} }))[0];
274 delete($V[$v]{edges
}->{$e});
276 # Unlink the edge from its other vertex, too.
277 my $v2 = $E[$e]{left
};
278 $v2 = $E[$e]{right
} if ($v2 == $v);
279 delete($V[$v2]{edges
}->{$e});
281 # Push e onto the front of the output-order list.
282 unshift @output_order, $e;
284 # Consider v2 on next iteration of inner loop.
289 # We succeeded only if all edges were removed from the graph.
290 return () if (scalar(@output_order) != $nedges);
292 # OK, build the hash table of size $nverts.
293 my @hashtab = (0) x
$nverts;
294 # We need a "visited" flag array in this step, too.
295 my @visited = (0) x
$nverts;
297 # The goal is that for any key, the sum of the hash table entries for
298 # its first and second hash values is the desired output (i.e., the key
299 # number). By assigning hash table values in the selected edge order,
300 # we can guarantee that that's true. This works because the edge first
301 # removed from the graph (and hence last to be visited here) must have
302 # at least one vertex it shared with no other edge; hence it will have at
303 # least one vertex (hashtable entry) still unvisited when we reach it here,
304 # and we can assign that unvisited entry a value that makes the sum come
305 # out as we wish. By induction, the same holds for all the other edges.
306 foreach my $e (@output_order)
308 my $l = $E[$e]{left
};
309 my $r = $E[$e]{right
};
312 # $hashtab[$r] might be zero, or some previously assigned value.
313 $hashtab[$l] = $e - $hashtab[$r];
317 die "oops, doubly used hashtab entry" if $visited[$r];
318 # $hashtab[$l] might be zero, or some previously assigned value.
319 $hashtab[$r] = $e - $hashtab[$l];
321 # Now freeze both of these hashtab entries.
326 # Detect range of values needed in hash table.
329 for (my $v = 0; $v < $nverts; $v++)
331 $hmin = $hashtab[$v] if $hashtab[$v] < $hmin;
332 $hmax = $hashtab[$v] if $hashtab[$v] > $hmax;
335 # Choose width of hashtable entries. In addition to the actual values,
336 # we need to be able to store a flag for unused entries, and we wish to
337 # have the property that adding any other entry value to the flag gives
338 # an out-of-range result (>= $nedges).
344 && $hmin + 0x7F >= $nedges)
350 elsif ($hmin >= -0x7FFF
352 && $hmin + 0x7FFF >= $nedges)
356 $unused_flag = 0x7FFF;
358 elsif ($hmin >= -0x7FFFFFFF
359 && $hmax <= 0x7FFFFFFF
360 && $hmin + 0x3FFFFFFF >= $nedges)
364 $unused_flag = 0x3FFFFFFF;
368 die "hash table values too wide";
371 # Set any unvisited hashtable entries to $unused_flag.
372 for (my $v = 0; $v < $nverts; $v++)
374 $hashtab[$v] = $unused_flag if !$visited[$v];
377 return ($elemtype, \
@hashtab);