src/tools/PerfectHash.pm

   1 #----------------------------------------------------------------------
   2 #
   3 # PerfectHash.pm
   4 #    Perl module that constructs minimal perfect hash functions
   5 #
   6 # This code constructs a minimal perfect hash function for the given
   7 # set of keys, using an algorithm described in
   8 # "An optimal algorithm for generating minimal perfect hash functions"
   9 # by Czech, Havas and Majewski in Information Processing Letters,
  10 # 43(5):256-264, October 1992.
  11 # This implementation is loosely based on NetBSD's "nbperf",
  12 # which was written by Joerg Sonnenberger.
  13 #
  14 # The resulting hash function is perfect in the sense that if the presented
  15 # key is one of the original set, it will return the key's index in the set
  16 # (in range 0..N-1).  However, the caller must still verify the match,
  17 # as false positives are possible.  Also, the hash function may return
  18 # values that are out of range (negative or >= N), due to summing unrelated
  19 # hashtable entries.  This indicates that the presented key is definitely
  20 # not in the set.
  21 #
  22 #
  23 # Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  24 # Portions Copyright (c) 1994, Regents of the University of California
  25 #
  26 # src/tools/PerfectHash.pm
  27 #
  28 #----------------------------------------------------------------------
  29
  30 package PerfectHash;
  31
  32 use strict;
  33 use warnings;
  34
  35
  36 # At runtime, we'll compute two simple hash functions of the input key,
  37 # and use them to index into a mapping table.  The hash functions are just
  38 # multiply-and-add in uint32 arithmetic, with different multipliers and
  39 # initial seeds.  All the complexity in this module is concerned with
  40 # selecting hash parameters that will work and building the mapping table.
  41
  42 # We support making case-insensitive hash functions, though this only
  43 # works for a strict-ASCII interpretation of case insensitivity,
  44 # ie, A-Z maps onto a-z and nothing else.
  45 my $case_fold = 0;
  46
  47
  48 #
  49 # Construct a C function implementing a perfect hash for the given keys.
  50 # The C function definition is returned as a string.
  51 #
  52 # The keys should be passed as an array reference.  They can be any set
  53 # of Perl strings; it is caller's responsibility that there not be any
  54 # duplicates.  (Note that the "strings" can be binary data, but hashing
  55 # e.g. OIDs has endianness hazards that callers must overcome.)
  56 #
  57 # The name to use for the function is specified as the second argument.
  58 # It will be a global function by default, but the caller may prepend
  59 # "static " to the result string if it wants a static function.
  60 #
  61 # Additional options can be specified as keyword-style arguments:
  62 #
  63 # case_fold => bool
  64 # If specified as true, the hash function is case-insensitive, for the
  65 # limited idea of case-insensitivity explained above.
  66 #
  67 # fixed_key_length => N
  68 # If specified, all keys are assumed to have length N bytes, and the
  69 # hash function signature will be just "int f(const void *key)"
  70 # rather than "int f(const void *key, size_t keylen)".
  71 #
  72 sub generate_hash_function
  73 {
  74         my ($keys_ref, $funcname, %options) = @_;
  75
  76         # It's not worth passing this around as a parameter; just use a global.
  77         $case_fold = $options{case_fold} || 0;
  78
  79         # Try different hash function parameters until we find a set that works
  80         # for these keys.  The multipliers are chosen to be primes that are cheap
  81         # to calculate via shift-and-add, so don't change them without care.
  82         # (Commonly, random seeds are tried, but we want reproducible results
  83         # from this program so we don't do that.)
  84         my $hash_mult1 = 257;
  85         my $hash_mult2;
  86         my $hash_seed1;
  87         my $hash_seed2;
  88         my @subresult;
  89   FIND_PARAMS:
  90         for ($hash_seed1 = 0; $hash_seed1 < 10; $hash_seed1++)
  91         {
  92
  93                 for ($hash_seed2 = 0; $hash_seed2 < 10; $hash_seed2++)
  94                 {
  95                         foreach (17, 31, 127, 8191)
  96                         {
  97                                 $hash_mult2 = $_;    # "foreach $hash_mult2" doesn't work
  98                                 @subresult = _construct_hash_table(
  99                                         $keys_ref, $hash_mult1, $hash_mult2,
 100                                         $hash_seed1, $hash_seed2);
 101                                 last FIND_PARAMS if @subresult;
 102                         }
 103                 }
 104         }
 105
 106         # Choke if we couldn't find a workable set of parameters.
 107         die "failed to generate perfect hash" if !@subresult;
 108
 109         # Extract info from _construct_hash_table's result array.
 110         my $elemtype = $subresult[0];
 111         my @hashtab = @{ $subresult[1] };
 112         my $nhash = scalar(@hashtab);
 113
 114         # OK, construct the hash function definition including the hash table.
 115         my $f = '';
 116         $f .= sprintf "int\n";
 117         if (defined $options{fixed_key_length})
 118         {
 119                 $f .= sprintf "%s(const void *key)\n{\n", $funcname;
 120         }
 121         else
 122         {
 123                 $f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname;
 124         }
 125         $f .= sprintf "\tstatic const %s h[%d] = {\n\t\t", $elemtype, $nhash;
 126         for (my $i = 0; $i < $nhash; $i++)
 127         {
 128                 # Hash element.
 129                 $f .= sprintf "%d", $hashtab[$i];
 130                 next if ($i == $nhash - 1);
 131
 132                 # Optional indentation and newline, with eight items per line.
 133                 $f .= sprintf ",%s",
 134                   ($i % 8 == 7 ? "\n\t\t" : ' ' x (6 - length($hashtab[$i])));
 135         }
 136         $f .= sprintf "\n" if ($nhash % 8 != 0);
 137         $f .= sprintf "\t};\n\n";
 138         $f .= sprintf "\tconst unsigned char *k = (const unsigned char *) key;\n";
 139         $f .= sprintf "\tsize_t\t\tkeylen = %d;\n", $options{fixed_key_length}
 140           if (defined $options{fixed_key_length});
 141         $f .= sprintf "\tuint32\t\ta = %d;\n", $hash_seed1;
 142         $f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed2;
 143         $f .= sprintf "\twhile (keylen--)\n\t{\n";
 144         $f .= sprintf "\t\tunsigned char c = *k++";
 145         $f .= sprintf " | 0x20" if $case_fold;                 # see comment below
 146         $f .= sprintf ";\n\n";
 147         $f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1;
 148         $f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2;
 149         $f .= sprintf "\t}\n";
 150         $f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash;
 151         $f .= sprintf "}\n";
 152
 153         return $f;
 154 }
 155
 156
 157 # Calculate a hash function as the run-time code will do.
 158 #
 159 # If we are making a case-insensitive hash function, we implement that
 160 # by OR'ing 0x20 into each byte of the key.  This correctly transforms
 161 # upper-case ASCII into lower-case ASCII, while not changing digits or
 162 # dollar signs.  (It does change '_', as well as other characters not
 163 # likely to appear in keywords; this has little effect on the hash's
 164 # ability to discriminate keywords.)
 165 sub _calc_hash
 166 {
 167         my ($key, $mult, $seed) = @_;
 168
 169         my $result = $seed;
 170         for my $c (split //, $key)
 171         {
 172                 my $cn = ord($c);
 173                 $cn |= 0x20 if $case_fold;
 174                 $result = ($result * $mult + $cn) % 4294967296;
 175         }
 176         return $result;
 177 }
 178
 179
 180 # Attempt to construct a mapping table for a minimal perfect hash function
 181 # for the given keys, using the specified hash parameters.
 182 #
 183 # Returns an array containing the mapping table element type name as the
 184 # first element, and a ref to an array of the table values as the second.
 185 #
 186 # Returns an empty array on failure; then caller should choose different
 187 # hash parameter(s) and try again.
 188 sub _construct_hash_table
 189 {
 190         my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed1, $hash_seed2) = @_;
 191         my @keys = @{$keys_ref};
 192
 193         # This algorithm is based on a graph whose edges correspond to the
 194         # keys and whose vertices correspond to entries of the mapping table.
 195         # A key's edge links the two vertices whose indexes are the outputs of
 196         # the two hash functions for that key.  For K keys, the mapping
 197         # table must have at least 2*K+1 entries, guaranteeing that there's at
 198         # least one unused entry.  (In principle, larger mapping tables make it
 199         # easier to find a workable hash and increase the number of inputs that
 200         # can be rejected due to touching unused hashtable entries.  In practice,
 201         # neither effect seems strong enough to justify using a larger table.)
 202         my $nedges = scalar @keys;       # number of edges
 203         my $nverts = 2 * $nedges + 1;    # number of vertices
 204
 205         # However, it would be very bad if $nverts were exactly equal to either
 206         # $hash_mult1 or $hash_mult2: effectively, that hash function would be
 207         # sensitive to only the last byte of each key.  Cases where $nverts is a
 208         # multiple of either multiplier likewise lose information.  (But $nverts
 209         # can't actually divide them, if they've been intelligently chosen as
 210         # primes.)  We can avoid such problems by adjusting the table size.
 211         while ($nverts % $hash_mult1 == 0
 212                 || $nverts % $hash_mult2 == 0)
 213         {
 214                 $nverts++;
 215         }
 216
 217         # Initialize the array of edges.
 218         my @E = ();
 219         foreach my $kw (@keys)
 220         {
 221                 # Calculate hashes for this key.
 222                 # The hashes are immediately reduced modulo the mapping table size.
 223                 my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed1) % $nverts;
 224                 my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed2) % $nverts;
 225
 226                 # If the two hashes are the same for any key, we have to fail
 227                 # since this edge would itself form a cycle in the graph.
 228                 return () if $hash1 == $hash2;
 229
 230                 # Add the edge for this key.
 231                 push @E, { left => $hash1, right => $hash2 };
 232         }
 233
 234         # Initialize the array of vertices, giving them all empty lists
 235         # of associated edges.  (The lists will be hashes of edge numbers.)
 236         my @V = ();
 237         for (my $v = 0; $v < $nverts; $v++)
 238         {
 239                 push @V, { edges => {} };
 240         }
 241
 242         # Insert each edge in the lists of edges connected to its vertices.
 243         for (my $e = 0; $e < $nedges; $e++)
 244         {
 245                 my $v = $E[$e]{left};
 246                 $V[$v]{edges}->{$e} = 1;
 247
 248                 $v = $E[$e]{right};
 249                 $V[$v]{edges}->{$e} = 1;
 250         }
 251
 252         # Now we attempt to prove the graph acyclic.
 253         # A cycle-free graph is either empty or has some vertex of degree 1.
 254         # Removing the edge attached to that vertex doesn't change this property,
 255         # so doing that repeatedly will reduce the size of the graph.
 256         # If the graph is empty at the end of the process, it was acyclic.
 257         # We track the order of edge removal so that the next phase can process
 258         # them in reverse order of removal.
 259         my @output_order = ();
 260
 261         # Consider each vertex as a possible starting point for edge-removal.
 262         for (my $startv = 0; $startv < $nverts; $startv++)
 263         {
 264                 my $v = $startv;
 265
 266                 # If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
 267                 # remove that edge, and then consider the edge's other vertex to see
 268                 # if it is now of degree 1.  The inner loop repeats until reaching a
 269                 # vertex not of degree 1.
 270                 while (scalar(keys(%{ $V[$v]{edges} })) == 1)
 271                 {
 272                         # Unlink its only edge.
 273                         my $e = (keys(%{ $V[$v]{edges} }))[0];
 274                         delete($V[$v]{edges}->{$e});
 275
 276                         # Unlink the edge from its other vertex, too.
 277                         my $v2 = $E[$e]{left};
 278                         $v2 = $E[$e]{right} if ($v2 == $v);
 279                         delete($V[$v2]{edges}->{$e});
 280
 281                         # Push e onto the front of the output-order list.
 282                         unshift @output_order, $e;
 283
 284                         # Consider v2 on next iteration of inner loop.
 285                         $v = $v2;
 286                 }
 287         }
 288
 289         # We succeeded only if all edges were removed from the graph.
 290         return () if (scalar(@output_order) != $nedges);
 291
 292         # OK, build the hash table of size $nverts.
 293         my @hashtab = (0) x $nverts;
 294         # We need a "visited" flag array in this step, too.
 295         my @visited = (0) x $nverts;
 296
 297         # The goal is that for any key, the sum of the hash table entries for
 298         # its first and second hash values is the desired output (i.e., the key
 299         # number).  By assigning hash table values in the selected edge order,
 300         # we can guarantee that that's true.  This works because the edge first
 301         # removed from the graph (and hence last to be visited here) must have
 302         # at least one vertex it shared with no other edge; hence it will have at
 303         # least one vertex (hashtable entry) still unvisited when we reach it here,
 304         # and we can assign that unvisited entry a value that makes the sum come
 305         # out as we wish.  By induction, the same holds for all the other edges.
 306         foreach my $e (@output_order)
 307         {
 308                 my $l = $E[$e]{left};
 309                 my $r = $E[$e]{right};
 310                 if (!$visited[$l])
 311                 {
 312                         # $hashtab[$r] might be zero, or some previously assigned value.
 313                         $hashtab[$l] = $e - $hashtab[$r];
 314                 }
 315                 else
 316                 {
 317                         die "oops, doubly used hashtab entry" if $visited[$r];
 318                         # $hashtab[$l] might be zero, or some previously assigned value.
 319                         $hashtab[$r] = $e - $hashtab[$l];
 320                 }
 321                 # Now freeze both of these hashtab entries.
 322                 $visited[$l] = 1;
 323                 $visited[$r] = 1;
 324         }
 325
 326         # Detect range of values needed in hash table.
 327         my $hmin = $nedges;
 328         my $hmax = 0;
 329         for (my $v = 0; $v < $nverts; $v++)
 330         {
 331                 $hmin = $hashtab[$v] if $hashtab[$v] < $hmin;
 332                 $hmax = $hashtab[$v] if $hashtab[$v] > $hmax;
 333         }
 334
 335         # Choose width of hashtable entries.  In addition to the actual values,
 336         # we need to be able to store a flag for unused entries, and we wish to
 337         # have the property that adding any other entry value to the flag gives
 338         # an out-of-range result (>= $nedges).
 339         my $elemtype;
 340         my $unused_flag;
 341
 342         if (   $hmin >= -0x7F
 343                 && $hmax <= 0x7F
 344                 && $hmin + 0x7F >= $nedges)
 345         {
 346                 # int8 will work
 347                 $elemtype = 'int8';
 348                 $unused_flag = 0x7F;
 349         }
 350         elsif ($hmin >= -0x7FFF
 351                 && $hmax <= 0x7FFF
 352                 && $hmin + 0x7FFF >= $nedges)
 353         {
 354                 # int16 will work
 355                 $elemtype = 'int16';
 356                 $unused_flag = 0x7FFF;
 357         }
 358         elsif ($hmin >= -0x7FFFFFFF
 359                 && $hmax <= 0x7FFFFFFF
 360                 && $hmin + 0x3FFFFFFF >= $nedges)
 361         {
 362                 # int32 will work
 363                 $elemtype = 'int32';
 364                 $unused_flag = 0x3FFFFFFF;
 365         }
 366         else
 367         {
 368                 die "hash table values too wide";
 369         }
 370
 371         # Set any unvisited hashtable entries to $unused_flag.
 372         for (my $v = 0; $v < $nverts; $v++)
 373         {
 374                 $hashtab[$v] = $unused_flag if !$visited[$v];
 375         }
 376
 377         return ($elemtype, \@hashtab);
 378 }
 379
 380 1;