lib/fstrcmp.c

   1 /* Functions to make fuzzy comparisons between strings
   2    Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2024 Free
   3    Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17
  18
  19 #include <config.h>
  20
  21 /* Specification.  */
  22 #include "fstrcmp.h"
  23
  24 #include <string.h>
  25 #include <stddef.h>
  26 #include <stdio.h>
  27 #include <stdint.h>
  28 #include <stdlib.h>
  29 #include <limits.h>
  30
  31 #include "glthread/lock.h"
  32 #include "glthread/tls.h"
  33 #include "minmax.h"
  34 #include "xalloc.h"
  35
  36
  37 #define ELEMENT char
  38 #define EQUAL(x,y) ((x) == (y))
  39 #define OFFSET ptrdiff_t
  40 #define OFFSET_MAX PTRDIFF_MAX
  41 #define EXTRA_CONTEXT_FIELDS \
  42   /* The number of edits beyond which the computation can be aborted. */ \
  43   ptrdiff_t edit_count_limit; \
  44   /* The number of edits (= number of elements inserted, plus the number of \
  45      elements deleted), temporarily minus edit_count_limit. */ \
  46   ptrdiff_t edit_count;
  47 #define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++
  48 #define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++
  49 #define NOTE_ORDERED false
  50 #define EARLY_ABORT(ctxt) ctxt->edit_count > 0
  51 /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of
  52    fstrcmp().  */
  53 #include "diffseq.h"
  54
  55
  56 /* Because fstrcmp is typically called multiple times, attempt to minimize
  57    the number of memory allocations performed.  Thus, let a call reuse the
  58    memory already allocated by the previous call, if it is sufficient.
  59    To make it multithread-safe, without need for a lock that protects the
  60    already allocated memory, store the allocated memory per thread.  Free
  61    it only when the thread exits.  */
  62
  63 static gl_tls_key_t buffer_key; /* TLS key for a 'ptrdiff_t *' */
  64 static gl_tls_key_t bufmax_key; /* TLS key for a 'uintptr_t' */
  65
  66 static void
  67 keys_init (void)
  68 {
  69   gl_tls_key_init (buffer_key, free);
  70   gl_tls_key_init (bufmax_key, NULL);
  71   /* The per-thread initial values are NULL and 0, respectively.  */
  72 }
  73
  74 /* Ensure that keys_init is called once only.  */
  75 gl_once_define(static, keys_init_once)
  76
  77 void
  78 fstrcmp_free_resources (void)
  79 {
  80   ptrdiff_t *buffer;
  81
  82   gl_once (keys_init_once, keys_init);
  83   buffer = gl_tls_get (buffer_key);
  84   if (buffer != NULL)
  85     {
  86       gl_tls_set (buffer_key, NULL);
  87       gl_tls_set (bufmax_key, (void *) (uintptr_t) 0);
  88       free (buffer);
  89     }
  90 }
  91
  92
  93 /* In the code below, branch probabilities were measured by Ralf Wildenhues,
  94    by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many
  95    values of LL.  The probability indicates that the condition evaluates
  96    to true; whether that leads to a branch or a non-branch in the code,
  97    depends on the compiler's reordering of basic blocks.  */
  98
  99
 100 double
 101 fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
 102 {
 103   struct context ctxt;
 104   size_t xvec_length = strlen (string1);
 105   size_t yvec_length = strlen (string2);
 106   size_t length_sum = xvec_length + yvec_length;
 107   ptrdiff_t i;
 108
 109   ptrdiff_t fdiag_len;
 110   ptrdiff_t *buffer;
 111   uintptr_t bufmax;
 112
 113   /* short-circuit obvious comparisons */
 114   if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */
 115     return length_sum == 0;
 116
 117   if (! (xvec_length <= length_sum
 118          && length_sum <= MIN (UINTPTR_MAX, PTRDIFF_MAX) - 3))
 119     xalloc_die ();
 120
 121   if (lower_bound > 0)
 122     {
 123       /* Compute a quick upper bound.
 124          Each edit is an insertion or deletion of an element, hence modifies
 125          the length of the sequence by at most 1.
 126          Therefore, when starting from a sequence X and ending at a sequence Y,
 127          with N edits,  | yvec_length - xvec_length | <= N.  (Proof by
 128          induction over N.)
 129          So, at the end, we will have
 130            edit_count >= | xvec_length - yvec_length |.
 131          and hence
 132            result
 133              = (xvec_length + yvec_length - edit_count)
 134                / (xvec_length + yvec_length)
 135              <= (xvec_length + yvec_length - | yvec_length - xvec_length |)
 136                 / (xvec_length + yvec_length)
 137              = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
 138        */
 139       ptrdiff_t length_min = MIN (xvec_length, yvec_length);
 140       volatile double upper_bound = 2.0 * length_min / length_sum;
 141
 142       if (upper_bound < lower_bound) /* Prob: 74% */
 143         /* Return an arbitrary value < LOWER_BOUND.  */
 144         return 0.0;
 145
 146 #if CHAR_BIT <= 8
 147       /* When X and Y are both small, avoid the overhead of setting up an
 148          array of size 256.  */
 149       if (length_sum >= 20) /* Prob: 99% */
 150         {
 151           /* Compute a less quick upper bound.
 152              Each edit is an insertion or deletion of a character, hence
 153              modifies the occurrence count of a character by 1 and leaves the
 154              other occurrence counts unchanged.
 155              Therefore, when starting from a sequence X and ending at a
 156              sequence Y, and denoting the occurrence count of C in X with
 157              OCC (X, C), with N edits,
 158                sum_C | OCC (X, C) - OCC (Y, C) | <= N.
 159              (Proof by induction over N.)
 160              So, at the end, we will have
 161                edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |,
 162              and hence
 163                result
 164                  = (xvec_length + yvec_length - edit_count)
 165                    / (xvec_length + yvec_length)
 166                  <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |)
 167                     / (xvec_length + yvec_length).
 168            */
 169           ptrdiff_t occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */
 170           ptrdiff_t sum;
 171           double dsum;
 172
 173           /* Determine the occurrence counts in X.  */
 174           memset (occ_diff, 0, sizeof (occ_diff));
 175           for (i = xvec_length - 1; i >= 0; i--)
 176             occ_diff[(unsigned char) string1[i]]++;
 177           /* Subtract the occurrence counts in Y.  */
 178           for (i = yvec_length - 1; i >= 0; i--)
 179             occ_diff[(unsigned char) string2[i]]--;
 180           /* Sum up the absolute values.  */
 181           sum = 0;
 182           for (i = 0; i <= UCHAR_MAX; i++)
 183             {
 184               ptrdiff_t d = occ_diff[i];
 185               sum += (d >= 0 ? d : -d);
 186             }
 187
 188           dsum = sum;
 189           upper_bound = 1.0 - dsum / length_sum;
 190
 191           if (upper_bound < lower_bound) /* Prob: 66% */
 192             /* Return an arbitrary value < LOWER_BOUND.  */
 193             return 0.0;
 194         }
 195 #endif
 196     }
 197
 198   /* set the info for each string.  */
 199   ctxt.xvec = string1;
 200   ctxt.yvec = string2;
 201
 202   /* Set TOO_EXPENSIVE to be approximate square root of input size,
 203      bounded below by 4096.  */
 204   ctxt.too_expensive = 1;
 205   for (i = xvec_length + yvec_length; i != 0; i >>= 2)
 206     ctxt.too_expensive <<= 1;
 207   if (ctxt.too_expensive < 4096)
 208     ctxt.too_expensive = 4096;
 209
 210   /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
 211   fdiag_len = length_sum + 3;
 212   gl_once (keys_init_once, keys_init);
 213   buffer = gl_tls_get (buffer_key);
 214   bufmax = (uintptr_t) gl_tls_get (bufmax_key);
 215   if (fdiag_len > bufmax)
 216     {
 217       /* Need more memory.  */
 218       bufmax = 2 * bufmax;
 219       if (fdiag_len > bufmax)
 220         bufmax = fdiag_len;
 221       /* Calling xrealloc would be a waste: buffer's contents does not need
 222          to be preserved.  */
 223       free (buffer);
 224       buffer = xnmalloc (bufmax, 2 * sizeof *buffer);
 225       gl_tls_set (buffer_key, buffer);
 226       gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
 227     }
 228   ctxt.fdiag = buffer + yvec_length + 1;
 229   ctxt.bdiag = ctxt.fdiag + fdiag_len;
 230
 231   /* The edit_count is only ever increased.  The computation can be aborted
 232      when
 233        (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length)
 234        < lower_bound,
 235      or equivalently
 236        edit_count > (xvec_length + yvec_length) * (1 - lower_bound)
 237      or equivalently
 238        edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)).
 239      We need to add an epsilon inside the floor(...) argument, to neutralize
 240      rounding errors.  */
 241   ctxt.edit_count_limit =
 242     (lower_bound < 1.0
 243      ? (ptrdiff_t) (length_sum * (1.0 - lower_bound + 0.000001))
 244      : 0);
 245
 246   /* Now do the main comparison algorithm */
 247   ctxt.edit_count = - ctxt.edit_count_limit;
 248   if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */
 249     /* The edit_count passed the limit.  Hence the result would be
 250        < lower_bound.  We can return any value < lower_bound instead.  */
 251     return 0.0;
 252   ctxt.edit_count += ctxt.edit_count_limit;
 253
 254   /* The result is
 255         ((number of chars in common) / (average length of the strings)).
 256      The numerator is
 257         = xvec_length - (number of calls to NOTE_DELETE)
 258         = yvec_length - (number of calls to NOTE_INSERT)
 259         = 1/2 * (xvec_length + yvec_length - (number of edits)).
 260      This is admittedly biased towards finding that the strings are
 261      similar, however it does produce meaningful results.  */
 262   return ((double) (xvec_length + yvec_length - ctxt.edit_count)
 263           / (xvec_length + yvec_length));
 264 }