2 * Daitch-Mokotoff Soundex
4 * Copyright (c) 2023, PostgreSQL Global Development Group
6 * This module was originally sponsored by Finance Norway /
7 * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
9 * The implementation of the Daitch-Mokotoff Soundex System aims at correctness
10 * and high performance, and can be summarized as follows:
12 * - The processing of each phoneme is initiated by an O(1) table lookup.
13 * - For phonemes containing more than one character, a coding tree is traversed
14 * to process the complete phoneme.
15 * - The (alternate) soundex codes are produced digit by digit in-place in
16 * another tree structure.
20 * https://www.avotaynu.com/soundex.htm
21 * https://www.jewishgen.org/InfoFiles/Soundex.html
22 * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex
23 * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)
24 * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)
25 * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)
27 * A few notes on other implementations:
29 * - All other known implementations have the same unofficial rules for "UE",
30 * these are also adapted by this implementation (0, 1, NC).
31 * - The only other known implementation which is capable of generating all
32 * correct soundex codes in all cases is the JOS Soundex Calculator at
33 * https://www.jewishgen.org/jos/jossound.htm
34 * - "J" is considered (only) a vowel in dmlat.php
35 * - The official rules for "RS" are commented out in dmlat.php
36 * - Identical code digits for adjacent letters are not collapsed correctly in
37 * dmsoundex.php when double digit codes are involved. E.g. "BESST" yields
38 * 744300 instead of 743000 as for "BEST".
39 * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java
40 * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java
45 #include "catalog/pg_type.h"
46 #include "mb/pg_wchar.h"
47 #include "utils/array.h"
48 #include "utils/builtins.h"
49 #include "utils/memutils.h"
53 * The soundex coding chart table is adapted from
54 * https://www.jewishgen.org/InfoFiles/Soundex.html
55 * See daitch_mokotoff_header.pl for details.
58 /* Generated coding chart table */
59 #include "daitch_mokotoff.h"
61 #define DM_CODE_DIGITS 6
63 /* Node in soundex code tree */
64 typedef struct dm_node
66 int soundex_length
; /* Length of generated soundex code */
67 char soundex
[DM_CODE_DIGITS
]; /* Soundex code */
68 int is_leaf
; /* Candidate for complete soundex code */
69 int last_update
; /* Letter number for last update of node */
70 char code_digit
; /* Last code digit, 0 - 9 */
73 * One or two alternate code digits leading to this node. If there are two
74 * digits, one of them is always an 'X'. Repeated code digits and 'X' lead
75 * back to the same node.
77 char prev_code_digits
[2];
78 /* One or two alternate code digits moving forward. */
79 char next_code_digits
[2];
80 /* ORed together code index(es) used to reach current node. */
83 /* Possible nodes branching out from this node - digits 0-9. */
84 struct dm_node
*children
[10];
85 /* Next node in linked list. Alternating index for each iteration. */
86 struct dm_node
*next
[2];
89 /* Template for new node in soundex code tree. */
90 static const dm_node start_node
= {
92 .soundex
= "000000", /* Six digits */
96 .prev_code_digits
= {'\0', '\0'},
97 .next_code_digits
= {'\0', '\0'},
104 /* Dummy soundex codes at end of input. */
105 static const dm_codes end_codes
[2] =
112 /* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */
113 static const char iso8859_1_to_ascii_upper
[] =
114 "`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~ ! ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY";
116 /* Internal C implementation */
117 static bool daitch_mokotoff_coding(const char *word
, ArrayBuildState
*soundex
);
120 PG_FUNCTION_INFO_V1(daitch_mokotoff
);
123 daitch_mokotoff(PG_FUNCTION_ARGS
)
125 text
*arg
= PG_GETARG_TEXT_PP(0);
128 ArrayBuildState
*soundex
;
129 MemoryContext old_ctx
,
132 /* Work in a temporary context to simplify cleanup. */
133 tmp_ctx
= AllocSetContextCreate(CurrentMemoryContext
,
134 "daitch_mokotoff temporary context",
135 ALLOCSET_DEFAULT_SIZES
);
136 old_ctx
= MemoryContextSwitchTo(tmp_ctx
);
138 /* We must convert the string to UTF-8 if it isn't already. */
139 string
= pg_server_to_any(text_to_cstring(arg
), VARSIZE_ANY_EXHDR(arg
),
142 /* The result is built in this ArrayBuildState. */
143 soundex
= initArrayResult(TEXTOID
, tmp_ctx
, false);
145 if (!daitch_mokotoff_coding(string
, soundex
))
147 /* No encodable characters in input */
148 MemoryContextSwitchTo(old_ctx
);
149 MemoryContextDelete(tmp_ctx
);
153 retval
= makeArrayResult(soundex
, old_ctx
);
155 MemoryContextSwitchTo(old_ctx
);
156 MemoryContextDelete(tmp_ctx
);
158 PG_RETURN_DATUM(retval
);
162 /* Initialize soundex code tree node for next code digit. */
164 initialize_node(dm_node
*node
, int last_update
)
166 if (node
->last_update
< last_update
)
168 node
->prev_code_digits
[0] = node
->next_code_digits
[0];
169 node
->prev_code_digits
[1] = node
->next_code_digits
[1];
170 node
->next_code_digits
[0] = '\0';
171 node
->next_code_digits
[1] = '\0';
172 node
->prev_code_index
= node
->next_code_index
;
173 node
->next_code_index
= 0;
175 node
->last_update
= last_update
;
180 /* Update soundex code tree node with next code digit. */
182 add_next_code_digit(dm_node
*node
, int code_index
, char code_digit
)
184 /* OR in index 1 or 2. */
185 node
->next_code_index
|= code_index
;
187 if (!node
->next_code_digits
[0])
188 node
->next_code_digits
[0] = code_digit
;
189 else if (node
->next_code_digits
[0] != code_digit
)
190 node
->next_code_digits
[1] = code_digit
;
194 /* Mark soundex code tree node as leaf. */
196 set_leaf(dm_node
*first_node
[2], dm_node
*last_node
[2],
197 dm_node
*node
, int ix_node
)
203 if (first_node
[ix_node
] == NULL
)
204 first_node
[ix_node
] = node
;
206 last_node
[ix_node
]->next
[ix_node
] = node
;
208 last_node
[ix_node
] = node
;
209 node
->next
[ix_node
] = NULL
;
214 /* Find next node corresponding to code digit, or create a new node. */
216 find_or_create_child_node(dm_node
*parent
, char code_digit
,
217 ArrayBuildState
*soundex
)
219 int i
= code_digit
- '0';
220 dm_node
**nodes
= parent
->children
;
221 dm_node
*node
= nodes
[i
];
225 /* Found existing child node. Skip completed nodes. */
226 return node
->soundex_length
< DM_CODE_DIGITS
? node
: NULL
;
229 /* Create new child node. */
230 node
= palloc_object(dm_node
);
234 memcpy(node
->soundex
, parent
->soundex
, sizeof(parent
->soundex
));
235 node
->soundex_length
= parent
->soundex_length
;
236 node
->soundex
[node
->soundex_length
++] = code_digit
;
237 node
->code_digit
= code_digit
;
238 node
->next_code_index
= node
->prev_code_index
;
240 if (node
->soundex_length
< DM_CODE_DIGITS
)
246 /* Append completed soundex code to output array. */
247 text
*out
= cstring_to_text_with_len(node
->soundex
,
250 accumArrayResult(soundex
,
251 PointerGetDatum(out
),
254 CurrentMemoryContext
);
260 /* Update node for next code digit(s). */
262 update_node(dm_node
*first_node
[2], dm_node
*last_node
[2],
263 dm_node
*node
, int ix_node
,
264 int letter_no
, int prev_code_index
, int next_code_index
,
265 const char *next_code_digits
, int digit_no
,
266 ArrayBuildState
*soundex
)
269 char next_code_digit
= next_code_digits
[digit_no
];
270 int num_dirty_nodes
= 0;
271 dm_node
*dirty_nodes
[2];
273 initialize_node(node
, letter_no
);
275 if (node
->prev_code_index
&& !(node
->prev_code_index
& prev_code_index
))
278 * If the sound (vowel / consonant) of this letter encoding doesn't
279 * correspond to the coding index of the previous letter, we skip this
280 * letter encoding. Note that currently, only "J" can be either a
281 * vowel or a consonant.
286 if (next_code_digit
== 'X' ||
288 (node
->prev_code_digits
[0] == next_code_digit
||
289 node
->prev_code_digits
[1] == next_code_digit
)))
291 /* The code digit is the same as one of the previous (i.e. not added). */
292 dirty_nodes
[num_dirty_nodes
++] = node
;
295 if (next_code_digit
!= 'X' &&
297 node
->prev_code_digits
[0] != next_code_digit
||
298 node
->prev_code_digits
[1]))
300 /* The code digit is different from one of the previous (i.e. added). */
301 node
= find_or_create_child_node(node
, next_code_digit
, soundex
);
304 initialize_node(node
, letter_no
);
305 dirty_nodes
[num_dirty_nodes
++] = node
;
309 for (i
= 0; i
< num_dirty_nodes
; i
++)
311 /* Add code digit leading to the current node. */
312 add_next_code_digit(dirty_nodes
[i
], next_code_index
, next_code_digit
);
314 if (next_code_digits
[++digit_no
])
316 update_node(first_node
, last_node
, dirty_nodes
[i
], ix_node
,
317 letter_no
, prev_code_index
, next_code_index
,
318 next_code_digits
, digit_no
,
323 /* Add incomplete leaf node to linked list. */
324 set_leaf(first_node
, last_node
, dirty_nodes
[i
], ix_node
);
330 /* Update soundex tree leaf nodes. */
332 update_leaves(dm_node
*first_node
[2], int *ix_node
, int letter_no
,
333 const dm_codes
*codes
, const dm_codes
*next_codes
,
334 ArrayBuildState
*soundex
)
343 int ix_node_next
= (*ix_node
+ 1) & 1; /* Alternating index: 0, 1 */
345 /* Initialize for new linked list of leaves. */
346 first_node
[ix_node_next
] = NULL
;
347 last_node
[ix_node_next
] = NULL
;
349 /* Process all nodes. */
350 for (node
= first_node
[*ix_node
]; node
; node
= node
->next
[*ix_node
])
352 /* One or two alternate code sequences. */
353 for (i
= 0; i
< 2 && (code
= codes
[i
]) && code
[0][0]; i
++)
355 /* Coding for previous letter - before vowel: 1, all other: 2 */
356 int prev_code_index
= (code
[0][0] > '1') + 1;
358 /* One or two alternate next code sequences. */
359 for (j
= 0; j
< 2 && (next_code
= next_codes
[j
]) && next_code
[0][0]; j
++)
361 /* Determine which code to use. */
364 /* This is the first letter. */
367 else if (next_code
[0][0] <= '1')
369 /* The next letter is a vowel. */
374 /* All other cases. */
378 /* One or two sequential code digits. */
379 update_node(first_node
, last_node
, node
, ix_node_next
,
380 letter_no
, prev_code_index
, code_index
,
387 *ix_node
= ix_node_next
;
392 * Return next character, converted from UTF-8 to uppercase ASCII.
393 * *ix is the current string index and is incremented by the character length.
396 read_char(const unsigned char *str
, int *ix
)
398 /* Substitute character for skipped code points. */
399 const char na
= '\x1a';
402 /* Decode UTF-8 character to ISO 10646 code point. */
404 c
= utf8_to_unicode(str
);
406 /* Advance *ix, but (for safety) not if we've reached end of string. */
408 *ix
+= pg_utf_mblen(str
);
411 if (c
>= (unsigned char) '[' && c
<= (unsigned char) ']')
413 /* ASCII characters [, \, and ] are reserved for conversions below. */
418 /* Other non-lowercase ASCII characters can be used as-is. */
423 /* ISO-8859-1 code point; convert to upper-case ASCII via table. */
424 return iso8859_1_to_ascii_upper
[c
- 0x60];
428 /* Conversion of non-ASCII characters in the coding chart. */
431 case 0x0104: /* LATIN CAPITAL LETTER A WITH OGONEK */
432 case 0x0105: /* LATIN SMALL LETTER A WITH OGONEK */
434 case 0x0118: /* LATIN CAPITAL LETTER E WITH OGONEK */
435 case 0x0119: /* LATIN SMALL LETTER E WITH OGONEK */
437 case 0x0162: /* LATIN CAPITAL LETTER T WITH CEDILLA */
438 case 0x0163: /* LATIN SMALL LETTER T WITH CEDILLA */
439 case 0x021A: /* LATIN CAPITAL LETTER T WITH COMMA BELOW */
440 case 0x021B: /* LATIN SMALL LETTER T WITH COMMA BELOW */
449 /* Read next ASCII character, skipping any characters not in [A-\]]. */
451 read_valid_char(const char *str
, int *ix
)
455 while ((c
= read_char((const unsigned char *) str
, ix
)) != '\0')
457 if (c
>= 'A' && c
<= ']')
465 /* Return sound coding for "letter" (letter sequence) */
466 static const dm_codes
*
467 read_letter(const char *str
, int *ix
)
473 const dm_letter
*letters
;
474 const dm_codes
*codes
;
476 /* First letter in sequence. */
477 if ((c
= read_valid_char(str
, ix
)) == '\0')
480 letters
= &letter_
[c
- 'A'];
481 codes
= letters
->codes
;
484 /* Any subsequent letters in sequence. */
485 while ((letters
= letters
->letters
) && (c
= read_valid_char(str
, &i
)))
487 for (j
= 0; (cmp
= letters
[j
].letter
); j
++)
492 letters
= &letters
[j
];
495 /* Coding for letter sequence found. */
496 codes
= letters
->codes
;
504 /* The sequence of letters has no coding. */
514 * Generate all Daitch-Mokotoff soundex codes for word,
515 * adding them to the "soundex" ArrayBuildState.
516 * Returns false if string has no encodable characters, else true.
519 daitch_mokotoff_coding(const char *word
, ArrayBuildState
*soundex
)
524 const dm_codes
*codes
,
526 dm_node
*first_node
[2],
530 if (!(codes
= read_letter(word
, &i
)))
532 /* No encodable character in input. */
536 /* Starting point. */
537 first_node
[ix_node
] = palloc_object(dm_node
);
538 *first_node
[ix_node
] = start_node
;
541 * Loop until either the word input is exhausted, or all generated soundex
542 * codes are completed to six digits.
544 while (codes
&& first_node
[ix_node
])
546 next_codes
= read_letter(word
, &i
);
548 /* Update leaf nodes. */
549 update_leaves(first_node
, &ix_node
, letter_no
,
550 codes
, next_codes
? next_codes
: end_codes
,
557 /* Append all remaining (incomplete) soundex codes to output array. */
558 for (node
= first_node
[ix_node
]; node
; node
= node
->next
[ix_node
])
560 text
*out
= cstring_to_text_with_len(node
->soundex
,
563 accumArrayResult(soundex
,
564 PointerGetDatum(out
),
567 CurrentMemoryContext
);