* lisp/character-fold.el: Add back multi-char matching
[emacs.git] / lisp / character-fold.el
blobfb28bae7281ffe3300e8dacf46337fa2ca7405aa
1 ;;; character-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
3 ;; Copyright (C) 2015 Free Software Foundation, Inc.
5 ;; Maintainer: emacs-devel@gnu.org
6 ;; Keywords: matching
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
23 ;;; Code:
25 (eval-and-compile (put 'character-fold-table 'char-table-extra-slots 1))
27 (defconst character-fold-table
28 (eval-when-compile
29 (let ((equiv (make-char-table 'character-fold-table))
30 (equiv-multi (make-char-table 'character-fold-table))
31 (table (unicode-property-table-internal 'decomposition)))
32 (set-char-table-extra-slot equiv 0 equiv-multi)
34 ;; Ensure the table is populated.
35 (let ((func (char-table-extra-slot table 1)))
36 (map-char-table (lambda (char v)
37 (when (consp char)
38 (funcall func (car char) v table)))
39 table))
41 ;; Compile a list of all complex characters that each simple
42 ;; character should match.
43 ;; In summary this loop does 3 things:
44 ;; - A complex character might be allowed to match its decomp.
45 ;; - The decomp is allowed to match the complex character.
46 ;; - A single char of the decomp might be allowed to match the
47 ;; character.
48 ;; Some examples in the comments below.
49 (map-char-table
50 (lambda (char decomp)
51 (when (consp decomp)
52 ;; Skip trivial cases like ?a decomposing to (?a).
53 (unless (and (not (cdr decomp))
54 (eq char (car decomp)))
55 (if (symbolp (car decomp))
56 ;; Discard a possible formatting tag.
57 (setq decomp (cdr decomp))
58 ;; If there's no formatting tag, ensure that char matches
59 ;; its decomp exactly. This is because we want 'ä' to
60 ;; match 'ä', but we don't want '¹' to match '1'.
61 (aset equiv char
62 (cons (apply #'string decomp)
63 (aref equiv char))))
65 ;; Allow the entire decomp to match char. If decomp has
66 ;; multiple characters, this is done by adding an entry
67 ;; to the alist of the first character in decomp. This
68 ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
69 ;; match '¹'.
70 (let ((make-decomp-match-char
71 (lambda (decomp char)
72 (if (cdr decomp)
73 (aset equiv-multi (car decomp)
74 (cons (cons (apply #'string (cdr decomp))
75 (regexp-quote (string char)))
76 (aref equiv-multi (car decomp))))
77 (aset equiv (car decomp)
78 (cons (char-to-string char)
79 (aref equiv (car decomp))))))))
80 (funcall make-decomp-match-char decomp char)
81 ;; Do it again, without the non-spacing characters.
82 ;; This allows 'a' to match 'ä'.
83 (let ((simpler-decomp nil)
84 (found-one nil))
85 (dolist (c decomp)
86 (if (> (get-char-code-property c 'canonical-combining-class) 0)
87 (setq found-one t)
88 (push c simpler-decomp)))
89 (when (and simpler-decomp found-one)
90 (funcall make-decomp-match-char simpler-decomp char)
91 ;; Finally, if the decomp only had one spacing
92 ;; character, we allow this character to match the
93 ;; decomp. This is to let 'a' match 'ä'.
94 (unless (cdr simpler-decomp)
95 (aset equiv (car simpler-decomp)
96 (cons (apply #'string decomp)
97 (aref equiv (car simpler-decomp)))))))))))
98 table)
100 ;; Add some manual entries.
101 (dolist (it '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
102 (?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "󠀢" "❮" "❯" "‹" "›")
103 (?` "❛" "‘" "‛" "󠀢" "❮" "‹")))
104 (let ((idx (car it))
105 (chars (cdr it)))
106 (aset equiv idx (append chars (aref equiv idx)))))
108 ;; Convert the lists of characters we compiled into regexps.
109 (map-char-table
110 (lambda (char dec-list)
111 (let ((re (regexp-opt (cons (char-to-string char) dec-list))))
112 (if (consp char)
113 (set-char-table-range equiv char re)
114 (aset equiv char re))))
115 equiv)
116 equiv))
117 "Used for folding characters of the same group during search.
118 This is a char-table with the `character-fold-table' subtype.
120 Let us refer to the character in question by char-x.
121 Each entry is either nil (meaning char-x only matches literally)
122 or a regexp. This regexp should match anything that char-x can
123 match by itself \(including char-x). For instance, the default
124 regexp for the ?+ character is \"[+⁺₊﬩﹢+]\".
126 This table also has one extra slot which is also a char-table.
127 Each entry in the extra slot is an alist used for multi-character
128 matching (which may be nil). The elements of the alist should
129 have the form (SUFFIX . OTHER-REGEXP). If the characters after
130 char-x are equal to SUFFIX, then this combination of char-x +
131 SUFFIX is allowed to match OTHER-REGEXP. This is in addition to
132 char-x being allowed to match REGEXP.
133 For instance, the default alist for ?f includes:
134 \((\"fl\" . \"\") (\"fi\" . \"\")
135 (\"i\" . \"\") (\"f\" . \"\"))
137 Exceptionally for the space character (32), ALIST is ignored.")
139 (defun character-fold--make-space-string (n)
140 "Return a string that matches N spaces."
141 (format "\\(?:%s\\|%s\\)"
142 (make-string n ?\s)
143 (apply #'concat
144 (make-list n (or (aref character-fold-table ?\s) " ")))))
146 ;;;###autoload
147 (defun character-fold-to-regexp (string &optional _lax from)
148 "Return a regexp matching anything that character-folds into STRING.
149 Any character in STRING that has an entry in
150 `character-fold-table' is replaced with that entry (which is a
151 regexp) and other characters are `regexp-quote'd.
153 If the resulting regexp would be too long for Emacs to handle,
154 just return the result of calling `regexp-quote' on STRING.
156 FROM is for internal use. It specifies an index in the STRING
157 from which to start."
158 (let* ((spaces 0)
159 (multi-char-table (char-table-extra-slot character-fold-table 0))
160 (lower-case-table (current-case-table))
161 (upper-case-table (char-table-extra-slot lower-case-table 0))
162 (i (or from 0))
163 (end (length string))
164 (out nil))
165 ;; When the user types a space, we want to match the table entry
166 ;; for ?\s, which is generally a regexp like "[ ...]". However,
167 ;; the `search-spaces-regexp' variable doesn't "see" spaces inside
168 ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
169 ;; instead (to manually expose a space). Furthermore, the lax
170 ;; search engine acts on a bunch of spaces, not on individual
171 ;; spaces, so if the string contains sequential spaces like " ", we
172 ;; need to keep them grouped together like this: "\\( \\|[ ...][ ...]\\)".
173 (while (< i end)
174 (pcase (aref string i)
175 (`?\s (setq spaces (1+ spaces)))
176 (c (when (> spaces 0)
177 (push (character-fold--make-space-string spaces) out)
178 (setq spaces 0))
179 (let ((regexp (or (aref character-fold-table c)
180 (regexp-quote (string c))))
181 (alist nil))
182 ;; Long string. The regexp would probably be too long.
183 (unless (> end 50)
184 (setq alist (aref multi-char-table c))
185 (when case-fold-search
186 (let ((other-c (aref lower-case-table c)))
187 (when (or (not other-c)
188 (eq other-c c))
189 (setq other-c (aref upper-case-table c)))
190 (when other-c
191 (setq alist (append alist (aref multi-char-table other-c)))
192 (setq regexp (concat "\\(?:" regexp "\\|"
193 (or (aref character-fold-table other-c)
194 (regexp-quote (string other-c)))
195 "\\)"))))))
196 (push (let ((matched-entries nil)
197 (max-length 0))
198 (dolist (entry alist)
199 (let* ((suffix (car entry))
200 (len-suf (length suffix)))
201 (when (eq (compare-strings suffix 0 nil
202 string (1+ i) (+ i 1 len-suf)
203 nil)
205 (push (cons len-suf (cdr entry)) matched-entries)
206 (setq max-length (max max-length len-suf)))))
207 ;; If no suffixes matched, just go on.
208 (if (not matched-entries)
209 regexp
210 ;;; If N suffixes match, we "branch" out into N+1 executions for the
211 ;;; length of the longest match. This means "fix" will match "fix" but
212 ;;; not "fⅸ", but it's necessary to keep the regexp size from scaling
213 ;;; exponentially. See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html
214 (let ((subs (substring string (1+ i) (+ i 1 max-length))))
215 ;; `i' is still going to inc by 1 below.
216 (setq i (+ i max-length))
217 (concat
218 "\\(?:"
219 (mapconcat (lambda (entry)
220 (let ((length (car entry))
221 (suffix-regexp (cdr entry)))
222 (concat suffix-regexp
223 (character-fold-to-regexp subs nil length))))
224 `((0 . ,regexp) . ,matched-entries) "\\|")
225 "\\)"))))
226 out))))
227 (setq i (1+ i)))
228 (when (> spaces 0)
229 (push (character-fold--make-space-string spaces) out))
230 (let ((regexp (apply #'concat (nreverse out))))
231 ;; Limited by `MAX_BUF_SIZE' in `regex.c'.
232 (if (> (length regexp) 10000)
233 (regexp-quote string)
234 regexp))))
237 ;;; Commands provided for completeness.
238 (defun character-fold-search-forward (string &optional bound noerror count)
239 "Search forward for a character-folded version of STRING.
240 STRING is converted to a regexp with `character-fold-to-regexp',
241 which is searched for with `re-search-forward'.
242 BOUND NOERROR COUNT are passed to `re-search-forward'."
243 (interactive "sSearch: ")
244 (re-search-forward (character-fold-to-regexp string) bound noerror count))
246 (defun character-fold-search-backward (string &optional bound noerror count)
247 "Search backward for a character-folded version of STRING.
248 STRING is converted to a regexp with `character-fold-to-regexp',
249 which is searched for with `re-search-backward'.
250 BOUND NOERROR COUNT are passed to `re-search-backward'."
251 (interactive "sSearch: ")
252 (re-search-backward (character-fold-to-regexp string) bound noerror count))
254 (provide 'character-fold)
256 ;;; character-fold.el ends here