1 ;;; character-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
3 ;; Copyright (C) 2015 Free Software Foundation, Inc.
5 ;; Maintainer: emacs-devel@gnu.org
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
27 (defvar character-fold-search nil
28 "Non-nil if searches should fold similar characters.
29 This means some characters will match entire groups of characters.
30 For instance, \" will match all variants of double quotes, and
31 the letter a will match all of its accented versions (and then
34 (defconst character-fold-table
36 (let* ((equiv (make-char-table 'character-fold-table
))
37 (table (unicode-property-table-internal 'decomposition
))
38 (func (char-table-extra-slot table
1)))
39 ;; Ensure the table is populated.
41 (lambda (i v
) (when (consp i
) (funcall func
(car i
) v table
)))
44 ;; Compile a list of all complex characters that each simple
45 ;; character should match.
49 ;; Discard a possible formatting tag.
50 (when (symbolp (car dec
))
52 ;; Skip trivial cases like ?a decomposing to (?a).
53 (unless (or (and (eq i
(car dec
))
58 (while (and d
(not found
))
60 ;; Is k a number or letter, per unicode standard?
61 (setq found
(memq (get-char-code-property k
'general-category
)
62 '(Lu Ll Lt Lm Lo Nd Nl No
))))
64 ;; Check if the decomposition has more than one letter,
65 ;; because then we don't want the first letter to match
68 (when (and fold-decomp
69 (memq (get-char-code-property k
'general-category
)
70 '(Lu Ll Lt Lm Lo Nd Nl No
)))
71 (setq fold-decomp nil
)))
72 ;; If there's no number or letter on the
73 ;; decomposition, take the first character in it.
74 (setq found
(car-safe dec
)))
75 ;; Finally, we only fold multi-char decomposition if at
76 ;; least one of the chars is non-spacing (combining).
78 (setq fold-decomp nil
)
80 (when (and (not fold-decomp
)
81 (> (get-char-code-property k
'canonical-combining-class
) 0))
82 (setq fold-decomp t
))))
83 ;; Add i to the list of characters that k can
84 ;; represent. Also possibly add its decomposition, so we can
85 ;; match multi-char representations like (format "a%c" 769)
86 (when (and found
(not (eq i k
)))
87 (let ((chars (cons (char-to-string i
) (aref equiv k
))))
90 (cons (apply #'string dec
) chars
)
94 ;; Add some manual entries.
95 (dolist (it '((?
\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
96 (?
' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "" "❮" "❯" "‹" "›")
97 (?
` "❛" "‘" "‛" "" "❮" "‹")))
100 (aset equiv idx
(append chars
(aref equiv idx
)))))
102 ;; Convert the lists of characters we compiled into regexps.
104 (lambda (i v
) (let ((re (regexp-opt (cons (char-to-string i
) v
))))
106 (set-char-table-range equiv i re
)
110 "Used for folding characters of the same group during search.")
113 (defun character-fold-to-regexp (string &optional lax
)
114 "Return a regexp matching anything that character-folds into STRING.
115 If `character-fold-search' is nil, `regexp-quote' string.
116 Otherwise, any character in STRING that has an entry in
117 `character-fold-table' is replaced with that entry (which is a
118 regexp) and other characters are `regexp-quote'd.
119 If LAX is non-nil, any single whitespace character is allowed to
120 match any number of times."
121 (if character-fold-search
123 (mapcar (lambda (c) (if (and lax
(memq c
'(?\s ?
\t ?
\r ?
\n)))
124 "[ \t\n\r\xa0\x2002\x2d\x200a\x202f\x205f\x3000]+"
125 (or (aref character-fold-table c
)
126 (regexp-quote (string c
)))))
128 (regexp-quote string
)))
130 ;;; character-fold.el ends here