; Typo fixes, mostly repeated words
[emacs.git] / lisp / net / puny.el
blobaf9b031bf217f84fdd45cd4442c85cfd0270cfe9
1 ;;; puny.el --- translate non-ASCII domain names to ASCII
3 ;; Copyright (C) 2015-2017 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
6 ;; Keywords: mail, net
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
23 ;;; Commentary:
25 ;; Written by looking at
26 ;; http://stackoverflow.com/questions/183485/can-anyone-recommend-a-good-free-javascript-for-punycode-to-unicode-conversion
28 ;;; Code:
30 (require 'seq)
32 (defun puny-encode-domain (domain)
33 "Encode DOMAIN according to the IDNA/punycode algorithm.
34 For instance, \"fśf.org\" => \"xn--ff-2sa.org\"."
35 ;; The vast majority of domain names are not IDNA domain names, so
36 ;; add a check first to avoid doing unnecessary work.
37 (if (string-match "\\'[[:ascii:]]+\\'" domain)
38 domain
39 (mapconcat 'puny-encode-string (split-string domain "[.]") ".")))
41 (defun puny-encode-string (string)
42 "Encode STRING according to the IDNA/punycode algorithm.
43 This is used to encode non-ASCII domain names.
44 For instance, \"bücher\" => \"xn--bcher-kva\"."
45 (let ((ascii (seq-filter (lambda (char)
46 (< char 128))
47 string)))
48 (if (= (length ascii) (length string))
49 string
50 (concat "xn--"
51 (if (null ascii)
53 (concat ascii "-"))
54 (puny-encode-complex (length ascii) string)))))
56 (defun puny-decode-domain (domain)
57 "Decode DOMAIN according to the IDNA/punycode algorithm.
58 For instance, \"xn--ff-2sa.org\" => \"fśf.org\"."
59 (mapconcat 'puny-decode-string (split-string domain "[.]") "."))
61 (defun puny-decode-string (string)
62 "Decode an IDNA/punycode-encoded string.
63 For instance \"xn--bcher-kva\" => \"bücher\"."
64 (if (string-match "\\`xn--" string)
65 (puny-decode-string-internal (substring string 4))
66 string))
68 (defconst puny-initial-n 128)
69 (defconst puny-initial-bias 72)
70 (defconst puny-base 36)
71 (defconst puny-damp 700)
72 (defconst puny-tmin 1)
73 (defconst puny-tmax 26)
74 (defconst puny-skew 28)
76 ;; 0-25 a-z
77 ;; 26-36 0-9
78 (defun puny-encode-digit (d)
79 (if (< d 26)
80 (+ ?a d)
81 (+ ?0 (- d 26))))
83 (defun puny-adapt (delta num-points first-time)
84 (let ((delta (if first-time
85 (/ delta puny-damp)
86 (/ delta 2)))
87 (k 0))
88 (setq delta (+ delta (/ delta num-points)))
89 (while (> delta (/ (* (- puny-base puny-tmin)
90 puny-tmax)
91 2))
92 (setq delta (/ delta (- puny-base puny-tmin))
93 k (+ k puny-base)))
94 (+ k (/ (* (1+ (- puny-base puny-tmin)) delta)
95 (+ delta puny-skew)))))
97 (defun puny-encode-complex (insertion-points string)
98 (let ((n puny-initial-n)
99 (delta 0)
100 (bias puny-initial-bias)
101 (h insertion-points)
102 result m ijv q)
103 (while (< h (length string))
104 (setq ijv (cl-loop for char across string
105 when (>= char n)
106 minimize char))
107 (setq m ijv)
108 (setq delta (+ delta (* (- m n) (+ h 1)))
109 n m)
110 (cl-loop for char across string
111 when (< char n)
112 do (cl-incf delta)
113 when (= char ijv)
114 do (progn
115 (setq q delta)
116 (cl-loop with k = puny-base
117 for t1 = (cond
118 ((<= k bias)
119 puny-tmin)
120 ((>= k (+ bias puny-tmax))
121 puny-tmax)
123 (- k bias)))
124 while (>= q t1)
125 do (push (puny-encode-digit
126 (+ t1 (mod (- q t1)
127 (- puny-base t1))))
128 result)
129 do (setq q (/ (- q t1) (- puny-base t1))
130 k (+ k puny-base)))
131 (push (puny-encode-digit q) result)
132 (setq bias (puny-adapt delta (+ h 1) (= h insertion-points))
133 delta 0
134 h (1+ h))))
135 (cl-incf delta)
136 (cl-incf n))
137 (nreverse result)))
139 (defun puny-decode-digit (cp)
140 (cond
141 ((<= cp ?9)
142 (+ (- cp ?0) 26))
143 ((<= cp ?Z)
144 (- cp ?A))
145 ((<= cp ?z)
146 (- cp ?a))
148 puny-base)))
150 (defun puny-decode-string-internal (string)
151 (with-temp-buffer
152 (insert string)
153 ;; The encoded chars are after any final dash, else the whole string.
154 (let ((encoded (buffer-substring
155 (if (search-backward "-" nil 'move)
156 (1+ (point))
157 (point))
158 (point-max)))
159 (ic 0)
160 (i 0)
161 (bias puny-initial-bias)
162 (n puny-initial-n)
163 out)
164 (delete-region (point) (point-max))
165 (while (< ic (length encoded))
166 (let ((old-i i)
167 (w 1)
168 (k puny-base)
169 digit t1)
170 (cl-loop do (progn
171 (setq digit (puny-decode-digit (aref encoded ic)))
172 (cl-incf ic)
173 (cl-incf i (* digit w))
174 (setq t1 (cond
175 ((<= k bias)
176 puny-tmin)
177 ((>= k (+ bias puny-tmax))
178 puny-tmax)
180 (- k bias)))))
181 while (>= digit t1)
182 do (setq w (* w (- puny-base t1))
183 k (+ k puny-base)))
184 (setq out (1+ (buffer-size)))
185 (setq bias (puny-adapt (- i old-i) out (= old-i 0))))
187 (setq n (+ n (/ i out))
188 i (mod i out))
189 (goto-char (point-min))
190 (forward-char i)
191 (insert (format "%c" n))
192 (cl-incf i)))
193 (buffer-string)))
195 ;; http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
196 ;; http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
198 (defun puny-highly-restrictive-string-p (string)
199 "Say whether STRING is \"highly restrictive\" in the Unicode IDNA sense.
200 See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
201 for details. The main idea is that if you're mixing
202 scripts (like latin and cyrillic), you may confuse the user by
203 using homographs."
204 (let ((scripts
205 (delq
207 (seq-uniq
208 (seq-map (lambda (char)
209 (if (memq char
210 ;; These characters are always allowed
211 ;; in any string.
212 '(#x0027 ; APOSTROPHE
213 #x002D ; HYPHEN-MINUS
214 #x002E ; FULL STOP
215 #x003A ; COLON
216 #x00B7 ; MIDDLE DOT
217 #x058A ; ARMENIAN HYPHEN
218 #x05F3 ; HEBREW PUNCTUATION GERESH
219 #x05F4 ; HEBREW PUNCTUATION GERSHAYIM
220 #x0F0B ; TIBETAN MARK INTERSYLLABIC TSHEG
221 #x200C ; ZERO WIDTH NON-JOINER*
222 #x200D ; ZERO WIDTH JOINER*
223 #x2010 ; HYPHEN
224 #x2019 ; RIGHT SINGLE QUOTATION MARK
225 #x2027 ; HYPHENATION POINT
226 #x30A0 ; KATAKANA-HIRAGANA DOUBLE HYPHEN
227 #x30FB)) ; KATAKANA MIDDLE DOT
229 (aref char-script-table char)))
230 string)))))
232 ;; Every character uses the same script.
233 (= (length scripts) 1)
234 (seq-some 'identity
235 (mapcar (lambda (list)
236 (seq-every-p (lambda (script)
237 (memq script list))
238 scripts))
239 '((latin han hiragana kana)
240 (latin han bopomofo)
241 (latin han hangul)))))))
243 (defun puny-highly-restrictive-domain-p (domain)
244 "Say whether DOMAIN is \"highly restrictive\" in the Unicode IDNA sense.
245 See `puny-highly-restrictive-string-p' for further details."
246 (seq-every-p 'puny-highly-restrictive-string-p (split-string domain "[.]")))
248 (provide 'puny)
250 ;;; puny.el ends here