(gnus-blocked-images): Clarify privacy implications
[emacs.git] / lisp / net / puny.el
blobefa11cf178d341aad865b8d67d9faea51458da16
1 ;;; puny.el --- translate non-ASCII domain names to ASCII
3 ;; Copyright (C) 2015-2018 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
6 ;; Keywords: mail, net
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
23 ;;; Commentary:
25 ;; Written by looking at
26 ;; http://stackoverflow.com/questions/183485/can-anyone-recommend-a-good-free-javascript-for-punycode-to-unicode-conversion
28 ;;; Code:
30 (eval-when-compile (require 'cl-lib))
31 (require 'seq)
33 (defun puny-encode-domain (domain)
34 "Encode DOMAIN according to the IDNA/punycode algorithm.
35 For instance, \"fśf.org\" => \"xn--ff-2sa.org\"."
36 ;; The vast majority of domain names are not IDNA domain names, so
37 ;; add a check first to avoid doing unnecessary work.
38 (if (string-match "\\'[[:ascii:]]+\\'" domain)
39 domain
40 (mapconcat 'puny-encode-string (split-string domain "[.]") ".")))
42 (defun puny-encode-string (string)
43 "Encode STRING according to the IDNA/punycode algorithm.
44 This is used to encode non-ASCII domain names.
45 For instance, \"bücher\" => \"xn--bcher-kva\"."
46 (let ((ascii (seq-filter (lambda (char)
47 (< char 128))
48 string)))
49 (if (= (length ascii) (length string))
50 string
51 (concat "xn--"
52 (if (null ascii)
54 (concat ascii "-"))
55 (puny-encode-complex (length ascii) string)))))
57 (defun puny-decode-domain (domain)
58 "Decode DOMAIN according to the IDNA/punycode algorithm.
59 For instance, \"xn--ff-2sa.org\" => \"fśf.org\"."
60 (mapconcat 'puny-decode-string (split-string domain "[.]") "."))
62 (defun puny-decode-string (string)
63 "Decode an IDNA/punycode-encoded string.
64 For instance \"xn--bcher-kva\" => \"bücher\"."
65 (if (string-match "\\`xn--" string)
66 (puny-decode-string-internal (substring string 4))
67 string))
69 (defconst puny-initial-n 128)
70 (defconst puny-initial-bias 72)
71 (defconst puny-base 36)
72 (defconst puny-damp 700)
73 (defconst puny-tmin 1)
74 (defconst puny-tmax 26)
75 (defconst puny-skew 28)
77 ;; 0-25 a-z
78 ;; 26-36 0-9
79 (defun puny-encode-digit (d)
80 (if (< d 26)
81 (+ ?a d)
82 (+ ?0 (- d 26))))
84 (defun puny-adapt (delta num-points first-time)
85 (let ((delta (if first-time
86 (/ delta puny-damp)
87 (/ delta 2)))
88 (k 0))
89 (setq delta (+ delta (/ delta num-points)))
90 (while (> delta (/ (* (- puny-base puny-tmin)
91 puny-tmax)
92 2))
93 (setq delta (/ delta (- puny-base puny-tmin))
94 k (+ k puny-base)))
95 (+ k (/ (* (1+ (- puny-base puny-tmin)) delta)
96 (+ delta puny-skew)))))
98 (defun puny-encode-complex (insertion-points string)
99 (let ((n puny-initial-n)
100 (delta 0)
101 (bias puny-initial-bias)
102 (h insertion-points)
103 result m ijv q)
104 (while (< h (length string))
105 (setq ijv (cl-loop for char across string
106 when (>= char n)
107 minimize char))
108 (setq m ijv)
109 (setq delta (+ delta (* (- m n) (+ h 1)))
110 n m)
111 (cl-loop for char across string
112 when (< char n)
113 do (cl-incf delta)
114 when (= char ijv)
115 do (progn
116 (setq q delta)
117 (cl-loop with k = puny-base
118 for t1 = (cond
119 ((<= k bias)
120 puny-tmin)
121 ((>= k (+ bias puny-tmax))
122 puny-tmax)
124 (- k bias)))
125 while (>= q t1)
126 do (push (puny-encode-digit
127 (+ t1 (mod (- q t1)
128 (- puny-base t1))))
129 result)
130 do (setq q (/ (- q t1) (- puny-base t1))
131 k (+ k puny-base)))
132 (push (puny-encode-digit q) result)
133 (setq bias (puny-adapt delta (+ h 1) (= h insertion-points))
134 delta 0
135 h (1+ h))))
136 (cl-incf delta)
137 (cl-incf n))
138 (nreverse result)))
140 (defun puny-decode-digit (cp)
141 (cond
142 ((<= cp ?9)
143 (+ (- cp ?0) 26))
144 ((<= cp ?Z)
145 (- cp ?A))
146 ((<= cp ?z)
147 (- cp ?a))
149 puny-base)))
151 (defun puny-decode-string-internal (string)
152 (with-temp-buffer
153 (insert string)
154 ;; The encoded chars are after any final dash, else the whole string.
155 (let ((encoded (buffer-substring
156 (if (search-backward "-" nil 'move)
157 (1+ (point))
158 (point))
159 (point-max)))
160 (ic 0)
161 (i 0)
162 (bias puny-initial-bias)
163 (n puny-initial-n)
164 out)
165 (delete-region (point) (point-max))
166 (while (< ic (length encoded))
167 (let ((old-i i)
168 (w 1)
169 (k puny-base)
170 digit t1)
171 (cl-loop do (progn
172 (setq digit (puny-decode-digit (aref encoded ic)))
173 (cl-incf ic)
174 (cl-incf i (* digit w))
175 (setq t1 (cond
176 ((<= k bias)
177 puny-tmin)
178 ((>= k (+ bias puny-tmax))
179 puny-tmax)
181 (- k bias)))))
182 while (>= digit t1)
183 do (setq w (* w (- puny-base t1))
184 k (+ k puny-base)))
185 (setq out (1+ (buffer-size)))
186 (setq bias (puny-adapt (- i old-i) out (= old-i 0))))
188 (setq n (+ n (/ i out))
189 i (mod i out))
190 (goto-char (point-min))
191 (forward-char i)
192 (insert (format "%c" n))
193 (cl-incf i)))
194 (buffer-string)))
196 ;; http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
197 ;; http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
199 (defun puny-highly-restrictive-string-p (string)
200 "Say whether STRING is \"highly restrictive\" in the Unicode IDNA sense.
201 See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
202 for details. The main idea is that if you're mixing
203 scripts (like latin and cyrillic), you may confuse the user by
204 using homographs."
205 (let ((scripts
206 (delq
208 (seq-uniq
209 (seq-map (lambda (char)
210 (if (memq char
211 ;; These characters are always allowed
212 ;; in any string.
213 '(#x0027 ; APOSTROPHE
214 #x002D ; HYPHEN-MINUS
215 #x002E ; FULL STOP
216 #x003A ; COLON
217 #x00B7 ; MIDDLE DOT
218 #x058A ; ARMENIAN HYPHEN
219 #x05F3 ; HEBREW PUNCTUATION GERESH
220 #x05F4 ; HEBREW PUNCTUATION GERSHAYIM
221 #x0F0B ; TIBETAN MARK INTERSYLLABIC TSHEG
222 #x200C ; ZERO WIDTH NON-JOINER*
223 #x200D ; ZERO WIDTH JOINER*
224 #x2010 ; HYPHEN
225 #x2019 ; RIGHT SINGLE QUOTATION MARK
226 #x2027 ; HYPHENATION POINT
227 #x30A0 ; KATAKANA-HIRAGANA DOUBLE HYPHEN
228 #x30FB)) ; KATAKANA MIDDLE DOT
230 (aref char-script-table char)))
231 string)))))
233 ;; Every character uses the same script.
234 (= (length scripts) 1)
235 (seq-some 'identity
236 (mapcar (lambda (list)
237 (seq-every-p (lambda (script)
238 (memq script list))
239 scripts))
240 '((latin han hiragana kana)
241 (latin han bopomofo)
242 (latin han hangul)))))))
244 (defun puny-highly-restrictive-domain-p (domain)
245 "Say whether DOMAIN is \"highly restrictive\" in the Unicode IDNA sense.
246 See `puny-highly-restrictive-string-p' for further details."
247 (seq-every-p 'puny-highly-restrictive-string-p (split-string domain "[.]")))
249 (provide 'puny)
251 ;;; puny.el ends here