Merge from origin/emacs-25
[emacs.git] / lisp / net / puny.el
blob50bde85287d60c9f31f714b4db463480067ba3ae
1 ;;; puny.el --- translate non-ASCII domain names to ASCII
3 ;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
6 ;; Keywords: mail, net
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
23 ;;; Commentary:
25 ;; Written by looking at
26 ;; http://stackoverflow.com/questions/183485/can-anyone-recommend-a-good-free-javascript-for-punycode-to-unicode-conversion
28 ;;; Code:
30 (require 'seq)
32 (defun puny-encode-domain (domain)
33 "Encode DOMAIN according to the IDNA/punycode algorithm.
34 For instance, \"fśf.org\" => \"xn--ff-2sa.org\"."
35 ;; The vast majority of domain names are not IDNA domain names, so
36 ;; add a check first to avoid doing unnecessary work.
37 (if (string-match "\\'[[:ascii:]]+\\'" domain)
38 domain
39 (mapconcat 'puny-encode-string (split-string domain "[.]") ".")))
41 (defun puny-encode-string (string)
42 "Encode STRING according to the IDNA/punycode algorithm.
43 This is used to encode non-ASCII domain names.
44 For instance, \"bücher\" => \"xn--bcher-kva\"."
45 (let ((ascii (seq-filter (lambda (char)
46 (< char 128))
47 string)))
48 (if (= (length ascii) (length string))
49 string
50 (concat "xn--"
51 (if (null ascii)
53 (concat ascii "-"))
54 (puny-encode-complex (length ascii) string)))))
56 (defun puny-decode-domain (domain)
57 "Decode DOMAIN according to the IDNA/punycode algorithm.
58 For instance, \"xn--ff-2sa.org\" => \"fśf.org\"."
59 (mapconcat 'puny-decode-string (split-string domain "[.]") "."))
61 (defun puny-decode-string (string)
62 "Decode an IDNA/punycode-encoded string.
63 For instance \"xn--bcher-kva\" => \"bücher\"."
64 (if (string-match "\\`xn--" string)
65 (puny-decode-string-internal (substring string 4))
66 string))
68 (defconst puny-initial-n 128)
69 (defconst puny-initial-bias 72)
70 (defconst puny-base 36)
71 (defconst puny-damp 700)
72 (defconst puny-tmin 1)
73 (defconst puny-tmax 26)
74 (defconst puny-skew 28)
76 ;; 0-25 a-z
77 ;; 26-36 0-9
78 (defun puny-encode-digit (d)
79 (if (< d 26)
80 (+ ?a d)
81 (+ ?0 (- d 26))))
83 (defun puny-adapt (delta num-points first-time)
84 (let ((delta (if first-time
85 (/ delta puny-damp)
86 (/ delta 2)))
87 (k 0))
88 (setq delta (+ delta (/ delta num-points)))
89 (while (> delta (/ (* (- puny-base puny-tmin)
90 puny-tmax)
91 2))
92 (setq delta (/ delta (- puny-base puny-tmin))
93 k (+ k puny-base)))
94 (+ k (/ (* (1+ (- puny-base puny-tmin)) delta)
95 (+ delta puny-skew)))))
97 (defun puny-encode-complex (insertion-points string)
98 (let ((n puny-initial-n)
99 (delta 0)
100 (bias puny-initial-bias)
101 (h insertion-points)
102 result m ijv q)
103 (while (< h (length string))
104 (setq ijv (cl-loop for char across string
105 when (>= char n)
106 minimize char))
107 (setq m ijv)
108 (setq delta (+ delta (* (- m n) (+ h 1)))
109 n m)
110 (cl-loop for char across string
111 when (< char n)
112 do (cl-incf delta)
113 when (= char ijv)
114 do (progn
115 (setq q delta)
116 (cl-loop with k = puny-base
117 for t1 = (cond
118 ((<= k bias)
119 puny-tmin)
120 ((>= k (+ bias puny-tmax))
121 puny-tmax)
123 (- k bias)))
124 while (>= q t1)
125 do (push (puny-encode-digit
126 (+ t1 (mod (- q t1)
127 (- puny-base t1))))
128 result)
129 do (setq q (/ (- q t1) (- puny-base t1))
130 k (+ k puny-base)))
131 (push (puny-encode-digit q) result)
132 (setq bias (puny-adapt delta (+ h 1) (= h insertion-points))
133 delta 0
134 h (1+ h))))
135 (cl-incf delta)
136 (cl-incf n))
137 (nreverse result)))
139 (defun puny-decode-digit (cp)
140 (cond
141 ((<= cp ?9)
142 (+ (- cp ?0) 26))
143 ((<= cp ?Z)
144 (- cp ?A))
145 ((<= cp ?z)
146 (- cp ?a))
148 puny-base)))
150 (defun puny-decode-string-internal (string)
151 (with-temp-buffer
152 (insert string)
153 (goto-char (point-max))
154 (search-backward "-" nil (point-min))
155 ;; The encoded chars are after the final dash.
156 (let ((encoded (buffer-substring (1+ (point)) (point-max)))
157 (ic 0)
158 (i 0)
159 (bias puny-initial-bias)
160 (n puny-initial-n)
161 out)
162 (delete-region (point) (point-max))
163 (while (< ic (length encoded))
164 (let ((old-i i)
165 (w 1)
166 (k puny-base)
167 digit t1)
168 (cl-loop do (progn
169 (setq digit (puny-decode-digit (aref encoded ic)))
170 (cl-incf ic)
171 (cl-incf i (* digit w))
172 (setq t1 (cond
173 ((<= k bias)
174 puny-tmin)
175 ((>= k (+ bias puny-tmax))
176 puny-tmax)
178 (- k bias)))))
179 while (>= digit t1)
180 do (setq w (* w (- puny-base t1))
181 k (+ k puny-base)))
182 (setq out (1+ (buffer-size)))
183 (setq bias (puny-adapt (- i old-i) out (= old-i 0))))
185 (setq n (+ n (/ i out))
186 i (mod i out))
187 (goto-char (point-min))
188 (forward-char i)
189 (insert (format "%c" n))
190 (cl-incf i)))
191 (buffer-string)))
193 ;; http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
194 ;; http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
196 (defun puny-highly-restrictive-string-p (string)
197 "Say whether STRING is \"highly restrictive\" in the Unicode IDNA sense.
198 See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
199 for details. The main idea is that if you're mixing
200 scripts (like latin and cyrillic), you may confuse the user by
201 using homographs."
202 (let ((scripts
203 (delq
205 (seq-uniq
206 (seq-map (lambda (char)
207 (if (memq char
208 ;; These characters are always allowed
209 ;; in any string.
210 '(#x0027 ; APOSTROPHE
211 #x002D ; HYPHEN-MINUS
212 #x002E ; FULL STOP
213 #x003A ; COLON
214 #x00B7 ; MIDDLE DOT
215 #x058A ; ARMENIAN HYPHEN
216 #x05F3 ; HEBREW PUNCTUATION GERESH
217 #x05F4 ; HEBREW PUNCTUATION GERSHAYIM
218 #x0F0B ; TIBETAN MARK INTERSYLLABIC TSHEG
219 #x200C ; ZERO WIDTH NON-JOINER*
220 #x200D ; ZERO WIDTH JOINER*
221 #x2010 ; HYPHEN
222 #x2019 ; RIGHT SINGLE QUOTATION MARK
223 #x2027 ; HYPHENATION POINT
224 #x30A0 ; KATAKANA-HIRAGANA DOUBLE HYPHEN
225 #x30FB)) ; KATAKANA MIDDLE DOT
227 (aref char-script-table char)))
228 string)))))
230 ;; Every character uses the same script.
231 (= (length scripts) 1)
232 (seq-some 'identity
233 (mapcar (lambda (list)
234 (seq-every-p (lambda (script)
235 (memq script list))
236 scripts))
237 '((latin han hiragana kana)
238 (latin han bopomofo)
239 (latin han hangul)))))))
241 (defun puny-highly-restrictive-domain-p (domain)
242 "Say whether DOMAIN is \"highly restrictive\" in the Unicode IDNA sense.
243 See `puny-highly-restrictive-string-p' for further details."
244 (seq-every-p 'puny-highly-restrictive-string-p (split-string domain "[.]")))
246 (provide 'puny)
248 ;;; puny.el ends here