1 ;;; puny.el --- translate non-ASCII domain names to ASCII
3 ;; Copyright (C) 2015-2018 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
25 ;; Written by looking at
26 ;; http://stackoverflow.com/questions/183485/can-anyone-recommend-a-good-free-javascript-for-punycode-to-unicode-conversion
30 (eval-when-compile (require 'cl-lib
))
33 (defun puny-encode-domain (domain)
34 "Encode DOMAIN according to the IDNA/punycode algorithm.
35 For instance, \"fśf.org\" => \"xn--ff-2sa.org\"."
36 ;; The vast majority of domain names are not IDNA domain names, so
37 ;; add a check first to avoid doing unnecessary work.
38 (if (string-match "\\'[[:ascii:]]+\\'" domain
)
40 (mapconcat 'puny-encode-string
(split-string domain
"[.]") ".")))
42 (defun puny-encode-string (string)
43 "Encode STRING according to the IDNA/punycode algorithm.
44 This is used to encode non-ASCII domain names.
45 For instance, \"bücher\" => \"xn--bcher-kva\"."
46 (let ((ascii (seq-filter (lambda (char)
49 (if (= (length ascii
) (length string
))
55 (puny-encode-complex (length ascii
) string
)))))
57 (defun puny-decode-domain (domain)
58 "Decode DOMAIN according to the IDNA/punycode algorithm.
59 For instance, \"xn--ff-2sa.org\" => \"fśf.org\"."
60 (mapconcat 'puny-decode-string
(split-string domain
"[.]") "."))
62 (defun puny-decode-string (string)
63 "Decode an IDNA/punycode-encoded string.
64 For instance \"xn--bcher-kva\" => \"bücher\"."
65 (if (string-match "\\`xn--" string
)
66 (puny-decode-string-internal (substring string
4))
69 (defconst puny-initial-n
128)
70 (defconst puny-initial-bias
72)
71 (defconst puny-base
36)
72 (defconst puny-damp
700)
73 (defconst puny-tmin
1)
74 (defconst puny-tmax
26)
75 (defconst puny-skew
28)
79 (defun puny-encode-digit (d)
84 (defun puny-adapt (delta num-points first-time
)
85 (let ((delta (if first-time
89 (setq delta
(+ delta
(/ delta num-points
)))
90 (while (> delta
(/ (* (- puny-base puny-tmin
)
93 (setq delta
(/ delta
(- puny-base puny-tmin
))
95 (+ k
(/ (* (1+ (- puny-base puny-tmin
)) delta
)
96 (+ delta puny-skew
)))))
98 (defun puny-encode-complex (insertion-points string
)
99 (let ((n puny-initial-n
)
101 (bias puny-initial-bias
)
104 (while (< h
(length string
))
105 (setq ijv
(cl-loop for char across string
109 (setq delta
(+ delta
(* (- m n
) (+ h
1)))
111 (cl-loop for char across string
117 (cl-loop with k
= puny-base
121 ((>= k
(+ bias puny-tmax
))
126 do
(push (puny-encode-digit
130 do
(setq q
(/ (- q t1
) (- puny-base t1
))
132 (push (puny-encode-digit q
) result
)
133 (setq bias
(puny-adapt delta
(+ h
1) (= h insertion-points
))
140 (defun puny-decode-digit (cp)
151 (defun puny-decode-string-internal (string)
154 ;; The encoded chars are after any final dash, else the whole string.
155 (let ((encoded (buffer-substring
156 (if (search-backward "-" nil
'move
)
162 (bias puny-initial-bias
)
165 (delete-region (point) (point-max))
166 (while (< ic
(length encoded
))
172 (setq digit
(puny-decode-digit (aref encoded ic
)))
174 (cl-incf i
(* digit w
))
178 ((>= k
(+ bias puny-tmax
))
183 do
(setq w
(* w
(- puny-base t1
))
185 (setq out
(1+ (buffer-size)))
186 (setq bias
(puny-adapt (- i old-i
) out
(= old-i
0))))
188 (setq n
(+ n
(/ i out
))
190 (goto-char (point-min))
192 (insert (format "%c" n
))
196 ;; http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
197 ;; http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
199 (defun puny-highly-restrictive-string-p (string)
200 "Say whether STRING is \"highly restrictive\" in the Unicode IDNA sense.
201 See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
202 for details. The main idea is that if you're mixing
203 scripts (like latin and cyrillic), you may confuse the user by
209 (seq-map (lambda (char)
211 ;; These characters are always allowed
213 '(#x0027
; APOSTROPHE
214 #x002D
; HYPHEN-MINUS
218 #x058A
; ARMENIAN HYPHEN
219 #x05F3
; HEBREW PUNCTUATION GERESH
220 #x05F4
; HEBREW PUNCTUATION GERSHAYIM
221 #x0F0B
; TIBETAN MARK INTERSYLLABIC TSHEG
222 #x200C
; ZERO WIDTH NON-JOINER*
223 #x200D
; ZERO WIDTH JOINER*
225 #x2019
; RIGHT SINGLE QUOTATION MARK
226 #x2027
; HYPHENATION POINT
227 #x30A0
; KATAKANA-HIRAGANA DOUBLE HYPHEN
228 #x30FB
)) ; KATAKANA MIDDLE DOT
230 (aref char-script-table char
)))
233 ;; Every character uses the same script.
234 (= (length scripts
) 1)
236 (mapcar (lambda (list)
237 (seq-every-p (lambda (script)
240 '((latin han hiragana kana
)
242 (latin han hangul
)))))))
244 (defun puny-highly-restrictive-domain-p (domain)
245 "Say whether DOMAIN is \"highly restrictive\" in the Unicode IDNA sense.
246 See `puny-highly-restrictive-string-p' for further details."
247 (seq-every-p 'puny-highly-restrictive-string-p
(split-string domain
"[.]")))
251 ;;; puny.el ends here