1 ;;; puny.el --- translate non-ASCII domain names to ASCII
3 ;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
25 ;; Written by looking at
26 ;; http://stackoverflow.com/questions/183485/can-anyone-recommend-a-good-free-javascript-for-punycode-to-unicode-conversion
32 (defun puny-encode-domain (domain)
33 "Encode DOMAIN according to the IDNA/punycode algorithm.
34 For instance, \"fśf.org\" => \"xn--ff-2sa.org\"."
35 ;; The vast majority of domain names are not IDNA domain names, so
36 ;; add a check first to avoid doing unnecessary work.
37 (if (string-match "\\'[[:ascii:]]+\\'" domain
)
39 (mapconcat 'puny-encode-string
(split-string domain
"[.]") ".")))
41 (defun puny-encode-string (string)
42 "Encode STRING according to the IDNA/punycode algorithm.
43 This is used to encode non-ASCII domain names.
44 For instance, \"bücher\" => \"xn--bcher-kva\"."
45 (let ((ascii (seq-filter (lambda (char)
48 (if (= (length ascii
) (length string
))
54 (puny-encode-complex (length ascii
) string
)))))
56 (defun puny-decode-domain (domain)
57 "Decode DOMAIN according to the IDNA/punycode algorithm.
58 For instance, \"xn--ff-2sa.org\" => \"fśf.org\"."
59 (mapconcat 'puny-decode-string
(split-string domain
"[.]") "."))
61 (defun puny-decode-string (string)
62 "Decode an IDNA/punycode-encoded string.
63 For instance \"xn--bcher-kva\" => \"bücher\"."
64 (if (string-match "\\`xn--" string
)
65 (puny-decode-string-internal (substring string
4))
68 (defconst puny-initial-n
128)
69 (defconst puny-initial-bias
72)
70 (defconst puny-base
36)
71 (defconst puny-damp
700)
72 (defconst puny-tmin
1)
73 (defconst puny-tmax
26)
74 (defconst puny-skew
28)
78 (defun puny-encode-digit (d)
83 (defun puny-adapt (delta num-points first-time
)
84 (let ((delta (if first-time
88 (setq delta
(+ delta
(/ delta num-points
)))
89 (while (> delta
(/ (* (- puny-base puny-tmin
)
92 (setq delta
(/ delta
(- puny-base puny-tmin
))
94 (+ k
(/ (* (1+ (- puny-base puny-tmin
)) delta
)
95 (+ delta puny-skew
)))))
97 (defun puny-encode-complex (insertion-points string
)
98 (let ((n puny-initial-n
)
100 (bias puny-initial-bias
)
103 (while (< h
(length string
))
104 (setq ijv
(cl-loop for char across string
108 (setq delta
(+ delta
(* (- m n
) (+ h
1)))
110 (cl-loop for char across string
116 (cl-loop with k
= puny-base
120 ((>= k
(+ bias puny-tmax
))
125 do
(push (puny-encode-digit
129 do
(setq q
(/ (- q t1
) (- puny-base t1
))
131 (push (puny-encode-digit q
) result
)
132 (setq bias
(puny-adapt delta
(+ h
1) (= h insertion-points
))
139 (defun puny-decode-digit (cp)
150 (defun puny-decode-string-internal (string)
153 (goto-char (point-max))
154 (search-backward "-" nil
(point-min))
155 ;; The encoded chars are after the final dash.
156 (let ((encoded (buffer-substring (1+ (point)) (point-max)))
159 (bias puny-initial-bias
)
162 (delete-region (point) (point-max))
163 (while (< ic
(length encoded
))
169 (setq digit
(puny-decode-digit (aref encoded ic
)))
171 (cl-incf i
(* digit w
))
175 ((>= k
(+ bias puny-tmax
))
180 do
(setq w
(* w
(- puny-base t1
))
182 (setq out
(1+ (buffer-size)))
183 (setq bias
(puny-adapt (- i old-i
) out
(= old-i
0))))
185 (setq n
(+ n
(/ i out
))
187 (goto-char (point-min))
189 (insert (format "%c" n
))
193 ;; http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
194 ;; http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
196 (defun puny-highly-restrictive-string-p (string)
197 "Say whether STRING is \"highly restrictive\" in the Unicode IDNA sense.
198 See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
199 for details. The main idea is that if you're mixing
200 scripts (like latin and cyrillic), you may confuse the user by
206 (seq-map (lambda (char)
208 ;; These characters are always allowed
210 '(#x0027
; APOSTROPHE
211 #x002D
; HYPHEN-MINUS
215 #x058A
; ARMENIAN HYPHEN
216 #x05F3
; HEBREW PUNCTUATION GERESH
217 #x05F4
; HEBREW PUNCTUATION GERSHAYIM
218 #x0F0B
; TIBETAN MARK INTERSYLLABIC TSHEG
219 #x200C
; ZERO WIDTH NON-JOINER*
220 #x200D
; ZERO WIDTH JOINER*
222 #x2019
; RIGHT SINGLE QUOTATION MARK
223 #x2027
; HYPHENATION POINT
224 #x30A0
; KATAKANA-HIRAGANA DOUBLE HYPHEN
225 #x30FB
)) ; KATAKANA MIDDLE DOT
227 (aref char-script-table char
)))
230 ;; Every character uses the same script.
231 (= (length scripts
) 1)
233 (mapcar (lambda (list)
234 (seq-every-p (lambda (script)
237 '((latin han hiragana kana
)
239 (latin han hangul
)))))))
241 (defun puny-highly-restrictive-domain-p (domain)
242 "Say whether DOMAIN is \"highly restrictive\" in the Unicode IDNA sense.
243 See `puny-highly-restrictive-string-p' for further details."
244 (seq-every-p 'puny-highly-restrictive-string-p
(split-string domain
"[.]")))
248 ;;; puny.el ends here