1 // Copyright (C) 2020-2023 Free Software Foundation, Inc.
3 // This file is part of GCC.
5 // GCC is free software; you can redistribute it and/or modify it under
6 // the terms of the GNU General Public License as published by the Free
7 // Software Foundation; either version 3, or (at your option) any later
10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 // You should have received a copy of the GNU General Public License
16 // along with GCC; see the file COPYING3. If not see
17 // <http://www.gnu.org/licenses/>.
19 // This file provides functions for punycode conversion
20 // See https://datatracker.ietf.org/doc/html/rfc3492
22 #include "rust-system.h"
23 #include "rust-unicode.h"
29 // https://tools.ietf.org/html/rfc3492#section-4.
30 constexpr uint32_t BASE
= 36;
31 constexpr uint32_t TMIN
= 1;
32 constexpr uint32_t TMAX
= 26;
33 constexpr uint32_t SKEW
= 38;
34 constexpr uint32_t DAMP
= 700;
35 constexpr uint32_t INITIAL_BIAS
= 72;
36 constexpr uint32_t INITIAL_N
= 128;
37 constexpr char DELIMITER
= '-';
40 extract_basic_string (const std::vector
<Codepoint
> &src
)
42 std::string basic_string
;
46 basic_string
+= c
.as_string ();
52 adapt_bias (uint32_t delta
, const uint32_t n_points
, const bool is_first
)
54 delta
/= is_first
? DAMP
: 2;
55 delta
+= delta
/ n_points
;
58 while (delta
> (BASE
- TMIN
) * TMAX
/ 2)
63 return k
+ (BASE
- TMIN
+ 1) * delta
/ (delta
+ SKEW
);
67 clamped_sub (const uint32_t min
, const uint32_t lhs
, const uint32_t rhs
,
72 else if (max
+ rhs
<= lhs
)
79 min_gt_or_eq (const std::vector
<Codepoint
> &l
, const uint32_t threshold
)
81 uint32_t min
= UINT32_MAX
;
83 if (c
.value
>= threshold
&& c
.value
< min
)
89 encode_digit (const uint32_t d
)
91 return d
+ 22 + (d
< 26 ? 75 : 0);
94 tl::optional
<std::string
>
95 encode_punycode (const Utf8String
&input
)
97 std::vector
<Codepoint
> input_chars
= input
.get_chars ();
99 uint32_t n
= INITIAL_N
;
101 uint32_t bias
= INITIAL_BIAS
;
103 std::string output
= extract_basic_string (input_chars
);
104 uint32_t h
= output
.size ();
105 const uint32_t b
= h
;
109 while (h
< input_chars
.size ())
111 const uint32_t m
= min_gt_or_eq (input_chars
, n
);
113 if (m
- n
> ((UINT32_MAX
- delta
) / (h
+ 1)))
116 delta
+= (m
- n
) * (h
+ 1);
119 for (const auto c
: input_chars
)
123 else if (c
.value
== n
)
126 // encode as a variable length integer
127 for (uint32_t k
= 1;; k
++)
129 const uint32_t kb
= k
* BASE
;
130 const uint32_t t
= clamped_sub (TMIN
, kb
, bias
, TMAX
);
134 output
+= encode_digit (t
+ (q
- t
) % (BASE
- t
));
135 q
= (q
- t
) / (BASE
- t
);
137 output
+= encode_digit (q
);
139 bias
= adapt_bias (delta
, h
+ 1, h
== b
);
158 encode_assert (const std::string
&input
, const std::string
&expected
)
160 Rust::Utf8String input_utf8
161 = Rust::Utf8String::make_utf8_string (input
).value ();
162 std::string actual
= Rust::encode_punycode (input_utf8
).value ();
163 ASSERT_EQ (actual
, expected
);
167 rust_punycode_encode_test ()
169 encode_assert ("abc", "abc-");
170 encode_assert ("12345", "12345-");
171 encode_assert ("香港", "j6w193g");
173 // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
174 encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
175 encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
176 encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
177 encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
180 } // namespace selftest