1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of Libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 * @in: input array with unicode code points.
26 * @inlen: length of input array with unicode code points.
27 * @out: output zero terminated string that must have room for at
28 * least 63 characters plus the terminating zero.
29 * @allowunassigned: boolean value as per IDNA specification.
30 * @usestd3asciirules: boolean value as per IDNA specification.
32 * The ToASCII operation takes a sequence of Unicode code points that make
33 * up one label and transforms it into a sequence of code points in the
34 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
35 * resulting sequence are equivalent labels.
37 * It is important to note that the ToASCII operation can fail. ToASCII
38 * fails if any step of it fails. If any step of the ToASCII operation
39 * fails on any label in a domain name, that domain name MUST NOT be used
40 * as an internationalized domain name. The method for deadling with this
41 * failure is application-specific.
43 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
44 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
45 * sequence of ASCII code points or a failure condition.
47 * ToASCII never alters a sequence of code points that are all in the ASCII
48 * range to begin with (although it could fail). Applying the ToASCII
49 * operation multiple times has exactly the same effect as applying it just
53 idna_to_ascii (const unsigned long *in
, size_t inlen
,
54 char *out
, int allowunassigned
, int usestd3asciirules
)
60 src
= malloc (sizeof (in
[0]) * inlen
+ 1);
62 return IDNA_MALLOC_ERROR
;
64 memcpy (src
, in
, sizeof (in
[0]) * inlen
);
68 * ToASCII consists of the following steps:
70 * 1. If all code points in the sequence are in the ASCII range (0..7F)
71 * then skip to step 3.
79 for (i
= 0; src
[i
]; i
++)
87 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
88 * an error. The AllowUnassigned flag is used in [NAMEPREP].
94 p
= stringprep_ucs4_to_utf8 (src
, inlen
, NULL
, NULL
);
96 return IDNA_MALLOC_ERROR
;
98 p
= realloc (p
, BUFSIZ
);
100 return IDNA_MALLOC_ERROR
;
103 rc
= stringprep_nameprep (p
, BUFSIZ
);
105 rc
= stringprep_nameprep_no_unassigned (p
, BUFSIZ
);
107 if (rc
!= STRINGPREP_OK
)
108 return IDNA_STRINGPREP_ERROR
;
112 src
= stringprep_utf8_to_ucs4 (p
, -1, NULL
);
117 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
119 * (a) Verify the absence of non-LDH ASCII code points; that is,
120 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
122 * (b) Verify the absence of leading and trailing hyphen-minus;
123 * that is, the absence of U+002D at the beginning and end of
127 if (usestd3asciirules
)
131 for (i
= 0; src
[i
]; i
++)
132 if (src
[i
] <= 0x2C || src
[i
] == 0x2E || src
[i
] == 0x2F ||
133 (src
[i
] >= 0x3A && src
[i
] <= 0x40) ||
134 (src
[i
] >= 0x5B && src
[i
] <= 0x60) ||
135 (src
[i
] >= 0x7B && src
[i
] <= 0x7F))
136 return IDNA_CONTAINS_LDH
;
138 if (src
[0] == 0x002D || (i
> 0 && src
[i
- 1] == 0x002D))
139 return IDNA_CONTAINS_MINUS
;
143 * 4. If all code points in the sequence are in the ASCII range
144 * (0..7F), then skip to step 8.
152 for (i
= 0; src
[i
]; i
++)
160 * 5. Verify that the sequence does NOT begin with the ACE prefix.
169 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
170 * and fail if there is an error.
172 for (len
= 0; src
[len
]; len
++)
175 outlen
= 63 - strlen (IDNA_ACE_PREFIX
);
176 rc
= punycode_encode (len
, src
, NULL
,
177 &outlen
, &out
[strlen (IDNA_ACE_PREFIX
)]);
178 if (rc
!= PUNYCODE_SUCCESS
)
179 return IDNA_PUNYCODE_ERROR
;
180 out
[strlen (IDNA_ACE_PREFIX
) + outlen
] = '\0';
183 * 7. Prepend the ACE prefix.
186 memcpy (out
, IDNA_ACE_PREFIX
, strlen (IDNA_ACE_PREFIX
));
189 * 8. Verify that the number of code points is in the range 1 to 63
194 if (strlen (out
) < 1 || strlen (out
) > 63)
195 return IDNA_INVALID_LENGTH
;
201 idna_to_unicode_internal (const unsigned long *in
, size_t inlen
,
202 unsigned long *out
, size_t * outlen
,
203 int allowunassigned
, int usestd3asciirules
,
204 char *utf8in
, size_t utf8len
)
210 * 1. If all code points in the sequence are in the ASCII range (0..7F)
211 * then skip to step 3.
219 for (i
= 0; in
[i
]; i
++)
227 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
228 * error. (If step 3 of ToASCII is also performed here, it will not
229 * affect the overall behavior of ToUnicode, but it is not
230 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
234 rc
= stringprep_nameprep (utf8in
, utf8len
);
236 rc
= stringprep_nameprep_no_unassigned (utf8in
, utf8len
);
238 if (rc
!= STRINGPREP_OK
)
239 return IDNA_STRINGPREP_ERROR
;
241 /* 3. Verify that the sequence begins with the ACE prefix, and save a
242 * copy of the sequence.
246 if (memcmp (IDNA_ACE_PREFIX
, utf8in
, strlen (IDNA_ACE_PREFIX
)) != 0)
247 return IDNA_NO_ACE_PREFIX
;
249 /* 4. Remove the ACE prefix.
252 memmove (utf8in
, &utf8in
[strlen (IDNA_ACE_PREFIX
)],
253 strlen (utf8in
) - strlen (IDNA_ACE_PREFIX
) + 1);
255 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
256 * and fail if there is an error. Save a copy of the result of
260 rc
= punycode_decode (strlen(utf8in
), utf8in
, outlen
, out
, NULL
);
261 if (rc
!= PUNYCODE_SUCCESS
)
262 return IDNA_PUNYCODE_ERROR
;
267 rc
= idna_to_ascii (out
, *outlen
, tmpout
,
268 allowunassigned
, usestd3asciirules
);
269 if (rc
!= IDNA_SUCCESS
)
272 /* 7. Verify that the result of step 6 matches the saved copy from
273 * step 3, using a case-insensitive ASCII comparison.
276 if (strcasecmp(utf8in
, tmpout
+ strlen(IDNA_ACE_PREFIX
)) != 0)
277 return IDNA_ROUNDTRIP_VERIFY_ERROR
;
279 /* 8. Return the saved copy from step 5.
286 * @in: input array with unicode code points.
287 * @inlen: length of input array with unicode code points.
288 * @out: output array with unicode code points.
289 * @outlen: on input, maximum size of output array with unicode code points,
290 * on exit, actual size of output array with unicode code points.
291 * @allowunassigned: boolean value as per IDNA specification.
292 * @usestd3asciirules: boolean value as per IDNA specification.
294 * The ToUnicode operation takes a sequence of Unicode code points
295 * that make up one label and returns a sequence of Unicode code
296 * points. If the input sequence is a label in ACE form, then the
297 * result is an equivalent internationalized label that is not in ACE
298 * form, otherwise the original sequence is returned unaltered.
300 * ToUnicode never fails. If any step fails, then the original input
301 * sequence is returned immediately in that step.
303 * The ToUnicode output never contains more code points than its
304 * input. Note that the number of octets needed to represent a
305 * sequence of code points depends on the particular character
308 * The inputs to ToUnicode are a sequence of code points, the
309 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
310 * ToUnicode is always a sequence of Unicode code points.
312 * Return value: Returns error condition, but it must only be used for
313 * debugging purposes. The output buffer is always
314 * guaranteed to contain the correct data according to
315 * the specification (sans malloc induced errors). NB!
316 * This means that you normally ignore the return code
317 * from this function, as checking it means breaking the
321 idna_to_unicode (const unsigned long *in
, size_t inlen
,
322 unsigned long *out
, size_t * outlen
,
323 int allowunassigned
, int usestd3asciirules
)
326 int outlensave
= *outlen
;
329 p
= stringprep_ucs4_to_utf8 (in
, inlen
, NULL
, NULL
);
331 return IDNA_MALLOC_ERROR
;
333 p
= realloc (p
, BUFSIZ
);
335 return IDNA_MALLOC_ERROR
;
337 rc
= idna_to_unicode_internal (in
, inlen
, out
, outlen
,
338 allowunassigned
, usestd3asciirules
,
340 if (rc
!= IDNA_SUCCESS
)
343 sizeof (in
[0]) * (inlen
< outlensave
? inlen
: outlensave
));