1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 * @in: input array with unicode code points.
29 * @inlen: length of input array with unicode code points.
30 * @out: output zero terminated string that must have room for at
31 * least 63 characters plus the terminating zero.
32 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
34 * The ToASCII operation takes a sequence of Unicode code points that make
35 * up one label and transforms it into a sequence of code points in the
36 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
37 * resulting sequence are equivalent labels.
39 * It is important to note that the ToASCII operation can fail. ToASCII
40 * fails if any step of it fails. If any step of the ToASCII operation
41 * fails on any label in a domain name, that domain name MUST NOT be used
42 * as an internationalized domain name. The method for deadling with this
43 * failure is application-specific.
45 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
46 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
47 * sequence of ASCII code points or a failure condition.
49 * ToASCII never alters a sequence of code points that are all in the ASCII
50 * range to begin with (although it could fail). Applying the ToASCII
51 * operation multiple times has exactly the same effect as applying it just
54 * Return value: Returns 0 on success, or an error code.
57 idna_to_ascii_4i (const uint32_t * in
, size_t inlen
, char *out
, int flags
)
60 uint32_t *src
; /* XXX don't need to copy data? */
64 * ToASCII consists of the following steps:
66 * 1. If all code points in the sequence are in the ASCII range (0..7F)
67 * then skip to step 3.
75 for (i
= 0; i
< inlen
; i
++)
80 src
= malloc (sizeof (in
[0]) * (inlen
+ 1));
82 return IDNA_MALLOC_ERROR
;
84 memcpy (src
, in
, sizeof (in
[0]) * inlen
);
92 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
93 * an error. The AllowUnassigned flag is used in [NAMEPREP].
99 p
= stringprep_ucs4_to_utf8 (in
, inlen
, NULL
, NULL
);
101 return IDNA_MALLOC_ERROR
;
106 len
= 2 * len
+ 10; /* XXX better guess? */
107 p
= realloc (p
, len
);
109 return IDNA_MALLOC_ERROR
;
111 if (flags
& IDNA_ALLOW_UNASSIGNED
)
112 rc
= stringprep_nameprep (p
, len
);
114 rc
= stringprep_nameprep_no_unassigned (p
, len
);
116 while (rc
== STRINGPREP_TOO_SMALL_BUFFER
);
118 if (rc
!= STRINGPREP_OK
)
121 return IDNA_STRINGPREP_ERROR
;
124 src
= stringprep_utf8_to_ucs4 (p
, -1, NULL
);
131 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
133 * (a) Verify the absence of non-LDH ASCII code points; that is,
134 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
136 * (b) Verify the absence of leading and trailing hyphen-minus;
137 * that is, the absence of U+002D at the beginning and end of
141 if (flags
& IDNA_USE_STD3_ASCII_RULES
)
145 for (i
= 0; src
[i
]; i
++)
146 if (src
[i
] <= 0x2C || src
[i
] == 0x2E || src
[i
] == 0x2F ||
147 (src
[i
] >= 0x3A && src
[i
] <= 0x40) ||
148 (src
[i
] >= 0x5B && src
[i
] <= 0x60) ||
149 (src
[i
] >= 0x7B && src
[i
] <= 0x7F))
152 return IDNA_CONTAINS_LDH
;
155 if (src
[0] == 0x002D || (i
> 0 && src
[i
- 1] == 0x002D))
158 return IDNA_CONTAINS_MINUS
;
163 * 4. If all code points in the sequence are in the ASCII range
164 * (0..7F), then skip to step 8.
172 for (i
= 0; src
[i
]; i
++)
176 /* copy string to output buffer if we are about to skip to step8 */
187 * 5. Verify that the sequence does NOT begin with the ACE prefix.
196 for (i
= 0; match
&& i
< strlen (IDNA_ACE_PREFIX
); i
++)
197 if (((uint32_t) IDNA_ACE_PREFIX
[i
] & 0xFF) != src
[i
])
202 return IDNA_CONTAINS_ACE_PREFIX
;
207 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
208 * and fail if there is an error.
210 for (len
= 0; src
[len
]; len
++)
213 outlen
= 63 - strlen (IDNA_ACE_PREFIX
);
214 rc
= punycode_encode (len
, src
, NULL
,
215 &outlen
, &out
[strlen (IDNA_ACE_PREFIX
)]);
217 if (rc
!= PUNYCODE_SUCCESS
)
218 return IDNA_PUNYCODE_ERROR
;
219 out
[strlen (IDNA_ACE_PREFIX
) + outlen
] = '\0';
222 * 7. Prepend the ACE prefix.
225 memcpy (out
, IDNA_ACE_PREFIX
, strlen (IDNA_ACE_PREFIX
));
228 * 8. Verify that the number of code points is in the range 1 to 63
233 if (strlen (out
) < 1 || strlen (out
) > 63)
234 return IDNA_INVALID_LENGTH
;
240 idna_to_unicode_internal (char *utf8in
, size_t utf8len
,
241 uint32_t * out
, size_t * outlen
, int flags
)
247 * 1. If all code points in the sequence are in the ASCII range (0..7F)
248 * then skip to step 3.
256 for (i
= 0; utf8in
[i
]; i
++)
257 if (utf8in
[i
] & ~0x7F)
264 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
265 * error. (If step 3 of ToASCII is also performed here, it will not
266 * affect the overall behavior of ToUnicode, but it is not
267 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
270 if (flags
& IDNA_ALLOW_UNASSIGNED
)
271 rc
= stringprep_nameprep (utf8in
, utf8len
);
273 rc
= stringprep_nameprep_no_unassigned (utf8in
, utf8len
);
275 if (rc
!= STRINGPREP_OK
)
276 return IDNA_STRINGPREP_ERROR
;
278 /* 3. Verify that the sequence begins with the ACE prefix, and save a
279 * copy of the sequence.
283 if (memcmp (IDNA_ACE_PREFIX
, utf8in
, strlen (IDNA_ACE_PREFIX
)) != 0)
284 return IDNA_NO_ACE_PREFIX
;
286 /* 4. Remove the ACE prefix.
289 memmove (utf8in
, &utf8in
[strlen (IDNA_ACE_PREFIX
)],
290 strlen (utf8in
) - strlen (IDNA_ACE_PREFIX
) + 1);
292 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
293 * and fail if there is an error. Save a copy of the result of
297 (*outlen
)--; /* reserve one for the zero */
299 rc
= punycode_decode (strlen (utf8in
), utf8in
, outlen
, out
, NULL
);
300 if (rc
!= PUNYCODE_SUCCESS
)
301 return IDNA_PUNYCODE_ERROR
;
303 out
[*outlen
] = 0; /* add zero */
308 rc
= idna_to_ascii_4i (out
, *outlen
, tmpout
, flags
);
309 if (rc
!= IDNA_SUCCESS
)
312 /* 7. Verify that the result of step 6 matches the saved copy from
313 * step 3, using a case-insensitive ASCII comparison.
316 if (strcasecmp (utf8in
, tmpout
+ strlen (IDNA_ACE_PREFIX
)) != 0)
317 return IDNA_ROUNDTRIP_VERIFY_ERROR
;
319 /* 8. Return the saved copy from step 5.
326 * idna_to_unicode_44i
327 * @in: input array with unicode code points.
328 * @inlen: length of input array with unicode code points.
329 * @out: output array with unicode code points.
330 * @outlen: on input, maximum size of output array with unicode code points,
331 * on exit, actual size of output array with unicode code points.
332 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
334 * The ToUnicode operation takes a sequence of Unicode code points
335 * that make up one label and returns a sequence of Unicode code
336 * points. If the input sequence is a label in ACE form, then the
337 * result is an equivalent internationalized label that is not in ACE
338 * form, otherwise the original sequence is returned unaltered.
340 * ToUnicode never fails. If any step fails, then the original input
341 * sequence is returned immediately in that step.
343 * The ToUnicode output never contains more code points than its
344 * input. Note that the number of octets needed to represent a
345 * sequence of code points depends on the particular character
348 * The inputs to ToUnicode are a sequence of code points, the
349 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
350 * ToUnicode is always a sequence of Unicode code points.
352 * Return value: Returns error condition, but it must only be used for
353 * debugging purposes. The output buffer is always
354 * guaranteed to contain the correct data according to
355 * the specification (sans malloc induced errors). NB!
356 * This means that you normally ignore the return code
357 * from this function, as checking it means breaking the
361 idna_to_unicode_44i (const uint32_t * in
, size_t inlen
,
362 uint32_t * out
, size_t * outlen
, int flags
)
365 size_t outlensave
= *outlen
;
368 p
= stringprep_ucs4_to_utf8 (in
, inlen
, NULL
, NULL
);
370 return IDNA_MALLOC_ERROR
;
372 p
= realloc (p
, BUFSIZ
);
374 return IDNA_MALLOC_ERROR
;
376 rc
= idna_to_unicode_internal (p
, BUFSIZ
, out
, outlen
, flags
);
377 if (rc
!= IDNA_SUCCESS
)
379 memcpy (out
, in
, sizeof (in
[0]) * (inlen
< outlensave
?
380 inlen
: outlensave
));
389 /* Wrappers that handle several labels */
393 * @input: zero terminated input Unicode string.
394 * @output: pointer to newly allocated output string.
395 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
397 * Convert UCS-4 domain name to ASCII string. The domain name may
398 * contain several labels, separated by dots. The output buffer must
399 * be deallocated by the caller.
401 * Return value: Returns IDNA_SUCCESS on success, or error code.
404 idna_to_ascii_4z (const uint32_t * input
, char **output
, int flags
)
406 const uint32_t *start
= input
;
407 const uint32_t *end
= input
;
418 /* 1) Whenever dots are used as label separators, the following
419 characters MUST be recognized as dots: U+002E (full stop),
420 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
421 U+FF61 (halfwidth ideographic full stop). */
424 *end
!= 0x3002 && *end
!= 0xFF0E && *end
!= 0xFF61; end
++)
427 /* Handle empty trailing labels. The RFC is not clear on this,
428 the text that mandate this behaviour inside a parenthesis in
429 the terminology section. */
430 if (end
== start
&& *end
== '\0')
432 strcpy (buf
, out
? "" : ".");
436 rc
= idna_to_ascii_4i (start
, end
- start
, buf
, flags
);
437 if (rc
!= IDNA_SUCCESS
)
443 out
= realloc (out
, strlen (out
) + 1 + strlen (buf
) + 1);
445 return IDNA_MALLOC_ERROR
;
451 out
= (char *) malloc (strlen (buf
) + 1);
453 return IDNA_MALLOC_ERROR
;
468 * @input: zero terminated input UTF-8 string.
469 * @output: pointer to newly allocated output string.
470 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
472 * Convert UTF-8 domain name to ASCII string. The domain name may
473 * contain several labels, separated by dots. The output buffer must
474 * be deallocated by the caller.
476 * Return value: Returns IDNA_SUCCESS on success, or error code.
479 idna_to_ascii_8z (const char *input
, char **output
, int flags
)
485 ucs4
= stringprep_utf8_to_ucs4 (input
, -1, &ucs4len
);
487 return IDNA_ICONV_ERROR
;
489 rc
= idna_to_ascii_4z (ucs4
, output
, flags
);
499 * @input: zero terminated input UTF-8 string.
500 * @output: pointer to newly allocated output string.
501 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
503 * Convert domain name in the locale's encoding to ASCII string. The
504 * domain name may contain several labels, separated by dots. The
505 * output buffer must be deallocated by the caller.
507 * Return value: Returns IDNA_SUCCESS on success, or error code.
510 idna_to_ascii_lz (const char *input
, char **output
, int flags
)
515 utf8
= stringprep_locale_to_utf8 (input
);
517 return IDNA_ICONV_ERROR
;
519 rc
= idna_to_ascii_8z (utf8
, output
, flags
);
527 * idna_to_unicode_4z4z:
528 * @input: zero-terminated Unicode string.
529 * @output: pointer to newly allocated output Unicode string.
530 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
532 * Convert possibly ACE encoded domain name in UCS-4 format into a
533 * UCS-4 string. The domain name may contain several labels,
534 * separated by dots. The output buffer must be deallocated by the
537 * Return value: Returns IDNA_SUCCESS on success, or error code.
540 idna_to_unicode_4z4z (const uint32_t * input
, uint32_t ** output
, int flags
)
542 const uint32_t *start
= input
;
543 const uint32_t *end
= input
;
546 uint32_t *out
= NULL
;
556 /* 1) Whenever dots are used as label separators, the following
557 characters MUST be recognized as dots: U+002E (full stop),
558 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
559 U+FF61 (halfwidth ideographic full stop). */
562 *end
!= 0x3002 && *end
!= 0xFF0E && *end
!= 0xFF61; end
++)
565 buflen
= end
- start
;
566 buf
= malloc (sizeof (buf
[0]) * (buflen
+ 1));
568 return IDNA_MALLOC_ERROR
;
570 rc
= idna_to_unicode_44i (start
, end
- start
, buf
, &buflen
, flags
);
571 /* don't check rc as per specification! */
575 out
= realloc (out
, sizeof (out
[0]) * (outlen
+ 1 + buflen
+ 1));
577 return IDNA_MALLOC_ERROR
;
578 out
[outlen
++] = 0x002E; /* '.' (full stop) */
579 memcpy (out
+ outlen
, buf
, sizeof (buf
[0]) * buflen
);
601 * idna_to_unicode_8z4z:
602 * @input: zero-terminated UTF-8 string.
603 * @output: pointer to newly allocated output Unicode string.
604 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
606 * Convert possibly ACE encoded domain name in UTF-8 format into a
607 * UCS-4 string. The domain name may contain several labels,
608 * separated by dots. The output buffer must be deallocated by the
611 * Return value: Returns IDNA_SUCCESS on success, or error code.
614 idna_to_unicode_8z4z (const char *input
, uint32_t ** output
, int flags
)
620 ucs4
= stringprep_utf8_to_ucs4 (input
, -1, &ucs4len
);
622 return IDNA_ICONV_ERROR
;
624 rc
= idna_to_unicode_4z4z (ucs4
, output
, flags
);
631 * idna_to_unicode_8z8z:
632 * @input: zero-terminated UTF-8 string.
633 * @output: pointer to newly allocated output UTF-8 string.
634 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
636 * Convert possibly ACE encoded domain name in UTF-8 format into a
637 * UTF-8 string. The domain name may contain several labels,
638 * separated by dots. The output buffer must be deallocated by the
641 * Return value: Returns IDNA_SUCCESS on success, or error code.
644 idna_to_unicode_8z8z (const char *input
, char **output
, int flags
)
649 rc
= idna_to_unicode_8z4z (input
, &ucs4
, flags
);
650 *output
= stringprep_ucs4_to_utf8 (ucs4
, -1, NULL
, NULL
);
654 return IDNA_ICONV_ERROR
;
660 * idna_to_unicode_8zlz:
661 * @input: zero-terminated UTF-8 string.
662 * @output: pointer to newly allocated output string encoded in the
663 * current locale's character set.
664 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
666 * Convert possibly ACE encoded domain name in UTF-8 format into a
667 * string encoded in the current locale's character set. The domain
668 * name may contain several labels, separated by dots. The output
669 * buffer must be deallocated by the caller.
671 * Return value: Returns IDNA_SUCCESS on success, or error code.
674 idna_to_unicode_8zlz (const char *input
, char **output
, int flags
)
679 rc
= idna_to_unicode_8z8z (input
, &utf8
, flags
);
680 *output
= stringprep_utf8_to_locale (utf8
);
684 return IDNA_ICONV_ERROR
;
690 * idna_to_unicode_lzlz:
691 * @input: zero-terminated string encoded in the current locale's
693 * @output: pointer to newly allocated output string encoded in the
694 * current locale's character set.
695 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
697 * Convert possibly ACE encoded domain name in the locale's character
698 * set into a string encoded in the current locale's character set.
699 * The domain name may contain several labels, separated by dots. The
700 * output buffer must be deallocated by the caller.
702 * Return value: Returns IDNA_SUCCESS on success, or error code.
705 idna_to_unicode_lzlz (const char *input
, char **output
, int flags
)
710 utf8
= stringprep_locale_to_utf8 (input
);
712 return IDNA_ICONV_ERROR
;
714 rc
= idna_to_unicode_8zlz (utf8
, output
, flags
);
723 * The IANA allocated prefix to use for IDNA. "xn--"