1 /* idna.c --- Convert to or from IDN strings.
2 * Copyright (C) 2002, 2003, 2004 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 #include <stringprep.h>
33 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
34 (c) == 0xFF0E || (c) == 0xFF61)
40 * @in: input array with unicode code points.
41 * @inlen: length of input array with unicode code points.
42 * @out: output zero terminated string that must have room for at
43 * least 63 characters plus the terminating zero.
44 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
45 * %IDNA_USE_STD3_ASCII_RULES.
47 * The ToASCII operation takes a sequence of Unicode code points that make
48 * up one label and transforms it into a sequence of code points in the
49 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
50 * resulting sequence are equivalent labels.
52 * It is important to note that the ToASCII operation can fail. ToASCII
53 * fails if any step of it fails. If any step of the ToASCII operation
54 * fails on any label in a domain name, that domain name MUST NOT be used
55 * as an internationalized domain name. The method for deadling with this
56 * failure is application-specific.
58 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
59 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
60 * sequence of ASCII code points or a failure condition.
62 * ToASCII never alters a sequence of code points that are all in the ASCII
63 * range to begin with (although it could fail). Applying the ToASCII
64 * operation multiple times has exactly the same effect as applying it just
67 * Return value: Returns 0 on success, or an #Idna_rc error code.
70 idna_to_ascii_4i (const uint32_t * in
, size_t inlen
, char *out
, int flags
)
73 uint32_t *src
; /* XXX don't need to copy data? */
77 * ToASCII consists of the following steps:
79 * 1. If all code points in the sequence are in the ASCII range (0..7F)
80 * then skip to step 3.
88 for (i
= 0; i
< inlen
; i
++)
93 src
= malloc (sizeof (in
[0]) * (inlen
+ 1));
95 return IDNA_MALLOC_ERROR
;
97 memcpy (src
, in
, sizeof (in
[0]) * inlen
);
105 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
106 * an error. The AllowUnassigned flag is used in [NAMEPREP].
112 p
= stringprep_ucs4_to_utf8 (in
, inlen
, NULL
, NULL
);
114 return IDNA_MALLOC_ERROR
;
121 len
= 2 * len
+ 10; /* XXX better guess? */
122 newp
= realloc (p
, len
);
126 return IDNA_MALLOC_ERROR
;
130 if (flags
& IDNA_ALLOW_UNASSIGNED
)
131 rc
= stringprep_nameprep (p
, len
);
133 rc
= stringprep_nameprep_no_unassigned (p
, len
);
135 while (rc
== STRINGPREP_TOO_SMALL_BUFFER
);
137 if (rc
!= STRINGPREP_OK
)
140 return IDNA_STRINGPREP_ERROR
;
143 src
= stringprep_utf8_to_ucs4 (p
, -1, NULL
);
150 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
152 * (a) Verify the absence of non-LDH ASCII code points; that is,
153 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
155 * (b) Verify the absence of leading and trailing hyphen-minus;
156 * that is, the absence of U+002D at the beginning and end of
160 if (flags
& IDNA_USE_STD3_ASCII_RULES
)
164 for (i
= 0; src
[i
]; i
++)
165 if (src
[i
] <= 0x2C || src
[i
] == 0x2E || src
[i
] == 0x2F ||
166 (src
[i
] >= 0x3A && src
[i
] <= 0x40) ||
167 (src
[i
] >= 0x5B && src
[i
] <= 0x60) ||
168 (src
[i
] >= 0x7B && src
[i
] <= 0x7F))
171 return IDNA_CONTAINS_NON_LDH
;
174 if (src
[0] == 0x002D || (i
> 0 && src
[i
- 1] == 0x002D))
177 return IDNA_CONTAINS_MINUS
;
182 * 4. If all code points in the sequence are in the ASCII range
183 * (0..7F), then skip to step 8.
191 for (i
= 0; src
[i
]; i
++)
195 /* copy string to output buffer if we are about to skip to step8 */
206 * 5. Verify that the sequence does NOT begin with the ACE prefix.
215 for (i
= 0; match
&& i
< strlen (IDNA_ACE_PREFIX
); i
++)
216 if (((uint32_t) IDNA_ACE_PREFIX
[i
] & 0xFF) != src
[i
])
221 return IDNA_CONTAINS_ACE_PREFIX
;
226 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
227 * and fail if there is an error.
229 for (len
= 0; src
[len
]; len
++)
232 outlen
= 63 - strlen (IDNA_ACE_PREFIX
);
233 rc
= punycode_encode (len
, src
, NULL
,
234 &outlen
, &out
[strlen (IDNA_ACE_PREFIX
)]);
235 if (rc
!= PUNYCODE_SUCCESS
)
238 return IDNA_PUNYCODE_ERROR
;
240 out
[strlen (IDNA_ACE_PREFIX
) + outlen
] = '\0';
243 * 7. Prepend the ACE prefix.
246 memcpy (out
, IDNA_ACE_PREFIX
, strlen (IDNA_ACE_PREFIX
));
249 * 8. Verify that the number of code points is in the range 1 to 63
250 * inclusive (0 is excluded).
255 if (strlen (out
) < 1 || strlen (out
) > 63)
256 return IDNA_INVALID_LENGTH
;
261 /* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
263 idna_to_unicode_internal (char *utf8in
,
264 uint32_t * out
, size_t * outlen
, int flags
)
268 size_t utf8len
= strlen (utf8in
) + 1;
272 * ToUnicode consists of the following steps:
274 * 1. If the sequence contains any code points outside the ASCII range
275 * (0..7F) then proceed to step 2, otherwise skip to step 3.
283 for (i
= 0; utf8in
[i
]; i
++)
284 if (utf8in
[i
] & ~0x7F)
291 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
292 * error. (If step 3 of ToASCII is also performed here, it will not
293 * affect the overall behavior of ToUnicode, but it is not
294 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
298 char *newp
= realloc (utf8in
, utf8len
+ addlen
);
302 return IDNA_MALLOC_ERROR
;
305 if (flags
& IDNA_ALLOW_UNASSIGNED
)
306 rc
= stringprep_nameprep (utf8in
, utf8len
+ addlen
);
308 rc
= stringprep_nameprep_no_unassigned (utf8in
, utf8len
+ addlen
);
311 while (rc
== STRINGPREP_TOO_SMALL_BUFFER
);
313 if (rc
!= STRINGPREP_OK
)
316 return IDNA_STRINGPREP_ERROR
;
319 /* 3. Verify that the sequence begins with the ACE prefix, and save a
320 * copy of the sequence.
324 if (memcmp (IDNA_ACE_PREFIX
, utf8in
, strlen (IDNA_ACE_PREFIX
)) != 0)
327 return IDNA_NO_ACE_PREFIX
;
330 /* 4. Remove the ACE prefix.
333 memmove (utf8in
, &utf8in
[strlen (IDNA_ACE_PREFIX
)],
334 strlen (utf8in
) - strlen (IDNA_ACE_PREFIX
) + 1);
336 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
337 * and fail if there is an error. Save a copy of the result of
341 (*outlen
)--; /* reserve one for the zero */
343 rc
= punycode_decode (strlen (utf8in
), utf8in
, outlen
, out
, NULL
);
344 if (rc
!= PUNYCODE_SUCCESS
)
347 return IDNA_PUNYCODE_ERROR
;
350 out
[*outlen
] = 0; /* add zero */
355 rc
= idna_to_ascii_4i (out
, *outlen
, tmpout
, flags
);
356 if (rc
!= IDNA_SUCCESS
)
362 /* 7. Verify that the result of step 6 matches the saved copy from
363 * step 3, using a case-insensitive ASCII comparison.
366 if (strcasecmp (utf8in
, tmpout
+ strlen (IDNA_ACE_PREFIX
)) != 0)
369 return IDNA_ROUNDTRIP_VERIFY_ERROR
;
372 /* 8. Return the saved copy from step 5.
380 * idna_to_unicode_44i
381 * @in: input array with unicode code points.
382 * @inlen: length of input array with unicode code points.
383 * @out: output array with unicode code points.
384 * @outlen: on input, maximum size of output array with unicode code points,
385 * on exit, actual size of output array with unicode code points.
386 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
387 * %IDNA_USE_STD3_ASCII_RULES.
389 * The ToUnicode operation takes a sequence of Unicode code points
390 * that make up one label and returns a sequence of Unicode code
391 * points. If the input sequence is a label in ACE form, then the
392 * result is an equivalent internationalized label that is not in ACE
393 * form, otherwise the original sequence is returned unaltered.
395 * ToUnicode never fails. If any step fails, then the original input
396 * sequence is returned immediately in that step.
398 * The Punycode decoder can never output more code points than it
399 * inputs, but Nameprep can, and therefore ToUnicode can. Note that
400 * the number of octets needed to represent a sequence of code points
401 * depends on the particular character encoding used.
403 * The inputs to ToUnicode are a sequence of code points, the
404 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
405 * ToUnicode is always a sequence of Unicode code points.
407 * Return value: Returns #Idna_rc error condition, but it must only be
408 * used for debugging purposes. The output buffer is always
409 * guaranteed to contain the correct data according to the
410 * specification (sans malloc induced errors). NB! This means that
411 * you normally ignore the return code from this function, as
412 * checking it means breaking the standard.
415 idna_to_unicode_44i (const uint32_t * in
, size_t inlen
,
416 uint32_t * out
, size_t * outlen
, int flags
)
419 size_t outlensave
= *outlen
;
422 p
= stringprep_ucs4_to_utf8 (in
, inlen
, NULL
, NULL
);
424 return IDNA_MALLOC_ERROR
;
426 rc
= idna_to_unicode_internal (p
, out
, outlen
, flags
);
427 if (rc
!= IDNA_SUCCESS
)
429 memcpy (out
, in
, sizeof (in
[0]) * (inlen
< outlensave
?
430 inlen
: outlensave
));
434 /* p is freed in idna_to_unicode_internal. */
439 /* Wrappers that handle several labels */
443 * @input: zero terminated input Unicode string.
444 * @output: pointer to newly allocated output string.
445 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
446 * %IDNA_USE_STD3_ASCII_RULES.
448 * Convert UCS-4 domain name to ASCII string. The domain name may
449 * contain several labels, separated by dots. The output buffer must
450 * be deallocated by the caller.
452 * Return value: Returns IDNA_SUCCESS on success, or error code.
455 idna_to_ascii_4z (const uint32_t * input
, char **output
, int flags
)
457 const uint32_t *start
= input
;
458 const uint32_t *end
= input
;
463 /* 1) Whenever dots are used as label separators, the following
464 characters MUST be recognized as dots: U+002E (full stop),
465 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
466 U+FF61 (halfwidth ideographic full stop). */
470 /* Handle implicit zero-length root label. */
471 *output
= malloc (1);
473 return IDNA_MALLOC_ERROR
;
474 strcpy (*output
, "");
478 if (DOTP (input
[0]) && input
[1] == 0)
480 /* Handle explicit zero-length root label. */
481 *output
= malloc (2);
483 return IDNA_MALLOC_ERROR
;
484 strcpy (*output
, ".");
493 for (; *end
&& !DOTP (*end
); end
++)
496 if (*end
== '\0' && start
== end
)
498 /* Handle explicit zero-length root label. */
503 rc
= idna_to_ascii_4i (start
, end
- start
, buf
, flags
);
504 if (rc
!= IDNA_SUCCESS
)
510 char *newp
= realloc (out
, strlen (out
) + 1 + strlen (buf
) + 1);
514 return IDNA_MALLOC_ERROR
;
522 out
= (char *) malloc (strlen (buf
) + 1);
524 return IDNA_MALLOC_ERROR
;
539 * @input: zero terminated input UTF-8 string.
540 * @output: pointer to newly allocated output string.
541 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
542 * %IDNA_USE_STD3_ASCII_RULES.
544 * Convert UTF-8 domain name to ASCII string. The domain name may
545 * contain several labels, separated by dots. The output buffer must
546 * be deallocated by the caller.
548 * Return value: Returns IDNA_SUCCESS on success, or error code.
551 idna_to_ascii_8z (const char *input
, char **output
, int flags
)
557 ucs4
= stringprep_utf8_to_ucs4 (input
, -1, &ucs4len
);
559 return IDNA_ICONV_ERROR
;
561 rc
= idna_to_ascii_4z (ucs4
, output
, flags
);
571 * @input: zero terminated input string encoded in the current locale's
573 * @output: pointer to newly allocated output string.
574 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
575 * %IDNA_USE_STD3_ASCII_RULES.
577 * Convert domain name in the locale's encoding to ASCII string. The
578 * domain name may contain several labels, separated by dots. The
579 * output buffer must be deallocated by the caller.
581 * Return value: Returns IDNA_SUCCESS on success, or error code.
584 idna_to_ascii_lz (const char *input
, char **output
, int flags
)
589 utf8
= stringprep_locale_to_utf8 (input
);
591 return IDNA_ICONV_ERROR
;
593 rc
= idna_to_ascii_8z (utf8
, output
, flags
);
601 * idna_to_unicode_4z4z:
602 * @input: zero-terminated Unicode string.
603 * @output: pointer to newly allocated output Unicode string.
604 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
605 * %IDNA_USE_STD3_ASCII_RULES.
607 * Convert possibly ACE encoded domain name in UCS-4 format into a
608 * UCS-4 string. The domain name may contain several labels,
609 * separated by dots. The output buffer must be deallocated by the
612 * Return value: Returns IDNA_SUCCESS on success, or error code.
615 idna_to_unicode_4z4z (const uint32_t * input
, uint32_t ** output
, int flags
)
617 const uint32_t *start
= input
;
618 const uint32_t *end
= input
;
621 uint32_t *out
= NULL
;
631 for (; *end
&& !DOTP (*end
); end
++)
634 buflen
= end
- start
;
635 buf
= malloc (sizeof (buf
[0]) * (buflen
+ 1));
637 return IDNA_MALLOC_ERROR
;
639 rc
= idna_to_unicode_44i (start
, end
- start
, buf
, &buflen
, flags
);
640 /* don't check rc as per specification! */
644 uint32_t *newp
= realloc (out
,
646 * (outlen
+ 1 + buflen
+ 1));
651 return IDNA_MALLOC_ERROR
;
654 out
[outlen
++] = 0x002E; /* '.' (full stop) */
655 memcpy (out
+ outlen
, buf
, sizeof (buf
[0]) * buflen
);
677 * idna_to_unicode_8z4z:
678 * @input: zero-terminated UTF-8 string.
679 * @output: pointer to newly allocated output Unicode string.
680 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
681 * %IDNA_USE_STD3_ASCII_RULES.
683 * Convert possibly ACE encoded domain name in UTF-8 format into a
684 * UCS-4 string. The domain name may contain several labels,
685 * separated by dots. The output buffer must be deallocated by the
688 * Return value: Returns IDNA_SUCCESS on success, or error code.
691 idna_to_unicode_8z4z (const char *input
, uint32_t ** output
, int flags
)
697 ucs4
= stringprep_utf8_to_ucs4 (input
, -1, &ucs4len
);
699 return IDNA_ICONV_ERROR
;
701 rc
= idna_to_unicode_4z4z (ucs4
, output
, flags
);
708 * idna_to_unicode_8z8z:
709 * @input: zero-terminated UTF-8 string.
710 * @output: pointer to newly allocated output UTF-8 string.
711 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
712 * %IDNA_USE_STD3_ASCII_RULES.
714 * Convert possibly ACE encoded domain name in UTF-8 format into a
715 * UTF-8 string. The domain name may contain several labels,
716 * separated by dots. The output buffer must be deallocated by the
719 * Return value: Returns IDNA_SUCCESS on success, or error code.
722 idna_to_unicode_8z8z (const char *input
, char **output
, int flags
)
727 rc
= idna_to_unicode_8z4z (input
, &ucs4
, flags
);
728 *output
= stringprep_ucs4_to_utf8 (ucs4
, -1, NULL
, NULL
);
732 return IDNA_ICONV_ERROR
;
738 * idna_to_unicode_8zlz:
739 * @input: zero-terminated UTF-8 string.
740 * @output: pointer to newly allocated output string encoded in the
741 * current locale's character set.
742 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
743 * %IDNA_USE_STD3_ASCII_RULES.
745 * Convert possibly ACE encoded domain name in UTF-8 format into a
746 * string encoded in the current locale's character set. The domain
747 * name may contain several labels, separated by dots. The output
748 * buffer must be deallocated by the caller.
750 * Return value: Returns IDNA_SUCCESS on success, or error code.
753 idna_to_unicode_8zlz (const char *input
, char **output
, int flags
)
758 rc
= idna_to_unicode_8z8z (input
, &utf8
, flags
);
759 *output
= stringprep_utf8_to_locale (utf8
);
763 return IDNA_ICONV_ERROR
;
769 * idna_to_unicode_lzlz:
770 * @input: zero-terminated string encoded in the current locale's
772 * @output: pointer to newly allocated output string encoded in the
773 * current locale's character set.
774 * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
775 * %IDNA_USE_STD3_ASCII_RULES.
777 * Convert possibly ACE encoded domain name in the locale's character
778 * set into a string encoded in the current locale's character set.
779 * The domain name may contain several labels, separated by dots. The
780 * output buffer must be deallocated by the caller.
782 * Return value: Returns IDNA_SUCCESS on success, or error code.
785 idna_to_unicode_lzlz (const char *input
, char **output
, int flags
)
790 utf8
= stringprep_locale_to_utf8 (input
);
792 return IDNA_ICONV_ERROR
;
794 rc
= idna_to_unicode_8zlz (utf8
, output
, flags
);
803 * The IANA allocated prefix to use for IDNA. "xn--"
808 * @IDNA_SUCCESS: Successful operation. This value is guaranteed to
809 * always be zero, the remaining ones are only guaranteed to hold
810 * non-zero values, for logical comparison purposes.
811 * @IDNA_STRINGPREP_ERROR: Error during string preparation.
812 * @IDNA_PUNYCODE_ERROR: Error during punycode operation.
813 * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
814 * the string contains non-LDH ASCII characters.
815 * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
816 * the string contains a leading or trailing hyphen-minus (U+002D).
817 * @IDNA_INVALID_LENGTH: The final output string is not within the
818 * (inclusive) range 1 to 63 characters.
819 * @IDNA_NO_ACE_PREFIX: The string does not contain the ACE prefix
821 * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
822 * string does not equal the input.
823 * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
825 * @IDNA_ICONV_ERROR: Could not convert string in locale encoding.
826 * @IDNA_MALLOC_ERROR: Could not allocate buffer (this is typically a
828 * @IDNA_DLOPEN_ERROR: Could not dlopen the libcidn DSO (only used
829 * internally in libc).
831 * Enumerated return codes of idna_to_ascii_4i(),
832 * idna_to_unicode_44i() functions (and functions derived from those
833 * functions). The value 0 is guaranteed to always correspond to
840 * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
841 * Unicode code points.
842 * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
843 * rules (i.e., normal host name rules).
845 * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.