Don't use () when talking about the name of a function.
[libidn.git] / idna.c
blob117ff5288f7e96fe12a9ce3cfa08835838cf1ae8
1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /* Core functions */
26 /**
27 * idna_to_ascii_4i
28 * @in: input array with unicode code points.
29 * @inlen: length of input array with unicode code points.
30 * @out: output zero terminated string that must have room for at
31 * least 63 characters plus the terminating zero.
32 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
34 * The ToASCII operation takes a sequence of Unicode code points that make
35 * up one label and transforms it into a sequence of code points in the
36 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
37 * resulting sequence are equivalent labels.
39 * It is important to note that the ToASCII operation can fail. ToASCII
40 * fails if any step of it fails. If any step of the ToASCII operation
41 * fails on any label in a domain name, that domain name MUST NOT be used
42 * as an internationalized domain name. The method for deadling with this
43 * failure is application-specific.
45 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
46 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
47 * sequence of ASCII code points or a failure condition.
49 * ToASCII never alters a sequence of code points that are all in the ASCII
50 * range to begin with (although it could fail). Applying the ToASCII
51 * operation multiple times has exactly the same effect as applying it just
52 * once.
54 * Return value: Returns 0 on success, or an error code.
56 int
57 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
59 size_t len, outlen;
60 uint32_t *src; /* XXX don't need to copy data? */
61 int rc;
65 * ToASCII consists of the following steps:
67 * 1. If all code points in the sequence are in the ASCII range (0..7F)
68 * then skip to step 3.
72 size_t i;
73 int inasciirange;
75 inasciirange = 1;
76 for (i = 0; i < inlen; i++)
77 if (in[i] > 0x7F)
78 inasciirange = 0;
79 if (inasciirange)
81 src = malloc (sizeof (in[0]) * (inlen + 1));
82 if (src == NULL)
83 return IDNA_MALLOC_ERROR;
85 memcpy (src, in, sizeof (in[0]) * inlen);
86 src[inlen] = 0;
88 goto step3;
93 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
94 * an error. The AllowUnassigned flag is used in [NAMEPREP].
98 char *p;
100 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
101 if (p == NULL)
102 return IDNA_MALLOC_ERROR;
104 len = strlen (p);
107 len = 2 * len + 10; /* XXX better guess? */
108 p = realloc (p, len);
109 if (p == NULL)
110 return IDNA_MALLOC_ERROR;
112 if (flags & IDNA_ALLOW_UNASSIGNED)
113 rc = stringprep_nameprep (p, len);
114 else
115 rc = stringprep_nameprep_no_unassigned (p, len);
117 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
119 if (rc != STRINGPREP_OK)
121 free (p);
122 return IDNA_STRINGPREP_ERROR;
125 src = stringprep_utf8_to_ucs4 (p, -1, NULL);
127 free (p);
130 step3:
132 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
134 * (a) Verify the absence of non-LDH ASCII code points; that is,
135 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
137 * (b) Verify the absence of leading and trailing hyphen-minus;
138 * that is, the absence of U+002D at the beginning and end of
139 * the sequence.
142 if (flags & IDNA_USE_STD3_ASCII_RULES)
144 size_t i;
146 for (i = 0; src[i]; i++)
147 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
148 (src[i] >= 0x3A && src[i] <= 0x40) ||
149 (src[i] >= 0x5B && src[i] <= 0x60) ||
150 (src[i] >= 0x7B && src[i] <= 0x7F))
152 free (src);
153 return IDNA_CONTAINS_LDH;
156 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
158 free (src);
159 return IDNA_CONTAINS_MINUS;
164 * 4. If all code points in the sequence are in the ASCII range
165 * (0..7F), then skip to step 8.
169 size_t i;
170 int inasciirange;
172 inasciirange = 1;
173 for (i = 0; src[i]; i++)
175 if (src[i] > 0x7F)
176 inasciirange = 0;
177 /* copy string to output buffer if we are about to skip to step8 */
178 if (i < 64)
179 out[i] = src[i];
181 if (i < 64)
182 out[i] = '\0';
183 if (inasciirange)
184 goto step8;
188 * 5. Verify that the sequence does NOT begin with the ACE prefix.
193 size_t i;
194 int match;
196 match = 1;
197 for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
198 if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
199 match = 0;
200 if (match)
202 free (src);
203 return IDNA_CONTAINS_ACE_PREFIX;
208 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
209 * and fail if there is an error.
211 for (len = 0; src[len]; len++)
213 src[len] = '\0';
214 outlen = 63 - strlen (IDNA_ACE_PREFIX);
215 rc = punycode_encode (len, src, NULL,
216 &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
217 free (src);
218 if (rc != PUNYCODE_SUCCESS)
219 return IDNA_PUNYCODE_ERROR;
220 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
223 * 7. Prepend the ACE prefix.
226 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
229 * 8. Verify that the number of code points is in the range 1 to 63
230 * inclusive.
233 step8:
234 if (strlen (out) < 1 || strlen (out) > 63)
235 return IDNA_INVALID_LENGTH;
237 return IDNA_SUCCESS;
240 static int
241 idna_to_unicode_internal (char *utf8in, size_t utf8len,
242 uint32_t * out, size_t * outlen, int flags)
244 int rc;
245 char tmpout[64];
248 * 1. If all code points in the sequence are in the ASCII range (0..7F)
249 * then skip to step 3.
253 size_t i;
254 int inasciirange;
256 inasciirange = 1;
257 for (i = 0; utf8in[i]; i++)
258 if (utf8in[i] & ~0x7F)
259 inasciirange = 0;
260 if (inasciirange)
261 goto step3;
265 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
266 * error. (If step 3 of ToASCII is also performed here, it will not
267 * affect the overall behavior of ToUnicode, but it is not
268 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
271 if (flags & IDNA_ALLOW_UNASSIGNED)
272 rc = stringprep_nameprep (utf8in, utf8len);
273 else
274 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len);
276 if (rc != STRINGPREP_OK)
277 return IDNA_STRINGPREP_ERROR;
279 /* 3. Verify that the sequence begins with the ACE prefix, and save a
280 * copy of the sequence.
283 step3:
284 if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
285 return IDNA_NO_ACE_PREFIX;
287 /* 4. Remove the ACE prefix.
290 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
291 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
293 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
294 * and fail if there is an error. Save a copy of the result of
295 * this step.
298 (*outlen)--; /* reserve one for the zero */
300 rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
301 if (rc != PUNYCODE_SUCCESS)
302 return IDNA_PUNYCODE_ERROR;
304 out[*outlen] = 0; /* add zero */
306 /* 6. Apply ToASCII.
309 rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
310 if (rc != IDNA_SUCCESS)
311 return rc;
313 /* 7. Verify that the result of step 6 matches the saved copy from
314 * step 3, using a case-insensitive ASCII comparison.
317 if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
318 return IDNA_ROUNDTRIP_VERIFY_ERROR;
320 /* 8. Return the saved copy from step 5.
323 return IDNA_SUCCESS;
327 * idna_to_unicode_44i
328 * @in: input array with unicode code points.
329 * @inlen: length of input array with unicode code points.
330 * @out: output array with unicode code points.
331 * @outlen: on input, maximum size of output array with unicode code points,
332 * on exit, actual size of output array with unicode code points.
333 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
335 * The ToUnicode operation takes a sequence of Unicode code points
336 * that make up one label and returns a sequence of Unicode code
337 * points. If the input sequence is a label in ACE form, then the
338 * result is an equivalent internationalized label that is not in ACE
339 * form, otherwise the original sequence is returned unaltered.
341 * ToUnicode never fails. If any step fails, then the original input
342 * sequence is returned immediately in that step.
344 * The ToUnicode output never contains more code points than its
345 * input. Note that the number of octets needed to represent a
346 * sequence of code points depends on the particular character
347 * encoding used.
349 * The inputs to ToUnicode are a sequence of code points, the
350 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
351 * ToUnicode is always a sequence of Unicode code points.
353 * Return value: Returns error condition, but it must only be used for
354 * debugging purposes. The output buffer is always
355 * guaranteed to contain the correct data according to
356 * the specification (sans malloc induced errors). NB!
357 * This means that you normally ignore the return code
358 * from this function, as checking it means breaking the
359 * standard.
362 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
363 uint32_t * out, size_t * outlen, int flags)
365 int rc;
366 size_t outlensave = *outlen;
367 char *p;
369 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
370 if (p == NULL)
371 return IDNA_MALLOC_ERROR;
373 p = realloc (p, BUFSIZ);
374 if (p == NULL)
375 return IDNA_MALLOC_ERROR;
377 rc = idna_to_unicode_internal (p, BUFSIZ, out, outlen, flags);
378 if (rc != IDNA_SUCCESS)
380 memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
381 inlen : outlensave));
382 *outlen = inlen;
385 free (p);
387 return rc;
390 /* Wrappers that handle several labels */
393 * idna_to_ascii_4z:
394 * @input: zero terminated input Unicode string.
395 * @output: pointer to newly allocated output string.
396 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
398 * Convert UCS-4 domain name to ASCII string. The domain name may
399 * contain several labels, separated by dots. The output buffer must
400 * be deallocated by the caller.
402 * Return value: Returns IDNA_SUCCESS on success, or error code.
405 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
407 const uint32_t *start = input;
408 const uint32_t *end = input;
409 char buf[64];
410 char *out = NULL;
411 int rc;
413 *output = NULL;
417 end = start;
419 /* 1) Whenever dots are used as label separators, the following
420 characters MUST be recognized as dots: U+002E (full stop),
421 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
422 U+FF61 (halfwidth ideographic full stop). */
423 for (; *end &&
424 *end != 0x002E &&
425 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
428 /* Handle empty trailing labels. The RFC is not clear on this,
429 the text that mandate this behaviour inside a parenthesis in
430 the terminology section. */
431 if (end == start && *end == '\0')
433 strcpy(buf, out ? "" : ".");
435 else
437 rc = idna_to_ascii_4i (start, end - start, buf, flags);
438 if (rc != IDNA_SUCCESS)
439 return rc;
442 if (out)
444 out = realloc (out, strlen (out) + 1 + strlen (buf) + 1);
445 if (!out)
446 return IDNA_MALLOC_ERROR;
447 strcat (out, ".");
448 strcat (out, buf);
450 else
452 out = (char *) strdup (buf);
453 if (!out)
454 return IDNA_MALLOC_ERROR;
457 start = end + 1;
459 while (*end);
461 *output = out;
463 return IDNA_SUCCESS;
467 * idna_to_ascii_8z:
468 * @input: zero terminated input UTF-8 string.
469 * @output: pointer to newly allocated output string.
470 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
472 * Convert UTF-8 domain name to ASCII string. The domain name may
473 * contain several labels, separated by dots. The output buffer must
474 * be deallocated by the caller.
476 * Return value: Returns IDNA_SUCCESS on success, or error code.
479 idna_to_ascii_8z (const char *input, char **output, int flags)
481 uint32_t *ucs4;
482 size_t ucs4len;
483 int rc;
485 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
486 if (!ucs4)
487 return IDNA_ICONV_ERROR;
489 rc = idna_to_ascii_4z (ucs4, output, flags);
491 free (ucs4);
493 return rc;
498 * idna_to_ascii_lz:
499 * @input: zero terminated input UTF-8 string.
500 * @output: pointer to newly allocated output string.
501 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
503 * Convert domain name in the locale's encoding to ASCII string. The
504 * domain name may contain several labels, separated by dots. The
505 * output buffer must be deallocated by the caller.
507 * Return value: Returns IDNA_SUCCESS on success, or error code.
510 idna_to_ascii_lz (const char *input, char **output, int flags)
512 char *utf8;
513 int rc;
515 utf8 = stringprep_locale_to_utf8 (input);
516 if (!utf8)
517 return IDNA_ICONV_ERROR;
519 rc = idna_to_ascii_8z (utf8, output, flags);
521 free (utf8);
523 return rc;
527 * idna_to_unicode_4z4z:
528 * @input: zero-terminated Unicode string.
529 * @output: pointer to newly allocated output Unicode string.
530 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
532 * Convert possibly ACE encoded domain name in UCS-4 format into a
533 * UCS-4 string. The domain name may contain several labels,
534 * separated by dots. The output buffer must be deallocated by the
535 * caller.
537 * Return value: Returns IDNA_SUCCESS on success, or error code.
540 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
542 const uint32_t *start = input;
543 const uint32_t *end = input;
544 uint32_t *buf;
545 size_t buflen;
546 uint32_t *out = NULL;
547 size_t outlen = 0;
548 int rc;
550 *output = NULL;
554 end = start;
556 /* 1) Whenever dots are used as label separators, the following
557 characters MUST be recognized as dots: U+002E (full stop),
558 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
559 U+FF61 (halfwidth ideographic full stop). */
560 for (; *end &&
561 *end != 0x002E &&
562 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
565 buflen = end - start;
566 buf = malloc (sizeof (buf[0]) * (buflen + 1));
567 if (!buf)
568 return IDNA_MALLOC_ERROR;
570 rc = idna_to_unicode_44i (start, end - start, buf, &buflen, flags);
571 /* don't check rc as per specification! */
573 if (out)
575 out = realloc (out, sizeof (out[0]) * (outlen + 1 + buflen + 1));
576 if (!out)
577 return IDNA_MALLOC_ERROR;
578 out[outlen++] = 0x002E; /* '.' (full stop) */
579 memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
580 outlen += buflen;
581 out[outlen] = 0x0;
582 free (buf);
584 else
586 out = buf;
587 outlen = buflen;
588 out[outlen] = 0x0;
591 start = end + 1;
593 while (*end);
595 *output = out;
597 return IDNA_SUCCESS;
601 * idna_to_unicode_8z4z:
602 * @input: zero-terminated UTF-8 string.
603 * @output: pointer to newly allocated output Unicode string.
604 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
606 * Convert possibly ACE encoded domain name in UTF-8 format into a
607 * UCS-4 string. The domain name may contain several labels,
608 * separated by dots. The output buffer must be deallocated by the
609 * caller.
611 * Return value: Returns IDNA_SUCCESS on success, or error code.
614 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
616 uint32_t *ucs4;
617 size_t ucs4len;
618 int rc;
620 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
621 if (!ucs4)
622 return IDNA_ICONV_ERROR;
624 rc = idna_to_unicode_4z4z (ucs4, output, flags);
625 free (ucs4);
627 return rc;
631 * idna_to_unicode_8z8z:
632 * @input: zero-terminated UTF-8 string.
633 * @output: pointer to newly allocated output UTF-8 string.
634 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
636 * Convert possibly ACE encoded domain name in UTF-8 format into a
637 * UTF-8 string. The domain name may contain several labels,
638 * separated by dots. The output buffer must be deallocated by the
639 * caller.
641 * Return value: Returns IDNA_SUCCESS on success, or error code.
644 idna_to_unicode_8z8z (const char *input, char **output, int flags)
646 uint32_t *ucs4;
647 int rc;
649 rc = idna_to_unicode_8z4z (input, &ucs4, flags);
650 *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
651 free (ucs4);
653 if (!*output)
654 return IDNA_ICONV_ERROR;
656 return rc;
660 * idna_to_unicode_8zlz:
661 * @input: zero-terminated UTF-8 string.
662 * @output: pointer to newly allocated output string encoded in the
663 * current locale's character set.
664 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
666 * Convert possibly ACE encoded domain name in UTF-8 format into a
667 * string encoded in the current locale's character set. The domain
668 * name may contain several labels, separated by dots. The output
669 * buffer must be deallocated by the caller.
671 * Return value: Returns IDNA_SUCCESS on success, or error code.
674 idna_to_unicode_8zlz (const char *input, char **output, int flags)
676 char *utf8;
677 int rc;
679 rc = idna_to_unicode_8z8z (input, &utf8, flags);
680 *output = stringprep_utf8_to_locale (utf8);
681 free (utf8);
683 if (!*output)
684 return IDNA_ICONV_ERROR;
686 return rc;
690 * idna_to_unicode_lzlz:
691 * @input: zero-terminated string encoded in the current locale's
692 * character set.
693 * @output: pointer to newly allocated output string encoded in the
694 * current locale's character set.
695 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
697 * Convert possibly ACE encoded domain name in the locale's character
698 * set into a string encoded in the current locale's character set.
699 * The domain name may contain several labels, separated by dots. The
700 * output buffer must be deallocated by the caller.
702 * Return value: Returns IDNA_SUCCESS on success, or error code.
705 idna_to_unicode_lzlz (const char *input, char **output, int flags)
707 char *utf8;
708 int rc;
710 utf8 = stringprep_locale_to_utf8 (input);
711 if (!utf8)
712 return IDNA_ICONV_ERROR;
714 rc = idna_to_unicode_8zlz (utf8, output, flags);
715 free (utf8);
717 return rc;
721 /* Deprecated interfaces */
724 * idna_to_ascii
725 * @in: input array with unicode code points.
726 * @inlen: length of input array with unicode code points.
727 * @out: output zero terminated string that must have room for at
728 * least 63 characters plus the terminating zero.
729 * @allowunassigned: whether to allow unassigned code points.
730 * @usestd3asciirules: whether to check input for STD3 compliance.
732 * The ToASCII operation takes a sequence of Unicode code points that make
733 * up one label and transforms it into a sequence of code points in the
734 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
735 * resulting sequence are equivalent labels.
737 * It is important to note that the ToASCII operation can fail. ToASCII
738 * fails if any step of it fails. If any step of the ToASCII operation
739 * fails on any label in a domain name, that domain name MUST NOT be used
740 * as an internationalized domain name. The method for deadling with this
741 * failure is application-specific.
743 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
744 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
745 * sequence of ASCII code points or a failure condition.
747 * ToASCII never alters a sequence of code points that are all in the ASCII
748 * range to begin with (although it could fail). Applying the ToASCII
749 * operation multiple times has exactly the same effect as applying it just
750 * once.
752 * Return value: Returns 0 on success, or an error code.
755 idna_to_ascii (const unsigned long *in, size_t inlen,
756 char *out, int allowunassigned, int usestd3asciirules)
758 int rc;
759 int flags = 0;
760 uint32_t *tmp;
761 size_t i;
763 tmp = malloc (sizeof (tmp[0]) * inlen);
764 if (!tmp)
765 return IDNA_MALLOC_ERROR;
767 if (allowunassigned)
768 flags |= IDNA_ALLOW_UNASSIGNED;
769 if (usestd3asciirules)
770 flags |= IDNA_USE_STD3_ASCII_RULES;
772 for (i = 0; i < inlen; i++)
773 tmp[i] = in[i];
774 rc = idna_to_ascii_4i (tmp, inlen, out, flags);
775 free (tmp);
777 return rc;
781 * idna_to_unicode
782 * @in: input array with unicode code points.
783 * @inlen: length of input array with unicode code points.
784 * @out: output array with unicode code points.
785 * @outlen: on input, maximum size of output array with unicode code points,
786 * on exit, actual size of output array with unicode code points.
787 * @allowunassigned: whether to allow unassigned code points.
788 * @usestd3asciirules: whether to check input for STD3 compliance.
790 * The ToUnicode operation takes a sequence of Unicode code points
791 * that make up one label and returns a sequence of Unicode code
792 * points. If the input sequence is a label in ACE form, then the
793 * result is an equivalent internationalized label that is not in ACE
794 * form, otherwise the original sequence is returned unaltered.
796 * ToUnicode never fails. If any step fails, then the original input
797 * sequence is returned immediately in that step.
799 * The ToUnicode output never contains more code points than its
800 * input. Note that the number of octets needed to represent a
801 * sequence of code points depends on the particular character
802 * encoding used.
804 * The inputs to ToUnicode are a sequence of code points, the
805 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
806 * ToUnicode is always a sequence of Unicode code points.
808 * Return value: Returns error condition, but it must only be used for
809 * debugging purposes. The output buffer is always
810 * guaranteed to contain the correct data according to
811 * the specification (sans malloc induced errors). NB!
812 * This means that you normally ignore the return code
813 * from this function, as checking it means breaking the
814 * standard.
817 idna_to_unicode (const unsigned long *in, size_t inlen,
818 unsigned long *out, size_t * outlen,
819 int allowunassigned, int usestd3asciirules)
821 int rc;
822 int flags = 0;
823 uint32_t *tmpin;
824 uint32_t *tmpout;
825 size_t i;
827 tmpin = malloc (sizeof (tmpin[0]) * inlen);
828 if (!tmpin)
829 return IDNA_MALLOC_ERROR;
830 tmpout = malloc (sizeof (tmpout[0]) * *outlen);
831 if (!tmpout)
832 return IDNA_MALLOC_ERROR;
834 if (allowunassigned)
835 flags |= IDNA_ALLOW_UNASSIGNED;
836 if (usestd3asciirules)
837 flags |= IDNA_USE_STD3_ASCII_RULES;
839 for (i = 0; i < inlen; i++)
840 tmpin[i] = in[i];
841 rc = idna_to_unicode_44i (tmpin, inlen, tmpout, outlen, flags);
842 free (tmpin);
844 for (i = 0; i < *outlen; i++)
845 out[i] = tmpout[i];
847 free (tmpout);
849 return rc;
854 * idna_to_ascii_from_ucs4:
855 * @input: zero terminated input Unicode string.
856 * @output: pointer to newly allocated output string.
857 * @allowunassigned: whether to allow unassigned code points.
858 * @usestd3asciirules: whether to check input for STD3 compliance.
860 * Convert UCS-4 domain name to ASCII string. The domain name may
861 * contain several labels, separated by dots. The output buffer must
862 * be deallocated by the caller.
864 * Return value: Returns IDNA_SUCCESS on success, or error code.
867 idna_to_ascii_from_ucs4 (const unsigned long *input, char **output,
868 int allowunassigned, int usestd3asciirules)
870 size_t inlen;
871 int rc;
872 int flags = 0;
873 uint32_t *tmp;
874 size_t i;
876 for (inlen = 0; input[inlen]; inlen++)
878 tmp = malloc (sizeof (tmp[0]) * (inlen + 1));
879 if (!tmp)
880 return IDNA_MALLOC_ERROR;
882 if (allowunassigned)
883 flags |= IDNA_ALLOW_UNASSIGNED;
884 if (usestd3asciirules)
885 flags |= IDNA_USE_STD3_ASCII_RULES;
887 for (i = 0; i < inlen; i++)
888 tmp[i] = input[i];
889 tmp[i] = 0;
890 rc = idna_to_ascii_4z (tmp, output, flags);
891 free (tmp);
893 return rc;
897 * idna_to_ascii_from_utf8:
898 * @input: zero terminated input UTF-8 string.
899 * @output: pointer to newly allocated output string.
900 * @allowunassigned: whether to allow unassigned code points.
901 * @usestd3asciirules: whether to check input for STD3 compliance.
903 * Convert UTF-8 domain name to ASCII string. The domain name may
904 * contain several labels, separated by dots. The output buffer must
905 * be deallocated by the caller.
907 * Return value: Returns IDNA_SUCCESS on success, or error code.
910 idna_to_ascii_from_utf8 (const char *input, char **output,
911 int allowunassigned, int usestd3asciirules)
913 int rc;
914 int flags = 0;
916 if (allowunassigned)
917 flags |= IDNA_ALLOW_UNASSIGNED;
918 if (usestd3asciirules)
919 flags |= IDNA_USE_STD3_ASCII_RULES;
921 rc = idna_to_ascii_8z (input, output, flags);
923 return rc;
927 * idna_to_ascii_from_locale:
928 * @input: zero terminated input UTF-8 string.
929 * @output: pointer to newly allocated output string.
930 * @allowunassigned: whether to allow unassigned code points.
931 * @usestd3asciirules: whether to check input for STD3 compliance.
933 * Convert domain name in the locale's encoding to ASCII string. The
934 * domain name may contain several labels, separated by dots. The
935 * output buffer must be deallocated by the caller.
937 * Return value: Returns IDNA_SUCCESS on success, or error code.
940 idna_to_ascii_from_locale (const char *input, char **output,
941 int allowunassigned, int usestd3asciirules)
943 int rc;
944 int flags = 0;
946 if (allowunassigned)
947 flags |= IDNA_ALLOW_UNASSIGNED;
948 if (usestd3asciirules)
949 flags |= IDNA_USE_STD3_ASCII_RULES;
951 rc = idna_to_ascii_lz (input, output, flags);
953 return rc;
957 * idna_to_unicode_ucs4_from_ucs4:
958 * @input: zero-terminated Unicode string.
959 * @output: pointer to newly allocated output Unicode string.
960 * @allowunassigned: whether to allow unassigned code points.
961 * @usestd3asciirules: whether to check input for STD3 compliance.
963 * Convert possibly ACE encoded domain name in UCS-4 format into a
964 * UCS-4 string. The domain name may contain several labels,
965 * separated by dots. The output buffer must be deallocated by the
966 * caller.
968 * Return value: Returns IDNA_SUCCESS on success, or error code.
971 idna_to_unicode_ucs4_from_ucs4 (const unsigned long *input,
972 unsigned long **output,
973 int allowunassigned, int usestd3asciirules)
975 size_t inlen, tmpoutlen;
976 int rc;
977 int flags = 0;
978 uint32_t *tmpin;
979 uint32_t *tmpout;
980 size_t i;
982 for (inlen = 0; input[inlen]; inlen++)
984 tmpin = malloc (sizeof (tmpin[0]) * (inlen + 1));
985 if (!tmpin)
986 return IDNA_MALLOC_ERROR;
988 if (allowunassigned)
989 flags |= IDNA_ALLOW_UNASSIGNED;
990 if (usestd3asciirules)
991 flags |= IDNA_USE_STD3_ASCII_RULES;
993 for (i = 0; i < inlen; i++)
994 tmpin[i] = input[i];
995 tmpin[i] = 0;
996 rc = idna_to_unicode_4z4z (tmpin, &tmpout, flags);
997 free (tmpin);
999 for (tmpoutlen = 0; tmpout[tmpoutlen]; tmpoutlen++)
1002 *output = malloc (sizeof (output[0]) * (tmpoutlen + 1));
1003 if (!*output)
1004 return IDNA_MALLOC_ERROR;
1006 for (i = 0; i < tmpoutlen; i++)
1007 (*output)[i] = tmpout[i];
1008 (*output)[i] = 0;
1010 return rc;
1014 * idna_to_unicode_ucs4_from_utf8:
1015 * @input: zero-terminated UTF-8 string.
1016 * @output: pointer to newly allocated output Unicode string.
1017 * @allowunassigned: whether to allow unassigned code points.
1018 * @usestd3asciirules: whether to check input for STD3 compliance.
1020 * Convert possibly ACE encoded domain name in UTF-8 format into a
1021 * UCS-4 string. The domain name may contain several labels,
1022 * separated by dots. The output buffer must be deallocated by the
1023 * caller.
1025 * Return value: Returns IDNA_SUCCESS on success, or error code.
1028 idna_to_unicode_ucs4_from_utf8 (const char *input, unsigned long **output,
1029 int allowunassigned, int usestd3asciirules)
1031 size_t tmpinlen, tmpoutlen;
1032 int rc;
1033 int flags = 0;
1034 uint32_t *tmpin;
1035 uint32_t *tmpout;
1036 size_t i;
1038 tmpin = stringprep_utf8_to_ucs4 (input, -1, &tmpinlen);
1039 if (!tmpin)
1040 return IDNA_ICONV_ERROR;
1042 if (allowunassigned)
1043 flags |= IDNA_ALLOW_UNASSIGNED;
1044 if (usestd3asciirules)
1045 flags |= IDNA_USE_STD3_ASCII_RULES;
1047 rc = idna_to_unicode_4z4z (tmpin, &tmpout, flags);
1048 free (tmpin);
1050 for (tmpoutlen = 0; tmpout[tmpoutlen]; tmpoutlen++)
1053 *output = malloc (sizeof (output[0]) * (tmpoutlen + 1));
1054 if (!*output)
1055 return IDNA_MALLOC_ERROR;
1057 for (i = 0; i < tmpoutlen; i++)
1058 (*output)[i] = tmpout[i];
1059 (*output)[i] = 0;
1061 return rc;
1065 * idna_to_unicode_utf8_from_utf8:
1066 * @input: zero-terminated UTF-8 string.
1067 * @output: pointer to newly allocated output UTF-8 string.
1068 * @allowunassigned: whether to allow unassigned code points.
1069 * @usestd3asciirules: whether to check input for STD3 compliance.
1071 * Convert possibly ACE encoded domain name in UTF-8 format into a
1072 * UTF-8 string. The domain name may contain several labels,
1073 * separated by dots. The output buffer must be deallocated by the
1074 * caller.
1076 * Return value: Returns IDNA_SUCCESS on success, or error code.
1079 idna_to_unicode_utf8_from_utf8 (const char *input, char **output,
1080 int allowunassigned, int usestd3asciirules)
1082 int flags = 0;
1083 int rc;
1085 if (allowunassigned)
1086 flags |= IDNA_ALLOW_UNASSIGNED;
1087 if (usestd3asciirules)
1088 flags |= IDNA_USE_STD3_ASCII_RULES;
1090 rc = idna_to_unicode_8z8z (input, output, flags);
1092 return rc;
1096 * idna_to_unicode_locale_from_utf8:
1097 * @input: zero-terminated UTF-8 string.
1098 * @output: pointer to newly allocated output string encoded in the
1099 * current locale's character set.
1100 * @allowunassigned: whether to allow unassigned code points.
1101 * @usestd3asciirules: whether to check input for STD3 compliance.
1103 * Convert possibly ACE encoded domain name in UTF-8 format into a
1104 * string encoded in the current locale's character set. The
1105 * The domain name may contain several labels, separated by dots. The
1106 * output buffer must be deallocated by the caller.
1108 * Return value: Returns IDNA_SUCCESS on success, or error code.
1111 idna_to_unicode_locale_from_utf8 (const char *input, char **output,
1112 int allowunassigned, int usestd3asciirules)
1114 int flags = 0;
1115 int rc;
1117 if (allowunassigned)
1118 flags |= IDNA_ALLOW_UNASSIGNED;
1119 if (usestd3asciirules)
1120 flags |= IDNA_USE_STD3_ASCII_RULES;
1122 rc = idna_to_unicode_8zlz (input, output, flags);
1124 return rc;
1128 * idna_to_unicode_locale_from_locale:
1129 * @input: zero-terminated string encoded in the current locale's
1130 * character set.
1131 * @output: pointer to newly allocated output string encoded in the
1132 * current locale's character set.
1133 * @allowunassigned: whether to allow unassigned code points.
1134 * @usestd3asciirules: whether to check input for STD3 compliance.
1136 * Convert possibly ACE encoded domain name in the locale's character
1137 * set into a string encoded in the current locale's character set.
1138 * The domain name may contain several labels, separated by dots. The
1139 * output buffer must be deallocated by the caller.
1141 * Return value: Returns IDNA_SUCCESS on success, or error code.
1144 idna_to_unicode_locale_from_locale (const char *input, char **output,
1145 int allowunassigned,
1146 int usestd3asciirules)
1148 int flags = 0;
1149 int rc;
1151 if (allowunassigned)
1152 flags |= IDNA_ALLOW_UNASSIGNED;
1153 if (usestd3asciirules)
1154 flags |= IDNA_USE_STD3_ASCII_RULES;
1156 rc = idna_to_unicode_lzlz (input, output, flags);
1158 return rc;
1162 /* Deprecated interfaces (even older) */
1166 * idna_ucs4_to_ace:
1167 * @input: zero terminated input Unicode string.
1168 * @output: pointer to newly allocated output string.
1170 * Convert UCS-4 domain name to ASCII string. The AllowUnassigned
1171 * flag is false and std3asciirules flag is false. The domain name
1172 * may contain several labels, separated by dots. The output buffer
1173 * must be deallocated by the caller.
1175 * This function is deprecated in favor of idna_to_ascii_from_ucs4()
1176 * and will be removed in future versions.
1178 * Return value: Returns IDNA_SUCCESS on success, or error code.
1181 idna_ucs4_to_ace (const unsigned long *input, char **output)
1183 return idna_to_ascii_from_ucs4 (input, output, 0, 0);
1187 * idna_utf8_to_ace:
1188 * @input: zero terminated input UTF-8 string.
1189 * @output: pointer to newly allocated output string.
1191 * Convert UTF-8 domain name to ASCII string. The AllowUnassigned
1192 * flag is false and std3asciirules flag is false. The domain name
1193 * may contain several labels, separated by dots. The output buffer
1194 * must be deallocated by the caller.
1196 * This function is deprecated in favor of idna_to_ascii_from_utf8()
1197 * and will be removed in future versions.
1199 * Return value: Returns IDNA_SUCCESS on success, or error code.
1202 idna_utf8_to_ace (const char *input, char **output)
1204 return idna_to_ascii_from_utf8 (input, output, 0, 0);
1208 * idna_locale_to_ace:
1209 * @input: zero terminated input UTF-8 string.
1210 * @output: pointer to newly allocated output string.
1212 * Convert domain name in the locale's encoding to ASCII string. The
1213 * AllowUnassigned flag is false and std3asciirules flag is false.
1214 * The domain name may contain several labels, separated by dots. The
1215 * output buffer must be deallocated by the caller.
1217 * This function is deprecated in favor of idna_to_ascii_from_locale()
1218 * and will be removed in future versions.
1220 * Return value: Returns IDNA_SUCCESS on success, or error code.
1223 idna_locale_to_ace (const char *input, char **output)
1225 return idna_to_ascii_from_locale (input, output, 0, 0);
1229 * idna_ucs4ace_to_ucs4:
1230 * @input: zero-terminated Unicode string.
1231 * @output: pointer to newly allocated output Unicode string.
1233 * Convert possibly ACE encoded domain name in UCS-4 format into a
1234 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
1235 * flag is false. The domain name may contain several labels,
1236 * separated by dots. The output buffer must be deallocated by the
1237 * caller.
1239 * This function is deprecated in favor of
1240 * idna_to_unicode_ucs4_from_ucs4() and will be removed in future
1241 * versions.
1243 * Return value: Returns IDNA_SUCCESS on success, or error code.
1246 idna_ucs4ace_to_ucs4 (const unsigned long *input, unsigned long **output)
1248 return idna_to_unicode_ucs4_from_ucs4 (input, output, 0, 0);
1252 * idna_utf8ace_to_ucs4:
1253 * @input: zero-terminated UTF-8 string.
1254 * @output: pointer to newly allocated output Unicode string.
1256 * Convert possibly ACE encoded domain name in UTF-8 format into a
1257 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
1258 * flag is false. The domain name may contain several labels,
1259 * separated by dots. The output buffer must be deallocated by the
1260 * caller.
1262 * This function is deprecated in favor of
1263 * idna_to_unicode_ucs4_from_utf8() and will be removed in future
1264 * versions.
1266 * Return value: Returns IDNA_SUCCESS on success, or error code.
1269 idna_utf8ace_to_ucs4 (const char *input, unsigned long **output)
1271 return idna_to_unicode_ucs4_from_utf8 (input, output, 0, 0);
1275 * idna_utf8ace_to_utf8:
1276 * @input: zero-terminated UTF-8 string.
1277 * @output: pointer to newly allocated output UTF-8 string.
1279 * Convert possibly ACE encoded domain name in UTF-8 format into a
1280 * UTF-8 string. The AllowUnassigned flag is false and std3asciirules
1281 * flag is false. The domain name may contain several labels,
1282 * separated by dots. The output buffer must be deallocated by the
1283 * caller.
1285 * This function is deprecated in favor of
1286 * idna_to_unicode_utf8_from_utf8() and will be removed in future
1287 * versions.
1289 * Return value: Returns IDNA_SUCCESS on success, or error code.
1292 idna_utf8ace_to_utf8 (const char *input, char **output)
1294 return idna_to_unicode_utf8_from_utf8 (input, output, 0, 0);
1298 * idna_utf8ace_to_locale:
1299 * @input: zero-terminated UTF-8 string.
1300 * @output: pointer to newly allocated output string encoded in the
1301 * current locale's character set.
1303 * Convert possibly ACE encoded domain name in UTF-8 format into a
1304 * string encoded in the current locale's character set. The
1305 * AllowUnassigned flag is false and std3asciirules flag is false.
1306 * The domain name may contain several labels, separated by dots. The
1307 * output buffer must be deallocated by the caller.
1309 * This function is deprecated in favor of
1310 * idna_to_unicode_locale_from_utf8() and will be removed in future
1311 * versions.
1313 * Return value: Returns IDNA_SUCCESS on success, or error code.
1316 idna_utf8ace_to_locale (const char *input, char **output)
1318 return idna_to_unicode_locale_from_utf8 (input, output, 0, 0);
1322 * idna_localeace_to_locale:
1323 * @input: zero-terminated string encoded in the current locale's
1324 * character set.
1325 * @output: pointer to newly allocated output string encoded in the
1326 * current locale's character set.
1328 * Convert possibly ACE encoded domain name in the locale's character
1329 * set into a string encoded in the current locale's character set.
1330 * The AllowUnassigned flag is false and std3asciirules flag is false.
1331 * The domain name may contain several labels, separated by dots. The
1332 * output buffer must be deallocated by the caller.
1334 * This function is deprecated in favor of
1335 * idna_to_unicode_locale_from_locale() and will be removed in future
1336 * versions.
1338 * Return value: Returns IDNA_SUCCESS on success, or error code.
1341 idna_localeace_to_locale (const char *input, char **output)
1343 return idna_to_unicode_locale_from_locale (input, output, 0, 0);