Ignore idn-int.h.
[libidn.git] / idna.c
blob41cb232ee2bfdf468ba1743cbc5af9a6c4908f6d
1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /* Core functions */
26 /**
27 * idna_to_ascii_4i
28 * @in: input array with unicode code points.
29 * @inlen: length of input array with unicode code points.
30 * @out: output zero terminated string that must have room for at
31 * least 63 characters plus the terminating zero.
32 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
34 * The ToASCII operation takes a sequence of Unicode code points that make
35 * up one label and transforms it into a sequence of code points in the
36 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
37 * resulting sequence are equivalent labels.
39 * It is important to note that the ToASCII operation can fail. ToASCII
40 * fails if any step of it fails. If any step of the ToASCII operation
41 * fails on any label in a domain name, that domain name MUST NOT be used
42 * as an internationalized domain name. The method for deadling with this
43 * failure is application-specific.
45 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
46 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
47 * sequence of ASCII code points or a failure condition.
49 * ToASCII never alters a sequence of code points that are all in the ASCII
50 * range to begin with (although it could fail). Applying the ToASCII
51 * operation multiple times has exactly the same effect as applying it just
52 * once.
54 * Return value: Returns 0 on success, or an error code.
56 int
57 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
59 size_t len, outlen;
60 uint32_t *src; /* XXX don't need to copy data? */
61 int rc;
65 * ToASCII consists of the following steps:
67 * 1. If all code points in the sequence are in the ASCII range (0..7F)
68 * then skip to step 3.
72 size_t i;
73 int inasciirange;
75 inasciirange = 1;
76 for (i = 0; i < inlen; i++)
77 if (in[i] > 0x7F)
78 inasciirange = 0;
79 if (inasciirange)
81 src = malloc (sizeof (in[0]) * (inlen + 1));
82 if (src == NULL)
83 return IDNA_MALLOC_ERROR;
85 memcpy (src, in, sizeof (in[0]) * inlen);
86 src[inlen] = 0;
88 goto step3;
93 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
94 * an error. The AllowUnassigned flag is used in [NAMEPREP].
98 char *p;
100 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
101 if (p == NULL)
102 return IDNA_MALLOC_ERROR;
104 len = strlen (p);
107 len = 2 * len + 10; /* XXX better guess? */
108 p = realloc (p, len);
109 if (p == NULL)
110 return IDNA_MALLOC_ERROR;
112 if (flags & IDNA_ALLOW_UNASSIGNED)
113 rc = stringprep_nameprep (p, len);
114 else
115 rc = stringprep_nameprep_no_unassigned (p, len);
117 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
119 if (rc != STRINGPREP_OK)
121 free (p);
122 return IDNA_STRINGPREP_ERROR;
125 src = stringprep_utf8_to_ucs4 (p, -1, NULL);
127 free (p);
130 step3:
132 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
134 * (a) Verify the absence of non-LDH ASCII code points; that is,
135 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
137 * (b) Verify the absence of leading and trailing hyphen-minus;
138 * that is, the absence of U+002D at the beginning and end of
139 * the sequence.
142 if (flags & IDNA_USE_STD3_ASCII_RULES)
144 size_t i;
146 for (i = 0; src[i]; i++)
147 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
148 (src[i] >= 0x3A && src[i] <= 0x40) ||
149 (src[i] >= 0x5B && src[i] <= 0x60) ||
150 (src[i] >= 0x7B && src[i] <= 0x7F))
152 free (src);
153 return IDNA_CONTAINS_LDH;
156 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
158 free (src);
159 return IDNA_CONTAINS_MINUS;
164 * 4. If all code points in the sequence are in the ASCII range
165 * (0..7F), then skip to step 8.
169 size_t i;
170 int inasciirange;
172 inasciirange = 1;
173 for (i = 0; src[i]; i++)
175 if (src[i] > 0x7F)
176 inasciirange = 0;
177 /* copy string to output buffer if we are about to skip to step8 */
178 if (i < 64)
179 out[i] = src[i];
181 if (i < 64)
182 out[i] = '\0';
183 if (inasciirange)
184 goto step8;
188 * 5. Verify that the sequence does NOT begin with the ACE prefix.
193 size_t i;
194 int match;
196 match = 1;
197 for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
198 if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
199 match = 0;
200 if (match)
202 free (src);
203 return IDNA_CONTAINS_ACE_PREFIX;
208 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
209 * and fail if there is an error.
211 for (len = 0; src[len]; len++)
213 src[len] = '\0';
214 outlen = 63 - strlen (IDNA_ACE_PREFIX);
215 rc = punycode_encode (len, src, NULL,
216 &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
217 free (src);
218 if (rc != PUNYCODE_SUCCESS)
219 return IDNA_PUNYCODE_ERROR;
220 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
223 * 7. Prepend the ACE prefix.
226 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
229 * 8. Verify that the number of code points is in the range 1 to 63
230 * inclusive.
233 step8:
234 if (strlen (out) < 1 || strlen (out) > 63)
235 return IDNA_INVALID_LENGTH;
237 return IDNA_SUCCESS;
240 static int
241 idna_to_unicode_internal (char *utf8in, size_t utf8len,
242 uint32_t * out, size_t * outlen, int flags)
244 int rc;
245 char tmpout[64];
248 * 1. If all code points in the sequence are in the ASCII range (0..7F)
249 * then skip to step 3.
253 size_t i;
254 int inasciirange;
256 inasciirange = 1;
257 for (i = 0; utf8in[i]; i++)
258 if (utf8in[i] & ~0x7F)
259 inasciirange = 0;
260 if (inasciirange)
261 goto step3;
265 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
266 * error. (If step 3 of ToASCII is also performed here, it will not
267 * affect the overall behavior of ToUnicode, but it is not
268 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
271 if (flags & IDNA_ALLOW_UNASSIGNED)
272 rc = stringprep_nameprep (utf8in, utf8len);
273 else
274 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len);
276 if (rc != STRINGPREP_OK)
277 return IDNA_STRINGPREP_ERROR;
279 /* 3. Verify that the sequence begins with the ACE prefix, and save a
280 * copy of the sequence.
283 step3:
284 if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
285 return IDNA_NO_ACE_PREFIX;
287 /* 4. Remove the ACE prefix.
290 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
291 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
293 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
294 * and fail if there is an error. Save a copy of the result of
295 * this step.
298 (*outlen)--; /* reserve one for the zero */
300 rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
301 if (rc != PUNYCODE_SUCCESS)
302 return IDNA_PUNYCODE_ERROR;
304 out[*outlen] = 0; /* add zero */
306 /* 6. Apply ToASCII.
309 rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
310 if (rc != IDNA_SUCCESS)
311 return rc;
313 /* 7. Verify that the result of step 6 matches the saved copy from
314 * step 3, using a case-insensitive ASCII comparison.
317 if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
318 return IDNA_ROUNDTRIP_VERIFY_ERROR;
320 /* 8. Return the saved copy from step 5.
323 return IDNA_SUCCESS;
327 * idna_to_unicode_44i
328 * @in: input array with unicode code points.
329 * @inlen: length of input array with unicode code points.
330 * @out: output array with unicode code points.
331 * @outlen: on input, maximum size of output array with unicode code points,
332 * on exit, actual size of output array with unicode code points.
333 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
335 * The ToUnicode operation takes a sequence of Unicode code points
336 * that make up one label and returns a sequence of Unicode code
337 * points. If the input sequence is a label in ACE form, then the
338 * result is an equivalent internationalized label that is not in ACE
339 * form, otherwise the original sequence is returned unaltered.
341 * ToUnicode never fails. If any step fails, then the original input
342 * sequence is returned immediately in that step.
344 * The ToUnicode output never contains more code points than its
345 * input. Note that the number of octets needed to represent a
346 * sequence of code points depends on the particular character
347 * encoding used.
349 * The inputs to ToUnicode are a sequence of code points, the
350 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
351 * ToUnicode is always a sequence of Unicode code points.
353 * Return value: Returns error condition, but it must only be used for
354 * debugging purposes. The output buffer is always
355 * guaranteed to contain the correct data according to
356 * the specification (sans malloc induced errors). NB!
357 * This means that you normally ignore the return code
358 * from this function, as checking it means breaking the
359 * standard.
362 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
363 uint32_t * out, size_t * outlen, int flags)
365 int rc;
366 size_t outlensave = *outlen;
367 char *p;
369 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
370 if (p == NULL)
371 return IDNA_MALLOC_ERROR;
373 p = realloc (p, BUFSIZ);
374 if (p == NULL)
375 return IDNA_MALLOC_ERROR;
377 rc = idna_to_unicode_internal (p, BUFSIZ, out, outlen, flags);
378 if (rc != IDNA_SUCCESS)
380 memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
381 inlen : outlensave));
382 *outlen = inlen;
385 free (p);
387 return rc;
390 /* Wrappers that handle several labels */
393 * idna_to_ascii_4z:
394 * @input: zero terminated input Unicode string.
395 * @output: pointer to newly allocated output string.
396 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
398 * Convert UCS-4 domain name to ASCII string. The domain name may
399 * contain several labels, separated by dots. The output buffer must
400 * be deallocated by the caller.
402 * Return value: Returns IDNA_SUCCESS on success, or error code.
405 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
407 const uint32_t *start = input;
408 const uint32_t *end = input;
409 char buf[64];
410 char *out = NULL;
411 int rc;
413 *output = NULL;
417 end = start;
419 /* 1) Whenever dots are used as label separators, the following
420 characters MUST be recognized as dots: U+002E (full stop),
421 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
422 U+FF61 (halfwidth ideographic full stop). */
423 for (; *end &&
424 *end != 0x002E &&
425 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
428 rc = idna_to_ascii_4i (start, end - start, buf, flags);
429 if (rc != IDNA_SUCCESS)
430 return rc;
432 if (out)
434 out = realloc (out, strlen (out) + 1 + strlen (buf) + 1);
435 if (!out)
436 return IDNA_MALLOC_ERROR;
437 strcat (out, ".");
438 strcat (out, buf);
440 else
442 out = (char *) strdup (buf);
443 if (!out)
444 return IDNA_MALLOC_ERROR;
447 start = end + 1;
449 while (*end);
451 *output = out;
453 return IDNA_SUCCESS;
457 * idna_to_ascii_8z:
458 * @input: zero terminated input UTF-8 string.
459 * @output: pointer to newly allocated output string.
460 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
462 * Convert UTF-8 domain name to ASCII string. The domain name may
463 * contain several labels, separated by dots. The output buffer must
464 * be deallocated by the caller.
466 * Return value: Returns IDNA_SUCCESS on success, or error code.
469 idna_to_ascii_8z (const char *input, char **output, int flags)
471 uint32_t *ucs4;
472 size_t ucs4len;
473 int rc;
475 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
476 if (!ucs4)
477 return IDNA_ICONV_ERROR;
479 rc = idna_to_ascii_4z (ucs4, output, flags);
481 free (ucs4);
483 return rc;
488 * idna_to_ascii_lz:
489 * @input: zero terminated input UTF-8 string.
490 * @output: pointer to newly allocated output string.
491 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
493 * Convert domain name in the locale's encoding to ASCII string. The
494 * domain name may contain several labels, separated by dots. The
495 * output buffer must be deallocated by the caller.
497 * Return value: Returns IDNA_SUCCESS on success, or error code.
500 idna_to_ascii_lz (const char *input, char **output, int flags)
502 char *utf8;
503 int rc;
505 utf8 = stringprep_locale_to_utf8 (input);
506 if (!utf8)
507 return IDNA_ICONV_ERROR;
509 rc = idna_to_ascii_8z (utf8, output, flags);
511 free (utf8);
513 return rc;
517 * idna_to_unicode_4z4z:
518 * @input: zero-terminated Unicode string.
519 * @output: pointer to newly allocated output Unicode string.
520 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
522 * Convert possibly ACE encoded domain name in UCS-4 format into a
523 * UCS-4 string. The domain name may contain several labels,
524 * separated by dots. The output buffer must be deallocated by the
525 * caller.
527 * Return value: Returns IDNA_SUCCESS on success, or error code.
530 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
532 const uint32_t *start = input;
533 const uint32_t *end = input;
534 uint32_t *buf;
535 size_t buflen;
536 uint32_t *out = NULL;
537 size_t outlen = 0;
538 int rc;
540 *output = NULL;
544 end = start;
546 /* 1) Whenever dots are used as label separators, the following
547 characters MUST be recognized as dots: U+002E (full stop),
548 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
549 U+FF61 (halfwidth ideographic full stop). */
550 for (; *end &&
551 *end != 0x002E &&
552 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
555 buflen = end - start;
556 buf = malloc (sizeof (buf[0]) * (buflen + 1));
557 if (!buf)
558 return IDNA_MALLOC_ERROR;
560 rc = idna_to_unicode_44i (start, end - start, buf, &buflen, flags);
561 /* don't check rc as per specification! */
563 if (out)
565 out = realloc (out, sizeof (out[0]) * (outlen + 1 + buflen + 1));
566 if (!out)
567 return IDNA_MALLOC_ERROR;
568 out[outlen++] = 0x002E; /* '.' (full stop) */
569 memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
570 outlen += buflen;
571 out[outlen] = 0x0;
572 free (buf);
574 else
576 out = buf;
577 outlen = buflen;
578 out[outlen] = 0x0;
581 start = end + 1;
583 while (*end);
585 *output = out;
587 return IDNA_SUCCESS;
591 * idna_to_unicode_8z4z:
592 * @input: zero-terminated UTF-8 string.
593 * @output: pointer to newly allocated output Unicode string.
594 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
596 * Convert possibly ACE encoded domain name in UTF-8 format into a
597 * UCS-4 string. The domain name may contain several labels,
598 * separated by dots. The output buffer must be deallocated by the
599 * caller.
601 * Return value: Returns IDNA_SUCCESS on success, or error code.
604 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
606 uint32_t *ucs4;
607 size_t ucs4len;
608 int rc;
610 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
611 if (!ucs4)
612 return IDNA_ICONV_ERROR;
614 rc = idna_to_unicode_4z4z (ucs4, output, flags);
615 free (ucs4);
617 return rc;
621 * idna_to_unicode_8z8z:
622 * @input: zero-terminated UTF-8 string.
623 * @output: pointer to newly allocated output UTF-8 string.
624 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
626 * Convert possibly ACE encoded domain name in UTF-8 format into a
627 * UTF-8 string. The domain name may contain several labels,
628 * separated by dots. The output buffer must be deallocated by the
629 * caller.
631 * Return value: Returns IDNA_SUCCESS on success, or error code.
634 idna_to_unicode_8z8z (const char *input, char **output, int flags)
636 uint32_t *ucs4;
637 int rc;
639 rc = idna_to_unicode_8z4z (input, &ucs4, flags);
640 *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
641 free (ucs4);
643 if (!*output)
644 return IDNA_ICONV_ERROR;
646 return rc;
650 * idna_to_unicode_8zlz:
651 * @input: zero-terminated UTF-8 string.
652 * @output: pointer to newly allocated output string encoded in the
653 * current locale's character set.
654 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
656 * Convert possibly ACE encoded domain name in UTF-8 format into a
657 * string encoded in the current locale's character set. The domain
658 * name may contain several labels, separated by dots. The output
659 * buffer must be deallocated by the caller.
661 * Return value: Returns IDNA_SUCCESS on success, or error code.
664 idna_to_unicode_8zlz (const char *input, char **output, int flags)
666 char *utf8;
667 int rc;
669 rc = idna_to_unicode_8z8z (input, &utf8, flags);
670 *output = stringprep_utf8_to_locale (utf8);
671 free (utf8);
673 if (!*output)
674 return IDNA_ICONV_ERROR;
676 return rc;
680 * idna_to_unicode_lzlz:
681 * @input: zero-terminated string encoded in the current locale's
682 * character set.
683 * @output: pointer to newly allocated output string encoded in the
684 * current locale's character set.
685 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
687 * Convert possibly ACE encoded domain name in the locale's character
688 * set into a string encoded in the current locale's character set.
689 * The domain name may contain several labels, separated by dots. The
690 * output buffer must be deallocated by the caller.
692 * Return value: Returns IDNA_SUCCESS on success, or error code.
695 idna_to_unicode_lzlz (const char *input, char **output, int flags)
697 char *utf8;
698 int rc;
700 utf8 = stringprep_locale_to_utf8 (input);
701 if (!utf8)
702 return IDNA_ICONV_ERROR;
704 rc = idna_to_unicode_8zlz (utf8, output, flags);
705 free (utf8);
707 return rc;
711 /* Deprecated interfaces */
714 * idna_to_ascii
715 * @in: input array with unicode code points.
716 * @inlen: length of input array with unicode code points.
717 * @out: output zero terminated string that must have room for at
718 * least 63 characters plus the terminating zero.
719 * @allowunassigned: whether to allow unassigned code points.
720 * @usestd3asciirules: whether to check input for STD3 compliance.
722 * The ToASCII operation takes a sequence of Unicode code points that make
723 * up one label and transforms it into a sequence of code points in the
724 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
725 * resulting sequence are equivalent labels.
727 * It is important to note that the ToASCII operation can fail. ToASCII
728 * fails if any step of it fails. If any step of the ToASCII operation
729 * fails on any label in a domain name, that domain name MUST NOT be used
730 * as an internationalized domain name. The method for deadling with this
731 * failure is application-specific.
733 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
734 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
735 * sequence of ASCII code points or a failure condition.
737 * ToASCII never alters a sequence of code points that are all in the ASCII
738 * range to begin with (although it could fail). Applying the ToASCII
739 * operation multiple times has exactly the same effect as applying it just
740 * once.
742 * Return value: Returns 0 on success, or an error code.
745 idna_to_ascii (const unsigned long *in, size_t inlen,
746 char *out, int allowunassigned, int usestd3asciirules)
748 int rc;
749 int flags = 0;
750 uint32_t *tmp;
751 size_t i;
753 tmp = malloc (sizeof (tmp[0]) * inlen);
754 if (!tmp)
755 return IDNA_MALLOC_ERROR;
757 if (allowunassigned)
758 flags |= IDNA_ALLOW_UNASSIGNED;
759 if (usestd3asciirules)
760 flags |= IDNA_USE_STD3_ASCII_RULES;
762 for (i = 0; i < inlen; i++)
763 tmp[i] = in[i];
764 rc = idna_to_ascii_4i (tmp, inlen, out, flags);
765 free (tmp);
767 return rc;
771 * idna_to_unicode
772 * @in: input array with unicode code points.
773 * @inlen: length of input array with unicode code points.
774 * @out: output array with unicode code points.
775 * @outlen: on input, maximum size of output array with unicode code points,
776 * on exit, actual size of output array with unicode code points.
777 * @allowunassigned: whether to allow unassigned code points.
778 * @usestd3asciirules: whether to check input for STD3 compliance.
780 * The ToUnicode operation takes a sequence of Unicode code points
781 * that make up one label and returns a sequence of Unicode code
782 * points. If the input sequence is a label in ACE form, then the
783 * result is an equivalent internationalized label that is not in ACE
784 * form, otherwise the original sequence is returned unaltered.
786 * ToUnicode never fails. If any step fails, then the original input
787 * sequence is returned immediately in that step.
789 * The ToUnicode output never contains more code points than its
790 * input. Note that the number of octets needed to represent a
791 * sequence of code points depends on the particular character
792 * encoding used.
794 * The inputs to ToUnicode are a sequence of code points, the
795 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
796 * ToUnicode is always a sequence of Unicode code points.
798 * Return value: Returns error condition, but it must only be used for
799 * debugging purposes. The output buffer is always
800 * guaranteed to contain the correct data according to
801 * the specification (sans malloc induced errors). NB!
802 * This means that you normally ignore the return code
803 * from this function, as checking it means breaking the
804 * standard.
807 idna_to_unicode (const unsigned long *in, size_t inlen,
808 unsigned long *out, size_t * outlen,
809 int allowunassigned, int usestd3asciirules)
811 int rc;
812 int flags = 0;
813 uint32_t *tmpin;
814 uint32_t *tmpout;
815 size_t i;
817 tmpin = malloc (sizeof (tmpin[0]) * inlen);
818 if (!tmpin)
819 return IDNA_MALLOC_ERROR;
820 tmpout = malloc (sizeof (tmpout[0]) * *outlen);
821 if (!tmpout)
822 return IDNA_MALLOC_ERROR;
824 if (allowunassigned)
825 flags |= IDNA_ALLOW_UNASSIGNED;
826 if (usestd3asciirules)
827 flags |= IDNA_USE_STD3_ASCII_RULES;
829 for (i = 0; i < inlen; i++)
830 tmpin[i] = in[i];
831 rc = idna_to_unicode_44i (tmpin, inlen, tmpout, outlen, flags);
832 free (tmpin);
834 for (i = 0; i < *outlen; i++)
835 out[i] = tmpout[i];
837 free (tmpout);
839 return rc;
844 * idna_to_ascii_from_ucs4:
845 * @input: zero terminated input Unicode string.
846 * @output: pointer to newly allocated output string.
847 * @allowunassigned: whether to allow unassigned code points.
848 * @usestd3asciirules: whether to check input for STD3 compliance.
850 * Convert UCS-4 domain name to ASCII string. The domain name may
851 * contain several labels, separated by dots. The output buffer must
852 * be deallocated by the caller.
854 * Return value: Returns IDNA_SUCCESS on success, or error code.
857 idna_to_ascii_from_ucs4 (const unsigned long *input, char **output,
858 int allowunassigned, int usestd3asciirules)
860 size_t inlen;
861 int rc;
862 int flags = 0;
863 uint32_t *tmp;
864 size_t i;
866 for (inlen = 0; input[inlen]; inlen++)
868 tmp = malloc (sizeof (tmp[0]) * (inlen + 1));
869 if (!tmp)
870 return IDNA_MALLOC_ERROR;
872 if (allowunassigned)
873 flags |= IDNA_ALLOW_UNASSIGNED;
874 if (usestd3asciirules)
875 flags |= IDNA_USE_STD3_ASCII_RULES;
877 for (i = 0; i < inlen; i++)
878 tmp[i] = input[i];
879 tmp[i] = 0;
880 rc = idna_to_ascii_4z (tmp, output, flags);
881 free (tmp);
883 return rc;
887 * idna_to_ascii_from_utf8:
888 * @input: zero terminated input UTF-8 string.
889 * @output: pointer to newly allocated output string.
890 * @allowunassigned: whether to allow unassigned code points.
891 * @usestd3asciirules: whether to check input for STD3 compliance.
893 * Convert UTF-8 domain name to ASCII string. The domain name may
894 * contain several labels, separated by dots. The output buffer must
895 * be deallocated by the caller.
897 * Return value: Returns IDNA_SUCCESS on success, or error code.
900 idna_to_ascii_from_utf8 (const char *input, char **output,
901 int allowunassigned, int usestd3asciirules)
903 int rc;
904 int flags = 0;
906 if (allowunassigned)
907 flags |= IDNA_ALLOW_UNASSIGNED;
908 if (usestd3asciirules)
909 flags |= IDNA_USE_STD3_ASCII_RULES;
911 rc = idna_to_ascii_8z (input, output, flags);
913 return rc;
917 * idna_to_ascii_from_locale:
918 * @input: zero terminated input UTF-8 string.
919 * @output: pointer to newly allocated output string.
920 * @allowunassigned: whether to allow unassigned code points.
921 * @usestd3asciirules: whether to check input for STD3 compliance.
923 * Convert domain name in the locale's encoding to ASCII string. The
924 * domain name may contain several labels, separated by dots. The
925 * output buffer must be deallocated by the caller.
927 * Return value: Returns IDNA_SUCCESS on success, or error code.
930 idna_to_ascii_from_locale (const char *input, char **output,
931 int allowunassigned, int usestd3asciirules)
933 int rc;
934 int flags = 0;
936 if (allowunassigned)
937 flags |= IDNA_ALLOW_UNASSIGNED;
938 if (usestd3asciirules)
939 flags |= IDNA_USE_STD3_ASCII_RULES;
941 rc = idna_to_ascii_lz (input, output, flags);
943 return rc;
947 * idna_to_unicode_ucs4_from_ucs4:
948 * @input: zero-terminated Unicode string.
949 * @output: pointer to newly allocated output Unicode string.
950 * @allowunassigned: whether to allow unassigned code points.
951 * @usestd3asciirules: whether to check input for STD3 compliance.
953 * Convert possibly ACE encoded domain name in UCS-4 format into a
954 * UCS-4 string. The domain name may contain several labels,
955 * separated by dots. The output buffer must be deallocated by the
956 * caller.
958 * Return value: Returns IDNA_SUCCESS on success, or error code.
961 idna_to_unicode_ucs4_from_ucs4 (const unsigned long *input,
962 unsigned long **output,
963 int allowunassigned, int usestd3asciirules)
965 size_t inlen, tmpoutlen;
966 int rc;
967 int flags = 0;
968 uint32_t *tmpin;
969 uint32_t *tmpout;
970 size_t i;
972 for (inlen = 0; input[inlen]; inlen++)
974 tmpin = malloc (sizeof (tmpin[0]) * (inlen + 1));
975 if (!tmpin)
976 return IDNA_MALLOC_ERROR;
978 if (allowunassigned)
979 flags |= IDNA_ALLOW_UNASSIGNED;
980 if (usestd3asciirules)
981 flags |= IDNA_USE_STD3_ASCII_RULES;
983 for (i = 0; i < inlen; i++)
984 tmpin[i] = input[i];
985 tmpin[i] = 0;
986 rc = idna_to_unicode_4z4z (tmpin, &tmpout, flags);
987 free (tmpin);
989 for (tmpoutlen = 0; tmpout[tmpoutlen]; tmpoutlen++)
992 *output = malloc (sizeof (output[0]) * (tmpoutlen + 1));
993 if (!*output)
994 return IDNA_MALLOC_ERROR;
996 for (i = 0; i < tmpoutlen; i++)
997 (*output)[i] = tmpout[i];
998 (*output)[i] = 0;
1000 return rc;
1004 * idna_to_unicode_ucs4_from_utf8:
1005 * @input: zero-terminated UTF-8 string.
1006 * @output: pointer to newly allocated output Unicode string.
1007 * @allowunassigned: whether to allow unassigned code points.
1008 * @usestd3asciirules: whether to check input for STD3 compliance.
1010 * Convert possibly ACE encoded domain name in UTF-8 format into a
1011 * UCS-4 string. The domain name may contain several labels,
1012 * separated by dots. The output buffer must be deallocated by the
1013 * caller.
1015 * Return value: Returns IDNA_SUCCESS on success, or error code.
1018 idna_to_unicode_ucs4_from_utf8 (const char *input, unsigned long **output,
1019 int allowunassigned, int usestd3asciirules)
1021 size_t tmpinlen, tmpoutlen;
1022 int rc;
1023 int flags = 0;
1024 uint32_t *tmpin;
1025 uint32_t *tmpout;
1026 size_t i;
1028 tmpin = stringprep_utf8_to_ucs4 (input, -1, &tmpinlen);
1029 if (!tmpin)
1030 return IDNA_ICONV_ERROR;
1032 if (allowunassigned)
1033 flags |= IDNA_ALLOW_UNASSIGNED;
1034 if (usestd3asciirules)
1035 flags |= IDNA_USE_STD3_ASCII_RULES;
1037 rc = idna_to_unicode_4z4z (tmpin, &tmpout, flags);
1038 free (tmpin);
1040 for (tmpoutlen = 0; tmpout[tmpoutlen]; tmpoutlen++)
1043 *output = malloc (sizeof (output[0]) * (tmpoutlen + 1));
1044 if (!*output)
1045 return IDNA_MALLOC_ERROR;
1047 for (i = 0; i < tmpoutlen; i++)
1048 (*output)[i] = tmpout[i];
1049 (*output)[i] = 0;
1051 return rc;
1055 * idna_to_unicode_utf8_from_utf8:
1056 * @input: zero-terminated UTF-8 string.
1057 * @output: pointer to newly allocated output UTF-8 string.
1058 * @allowunassigned: whether to allow unassigned code points.
1059 * @usestd3asciirules: whether to check input for STD3 compliance.
1061 * Convert possibly ACE encoded domain name in UTF-8 format into a
1062 * UTF-8 string. The domain name may contain several labels,
1063 * separated by dots. The output buffer must be deallocated by the
1064 * caller.
1066 * Return value: Returns IDNA_SUCCESS on success, or error code.
1069 idna_to_unicode_utf8_from_utf8 (const char *input, char **output,
1070 int allowunassigned, int usestd3asciirules)
1072 int flags = 0;
1073 int rc;
1075 if (allowunassigned)
1076 flags |= IDNA_ALLOW_UNASSIGNED;
1077 if (usestd3asciirules)
1078 flags |= IDNA_USE_STD3_ASCII_RULES;
1080 rc = idna_to_unicode_8z8z (input, output, flags);
1082 return rc;
1086 * idna_to_unicode_locale_from_utf8:
1087 * @input: zero-terminated UTF-8 string.
1088 * @output: pointer to newly allocated output string encoded in the
1089 * current locale's character set.
1090 * @allowunassigned: whether to allow unassigned code points.
1091 * @usestd3asciirules: whether to check input for STD3 compliance.
1093 * Convert possibly ACE encoded domain name in UTF-8 format into a
1094 * string encoded in the current locale's character set. The
1095 * The domain name may contain several labels, separated by dots. The
1096 * output buffer must be deallocated by the caller.
1098 * Return value: Returns IDNA_SUCCESS on success, or error code.
1101 idna_to_unicode_locale_from_utf8 (const char *input, char **output,
1102 int allowunassigned, int usestd3asciirules)
1104 int flags = 0;
1105 int rc;
1107 if (allowunassigned)
1108 flags |= IDNA_ALLOW_UNASSIGNED;
1109 if (usestd3asciirules)
1110 flags |= IDNA_USE_STD3_ASCII_RULES;
1112 rc = idna_to_unicode_8zlz (input, output, flags);
1114 return rc;
1118 * idna_to_unicode_locale_from_locale:
1119 * @input: zero-terminated string encoded in the current locale's
1120 * character set.
1121 * @output: pointer to newly allocated output string encoded in the
1122 * current locale's character set.
1123 * @allowunassigned: whether to allow unassigned code points.
1124 * @usestd3asciirules: whether to check input for STD3 compliance.
1126 * Convert possibly ACE encoded domain name in the locale's character
1127 * set into a string encoded in the current locale's character set.
1128 * The domain name may contain several labels, separated by dots. The
1129 * output buffer must be deallocated by the caller.
1131 * Return value: Returns IDNA_SUCCESS on success, or error code.
1134 idna_to_unicode_locale_from_locale (const char *input, char **output,
1135 int allowunassigned,
1136 int usestd3asciirules)
1138 int flags = 0;
1139 int rc;
1141 if (allowunassigned)
1142 flags |= IDNA_ALLOW_UNASSIGNED;
1143 if (usestd3asciirules)
1144 flags |= IDNA_USE_STD3_ASCII_RULES;
1146 rc = idna_to_unicode_lzlz (input, output, flags);
1148 return rc;
1152 /* Deprecated interfaces (even older) */
1156 * idna_ucs4_to_ace:
1157 * @input: zero terminated input Unicode string.
1158 * @output: pointer to newly allocated output string.
1160 * Convert UCS-4 domain name to ASCII string. The AllowUnassigned
1161 * flag is false and std3asciirules flag is false. The domain name
1162 * may contain several labels, separated by dots. The output buffer
1163 * must be deallocated by the caller.
1165 * This function is deprecated in favor of idna_to_ascii_from_ucs4()
1166 * and will be removed in future versions.
1168 * Return value: Returns IDNA_SUCCESS on success, or error code.
1171 idna_ucs4_to_ace (const unsigned long *input, char **output)
1173 return idna_to_ascii_from_ucs4 (input, output, 0, 0);
1177 * idna_utf8_to_ace:
1178 * @input: zero terminated input UTF-8 string.
1179 * @output: pointer to newly allocated output string.
1181 * Convert UTF-8 domain name to ASCII string. The AllowUnassigned
1182 * flag is false and std3asciirules flag is false. The domain name
1183 * may contain several labels, separated by dots. The output buffer
1184 * must be deallocated by the caller.
1186 * This function is deprecated in favor of idna_to_ascii_from_utf8()
1187 * and will be removed in future versions.
1189 * Return value: Returns IDNA_SUCCESS on success, or error code.
1192 idna_utf8_to_ace (const char *input, char **output)
1194 return idna_to_ascii_from_utf8 (input, output, 0, 0);
1198 * idna_locale_to_ace:
1199 * @input: zero terminated input UTF-8 string.
1200 * @output: pointer to newly allocated output string.
1202 * Convert domain name in the locale's encoding to ASCII string. The
1203 * AllowUnassigned flag is false and std3asciirules flag is false.
1204 * The domain name may contain several labels, separated by dots. The
1205 * output buffer must be deallocated by the caller.
1207 * This function is deprecated in favor of idna_to_ascii_from_locale()
1208 * and will be removed in future versions.
1210 * Return value: Returns IDNA_SUCCESS on success, or error code.
1213 idna_locale_to_ace (const char *input, char **output)
1215 return idna_to_ascii_from_locale (input, output, 0, 0);
1219 * idna_ucs4ace_to_ucs4:
1220 * @input: zero-terminated Unicode string.
1221 * @output: pointer to newly allocated output Unicode string.
1223 * Convert possibly ACE encoded domain name in UCS-4 format into a
1224 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
1225 * flag is false. The domain name may contain several labels,
1226 * separated by dots. The output buffer must be deallocated by the
1227 * caller.
1229 * This function is deprecated in favor of
1230 * idna_to_unicode_ucs4_from_ucs4() and will be removed in future
1231 * versions.
1233 * Return value: Returns IDNA_SUCCESS on success, or error code.
1236 idna_ucs4ace_to_ucs4 (const unsigned long *input, unsigned long **output)
1238 return idna_to_unicode_ucs4_from_ucs4 (input, output, 0, 0);
1242 * idna_utf8ace_to_ucs4:
1243 * @input: zero-terminated UTF-8 string.
1244 * @output: pointer to newly allocated output Unicode string.
1246 * Convert possibly ACE encoded domain name in UTF-8 format into a
1247 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
1248 * flag is false. The domain name may contain several labels,
1249 * separated by dots. The output buffer must be deallocated by the
1250 * caller.
1252 * This function is deprecated in favor of
1253 * idna_to_unicode_ucs4_from_utf8() and will be removed in future
1254 * versions.
1256 * Return value: Returns IDNA_SUCCESS on success, or error code.
1259 idna_utf8ace_to_ucs4 (const char *input, unsigned long **output)
1261 return idna_to_unicode_ucs4_from_utf8 (input, output, 0, 0);
1265 * idna_utf8ace_to_utf8:
1266 * @input: zero-terminated UTF-8 string.
1267 * @output: pointer to newly allocated output UTF-8 string.
1269 * Convert possibly ACE encoded domain name in UTF-8 format into a
1270 * UTF-8 string. The AllowUnassigned flag is false and std3asciirules
1271 * flag is false. The domain name may contain several labels,
1272 * separated by dots. The output buffer must be deallocated by the
1273 * caller.
1275 * This function is deprecated in favor of
1276 * idna_to_unicode_utf8_from_utf8() and will be removed in future
1277 * versions.
1279 * Return value: Returns IDNA_SUCCESS on success, or error code.
1282 idna_utf8ace_to_utf8 (const char *input, char **output)
1284 return idna_to_unicode_utf8_from_utf8 (input, output, 0, 0);
1288 * idna_utf8ace_to_locale:
1289 * @input: zero-terminated UTF-8 string.
1290 * @output: pointer to newly allocated output string encoded in the
1291 * current locale's character set.
1293 * Convert possibly ACE encoded domain name in UTF-8 format into a
1294 * string encoded in the current locale's character set. The
1295 * AllowUnassigned flag is false and std3asciirules flag is false.
1296 * The domain name may contain several labels, separated by dots. The
1297 * output buffer must be deallocated by the caller.
1299 * This function is deprecated in favor of
1300 * idna_to_unicode_locale_from_utf8() and will be removed in future
1301 * versions.
1303 * Return value: Returns IDNA_SUCCESS on success, or error code.
1306 idna_utf8ace_to_locale (const char *input, char **output)
1308 return idna_to_unicode_locale_from_utf8 (input, output, 0, 0);
1312 * idna_localeace_to_locale:
1313 * @input: zero-terminated string encoded in the current locale's
1314 * character set.
1315 * @output: pointer to newly allocated output string encoded in the
1316 * current locale's character set.
1318 * Convert possibly ACE encoded domain name in the locale's character
1319 * set into a string encoded in the current locale's character set.
1320 * The AllowUnassigned flag is false and std3asciirules flag is false.
1321 * The domain name may contain several labels, separated by dots. The
1322 * output buffer must be deallocated by the caller.
1324 * This function is deprecated in favor of
1325 * idna_to_unicode_locale_from_locale() and will be removed in future
1326 * versions.
1328 * Return value: Returns IDNA_SUCCESS on success, or error code.
1331 idna_localeace_to_locale (const char *input, char **output)
1333 return idna_to_unicode_locale_from_locale (input, output, 0, 0);