Rename stringprep_generic_* to stringprep_rfc3454_*.
[libidn.git] / lib / idna.c
blobc692946654a0ab5a93f842e6e6cbd0515bcf7cee
1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /* Core functions */
26 /**
27 * idna_to_ascii_4i
28 * @in: input array with unicode code points.
29 * @inlen: length of input array with unicode code points.
30 * @out: output zero terminated string that must have room for at
31 * least 63 characters plus the terminating zero.
32 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
34 * The ToASCII operation takes a sequence of Unicode code points that make
35 * up one label and transforms it into a sequence of code points in the
36 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
37 * resulting sequence are equivalent labels.
39 * It is important to note that the ToASCII operation can fail. ToASCII
40 * fails if any step of it fails. If any step of the ToASCII operation
41 * fails on any label in a domain name, that domain name MUST NOT be used
42 * as an internationalized domain name. The method for deadling with this
43 * failure is application-specific.
45 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
46 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
47 * sequence of ASCII code points or a failure condition.
49 * ToASCII never alters a sequence of code points that are all in the ASCII
50 * range to begin with (although it could fail). Applying the ToASCII
51 * operation multiple times has exactly the same effect as applying it just
52 * once.
54 * Return value: Returns 0 on success, or an error code.
56 int
57 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
59 size_t len, outlen;
60 uint32_t *src; /* XXX don't need to copy data? */
61 int rc;
64 * ToASCII consists of the following steps:
66 * 1. If all code points in the sequence are in the ASCII range (0..7F)
67 * then skip to step 3.
71 size_t i;
72 int inasciirange;
74 inasciirange = 1;
75 for (i = 0; i < inlen; i++)
76 if (in[i] > 0x7F)
77 inasciirange = 0;
78 if (inasciirange)
80 src = malloc (sizeof (in[0]) * (inlen + 1));
81 if (src == NULL)
82 return IDNA_MALLOC_ERROR;
84 memcpy (src, in, sizeof (in[0]) * inlen);
85 src[inlen] = 0;
87 goto step3;
92 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
93 * an error. The AllowUnassigned flag is used in [NAMEPREP].
97 char *p;
99 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
100 if (p == NULL)
101 return IDNA_MALLOC_ERROR;
103 len = strlen (p);
106 len = 2 * len + 10; /* XXX better guess? */
107 p = realloc (p, len);
108 if (p == NULL)
109 return IDNA_MALLOC_ERROR;
111 if (flags & IDNA_ALLOW_UNASSIGNED)
112 rc = stringprep_nameprep (p, len);
113 else
114 rc = stringprep_nameprep_no_unassigned (p, len);
116 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
118 if (rc != STRINGPREP_OK)
120 free (p);
121 return IDNA_STRINGPREP_ERROR;
124 src = stringprep_utf8_to_ucs4 (p, -1, NULL);
126 free (p);
129 step3:
131 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
133 * (a) Verify the absence of non-LDH ASCII code points; that is,
134 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
136 * (b) Verify the absence of leading and trailing hyphen-minus;
137 * that is, the absence of U+002D at the beginning and end of
138 * the sequence.
141 if (flags & IDNA_USE_STD3_ASCII_RULES)
143 size_t i;
145 for (i = 0; src[i]; i++)
146 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
147 (src[i] >= 0x3A && src[i] <= 0x40) ||
148 (src[i] >= 0x5B && src[i] <= 0x60) ||
149 (src[i] >= 0x7B && src[i] <= 0x7F))
151 free (src);
152 return IDNA_CONTAINS_LDH;
155 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
157 free (src);
158 return IDNA_CONTAINS_MINUS;
163 * 4. If all code points in the sequence are in the ASCII range
164 * (0..7F), then skip to step 8.
168 size_t i;
169 int inasciirange;
171 inasciirange = 1;
172 for (i = 0; src[i]; i++)
174 if (src[i] > 0x7F)
175 inasciirange = 0;
176 /* copy string to output buffer if we are about to skip to step8 */
177 if (i < 64)
178 out[i] = src[i];
180 if (i < 64)
181 out[i] = '\0';
182 if (inasciirange)
183 goto step8;
187 * 5. Verify that the sequence does NOT begin with the ACE prefix.
192 size_t i;
193 int match;
195 match = 1;
196 for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
197 if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
198 match = 0;
199 if (match)
201 free (src);
202 return IDNA_CONTAINS_ACE_PREFIX;
207 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
208 * and fail if there is an error.
210 for (len = 0; src[len]; len++)
212 src[len] = '\0';
213 outlen = 63 - strlen (IDNA_ACE_PREFIX);
214 rc = punycode_encode (len, src, NULL,
215 &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
216 free (src);
217 if (rc != PUNYCODE_SUCCESS)
218 return IDNA_PUNYCODE_ERROR;
219 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
222 * 7. Prepend the ACE prefix.
225 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
228 * 8. Verify that the number of code points is in the range 1 to 63
229 * inclusive.
232 step8:
233 if (strlen (out) < 1 || strlen (out) > 63)
234 return IDNA_INVALID_LENGTH;
236 return IDNA_SUCCESS;
239 static int
240 idna_to_unicode_internal (char *utf8in, size_t utf8len,
241 uint32_t * out, size_t * outlen, int flags)
243 int rc;
244 char tmpout[64];
247 * 1. If all code points in the sequence are in the ASCII range (0..7F)
248 * then skip to step 3.
252 size_t i;
253 int inasciirange;
255 inasciirange = 1;
256 for (i = 0; utf8in[i]; i++)
257 if (utf8in[i] & ~0x7F)
258 inasciirange = 0;
259 if (inasciirange)
260 goto step3;
264 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
265 * error. (If step 3 of ToASCII is also performed here, it will not
266 * affect the overall behavior of ToUnicode, but it is not
267 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
270 if (flags & IDNA_ALLOW_UNASSIGNED)
271 rc = stringprep_nameprep (utf8in, utf8len);
272 else
273 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len);
275 if (rc != STRINGPREP_OK)
276 return IDNA_STRINGPREP_ERROR;
278 /* 3. Verify that the sequence begins with the ACE prefix, and save a
279 * copy of the sequence.
282 step3:
283 if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
284 return IDNA_NO_ACE_PREFIX;
286 /* 4. Remove the ACE prefix.
289 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
290 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
292 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
293 * and fail if there is an error. Save a copy of the result of
294 * this step.
297 (*outlen)--; /* reserve one for the zero */
299 rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
300 if (rc != PUNYCODE_SUCCESS)
301 return IDNA_PUNYCODE_ERROR;
303 out[*outlen] = 0; /* add zero */
305 /* 6. Apply ToASCII.
308 rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
309 if (rc != IDNA_SUCCESS)
310 return rc;
312 /* 7. Verify that the result of step 6 matches the saved copy from
313 * step 3, using a case-insensitive ASCII comparison.
316 if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
317 return IDNA_ROUNDTRIP_VERIFY_ERROR;
319 /* 8. Return the saved copy from step 5.
322 return IDNA_SUCCESS;
326 * idna_to_unicode_44i
327 * @in: input array with unicode code points.
328 * @inlen: length of input array with unicode code points.
329 * @out: output array with unicode code points.
330 * @outlen: on input, maximum size of output array with unicode code points,
331 * on exit, actual size of output array with unicode code points.
332 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
334 * The ToUnicode operation takes a sequence of Unicode code points
335 * that make up one label and returns a sequence of Unicode code
336 * points. If the input sequence is a label in ACE form, then the
337 * result is an equivalent internationalized label that is not in ACE
338 * form, otherwise the original sequence is returned unaltered.
340 * ToUnicode never fails. If any step fails, then the original input
341 * sequence is returned immediately in that step.
343 * The ToUnicode output never contains more code points than its
344 * input. Note that the number of octets needed to represent a
345 * sequence of code points depends on the particular character
346 * encoding used.
348 * The inputs to ToUnicode are a sequence of code points, the
349 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
350 * ToUnicode is always a sequence of Unicode code points.
352 * Return value: Returns error condition, but it must only be used for
353 * debugging purposes. The output buffer is always
354 * guaranteed to contain the correct data according to
355 * the specification (sans malloc induced errors). NB!
356 * This means that you normally ignore the return code
357 * from this function, as checking it means breaking the
358 * standard.
361 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
362 uint32_t * out, size_t * outlen, int flags)
364 int rc;
365 size_t outlensave = *outlen;
366 char *p;
368 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
369 if (p == NULL)
370 return IDNA_MALLOC_ERROR;
372 p = realloc (p, BUFSIZ);
373 if (p == NULL)
374 return IDNA_MALLOC_ERROR;
376 rc = idna_to_unicode_internal (p, BUFSIZ, out, outlen, flags);
377 if (rc != IDNA_SUCCESS)
379 memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
380 inlen : outlensave));
381 *outlen = inlen;
384 free (p);
386 return rc;
389 /* Wrappers that handle several labels */
392 * idna_to_ascii_4z:
393 * @input: zero terminated input Unicode string.
394 * @output: pointer to newly allocated output string.
395 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
397 * Convert UCS-4 domain name to ASCII string. The domain name may
398 * contain several labels, separated by dots. The output buffer must
399 * be deallocated by the caller.
401 * Return value: Returns IDNA_SUCCESS on success, or error code.
404 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
406 const uint32_t *start = input;
407 const uint32_t *end = input;
408 char buf[64];
409 char *out = NULL;
410 int rc;
412 *output = NULL;
416 end = start;
418 /* 1) Whenever dots are used as label separators, the following
419 characters MUST be recognized as dots: U+002E (full stop),
420 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
421 U+FF61 (halfwidth ideographic full stop). */
422 for (; *end &&
423 *end != 0x002E &&
424 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
427 /* Handle empty trailing labels. The RFC is not clear on this,
428 the text that mandate this behaviour inside a parenthesis in
429 the terminology section. */
430 if (end == start && *end == '\0')
432 strcpy (buf, out ? "" : ".");
434 else
436 rc = idna_to_ascii_4i (start, end - start, buf, flags);
437 if (rc != IDNA_SUCCESS)
438 return rc;
441 if (out)
443 out = realloc (out, strlen (out) + 1 + strlen (buf) + 1);
444 if (!out)
445 return IDNA_MALLOC_ERROR;
446 strcat (out, ".");
447 strcat (out, buf);
449 else
451 out = (char *) malloc (strlen (buf) + 1);
452 if (!out)
453 return IDNA_MALLOC_ERROR;
454 strcpy (out, buf);
457 start = end + 1;
459 while (*end);
461 *output = out;
463 return IDNA_SUCCESS;
467 * idna_to_ascii_8z:
468 * @input: zero terminated input UTF-8 string.
469 * @output: pointer to newly allocated output string.
470 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
472 * Convert UTF-8 domain name to ASCII string. The domain name may
473 * contain several labels, separated by dots. The output buffer must
474 * be deallocated by the caller.
476 * Return value: Returns IDNA_SUCCESS on success, or error code.
479 idna_to_ascii_8z (const char *input, char **output, int flags)
481 uint32_t *ucs4;
482 size_t ucs4len;
483 int rc;
485 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
486 if (!ucs4)
487 return IDNA_ICONV_ERROR;
489 rc = idna_to_ascii_4z (ucs4, output, flags);
491 free (ucs4);
493 return rc;
498 * idna_to_ascii_lz:
499 * @input: zero terminated input UTF-8 string.
500 * @output: pointer to newly allocated output string.
501 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
503 * Convert domain name in the locale's encoding to ASCII string. The
504 * domain name may contain several labels, separated by dots. The
505 * output buffer must be deallocated by the caller.
507 * Return value: Returns IDNA_SUCCESS on success, or error code.
510 idna_to_ascii_lz (const char *input, char **output, int flags)
512 char *utf8;
513 int rc;
515 utf8 = stringprep_locale_to_utf8 (input);
516 if (!utf8)
517 return IDNA_ICONV_ERROR;
519 rc = idna_to_ascii_8z (utf8, output, flags);
521 free (utf8);
523 return rc;
527 * idna_to_unicode_4z4z:
528 * @input: zero-terminated Unicode string.
529 * @output: pointer to newly allocated output Unicode string.
530 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
532 * Convert possibly ACE encoded domain name in UCS-4 format into a
533 * UCS-4 string. The domain name may contain several labels,
534 * separated by dots. The output buffer must be deallocated by the
535 * caller.
537 * Return value: Returns IDNA_SUCCESS on success, or error code.
540 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
542 const uint32_t *start = input;
543 const uint32_t *end = input;
544 uint32_t *buf;
545 size_t buflen;
546 uint32_t *out = NULL;
547 size_t outlen = 0;
548 int rc;
550 *output = NULL;
554 end = start;
556 /* 1) Whenever dots are used as label separators, the following
557 characters MUST be recognized as dots: U+002E (full stop),
558 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
559 U+FF61 (halfwidth ideographic full stop). */
560 for (; *end &&
561 *end != 0x002E &&
562 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
565 buflen = end - start;
566 buf = malloc (sizeof (buf[0]) * (buflen + 1));
567 if (!buf)
568 return IDNA_MALLOC_ERROR;
570 rc = idna_to_unicode_44i (start, end - start, buf, &buflen, flags);
571 /* don't check rc as per specification! */
573 if (out)
575 out = realloc (out, sizeof (out[0]) * (outlen + 1 + buflen + 1));
576 if (!out)
577 return IDNA_MALLOC_ERROR;
578 out[outlen++] = 0x002E; /* '.' (full stop) */
579 memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
580 outlen += buflen;
581 out[outlen] = 0x0;
582 free (buf);
584 else
586 out = buf;
587 outlen = buflen;
588 out[outlen] = 0x0;
591 start = end + 1;
593 while (*end);
595 *output = out;
597 return IDNA_SUCCESS;
601 * idna_to_unicode_8z4z:
602 * @input: zero-terminated UTF-8 string.
603 * @output: pointer to newly allocated output Unicode string.
604 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
606 * Convert possibly ACE encoded domain name in UTF-8 format into a
607 * UCS-4 string. The domain name may contain several labels,
608 * separated by dots. The output buffer must be deallocated by the
609 * caller.
611 * Return value: Returns IDNA_SUCCESS on success, or error code.
614 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
616 uint32_t *ucs4;
617 size_t ucs4len;
618 int rc;
620 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
621 if (!ucs4)
622 return IDNA_ICONV_ERROR;
624 rc = idna_to_unicode_4z4z (ucs4, output, flags);
625 free (ucs4);
627 return rc;
631 * idna_to_unicode_8z8z:
632 * @input: zero-terminated UTF-8 string.
633 * @output: pointer to newly allocated output UTF-8 string.
634 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
636 * Convert possibly ACE encoded domain name in UTF-8 format into a
637 * UTF-8 string. The domain name may contain several labels,
638 * separated by dots. The output buffer must be deallocated by the
639 * caller.
641 * Return value: Returns IDNA_SUCCESS on success, or error code.
644 idna_to_unicode_8z8z (const char *input, char **output, int flags)
646 uint32_t *ucs4;
647 int rc;
649 rc = idna_to_unicode_8z4z (input, &ucs4, flags);
650 *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
651 free (ucs4);
653 if (!*output)
654 return IDNA_ICONV_ERROR;
656 return rc;
660 * idna_to_unicode_8zlz:
661 * @input: zero-terminated UTF-8 string.
662 * @output: pointer to newly allocated output string encoded in the
663 * current locale's character set.
664 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
666 * Convert possibly ACE encoded domain name in UTF-8 format into a
667 * string encoded in the current locale's character set. The domain
668 * name may contain several labels, separated by dots. The output
669 * buffer must be deallocated by the caller.
671 * Return value: Returns IDNA_SUCCESS on success, or error code.
674 idna_to_unicode_8zlz (const char *input, char **output, int flags)
676 char *utf8;
677 int rc;
679 rc = idna_to_unicode_8z8z (input, &utf8, flags);
680 *output = stringprep_utf8_to_locale (utf8);
681 free (utf8);
683 if (!*output)
684 return IDNA_ICONV_ERROR;
686 return rc;
690 * idna_to_unicode_lzlz:
691 * @input: zero-terminated string encoded in the current locale's
692 * character set.
693 * @output: pointer to newly allocated output string encoded in the
694 * current locale's character set.
695 * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
697 * Convert possibly ACE encoded domain name in the locale's character
698 * set into a string encoded in the current locale's character set.
699 * The domain name may contain several labels, separated by dots. The
700 * output buffer must be deallocated by the caller.
702 * Return value: Returns IDNA_SUCCESS on success, or error code.
705 idna_to_unicode_lzlz (const char *input, char **output, int flags)
707 char *utf8;
708 int rc;
710 utf8 = stringprep_locale_to_utf8 (input);
711 if (!utf8)
712 return IDNA_ICONV_ERROR;
714 rc = idna_to_unicode_8zlz (utf8, output, flags);
715 free (utf8);
717 return rc;
721 * IDNA_ACE_PREFIX
723 * The IANA allocated prefix to use for IDNA. "xn--"