Add.
[libidn.git] / idna.c
blobc3537adcf39e144e116b0c16da3e4470d2f01a5e
1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /**
25 * idna_to_ascii
26 * @in: input array with unicode code points.
27 * @inlen: length of input array with unicode code points.
28 * @out: output zero terminated string that must have room for at
29 * least 63 characters plus the terminating zero.
30 * @allowunassigned: boolean value as per IDNA specification.
31 * @usestd3asciirules: boolean value as per IDNA specification.
33 * The ToASCII operation takes a sequence of Unicode code points that make
34 * up one label and transforms it into a sequence of code points in the
35 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
36 * resulting sequence are equivalent labels.
38 * It is important to note that the ToASCII operation can fail. ToASCII
39 * fails if any step of it fails. If any step of the ToASCII operation
40 * fails on any label in a domain name, that domain name MUST NOT be used
41 * as an internationalized domain name. The method for deadling with this
42 * failure is application-specific.
44 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
45 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
46 * sequence of ASCII code points or a failure condition.
48 * ToASCII never alters a sequence of code points that are all in the ASCII
49 * range to begin with (although it could fail). Applying the ToASCII
50 * operation multiple times has exactly the same effect as applying it just
51 * once.
53 * Return value: Returns 0 on success, or an error code.
55 int
56 idna_to_ascii (const unsigned long *in, size_t inlen,
57 char *out, int allowunassigned, int usestd3asciirules)
59 size_t len, outlen;
60 unsigned long *src; /* XXX don't need to copy data? */
61 int rc;
65 * ToASCII consists of the following steps:
67 * 1. If all code points in the sequence are in the ASCII range (0..7F)
68 * then skip to step 3.
72 size_t i;
73 int inasciirange;
75 inasciirange = 1;
76 for (i = 0; in[i]; i++)
77 if (in[i] > 0x7F)
78 inasciirange = 0;
79 if (inasciirange)
81 src = malloc (sizeof (in[0]) * (inlen + 1));
82 if (src == NULL)
83 return IDNA_MALLOC_ERROR;
85 memcpy (src, in, sizeof (in[0]) * inlen);
86 src[inlen] = 0;
88 goto step3;
93 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
94 * an error. The AllowUnassigned flag is used in [NAMEPREP].
98 char *p;
100 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
101 if (p == NULL)
102 return IDNA_MALLOC_ERROR;
104 len = strlen(p);
107 len = 2 * len + 10; /* XXX better guess? */
108 p = realloc (p, len);
109 if (p == NULL)
110 return IDNA_MALLOC_ERROR;
112 if (allowunassigned)
113 rc = stringprep_nameprep (p, len);
114 else
115 rc = stringprep_nameprep_no_unassigned (p, len);
117 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
119 if (rc != STRINGPREP_OK)
121 free(p);
122 return IDNA_STRINGPREP_ERROR;
125 src = stringprep_utf8_to_ucs4 (p, -1, NULL);
127 free(p);
130 step3:
132 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
134 * (a) Verify the absence of non-LDH ASCII code points; that is,
135 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
137 * (b) Verify the absence of leading and trailing hyphen-minus;
138 * that is, the absence of U+002D at the beginning and end of
139 * the sequence.
142 if (usestd3asciirules)
144 size_t i;
146 for (i = 0; src[i]; i++)
147 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
148 (src[i] >= 0x3A && src[i] <= 0x40) ||
149 (src[i] >= 0x5B && src[i] <= 0x60) ||
150 (src[i] >= 0x7B && src[i] <= 0x7F))
152 free(src);
153 return IDNA_CONTAINS_LDH;
156 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
158 free(src);
159 return IDNA_CONTAINS_MINUS;
164 * 4. If all code points in the sequence are in the ASCII range
165 * (0..7F), then skip to step 8.
169 size_t i;
170 int inasciirange;
172 inasciirange = 1;
173 for (i = 0; src[i]; i++)
175 if (src[i] > 0x7F)
176 inasciirange = 0;
177 /* copy string to output buffer if we are about to skip to step8 */
178 if (i < 64)
179 out[i] = src[i];
181 if (i < 64)
182 out[i] = '\0';
183 if (inasciirange)
184 goto step8;
188 * 5. Verify that the sequence does NOT begin with the ACE prefix.
193 size_t i;
194 int match;
196 match = 1;
197 for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
198 if (((unsigned long)IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
199 match = 0;
200 if (match)
202 free(src);
203 return IDNA_CONTAINS_ACE_PREFIX;
208 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
209 * and fail if there is an error.
211 for (len = 0; src[len]; len++)
213 src[len] = '\0';
214 outlen = 63 - strlen (IDNA_ACE_PREFIX);
215 rc = punycode_encode (len, src, NULL,
216 &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
217 free(src);
218 if (rc != PUNYCODE_SUCCESS)
219 return IDNA_PUNYCODE_ERROR;
220 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
223 * 7. Prepend the ACE prefix.
226 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
229 * 8. Verify that the number of code points is in the range 1 to 63
230 * inclusive.
233 step8:
234 if (strlen (out) < 1 || strlen (out) > 63)
235 return IDNA_INVALID_LENGTH;
237 return IDNA_SUCCESS;
240 static int
241 idna_to_unicode_internal (const unsigned long *in, size_t inlen,
242 unsigned long *out, size_t * outlen,
243 int allowunassigned, int usestd3asciirules,
244 char *utf8in, size_t utf8len)
246 int rc;
247 char tmpout[64];
250 * 1. If all code points in the sequence are in the ASCII range (0..7F)
251 * then skip to step 3.
255 size_t i;
256 int inasciirange;
258 inasciirange = 1;
259 for (i = 0; in[i]; i++)
260 if (in[i] > 0x7F)
261 inasciirange = 0;
262 if (inasciirange)
263 goto step3;
267 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
268 * error. (If step 3 of ToASCII is also performed here, it will not
269 * affect the overall behavior of ToUnicode, but it is not
270 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
273 if (allowunassigned)
274 rc = stringprep_nameprep (utf8in, utf8len);
275 else
276 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len);
278 if (rc != STRINGPREP_OK)
279 return IDNA_STRINGPREP_ERROR;
281 /* 3. Verify that the sequence begins with the ACE prefix, and save a
282 * copy of the sequence.
285 step3:
286 if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
287 return IDNA_NO_ACE_PREFIX;
289 /* 4. Remove the ACE prefix.
292 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
293 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
295 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
296 * and fail if there is an error. Save a copy of the result of
297 * this step.
300 (*outlen)--; /* reserve one for the zero */
302 rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
303 if (rc != PUNYCODE_SUCCESS)
304 return IDNA_PUNYCODE_ERROR;
306 out[*outlen] = 0; /* add zero */
308 /* 6. Apply ToASCII.
311 rc = idna_to_ascii (out, *outlen, tmpout,
312 allowunassigned, usestd3asciirules);
313 if (rc != IDNA_SUCCESS)
314 return rc;
316 /* 7. Verify that the result of step 6 matches the saved copy from
317 * step 3, using a case-insensitive ASCII comparison.
320 if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
321 return IDNA_ROUNDTRIP_VERIFY_ERROR;
323 /* 8. Return the saved copy from step 5.
326 return IDNA_SUCCESS;
330 * idna_to_unicode
331 * @in: input array with unicode code points.
332 * @inlen: length of input array with unicode code points.
333 * @out: output array with unicode code points.
334 * @outlen: on input, maximum size of output array with unicode code points,
335 * on exit, actual size of output array with unicode code points.
336 * @allowunassigned: boolean value as per IDNA specification.
337 * @usestd3asciirules: boolean value as per IDNA specification.
339 * The ToUnicode operation takes a sequence of Unicode code points
340 * that make up one label and returns a sequence of Unicode code
341 * points. If the input sequence is a label in ACE form, then the
342 * result is an equivalent internationalized label that is not in ACE
343 * form, otherwise the original sequence is returned unaltered.
345 * ToUnicode never fails. If any step fails, then the original input
346 * sequence is returned immediately in that step.
348 * The ToUnicode output never contains more code points than its
349 * input. Note that the number of octets needed to represent a
350 * sequence of code points depends on the particular character
351 * encoding used.
353 * The inputs to ToUnicode are a sequence of code points, the
354 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
355 * ToUnicode is always a sequence of Unicode code points.
357 * Return value: Returns error condition, but it must only be used for
358 * debugging purposes. The output buffer is always
359 * guaranteed to contain the correct data according to
360 * the specification (sans malloc induced errors). NB!
361 * This means that you normally ignore the return code
362 * from this function, as checking it means breaking the
363 * standard.
366 idna_to_unicode (const unsigned long *in, size_t inlen,
367 unsigned long *out, size_t * outlen,
368 int allowunassigned, int usestd3asciirules)
370 int rc;
371 size_t outlensave = *outlen;
372 char *p;
374 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
375 if (p == NULL)
376 return IDNA_MALLOC_ERROR;
378 p = realloc (p, BUFSIZ);
379 if (p == NULL)
380 return IDNA_MALLOC_ERROR;
382 rc = idna_to_unicode_internal (in, inlen, out, outlen,
383 allowunassigned, usestd3asciirules,
384 p, BUFSIZ);
385 if (rc != IDNA_SUCCESS)
387 memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
388 inlen : outlensave));
389 *outlen = inlen;
392 free (p);
394 return rc;
398 * idna_ucs4_to_ace:
399 * @input: zero terminated input Unicode string.
400 * @output: pointer to newly allocated output string.
402 * Convert UCS-4 domain name to ASCII string. The AllowUnassigned
403 * flag is false and std3asciirules flag is false. The domain name
404 * may contain several labels, separated by dots. The output buffer
405 * must be deallocated by the caller.
407 * Return value: Returns IDNA_SUCCESS on success, or error code.
410 idna_ucs4_to_ace (const unsigned long *input, char **output)
412 const unsigned long *start = input;
413 const unsigned long *end = input;
414 char buf[64];
415 char *out = NULL;
416 int rc;
418 *output = NULL;
422 end = start;
424 /* 1) Whenever dots are used as label separators, the following
425 characters MUST be recognized as dots: U+002E (full stop),
426 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
427 U+FF61 (halfwidth ideographic full stop). */
428 for (; *end &&
429 *end != 0x002E &&
430 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
433 rc = idna_to_ascii (start, end - start, buf, 0, 0);
434 if (rc != IDNA_SUCCESS)
435 return rc;
437 if (out)
439 out = realloc (out, strlen (out) + 1 + strlen (buf) + 1);
440 if (!out)
441 return IDNA_MALLOC_ERROR;
442 strcat (out, ".");
443 strcat (out, buf);
445 else
447 out = strdup (buf);
448 if (!out)
449 return IDNA_MALLOC_ERROR;
452 start = end + 1;
454 while (*end);
456 *output = out;
458 return IDNA_SUCCESS;
462 * idna_utf8_to_ace:
463 * @input: zero terminated input UTF-8 string.
464 * @output: pointer to newly allocated output string.
466 * Convert UTF-8 domain name to ASCII string. The AllowUnassigned
467 * flag is false and std3asciirules flag is false. The domain name
468 * may contain several labels, separated by dots. The output buffer
469 * must be deallocated by the caller.
471 * Return value: Returns IDNA_SUCCESS on success, or error code.
474 idna_utf8_to_ace (const char *input, char **output)
476 unsigned long *ucs4;
477 size_t ucs4len;
478 int rc;
480 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
481 if (!ucs4)
482 return IDNA_ICONV_ERROR;
484 rc = idna_ucs4_to_ace (ucs4, output);
485 free (ucs4);
487 return rc;
491 * idna_locale_to_ace:
492 * @input: zero terminated input UTF-8 string.
493 * @output: pointer to newly allocated output string.
495 * Convert domain name in the locale's encoding to ASCII string. The
496 * AllowUnassigned flag is false and std3asciirules flag is false.
497 * The domain name may contain several labels, separated by dots. The
498 * output buffer must be deallocated by the caller.
500 * Return value: Returns IDNA_SUCCESS on success, or error code.
503 idna_locale_to_ace (const char *input, char **output)
505 char *utf8;
506 int rc;
508 utf8 = stringprep_locale_to_utf8 (input);
509 if (!utf8)
510 return IDNA_ICONV_ERROR;
512 rc = idna_utf8_to_ace (utf8, output);
513 free (utf8);
515 return rc;
518 /* Transforms an (possibly) ACE domain name into Unicode. Every label
519 which is not ACE will be output inchanged so you can safely use
520 this routine. The output will be encoded in UTF-8. The output must
521 be allocated and freed by you. The returned int is a status
522 code. */
525 * idna_ucs4ace_to_ucs4:
526 * @input: zero-terminated Unicode string.
527 * @output: pointer to newly allocated output Unicode string.
529 * Convert possibly ACE encoded domain name in UCS-4 format into a
530 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
531 * flag is false. The domain name may contain several labels,
532 * separated by dots. The output buffer must be deallocated by the
533 * caller.
535 * Return value: Returns IDNA_SUCCESS on success, or error code.
538 idna_ucs4ace_to_ucs4 (const unsigned long *input, unsigned long **output)
540 const unsigned long *start = input;
541 const unsigned long *end = input;
542 unsigned long *buf;
543 size_t buflen;
544 unsigned long *out = NULL;
545 size_t outlen = 0;
546 int rc;
548 *output = NULL;
552 end = start;
554 /* 1) Whenever dots are used as label separators, the following
555 characters MUST be recognized as dots: U+002E (full stop),
556 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
557 U+FF61 (halfwidth ideographic full stop). */
558 for (; *end &&
559 *end != 0x002E &&
560 *end != 0x3002 && *end != 0xFF0E && *end != 0xFF61; end++)
563 buflen = end - start;
564 buf = malloc (sizeof (buf[0]) * (buflen + 1));
565 if (!buf)
566 return IDNA_MALLOC_ERROR;
568 rc = idna_to_unicode (start, end - start, buf, &buflen, 0, 0);
569 /* don't check rc as per specification! */
571 if (out)
573 out = realloc (out, sizeof (out[0]) * (outlen + 1 + buflen + 1));
574 if (!out)
575 return IDNA_MALLOC_ERROR;
576 out[outlen++] = 0x002E; /* '.' (full stop) */
577 memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
578 outlen += buflen;
579 out[outlen] = 0x0;
580 free (buf);
582 else
584 out = buf;
585 outlen = buflen;
586 out[outlen] = 0x0;
589 start = end + 1;
591 while (*end);
593 *output = out;
595 return IDNA_SUCCESS;
599 * idna_utf8ace_to_ucs4:
600 * @input: zero-terminated UTF-8 string.
601 * @output: pointer to newly allocated output Unicode string.
603 * Convert possibly ACE encoded domain name in UTF-8 format into a
604 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
605 * flag is false. The domain name may contain several labels,
606 * separated by dots. The output buffer must be deallocated by the
607 * caller.
609 * Return value: Returns IDNA_SUCCESS on success, or error code.
612 idna_utf8ace_to_ucs4 (const char *input, unsigned long **output)
614 unsigned long *ucs4;
615 size_t ucs4len;
616 int rc;
618 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
619 if (!ucs4)
620 return IDNA_ICONV_ERROR;
622 rc = idna_ucs4ace_to_ucs4 (ucs4, output);
623 free (ucs4);
625 return rc;
629 * idna_utf8ace_to_utf8:
630 * @input: zero-terminated UTF-8 string.
631 * @output: pointer to newly allocated output UTF-8 string.
633 * Convert possibly ACE encoded domain name in UTF-8 format into a
634 * UTF-8 string. The AllowUnassigned flag is false and std3asciirules
635 * flag is false. The domain name may contain several labels,
636 * separated by dots. The output buffer must be deallocated by the
637 * caller.
639 * Return value: Returns IDNA_SUCCESS on success, or error code.
642 idna_utf8ace_to_utf8 (const char *input, char **output)
644 unsigned long *ucs4;
645 int rc;
647 rc = idna_utf8ace_to_ucs4 (input, &ucs4);
648 *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
649 free (ucs4);
651 if (!*output)
652 return IDNA_ICONV_ERROR;
654 return rc;
658 * idna_utf8ace_to_locale:
659 * @input: zero-terminated UTF-8 string.
660 * @output: pointer to newly allocated output string encoded in the
661 * current locale's character set.
663 * Convert possibly ACE encoded domain name in UTF-8 format into a
664 * string encoded in the current locale's character set. The
665 * AllowUnassigned flag is false and std3asciirules flag is false.
666 * The domain name may contain several labels, separated by dots. The
667 * output buffer must be deallocated by the caller.
669 * Return value: Returns IDNA_SUCCESS on success, or error code.
672 idna_utf8ace_to_locale (const char *input, char **output)
674 char *utf8;
675 int rc;
677 rc = idna_utf8ace_to_utf8 (input, &utf8);
678 *output = stringprep_utf8_to_locale (utf8);
679 free (utf8);
681 if (!*output)
682 return IDNA_ICONV_ERROR;
684 return rc;
688 * idna_localeace_to_locale:
689 * @input: zero-terminated string encoded in the current locale's
690 * character set.
691 * @output: pointer to newly allocated output string encoded in the
692 * current locale's character set.
694 * Convert possibly ACE encoded domain name in the locale's character
695 * set into a string encoded in the current locale's character set.
696 * The AllowUnassigned flag is false and std3asciirules flag is false.
697 * The domain name may contain several labels, separated by dots. The
698 * output buffer must be deallocated by the caller.
700 * Return value: Returns IDNA_SUCCESS on success, or error code.
703 idna_localeace_to_locale (const char *input, char **output)
705 char *utf8;
706 int rc;
708 utf8 = stringprep_locale_to_utf8 (input);
709 if (!utf8)
710 return IDNA_ICONV_ERROR;
712 rc = idna_utf8ace_to_locale (utf8, output);
713 free (utf8);
715 return rc;