1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 * @in: input array with unicode code points.
27 * @inlen: length of input array with unicode code points.
28 * @out: output zero terminated string that must have room for at
29 * least 63 characters plus the terminating zero.
30 * @allowunassigned: boolean value as per IDNA specification.
31 * @usestd3asciirules: boolean value as per IDNA specification.
33 * The ToASCII operation takes a sequence of Unicode code points that make
34 * up one label and transforms it into a sequence of code points in the
35 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
36 * resulting sequence are equivalent labels.
38 * It is important to note that the ToASCII operation can fail. ToASCII
39 * fails if any step of it fails. If any step of the ToASCII operation
40 * fails on any label in a domain name, that domain name MUST NOT be used
41 * as an internationalized domain name. The method for deadling with this
42 * failure is application-specific.
44 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
45 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
46 * sequence of ASCII code points or a failure condition.
48 * ToASCII never alters a sequence of code points that are all in the ASCII
49 * range to begin with (although it could fail). Applying the ToASCII
50 * operation multiple times has exactly the same effect as applying it just
53 * Return value: Returns 0 on success, or an error code.
56 idna_to_ascii (const unsigned long *in
, size_t inlen
,
57 char *out
, int allowunassigned
, int usestd3asciirules
)
60 unsigned long *src
; /* XXX don't need to copy data? */
65 * ToASCII consists of the following steps:
67 * 1. If all code points in the sequence are in the ASCII range (0..7F)
68 * then skip to step 3.
76 for (i
= 0; in
[i
]; i
++)
81 src
= malloc (sizeof (in
[0]) * (inlen
+ 1));
83 return IDNA_MALLOC_ERROR
;
85 memcpy (src
, in
, sizeof (in
[0]) * inlen
);
93 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
94 * an error. The AllowUnassigned flag is used in [NAMEPREP].
100 p
= stringprep_ucs4_to_utf8 (in
, inlen
, NULL
, NULL
);
102 return IDNA_MALLOC_ERROR
;
107 len
= 2 * len
+ 10; /* XXX better guess? */
108 p
= realloc (p
, len
);
110 return IDNA_MALLOC_ERROR
;
113 rc
= stringprep_nameprep (p
, len
);
115 rc
= stringprep_nameprep_no_unassigned (p
, len
);
117 while (rc
== STRINGPREP_TOO_SMALL_BUFFER
);
119 if (rc
!= STRINGPREP_OK
)
122 return IDNA_STRINGPREP_ERROR
;
125 src
= stringprep_utf8_to_ucs4 (p
, -1, NULL
);
132 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
134 * (a) Verify the absence of non-LDH ASCII code points; that is,
135 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
137 * (b) Verify the absence of leading and trailing hyphen-minus;
138 * that is, the absence of U+002D at the beginning and end of
142 if (usestd3asciirules
)
146 for (i
= 0; src
[i
]; i
++)
147 if (src
[i
] <= 0x2C || src
[i
] == 0x2E || src
[i
] == 0x2F ||
148 (src
[i
] >= 0x3A && src
[i
] <= 0x40) ||
149 (src
[i
] >= 0x5B && src
[i
] <= 0x60) ||
150 (src
[i
] >= 0x7B && src
[i
] <= 0x7F))
153 return IDNA_CONTAINS_LDH
;
156 if (src
[0] == 0x002D || (i
> 0 && src
[i
- 1] == 0x002D))
159 return IDNA_CONTAINS_MINUS
;
164 * 4. If all code points in the sequence are in the ASCII range
165 * (0..7F), then skip to step 8.
173 for (i
= 0; src
[i
]; i
++)
177 /* copy string to output buffer if we are about to skip to step8 */
188 * 5. Verify that the sequence does NOT begin with the ACE prefix.
197 for (i
= 0; match
&& i
< strlen (IDNA_ACE_PREFIX
); i
++)
198 if (((unsigned long)IDNA_ACE_PREFIX
[i
] & 0xFF) != src
[i
])
203 return IDNA_CONTAINS_ACE_PREFIX
;
208 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
209 * and fail if there is an error.
211 for (len
= 0; src
[len
]; len
++)
214 outlen
= 63 - strlen (IDNA_ACE_PREFIX
);
215 rc
= punycode_encode (len
, src
, NULL
,
216 &outlen
, &out
[strlen (IDNA_ACE_PREFIX
)]);
218 if (rc
!= PUNYCODE_SUCCESS
)
219 return IDNA_PUNYCODE_ERROR
;
220 out
[strlen (IDNA_ACE_PREFIX
) + outlen
] = '\0';
223 * 7. Prepend the ACE prefix.
226 memcpy (out
, IDNA_ACE_PREFIX
, strlen (IDNA_ACE_PREFIX
));
229 * 8. Verify that the number of code points is in the range 1 to 63
234 if (strlen (out
) < 1 || strlen (out
) > 63)
235 return IDNA_INVALID_LENGTH
;
241 idna_to_unicode_internal (const unsigned long *in
, size_t inlen
,
242 unsigned long *out
, size_t * outlen
,
243 int allowunassigned
, int usestd3asciirules
,
244 char *utf8in
, size_t utf8len
)
250 * 1. If all code points in the sequence are in the ASCII range (0..7F)
251 * then skip to step 3.
259 for (i
= 0; in
[i
]; i
++)
267 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
268 * error. (If step 3 of ToASCII is also performed here, it will not
269 * affect the overall behavior of ToUnicode, but it is not
270 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
274 rc
= stringprep_nameprep (utf8in
, utf8len
);
276 rc
= stringprep_nameprep_no_unassigned (utf8in
, utf8len
);
278 if (rc
!= STRINGPREP_OK
)
279 return IDNA_STRINGPREP_ERROR
;
281 /* 3. Verify that the sequence begins with the ACE prefix, and save a
282 * copy of the sequence.
286 if (memcmp (IDNA_ACE_PREFIX
, utf8in
, strlen (IDNA_ACE_PREFIX
)) != 0)
287 return IDNA_NO_ACE_PREFIX
;
289 /* 4. Remove the ACE prefix.
292 memmove (utf8in
, &utf8in
[strlen (IDNA_ACE_PREFIX
)],
293 strlen (utf8in
) - strlen (IDNA_ACE_PREFIX
) + 1);
295 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
296 * and fail if there is an error. Save a copy of the result of
300 (*outlen
)--; /* reserve one for the zero */
302 rc
= punycode_decode (strlen (utf8in
), utf8in
, outlen
, out
, NULL
);
303 if (rc
!= PUNYCODE_SUCCESS
)
304 return IDNA_PUNYCODE_ERROR
;
306 out
[*outlen
] = 0; /* add zero */
311 rc
= idna_to_ascii (out
, *outlen
, tmpout
,
312 allowunassigned
, usestd3asciirules
);
313 if (rc
!= IDNA_SUCCESS
)
316 /* 7. Verify that the result of step 6 matches the saved copy from
317 * step 3, using a case-insensitive ASCII comparison.
320 if (strcasecmp (utf8in
, tmpout
+ strlen (IDNA_ACE_PREFIX
)) != 0)
321 return IDNA_ROUNDTRIP_VERIFY_ERROR
;
323 /* 8. Return the saved copy from step 5.
331 * @in: input array with unicode code points.
332 * @inlen: length of input array with unicode code points.
333 * @out: output array with unicode code points.
334 * @outlen: on input, maximum size of output array with unicode code points,
335 * on exit, actual size of output array with unicode code points.
336 * @allowunassigned: boolean value as per IDNA specification.
337 * @usestd3asciirules: boolean value as per IDNA specification.
339 * The ToUnicode operation takes a sequence of Unicode code points
340 * that make up one label and returns a sequence of Unicode code
341 * points. If the input sequence is a label in ACE form, then the
342 * result is an equivalent internationalized label that is not in ACE
343 * form, otherwise the original sequence is returned unaltered.
345 * ToUnicode never fails. If any step fails, then the original input
346 * sequence is returned immediately in that step.
348 * The ToUnicode output never contains more code points than its
349 * input. Note that the number of octets needed to represent a
350 * sequence of code points depends on the particular character
353 * The inputs to ToUnicode are a sequence of code points, the
354 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
355 * ToUnicode is always a sequence of Unicode code points.
357 * Return value: Returns error condition, but it must only be used for
358 * debugging purposes. The output buffer is always
359 * guaranteed to contain the correct data according to
360 * the specification (sans malloc induced errors). NB!
361 * This means that you normally ignore the return code
362 * from this function, as checking it means breaking the
366 idna_to_unicode (const unsigned long *in
, size_t inlen
,
367 unsigned long *out
, size_t * outlen
,
368 int allowunassigned
, int usestd3asciirules
)
371 size_t outlensave
= *outlen
;
374 p
= stringprep_ucs4_to_utf8 (in
, inlen
, NULL
, NULL
);
376 return IDNA_MALLOC_ERROR
;
378 p
= realloc (p
, BUFSIZ
);
380 return IDNA_MALLOC_ERROR
;
382 rc
= idna_to_unicode_internal (in
, inlen
, out
, outlen
,
383 allowunassigned
, usestd3asciirules
,
385 if (rc
!= IDNA_SUCCESS
)
387 memcpy (out
, in
, sizeof (in
[0]) * (inlen
< outlensave
?
388 inlen
: outlensave
));
399 * @input: zero terminated input Unicode string.
400 * @output: pointer to newly allocated output string.
402 * Convert UCS-4 domain name to ASCII string. The AllowUnassigned
403 * flag is false and std3asciirules flag is false. The domain name
404 * may contain several labels, separated by dots. The output buffer
405 * must be deallocated by the caller.
407 * Return value: Returns IDNA_SUCCESS on success, or error code.
410 idna_ucs4_to_ace (const unsigned long *input
, char **output
)
412 const unsigned long *start
= input
;
413 const unsigned long *end
= input
;
424 /* 1) Whenever dots are used as label separators, the following
425 characters MUST be recognized as dots: U+002E (full stop),
426 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
427 U+FF61 (halfwidth ideographic full stop). */
430 *end
!= 0x3002 && *end
!= 0xFF0E && *end
!= 0xFF61; end
++)
433 rc
= idna_to_ascii (start
, end
- start
, buf
, 0, 0);
434 if (rc
!= IDNA_SUCCESS
)
439 out
= realloc (out
, strlen (out
) + 1 + strlen (buf
) + 1);
441 return IDNA_MALLOC_ERROR
;
449 return IDNA_MALLOC_ERROR
;
463 * @input: zero terminated input UTF-8 string.
464 * @output: pointer to newly allocated output string.
466 * Convert UTF-8 domain name to ASCII string. The AllowUnassigned
467 * flag is false and std3asciirules flag is false. The domain name
468 * may contain several labels, separated by dots. The output buffer
469 * must be deallocated by the caller.
471 * Return value: Returns IDNA_SUCCESS on success, or error code.
474 idna_utf8_to_ace (const char *input
, char **output
)
480 ucs4
= stringprep_utf8_to_ucs4 (input
, -1, &ucs4len
);
482 return IDNA_ICONV_ERROR
;
484 rc
= idna_ucs4_to_ace (ucs4
, output
);
491 * idna_locale_to_ace:
492 * @input: zero terminated input UTF-8 string.
493 * @output: pointer to newly allocated output string.
495 * Convert domain name in the locale's encoding to ASCII string. The
496 * AllowUnassigned flag is false and std3asciirules flag is false.
497 * The domain name may contain several labels, separated by dots. The
498 * output buffer must be deallocated by the caller.
500 * Return value: Returns IDNA_SUCCESS on success, or error code.
503 idna_locale_to_ace (const char *input
, char **output
)
508 utf8
= stringprep_locale_to_utf8 (input
);
510 return IDNA_ICONV_ERROR
;
512 rc
= idna_utf8_to_ace (utf8
, output
);
518 /* Transforms an (possibly) ACE domain name into Unicode. Every label
519 which is not ACE will be output inchanged so you can safely use
520 this routine. The output will be encoded in UTF-8. The output must
521 be allocated and freed by you. The returned int is a status
525 * idna_ucs4ace_to_ucs4:
526 * @input: zero-terminated Unicode string.
527 * @output: pointer to newly allocated output Unicode string.
529 * Convert possibly ACE encoded domain name in UCS-4 format into a
530 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
531 * flag is false. The domain name may contain several labels,
532 * separated by dots. The output buffer must be deallocated by the
535 * Return value: Returns IDNA_SUCCESS on success, or error code.
538 idna_ucs4ace_to_ucs4 (const unsigned long *input
, unsigned long **output
)
540 const unsigned long *start
= input
;
541 const unsigned long *end
= input
;
544 unsigned long *out
= NULL
;
554 /* 1) Whenever dots are used as label separators, the following
555 characters MUST be recognized as dots: U+002E (full stop),
556 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
557 U+FF61 (halfwidth ideographic full stop). */
560 *end
!= 0x3002 && *end
!= 0xFF0E && *end
!= 0xFF61; end
++)
563 buflen
= end
- start
;
564 buf
= malloc (sizeof (buf
[0]) * (buflen
+ 1));
566 return IDNA_MALLOC_ERROR
;
568 rc
= idna_to_unicode (start
, end
- start
, buf
, &buflen
, 0, 0);
569 /* don't check rc as per specification! */
573 out
= realloc (out
, sizeof (out
[0]) * (outlen
+ 1 + buflen
+ 1));
575 return IDNA_MALLOC_ERROR
;
576 out
[outlen
++] = 0x002E; /* '.' (full stop) */
577 memcpy (out
+ outlen
, buf
, sizeof (buf
[0]) * buflen
);
599 * idna_utf8ace_to_ucs4:
600 * @input: zero-terminated UTF-8 string.
601 * @output: pointer to newly allocated output Unicode string.
603 * Convert possibly ACE encoded domain name in UTF-8 format into a
604 * UCS-4 string. The AllowUnassigned flag is false and std3asciirules
605 * flag is false. The domain name may contain several labels,
606 * separated by dots. The output buffer must be deallocated by the
609 * Return value: Returns IDNA_SUCCESS on success, or error code.
612 idna_utf8ace_to_ucs4 (const char *input
, unsigned long **output
)
618 ucs4
= stringprep_utf8_to_ucs4 (input
, -1, &ucs4len
);
620 return IDNA_ICONV_ERROR
;
622 rc
= idna_ucs4ace_to_ucs4 (ucs4
, output
);
629 * idna_utf8ace_to_utf8:
630 * @input: zero-terminated UTF-8 string.
631 * @output: pointer to newly allocated output UTF-8 string.
633 * Convert possibly ACE encoded domain name in UTF-8 format into a
634 * UTF-8 string. The AllowUnassigned flag is false and std3asciirules
635 * flag is false. The domain name may contain several labels,
636 * separated by dots. The output buffer must be deallocated by the
639 * Return value: Returns IDNA_SUCCESS on success, or error code.
642 idna_utf8ace_to_utf8 (const char *input
, char **output
)
647 rc
= idna_utf8ace_to_ucs4 (input
, &ucs4
);
648 *output
= stringprep_ucs4_to_utf8 (ucs4
, -1, NULL
, NULL
);
652 return IDNA_ICONV_ERROR
;
658 * idna_utf8ace_to_locale:
659 * @input: zero-terminated UTF-8 string.
660 * @output: pointer to newly allocated output string encoded in the
661 * current locale's character set.
663 * Convert possibly ACE encoded domain name in UTF-8 format into a
664 * string encoded in the current locale's character set. The
665 * AllowUnassigned flag is false and std3asciirules flag is false.
666 * The domain name may contain several labels, separated by dots. The
667 * output buffer must be deallocated by the caller.
669 * Return value: Returns IDNA_SUCCESS on success, or error code.
672 idna_utf8ace_to_locale (const char *input
, char **output
)
677 rc
= idna_utf8ace_to_utf8 (input
, &utf8
);
678 *output
= stringprep_utf8_to_locale (utf8
);
682 return IDNA_ICONV_ERROR
;
688 * idna_localeace_to_locale:
689 * @input: zero-terminated string encoded in the current locale's
691 * @output: pointer to newly allocated output string encoded in the
692 * current locale's character set.
694 * Convert possibly ACE encoded domain name in the locale's character
695 * set into a string encoded in the current locale's character set.
696 * The AllowUnassigned flag is false and std3asciirules flag is false.
697 * The domain name may contain several labels, separated by dots. The
698 * output buffer must be deallocated by the caller.
700 * Return value: Returns IDNA_SUCCESS on success, or error code.
703 idna_localeace_to_locale (const char *input
, char **output
)
708 utf8
= stringprep_locale_to_utf8 (input
);
710 return IDNA_ICONV_ERROR
;
712 rc
= idna_utf8ace_to_locale (utf8
, output
);