dist dummy.c.
[libidn.git] / idna.c
blobb8c132bb9a7de7e0164048d0fdbf8fa3b28a832a
1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of Libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /**
25 * idna_to_ascii
26 * @in: input array with unicode code points.
27 * @inlen: length of input array with unicode code points.
28 * @out: output zero terminated string that must have room for at
29 * least 63 characters plus the terminating zero.
30 * @allowunassigned: boolean value as per IDNA specification.
31 * @usestd3asciirules: boolean value as per IDNA specification.
33 * The ToASCII operation takes a sequence of Unicode code points that make
34 * up one label and transforms it into a sequence of code points in the
35 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
36 * resulting sequence are equivalent labels.
38 * It is important to note that the ToASCII operation can fail. ToASCII
39 * fails if any step of it fails. If any step of the ToASCII operation
40 * fails on any label in a domain name, that domain name MUST NOT be used
41 * as an internationalized domain name. The method for deadling with this
42 * failure is application-specific.
44 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
45 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
46 * sequence of ASCII code points or a failure condition.
48 * ToASCII never alters a sequence of code points that are all in the ASCII
49 * range to begin with (although it could fail). Applying the ToASCII
50 * operation multiple times has exactly the same effect as applying it just
51 * once.
53 * Return value: Returns 0 on success, or an error code.
55 int
56 idna_to_ascii (const unsigned long *in, size_t inlen,
57 char *out, int allowunassigned, int usestd3asciirules)
59 size_t len, outlen;
60 unsigned long *src;
61 int rc;
63 src = malloc (sizeof (in[0]) * inlen + 1);
64 if (src == NULL)
65 return IDNA_MALLOC_ERROR;
67 memcpy (src, in, sizeof (in[0]) * inlen);
68 src[inlen] = 0;
71 * ToASCII consists of the following steps:
73 * 1. If all code points in the sequence are in the ASCII range (0..7F)
74 * then skip to step 3.
78 size_t i;
79 int inasciirange;
81 inasciirange = 1;
82 for (i = 0; src[i]; i++)
83 if (src[i] > 0x7F)
84 inasciirange = 0;
85 if (inasciirange)
86 goto step3;
90 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
91 * an error. The AllowUnassigned flag is used in [NAMEPREP].
95 char *p;
97 p = stringprep_ucs4_to_utf8 (src, inlen, NULL, NULL);
98 if (p == NULL)
99 return IDNA_MALLOC_ERROR;
101 p = realloc (p, BUFSIZ);
102 if (p == NULL)
103 return IDNA_MALLOC_ERROR;
105 if (allowunassigned)
106 rc = stringprep_nameprep (p, BUFSIZ);
107 else
108 rc = stringprep_nameprep_no_unassigned (p, BUFSIZ);
110 if (rc != STRINGPREP_OK)
111 return IDNA_STRINGPREP_ERROR;
113 free (src);
115 src = stringprep_utf8_to_ucs4 (p, -1, NULL);
118 step3:
120 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
122 * (a) Verify the absence of non-LDH ASCII code points; that is,
123 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
125 * (b) Verify the absence of leading and trailing hyphen-minus;
126 * that is, the absence of U+002D at the beginning and end of
127 * the sequence.
130 if (usestd3asciirules)
132 size_t i;
134 for (i = 0; src[i]; i++)
135 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
136 (src[i] >= 0x3A && src[i] <= 0x40) ||
137 (src[i] >= 0x5B && src[i] <= 0x60) ||
138 (src[i] >= 0x7B && src[i] <= 0x7F))
139 return IDNA_CONTAINS_LDH;
141 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
142 return IDNA_CONTAINS_MINUS;
146 * 4. If all code points in the sequence are in the ASCII range
147 * (0..7F), then skip to step 8.
151 size_t i;
152 int inasciirange;
154 inasciirange = 1;
155 for (i = 0; src[i]; i++)
156 if (src[i] > 0x7F)
157 inasciirange = 0;
158 if (inasciirange)
159 goto step8;
163 * 5. Verify that the sequence does NOT begin with the ACE prefix.
168 /* XXX */
172 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
173 * and fail if there is an error.
175 for (len = 0; src[len]; len++)
177 src[len] = '\0';
178 outlen = 63 - strlen (IDNA_ACE_PREFIX);
179 rc = punycode_encode (len, src, NULL,
180 &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
181 if (rc != PUNYCODE_SUCCESS)
182 return IDNA_PUNYCODE_ERROR;
183 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
186 * 7. Prepend the ACE prefix.
189 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
192 * 8. Verify that the number of code points is in the range 1 to 63
193 * inclusive.
196 step8:
197 if (strlen (out) < 1 || strlen (out) > 63)
198 return IDNA_INVALID_LENGTH;
200 return IDNA_SUCCESS;
203 static int
204 idna_to_unicode_internal (const unsigned long *in, size_t inlen,
205 unsigned long *out, size_t * outlen,
206 int allowunassigned, int usestd3asciirules,
207 char *utf8in, size_t utf8len)
209 int rc;
210 char tmpout[64];
213 * 1. If all code points in the sequence are in the ASCII range (0..7F)
214 * then skip to step 3.
218 size_t i;
219 int inasciirange;
221 inasciirange = 1;
222 for (i = 0; in[i]; i++)
223 if (in[i] > 0x7F)
224 inasciirange = 0;
225 if (inasciirange)
226 goto step3;
230 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
231 * error. (If step 3 of ToASCII is also performed here, it will not
232 * affect the overall behavior of ToUnicode, but it is not
233 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
236 if (allowunassigned)
237 rc = stringprep_nameprep (utf8in, utf8len);
238 else
239 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len);
241 if (rc != STRINGPREP_OK)
242 return IDNA_STRINGPREP_ERROR;
244 /* 3. Verify that the sequence begins with the ACE prefix, and save a
245 * copy of the sequence.
248 step3:
249 if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
250 return IDNA_NO_ACE_PREFIX;
252 /* 4. Remove the ACE prefix.
255 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
256 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
258 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
259 * and fail if there is an error. Save a copy of the result of
260 * this step.
263 rc = punycode_decode (strlen(utf8in), utf8in, outlen, out, NULL);
264 if (rc != PUNYCODE_SUCCESS)
265 return IDNA_PUNYCODE_ERROR;
267 /* 6. Apply ToASCII.
270 rc = idna_to_ascii (out, *outlen, tmpout,
271 allowunassigned, usestd3asciirules);
272 if (rc != IDNA_SUCCESS)
273 return rc;
275 /* 7. Verify that the result of step 6 matches the saved copy from
276 * step 3, using a case-insensitive ASCII comparison.
279 if (strcasecmp(utf8in, tmpout + strlen(IDNA_ACE_PREFIX)) != 0)
280 return IDNA_ROUNDTRIP_VERIFY_ERROR;
282 /* 8. Return the saved copy from step 5.
285 return IDNA_SUCCESS;
289 * idna_to_unicode
290 * @in: input array with unicode code points.
291 * @inlen: length of input array with unicode code points.
292 * @out: output array with unicode code points.
293 * @outlen: on input, maximum size of output array with unicode code points,
294 * on exit, actual size of output array with unicode code points.
295 * @allowunassigned: boolean value as per IDNA specification.
296 * @usestd3asciirules: boolean value as per IDNA specification.
298 * The ToUnicode operation takes a sequence of Unicode code points
299 * that make up one label and returns a sequence of Unicode code
300 * points. If the input sequence is a label in ACE form, then the
301 * result is an equivalent internationalized label that is not in ACE
302 * form, otherwise the original sequence is returned unaltered.
304 * ToUnicode never fails. If any step fails, then the original input
305 * sequence is returned immediately in that step.
307 * The ToUnicode output never contains more code points than its
308 * input. Note that the number of octets needed to represent a
309 * sequence of code points depends on the particular character
310 * encoding used.
312 * The inputs to ToUnicode are a sequence of code points, the
313 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
314 * ToUnicode is always a sequence of Unicode code points.
316 * Return value: Returns error condition, but it must only be used for
317 * debugging purposes. The output buffer is always
318 * guaranteed to contain the correct data according to
319 * the specification (sans malloc induced errors). NB!
320 * This means that you normally ignore the return code
321 * from this function, as checking it means breaking the
322 * standard.
325 idna_to_unicode (const unsigned long *in, size_t inlen,
326 unsigned long *out, size_t * outlen,
327 int allowunassigned, int usestd3asciirules)
329 int rc;
330 int outlensave = *outlen;
331 char *p;
333 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
334 if (p == NULL)
335 return IDNA_MALLOC_ERROR;
337 p = realloc (p, BUFSIZ);
338 if (p == NULL)
339 return IDNA_MALLOC_ERROR;
341 rc = idna_to_unicode_internal (in, inlen, out, outlen,
342 allowunassigned, usestd3asciirules,
343 p, BUFSIZ);
344 if (rc != IDNA_SUCCESS)
346 memcpy(out, in,
347 sizeof (in[0]) * (inlen < outlensave ? inlen : outlensave));
348 *outlen = inlen;
351 free(p);
353 return rc;