Add unfinished old IDNA stuff.
[libidn.git] / idna.c
blob218da67ef084bb11ec46902642fe8c78aa1a5bd2
1 /* idna.c Convert to or from IDN strings.
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of Libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /** idna_to_ascii
25 * @in: input array with unicode code points.
26 * @inlen: length of input array with unicode code points.
27 * @out: output zero terminated string that must have room for at
28 * least 63 characters plus the terminating zero.
29 * @allowunassigned: boolean value as per IDNA specification.
30 * @usestd3asciirules: boolean value as per IDNA specification.
32 * The ToASCII operation takes a sequence of Unicode code points that make
33 * up one label and transforms it into a sequence of code points in the
34 * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
35 * resulting sequence are equivalent labels.
37 * It is important to note that the ToASCII operation can fail. ToASCII
38 * fails if any step of it fails. If any step of the ToASCII operation
39 * fails on any label in a domain name, that domain name MUST NOT be used
40 * as an internationalized domain name. The method for deadling with this
41 * failure is application-specific.
43 * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
44 * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
45 * sequence of ASCII code points or a failure condition.
47 * ToASCII never alters a sequence of code points that are all in the ASCII
48 * range to begin with (although it could fail). Applying the ToASCII
49 * operation multiple times has exactly the same effect as applying it just
50 * once.
52 int
53 idna_to_ascii (const unsigned long *in, size_t inlen,
54 char *out,
55 int allowunassigned, int usestd3asciirules)
57 size_t len, outlen;
58 unsigned long *src;
59 int rc;
61 src = malloc(sizeof(in[0]) * inlen + 1);
62 if (src == NULL)
63 return IDNA_MALLOC_ERROR;
65 memcpy(src, in, sizeof(in[0]) * inlen);
66 src[inlen] = 0;
69 * ToASCII consists of the following steps:
71 * 1. If all code points in the sequence are in the ASCII range (0..7F)
72 * then skip to step 3.
76 size_t i;
77 int inasciirange;
79 inasciirange = 1;
80 for (i = 0; src[i]; i++)
81 if (src[i] > 0x7F)
82 inasciirange = 0;
83 if (inasciirange)
84 goto step3;
88 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
89 * an error. The AllowUnassigned flag is used in [NAMEPREP].
93 char *p;
95 p = stringprep_ucs4_to_utf8 (src, inlen, NULL, NULL);
96 if (p == NULL)
97 return IDNA_MALLOC_ERROR;
99 p = realloc(p, BUFSIZ);
100 if (p == NULL)
101 return IDNA_MALLOC_ERROR;
103 if (allowunassigned)
104 rc = stringprep_nameprep(p, BUFSIZ);
105 else
106 rc = stringprep_nameprep_no_unassigned(p, BUFSIZ);
108 if (rc != STRINGPREP_OK)
109 return IDNA_STRINGPREP_ERROR;
111 free(src);
113 src = stringprep_utf8_to_ucs4(p, -1, NULL);
116 step3:
118 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
120 * (a) Verify the absence of non-LDH ASCII code points; that is,
121 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
123 * (b) Verify the absence of leading and trailing hyphen-minus;
124 * that is, the absence of U+002D at the beginning and end of
125 * the sequence.
128 if (usestd3asciirules)
130 size_t i;
132 for (i = 0; src[i]; i++)
133 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
134 (src[i] >= 0x3A && src[i] <= 0x40) ||
135 (src[i] >= 0x5B && src[i] <= 0x60) ||
136 (src[i] >= 0x7B && src[i] <= 0x7F))
137 return IDNA_CONTAINS_LDH;
139 if (src[0] == 0x002D || (i > 0 && src[i-1] == 0x002D))
140 return IDNA_CONTAINS_MINUS;
144 * 4. If all code points in the sequence are in the ASCII range
145 * (0..7F), then skip to step 8.
149 size_t i;
150 int inasciirange;
152 inasciirange = 1;
153 for (i = 0; src[i]; i++)
154 if (src[i] > 0x7F)
155 inasciirange = 0;
156 if (inasciirange)
157 goto step8;
161 * 5. Verify that the sequence does NOT begin with the ACE prefix.
166 /* XXX */
170 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
171 * and fail if there is an error.
173 for (len = 0; src[len]; len++)
175 src[len] = '\0';
176 outlen = 63 - strlen(IDNA_ACE_PREFIX);
177 rc = punycode_encode (len, src, NULL,
178 &outlen, &out[strlen(IDNA_ACE_PREFIX)]);
179 if (rc != PUNYCODE_SUCCESS)
180 return IDNA_PUNYCODE_ERROR;
181 if (outlen > 63)
182 return IDNA_PUNYCODE_ERROR;
183 out[strlen(IDNA_ACE_PREFIX) + outlen] = '\0';
186 * 7. Prepend the ACE prefix.
189 memcpy(out, IDNA_ACE_PREFIX, strlen(IDNA_ACE_PREFIX));
192 * 8. Verify that the number of code points is in the range 1 to 63
193 * inclusive.
196 step8:
197 if (strlen(out) < 1 || strlen(out) > 63)
198 return IDNA_INVALID_LENGTH;
200 return IDNA_SUCCESS;
203 /** idna_to_unicode
204 * @in: input array with unicode code points.
205 * @inlen: length of input array with unicode code points.
206 * @out: output array with unicode code points.
207 * @outlen: on input, maximum size of output array with unicode code points,
208 * on exit, actual size of output array with unicode code points.
209 * @allowunassigned: boolean value as per IDNA specification.
210 * @usestd3asciirules: boolean value as per IDNA specification.
212 * The ToUnicode operation takes a sequence of Unicode code points
213 * that make up one label and returns a sequence of Unicode code
214 * points. If the input sequence is a label in ACE form, then the
215 * result is an equivalent internationalized label that is not in ACE
216 * form, otherwise the original sequence is returned unaltered.
218 * ToUnicode never fails. If any step fails, then the original input
219 * sequence is returned immediately in that step.
221 * The ToUnicode output never contains more code points than its
222 * input. Note that the number of octets needed to represent a
223 * sequence of code points depends on the particular character
224 * encoding used.
226 * The inputs to ToUnicode are a sequence of code points, the
227 * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
228 * ToUnicode is always a sequence of Unicode code points.
231 idna_to_unicode (const unsigned long *in, size_t inlen,
232 unsigned long *out, size_t *outlen,
233 int allowunassigned, int usestd3asciirules)
235 char *p;
236 int rc;
237 char *src;
240 * 1. If all code points in the sequence are in the ASCII range (0..7F)
241 * then skip to step 3.
245 size_t i;
246 int inasciirange;
248 inasciirange = 1;
249 for (i = 0; in[i]; i++)
250 if (in[i] > 0x7F)
251 inasciirange = 0;
252 if (inasciirange)
253 goto step3;
257 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
258 * error. (If step 3 of ToASCII is also performed here, it will not
259 * affect the overall behavior of ToUnicode, but it is not
260 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
263 p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
264 if (p == NULL)
265 return IDNA_MALLOC_ERROR;
267 p = realloc(p, BUFSIZ);
268 if (p == NULL)
269 return IDNA_MALLOC_ERROR;
271 if (allowunassigned)
272 rc = stringprep_nameprep(p, BUFSIZ);
273 else
274 rc = stringprep_nameprep_no_unassigned(p, BUFSIZ);
276 if (rc != STRINGPREP_OK)
277 return IDNA_STRINGPREP_ERROR;
279 free(src);
281 src = stringprep_utf8_to_ucs4(p, -1, NULL);
283 /* 3. Verify that the sequence begins with the ACE prefix, and save a
284 * copy of the sequence.
287 step3:
288 if (memcmp(IDNA_ACE_PREFIX, p, strlen(IDNA_ACE_PREFIX)) != 0)
289 return IDNA_NO_ACE_PREFIX;
291 /* 4. Remove the ACE prefix.
294 memmove(p, &p[strlen(IDNA_ACE_PREFIX)], strlen(p)-strlen(IDNA_ACE_PREFIX));
296 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
297 * and fail if there is an error. Save a copy of the result of
298 * this step.
303 /* 6. Apply ToASCII.
306 /* 7. Verify that the result of step 6 matches the saved copy from
307 * step 3, using a case-insensitive ASCII comparison.
310 /* 8. Return the saved copy from step 5.
313 return IDNA_SUCCESS;