Use new API.
[libidn.git] / lib / tld.c
blob72d8560ca55df9896106288af9b7c367ee76adad
1 /* tld.c --- Handle TLD restriction checking.
2 * Copyright (C) 2003, 2004 Free Software Foundation, Inc.
4 * Author: Thomas Jacob, Internet24.de
6 * This file is part of GNU Libidn.
8 * GNU Libidn is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * GNU Libidn is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with GNU Libidn; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
25 #include <stringprep.h>
27 /* Get strcmp(). */
28 #include <string.h>
30 /* Get specifications. */
31 #include <tld.h>
33 /**
34 * tld_get_table:
35 * @tld_str: TLD name (e.g. "com") as zero terminated ASCII byte string.
36 * @xtra_tlds: Additional well-formed info-structures for TLDs or %NULL.
38 * Return value: Return structure corresponding to TLD @tld_str, first
39 * looking through @xtra_tlds then thru built-in list, or %NULL if no
40 * such structure found.
42 const Tld_table *
43 tld_get_table (const char *tld_str, const Tld_table ** xtra_tlds)
45 const Tld_table **tld = NULL;
47 if (!tld_str)
48 return NULL;
50 /* First search custom tlds. */
51 if (xtra_tlds)
52 for (tld = xtra_tlds; *tld; tld++)
53 if (!strcmp ((*tld)->name, tld_str))
54 return *tld;
56 /* Then search the internal stuff. */
57 for (tld = tld_table; *tld; tld++)
58 if (!strcmp ((*tld)->name, tld_str))
59 return *tld;
61 return NULL;
64 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
65 (c) == 0xFF0E || (c) == 0xFF61)
68 * tld_checkchar:
69 * @ch: 32 bit unicode character to check.
70 * @tld: Tld_table data structure to check @ch against
72 * Verify if @ch is either in [a-z0-9-.] or mentioned
73 * as a legal character in @tld.
75 * Return value: Return %TLD_SUCCESS if @ch is a legal character for
76 * the TLD @tld or if @tld is %NULL, %TLD_ILLEGAL if @ch is not a
77 * legal as defined by @tld.
79 static int
80 _tld_checkchar (uint32_t ch, const Tld_table * tld)
82 const Tld_table_element *p;
83 size_t i;
84 int found = 0;
86 if (!tld)
87 return TLD_SUCCESS;
89 /* Check for [-a-z0-9.]. */
90 if ((ch >= 0x61 && ch <= 0x7A) ||
91 (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
92 return TLD_SUCCESS;
94 /* FIXME: replace searches by bsearch like stuff. */
96 for (p = *tld->valid, i = 0; i < tld->nvalid; i++, p++)
97 if (ch >= p->start && ch <= p->end)
98 return TLD_SUCCESS;
100 return TLD_ILLEGAL;
104 * tld_gettld_4i:
105 * @in: Array of unicode code points to process (Does not need to be
106 * zero terminated).
107 * @inlen: Number of unicode code points.
108 * @out: Zero terminated ascii result string pointer.
110 * Isolate the top-level domain of @in and return it as
111 * an ascii string in @out.
113 * Return value: Return %TLD_SUCCESS on success, the corresponding
114 * error code otherwise.
117 tld_gettld_4i (const uint32_t * in, size_t inlen, char **out)
119 const uint32_t *ipos;
120 size_t olen;
122 *out = NULL;
123 if (!in || (inlen <= 0))
124 return TLD_NODATA;
126 ipos = &in[inlen - 1];
127 olen = 0;
128 /* Scan backwards for non(latin)letters. */
129 while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
130 (*ipos >= 0x61 && *ipos <= 0x7A)))
131 ipos--, olen++;
133 if (olen > 0 && DOTP (*ipos)) /* Found something that appears a TLD. */
135 char *out_s = malloc (sizeof (char) * (olen + 1));
136 char *opos = out_s;
138 if (!opos)
139 return TLD_MALLOC_ERROR;
141 ipos++;
142 /* Transcribe to lowercase ascii string. */
143 for (; ipos < &in[inlen]; ipos++, opos++)
144 *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
145 *opos = 0;
146 *out = out_s;
147 return TLD_SUCCESS;
150 return TLD_NOTLD;
155 * tld_gettld_4z:
156 * @in: Zero terminated array of unicode code points to process.
157 * @out: Zero terminated ascii result string pointer.
159 * Isolate the top-level domain of @in and return it as
160 * an ascii string in @out.
162 * Return value: Returns %TLD_SUCCESS on success, the corresponding
163 * error code otherwise.
166 tld_gettld_4z (const uint32_t * in, char **out)
168 const uint32_t *ipos = in;
170 if (!in)
171 return TLD_NODATA;
173 while (*ipos)
174 ipos++;
176 return tld_gettld_4i (in, ipos - in, out);
181 * tld_check_4it
182 * @in: Array of unicode code points to process (Does not need to be
183 * zero terminated).
184 * @inlen: Number of unicode code points.
185 * @errpos: Position of offending character is returned here.
186 * @tld: Data structure representing the restrictions for
187 * which the input should be tested.
189 * Test each of the code points in @in for whether or not
190 * they are allowed by the data structure in @tld, return
191 * the position of the first character for which this is not
192 * the case in @errpos.
194 * Return value: Returns %TLD_SUCCESS if all code points
195 * are valid or when @tld is null, %TLD_ILLEGAL if a
196 * character is not allowed, or additional error codes on
197 * general failure conditions.
200 tld_check_4it (const uint32_t * in, size_t inlen, size_t * errpos,
201 const Tld_table * tld)
203 const uint32_t *ipos;
204 int rc;
206 if (!tld) /* No data for TLD so everything is valid. */
207 return TLD_SUCCESS;
209 ipos = in;
210 while (ipos < &in[inlen])
212 rc = _tld_checkchar (*ipos, tld);
213 if (rc != TLD_SUCCESS)
215 if (errpos)
217 *errpos = ipos - in;
219 return rc;
221 ipos++;
223 return TLD_SUCCESS;
227 * tld_check_4zt
228 * @in: Zero terminated array of unicode code points to process.
229 * @errpos: Position of offending character is returned here.
230 * @tld: Data structure representing the restrictions for
231 * which the input should be tested.
233 * Test each of the code points in @in for whether or not
234 * they are allowed by the data structure in @tld, return
235 * the position of the first character for which this is not
236 * the case in @errpos.
238 * Return value: Returns %TLD_SUCCESS if all code points
239 * are valid or when @tld is null, %TLD_ILLEGAL if a
240 * character is not allowed, or additional error codes on
241 * general failure conditions.
244 tld_check_4zt (const uint32_t * in, size_t * errpos, const Tld_table * tld)
246 const uint32_t *ipos = in;
248 if (!ipos)
249 return TLD_NODATA;
251 while (*ipos)
252 ipos++;
254 return tld_check_4it (in, ipos - in, errpos, tld);
258 * tld_check_4i
259 * @in: Array of unicode code points to process (Does not need to be
260 * zero terminated).
261 * @inlen: Number of unicode code points.
262 * @errpos: Position of offending character is returned here.
263 * @xtra_tlds: An array of additional domain restriction structures
264 * that complement and supersede the built-in information.
266 * Test each of the code points in @in for whether or not they are
267 * allowed by the information in @xtra_tlds or by the built-in TLD
268 * restriction data. When data for the same TLD is available both
269 * internally and in @xtra_tlds, the information in @xtra_tlds takes
270 * precedence. If several entries for a specific TLD are found, the
271 * first one is used. If @xtra_tlds is %NULL, only the built-in
272 * information is used. The position of the first offending character
273 * is returned in @errpos.
275 * Return value: Returns %TLD_SUCCESS if all code points
276 * are valid or when @tld is null, %TLD_ILLEGAL if a
277 * character is not allowed, or additional error codes on
278 * general failure conditions.
281 tld_check_4i (const uint32_t * in, size_t inlen, size_t * errpos,
282 const Tld_table ** xtra_tlds)
284 const uint32_t *ipos;
285 const Tld_table *tld;
286 char *domain;
287 int rc;
289 *errpos = 0;
290 /* Get TLD name. */
291 rc = tld_gettld_4i (in, inlen, &domain);
293 if (rc != TLD_SUCCESS)
295 if (rc == TLD_NOTLD) /* No TLD, say OK */
296 return TLD_SUCCESS;
297 else
298 return rc;
301 /* Retrieve appropriate data structure. */
302 tld = tld_get_table (domain, xtra_tlds);
303 free (domain);
305 return tld_check_4it (in, inlen, errpos, tld);
309 * tld_check_4z
310 * @in: Zero-terminated array of unicode code points to process.
311 * @errpos: Position of offending character is returned here.
312 * @xtra_tlds: An array of additional domain restriction structures
313 * that complement and supersede the built-in information.
315 * Test each of the code points in @in for whether or not they are
316 * allowed by the information in @xtra_tlds or by the built-in TLD
317 * restriction data. When data for the same TLD is available both
318 * internally and in @xtra_tlds, the information in @xtra_tlds takes
319 * precedence. If several entries for a specific TLD are found, the
320 * first one is used. If @xtra_tlds is %NULL, only the built-in
321 * information is used. The position of the first offending character
322 * is returned in @errpos.
324 * Return value: Returns %TLD_SUCCESS if all code points
325 * are valid or when @tld is null, %TLD_ILLEGAL if a
326 * character is not allowed, or additional error codes on
327 * general failure conditions.
330 tld_check_4z (const uint32_t * in, size_t * errpos,
331 const Tld_table ** xtra_tlds)
333 const uint32_t *ipos = in;
335 if (!ipos)
336 return TLD_NODATA;
338 while (*ipos)
339 ipos++;
341 return tld_check_4i (in, ipos - in, errpos, xtra_tlds);
345 * tld_check_8z
346 * @in: Zero-terminated UTF8 string to process.
347 * @errpos: Position of offending character is returned here.
348 * @xtra_tlds: An array of additional domain restriction structures
349 * that complement and supersede the built-in information.
351 * Test each of the characters in @in for whether or not they are
352 * allowed by the information in @xtra_tlds or by the built-in TLD
353 * restriction data. When data for the same TLD is available both
354 * internally and in @xtra_tlds, the information in @xtra_tlds takes
355 * precedence. If several entries for a specific TLD are found, the
356 * first one is used. If @xtra_tlds is %NULL, only the built-in
357 * information is used. The position of the first offending character
358 * is returned in @errpos. Note that the error position refers to the
359 * decoded character offset rather than the byte position in the
360 * string.
362 * Return value: Returns %TLD_SUCCESS if all characters
363 * are valid or when @tld is null, %TLD_ILLEGAL if a
364 * character is not allowed, or additional error codes on
365 * general failure conditions.
368 tld_check_8z (const char *in, size_t * errpos, const Tld_table ** xtra_tlds)
370 uint32_t *iucs;
371 size_t ilen;
372 int rc;
374 if (!in)
375 return TLD_NODATA;
377 iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
379 if (!iucs)
380 return TLD_MALLOC_ERROR;
382 rc = tld_check_4i (iucs, ilen, errpos, xtra_tlds);
384 free (iucs);
386 return rc;
390 * tld_check_lz
391 * @in: Zero-terminated string in the current locales encoding to process.
392 * @errpos: Position of offending character is returned here.
393 * @xtra_tlds: An array of additional domain restriction structures
394 * that complement and supersede the built-in information.
396 * Test each of the characters in @in for whether or not they are
397 * allowed by the information in @xtra_tlds or by the built-in TLD
398 * restriction data. When data for the same TLD is available both
399 * internally and in @xtra_tlds, the information in @xtra_tlds takes
400 * precedence. If several entries for a specific TLD are found, the
401 * first one is used. If @xtra_tlds is %NULL, only the built-in
402 * information is used. The position of the first offending character
403 * is returned in @errpos. Note that the error position refers to the
404 * decoded character offset rather than the byte position in the
405 * string.
407 * Return value: Returns %TLD_SUCCESS if all characters
408 * are valid or when @tld is null, %TLD_ILLEGAL if a
409 * character is not allowed, or additional error codes on
410 * general failure conditions.
413 tld_check_lz (const char *in, size_t * errpos, const Tld_table ** xtra_tlds)
415 char *utf8;
416 int rc;
418 if (!in)
419 return TLD_NODATA;
421 utf8 = stringprep_locale_to_utf8 (in);
422 if (!utf8)
423 return TLD_ICONV_ERROR;
426 rc = tld_check_8z (utf8, errpos, xtra_tlds);
428 free (utf8);
430 return rc;
434 * Tldchk_rc:
435 * @TLD_SUCCESS: Successful operation. This value is guaranteed to
436 * always be zero, the remaining ones are only guaranteed to hold
437 * non-zero values, for logical comparison purposes.
438 * @TLD_ILLEGAL: Illegal character found.
439 * @TLD_NODATA: No input data was provided.
440 * @TLD_MALLOC_ERROR: Error during memory allocation.
441 * @TLD_ICONV_ERROR: Error during iconv string conversion.
442 * @TLD_NOTLD: No top-level domain found in domain string.
444 * Enumerated return codes of the TLD checking functions.
445 * The value 0 is guaranteed to always correspond to success.