(Tld_table): Change valid definition.
[libidn.git] / lib / tld.c
blob94d163da7a46b19326657ee28876ca61aec4943d
1 /* tld.c --- Handle TLD restriction checking.
2 * Copyright (C) 2003, 2004 Free Software Foundation, Inc.
4 * Author: Thomas Jacob, Internet24.de
6 * This file is part of GNU Libidn.
8 * GNU Libidn is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * GNU Libidn is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with GNU Libidn; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
25 #include <stringprep.h>
27 /* Get strcmp(). */
28 #include <string.h>
30 /* Get specifications. */
31 #include <tld.h>
33 /* Array of built-in domain restriction structures. See tlds.c. */
34 extern const Tld_table *_tld_tables[];
36 /**
37 * tld_get_table:
38 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
39 * @tables: Zero terminated array of info-structures for TLDs.
41 * Get the TLD table for a named TLD by searching through the given
42 * TLD table array.
44 * Return value: Return structure corresponding to TLD @tld by going
45 * thru @tables, or return %NULL if no such structure is found.
47 const Tld_table *
48 tld_get_table (const char *tld, const Tld_table ** tables)
50 const Tld_table **tldtable = NULL;
52 if (!tld || !tables)
53 return NULL;
55 for (tldtable = tables; *tldtable; tldtable++)
56 if (!strcmp ((*tldtable)->name, tld))
57 return *tldtable;
59 return NULL;
62 /**
63 * tld_default_table:
64 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
65 * @overrides: Additional well-formed info-structures for TLDs, or %NULL
66 * to only use library deault tables.
68 * Get the TLD table for a named TLD, using the internal defaults,
69 * possibly overrided by the (optional) supplied tables.
71 * Return value: Return structure corresponding to TLD @tld_str, first
72 * looking through @overrides then thru built-in list, or %NULL if no
73 * such structure found.
75 const Tld_table *
76 tld_default_table (const char *tld, const Tld_table ** overrides)
78 const Tld_table *tldtable = NULL;
80 if (!tld)
81 return NULL;
83 if (overrides)
84 tldtable = tld_get_table (tld, overrides);
86 if (!tldtable)
87 tldtable = tld_get_table (tld, _tld_tables);
89 return tldtable;
92 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
93 (c) == 0xFF0E || (c) == 0xFF61)
95 /**
96 * tld_get_4:
97 * @in: Array of unicode code points to process. Does not need to be
98 * zero terminated.
99 * @inlen: Number of unicode code points.
100 * @out: Zero terminated ascii result string pointer.
102 * Isolate the top-level domain of @in and return it as an ASCII
103 * string in @out.
105 * Return value: Return %TLD_SUCCESS on success, the corresponding
106 * error code otherwise.
109 tld_get_4 (const uint32_t * in, size_t inlen, char **out)
111 const uint32_t *ipos;
112 size_t olen;
114 *out = NULL;
115 if (!in || inlen == 0)
116 return TLD_NODATA;
118 ipos = &in[inlen - 1];
119 olen = 0;
120 /* Scan backwards for non(latin)letters. */
121 while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
122 (*ipos >= 0x61 && *ipos <= 0x7A)))
123 ipos--, olen++;
125 if (olen > 0 && DOTP (*ipos)) /* Found something that appears a TLD. */
127 char *out_s = malloc (sizeof (char) * (olen + 1));
128 char *opos = out_s;
130 if (!opos)
131 return TLD_MALLOC_ERROR;
133 ipos++;
134 /* Transcribe to lowercase ascii string. */
135 for (; ipos < &in[inlen]; ipos++, opos++)
136 *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
137 *opos = 0;
138 *out = out_s;
139 return TLD_SUCCESS;
142 return TLD_NOTLD;
146 * tld_get_4z:
147 * @in: Zero terminated array of unicode code points to process.
148 * @out: Zero terminated ascii result string pointer.
150 * Isolate the top-level domain of @in and return it as an ASCII
151 * string in @out.
153 * Return value: Returns %TLD_SUCCESS on success, the corresponding
154 * error code otherwise.
157 tld_get_4z (const uint32_t * in, char **out)
159 const uint32_t *ipos = in;
161 if (!in)
162 return TLD_NODATA;
164 while (*ipos)
165 ipos++;
167 return tld_get_4 (in, ipos - in, out);
171 * tld_get_z:
172 * @in: Zero terminated character array to process.
173 * @out: Zero terminated ascii result string pointer.
175 * Isolate the top-level domain of @in and return it as an ASCII
176 * string in @out. The input string @in may be UTF-8, ISO-8859-1 or
177 * any ASCII compatible character encoding.
179 * Return value: Returns %TLD_SUCCESS on success, the corresponding
180 * error code otherwise.
183 tld_get_z (const char *in, char **out)
185 uint32_t *iucs;
186 size_t i, ilen;
187 int rc;
189 ilen = strlen (in);
190 iucs = calloc (ilen, sizeof (*iucs));
192 if (!iucs)
193 return TLD_MALLOC_ERROR;
195 for (i = 0; i < ilen; i++)
196 iucs[i] = in[i];
198 rc = tld_get_4 (iucs, ilen, out);
200 free (iucs);
202 return rc;
206 * tld_checkchar:
207 * @ch: 32 bit unicode character to check.
208 * @tld: Tld_table data structure to check @ch against
210 * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
211 * character in @tld.
213 * Return value: Return %TLD_SUCCESS if @ch is a valid character for
214 * the TLD @tld or if @tld is %NULL, %TLD_INVALID if @ch is invalid as
215 * defined by @tld.
217 static int
218 _tld_checkchar (uint32_t ch, const Tld_table * tld)
220 const Tld_table_element *s, *e, *m;
222 if (!tld)
223 return TLD_SUCCESS;
225 /* Check for [-a-z0-9.]. */
226 if ((ch >= 0x61 && ch <= 0x7A) ||
227 (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
228 return TLD_SUCCESS;
230 s = *tld->valid;
231 e = s + tld->nvalid;
232 while (s < e)
234 m = s + ((e - s) >> 1);
235 if (ch < m->start)
236 e = m;
237 else if (ch > m->end)
238 s = m + 1;
239 else
240 return TLD_SUCCESS;
243 return TLD_INVALID;
247 * tld_check_4t
248 * @in: Array of unicode code points to process. Does not need to be
249 * zero terminated.
250 * @inlen: Number of unicode code points.
251 * @errpos: Position of offending character is returned here.
252 * @tld: Data structure representing the restrictions for
253 * which the input should be tested.
255 * Test each of the code points in @in for whether or not
256 * they are allowed by the data structure in @tld, return
257 * the position of the first character for which this is not
258 * the case in @errpos.
260 * Return value: Returns %TLD_SUCCESS if all code points
261 * are valid or when @tld is null, %TLD_INVALID if a
262 * character is not allowed, or additional error codes on
263 * general failure conditions.
266 tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos,
267 const Tld_table * tld)
269 const uint32_t *ipos;
270 int rc;
272 if (!tld) /* No data for TLD so everything is valid. */
273 return TLD_SUCCESS;
275 ipos = in;
276 while (ipos < &in[inlen])
278 rc = _tld_checkchar (*ipos, tld);
279 if (rc != TLD_SUCCESS)
281 if (errpos)
282 *errpos = ipos - in;
283 return rc;
285 ipos++;
287 return TLD_SUCCESS;
291 * tld_check_4tz
292 * @in: Zero terminated array of unicode code points to process.
293 * @errpos: Position of offending character is returned here.
294 * @tld: Data structure representing the restrictions for
295 * which the input should be tested.
297 * Test each of the code points in @in for whether or not
298 * they are allowed by the data structure in @tld, return
299 * the position of the first character for which this is not
300 * the case in @errpos.
302 * Return value: Returns %TLD_SUCCESS if all code points
303 * are valid or when @tld is null, %TLD_INVALID if a
304 * character is not allowed, or additional error codes on
305 * general failure conditions.
308 tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld)
310 const uint32_t *ipos = in;
312 if (!ipos)
313 return TLD_NODATA;
315 while (*ipos)
316 ipos++;
318 return tld_check_4t (in, ipos - in, errpos, tld);
322 * tld_check_4
323 * @in: Array of unicode code points to process. Does not need to be
324 * zero terminated.
325 * @inlen: Number of unicode code points.
326 * @errpos: Position of offending character is returned here.
327 * @overrides: An array of additional domain restriction structures
328 * that complement and supersede the built-in information.
330 * Test each of the code points in @in for whether or not they are
331 * allowed by the information in @overrides or by the built-in TLD
332 * restriction data. When data for the same TLD is available both
333 * internally and in @overrides, the information in @overrides takes
334 * precedence. If several entries for a specific TLD are found, the
335 * first one is used. If @overrides is %NULL, only the built-in
336 * information is used. The position of the first offending character
337 * is returned in @errpos.
339 * Return value: Returns %TLD_SUCCESS if all code points
340 * are valid or when @tld is null, %TLD_INVALID if a
341 * character is not allowed, or additional error codes on
342 * general failure conditions.
345 tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos,
346 const Tld_table ** overrides)
348 const Tld_table *tld;
349 char *domain;
350 int rc;
352 if (errpos)
353 *errpos = 0;
355 /* Get TLD name. */
356 rc = tld_get_4 (in, inlen, &domain);
358 if (rc != TLD_SUCCESS)
360 if (rc == TLD_NOTLD) /* No TLD, say OK */
361 return TLD_SUCCESS;
362 else
363 return rc;
366 /* Retrieve appropriate data structure. */
367 tld = tld_default_table (domain, overrides);
368 free (domain);
370 return tld_check_4t (in, inlen, errpos, tld);
374 * tld_check_4z
375 * @in: Zero-terminated array of unicode code points to process.
376 * @errpos: Position of offending character is returned here.
377 * @overrides: An array of additional domain restriction structures
378 * that complement and supersede the built-in information.
380 * Test each of the code points in @in for whether or not they are
381 * allowed by the information in @overrides or by the built-in TLD
382 * restriction data. When data for the same TLD is available both
383 * internally and in @overrides, the information in @overrides takes
384 * precedence. If several entries for a specific TLD are found, the
385 * first one is used. If @overrides is %NULL, only the built-in
386 * information is used. The position of the first offending character
387 * is returned in @errpos.
389 * Return value: Returns %TLD_SUCCESS if all code points
390 * are valid or when @tld is null, %TLD_INVALID if a
391 * character is not allowed, or additional error codes on
392 * general failure conditions.
395 tld_check_4z (const uint32_t * in, size_t * errpos,
396 const Tld_table ** overrides)
398 const uint32_t *ipos = in;
400 if (!ipos)
401 return TLD_NODATA;
403 while (*ipos)
404 ipos++;
406 return tld_check_4 (in, ipos - in, errpos, overrides);
410 * tld_check_8z
411 * @in: Zero-terminated UTF8 string to process.
412 * @errpos: Position of offending character is returned here.
413 * @overrides: An array of additional domain restriction structures
414 * that complement and supersede the built-in information.
416 * Test each of the characters in @in for whether or not they are
417 * allowed by the information in @overrides or by the built-in TLD
418 * restriction data. When data for the same TLD is available both
419 * internally and in @overrides, the information in @overrides takes
420 * precedence. If several entries for a specific TLD are found, the
421 * first one is used. If @overrides is %NULL, only the built-in
422 * information is used. The position of the first offending character
423 * is returned in @errpos. Note that the error position refers to the
424 * decoded character offset rather than the byte position in the
425 * string.
427 * Return value: Returns %TLD_SUCCESS if all characters
428 * are valid or when @tld is null, %TLD_INVALID if a
429 * character is not allowed, or additional error codes on
430 * general failure conditions.
433 tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides)
435 uint32_t *iucs;
436 size_t ilen;
437 int rc;
439 if (!in)
440 return TLD_NODATA;
442 iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
444 if (!iucs)
445 return TLD_MALLOC_ERROR;
447 rc = tld_check_4 (iucs, ilen, errpos, overrides);
449 free (iucs);
451 return rc;
455 * tld_check_lz
456 * @in: Zero-terminated string in the current locales encoding to process.
457 * @errpos: Position of offending character is returned here.
458 * @overrides: An array of additional domain restriction structures
459 * that complement and supersede the built-in information.
461 * Test each of the characters in @in for whether or not they are
462 * allowed by the information in @overrides or by the built-in TLD
463 * restriction data. When data for the same TLD is available both
464 * internally and in @overrides, the information in @overrides takes
465 * precedence. If several entries for a specific TLD are found, the
466 * first one is used. If @overrides is %NULL, only the built-in
467 * information is used. The position of the first offending character
468 * is returned in @errpos. Note that the error position refers to the
469 * decoded character offset rather than the byte position in the
470 * string.
472 * Return value: Returns %TLD_SUCCESS if all characters
473 * are valid or when @tld is null, %TLD_INVALID if a
474 * character is not allowed, or additional error codes on
475 * general failure conditions.
478 tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides)
480 char *utf8;
481 int rc;
483 if (!in)
484 return TLD_NODATA;
486 utf8 = stringprep_locale_to_utf8 (in);
487 if (!utf8)
488 return TLD_ICONV_ERROR;
491 rc = tld_check_8z (utf8, errpos, overrides);
493 free (utf8);
495 return rc;
499 * Tld_rc:
500 * @TLD_SUCCESS: Successful operation. This value is guaranteed to
501 * always be zero, the remaining ones are only guaranteed to hold
502 * non-zero values, for logical comparison purposes.
503 * @TLD_INVALID: Invalid character found.
504 * @TLD_NODATA: No input data was provided.
505 * @TLD_MALLOC_ERROR: Error during memory allocation.
506 * @TLD_ICONV_ERROR: Error during iconv string conversion.
507 * @TLD_NOTLD: No top-level domain found in domain string.
509 * Enumerated return codes of the TLD checking functions.
510 * The value 0 is guaranteed to always correspond to success.