1 /* tld.c --- Handle TLD restriction checking.
2 * Copyright (C) 2003, 2004 Free Software Foundation, Inc.
4 * Author: Thomas Jacob, Internet24.de
6 * This file is part of GNU Libidn.
8 * GNU Libidn is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * GNU Libidn is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with GNU Libidn; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
25 #include <stringprep.h>
30 /* Get specifications. */
33 /* Array of built-in domain restriction structures. See tlds.c. */
34 extern const Tld_table
*_tld_tables
[];
38 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
39 * @tables: Zero terminated array of info-structures for TLDs.
41 * Get the TLD table for a named TLD by searching through the given
44 * Return value: Return structure corresponding to TLD @tld by going
45 * thru @tables, or return %NULL if no such structure is found.
48 tld_get_table (const char *tld
, const Tld_table
** tables
)
50 const Tld_table
**tldtable
= NULL
;
55 for (tldtable
= tables
; *tldtable
; tldtable
++)
56 if (!strcmp ((*tldtable
)->name
, tld
))
64 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
65 * @overrides: Additional well-formed info-structures for TLDs, or %NULL
66 * to only use library deault tables.
68 * Get the TLD table for a named TLD, using the internal defaults,
69 * possibly overrided by the (optional) supplied tables.
71 * Return value: Return structure corresponding to TLD @tld_str, first
72 * looking through @overrides then thru built-in list, or %NULL if no
73 * such structure found.
76 tld_default_table (const char *tld
, const Tld_table
** overrides
)
78 const Tld_table
*tldtable
= NULL
;
84 tldtable
= tld_get_table (tld
, overrides
);
87 tldtable
= tld_get_table (tld
, _tld_tables
);
92 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
93 (c) == 0xFF0E || (c) == 0xFF61)
97 * @in: Array of unicode code points to process. Does not need to be
99 * @inlen: Number of unicode code points.
100 * @out: Zero terminated ascii result string pointer.
102 * Isolate the top-level domain of @in and return it as an ASCII
105 * Return value: Return %TLD_SUCCESS on success, the corresponding
106 * error code otherwise.
109 tld_get_4 (const uint32_t * in
, size_t inlen
, char **out
)
111 const uint32_t *ipos
;
115 if (!in
|| inlen
== 0)
118 ipos
= &in
[inlen
- 1];
120 /* Scan backwards for non(latin)letters. */
121 while (ipos
>= in
&& ((*ipos
>= 0x41 && *ipos
<= 0x5A) ||
122 (*ipos
>= 0x61 && *ipos
<= 0x7A)))
125 if (olen
> 0 && DOTP (*ipos
)) /* Found something that appears a TLD. */
127 char *out_s
= malloc (sizeof (char) * (olen
+ 1));
131 return TLD_MALLOC_ERROR
;
134 /* Transcribe to lowercase ascii string. */
135 for (; ipos
< &in
[inlen
]; ipos
++, opos
++)
136 *opos
= *ipos
> 0x5A ? *ipos
: *ipos
+ 0x20;
147 * @in: Zero terminated array of unicode code points to process.
148 * @out: Zero terminated ascii result string pointer.
150 * Isolate the top-level domain of @in and return it as an ASCII
153 * Return value: Returns %TLD_SUCCESS on success, the corresponding
154 * error code otherwise.
157 tld_get_4z (const uint32_t * in
, char **out
)
159 const uint32_t *ipos
= in
;
167 return tld_get_4 (in
, ipos
- in
, out
);
172 * @in: Zero terminated character array to process.
173 * @out: Zero terminated ascii result string pointer.
175 * Isolate the top-level domain of @in and return it as an ASCII
176 * string in @out. The input string @in may be UTF-8, ISO-8859-1 or
177 * any ASCII compatible character encoding.
179 * Return value: Returns %TLD_SUCCESS on success, the corresponding
180 * error code otherwise.
183 tld_get_z (const char *in
, char **out
)
190 iucs
= calloc (ilen
, sizeof (*iucs
));
193 return TLD_MALLOC_ERROR
;
195 for (i
= 0; i
< ilen
; i
++)
198 rc
= tld_get_4 (iucs
, ilen
, out
);
207 * @ch: 32 bit unicode character to check.
208 * @tld: Tld_table data structure to check @ch against
210 * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
213 * Return value: Return %TLD_SUCCESS if @ch is a valid character for
214 * the TLD @tld or if @tld is %NULL, %TLD_INVALID if @ch is invalid as
218 _tld_checkchar (uint32_t ch
, const Tld_table
* tld
)
220 const Tld_table_element
*s
, *e
, *m
;
225 /* Check for [-a-z0-9.]. */
226 if ((ch
>= 0x61 && ch
<= 0x7A) ||
227 (ch
>= 0x30 && ch
<= 0x39) || ch
== 0x2D || DOTP (ch
))
234 m
= s
+ ((e
- s
) >> 1);
237 else if (ch
> m
->end
)
248 * @in: Array of unicode code points to process. Does not need to be
250 * @inlen: Number of unicode code points.
251 * @errpos: Position of offending character is returned here.
252 * @tld: Data structure representing the restrictions for
253 * which the input should be tested.
255 * Test each of the code points in @in for whether or not
256 * they are allowed by the data structure in @tld, return
257 * the position of the first character for which this is not
258 * the case in @errpos.
260 * Return value: Returns %TLD_SUCCESS if all code points
261 * are valid or when @tld is null, %TLD_INVALID if a
262 * character is not allowed, or additional error codes on
263 * general failure conditions.
266 tld_check_4t (const uint32_t * in
, size_t inlen
, size_t * errpos
,
267 const Tld_table
* tld
)
269 const uint32_t *ipos
;
272 if (!tld
) /* No data for TLD so everything is valid. */
276 while (ipos
< &in
[inlen
])
278 rc
= _tld_checkchar (*ipos
, tld
);
279 if (rc
!= TLD_SUCCESS
)
292 * @in: Zero terminated array of unicode code points to process.
293 * @errpos: Position of offending character is returned here.
294 * @tld: Data structure representing the restrictions for
295 * which the input should be tested.
297 * Test each of the code points in @in for whether or not
298 * they are allowed by the data structure in @tld, return
299 * the position of the first character for which this is not
300 * the case in @errpos.
302 * Return value: Returns %TLD_SUCCESS if all code points
303 * are valid or when @tld is null, %TLD_INVALID if a
304 * character is not allowed, or additional error codes on
305 * general failure conditions.
308 tld_check_4tz (const uint32_t * in
, size_t * errpos
, const Tld_table
* tld
)
310 const uint32_t *ipos
= in
;
318 return tld_check_4t (in
, ipos
- in
, errpos
, tld
);
323 * @in: Array of unicode code points to process. Does not need to be
325 * @inlen: Number of unicode code points.
326 * @errpos: Position of offending character is returned here.
327 * @overrides: An array of additional domain restriction structures
328 * that complement and supersede the built-in information.
330 * Test each of the code points in @in for whether or not they are
331 * allowed by the information in @overrides or by the built-in TLD
332 * restriction data. When data for the same TLD is available both
333 * internally and in @overrides, the information in @overrides takes
334 * precedence. If several entries for a specific TLD are found, the
335 * first one is used. If @overrides is %NULL, only the built-in
336 * information is used. The position of the first offending character
337 * is returned in @errpos.
339 * Return value: Returns %TLD_SUCCESS if all code points
340 * are valid or when @tld is null, %TLD_INVALID if a
341 * character is not allowed, or additional error codes on
342 * general failure conditions.
345 tld_check_4 (const uint32_t * in
, size_t inlen
, size_t * errpos
,
346 const Tld_table
** overrides
)
348 const Tld_table
*tld
;
356 rc
= tld_get_4 (in
, inlen
, &domain
);
358 if (rc
!= TLD_SUCCESS
)
360 if (rc
== TLD_NOTLD
) /* No TLD, say OK */
366 /* Retrieve appropriate data structure. */
367 tld
= tld_default_table (domain
, overrides
);
370 return tld_check_4t (in
, inlen
, errpos
, tld
);
375 * @in: Zero-terminated array of unicode code points to process.
376 * @errpos: Position of offending character is returned here.
377 * @overrides: An array of additional domain restriction structures
378 * that complement and supersede the built-in information.
380 * Test each of the code points in @in for whether or not they are
381 * allowed by the information in @overrides or by the built-in TLD
382 * restriction data. When data for the same TLD is available both
383 * internally and in @overrides, the information in @overrides takes
384 * precedence. If several entries for a specific TLD are found, the
385 * first one is used. If @overrides is %NULL, only the built-in
386 * information is used. The position of the first offending character
387 * is returned in @errpos.
389 * Return value: Returns %TLD_SUCCESS if all code points
390 * are valid or when @tld is null, %TLD_INVALID if a
391 * character is not allowed, or additional error codes on
392 * general failure conditions.
395 tld_check_4z (const uint32_t * in
, size_t * errpos
,
396 const Tld_table
** overrides
)
398 const uint32_t *ipos
= in
;
406 return tld_check_4 (in
, ipos
- in
, errpos
, overrides
);
411 * @in: Zero-terminated UTF8 string to process.
412 * @errpos: Position of offending character is returned here.
413 * @overrides: An array of additional domain restriction structures
414 * that complement and supersede the built-in information.
416 * Test each of the characters in @in for whether or not they are
417 * allowed by the information in @overrides or by the built-in TLD
418 * restriction data. When data for the same TLD is available both
419 * internally and in @overrides, the information in @overrides takes
420 * precedence. If several entries for a specific TLD are found, the
421 * first one is used. If @overrides is %NULL, only the built-in
422 * information is used. The position of the first offending character
423 * is returned in @errpos. Note that the error position refers to the
424 * decoded character offset rather than the byte position in the
427 * Return value: Returns %TLD_SUCCESS if all characters
428 * are valid or when @tld is null, %TLD_INVALID if a
429 * character is not allowed, or additional error codes on
430 * general failure conditions.
433 tld_check_8z (const char *in
, size_t * errpos
, const Tld_table
** overrides
)
442 iucs
= stringprep_utf8_to_ucs4 (in
, -1, &ilen
);
445 return TLD_MALLOC_ERROR
;
447 rc
= tld_check_4 (iucs
, ilen
, errpos
, overrides
);
456 * @in: Zero-terminated string in the current locales encoding to process.
457 * @errpos: Position of offending character is returned here.
458 * @overrides: An array of additional domain restriction structures
459 * that complement and supersede the built-in information.
461 * Test each of the characters in @in for whether or not they are
462 * allowed by the information in @overrides or by the built-in TLD
463 * restriction data. When data for the same TLD is available both
464 * internally and in @overrides, the information in @overrides takes
465 * precedence. If several entries for a specific TLD are found, the
466 * first one is used. If @overrides is %NULL, only the built-in
467 * information is used. The position of the first offending character
468 * is returned in @errpos. Note that the error position refers to the
469 * decoded character offset rather than the byte position in the
472 * Return value: Returns %TLD_SUCCESS if all characters
473 * are valid or when @tld is null, %TLD_INVALID if a
474 * character is not allowed, or additional error codes on
475 * general failure conditions.
478 tld_check_lz (const char *in
, size_t * errpos
, const Tld_table
** overrides
)
486 utf8
= stringprep_locale_to_utf8 (in
);
488 return TLD_ICONV_ERROR
;
491 rc
= tld_check_8z (utf8
, errpos
, overrides
);
500 * @TLD_SUCCESS: Successful operation. This value is guaranteed to
501 * always be zero, the remaining ones are only guaranteed to hold
502 * non-zero values, for logical comparison purposes.
503 * @TLD_INVALID: Invalid character found.
504 * @TLD_NODATA: No input data was provided.
505 * @TLD_MALLOC_ERROR: Error during memory allocation.
506 * @TLD_ICONV_ERROR: Error during iconv string conversion.
507 * @TLD_NOTLD: No top-level domain found in domain string.
509 * Enumerated return codes of the TLD checking functions.
510 * The value 0 is guaranteed to always correspond to success.