1 /* tld.c --- Handle TLD restriction checking.
2 * Copyright (C) 2003, 2004 Free Software Foundation, Inc.
4 * Author: Thomas Jacob, Internet24.de
6 * This file is part of GNU Libidn.
8 * GNU Libidn is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * GNU Libidn is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with GNU Libidn; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
25 #include <stringprep.h>
30 /* Get specifications. */
35 * @tld_str: TLD name (e.g. "com") as zero terminated ASCII byte string.
36 * @xtra_tlds: Additional well-formed info-structures for TLDs or %NULL.
38 * Return value: Return structure corresponding to TLD @tld_str, first
39 * looking through @xtra_tlds then thru built-in list, or %NULL if no
40 * such structure found.
43 tld_get_table (const char *tld_str
, const Tld_table
** xtra_tlds
)
45 const Tld_table
**tld
= NULL
;
50 /* First search custom tlds. */
52 for (tld
= xtra_tlds
; *tld
; tld
++)
53 if (!strcmp ((*tld
)->name
, tld_str
))
56 /* Then search the internal stuff. */
57 for (tld
= tld_table
; *tld
; tld
++)
58 if (!strcmp ((*tld
)->name
, tld_str
))
64 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
65 (c) == 0xFF0E || (c) == 0xFF61)
69 * @ch: 32 bit unicode character to check.
70 * @tld: Tld_table data structure to check @ch against
72 * Verify if @ch is either in [a-z0-9-.] or mentioned
73 * as a legal character in @tld.
75 * Return value: Return %TLD_SUCCESS if @ch is a legal character for
76 * the TLD @tld or if @tld is %NULL, %TLD_ILLEGAL if @ch is not a
77 * legal as defined by @tld.
80 _tld_checkchar (uint32_t ch
, const Tld_table
* tld
)
82 const Tld_table_element
*p
;
89 /* Check for [-a-z0-9.]. */
90 if ((ch
>= 0x61 && ch
<= 0x7A) ||
91 (ch
>= 0x30 && ch
<= 0x39) || ch
== 0x2D || DOTP (ch
))
94 /* FIXME: replace searches by bsearch like stuff. */
96 for (p
= *tld
->valid
, i
= 0; i
< tld
->nvalid
; i
++, p
++)
97 if (ch
>= p
->start
&& ch
<= p
->end
)
105 * @in: Array of unicode code points to process (Does not need to be
107 * @inlen: Number of unicode code points.
108 * @out: Zero terminated ascii result string pointer.
110 * Isolate the top-level domain of @in and return it as
111 * an ascii string in @out.
113 * Return value: Return %TLD_SUCCESS on success, the corresponding
114 * error code otherwise.
117 tld_gettld_4i (const uint32_t * in
, size_t inlen
, char **out
)
119 const uint32_t *ipos
;
123 if (!in
|| (inlen
<= 0))
126 ipos
= &in
[inlen
- 1];
128 /* Scan backwards for non(latin)letters. */
129 while (ipos
>= in
&& ((*ipos
>= 0x41 && *ipos
<= 0x5A) ||
130 (*ipos
>= 0x61 && *ipos
<= 0x7A)))
133 if (olen
> 0 && DOTP (*ipos
)) /* Found something that appears a TLD. */
135 char *out_s
= malloc (sizeof (char) * (olen
+ 1));
139 return TLD_MALLOC_ERROR
;
142 /* Transcribe to lowercase ascii string. */
143 for (; ipos
< &in
[inlen
]; ipos
++, opos
++)
144 *opos
= *ipos
> 0x5A ? *ipos
: *ipos
+ 0x20;
156 * @in: Zero terminated array of unicode code points to process.
157 * @out: Zero terminated ascii result string pointer.
159 * Isolate the top-level domain of @in and return it as
160 * an ascii string in @out.
162 * Return value: Returns %TLD_SUCCESS on success, the corresponding
163 * error code otherwise.
166 tld_gettld_4z (const uint32_t * in
, char **out
)
168 const uint32_t *ipos
= in
;
176 return tld_gettld_4i (in
, ipos
- in
, out
);
182 * @in: Array of unicode code points to process (Does not need to be
184 * @inlen: Number of unicode code points.
185 * @errpos: Position of offending character is returned here.
186 * @tld: Data structure representing the restrictions for
187 * which the input should be tested.
189 * Test each of the code points in @in for whether or not
190 * they are allowed by the data structure in @tld, return
191 * the position of the first character for which this is not
192 * the case in @errpos.
194 * Return value: Returns %TLD_SUCCESS if all code points
195 * are valid or when @tld is null, %TLD_ILLEGAL if a
196 * character is not allowed, or additional error codes on
197 * general failure conditions.
200 tld_check_4it (const uint32_t * in
, size_t inlen
, size_t * errpos
,
201 const Tld_table
* tld
)
203 const uint32_t *ipos
;
206 if (!tld
) /* No data for TLD so everything is valid. */
210 while (ipos
< &in
[inlen
])
212 rc
= _tld_checkchar (*ipos
, tld
);
213 if (rc
!= TLD_SUCCESS
)
228 * @in: Zero terminated array of unicode code points to process.
229 * @errpos: Position of offending character is returned here.
230 * @tld: Data structure representing the restrictions for
231 * which the input should be tested.
233 * Test each of the code points in @in for whether or not
234 * they are allowed by the data structure in @tld, return
235 * the position of the first character for which this is not
236 * the case in @errpos.
238 * Return value: Returns %TLD_SUCCESS if all code points
239 * are valid or when @tld is null, %TLD_ILLEGAL if a
240 * character is not allowed, or additional error codes on
241 * general failure conditions.
244 tld_check_4zt (const uint32_t * in
, size_t * errpos
, const Tld_table
* tld
)
246 const uint32_t *ipos
= in
;
254 return tld_check_4it (in
, ipos
- in
, errpos
, tld
);
259 * @in: Array of unicode code points to process (Does not need to be
261 * @inlen: Number of unicode code points.
262 * @errpos: Position of offending character is returned here.
263 * @xtra_tlds: An array of additional domain restriction structures
264 * that complement and supersede the built-in information.
266 * Test each of the code points in @in for whether or not they are
267 * allowed by the information in @xtra_tlds or by the built-in TLD
268 * restriction data. When data for the same TLD is available both
269 * internally and in @xtra_tlds, the information in @xtra_tlds takes
270 * precedence. If several entries for a specific TLD are found, the
271 * first one is used. If @xtra_tlds is %NULL, only the built-in
272 * information is used. The position of the first offending character
273 * is returned in @errpos.
275 * Return value: Returns %TLD_SUCCESS if all code points
276 * are valid or when @tld is null, %TLD_ILLEGAL if a
277 * character is not allowed, or additional error codes on
278 * general failure conditions.
281 tld_check_4i (const uint32_t * in
, size_t inlen
, size_t * errpos
,
282 const Tld_table
** xtra_tlds
)
284 const uint32_t *ipos
;
285 const Tld_table
*tld
;
291 rc
= tld_gettld_4i (in
, inlen
, &domain
);
293 if (rc
!= TLD_SUCCESS
)
295 if (rc
== TLD_NOTLD
) /* No TLD, say OK */
301 /* Retrieve appropriate data structure. */
302 tld
= tld_get_table (domain
, xtra_tlds
);
305 return tld_check_4it (in
, inlen
, errpos
, tld
);
310 * @in: Zero-terminated array of unicode code points to process.
311 * @errpos: Position of offending character is returned here.
312 * @xtra_tlds: An array of additional domain restriction structures
313 * that complement and supersede the built-in information.
315 * Test each of the code points in @in for whether or not they are
316 * allowed by the information in @xtra_tlds or by the built-in TLD
317 * restriction data. When data for the same TLD is available both
318 * internally and in @xtra_tlds, the information in @xtra_tlds takes
319 * precedence. If several entries for a specific TLD are found, the
320 * first one is used. If @xtra_tlds is %NULL, only the built-in
321 * information is used. The position of the first offending character
322 * is returned in @errpos.
324 * Return value: Returns %TLD_SUCCESS if all code points
325 * are valid or when @tld is null, %TLD_ILLEGAL if a
326 * character is not allowed, or additional error codes on
327 * general failure conditions.
330 tld_check_4z (const uint32_t * in
, size_t * errpos
,
331 const Tld_table
** xtra_tlds
)
333 const uint32_t *ipos
= in
;
341 return tld_check_4i (in
, ipos
- in
, errpos
, xtra_tlds
);
346 * @in: Zero-terminated UTF8 string to process.
347 * @errpos: Position of offending character is returned here.
348 * @xtra_tlds: An array of additional domain restriction structures
349 * that complement and supersede the built-in information.
351 * Test each of the characters in @in for whether or not they are
352 * allowed by the information in @xtra_tlds or by the built-in TLD
353 * restriction data. When data for the same TLD is available both
354 * internally and in @xtra_tlds, the information in @xtra_tlds takes
355 * precedence. If several entries for a specific TLD are found, the
356 * first one is used. If @xtra_tlds is %NULL, only the built-in
357 * information is used. The position of the first offending character
358 * is returned in @errpos. Note that the error position refers to the
359 * decoded character offset rather than the byte position in the
362 * Return value: Returns %TLD_SUCCESS if all characters
363 * are valid or when @tld is null, %TLD_ILLEGAL if a
364 * character is not allowed, or additional error codes on
365 * general failure conditions.
368 tld_check_8z (const char *in
, size_t * errpos
, const Tld_table
** xtra_tlds
)
377 iucs
= stringprep_utf8_to_ucs4 (in
, -1, &ilen
);
380 return TLD_MALLOC_ERROR
;
382 rc
= tld_check_4i (iucs
, ilen
, errpos
, xtra_tlds
);
391 * @in: Zero-terminated string in the current locales encoding to process.
392 * @errpos: Position of offending character is returned here.
393 * @xtra_tlds: An array of additional domain restriction structures
394 * that complement and supersede the built-in information.
396 * Test each of the characters in @in for whether or not they are
397 * allowed by the information in @xtra_tlds or by the built-in TLD
398 * restriction data. When data for the same TLD is available both
399 * internally and in @xtra_tlds, the information in @xtra_tlds takes
400 * precedence. If several entries for a specific TLD are found, the
401 * first one is used. If @xtra_tlds is %NULL, only the built-in
402 * information is used. The position of the first offending character
403 * is returned in @errpos. Note that the error position refers to the
404 * decoded character offset rather than the byte position in the
407 * Return value: Returns %TLD_SUCCESS if all characters
408 * are valid or when @tld is null, %TLD_ILLEGAL if a
409 * character is not allowed, or additional error codes on
410 * general failure conditions.
413 tld_check_lz (const char *in
, size_t * errpos
, const Tld_table
** xtra_tlds
)
421 utf8
= stringprep_locale_to_utf8 (in
);
423 return TLD_ICONV_ERROR
;
426 rc
= tld_check_8z (utf8
, errpos
, xtra_tlds
);
435 * @TLD_SUCCESS: Successful operation. This value is guaranteed to
436 * always be zero, the remaining ones are only guaranteed to hold
437 * non-zero values, for logical comparison purposes.
438 * @TLD_ILLEGAL: Illegal character found.
439 * @TLD_NODATA: No input data was provided.
440 * @TLD_MALLOC_ERROR: Error during memory allocation.
441 * @TLD_ICONV_ERROR: Error during iconv string conversion.
442 * @TLD_NOTLD: No top-level domain found in domain string.
444 * Enumerated return codes of the TLD checking functions.
445 * The value 0 is guaranteed to always correspond to success.