2007-12-17 Roland McGrath <roland@redhat.com>
[glibc.git] / libidn / stringprep.c
blob6041e3937a2471f9329a3e23abe0bc5965076508
1 /* stringprep.c --- Core stringprep implementation.
2 * Copyright (C) 2002, 2003, 2004 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #if HAVE_CONFIG_H
23 # include "config.h"
24 #endif
26 #include <stdlib.h>
27 #include <string.h>
29 #include "stringprep.h"
31 static ssize_t
32 stringprep_find_character_in_table (uint32_t ucs4,
33 const Stringprep_table_element * table)
35 ssize_t i;
37 /* This is where typical uses of Libidn spends very close to all CPU
38 time and causes most cache misses. One could easily do a binary
39 search instead. Before rewriting this, I want hard evidence this
40 slowness is at all relevant in typical applications. (I don't
41 dispute optimization may improve matters significantly, I'm
42 mostly interested in having someone give real-world benchmark on
43 the impact of libidn.) */
45 for (i = 0; table[i].start || table[i].end; i++)
46 if (ucs4 >= table[i].start &&
47 ucs4 <= (table[i].end ? table[i].end : table[i].start))
48 return i;
50 return -1;
53 static ssize_t
54 stringprep_find_string_in_table (uint32_t * ucs4,
55 size_t ucs4len,
56 size_t * tablepos,
57 const Stringprep_table_element * table)
59 size_t j;
60 ssize_t pos;
62 for (j = 0; j < ucs4len; j++)
63 if ((pos = stringprep_find_character_in_table (ucs4[j], table)) != -1)
65 if (tablepos)
66 *tablepos = pos;
67 return j;
70 return -1;
73 static int
74 stringprep_apply_table_to_string (uint32_t * ucs4,
75 size_t * ucs4len,
76 size_t maxucs4len,
77 const Stringprep_table_element * table)
79 ssize_t pos;
80 size_t i, maplen;
82 while ((pos = stringprep_find_string_in_table (ucs4, *ucs4len,
83 &i, table)) != -1)
85 for (maplen = STRINGPREP_MAX_MAP_CHARS;
86 maplen > 0 && table[i].map[maplen - 1] == 0; maplen--)
89 if (*ucs4len - 1 + maplen >= maxucs4len)
90 return STRINGPREP_TOO_SMALL_BUFFER;
92 memmove (&ucs4[pos + maplen], &ucs4[pos + 1],
93 sizeof (uint32_t) * (*ucs4len - pos - 1));
94 memcpy (&ucs4[pos], table[i].map, sizeof (uint32_t) * maplen);
95 *ucs4len = *ucs4len - 1 + maplen;
98 return STRINGPREP_OK;
101 #define INVERTED(x) ((x) & ((~0UL) >> 1))
102 #define UNAPPLICAPLEFLAGS(flags, profileflags) \
103 ((!INVERTED(profileflags) && !(profileflags & flags) && profileflags) || \
104 ( INVERTED(profileflags) && (profileflags & flags)))
107 * stringprep_4i:
108 * @ucs4: input/output array with string to prepare.
109 * @len: on input, length of input array with Unicode code points,
110 * on exit, length of output array with Unicode code points.
111 * @maxucs4len: maximum length of input/output array.
112 * @flags: stringprep profile flags, or 0.
113 * @profile: pointer to stringprep profile to use.
115 * Prepare the input UCS-4 string according to the stringprep profile,
116 * and write back the result to the input string.
118 * The input is not required to be zero terminated (@ucs4[@len] = 0).
119 * The output will not be zero terminated unless @ucs4[@len] = 0.
120 * Instead, see stringprep_4zi() if your input is zero terminated or
121 * if you want the output to be.
123 * Since the stringprep operation can expand the string, @maxucs4len
124 * indicate how large the buffer holding the string is. This function
125 * will not read or write to code points outside that size.
127 * The @flags are one of Stringprep_profile_flags, or 0.
129 * The @profile contain the instructions to perform. Your application
130 * can define new profiles, possibly re-using the generic stringprep
131 * tables that always will be part of the library, or use one of the
132 * currently supported profiles.
134 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
137 stringprep_4i (uint32_t * ucs4, size_t * len, size_t maxucs4len,
138 Stringprep_profile_flags flags,
139 const Stringprep_profile * profile)
141 size_t i, j;
142 ssize_t k;
143 size_t ucs4len = *len;
144 int rc;
146 for (i = 0; profile[i].operation; i++)
148 switch (profile[i].operation)
150 case STRINGPREP_NFKC:
152 uint32_t *q = 0;
154 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
155 break;
157 if (flags & STRINGPREP_NO_NFKC && !profile[i].flags)
158 /* Profile requires NFKC, but callee asked for no NFKC. */
159 return STRINGPREP_FLAG_ERROR;
161 q = stringprep_ucs4_nfkc_normalize (ucs4, ucs4len);
162 if (!q)
163 return STRINGPREP_NFKC_FAILED;
165 for (ucs4len = 0; q[ucs4len]; ucs4len++)
168 if (ucs4len >= maxucs4len)
170 free (q);
171 return STRINGPREP_TOO_SMALL_BUFFER;
174 memcpy (ucs4, q, ucs4len * sizeof (ucs4[0]));
176 free (q);
178 break;
180 case STRINGPREP_PROHIBIT_TABLE:
181 k = stringprep_find_string_in_table (ucs4, ucs4len,
182 NULL, profile[i].table);
183 if (k != -1)
184 return STRINGPREP_CONTAINS_PROHIBITED;
185 break;
187 case STRINGPREP_UNASSIGNED_TABLE:
188 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
189 break;
190 if (flags & STRINGPREP_NO_UNASSIGNED)
192 k = stringprep_find_string_in_table
193 (ucs4, ucs4len, NULL, profile[i].table);
194 if (k != -1)
195 return STRINGPREP_CONTAINS_UNASSIGNED;
197 break;
199 case STRINGPREP_MAP_TABLE:
200 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
201 break;
202 rc = stringprep_apply_table_to_string
203 (ucs4, &ucs4len, maxucs4len, profile[i].table);
204 if (rc != STRINGPREP_OK)
205 return rc;
206 break;
208 case STRINGPREP_BIDI_PROHIBIT_TABLE:
209 case STRINGPREP_BIDI_RAL_TABLE:
210 case STRINGPREP_BIDI_L_TABLE:
211 break;
213 case STRINGPREP_BIDI:
215 int done_prohibited = 0;
216 int done_ral = 0;
217 int done_l = 0;
218 int contains_ral = -1;
219 int contains_l = -1;
221 for (j = 0; profile[j].operation; j++)
222 if (profile[j].operation == STRINGPREP_BIDI_PROHIBIT_TABLE)
224 done_prohibited = 1;
225 k = stringprep_find_string_in_table (ucs4, ucs4len,
226 NULL,
227 profile[j].table);
228 if (k != -1)
229 return STRINGPREP_BIDI_CONTAINS_PROHIBITED;
231 else if (profile[j].operation == STRINGPREP_BIDI_RAL_TABLE)
233 done_ral = 1;
234 if (stringprep_find_string_in_table
235 (ucs4, ucs4len, NULL, profile[j].table) != -1)
236 contains_ral = j;
238 else if (profile[j].operation == STRINGPREP_BIDI_L_TABLE)
240 done_l = 1;
241 if (stringprep_find_string_in_table
242 (ucs4, ucs4len, NULL, profile[j].table) != -1)
243 contains_l = j;
246 if (!done_prohibited || !done_ral || !done_l)
247 return STRINGPREP_PROFILE_ERROR;
249 if (contains_ral != -1 && contains_l != -1)
250 return STRINGPREP_BIDI_BOTH_L_AND_RAL;
252 if (contains_ral != -1)
254 if (!(stringprep_find_character_in_table
255 (ucs4[0], profile[contains_ral].table) != -1 &&
256 stringprep_find_character_in_table
257 (ucs4[ucs4len - 1], profile[contains_ral].table) != -1))
258 return STRINGPREP_BIDI_LEADTRAIL_NOT_RAL;
261 break;
263 default:
264 return STRINGPREP_PROFILE_ERROR;
265 break;
269 *len = ucs4len;
271 return STRINGPREP_OK;
274 static int
275 stringprep_4zi_1 (uint32_t * ucs4, size_t ucs4len, size_t maxucs4len,
276 Stringprep_profile_flags flags,
277 const Stringprep_profile * profile)
279 int rc;
281 rc = stringprep_4i (ucs4, &ucs4len, maxucs4len, flags, profile);
282 if (rc != STRINGPREP_OK)
283 return rc;
285 if (ucs4len >= maxucs4len)
286 return STRINGPREP_TOO_SMALL_BUFFER;
288 ucs4[ucs4len] = 0;
290 return STRINGPREP_OK;
294 * stringprep_4zi:
295 * @ucs4: input/output array with zero terminated string to prepare.
296 * @maxucs4len: maximum length of input/output array.
297 * @flags: stringprep profile flags, or 0.
298 * @profile: pointer to stringprep profile to use.
300 * Prepare the input zero terminated UCS-4 string according to the
301 * stringprep profile, and write back the result to the input string.
303 * Since the stringprep operation can expand the string, @maxucs4len
304 * indicate how large the buffer holding the string is. This function
305 * will not read or write to code points outside that size.
307 * The @flags are one of Stringprep_profile_flags, or 0.
309 * The @profile contain the instructions to perform. Your application
310 * can define new profiles, possibly re-using the generic stringprep
311 * tables that always will be part of the library, or use one of the
312 * currently supported profiles.
314 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
317 stringprep_4zi (uint32_t * ucs4, size_t maxucs4len,
318 Stringprep_profile_flags flags,
319 const Stringprep_profile * profile)
321 size_t ucs4len;
323 for (ucs4len = 0; ucs4len < maxucs4len && ucs4[ucs4len] != 0; ucs4len++)
326 return stringprep_4zi_1 (ucs4, ucs4len, maxucs4len, flags, profile);
330 * stringprep:
331 * @in: input/ouput array with string to prepare.
332 * @maxlen: maximum length of input/output array.
333 * @flags: stringprep profile flags, or 0.
334 * @profile: pointer to stringprep profile to use.
336 * Prepare the input zero terminated UTF-8 string according to the
337 * stringprep profile, and write back the result to the input string.
339 * Note that you must convert strings entered in the systems locale
340 * into UTF-8 before using this function, see
341 * stringprep_locale_to_utf8().
343 * Since the stringprep operation can expand the string, @maxlen
344 * indicate how large the buffer holding the string is. This function
345 * will not read or write to characters outside that size.
347 * The @flags are one of Stringprep_profile_flags, or 0.
349 * The @profile contain the instructions to perform. Your application
350 * can define new profiles, possibly re-using the generic stringprep
351 * tables that always will be part of the library, or use one of the
352 * currently supported profiles.
354 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
357 stringprep (char *in,
358 size_t maxlen,
359 Stringprep_profile_flags flags,
360 const Stringprep_profile * profile)
362 int rc;
363 char *utf8 = NULL;
364 uint32_t *ucs4 = NULL;
365 size_t ucs4len, maxucs4len, adducs4len = 50;
369 if (ucs4)
370 free (ucs4);
371 ucs4 = stringprep_utf8_to_ucs4 (in, -1, &ucs4len);
372 maxucs4len = ucs4len + adducs4len;
373 uint32_t *newp = realloc (ucs4, maxucs4len * sizeof (uint32_t));
374 if (!newp)
376 free (ucs4);
377 return STRINGPREP_MALLOC_ERROR;
379 ucs4 = newp;
381 rc = stringprep_4i (ucs4, &ucs4len, maxucs4len, flags, profile);
382 adducs4len += 50;
384 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
385 if (rc != STRINGPREP_OK)
387 free (ucs4);
388 return rc;
391 utf8 = stringprep_ucs4_to_utf8 (ucs4, ucs4len, 0, 0);
392 free (ucs4);
393 if (!utf8)
394 return STRINGPREP_MALLOC_ERROR;
396 if (strlen (utf8) >= maxlen)
398 free (utf8);
399 return STRINGPREP_TOO_SMALL_BUFFER;
402 strcpy (in, utf8); /* flawfinder: ignore */
404 free (utf8);
406 return STRINGPREP_OK;
410 * stringprep_profile:
411 * @in: input array with UTF-8 string to prepare.
412 * @out: output variable with pointer to newly allocate string.
413 * @profile: name of stringprep profile to use.
414 * @flags: stringprep profile flags, or 0.
416 * Prepare the input zero terminated UTF-8 string according to the
417 * stringprep profile, and return the result in a newly allocated
418 * variable.
420 * Note that you must convert strings entered in the systems locale
421 * into UTF-8 before using this function, see
422 * stringprep_locale_to_utf8().
424 * The output @out variable must be deallocated by the caller.
426 * The @flags are one of Stringprep_profile_flags, or 0.
428 * The @profile specifies the name of the stringprep profile to use.
429 * It must be one of the internally supported stringprep profiles.
431 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
434 stringprep_profile (const char *in,
435 char **out,
436 const char *profile, Stringprep_profile_flags flags)
438 const Stringprep_profiles *p;
439 char *str = NULL;
440 size_t len = strlen (in) + 1;
441 int rc;
443 for (p = &stringprep_profiles[0]; p->name; p++)
444 if (strcmp (p->name, profile) == 0)
445 break;
447 if (!p || !p->name || !p->tables)
448 return STRINGPREP_UNKNOWN_PROFILE;
452 if (str)
453 free (str);
454 str = (char *) malloc (len);
455 if (str == NULL)
456 return STRINGPREP_MALLOC_ERROR;
458 strcpy (str, in);
460 rc = stringprep (str, len, flags, p->tables);
461 len += 50;
463 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
465 if (rc == STRINGPREP_OK)
466 *out = str;
467 else
468 free (str);
470 return rc;
473 /*! \mainpage GNU Internationalized Domain Name Library
475 * \section intro Introduction
477 * GNU Libidn is an implementation of the Stringprep, Punycode and IDNA
478 * specifications defined by the IETF Internationalized Domain Names
479 * (IDN) working group, used for internationalized domain names. The
480 * package is available under the GNU Lesser General Public License.
482 * The library contains a generic Stringprep implementation that does
483 * Unicode 3.2 NFKC normalization, mapping and prohibitation of
484 * characters, and bidirectional character handling. Profiles for
485 * Nameprep, iSCSI, SASL and XMPP are included. Punycode and ASCII
486 * Compatible Encoding (ACE) via IDNA are supported. A mechanism to
487 * define Top-Level Domain (TLD) specific validation tables, and to
488 * compare strings against those tables, is included. Default tables
489 * for some TLDs are also included.
491 * The Stringprep API consists of two main functions, one for
492 * converting data from the system's native representation into UTF-8,
493 * and one function to perform the Stringprep processing. Adding a
494 * new Stringprep profile for your application within the API is
495 * straightforward. The Punycode API consists of one encoding
496 * function and one decoding function. The IDNA API consists of the
497 * ToASCII and ToUnicode functions, as well as an high-level interface
498 * for converting entire domain names to and from the ACE encoded
499 * form. The TLD API consists of one set of functions to extract the
500 * TLD name from a domain string, one set of functions to locate the
501 * proper TLD table to use based on the TLD name, and core functions
502 * to validate a string against a TLD table, and some utility wrappers
503 * to perform all the steps in one call.
505 * The library is used by, e.g., GNU SASL and Shishi to process user
506 * names and passwords. Libidn can be built into GNU Libc to enable a
507 * new system-wide getaddrinfo() flag for IDN processing.
509 * Libidn is developed for the GNU/Linux system, but runs on over 20 Unix
510 * platforms (including Solaris, IRIX, AIX, and Tru64) and Windows.
511 * Libidn is written in C and (parts of) the API is accessible from C,
512 * C++, Emacs Lisp, Python and Java.
514 * The project web page:\n
515 * http://www.gnu.org/software/libidn/
517 * The software archive:\n
518 * ftp://alpha.gnu.org/pub/gnu/libidn/
520 * For more information see:\n
521 * http://www.ietf.org/html.charters/idn-charter.html\n
522 * http://www.ietf.org/rfc/rfc3454.txt (stringprep specification)\n
523 * http://www.ietf.org/rfc/rfc3490.txt (idna specification)\n
524 * http://www.ietf.org/rfc/rfc3491.txt (nameprep specification)\n
525 * http://www.ietf.org/rfc/rfc3492.txt (punycode specification)\n
526 * http://www.ietf.org/internet-drafts/draft-ietf-ips-iscsi-string-prep-04.txt\n
527 * http://www.ietf.org/internet-drafts/draft-ietf-krb-wg-utf8-profile-01.txt\n
528 * http://www.ietf.org/internet-drafts/draft-ietf-sasl-anon-00.txt\n
529 * http://www.ietf.org/internet-drafts/draft-ietf-sasl-saslprep-00.txt\n
530 * http://www.ietf.org/internet-drafts/draft-ietf-xmpp-nodeprep-01.txt\n
531 * http://www.ietf.org/internet-drafts/draft-ietf-xmpp-resourceprep-01.txt\n
533 * Further information and paid contract development:\n
534 * Simon Josefsson <simon@josefsson.org>
536 * \section examples Examples
538 * \include example.c
539 * \include example3.c
540 * \include example4.c
541 * \include example5.c
545 * STRINGPREP_VERSION
547 * String defined via CPP denoting the header file version number.
548 * Used together with stringprep_check_version() to verify header file
549 * and run-time library consistency.
553 * STRINGPREP_MAX_MAP_CHARS
555 * Maximum number of code points that can replace a single code point,
556 * during stringprep mapping.
560 * Stringprep_rc:
561 * @STRINGPREP_OK: Successful operation. This value is guaranteed to
562 * always be zero, the remaining ones are only guaranteed to hold
563 * non-zero values, for logical comparison purposes.
564 * @STRINGPREP_CONTAINS_UNASSIGNED: String contain unassigned Unicode
565 * code points, which is forbidden by the profile.
566 * @STRINGPREP_CONTAINS_PROHIBITED: String contain code points
567 * prohibited by the profile.
568 * @STRINGPREP_BIDI_BOTH_L_AND_RAL: String contain code points with
569 * conflicting bidirection category.
570 * @STRINGPREP_BIDI_LEADTRAIL_NOT_RAL: Leading and trailing character
571 * in string not of proper bidirectional category.
572 * @STRINGPREP_BIDI_CONTAINS_PROHIBITED: Contains prohibited code
573 * points detected by bidirectional code.
574 * @STRINGPREP_TOO_SMALL_BUFFER: Buffer handed to function was too
575 * small. This usually indicate a problem in the calling
576 * application.
577 * @STRINGPREP_PROFILE_ERROR: The stringprep profile was inconsistent.
578 * This usually indicate an internal error in the library.
579 * @STRINGPREP_FLAG_ERROR: The supplied flag conflicted with profile.
580 * This usually indicate a problem in the calling application.
581 * @STRINGPREP_UNKNOWN_PROFILE: The supplied profile name was not
582 * known to the library.
583 * @STRINGPREP_NFKC_FAILED: The Unicode NFKC operation failed. This
584 * usually indicate an internal error in the library.
585 * @STRINGPREP_MALLOC_ERROR: The malloc() was out of memory. This is
586 * usually a fatal error.
588 * Enumerated return codes of stringprep(), stringprep_profile()
589 * functions (and macros using those functions). The value 0 is
590 * guaranteed to always correspond to success.
594 * Stringprep_profile_flags:
595 * @STRINGPREP_NO_NFKC: Disable the NFKC normalization, as well as
596 * selecting the non-NFKC case folding tables. Usually the profile
597 * specifies BIDI and NFKC settings, and applications should not
598 * override it unless in special situations.
599 * @STRINGPREP_NO_BIDI: Disable the BIDI step. Usually the profile
600 * specifies BIDI and NFKC settings, and applications should not
601 * override it unless in special situations.
602 * @STRINGPREP_NO_UNASSIGNED: Make the library return with an error if
603 * string contains unassigned characters according to profile.
605 * Stringprep profile flags.
609 * Stringprep_profile_steps:
611 * Various steps in the stringprep algorithm. You really want to
612 * study the source code to understand this one. Only useful if you
613 * want to add another profile.
617 * stringprep_nameprep:
618 * @in: input/ouput array with string to prepare.
619 * @maxlen: maximum length of input/output array.
621 * Prepare the input UTF-8 string according to the nameprep profile.
622 * The AllowUnassigned flag is true, use
623 * stringprep_nameprep_no_unassigned() if you want a false
624 * AllowUnassigned. Returns 0 iff successful, or an error code.
628 * stringprep_nameprep_no_unassigned:
629 * @in: input/ouput array with string to prepare.
630 * @maxlen: maximum length of input/output array.
632 * Prepare the input UTF-8 string according to the nameprep profile.
633 * The AllowUnassigned flag is false, use stringprep_nameprep() for
634 * true AllowUnassigned. Returns 0 iff successful, or an error code.
638 * stringprep_iscsi:
639 * @in: input/ouput array with string to prepare.
640 * @maxlen: maximum length of input/output array.
642 * Prepare the input UTF-8 string according to the draft iSCSI
643 * stringprep profile. Returns 0 iff successful, or an error code.
647 * stringprep_plain:
648 * @in: input/ouput array with string to prepare.
649 * @maxlen: maximum length of input/output array.
651 * Prepare the input UTF-8 string according to the draft SASL
652 * ANONYMOUS profile. Returns 0 iff successful, or an error code.
656 * stringprep_xmpp_nodeprep:
657 * @in: input/ouput array with string to prepare.
658 * @maxlen: maximum length of input/output array.
660 * Prepare the input UTF-8 string according to the draft XMPP node
661 * identifier profile. Returns 0 iff successful, or an error code.
665 * stringprep_xmpp_resourceprep:
666 * @in: input/ouput array with string to prepare.
667 * @maxlen: maximum length of input/output array.
669 * Prepare the input UTF-8 string according to the draft XMPP resource
670 * identifier profile. Returns 0 iff successful, or an error code.