*** empty log message ***
[libidn.git] / lib / stringprep.c
blobc2d0e61491b14ed8e0d9eb404ed74ad2f3aa7975
1 /* stringprep.c --- Core stringprep implementation.
2 * Copyright (C) 2002, 2003, 2004 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 #ifdef HAVE_CONFIG_H
23 # include "config.h"
24 #endif
26 #include <stdlib.h>
27 #include <string.h>
29 #include "stringprep.h"
31 static ssize_t
32 stringprep_find_character_in_table (uint32_t ucs4,
33 const Stringprep_table_element * table)
35 ssize_t i;
37 /* This is where typical uses of Libidn spends very close to all CPU
38 time and causes most cache misses. One could easily do a binary
39 search instead. Before rewriting this, I want hard evidence this
40 slowness is at all relevant in typical applications. (I don't
41 dispute optimization may improve matters significantly, I'm
42 mostly interested in having someone give real-world benchmark on
43 the impact of libidn.) */
45 for (i = 0; table[i].start || table[i].end; i++)
46 if (ucs4 >= table[i].start &&
47 ucs4 <= (table[i].end ? table[i].end : table[i].start))
48 return i;
50 return -1;
53 static ssize_t
54 stringprep_find_string_in_table (uint32_t * ucs4,
55 size_t ucs4len,
56 size_t * tablepos,
57 const Stringprep_table_element * table)
59 size_t j;
60 ssize_t pos;
62 for (j = 0; j < ucs4len; j++)
63 if ((pos = stringprep_find_character_in_table (ucs4[j], table)) != -1)
65 if (tablepos)
66 *tablepos = pos;
67 return j;
70 return -1;
73 static int
74 stringprep_apply_table_to_string (uint32_t * ucs4,
75 size_t * ucs4len,
76 size_t maxucs4len,
77 const Stringprep_table_element * table)
79 ssize_t pos;
80 size_t i, maplen;
82 while ((pos = stringprep_find_string_in_table (ucs4, *ucs4len,
83 &i, table)) != -1)
85 for (maplen = STRINGPREP_MAX_MAP_CHARS;
86 maplen > 0 && table[i].map[maplen - 1] == 0; maplen--)
89 if (*ucs4len - 1 + maplen >= maxucs4len)
90 return STRINGPREP_TOO_SMALL_BUFFER;
92 memmove (&ucs4[pos + maplen], &ucs4[pos + 1],
93 sizeof (uint32_t) * (*ucs4len - pos - 1));
94 memcpy (&ucs4[pos], table[i].map, sizeof (uint32_t) * maplen);
95 *ucs4len = *ucs4len - 1 + maplen;
98 return STRINGPREP_OK;
101 #define INVERTED(x) ((x) & ((~0UL) >> 1))
102 #define UNAPPLICAPLEFLAGS(flags, profileflags) \
103 ((!INVERTED(profileflags) && !(profileflags & flags) && profileflags) || \
104 ( INVERTED(profileflags) && (profileflags & flags)))
107 * stringprep_4i - prepare internationalized string
108 * @ucs4: input/output array with string to prepare.
109 * @len: on input, length of input array with Unicode code points,
110 * on exit, length of output array with Unicode code points.
111 * @maxucs4len: maximum length of input/output array.
112 * @flags: a #Stringprep_profile_flags value, or 0.
113 * @profile: pointer to #Stringprep_profile to use.
115 * Prepare the input UCS-4 string according to the stringprep profile,
116 * and write back the result to the input string.
118 * The input is not required to be zero terminated (@ucs4[@len] = 0).
119 * The output will not be zero terminated unless @ucs4[@len] = 0.
120 * Instead, see stringprep_4zi() if your input is zero terminated or
121 * if you want the output to be.
123 * Since the stringprep operation can expand the string, @maxucs4len
124 * indicate how large the buffer holding the string is. This function
125 * will not read or write to code points outside that size.
127 * The @flags are one of #Stringprep_profile_flags values, or 0.
129 * The @profile contain the #Stringprep_profile instructions to
130 * perform. Your application can define new profiles, possibly
131 * re-using the generic stringprep tables that always will be part of
132 * the library, or use one of the currently supported profiles.
134 * Return value: Returns %STRINGPREP_OK iff successful, or an
135 * #Stringprep_rc error code.
138 stringprep_4i (uint32_t * ucs4, size_t * len, size_t maxucs4len,
139 Stringprep_profile_flags flags,
140 const Stringprep_profile * profile)
142 size_t i, j;
143 ssize_t k;
144 size_t ucs4len = *len;
145 int rc;
147 for (i = 0; profile[i].operation; i++)
149 switch (profile[i].operation)
151 case STRINGPREP_NFKC:
153 uint32_t *q = 0;
155 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
156 break;
158 if (flags & STRINGPREP_NO_NFKC && !profile[i].flags)
159 /* Profile requires NFKC, but callee asked for no NFKC. */
160 return STRINGPREP_FLAG_ERROR;
162 q = stringprep_ucs4_nfkc_normalize (ucs4, ucs4len);
163 if (!q)
164 return STRINGPREP_NFKC_FAILED;
166 for (ucs4len = 0; q[ucs4len]; ucs4len++)
169 if (ucs4len >= maxucs4len)
171 free (q);
172 return STRINGPREP_TOO_SMALL_BUFFER;
175 memcpy (ucs4, q, ucs4len * sizeof (ucs4[0]));
177 free (q);
179 break;
181 case STRINGPREP_PROHIBIT_TABLE:
182 k = stringprep_find_string_in_table (ucs4, ucs4len,
183 NULL, profile[i].table);
184 if (k != -1)
185 return STRINGPREP_CONTAINS_PROHIBITED;
186 break;
188 case STRINGPREP_UNASSIGNED_TABLE:
189 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
190 break;
191 if (flags & STRINGPREP_NO_UNASSIGNED)
193 k = stringprep_find_string_in_table
194 (ucs4, ucs4len, NULL, profile[i].table);
195 if (k != -1)
196 return STRINGPREP_CONTAINS_UNASSIGNED;
198 break;
200 case STRINGPREP_MAP_TABLE:
201 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
202 break;
203 rc = stringprep_apply_table_to_string
204 (ucs4, &ucs4len, maxucs4len, profile[i].table);
205 if (rc != STRINGPREP_OK)
206 return rc;
207 break;
209 case STRINGPREP_BIDI_PROHIBIT_TABLE:
210 case STRINGPREP_BIDI_RAL_TABLE:
211 case STRINGPREP_BIDI_L_TABLE:
212 break;
214 case STRINGPREP_BIDI:
216 int done_prohibited = 0;
217 int done_ral = 0;
218 int done_l = 0;
219 int contains_ral = -1;
220 int contains_l = -1;
222 for (j = 0; profile[j].operation; j++)
223 if (profile[j].operation == STRINGPREP_BIDI_PROHIBIT_TABLE)
225 done_prohibited = 1;
226 k = stringprep_find_string_in_table (ucs4, ucs4len,
227 NULL,
228 profile[j].table);
229 if (k != -1)
230 return STRINGPREP_BIDI_CONTAINS_PROHIBITED;
232 else if (profile[j].operation == STRINGPREP_BIDI_RAL_TABLE)
234 done_ral = 1;
235 if (stringprep_find_string_in_table
236 (ucs4, ucs4len, NULL, profile[j].table) != -1)
237 contains_ral = j;
239 else if (profile[j].operation == STRINGPREP_BIDI_L_TABLE)
241 done_l = 1;
242 if (stringprep_find_string_in_table
243 (ucs4, ucs4len, NULL, profile[j].table) != -1)
244 contains_l = j;
247 if (!done_prohibited || !done_ral || !done_l)
248 return STRINGPREP_PROFILE_ERROR;
250 if (contains_ral != -1 && contains_l != -1)
251 return STRINGPREP_BIDI_BOTH_L_AND_RAL;
253 if (contains_ral != -1)
255 if (!(stringprep_find_character_in_table
256 (ucs4[0], profile[contains_ral].table) != -1 &&
257 stringprep_find_character_in_table
258 (ucs4[ucs4len - 1], profile[contains_ral].table) != -1))
259 return STRINGPREP_BIDI_LEADTRAIL_NOT_RAL;
262 break;
264 default:
265 return STRINGPREP_PROFILE_ERROR;
266 break;
270 *len = ucs4len;
272 return STRINGPREP_OK;
275 static int
276 stringprep_4zi_1 (uint32_t * ucs4, size_t ucs4len, size_t maxucs4len,
277 Stringprep_profile_flags flags,
278 const Stringprep_profile * profile)
280 int rc;
282 rc = stringprep_4i (ucs4, &ucs4len, maxucs4len, flags, profile);
283 if (rc != STRINGPREP_OK)
284 return rc;
286 if (ucs4len >= maxucs4len)
287 return STRINGPREP_TOO_SMALL_BUFFER;
289 ucs4[ucs4len] = 0;
291 return STRINGPREP_OK;
295 * stringprep_4zi - prepare internationalized string
296 * @ucs4: input/output array with zero terminated string to prepare.
297 * @maxucs4len: maximum length of input/output array.
298 * @flags: a #Stringprep_profile_flags value, or 0.
299 * @profile: pointer to #Stringprep_profile to use.
301 * Prepare the input zero terminated UCS-4 string according to the
302 * stringprep profile, and write back the result to the input string.
304 * Since the stringprep operation can expand the string, @maxucs4len
305 * indicate how large the buffer holding the string is. This function
306 * will not read or write to code points outside that size.
308 * The @flags are one of #Stringprep_profile_flags values, or 0.
310 * The @profile contain the #Stringprep_profile instructions to
311 * perform. Your application can define new profiles, possibly
312 * re-using the generic stringprep tables that always will be part of
313 * the library, or use one of the currently supported profiles.
315 * Return value: Returns %STRINGPREP_OK iff successful, or an
316 * #Stringprep_rc error code.
319 stringprep_4zi (uint32_t * ucs4, size_t maxucs4len,
320 Stringprep_profile_flags flags,
321 const Stringprep_profile * profile)
323 size_t ucs4len;
325 for (ucs4len = 0; ucs4len < maxucs4len && ucs4[ucs4len] != 0; ucs4len++)
328 return stringprep_4zi_1 (ucs4, ucs4len, maxucs4len, flags, profile);
332 * stringprep - prepare internationalized string
333 * @in: input/ouput array with string to prepare.
334 * @maxlen: maximum length of input/output array.
335 * @flags: a #Stringprep_profile_flags value, or 0.
336 * @profile: pointer to #Stringprep_profile to use.
338 * Prepare the input zero terminated UTF-8 string according to the
339 * stringprep profile, and write back the result to the input string.
341 * Note that you must convert strings entered in the systems locale
342 * into UTF-8 before using this function, see
343 * stringprep_locale_to_utf8().
345 * Since the stringprep operation can expand the string, @maxlen
346 * indicate how large the buffer holding the string is. This function
347 * will not read or write to characters outside that size.
349 * The @flags are one of #Stringprep_profile_flags values, or 0.
351 * The @profile contain the #Stringprep_profile instructions to
352 * perform. Your application can define new profiles, possibly
353 * re-using the generic stringprep tables that always will be part of
354 * the library, or use one of the currently supported profiles.
356 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
359 stringprep (char *in,
360 size_t maxlen,
361 Stringprep_profile_flags flags,
362 const Stringprep_profile * profile)
364 int rc;
365 char *utf8 = NULL;
366 uint32_t *ucs4 = NULL;
367 size_t ucs4len, maxucs4len, adducs4len = 50;
371 uint32_t *newp;
373 if (ucs4)
374 free (ucs4);
375 ucs4 = stringprep_utf8_to_ucs4 (in, -1, &ucs4len);
376 maxucs4len = ucs4len + adducs4len;
377 newp = realloc (ucs4, maxucs4len * sizeof (uint32_t));
378 if (!newp)
380 free (ucs4);
381 return STRINGPREP_MALLOC_ERROR;
383 ucs4 = newp;
385 rc = stringprep_4i (ucs4, &ucs4len, maxucs4len, flags, profile);
386 adducs4len += 50;
388 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
389 if (rc != STRINGPREP_OK)
391 free (ucs4);
392 return rc;
395 utf8 = stringprep_ucs4_to_utf8 (ucs4, ucs4len, 0, 0);
396 free (ucs4);
397 if (!utf8)
398 return STRINGPREP_MALLOC_ERROR;
400 if (strlen (utf8) >= maxlen)
402 free (utf8);
403 return STRINGPREP_TOO_SMALL_BUFFER;
406 strcpy (in, utf8); /* flawfinder: ignore */
408 free (utf8);
410 return STRINGPREP_OK;
414 * stringprep_profile - prepare internationalized string
415 * @in: input array with UTF-8 string to prepare.
416 * @out: output variable with pointer to newly allocate string.
417 * @profile: name of stringprep profile to use.
418 * @flags: a #Stringprep_profile_flags value, or 0.
420 * Prepare the input zero terminated UTF-8 string according to the
421 * stringprep profile, and return the result in a newly allocated
422 * variable.
424 * Note that you must convert strings entered in the systems locale
425 * into UTF-8 before using this function, see
426 * stringprep_locale_to_utf8().
428 * The output @out variable must be deallocated by the caller.
430 * The @flags are one of #Stringprep_profile_flags values, or 0.
432 * The @profile specifies the name of the stringprep profile to use.
433 * It must be one of the internally supported stringprep profiles.
435 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
438 stringprep_profile (const char *in,
439 char **out,
440 const char *profile, Stringprep_profile_flags flags)
442 const Stringprep_profiles *p;
443 char *str = NULL;
444 size_t len = strlen (in) + 1;
445 int rc;
447 for (p = &stringprep_profiles[0]; p->name; p++)
448 if (strcmp (p->name, profile) == 0)
449 break;
451 if (!p || !p->name || !p->tables)
452 return STRINGPREP_UNKNOWN_PROFILE;
456 if (str)
457 free (str);
458 str = (char *) malloc (len);
459 if (str == NULL)
460 return STRINGPREP_MALLOC_ERROR;
462 strcpy (str, in);
464 rc = stringprep (str, len, flags, p->tables);
465 len += 50;
467 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
469 if (rc == STRINGPREP_OK)
470 *out = str;
471 else
472 free (str);
474 return rc;
477 /*! \mainpage GNU Internationalized Domain Name Library
479 * \section intro Introduction
481 * GNU Libidn is an implementation of the Stringprep, Punycode and IDNA
482 * specifications defined by the IETF Internationalized Domain Names
483 * (IDN) working group, used for internationalized domain names. The
484 * package is available under the GNU Lesser General Public License.
486 * The library contains a generic Stringprep implementation that does
487 * Unicode 3.2 NFKC normalization, mapping and prohibitation of
488 * characters, and bidirectional character handling. Profiles for
489 * Nameprep, iSCSI, SASL and XMPP are included. Punycode and ASCII
490 * Compatible Encoding (ACE) via IDNA are supported. A mechanism to
491 * define Top-Level Domain (TLD) specific validation tables, and to
492 * compare strings against those tables, is included. Default tables
493 * for some TLDs are also included.
495 * The Stringprep API consists of two main functions, one for
496 * converting data from the system's native representation into UTF-8,
497 * and one function to perform the Stringprep processing. Adding a
498 * new Stringprep profile for your application within the API is
499 * straightforward. The Punycode API consists of one encoding
500 * function and one decoding function. The IDNA API consists of the
501 * ToASCII and ToUnicode functions, as well as an high-level interface
502 * for converting entire domain names to and from the ACE encoded
503 * form. The TLD API consists of one set of functions to extract the
504 * TLD name from a domain string, one set of functions to locate the
505 * proper TLD table to use based on the TLD name, and core functions
506 * to validate a string against a TLD table, and some utility wrappers
507 * to perform all the steps in one call.
509 * The library is used by, e.g., GNU SASL and Shishi to process user
510 * names and passwords. Libidn can be built into GNU Libc to enable a
511 * new system-wide getaddrinfo() flag for IDN processing.
513 * Libidn is developed for the GNU/Linux system, but runs on over 20 Unix
514 * platforms (including Solaris, IRIX, AIX, and Tru64) and Windows.
515 * Libidn is written in C and (parts of) the API is accessible from C,
516 * C++, Emacs Lisp, Python and Java.
518 * The project web page:\n
519 * http://www.gnu.org/software/libidn/
521 * The software archive:\n
522 * ftp://alpha.gnu.org/pub/gnu/libidn/
524 * For more information see:\n
525 * http://www.ietf.org/html.charters/idn-charter.html\n
526 * http://www.ietf.org/rfc/rfc3454.txt (stringprep specification)\n
527 * http://www.ietf.org/rfc/rfc3490.txt (idna specification)\n
528 * http://www.ietf.org/rfc/rfc3491.txt (nameprep specification)\n
529 * http://www.ietf.org/rfc/rfc3492.txt (punycode specification)\n
530 * http://www.ietf.org/internet-drafts/draft-ietf-ips-iscsi-string-prep-04.txt\n
531 * http://www.ietf.org/internet-drafts/draft-ietf-krb-wg-utf8-profile-01.txt\n
532 * http://www.ietf.org/internet-drafts/draft-ietf-sasl-anon-00.txt\n
533 * http://www.ietf.org/internet-drafts/draft-ietf-sasl-saslprep-00.txt\n
534 * http://www.ietf.org/internet-drafts/draft-ietf-xmpp-nodeprep-01.txt\n
535 * http://www.ietf.org/internet-drafts/draft-ietf-xmpp-resourceprep-01.txt\n
537 * Further information and paid contract development:\n
538 * Simon Josefsson <simon@josefsson.org>
540 * \section examples Examples
542 * \include example.c
543 * \include example3.c
544 * \include example4.c
545 * \include example5.c
549 * STRINGPREP_VERSION
551 * String defined via CPP denoting the header file version number.
552 * Used together with stringprep_check_version() to verify header file
553 * and run-time library consistency.
557 * STRINGPREP_MAX_MAP_CHARS
559 * Maximum number of code points that can replace a single code point,
560 * during stringprep mapping.
564 * Stringprep_rc:
565 * @STRINGPREP_OK: Successful operation. This value is guaranteed to
566 * always be zero, the remaining ones are only guaranteed to hold
567 * non-zero values, for logical comparison purposes.
568 * @STRINGPREP_CONTAINS_UNASSIGNED: String contain unassigned Unicode
569 * code points, which is forbidden by the profile.
570 * @STRINGPREP_CONTAINS_PROHIBITED: String contain code points
571 * prohibited by the profile.
572 * @STRINGPREP_BIDI_BOTH_L_AND_RAL: String contain code points with
573 * conflicting bidirection category.
574 * @STRINGPREP_BIDI_LEADTRAIL_NOT_RAL: Leading and trailing character
575 * in string not of proper bidirectional category.
576 * @STRINGPREP_BIDI_CONTAINS_PROHIBITED: Contains prohibited code
577 * points detected by bidirectional code.
578 * @STRINGPREP_TOO_SMALL_BUFFER: Buffer handed to function was too
579 * small. This usually indicate a problem in the calling
580 * application.
581 * @STRINGPREP_PROFILE_ERROR: The stringprep profile was inconsistent.
582 * This usually indicate an internal error in the library.
583 * @STRINGPREP_FLAG_ERROR: The supplied flag conflicted with profile.
584 * This usually indicate a problem in the calling application.
585 * @STRINGPREP_UNKNOWN_PROFILE: The supplied profile name was not
586 * known to the library.
587 * @STRINGPREP_NFKC_FAILED: The Unicode NFKC operation failed. This
588 * usually indicate an internal error in the library.
589 * @STRINGPREP_MALLOC_ERROR: The malloc() was out of memory. This is
590 * usually a fatal error.
592 * Enumerated return codes of stringprep(), stringprep_profile()
593 * functions (and macros using those functions). The value 0 is
594 * guaranteed to always correspond to success.
598 * Stringprep_profile_flags:
599 * @STRINGPREP_NO_NFKC: Disable the NFKC normalization, as well as
600 * selecting the non-NFKC case folding tables. Usually the profile
601 * specifies BIDI and NFKC settings, and applications should not
602 * override it unless in special situations.
603 * @STRINGPREP_NO_BIDI: Disable the BIDI step. Usually the profile
604 * specifies BIDI and NFKC settings, and applications should not
605 * override it unless in special situations.
606 * @STRINGPREP_NO_UNASSIGNED: Make the library return with an error if
607 * string contains unassigned characters according to profile.
609 * Stringprep profile flags.
613 * Stringprep_profile_steps:
615 * Various steps in the stringprep algorithm. You really want to
616 * study the source code to understand this one. Only useful if you
617 * want to add another profile.
621 * stringprep_nameprep:
622 * @in: input/ouput array with string to prepare.
623 * @maxlen: maximum length of input/output array.
625 * Prepare the input UTF-8 string according to the nameprep profile.
626 * The AllowUnassigned flag is true, use
627 * stringprep_nameprep_no_unassigned() if you want a false
628 * AllowUnassigned. Returns 0 iff successful, or an error code.
632 * stringprep_nameprep_no_unassigned:
633 * @in: input/ouput array with string to prepare.
634 * @maxlen: maximum length of input/output array.
636 * Prepare the input UTF-8 string according to the nameprep profile.
637 * The AllowUnassigned flag is false, use stringprep_nameprep() for
638 * true AllowUnassigned. Returns 0 iff successful, or an error code.
642 * stringprep_iscsi:
643 * @in: input/ouput array with string to prepare.
644 * @maxlen: maximum length of input/output array.
646 * Prepare the input UTF-8 string according to the draft iSCSI
647 * stringprep profile. Returns 0 iff successful, or an error code.
651 * stringprep_plain:
652 * @in: input/ouput array with string to prepare.
653 * @maxlen: maximum length of input/output array.
655 * Prepare the input UTF-8 string according to the draft SASL
656 * ANONYMOUS profile. Returns 0 iff successful, or an error code.
660 * stringprep_xmpp_nodeprep:
661 * @in: input/ouput array with string to prepare.
662 * @maxlen: maximum length of input/output array.
664 * Prepare the input UTF-8 string according to the draft XMPP node
665 * identifier profile. Returns 0 iff successful, or an error code.
669 * stringprep_xmpp_resourceprep:
670 * @in: input/ouput array with string to prepare.
671 * @maxlen: maximum length of input/output array.
673 * Prepare the input UTF-8 string according to the draft XMPP resource
674 * identifier profile. Returns 0 iff successful, or an error code.