Fix non-ASCII case of SSE4.2 strcasstr.
[glibc.git] / libidn / stringprep.c
blobf5c9fae0735df5ea96dbbfc94e15e3f027fb9683
1 /* stringprep.c --- Core stringprep implementation.
2 * Copyright (C) 2002, 2003, 2004 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #if HAVE_CONFIG_H
23 # include "config.h"
24 #endif
26 #include <stdlib.h>
27 #include <string.h>
29 #include "stringprep.h"
31 static ssize_t
32 stringprep_find_character_in_table (uint32_t ucs4,
33 const Stringprep_table_element * table)
35 ssize_t i;
37 /* This is where typical uses of Libidn spends very close to all CPU
38 time and causes most cache misses. One could easily do a binary
39 search instead. Before rewriting this, I want hard evidence this
40 slowness is at all relevant in typical applications. (I don't
41 dispute optimization may improve matters significantly, I'm
42 mostly interested in having someone give real-world benchmark on
43 the impact of libidn.) */
45 for (i = 0; table[i].start || table[i].end; i++)
46 if (ucs4 >= table[i].start &&
47 ucs4 <= (table[i].end ? table[i].end : table[i].start))
48 return i;
50 return -1;
53 static ssize_t
54 stringprep_find_string_in_table (uint32_t * ucs4,
55 size_t ucs4len,
56 size_t * tablepos,
57 const Stringprep_table_element * table)
59 size_t j;
60 ssize_t pos;
62 for (j = 0; j < ucs4len; j++)
63 if ((pos = stringprep_find_character_in_table (ucs4[j], table)) != -1)
65 if (tablepos)
66 *tablepos = pos;
67 return j;
70 return -1;
73 static int
74 stringprep_apply_table_to_string (uint32_t * ucs4,
75 size_t * ucs4len,
76 size_t maxucs4len,
77 const Stringprep_table_element * table)
79 ssize_t pos;
80 size_t i, maplen;
82 while ((pos = stringprep_find_string_in_table (ucs4, *ucs4len,
83 &i, table)) != -1)
85 for (maplen = STRINGPREP_MAX_MAP_CHARS;
86 maplen > 0 && table[i].map[maplen - 1] == 0; maplen--)
89 if (*ucs4len - 1 + maplen >= maxucs4len)
90 return STRINGPREP_TOO_SMALL_BUFFER;
92 memmove (&ucs4[pos + maplen], &ucs4[pos + 1],
93 sizeof (uint32_t) * (*ucs4len - pos - 1));
94 memcpy (&ucs4[pos], table[i].map, sizeof (uint32_t) * maplen);
95 *ucs4len = *ucs4len - 1 + maplen;
98 return STRINGPREP_OK;
101 #define INVERTED(x) ((x) & ((~0UL) >> 1))
102 #define UNAPPLICAPLEFLAGS(flags, profileflags) \
103 ((!INVERTED(profileflags) && !(profileflags & flags) && profileflags) || \
104 ( INVERTED(profileflags) && (profileflags & flags)))
107 * stringprep_4i:
108 * @ucs4: input/output array with string to prepare.
109 * @len: on input, length of input array with Unicode code points,
110 * on exit, length of output array with Unicode code points.
111 * @maxucs4len: maximum length of input/output array.
112 * @flags: stringprep profile flags, or 0.
113 * @profile: pointer to stringprep profile to use.
115 * Prepare the input UCS-4 string according to the stringprep profile,
116 * and write back the result to the input string.
118 * The input is not required to be zero terminated (@ucs4[@len] = 0).
119 * The output will not be zero terminated unless @ucs4[@len] = 0.
120 * Instead, see stringprep_4zi() if your input is zero terminated or
121 * if you want the output to be.
123 * Since the stringprep operation can expand the string, @maxucs4len
124 * indicate how large the buffer holding the string is. This function
125 * will not read or write to code points outside that size.
127 * The @flags are one of Stringprep_profile_flags, or 0.
129 * The @profile contain the instructions to perform. Your application
130 * can define new profiles, possibly re-using the generic stringprep
131 * tables that always will be part of the library, or use one of the
132 * currently supported profiles.
134 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
137 stringprep_4i (uint32_t * ucs4, size_t * len, size_t maxucs4len,
138 Stringprep_profile_flags flags,
139 const Stringprep_profile * profile)
141 size_t i, j;
142 ssize_t k;
143 size_t ucs4len = *len;
144 int rc;
146 for (i = 0; profile[i].operation; i++)
148 switch (profile[i].operation)
150 case STRINGPREP_NFKC:
152 uint32_t *q = 0;
154 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
155 break;
157 if (flags & STRINGPREP_NO_NFKC && !profile[i].flags)
158 /* Profile requires NFKC, but callee asked for no NFKC. */
159 return STRINGPREP_FLAG_ERROR;
161 q = stringprep_ucs4_nfkc_normalize (ucs4, ucs4len);
162 if (!q)
163 return STRINGPREP_NFKC_FAILED;
165 for (ucs4len = 0; q[ucs4len]; ucs4len++)
168 if (ucs4len >= maxucs4len)
170 free (q);
171 return STRINGPREP_TOO_SMALL_BUFFER;
174 memcpy (ucs4, q, ucs4len * sizeof (ucs4[0]));
176 free (q);
178 break;
180 case STRINGPREP_PROHIBIT_TABLE:
181 k = stringprep_find_string_in_table (ucs4, ucs4len,
182 NULL, profile[i].table);
183 if (k != -1)
184 return STRINGPREP_CONTAINS_PROHIBITED;
185 break;
187 case STRINGPREP_UNASSIGNED_TABLE:
188 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
189 break;
190 if (flags & STRINGPREP_NO_UNASSIGNED)
192 k = stringprep_find_string_in_table
193 (ucs4, ucs4len, NULL, profile[i].table);
194 if (k != -1)
195 return STRINGPREP_CONTAINS_UNASSIGNED;
197 break;
199 case STRINGPREP_MAP_TABLE:
200 if (UNAPPLICAPLEFLAGS (flags, profile[i].flags))
201 break;
202 rc = stringprep_apply_table_to_string
203 (ucs4, &ucs4len, maxucs4len, profile[i].table);
204 if (rc != STRINGPREP_OK)
205 return rc;
206 break;
208 case STRINGPREP_BIDI_PROHIBIT_TABLE:
209 case STRINGPREP_BIDI_RAL_TABLE:
210 case STRINGPREP_BIDI_L_TABLE:
211 break;
213 case STRINGPREP_BIDI:
215 int done_prohibited = 0;
216 int done_ral = 0;
217 int done_l = 0;
218 int contains_ral = -1;
219 int contains_l = -1;
221 for (j = 0; profile[j].operation; j++)
222 if (profile[j].operation == STRINGPREP_BIDI_PROHIBIT_TABLE)
224 done_prohibited = 1;
225 k = stringprep_find_string_in_table (ucs4, ucs4len,
226 NULL,
227 profile[j].table);
228 if (k != -1)
229 return STRINGPREP_BIDI_CONTAINS_PROHIBITED;
231 else if (profile[j].operation == STRINGPREP_BIDI_RAL_TABLE)
233 done_ral = 1;
234 if (stringprep_find_string_in_table
235 (ucs4, ucs4len, NULL, profile[j].table) != -1)
236 contains_ral = j;
238 else if (profile[j].operation == STRINGPREP_BIDI_L_TABLE)
240 done_l = 1;
241 if (stringprep_find_string_in_table
242 (ucs4, ucs4len, NULL, profile[j].table) != -1)
243 contains_l = j;
246 if (!done_prohibited || !done_ral || !done_l)
247 return STRINGPREP_PROFILE_ERROR;
249 if (contains_ral != -1 && contains_l != -1)
250 return STRINGPREP_BIDI_BOTH_L_AND_RAL;
252 if (contains_ral != -1)
254 if (!(stringprep_find_character_in_table
255 (ucs4[0], profile[contains_ral].table) != -1 &&
256 stringprep_find_character_in_table
257 (ucs4[ucs4len - 1], profile[contains_ral].table) != -1))
258 return STRINGPREP_BIDI_LEADTRAIL_NOT_RAL;
261 break;
263 default:
264 return STRINGPREP_PROFILE_ERROR;
265 break;
269 *len = ucs4len;
271 return STRINGPREP_OK;
274 static int
275 stringprep_4zi_1 (uint32_t * ucs4, size_t ucs4len, size_t maxucs4len,
276 Stringprep_profile_flags flags,
277 const Stringprep_profile * profile)
279 int rc;
281 rc = stringprep_4i (ucs4, &ucs4len, maxucs4len, flags, profile);
282 if (rc != STRINGPREP_OK)
283 return rc;
285 if (ucs4len >= maxucs4len)
286 return STRINGPREP_TOO_SMALL_BUFFER;
288 ucs4[ucs4len] = 0;
290 return STRINGPREP_OK;
294 * stringprep_4zi:
295 * @ucs4: input/output array with zero terminated string to prepare.
296 * @maxucs4len: maximum length of input/output array.
297 * @flags: stringprep profile flags, or 0.
298 * @profile: pointer to stringprep profile to use.
300 * Prepare the input zero terminated UCS-4 string according to the
301 * stringprep profile, and write back the result to the input string.
303 * Since the stringprep operation can expand the string, @maxucs4len
304 * indicate how large the buffer holding the string is. This function
305 * will not read or write to code points outside that size.
307 * The @flags are one of Stringprep_profile_flags, or 0.
309 * The @profile contain the instructions to perform. Your application
310 * can define new profiles, possibly re-using the generic stringprep
311 * tables that always will be part of the library, or use one of the
312 * currently supported profiles.
314 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
317 stringprep_4zi (uint32_t * ucs4, size_t maxucs4len,
318 Stringprep_profile_flags flags,
319 const Stringprep_profile * profile)
321 size_t ucs4len;
323 for (ucs4len = 0; ucs4len < maxucs4len && ucs4[ucs4len] != 0; ucs4len++)
326 return stringprep_4zi_1 (ucs4, ucs4len, maxucs4len, flags, profile);
330 * stringprep:
331 * @in: input/ouput array with string to prepare.
332 * @maxlen: maximum length of input/output array.
333 * @flags: stringprep profile flags, or 0.
334 * @profile: pointer to stringprep profile to use.
336 * Prepare the input zero terminated UTF-8 string according to the
337 * stringprep profile, and write back the result to the input string.
339 * Note that you must convert strings entered in the systems locale
340 * into UTF-8 before using this function, see
341 * stringprep_locale_to_utf8().
343 * Since the stringprep operation can expand the string, @maxlen
344 * indicate how large the buffer holding the string is. This function
345 * will not read or write to characters outside that size.
347 * The @flags are one of Stringprep_profile_flags, or 0.
349 * The @profile contain the instructions to perform. Your application
350 * can define new profiles, possibly re-using the generic stringprep
351 * tables that always will be part of the library, or use one of the
352 * currently supported profiles.
354 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
357 stringprep (char *in,
358 size_t maxlen,
359 Stringprep_profile_flags flags,
360 const Stringprep_profile * profile)
362 int rc;
363 char *utf8 = NULL;
364 uint32_t *ucs4 = NULL;
365 size_t ucs4len, maxucs4len, adducs4len = 50;
369 free (ucs4);
370 ucs4 = stringprep_utf8_to_ucs4 (in, -1, &ucs4len);
371 maxucs4len = ucs4len + adducs4len;
372 uint32_t *newp = realloc (ucs4, maxucs4len * sizeof (uint32_t));
373 if (!newp)
375 free (ucs4);
376 return STRINGPREP_MALLOC_ERROR;
378 ucs4 = newp;
380 rc = stringprep_4i (ucs4, &ucs4len, maxucs4len, flags, profile);
381 adducs4len += 50;
383 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
384 if (rc != STRINGPREP_OK)
386 free (ucs4);
387 return rc;
390 utf8 = stringprep_ucs4_to_utf8 (ucs4, ucs4len, 0, 0);
391 free (ucs4);
392 if (!utf8)
393 return STRINGPREP_MALLOC_ERROR;
395 if (strlen (utf8) >= maxlen)
397 free (utf8);
398 return STRINGPREP_TOO_SMALL_BUFFER;
401 strcpy (in, utf8); /* flawfinder: ignore */
403 free (utf8);
405 return STRINGPREP_OK;
409 * stringprep_profile:
410 * @in: input array with UTF-8 string to prepare.
411 * @out: output variable with pointer to newly allocate string.
412 * @profile: name of stringprep profile to use.
413 * @flags: stringprep profile flags, or 0.
415 * Prepare the input zero terminated UTF-8 string according to the
416 * stringprep profile, and return the result in a newly allocated
417 * variable.
419 * Note that you must convert strings entered in the systems locale
420 * into UTF-8 before using this function, see
421 * stringprep_locale_to_utf8().
423 * The output @out variable must be deallocated by the caller.
425 * The @flags are one of Stringprep_profile_flags, or 0.
427 * The @profile specifies the name of the stringprep profile to use.
428 * It must be one of the internally supported stringprep profiles.
430 * Return value: Returns %STRINGPREP_OK iff successful, or an error code.
433 stringprep_profile (const char *in,
434 char **out,
435 const char *profile, Stringprep_profile_flags flags)
437 const Stringprep_profiles *p;
438 char *str = NULL;
439 size_t len = strlen (in) + 1;
440 int rc;
442 for (p = &stringprep_profiles[0]; p->name; p++)
443 if (strcmp (p->name, profile) == 0)
444 break;
446 if (!p || !p->name || !p->tables)
447 return STRINGPREP_UNKNOWN_PROFILE;
451 free (str);
452 str = (char *) malloc (len);
453 if (str == NULL)
454 return STRINGPREP_MALLOC_ERROR;
456 strcpy (str, in);
458 rc = stringprep (str, len, flags, p->tables);
459 len += 50;
461 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
463 if (rc == STRINGPREP_OK)
464 *out = str;
465 else
466 free (str);
468 return rc;
471 /*! \mainpage GNU Internationalized Domain Name Library
473 * \section intro Introduction
475 * GNU Libidn is an implementation of the Stringprep, Punycode and IDNA
476 * specifications defined by the IETF Internationalized Domain Names
477 * (IDN) working group, used for internationalized domain names. The
478 * package is available under the GNU Lesser General Public License.
480 * The library contains a generic Stringprep implementation that does
481 * Unicode 3.2 NFKC normalization, mapping and prohibitation of
482 * characters, and bidirectional character handling. Profiles for
483 * Nameprep, iSCSI, SASL and XMPP are included. Punycode and ASCII
484 * Compatible Encoding (ACE) via IDNA are supported. A mechanism to
485 * define Top-Level Domain (TLD) specific validation tables, and to
486 * compare strings against those tables, is included. Default tables
487 * for some TLDs are also included.
489 * The Stringprep API consists of two main functions, one for
490 * converting data from the system's native representation into UTF-8,
491 * and one function to perform the Stringprep processing. Adding a
492 * new Stringprep profile for your application within the API is
493 * straightforward. The Punycode API consists of one encoding
494 * function and one decoding function. The IDNA API consists of the
495 * ToASCII and ToUnicode functions, as well as an high-level interface
496 * for converting entire domain names to and from the ACE encoded
497 * form. The TLD API consists of one set of functions to extract the
498 * TLD name from a domain string, one set of functions to locate the
499 * proper TLD table to use based on the TLD name, and core functions
500 * to validate a string against a TLD table, and some utility wrappers
501 * to perform all the steps in one call.
503 * The library is used by, e.g., GNU SASL and Shishi to process user
504 * names and passwords. Libidn can be built into GNU Libc to enable a
505 * new system-wide getaddrinfo() flag for IDN processing.
507 * Libidn is developed for the GNU/Linux system, but runs on over 20 Unix
508 * platforms (including Solaris, IRIX, AIX, and Tru64) and Windows.
509 * Libidn is written in C and (parts of) the API is accessible from C,
510 * C++, Emacs Lisp, Python and Java.
512 * The project web page:\n
513 * http://www.gnu.org/software/libidn/
515 * The software archive:\n
516 * ftp://alpha.gnu.org/pub/gnu/libidn/
518 * For more information see:\n
519 * http://www.ietf.org/html.charters/idn-charter.html\n
520 * http://www.ietf.org/rfc/rfc3454.txt (stringprep specification)\n
521 * http://www.ietf.org/rfc/rfc3490.txt (idna specification)\n
522 * http://www.ietf.org/rfc/rfc3491.txt (nameprep specification)\n
523 * http://www.ietf.org/rfc/rfc3492.txt (punycode specification)\n
524 * http://www.ietf.org/internet-drafts/draft-ietf-ips-iscsi-string-prep-04.txt\n
525 * http://www.ietf.org/internet-drafts/draft-ietf-krb-wg-utf8-profile-01.txt\n
526 * http://www.ietf.org/internet-drafts/draft-ietf-sasl-anon-00.txt\n
527 * http://www.ietf.org/internet-drafts/draft-ietf-sasl-saslprep-00.txt\n
528 * http://www.ietf.org/internet-drafts/draft-ietf-xmpp-nodeprep-01.txt\n
529 * http://www.ietf.org/internet-drafts/draft-ietf-xmpp-resourceprep-01.txt\n
531 * Further information and paid contract development:\n
532 * Simon Josefsson <simon@josefsson.org>
534 * \section examples Examples
536 * \include example.c
537 * \include example3.c
538 * \include example4.c
539 * \include example5.c
543 * STRINGPREP_VERSION
545 * String defined via CPP denoting the header file version number.
546 * Used together with stringprep_check_version() to verify header file
547 * and run-time library consistency.
551 * STRINGPREP_MAX_MAP_CHARS
553 * Maximum number of code points that can replace a single code point,
554 * during stringprep mapping.
558 * Stringprep_rc:
559 * @STRINGPREP_OK: Successful operation. This value is guaranteed to
560 * always be zero, the remaining ones are only guaranteed to hold
561 * non-zero values, for logical comparison purposes.
562 * @STRINGPREP_CONTAINS_UNASSIGNED: String contain unassigned Unicode
563 * code points, which is forbidden by the profile.
564 * @STRINGPREP_CONTAINS_PROHIBITED: String contain code points
565 * prohibited by the profile.
566 * @STRINGPREP_BIDI_BOTH_L_AND_RAL: String contain code points with
567 * conflicting bidirection category.
568 * @STRINGPREP_BIDI_LEADTRAIL_NOT_RAL: Leading and trailing character
569 * in string not of proper bidirectional category.
570 * @STRINGPREP_BIDI_CONTAINS_PROHIBITED: Contains prohibited code
571 * points detected by bidirectional code.
572 * @STRINGPREP_TOO_SMALL_BUFFER: Buffer handed to function was too
573 * small. This usually indicate a problem in the calling
574 * application.
575 * @STRINGPREP_PROFILE_ERROR: The stringprep profile was inconsistent.
576 * This usually indicate an internal error in the library.
577 * @STRINGPREP_FLAG_ERROR: The supplied flag conflicted with profile.
578 * This usually indicate a problem in the calling application.
579 * @STRINGPREP_UNKNOWN_PROFILE: The supplied profile name was not
580 * known to the library.
581 * @STRINGPREP_NFKC_FAILED: The Unicode NFKC operation failed. This
582 * usually indicate an internal error in the library.
583 * @STRINGPREP_MALLOC_ERROR: The malloc() was out of memory. This is
584 * usually a fatal error.
586 * Enumerated return codes of stringprep(), stringprep_profile()
587 * functions (and macros using those functions). The value 0 is
588 * guaranteed to always correspond to success.
592 * Stringprep_profile_flags:
593 * @STRINGPREP_NO_NFKC: Disable the NFKC normalization, as well as
594 * selecting the non-NFKC case folding tables. Usually the profile
595 * specifies BIDI and NFKC settings, and applications should not
596 * override it unless in special situations.
597 * @STRINGPREP_NO_BIDI: Disable the BIDI step. Usually the profile
598 * specifies BIDI and NFKC settings, and applications should not
599 * override it unless in special situations.
600 * @STRINGPREP_NO_UNASSIGNED: Make the library return with an error if
601 * string contains unassigned characters according to profile.
603 * Stringprep profile flags.
607 * Stringprep_profile_steps:
609 * Various steps in the stringprep algorithm. You really want to
610 * study the source code to understand this one. Only useful if you
611 * want to add another profile.
615 * stringprep_nameprep:
616 * @in: input/ouput array with string to prepare.
617 * @maxlen: maximum length of input/output array.
619 * Prepare the input UTF-8 string according to the nameprep profile.
620 * The AllowUnassigned flag is true, use
621 * stringprep_nameprep_no_unassigned() if you want a false
622 * AllowUnassigned. Returns 0 iff successful, or an error code.
626 * stringprep_nameprep_no_unassigned:
627 * @in: input/ouput array with string to prepare.
628 * @maxlen: maximum length of input/output array.
630 * Prepare the input UTF-8 string according to the nameprep profile.
631 * The AllowUnassigned flag is false, use stringprep_nameprep() for
632 * true AllowUnassigned. Returns 0 iff successful, or an error code.
636 * stringprep_iscsi:
637 * @in: input/ouput array with string to prepare.
638 * @maxlen: maximum length of input/output array.
640 * Prepare the input UTF-8 string according to the draft iSCSI
641 * stringprep profile. Returns 0 iff successful, or an error code.
645 * stringprep_plain:
646 * @in: input/ouput array with string to prepare.
647 * @maxlen: maximum length of input/output array.
649 * Prepare the input UTF-8 string according to the draft SASL
650 * ANONYMOUS profile. Returns 0 iff successful, or an error code.
654 * stringprep_xmpp_nodeprep:
655 * @in: input/ouput array with string to prepare.
656 * @maxlen: maximum length of input/output array.
658 * Prepare the input UTF-8 string according to the draft XMPP node
659 * identifier profile. Returns 0 iff successful, or an error code.
663 * stringprep_xmpp_resourceprep:
664 * @in: input/ouput array with string to prepare.
665 * @maxlen: maximum length of input/output array.
667 * Prepare the input UTF-8 string according to the draft XMPP resource
668 * identifier profile. Returns 0 iff successful, or an error code.