lib/punycode.c

   1 /* punycode.c   Implementation of punycode used to ASCII encode IDN's.
   2  * Copyright (C) 2002, 2003  Simon Josefsson
   3  *
   4  * This file is part of GNU Libidn.
   5  *
   6  * GNU Libidn is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * GNU Libidn is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with GNU Libidn; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 /*
  23  * This file is derived from RFC 3492 written by Adam M. Costello.
  24  *
  25  * Disclaimer and license: Regarding this entire document or any
  26  * portion of it (including the pseudocode and C code), the author
  27  * makes no guarantees and is not responsible for any damage resulting
  28  * from its use.  The author grants irrevocable permission to anyone
  29  * to use, modify, and distribute it in any way that does not diminish
  30  * the rights of anyone else to use, modify, and distribute it,
  31  * provided that redistributed derivative works do not contain
  32  * misleading author or version information.  Derivative works need
  33  * not be licensed under similar terms.
  34  *
  35  * Copyright (C) The Internet Society (2003).  All Rights Reserved.
  36  *
  37  * This document and translations of it may be copied and furnished to
  38  * others, and derivative works that comment on or otherwise explain it
  39  * or assist in its implementation may be prepared, copied, published
  40  * and distributed, in whole or in part, without restriction of any
  41  * kind, provided that the above copyright notice and this paragraph are
  42  * included on all such copies and derivative works.  However, this
  43  * document itself may not be modified in any way, such as by removing
  44  * the copyright notice or references to the Internet Society or other
  45  * Internet organizations, except as needed for the purpose of
  46  * developing Internet standards in which case the procedures for
  47  * copyrights defined in the Internet Standards process must be
  48  * followed, or as required to translate it into languages other than
  49  * English.
  50  *
  51  * The limited permissions granted above are perpetual and will not be
  52  * revoked by the Internet Society or its successors or assigns.
  53  *
  54  * This document and the information contained herein is provided on an
  55  * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
  56  * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
  57  * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
  58  * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
  59  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
  60  */
  61
  62 #include "internal.h"
  63
  64 /*** Bootstring parameters for Punycode ***/
  65
  66 enum
  67 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
  68   initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
  69 };
  70
  71 /* basic(cp) tests whether cp is a basic code point: */
  72 #define basic(cp) ((punycode_uint)(cp) < 0x80)
  73
  74 /* delim(cp) tests whether cp is a delimiter: */
  75 #define delim(cp) ((cp) == delimiter)
  76
  77 /* decode_digit(cp) returns the numeric value of a basic code */
  78 /* point (for use in representing integers) in the range 0 to */
  79 /* base-1, or base if cp is does not represent a value.       */
  80
  81 static punycode_uint
  82 decode_digit (punycode_uint cp)
  83 {
  84   return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
  85     cp - 97 < 26 ? cp - 97 : base;
  86 }
  87
  88 /* encode_digit(d,flag) returns the basic code point whose value      */
  89 /* (when used for representing integers) is d, which needs to be in   */
  90 /* the range 0 to base-1.  The lowercase form is used unless flag is  */
  91 /* nonzero, in which case the uppercase form is used.  The behavior   */
  92 /* is undefined if flag is nonzero and digit d has no uppercase form. */
  93
  94 static char
  95 encode_digit (punycode_uint d, int flag)
  96 {
  97   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
  98   /*  0..25 map to ASCII a..z or A..Z */
  99   /* 26..35 map to ASCII 0..9         */
 100 }
 101
 102 /* flagged(bcp) tests whether a basic code point is flagged */
 103 /* (uppercase).  The behavior is undefined if bcp is not a  */
 104 /* basic code point.                                        */
 105
 106 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
 107
 108 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
 109 /* if flag is zero, uppercase if flag is nonzero, and returns    */
 110 /* the resulting code point.  The code point is unchanged if it  */
 111 /* is caseless.  The behavior is undefined if bcp is not a basic */
 112 /* code point.                                                   */
 113
 114 static char
 115 encode_basic (punycode_uint bcp, int flag)
 116 {
 117   bcp -= (bcp - 97 < 26) << 5;
 118   return bcp + ((!flag && (bcp - 65 < 26)) << 5);
 119 }
 120
 121 /*** Platform-specific constants ***/
 122
 123 /* maxint is the maximum value of a punycode_uint variable: */
 124 static const punycode_uint maxint = -1;
 125 /* Because maxint is unsigned, -1 becomes the maximum value. */
 126
 127 /*** Bias adaptation function ***/
 128
 129 static punycode_uint
 130 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
 131 {
 132   punycode_uint k;
 133
 134   delta = firsttime ? delta / damp : delta >> 1;
 135   /* delta >> 1 is a faster way of doing delta / 2 */
 136   delta += delta / numpoints;
 137
 138   for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
 139     {
 140       delta /= base - tmin;
 141     }
 142
 143   return k + (base - tmin + 1) * delta / (delta + skew);
 144 }
 145
 146 /*** Main encode function ***/
 147
 148 /**
 149  * punycode_encode:
 150  * @input_length: The @input_length is the number of code points in the @input.
 151  * @input: The @input is represented as an array of Unicode code points
 152  *         (not code units; surrogate pairs are not allowed).
 153  * @case_flags: The @case_flags array holds @input_length boolean
 154  *              values, where nonzero suggests that the corresponding
 155  *              Unicode character be forced to uppercase after being
 156  *              decoded (if possible), and zero suggests that it be
 157  *              forced to lowercase (if possible).  ASCII code points
 158  *              are encoded literally, except that ASCII letters are
 159  *              forced to uppercase or lowercase according to the
 160  *              corresponding uppercase flags.  If @case_flags is a
 161  *              %NULL pointer then ASCII letters are left as they are,
 162  *              and other code points are treated as if their
 163  *              uppercase flags were zero.
 164  * @output_length: The @output_length is an in/out argument: the caller
 165  *                 passes in the maximum number of code points that it
 166  *                 can receive, and on successful return it will
 167  *                 contain the number of code points actually output.
 168  * @output: The @output will be represented as an array of ASCII code
 169  *          points.  The output string is *not* zero-terminated; it
 170  *          will contain zeros if and only if the input contains
 171  *          zeros. (Of course the caller can leave room for a
 172  *          terminator and add one if needed.)
 173  *
 174  * Converts Unicode to Punycode.
 175  *
 176  * Return value: The return value can be any of the Punycode_status
 177  *               values defined above except %PUNYCODE_BAD_INPUT; if
 178  *               not %PUNYCODE_SUCCESS, then @output_size and @output
 179  *               might contain garbage.
 180  **/
 181 int
 182 punycode_encode (size_t input_length,
 183                  const punycode_uint input[],
 184                  const unsigned char case_flags[],
 185                  size_t * output_length, char output[])
 186 {
 187   punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
 188
 189   if (input_length > maxint || *output_length > maxint)
 190     return punycode_bad_input;
 191
 192   /* Initialize the state: */
 193
 194   n = initial_n;
 195   delta = out = 0;
 196   max_out = *output_length;
 197   bias = initial_bias;
 198
 199   /* Handle the basic code points: */
 200   for (j = 0; j < input_length; ++j)
 201     {
 202       if (basic (input[j]))
 203         {
 204           if (max_out - out < 2)
 205             return punycode_big_output;
 206           output[out++] =
 207             case_flags ?
 208             (punycode_uint) encode_basic (input[j], case_flags[j]) : input[j];
 209         }
 210       /* else if (input[j] < n) return punycode_bad_input; */
 211       /* (not needed for Punycode with unsigned code points) */
 212     }
 213
 214   h = b = out;
 215
 216   /* h is the number of code points that have been handled, b is the  */
 217   /* number of basic code points, and out is the number of characters */
 218   /* that have been output.                                           */
 219
 220   if (b > 0)
 221     output[out++] = delimiter;
 222
 223   /* Main encoding loop: */
 224
 225   while (h < input_length)
 226     {
 227       /* All non-basic code points < n have been     */
 228       /* handled already.  Find the next larger one: */
 229
 230       for (m = maxint, j = 0; j < input_length; ++j)
 231         {
 232           /* if (basic(input[j])) continue; */
 233           /* (not needed for Punycode) */
 234           if (input[j] >= n && input[j] < m)
 235             m = input[j];
 236         }
 237
 238       /* Increase delta enough to advance the decoder's    */
 239       /* <n,i> state to <m,0>, but guard against overflow: */
 240
 241       if (m - n > (maxint - delta) / (h + 1))
 242         return punycode_overflow;
 243       delta += (m - n) * (h + 1);
 244       n = m;
 245
 246       for (j = 0; j < input_length; ++j)
 247         {
 248           /* Punycode does not need to check whether input[j] is basic: */
 249           if (input[j] < n /* || basic(input[j]) */ )
 250             {
 251               if (++delta == 0)
 252                 return punycode_overflow;
 253             }
 254
 255           if (input[j] == n)
 256             {
 257               /* Represent delta as a generalized variable-length integer: */
 258
 259               for (q = delta, k = base;; k += base)
 260                 {
 261                   if (out >= max_out)
 262                     return punycode_big_output;
 263                   t = k <= bias /* + tmin */ ? tmin :   /* +tmin not needed */
 264                     k >= bias + tmax ? tmax : k - bias;
 265                   if (q < t)
 266                     break;
 267                   output[out++] = encode_digit (t + (q - t) % (base - t), 0);
 268                   q = (q - t) / (base - t);
 269                 }
 270
 271               output[out++] = encode_digit (q, case_flags && case_flags[j]);
 272               bias = adapt (delta, h + 1, h == b);
 273               delta = 0;
 274               ++h;
 275             }
 276         }
 277
 278       ++delta, ++n;
 279     }
 280
 281   *output_length = out;
 282   return punycode_success;
 283 }
 284
 285 /*** Main decode function ***/
 286
 287 /**
 288  * punycode_decode:
 289  * @input_length: The @input_length is the number of code points in the input.
 290  * @input: The @input is represented as an array of ASCII code points.
 291  * @output_length: The @output_length is an in/out argument: the caller
 292  *                 passes in the maximum number of code points that it
 293  *                 can receive, and on successful return it will
 294  *                 contain the actual number of code points output.
 295  * @output: The output will be represented as an array of Unicode code
 296  *          points.
 297  * @case_flags: The @case_flags array needs room for at least
 298  *              @output_length values, or it can be a %NULL pointer if
 299  *              the case information is not needed.  A nonzero flag
 300  *              suggests that the corresponding Unicode character be
 301  *              forced to uppercase by the caller (if possible), while
 302  *              zero suggests that it be forced to lowercase (if
 303  *              possible).  ASCII code points are output already in
 304  *              the proper case, but their flags will be set
 305  *              appropriately so that applying the flags would be
 306  *              harmless.
 307  *
 308  * Converts Punycode to Unicode.
 309  *
 310  * Return value: The return value can be any of the Punycode_status
 311  *               values defined above; if not %PUNYCODE_SUCCESS, then
 312  *               @output_length, @output, and @case_flags might contain
 313  *               garbage.  On success, the decoder will never need to
 314  *               write an @output_length greater than @input_length,
 315  *               because of how the encoding is defined.
 316  *
 317  **/
 318 int
 319 punycode_decode (size_t input_length,
 320                  const char input[],
 321                  size_t * output_length,
 322                  punycode_uint output[], unsigned char case_flags[])
 323 {
 324   punycode_uint n, out, i, max_out, bias, b, j, in, oldi, w, k, digit, t;
 325
 326   if (input_length > maxint || *output_length > maxint)
 327     return punycode_bad_input;
 328
 329   /* Initialize the state: */
 330
 331   n = initial_n;
 332   out = i = 0;
 333   max_out = *output_length;
 334   bias = initial_bias;
 335
 336   /* Handle the basic code points:  Let b be the number of input code */
 337   /* points before the last delimiter, or 0 if there is none, then    */
 338   /* copy the first b code points to the output.                      */
 339
 340   for (b = j = 0; j < input_length; ++j)
 341     if (delim (input[j]))
 342       b = j;
 343   if (b > max_out)
 344     return punycode_big_output;
 345
 346   for (j = 0; j < b; ++j)
 347     {
 348       if (case_flags)
 349         case_flags[out] = flagged (input[j]);
 350       if (!basic (input[j]))
 351         return punycode_bad_input;
 352       output[out++] = input[j];
 353     }
 354
 355   /* Main decoding loop:  Start just after the last delimiter if any  */
 356   /* basic code points were copied; start at the beginning otherwise. */
 357
 358   for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
 359     {
 360
 361       /* in is the index of the next character to be consumed, and */
 362       /* out is the number of code points in the output array.     */
 363
 364       /* Decode a generalized variable-length integer into delta,  */
 365       /* which gets added to i.  The overflow checking is easier   */
 366       /* if we increase i as we go, then subtract off its starting */
 367       /* value at the end to obtain delta.                         */
 368
 369       for (oldi = i, w = 1, k = base;; k += base)
 370         {
 371           if (in >= input_length)
 372             return punycode_bad_input;
 373           digit = decode_digit (input[in++]);
 374           if (digit >= base)
 375             return punycode_bad_input;
 376           if (digit > (maxint - i) / w)
 377             return punycode_overflow;
 378           i += digit * w;
 379           t = k <= bias /* + tmin */ ? tmin :   /* +tmin not needed */
 380             k >= bias + tmax ? tmax : k - bias;
 381           if (digit < t)
 382             break;
 383           if (w > maxint / (base - t))
 384             return punycode_overflow;
 385           w *= (base - t);
 386         }
 387
 388       bias = adapt (i - oldi, out + 1, oldi == 0);
 389
 390       /* i was supposed to wrap around from out+1 to 0,   */
 391       /* incrementing n each time, so we'll fix that now: */
 392
 393       if (i / (out + 1) > maxint - n)
 394         return punycode_overflow;
 395       n += i / (out + 1);
 396       i %= (out + 1);
 397
 398       /* Insert n at position i of the output: */
 399
 400       /* not needed for Punycode: */
 401       /* if (decode_digit(n) <= base) return punycode_invalid_input; */
 402       if (out >= max_out)
 403         return punycode_big_output;
 404
 405       if (case_flags)
 406         {
 407           memmove (case_flags + i + 1, case_flags + i, out - i);
 408
 409           /* Case of last character determines uppercase flag: */
 410           case_flags[i] = flagged (input[in - 1]);
 411         }
 412
 413       memmove (output + i + 1, output + i, (out - i) * sizeof *output);
 414       output[i++] = n;
 415     }
 416
 417   *output_length = out;
 418   return punycode_success;
 419 }
 420
 421 /**
 422  * punycode_uint
 423  *
 424  * Unicode code point data type, this is always a 32 bit unsigned
 425  * integer.
 426  */
 427
 428 /**
 429  * Punycode_status
 430  * @PUNYCODE_SUCCESS: Successful operation.  This value is guaranteed
 431  *   to always be zero, the remaining ones are only guaranteed to hold
 432  *   non-zero values, for logical comparison purposes.
 433  * @PUNYCODE_BAD_INPUT: Input is invalid.
 434  * @PUNYCODE_BIG_OUTPUT: Output would exceed the space provided.
 435  * @PUNYCODE_OVERFLOW: Input needs wider integers to process.
 436  *
 437  * Enumerated return codes of punycode_encode() and punycode_decode().
 438  * The value 0 is guaranteed to always correspond to success.
 439  */