idna.c

   1 /* idna.c       Convert to or from IDN strings.
   2  * Copyright (C) 2002  Simon Josefsson
   3  *
   4  * This file is part of Libstringprep.
   5  *
   6  * Libstringprep is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libstringprep is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libstringprep; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #include "internal.h"
  23
  24 /** idna_to_ascii
  25  * @in: input array with unicode code points.
  26  * @inlen: length of input array with unicode code points.
  27  * @out: output zero terminated string that must have room for at
  28  *       least 63 characters plus the terminating zero.
  29  * @allowunassigned: boolean value as per IDNA specification.
  30  * @usestd3asciirules: boolean value as per IDNA specification.
  31  *
  32  * The ToASCII operation takes a sequence of Unicode code points that make
  33  * up one label and transforms it into a sequence of code points in the
  34  * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
  35  * resulting sequence are equivalent labels.
  36  *
  37  * It is important to note that the ToASCII operation can fail. ToASCII
  38  * fails if any step of it fails. If any step of the ToASCII operation
  39  * fails on any label in a domain name, that domain name MUST NOT be used
  40  * as an internationalized domain name. The method for deadling with this
  41  * failure is application-specific.
  42  *
  43  * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
  44  * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
  45  * sequence of ASCII code points or a failure condition.
  46  *
  47  * ToASCII never alters a sequence of code points that are all in the ASCII
  48  * range to begin with (although it could fail). Applying the ToASCII
  49  * operation multiple times has exactly the same effect as applying it just
  50  * once.
  51  */
  52 int
  53 idna_to_ascii (const unsigned long *in, size_t inlen,
  54                char *out, int allowunassigned, int usestd3asciirules)
  55 {
  56   size_t len, outlen;
  57   unsigned long *src;
  58   int rc;
  59
  60   src = malloc (sizeof (in[0]) * inlen + 1);
  61   if (src == NULL)
  62     return IDNA_MALLOC_ERROR;
  63
  64   memcpy (src, in, sizeof (in[0]) * inlen);
  65   src[inlen] = 0;
  66
  67   /*
  68    * ToASCII consists of the following steps:
  69    *
  70    * 1. If all code points in the sequence are in the ASCII range (0..7F)
  71    * then skip to step 3.
  72    */
  73
  74   {
  75     size_t i;
  76     int inasciirange;
  77
  78     inasciirange = 1;
  79     for (i = 0; src[i]; i++)
  80       if (src[i] > 0x7F)
  81         inasciirange = 0;
  82     if (inasciirange)
  83       goto step3;
  84   }
  85
  86   /*
  87    * 2. Perform the steps specified in [NAMEPREP] and fail if there is
  88    * an error. The AllowUnassigned flag is used in [NAMEPREP].
  89    */
  90
  91   {
  92     char *p;
  93
  94     p = stringprep_ucs4_to_utf8 (src, inlen, NULL, NULL);
  95     if (p == NULL)
  96       return IDNA_MALLOC_ERROR;
  97
  98     p = realloc (p, BUFSIZ);
  99     if (p == NULL)
 100       return IDNA_MALLOC_ERROR;
 101
 102     if (allowunassigned)
 103       rc = stringprep_nameprep (p, BUFSIZ);
 104     else
 105       rc = stringprep_nameprep_no_unassigned (p, BUFSIZ);
 106
 107     if (rc != STRINGPREP_OK)
 108       return IDNA_STRINGPREP_ERROR;
 109
 110     free (src);
 111
 112     src = stringprep_utf8_to_ucs4 (p, -1, NULL);
 113   }
 114
 115 step3:
 116   /*
 117    * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
 118    *
 119    * (a) Verify the absence of non-LDH ASCII code points; that is,
 120    * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
 121    *
 122    * (b) Verify the absence of leading and trailing hyphen-minus;
 123    * that is, the absence of U+002D at the beginning and end of
 124    * the sequence.
 125    */
 126
 127   if (usestd3asciirules)
 128     {
 129       size_t i;
 130
 131       for (i = 0; src[i]; i++)
 132         if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
 133             (src[i] >= 0x3A && src[i] <= 0x40) ||
 134             (src[i] >= 0x5B && src[i] <= 0x60) ||
 135             (src[i] >= 0x7B && src[i] <= 0x7F))
 136           return IDNA_CONTAINS_LDH;
 137
 138       if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
 139         return IDNA_CONTAINS_MINUS;
 140     }
 141
 142   /*
 143    * 4. If all code points in the sequence are in the ASCII range
 144    * (0..7F), then skip to step 8.
 145    */
 146
 147   {
 148     size_t i;
 149     int inasciirange;
 150
 151     inasciirange = 1;
 152     for (i = 0; src[i]; i++)
 153       if (src[i] > 0x7F)
 154         inasciirange = 0;
 155     if (inasciirange)
 156       goto step8;
 157   }
 158
 159   /*
 160    * 5. Verify that the sequence does NOT begin with the ACE prefix.
 161    *
 162    */
 163
 164   {
 165     /* XXX */
 166   }
 167
 168   /*
 169    * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
 170    * and fail if there is an error.
 171    */
 172   for (len = 0; src[len]; len++)
 173     ;
 174   src[len] = '\0';
 175   outlen = 63 - strlen (IDNA_ACE_PREFIX);
 176   rc = punycode_encode (len, src, NULL,
 177                         &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
 178   if (rc != PUNYCODE_SUCCESS)
 179     return IDNA_PUNYCODE_ERROR;
 180   out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
 181
 182   /*
 183    * 7. Prepend the ACE prefix.
 184    */
 185
 186   memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
 187
 188   /*
 189    * 8. Verify that the number of code points is in the range 1 to 63
 190    * inclusive.
 191    */
 192
 193 step8:
 194   if (strlen (out) < 1 || strlen (out) > 63)
 195     return IDNA_INVALID_LENGTH;
 196
 197   return IDNA_SUCCESS;
 198 }
 199
 200 static int
 201 idna_to_unicode_internal (const unsigned long *in, size_t inlen,
 202                           unsigned long *out, size_t * outlen,
 203                           int allowunassigned, int usestd3asciirules,
 204                           char *utf8in, size_t utf8len)
 205 {
 206   int rc;
 207   char tmpout[64];
 208
 209   /*
 210    * 1. If all code points in the sequence are in the ASCII range (0..7F)
 211    * then skip to step 3.
 212    */
 213
 214   {
 215     size_t i;
 216     int inasciirange;
 217
 218     inasciirange = 1;
 219     for (i = 0; in[i]; i++)
 220       if (in[i] > 0x7F)
 221         inasciirange = 0;
 222     if (inasciirange)
 223       goto step3;
 224   }
 225
 226   /*
 227    * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
 228    * error. (If step 3 of ToASCII is also performed here, it will not
 229    * affect the overall behavior of ToUnicode, but it is not
 230    * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
 231    */
 232
 233   if (allowunassigned)
 234     rc = stringprep_nameprep (utf8in, utf8len);
 235   else
 236     rc = stringprep_nameprep_no_unassigned (utf8in, utf8len);
 237
 238   if (rc != STRINGPREP_OK)
 239     return IDNA_STRINGPREP_ERROR;
 240
 241   /* 3. Verify that the sequence begins with the ACE prefix, and save a
 242    * copy of the sequence.
 243    */
 244
 245 step3:
 246   if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
 247     return IDNA_NO_ACE_PREFIX;
 248
 249   /* 4. Remove the ACE prefix.
 250    */
 251
 252   memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
 253            strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
 254
 255   /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
 256    * and fail if there is an error. Save a copy of the result of
 257    * this step.
 258    */
 259
 260   rc = punycode_decode (strlen(utf8in), utf8in, outlen, out, NULL);
 261   if (rc != PUNYCODE_SUCCESS)
 262     return IDNA_PUNYCODE_ERROR;
 263
 264   /* 6. Apply ToASCII.
 265    */
 266
 267   rc = idna_to_ascii (out, *outlen, tmpout,
 268                       allowunassigned, usestd3asciirules);
 269   if (rc != IDNA_SUCCESS)
 270     return rc;
 271
 272   /* 7. Verify that the result of step 6 matches the saved copy from
 273    * step 3, using a case-insensitive ASCII comparison.
 274    */
 275
 276   if (strcasecmp(utf8in, tmpout + strlen(IDNA_ACE_PREFIX)) != 0)
 277     return IDNA_ROUNDTRIP_VERIFY_ERROR;
 278
 279   /* 8. Return the saved copy from step 5.
 280    */
 281
 282   return IDNA_SUCCESS;
 283 }
 284
 285 /** idna_to_unicode
 286  * @in: input array with unicode code points.
 287  * @inlen: length of input array with unicode code points.
 288  * @out: output array with unicode code points.
 289  * @outlen: on input, maximum size of output array with unicode code points,
 290  *          on exit, actual size of output array with unicode code points.
 291  * @allowunassigned: boolean value as per IDNA specification.
 292  * @usestd3asciirules: boolean value as per IDNA specification.
 293  *
 294  * The ToUnicode operation takes a sequence of Unicode code points
 295  * that make up one label and returns a sequence of Unicode code
 296  * points. If the input sequence is a label in ACE form, then the
 297  * result is an equivalent internationalized label that is not in ACE
 298  * form, otherwise the original sequence is returned unaltered.
 299  *
 300  * ToUnicode never fails. If any step fails, then the original input
 301  * sequence is returned immediately in that step.
 302  *
 303  * The ToUnicode output never contains more code points than its
 304  * input.  Note that the number of octets needed to represent a
 305  * sequence of code points depends on the particular character
 306  * encoding used.
 307  *
 308  * The inputs to ToUnicode are a sequence of code points, the
 309  * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
 310  * ToUnicode is always a sequence of Unicode code points.
 311  *
 312  * Return value: Returns error condition, but it must only be used for
 313  *               debugging purposes.  The output buffer is always
 314  *               guaranteed to contain the correct data according to
 315  *               the specification (sans malloc induced errors).  NB!
 316  *               This means that you normally ignore the return code
 317  *               from this function, as checking it means breaking the
 318  *               standard.
 319  */
 320 int
 321 idna_to_unicode (const unsigned long *in, size_t inlen,
 322                  unsigned long *out, size_t * outlen,
 323                  int allowunassigned, int usestd3asciirules)
 324 {
 325   int rc;
 326   int outlensave = *outlen;
 327   char *p;
 328
 329   p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
 330   if (p == NULL)
 331     return IDNA_MALLOC_ERROR;
 332
 333   p = realloc (p, BUFSIZ);
 334   if (p == NULL)
 335     return IDNA_MALLOC_ERROR;
 336
 337   rc = idna_to_unicode_internal (in, inlen, out, outlen,
 338                                  allowunassigned, usestd3asciirules,
 339                                  p, BUFSIZ);
 340   if (rc != IDNA_SUCCESS)
 341     {
 342       memcpy(out, in,
 343              sizeof (in[0]) * (inlen < outlensave ? inlen : outlensave));
 344       *outlen = inlen;
 345     }
 346
 347   free(p);
 348
 349   return rc;
 350 }