idna.c

   1 /* idna.c       Convert to or from IDN strings.
   2  * Copyright (C) 2002  Simon Josefsson
   3  *
   4  * This file is part of Libstringprep.
   5  *
   6  * Libstringprep is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libstringprep is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libstringprep; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #include "internal.h"
  23
  24 /**
  25  * idna_to_ascii
  26  * @in: input array with unicode code points.
  27  * @inlen: length of input array with unicode code points.
  28  * @out: output zero terminated string that must have room for at
  29  *       least 63 characters plus the terminating zero.
  30  * @allowunassigned: boolean value as per IDNA specification.
  31  * @usestd3asciirules: boolean value as per IDNA specification.
  32  *
  33  * The ToASCII operation takes a sequence of Unicode code points that make
  34  * up one label and transforms it into a sequence of code points in the
  35  * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
  36  * resulting sequence are equivalent labels.
  37  *
  38  * It is important to note that the ToASCII operation can fail. ToASCII
  39  * fails if any step of it fails. If any step of the ToASCII operation
  40  * fails on any label in a domain name, that domain name MUST NOT be used
  41  * as an internationalized domain name. The method for deadling with this
  42  * failure is application-specific.
  43  *
  44  * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
  45  * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
  46  * sequence of ASCII code points or a failure condition.
  47  *
  48  * ToASCII never alters a sequence of code points that are all in the ASCII
  49  * range to begin with (although it could fail). Applying the ToASCII
  50  * operation multiple times has exactly the same effect as applying it just
  51  * once.
  52  *
  53  * Return value: Returns 0 on success, or an error code.
  54  */
  55 int
  56 idna_to_ascii (const unsigned long *in, size_t inlen,
  57                char *out, int allowunassigned, int usestd3asciirules)
  58 {
  59   size_t len, outlen;
  60   unsigned long *src;
  61   int rc;
  62
  63   src = malloc (sizeof (in[0]) * inlen + 1);
  64   if (src == NULL)
  65     return IDNA_MALLOC_ERROR;
  66
  67   memcpy (src, in, sizeof (in[0]) * inlen);
  68   src[inlen] = 0;
  69
  70   /*
  71    * ToASCII consists of the following steps:
  72    *
  73    * 1. If all code points in the sequence are in the ASCII range (0..7F)
  74    * then skip to step 3.
  75    */
  76
  77   {
  78     size_t i;
  79     int inasciirange;
  80
  81     inasciirange = 1;
  82     for (i = 0; src[i]; i++)
  83       if (src[i] > 0x7F)
  84         inasciirange = 0;
  85     if (inasciirange)
  86       goto step3;
  87   }
  88
  89   /*
  90    * 2. Perform the steps specified in [NAMEPREP] and fail if there is
  91    * an error. The AllowUnassigned flag is used in [NAMEPREP].
  92    */
  93
  94   {
  95     char *p;
  96
  97     p = stringprep_ucs4_to_utf8 (src, inlen, NULL, NULL);
  98     if (p == NULL)
  99       return IDNA_MALLOC_ERROR;
 100
 101     p = realloc (p, BUFSIZ);
 102     if (p == NULL)
 103       return IDNA_MALLOC_ERROR;
 104
 105     if (allowunassigned)
 106       rc = stringprep_nameprep (p, BUFSIZ);
 107     else
 108       rc = stringprep_nameprep_no_unassigned (p, BUFSIZ);
 109
 110     if (rc != STRINGPREP_OK)
 111       return IDNA_STRINGPREP_ERROR;
 112
 113     free (src);
 114
 115     src = stringprep_utf8_to_ucs4 (p, -1, NULL);
 116   }
 117
 118 step3:
 119   /*
 120    * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
 121    *
 122    * (a) Verify the absence of non-LDH ASCII code points; that is,
 123    * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
 124    *
 125    * (b) Verify the absence of leading and trailing hyphen-minus;
 126    * that is, the absence of U+002D at the beginning and end of
 127    * the sequence.
 128    */
 129
 130   if (usestd3asciirules)
 131     {
 132       size_t i;
 133
 134       for (i = 0; src[i]; i++)
 135         if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
 136             (src[i] >= 0x3A && src[i] <= 0x40) ||
 137             (src[i] >= 0x5B && src[i] <= 0x60) ||
 138             (src[i] >= 0x7B && src[i] <= 0x7F))
 139           return IDNA_CONTAINS_LDH;
 140
 141       if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
 142         return IDNA_CONTAINS_MINUS;
 143     }
 144
 145   /*
 146    * 4. If all code points in the sequence are in the ASCII range
 147    * (0..7F), then skip to step 8.
 148    */
 149
 150   {
 151     size_t i;
 152     int inasciirange;
 153
 154     inasciirange = 1;
 155     for (i = 0; src[i]; i++)
 156       if (src[i] > 0x7F)
 157         inasciirange = 0;
 158     if (inasciirange)
 159       goto step8;
 160   }
 161
 162   /*
 163    * 5. Verify that the sequence does NOT begin with the ACE prefix.
 164    *
 165    */
 166
 167   {
 168     /* XXX */
 169   }
 170
 171   /*
 172    * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
 173    * and fail if there is an error.
 174    */
 175   for (len = 0; src[len]; len++)
 176     ;
 177   src[len] = '\0';
 178   outlen = 63 - strlen (IDNA_ACE_PREFIX);
 179   rc = punycode_encode (len, src, NULL,
 180                         &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
 181   if (rc != PUNYCODE_SUCCESS)
 182     return IDNA_PUNYCODE_ERROR;
 183   out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
 184
 185   /*
 186    * 7. Prepend the ACE prefix.
 187    */
 188
 189   memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
 190
 191   /*
 192    * 8. Verify that the number of code points is in the range 1 to 63
 193    * inclusive.
 194    */
 195
 196 step8:
 197   if (strlen (out) < 1 || strlen (out) > 63)
 198     return IDNA_INVALID_LENGTH;
 199
 200   return IDNA_SUCCESS;
 201 }
 202
 203 static int
 204 idna_to_unicode_internal (const unsigned long *in, size_t inlen,
 205                           unsigned long *out, size_t * outlen,
 206                           int allowunassigned, int usestd3asciirules,
 207                           char *utf8in, size_t utf8len)
 208 {
 209   int rc;
 210   char tmpout[64];
 211
 212   /*
 213    * 1. If all code points in the sequence are in the ASCII range (0..7F)
 214    * then skip to step 3.
 215    */
 216
 217   {
 218     size_t i;
 219     int inasciirange;
 220
 221     inasciirange = 1;
 222     for (i = 0; in[i]; i++)
 223       if (in[i] > 0x7F)
 224         inasciirange = 0;
 225     if (inasciirange)
 226       goto step3;
 227   }
 228
 229   /*
 230    * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
 231    * error. (If step 3 of ToASCII is also performed here, it will not
 232    * affect the overall behavior of ToUnicode, but it is not
 233    * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
 234    */
 235
 236   if (allowunassigned)
 237     rc = stringprep_nameprep (utf8in, utf8len);
 238   else
 239     rc = stringprep_nameprep_no_unassigned (utf8in, utf8len);
 240
 241   if (rc != STRINGPREP_OK)
 242     return IDNA_STRINGPREP_ERROR;
 243
 244   /* 3. Verify that the sequence begins with the ACE prefix, and save a
 245    * copy of the sequence.
 246    */
 247
 248 step3:
 249   if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
 250     return IDNA_NO_ACE_PREFIX;
 251
 252   /* 4. Remove the ACE prefix.
 253    */
 254
 255   memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
 256            strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
 257
 258   /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
 259    * and fail if there is an error. Save a copy of the result of
 260    * this step.
 261    */
 262
 263   rc = punycode_decode (strlen(utf8in), utf8in, outlen, out, NULL);
 264   if (rc != PUNYCODE_SUCCESS)
 265     return IDNA_PUNYCODE_ERROR;
 266
 267   /* 6. Apply ToASCII.
 268    */
 269
 270   rc = idna_to_ascii (out, *outlen, tmpout,
 271                       allowunassigned, usestd3asciirules);
 272   if (rc != IDNA_SUCCESS)
 273     return rc;
 274
 275   /* 7. Verify that the result of step 6 matches the saved copy from
 276    * step 3, using a case-insensitive ASCII comparison.
 277    */
 278
 279   if (strcasecmp(utf8in, tmpout + strlen(IDNA_ACE_PREFIX)) != 0)
 280     return IDNA_ROUNDTRIP_VERIFY_ERROR;
 281
 282   /* 8. Return the saved copy from step 5.
 283    */
 284
 285   return IDNA_SUCCESS;
 286 }
 287
 288 /**
 289  * idna_to_unicode
 290  * @in: input array with unicode code points.
 291  * @inlen: length of input array with unicode code points.
 292  * @out: output array with unicode code points.
 293  * @outlen: on input, maximum size of output array with unicode code points,
 294  *          on exit, actual size of output array with unicode code points.
 295  * @allowunassigned: boolean value as per IDNA specification.
 296  * @usestd3asciirules: boolean value as per IDNA specification.
 297  *
 298  * The ToUnicode operation takes a sequence of Unicode code points
 299  * that make up one label and returns a sequence of Unicode code
 300  * points. If the input sequence is a label in ACE form, then the
 301  * result is an equivalent internationalized label that is not in ACE
 302  * form, otherwise the original sequence is returned unaltered.
 303  *
 304  * ToUnicode never fails. If any step fails, then the original input
 305  * sequence is returned immediately in that step.
 306  *
 307  * The ToUnicode output never contains more code points than its
 308  * input.  Note that the number of octets needed to represent a
 309  * sequence of code points depends on the particular character
 310  * encoding used.
 311  *
 312  * The inputs to ToUnicode are a sequence of code points, the
 313  * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
 314  * ToUnicode is always a sequence of Unicode code points.
 315  *
 316  * Return value: Returns error condition, but it must only be used for
 317  *               debugging purposes.  The output buffer is always
 318  *               guaranteed to contain the correct data according to
 319  *               the specification (sans malloc induced errors).  NB!
 320  *               This means that you normally ignore the return code
 321  *               from this function, as checking it means breaking the
 322  *               standard.
 323  */
 324 int
 325 idna_to_unicode (const unsigned long *in, size_t inlen,
 326                  unsigned long *out, size_t * outlen,
 327                  int allowunassigned, int usestd3asciirules)
 328 {
 329   int rc;
 330   int outlensave = *outlen;
 331   char *p;
 332
 333   p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
 334   if (p == NULL)
 335     return IDNA_MALLOC_ERROR;
 336
 337   p = realloc (p, BUFSIZ);
 338   if (p == NULL)
 339     return IDNA_MALLOC_ERROR;
 340
 341   rc = idna_to_unicode_internal (in, inlen, out, outlen,
 342                                  allowunassigned, usestd3asciirules,
 343                                  p, BUFSIZ);
 344   if (rc != IDNA_SUCCESS)
 345     {
 346       memcpy(out, in,
 347              sizeof (in[0]) * (inlen < outlensave ? inlen : outlensave));
 348       *outlen = inlen;
 349     }
 350
 351   free(p);
 352
 353   return rc;
 354 }