idna.c

   1 /* idna.c       Convert to or from IDN strings.
   2  * Copyright (C) 2002  Simon Josefsson
   3  *
   4  * This file is part of Libstringprep.
   5  *
   6  * Libstringprep is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libstringprep is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libstringprep; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #include "internal.h"
  23
  24 /** idna_to_ascii
  25  * @in: input array with unicode code points.
  26  * @inlen: length of input array with unicode code points.
  27  * @out: output zero terminated string that must have room for at
  28  *       least 63 characters plus the terminating zero.
  29  * @allowunassigned: boolean value as per IDNA specification.
  30  * @usestd3asciirules: boolean value as per IDNA specification.
  31  *
  32  * The ToASCII operation takes a sequence of Unicode code points that make
  33  * up one label and transforms it into a sequence of code points in the
  34  * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
  35  * resulting sequence are equivalent labels.
  36  *
  37  * It is important to note that the ToASCII operation can fail. ToASCII
  38  * fails if any step of it fails. If any step of the ToASCII operation
  39  * fails on any label in a domain name, that domain name MUST NOT be used
  40  * as an internationalized domain name. The method for deadling with this
  41  * failure is application-specific.
  42  *
  43  * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
  44  * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
  45  * sequence of ASCII code points or a failure condition.
  46  *
  47  * ToASCII never alters a sequence of code points that are all in the ASCII
  48  * range to begin with (although it could fail). Applying the ToASCII
  49  * operation multiple times has exactly the same effect as applying it just
  50  * once.
  51  */
  52 int
  53 idna_to_ascii (const unsigned long *in, size_t inlen,
  54                char *out,
  55                int allowunassigned, int usestd3asciirules)
  56 {
  57   size_t len, outlen;
  58   unsigned long *src;
  59   int rc;
  60
  61   src = malloc(sizeof(in[0]) * inlen + 1);
  62   if (src == NULL)
  63     return IDNA_MALLOC_ERROR;
  64
  65   memcpy(src, in, sizeof(in[0]) * inlen);
  66   src[inlen] = 0;
  67
  68   /*
  69    * ToASCII consists of the following steps:
  70    *
  71    * 1. If all code points in the sequence are in the ASCII range (0..7F)
  72    * then skip to step 3.
  73    */
  74
  75   {
  76     size_t i;
  77     int inasciirange;
  78
  79     inasciirange = 1;
  80     for (i = 0; src[i]; i++)
  81       if (src[i] > 0x7F)
  82         inasciirange = 0;
  83     if (inasciirange)
  84       goto step3;
  85   }
  86
  87   /*
  88    * 2. Perform the steps specified in [NAMEPREP] and fail if there is
  89    * an error. The AllowUnassigned flag is used in [NAMEPREP].
  90    */
  91
  92   {
  93     char *p;
  94
  95     p = stringprep_ucs4_to_utf8 (src, inlen, NULL, NULL);
  96     if (p == NULL)
  97       return IDNA_MALLOC_ERROR;
  98
  99     p = realloc(p, BUFSIZ);
 100     if (p == NULL)
 101       return IDNA_MALLOC_ERROR;
 102
 103     if (allowunassigned)
 104       rc = stringprep_nameprep(p, BUFSIZ);
 105     else
 106       rc = stringprep_nameprep_no_unassigned(p, BUFSIZ);
 107
 108     if (rc != STRINGPREP_OK)
 109       return IDNA_STRINGPREP_ERROR;
 110
 111     free(src);
 112
 113     src = stringprep_utf8_to_ucs4(p, -1, NULL);
 114   }
 115
 116  step3:
 117   /*
 118    * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
 119    *
 120    * (a) Verify the absence of non-LDH ASCII code points; that is,
 121    * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
 122    *
 123    * (b) Verify the absence of leading and trailing hyphen-minus;
 124    * that is, the absence of U+002D at the beginning and end of
 125    * the sequence.
 126    */
 127
 128   if (usestd3asciirules)
 129     {
 130       size_t i;
 131
 132       for (i = 0; src[i]; i++)
 133         if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
 134             (src[i] >= 0x3A && src[i] <= 0x40) ||
 135             (src[i] >= 0x5B && src[i] <= 0x60) ||
 136             (src[i] >= 0x7B && src[i] <= 0x7F))
 137           return IDNA_CONTAINS_LDH;
 138
 139       if (src[0] == 0x002D || (i > 0 && src[i-1] == 0x002D))
 140         return IDNA_CONTAINS_MINUS;
 141     }
 142
 143   /*
 144    * 4. If all code points in the sequence are in the ASCII range
 145    * (0..7F), then skip to step 8.
 146    */
 147
 148   {
 149     size_t i;
 150     int inasciirange;
 151
 152     inasciirange = 1;
 153     for (i = 0; src[i]; i++)
 154       if (src[i] > 0x7F)
 155         inasciirange = 0;
 156     if (inasciirange)
 157       goto step8;
 158   }
 159
 160   /*
 161    * 5. Verify that the sequence does NOT begin with the ACE prefix.
 162    *
 163    */
 164
 165   {
 166     /* XXX */
 167   }
 168
 169   /*
 170    * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
 171    * and fail if there is an error.
 172    */
 173   for (len = 0; src[len]; len++)
 174     ;
 175   src[len] = '\0';
 176   outlen = 63 - strlen(IDNA_ACE_PREFIX);
 177   rc = punycode_encode (len, src, NULL,
 178                         &outlen, &out[strlen(IDNA_ACE_PREFIX)]);
 179   if (rc != PUNYCODE_SUCCESS)
 180     return IDNA_PUNYCODE_ERROR;
 181   if (outlen > 63)
 182     return IDNA_PUNYCODE_ERROR;
 183   out[strlen(IDNA_ACE_PREFIX) + outlen] = '\0';
 184
 185   /*
 186    * 7. Prepend the ACE prefix.
 187    */
 188
 189   memcpy(out, IDNA_ACE_PREFIX, strlen(IDNA_ACE_PREFIX));
 190
 191   /*
 192    * 8. Verify that the number of code points is in the range 1 to 63
 193    * inclusive.
 194    */
 195
 196  step8:
 197   if (strlen(out) < 1 || strlen(out) > 63)
 198     return IDNA_INVALID_LENGTH;
 199
 200   return IDNA_SUCCESS;
 201 }
 202
 203 /** idna_to_unicode
 204  * @in: input array with unicode code points.
 205  * @inlen: length of input array with unicode code points.
 206  * @out: output array with unicode code points.
 207  * @outlen: on input, maximum size of output array with unicode code points,
 208  *          on exit, actual size of output array with unicode code points.
 209  * @allowunassigned: boolean value as per IDNA specification.
 210  * @usestd3asciirules: boolean value as per IDNA specification.
 211  *
 212  * The ToUnicode operation takes a sequence of Unicode code points
 213  * that make up one label and returns a sequence of Unicode code
 214  * points. If the input sequence is a label in ACE form, then the
 215  * result is an equivalent internationalized label that is not in ACE
 216  * form, otherwise the original sequence is returned unaltered.
 217  *
 218  * ToUnicode never fails. If any step fails, then the original input
 219  * sequence is returned immediately in that step.
 220  *
 221  * The ToUnicode output never contains more code points than its
 222  * input.  Note that the number of octets needed to represent a
 223  * sequence of code points depends on the particular character
 224  * encoding used.
 225  *
 226  * The inputs to ToUnicode are a sequence of code points, the
 227  * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
 228  * ToUnicode is always a sequence of Unicode code points.
 229  */
 230 int
 231 idna_to_unicode (const unsigned long *in, size_t inlen,
 232                  unsigned long *out, size_t *outlen,
 233                  int allowunassigned, int usestd3asciirules)
 234 {
 235   char *p;
 236   int rc;
 237   char *src;
 238
 239   /*
 240    * 1. If all code points in the sequence are in the ASCII range (0..7F)
 241    * then skip to step 3.
 242    */
 243
 244   {
 245     size_t i;
 246     int inasciirange;
 247
 248     inasciirange = 1;
 249     for (i = 0; in[i]; i++)
 250       if (in[i] > 0x7F)
 251         inasciirange = 0;
 252     if (inasciirange)
 253       goto step3;
 254   }
 255
 256   /*
 257    * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
 258    * error. (If step 3 of ToASCII is also performed here, it will not
 259    * affect the overall behavior of ToUnicode, but it is not
 260    * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
 261    */
 262
 263   p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
 264   if (p == NULL)
 265     return IDNA_MALLOC_ERROR;
 266
 267   p = realloc(p, BUFSIZ);
 268   if (p == NULL)
 269     return IDNA_MALLOC_ERROR;
 270
 271   if (allowunassigned)
 272     rc = stringprep_nameprep(p, BUFSIZ);
 273   else
 274     rc = stringprep_nameprep_no_unassigned(p, BUFSIZ);
 275
 276   if (rc != STRINGPREP_OK)
 277     return IDNA_STRINGPREP_ERROR;
 278
 279   free(src);
 280
 281   src = stringprep_utf8_to_ucs4(p, -1, NULL);
 282
 283   /* 3. Verify that the sequence begins with the ACE prefix, and save a
 284    * copy of the sequence.
 285    */
 286
 287  step3:
 288   if (memcmp(IDNA_ACE_PREFIX, p, strlen(IDNA_ACE_PREFIX)) != 0)
 289     return IDNA_NO_ACE_PREFIX;
 290
 291   /* 4. Remove the ACE prefix.
 292    */
 293
 294   memmove(p, &p[strlen(IDNA_ACE_PREFIX)], strlen(p)-strlen(IDNA_ACE_PREFIX));
 295
 296   /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
 297    * and fail if there is an error. Save a copy of the result of
 298    * this step.
 299    */
 300
 301
 302
 303   /* 6. Apply ToASCII.
 304    */
 305
 306   /* 7. Verify that the result of step 6 matches the saved copy from
 307    * step 3, using a case-insensitive ASCII comparison.
 308    */
 309
 310   /* 8. Return the saved copy from step 5.
 311    */
 312
 313   return IDNA_SUCCESS;
 314 }