iconvdata/bug-iconv12.c

   1 /* bug 19727: Testing UTF conversions with UTF16 surrogates as input.
   2    Copyright (C) 2016-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include <errno.h>
  22 #include <string.h>
  23 #include <inttypes.h>
  24 #include <iconv.h>
  25 #include <byteswap.h>
  26
  27 static int
  28 run_conversion (const char *from, const char *to, char *inbuf, size_t inbuflen,
  29                 int exp_errno, int line)
  30 {
  31   char outbuf[16];
  32   iconv_t cd;
  33   char *inptr;
  34   size_t inlen;
  35   char *outptr;
  36   size_t outlen;
  37   size_t n;
  38   int e;
  39   int fails = 0;
  40
  41   cd = iconv_open (to, from);
  42   if (cd == (iconv_t) -1)
  43     {
  44       printf ("line %d: cannot convert from %s to %s: %m\n", line, from, to);
  45       return 1;
  46     }
  47
  48   inptr = (char *) inbuf;
  49   inlen = inbuflen;
  50   outptr = outbuf;
  51   outlen = sizeof (outbuf);
  52
  53   errno = 0;
  54   n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
  55   e = errno;
  56
  57   if (exp_errno == 0)
  58     {
  59       if (n == (size_t) -1)
  60         {
  61           puts ("n should be >= 0, but n == -1");
  62           fails ++;
  63         }
  64
  65       if (e != 0)
  66         {
  67           printf ("errno should be 0: 'Success', but errno == %d: '%s'\n"
  68                   , e, strerror(e));
  69           fails ++;
  70         }
  71     }
  72   else
  73     {
  74       if (n != (size_t) -1)
  75         {
  76           printf ("n should be -1, but n == %zd\n", n);
  77           fails ++;
  78         }
  79
  80       if (e != exp_errno)
  81         {
  82           printf ("errno should be %d: '%s', but errno == %d: '%s'\n"
  83                   , exp_errno, strerror (exp_errno), e, strerror (e));
  84           fails ++;
  85         }
  86     }
  87
  88   iconv_close (cd);
  89
  90   if (fails > 0)
  91     {
  92       printf ("Errors in line %d while converting %s to %s.\n\n"
  93               , line, from, to);
  94     }
  95
  96   return fails;
  97 }
  98
  99 static int
 100 do_test (void)
 101 {
 102   int fails = 0;
 103   char buf[4];
 104
 105   /* This test runs iconv() with UTF character in range of an UTF16 surrogate.
 106      UTF-16 high surrogate is in range 0xD800..0xDBFF and
 107      UTF-16 low surrogate is in range 0xDC00..0xDFFF.
 108      Converting from or to UTF-xx has to report errors in those cases.
 109      In UTF-16, surrogate pairs with a high surrogate in front of a low
 110      surrogate is valid.  */
 111
 112   /* Use RUN_UCS4_UTF32_INPUT to test conversion ...
 113
 114      ... from INTERNAL to UTF-xx[LE|BE]:
 115      Converting from UCS4 to UTF-xx[LE|BE] first converts UCS4 to INTERNAL
 116      without checking for UTF-16 surrogate values
 117      and then converts from INTERNAL to UTF-xx[LE|BE].
 118      The latter conversion has to report an error in those cases.
 119
 120      ... from UTF-32[LE|BE] to INTERNAL:
 121      Converting directly from UTF-32LE to UTF-8|16 is needed,
 122      because e.g. s390x has iconv-modules which converts directly.  */
 123 #define RUN_UCS4_UTF32_INPUT(b0, b1, b2, b3, err, line)                 \
 124   buf[0] = b0;                                                          \
 125   buf[1] = b1;                                                          \
 126   buf[2] = b2;                                                          \
 127   buf[3] = b3;                                                          \
 128   fails += run_conversion ("UCS4", "UTF-8", buf, 4, err, line);         \
 129   fails += run_conversion ("UCS4", "UTF-16LE", buf, 4, err, line);      \
 130   fails += run_conversion ("UCS4", "UTF-16BE", buf, 4, err, line);      \
 131   fails += run_conversion ("UCS4", "UTF-32LE", buf, 4, err, line);      \
 132   fails += run_conversion ("UCS4", "UTF-32BE", buf, 4, err, line);      \
 133   fails += run_conversion ("UTF-32BE", "WCHAR_T", buf, 4, err, line);   \
 134   fails += run_conversion ("UTF-32BE", "UTF-8", buf, 4, err, line);     \
 135   fails += run_conversion ("UTF-32BE", "UTF-16LE", buf, 4, err, line);  \
 136   fails += run_conversion ("UTF-32BE", "UTF-16BE", buf, 4, err, line);  \
 137   buf[0] = b3;                                                          \
 138   buf[1] = b2;                                                          \
 139   buf[2] = b1;                                                          \
 140   buf[3] = b0;                                                          \
 141   fails += run_conversion ("UTF-32LE", "WCHAR_T", buf, 4, err, line);   \
 142   fails += run_conversion ("UTF-32LE", "UTF-8", buf, 4, err, line);     \
 143   fails += run_conversion ("UTF-32LE", "UTF-16LE", buf, 4, err, line);  \
 144   fails += run_conversion ("UTF-32LE", "UTF-16BE", buf, 4, err, line);
 145
 146   /* Use UCS4/UTF32 input of 0xD7FF.  */
 147   RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD7, 0xFF, 0, __LINE__);
 148
 149   /* Use UCS4/UTF32 input of 0xD800.  */
 150   RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD8, 0x00, EILSEQ, __LINE__);
 151
 152   /* Use UCS4/UTF32 input of 0xDBFF.  */
 153   RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDB, 0xFF, EILSEQ, __LINE__);
 154
 155   /* Use UCS4/UTF32 input of 0xDC00.  */
 156   RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDC, 0x00, EILSEQ, __LINE__);
 157
 158   /* Use UCS4/UTF32 input of 0xDFFF.  */
 159   RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDF, 0xFF, EILSEQ, __LINE__);
 160
 161   /* Use UCS4/UTF32 input of 0xE000.  */
 162   RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xE0, 0x00, 0, __LINE__);
 163
 164
 165   /* Use RUN_UTF16_INPUT to test conversion from UTF16[LE|BE] to INTERNAL.
 166      Converting directly from UTF-16 to UTF-8|32 is needed,
 167      because e.g. s390x has iconv-modules which converts directly.
 168      Use len == 2 or 4 to specify one or two UTF-16 characters.  */
 169 #define RUN_UTF16_INPUT(b0, b1, b2, b3, len, err, line)                 \
 170   buf[0] = b0;                                                          \
 171   buf[1] = b1;                                                          \
 172   buf[2] = b2;                                                          \
 173   buf[3] = b3;                                                          \
 174   fails += run_conversion ("UTF-16BE", "WCHAR_T", buf, len, err, line); \
 175   fails += run_conversion ("UTF-16BE", "UTF-8", buf, len, err, line);   \
 176   fails += run_conversion ("UTF-16BE", "UTF-32LE", buf, len, err, line); \
 177   fails += run_conversion ("UTF-16BE", "UTF-32BE", buf, len, err, line); \
 178   buf[0] = b1;                                                          \
 179   buf[1] = b0;                                                          \
 180   buf[2] = b3;                                                          \
 181   buf[3] = b2;                                                          \
 182   fails += run_conversion ("UTF-16LE", "WCHAR_T", buf, len, err, line); \
 183   fails += run_conversion ("UTF-16LE", "UTF-8", buf, len, err, line);   \
 184   fails += run_conversion ("UTF-16LE", "UTF-32LE", buf, len, err, line); \
 185   fails += run_conversion ("UTF-16LE", "UTF-32BE", buf, len, err, line);
 186
 187   /* Use UTF16 input of 0xD7FF.  */
 188   RUN_UTF16_INPUT (0xD7, 0xFF, 0xD7, 0xFF, 4, 0, __LINE__);
 189
 190   /* Use [single] UTF16 high surrogate 0xD800 [with a valid character behind].
 191      And check an UTF16 surrogate pair [without valid low surrogate].  */
 192   RUN_UTF16_INPUT (0xD8, 0x0, 0x0, 0x0, 2, EINVAL, __LINE__);
 193   RUN_UTF16_INPUT (0xD8, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
 194   RUN_UTF16_INPUT (0xD8, 0x0, 0xD8, 0x0, 4, EILSEQ, __LINE__);
 195   RUN_UTF16_INPUT (0xD8, 0x0, 0xE0, 0x0, 4, EILSEQ, __LINE__);
 196   RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__);
 197
 198   /* Use [single] UTF16 high surrogate 0xDBFF [with a valid character behind].
 199      And check an UTF16 surrogate pair [without valid low surrogate].  */
 200   RUN_UTF16_INPUT (0xDB, 0xFF, 0x0, 0x0, 2, EINVAL, __LINE__);
 201   RUN_UTF16_INPUT (0xDB, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
 202   RUN_UTF16_INPUT (0xDB, 0xFF, 0xDB, 0xFF, 4, EILSEQ, __LINE__);
 203   RUN_UTF16_INPUT (0xDB, 0xFF, 0xE0, 0x0, 4, EILSEQ, __LINE__);
 204   RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__);
 205
 206   /* Use single UTF16 low surrogate 0xDC00 [with a valid character behind].
 207      And check an UTF16 surrogate pair [without valid high surrogate].   */
 208   RUN_UTF16_INPUT (0xDC, 0x0, 0x0, 0x0, 2, EILSEQ, __LINE__);
 209   RUN_UTF16_INPUT (0xDC, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
 210   RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__);
 211   RUN_UTF16_INPUT (0xD7, 0xFF, 0xDC, 0x0, 4, EILSEQ, __LINE__);
 212   RUN_UTF16_INPUT (0xDC, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__);
 213   RUN_UTF16_INPUT (0xE0, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__);
 214
 215   /* Use single UTF16 low surrogate 0xDFFF [with a valid character behind].
 216      And check an UTF16 surrogate pair [without valid high surrogate].   */
 217   RUN_UTF16_INPUT (0xDF, 0xFF, 0x0, 0x0, 2, EILSEQ, __LINE__);
 218   RUN_UTF16_INPUT (0xDF, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__);
 219   RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__);
 220   RUN_UTF16_INPUT (0xD7, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__);
 221   RUN_UTF16_INPUT (0xDF, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__);
 222   RUN_UTF16_INPUT (0xE0, 0x0, 0xDF, 0xFF, 4, EILSEQ, __LINE__);
 223
 224   /* Use UCS4/UTF32 input of 0xE000.  */
 225   RUN_UTF16_INPUT (0xE0, 0x0, 0xE0, 0x0, 4, 0, __LINE__);
 226
 227
 228   /* Use RUN_UTF8_3BYTE_INPUT to test conversion from UTF-8 to INTERNAL.
 229      Converting directly from UTF-8 to UTF-16|32 is needed,
 230      because e.g. s390x has iconv-modules which converts directly.  */
 231 #define RUN_UTF8_3BYTE_INPUT(b0, b1, b2, err, line)                     \
 232   buf[0] = b0;                                                          \
 233   buf[1] = b1;                                                          \
 234   buf[2] = b2;                                                          \
 235   fails += run_conversion ("UTF-8", "WCHAR_T", buf, 3, err, line);      \
 236   fails += run_conversion ("UTF-8", "UTF-16LE", buf, 3, err, line);     \
 237   fails += run_conversion ("UTF-8", "UTF-16BE", buf, 3, err, line);     \
 238   fails += run_conversion ("UTF-8", "UTF-32LE", buf, 3, err, line);     \
 239   fails += run_conversion ("UTF-8", "UTF-32BE", buf, 3, err, line);
 240
 241   /* Use UTF-8 input of 0xD7FF.  */
 242   RUN_UTF8_3BYTE_INPUT (0xED, 0x9F, 0xBF, 0, __LINE__);
 243
 244   /* Use UTF-8 input of 0xD800.  */
 245   RUN_UTF8_3BYTE_INPUT (0xED, 0xA0, 0x80, EILSEQ, __LINE__);
 246
 247   /* Use UTF-8 input of 0xDBFF.  */
 248   RUN_UTF8_3BYTE_INPUT (0xED, 0xAF, 0xBF, EILSEQ, __LINE__);
 249
 250   /* Use UTF-8 input of 0xDC00.  */
 251   RUN_UTF8_3BYTE_INPUT (0xED, 0xB0, 0x80, EILSEQ, __LINE__);
 252
 253   /* Use UTF-8 input of 0xDFFF.  */
 254   RUN_UTF8_3BYTE_INPUT (0xED, 0xBF, 0xBF, EILSEQ, __LINE__);
 255
 256   /* Use UTF-8 input of 0xF000.  */
 257   RUN_UTF8_3BYTE_INPUT (0xEF, 0x80, 0x80, 0, __LINE__);
 258
 259   return fails > 0 ? EXIT_FAILURE : EXIT_SUCCESS;
 260 }
 261
 262 #define TEST_FUNCTION do_test ()
 263 #include "../test-skeleton.c"