src/fe_utils/mbprint.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * Multibyte character printing support for frontend code
   4  *
   5  *
   6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  * src/fe_utils/mbprint.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "postgres_fe.h"
  14
  15 #include "fe_utils/mbprint.h"
  16
  17 #include "libpq-fe.h"
  18
  19
  20 /*
  21  * To avoid version-skew problems, this file must not use declarations
  22  * from pg_wchar.h: the encoding IDs we are dealing with are determined
  23  * by the libpq.so we are linked with, and that might not match the
  24  * numbers we see at compile time.  (If this file were inside libpq,
  25  * the problem would go away...)
  26  *
  27  * Hence, we have our own definition of pg_wchar, and we get the values
  28  * of any needed encoding IDs on-the-fly.
  29  */
  30
  31 typedef unsigned int pg_wchar;
  32
  33 static int
  34 pg_get_utf8_id(void)
  35 {
  36         static int      utf8_id = -1;
  37
  38         if (utf8_id < 0)
  39                 utf8_id = pg_char_to_encoding("utf8");
  40         return utf8_id;
  41 }
  42
  43 #define PG_UTF8         pg_get_utf8_id()
  44
  45
  46 /*
  47  * Convert a UTF-8 character to a Unicode code point.
  48  * This is a one-character version of pg_utf2wchar_with_len.
  49  *
  50  * No error checks here, c must point to a long-enough string.
  51  */
  52 static pg_wchar
  53 utf8_to_unicode(const unsigned char *c)
  54 {
  55         if ((*c & 0x80) == 0)
  56                 return (pg_wchar) c[0];
  57         else if ((*c & 0xe0) == 0xc0)
  58                 return (pg_wchar) (((c[0] & 0x1f) << 6) |
  59                                                    (c[1] & 0x3f));
  60         else if ((*c & 0xf0) == 0xe0)
  61                 return (pg_wchar) (((c[0] & 0x0f) << 12) |
  62                                                    ((c[1] & 0x3f) << 6) |
  63                                                    (c[2] & 0x3f));
  64         else if ((*c & 0xf8) == 0xf0)
  65                 return (pg_wchar) (((c[0] & 0x07) << 18) |
  66                                                    ((c[1] & 0x3f) << 12) |
  67                                                    ((c[2] & 0x3f) << 6) |
  68                                                    (c[3] & 0x3f));
  69         else
  70                 /* that is an invalid code on purpose */
  71                 return 0xffffffff;
  72 }
  73
  74
  75 /*
  76  * Unicode 3.1 compliant validation : for each category, it checks the
  77  * combination of each byte to make sure it maps to a valid range. It also
  78  * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
  79  * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
  80  */
  81 static int
  82 utf_charcheck(const unsigned char *c)
  83 {
  84         if ((*c & 0x80) == 0)
  85                 return 1;
  86         else if ((*c & 0xe0) == 0xc0)
  87         {
  88                 /* two-byte char */
  89                 if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
  90                         return 2;
  91                 return -1;
  92         }
  93         else if ((*c & 0xf0) == 0xe0)
  94         {
  95                 /* three-byte char */
  96                 if (((c[1] & 0xc0) == 0x80) &&
  97                         (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
  98                         ((c[2] & 0xc0) == 0x80))
  99                 {
 100                         int                     z = c[0] & 0x0f;
 101                         int                     yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
 102                         int                     lx = yx & 0x7f;
 103
 104                         /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
 105                         if (((z == 0x0f) &&
 106                                  (((yx & 0xffe) == 0xffe) ||
 107                                   (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
 108                                 ((z == 0x0d) && ((yx & 0xb00) == 0x800)))
 109                                 return -1;
 110                         return 3;
 111                 }
 112                 return -1;
 113         }
 114         else if ((*c & 0xf8) == 0xf0)
 115         {
 116                 int                     u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
 117
 118                 /* four-byte char */
 119                 if (((c[1] & 0xc0) == 0x80) &&
 120                         (u > 0x00) && (u <= 0x10) &&
 121                         ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
 122                 {
 123                         /* test for 0xzzzzfffe/0xzzzzfffff */
 124                         if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
 125                                 ((c[3] & 0x3e) == 0x3e))
 126                                 return -1;
 127                         return 4;
 128                 }
 129                 return -1;
 130         }
 131         return -1;
 132 }
 133
 134
 135 static void
 136 mb_utf_validate(unsigned char *pwcs)
 137 {
 138         unsigned char *p = pwcs;
 139
 140         while (*pwcs)
 141         {
 142                 int                     len;
 143
 144                 if ((len = utf_charcheck(pwcs)) > 0)
 145                 {
 146                         if (p != pwcs)
 147                         {
 148                                 int                     i;
 149
 150                                 for (i = 0; i < len; i++)
 151                                         *p++ = *pwcs++;
 152                         }
 153                         else
 154                         {
 155                                 pwcs += len;
 156                                 p += len;
 157                         }
 158                 }
 159                 else
 160                         /* we skip the char */
 161                         pwcs++;
 162         }
 163         if (p != pwcs)
 164                 *p = '\0';
 165 }
 166
 167 /*
 168  * public functions : wcswidth and mbvalidate
 169  */
 170
 171 /*
 172  * pg_wcswidth is the dumb display-width function.
 173  * It assumes that everything will appear on one line.
 174  * OTOH it is easier to use than pg_wcssize if this applies to you.
 175  */
 176 int
 177 pg_wcswidth(const char *pwcs, size_t len, int encoding)
 178 {
 179         int                     width = 0;
 180
 181         while (len > 0)
 182         {
 183                 int                     chlen,
 184                                         chwidth;
 185
 186                 chlen = PQmblen(pwcs, encoding);
 187                 if (len < (size_t) chlen)
 188                         break;                          /* Invalid string */
 189
 190                 chwidth = PQdsplen(pwcs, encoding);
 191                 if (chwidth > 0)
 192                         width += chwidth;
 193
 194                 pwcs += chlen;
 195                 len -= chlen;
 196         }
 197         return width;
 198 }
 199
 200 /*
 201  * pg_wcssize takes the given string in the given encoding and returns three
 202  * values:
 203  *        result_width: Width in display characters of the longest line in string
 204  *        result_height: Number of lines in display output
 205  *        result_format_size: Number of bytes required to store formatted
 206  *              representation of string
 207  *
 208  * This MUST be kept in sync with pg_wcsformat!
 209  */
 210 void
 211 pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
 212                    int *result_width, int *result_height, int *result_format_size)
 213 {
 214         int                     w,
 215                                 chlen = 0,
 216                                 linewidth = 0;
 217         int                     width = 0;
 218         int                     height = 1;
 219         int                     format_size = 0;
 220
 221         for (; *pwcs && len > 0; pwcs += chlen)
 222         {
 223                 chlen = PQmblen((const char *) pwcs, encoding);
 224                 if (len < (size_t) chlen)
 225                         break;
 226                 w = PQdsplen((const char *) pwcs, encoding);
 227
 228                 if (chlen == 1)                 /* single-byte char */
 229                 {
 230                         if (*pwcs == '\n')      /* Newline */
 231                         {
 232                                 if (linewidth > width)
 233                                         width = linewidth;
 234                                 linewidth = 0;
 235                                 height += 1;
 236                                 format_size += 1;       /* For NUL char */
 237                         }
 238                         else if (*pwcs == '\r') /* Linefeed */
 239                         {
 240                                 linewidth += 2;
 241                                 format_size += 2;
 242                         }
 243                         else if (*pwcs == '\t') /* Tab */
 244                         {
 245                                 do
 246                                 {
 247                                         linewidth++;
 248                                         format_size++;
 249                                 } while (linewidth % 8 != 0);
 250                         }
 251                         else if (w < 0)         /* Other control char */
 252                         {
 253                                 linewidth += 4;
 254                                 format_size += 4;
 255                         }
 256                         else                            /* Output it as-is */
 257                         {
 258                                 linewidth += w;
 259                                 format_size += 1;
 260                         }
 261                 }
 262                 else if (w < 0)                 /* Non-ascii control char */
 263                 {
 264                         linewidth += 6;         /* \u0000 */
 265                         format_size += 6;
 266                 }
 267                 else                                    /* All other chars */
 268                 {
 269                         linewidth += w;
 270                         format_size += chlen;
 271                 }
 272                 len -= chlen;
 273         }
 274         if (linewidth > width)
 275                 width = linewidth;
 276         format_size += 1;                       /* For NUL char */
 277
 278         /* Set results */
 279         if (result_width)
 280                 *result_width = width;
 281         if (result_height)
 282                 *result_height = height;
 283         if (result_format_size)
 284                 *result_format_size = format_size;
 285 }
 286
 287 /*
 288  *      Format a string into one or more "struct lineptr" lines.
 289  *      lines[i].ptr == NULL indicates the end of the array.
 290  *
 291  * This MUST be kept in sync with pg_wcssize!
 292  */
 293 void
 294 pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
 295                          struct lineptr *lines, int count)
 296 {
 297         int                     w,
 298                                 chlen = 0;
 299         int                     linewidth = 0;
 300         unsigned char *ptr = lines->ptr;        /* Pointer to data area */
 301
 302         for (; *pwcs && len > 0; pwcs += chlen)
 303         {
 304                 chlen = PQmblen((const char *) pwcs, encoding);
 305                 if (len < (size_t) chlen)
 306                         break;
 307                 w = PQdsplen((const char *) pwcs, encoding);
 308
 309                 if (chlen == 1)                 /* single-byte char */
 310                 {
 311                         if (*pwcs == '\n')      /* Newline */
 312                         {
 313                                 *ptr++ = '\0';
 314                                 lines->width = linewidth;
 315                                 linewidth = 0;
 316                                 lines++;
 317                                 count--;
 318                                 if (count <= 0)
 319                                         exit(1);        /* Screwup */
 320
 321                                 /* make next line point to remaining memory */
 322                                 lines->ptr = ptr;
 323                         }
 324                         else if (*pwcs == '\r') /* Linefeed */
 325                         {
 326                                 strcpy((char *) ptr, "\\r");
 327                                 linewidth += 2;
 328                                 ptr += 2;
 329                         }
 330                         else if (*pwcs == '\t') /* Tab */
 331                         {
 332                                 do
 333                                 {
 334                                         *ptr++ = ' ';
 335                                         linewidth++;
 336                                 } while (linewidth % 8 != 0);
 337                         }
 338                         else if (w < 0)         /* Other control char */
 339                         {
 340                                 sprintf((char *) ptr, "\\x%02X", *pwcs);
 341                                 linewidth += 4;
 342                                 ptr += 4;
 343                         }
 344                         else                            /* Output it as-is */
 345                         {
 346                                 linewidth += w;
 347                                 *ptr++ = *pwcs;
 348                         }
 349                 }
 350                 else if (w < 0)                 /* Non-ascii control char */
 351                 {
 352                         if (encoding == PG_UTF8)
 353                                 sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
 354                         else
 355                         {
 356                                 /*
 357                                  * This case cannot happen in the current code because only
 358                                  * UTF-8 signals multibyte control characters. But we may need
 359                                  * to support it at some stage
 360                                  */
 361                                 sprintf((char *) ptr, "\\u????");
 362                         }
 363                         ptr += 6;
 364                         linewidth += 6;
 365                 }
 366                 else                                    /* All other chars */
 367                 {
 368                         int                     i;
 369
 370                         for (i = 0; i < chlen; i++)
 371                                 *ptr++ = pwcs[i];
 372                         linewidth += w;
 373                 }
 374                 len -= chlen;
 375         }
 376         lines->width = linewidth;
 377         *ptr++ = '\0';                          /* Terminate formatted string */
 378
 379         if (count <= 0)
 380                 exit(1);                                /* Screwup */
 381
 382         (lines + 1)->ptr = NULL;        /* terminate line array */
 383 }
 384
 385
 386 /*
 387  * Encoding validation: delete any unvalidatable characters from the string
 388  *
 389  * This seems redundant with existing functionality elsewhere?
 390  */
 391 unsigned char *
 392 mbvalidate(unsigned char *pwcs, int encoding)
 393 {
 394         if (encoding == PG_UTF8)
 395                 mb_utf_validate(pwcs);
 396         else
 397         {
 398                 /*
 399                  * other encodings needing validation should add their own routines
 400                  * here
 401                  */
 402         }
 403
 404         return pwcs;
 405 }