src/backend/utils/mb/wchar.c

   1 /*
   2  * conversion functions between pg_wchar and multibyte streams.
   3  * Tatsuo Ishii
   4  * src/backend/utils/mb/wchar.c
   5  *
   6  */
   7 /* can be used in either frontend or backend */
   8 #ifdef FRONTEND
   9 #include "postgres_fe.h"
  10 #else
  11 #include "postgres.h"
  12 #endif
  13
  14 #include "mb/pg_wchar.h"
  15
  16
  17 /*
  18  * Operations on multi-byte encodings are driven by a table of helper
  19  * functions.
  20  *
  21  * To add an encoding support, define mblen(), dsplen() and verifier() for
  22  * the encoding.  For server-encodings, also define mb2wchar() and wchar2mb()
  23  * conversion functions.
  24  *
  25  * These functions generally assume that their input is validly formed.
  26  * The "verifier" functions, further down in the file, have to be more
  27  * paranoid.
  28  *
  29  * We expect that mblen() does not need to examine more than the first byte
  30  * of the character to discover the correct length.  GB18030 is an exception
  31  * to that rule, though, as it also looks at second byte.  But even that
  32  * behaves in a predictable way, if you only pass the first byte: it will
  33  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
  34  * good enough for all current uses.
  35  *
  36  * Note: for the display output of psql to work properly, the return values
  37  * of the dsplen functions must conform to the Unicode standard. In particular
  38  * the NUL character is zero width and control characters are generally
  39  * width -1. It is recommended that non-ASCII encodings refer their ASCII
  40  * subset to the ASCII routines to ensure consistency.
  41  */
  42
  43 /*
  44  * SQL/ASCII
  45  */
  46 static int
  47 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  48 {
  49         int                     cnt = 0;
  50
  51         while (len > 0 && *from)
  52         {
  53                 *to++ = *from++;
  54                 len--;
  55                 cnt++;
  56         }
  57         *to = 0;
  58         return cnt;
  59 }
  60
  61 static int
  62 pg_ascii_mblen(const unsigned char *s)
  63 {
  64         return 1;
  65 }
  66
  67 static int
  68 pg_ascii_dsplen(const unsigned char *s)
  69 {
  70         if (*s == '\0')
  71                 return 0;
  72         if (*s < 0x20 || *s == 0x7f)
  73                 return -1;
  74
  75         return 1;
  76 }
  77
  78 /*
  79  * EUC
  80  */
  81 static int
  82 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  83 {
  84         int                     cnt = 0;
  85
  86         while (len > 0 && *from)
  87         {
  88                 if (*from == SS2 && len >= 2)   /* JIS X 0201 (so called "1 byte
  89                                                                                  * KANA") */
  90                 {
  91                         from++;
  92                         *to = (SS2 << 8) | *from++;
  93                         len -= 2;
  94                 }
  95                 else if (*from == SS3 && len >= 3)      /* JIS X 0212 KANJI */
  96                 {
  97                         from++;
  98                         *to = (SS3 << 16) | (*from++ << 8);
  99                         *to |= *from++;
 100                         len -= 3;
 101                 }
 102                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
 103                 {
 104                         *to = *from++ << 8;
 105                         *to |= *from++;
 106                         len -= 2;
 107                 }
 108                 else                                    /* must be ASCII */
 109                 {
 110                         *to = *from++;
 111                         len--;
 112                 }
 113                 to++;
 114                 cnt++;
 115         }
 116         *to = 0;
 117         return cnt;
 118 }
 119
 120 static inline int
 121 pg_euc_mblen(const unsigned char *s)
 122 {
 123         int                     len;
 124
 125         if (*s == SS2)
 126                 len = 2;
 127         else if (*s == SS3)
 128                 len = 3;
 129         else if (IS_HIGHBIT_SET(*s))
 130                 len = 2;
 131         else
 132                 len = 1;
 133         return len;
 134 }
 135
 136 static inline int
 137 pg_euc_dsplen(const unsigned char *s)
 138 {
 139         int                     len;
 140
 141         if (*s == SS2)
 142                 len = 2;
 143         else if (*s == SS3)
 144                 len = 2;
 145         else if (IS_HIGHBIT_SET(*s))
 146                 len = 2;
 147         else
 148                 len = pg_ascii_dsplen(s);
 149         return len;
 150 }
 151
 152 /*
 153  * EUC_JP
 154  */
 155 static int
 156 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 157 {
 158         return pg_euc2wchar_with_len(from, to, len);
 159 }
 160
 161 static int
 162 pg_eucjp_mblen(const unsigned char *s)
 163 {
 164         return pg_euc_mblen(s);
 165 }
 166
 167 static int
 168 pg_eucjp_dsplen(const unsigned char *s)
 169 {
 170         int                     len;
 171
 172         if (*s == SS2)
 173                 len = 1;
 174         else if (*s == SS3)
 175                 len = 2;
 176         else if (IS_HIGHBIT_SET(*s))
 177                 len = 2;
 178         else
 179                 len = pg_ascii_dsplen(s);
 180         return len;
 181 }
 182
 183 /*
 184  * EUC_KR
 185  */
 186 static int
 187 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 188 {
 189         return pg_euc2wchar_with_len(from, to, len);
 190 }
 191
 192 static int
 193 pg_euckr_mblen(const unsigned char *s)
 194 {
 195         return pg_euc_mblen(s);
 196 }
 197
 198 static int
 199 pg_euckr_dsplen(const unsigned char *s)
 200 {
 201         return pg_euc_dsplen(s);
 202 }
 203
 204 /*
 205  * EUC_CN
 206  *
 207  */
 208 static int
 209 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 210 {
 211         int                     cnt = 0;
 212
 213         while (len > 0 && *from)
 214         {
 215                 if (*from == SS2 && len >= 3)   /* code set 2 (unused?) */
 216                 {
 217                         from++;
 218                         *to = (SS2 << 16) | (*from++ << 8);
 219                         *to |= *from++;
 220                         len -= 3;
 221                 }
 222                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused ?) */
 223                 {
 224                         from++;
 225                         *to = (SS3 << 16) | (*from++ << 8);
 226                         *to |= *from++;
 227                         len -= 3;
 228                 }
 229                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
 230                 {
 231                         *to = *from++ << 8;
 232                         *to |= *from++;
 233                         len -= 2;
 234                 }
 235                 else
 236                 {
 237                         *to = *from++;
 238                         len--;
 239                 }
 240                 to++;
 241                 cnt++;
 242         }
 243         *to = 0;
 244         return cnt;
 245 }
 246
 247 static int
 248 pg_euccn_mblen(const unsigned char *s)
 249 {
 250         int                     len;
 251
 252         if (IS_HIGHBIT_SET(*s))
 253                 len = 2;
 254         else
 255                 len = 1;
 256         return len;
 257 }
 258
 259 static int
 260 pg_euccn_dsplen(const unsigned char *s)
 261 {
 262         int                     len;
 263
 264         if (IS_HIGHBIT_SET(*s))
 265                 len = 2;
 266         else
 267                 len = pg_ascii_dsplen(s);
 268         return len;
 269 }
 270
 271 /*
 272  * EUC_TW
 273  *
 274  */
 275 static int
 276 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 277 {
 278         int                     cnt = 0;
 279
 280         while (len > 0 && *from)
 281         {
 282                 if (*from == SS2 && len >= 4)   /* code set 2 */
 283                 {
 284                         from++;
 285                         *to = (((uint32) SS2) << 24) | (*from++ << 16);
 286                         *to |= *from++ << 8;
 287                         *to |= *from++;
 288                         len -= 4;
 289                 }
 290                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused?) */
 291                 {
 292                         from++;
 293                         *to = (SS3 << 16) | (*from++ << 8);
 294                         *to |= *from++;
 295                         len -= 3;
 296                 }
 297                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
 298                 {
 299                         *to = *from++ << 8;
 300                         *to |= *from++;
 301                         len -= 2;
 302                 }
 303                 else
 304                 {
 305                         *to = *from++;
 306                         len--;
 307                 }
 308                 to++;
 309                 cnt++;
 310         }
 311         *to = 0;
 312         return cnt;
 313 }
 314
 315 static int
 316 pg_euctw_mblen(const unsigned char *s)
 317 {
 318         int                     len;
 319
 320         if (*s == SS2)
 321                 len = 4;
 322         else if (*s == SS3)
 323                 len = 3;
 324         else if (IS_HIGHBIT_SET(*s))
 325                 len = 2;
 326         else
 327                 len = 1;
 328         return len;
 329 }
 330
 331 static int
 332 pg_euctw_dsplen(const unsigned char *s)
 333 {
 334         int                     len;
 335
 336         if (*s == SS2)
 337                 len = 2;
 338         else if (*s == SS3)
 339                 len = 2;
 340         else if (IS_HIGHBIT_SET(*s))
 341                 len = 2;
 342         else
 343                 len = pg_ascii_dsplen(s);
 344         return len;
 345 }
 346
 347 /*
 348  * Convert pg_wchar to EUC_* encoding.
 349  * caller must allocate enough space for "to", including a trailing zero!
 350  * len: length of from.
 351  * "from" not necessarily null terminated.
 352  */
 353 static int
 354 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
 355 {
 356         int                     cnt = 0;
 357
 358         while (len > 0 && *from)
 359         {
 360                 unsigned char c;
 361
 362                 if ((c = (*from >> 24)))
 363                 {
 364                         *to++ = c;
 365                         *to++ = (*from >> 16) & 0xff;
 366                         *to++ = (*from >> 8) & 0xff;
 367                         *to++ = *from & 0xff;
 368                         cnt += 4;
 369                 }
 370                 else if ((c = (*from >> 16)))
 371                 {
 372                         *to++ = c;
 373                         *to++ = (*from >> 8) & 0xff;
 374                         *to++ = *from & 0xff;
 375                         cnt += 3;
 376                 }
 377                 else if ((c = (*from >> 8)))
 378                 {
 379                         *to++ = c;
 380                         *to++ = *from & 0xff;
 381                         cnt += 2;
 382                 }
 383                 else
 384                 {
 385                         *to++ = *from;
 386                         cnt++;
 387                 }
 388                 from++;
 389                 len--;
 390         }
 391         *to = 0;
 392         return cnt;
 393 }
 394
 395
 396 /*
 397  * JOHAB
 398  */
 399 static int
 400 pg_johab_mblen(const unsigned char *s)
 401 {
 402         return pg_euc_mblen(s);
 403 }
 404
 405 static int
 406 pg_johab_dsplen(const unsigned char *s)
 407 {
 408         return pg_euc_dsplen(s);
 409 }
 410
 411 /*
 412  * convert UTF8 string to pg_wchar (UCS-4)
 413  * caller must allocate enough space for "to", including a trailing zero!
 414  * len: length of from.
 415  * "from" not necessarily null terminated.
 416  */
 417 static int
 418 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 419 {
 420         int                     cnt = 0;
 421         uint32          c1,
 422                                 c2,
 423                                 c3,
 424                                 c4;
 425
 426         while (len > 0 && *from)
 427         {
 428                 if ((*from & 0x80) == 0)
 429                 {
 430                         *to = *from++;
 431                         len--;
 432                 }
 433                 else if ((*from & 0xe0) == 0xc0)
 434                 {
 435                         if (len < 2)
 436                                 break;                  /* drop trailing incomplete char */
 437                         c1 = *from++ & 0x1f;
 438                         c2 = *from++ & 0x3f;
 439                         *to = (c1 << 6) | c2;
 440                         len -= 2;
 441                 }
 442                 else if ((*from & 0xf0) == 0xe0)
 443                 {
 444                         if (len < 3)
 445                                 break;                  /* drop trailing incomplete char */
 446                         c1 = *from++ & 0x0f;
 447                         c2 = *from++ & 0x3f;
 448                         c3 = *from++ & 0x3f;
 449                         *to = (c1 << 12) | (c2 << 6) | c3;
 450                         len -= 3;
 451                 }
 452                 else if ((*from & 0xf8) == 0xf0)
 453                 {
 454                         if (len < 4)
 455                                 break;                  /* drop trailing incomplete char */
 456                         c1 = *from++ & 0x07;
 457                         c2 = *from++ & 0x3f;
 458                         c3 = *from++ & 0x3f;
 459                         c4 = *from++ & 0x3f;
 460                         *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
 461                         len -= 4;
 462                 }
 463                 else
 464                 {
 465                         /* treat a bogus char as length 1; not ours to raise error */
 466                         *to = *from++;
 467                         len--;
 468                 }
 469                 to++;
 470                 cnt++;
 471         }
 472         *to = 0;
 473         return cnt;
 474 }
 475
 476
 477 /*
 478  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
 479  * space allocated.
 480  */
 481 unsigned char *
 482 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
 483 {
 484         if (c <= 0x7F)
 485         {
 486                 utf8string[0] = c;
 487         }
 488         else if (c <= 0x7FF)
 489         {
 490                 utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
 491                 utf8string[1] = 0x80 | (c & 0x3F);
 492         }
 493         else if (c <= 0xFFFF)
 494         {
 495                 utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
 496                 utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
 497                 utf8string[2] = 0x80 | (c & 0x3F);
 498         }
 499         else
 500         {
 501                 utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
 502                 utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
 503                 utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
 504                 utf8string[3] = 0x80 | (c & 0x3F);
 505         }
 506
 507         return utf8string;
 508 }
 509
 510 /*
 511  * Trivial conversion from pg_wchar to UTF-8.
 512  * caller should allocate enough space for "to"
 513  * len: length of from.
 514  * "from" not necessarily null terminated.
 515  */
 516 static int
 517 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
 518 {
 519         int                     cnt = 0;
 520
 521         while (len > 0 && *from)
 522         {
 523                 int                     char_len;
 524
 525                 unicode_to_utf8(*from, to);
 526                 char_len = pg_utf_mblen(to);
 527                 cnt += char_len;
 528                 to += char_len;
 529                 from++;
 530                 len--;
 531         }
 532         *to = 0;
 533         return cnt;
 534 }
 535
 536 /*
 537  * Return the byte length of a UTF8 character pointed to by s
 538  *
 539  * Note: in the current implementation we do not support UTF8 sequences
 540  * of more than 4 bytes; hence do NOT return a value larger than 4.
 541  * We return "1" for any leading byte that is either flat-out illegal or
 542  * indicates a length larger than we support.
 543  *
 544  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
 545  * other places would need to be fixed to change this.
 546  */
 547 int
 548 pg_utf_mblen(const unsigned char *s)
 549 {
 550         int                     len;
 551
 552         if ((*s & 0x80) == 0)
 553                 len = 1;
 554         else if ((*s & 0xe0) == 0xc0)
 555                 len = 2;
 556         else if ((*s & 0xf0) == 0xe0)
 557                 len = 3;
 558         else if ((*s & 0xf8) == 0xf0)
 559                 len = 4;
 560 #ifdef NOT_USED
 561         else if ((*s & 0xfc) == 0xf8)
 562                 len = 5;
 563         else if ((*s & 0xfe) == 0xfc)
 564                 len = 6;
 565 #endif
 566         else
 567                 len = 1;
 568         return len;
 569 }
 570
 571 /*
 572  * This is an implementation of wcwidth() and wcswidth() as defined in
 573  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
 574  * <http://www.unix.org/online.html>
 575  *
 576  * Markus Kuhn -- 2001-09-08 -- public domain
 577  *
 578  * customised for PostgreSQL
 579  *
 580  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 581  */
 582
 583 struct mbinterval
 584 {
 585         unsigned short first;
 586         unsigned short last;
 587 };
 588
 589 /* auxiliary function for binary search in interval table */
 590 static int
 591 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
 592 {
 593         int                     min = 0;
 594         int                     mid;
 595
 596         if (ucs < table[0].first || ucs > table[max].last)
 597                 return 0;
 598         while (max >= min)
 599         {
 600                 mid = (min + max) / 2;
 601                 if (ucs > table[mid].last)
 602                         min = mid + 1;
 603                 else if (ucs < table[mid].first)
 604                         max = mid - 1;
 605                 else
 606                         return 1;
 607         }
 608
 609         return 0;
 610 }
 611
 612
 613 /* The following functions define the column width of an ISO 10646
 614  * character as follows:
 615  *
 616  *        - The null character (U+0000) has a column width of 0.
 617  *
 618  *        - Other C0/C1 control characters and DEL will lead to a return
 619  *              value of -1.
 620  *
 621  *        - Non-spacing and enclosing combining characters (general
 622  *              category code Mn or Me in the Unicode database) have a
 623  *              column width of 0.
 624  *
 625  *        - Other format characters (general category code Cf in the Unicode
 626  *              database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
 627  *
 628  *        - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
 629  *              have a column width of 0.
 630  *
 631  *        - Spacing characters in the East Asian Wide (W) or East Asian
 632  *              FullWidth (F) category as defined in Unicode Technical
 633  *              Report #11 have a column width of 2.
 634  *
 635  *        - All remaining characters (including all printable
 636  *              ISO 8859-1 and WGL4 characters, Unicode control characters,
 637  *              etc.) have a column width of 1.
 638  *
 639  * This implementation assumes that wchar_t characters are encoded
 640  * in ISO 10646.
 641  */
 642
 643 static int
 644 ucs_wcwidth(pg_wchar ucs)
 645 {
 646 #include "common/unicode_combining_table.h"
 647
 648         /* test for 8-bit control characters */
 649         if (ucs == 0)
 650                 return 0;
 651
 652         if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
 653                 return -1;
 654
 655         /* binary search in table of non-spacing characters */
 656         if (mbbisearch(ucs, combining,
 657                                    sizeof(combining) / sizeof(struct mbinterval) - 1))
 658                 return 0;
 659
 660         /*
 661          * if we arrive here, ucs is not a combining or C0/C1 control character
 662          */
 663
 664         return 1 +
 665                 (ucs >= 0x1100 &&
 666                  (ucs <= 0x115f ||              /* Hangul Jamo init. consonants */
 667                   (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
 668                    ucs != 0x303f) ||    /* CJK ... Yi */
 669                   (ucs >= 0xac00 && ucs <= 0xd7a3) ||   /* Hangul Syllables */
 670                   (ucs >= 0xf900 && ucs <= 0xfaff) ||   /* CJK Compatibility
 671                                                                                                  * Ideographs */
 672                   (ucs >= 0xfe30 && ucs <= 0xfe6f) ||   /* CJK Compatibility Forms */
 673                   (ucs >= 0xff00 && ucs <= 0xff5f) ||   /* Fullwidth Forms */
 674                   (ucs >= 0xffe0 && ucs <= 0xffe6) ||
 675                   (ucs >= 0x20000 && ucs <= 0x2ffff)));
 676 }
 677
 678 /*
 679  * Convert a UTF-8 character to a Unicode code point.
 680  * This is a one-character version of pg_utf2wchar_with_len.
 681  *
 682  * No error checks here, c must point to a long-enough string.
 683  */
 684 pg_wchar
 685 utf8_to_unicode(const unsigned char *c)
 686 {
 687         if ((*c & 0x80) == 0)
 688                 return (pg_wchar) c[0];
 689         else if ((*c & 0xe0) == 0xc0)
 690                 return (pg_wchar) (((c[0] & 0x1f) << 6) |
 691                                                    (c[1] & 0x3f));
 692         else if ((*c & 0xf0) == 0xe0)
 693                 return (pg_wchar) (((c[0] & 0x0f) << 12) |
 694                                                    ((c[1] & 0x3f) << 6) |
 695                                                    (c[2] & 0x3f));
 696         else if ((*c & 0xf8) == 0xf0)
 697                 return (pg_wchar) (((c[0] & 0x07) << 18) |
 698                                                    ((c[1] & 0x3f) << 12) |
 699                                                    ((c[2] & 0x3f) << 6) |
 700                                                    (c[3] & 0x3f));
 701         else
 702                 /* that is an invalid code on purpose */
 703                 return 0xffffffff;
 704 }
 705
 706 static int
 707 pg_utf_dsplen(const unsigned char *s)
 708 {
 709         return ucs_wcwidth(utf8_to_unicode(s));
 710 }
 711
 712 /*
 713  * convert mule internal code to pg_wchar
 714  * caller should allocate enough space for "to"
 715  * len: length of from.
 716  * "from" not necessarily null terminated.
 717  */
 718 static int
 719 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 720 {
 721         int                     cnt = 0;
 722
 723         while (len > 0 && *from)
 724         {
 725                 if (IS_LC1(*from) && len >= 2)
 726                 {
 727                         *to = *from++ << 16;
 728                         *to |= *from++;
 729                         len -= 2;
 730                 }
 731                 else if (IS_LCPRV1(*from) && len >= 3)
 732                 {
 733                         from++;
 734                         *to = *from++ << 16;
 735                         *to |= *from++;
 736                         len -= 3;
 737                 }
 738                 else if (IS_LC2(*from) && len >= 3)
 739                 {
 740                         *to = *from++ << 16;
 741                         *to |= *from++ << 8;
 742                         *to |= *from++;
 743                         len -= 3;
 744                 }
 745                 else if (IS_LCPRV2(*from) && len >= 4)
 746                 {
 747                         from++;
 748                         *to = *from++ << 16;
 749                         *to |= *from++ << 8;
 750                         *to |= *from++;
 751                         len -= 4;
 752                 }
 753                 else
 754                 {                                               /* assume ASCII */
 755                         *to = (unsigned char) *from++;
 756                         len--;
 757                 }
 758                 to++;
 759                 cnt++;
 760         }
 761         *to = 0;
 762         return cnt;
 763 }
 764
 765 /*
 766  * convert pg_wchar to mule internal code
 767  * caller should allocate enough space for "to"
 768  * len: length of from.
 769  * "from" not necessarily null terminated.
 770  */
 771 static int
 772 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
 773 {
 774         int                     cnt = 0;
 775
 776         while (len > 0 && *from)
 777         {
 778                 unsigned char lb;
 779
 780                 lb = (*from >> 16) & 0xff;
 781                 if (IS_LC1(lb))
 782                 {
 783                         *to++ = lb;
 784                         *to++ = *from & 0xff;
 785                         cnt += 2;
 786                 }
 787                 else if (IS_LC2(lb))
 788                 {
 789                         *to++ = lb;
 790                         *to++ = (*from >> 8) & 0xff;
 791                         *to++ = *from & 0xff;
 792                         cnt += 3;
 793                 }
 794                 else if (IS_LCPRV1_A_RANGE(lb))
 795                 {
 796                         *to++ = LCPRV1_A;
 797                         *to++ = lb;
 798                         *to++ = *from & 0xff;
 799                         cnt += 3;
 800                 }
 801                 else if (IS_LCPRV1_B_RANGE(lb))
 802                 {
 803                         *to++ = LCPRV1_B;
 804                         *to++ = lb;
 805                         *to++ = *from & 0xff;
 806                         cnt += 3;
 807                 }
 808                 else if (IS_LCPRV2_A_RANGE(lb))
 809                 {
 810                         *to++ = LCPRV2_A;
 811                         *to++ = lb;
 812                         *to++ = (*from >> 8) & 0xff;
 813                         *to++ = *from & 0xff;
 814                         cnt += 4;
 815                 }
 816                 else if (IS_LCPRV2_B_RANGE(lb))
 817                 {
 818                         *to++ = LCPRV2_B;
 819                         *to++ = lb;
 820                         *to++ = (*from >> 8) & 0xff;
 821                         *to++ = *from & 0xff;
 822                         cnt += 4;
 823                 }
 824                 else
 825                 {
 826                         *to++ = *from & 0xff;
 827                         cnt += 1;
 828                 }
 829                 from++;
 830                 len--;
 831         }
 832         *to = 0;
 833         return cnt;
 834 }
 835
 836 int
 837 pg_mule_mblen(const unsigned char *s)
 838 {
 839         int                     len;
 840
 841         if (IS_LC1(*s))
 842                 len = 2;
 843         else if (IS_LCPRV1(*s))
 844                 len = 3;
 845         else if (IS_LC2(*s))
 846                 len = 3;
 847         else if (IS_LCPRV2(*s))
 848                 len = 4;
 849         else
 850                 len = 1;                                /* assume ASCII */
 851         return len;
 852 }
 853
 854 static int
 855 pg_mule_dsplen(const unsigned char *s)
 856 {
 857         int                     len;
 858
 859         /*
 860          * Note: it's not really appropriate to assume that all multibyte charsets
 861          * are double-wide on screen.  But this seems an okay approximation for
 862          * the MULE charsets we currently support.
 863          */
 864
 865         if (IS_LC1(*s))
 866                 len = 1;
 867         else if (IS_LCPRV1(*s))
 868                 len = 1;
 869         else if (IS_LC2(*s))
 870                 len = 2;
 871         else if (IS_LCPRV2(*s))
 872                 len = 2;
 873         else
 874                 len = 1;                                /* assume ASCII */
 875
 876         return len;
 877 }
 878
 879 /*
 880  * ISO8859-1
 881  */
 882 static int
 883 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 884 {
 885         int                     cnt = 0;
 886
 887         while (len > 0 && *from)
 888         {
 889                 *to++ = *from++;
 890                 len--;
 891                 cnt++;
 892         }
 893         *to = 0;
 894         return cnt;
 895 }
 896
 897 /*
 898  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
 899  * high bits.
 900  * caller should allocate enough space for "to"
 901  * len: length of from.
 902  * "from" not necessarily null terminated.
 903  */
 904 static int
 905 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
 906 {
 907         int                     cnt = 0;
 908
 909         while (len > 0 && *from)
 910         {
 911                 *to++ = *from++;
 912                 len--;
 913                 cnt++;
 914         }
 915         *to = 0;
 916         return cnt;
 917 }
 918
 919 static int
 920 pg_latin1_mblen(const unsigned char *s)
 921 {
 922         return 1;
 923 }
 924
 925 static int
 926 pg_latin1_dsplen(const unsigned char *s)
 927 {
 928         return pg_ascii_dsplen(s);
 929 }
 930
 931 /*
 932  * SJIS
 933  */
 934 static int
 935 pg_sjis_mblen(const unsigned char *s)
 936 {
 937         int                     len;
 938
 939         if (*s >= 0xa1 && *s <= 0xdf)
 940                 len = 1;                                /* 1 byte kana? */
 941         else if (IS_HIGHBIT_SET(*s))
 942                 len = 2;                                /* kanji? */
 943         else
 944                 len = 1;                                /* should be ASCII */
 945         return len;
 946 }
 947
 948 static int
 949 pg_sjis_dsplen(const unsigned char *s)
 950 {
 951         int                     len;
 952
 953         if (*s >= 0xa1 && *s <= 0xdf)
 954                 len = 1;                                /* 1 byte kana? */
 955         else if (IS_HIGHBIT_SET(*s))
 956                 len = 2;                                /* kanji? */
 957         else
 958                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 959         return len;
 960 }
 961
 962 /*
 963  * Big5
 964  */
 965 static int
 966 pg_big5_mblen(const unsigned char *s)
 967 {
 968         int                     len;
 969
 970         if (IS_HIGHBIT_SET(*s))
 971                 len = 2;                                /* kanji? */
 972         else
 973                 len = 1;                                /* should be ASCII */
 974         return len;
 975 }
 976
 977 static int
 978 pg_big5_dsplen(const unsigned char *s)
 979 {
 980         int                     len;
 981
 982         if (IS_HIGHBIT_SET(*s))
 983                 len = 2;                                /* kanji? */
 984         else
 985                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 986         return len;
 987 }
 988
 989 /*
 990  * GBK
 991  */
 992 static int
 993 pg_gbk_mblen(const unsigned char *s)
 994 {
 995         int                     len;
 996
 997         if (IS_HIGHBIT_SET(*s))
 998                 len = 2;                                /* kanji? */
 999         else
1000                 len = 1;                                /* should be ASCII */
1001         return len;
1002 }
1003
1004 static int
1005 pg_gbk_dsplen(const unsigned char *s)
1006 {
1007         int                     len;
1008
1009         if (IS_HIGHBIT_SET(*s))
1010                 len = 2;                                /* kanji? */
1011         else
1012                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1013         return len;
1014 }
1015
1016 /*
1017  * UHC
1018  */
1019 static int
1020 pg_uhc_mblen(const unsigned char *s)
1021 {
1022         int                     len;
1023
1024         if (IS_HIGHBIT_SET(*s))
1025                 len = 2;                                /* 2byte? */
1026         else
1027                 len = 1;                                /* should be ASCII */
1028         return len;
1029 }
1030
1031 static int
1032 pg_uhc_dsplen(const unsigned char *s)
1033 {
1034         int                     len;
1035
1036         if (IS_HIGHBIT_SET(*s))
1037                 len = 2;                                /* 2byte? */
1038         else
1039                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1040         return len;
1041 }
1042
1043 /*
1044  * GB18030
1045  *      Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1046  */
1047
1048 /*
1049  * Unlike all other mblen() functions, this also looks at the second byte of
1050  * the input.  However, if you only pass the first byte of a multi-byte
1051  * string, and \0 as the second byte, this still works in a predictable way:
1052  * a 4-byte character will be reported as two 2-byte characters.  That's
1053  * enough for all current uses, as a client-only encoding.  It works that
1054  * way, because in any valid 4-byte GB18030-encoded character, the third and
1055  * fourth byte look like a 2-byte encoded character, when looked at
1056  * separately.
1057  */
1058 static int
1059 pg_gb18030_mblen(const unsigned char *s)
1060 {
1061         int                     len;
1062
1063         if (!IS_HIGHBIT_SET(*s))
1064                 len = 1;                                /* ASCII */
1065         else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1066                 len = 4;
1067         else
1068                 len = 2;
1069         return len;
1070 }
1071
1072 static int
1073 pg_gb18030_dsplen(const unsigned char *s)
1074 {
1075         int                     len;
1076
1077         if (IS_HIGHBIT_SET(*s))
1078                 len = 2;
1079         else
1080                 len = pg_ascii_dsplen(s);       /* ASCII */
1081         return len;
1082 }
1083
1084 /*
1085  *-------------------------------------------------------------------
1086  * multibyte sequence validators
1087  *
1088  * These functions accept "s", a pointer to the first byte of a string,
1089  * and "len", the remaining length of the string.  If there is a validly
1090  * encoded character beginning at *s, return its length in bytes; else
1091  * return -1.
1092  *
1093  * The functions can assume that len > 0 and that *s != '\0', but they must
1094  * test for and reject zeroes in any additional bytes of a multibyte character.
1095  *
1096  * Note that this definition allows the function for a single-byte
1097  * encoding to be just "return 1".
1098  *-------------------------------------------------------------------
1099  */
1100
1101 static int
1102 pg_ascii_verifier(const unsigned char *s, int len)
1103 {
1104         return 1;
1105 }
1106
1107 #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
1108
1109 static int
1110 pg_eucjp_verifier(const unsigned char *s, int len)
1111 {
1112         int                     l;
1113         unsigned char c1,
1114                                 c2;
1115
1116         c1 = *s++;
1117
1118         switch (c1)
1119         {
1120                 case SS2:                               /* JIS X 0201 */
1121                         l = 2;
1122                         if (l > len)
1123                                 return -1;
1124                         c2 = *s++;
1125                         if (c2 < 0xa1 || c2 > 0xdf)
1126                                 return -1;
1127                         break;
1128
1129                 case SS3:                               /* JIS X 0212 */
1130                         l = 3;
1131                         if (l > len)
1132                                 return -1;
1133                         c2 = *s++;
1134                         if (!IS_EUC_RANGE_VALID(c2))
1135                                 return -1;
1136                         c2 = *s++;
1137                         if (!IS_EUC_RANGE_VALID(c2))
1138                                 return -1;
1139                         break;
1140
1141                 default:
1142                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1143                         {
1144                                 l = 2;
1145                                 if (l > len)
1146                                         return -1;
1147                                 if (!IS_EUC_RANGE_VALID(c1))
1148                                         return -1;
1149                                 c2 = *s++;
1150                                 if (!IS_EUC_RANGE_VALID(c2))
1151                                         return -1;
1152                         }
1153                         else
1154                                 /* must be ASCII */
1155                         {
1156                                 l = 1;
1157                         }
1158                         break;
1159         }
1160
1161         return l;
1162 }
1163
1164 static int
1165 pg_euckr_verifier(const unsigned char *s, int len)
1166 {
1167         int                     l;
1168         unsigned char c1,
1169                                 c2;
1170
1171         c1 = *s++;
1172
1173         if (IS_HIGHBIT_SET(c1))
1174         {
1175                 l = 2;
1176                 if (l > len)
1177                         return -1;
1178                 if (!IS_EUC_RANGE_VALID(c1))
1179                         return -1;
1180                 c2 = *s++;
1181                 if (!IS_EUC_RANGE_VALID(c2))
1182                         return -1;
1183         }
1184         else
1185                 /* must be ASCII */
1186         {
1187                 l = 1;
1188         }
1189
1190         return l;
1191 }
1192
1193 /* EUC-CN byte sequences are exactly same as EUC-KR */
1194 #define pg_euccn_verifier       pg_euckr_verifier
1195
1196 static int
1197 pg_euctw_verifier(const unsigned char *s, int len)
1198 {
1199         int                     l;
1200         unsigned char c1,
1201                                 c2;
1202
1203         c1 = *s++;
1204
1205         switch (c1)
1206         {
1207                 case SS2:                               /* CNS 11643 Plane 1-7 */
1208                         l = 4;
1209                         if (l > len)
1210                                 return -1;
1211                         c2 = *s++;
1212                         if (c2 < 0xa1 || c2 > 0xa7)
1213                                 return -1;
1214                         c2 = *s++;
1215                         if (!IS_EUC_RANGE_VALID(c2))
1216                                 return -1;
1217                         c2 = *s++;
1218                         if (!IS_EUC_RANGE_VALID(c2))
1219                                 return -1;
1220                         break;
1221
1222                 case SS3:                               /* unused */
1223                         return -1;
1224
1225                 default:
1226                         if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1227                         {
1228                                 l = 2;
1229                                 if (l > len)
1230                                         return -1;
1231                                 /* no further range check on c1? */
1232                                 c2 = *s++;
1233                                 if (!IS_EUC_RANGE_VALID(c2))
1234                                         return -1;
1235                         }
1236                         else
1237                                 /* must be ASCII */
1238                         {
1239                                 l = 1;
1240                         }
1241                         break;
1242         }
1243         return l;
1244 }
1245
1246 static int
1247 pg_johab_verifier(const unsigned char *s, int len)
1248 {
1249         int                     l,
1250                                 mbl;
1251         unsigned char c;
1252
1253         l = mbl = pg_johab_mblen(s);
1254
1255         if (len < l)
1256                 return -1;
1257
1258         if (!IS_HIGHBIT_SET(*s))
1259                 return mbl;
1260
1261         while (--l > 0)
1262         {
1263                 c = *++s;
1264                 if (!IS_EUC_RANGE_VALID(c))
1265                         return -1;
1266         }
1267         return mbl;
1268 }
1269
1270 static int
1271 pg_mule_verifier(const unsigned char *s, int len)
1272 {
1273         int                     l,
1274                                 mbl;
1275         unsigned char c;
1276
1277         l = mbl = pg_mule_mblen(s);
1278
1279         if (len < l)
1280                 return -1;
1281
1282         while (--l > 0)
1283         {
1284                 c = *++s;
1285                 if (!IS_HIGHBIT_SET(c))
1286                         return -1;
1287         }
1288         return mbl;
1289 }
1290
1291 static int
1292 pg_latin1_verifier(const unsigned char *s, int len)
1293 {
1294         return 1;
1295 }
1296
1297 static int
1298 pg_sjis_verifier(const unsigned char *s, int len)
1299 {
1300         int                     l,
1301                                 mbl;
1302         unsigned char c1,
1303                                 c2;
1304
1305         l = mbl = pg_sjis_mblen(s);
1306
1307         if (len < l)
1308                 return -1;
1309
1310         if (l == 1)                                     /* pg_sjis_mblen already verified it */
1311                 return mbl;
1312
1313         c1 = *s++;
1314         c2 = *s;
1315         if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1316                 return -1;
1317         return mbl;
1318 }
1319
1320 static int
1321 pg_big5_verifier(const unsigned char *s, int len)
1322 {
1323         int                     l,
1324                                 mbl;
1325
1326         l = mbl = pg_big5_mblen(s);
1327
1328         if (len < l)
1329                 return -1;
1330
1331         while (--l > 0)
1332         {
1333                 if (*++s == '\0')
1334                         return -1;
1335         }
1336
1337         return mbl;
1338 }
1339
1340 static int
1341 pg_gbk_verifier(const unsigned char *s, int len)
1342 {
1343         int                     l,
1344                                 mbl;
1345
1346         l = mbl = pg_gbk_mblen(s);
1347
1348         if (len < l)
1349                 return -1;
1350
1351         while (--l > 0)
1352         {
1353                 if (*++s == '\0')
1354                         return -1;
1355         }
1356
1357         return mbl;
1358 }
1359
1360 static int
1361 pg_uhc_verifier(const unsigned char *s, int len)
1362 {
1363         int                     l,
1364                                 mbl;
1365
1366         l = mbl = pg_uhc_mblen(s);
1367
1368         if (len < l)
1369                 return -1;
1370
1371         while (--l > 0)
1372         {
1373                 if (*++s == '\0')
1374                         return -1;
1375         }
1376
1377         return mbl;
1378 }
1379
1380 static int
1381 pg_gb18030_verifier(const unsigned char *s, int len)
1382 {
1383         int                     l;
1384
1385         if (!IS_HIGHBIT_SET(*s))
1386                 l = 1;                                  /* ASCII */
1387         else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1388         {
1389                 /* Should be 4-byte, validate remaining bytes */
1390                 if (*s >= 0x81 && *s <= 0xfe &&
1391                         *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1392                         *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1393                         l = 4;
1394                 else
1395                         l = -1;
1396         }
1397         else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1398         {
1399                 /* Should be 2-byte, validate */
1400                 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1401                         (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1402                         l = 2;
1403                 else
1404                         l = -1;
1405         }
1406         else
1407                 l = -1;
1408         return l;
1409 }
1410
1411 static int
1412 pg_utf8_verifier(const unsigned char *s, int len)
1413 {
1414         int                     l = pg_utf_mblen(s);
1415
1416         if (len < l)
1417                 return -1;
1418
1419         if (!pg_utf8_islegal(s, l))
1420                 return -1;
1421
1422         return l;
1423 }
1424
1425 /*
1426  * Check for validity of a single UTF-8 encoded character
1427  *
1428  * This directly implements the rules in RFC3629.  The bizarre-looking
1429  * restrictions on the second byte are meant to ensure that there isn't
1430  * more than one encoding of a given Unicode character point; that is,
1431  * you may not use a longer-than-necessary byte sequence with high order
1432  * zero bits to represent a character that would fit in fewer bytes.
1433  * To do otherwise is to create security hazards (eg, create an apparent
1434  * non-ASCII character that decodes to plain ASCII).
1435  *
1436  * length is assumed to have been obtained by pg_utf_mblen(), and the
1437  * caller must have checked that that many bytes are present in the buffer.
1438  */
1439 bool
1440 pg_utf8_islegal(const unsigned char *source, int length)
1441 {
1442         unsigned char a;
1443
1444         switch (length)
1445         {
1446                 default:
1447                         /* reject lengths 5 and 6 for now */
1448                         return false;
1449                 case 4:
1450                         a = source[3];
1451                         if (a < 0x80 || a > 0xBF)
1452                                 return false;
1453                         /* FALL THRU */
1454                 case 3:
1455                         a = source[2];
1456                         if (a < 0x80 || a > 0xBF)
1457                                 return false;
1458                         /* FALL THRU */
1459                 case 2:
1460                         a = source[1];
1461                         switch (*source)
1462                         {
1463                                 case 0xE0:
1464                                         if (a < 0xA0 || a > 0xBF)
1465                                                 return false;
1466                                         break;
1467                                 case 0xED:
1468                                         if (a < 0x80 || a > 0x9F)
1469                                                 return false;
1470                                         break;
1471                                 case 0xF0:
1472                                         if (a < 0x90 || a > 0xBF)
1473                                                 return false;
1474                                         break;
1475                                 case 0xF4:
1476                                         if (a < 0x80 || a > 0x8F)
1477                                                 return false;
1478                                         break;
1479                                 default:
1480                                         if (a < 0x80 || a > 0xBF)
1481                                                 return false;
1482                                         break;
1483                         }
1484                         /* FALL THRU */
1485                 case 1:
1486                         a = *source;
1487                         if (a >= 0x80 && a < 0xC2)
1488                                 return false;
1489                         if (a > 0xF4)
1490                                 return false;
1491                         break;
1492         }
1493         return true;
1494 }
1495
1496 #ifndef FRONTEND
1497
1498 /*
1499  * Generic character incrementer function.
1500  *
1501  * Not knowing anything about the properties of the encoding in use, we just
1502  * keep incrementing the last byte until we get a validly-encoded result,
1503  * or we run out of values to try.  We don't bother to try incrementing
1504  * higher-order bytes, so there's no growth in runtime for wider characters.
1505  * (If we did try to do that, we'd need to consider the likelihood that 255
1506  * is not a valid final byte in the encoding.)
1507  */
1508 static bool
1509 pg_generic_charinc(unsigned char *charptr, int len)
1510 {
1511         unsigned char *lastbyte = charptr + len - 1;
1512         mbverifier      mbverify;
1513
1514         /* We can just invoke the character verifier directly. */
1515         mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
1516
1517         while (*lastbyte < (unsigned char) 255)
1518         {
1519                 (*lastbyte)++;
1520                 if ((*mbverify) (charptr, len) == len)
1521                         return true;
1522         }
1523
1524         return false;
1525 }
1526
1527 /*
1528  * UTF-8 character incrementer function.
1529  *
1530  * For a one-byte character less than 0x7F, we just increment the byte.
1531  *
1532  * For a multibyte character, every byte but the first must fall between 0x80
1533  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
1534  * the last byte that's not already at its maximum value.  If we can't find a
1535  * byte that's less than the maximum allowable value, we simply fail.  We also
1536  * need some special-case logic to skip regions used for surrogate pair
1537  * handling, as those should not occur in valid UTF-8.
1538  *
1539  * Note that we don't reset lower-order bytes back to their minimums, since
1540  * we can't afford to make an exhaustive search (see make_greater_string).
1541  */
1542 static bool
1543 pg_utf8_increment(unsigned char *charptr, int length)
1544 {
1545         unsigned char a;
1546         unsigned char limit;
1547
1548         switch (length)
1549         {
1550                 default:
1551                         /* reject lengths 5 and 6 for now */
1552                         return false;
1553                 case 4:
1554                         a = charptr[3];
1555                         if (a < 0xBF)
1556                         {
1557                                 charptr[3]++;
1558                                 break;
1559                         }
1560                         /* FALL THRU */
1561                 case 3:
1562                         a = charptr[2];
1563                         if (a < 0xBF)
1564                         {
1565                                 charptr[2]++;
1566                                 break;
1567                         }
1568                         /* FALL THRU */
1569                 case 2:
1570                         a = charptr[1];
1571                         switch (*charptr)
1572                         {
1573                                 case 0xED:
1574                                         limit = 0x9F;
1575                                         break;
1576                                 case 0xF4:
1577                                         limit = 0x8F;
1578                                         break;
1579                                 default:
1580                                         limit = 0xBF;
1581                                         break;
1582                         }
1583                         if (a < limit)
1584                         {
1585                                 charptr[1]++;
1586                                 break;
1587                         }
1588                         /* FALL THRU */
1589                 case 1:
1590                         a = *charptr;
1591                         if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1592                                 return false;
1593                         charptr[0]++;
1594                         break;
1595         }
1596
1597         return true;
1598 }
1599
1600 /*
1601  * EUC-JP character incrementer function.
1602  *
1603  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1604  * representing JIS X 0201 characters with the second byte ranging between
1605  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
1606  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1607  *
1608  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1609  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
1610  * is incremented if possible, otherwise the second-to-last byte.
1611  *
1612  * If the sequence starts with a value other than the above and its MSB
1613  * is set, it must be a two-byte sequence representing JIS X 0208 characters
1614  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
1615  * incremented if possible, otherwise the second-to-last byte.
1616  *
1617  * Otherwise, the sequence is a single-byte ASCII character. It is
1618  * incremented up to 0x7f.
1619  */
1620 static bool
1621 pg_eucjp_increment(unsigned char *charptr, int length)
1622 {
1623         unsigned char c1,
1624                                 c2;
1625         int                     i;
1626
1627         c1 = *charptr;
1628
1629         switch (c1)
1630         {
1631                 case SS2:                               /* JIS X 0201 */
1632                         if (length != 2)
1633                                 return false;
1634
1635                         c2 = charptr[1];
1636
1637                         if (c2 >= 0xdf)
1638                                 charptr[0] = charptr[1] = 0xa1;
1639                         else if (c2 < 0xa1)
1640                                 charptr[1] = 0xa1;
1641                         else
1642                                 charptr[1]++;
1643                         break;
1644
1645                 case SS3:                               /* JIS X 0212 */
1646                         if (length != 3)
1647                                 return false;
1648
1649                         for (i = 2; i > 0; i--)
1650                         {
1651                                 c2 = charptr[i];
1652                                 if (c2 < 0xa1)
1653                                 {
1654                                         charptr[i] = 0xa1;
1655                                         return true;
1656                                 }
1657                                 else if (c2 < 0xfe)
1658                                 {
1659                                         charptr[i]++;
1660                                         return true;
1661                                 }
1662                         }
1663
1664                         /* Out of 3-byte code region */
1665                         return false;
1666
1667                 default:
1668                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1669                         {
1670                                 if (length != 2)
1671                                         return false;
1672
1673                                 for (i = 1; i >= 0; i--)
1674                                 {
1675                                         c2 = charptr[i];
1676                                         if (c2 < 0xa1)
1677                                         {
1678                                                 charptr[i] = 0xa1;
1679                                                 return true;
1680                                         }
1681                                         else if (c2 < 0xfe)
1682                                         {
1683                                                 charptr[i]++;
1684                                                 return true;
1685                                         }
1686                                 }
1687
1688                                 /* Out of 2 byte code region */
1689                                 return false;
1690                         }
1691                         else
1692                         {                                       /* ASCII, single byte */
1693                                 if (c1 > 0x7e)
1694                                         return false;
1695                                 (*charptr)++;
1696                         }
1697                         break;
1698         }
1699
1700         return true;
1701 }
1702 #endif                                                  /* !FRONTEND */
1703
1704
1705 /*
1706  *-------------------------------------------------------------------
1707  * encoding info table
1708  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1709  *-------------------------------------------------------------------
1710  */
1711 const pg_wchar_tbl pg_wchar_table[] = {
1712         {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1713         {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},        /* PG_EUC_JP */
1714         {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},        /* PG_EUC_CN */
1715         {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},        /* PG_EUC_KR */
1716         {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},        /* PG_EUC_TW */
1717         {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},        /* PG_EUC_JIS_2004 */
1718         {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},       /* PG_UTF8 */
1719         {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4},   /* PG_MULE_INTERNAL */
1720         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1721         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1722         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1723         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1724         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1725         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1726         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1727         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1728         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1729         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1730         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1731         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1732         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1733         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1734         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1735         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1736         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1737         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1738         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1739         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1740         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1741         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1742         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1743         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1744         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1745         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1746         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
1747         {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1748         {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1749         {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},        /* PG_GBK */
1750         {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},        /* PG_UHC */
1751         {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},    /* PG_GB18030 */
1752         {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},  /* PG_JOHAB */
1753         {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}      /* PG_SHIFT_JIS_2004 */
1754 };
1755
1756 /* returns the byte length of a word for mule internal code */
1757 int
1758 pg_mic_mblen(const unsigned char *mbstr)
1759 {
1760         return pg_mule_mblen(mbstr);
1761 }
1762
1763 /*
1764  * Returns the byte length of a multibyte character.
1765  */
1766 int
1767 pg_encoding_mblen(int encoding, const char *mbstr)
1768 {
1769         return (PG_VALID_ENCODING(encoding) ?
1770                         pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1771                         pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1772 }
1773
1774 /*
1775  * Returns the display length of a multibyte character.
1776  */
1777 int
1778 pg_encoding_dsplen(int encoding, const char *mbstr)
1779 {
1780         return (PG_VALID_ENCODING(encoding) ?
1781                         pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1782                         pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1783 }
1784
1785 /*
1786  * Verify the first multibyte character of the given string.
1787  * Return its byte length if good, -1 if bad.  (See comments above for
1788  * full details of the mbverify API.)
1789  */
1790 int
1791 pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1792 {
1793         return (PG_VALID_ENCODING(encoding) ?
1794                         pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1795                         pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1796 }
1797
1798 /*
1799  * fetch maximum length of a given encoding
1800  */
1801 int
1802 pg_encoding_max_length(int encoding)
1803 {
1804         Assert(PG_VALID_ENCODING(encoding));
1805
1806         return pg_wchar_table[encoding].maxmblen;
1807 }
1808
1809 #ifndef FRONTEND
1810
1811 /*
1812  * fetch maximum length of the encoding for the current database
1813  */
1814 int
1815 pg_database_encoding_max_length(void)
1816 {
1817         return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1818 }
1819
1820 /*
1821  * get the character incrementer for the encoding for the current database
1822  */
1823 mbcharacter_incrementer
1824 pg_database_encoding_character_incrementer(void)
1825 {
1826         /*
1827          * Eventually it might be best to add a field to pg_wchar_table[], but for
1828          * now we just use a switch.
1829          */
1830         switch (GetDatabaseEncoding())
1831         {
1832                 case PG_UTF8:
1833                         return pg_utf8_increment;
1834
1835                 case PG_EUC_JP:
1836                         return pg_eucjp_increment;
1837
1838                 default:
1839                         return pg_generic_charinc;
1840         }
1841 }
1842
1843 /*
1844  * Verify mbstr to make sure that it is validly encoded in the current
1845  * database encoding.  Otherwise same as pg_verify_mbstr().
1846  */
1847 bool
1848 pg_verifymbstr(const char *mbstr, int len, bool noError)
1849 {
1850         return
1851                 pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1852 }
1853
1854 /*
1855  * Verify mbstr to make sure that it is validly encoded in the specified
1856  * encoding.
1857  */
1858 bool
1859 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1860 {
1861         return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1862 }
1863
1864 /*
1865  * Verify mbstr to make sure that it is validly encoded in the specified
1866  * encoding.
1867  *
1868  * mbstr is not necessarily zero terminated; length of mbstr is
1869  * specified by len.
1870  *
1871  * If OK, return length of string in the encoding.
1872  * If a problem is found, return -1 when noError is
1873  * true; when noError is false, ereport() a descriptive message.
1874  */
1875 int
1876 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1877 {
1878         mbverifier      mbverify;
1879         int                     mb_len;
1880
1881         Assert(PG_VALID_ENCODING(encoding));
1882
1883         /*
1884          * In single-byte encodings, we need only reject nulls (\0).
1885          */
1886         if (pg_encoding_max_length(encoding) <= 1)
1887         {
1888                 const char *nullpos = memchr(mbstr, 0, len);
1889
1890                 if (nullpos == NULL)
1891                         return len;
1892                 if (noError)
1893                         return -1;
1894                 report_invalid_encoding(encoding, nullpos, 1);
1895         }
1896
1897         /* fetch function pointer just once */
1898         mbverify = pg_wchar_table[encoding].mbverify;
1899
1900         mb_len = 0;
1901
1902         while (len > 0)
1903         {
1904                 int                     l;
1905
1906                 /* fast path for ASCII-subset characters */
1907                 if (!IS_HIGHBIT_SET(*mbstr))
1908                 {
1909                         if (*mbstr != '\0')
1910                         {
1911                                 mb_len++;
1912                                 mbstr++;
1913                                 len--;
1914                                 continue;
1915                         }
1916                         if (noError)
1917                                 return -1;
1918                         report_invalid_encoding(encoding, mbstr, len);
1919                 }
1920
1921                 l = (*mbverify) ((const unsigned char *) mbstr, len);
1922
1923                 if (l < 0)
1924                 {
1925                         if (noError)
1926                                 return -1;
1927                         report_invalid_encoding(encoding, mbstr, len);
1928                 }
1929
1930                 mbstr += l;
1931                 len -= l;
1932                 mb_len++;
1933         }
1934         return mb_len;
1935 }
1936
1937 /*
1938  * check_encoding_conversion_args: check arguments of a conversion function
1939  *
1940  * "expected" arguments can be either an encoding ID or -1 to indicate that
1941  * the caller will check whether it accepts the ID.
1942  *
1943  * Note: the errors here are not really user-facing, so elog instead of
1944  * ereport seems sufficient.  Also, we trust that the "expected" encoding
1945  * arguments are valid encoding IDs, but we don't trust the actuals.
1946  */
1947 void
1948 check_encoding_conversion_args(int src_encoding,
1949                                                            int dest_encoding,
1950                                                            int len,
1951                                                            int expected_src_encoding,
1952                                                            int expected_dest_encoding)
1953 {
1954         if (!PG_VALID_ENCODING(src_encoding))
1955                 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1956         if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1957                 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1958                          pg_enc2name_tbl[expected_src_encoding].name,
1959                          pg_enc2name_tbl[src_encoding].name);
1960         if (!PG_VALID_ENCODING(dest_encoding))
1961                 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1962         if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1963                 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1964                          pg_enc2name_tbl[expected_dest_encoding].name,
1965                          pg_enc2name_tbl[dest_encoding].name);
1966         if (len < 0)
1967                 elog(ERROR, "encoding conversion length must not be negative");
1968 }
1969
1970 /*
1971  * report_invalid_encoding: complain about invalid multibyte character
1972  *
1973  * note: len is remaining length of string, not length of character;
1974  * len must be greater than zero, as we always examine the first byte.
1975  */
1976 void
1977 report_invalid_encoding(int encoding, const char *mbstr, int len)
1978 {
1979         int                     l = pg_encoding_mblen(encoding, mbstr);
1980         char            buf[8 * 5 + 1];
1981         char       *p = buf;
1982         int                     j,
1983                                 jlimit;
1984
1985         jlimit = Min(l, len);
1986         jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
1987
1988         for (j = 0; j < jlimit; j++)
1989         {
1990                 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1991                 if (j < jlimit - 1)
1992                         p += sprintf(p, " ");
1993         }
1994
1995         ereport(ERROR,
1996                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1997                          errmsg("invalid byte sequence for encoding \"%s\": %s",
1998                                         pg_enc2name_tbl[encoding].name,
1999                                         buf)));
2000 }
2001
2002 /*
2003  * report_untranslatable_char: complain about untranslatable character
2004  *
2005  * note: len is remaining length of string, not length of character;
2006  * len must be greater than zero, as we always examine the first byte.
2007  */
2008 void
2009 report_untranslatable_char(int src_encoding, int dest_encoding,
2010                                                    const char *mbstr, int len)
2011 {
2012         int                     l = pg_encoding_mblen(src_encoding, mbstr);
2013         char            buf[8 * 5 + 1];
2014         char       *p = buf;
2015         int                     j,
2016                                 jlimit;
2017
2018         jlimit = Min(l, len);
2019         jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
2020
2021         for (j = 0; j < jlimit; j++)
2022         {
2023                 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2024                 if (j < jlimit - 1)
2025                         p += sprintf(p, " ");
2026         }
2027
2028         ereport(ERROR,
2029                         (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
2030                          errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
2031                                         buf,
2032                                         pg_enc2name_tbl[src_encoding].name,
2033                                         pg_enc2name_tbl[dest_encoding].name)));
2034 }
2035
2036 #endif                                                  /* !FRONTEND */