src/backend/utils/mb/conv.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  *        Utility functions for conversion procs.
   4  *
   5  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
   6  * Portions Copyright (c) 1994, Regents of the University of California
   7  *
   8  * IDENTIFICATION
   9  *        src/backend/utils/mb/conv.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "postgres.h"
  14 #include "mb/pg_wchar.h"
  15
  16
  17 /*
  18  * local2local: a generic single byte charset encoding
  19  * conversion between two ASCII-superset encodings.
  20  *
  21  * l points to the source string of length len
  22  * p is the output area (must be large enough!)
  23  * src_encoding is the PG identifier for the source encoding
  24  * dest_encoding is the PG identifier for the target encoding
  25  * tab holds conversion entries for the source charset
  26  * starting from 128 (0x80). each entry in the table holds the corresponding
  27  * code point for the target charset, or 0 if there is no equivalent code.
  28  *
  29  * Returns the number of input bytes consumed.  If noError is true, this can
  30  * be less than 'len'.
  31  */
  32 int
  33 local2local(const unsigned char *l,
  34                         unsigned char *p,
  35                         int len,
  36                         int src_encoding,
  37                         int dest_encoding,
  38                         const unsigned char *tab,
  39                         bool noError)
  40 {
  41         const unsigned char *start = l;
  42         unsigned char c1,
  43                                 c2;
  44
  45         while (len > 0)
  46         {
  47                 c1 = *l;
  48                 if (c1 == 0)
  49                 {
  50                         if (noError)
  51                                 break;
  52                         report_invalid_encoding(src_encoding, (const char *) l, len);
  53                 }
  54                 if (!IS_HIGHBIT_SET(c1))
  55                         *p++ = c1;
  56                 else
  57                 {
  58                         c2 = tab[c1 - HIGHBIT];
  59                         if (c2)
  60                                 *p++ = c2;
  61                         else
  62                         {
  63                                 if (noError)
  64                                         break;
  65                                 report_untranslatable_char(src_encoding, dest_encoding,
  66                                                                                    (const char *) l, len);
  67                         }
  68                 }
  69                 l++;
  70                 len--;
  71         }
  72         *p = '\0';
  73
  74         return l - start;
  75 }
  76
  77 /*
  78  * LATINn ---> MIC when the charset's local codes map directly to MIC
  79  *
  80  * l points to the source string of length len
  81  * p is the output area (must be large enough!)
  82  * lc is the mule character set id for the local encoding
  83  * encoding is the PG identifier for the local encoding
  84  *
  85  * Returns the number of input bytes consumed.  If noError is true, this can
  86  * be less than 'len'.
  87  */
  88 int
  89 latin2mic(const unsigned char *l, unsigned char *p, int len,
  90                   int lc, int encoding, bool noError)
  91 {
  92         const unsigned char *start = l;
  93         int                     c1;
  94
  95         while (len > 0)
  96         {
  97                 c1 = *l;
  98                 if (c1 == 0)
  99                 {
 100                         if (noError)
 101                                 break;
 102                         report_invalid_encoding(encoding, (const char *) l, len);
 103                 }
 104                 if (IS_HIGHBIT_SET(c1))
 105                         *p++ = lc;
 106                 *p++ = c1;
 107                 l++;
 108                 len--;
 109         }
 110         *p = '\0';
 111
 112         return l - start;
 113 }
 114
 115 /*
 116  * MIC ---> LATINn when the charset's local codes map directly to MIC
 117  *
 118  * mic points to the source string of length len
 119  * p is the output area (must be large enough!)
 120  * lc is the mule character set id for the local encoding
 121  * encoding is the PG identifier for the local encoding
 122  *
 123  * Returns the number of input bytes consumed.  If noError is true, this can
 124  * be less than 'len'.
 125  */
 126 int
 127 mic2latin(const unsigned char *mic, unsigned char *p, int len,
 128                   int lc, int encoding, bool noError)
 129 {
 130         const unsigned char *start = mic;
 131         int                     c1;
 132
 133         while (len > 0)
 134         {
 135                 c1 = *mic;
 136                 if (c1 == 0)
 137                 {
 138                         if (noError)
 139                                 break;
 140                         report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
 141                 }
 142                 if (!IS_HIGHBIT_SET(c1))
 143                 {
 144                         /* easy for ASCII */
 145                         *p++ = c1;
 146                         mic++;
 147                         len--;
 148                 }
 149                 else
 150                 {
 151                         int                     l = pg_mule_mblen(mic);
 152
 153                         if (len < l)
 154                         {
 155                                 if (noError)
 156                                         break;
 157                                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 158                                                                                 len);
 159                         }
 160                         if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
 161                         {
 162                                 if (noError)
 163                                         break;
 164                                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 165                                                                                    (const char *) mic, len);
 166                         }
 167                         *p++ = mic[1];
 168                         mic += 2;
 169                         len -= 2;
 170                 }
 171         }
 172         *p = '\0';
 173
 174         return mic - start;
 175 }
 176
 177
 178 /*
 179  * latin2mic_with_table: a generic single byte charset encoding
 180  * conversion from a local charset to the mule internal code.
 181  *
 182  * l points to the source string of length len
 183  * p is the output area (must be large enough!)
 184  * lc is the mule character set id for the local encoding
 185  * encoding is the PG identifier for the local encoding
 186  * tab holds conversion entries for the local charset
 187  * starting from 128 (0x80). each entry in the table holds the corresponding
 188  * code point for the mule encoding, or 0 if there is no equivalent code.
 189  *
 190  * Returns the number of input bytes consumed.  If noError is true, this can
 191  * be less than 'len'.
 192  */
 193 int
 194 latin2mic_with_table(const unsigned char *l,
 195                                          unsigned char *p,
 196                                          int len,
 197                                          int lc,
 198                                          int encoding,
 199                                          const unsigned char *tab,
 200                                          bool noError)
 201 {
 202         const unsigned char *start = l;
 203         unsigned char c1,
 204                                 c2;
 205
 206         while (len > 0)
 207         {
 208                 c1 = *l;
 209                 if (c1 == 0)
 210                 {
 211                         if (noError)
 212                                 break;
 213                         report_invalid_encoding(encoding, (const char *) l, len);
 214                 }
 215                 if (!IS_HIGHBIT_SET(c1))
 216                         *p++ = c1;
 217                 else
 218                 {
 219                         c2 = tab[c1 - HIGHBIT];
 220                         if (c2)
 221                         {
 222                                 *p++ = lc;
 223                                 *p++ = c2;
 224                         }
 225                         else
 226                         {
 227                                 if (noError)
 228                                         break;
 229                                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 230                                                                                    (const char *) l, len);
 231                         }
 232                 }
 233                 l++;
 234                 len--;
 235         }
 236         *p = '\0';
 237
 238         return l - start;
 239 }
 240
 241 /*
 242  * mic2latin_with_table: a generic single byte charset encoding
 243  * conversion from the mule internal code to a local charset.
 244  *
 245  * mic points to the source string of length len
 246  * p is the output area (must be large enough!)
 247  * lc is the mule character set id for the local encoding
 248  * encoding is the PG identifier for the local encoding
 249  * tab holds conversion entries for the mule internal code's second byte,
 250  * starting from 128 (0x80). each entry in the table holds the corresponding
 251  * code point for the local charset, or 0 if there is no equivalent code.
 252  *
 253  * Returns the number of input bytes consumed.  If noError is true, this can
 254  * be less than 'len'.
 255  */
 256 int
 257 mic2latin_with_table(const unsigned char *mic,
 258                                          unsigned char *p,
 259                                          int len,
 260                                          int lc,
 261                                          int encoding,
 262                                          const unsigned char *tab,
 263                                          bool noError)
 264 {
 265         const unsigned char *start = mic;
 266         unsigned char c1,
 267                                 c2;
 268
 269         while (len > 0)
 270         {
 271                 c1 = *mic;
 272                 if (c1 == 0)
 273                 {
 274                         if (noError)
 275                                 break;
 276                         report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
 277                 }
 278                 if (!IS_HIGHBIT_SET(c1))
 279                 {
 280                         /* easy for ASCII */
 281                         *p++ = c1;
 282                         mic++;
 283                         len--;
 284                 }
 285                 else
 286                 {
 287                         int                     l = pg_mule_mblen(mic);
 288
 289                         if (len < l)
 290                         {
 291                                 if (noError)
 292                                         break;
 293                                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 294                                                                                 len);
 295                         }
 296                         if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 297                                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
 298                         {
 299                                 if (noError)
 300                                         break;
 301                                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 302                                                                                    (const char *) mic, len);
 303                                 break;                  /* keep compiler quiet */
 304                         }
 305                         *p++ = c2;
 306                         mic += 2;
 307                         len -= 2;
 308                 }
 309         }
 310         *p = '\0';
 311
 312         return mic - start;
 313 }
 314
 315 /*
 316  * comparison routine for bsearch()
 317  * this routine is intended for combined UTF8 -> local code
 318  */
 319 static int
 320 compare3(const void *p1, const void *p2)
 321 {
 322         uint32          s1,
 323                                 s2,
 324                                 d1,
 325                                 d2;
 326
 327         s1 = *(const uint32 *) p1;
 328         s2 = *((const uint32 *) p1 + 1);
 329         d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
 330         d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
 331         return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
 332 }
 333
 334 /*
 335  * comparison routine for bsearch()
 336  * this routine is intended for local code -> combined UTF8
 337  */
 338 static int
 339 compare4(const void *p1, const void *p2)
 340 {
 341         uint32          v1,
 342                                 v2;
 343
 344         v1 = *(const uint32 *) p1;
 345         v2 = ((const pg_local_to_utf_combined *) p2)->code;
 346         return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
 347 }
 348
 349 /*
 350  * store 32bit character representation into multibyte stream
 351  */
 352 static inline unsigned char *
 353 store_coded_char(unsigned char *dest, uint32 code)
 354 {
 355         if (code & 0xff000000)
 356                 *dest++ = code >> 24;
 357         if (code & 0x00ff0000)
 358                 *dest++ = code >> 16;
 359         if (code & 0x0000ff00)
 360                 *dest++ = code >> 8;
 361         if (code & 0x000000ff)
 362                 *dest++ = code;
 363         return dest;
 364 }
 365
 366 /*
 367  * Convert a character using a conversion radix tree.
 368  *
 369  * 'l' is the length of the input character in bytes, and b1-b4 are
 370  * the input character's bytes.
 371  */
 372 static inline uint32
 373 pg_mb_radix_conv(const pg_mb_radix_tree *rt,
 374                                  int l,
 375                                  unsigned char b1,
 376                                  unsigned char b2,
 377                                  unsigned char b3,
 378                                  unsigned char b4)
 379 {
 380         if (l == 4)
 381         {
 382                 /* 4-byte code */
 383
 384                 /* check code validity */
 385                 if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
 386                         b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
 387                         b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
 388                         b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
 389                         return 0;
 390
 391                 /* perform lookup */
 392                 if (rt->chars32)
 393                 {
 394                         uint32          idx = rt->b4root;
 395
 396                         idx = rt->chars32[b1 + idx - rt->b4_1_lower];
 397                         idx = rt->chars32[b2 + idx - rt->b4_2_lower];
 398                         idx = rt->chars32[b3 + idx - rt->b4_3_lower];
 399                         return rt->chars32[b4 + idx - rt->b4_4_lower];
 400                 }
 401                 else
 402                 {
 403                         uint16          idx = rt->b4root;
 404
 405                         idx = rt->chars16[b1 + idx - rt->b4_1_lower];
 406                         idx = rt->chars16[b2 + idx - rt->b4_2_lower];
 407                         idx = rt->chars16[b3 + idx - rt->b4_3_lower];
 408                         return rt->chars16[b4 + idx - rt->b4_4_lower];
 409                 }
 410         }
 411         else if (l == 3)
 412         {
 413                 /* 3-byte code */
 414
 415                 /* check code validity */
 416                 if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
 417                         b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
 418                         b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
 419                         return 0;
 420
 421                 /* perform lookup */
 422                 if (rt->chars32)
 423                 {
 424                         uint32          idx = rt->b3root;
 425
 426                         idx = rt->chars32[b2 + idx - rt->b3_1_lower];
 427                         idx = rt->chars32[b3 + idx - rt->b3_2_lower];
 428                         return rt->chars32[b4 + idx - rt->b3_3_lower];
 429                 }
 430                 else
 431                 {
 432                         uint16          idx = rt->b3root;
 433
 434                         idx = rt->chars16[b2 + idx - rt->b3_1_lower];
 435                         idx = rt->chars16[b3 + idx - rt->b3_2_lower];
 436                         return rt->chars16[b4 + idx - rt->b3_3_lower];
 437                 }
 438         }
 439         else if (l == 2)
 440         {
 441                 /* 2-byte code */
 442
 443                 /* check code validity - first byte */
 444                 if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
 445                         b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
 446                         return 0;
 447
 448                 /* perform lookup */
 449                 if (rt->chars32)
 450                 {
 451                         uint32          idx = rt->b2root;
 452
 453                         idx = rt->chars32[b3 + idx - rt->b2_1_lower];
 454                         return rt->chars32[b4 + idx - rt->b2_2_lower];
 455                 }
 456                 else
 457                 {
 458                         uint16          idx = rt->b2root;
 459
 460                         idx = rt->chars16[b3 + idx - rt->b2_1_lower];
 461                         return rt->chars16[b4 + idx - rt->b2_2_lower];
 462                 }
 463         }
 464         else if (l == 1)
 465         {
 466                 /* 1-byte code */
 467
 468                 /* check code validity - first byte */
 469                 if (b4 < rt->b1_lower || b4 > rt->b1_upper)
 470                         return 0;
 471
 472                 /* perform lookup */
 473                 if (rt->chars32)
 474                         return rt->chars32[b4 + rt->b1root - rt->b1_lower];
 475                 else
 476                         return rt->chars16[b4 + rt->b1root - rt->b1_lower];
 477         }
 478         return 0;                                       /* shouldn't happen */
 479 }
 480
 481 /*
 482  * UTF8 ---> local code
 483  *
 484  * utf: input string in UTF8 encoding (need not be null-terminated)
 485  * len: length of input string (in bytes)
 486  * iso: pointer to the output area (must be large enough!)
 487                   (output string will be null-terminated)
 488  * map: conversion map for single characters
 489  * cmap: conversion map for combined characters
 490  *                (optional, pass NULL if none)
 491  * cmapsize: number of entries in the conversion map for combined characters
 492  *                (optional, pass 0 if none)
 493  * conv_func: algorithmic encoding conversion function
 494  *                (optional, pass NULL if none)
 495  * encoding: PG identifier for the local encoding
 496  *
 497  * For each character, the cmap (if provided) is consulted first; if no match,
 498  * the map is consulted next; if still no match, the conv_func (if provided)
 499  * is applied.  An error is raised if no match is found.
 500  *
 501  * See pg_wchar.h for more details about the data structures used here.
 502  *
 503  * Returns the number of input bytes consumed.  If noError is true, this can
 504  * be less than 'len'.
 505  */
 506 int
 507 UtfToLocal(const unsigned char *utf, int len,
 508                    unsigned char *iso,
 509                    const pg_mb_radix_tree *map,
 510                    const pg_utf_to_local_combined *cmap, int cmapsize,
 511                    utf_local_conversion_func conv_func,
 512                    int encoding, bool noError)
 513 {
 514         uint32          iutf;
 515         int                     l;
 516         const pg_utf_to_local_combined *cp;
 517         const unsigned char *start = utf;
 518
 519         if (!PG_VALID_ENCODING(encoding))
 520                 ereport(ERROR,
 521                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 522                                  errmsg("invalid encoding number: %d", encoding)));
 523
 524         for (; len > 0; len -= l)
 525         {
 526                 unsigned char b1 = 0;
 527                 unsigned char b2 = 0;
 528                 unsigned char b3 = 0;
 529                 unsigned char b4 = 0;
 530
 531                 /* "break" cases all represent errors */
 532                 if (*utf == '\0')
 533                         break;
 534
 535                 l = pg_utf_mblen(utf);
 536                 if (len < l)
 537                         break;
 538
 539                 if (!pg_utf8_islegal(utf, l))
 540                         break;
 541
 542                 if (l == 1)
 543                 {
 544                         /* ASCII case is easy, assume it's one-to-one conversion */
 545                         *iso++ = *utf++;
 546                         continue;
 547                 }
 548
 549                 /* collect coded char of length l */
 550                 if (l == 2)
 551                 {
 552                         b3 = *utf++;
 553                         b4 = *utf++;
 554                 }
 555                 else if (l == 3)
 556                 {
 557                         b2 = *utf++;
 558                         b3 = *utf++;
 559                         b4 = *utf++;
 560                 }
 561                 else if (l == 4)
 562                 {
 563                         b1 = *utf++;
 564                         b2 = *utf++;
 565                         b3 = *utf++;
 566                         b4 = *utf++;
 567                 }
 568                 else
 569                 {
 570                         elog(ERROR, "unsupported character length %d", l);
 571                         iutf = 0;                       /* keep compiler quiet */
 572                 }
 573                 iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
 574
 575                 /* First, try with combined map if possible */
 576                 if (cmap && len > l)
 577                 {
 578                         const unsigned char *utf_save = utf;
 579                         int                     len_save = len;
 580                         int                     l_save = l;
 581
 582                         /* collect next character, same as above */
 583                         len -= l;
 584
 585                         l = pg_utf_mblen(utf);
 586                         if (len < l)
 587                         {
 588                                 /* need more data to decide if this is a combined char */
 589                                 utf -= l_save;
 590                                 break;
 591                         }
 592
 593                         if (!pg_utf8_islegal(utf, l))
 594                         {
 595                                 if (!noError)
 596                                         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 597                                 utf -= l_save;
 598                                 break;
 599                         }
 600
 601                         /* We assume ASCII character cannot be in combined map */
 602                         if (l > 1)
 603                         {
 604                                 uint32          iutf2;
 605                                 uint32          cutf[2];
 606
 607                                 if (l == 2)
 608                                 {
 609                                         iutf2 = *utf++ << 8;
 610                                         iutf2 |= *utf++;
 611                                 }
 612                                 else if (l == 3)
 613                                 {
 614                                         iutf2 = *utf++ << 16;
 615                                         iutf2 |= *utf++ << 8;
 616                                         iutf2 |= *utf++;
 617                                 }
 618                                 else if (l == 4)
 619                                 {
 620                                         iutf2 = *utf++ << 24;
 621                                         iutf2 |= *utf++ << 16;
 622                                         iutf2 |= *utf++ << 8;
 623                                         iutf2 |= *utf++;
 624                                 }
 625                                 else
 626                                 {
 627                                         elog(ERROR, "unsupported character length %d", l);
 628                                         iutf2 = 0;      /* keep compiler quiet */
 629                                 }
 630
 631                                 cutf[0] = iutf;
 632                                 cutf[1] = iutf2;
 633
 634                                 cp = bsearch(cutf, cmap, cmapsize,
 635                                                          sizeof(pg_utf_to_local_combined), compare3);
 636
 637                                 if (cp)
 638                                 {
 639                                         iso = store_coded_char(iso, cp->code);
 640                                         continue;
 641                                 }
 642                         }
 643
 644                         /* fail, so back up to reprocess second character next time */
 645                         utf = utf_save;
 646                         len = len_save;
 647                         l = l_save;
 648                 }
 649
 650                 /* Now check ordinary map */
 651                 if (map)
 652                 {
 653                         uint32          converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
 654
 655                         if (converted)
 656                         {
 657                                 iso = store_coded_char(iso, converted);
 658                                 continue;
 659                         }
 660                 }
 661
 662                 /* if there's a conversion function, try that */
 663                 if (conv_func)
 664                 {
 665                         uint32          converted = (*conv_func) (iutf);
 666
 667                         if (converted)
 668                         {
 669                                 iso = store_coded_char(iso, converted);
 670                                 continue;
 671                         }
 672                 }
 673
 674                 /* failed to translate this character */
 675                 utf -= l;
 676                 if (noError)
 677                         break;
 678                 report_untranslatable_char(PG_UTF8, encoding,
 679                                                                    (const char *) utf, len);
 680         }
 681
 682         /* if we broke out of loop early, must be invalid input */
 683         if (len > 0 && !noError)
 684                 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 685
 686         *iso = '\0';
 687
 688         return utf - start;
 689 }
 690
 691 /*
 692  * local code ---> UTF8
 693  *
 694  * iso: input string in local encoding (need not be null-terminated)
 695  * len: length of input string (in bytes)
 696  * utf: pointer to the output area (must be large enough!)
 697                   (output string will be null-terminated)
 698  * map: conversion map for single characters
 699  * cmap: conversion map for combined characters
 700  *                (optional, pass NULL if none)
 701  * cmapsize: number of entries in the conversion map for combined characters
 702  *                (optional, pass 0 if none)
 703  * conv_func: algorithmic encoding conversion function
 704  *                (optional, pass NULL if none)
 705  * encoding: PG identifier for the local encoding
 706  *
 707  * For each character, the map is consulted first; if no match, the cmap
 708  * (if provided) is consulted next; if still no match, the conv_func
 709  * (if provided) is applied.  An error is raised if no match is found.
 710  *
 711  * See pg_wchar.h for more details about the data structures used here.
 712  *
 713  * Returns the number of input bytes consumed.  If noError is true, this can
 714  * be less than 'len'.
 715  */
 716 int
 717 LocalToUtf(const unsigned char *iso, int len,
 718                    unsigned char *utf,
 719                    const pg_mb_radix_tree *map,
 720                    const pg_local_to_utf_combined *cmap, int cmapsize,
 721                    utf_local_conversion_func conv_func,
 722                    int encoding,
 723                    bool noError)
 724 {
 725         uint32          iiso;
 726         int                     l;
 727         const pg_local_to_utf_combined *cp;
 728         const unsigned char *start = iso;
 729
 730         if (!PG_VALID_ENCODING(encoding))
 731                 ereport(ERROR,
 732                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 733                                  errmsg("invalid encoding number: %d", encoding)));
 734
 735         for (; len > 0; len -= l)
 736         {
 737                 unsigned char b1 = 0;
 738                 unsigned char b2 = 0;
 739                 unsigned char b3 = 0;
 740                 unsigned char b4 = 0;
 741
 742                 /* "break" cases all represent errors */
 743                 if (*iso == '\0')
 744                         break;
 745
 746                 if (!IS_HIGHBIT_SET(*iso))
 747                 {
 748                         /* ASCII case is easy, assume it's one-to-one conversion */
 749                         *utf++ = *iso++;
 750                         l = 1;
 751                         continue;
 752                 }
 753
 754                 l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
 755                 if (l < 0)
 756                         break;
 757
 758                 /* collect coded char of length l */
 759                 if (l == 1)
 760                         b4 = *iso++;
 761                 else if (l == 2)
 762                 {
 763                         b3 = *iso++;
 764                         b4 = *iso++;
 765                 }
 766                 else if (l == 3)
 767                 {
 768                         b2 = *iso++;
 769                         b3 = *iso++;
 770                         b4 = *iso++;
 771                 }
 772                 else if (l == 4)
 773                 {
 774                         b1 = *iso++;
 775                         b2 = *iso++;
 776                         b3 = *iso++;
 777                         b4 = *iso++;
 778                 }
 779                 else
 780                 {
 781                         elog(ERROR, "unsupported character length %d", l);
 782                         iiso = 0;                       /* keep compiler quiet */
 783                 }
 784                 iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
 785
 786                 if (map)
 787                 {
 788                         uint32          converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
 789
 790                         if (converted)
 791                         {
 792                                 utf = store_coded_char(utf, converted);
 793                                 continue;
 794                         }
 795
 796                         /* If there's a combined character map, try that */
 797                         if (cmap)
 798                         {
 799                                 cp = bsearch(&iiso, cmap, cmapsize,
 800                                                          sizeof(pg_local_to_utf_combined), compare4);
 801
 802                                 if (cp)
 803                                 {
 804                                         utf = store_coded_char(utf, cp->utf1);
 805                                         utf = store_coded_char(utf, cp->utf2);
 806                                         continue;
 807                                 }
 808                         }
 809                 }
 810
 811                 /* if there's a conversion function, try that */
 812                 if (conv_func)
 813                 {
 814                         uint32          converted = (*conv_func) (iiso);
 815
 816                         if (converted)
 817                         {
 818                                 utf = store_coded_char(utf, converted);
 819                                 continue;
 820                         }
 821                 }
 822
 823                 /* failed to translate this character */
 824                 iso -= l;
 825                 if (noError)
 826                         break;
 827                 report_untranslatable_char(encoding, PG_UTF8,
 828                                                                    (const char *) iso, len);
 829         }
 830
 831         /* if we broke out of loop early, must be invalid input */
 832         if (len > 0 && !noError)
 833                 report_invalid_encoding(encoding, (const char *) iso, len);
 834
 835         *utf = '\0';
 836
 837         return iso - start;
 838 }