src/backend/utils/mb/mbutils.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * mbutils.c
   4  *        This file contains functions for encoding conversion.
   5  *
   6  * The string-conversion functions in this file share some API quirks.
   7  * Note the following:
   8  *
   9  * The functions return a palloc'd, null-terminated string if conversion
  10  * is required.  However, if no conversion is performed, the given source
  11  * string pointer is returned as-is.
  12  *
  13  * Although the presence of a length argument means that callers can pass
  14  * non-null-terminated strings, care is required because the same string
  15  * will be passed back if no conversion occurs.  Such callers *must* check
  16  * whether result == src and handle that case differently.
  17  *
  18  * If the source and destination encodings are the same, the source string
  19  * is returned without any verification; it's assumed to be valid data.
  20  * If that might not be the case, the caller is responsible for validating
  21  * the string using a separate call to pg_verify_mbstr().  Whenever the
  22  * source and destination encodings are different, the functions ensure that
  23  * the result is validly encoded according to the destination encoding.
  24  *
  25  *
  26  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  27  * Portions Copyright (c) 1994, Regents of the University of California
  28  *
  29  *
  30  * IDENTIFICATION
  31  *        src/backend/utils/mb/mbutils.c
  32  *
  33  *-------------------------------------------------------------------------
  34  */
  35 #include "postgres.h"
  36
  37 #include "access/xact.h"
  38 #include "catalog/namespace.h"
  39 #include "mb/pg_wchar.h"
  40 #include "utils/builtins.h"
  41 #include "utils/memutils.h"
  42 #include "utils/syscache.h"
  43 #include "varatt.h"
  44
  45 /*
  46  * We maintain a simple linked list caching the fmgr lookup info for the
  47  * currently selected conversion functions, as well as any that have been
  48  * selected previously in the current session.  (We remember previous
  49  * settings because we must be able to restore a previous setting during
  50  * transaction rollback, without doing any fresh catalog accesses.)
  51  *
  52  * Since we'll never release this data, we just keep it in TopMemoryContext.
  53  */
  54 typedef struct ConvProcInfo
  55 {
  56         int                     s_encoding;             /* server and client encoding IDs */
  57         int                     c_encoding;
  58         FmgrInfo        to_server_info; /* lookup info for conversion procs */
  59         FmgrInfo        to_client_info;
  60 } ConvProcInfo;
  61
  62 static List *ConvProcList = NIL;        /* List of ConvProcInfo */
  63
  64 /*
  65  * These variables point to the currently active conversion functions,
  66  * or are NULL when no conversion is needed.
  67  */
  68 static FmgrInfo *ToServerConvProc = NULL;
  69 static FmgrInfo *ToClientConvProc = NULL;
  70
  71 /*
  72  * This variable stores the conversion function to convert from UTF-8
  73  * to the server encoding.  It's NULL if the server encoding *is* UTF-8,
  74  * or if we lack a conversion function for this.
  75  */
  76 static FmgrInfo *Utf8ToServerConvProc = NULL;
  77
  78 /*
  79  * These variables track the currently-selected encodings.
  80  */
  81 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  82 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  83 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  84
  85 /*
  86  * During backend startup we can't set client encoding because we (a)
  87  * can't look up the conversion functions, and (b) may not know the database
  88  * encoding yet either.  So SetClientEncoding() just accepts anything and
  89  * remembers it for InitializeClientEncoding() to apply later.
  90  */
  91 static bool backend_startup_complete = false;
  92 static int      pending_client_encoding = PG_SQL_ASCII;
  93
  94
  95 /* Internal functions */
  96 static char *perform_default_encoding_conversion(const char *src,
  97                                                                                                  int len, bool is_client_to_server);
  98 static int      cliplen(const char *str, int len, int limit);
  99
 100
 101 /*
 102  * Prepare for a future call to SetClientEncoding.  Success should mean
 103  * that SetClientEncoding is guaranteed to succeed for this encoding request.
 104  *
 105  * (But note that success before backend_startup_complete does not guarantee
 106  * success after ...)
 107  *
 108  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
 109  */
 110 int
 111 PrepareClientEncoding(int encoding)
 112 {
 113         int                     current_server_encoding;
 114         ListCell   *lc;
 115
 116         if (!PG_VALID_FE_ENCODING(encoding))
 117                 return -1;
 118
 119         /* Can't do anything during startup, per notes above */
 120         if (!backend_startup_complete)
 121                 return 0;
 122
 123         current_server_encoding = GetDatabaseEncoding();
 124
 125         /*
 126          * Check for cases that require no conversion function.
 127          */
 128         if (current_server_encoding == encoding ||
 129                 current_server_encoding == PG_SQL_ASCII ||
 130                 encoding == PG_SQL_ASCII)
 131                 return 0;
 132
 133         if (IsTransactionState())
 134         {
 135                 /*
 136                  * If we're in a live transaction, it's safe to access the catalogs,
 137                  * so look up the functions.  We repeat the lookup even if the info is
 138                  * already cached, so that we can react to changes in the contents of
 139                  * pg_conversion.
 140                  */
 141                 Oid                     to_server_proc,
 142                                         to_client_proc;
 143                 ConvProcInfo *convinfo;
 144                 MemoryContext oldcontext;
 145
 146                 to_server_proc = FindDefaultConversionProc(encoding,
 147                                                                                                    current_server_encoding);
 148                 if (!OidIsValid(to_server_proc))
 149                         return -1;
 150                 to_client_proc = FindDefaultConversionProc(current_server_encoding,
 151                                                                                                    encoding);
 152                 if (!OidIsValid(to_client_proc))
 153                         return -1;
 154
 155                 /*
 156                  * Load the fmgr info into TopMemoryContext (could still fail here)
 157                  */
 158                 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
 159                                                                                                            sizeof(ConvProcInfo));
 160                 convinfo->s_encoding = current_server_encoding;
 161                 convinfo->c_encoding = encoding;
 162                 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
 163                                           TopMemoryContext);
 164                 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
 165                                           TopMemoryContext);
 166
 167                 /* Attach new info to head of list */
 168                 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 169                 ConvProcList = lcons(convinfo, ConvProcList);
 170                 MemoryContextSwitchTo(oldcontext);
 171
 172                 /*
 173                  * We cannot yet remove any older entry for the same encoding pair,
 174                  * since it could still be in use.  SetClientEncoding will clean up.
 175                  */
 176
 177                 return 0;                               /* success */
 178         }
 179         else
 180         {
 181                 /*
 182                  * If we're not in a live transaction, the only thing we can do is
 183                  * restore a previous setting using the cache.  This covers all
 184                  * transaction-rollback cases.  The only case it might not work for is
 185                  * trying to change client_encoding on the fly by editing
 186                  * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
 187                  * thing to do anyway.
 188                  */
 189                 foreach(lc, ConvProcList)
 190                 {
 191                         ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
 192
 193                         if (oldinfo->s_encoding == current_server_encoding &&
 194                                 oldinfo->c_encoding == encoding)
 195                                 return 0;
 196                 }
 197
 198                 return -1;                              /* it's not cached, so fail */
 199         }
 200 }
 201
 202 /*
 203  * Set the active client encoding and set up the conversion-function pointers.
 204  * PrepareClientEncoding should have been called previously for this encoding.
 205  *
 206  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
 207  */
 208 int
 209 SetClientEncoding(int encoding)
 210 {
 211         int                     current_server_encoding;
 212         bool            found;
 213         ListCell   *lc;
 214
 215         if (!PG_VALID_FE_ENCODING(encoding))
 216                 return -1;
 217
 218         /* Can't do anything during startup, per notes above */
 219         if (!backend_startup_complete)
 220         {
 221                 pending_client_encoding = encoding;
 222                 return 0;
 223         }
 224
 225         current_server_encoding = GetDatabaseEncoding();
 226
 227         /*
 228          * Check for cases that require no conversion function.
 229          */
 230         if (current_server_encoding == encoding ||
 231                 current_server_encoding == PG_SQL_ASCII ||
 232                 encoding == PG_SQL_ASCII)
 233         {
 234                 ClientEncoding = &pg_enc2name_tbl[encoding];
 235                 ToServerConvProc = NULL;
 236                 ToClientConvProc = NULL;
 237                 return 0;
 238         }
 239
 240         /*
 241          * Search the cache for the entry previously prepared by
 242          * PrepareClientEncoding; if there isn't one, we lose.  While at it,
 243          * release any duplicate entries so that repeated Prepare/Set cycles don't
 244          * leak memory.
 245          */
 246         found = false;
 247         foreach(lc, ConvProcList)
 248         {
 249                 ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
 250
 251                 if (convinfo->s_encoding == current_server_encoding &&
 252                         convinfo->c_encoding == encoding)
 253                 {
 254                         if (!found)
 255                         {
 256                                 /* Found newest entry, so set up */
 257                                 ClientEncoding = &pg_enc2name_tbl[encoding];
 258                                 ToServerConvProc = &convinfo->to_server_info;
 259                                 ToClientConvProc = &convinfo->to_client_info;
 260                                 found = true;
 261                         }
 262                         else
 263                         {
 264                                 /* Duplicate entry, release it */
 265                                 ConvProcList = foreach_delete_current(ConvProcList, lc);
 266                                 pfree(convinfo);
 267                         }
 268                 }
 269         }
 270
 271         if (found)
 272                 return 0;                               /* success */
 273         else
 274                 return -1;                              /* it's not cached, so fail */
 275 }
 276
 277 /*
 278  * Initialize client encoding conversions.
 279  *              Called from InitPostgres() once during backend startup.
 280  */
 281 void
 282 InitializeClientEncoding(void)
 283 {
 284         int                     current_server_encoding;
 285
 286         Assert(!backend_startup_complete);
 287         backend_startup_complete = true;
 288
 289         if (PrepareClientEncoding(pending_client_encoding) < 0 ||
 290                 SetClientEncoding(pending_client_encoding) < 0)
 291         {
 292                 /*
 293                  * Oops, the requested conversion is not available. We couldn't fail
 294                  * before, but we can now.
 295                  */
 296                 ereport(FATAL,
 297                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 298                                  errmsg("conversion between %s and %s is not supported",
 299                                                 pg_enc2name_tbl[pending_client_encoding].name,
 300                                                 GetDatabaseEncodingName())));
 301         }
 302
 303         /*
 304          * Also look up the UTF8-to-server conversion function if needed.  Since
 305          * the server encoding is fixed within any one backend process, we don't
 306          * have to do this more than once.
 307          */
 308         current_server_encoding = GetDatabaseEncoding();
 309         if (current_server_encoding != PG_UTF8 &&
 310                 current_server_encoding != PG_SQL_ASCII)
 311         {
 312                 Oid                     utf8_to_server_proc;
 313
 314                 Assert(IsTransactionState());
 315                 utf8_to_server_proc =
 316                         FindDefaultConversionProc(PG_UTF8,
 317                                                                           current_server_encoding);
 318                 /* If there's no such conversion, just leave the pointer as NULL */
 319                 if (OidIsValid(utf8_to_server_proc))
 320                 {
 321                         FmgrInfo   *finfo;
 322
 323                         finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
 324                                                                                                         sizeof(FmgrInfo));
 325                         fmgr_info_cxt(utf8_to_server_proc, finfo,
 326                                                   TopMemoryContext);
 327                         /* Set Utf8ToServerConvProc only after data is fully valid */
 328                         Utf8ToServerConvProc = finfo;
 329                 }
 330         }
 331 }
 332
 333 /*
 334  * returns the current client encoding
 335  */
 336 int
 337 pg_get_client_encoding(void)
 338 {
 339         return ClientEncoding->encoding;
 340 }
 341
 342 /*
 343  * returns the current client encoding name
 344  */
 345 const char *
 346 pg_get_client_encoding_name(void)
 347 {
 348         return ClientEncoding->name;
 349 }
 350
 351 /*
 352  * Convert src string to another encoding (general case).
 353  *
 354  * See the notes about string conversion functions at the top of this file.
 355  */
 356 unsigned char *
 357 pg_do_encoding_conversion(unsigned char *src, int len,
 358                                                   int src_encoding, int dest_encoding)
 359 {
 360         unsigned char *result;
 361         Oid                     proc;
 362
 363         if (len <= 0)
 364                 return src;                             /* empty string is always valid */
 365
 366         if (src_encoding == dest_encoding)
 367                 return src;                             /* no conversion required, assume valid */
 368
 369         if (dest_encoding == PG_SQL_ASCII)
 370                 return src;                             /* any string is valid in SQL_ASCII */
 371
 372         if (src_encoding == PG_SQL_ASCII)
 373         {
 374                 /* No conversion is possible, but we must validate the result */
 375                 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
 376                 return src;
 377         }
 378
 379         if (!IsTransactionState())      /* shouldn't happen */
 380                 elog(ERROR, "cannot perform encoding conversion outside a transaction");
 381
 382         proc = FindDefaultConversionProc(src_encoding, dest_encoding);
 383         if (!OidIsValid(proc))
 384                 ereport(ERROR,
 385                                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
 386                                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
 387                                                 pg_encoding_to_char(src_encoding),
 388                                                 pg_encoding_to_char(dest_encoding))));
 389
 390         /*
 391          * Allocate space for conversion result, being wary of integer overflow.
 392          *
 393          * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
 394          * required space, so it might exceed MaxAllocSize even though the result
 395          * would actually fit.  We do not want to hand back a result string that
 396          * exceeds MaxAllocSize, because callers might not cope gracefully --- but
 397          * if we just allocate more than that, and don't use it, that's fine.
 398          */
 399         if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
 400                 ereport(ERROR,
 401                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 402                                  errmsg("out of memory"),
 403                                  errdetail("String of %d bytes is too long for encoding conversion.",
 404                                                    len)));
 405
 406         result = (unsigned char *)
 407                 MemoryContextAllocHuge(CurrentMemoryContext,
 408                                                            (Size) len * MAX_CONVERSION_GROWTH + 1);
 409
 410         (void) OidFunctionCall6(proc,
 411                                                         Int32GetDatum(src_encoding),
 412                                                         Int32GetDatum(dest_encoding),
 413                                                         CStringGetDatum((char *) src),
 414                                                         CStringGetDatum((char *) result),
 415                                                         Int32GetDatum(len),
 416                                                         BoolGetDatum(false));
 417
 418         /*
 419          * If the result is large, it's worth repalloc'ing to release any extra
 420          * space we asked for.  The cutoff here is somewhat arbitrary, but we
 421          * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
 422          */
 423         if (len > 1000000)
 424         {
 425                 Size            resultlen = strlen((char *) result);
 426
 427                 if (resultlen >= MaxAllocSize)
 428                         ereport(ERROR,
 429                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 430                                          errmsg("out of memory"),
 431                                          errdetail("String of %d bytes is too long for encoding conversion.",
 432                                                            len)));
 433
 434                 result = (unsigned char *) repalloc(result, resultlen + 1);
 435         }
 436
 437         return result;
 438 }
 439
 440 /*
 441  * Convert src string to another encoding.
 442  *
 443  * This function has a different API than the other conversion functions.
 444  * The caller should've looked up the conversion function using
 445  * FindDefaultConversionProc().  Unlike the other functions, the converted
 446  * result is not palloc'd.  It is written to the caller-supplied buffer
 447  * instead.
 448  *
 449  * src_encoding   - encoding to convert from
 450  * dest_encoding  - encoding to convert to
 451  * src, srclen    - input buffer and its length in bytes
 452  * dest, destlen  - destination buffer and its size in bytes
 453  *
 454  * The output is null-terminated.
 455  *
 456  * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
 457  * wouldn't necessarily fit in the output buffer, and the function will not
 458  * convert the whole input.
 459  *
 460  * TODO: The conversion function interface is not great.  Firstly, it
 461  * would be nice to pass through the destination buffer size to the
 462  * conversion function, so that if you pass a shorter destination buffer, it
 463  * could still continue to fill up the whole buffer.  Currently, we have to
 464  * assume worst case expansion and stop the conversion short, even if there
 465  * is in fact space left in the destination buffer.  Secondly, it would be
 466  * nice to return the number of bytes written to the caller, to avoid a call
 467  * to strlen().
 468  */
 469 int
 470 pg_do_encoding_conversion_buf(Oid proc,
 471                                                           int src_encoding,
 472                                                           int dest_encoding,
 473                                                           unsigned char *src, int srclen,
 474                                                           unsigned char *dest, int destlen,
 475                                                           bool noError)
 476 {
 477         Datum           result;
 478
 479         /*
 480          * If the destination buffer is not large enough to hold the result in the
 481          * worst case, limit the input size passed to the conversion function.
 482          */
 483         if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
 484                 srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
 485
 486         result = OidFunctionCall6(proc,
 487                                                           Int32GetDatum(src_encoding),
 488                                                           Int32GetDatum(dest_encoding),
 489                                                           CStringGetDatum((char *) src),
 490                                                           CStringGetDatum((char *) dest),
 491                                                           Int32GetDatum(srclen),
 492                                                           BoolGetDatum(noError));
 493         return DatumGetInt32(result);
 494 }
 495
 496 /*
 497  * Convert string to encoding encoding_name. The source
 498  * encoding is the DB encoding.
 499  *
 500  * BYTEA convert_to(TEXT string, NAME encoding_name) */
 501 Datum
 502 pg_convert_to(PG_FUNCTION_ARGS)
 503 {
 504         Datum           string = PG_GETARG_DATUM(0);
 505         Datum           dest_encoding_name = PG_GETARG_DATUM(1);
 506         Datum           src_encoding_name = DirectFunctionCall1(namein,
 507                                                                                                                 CStringGetDatum(DatabaseEncoding->name));
 508         Datum           result;
 509
 510         /*
 511          * pg_convert expects a bytea as its first argument. We're passing it a
 512          * text argument here, relying on the fact that they are both in fact
 513          * varlena types, and thus structurally identical.
 514          */
 515         result = DirectFunctionCall3(pg_convert, string,
 516                                                                  src_encoding_name, dest_encoding_name);
 517
 518         PG_RETURN_DATUM(result);
 519 }
 520
 521 /*
 522  * Convert string from encoding encoding_name. The destination
 523  * encoding is the DB encoding.
 524  *
 525  * TEXT convert_from(BYTEA string, NAME encoding_name) */
 526 Datum
 527 pg_convert_from(PG_FUNCTION_ARGS)
 528 {
 529         Datum           string = PG_GETARG_DATUM(0);
 530         Datum           src_encoding_name = PG_GETARG_DATUM(1);
 531         Datum           dest_encoding_name = DirectFunctionCall1(namein,
 532                                                                                                                  CStringGetDatum(DatabaseEncoding->name));
 533         Datum           result;
 534
 535         result = DirectFunctionCall3(pg_convert, string,
 536                                                                  src_encoding_name, dest_encoding_name);
 537
 538         /*
 539          * pg_convert returns a bytea, which we in turn return as text, relying on
 540          * the fact that they are both in fact varlena types, and thus
 541          * structurally identical. Although not all bytea values are valid text,
 542          * in this case it will be because we've told pg_convert to return one
 543          * that is valid as text in the current database encoding.
 544          */
 545         PG_RETURN_DATUM(result);
 546 }
 547
 548 /*
 549  * Convert string between two arbitrary encodings.
 550  *
 551  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
 552  */
 553 Datum
 554 pg_convert(PG_FUNCTION_ARGS)
 555 {
 556         bytea      *string = PG_GETARG_BYTEA_PP(0);
 557         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 558         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 559         char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 560         int                     dest_encoding = pg_char_to_encoding(dest_encoding_name);
 561         const char *src_str;
 562         char       *dest_str;
 563         bytea      *retval;
 564         int                     len;
 565
 566         if (src_encoding < 0)
 567                 ereport(ERROR,
 568                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 569                                  errmsg("invalid source encoding name \"%s\"",
 570                                                 src_encoding_name)));
 571         if (dest_encoding < 0)
 572                 ereport(ERROR,
 573                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 574                                  errmsg("invalid destination encoding name \"%s\"",
 575                                                 dest_encoding_name)));
 576
 577         /* make sure that source string is valid */
 578         len = VARSIZE_ANY_EXHDR(string);
 579         src_str = VARDATA_ANY(string);
 580         (void) pg_verify_mbstr(src_encoding, src_str, len, false);
 581
 582         /* perform conversion */
 583         dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
 584                                                                                                   len,
 585                                                                                                   src_encoding,
 586                                                                                                   dest_encoding);
 587
 588         /* update len if conversion actually happened */
 589         if (dest_str != src_str)
 590                 len = strlen(dest_str);
 591
 592         /*
 593          * build bytea data type structure.
 594          */
 595         retval = (bytea *) palloc(len + VARHDRSZ);
 596         SET_VARSIZE(retval, len + VARHDRSZ);
 597         memcpy(VARDATA(retval), dest_str, len);
 598
 599         if (dest_str != src_str)
 600                 pfree(dest_str);
 601
 602         /* free memory if allocated by the toaster */
 603         PG_FREE_IF_COPY(string, 0);
 604
 605         PG_RETURN_BYTEA_P(retval);
 606 }
 607
 608 /*
 609  * get the length of the string considered as text in the specified
 610  * encoding. Raises an error if the data is not valid in that
 611  * encoding.
 612  *
 613  * INT4 length (BYTEA string, NAME src_encoding_name)
 614  */
 615 Datum
 616 length_in_encoding(PG_FUNCTION_ARGS)
 617 {
 618         bytea      *string = PG_GETARG_BYTEA_PP(0);
 619         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 620         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 621         const char *src_str;
 622         int                     len;
 623         int                     retval;
 624
 625         if (src_encoding < 0)
 626                 ereport(ERROR,
 627                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 628                                  errmsg("invalid encoding name \"%s\"",
 629                                                 src_encoding_name)));
 630
 631         len = VARSIZE_ANY_EXHDR(string);
 632         src_str = VARDATA_ANY(string);
 633
 634         retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
 635
 636         PG_RETURN_INT32(retval);
 637 }
 638
 639 /*
 640  * Get maximum multibyte character length in the specified encoding.
 641  *
 642  * Note encoding is specified numerically, not by name as above.
 643  */
 644 Datum
 645 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
 646 {
 647         int                     encoding = PG_GETARG_INT32(0);
 648
 649         if (PG_VALID_ENCODING(encoding))
 650                 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
 651         else
 652                 PG_RETURN_NULL();
 653 }
 654
 655 /*
 656  * Convert client encoding to server encoding.
 657  *
 658  * See the notes about string conversion functions at the top of this file.
 659  */
 660 char *
 661 pg_client_to_server(const char *s, int len)
 662 {
 663         return pg_any_to_server(s, len, ClientEncoding->encoding);
 664 }
 665
 666 /*
 667  * Convert any encoding to server encoding.
 668  *
 669  * See the notes about string conversion functions at the top of this file.
 670  *
 671  * Unlike the other string conversion functions, this will apply validation
 672  * even if encoding == DatabaseEncoding->encoding.  This is because this is
 673  * used to process data coming in from outside the database, and we never
 674  * want to just assume validity.
 675  */
 676 char *
 677 pg_any_to_server(const char *s, int len, int encoding)
 678 {
 679         if (len <= 0)
 680                 return unconstify(char *, s);   /* empty string is always valid */
 681
 682         if (encoding == DatabaseEncoding->encoding ||
 683                 encoding == PG_SQL_ASCII)
 684         {
 685                 /*
 686                  * No conversion is needed, but we must still validate the data.
 687                  */
 688                 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
 689                 return unconstify(char *, s);
 690         }
 691
 692         if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 693         {
 694                 /*
 695                  * No conversion is possible, but we must still validate the data,
 696                  * because the client-side code might have done string escaping using
 697                  * the selected client_encoding.  If the client encoding is ASCII-safe
 698                  * then we just do a straight validation under that encoding.  For an
 699                  * ASCII-unsafe encoding we have a problem: we dare not pass such data
 700                  * to the parser but we have no way to convert it.  We compromise by
 701                  * rejecting the data if it contains any non-ASCII characters.
 702                  */
 703                 if (PG_VALID_BE_ENCODING(encoding))
 704                         (void) pg_verify_mbstr(encoding, s, len, false);
 705                 else
 706                 {
 707                         int                     i;
 708
 709                         for (i = 0; i < len; i++)
 710                         {
 711                                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
 712                                         ereport(ERROR,
 713                                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 714                                                          errmsg("invalid byte value for encoding \"%s\": 0x%02x",
 715                                                                         pg_enc2name_tbl[PG_SQL_ASCII].name,
 716                                                                         (unsigned char) s[i])));
 717                         }
 718                 }
 719                 return unconstify(char *, s);
 720         }
 721
 722         /* Fast path if we can use cached conversion function */
 723         if (encoding == ClientEncoding->encoding)
 724                 return perform_default_encoding_conversion(s, len, true);
 725
 726         /* General case ... will not work outside transactions */
 727         return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
 728                                                                                           len,
 729                                                                                           encoding,
 730                                                                                           DatabaseEncoding->encoding);
 731 }
 732
 733 /*
 734  * Convert server encoding to client encoding.
 735  *
 736  * See the notes about string conversion functions at the top of this file.
 737  */
 738 char *
 739 pg_server_to_client(const char *s, int len)
 740 {
 741         return pg_server_to_any(s, len, ClientEncoding->encoding);
 742 }
 743
 744 /*
 745  * Convert server encoding to any encoding.
 746  *
 747  * See the notes about string conversion functions at the top of this file.
 748  */
 749 char *
 750 pg_server_to_any(const char *s, int len, int encoding)
 751 {
 752         if (len <= 0)
 753                 return unconstify(char *, s);   /* empty string is always valid */
 754
 755         if (encoding == DatabaseEncoding->encoding ||
 756                 encoding == PG_SQL_ASCII)
 757                 return unconstify(char *, s);   /* assume data is valid */
 758
 759         if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 760         {
 761                 /* No conversion is possible, but we must validate the result */
 762                 (void) pg_verify_mbstr(encoding, s, len, false);
 763                 return unconstify(char *, s);
 764         }
 765
 766         /* Fast path if we can use cached conversion function */
 767         if (encoding == ClientEncoding->encoding)
 768                 return perform_default_encoding_conversion(s, len, false);
 769
 770         /* General case ... will not work outside transactions */
 771         return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
 772                                                                                           len,
 773                                                                                           DatabaseEncoding->encoding,
 774                                                                                           encoding);
 775 }
 776
 777 /*
 778  *      Perform default encoding conversion using cached FmgrInfo. Since
 779  *      this function does not access database at all, it is safe to call
 780  *      outside transactions.  If the conversion has not been set up by
 781  *      SetClientEncoding(), no conversion is performed.
 782  */
 783 static char *
 784 perform_default_encoding_conversion(const char *src, int len,
 785                                                                         bool is_client_to_server)
 786 {
 787         char       *result;
 788         int                     src_encoding,
 789                                 dest_encoding;
 790         FmgrInfo   *flinfo;
 791
 792         if (is_client_to_server)
 793         {
 794                 src_encoding = ClientEncoding->encoding;
 795                 dest_encoding = DatabaseEncoding->encoding;
 796                 flinfo = ToServerConvProc;
 797         }
 798         else
 799         {
 800                 src_encoding = DatabaseEncoding->encoding;
 801                 dest_encoding = ClientEncoding->encoding;
 802                 flinfo = ToClientConvProc;
 803         }
 804
 805         if (flinfo == NULL)
 806                 return unconstify(char *, src);
 807
 808         /*
 809          * Allocate space for conversion result, being wary of integer overflow.
 810          * See comments in pg_do_encoding_conversion.
 811          */
 812         if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
 813                 ereport(ERROR,
 814                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 815                                  errmsg("out of memory"),
 816                                  errdetail("String of %d bytes is too long for encoding conversion.",
 817                                                    len)));
 818
 819         result = (char *)
 820                 MemoryContextAllocHuge(CurrentMemoryContext,
 821                                                            (Size) len * MAX_CONVERSION_GROWTH + 1);
 822
 823         FunctionCall6(flinfo,
 824                                   Int32GetDatum(src_encoding),
 825                                   Int32GetDatum(dest_encoding),
 826                                   CStringGetDatum(src),
 827                                   CStringGetDatum(result),
 828                                   Int32GetDatum(len),
 829                                   BoolGetDatum(false));
 830
 831         /*
 832          * Release extra space if there might be a lot --- see comments in
 833          * pg_do_encoding_conversion.
 834          */
 835         if (len > 1000000)
 836         {
 837                 Size            resultlen = strlen(result);
 838
 839                 if (resultlen >= MaxAllocSize)
 840                         ereport(ERROR,
 841                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 842                                          errmsg("out of memory"),
 843                                          errdetail("String of %d bytes is too long for encoding conversion.",
 844                                                            len)));
 845
 846                 result = (char *) repalloc(result, resultlen + 1);
 847         }
 848
 849         return result;
 850 }
 851
 852 /*
 853  * Convert a single Unicode code point into a string in the server encoding.
 854  *
 855  * The code point given by "c" is converted and stored at *s, which must
 856  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
 857  * The output will have a trailing '\0'.  Throws error if the conversion
 858  * cannot be performed.
 859  *
 860  * Note that this relies on having previously looked up any required
 861  * conversion function.  That's partly for speed but mostly because the parser
 862  * may call this outside any transaction, or in an aborted transaction.
 863  */
 864 void
 865 pg_unicode_to_server(pg_wchar c, unsigned char *s)
 866 {
 867         unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
 868         int                     c_as_utf8_len;
 869         int                     server_encoding;
 870
 871         /*
 872          * Complain if invalid Unicode code point.  The choice of errcode here is
 873          * debatable, but really our caller should have checked this anyway.
 874          */
 875         if (!is_valid_unicode_codepoint(c))
 876                 ereport(ERROR,
 877                                 (errcode(ERRCODE_SYNTAX_ERROR),
 878                                  errmsg("invalid Unicode code point")));
 879
 880         /* Otherwise, if it's in ASCII range, conversion is trivial */
 881         if (c <= 0x7F)
 882         {
 883                 s[0] = (unsigned char) c;
 884                 s[1] = '\0';
 885                 return;
 886         }
 887
 888         /* If the server encoding is UTF-8, we just need to reformat the code */
 889         server_encoding = GetDatabaseEncoding();
 890         if (server_encoding == PG_UTF8)
 891         {
 892                 unicode_to_utf8(c, s);
 893                 s[pg_utf_mblen(s)] = '\0';
 894                 return;
 895         }
 896
 897         /* For all other cases, we must have a conversion function available */
 898         if (Utf8ToServerConvProc == NULL)
 899                 ereport(ERROR,
 900                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 901                                  errmsg("conversion between %s and %s is not supported",
 902                                                 pg_enc2name_tbl[PG_UTF8].name,
 903                                                 GetDatabaseEncodingName())));
 904
 905         /* Construct UTF-8 source string */
 906         unicode_to_utf8(c, c_as_utf8);
 907         c_as_utf8_len = pg_utf_mblen(c_as_utf8);
 908         c_as_utf8[c_as_utf8_len] = '\0';
 909
 910         /* Convert, or throw error if we can't */
 911         FunctionCall6(Utf8ToServerConvProc,
 912                                   Int32GetDatum(PG_UTF8),
 913                                   Int32GetDatum(server_encoding),
 914                                   CStringGetDatum((char *) c_as_utf8),
 915                                   CStringGetDatum((char *) s),
 916                                   Int32GetDatum(c_as_utf8_len),
 917                                   BoolGetDatum(false));
 918 }
 919
 920 /*
 921  * Convert a single Unicode code point into a string in the server encoding.
 922  *
 923  * Same as pg_unicode_to_server(), except that we don't throw errors,
 924  * but simply return false on conversion failure.
 925  */
 926 bool
 927 pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
 928 {
 929         unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
 930         int                     c_as_utf8_len;
 931         int                     converted_len;
 932         int                     server_encoding;
 933
 934         /* Fail if invalid Unicode code point */
 935         if (!is_valid_unicode_codepoint(c))
 936                 return false;
 937
 938         /* Otherwise, if it's in ASCII range, conversion is trivial */
 939         if (c <= 0x7F)
 940         {
 941                 s[0] = (unsigned char) c;
 942                 s[1] = '\0';
 943                 return true;
 944         }
 945
 946         /* If the server encoding is UTF-8, we just need to reformat the code */
 947         server_encoding = GetDatabaseEncoding();
 948         if (server_encoding == PG_UTF8)
 949         {
 950                 unicode_to_utf8(c, s);
 951                 s[pg_utf_mblen(s)] = '\0';
 952                 return true;
 953         }
 954
 955         /* For all other cases, we must have a conversion function available */
 956         if (Utf8ToServerConvProc == NULL)
 957                 return false;
 958
 959         /* Construct UTF-8 source string */
 960         unicode_to_utf8(c, c_as_utf8);
 961         c_as_utf8_len = pg_utf_mblen(c_as_utf8);
 962         c_as_utf8[c_as_utf8_len] = '\0';
 963
 964         /* Convert, but without throwing error if we can't */
 965         converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
 966                                                                                                 Int32GetDatum(PG_UTF8),
 967                                                                                                 Int32GetDatum(server_encoding),
 968                                                                                                 CStringGetDatum((char *) c_as_utf8),
 969                                                                                                 CStringGetDatum((char *) s),
 970                                                                                                 Int32GetDatum(c_as_utf8_len),
 971                                                                                                 BoolGetDatum(true)));
 972
 973         /* Conversion was successful iff it consumed the whole input */
 974         return (converted_len == c_as_utf8_len);
 975 }
 976
 977
 978 /* convert a multibyte string to a wchar */
 979 int
 980 pg_mb2wchar(const char *from, pg_wchar *to)
 981 {
 982         return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
 983 }
 984
 985 /* convert a multibyte string to a wchar with a limited length */
 986 int
 987 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
 988 {
 989         return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
 990 }
 991
 992 /* same, with any encoding */
 993 int
 994 pg_encoding_mb2wchar_with_len(int encoding,
 995                                                           const char *from, pg_wchar *to, int len)
 996 {
 997         return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
 998 }
 999
1000 /* convert a wchar string to a multibyte */
1001 int
1002 pg_wchar2mb(const pg_wchar *from, char *to)
1003 {
1004         return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
1005 }
1006
1007 /* convert a wchar string to a multibyte with a limited length */
1008 int
1009 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
1010 {
1011         return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1012 }
1013
1014 /* same, with any encoding */
1015 int
1016 pg_encoding_wchar2mb_with_len(int encoding,
1017                                                           const pg_wchar *from, char *to, int len)
1018 {
1019         return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1020 }
1021
1022 /* returns the byte length of a multibyte character */
1023 int
1024 pg_mblen(const char *mbstr)
1025 {
1026         return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1027 }
1028
1029 /* returns the display length of a multibyte character */
1030 int
1031 pg_dsplen(const char *mbstr)
1032 {
1033         return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
1034 }
1035
1036 /* returns the length (counted in wchars) of a multibyte string */
1037 int
1038 pg_mbstrlen(const char *mbstr)
1039 {
1040         int                     len = 0;
1041
1042         /* optimization for single byte encoding */
1043         if (pg_database_encoding_max_length() == 1)
1044                 return strlen(mbstr);
1045
1046         while (*mbstr)
1047         {
1048                 mbstr += pg_mblen(mbstr);
1049                 len++;
1050         }
1051         return len;
1052 }
1053
1054 /* returns the length (counted in wchars) of a multibyte string
1055  * (not necessarily NULL terminated)
1056  */
1057 int
1058 pg_mbstrlen_with_len(const char *mbstr, int limit)
1059 {
1060         int                     len = 0;
1061
1062         /* optimization for single byte encoding */
1063         if (pg_database_encoding_max_length() == 1)
1064                 return limit;
1065
1066         while (limit > 0 && *mbstr)
1067         {
1068                 int                     l = pg_mblen(mbstr);
1069
1070                 limit -= l;
1071                 mbstr += l;
1072                 len++;
1073         }
1074         return len;
1075 }
1076
1077 /*
1078  * returns the byte length of a multibyte string
1079  * (not necessarily NULL terminated)
1080  * that is no longer than limit.
1081  * this function does not break multibyte character boundary.
1082  */
1083 int
1084 pg_mbcliplen(const char *mbstr, int len, int limit)
1085 {
1086         return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
1087                                                                  len, limit);
1088 }
1089
1090 /*
1091  * pg_mbcliplen with specified encoding
1092  */
1093 int
1094 pg_encoding_mbcliplen(int encoding, const char *mbstr,
1095                                           int len, int limit)
1096 {
1097         mblen_converter mblen_fn;
1098         int                     clen = 0;
1099         int                     l;
1100
1101         /* optimization for single byte encoding */
1102         if (pg_encoding_max_length(encoding) == 1)
1103                 return cliplen(mbstr, len, limit);
1104
1105         mblen_fn = pg_wchar_table[encoding].mblen;
1106
1107         while (len > 0 && *mbstr)
1108         {
1109                 l = (*mblen_fn) ((const unsigned char *) mbstr);
1110                 if ((clen + l) > limit)
1111                         break;
1112                 clen += l;
1113                 if (clen == limit)
1114                         break;
1115                 len -= l;
1116                 mbstr += l;
1117         }
1118         return clen;
1119 }
1120
1121 /*
1122  * Similar to pg_mbcliplen except the limit parameter specifies the
1123  * character length, not the byte length.
1124  */
1125 int
1126 pg_mbcharcliplen(const char *mbstr, int len, int limit)
1127 {
1128         int                     clen = 0;
1129         int                     nch = 0;
1130         int                     l;
1131
1132         /* optimization for single byte encoding */
1133         if (pg_database_encoding_max_length() == 1)
1134                 return cliplen(mbstr, len, limit);
1135
1136         while (len > 0 && *mbstr)
1137         {
1138                 l = pg_mblen(mbstr);
1139                 nch++;
1140                 if (nch > limit)
1141                         break;
1142                 clen += l;
1143                 len -= l;
1144                 mbstr += l;
1145         }
1146         return clen;
1147 }
1148
1149 /* mbcliplen for any single-byte encoding */
1150 static int
1151 cliplen(const char *str, int len, int limit)
1152 {
1153         int                     l = 0;
1154
1155         len = Min(len, limit);
1156         while (l < len && str[l])
1157                 l++;
1158         return l;
1159 }
1160
1161 void
1162 SetDatabaseEncoding(int encoding)
1163 {
1164         if (!PG_VALID_BE_ENCODING(encoding))
1165                 elog(ERROR, "invalid database encoding: %d", encoding);
1166
1167         DatabaseEncoding = &pg_enc2name_tbl[encoding];
1168         Assert(DatabaseEncoding->encoding == encoding);
1169 }
1170
1171 void
1172 SetMessageEncoding(int encoding)
1173 {
1174         /* Some calls happen before we can elog()! */
1175         Assert(PG_VALID_ENCODING(encoding));
1176
1177         MessageEncoding = &pg_enc2name_tbl[encoding];
1178         Assert(MessageEncoding->encoding == encoding);
1179 }
1180
1181 #ifdef ENABLE_NLS
1182 /*
1183  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1184  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1185  * fail for gettext-internal causes like out-of-memory.
1186  */
1187 static bool
1188 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1189 {
1190         bool            elog_ok = (CurrentMemoryContext != NULL);
1191         int                     i;
1192
1193         for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
1194         {
1195                 if (pg_enc2gettext_tbl[i].encoding == encoding)
1196                 {
1197                         if (bind_textdomain_codeset(domainname,
1198                                                                                 pg_enc2gettext_tbl[i].name) != NULL)
1199                                 return true;
1200
1201                         if (elog_ok)
1202                                 elog(LOG, "bind_textdomain_codeset failed");
1203                         else
1204                                 write_stderr("bind_textdomain_codeset failed");
1205
1206                         break;
1207                 }
1208         }
1209
1210         return false;
1211 }
1212
1213 /*
1214  * Bind a gettext message domain to the codeset corresponding to the database
1215  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1216  * Return the MessageEncoding implied by the new settings.
1217  *
1218  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1219  * When that matches the database encoding, we don't need to do anything.  In
1220  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1221  * database encoding, except for the C locale.  (On Windows, we also permit a
1222  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
1223  * gettext to the right codeset.
1224  *
1225  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
1226  * convenient departure for software that passes the strings to Windows ANSI
1227  * APIs, but we don't do that.  Compel gettext to use database encoding or,
1228  * failing that, the LC_CTYPE encoding as it would on other platforms.
1229  *
1230  * This function is called before elog() and palloc() are usable.
1231  */
1232 int
1233 pg_bind_textdomain_codeset(const char *domainname)
1234 {
1235         bool            elog_ok = (CurrentMemoryContext != NULL);
1236         int                     encoding = GetDatabaseEncoding();
1237         int                     new_msgenc;
1238
1239 #ifndef WIN32
1240         const char *ctype = setlocale(LC_CTYPE, NULL);
1241
1242         if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1243 #endif
1244                 if (encoding != PG_SQL_ASCII &&
1245                         raw_pg_bind_textdomain_codeset(domainname, encoding))
1246                         return encoding;
1247
1248         new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1249         if (new_msgenc < 0)
1250                 new_msgenc = PG_SQL_ASCII;
1251
1252 #ifdef WIN32
1253         if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1254                 /* On failure, the old message encoding remains valid. */
1255                 return GetMessageEncoding();
1256 #endif
1257
1258         return new_msgenc;
1259 }
1260 #endif
1261
1262 /*
1263  * The database encoding, also called the server encoding, represents the
1264  * encoding of data stored in text-like data types.  Affected types include
1265  * cstring, text, varchar, name, xml, and json.
1266  */
1267 int
1268 GetDatabaseEncoding(void)
1269 {
1270         return DatabaseEncoding->encoding;
1271 }
1272
1273 const char *
1274 GetDatabaseEncodingName(void)
1275 {
1276         return DatabaseEncoding->name;
1277 }
1278
1279 Datum
1280 getdatabaseencoding(PG_FUNCTION_ARGS)
1281 {
1282         return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1283 }
1284
1285 Datum
1286 pg_client_encoding(PG_FUNCTION_ARGS)
1287 {
1288         return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1289 }
1290
1291 Datum
1292 PG_char_to_encoding(PG_FUNCTION_ARGS)
1293 {
1294         Name            s = PG_GETARG_NAME(0);
1295
1296         PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
1297 }
1298
1299 Datum
1300 PG_encoding_to_char(PG_FUNCTION_ARGS)
1301 {
1302         int32           encoding = PG_GETARG_INT32(0);
1303         const char *encoding_name = pg_encoding_to_char(encoding);
1304
1305         return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1306 }
1307
1308 /*
1309  * gettext() returns messages in this encoding.  This often matches the
1310  * database encoding, but it differs for SQL_ASCII databases, for processes
1311  * not attached to a database, and under a database encoding lacking iconv
1312  * support (MULE_INTERNAL).
1313  */
1314 int
1315 GetMessageEncoding(void)
1316 {
1317         return MessageEncoding->encoding;
1318 }
1319
1320
1321 /*
1322  * Generic character incrementer function.
1323  *
1324  * Not knowing anything about the properties of the encoding in use, we just
1325  * keep incrementing the last byte until we get a validly-encoded result,
1326  * or we run out of values to try.  We don't bother to try incrementing
1327  * higher-order bytes, so there's no growth in runtime for wider characters.
1328  * (If we did try to do that, we'd need to consider the likelihood that 255
1329  * is not a valid final byte in the encoding.)
1330  */
1331 static bool
1332 pg_generic_charinc(unsigned char *charptr, int len)
1333 {
1334         unsigned char *lastbyte = charptr + len - 1;
1335         mbchar_verifier mbverify;
1336
1337         /* We can just invoke the character verifier directly. */
1338         mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
1339
1340         while (*lastbyte < (unsigned char) 255)
1341         {
1342                 (*lastbyte)++;
1343                 if ((*mbverify) (charptr, len) == len)
1344                         return true;
1345         }
1346
1347         return false;
1348 }
1349
1350 /*
1351  * UTF-8 character incrementer function.
1352  *
1353  * For a one-byte character less than 0x7F, we just increment the byte.
1354  *
1355  * For a multibyte character, every byte but the first must fall between 0x80
1356  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
1357  * the last byte that's not already at its maximum value.  If we can't find a
1358  * byte that's less than the maximum allowable value, we simply fail.  We also
1359  * need some special-case logic to skip regions used for surrogate pair
1360  * handling, as those should not occur in valid UTF-8.
1361  *
1362  * Note that we don't reset lower-order bytes back to their minimums, since
1363  * we can't afford to make an exhaustive search (see make_greater_string).
1364  */
1365 static bool
1366 pg_utf8_increment(unsigned char *charptr, int length)
1367 {
1368         unsigned char a;
1369         unsigned char limit;
1370
1371         switch (length)
1372         {
1373                 default:
1374                         /* reject lengths 5 and 6 for now */
1375                         return false;
1376                 case 4:
1377                         a = charptr[3];
1378                         if (a < 0xBF)
1379                         {
1380                                 charptr[3]++;
1381                                 break;
1382                         }
1383                         /* FALL THRU */
1384                 case 3:
1385                         a = charptr[2];
1386                         if (a < 0xBF)
1387                         {
1388                                 charptr[2]++;
1389                                 break;
1390                         }
1391                         /* FALL THRU */
1392                 case 2:
1393                         a = charptr[1];
1394                         switch (*charptr)
1395                         {
1396                                 case 0xED:
1397                                         limit = 0x9F;
1398                                         break;
1399                                 case 0xF4:
1400                                         limit = 0x8F;
1401                                         break;
1402                                 default:
1403                                         limit = 0xBF;
1404                                         break;
1405                         }
1406                         if (a < limit)
1407                         {
1408                                 charptr[1]++;
1409                                 break;
1410                         }
1411                         /* FALL THRU */
1412                 case 1:
1413                         a = *charptr;
1414                         if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1415                                 return false;
1416                         charptr[0]++;
1417                         break;
1418         }
1419
1420         return true;
1421 }
1422
1423 /*
1424  * EUC-JP character incrementer function.
1425  *
1426  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1427  * representing JIS X 0201 characters with the second byte ranging between
1428  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
1429  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1430  *
1431  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1432  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
1433  * is incremented if possible, otherwise the second-to-last byte.
1434  *
1435  * If the sequence starts with a value other than the above and its MSB
1436  * is set, it must be a two-byte sequence representing JIS X 0208 characters
1437  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
1438  * incremented if possible, otherwise the second-to-last byte.
1439  *
1440  * Otherwise, the sequence is a single-byte ASCII character. It is
1441  * incremented up to 0x7f.
1442  */
1443 static bool
1444 pg_eucjp_increment(unsigned char *charptr, int length)
1445 {
1446         unsigned char c1,
1447                                 c2;
1448         int                     i;
1449
1450         c1 = *charptr;
1451
1452         switch (c1)
1453         {
1454                 case SS2:                               /* JIS X 0201 */
1455                         if (length != 2)
1456                                 return false;
1457
1458                         c2 = charptr[1];
1459
1460                         if (c2 >= 0xdf)
1461                                 charptr[0] = charptr[1] = 0xa1;
1462                         else if (c2 < 0xa1)
1463                                 charptr[1] = 0xa1;
1464                         else
1465                                 charptr[1]++;
1466                         break;
1467
1468                 case SS3:                               /* JIS X 0212 */
1469                         if (length != 3)
1470                                 return false;
1471
1472                         for (i = 2; i > 0; i--)
1473                         {
1474                                 c2 = charptr[i];
1475                                 if (c2 < 0xa1)
1476                                 {
1477                                         charptr[i] = 0xa1;
1478                                         return true;
1479                                 }
1480                                 else if (c2 < 0xfe)
1481                                 {
1482                                         charptr[i]++;
1483                                         return true;
1484                                 }
1485                         }
1486
1487                         /* Out of 3-byte code region */
1488                         return false;
1489
1490                 default:
1491                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1492                         {
1493                                 if (length != 2)
1494                                         return false;
1495
1496                                 for (i = 1; i >= 0; i--)
1497                                 {
1498                                         c2 = charptr[i];
1499                                         if (c2 < 0xa1)
1500                                         {
1501                                                 charptr[i] = 0xa1;
1502                                                 return true;
1503                                         }
1504                                         else if (c2 < 0xfe)
1505                                         {
1506                                                 charptr[i]++;
1507                                                 return true;
1508                                         }
1509                                 }
1510
1511                                 /* Out of 2 byte code region */
1512                                 return false;
1513                         }
1514                         else
1515                         {                                       /* ASCII, single byte */
1516                                 if (c1 > 0x7e)
1517                                         return false;
1518                                 (*charptr)++;
1519                         }
1520                         break;
1521         }
1522
1523         return true;
1524 }
1525
1526 /*
1527  * get the character incrementer for the encoding for the current database
1528  */
1529 mbcharacter_incrementer
1530 pg_database_encoding_character_incrementer(void)
1531 {
1532         /*
1533          * Eventually it might be best to add a field to pg_wchar_table[], but for
1534          * now we just use a switch.
1535          */
1536         switch (GetDatabaseEncoding())
1537         {
1538                 case PG_UTF8:
1539                         return pg_utf8_increment;
1540
1541                 case PG_EUC_JP:
1542                         return pg_eucjp_increment;
1543
1544                 default:
1545                         return pg_generic_charinc;
1546         }
1547 }
1548
1549 /*
1550  * fetch maximum length of the encoding for the current database
1551  */
1552 int
1553 pg_database_encoding_max_length(void)
1554 {
1555         return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1556 }
1557
1558 /*
1559  * Verify mbstr to make sure that it is validly encoded in the current
1560  * database encoding.  Otherwise same as pg_verify_mbstr().
1561  */
1562 bool
1563 pg_verifymbstr(const char *mbstr, int len, bool noError)
1564 {
1565         return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1566 }
1567
1568 /*
1569  * Verify mbstr to make sure that it is validly encoded in the specified
1570  * encoding.
1571  */
1572 bool
1573 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1574 {
1575         int                     oklen;
1576
1577         Assert(PG_VALID_ENCODING(encoding));
1578
1579         oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1580         if (oklen != len)
1581         {
1582                 if (noError)
1583                         return false;
1584                 report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1585         }
1586         return true;
1587 }
1588
1589 /*
1590  * Verify mbstr to make sure that it is validly encoded in the specified
1591  * encoding.
1592  *
1593  * mbstr is not necessarily zero terminated; length of mbstr is
1594  * specified by len.
1595  *
1596  * If OK, return length of string in the encoding.
1597  * If a problem is found, return -1 when noError is
1598  * true; when noError is false, ereport() a descriptive message.
1599  *
1600  * Note: We cannot use the faster encoding-specific mbverifystr() function
1601  * here, because we need to count the number of characters in the string.
1602  */
1603 int
1604 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1605 {
1606         mbchar_verifier mbverifychar;
1607         int                     mb_len;
1608
1609         Assert(PG_VALID_ENCODING(encoding));
1610
1611         /*
1612          * In single-byte encodings, we need only reject nulls (\0).
1613          */
1614         if (pg_encoding_max_length(encoding) <= 1)
1615         {
1616                 const char *nullpos = memchr(mbstr, 0, len);
1617
1618                 if (nullpos == NULL)
1619                         return len;
1620                 if (noError)
1621                         return -1;
1622                 report_invalid_encoding(encoding, nullpos, 1);
1623         }
1624
1625         /* fetch function pointer just once */
1626         mbverifychar = pg_wchar_table[encoding].mbverifychar;
1627
1628         mb_len = 0;
1629
1630         while (len > 0)
1631         {
1632                 int                     l;
1633
1634                 /* fast path for ASCII-subset characters */
1635                 if (!IS_HIGHBIT_SET(*mbstr))
1636                 {
1637                         if (*mbstr != '\0')
1638                         {
1639                                 mb_len++;
1640                                 mbstr++;
1641                                 len--;
1642                                 continue;
1643                         }
1644                         if (noError)
1645                                 return -1;
1646                         report_invalid_encoding(encoding, mbstr, len);
1647                 }
1648
1649                 l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1650
1651                 if (l < 0)
1652                 {
1653                         if (noError)
1654                                 return -1;
1655                         report_invalid_encoding(encoding, mbstr, len);
1656                 }
1657
1658                 mbstr += l;
1659                 len -= l;
1660                 mb_len++;
1661         }
1662         return mb_len;
1663 }
1664
1665 /*
1666  * check_encoding_conversion_args: check arguments of a conversion function
1667  *
1668  * "expected" arguments can be either an encoding ID or -1 to indicate that
1669  * the caller will check whether it accepts the ID.
1670  *
1671  * Note: the errors here are not really user-facing, so elog instead of
1672  * ereport seems sufficient.  Also, we trust that the "expected" encoding
1673  * arguments are valid encoding IDs, but we don't trust the actuals.
1674  */
1675 void
1676 check_encoding_conversion_args(int src_encoding,
1677                                                            int dest_encoding,
1678                                                            int len,
1679                                                            int expected_src_encoding,
1680                                                            int expected_dest_encoding)
1681 {
1682         if (!PG_VALID_ENCODING(src_encoding))
1683                 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1684         if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1685                 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1686                          pg_enc2name_tbl[expected_src_encoding].name,
1687                          pg_enc2name_tbl[src_encoding].name);
1688         if (!PG_VALID_ENCODING(dest_encoding))
1689                 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1690         if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1691                 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1692                          pg_enc2name_tbl[expected_dest_encoding].name,
1693                          pg_enc2name_tbl[dest_encoding].name);
1694         if (len < 0)
1695                 elog(ERROR, "encoding conversion length must not be negative");
1696 }
1697
1698 /*
1699  * report_invalid_encoding: complain about invalid multibyte character
1700  *
1701  * note: len is remaining length of string, not length of character;
1702  * len must be greater than zero, as we always examine the first byte.
1703  */
1704 void
1705 report_invalid_encoding(int encoding, const char *mbstr, int len)
1706 {
1707         int                     l = pg_encoding_mblen(encoding, mbstr);
1708         char            buf[8 * 5 + 1];
1709         char       *p = buf;
1710         int                     j,
1711                                 jlimit;
1712
1713         jlimit = Min(l, len);
1714         jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
1715
1716         for (j = 0; j < jlimit; j++)
1717         {
1718                 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1719                 if (j < jlimit - 1)
1720                         p += sprintf(p, " ");
1721         }
1722
1723         ereport(ERROR,
1724                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1725                          errmsg("invalid byte sequence for encoding \"%s\": %s",
1726                                         pg_enc2name_tbl[encoding].name,
1727                                         buf)));
1728 }
1729
1730 /*
1731  * report_untranslatable_char: complain about untranslatable character
1732  *
1733  * note: len is remaining length of string, not length of character;
1734  * len must be greater than zero, as we always examine the first byte.
1735  */
1736 void
1737 report_untranslatable_char(int src_encoding, int dest_encoding,
1738                                                    const char *mbstr, int len)
1739 {
1740         int                     l = pg_encoding_mblen(src_encoding, mbstr);
1741         char            buf[8 * 5 + 1];
1742         char       *p = buf;
1743         int                     j,
1744                                 jlimit;
1745
1746         jlimit = Min(l, len);
1747         jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
1748
1749         for (j = 0; j < jlimit; j++)
1750         {
1751                 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1752                 if (j < jlimit - 1)
1753                         p += sprintf(p, " ");
1754         }
1755
1756         ereport(ERROR,
1757                         (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1758                          errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1759                                         buf,
1760                                         pg_enc2name_tbl[src_encoding].name,
1761                                         pg_enc2name_tbl[dest_encoding].name)));
1762 }
1763
1764
1765 #ifdef WIN32
1766 /*
1767  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1768  * string. The character length is also passed to utf16len if not
1769  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1770  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1771  */
1772 WCHAR *
1773 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1774 {
1775         int                     msgenc = GetMessageEncoding();
1776         WCHAR      *utf16;
1777         int                     dstlen;
1778         UINT            codepage;
1779
1780         if (msgenc == PG_SQL_ASCII)
1781                 /* No conversion is possible, and SQL_ASCII is never utf16. */
1782                 return NULL;
1783
1784         codepage = pg_enc2name_tbl[msgenc].codepage;
1785
1786         /*
1787          * Use MultiByteToWideChar directly if there is a corresponding codepage,
1788          * or double conversion through UTF8 if not.  Double conversion is needed,
1789          * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1790          */
1791         if (codepage != 0)
1792         {
1793                 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1794                 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1795                 utf16[dstlen] = (WCHAR) 0;
1796         }
1797         else
1798         {
1799                 char       *utf8;
1800
1801                 /*
1802                  * XXX pg_do_encoding_conversion() requires a transaction.  In the
1803                  * absence of one, hope for the input to be valid UTF8.
1804                  */
1805                 if (IsTransactionState())
1806                 {
1807                         utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1808                                                                                                           len,
1809                                                                                                           msgenc,
1810                                                                                                           PG_UTF8);
1811                         if (utf8 != str)
1812                                 len = strlen(utf8);
1813                 }
1814                 else
1815                         utf8 = (char *) str;
1816
1817                 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1818                 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1819                 utf16[dstlen] = (WCHAR) 0;
1820
1821                 if (utf8 != str)
1822                         pfree(utf8);
1823         }
1824
1825         if (dstlen == 0 && len > 0)
1826         {
1827                 pfree(utf16);
1828                 return NULL;                    /* error */
1829         }
1830
1831         if (utf16len)
1832                 *utf16len = dstlen;
1833         return utf16;
1834 }
1835
1836 #endif                                                  /* WIN32 */