doc: 1-byte varlena headers can be used for user PLAIN storage
[pgsql.git] / src / backend / utils / mb / mbutils.c
blob67a1ab2ab23b7aefe2725e24c5f93db6411ee122
1 /*-------------------------------------------------------------------------
3 * mbutils.c
4 * This file contains functions for encoding conversion.
6 * The string-conversion functions in this file share some API quirks.
7 * Note the following:
9 * The functions return a palloc'd, null-terminated string if conversion
10 * is required. However, if no conversion is performed, the given source
11 * string pointer is returned as-is.
13 * Although the presence of a length argument means that callers can pass
14 * non-null-terminated strings, care is required because the same string
15 * will be passed back if no conversion occurs. Such callers *must* check
16 * whether result == src and handle that case differently.
18 * If the source and destination encodings are the same, the source string
19 * is returned without any verification; it's assumed to be valid data.
20 * If that might not be the case, the caller is responsible for validating
21 * the string using a separate call to pg_verify_mbstr(). Whenever the
22 * source and destination encodings are different, the functions ensure that
23 * the result is validly encoded according to the destination encoding.
26 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
30 * IDENTIFICATION
31 * src/backend/utils/mb/mbutils.c
33 *-------------------------------------------------------------------------
35 #include "postgres.h"
37 #include "access/xact.h"
38 #include "catalog/namespace.h"
39 #include "mb/pg_wchar.h"
40 #include "utils/builtins.h"
41 #include "utils/memutils.h"
42 #include "utils/syscache.h"
43 #include "varatt.h"
46 * We maintain a simple linked list caching the fmgr lookup info for the
47 * currently selected conversion functions, as well as any that have been
48 * selected previously in the current session. (We remember previous
49 * settings because we must be able to restore a previous setting during
50 * transaction rollback, without doing any fresh catalog accesses.)
52 * Since we'll never release this data, we just keep it in TopMemoryContext.
54 typedef struct ConvProcInfo
56 int s_encoding; /* server and client encoding IDs */
57 int c_encoding;
58 FmgrInfo to_server_info; /* lookup info for conversion procs */
59 FmgrInfo to_client_info;
60 } ConvProcInfo;
62 static List *ConvProcList = NIL; /* List of ConvProcInfo */
65 * These variables point to the currently active conversion functions,
66 * or are NULL when no conversion is needed.
68 static FmgrInfo *ToServerConvProc = NULL;
69 static FmgrInfo *ToClientConvProc = NULL;
72 * This variable stores the conversion function to convert from UTF-8
73 * to the server encoding. It's NULL if the server encoding *is* UTF-8,
74 * or if we lack a conversion function for this.
76 static FmgrInfo *Utf8ToServerConvProc = NULL;
79 * These variables track the currently-selected encodings.
81 static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
82 static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
83 static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
86 * During backend startup we can't set client encoding because we (a)
87 * can't look up the conversion functions, and (b) may not know the database
88 * encoding yet either. So SetClientEncoding() just accepts anything and
89 * remembers it for InitializeClientEncoding() to apply later.
91 static bool backend_startup_complete = false;
92 static int pending_client_encoding = PG_SQL_ASCII;
95 /* Internal functions */
96 static char *perform_default_encoding_conversion(const char *src,
97 int len, bool is_client_to_server);
98 static int cliplen(const char *str, int len, int limit);
102 * Prepare for a future call to SetClientEncoding. Success should mean
103 * that SetClientEncoding is guaranteed to succeed for this encoding request.
105 * (But note that success before backend_startup_complete does not guarantee
106 * success after ...)
108 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
111 PrepareClientEncoding(int encoding)
113 int current_server_encoding;
114 ListCell *lc;
116 if (!PG_VALID_FE_ENCODING(encoding))
117 return -1;
119 /* Can't do anything during startup, per notes above */
120 if (!backend_startup_complete)
121 return 0;
123 current_server_encoding = GetDatabaseEncoding();
126 * Check for cases that require no conversion function.
128 if (current_server_encoding == encoding ||
129 current_server_encoding == PG_SQL_ASCII ||
130 encoding == PG_SQL_ASCII)
131 return 0;
133 if (IsTransactionState())
136 * If we're in a live transaction, it's safe to access the catalogs,
137 * so look up the functions. We repeat the lookup even if the info is
138 * already cached, so that we can react to changes in the contents of
139 * pg_conversion.
141 Oid to_server_proc,
142 to_client_proc;
143 ConvProcInfo *convinfo;
144 MemoryContext oldcontext;
146 to_server_proc = FindDefaultConversionProc(encoding,
147 current_server_encoding);
148 if (!OidIsValid(to_server_proc))
149 return -1;
150 to_client_proc = FindDefaultConversionProc(current_server_encoding,
151 encoding);
152 if (!OidIsValid(to_client_proc))
153 return -1;
156 * Load the fmgr info into TopMemoryContext (could still fail here)
158 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
159 sizeof(ConvProcInfo));
160 convinfo->s_encoding = current_server_encoding;
161 convinfo->c_encoding = encoding;
162 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
163 TopMemoryContext);
164 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
165 TopMemoryContext);
167 /* Attach new info to head of list */
168 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
169 ConvProcList = lcons(convinfo, ConvProcList);
170 MemoryContextSwitchTo(oldcontext);
173 * We cannot yet remove any older entry for the same encoding pair,
174 * since it could still be in use. SetClientEncoding will clean up.
177 return 0; /* success */
179 else
182 * If we're not in a live transaction, the only thing we can do is
183 * restore a previous setting using the cache. This covers all
184 * transaction-rollback cases. The only case it might not work for is
185 * trying to change client_encoding on the fly by editing
186 * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
187 * thing to do anyway.
189 foreach(lc, ConvProcList)
191 ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
193 if (oldinfo->s_encoding == current_server_encoding &&
194 oldinfo->c_encoding == encoding)
195 return 0;
198 return -1; /* it's not cached, so fail */
203 * Set the active client encoding and set up the conversion-function pointers.
204 * PrepareClientEncoding should have been called previously for this encoding.
206 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
209 SetClientEncoding(int encoding)
211 int current_server_encoding;
212 bool found;
213 ListCell *lc;
215 if (!PG_VALID_FE_ENCODING(encoding))
216 return -1;
218 /* Can't do anything during startup, per notes above */
219 if (!backend_startup_complete)
221 pending_client_encoding = encoding;
222 return 0;
225 current_server_encoding = GetDatabaseEncoding();
228 * Check for cases that require no conversion function.
230 if (current_server_encoding == encoding ||
231 current_server_encoding == PG_SQL_ASCII ||
232 encoding == PG_SQL_ASCII)
234 ClientEncoding = &pg_enc2name_tbl[encoding];
235 ToServerConvProc = NULL;
236 ToClientConvProc = NULL;
237 return 0;
241 * Search the cache for the entry previously prepared by
242 * PrepareClientEncoding; if there isn't one, we lose. While at it,
243 * release any duplicate entries so that repeated Prepare/Set cycles don't
244 * leak memory.
246 found = false;
247 foreach(lc, ConvProcList)
249 ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
251 if (convinfo->s_encoding == current_server_encoding &&
252 convinfo->c_encoding == encoding)
254 if (!found)
256 /* Found newest entry, so set up */
257 ClientEncoding = &pg_enc2name_tbl[encoding];
258 ToServerConvProc = &convinfo->to_server_info;
259 ToClientConvProc = &convinfo->to_client_info;
260 found = true;
262 else
264 /* Duplicate entry, release it */
265 ConvProcList = foreach_delete_current(ConvProcList, lc);
266 pfree(convinfo);
271 if (found)
272 return 0; /* success */
273 else
274 return -1; /* it's not cached, so fail */
278 * Initialize client encoding conversions.
279 * Called from InitPostgres() once during backend startup.
281 void
282 InitializeClientEncoding(void)
284 int current_server_encoding;
286 Assert(!backend_startup_complete);
287 backend_startup_complete = true;
289 if (PrepareClientEncoding(pending_client_encoding) < 0 ||
290 SetClientEncoding(pending_client_encoding) < 0)
293 * Oops, the requested conversion is not available. We couldn't fail
294 * before, but we can now.
296 ereport(FATAL,
297 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
298 errmsg("conversion between %s and %s is not supported",
299 pg_enc2name_tbl[pending_client_encoding].name,
300 GetDatabaseEncodingName())));
304 * Also look up the UTF8-to-server conversion function if needed. Since
305 * the server encoding is fixed within any one backend process, we don't
306 * have to do this more than once.
308 current_server_encoding = GetDatabaseEncoding();
309 if (current_server_encoding != PG_UTF8 &&
310 current_server_encoding != PG_SQL_ASCII)
312 Oid utf8_to_server_proc;
314 Assert(IsTransactionState());
315 utf8_to_server_proc =
316 FindDefaultConversionProc(PG_UTF8,
317 current_server_encoding);
318 /* If there's no such conversion, just leave the pointer as NULL */
319 if (OidIsValid(utf8_to_server_proc))
321 FmgrInfo *finfo;
323 finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
324 sizeof(FmgrInfo));
325 fmgr_info_cxt(utf8_to_server_proc, finfo,
326 TopMemoryContext);
327 /* Set Utf8ToServerConvProc only after data is fully valid */
328 Utf8ToServerConvProc = finfo;
334 * returns the current client encoding
337 pg_get_client_encoding(void)
339 return ClientEncoding->encoding;
343 * returns the current client encoding name
345 const char *
346 pg_get_client_encoding_name(void)
348 return ClientEncoding->name;
352 * Convert src string to another encoding (general case).
354 * See the notes about string conversion functions at the top of this file.
356 unsigned char *
357 pg_do_encoding_conversion(unsigned char *src, int len,
358 int src_encoding, int dest_encoding)
360 unsigned char *result;
361 Oid proc;
363 if (len <= 0)
364 return src; /* empty string is always valid */
366 if (src_encoding == dest_encoding)
367 return src; /* no conversion required, assume valid */
369 if (dest_encoding == PG_SQL_ASCII)
370 return src; /* any string is valid in SQL_ASCII */
372 if (src_encoding == PG_SQL_ASCII)
374 /* No conversion is possible, but we must validate the result */
375 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
376 return src;
379 if (!IsTransactionState()) /* shouldn't happen */
380 elog(ERROR, "cannot perform encoding conversion outside a transaction");
382 proc = FindDefaultConversionProc(src_encoding, dest_encoding);
383 if (!OidIsValid(proc))
384 ereport(ERROR,
385 (errcode(ERRCODE_UNDEFINED_FUNCTION),
386 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
387 pg_encoding_to_char(src_encoding),
388 pg_encoding_to_char(dest_encoding))));
391 * Allocate space for conversion result, being wary of integer overflow.
393 * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
394 * required space, so it might exceed MaxAllocSize even though the result
395 * would actually fit. We do not want to hand back a result string that
396 * exceeds MaxAllocSize, because callers might not cope gracefully --- but
397 * if we just allocate more than that, and don't use it, that's fine.
399 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
400 ereport(ERROR,
401 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
402 errmsg("out of memory"),
403 errdetail("String of %d bytes is too long for encoding conversion.",
404 len)));
406 result = (unsigned char *)
407 MemoryContextAllocHuge(CurrentMemoryContext,
408 (Size) len * MAX_CONVERSION_GROWTH + 1);
410 (void) OidFunctionCall6(proc,
411 Int32GetDatum(src_encoding),
412 Int32GetDatum(dest_encoding),
413 CStringGetDatum((char *) src),
414 CStringGetDatum((char *) result),
415 Int32GetDatum(len),
416 BoolGetDatum(false));
419 * If the result is large, it's worth repalloc'ing to release any extra
420 * space we asked for. The cutoff here is somewhat arbitrary, but we
421 * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
423 if (len > 1000000)
425 Size resultlen = strlen((char *) result);
427 if (resultlen >= MaxAllocSize)
428 ereport(ERROR,
429 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
430 errmsg("out of memory"),
431 errdetail("String of %d bytes is too long for encoding conversion.",
432 len)));
434 result = (unsigned char *) repalloc(result, resultlen + 1);
437 return result;
441 * Convert src string to another encoding.
443 * This function has a different API than the other conversion functions.
444 * The caller should've looked up the conversion function using
445 * FindDefaultConversionProc(). Unlike the other functions, the converted
446 * result is not palloc'd. It is written to the caller-supplied buffer
447 * instead.
449 * src_encoding - encoding to convert from
450 * dest_encoding - encoding to convert to
451 * src, srclen - input buffer and its length in bytes
452 * dest, destlen - destination buffer and its size in bytes
454 * The output is null-terminated.
456 * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
457 * wouldn't necessarily fit in the output buffer, and the function will not
458 * convert the whole input.
460 * TODO: The conversion function interface is not great. Firstly, it
461 * would be nice to pass through the destination buffer size to the
462 * conversion function, so that if you pass a shorter destination buffer, it
463 * could still continue to fill up the whole buffer. Currently, we have to
464 * assume worst case expansion and stop the conversion short, even if there
465 * is in fact space left in the destination buffer. Secondly, it would be
466 * nice to return the number of bytes written to the caller, to avoid a call
467 * to strlen().
470 pg_do_encoding_conversion_buf(Oid proc,
471 int src_encoding,
472 int dest_encoding,
473 unsigned char *src, int srclen,
474 unsigned char *dest, int destlen,
475 bool noError)
477 Datum result;
480 * If the destination buffer is not large enough to hold the result in the
481 * worst case, limit the input size passed to the conversion function.
483 if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
484 srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
486 result = OidFunctionCall6(proc,
487 Int32GetDatum(src_encoding),
488 Int32GetDatum(dest_encoding),
489 CStringGetDatum((char *) src),
490 CStringGetDatum((char *) dest),
491 Int32GetDatum(srclen),
492 BoolGetDatum(noError));
493 return DatumGetInt32(result);
497 * Convert string to encoding encoding_name. The source
498 * encoding is the DB encoding.
500 * BYTEA convert_to(TEXT string, NAME encoding_name) */
501 Datum
502 pg_convert_to(PG_FUNCTION_ARGS)
504 Datum string = PG_GETARG_DATUM(0);
505 Datum dest_encoding_name = PG_GETARG_DATUM(1);
506 Datum src_encoding_name = DirectFunctionCall1(namein,
507 CStringGetDatum(DatabaseEncoding->name));
508 Datum result;
511 * pg_convert expects a bytea as its first argument. We're passing it a
512 * text argument here, relying on the fact that they are both in fact
513 * varlena types, and thus structurally identical.
515 result = DirectFunctionCall3(pg_convert, string,
516 src_encoding_name, dest_encoding_name);
518 PG_RETURN_DATUM(result);
522 * Convert string from encoding encoding_name. The destination
523 * encoding is the DB encoding.
525 * TEXT convert_from(BYTEA string, NAME encoding_name) */
526 Datum
527 pg_convert_from(PG_FUNCTION_ARGS)
529 Datum string = PG_GETARG_DATUM(0);
530 Datum src_encoding_name = PG_GETARG_DATUM(1);
531 Datum dest_encoding_name = DirectFunctionCall1(namein,
532 CStringGetDatum(DatabaseEncoding->name));
533 Datum result;
535 result = DirectFunctionCall3(pg_convert, string,
536 src_encoding_name, dest_encoding_name);
539 * pg_convert returns a bytea, which we in turn return as text, relying on
540 * the fact that they are both in fact varlena types, and thus
541 * structurally identical. Although not all bytea values are valid text,
542 * in this case it will be because we've told pg_convert to return one
543 * that is valid as text in the current database encoding.
545 PG_RETURN_DATUM(result);
549 * Convert string between two arbitrary encodings.
551 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
553 Datum
554 pg_convert(PG_FUNCTION_ARGS)
556 bytea *string = PG_GETARG_BYTEA_PP(0);
557 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
558 int src_encoding = pg_char_to_encoding(src_encoding_name);
559 char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
560 int dest_encoding = pg_char_to_encoding(dest_encoding_name);
561 const char *src_str;
562 char *dest_str;
563 bytea *retval;
564 int len;
566 if (src_encoding < 0)
567 ereport(ERROR,
568 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
569 errmsg("invalid source encoding name \"%s\"",
570 src_encoding_name)));
571 if (dest_encoding < 0)
572 ereport(ERROR,
573 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
574 errmsg("invalid destination encoding name \"%s\"",
575 dest_encoding_name)));
577 /* make sure that source string is valid */
578 len = VARSIZE_ANY_EXHDR(string);
579 src_str = VARDATA_ANY(string);
580 (void) pg_verify_mbstr(src_encoding, src_str, len, false);
582 /* perform conversion */
583 dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
584 len,
585 src_encoding,
586 dest_encoding);
588 /* update len if conversion actually happened */
589 if (dest_str != src_str)
590 len = strlen(dest_str);
593 * build bytea data type structure.
595 retval = (bytea *) palloc(len + VARHDRSZ);
596 SET_VARSIZE(retval, len + VARHDRSZ);
597 memcpy(VARDATA(retval), dest_str, len);
599 if (dest_str != src_str)
600 pfree(dest_str);
602 /* free memory if allocated by the toaster */
603 PG_FREE_IF_COPY(string, 0);
605 PG_RETURN_BYTEA_P(retval);
609 * get the length of the string considered as text in the specified
610 * encoding. Raises an error if the data is not valid in that
611 * encoding.
613 * INT4 length (BYTEA string, NAME src_encoding_name)
615 Datum
616 length_in_encoding(PG_FUNCTION_ARGS)
618 bytea *string = PG_GETARG_BYTEA_PP(0);
619 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
620 int src_encoding = pg_char_to_encoding(src_encoding_name);
621 const char *src_str;
622 int len;
623 int retval;
625 if (src_encoding < 0)
626 ereport(ERROR,
627 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
628 errmsg("invalid encoding name \"%s\"",
629 src_encoding_name)));
631 len = VARSIZE_ANY_EXHDR(string);
632 src_str = VARDATA_ANY(string);
634 retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
636 PG_RETURN_INT32(retval);
640 * Get maximum multibyte character length in the specified encoding.
642 * Note encoding is specified numerically, not by name as above.
644 Datum
645 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
647 int encoding = PG_GETARG_INT32(0);
649 if (PG_VALID_ENCODING(encoding))
650 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
651 else
652 PG_RETURN_NULL();
656 * Convert client encoding to server encoding.
658 * See the notes about string conversion functions at the top of this file.
660 char *
661 pg_client_to_server(const char *s, int len)
663 return pg_any_to_server(s, len, ClientEncoding->encoding);
667 * Convert any encoding to server encoding.
669 * See the notes about string conversion functions at the top of this file.
671 * Unlike the other string conversion functions, this will apply validation
672 * even if encoding == DatabaseEncoding->encoding. This is because this is
673 * used to process data coming in from outside the database, and we never
674 * want to just assume validity.
676 char *
677 pg_any_to_server(const char *s, int len, int encoding)
679 if (len <= 0)
680 return unconstify(char *, s); /* empty string is always valid */
682 if (encoding == DatabaseEncoding->encoding ||
683 encoding == PG_SQL_ASCII)
686 * No conversion is needed, but we must still validate the data.
688 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
689 return unconstify(char *, s);
692 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
695 * No conversion is possible, but we must still validate the data,
696 * because the client-side code might have done string escaping using
697 * the selected client_encoding. If the client encoding is ASCII-safe
698 * then we just do a straight validation under that encoding. For an
699 * ASCII-unsafe encoding we have a problem: we dare not pass such data
700 * to the parser but we have no way to convert it. We compromise by
701 * rejecting the data if it contains any non-ASCII characters.
703 if (PG_VALID_BE_ENCODING(encoding))
704 (void) pg_verify_mbstr(encoding, s, len, false);
705 else
707 int i;
709 for (i = 0; i < len; i++)
711 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
712 ereport(ERROR,
713 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
714 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
715 pg_enc2name_tbl[PG_SQL_ASCII].name,
716 (unsigned char) s[i])));
719 return unconstify(char *, s);
722 /* Fast path if we can use cached conversion function */
723 if (encoding == ClientEncoding->encoding)
724 return perform_default_encoding_conversion(s, len, true);
726 /* General case ... will not work outside transactions */
727 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
728 len,
729 encoding,
730 DatabaseEncoding->encoding);
734 * Convert server encoding to client encoding.
736 * See the notes about string conversion functions at the top of this file.
738 char *
739 pg_server_to_client(const char *s, int len)
741 return pg_server_to_any(s, len, ClientEncoding->encoding);
745 * Convert server encoding to any encoding.
747 * See the notes about string conversion functions at the top of this file.
749 char *
750 pg_server_to_any(const char *s, int len, int encoding)
752 if (len <= 0)
753 return unconstify(char *, s); /* empty string is always valid */
755 if (encoding == DatabaseEncoding->encoding ||
756 encoding == PG_SQL_ASCII)
757 return unconstify(char *, s); /* assume data is valid */
759 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
761 /* No conversion is possible, but we must validate the result */
762 (void) pg_verify_mbstr(encoding, s, len, false);
763 return unconstify(char *, s);
766 /* Fast path if we can use cached conversion function */
767 if (encoding == ClientEncoding->encoding)
768 return perform_default_encoding_conversion(s, len, false);
770 /* General case ... will not work outside transactions */
771 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
772 len,
773 DatabaseEncoding->encoding,
774 encoding);
778 * Perform default encoding conversion using cached FmgrInfo. Since
779 * this function does not access database at all, it is safe to call
780 * outside transactions. If the conversion has not been set up by
781 * SetClientEncoding(), no conversion is performed.
783 static char *
784 perform_default_encoding_conversion(const char *src, int len,
785 bool is_client_to_server)
787 char *result;
788 int src_encoding,
789 dest_encoding;
790 FmgrInfo *flinfo;
792 if (is_client_to_server)
794 src_encoding = ClientEncoding->encoding;
795 dest_encoding = DatabaseEncoding->encoding;
796 flinfo = ToServerConvProc;
798 else
800 src_encoding = DatabaseEncoding->encoding;
801 dest_encoding = ClientEncoding->encoding;
802 flinfo = ToClientConvProc;
805 if (flinfo == NULL)
806 return unconstify(char *, src);
809 * Allocate space for conversion result, being wary of integer overflow.
810 * See comments in pg_do_encoding_conversion.
812 if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
813 ereport(ERROR,
814 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
815 errmsg("out of memory"),
816 errdetail("String of %d bytes is too long for encoding conversion.",
817 len)));
819 result = (char *)
820 MemoryContextAllocHuge(CurrentMemoryContext,
821 (Size) len * MAX_CONVERSION_GROWTH + 1);
823 FunctionCall6(flinfo,
824 Int32GetDatum(src_encoding),
825 Int32GetDatum(dest_encoding),
826 CStringGetDatum(src),
827 CStringGetDatum(result),
828 Int32GetDatum(len),
829 BoolGetDatum(false));
832 * Release extra space if there might be a lot --- see comments in
833 * pg_do_encoding_conversion.
835 if (len > 1000000)
837 Size resultlen = strlen(result);
839 if (resultlen >= MaxAllocSize)
840 ereport(ERROR,
841 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
842 errmsg("out of memory"),
843 errdetail("String of %d bytes is too long for encoding conversion.",
844 len)));
846 result = (char *) repalloc(result, resultlen + 1);
849 return result;
853 * Convert a single Unicode code point into a string in the server encoding.
855 * The code point given by "c" is converted and stored at *s, which must
856 * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
857 * The output will have a trailing '\0'. Throws error if the conversion
858 * cannot be performed.
860 * Note that this relies on having previously looked up any required
861 * conversion function. That's partly for speed but mostly because the parser
862 * may call this outside any transaction, or in an aborted transaction.
864 void
865 pg_unicode_to_server(pg_wchar c, unsigned char *s)
867 unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
868 int c_as_utf8_len;
869 int server_encoding;
872 * Complain if invalid Unicode code point. The choice of errcode here is
873 * debatable, but really our caller should have checked this anyway.
875 if (!is_valid_unicode_codepoint(c))
876 ereport(ERROR,
877 (errcode(ERRCODE_SYNTAX_ERROR),
878 errmsg("invalid Unicode code point")));
880 /* Otherwise, if it's in ASCII range, conversion is trivial */
881 if (c <= 0x7F)
883 s[0] = (unsigned char) c;
884 s[1] = '\0';
885 return;
888 /* If the server encoding is UTF-8, we just need to reformat the code */
889 server_encoding = GetDatabaseEncoding();
890 if (server_encoding == PG_UTF8)
892 unicode_to_utf8(c, s);
893 s[pg_utf_mblen(s)] = '\0';
894 return;
897 /* For all other cases, we must have a conversion function available */
898 if (Utf8ToServerConvProc == NULL)
899 ereport(ERROR,
900 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
901 errmsg("conversion between %s and %s is not supported",
902 pg_enc2name_tbl[PG_UTF8].name,
903 GetDatabaseEncodingName())));
905 /* Construct UTF-8 source string */
906 unicode_to_utf8(c, c_as_utf8);
907 c_as_utf8_len = pg_utf_mblen(c_as_utf8);
908 c_as_utf8[c_as_utf8_len] = '\0';
910 /* Convert, or throw error if we can't */
911 FunctionCall6(Utf8ToServerConvProc,
912 Int32GetDatum(PG_UTF8),
913 Int32GetDatum(server_encoding),
914 CStringGetDatum((char *) c_as_utf8),
915 CStringGetDatum((char *) s),
916 Int32GetDatum(c_as_utf8_len),
917 BoolGetDatum(false));
921 * Convert a single Unicode code point into a string in the server encoding.
923 * Same as pg_unicode_to_server(), except that we don't throw errors,
924 * but simply return false on conversion failure.
926 bool
927 pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
929 unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
930 int c_as_utf8_len;
931 int converted_len;
932 int server_encoding;
934 /* Fail if invalid Unicode code point */
935 if (!is_valid_unicode_codepoint(c))
936 return false;
938 /* Otherwise, if it's in ASCII range, conversion is trivial */
939 if (c <= 0x7F)
941 s[0] = (unsigned char) c;
942 s[1] = '\0';
943 return true;
946 /* If the server encoding is UTF-8, we just need to reformat the code */
947 server_encoding = GetDatabaseEncoding();
948 if (server_encoding == PG_UTF8)
950 unicode_to_utf8(c, s);
951 s[pg_utf_mblen(s)] = '\0';
952 return true;
955 /* For all other cases, we must have a conversion function available */
956 if (Utf8ToServerConvProc == NULL)
957 return false;
959 /* Construct UTF-8 source string */
960 unicode_to_utf8(c, c_as_utf8);
961 c_as_utf8_len = pg_utf_mblen(c_as_utf8);
962 c_as_utf8[c_as_utf8_len] = '\0';
964 /* Convert, but without throwing error if we can't */
965 converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
966 Int32GetDatum(PG_UTF8),
967 Int32GetDatum(server_encoding),
968 CStringGetDatum((char *) c_as_utf8),
969 CStringGetDatum((char *) s),
970 Int32GetDatum(c_as_utf8_len),
971 BoolGetDatum(true)));
973 /* Conversion was successful iff it consumed the whole input */
974 return (converted_len == c_as_utf8_len);
978 /* convert a multibyte string to a wchar */
980 pg_mb2wchar(const char *from, pg_wchar *to)
982 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
985 /* convert a multibyte string to a wchar with a limited length */
987 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
989 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
992 /* same, with any encoding */
994 pg_encoding_mb2wchar_with_len(int encoding,
995 const char *from, pg_wchar *to, int len)
997 return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
1000 /* convert a wchar string to a multibyte */
1002 pg_wchar2mb(const pg_wchar *from, char *to)
1004 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
1007 /* convert a wchar string to a multibyte with a limited length */
1009 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
1011 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1014 /* same, with any encoding */
1016 pg_encoding_wchar2mb_with_len(int encoding,
1017 const pg_wchar *from, char *to, int len)
1019 return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1022 /* returns the byte length of a multibyte character */
1024 pg_mblen(const char *mbstr)
1026 return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1029 /* returns the display length of a multibyte character */
1031 pg_dsplen(const char *mbstr)
1033 return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
1036 /* returns the length (counted in wchars) of a multibyte string */
1038 pg_mbstrlen(const char *mbstr)
1040 int len = 0;
1042 /* optimization for single byte encoding */
1043 if (pg_database_encoding_max_length() == 1)
1044 return strlen(mbstr);
1046 while (*mbstr)
1048 mbstr += pg_mblen(mbstr);
1049 len++;
1051 return len;
1054 /* returns the length (counted in wchars) of a multibyte string
1055 * (not necessarily NULL terminated)
1058 pg_mbstrlen_with_len(const char *mbstr, int limit)
1060 int len = 0;
1062 /* optimization for single byte encoding */
1063 if (pg_database_encoding_max_length() == 1)
1064 return limit;
1066 while (limit > 0 && *mbstr)
1068 int l = pg_mblen(mbstr);
1070 limit -= l;
1071 mbstr += l;
1072 len++;
1074 return len;
1078 * returns the byte length of a multibyte string
1079 * (not necessarily NULL terminated)
1080 * that is no longer than limit.
1081 * this function does not break multibyte character boundary.
1084 pg_mbcliplen(const char *mbstr, int len, int limit)
1086 return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
1087 len, limit);
1091 * pg_mbcliplen with specified encoding
1094 pg_encoding_mbcliplen(int encoding, const char *mbstr,
1095 int len, int limit)
1097 mblen_converter mblen_fn;
1098 int clen = 0;
1099 int l;
1101 /* optimization for single byte encoding */
1102 if (pg_encoding_max_length(encoding) == 1)
1103 return cliplen(mbstr, len, limit);
1105 mblen_fn = pg_wchar_table[encoding].mblen;
1107 while (len > 0 && *mbstr)
1109 l = (*mblen_fn) ((const unsigned char *) mbstr);
1110 if ((clen + l) > limit)
1111 break;
1112 clen += l;
1113 if (clen == limit)
1114 break;
1115 len -= l;
1116 mbstr += l;
1118 return clen;
1122 * Similar to pg_mbcliplen except the limit parameter specifies the
1123 * character length, not the byte length.
1126 pg_mbcharcliplen(const char *mbstr, int len, int limit)
1128 int clen = 0;
1129 int nch = 0;
1130 int l;
1132 /* optimization for single byte encoding */
1133 if (pg_database_encoding_max_length() == 1)
1134 return cliplen(mbstr, len, limit);
1136 while (len > 0 && *mbstr)
1138 l = pg_mblen(mbstr);
1139 nch++;
1140 if (nch > limit)
1141 break;
1142 clen += l;
1143 len -= l;
1144 mbstr += l;
1146 return clen;
1149 /* mbcliplen for any single-byte encoding */
1150 static int
1151 cliplen(const char *str, int len, int limit)
1153 int l = 0;
1155 len = Min(len, limit);
1156 while (l < len && str[l])
1157 l++;
1158 return l;
1161 void
1162 SetDatabaseEncoding(int encoding)
1164 if (!PG_VALID_BE_ENCODING(encoding))
1165 elog(ERROR, "invalid database encoding: %d", encoding);
1167 DatabaseEncoding = &pg_enc2name_tbl[encoding];
1168 Assert(DatabaseEncoding->encoding == encoding);
1171 void
1172 SetMessageEncoding(int encoding)
1174 /* Some calls happen before we can elog()! */
1175 Assert(PG_VALID_ENCODING(encoding));
1177 MessageEncoding = &pg_enc2name_tbl[encoding];
1178 Assert(MessageEncoding->encoding == encoding);
1181 #ifdef ENABLE_NLS
1183 * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1184 * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1185 * fail for gettext-internal causes like out-of-memory.
1187 static bool
1188 raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1190 bool elog_ok = (CurrentMemoryContext != NULL);
1191 int i;
1193 for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
1195 if (pg_enc2gettext_tbl[i].encoding == encoding)
1197 if (bind_textdomain_codeset(domainname,
1198 pg_enc2gettext_tbl[i].name) != NULL)
1199 return true;
1201 if (elog_ok)
1202 elog(LOG, "bind_textdomain_codeset failed");
1203 else
1204 write_stderr("bind_textdomain_codeset failed");
1206 break;
1210 return false;
1214 * Bind a gettext message domain to the codeset corresponding to the database
1215 * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1216 * Return the MessageEncoding implied by the new settings.
1218 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1219 * When that matches the database encoding, we don't need to do anything. In
1220 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1221 * database encoding, except for the C locale. (On Windows, we also permit a
1222 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1223 * gettext to the right codeset.
1225 * On Windows, gettext defaults to the Windows ANSI code page. This is a
1226 * convenient departure for software that passes the strings to Windows ANSI
1227 * APIs, but we don't do that. Compel gettext to use database encoding or,
1228 * failing that, the LC_CTYPE encoding as it would on other platforms.
1230 * This function is called before elog() and palloc() are usable.
1233 pg_bind_textdomain_codeset(const char *domainname)
1235 bool elog_ok = (CurrentMemoryContext != NULL);
1236 int encoding = GetDatabaseEncoding();
1237 int new_msgenc;
1239 #ifndef WIN32
1240 const char *ctype = setlocale(LC_CTYPE, NULL);
1242 if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1243 #endif
1244 if (encoding != PG_SQL_ASCII &&
1245 raw_pg_bind_textdomain_codeset(domainname, encoding))
1246 return encoding;
1248 new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1249 if (new_msgenc < 0)
1250 new_msgenc = PG_SQL_ASCII;
1252 #ifdef WIN32
1253 if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1254 /* On failure, the old message encoding remains valid. */
1255 return GetMessageEncoding();
1256 #endif
1258 return new_msgenc;
1260 #endif
1263 * The database encoding, also called the server encoding, represents the
1264 * encoding of data stored in text-like data types. Affected types include
1265 * cstring, text, varchar, name, xml, and json.
1268 GetDatabaseEncoding(void)
1270 return DatabaseEncoding->encoding;
1273 const char *
1274 GetDatabaseEncodingName(void)
1276 return DatabaseEncoding->name;
1279 Datum
1280 getdatabaseencoding(PG_FUNCTION_ARGS)
1282 return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1285 Datum
1286 pg_client_encoding(PG_FUNCTION_ARGS)
1288 return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1291 Datum
1292 PG_char_to_encoding(PG_FUNCTION_ARGS)
1294 Name s = PG_GETARG_NAME(0);
1296 PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
1299 Datum
1300 PG_encoding_to_char(PG_FUNCTION_ARGS)
1302 int32 encoding = PG_GETARG_INT32(0);
1303 const char *encoding_name = pg_encoding_to_char(encoding);
1305 return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1309 * gettext() returns messages in this encoding. This often matches the
1310 * database encoding, but it differs for SQL_ASCII databases, for processes
1311 * not attached to a database, and under a database encoding lacking iconv
1312 * support (MULE_INTERNAL).
1315 GetMessageEncoding(void)
1317 return MessageEncoding->encoding;
1322 * Generic character incrementer function.
1324 * Not knowing anything about the properties of the encoding in use, we just
1325 * keep incrementing the last byte until we get a validly-encoded result,
1326 * or we run out of values to try. We don't bother to try incrementing
1327 * higher-order bytes, so there's no growth in runtime for wider characters.
1328 * (If we did try to do that, we'd need to consider the likelihood that 255
1329 * is not a valid final byte in the encoding.)
1331 static bool
1332 pg_generic_charinc(unsigned char *charptr, int len)
1334 unsigned char *lastbyte = charptr + len - 1;
1335 mbchar_verifier mbverify;
1337 /* We can just invoke the character verifier directly. */
1338 mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
1340 while (*lastbyte < (unsigned char) 255)
1342 (*lastbyte)++;
1343 if ((*mbverify) (charptr, len) == len)
1344 return true;
1347 return false;
1351 * UTF-8 character incrementer function.
1353 * For a one-byte character less than 0x7F, we just increment the byte.
1355 * For a multibyte character, every byte but the first must fall between 0x80
1356 * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1357 * the last byte that's not already at its maximum value. If we can't find a
1358 * byte that's less than the maximum allowable value, we simply fail. We also
1359 * need some special-case logic to skip regions used for surrogate pair
1360 * handling, as those should not occur in valid UTF-8.
1362 * Note that we don't reset lower-order bytes back to their minimums, since
1363 * we can't afford to make an exhaustive search (see make_greater_string).
1365 static bool
1366 pg_utf8_increment(unsigned char *charptr, int length)
1368 unsigned char a;
1369 unsigned char limit;
1371 switch (length)
1373 default:
1374 /* reject lengths 5 and 6 for now */
1375 return false;
1376 case 4:
1377 a = charptr[3];
1378 if (a < 0xBF)
1380 charptr[3]++;
1381 break;
1383 /* FALL THRU */
1384 case 3:
1385 a = charptr[2];
1386 if (a < 0xBF)
1388 charptr[2]++;
1389 break;
1391 /* FALL THRU */
1392 case 2:
1393 a = charptr[1];
1394 switch (*charptr)
1396 case 0xED:
1397 limit = 0x9F;
1398 break;
1399 case 0xF4:
1400 limit = 0x8F;
1401 break;
1402 default:
1403 limit = 0xBF;
1404 break;
1406 if (a < limit)
1408 charptr[1]++;
1409 break;
1411 /* FALL THRU */
1412 case 1:
1413 a = *charptr;
1414 if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1415 return false;
1416 charptr[0]++;
1417 break;
1420 return true;
1424 * EUC-JP character incrementer function.
1426 * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1427 * representing JIS X 0201 characters with the second byte ranging between
1428 * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1429 * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1431 * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1432 * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1433 * is incremented if possible, otherwise the second-to-last byte.
1435 * If the sequence starts with a value other than the above and its MSB
1436 * is set, it must be a two-byte sequence representing JIS X 0208 characters
1437 * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1438 * incremented if possible, otherwise the second-to-last byte.
1440 * Otherwise, the sequence is a single-byte ASCII character. It is
1441 * incremented up to 0x7f.
1443 static bool
1444 pg_eucjp_increment(unsigned char *charptr, int length)
1446 unsigned char c1,
1448 int i;
1450 c1 = *charptr;
1452 switch (c1)
1454 case SS2: /* JIS X 0201 */
1455 if (length != 2)
1456 return false;
1458 c2 = charptr[1];
1460 if (c2 >= 0xdf)
1461 charptr[0] = charptr[1] = 0xa1;
1462 else if (c2 < 0xa1)
1463 charptr[1] = 0xa1;
1464 else
1465 charptr[1]++;
1466 break;
1468 case SS3: /* JIS X 0212 */
1469 if (length != 3)
1470 return false;
1472 for (i = 2; i > 0; i--)
1474 c2 = charptr[i];
1475 if (c2 < 0xa1)
1477 charptr[i] = 0xa1;
1478 return true;
1480 else if (c2 < 0xfe)
1482 charptr[i]++;
1483 return true;
1487 /* Out of 3-byte code region */
1488 return false;
1490 default:
1491 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1493 if (length != 2)
1494 return false;
1496 for (i = 1; i >= 0; i--)
1498 c2 = charptr[i];
1499 if (c2 < 0xa1)
1501 charptr[i] = 0xa1;
1502 return true;
1504 else if (c2 < 0xfe)
1506 charptr[i]++;
1507 return true;
1511 /* Out of 2 byte code region */
1512 return false;
1514 else
1515 { /* ASCII, single byte */
1516 if (c1 > 0x7e)
1517 return false;
1518 (*charptr)++;
1520 break;
1523 return true;
1527 * get the character incrementer for the encoding for the current database
1529 mbcharacter_incrementer
1530 pg_database_encoding_character_incrementer(void)
1533 * Eventually it might be best to add a field to pg_wchar_table[], but for
1534 * now we just use a switch.
1536 switch (GetDatabaseEncoding())
1538 case PG_UTF8:
1539 return pg_utf8_increment;
1541 case PG_EUC_JP:
1542 return pg_eucjp_increment;
1544 default:
1545 return pg_generic_charinc;
1550 * fetch maximum length of the encoding for the current database
1553 pg_database_encoding_max_length(void)
1555 return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1559 * Verify mbstr to make sure that it is validly encoded in the current
1560 * database encoding. Otherwise same as pg_verify_mbstr().
1562 bool
1563 pg_verifymbstr(const char *mbstr, int len, bool noError)
1565 return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1569 * Verify mbstr to make sure that it is validly encoded in the specified
1570 * encoding.
1572 bool
1573 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1575 int oklen;
1577 Assert(PG_VALID_ENCODING(encoding));
1579 oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1580 if (oklen != len)
1582 if (noError)
1583 return false;
1584 report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1586 return true;
1590 * Verify mbstr to make sure that it is validly encoded in the specified
1591 * encoding.
1593 * mbstr is not necessarily zero terminated; length of mbstr is
1594 * specified by len.
1596 * If OK, return length of string in the encoding.
1597 * If a problem is found, return -1 when noError is
1598 * true; when noError is false, ereport() a descriptive message.
1600 * Note: We cannot use the faster encoding-specific mbverifystr() function
1601 * here, because we need to count the number of characters in the string.
1604 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1606 mbchar_verifier mbverifychar;
1607 int mb_len;
1609 Assert(PG_VALID_ENCODING(encoding));
1612 * In single-byte encodings, we need only reject nulls (\0).
1614 if (pg_encoding_max_length(encoding) <= 1)
1616 const char *nullpos = memchr(mbstr, 0, len);
1618 if (nullpos == NULL)
1619 return len;
1620 if (noError)
1621 return -1;
1622 report_invalid_encoding(encoding, nullpos, 1);
1625 /* fetch function pointer just once */
1626 mbverifychar = pg_wchar_table[encoding].mbverifychar;
1628 mb_len = 0;
1630 while (len > 0)
1632 int l;
1634 /* fast path for ASCII-subset characters */
1635 if (!IS_HIGHBIT_SET(*mbstr))
1637 if (*mbstr != '\0')
1639 mb_len++;
1640 mbstr++;
1641 len--;
1642 continue;
1644 if (noError)
1645 return -1;
1646 report_invalid_encoding(encoding, mbstr, len);
1649 l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1651 if (l < 0)
1653 if (noError)
1654 return -1;
1655 report_invalid_encoding(encoding, mbstr, len);
1658 mbstr += l;
1659 len -= l;
1660 mb_len++;
1662 return mb_len;
1666 * check_encoding_conversion_args: check arguments of a conversion function
1668 * "expected" arguments can be either an encoding ID or -1 to indicate that
1669 * the caller will check whether it accepts the ID.
1671 * Note: the errors here are not really user-facing, so elog instead of
1672 * ereport seems sufficient. Also, we trust that the "expected" encoding
1673 * arguments are valid encoding IDs, but we don't trust the actuals.
1675 void
1676 check_encoding_conversion_args(int src_encoding,
1677 int dest_encoding,
1678 int len,
1679 int expected_src_encoding,
1680 int expected_dest_encoding)
1682 if (!PG_VALID_ENCODING(src_encoding))
1683 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1684 if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1685 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1686 pg_enc2name_tbl[expected_src_encoding].name,
1687 pg_enc2name_tbl[src_encoding].name);
1688 if (!PG_VALID_ENCODING(dest_encoding))
1689 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1690 if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1691 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1692 pg_enc2name_tbl[expected_dest_encoding].name,
1693 pg_enc2name_tbl[dest_encoding].name);
1694 if (len < 0)
1695 elog(ERROR, "encoding conversion length must not be negative");
1699 * report_invalid_encoding: complain about invalid multibyte character
1701 * note: len is remaining length of string, not length of character;
1702 * len must be greater than zero, as we always examine the first byte.
1704 void
1705 report_invalid_encoding(int encoding, const char *mbstr, int len)
1707 int l = pg_encoding_mblen(encoding, mbstr);
1708 char buf[8 * 5 + 1];
1709 char *p = buf;
1710 int j,
1711 jlimit;
1713 jlimit = Min(l, len);
1714 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1716 for (j = 0; j < jlimit; j++)
1718 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1719 if (j < jlimit - 1)
1720 p += sprintf(p, " ");
1723 ereport(ERROR,
1724 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1725 errmsg("invalid byte sequence for encoding \"%s\": %s",
1726 pg_enc2name_tbl[encoding].name,
1727 buf)));
1731 * report_untranslatable_char: complain about untranslatable character
1733 * note: len is remaining length of string, not length of character;
1734 * len must be greater than zero, as we always examine the first byte.
1736 void
1737 report_untranslatable_char(int src_encoding, int dest_encoding,
1738 const char *mbstr, int len)
1740 int l = pg_encoding_mblen(src_encoding, mbstr);
1741 char buf[8 * 5 + 1];
1742 char *p = buf;
1743 int j,
1744 jlimit;
1746 jlimit = Min(l, len);
1747 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1749 for (j = 0; j < jlimit; j++)
1751 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1752 if (j < jlimit - 1)
1753 p += sprintf(p, " ");
1756 ereport(ERROR,
1757 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1758 errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1759 buf,
1760 pg_enc2name_tbl[src_encoding].name,
1761 pg_enc2name_tbl[dest_encoding].name)));
1765 #ifdef WIN32
1767 * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1768 * string. The character length is also passed to utf16len if not
1769 * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1770 * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1772 WCHAR *
1773 pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1775 int msgenc = GetMessageEncoding();
1776 WCHAR *utf16;
1777 int dstlen;
1778 UINT codepage;
1780 if (msgenc == PG_SQL_ASCII)
1781 /* No conversion is possible, and SQL_ASCII is never utf16. */
1782 return NULL;
1784 codepage = pg_enc2name_tbl[msgenc].codepage;
1787 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1788 * or double conversion through UTF8 if not. Double conversion is needed,
1789 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1791 if (codepage != 0)
1793 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1794 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1795 utf16[dstlen] = (WCHAR) 0;
1797 else
1799 char *utf8;
1802 * XXX pg_do_encoding_conversion() requires a transaction. In the
1803 * absence of one, hope for the input to be valid UTF8.
1805 if (IsTransactionState())
1807 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1808 len,
1809 msgenc,
1810 PG_UTF8);
1811 if (utf8 != str)
1812 len = strlen(utf8);
1814 else
1815 utf8 = (char *) str;
1817 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1818 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1819 utf16[dstlen] = (WCHAR) 0;
1821 if (utf8 != str)
1822 pfree(utf8);
1825 if (dstlen == 0 && len > 0)
1827 pfree(utf16);
1828 return NULL; /* error */
1831 if (utf16len)
1832 *utf16len = dstlen;
1833 return utf16;
1836 #endif /* WIN32 */