doc: 1-byte varlena headers can be used for user PLAIN storage
[pgsql.git] / src / backend / utils / mb / conv.c
blob82bc1ac6af37e125be0ce4ec396aefa74209bb7b
1 /*-------------------------------------------------------------------------
3 * Utility functions for conversion procs.
5 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
8 * IDENTIFICATION
9 * src/backend/utils/mb/conv.c
11 *-------------------------------------------------------------------------
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
29 * Returns the number of input bytes consumed. If noError is true, this can
30 * be less than 'len'.
32 int
33 local2local(const unsigned char *l,
34 unsigned char *p,
35 int len,
36 int src_encoding,
37 int dest_encoding,
38 const unsigned char *tab,
39 bool noError)
41 const unsigned char *start = l;
42 unsigned char c1,
43 c2;
45 while (len > 0)
47 c1 = *l;
48 if (c1 == 0)
50 if (noError)
51 break;
52 report_invalid_encoding(src_encoding, (const char *) l, len);
54 if (!IS_HIGHBIT_SET(c1))
55 *p++ = c1;
56 else
58 c2 = tab[c1 - HIGHBIT];
59 if (c2)
60 *p++ = c2;
61 else
63 if (noError)
64 break;
65 report_untranslatable_char(src_encoding, dest_encoding,
66 (const char *) l, len);
69 l++;
70 len--;
72 *p = '\0';
74 return l - start;
78 * LATINn ---> MIC when the charset's local codes map directly to MIC
80 * l points to the source string of length len
81 * p is the output area (must be large enough!)
82 * lc is the mule character set id for the local encoding
83 * encoding is the PG identifier for the local encoding
85 * Returns the number of input bytes consumed. If noError is true, this can
86 * be less than 'len'.
88 int
89 latin2mic(const unsigned char *l, unsigned char *p, int len,
90 int lc, int encoding, bool noError)
92 const unsigned char *start = l;
93 int c1;
95 while (len > 0)
97 c1 = *l;
98 if (c1 == 0)
100 if (noError)
101 break;
102 report_invalid_encoding(encoding, (const char *) l, len);
104 if (IS_HIGHBIT_SET(c1))
105 *p++ = lc;
106 *p++ = c1;
107 l++;
108 len--;
110 *p = '\0';
112 return l - start;
116 * MIC ---> LATINn when the charset's local codes map directly to MIC
118 * mic points to the source string of length len
119 * p is the output area (must be large enough!)
120 * lc is the mule character set id for the local encoding
121 * encoding is the PG identifier for the local encoding
123 * Returns the number of input bytes consumed. If noError is true, this can
124 * be less than 'len'.
127 mic2latin(const unsigned char *mic, unsigned char *p, int len,
128 int lc, int encoding, bool noError)
130 const unsigned char *start = mic;
131 int c1;
133 while (len > 0)
135 c1 = *mic;
136 if (c1 == 0)
138 if (noError)
139 break;
140 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
142 if (!IS_HIGHBIT_SET(c1))
144 /* easy for ASCII */
145 *p++ = c1;
146 mic++;
147 len--;
149 else
151 int l = pg_mule_mblen(mic);
153 if (len < l)
155 if (noError)
156 break;
157 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158 len);
160 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
162 if (noError)
163 break;
164 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
165 (const char *) mic, len);
167 *p++ = mic[1];
168 mic += 2;
169 len -= 2;
172 *p = '\0';
174 return mic - start;
179 * latin2mic_with_table: a generic single byte charset encoding
180 * conversion from a local charset to the mule internal code.
182 * l points to the source string of length len
183 * p is the output area (must be large enough!)
184 * lc is the mule character set id for the local encoding
185 * encoding is the PG identifier for the local encoding
186 * tab holds conversion entries for the local charset
187 * starting from 128 (0x80). each entry in the table holds the corresponding
188 * code point for the mule encoding, or 0 if there is no equivalent code.
190 * Returns the number of input bytes consumed. If noError is true, this can
191 * be less than 'len'.
194 latin2mic_with_table(const unsigned char *l,
195 unsigned char *p,
196 int len,
197 int lc,
198 int encoding,
199 const unsigned char *tab,
200 bool noError)
202 const unsigned char *start = l;
203 unsigned char c1,
206 while (len > 0)
208 c1 = *l;
209 if (c1 == 0)
211 if (noError)
212 break;
213 report_invalid_encoding(encoding, (const char *) l, len);
215 if (!IS_HIGHBIT_SET(c1))
216 *p++ = c1;
217 else
219 c2 = tab[c1 - HIGHBIT];
220 if (c2)
222 *p++ = lc;
223 *p++ = c2;
225 else
227 if (noError)
228 break;
229 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
230 (const char *) l, len);
233 l++;
234 len--;
236 *p = '\0';
238 return l - start;
242 * mic2latin_with_table: a generic single byte charset encoding
243 * conversion from the mule internal code to a local charset.
245 * mic points to the source string of length len
246 * p is the output area (must be large enough!)
247 * lc is the mule character set id for the local encoding
248 * encoding is the PG identifier for the local encoding
249 * tab holds conversion entries for the mule internal code's second byte,
250 * starting from 128 (0x80). each entry in the table holds the corresponding
251 * code point for the local charset, or 0 if there is no equivalent code.
253 * Returns the number of input bytes consumed. If noError is true, this can
254 * be less than 'len'.
257 mic2latin_with_table(const unsigned char *mic,
258 unsigned char *p,
259 int len,
260 int lc,
261 int encoding,
262 const unsigned char *tab,
263 bool noError)
265 const unsigned char *start = mic;
266 unsigned char c1,
269 while (len > 0)
271 c1 = *mic;
272 if (c1 == 0)
274 if (noError)
275 break;
276 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
278 if (!IS_HIGHBIT_SET(c1))
280 /* easy for ASCII */
281 *p++ = c1;
282 mic++;
283 len--;
285 else
287 int l = pg_mule_mblen(mic);
289 if (len < l)
291 if (noError)
292 break;
293 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294 len);
296 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
297 (c2 = tab[mic[1] - HIGHBIT]) == 0)
299 if (noError)
300 break;
301 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
302 (const char *) mic, len);
303 break; /* keep compiler quiet */
305 *p++ = c2;
306 mic += 2;
307 len -= 2;
310 *p = '\0';
312 return mic - start;
316 * comparison routine for bsearch()
317 * this routine is intended for combined UTF8 -> local code
319 static int
320 compare3(const void *p1, const void *p2)
322 uint32 s1,
327 s1 = *(const uint32 *) p1;
328 s2 = *((const uint32 *) p1 + 1);
329 d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331 return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
335 * comparison routine for bsearch()
336 * this routine is intended for local code -> combined UTF8
338 static int
339 compare4(const void *p1, const void *p2)
341 uint32 v1,
344 v1 = *(const uint32 *) p1;
345 v2 = ((const pg_local_to_utf_combined *) p2)->code;
346 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
350 * store 32bit character representation into multibyte stream
352 static inline unsigned char *
353 store_coded_char(unsigned char *dest, uint32 code)
355 if (code & 0xff000000)
356 *dest++ = code >> 24;
357 if (code & 0x00ff0000)
358 *dest++ = code >> 16;
359 if (code & 0x0000ff00)
360 *dest++ = code >> 8;
361 if (code & 0x000000ff)
362 *dest++ = code;
363 return dest;
367 * Convert a character using a conversion radix tree.
369 * 'l' is the length of the input character in bytes, and b1-b4 are
370 * the input character's bytes.
372 static inline uint32
373 pg_mb_radix_conv(const pg_mb_radix_tree *rt,
374 int l,
375 unsigned char b1,
376 unsigned char b2,
377 unsigned char b3,
378 unsigned char b4)
380 if (l == 4)
382 /* 4-byte code */
384 /* check code validity */
385 if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
386 b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
387 b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
388 b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
389 return 0;
391 /* perform lookup */
392 if (rt->chars32)
394 uint32 idx = rt->b4root;
396 idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397 idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398 idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399 return rt->chars32[b4 + idx - rt->b4_4_lower];
401 else
403 uint16 idx = rt->b4root;
405 idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406 idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407 idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408 return rt->chars16[b4 + idx - rt->b4_4_lower];
411 else if (l == 3)
413 /* 3-byte code */
415 /* check code validity */
416 if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
417 b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
418 b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
419 return 0;
421 /* perform lookup */
422 if (rt->chars32)
424 uint32 idx = rt->b3root;
426 idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427 idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428 return rt->chars32[b4 + idx - rt->b3_3_lower];
430 else
432 uint16 idx = rt->b3root;
434 idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435 idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436 return rt->chars16[b4 + idx - rt->b3_3_lower];
439 else if (l == 2)
441 /* 2-byte code */
443 /* check code validity - first byte */
444 if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
445 b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
446 return 0;
448 /* perform lookup */
449 if (rt->chars32)
451 uint32 idx = rt->b2root;
453 idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454 return rt->chars32[b4 + idx - rt->b2_2_lower];
456 else
458 uint16 idx = rt->b2root;
460 idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461 return rt->chars16[b4 + idx - rt->b2_2_lower];
464 else if (l == 1)
466 /* 1-byte code */
468 /* check code validity - first byte */
469 if (b4 < rt->b1_lower || b4 > rt->b1_upper)
470 return 0;
472 /* perform lookup */
473 if (rt->chars32)
474 return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475 else
476 return rt->chars16[b4 + rt->b1root - rt->b1_lower];
478 return 0; /* shouldn't happen */
482 * UTF8 ---> local code
484 * utf: input string in UTF8 encoding (need not be null-terminated)
485 * len: length of input string (in bytes)
486 * iso: pointer to the output area (must be large enough!)
487 (output string will be null-terminated)
488 * map: conversion map for single characters
489 * cmap: conversion map for combined characters
490 * (optional, pass NULL if none)
491 * cmapsize: number of entries in the conversion map for combined characters
492 * (optional, pass 0 if none)
493 * conv_func: algorithmic encoding conversion function
494 * (optional, pass NULL if none)
495 * encoding: PG identifier for the local encoding
497 * For each character, the cmap (if provided) is consulted first; if no match,
498 * the map is consulted next; if still no match, the conv_func (if provided)
499 * is applied. An error is raised if no match is found.
501 * See pg_wchar.h for more details about the data structures used here.
503 * Returns the number of input bytes consumed. If noError is true, this can
504 * be less than 'len'.
507 UtfToLocal(const unsigned char *utf, int len,
508 unsigned char *iso,
509 const pg_mb_radix_tree *map,
510 const pg_utf_to_local_combined *cmap, int cmapsize,
511 utf_local_conversion_func conv_func,
512 int encoding, bool noError)
514 uint32 iutf;
515 int l;
516 const pg_utf_to_local_combined *cp;
517 const unsigned char *start = utf;
519 if (!PG_VALID_ENCODING(encoding))
520 ereport(ERROR,
521 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522 errmsg("invalid encoding number: %d", encoding)));
524 for (; len > 0; len -= l)
526 unsigned char b1 = 0;
527 unsigned char b2 = 0;
528 unsigned char b3 = 0;
529 unsigned char b4 = 0;
531 /* "break" cases all represent errors */
532 if (*utf == '\0')
533 break;
535 l = pg_utf_mblen(utf);
536 if (len < l)
537 break;
539 if (!pg_utf8_islegal(utf, l))
540 break;
542 if (l == 1)
544 /* ASCII case is easy, assume it's one-to-one conversion */
545 *iso++ = *utf++;
546 continue;
549 /* collect coded char of length l */
550 if (l == 2)
552 b3 = *utf++;
553 b4 = *utf++;
555 else if (l == 3)
557 b2 = *utf++;
558 b3 = *utf++;
559 b4 = *utf++;
561 else if (l == 4)
563 b1 = *utf++;
564 b2 = *utf++;
565 b3 = *utf++;
566 b4 = *utf++;
568 else
570 elog(ERROR, "unsupported character length %d", l);
571 iutf = 0; /* keep compiler quiet */
573 iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
575 /* First, try with combined map if possible */
576 if (cmap && len > l)
578 const unsigned char *utf_save = utf;
579 int len_save = len;
580 int l_save = l;
582 /* collect next character, same as above */
583 len -= l;
585 l = pg_utf_mblen(utf);
586 if (len < l)
588 /* need more data to decide if this is a combined char */
589 utf -= l_save;
590 break;
593 if (!pg_utf8_islegal(utf, l))
595 if (!noError)
596 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597 utf -= l_save;
598 break;
601 /* We assume ASCII character cannot be in combined map */
602 if (l > 1)
604 uint32 iutf2;
605 uint32 cutf[2];
607 if (l == 2)
609 iutf2 = *utf++ << 8;
610 iutf2 |= *utf++;
612 else if (l == 3)
614 iutf2 = *utf++ << 16;
615 iutf2 |= *utf++ << 8;
616 iutf2 |= *utf++;
618 else if (l == 4)
620 iutf2 = *utf++ << 24;
621 iutf2 |= *utf++ << 16;
622 iutf2 |= *utf++ << 8;
623 iutf2 |= *utf++;
625 else
627 elog(ERROR, "unsupported character length %d", l);
628 iutf2 = 0; /* keep compiler quiet */
631 cutf[0] = iutf;
632 cutf[1] = iutf2;
634 cp = bsearch(cutf, cmap, cmapsize,
635 sizeof(pg_utf_to_local_combined), compare3);
637 if (cp)
639 iso = store_coded_char(iso, cp->code);
640 continue;
644 /* fail, so back up to reprocess second character next time */
645 utf = utf_save;
646 len = len_save;
647 l = l_save;
650 /* Now check ordinary map */
651 if (map)
653 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
655 if (converted)
657 iso = store_coded_char(iso, converted);
658 continue;
662 /* if there's a conversion function, try that */
663 if (conv_func)
665 uint32 converted = (*conv_func) (iutf);
667 if (converted)
669 iso = store_coded_char(iso, converted);
670 continue;
674 /* failed to translate this character */
675 utf -= l;
676 if (noError)
677 break;
678 report_untranslatable_char(PG_UTF8, encoding,
679 (const char *) utf, len);
682 /* if we broke out of loop early, must be invalid input */
683 if (len > 0 && !noError)
684 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
686 *iso = '\0';
688 return utf - start;
692 * local code ---> UTF8
694 * iso: input string in local encoding (need not be null-terminated)
695 * len: length of input string (in bytes)
696 * utf: pointer to the output area (must be large enough!)
697 (output string will be null-terminated)
698 * map: conversion map for single characters
699 * cmap: conversion map for combined characters
700 * (optional, pass NULL if none)
701 * cmapsize: number of entries in the conversion map for combined characters
702 * (optional, pass 0 if none)
703 * conv_func: algorithmic encoding conversion function
704 * (optional, pass NULL if none)
705 * encoding: PG identifier for the local encoding
707 * For each character, the map is consulted first; if no match, the cmap
708 * (if provided) is consulted next; if still no match, the conv_func
709 * (if provided) is applied. An error is raised if no match is found.
711 * See pg_wchar.h for more details about the data structures used here.
713 * Returns the number of input bytes consumed. If noError is true, this can
714 * be less than 'len'.
717 LocalToUtf(const unsigned char *iso, int len,
718 unsigned char *utf,
719 const pg_mb_radix_tree *map,
720 const pg_local_to_utf_combined *cmap, int cmapsize,
721 utf_local_conversion_func conv_func,
722 int encoding,
723 bool noError)
725 uint32 iiso;
726 int l;
727 const pg_local_to_utf_combined *cp;
728 const unsigned char *start = iso;
730 if (!PG_VALID_ENCODING(encoding))
731 ereport(ERROR,
732 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733 errmsg("invalid encoding number: %d", encoding)));
735 for (; len > 0; len -= l)
737 unsigned char b1 = 0;
738 unsigned char b2 = 0;
739 unsigned char b3 = 0;
740 unsigned char b4 = 0;
742 /* "break" cases all represent errors */
743 if (*iso == '\0')
744 break;
746 if (!IS_HIGHBIT_SET(*iso))
748 /* ASCII case is easy, assume it's one-to-one conversion */
749 *utf++ = *iso++;
750 l = 1;
751 continue;
754 l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
755 if (l < 0)
756 break;
758 /* collect coded char of length l */
759 if (l == 1)
760 b4 = *iso++;
761 else if (l == 2)
763 b3 = *iso++;
764 b4 = *iso++;
766 else if (l == 3)
768 b2 = *iso++;
769 b3 = *iso++;
770 b4 = *iso++;
772 else if (l == 4)
774 b1 = *iso++;
775 b2 = *iso++;
776 b3 = *iso++;
777 b4 = *iso++;
779 else
781 elog(ERROR, "unsupported character length %d", l);
782 iiso = 0; /* keep compiler quiet */
784 iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
786 if (map)
788 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
790 if (converted)
792 utf = store_coded_char(utf, converted);
793 continue;
796 /* If there's a combined character map, try that */
797 if (cmap)
799 cp = bsearch(&iiso, cmap, cmapsize,
800 sizeof(pg_local_to_utf_combined), compare4);
802 if (cp)
804 utf = store_coded_char(utf, cp->utf1);
805 utf = store_coded_char(utf, cp->utf2);
806 continue;
811 /* if there's a conversion function, try that */
812 if (conv_func)
814 uint32 converted = (*conv_func) (iiso);
816 if (converted)
818 utf = store_coded_char(utf, converted);
819 continue;
823 /* failed to translate this character */
824 iso -= l;
825 if (noError)
826 break;
827 report_untranslatable_char(encoding, PG_UTF8,
828 (const char *) iso, len);
831 /* if we broke out of loop early, must be invalid input */
832 if (len > 0 && !noError)
833 report_invalid_encoding(encoding, (const char *) iso, len);
835 *utf = '\0';
837 return iso - start;