1 /*-------------------------------------------------------------------------
3 * Utility functions for conversion procs.
5 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
9 * src/backend/utils/mb/conv.c
11 *-------------------------------------------------------------------------
14 #include "mb/pg_wchar.h"
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
29 * Returns the number of input bytes consumed. If noError is true, this can
33 local2local(const unsigned char *l
,
38 const unsigned char *tab
,
41 const unsigned char *start
= l
;
52 report_invalid_encoding(src_encoding
, (const char *) l
, len
);
54 if (!IS_HIGHBIT_SET(c1
))
58 c2
= tab
[c1
- HIGHBIT
];
65 report_untranslatable_char(src_encoding
, dest_encoding
,
66 (const char *) l
, len
);
78 * LATINn ---> MIC when the charset's local codes map directly to MIC
80 * l points to the source string of length len
81 * p is the output area (must be large enough!)
82 * lc is the mule character set id for the local encoding
83 * encoding is the PG identifier for the local encoding
85 * Returns the number of input bytes consumed. If noError is true, this can
89 latin2mic(const unsigned char *l
, unsigned char *p
, int len
,
90 int lc
, int encoding
, bool noError
)
92 const unsigned char *start
= l
;
102 report_invalid_encoding(encoding
, (const char *) l
, len
);
104 if (IS_HIGHBIT_SET(c1
))
116 * MIC ---> LATINn when the charset's local codes map directly to MIC
118 * mic points to the source string of length len
119 * p is the output area (must be large enough!)
120 * lc is the mule character set id for the local encoding
121 * encoding is the PG identifier for the local encoding
123 * Returns the number of input bytes consumed. If noError is true, this can
124 * be less than 'len'.
127 mic2latin(const unsigned char *mic
, unsigned char *p
, int len
,
128 int lc
, int encoding
, bool noError
)
130 const unsigned char *start
= mic
;
140 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
, len
);
142 if (!IS_HIGHBIT_SET(c1
))
151 int l
= pg_mule_mblen(mic
);
157 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
,
160 if (l
!= 2 || c1
!= lc
|| !IS_HIGHBIT_SET(mic
[1]))
164 report_untranslatable_char(PG_MULE_INTERNAL
, encoding
,
165 (const char *) mic
, len
);
179 * latin2mic_with_table: a generic single byte charset encoding
180 * conversion from a local charset to the mule internal code.
182 * l points to the source string of length len
183 * p is the output area (must be large enough!)
184 * lc is the mule character set id for the local encoding
185 * encoding is the PG identifier for the local encoding
186 * tab holds conversion entries for the local charset
187 * starting from 128 (0x80). each entry in the table holds the corresponding
188 * code point for the mule encoding, or 0 if there is no equivalent code.
190 * Returns the number of input bytes consumed. If noError is true, this can
191 * be less than 'len'.
194 latin2mic_with_table(const unsigned char *l
,
199 const unsigned char *tab
,
202 const unsigned char *start
= l
;
213 report_invalid_encoding(encoding
, (const char *) l
, len
);
215 if (!IS_HIGHBIT_SET(c1
))
219 c2
= tab
[c1
- HIGHBIT
];
229 report_untranslatable_char(encoding
, PG_MULE_INTERNAL
,
230 (const char *) l
, len
);
242 * mic2latin_with_table: a generic single byte charset encoding
243 * conversion from the mule internal code to a local charset.
245 * mic points to the source string of length len
246 * p is the output area (must be large enough!)
247 * lc is the mule character set id for the local encoding
248 * encoding is the PG identifier for the local encoding
249 * tab holds conversion entries for the mule internal code's second byte,
250 * starting from 128 (0x80). each entry in the table holds the corresponding
251 * code point for the local charset, or 0 if there is no equivalent code.
253 * Returns the number of input bytes consumed. If noError is true, this can
254 * be less than 'len'.
257 mic2latin_with_table(const unsigned char *mic
,
262 const unsigned char *tab
,
265 const unsigned char *start
= mic
;
276 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
, len
);
278 if (!IS_HIGHBIT_SET(c1
))
287 int l
= pg_mule_mblen(mic
);
293 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
,
296 if (l
!= 2 || c1
!= lc
|| !IS_HIGHBIT_SET(mic
[1]) ||
297 (c2
= tab
[mic
[1] - HIGHBIT
]) == 0)
301 report_untranslatable_char(PG_MULE_INTERNAL
, encoding
,
302 (const char *) mic
, len
);
303 break; /* keep compiler quiet */
316 * comparison routine for bsearch()
317 * this routine is intended for combined UTF8 -> local code
320 compare3(const void *p1
, const void *p2
)
327 s1
= *(const uint32
*) p1
;
328 s2
= *((const uint32
*) p1
+ 1);
329 d1
= ((const pg_utf_to_local_combined
*) p2
)->utf1
;
330 d2
= ((const pg_utf_to_local_combined
*) p2
)->utf2
;
331 return (s1
> d1
|| (s1
== d1
&& s2
> d2
)) ? 1 : ((s1
== d1
&& s2
== d2
) ? 0 : -1);
335 * comparison routine for bsearch()
336 * this routine is intended for local code -> combined UTF8
339 compare4(const void *p1
, const void *p2
)
344 v1
= *(const uint32
*) p1
;
345 v2
= ((const pg_local_to_utf_combined
*) p2
)->code
;
346 return (v1
> v2
) ? 1 : ((v1
== v2
) ? 0 : -1);
350 * store 32bit character representation into multibyte stream
352 static inline unsigned char *
353 store_coded_char(unsigned char *dest
, uint32 code
)
355 if (code
& 0xff000000)
356 *dest
++ = code
>> 24;
357 if (code
& 0x00ff0000)
358 *dest
++ = code
>> 16;
359 if (code
& 0x0000ff00)
361 if (code
& 0x000000ff)
367 * Convert a character using a conversion radix tree.
369 * 'l' is the length of the input character in bytes, and b1-b4 are
370 * the input character's bytes.
373 pg_mb_radix_conv(const pg_mb_radix_tree
*rt
,
384 /* check code validity */
385 if (b1
< rt
->b4_1_lower
|| b1
> rt
->b4_1_upper
||
386 b2
< rt
->b4_2_lower
|| b2
> rt
->b4_2_upper
||
387 b3
< rt
->b4_3_lower
|| b3
> rt
->b4_3_upper
||
388 b4
< rt
->b4_4_lower
|| b4
> rt
->b4_4_upper
)
394 uint32 idx
= rt
->b4root
;
396 idx
= rt
->chars32
[b1
+ idx
- rt
->b4_1_lower
];
397 idx
= rt
->chars32
[b2
+ idx
- rt
->b4_2_lower
];
398 idx
= rt
->chars32
[b3
+ idx
- rt
->b4_3_lower
];
399 return rt
->chars32
[b4
+ idx
- rt
->b4_4_lower
];
403 uint16 idx
= rt
->b4root
;
405 idx
= rt
->chars16
[b1
+ idx
- rt
->b4_1_lower
];
406 idx
= rt
->chars16
[b2
+ idx
- rt
->b4_2_lower
];
407 idx
= rt
->chars16
[b3
+ idx
- rt
->b4_3_lower
];
408 return rt
->chars16
[b4
+ idx
- rt
->b4_4_lower
];
415 /* check code validity */
416 if (b2
< rt
->b3_1_lower
|| b2
> rt
->b3_1_upper
||
417 b3
< rt
->b3_2_lower
|| b3
> rt
->b3_2_upper
||
418 b4
< rt
->b3_3_lower
|| b4
> rt
->b3_3_upper
)
424 uint32 idx
= rt
->b3root
;
426 idx
= rt
->chars32
[b2
+ idx
- rt
->b3_1_lower
];
427 idx
= rt
->chars32
[b3
+ idx
- rt
->b3_2_lower
];
428 return rt
->chars32
[b4
+ idx
- rt
->b3_3_lower
];
432 uint16 idx
= rt
->b3root
;
434 idx
= rt
->chars16
[b2
+ idx
- rt
->b3_1_lower
];
435 idx
= rt
->chars16
[b3
+ idx
- rt
->b3_2_lower
];
436 return rt
->chars16
[b4
+ idx
- rt
->b3_3_lower
];
443 /* check code validity - first byte */
444 if (b3
< rt
->b2_1_lower
|| b3
> rt
->b2_1_upper
||
445 b4
< rt
->b2_2_lower
|| b4
> rt
->b2_2_upper
)
451 uint32 idx
= rt
->b2root
;
453 idx
= rt
->chars32
[b3
+ idx
- rt
->b2_1_lower
];
454 return rt
->chars32
[b4
+ idx
- rt
->b2_2_lower
];
458 uint16 idx
= rt
->b2root
;
460 idx
= rt
->chars16
[b3
+ idx
- rt
->b2_1_lower
];
461 return rt
->chars16
[b4
+ idx
- rt
->b2_2_lower
];
468 /* check code validity - first byte */
469 if (b4
< rt
->b1_lower
|| b4
> rt
->b1_upper
)
474 return rt
->chars32
[b4
+ rt
->b1root
- rt
->b1_lower
];
476 return rt
->chars16
[b4
+ rt
->b1root
- rt
->b1_lower
];
478 return 0; /* shouldn't happen */
482 * UTF8 ---> local code
484 * utf: input string in UTF8 encoding (need not be null-terminated)
485 * len: length of input string (in bytes)
486 * iso: pointer to the output area (must be large enough!)
487 (output string will be null-terminated)
488 * map: conversion map for single characters
489 * cmap: conversion map for combined characters
490 * (optional, pass NULL if none)
491 * cmapsize: number of entries in the conversion map for combined characters
492 * (optional, pass 0 if none)
493 * conv_func: algorithmic encoding conversion function
494 * (optional, pass NULL if none)
495 * encoding: PG identifier for the local encoding
497 * For each character, the cmap (if provided) is consulted first; if no match,
498 * the map is consulted next; if still no match, the conv_func (if provided)
499 * is applied. An error is raised if no match is found.
501 * See pg_wchar.h for more details about the data structures used here.
503 * Returns the number of input bytes consumed. If noError is true, this can
504 * be less than 'len'.
507 UtfToLocal(const unsigned char *utf
, int len
,
509 const pg_mb_radix_tree
*map
,
510 const pg_utf_to_local_combined
*cmap
, int cmapsize
,
511 utf_local_conversion_func conv_func
,
512 int encoding
, bool noError
)
516 const pg_utf_to_local_combined
*cp
;
517 const unsigned char *start
= utf
;
519 if (!PG_VALID_ENCODING(encoding
))
521 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
522 errmsg("invalid encoding number: %d", encoding
)));
524 for (; len
> 0; len
-= l
)
526 unsigned char b1
= 0;
527 unsigned char b2
= 0;
528 unsigned char b3
= 0;
529 unsigned char b4
= 0;
531 /* "break" cases all represent errors */
535 l
= pg_utf_mblen(utf
);
539 if (!pg_utf8_islegal(utf
, l
))
544 /* ASCII case is easy, assume it's one-to-one conversion */
549 /* collect coded char of length l */
570 elog(ERROR
, "unsupported character length %d", l
);
571 iutf
= 0; /* keep compiler quiet */
573 iutf
= (b1
<< 24 | b2
<< 16 | b3
<< 8 | b4
);
575 /* First, try with combined map if possible */
578 const unsigned char *utf_save
= utf
;
582 /* collect next character, same as above */
585 l
= pg_utf_mblen(utf
);
588 /* need more data to decide if this is a combined char */
593 if (!pg_utf8_islegal(utf
, l
))
596 report_invalid_encoding(PG_UTF8
, (const char *) utf
, len
);
601 /* We assume ASCII character cannot be in combined map */
614 iutf2
= *utf
++ << 16;
615 iutf2
|= *utf
++ << 8;
620 iutf2
= *utf
++ << 24;
621 iutf2
|= *utf
++ << 16;
622 iutf2
|= *utf
++ << 8;
627 elog(ERROR
, "unsupported character length %d", l
);
628 iutf2
= 0; /* keep compiler quiet */
634 cp
= bsearch(cutf
, cmap
, cmapsize
,
635 sizeof(pg_utf_to_local_combined
), compare3
);
639 iso
= store_coded_char(iso
, cp
->code
);
644 /* fail, so back up to reprocess second character next time */
650 /* Now check ordinary map */
653 uint32 converted
= pg_mb_radix_conv(map
, l
, b1
, b2
, b3
, b4
);
657 iso
= store_coded_char(iso
, converted
);
662 /* if there's a conversion function, try that */
665 uint32 converted
= (*conv_func
) (iutf
);
669 iso
= store_coded_char(iso
, converted
);
674 /* failed to translate this character */
678 report_untranslatable_char(PG_UTF8
, encoding
,
679 (const char *) utf
, len
);
682 /* if we broke out of loop early, must be invalid input */
683 if (len
> 0 && !noError
)
684 report_invalid_encoding(PG_UTF8
, (const char *) utf
, len
);
692 * local code ---> UTF8
694 * iso: input string in local encoding (need not be null-terminated)
695 * len: length of input string (in bytes)
696 * utf: pointer to the output area (must be large enough!)
697 (output string will be null-terminated)
698 * map: conversion map for single characters
699 * cmap: conversion map for combined characters
700 * (optional, pass NULL if none)
701 * cmapsize: number of entries in the conversion map for combined characters
702 * (optional, pass 0 if none)
703 * conv_func: algorithmic encoding conversion function
704 * (optional, pass NULL if none)
705 * encoding: PG identifier for the local encoding
707 * For each character, the map is consulted first; if no match, the cmap
708 * (if provided) is consulted next; if still no match, the conv_func
709 * (if provided) is applied. An error is raised if no match is found.
711 * See pg_wchar.h for more details about the data structures used here.
713 * Returns the number of input bytes consumed. If noError is true, this can
714 * be less than 'len'.
717 LocalToUtf(const unsigned char *iso
, int len
,
719 const pg_mb_radix_tree
*map
,
720 const pg_local_to_utf_combined
*cmap
, int cmapsize
,
721 utf_local_conversion_func conv_func
,
727 const pg_local_to_utf_combined
*cp
;
728 const unsigned char *start
= iso
;
730 if (!PG_VALID_ENCODING(encoding
))
732 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
733 errmsg("invalid encoding number: %d", encoding
)));
735 for (; len
> 0; len
-= l
)
737 unsigned char b1
= 0;
738 unsigned char b2
= 0;
739 unsigned char b3
= 0;
740 unsigned char b4
= 0;
742 /* "break" cases all represent errors */
746 if (!IS_HIGHBIT_SET(*iso
))
748 /* ASCII case is easy, assume it's one-to-one conversion */
754 l
= pg_encoding_verifymbchar(encoding
, (const char *) iso
, len
);
758 /* collect coded char of length l */
781 elog(ERROR
, "unsupported character length %d", l
);
782 iiso
= 0; /* keep compiler quiet */
784 iiso
= (b1
<< 24 | b2
<< 16 | b3
<< 8 | b4
);
788 uint32 converted
= pg_mb_radix_conv(map
, l
, b1
, b2
, b3
, b4
);
792 utf
= store_coded_char(utf
, converted
);
796 /* If there's a combined character map, try that */
799 cp
= bsearch(&iiso
, cmap
, cmapsize
,
800 sizeof(pg_local_to_utf_combined
), compare4
);
804 utf
= store_coded_char(utf
, cp
->utf1
);
805 utf
= store_coded_char(utf
, cp
->utf2
);
811 /* if there's a conversion function, try that */
814 uint32 converted
= (*conv_func
) (iiso
);
818 utf
= store_coded_char(utf
, converted
);
823 /* failed to translate this character */
827 report_untranslatable_char(encoding
, PG_UTF8
,
828 (const char *) iso
, len
);
831 /* if we broke out of loop early, must be invalid input */
832 if (len
> 0 && !noError
)
833 report_invalid_encoding(encoding
, (const char *) iso
, len
);