Add support for automatically updating Unicode derived files
[pgsql.git] / src / backend / utils / mb / wchar.c
blob02e2588ffe12d6dd3f89c0fcc395735d15653d8d
1 /*
2 * conversion functions between pg_wchar and multibyte streams.
3 * Tatsuo Ishii
4 * src/backend/utils/mb/wchar.c
6 */
7 /* can be used in either frontend or backend */
8 #ifdef FRONTEND
9 #include "postgres_fe.h"
10 #else
11 #include "postgres.h"
12 #endif
14 #include "mb/pg_wchar.h"
18 * Operations on multi-byte encodings are driven by a table of helper
19 * functions.
21 * To add an encoding support, define mblen(), dsplen() and verifier() for
22 * the encoding. For server-encodings, also define mb2wchar() and wchar2mb()
23 * conversion functions.
25 * These functions generally assume that their input is validly formed.
26 * The "verifier" functions, further down in the file, have to be more
27 * paranoid.
29 * We expect that mblen() does not need to examine more than the first byte
30 * of the character to discover the correct length. GB18030 is an exception
31 * to that rule, though, as it also looks at second byte. But even that
32 * behaves in a predictable way, if you only pass the first byte: it will
33 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
34 * good enough for all current uses.
36 * Note: for the display output of psql to work properly, the return values
37 * of the dsplen functions must conform to the Unicode standard. In particular
38 * the NUL character is zero width and control characters are generally
39 * width -1. It is recommended that non-ASCII encodings refer their ASCII
40 * subset to the ASCII routines to ensure consistency.
44 * SQL/ASCII
46 static int
47 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
49 int cnt = 0;
51 while (len > 0 && *from)
53 *to++ = *from++;
54 len--;
55 cnt++;
57 *to = 0;
58 return cnt;
61 static int
62 pg_ascii_mblen(const unsigned char *s)
64 return 1;
67 static int
68 pg_ascii_dsplen(const unsigned char *s)
70 if (*s == '\0')
71 return 0;
72 if (*s < 0x20 || *s == 0x7f)
73 return -1;
75 return 1;
79 * EUC
81 static int
82 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
84 int cnt = 0;
86 while (len > 0 && *from)
88 if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
89 * KANA") */
91 from++;
92 *to = (SS2 << 8) | *from++;
93 len -= 2;
95 else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
97 from++;
98 *to = (SS3 << 16) | (*from++ << 8);
99 *to |= *from++;
100 len -= 3;
102 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
104 *to = *from++ << 8;
105 *to |= *from++;
106 len -= 2;
108 else /* must be ASCII */
110 *to = *from++;
111 len--;
113 to++;
114 cnt++;
116 *to = 0;
117 return cnt;
120 static inline int
121 pg_euc_mblen(const unsigned char *s)
123 int len;
125 if (*s == SS2)
126 len = 2;
127 else if (*s == SS3)
128 len = 3;
129 else if (IS_HIGHBIT_SET(*s))
130 len = 2;
131 else
132 len = 1;
133 return len;
136 static inline int
137 pg_euc_dsplen(const unsigned char *s)
139 int len;
141 if (*s == SS2)
142 len = 2;
143 else if (*s == SS3)
144 len = 2;
145 else if (IS_HIGHBIT_SET(*s))
146 len = 2;
147 else
148 len = pg_ascii_dsplen(s);
149 return len;
153 * EUC_JP
155 static int
156 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
158 return pg_euc2wchar_with_len(from, to, len);
161 static int
162 pg_eucjp_mblen(const unsigned char *s)
164 return pg_euc_mblen(s);
167 static int
168 pg_eucjp_dsplen(const unsigned char *s)
170 int len;
172 if (*s == SS2)
173 len = 1;
174 else if (*s == SS3)
175 len = 2;
176 else if (IS_HIGHBIT_SET(*s))
177 len = 2;
178 else
179 len = pg_ascii_dsplen(s);
180 return len;
184 * EUC_KR
186 static int
187 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
189 return pg_euc2wchar_with_len(from, to, len);
192 static int
193 pg_euckr_mblen(const unsigned char *s)
195 return pg_euc_mblen(s);
198 static int
199 pg_euckr_dsplen(const unsigned char *s)
201 return pg_euc_dsplen(s);
205 * EUC_CN
208 static int
209 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211 int cnt = 0;
213 while (len > 0 && *from)
215 if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
217 from++;
218 *to = (SS2 << 16) | (*from++ << 8);
219 *to |= *from++;
220 len -= 3;
222 else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
224 from++;
225 *to = (SS3 << 16) | (*from++ << 8);
226 *to |= *from++;
227 len -= 3;
229 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
231 *to = *from++ << 8;
232 *to |= *from++;
233 len -= 2;
235 else
237 *to = *from++;
238 len--;
240 to++;
241 cnt++;
243 *to = 0;
244 return cnt;
247 static int
248 pg_euccn_mblen(const unsigned char *s)
250 int len;
252 if (IS_HIGHBIT_SET(*s))
253 len = 2;
254 else
255 len = 1;
256 return len;
259 static int
260 pg_euccn_dsplen(const unsigned char *s)
262 int len;
264 if (IS_HIGHBIT_SET(*s))
265 len = 2;
266 else
267 len = pg_ascii_dsplen(s);
268 return len;
272 * EUC_TW
275 static int
276 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
278 int cnt = 0;
280 while (len > 0 && *from)
282 if (*from == SS2 && len >= 4) /* code set 2 */
284 from++;
285 *to = (((uint32) SS2) << 24) | (*from++ << 16);
286 *to |= *from++ << 8;
287 *to |= *from++;
288 len -= 4;
290 else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
292 from++;
293 *to = (SS3 << 16) | (*from++ << 8);
294 *to |= *from++;
295 len -= 3;
297 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
299 *to = *from++ << 8;
300 *to |= *from++;
301 len -= 2;
303 else
305 *to = *from++;
306 len--;
308 to++;
309 cnt++;
311 *to = 0;
312 return cnt;
315 static int
316 pg_euctw_mblen(const unsigned char *s)
318 int len;
320 if (*s == SS2)
321 len = 4;
322 else if (*s == SS3)
323 len = 3;
324 else if (IS_HIGHBIT_SET(*s))
325 len = 2;
326 else
327 len = 1;
328 return len;
331 static int
332 pg_euctw_dsplen(const unsigned char *s)
334 int len;
336 if (*s == SS2)
337 len = 2;
338 else if (*s == SS3)
339 len = 2;
340 else if (IS_HIGHBIT_SET(*s))
341 len = 2;
342 else
343 len = pg_ascii_dsplen(s);
344 return len;
348 * Convert pg_wchar to EUC_* encoding.
349 * caller must allocate enough space for "to", including a trailing zero!
350 * len: length of from.
351 * "from" not necessarily null terminated.
353 static int
354 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
356 int cnt = 0;
358 while (len > 0 && *from)
360 unsigned char c;
362 if ((c = (*from >> 24)))
364 *to++ = c;
365 *to++ = (*from >> 16) & 0xff;
366 *to++ = (*from >> 8) & 0xff;
367 *to++ = *from & 0xff;
368 cnt += 4;
370 else if ((c = (*from >> 16)))
372 *to++ = c;
373 *to++ = (*from >> 8) & 0xff;
374 *to++ = *from & 0xff;
375 cnt += 3;
377 else if ((c = (*from >> 8)))
379 *to++ = c;
380 *to++ = *from & 0xff;
381 cnt += 2;
383 else
385 *to++ = *from;
386 cnt++;
388 from++;
389 len--;
391 *to = 0;
392 return cnt;
397 * JOHAB
399 static int
400 pg_johab_mblen(const unsigned char *s)
402 return pg_euc_mblen(s);
405 static int
406 pg_johab_dsplen(const unsigned char *s)
408 return pg_euc_dsplen(s);
412 * convert UTF8 string to pg_wchar (UCS-4)
413 * caller must allocate enough space for "to", including a trailing zero!
414 * len: length of from.
415 * "from" not necessarily null terminated.
417 static int
418 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
420 int cnt = 0;
421 uint32 c1,
426 while (len > 0 && *from)
428 if ((*from & 0x80) == 0)
430 *to = *from++;
431 len--;
433 else if ((*from & 0xe0) == 0xc0)
435 if (len < 2)
436 break; /* drop trailing incomplete char */
437 c1 = *from++ & 0x1f;
438 c2 = *from++ & 0x3f;
439 *to = (c1 << 6) | c2;
440 len -= 2;
442 else if ((*from & 0xf0) == 0xe0)
444 if (len < 3)
445 break; /* drop trailing incomplete char */
446 c1 = *from++ & 0x0f;
447 c2 = *from++ & 0x3f;
448 c3 = *from++ & 0x3f;
449 *to = (c1 << 12) | (c2 << 6) | c3;
450 len -= 3;
452 else if ((*from & 0xf8) == 0xf0)
454 if (len < 4)
455 break; /* drop trailing incomplete char */
456 c1 = *from++ & 0x07;
457 c2 = *from++ & 0x3f;
458 c3 = *from++ & 0x3f;
459 c4 = *from++ & 0x3f;
460 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
461 len -= 4;
463 else
465 /* treat a bogus char as length 1; not ours to raise error */
466 *to = *from++;
467 len--;
469 to++;
470 cnt++;
472 *to = 0;
473 return cnt;
478 * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
479 * space allocated.
481 unsigned char *
482 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
484 if (c <= 0x7F)
486 utf8string[0] = c;
488 else if (c <= 0x7FF)
490 utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
491 utf8string[1] = 0x80 | (c & 0x3F);
493 else if (c <= 0xFFFF)
495 utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
496 utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
497 utf8string[2] = 0x80 | (c & 0x3F);
499 else
501 utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
502 utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
503 utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
504 utf8string[3] = 0x80 | (c & 0x3F);
507 return utf8string;
511 * Trivial conversion from pg_wchar to UTF-8.
512 * caller should allocate enough space for "to"
513 * len: length of from.
514 * "from" not necessarily null terminated.
516 static int
517 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
519 int cnt = 0;
521 while (len > 0 && *from)
523 int char_len;
525 unicode_to_utf8(*from, to);
526 char_len = pg_utf_mblen(to);
527 cnt += char_len;
528 to += char_len;
529 from++;
530 len--;
532 *to = 0;
533 return cnt;
537 * Return the byte length of a UTF8 character pointed to by s
539 * Note: in the current implementation we do not support UTF8 sequences
540 * of more than 4 bytes; hence do NOT return a value larger than 4.
541 * We return "1" for any leading byte that is either flat-out illegal or
542 * indicates a length larger than we support.
544 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
545 * other places would need to be fixed to change this.
548 pg_utf_mblen(const unsigned char *s)
550 int len;
552 if ((*s & 0x80) == 0)
553 len = 1;
554 else if ((*s & 0xe0) == 0xc0)
555 len = 2;
556 else if ((*s & 0xf0) == 0xe0)
557 len = 3;
558 else if ((*s & 0xf8) == 0xf0)
559 len = 4;
560 #ifdef NOT_USED
561 else if ((*s & 0xfc) == 0xf8)
562 len = 5;
563 else if ((*s & 0xfe) == 0xfc)
564 len = 6;
565 #endif
566 else
567 len = 1;
568 return len;
572 * This is an implementation of wcwidth() and wcswidth() as defined in
573 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
574 * <http://www.unix.org/online.html>
576 * Markus Kuhn -- 2001-09-08 -- public domain
578 * customised for PostgreSQL
580 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
583 struct mbinterval
585 unsigned short first;
586 unsigned short last;
589 /* auxiliary function for binary search in interval table */
590 static int
591 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
593 int min = 0;
594 int mid;
596 if (ucs < table[0].first || ucs > table[max].last)
597 return 0;
598 while (max >= min)
600 mid = (min + max) / 2;
601 if (ucs > table[mid].last)
602 min = mid + 1;
603 else if (ucs < table[mid].first)
604 max = mid - 1;
605 else
606 return 1;
609 return 0;
613 /* The following functions define the column width of an ISO 10646
614 * character as follows:
616 * - The null character (U+0000) has a column width of 0.
618 * - Other C0/C1 control characters and DEL will lead to a return
619 * value of -1.
621 * - Non-spacing and enclosing combining characters (general
622 * category code Mn or Me in the Unicode database) have a
623 * column width of 0.
625 * - Other format characters (general category code Cf in the Unicode
626 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
628 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
629 * have a column width of 0.
631 * - Spacing characters in the East Asian Wide (W) or East Asian
632 * FullWidth (F) category as defined in Unicode Technical
633 * Report #11 have a column width of 2.
635 * - All remaining characters (including all printable
636 * ISO 8859-1 and WGL4 characters, Unicode control characters,
637 * etc.) have a column width of 1.
639 * This implementation assumes that wchar_t characters are encoded
640 * in ISO 10646.
643 static int
644 ucs_wcwidth(pg_wchar ucs)
646 #include "common/unicode_combining_table.h"
648 /* test for 8-bit control characters */
649 if (ucs == 0)
650 return 0;
652 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
653 return -1;
655 /* binary search in table of non-spacing characters */
656 if (mbbisearch(ucs, combining,
657 sizeof(combining) / sizeof(struct mbinterval) - 1))
658 return 0;
661 * if we arrive here, ucs is not a combining or C0/C1 control character
664 return 1 +
665 (ucs >= 0x1100 &&
666 (ucs <= 0x115f || /* Hangul Jamo init. consonants */
667 (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
668 ucs != 0x303f) || /* CJK ... Yi */
669 (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
670 (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
671 * Ideographs */
672 (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
673 (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
674 (ucs >= 0xffe0 && ucs <= 0xffe6) ||
675 (ucs >= 0x20000 && ucs <= 0x2ffff)));
679 * Convert a UTF-8 character to a Unicode code point.
680 * This is a one-character version of pg_utf2wchar_with_len.
682 * No error checks here, c must point to a long-enough string.
684 pg_wchar
685 utf8_to_unicode(const unsigned char *c)
687 if ((*c & 0x80) == 0)
688 return (pg_wchar) c[0];
689 else if ((*c & 0xe0) == 0xc0)
690 return (pg_wchar) (((c[0] & 0x1f) << 6) |
691 (c[1] & 0x3f));
692 else if ((*c & 0xf0) == 0xe0)
693 return (pg_wchar) (((c[0] & 0x0f) << 12) |
694 ((c[1] & 0x3f) << 6) |
695 (c[2] & 0x3f));
696 else if ((*c & 0xf8) == 0xf0)
697 return (pg_wchar) (((c[0] & 0x07) << 18) |
698 ((c[1] & 0x3f) << 12) |
699 ((c[2] & 0x3f) << 6) |
700 (c[3] & 0x3f));
701 else
702 /* that is an invalid code on purpose */
703 return 0xffffffff;
706 static int
707 pg_utf_dsplen(const unsigned char *s)
709 return ucs_wcwidth(utf8_to_unicode(s));
713 * convert mule internal code to pg_wchar
714 * caller should allocate enough space for "to"
715 * len: length of from.
716 * "from" not necessarily null terminated.
718 static int
719 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
721 int cnt = 0;
723 while (len > 0 && *from)
725 if (IS_LC1(*from) && len >= 2)
727 *to = *from++ << 16;
728 *to |= *from++;
729 len -= 2;
731 else if (IS_LCPRV1(*from) && len >= 3)
733 from++;
734 *to = *from++ << 16;
735 *to |= *from++;
736 len -= 3;
738 else if (IS_LC2(*from) && len >= 3)
740 *to = *from++ << 16;
741 *to |= *from++ << 8;
742 *to |= *from++;
743 len -= 3;
745 else if (IS_LCPRV2(*from) && len >= 4)
747 from++;
748 *to = *from++ << 16;
749 *to |= *from++ << 8;
750 *to |= *from++;
751 len -= 4;
753 else
754 { /* assume ASCII */
755 *to = (unsigned char) *from++;
756 len--;
758 to++;
759 cnt++;
761 *to = 0;
762 return cnt;
766 * convert pg_wchar to mule internal code
767 * caller should allocate enough space for "to"
768 * len: length of from.
769 * "from" not necessarily null terminated.
771 static int
772 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
774 int cnt = 0;
776 while (len > 0 && *from)
778 unsigned char lb;
780 lb = (*from >> 16) & 0xff;
781 if (IS_LC1(lb))
783 *to++ = lb;
784 *to++ = *from & 0xff;
785 cnt += 2;
787 else if (IS_LC2(lb))
789 *to++ = lb;
790 *to++ = (*from >> 8) & 0xff;
791 *to++ = *from & 0xff;
792 cnt += 3;
794 else if (IS_LCPRV1_A_RANGE(lb))
796 *to++ = LCPRV1_A;
797 *to++ = lb;
798 *to++ = *from & 0xff;
799 cnt += 3;
801 else if (IS_LCPRV1_B_RANGE(lb))
803 *to++ = LCPRV1_B;
804 *to++ = lb;
805 *to++ = *from & 0xff;
806 cnt += 3;
808 else if (IS_LCPRV2_A_RANGE(lb))
810 *to++ = LCPRV2_A;
811 *to++ = lb;
812 *to++ = (*from >> 8) & 0xff;
813 *to++ = *from & 0xff;
814 cnt += 4;
816 else if (IS_LCPRV2_B_RANGE(lb))
818 *to++ = LCPRV2_B;
819 *to++ = lb;
820 *to++ = (*from >> 8) & 0xff;
821 *to++ = *from & 0xff;
822 cnt += 4;
824 else
826 *to++ = *from & 0xff;
827 cnt += 1;
829 from++;
830 len--;
832 *to = 0;
833 return cnt;
837 pg_mule_mblen(const unsigned char *s)
839 int len;
841 if (IS_LC1(*s))
842 len = 2;
843 else if (IS_LCPRV1(*s))
844 len = 3;
845 else if (IS_LC2(*s))
846 len = 3;
847 else if (IS_LCPRV2(*s))
848 len = 4;
849 else
850 len = 1; /* assume ASCII */
851 return len;
854 static int
855 pg_mule_dsplen(const unsigned char *s)
857 int len;
860 * Note: it's not really appropriate to assume that all multibyte charsets
861 * are double-wide on screen. But this seems an okay approximation for
862 * the MULE charsets we currently support.
865 if (IS_LC1(*s))
866 len = 1;
867 else if (IS_LCPRV1(*s))
868 len = 1;
869 else if (IS_LC2(*s))
870 len = 2;
871 else if (IS_LCPRV2(*s))
872 len = 2;
873 else
874 len = 1; /* assume ASCII */
876 return len;
880 * ISO8859-1
882 static int
883 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
885 int cnt = 0;
887 while (len > 0 && *from)
889 *to++ = *from++;
890 len--;
891 cnt++;
893 *to = 0;
894 return cnt;
898 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
899 * high bits.
900 * caller should allocate enough space for "to"
901 * len: length of from.
902 * "from" not necessarily null terminated.
904 static int
905 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
907 int cnt = 0;
909 while (len > 0 && *from)
911 *to++ = *from++;
912 len--;
913 cnt++;
915 *to = 0;
916 return cnt;
919 static int
920 pg_latin1_mblen(const unsigned char *s)
922 return 1;
925 static int
926 pg_latin1_dsplen(const unsigned char *s)
928 return pg_ascii_dsplen(s);
932 * SJIS
934 static int
935 pg_sjis_mblen(const unsigned char *s)
937 int len;
939 if (*s >= 0xa1 && *s <= 0xdf)
940 len = 1; /* 1 byte kana? */
941 else if (IS_HIGHBIT_SET(*s))
942 len = 2; /* kanji? */
943 else
944 len = 1; /* should be ASCII */
945 return len;
948 static int
949 pg_sjis_dsplen(const unsigned char *s)
951 int len;
953 if (*s >= 0xa1 && *s <= 0xdf)
954 len = 1; /* 1 byte kana? */
955 else if (IS_HIGHBIT_SET(*s))
956 len = 2; /* kanji? */
957 else
958 len = pg_ascii_dsplen(s); /* should be ASCII */
959 return len;
963 * Big5
965 static int
966 pg_big5_mblen(const unsigned char *s)
968 int len;
970 if (IS_HIGHBIT_SET(*s))
971 len = 2; /* kanji? */
972 else
973 len = 1; /* should be ASCII */
974 return len;
977 static int
978 pg_big5_dsplen(const unsigned char *s)
980 int len;
982 if (IS_HIGHBIT_SET(*s))
983 len = 2; /* kanji? */
984 else
985 len = pg_ascii_dsplen(s); /* should be ASCII */
986 return len;
990 * GBK
992 static int
993 pg_gbk_mblen(const unsigned char *s)
995 int len;
997 if (IS_HIGHBIT_SET(*s))
998 len = 2; /* kanji? */
999 else
1000 len = 1; /* should be ASCII */
1001 return len;
1004 static int
1005 pg_gbk_dsplen(const unsigned char *s)
1007 int len;
1009 if (IS_HIGHBIT_SET(*s))
1010 len = 2; /* kanji? */
1011 else
1012 len = pg_ascii_dsplen(s); /* should be ASCII */
1013 return len;
1017 * UHC
1019 static int
1020 pg_uhc_mblen(const unsigned char *s)
1022 int len;
1024 if (IS_HIGHBIT_SET(*s))
1025 len = 2; /* 2byte? */
1026 else
1027 len = 1; /* should be ASCII */
1028 return len;
1031 static int
1032 pg_uhc_dsplen(const unsigned char *s)
1034 int len;
1036 if (IS_HIGHBIT_SET(*s))
1037 len = 2; /* 2byte? */
1038 else
1039 len = pg_ascii_dsplen(s); /* should be ASCII */
1040 return len;
1044 * GB18030
1045 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1049 * Unlike all other mblen() functions, this also looks at the second byte of
1050 * the input. However, if you only pass the first byte of a multi-byte
1051 * string, and \0 as the second byte, this still works in a predictable way:
1052 * a 4-byte character will be reported as two 2-byte characters. That's
1053 * enough for all current uses, as a client-only encoding. It works that
1054 * way, because in any valid 4-byte GB18030-encoded character, the third and
1055 * fourth byte look like a 2-byte encoded character, when looked at
1056 * separately.
1058 static int
1059 pg_gb18030_mblen(const unsigned char *s)
1061 int len;
1063 if (!IS_HIGHBIT_SET(*s))
1064 len = 1; /* ASCII */
1065 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1066 len = 4;
1067 else
1068 len = 2;
1069 return len;
1072 static int
1073 pg_gb18030_dsplen(const unsigned char *s)
1075 int len;
1077 if (IS_HIGHBIT_SET(*s))
1078 len = 2;
1079 else
1080 len = pg_ascii_dsplen(s); /* ASCII */
1081 return len;
1085 *-------------------------------------------------------------------
1086 * multibyte sequence validators
1088 * These functions accept "s", a pointer to the first byte of a string,
1089 * and "len", the remaining length of the string. If there is a validly
1090 * encoded character beginning at *s, return its length in bytes; else
1091 * return -1.
1093 * The functions can assume that len > 0 and that *s != '\0', but they must
1094 * test for and reject zeroes in any additional bytes of a multibyte character.
1096 * Note that this definition allows the function for a single-byte
1097 * encoding to be just "return 1".
1098 *-------------------------------------------------------------------
1101 static int
1102 pg_ascii_verifier(const unsigned char *s, int len)
1104 return 1;
1107 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1109 static int
1110 pg_eucjp_verifier(const unsigned char *s, int len)
1112 int l;
1113 unsigned char c1,
1116 c1 = *s++;
1118 switch (c1)
1120 case SS2: /* JIS X 0201 */
1121 l = 2;
1122 if (l > len)
1123 return -1;
1124 c2 = *s++;
1125 if (c2 < 0xa1 || c2 > 0xdf)
1126 return -1;
1127 break;
1129 case SS3: /* JIS X 0212 */
1130 l = 3;
1131 if (l > len)
1132 return -1;
1133 c2 = *s++;
1134 if (!IS_EUC_RANGE_VALID(c2))
1135 return -1;
1136 c2 = *s++;
1137 if (!IS_EUC_RANGE_VALID(c2))
1138 return -1;
1139 break;
1141 default:
1142 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1144 l = 2;
1145 if (l > len)
1146 return -1;
1147 if (!IS_EUC_RANGE_VALID(c1))
1148 return -1;
1149 c2 = *s++;
1150 if (!IS_EUC_RANGE_VALID(c2))
1151 return -1;
1153 else
1154 /* must be ASCII */
1156 l = 1;
1158 break;
1161 return l;
1164 static int
1165 pg_euckr_verifier(const unsigned char *s, int len)
1167 int l;
1168 unsigned char c1,
1171 c1 = *s++;
1173 if (IS_HIGHBIT_SET(c1))
1175 l = 2;
1176 if (l > len)
1177 return -1;
1178 if (!IS_EUC_RANGE_VALID(c1))
1179 return -1;
1180 c2 = *s++;
1181 if (!IS_EUC_RANGE_VALID(c2))
1182 return -1;
1184 else
1185 /* must be ASCII */
1187 l = 1;
1190 return l;
1193 /* EUC-CN byte sequences are exactly same as EUC-KR */
1194 #define pg_euccn_verifier pg_euckr_verifier
1196 static int
1197 pg_euctw_verifier(const unsigned char *s, int len)
1199 int l;
1200 unsigned char c1,
1203 c1 = *s++;
1205 switch (c1)
1207 case SS2: /* CNS 11643 Plane 1-7 */
1208 l = 4;
1209 if (l > len)
1210 return -1;
1211 c2 = *s++;
1212 if (c2 < 0xa1 || c2 > 0xa7)
1213 return -1;
1214 c2 = *s++;
1215 if (!IS_EUC_RANGE_VALID(c2))
1216 return -1;
1217 c2 = *s++;
1218 if (!IS_EUC_RANGE_VALID(c2))
1219 return -1;
1220 break;
1222 case SS3: /* unused */
1223 return -1;
1225 default:
1226 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1228 l = 2;
1229 if (l > len)
1230 return -1;
1231 /* no further range check on c1? */
1232 c2 = *s++;
1233 if (!IS_EUC_RANGE_VALID(c2))
1234 return -1;
1236 else
1237 /* must be ASCII */
1239 l = 1;
1241 break;
1243 return l;
1246 static int
1247 pg_johab_verifier(const unsigned char *s, int len)
1249 int l,
1250 mbl;
1251 unsigned char c;
1253 l = mbl = pg_johab_mblen(s);
1255 if (len < l)
1256 return -1;
1258 if (!IS_HIGHBIT_SET(*s))
1259 return mbl;
1261 while (--l > 0)
1263 c = *++s;
1264 if (!IS_EUC_RANGE_VALID(c))
1265 return -1;
1267 return mbl;
1270 static int
1271 pg_mule_verifier(const unsigned char *s, int len)
1273 int l,
1274 mbl;
1275 unsigned char c;
1277 l = mbl = pg_mule_mblen(s);
1279 if (len < l)
1280 return -1;
1282 while (--l > 0)
1284 c = *++s;
1285 if (!IS_HIGHBIT_SET(c))
1286 return -1;
1288 return mbl;
1291 static int
1292 pg_latin1_verifier(const unsigned char *s, int len)
1294 return 1;
1297 static int
1298 pg_sjis_verifier(const unsigned char *s, int len)
1300 int l,
1301 mbl;
1302 unsigned char c1,
1305 l = mbl = pg_sjis_mblen(s);
1307 if (len < l)
1308 return -1;
1310 if (l == 1) /* pg_sjis_mblen already verified it */
1311 return mbl;
1313 c1 = *s++;
1314 c2 = *s;
1315 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1316 return -1;
1317 return mbl;
1320 static int
1321 pg_big5_verifier(const unsigned char *s, int len)
1323 int l,
1324 mbl;
1326 l = mbl = pg_big5_mblen(s);
1328 if (len < l)
1329 return -1;
1331 while (--l > 0)
1333 if (*++s == '\0')
1334 return -1;
1337 return mbl;
1340 static int
1341 pg_gbk_verifier(const unsigned char *s, int len)
1343 int l,
1344 mbl;
1346 l = mbl = pg_gbk_mblen(s);
1348 if (len < l)
1349 return -1;
1351 while (--l > 0)
1353 if (*++s == '\0')
1354 return -1;
1357 return mbl;
1360 static int
1361 pg_uhc_verifier(const unsigned char *s, int len)
1363 int l,
1364 mbl;
1366 l = mbl = pg_uhc_mblen(s);
1368 if (len < l)
1369 return -1;
1371 while (--l > 0)
1373 if (*++s == '\0')
1374 return -1;
1377 return mbl;
1380 static int
1381 pg_gb18030_verifier(const unsigned char *s, int len)
1383 int l;
1385 if (!IS_HIGHBIT_SET(*s))
1386 l = 1; /* ASCII */
1387 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1389 /* Should be 4-byte, validate remaining bytes */
1390 if (*s >= 0x81 && *s <= 0xfe &&
1391 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1392 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1393 l = 4;
1394 else
1395 l = -1;
1397 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1399 /* Should be 2-byte, validate */
1400 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1401 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1402 l = 2;
1403 else
1404 l = -1;
1406 else
1407 l = -1;
1408 return l;
1411 static int
1412 pg_utf8_verifier(const unsigned char *s, int len)
1414 int l = pg_utf_mblen(s);
1416 if (len < l)
1417 return -1;
1419 if (!pg_utf8_islegal(s, l))
1420 return -1;
1422 return l;
1426 * Check for validity of a single UTF-8 encoded character
1428 * This directly implements the rules in RFC3629. The bizarre-looking
1429 * restrictions on the second byte are meant to ensure that there isn't
1430 * more than one encoding of a given Unicode character point; that is,
1431 * you may not use a longer-than-necessary byte sequence with high order
1432 * zero bits to represent a character that would fit in fewer bytes.
1433 * To do otherwise is to create security hazards (eg, create an apparent
1434 * non-ASCII character that decodes to plain ASCII).
1436 * length is assumed to have been obtained by pg_utf_mblen(), and the
1437 * caller must have checked that that many bytes are present in the buffer.
1439 bool
1440 pg_utf8_islegal(const unsigned char *source, int length)
1442 unsigned char a;
1444 switch (length)
1446 default:
1447 /* reject lengths 5 and 6 for now */
1448 return false;
1449 case 4:
1450 a = source[3];
1451 if (a < 0x80 || a > 0xBF)
1452 return false;
1453 /* FALL THRU */
1454 case 3:
1455 a = source[2];
1456 if (a < 0x80 || a > 0xBF)
1457 return false;
1458 /* FALL THRU */
1459 case 2:
1460 a = source[1];
1461 switch (*source)
1463 case 0xE0:
1464 if (a < 0xA0 || a > 0xBF)
1465 return false;
1466 break;
1467 case 0xED:
1468 if (a < 0x80 || a > 0x9F)
1469 return false;
1470 break;
1471 case 0xF0:
1472 if (a < 0x90 || a > 0xBF)
1473 return false;
1474 break;
1475 case 0xF4:
1476 if (a < 0x80 || a > 0x8F)
1477 return false;
1478 break;
1479 default:
1480 if (a < 0x80 || a > 0xBF)
1481 return false;
1482 break;
1484 /* FALL THRU */
1485 case 1:
1486 a = *source;
1487 if (a >= 0x80 && a < 0xC2)
1488 return false;
1489 if (a > 0xF4)
1490 return false;
1491 break;
1493 return true;
1496 #ifndef FRONTEND
1499 * Generic character incrementer function.
1501 * Not knowing anything about the properties of the encoding in use, we just
1502 * keep incrementing the last byte until we get a validly-encoded result,
1503 * or we run out of values to try. We don't bother to try incrementing
1504 * higher-order bytes, so there's no growth in runtime for wider characters.
1505 * (If we did try to do that, we'd need to consider the likelihood that 255
1506 * is not a valid final byte in the encoding.)
1508 static bool
1509 pg_generic_charinc(unsigned char *charptr, int len)
1511 unsigned char *lastbyte = charptr + len - 1;
1512 mbverifier mbverify;
1514 /* We can just invoke the character verifier directly. */
1515 mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
1517 while (*lastbyte < (unsigned char) 255)
1519 (*lastbyte)++;
1520 if ((*mbverify) (charptr, len) == len)
1521 return true;
1524 return false;
1528 * UTF-8 character incrementer function.
1530 * For a one-byte character less than 0x7F, we just increment the byte.
1532 * For a multibyte character, every byte but the first must fall between 0x80
1533 * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1534 * the last byte that's not already at its maximum value. If we can't find a
1535 * byte that's less than the maximum allowable value, we simply fail. We also
1536 * need some special-case logic to skip regions used for surrogate pair
1537 * handling, as those should not occur in valid UTF-8.
1539 * Note that we don't reset lower-order bytes back to their minimums, since
1540 * we can't afford to make an exhaustive search (see make_greater_string).
1542 static bool
1543 pg_utf8_increment(unsigned char *charptr, int length)
1545 unsigned char a;
1546 unsigned char limit;
1548 switch (length)
1550 default:
1551 /* reject lengths 5 and 6 for now */
1552 return false;
1553 case 4:
1554 a = charptr[3];
1555 if (a < 0xBF)
1557 charptr[3]++;
1558 break;
1560 /* FALL THRU */
1561 case 3:
1562 a = charptr[2];
1563 if (a < 0xBF)
1565 charptr[2]++;
1566 break;
1568 /* FALL THRU */
1569 case 2:
1570 a = charptr[1];
1571 switch (*charptr)
1573 case 0xED:
1574 limit = 0x9F;
1575 break;
1576 case 0xF4:
1577 limit = 0x8F;
1578 break;
1579 default:
1580 limit = 0xBF;
1581 break;
1583 if (a < limit)
1585 charptr[1]++;
1586 break;
1588 /* FALL THRU */
1589 case 1:
1590 a = *charptr;
1591 if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1592 return false;
1593 charptr[0]++;
1594 break;
1597 return true;
1601 * EUC-JP character incrementer function.
1603 * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1604 * representing JIS X 0201 characters with the second byte ranging between
1605 * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1606 * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1608 * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1609 * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1610 * is incremented if possible, otherwise the second-to-last byte.
1612 * If the sequence starts with a value other than the above and its MSB
1613 * is set, it must be a two-byte sequence representing JIS X 0208 characters
1614 * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1615 * incremented if possible, otherwise the second-to-last byte.
1617 * Otherwise, the sequence is a single-byte ASCII character. It is
1618 * incremented up to 0x7f.
1620 static bool
1621 pg_eucjp_increment(unsigned char *charptr, int length)
1623 unsigned char c1,
1625 int i;
1627 c1 = *charptr;
1629 switch (c1)
1631 case SS2: /* JIS X 0201 */
1632 if (length != 2)
1633 return false;
1635 c2 = charptr[1];
1637 if (c2 >= 0xdf)
1638 charptr[0] = charptr[1] = 0xa1;
1639 else if (c2 < 0xa1)
1640 charptr[1] = 0xa1;
1641 else
1642 charptr[1]++;
1643 break;
1645 case SS3: /* JIS X 0212 */
1646 if (length != 3)
1647 return false;
1649 for (i = 2; i > 0; i--)
1651 c2 = charptr[i];
1652 if (c2 < 0xa1)
1654 charptr[i] = 0xa1;
1655 return true;
1657 else if (c2 < 0xfe)
1659 charptr[i]++;
1660 return true;
1664 /* Out of 3-byte code region */
1665 return false;
1667 default:
1668 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1670 if (length != 2)
1671 return false;
1673 for (i = 1; i >= 0; i--)
1675 c2 = charptr[i];
1676 if (c2 < 0xa1)
1678 charptr[i] = 0xa1;
1679 return true;
1681 else if (c2 < 0xfe)
1683 charptr[i]++;
1684 return true;
1688 /* Out of 2 byte code region */
1689 return false;
1691 else
1692 { /* ASCII, single byte */
1693 if (c1 > 0x7e)
1694 return false;
1695 (*charptr)++;
1697 break;
1700 return true;
1702 #endif /* !FRONTEND */
1706 *-------------------------------------------------------------------
1707 * encoding info table
1708 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1709 *-------------------------------------------------------------------
1711 const pg_wchar_tbl pg_wchar_table[] = {
1712 {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1713 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
1714 {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
1715 {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */
1716 {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */
1717 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */
1718 {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */
1719 {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */
1720 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1721 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1722 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1723 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1724 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1725 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1726 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1727 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1728 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1729 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1730 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1731 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1732 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1733 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1734 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1735 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1736 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1737 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1738 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1739 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1740 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1741 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1742 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1743 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1744 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1745 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1746 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
1747 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1748 {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1749 {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */
1750 {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */
1751 {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */
1752 {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
1753 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
1756 /* returns the byte length of a word for mule internal code */
1758 pg_mic_mblen(const unsigned char *mbstr)
1760 return pg_mule_mblen(mbstr);
1764 * Returns the byte length of a multibyte character.
1767 pg_encoding_mblen(int encoding, const char *mbstr)
1769 return (PG_VALID_ENCODING(encoding) ?
1770 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1771 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1775 * Returns the display length of a multibyte character.
1778 pg_encoding_dsplen(int encoding, const char *mbstr)
1780 return (PG_VALID_ENCODING(encoding) ?
1781 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1782 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1786 * Verify the first multibyte character of the given string.
1787 * Return its byte length if good, -1 if bad. (See comments above for
1788 * full details of the mbverify API.)
1791 pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1793 return (PG_VALID_ENCODING(encoding) ?
1794 pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1795 pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1799 * fetch maximum length of a given encoding
1802 pg_encoding_max_length(int encoding)
1804 Assert(PG_VALID_ENCODING(encoding));
1806 return pg_wchar_table[encoding].maxmblen;
1809 #ifndef FRONTEND
1812 * fetch maximum length of the encoding for the current database
1815 pg_database_encoding_max_length(void)
1817 return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1821 * get the character incrementer for the encoding for the current database
1823 mbcharacter_incrementer
1824 pg_database_encoding_character_incrementer(void)
1827 * Eventually it might be best to add a field to pg_wchar_table[], but for
1828 * now we just use a switch.
1830 switch (GetDatabaseEncoding())
1832 case PG_UTF8:
1833 return pg_utf8_increment;
1835 case PG_EUC_JP:
1836 return pg_eucjp_increment;
1838 default:
1839 return pg_generic_charinc;
1844 * Verify mbstr to make sure that it is validly encoded in the current
1845 * database encoding. Otherwise same as pg_verify_mbstr().
1847 bool
1848 pg_verifymbstr(const char *mbstr, int len, bool noError)
1850 return
1851 pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1855 * Verify mbstr to make sure that it is validly encoded in the specified
1856 * encoding.
1858 bool
1859 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1861 return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1865 * Verify mbstr to make sure that it is validly encoded in the specified
1866 * encoding.
1868 * mbstr is not necessarily zero terminated; length of mbstr is
1869 * specified by len.
1871 * If OK, return length of string in the encoding.
1872 * If a problem is found, return -1 when noError is
1873 * true; when noError is false, ereport() a descriptive message.
1876 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1878 mbverifier mbverify;
1879 int mb_len;
1881 Assert(PG_VALID_ENCODING(encoding));
1884 * In single-byte encodings, we need only reject nulls (\0).
1886 if (pg_encoding_max_length(encoding) <= 1)
1888 const char *nullpos = memchr(mbstr, 0, len);
1890 if (nullpos == NULL)
1891 return len;
1892 if (noError)
1893 return -1;
1894 report_invalid_encoding(encoding, nullpos, 1);
1897 /* fetch function pointer just once */
1898 mbverify = pg_wchar_table[encoding].mbverify;
1900 mb_len = 0;
1902 while (len > 0)
1904 int l;
1906 /* fast path for ASCII-subset characters */
1907 if (!IS_HIGHBIT_SET(*mbstr))
1909 if (*mbstr != '\0')
1911 mb_len++;
1912 mbstr++;
1913 len--;
1914 continue;
1916 if (noError)
1917 return -1;
1918 report_invalid_encoding(encoding, mbstr, len);
1921 l = (*mbverify) ((const unsigned char *) mbstr, len);
1923 if (l < 0)
1925 if (noError)
1926 return -1;
1927 report_invalid_encoding(encoding, mbstr, len);
1930 mbstr += l;
1931 len -= l;
1932 mb_len++;
1934 return mb_len;
1938 * check_encoding_conversion_args: check arguments of a conversion function
1940 * "expected" arguments can be either an encoding ID or -1 to indicate that
1941 * the caller will check whether it accepts the ID.
1943 * Note: the errors here are not really user-facing, so elog instead of
1944 * ereport seems sufficient. Also, we trust that the "expected" encoding
1945 * arguments are valid encoding IDs, but we don't trust the actuals.
1947 void
1948 check_encoding_conversion_args(int src_encoding,
1949 int dest_encoding,
1950 int len,
1951 int expected_src_encoding,
1952 int expected_dest_encoding)
1954 if (!PG_VALID_ENCODING(src_encoding))
1955 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1956 if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1957 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1958 pg_enc2name_tbl[expected_src_encoding].name,
1959 pg_enc2name_tbl[src_encoding].name);
1960 if (!PG_VALID_ENCODING(dest_encoding))
1961 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1962 if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1963 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1964 pg_enc2name_tbl[expected_dest_encoding].name,
1965 pg_enc2name_tbl[dest_encoding].name);
1966 if (len < 0)
1967 elog(ERROR, "encoding conversion length must not be negative");
1971 * report_invalid_encoding: complain about invalid multibyte character
1973 * note: len is remaining length of string, not length of character;
1974 * len must be greater than zero, as we always examine the first byte.
1976 void
1977 report_invalid_encoding(int encoding, const char *mbstr, int len)
1979 int l = pg_encoding_mblen(encoding, mbstr);
1980 char buf[8 * 5 + 1];
1981 char *p = buf;
1982 int j,
1983 jlimit;
1985 jlimit = Min(l, len);
1986 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1988 for (j = 0; j < jlimit; j++)
1990 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1991 if (j < jlimit - 1)
1992 p += sprintf(p, " ");
1995 ereport(ERROR,
1996 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1997 errmsg("invalid byte sequence for encoding \"%s\": %s",
1998 pg_enc2name_tbl[encoding].name,
1999 buf)));
2003 * report_untranslatable_char: complain about untranslatable character
2005 * note: len is remaining length of string, not length of character;
2006 * len must be greater than zero, as we always examine the first byte.
2008 void
2009 report_untranslatable_char(int src_encoding, int dest_encoding,
2010 const char *mbstr, int len)
2012 int l = pg_encoding_mblen(src_encoding, mbstr);
2013 char buf[8 * 5 + 1];
2014 char *p = buf;
2015 int j,
2016 jlimit;
2018 jlimit = Min(l, len);
2019 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
2021 for (j = 0; j < jlimit; j++)
2023 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2024 if (j < jlimit - 1)
2025 p += sprintf(p, " ");
2028 ereport(ERROR,
2029 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
2030 errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
2031 buf,
2032 pg_enc2name_tbl[src_encoding].name,
2033 pg_enc2name_tbl[dest_encoding].name)));
2036 #endif /* !FRONTEND */