1 /*-------------------------------------------------------------------------
4 * Functions for the variable-length built-in types.
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/utils/adt/varlena.c
13 *-------------------------------------------------------------------------
20 #include "access/detoast.h"
21 #include "access/toast_compression.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "lib/hyperloglog.h"
28 #include "libpq/pqformat.h"
29 #include "miscadmin.h"
30 #include "nodes/execnodes.h"
31 #include "parser/scansup.h"
32 #include "port/pg_bswap.h"
33 #include "regex/regex.h"
34 #include "utils/builtins.h"
35 #include "utils/bytea.h"
36 #include "utils/lsyscache.h"
37 #include "utils/memutils.h"
38 #include "utils/pg_locale.h"
39 #include "utils/sortsupport.h"
40 #include "utils/varlena.h"
44 int bytea_output
= BYTEA_OUTPUT_HEX
;
46 typedef struct varlena unknown
;
47 typedef struct varlena VarString
;
50 * State for text_position_* functions.
54 bool is_multibyte
; /* T if multibyte encoding */
55 bool is_multibyte_char_in_char
; /* need to check char boundaries? */
57 char *str1
; /* haystack string */
58 char *str2
; /* needle string */
59 int len1
; /* string lengths in bytes */
62 /* Skip table for Boyer-Moore-Horspool search algorithm: */
63 int skiptablemask
; /* mask for ANDing with skiptable subscripts */
64 int skiptable
[256]; /* skip distance for given mismatched char */
66 char *last_match
; /* pointer to last match in 'str1' */
69 * Sometimes we need to convert the byte position of a match to a
70 * character position. These store the last position that was converted,
71 * so that on the next call, we can continue from that point, rather than
72 * count characters from the very beginning.
74 char *refpoint
; /* pointer within original haystack string */
75 int refpos
; /* 0-based character offset of the same point */
80 char *buf1
; /* 1st string, or abbreviation original string
82 char *buf2
; /* 2nd string, or abbreviation strxfrm() buf */
85 int last_len1
; /* Length of last buf1 string/strxfrm() input */
86 int last_len2
; /* Length of last buf2 string/strxfrm() blob */
87 int last_returned
; /* Last comparison result (cache) */
88 bool cache_blob
; /* Does buf2 contain strxfrm() blob, etc? */
90 Oid typid
; /* Actual datatype (text/bpchar/bytea/name) */
91 hyperLogLogState abbr_card
; /* Abbreviated key cardinality state */
92 hyperLogLogState full_card
; /* Full key cardinality state */
93 double prop_card
; /* Required cardinality proportion */
95 } VarStringSortSupport
;
98 * Output data for split_text(): we output either to an array or a table.
99 * tupstore and tupdesc must be set up in advance to output to a table.
103 ArrayBuildState
*astate
;
104 Tuplestorestate
*tupstore
;
106 } SplitTextOutputData
;
109 * This should be large enough that most strings will fit, but small enough
110 * that we feel comfortable putting it on the stack
112 #define TEXTBUFLEN 1024
114 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
115 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
116 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
117 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
118 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
120 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
121 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
123 static int varstrfastcmp_c(Datum x
, Datum y
, SortSupport ssup
);
124 static int bpcharfastcmp_c(Datum x
, Datum y
, SortSupport ssup
);
125 static int namefastcmp_c(Datum x
, Datum y
, SortSupport ssup
);
126 static int varlenafastcmp_locale(Datum x
, Datum y
, SortSupport ssup
);
127 static int namefastcmp_locale(Datum x
, Datum y
, SortSupport ssup
);
128 static int varstrfastcmp_locale(char *a1p
, int len1
, char *a2p
, int len2
, SortSupport ssup
);
129 static int varstrcmp_abbrev(Datum x
, Datum y
, SortSupport ssup
);
130 static Datum
varstr_abbrev_convert(Datum original
, SortSupport ssup
);
131 static bool varstr_abbrev_abort(int memtupcount
, SortSupport ssup
);
132 static int32
text_length(Datum str
);
133 static text
*text_catenate(text
*t1
, text
*t2
);
134 static text
*text_substring(Datum str
,
137 bool length_not_specified
);
138 static text
*text_overlay(text
*t1
, text
*t2
, int sp
, int sl
);
139 static int text_position(text
*t1
, text
*t2
, Oid collid
);
140 static void text_position_setup(text
*t1
, text
*t2
, Oid collid
, TextPositionState
*state
);
141 static bool text_position_next(TextPositionState
*state
);
142 static char *text_position_next_internal(char *start_ptr
, TextPositionState
*state
);
143 static char *text_position_get_match_ptr(TextPositionState
*state
);
144 static int text_position_get_match_pos(TextPositionState
*state
);
145 static void text_position_cleanup(TextPositionState
*state
);
146 static void check_collation_set(Oid collid
);
147 static int text_cmp(text
*arg1
, text
*arg2
, Oid collid
);
148 static bytea
*bytea_catenate(bytea
*t1
, bytea
*t2
);
149 static bytea
*bytea_substring(Datum str
,
152 bool length_not_specified
);
153 static bytea
*bytea_overlay(bytea
*t1
, bytea
*t2
, int sp
, int sl
);
154 static void appendStringInfoText(StringInfo str
, const text
*t
);
155 static bool split_text(FunctionCallInfo fcinfo
, SplitTextOutputData
*tstate
);
156 static void split_text_accum_result(SplitTextOutputData
*tstate
,
160 static text
*array_to_text_internal(FunctionCallInfo fcinfo
, ArrayType
*v
,
161 const char *fldsep
, const char *null_string
);
162 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo
);
163 static bool text_format_parse_digits(const char **ptr
, const char *end_ptr
,
165 static const char *text_format_parse_format(const char *start_ptr
,
167 int *argpos
, int *widthpos
,
168 int *flags
, int *width
);
169 static void text_format_string_conversion(StringInfo buf
, char conversion
,
170 FmgrInfo
*typOutputInfo
,
171 Datum value
, bool isNull
,
172 int flags
, int width
);
173 static void text_format_append_string(StringInfo buf
, const char *str
,
174 int flags
, int width
);
177 /*****************************************************************************
178 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
179 *****************************************************************************/
184 * Create a text value from a null-terminated C string.
186 * The new text value is freshly palloc'd with a full-size VARHDR.
189 cstring_to_text(const char *s
)
191 return cstring_to_text_with_len(s
, strlen(s
));
195 * cstring_to_text_with_len
197 * Same as cstring_to_text except the caller specifies the string length;
198 * the string need not be null_terminated.
201 cstring_to_text_with_len(const char *s
, int len
)
203 text
*result
= (text
*) palloc(len
+ VARHDRSZ
);
205 SET_VARSIZE(result
, len
+ VARHDRSZ
);
206 memcpy(VARDATA(result
), s
, len
);
214 * Create a palloc'd, null-terminated C string from a text value.
216 * We support being passed a compressed or toasted text value.
217 * This is a bit bogus since such values shouldn't really be referred to as
218 * "text *", but it seems useful for robustness. If we didn't handle that
219 * case here, we'd need another routine that did, anyway.
222 text_to_cstring(const text
*t
)
224 /* must cast away the const, unfortunately */
225 text
*tunpacked
= pg_detoast_datum_packed(unconstify(text
*, t
));
226 int len
= VARSIZE_ANY_EXHDR(tunpacked
);
229 result
= (char *) palloc(len
+ 1);
230 memcpy(result
, VARDATA_ANY(tunpacked
), len
);
240 * text_to_cstring_buffer
242 * Copy a text value into a caller-supplied buffer of size dst_len.
244 * The text string is truncated if necessary to fit. The result is
245 * guaranteed null-terminated (unless dst_len == 0).
247 * We support being passed a compressed or toasted text value.
248 * This is a bit bogus since such values shouldn't really be referred to as
249 * "text *", but it seems useful for robustness. If we didn't handle that
250 * case here, we'd need another routine that did, anyway.
253 text_to_cstring_buffer(const text
*src
, char *dst
, size_t dst_len
)
255 /* must cast away the const, unfortunately */
256 text
*srcunpacked
= pg_detoast_datum_packed(unconstify(text
*, src
));
257 size_t src_len
= VARSIZE_ANY_EXHDR(srcunpacked
);
262 if (dst_len
>= src_len
)
264 else /* ensure truncation is encoding-safe */
265 dst_len
= pg_mbcliplen(VARDATA_ANY(srcunpacked
), src_len
, dst_len
);
266 memcpy(dst
, VARDATA_ANY(srcunpacked
), dst_len
);
270 if (srcunpacked
!= src
)
275 /*****************************************************************************
276 * USER I/O ROUTINES *
277 *****************************************************************************/
280 #define VAL(CH) ((CH) - '0')
281 #define DIG(VAL) ((VAL) + '0')
284 * byteain - converts from printable representation of byte array
286 * Non-printable characters must be passed as '\nnn' (octal) and are
287 * converted to internal form. '\' must be passed as '\\'.
288 * ereport(ERROR, ...) if bad form.
291 * The input is scanned twice.
292 * The error checking of input is minimal.
295 byteain(PG_FUNCTION_ARGS
)
297 char *inputText
= PG_GETARG_CSTRING(0);
303 /* Recognize hex input */
304 if (inputText
[0] == '\\' && inputText
[1] == 'x')
306 size_t len
= strlen(inputText
);
308 bc
= (len
- 2) / 2 + VARHDRSZ
; /* maximum possible length */
310 bc
= hex_decode(inputText
+ 2, len
- 2, VARDATA(result
));
311 SET_VARSIZE(result
, bc
+ VARHDRSZ
); /* actual length */
313 PG_RETURN_BYTEA_P(result
);
316 /* Else, it's the traditional escaped style */
317 for (bc
= 0, tp
= inputText
; *tp
!= '\0'; bc
++)
321 else if ((tp
[0] == '\\') &&
322 (tp
[1] >= '0' && tp
[1] <= '3') &&
323 (tp
[2] >= '0' && tp
[2] <= '7') &&
324 (tp
[3] >= '0' && tp
[3] <= '7'))
326 else if ((tp
[0] == '\\') &&
332 * one backslash, not followed by another or ### valid octal
335 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION
),
336 errmsg("invalid input syntax for type %s", "bytea")));
342 result
= (bytea
*) palloc(bc
);
343 SET_VARSIZE(result
, bc
);
346 rp
= VARDATA(result
);
351 else if ((tp
[0] == '\\') &&
352 (tp
[1] >= '0' && tp
[1] <= '3') &&
353 (tp
[2] >= '0' && tp
[2] <= '7') &&
354 (tp
[3] >= '0' && tp
[3] <= '7'))
360 *rp
++ = bc
+ VAL(tp
[3]);
364 else if ((tp
[0] == '\\') &&
373 * We should never get here. The first pass should not allow it.
376 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION
),
377 errmsg("invalid input syntax for type %s", "bytea")));
381 PG_RETURN_BYTEA_P(result
);
385 * byteaout - converts to printable representation of byte array
387 * In the traditional escaped format, non-printable characters are
388 * printed as '\nnn' (octal) and '\' as '\\'.
391 byteaout(PG_FUNCTION_ARGS
)
393 bytea
*vlena
= PG_GETARG_BYTEA_PP(0);
397 if (bytea_output
== BYTEA_OUTPUT_HEX
)
399 /* Print hex format */
400 rp
= result
= palloc(VARSIZE_ANY_EXHDR(vlena
) * 2 + 2 + 1);
403 rp
+= hex_encode(VARDATA_ANY(vlena
), VARSIZE_ANY_EXHDR(vlena
), rp
);
405 else if (bytea_output
== BYTEA_OUTPUT_ESCAPE
)
407 /* Print traditional escaped format */
412 len
= 1; /* empty string has 1 char */
413 vp
= VARDATA_ANY(vlena
);
414 for (i
= VARSIZE_ANY_EXHDR(vlena
); i
!= 0; i
--, vp
++)
418 else if ((unsigned char) *vp
< 0x20 || (unsigned char) *vp
> 0x7e)
425 * In principle len can't overflow uint32 if the input fit in 1GB, but
426 * for safety let's check rather than relying on palloc's internal
429 if (len
> MaxAllocSize
)
431 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
432 errmsg_internal("result of bytea output conversion is too large")));
433 rp
= result
= (char *) palloc(len
);
435 vp
= VARDATA_ANY(vlena
);
436 for (i
= VARSIZE_ANY_EXHDR(vlena
); i
!= 0; i
--, vp
++)
443 else if ((unsigned char) *vp
< 0x20 || (unsigned char) *vp
> 0x7e)
445 int val
; /* holds unprintable chars */
449 rp
[3] = DIG(val
& 07);
451 rp
[2] = DIG(val
& 07);
453 rp
[1] = DIG(val
& 03);
462 elog(ERROR
, "unrecognized bytea_output setting: %d",
464 rp
= result
= NULL
; /* keep compiler quiet */
467 PG_RETURN_CSTRING(result
);
471 * bytearecv - converts external binary format to bytea
474 bytearecv(PG_FUNCTION_ARGS
)
476 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
480 nbytes
= buf
->len
- buf
->cursor
;
481 result
= (bytea
*) palloc(nbytes
+ VARHDRSZ
);
482 SET_VARSIZE(result
, nbytes
+ VARHDRSZ
);
483 pq_copymsgbytes(buf
, VARDATA(result
), nbytes
);
484 PG_RETURN_BYTEA_P(result
);
488 * byteasend - converts bytea to binary format
490 * This is a special case: just copy the input...
493 byteasend(PG_FUNCTION_ARGS
)
495 bytea
*vlena
= PG_GETARG_BYTEA_P_COPY(0);
497 PG_RETURN_BYTEA_P(vlena
);
501 bytea_string_agg_transfn(PG_FUNCTION_ARGS
)
505 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
507 /* Append the value unless null. */
508 if (!PG_ARGISNULL(1))
510 bytea
*value
= PG_GETARG_BYTEA_PP(1);
512 /* On the first time through, we ignore the delimiter. */
514 state
= makeStringAggState(fcinfo
);
515 else if (!PG_ARGISNULL(2))
517 bytea
*delim
= PG_GETARG_BYTEA_PP(2);
519 appendBinaryStringInfo(state
, VARDATA_ANY(delim
), VARSIZE_ANY_EXHDR(delim
));
522 appendBinaryStringInfo(state
, VARDATA_ANY(value
), VARSIZE_ANY_EXHDR(value
));
526 * The transition type for string_agg() is declared to be "internal",
527 * which is a pass-by-value type the same size as a pointer.
529 PG_RETURN_POINTER(state
);
533 bytea_string_agg_finalfn(PG_FUNCTION_ARGS
)
537 /* cannot be called directly because of internal-type argument */
538 Assert(AggCheckCallContext(fcinfo
, NULL
));
540 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
546 result
= (bytea
*) palloc(state
->len
+ VARHDRSZ
);
547 SET_VARSIZE(result
, state
->len
+ VARHDRSZ
);
548 memcpy(VARDATA(result
), state
->data
, state
->len
);
549 PG_RETURN_BYTEA_P(result
);
556 * textin - converts "..." to internal representation
559 textin(PG_FUNCTION_ARGS
)
561 char *inputText
= PG_GETARG_CSTRING(0);
563 PG_RETURN_TEXT_P(cstring_to_text(inputText
));
567 * textout - converts internal representation to "..."
570 textout(PG_FUNCTION_ARGS
)
572 Datum txt
= PG_GETARG_DATUM(0);
574 PG_RETURN_CSTRING(TextDatumGetCString(txt
));
578 * textrecv - converts external binary format to text
581 textrecv(PG_FUNCTION_ARGS
)
583 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
588 str
= pq_getmsgtext(buf
, buf
->len
- buf
->cursor
, &nbytes
);
590 result
= cstring_to_text_with_len(str
, nbytes
);
592 PG_RETURN_TEXT_P(result
);
596 * textsend - converts text to binary format
599 textsend(PG_FUNCTION_ARGS
)
601 text
*t
= PG_GETARG_TEXT_PP(0);
604 pq_begintypsend(&buf
);
605 pq_sendtext(&buf
, VARDATA_ANY(t
), VARSIZE_ANY_EXHDR(t
));
606 PG_RETURN_BYTEA_P(pq_endtypsend(&buf
));
611 * unknownin - converts "..." to internal representation
614 unknownin(PG_FUNCTION_ARGS
)
616 char *str
= PG_GETARG_CSTRING(0);
618 /* representation is same as cstring */
619 PG_RETURN_CSTRING(pstrdup(str
));
623 * unknownout - converts internal representation to "..."
626 unknownout(PG_FUNCTION_ARGS
)
628 /* representation is same as cstring */
629 char *str
= PG_GETARG_CSTRING(0);
631 PG_RETURN_CSTRING(pstrdup(str
));
635 * unknownrecv - converts external binary format to unknown
638 unknownrecv(PG_FUNCTION_ARGS
)
640 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
644 str
= pq_getmsgtext(buf
, buf
->len
- buf
->cursor
, &nbytes
);
645 /* representation is same as cstring */
646 PG_RETURN_CSTRING(str
);
650 * unknownsend - converts unknown to binary format
653 unknownsend(PG_FUNCTION_ARGS
)
655 /* representation is same as cstring */
656 char *str
= PG_GETARG_CSTRING(0);
659 pq_begintypsend(&buf
);
660 pq_sendtext(&buf
, str
, strlen(str
));
661 PG_RETURN_BYTEA_P(pq_endtypsend(&buf
));
665 /* ========== PUBLIC ROUTINES ========== */
669 * returns the logical length of a text*
670 * (which is less than the VARSIZE of the text*)
673 textlen(PG_FUNCTION_ARGS
)
675 Datum str
= PG_GETARG_DATUM(0);
677 /* try to avoid decompressing argument */
678 PG_RETURN_INT32(text_length(str
));
683 * Does the real work for textlen()
685 * This is broken out so it can be called directly by other string processing
686 * functions. Note that the argument is passed as a Datum, to indicate that
687 * it may still be in compressed form. We can avoid decompressing it at all
691 text_length(Datum str
)
693 /* fastpath when max encoding length is one */
694 if (pg_database_encoding_max_length() == 1)
695 PG_RETURN_INT32(toast_raw_datum_size(str
) - VARHDRSZ
);
698 text
*t
= DatumGetTextPP(str
);
700 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t
),
701 VARSIZE_ANY_EXHDR(t
)));
707 * returns the physical length of a text*
708 * (which is less than the VARSIZE of the text*)
711 textoctetlen(PG_FUNCTION_ARGS
)
713 Datum str
= PG_GETARG_DATUM(0);
715 /* We need not detoast the input at all */
716 PG_RETURN_INT32(toast_raw_datum_size(str
) - VARHDRSZ
);
721 * takes two text* and returns a text* that is the concatenation of
724 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
725 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
726 * Allocate space for output in all cases.
727 * XXX - thomas 1997-07-10
730 textcat(PG_FUNCTION_ARGS
)
732 text
*t1
= PG_GETARG_TEXT_PP(0);
733 text
*t2
= PG_GETARG_TEXT_PP(1);
735 PG_RETURN_TEXT_P(text_catenate(t1
, t2
));
740 * Guts of textcat(), broken out so it can be used by other functions
742 * Arguments can be in short-header form, but not compressed or out-of-line
745 text_catenate(text
*t1
, text
*t2
)
753 len1
= VARSIZE_ANY_EXHDR(t1
);
754 len2
= VARSIZE_ANY_EXHDR(t2
);
756 /* paranoia ... probably should throw error instead? */
762 len
= len1
+ len2
+ VARHDRSZ
;
763 result
= (text
*) palloc(len
);
765 /* Set size of result string... */
766 SET_VARSIZE(result
, len
);
768 /* Fill data field of result string... */
769 ptr
= VARDATA(result
);
771 memcpy(ptr
, VARDATA_ANY(t1
), len1
);
773 memcpy(ptr
+ len1
, VARDATA_ANY(t2
), len2
);
779 * charlen_to_bytelen()
780 * Compute the number of bytes occupied by n characters starting at *p
782 * It is caller's responsibility that there actually are n characters;
783 * the string need not be null-terminated.
786 charlen_to_bytelen(const char *p
, int n
)
788 if (pg_database_encoding_max_length() == 1)
790 /* Optimization for single-byte encodings */
797 for (s
= p
; n
> 0; n
--)
806 * Return a substring starting at the specified position.
807 * - thomas 1997-12-31
811 * - starting position (is one-based)
814 * If the starting position is zero or less, then return from the start of the string
815 * adjusting the length to be consistent with the "negative start" per SQL.
816 * If the length is less than zero, return the remaining string.
818 * Added multibyte support.
819 * - Tatsuo Ishii 1998-4-21
820 * Changed behavior if starting position is less than one to conform to SQL behavior.
821 * Formerly returned the entire string; now returns a portion.
822 * - Thomas Lockhart 1998-12-10
823 * Now uses faster TOAST-slicing interface
824 * - John Gray 2002-02-22
825 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
826 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
827 * error; if E < 1, return '', not entire string). Fixed MB related bug when
828 * S > LC and < LC + 4 sometimes garbage characters are returned.
829 * - Joe Conway 2002-08-10
832 text_substr(PG_FUNCTION_ARGS
)
834 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
841 * text_substr_no_len -
842 * Wrapper to avoid opr_sanity failure due to
843 * one function accepting a different number of args.
846 text_substr_no_len(PG_FUNCTION_ARGS
)
848 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
855 * Does the real work for text_substr() and text_substr_no_len()
857 * This is broken out so it can be called directly by other string processing
858 * functions. Note that the argument is passed as a Datum, to indicate that
859 * it may still be in compressed/toasted form. We can avoid detoasting all
860 * of it in some cases.
862 * The result is always a freshly palloc'd datum.
865 text_substring(Datum str
, int32 start
, int32 length
, bool length_not_specified
)
867 int32 eml
= pg_database_encoding_max_length();
868 int32 S
= start
; /* start position */
869 int32 S1
; /* adjusted start position */
870 int32 L1
; /* adjusted substring length */
871 int32 E
; /* end position */
874 * SQL99 says S can be zero or negative, but we still must fetch from the
875 * start of the string.
879 /* life is easy if the encoding max length is 1 */
882 if (length_not_specified
) /* special case - get length to end of
887 /* SQL99 says to throw an error for E < S, i.e., negative length */
889 (errcode(ERRCODE_SUBSTRING_ERROR
),
890 errmsg("negative substring length not allowed")));
891 L1
= -1; /* silence stupider compilers */
893 else if (pg_add_s32_overflow(S
, length
, &E
))
896 * L could be large enough for S + L to overflow, in which case
897 * the substring must run to end of string.
904 * A zero or negative value for the end position can happen if the
905 * start was negative or one. SQL99 says to return a zero-length
909 return cstring_to_text("");
915 * If the start position is past the end of the string, SQL99 says to
916 * return a zero-length string -- DatumGetTextPSlice() will do that
917 * for us. We need only convert S1 to zero-based starting position.
919 return DatumGetTextPSlice(str
, S1
- 1, L1
);
924 * When encoding max length is > 1, we can't get LC without
925 * detoasting, so we'll grab a conservatively large slice now and go
926 * back later to do the right thing
939 * We need to start at position zero because there is no way to know
940 * in advance which byte offset corresponds to the supplied start
945 if (length_not_specified
) /* special case - get length to end of
947 slice_size
= L1
= -1;
950 /* SQL99 says to throw an error for E < S, i.e., negative length */
952 (errcode(ERRCODE_SUBSTRING_ERROR
),
953 errmsg("negative substring length not allowed")));
954 slice_size
= L1
= -1; /* silence stupider compilers */
956 else if (pg_add_s32_overflow(S
, length
, &E
))
959 * L could be large enough for S + L to overflow, in which case
960 * the substring must run to end of string.
962 slice_size
= L1
= -1;
967 * A zero or negative value for the end position can happen if the
968 * start was negative or one. SQL99 says to return a zero-length
972 return cstring_to_text("");
975 * if E is past the end of the string, the tuple toaster will
976 * truncate the length for us
981 * Total slice size in bytes can't be any longer than the start
982 * position plus substring length times the encoding max length.
983 * If that overflows, we can just use -1.
985 if (pg_mul_s32_overflow(E
, eml
, &slice_size
))
990 * If we're working with an untoasted source, no need to do an extra
993 if (VARATT_IS_COMPRESSED(DatumGetPointer(str
)) ||
994 VARATT_IS_EXTERNAL(DatumGetPointer(str
)))
995 slice
= DatumGetTextPSlice(str
, slice_start
, slice_size
);
997 slice
= (text
*) DatumGetPointer(str
);
999 /* see if we got back an empty string */
1000 if (VARSIZE_ANY_EXHDR(slice
) == 0)
1002 if (slice
!= (text
*) DatumGetPointer(str
))
1004 return cstring_to_text("");
1007 /* Now we can get the actual length of the slice in MB characters */
1008 slice_strlen
= pg_mbstrlen_with_len(VARDATA_ANY(slice
),
1009 VARSIZE_ANY_EXHDR(slice
));
1012 * Check that the start position wasn't > slice_strlen. If so, SQL99
1013 * says to return a zero-length string.
1015 if (S1
> slice_strlen
)
1017 if (slice
!= (text
*) DatumGetPointer(str
))
1019 return cstring_to_text("");
1023 * Adjust L1 and E1 now that we know the slice string length. Again
1024 * remember that S1 is one based, and slice_start is zero based.
1027 E1
= Min(S1
+ L1
, slice_start
+ 1 + slice_strlen
);
1029 E1
= slice_start
+ 1 + slice_strlen
;
1032 * Find the start position in the slice; remember S1 is not zero based
1034 p
= VARDATA_ANY(slice
);
1035 for (i
= 0; i
< S1
- 1; i
++)
1038 /* hang onto a pointer to our start position */
1042 * Count the actual bytes used by the substring of the requested
1045 for (i
= S1
; i
< E1
; i
++)
1048 ret
= (text
*) palloc(VARHDRSZ
+ (p
- s
));
1049 SET_VARSIZE(ret
, VARHDRSZ
+ (p
- s
));
1050 memcpy(VARDATA(ret
), s
, (p
- s
));
1052 if (slice
!= (text
*) DatumGetPointer(str
))
1058 elog(ERROR
, "invalid backend encoding: encoding max length < 1");
1060 /* not reached: suppress compiler warning */
1066 * Replace specified substring of first string with second
1068 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1069 * This code is a direct implementation of what the standard says.
1072 textoverlay(PG_FUNCTION_ARGS
)
1074 text
*t1
= PG_GETARG_TEXT_PP(0);
1075 text
*t2
= PG_GETARG_TEXT_PP(1);
1076 int sp
= PG_GETARG_INT32(2); /* substring start position */
1077 int sl
= PG_GETARG_INT32(3); /* substring length */
1079 PG_RETURN_TEXT_P(text_overlay(t1
, t2
, sp
, sl
));
1083 textoverlay_no_len(PG_FUNCTION_ARGS
)
1085 text
*t1
= PG_GETARG_TEXT_PP(0);
1086 text
*t2
= PG_GETARG_TEXT_PP(1);
1087 int sp
= PG_GETARG_INT32(2); /* substring start position */
1090 sl
= text_length(PointerGetDatum(t2
)); /* defaults to length(t2) */
1091 PG_RETURN_TEXT_P(text_overlay(t1
, t2
, sp
, sl
));
1095 text_overlay(text
*t1
, text
*t2
, int sp
, int sl
)
1103 * Check for possible integer-overflow cases. For negative sp, throw a
1104 * "substring length" error because that's what should be expected
1105 * according to the spec's definition of OVERLAY().
1109 (errcode(ERRCODE_SUBSTRING_ERROR
),
1110 errmsg("negative substring length not allowed")));
1111 if (pg_add_s32_overflow(sp
, sl
, &sp_pl_sl
))
1113 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
1114 errmsg("integer out of range")));
1116 s1
= text_substring(PointerGetDatum(t1
), 1, sp
- 1, false);
1117 s2
= text_substring(PointerGetDatum(t1
), sp_pl_sl
, -1, true);
1118 result
= text_catenate(s1
, t2
);
1119 result
= text_catenate(result
, s2
);
1126 * Return the position of the specified substring.
1127 * Implements the SQL POSITION() function.
1128 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1129 * - thomas 1997-07-27
1132 textpos(PG_FUNCTION_ARGS
)
1134 text
*str
= PG_GETARG_TEXT_PP(0);
1135 text
*search_str
= PG_GETARG_TEXT_PP(1);
1137 PG_RETURN_INT32((int32
) text_position(str
, search_str
, PG_GET_COLLATION()));
1142 * Does the real work for textpos()
1145 * t1 - string to be searched
1146 * t2 - pattern to match within t1
1148 * Character index of the first matched char, starting from 1,
1151 * This is broken out so it can be called directly by other string processing
1155 text_position(text
*t1
, text
*t2
, Oid collid
)
1157 TextPositionState state
;
1160 /* Empty needle always matches at position 1 */
1161 if (VARSIZE_ANY_EXHDR(t2
) < 1)
1164 /* Otherwise, can't match if haystack is shorter than needle */
1165 if (VARSIZE_ANY_EXHDR(t1
) < VARSIZE_ANY_EXHDR(t2
))
1168 text_position_setup(t1
, t2
, collid
, &state
);
1169 if (!text_position_next(&state
))
1172 result
= text_position_get_match_pos(&state
);
1173 text_position_cleanup(&state
);
1179 * text_position_setup, text_position_next, text_position_cleanup -
1180 * Component steps of text_position()
1182 * These are broken out so that a string can be efficiently searched for
1183 * multiple occurrences of the same pattern. text_position_next may be
1184 * called multiple times, and it advances to the next match on each call.
1185 * text_position_get_match_ptr() and text_position_get_match_pos() return
1186 * a pointer or 1-based character position of the last match, respectively.
1188 * The "state" variable is normally just a local variable in the caller.
1190 * NOTE: text_position_next skips over the matched portion. For example,
1191 * searching for "xx" in "xxx" returns only one match, not two.
1195 text_position_setup(text
*t1
, text
*t2
, Oid collid
, TextPositionState
*state
)
1197 int len1
= VARSIZE_ANY_EXHDR(t1
);
1198 int len2
= VARSIZE_ANY_EXHDR(t2
);
1199 pg_locale_t mylocale
= 0;
1201 check_collation_set(collid
);
1203 if (!lc_collate_is_c(collid
) && collid
!= DEFAULT_COLLATION_OID
)
1204 mylocale
= pg_newlocale_from_collation(collid
);
1206 if (mylocale
&& !mylocale
->deterministic
)
1208 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
1209 errmsg("nondeterministic collations are not supported for substring searches")));
1215 * Even with a multi-byte encoding, we perform the search using the raw
1216 * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1217 * because in UTF-8 the byte sequence of one character cannot contain
1218 * another character. For other multi-byte encodings, we do the search
1219 * initially as a simple byte search, ignoring multibyte issues, but
1220 * verify afterwards that the match we found is at a character boundary,
1221 * and continue the search if it was a false match.
1223 if (pg_database_encoding_max_length() == 1)
1225 state
->is_multibyte
= false;
1226 state
->is_multibyte_char_in_char
= false;
1228 else if (GetDatabaseEncoding() == PG_UTF8
)
1230 state
->is_multibyte
= true;
1231 state
->is_multibyte_char_in_char
= false;
1235 state
->is_multibyte
= true;
1236 state
->is_multibyte_char_in_char
= true;
1239 state
->str1
= VARDATA_ANY(t1
);
1240 state
->str2
= VARDATA_ANY(t2
);
1243 state
->last_match
= NULL
;
1244 state
->refpoint
= state
->str1
;
1248 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1249 * notes we use the terminology that the "haystack" is the string to be
1250 * searched (t1) and the "needle" is the pattern being sought (t2).
1252 * If the needle is empty or bigger than the haystack then there is no
1253 * point in wasting cycles initializing the table. We also choose not to
1254 * use B-M-H for needles of length 1, since the skip table can't possibly
1255 * save anything in that case.
1257 if (len1
>= len2
&& len2
> 1)
1259 int searchlength
= len1
- len2
;
1263 const char *str2
= state
->str2
;
1266 * First we must determine how much of the skip table to use. The
1267 * declaration of TextPositionState allows up to 256 elements, but for
1268 * short search problems we don't really want to have to initialize so
1269 * many elements --- it would take too long in comparison to the
1270 * actual search time. So we choose a useful skip table size based on
1271 * the haystack length minus the needle length. The closer the needle
1272 * length is to the haystack length the less useful skipping becomes.
1274 * Note: since we use bit-masking to select table elements, the skip
1275 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1277 if (searchlength
< 16)
1279 else if (searchlength
< 64)
1281 else if (searchlength
< 128)
1283 else if (searchlength
< 512)
1285 else if (searchlength
< 2048)
1287 else if (searchlength
< 4096)
1288 skiptablemask
= 127;
1290 skiptablemask
= 255;
1291 state
->skiptablemask
= skiptablemask
;
1294 * Initialize the skip table. We set all elements to the needle
1295 * length, since this is the correct skip distance for any character
1296 * not found in the needle.
1298 for (i
= 0; i
<= skiptablemask
; i
++)
1299 state
->skiptable
[i
] = len2
;
1302 * Now examine the needle. For each character except the last one,
1303 * set the corresponding table element to the appropriate skip
1304 * distance. Note that when two characters share the same skip table
1305 * entry, the one later in the needle must determine the skip
1310 for (i
= 0; i
< last
; i
++)
1311 state
->skiptable
[(unsigned char) str2
[i
] & skiptablemask
] = last
- i
;
1316 * Advance to the next match, starting from the end of the previous match
1317 * (or the beginning of the string, on first call). Returns true if a match
1320 * Note that this refuses to match an empty-string needle. Most callers
1321 * will have handled that case specially and we'll never see it here.
1324 text_position_next(TextPositionState
*state
)
1326 int needle_len
= state
->len2
;
1330 if (needle_len
<= 0)
1331 return false; /* result for empty pattern */
1333 /* Start from the point right after the previous match. */
1334 if (state
->last_match
)
1335 start_ptr
= state
->last_match
+ needle_len
;
1337 start_ptr
= state
->str1
;
1340 matchptr
= text_position_next_internal(start_ptr
, state
);
1346 * Found a match for the byte sequence. If this is a multibyte encoding,
1347 * where one character's byte sequence can appear inside a longer
1348 * multi-byte character, we need to verify that the match was at a
1349 * character boundary, not in the middle of a multi-byte character.
1351 if (state
->is_multibyte_char_in_char
)
1353 /* Walk one character at a time, until we reach the match. */
1355 /* the search should never move backwards. */
1356 Assert(state
->refpoint
<= matchptr
);
1358 while (state
->refpoint
< matchptr
)
1360 /* step to next character. */
1361 state
->refpoint
+= pg_mblen(state
->refpoint
);
1365 * If we stepped over the match's start position, then it was a
1366 * false positive, where the byte sequence appeared in the middle
1367 * of a multi-byte character. Skip it, and continue the search at
1368 * the next character boundary.
1370 if (state
->refpoint
> matchptr
)
1372 start_ptr
= state
->refpoint
;
1378 state
->last_match
= matchptr
;
1383 * Subroutine of text_position_next(). This searches for the raw byte
1384 * sequence, ignoring any multi-byte encoding issues. Returns the first
1385 * match starting at 'start_ptr', or NULL if no match is found.
1388 text_position_next_internal(char *start_ptr
, TextPositionState
*state
)
1390 int haystack_len
= state
->len1
;
1391 int needle_len
= state
->len2
;
1392 int skiptablemask
= state
->skiptablemask
;
1393 const char *haystack
= state
->str1
;
1394 const char *needle
= state
->str2
;
1395 const char *haystack_end
= &haystack
[haystack_len
];
1398 Assert(start_ptr
>= haystack
&& start_ptr
<= haystack_end
);
1400 if (needle_len
== 1)
1402 /* No point in using B-M-H for a one-character needle */
1403 char nchar
= *needle
;
1406 while (hptr
< haystack_end
)
1409 return (char *) hptr
;
1415 const char *needle_last
= &needle
[needle_len
- 1];
1417 /* Start at startpos plus the length of the needle */
1418 hptr
= start_ptr
+ needle_len
- 1;
1419 while (hptr
< haystack_end
)
1421 /* Match the needle scanning *backward* */
1429 /* Matched it all? If so, return 1-based position */
1436 * No match, so use the haystack char at hptr to decide how far to
1437 * advance. If the needle had any occurrence of that character
1438 * (or more precisely, one sharing the same skiptable entry)
1439 * before its last character, then we advance far enough to align
1440 * the last such needle character with that haystack position.
1441 * Otherwise we can advance by the whole needle length.
1443 hptr
+= state
->skiptable
[(unsigned char) *hptr
& skiptablemask
];
1447 return 0; /* not found */
1451 * Return a pointer to the current match.
1453 * The returned pointer points into the original haystack string.
1456 text_position_get_match_ptr(TextPositionState
*state
)
1458 return state
->last_match
;
1462 * Return the offset of the current match.
1464 * The offset is in characters, 1-based.
1467 text_position_get_match_pos(TextPositionState
*state
)
1469 if (!state
->is_multibyte
)
1470 return state
->last_match
- state
->str1
+ 1;
1473 /* Convert the byte position to char position. */
1474 while (state
->refpoint
< state
->last_match
)
1476 state
->refpoint
+= pg_mblen(state
->refpoint
);
1479 Assert(state
->refpoint
== state
->last_match
);
1480 return state
->refpos
+ 1;
1485 * Reset search state to the initial state installed by text_position_setup.
1487 * The next call to text_position_next will search from the beginning
1491 text_position_reset(TextPositionState
*state
)
1493 state
->last_match
= NULL
;
1494 state
->refpoint
= state
->str1
;
1499 text_position_cleanup(TextPositionState
*state
)
1501 /* no cleanup needed */
1506 check_collation_set(Oid collid
)
1508 if (!OidIsValid(collid
))
1511 * This typically means that the parser could not resolve a conflict
1512 * of implicit collations, so report it that way.
1515 (errcode(ERRCODE_INDETERMINATE_COLLATION
),
1516 errmsg("could not determine which collation to use for string comparison"),
1517 errhint("Use the COLLATE clause to set the collation explicitly.")));
1522 * Comparison function for text strings with given lengths.
1523 * Includes locale support, but must copy strings to temporary memory
1524 * to allow null-termination for inputs to strcoll().
1525 * Returns an integer less than, equal to, or greater than zero, indicating
1526 * whether arg1 is less than, equal to, or greater than arg2.
1528 * Note: many functions that depend on this are marked leakproof; therefore,
1529 * avoid reporting the actual contents of the input when throwing errors.
1530 * All errors herein should be things that can't happen except on corrupt
1531 * data, anyway; otherwise we will have trouble with indexing strings that
1535 varstr_cmp(const char *arg1
, int len1
, const char *arg2
, int len2
, Oid collid
)
1539 check_collation_set(collid
);
1542 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1543 * have to do some memory copying. This turns out to be significantly
1544 * slower, so we optimize the case where LC_COLLATE is C. We also try to
1545 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1547 if (lc_collate_is_c(collid
))
1549 result
= memcmp(arg1
, arg2
, Min(len1
, len2
));
1550 if ((result
== 0) && (len1
!= len2
))
1551 result
= (len1
< len2
) ? -1 : 1;
1555 char a1buf
[TEXTBUFLEN
];
1556 char a2buf
[TEXTBUFLEN
];
1559 pg_locale_t mylocale
= 0;
1561 if (collid
!= DEFAULT_COLLATION_OID
)
1562 mylocale
= pg_newlocale_from_collation(collid
);
1565 * memcmp() can't tell us which of two unequal strings sorts first,
1566 * but it's a cheap way to tell if they're equal. Testing shows that
1567 * memcmp() followed by strcoll() is only trivially slower than
1568 * strcoll() by itself, so we don't lose much if this doesn't work out
1569 * very often, and if it does - for example, because there are many
1570 * equal strings in the input - then we win big by avoiding expensive
1571 * collation-aware comparisons.
1573 if (len1
== len2
&& memcmp(arg1
, arg2
, len1
) == 0)
1577 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1578 if (GetDatabaseEncoding() == PG_UTF8
1579 && (!mylocale
|| mylocale
->provider
== COLLPROVIDER_LIBC
))
1585 if (len1
>= TEXTBUFLEN
/ 2)
1587 a1len
= len1
* 2 + 2;
1588 a1p
= palloc(a1len
);
1595 if (len2
>= TEXTBUFLEN
/ 2)
1597 a2len
= len2
* 2 + 2;
1598 a2p
= palloc(a2len
);
1606 /* stupid Microsloth API does not work for zero-length input */
1611 r
= MultiByteToWideChar(CP_UTF8
, 0, arg1
, len1
,
1612 (LPWSTR
) a1p
, a1len
/ 2);
1615 (errmsg("could not convert string to UTF-16: error code %lu",
1618 ((LPWSTR
) a1p
)[r
] = 0;
1624 r
= MultiByteToWideChar(CP_UTF8
, 0, arg2
, len2
,
1625 (LPWSTR
) a2p
, a2len
/ 2);
1628 (errmsg("could not convert string to UTF-16: error code %lu",
1631 ((LPWSTR
) a2p
)[r
] = 0;
1634 #ifdef HAVE_LOCALE_T
1636 result
= wcscoll_l((LPWSTR
) a1p
, (LPWSTR
) a2p
, mylocale
->info
.lt
);
1639 result
= wcscoll((LPWSTR
) a1p
, (LPWSTR
) a2p
);
1640 if (result
== 2147483647) /* _NLSCMPERROR; missing from mingw
1643 (errmsg("could not compare Unicode strings: %m")));
1645 /* Break tie if necessary. */
1647 (!mylocale
|| mylocale
->deterministic
))
1649 result
= memcmp(arg1
, arg2
, Min(len1
, len2
));
1650 if ((result
== 0) && (len1
!= len2
))
1651 result
= (len1
< len2
) ? -1 : 1;
1663 if (len1
>= TEXTBUFLEN
)
1664 a1p
= (char *) palloc(len1
+ 1);
1667 if (len2
>= TEXTBUFLEN
)
1668 a2p
= (char *) palloc(len2
+ 1);
1672 memcpy(a1p
, arg1
, len1
);
1674 memcpy(a2p
, arg2
, len2
);
1679 if (mylocale
->provider
== COLLPROVIDER_ICU
)
1682 #ifdef HAVE_UCOL_STRCOLLUTF8
1683 if (GetDatabaseEncoding() == PG_UTF8
)
1687 status
= U_ZERO_ERROR
;
1688 result
= ucol_strcollUTF8(mylocale
->info
.icu
.ucol
,
1692 if (U_FAILURE(status
))
1694 (errmsg("collation failed: %s", u_errorName(status
))));
1704 ulen1
= icu_to_uchar(&uchar1
, arg1
, len1
);
1705 ulen2
= icu_to_uchar(&uchar2
, arg2
, len2
);
1707 result
= ucol_strcoll(mylocale
->info
.icu
.ucol
,
1714 #else /* not USE_ICU */
1715 /* shouldn't happen */
1716 elog(ERROR
, "unsupported collprovider: %c", mylocale
->provider
);
1717 #endif /* not USE_ICU */
1721 #ifdef HAVE_LOCALE_T
1722 result
= strcoll_l(a1p
, a2p
, mylocale
->info
.lt
);
1724 /* shouldn't happen */
1725 elog(ERROR
, "unsupported collprovider: %c", mylocale
->provider
);
1730 result
= strcoll(a1p
, a2p
);
1732 /* Break tie if necessary. */
1734 (!mylocale
|| mylocale
->deterministic
))
1735 result
= strcmp(a1p
, a2p
);
1747 * Internal comparison function for text strings.
1748 * Returns -1, 0 or 1
1751 text_cmp(text
*arg1
, text
*arg2
, Oid collid
)
1758 a1p
= VARDATA_ANY(arg1
);
1759 a2p
= VARDATA_ANY(arg2
);
1761 len1
= VARSIZE_ANY_EXHDR(arg1
);
1762 len2
= VARSIZE_ANY_EXHDR(arg2
);
1764 return varstr_cmp(a1p
, len1
, a2p
, len2
, collid
);
1768 * Comparison functions for text strings.
1770 * Note: btree indexes need these routines not to leak memory; therefore,
1771 * be careful to free working copies of toasted datums. Most places don't
1772 * need to be so careful.
1776 texteq(PG_FUNCTION_ARGS
)
1778 Oid collid
= PG_GET_COLLATION();
1781 check_collation_set(collid
);
1783 if (lc_collate_is_c(collid
) ||
1784 collid
== DEFAULT_COLLATION_OID
||
1785 pg_newlocale_from_collation(collid
)->deterministic
)
1787 Datum arg1
= PG_GETARG_DATUM(0);
1788 Datum arg2
= PG_GETARG_DATUM(1);
1793 * Since we only care about equality or not-equality, we can avoid all
1794 * the expense of strcoll() here, and just do bitwise comparison. In
1795 * fact, we don't even have to do a bitwise comparison if we can show
1796 * the lengths of the strings are unequal; which might save us from
1797 * having to detoast one or both values.
1799 len1
= toast_raw_datum_size(arg1
);
1800 len2
= toast_raw_datum_size(arg2
);
1805 text
*targ1
= DatumGetTextPP(arg1
);
1806 text
*targ2
= DatumGetTextPP(arg2
);
1808 result
= (memcmp(VARDATA_ANY(targ1
), VARDATA_ANY(targ2
),
1809 len1
- VARHDRSZ
) == 0);
1811 PG_FREE_IF_COPY(targ1
, 0);
1812 PG_FREE_IF_COPY(targ2
, 1);
1817 text
*arg1
= PG_GETARG_TEXT_PP(0);
1818 text
*arg2
= PG_GETARG_TEXT_PP(1);
1820 result
= (text_cmp(arg1
, arg2
, collid
) == 0);
1822 PG_FREE_IF_COPY(arg1
, 0);
1823 PG_FREE_IF_COPY(arg2
, 1);
1826 PG_RETURN_BOOL(result
);
1830 textne(PG_FUNCTION_ARGS
)
1832 Oid collid
= PG_GET_COLLATION();
1835 check_collation_set(collid
);
1837 if (lc_collate_is_c(collid
) ||
1838 collid
== DEFAULT_COLLATION_OID
||
1839 pg_newlocale_from_collation(collid
)->deterministic
)
1841 Datum arg1
= PG_GETARG_DATUM(0);
1842 Datum arg2
= PG_GETARG_DATUM(1);
1846 /* See comment in texteq() */
1847 len1
= toast_raw_datum_size(arg1
);
1848 len2
= toast_raw_datum_size(arg2
);
1853 text
*targ1
= DatumGetTextPP(arg1
);
1854 text
*targ2
= DatumGetTextPP(arg2
);
1856 result
= (memcmp(VARDATA_ANY(targ1
), VARDATA_ANY(targ2
),
1857 len1
- VARHDRSZ
) != 0);
1859 PG_FREE_IF_COPY(targ1
, 0);
1860 PG_FREE_IF_COPY(targ2
, 1);
1865 text
*arg1
= PG_GETARG_TEXT_PP(0);
1866 text
*arg2
= PG_GETARG_TEXT_PP(1);
1868 result
= (text_cmp(arg1
, arg2
, collid
) != 0);
1870 PG_FREE_IF_COPY(arg1
, 0);
1871 PG_FREE_IF_COPY(arg2
, 1);
1874 PG_RETURN_BOOL(result
);
1878 text_lt(PG_FUNCTION_ARGS
)
1880 text
*arg1
= PG_GETARG_TEXT_PP(0);
1881 text
*arg2
= PG_GETARG_TEXT_PP(1);
1884 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) < 0);
1886 PG_FREE_IF_COPY(arg1
, 0);
1887 PG_FREE_IF_COPY(arg2
, 1);
1889 PG_RETURN_BOOL(result
);
1893 text_le(PG_FUNCTION_ARGS
)
1895 text
*arg1
= PG_GETARG_TEXT_PP(0);
1896 text
*arg2
= PG_GETARG_TEXT_PP(1);
1899 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) <= 0);
1901 PG_FREE_IF_COPY(arg1
, 0);
1902 PG_FREE_IF_COPY(arg2
, 1);
1904 PG_RETURN_BOOL(result
);
1908 text_gt(PG_FUNCTION_ARGS
)
1910 text
*arg1
= PG_GETARG_TEXT_PP(0);
1911 text
*arg2
= PG_GETARG_TEXT_PP(1);
1914 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) > 0);
1916 PG_FREE_IF_COPY(arg1
, 0);
1917 PG_FREE_IF_COPY(arg2
, 1);
1919 PG_RETURN_BOOL(result
);
1923 text_ge(PG_FUNCTION_ARGS
)
1925 text
*arg1
= PG_GETARG_TEXT_PP(0);
1926 text
*arg2
= PG_GETARG_TEXT_PP(1);
1929 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) >= 0);
1931 PG_FREE_IF_COPY(arg1
, 0);
1932 PG_FREE_IF_COPY(arg2
, 1);
1934 PG_RETURN_BOOL(result
);
1938 text_starts_with(PG_FUNCTION_ARGS
)
1940 Datum arg1
= PG_GETARG_DATUM(0);
1941 Datum arg2
= PG_GETARG_DATUM(1);
1942 Oid collid
= PG_GET_COLLATION();
1943 pg_locale_t mylocale
= 0;
1948 check_collation_set(collid
);
1950 if (!lc_collate_is_c(collid
) && collid
!= DEFAULT_COLLATION_OID
)
1951 mylocale
= pg_newlocale_from_collation(collid
);
1953 if (mylocale
&& !mylocale
->deterministic
)
1955 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
1956 errmsg("nondeterministic collations are not supported for substring searches")));
1958 len1
= toast_raw_datum_size(arg1
);
1959 len2
= toast_raw_datum_size(arg2
);
1964 text
*targ1
= text_substring(arg1
, 1, len2
, false);
1965 text
*targ2
= DatumGetTextPP(arg2
);
1967 result
= (memcmp(VARDATA_ANY(targ1
), VARDATA_ANY(targ2
),
1968 VARSIZE_ANY_EXHDR(targ2
)) == 0);
1970 PG_FREE_IF_COPY(targ1
, 0);
1971 PG_FREE_IF_COPY(targ2
, 1);
1974 PG_RETURN_BOOL(result
);
1978 bttextcmp(PG_FUNCTION_ARGS
)
1980 text
*arg1
= PG_GETARG_TEXT_PP(0);
1981 text
*arg2
= PG_GETARG_TEXT_PP(1);
1984 result
= text_cmp(arg1
, arg2
, PG_GET_COLLATION());
1986 PG_FREE_IF_COPY(arg1
, 0);
1987 PG_FREE_IF_COPY(arg2
, 1);
1989 PG_RETURN_INT32(result
);
1993 bttextsortsupport(PG_FUNCTION_ARGS
)
1995 SortSupport ssup
= (SortSupport
) PG_GETARG_POINTER(0);
1996 Oid collid
= ssup
->ssup_collation
;
1997 MemoryContext oldcontext
;
1999 oldcontext
= MemoryContextSwitchTo(ssup
->ssup_cxt
);
2001 /* Use generic string SortSupport */
2002 varstr_sortsupport(ssup
, TEXTOID
, collid
);
2004 MemoryContextSwitchTo(oldcontext
);
2010 * Generic sortsupport interface for character type's operator classes.
2011 * Includes locale support, and support for BpChar semantics (i.e. removing
2012 * trailing spaces before comparison).
2014 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2015 * same representation. Callers that always use the C collation (e.g.
2016 * non-collatable type callers like bytea) may have NUL bytes in their strings;
2017 * this will not work with any other collation, though.
2020 varstr_sortsupport(SortSupport ssup
, Oid typid
, Oid collid
)
2022 bool abbreviate
= ssup
->abbreviate
;
2023 bool collate_c
= false;
2024 VarStringSortSupport
*sss
;
2025 pg_locale_t locale
= 0;
2027 check_collation_set(collid
);
2030 * If possible, set ssup->comparator to a function which can be used to
2031 * directly compare two datums. If we can do this, we'll avoid the
2032 * overhead of a trip through the fmgr layer for every comparison, which
2033 * can be substantial.
2035 * Most typically, we'll set the comparator to varlenafastcmp_locale,
2036 * which uses strcoll() to perform comparisons. We use that for the
2037 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2038 * LC_COLLATE = C, we can make things quite a bit faster with
2039 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2040 * memcmp() rather than strcoll().
2042 if (lc_collate_is_c(collid
))
2044 if (typid
== BPCHAROID
)
2045 ssup
->comparator
= bpcharfastcmp_c
;
2046 else if (typid
== NAMEOID
)
2048 ssup
->comparator
= namefastcmp_c
;
2049 /* Not supporting abbreviation with type NAME, for now */
2053 ssup
->comparator
= varstrfastcmp_c
;
2060 * We need a collation-sensitive comparison. To make things faster,
2061 * we'll figure out the collation based on the locale id and cache the
2064 if (collid
!= DEFAULT_COLLATION_OID
)
2065 locale
= pg_newlocale_from_collation(collid
);
2068 * There is a further exception on Windows. When the database
2069 * encoding is UTF-8 and we are not using the C collation, complex
2070 * hacks are required. We don't currently have a comparator that
2071 * handles that case, so we fall back on the slow method of having the
2072 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2073 * trampoline. ICU locales work just the same on Windows, however.
2076 if (GetDatabaseEncoding() == PG_UTF8
&&
2077 !(locale
&& locale
->provider
== COLLPROVIDER_ICU
))
2082 * We use varlenafastcmp_locale except for type NAME.
2084 if (typid
== NAMEOID
)
2086 ssup
->comparator
= namefastcmp_locale
;
2087 /* Not supporting abbreviation with type NAME, for now */
2091 ssup
->comparator
= varlenafastcmp_locale
;
2095 * Unfortunately, it seems that abbreviation for non-C collations is
2096 * broken on many common platforms; testing of multiple versions of glibc
2097 * reveals that, for many locales, strcoll() and strxfrm() do not return
2098 * consistent results, which is fatal to this optimization. While no
2099 * other libc other than Cygwin has so far been shown to have a problem,
2100 * we take the conservative course of action for right now and disable
2101 * this categorically. (Users who are certain this isn't a problem on
2102 * their system can define TRUST_STRXFRM.)
2104 * Even apart from the risk of broken locales, it's possible that there
2105 * are platforms where the use of abbreviated keys should be disabled at
2106 * compile time. Having only 4 byte datums could make worst-case
2107 * performance drastically more likely, for example. Moreover, macOS's
2108 * strxfrm() implementation is known to not effectively concentrate a
2109 * significant amount of entropy from the original string in earlier
2110 * transformed blobs. It's possible that other supported platforms are
2111 * similarly encumbered. So, if we ever get past disabling this
2112 * categorically, we may still want or need to disable it for particular
2115 #ifndef TRUST_STRXFRM
2116 if (!collate_c
&& !(locale
&& locale
->provider
== COLLPROVIDER_ICU
))
2121 * If we're using abbreviated keys, or if we're using a locale-aware
2122 * comparison, we need to initialize a VarStringSortSupport object. Both
2123 * cases will make use of the temporary buffers we initialize here for
2124 * scratch space (and to detect requirement for BpChar semantics from
2125 * caller), and the abbreviation case requires additional state.
2127 if (abbreviate
|| !collate_c
)
2129 sss
= palloc(sizeof(VarStringSortSupport
));
2130 sss
->buf1
= palloc(TEXTBUFLEN
);
2131 sss
->buflen1
= TEXTBUFLEN
;
2132 sss
->buf2
= palloc(TEXTBUFLEN
);
2133 sss
->buflen2
= TEXTBUFLEN
;
2134 /* Start with invalid values */
2135 sss
->last_len1
= -1;
2136 sss
->last_len2
= -1;
2138 sss
->last_returned
= 0;
2139 sss
->locale
= locale
;
2142 * To avoid somehow confusing a strxfrm() blob and an original string,
2143 * constantly keep track of the variety of data that buf1 and buf2
2144 * currently contain.
2146 * Comparisons may be interleaved with conversion calls. Frequently,
2147 * conversions and comparisons are batched into two distinct phases,
2148 * but the correctness of caching cannot hinge upon this. For
2149 * comparison caching, buffer state is only trusted if cache_blob is
2150 * found set to false, whereas strxfrm() caching only trusts the state
2151 * when cache_blob is found set to true.
2153 * Arbitrarily initialize cache_blob to true.
2155 sss
->cache_blob
= true;
2156 sss
->collate_c
= collate_c
;
2158 ssup
->ssup_extra
= sss
;
2161 * If possible, plan to use the abbreviated keys optimization. The
2162 * core code may switch back to authoritative comparator should
2163 * abbreviation be aborted.
2167 sss
->prop_card
= 0.20;
2168 initHyperLogLog(&sss
->abbr_card
, 10);
2169 initHyperLogLog(&sss
->full_card
, 10);
2170 ssup
->abbrev_full_comparator
= ssup
->comparator
;
2171 ssup
->comparator
= varstrcmp_abbrev
;
2172 ssup
->abbrev_converter
= varstr_abbrev_convert
;
2173 ssup
->abbrev_abort
= varstr_abbrev_abort
;
2179 * sortsupport comparison func (for C locale case)
2182 varstrfastcmp_c(Datum x
, Datum y
, SortSupport ssup
)
2184 VarString
*arg1
= DatumGetVarStringPP(x
);
2185 VarString
*arg2
= DatumGetVarStringPP(y
);
2192 a1p
= VARDATA_ANY(arg1
);
2193 a2p
= VARDATA_ANY(arg2
);
2195 len1
= VARSIZE_ANY_EXHDR(arg1
);
2196 len2
= VARSIZE_ANY_EXHDR(arg2
);
2198 result
= memcmp(a1p
, a2p
, Min(len1
, len2
));
2199 if ((result
== 0) && (len1
!= len2
))
2200 result
= (len1
< len2
) ? -1 : 1;
2202 /* We can't afford to leak memory here. */
2203 if (PointerGetDatum(arg1
) != x
)
2205 if (PointerGetDatum(arg2
) != y
)
2212 * sortsupport comparison func (for BpChar C locale case)
2214 * BpChar outsources its sortsupport to this module. Specialization for the
2215 * varstr_sortsupport BpChar case, modeled on
2216 * internal_bpchar_pattern_compare().
2219 bpcharfastcmp_c(Datum x
, Datum y
, SortSupport ssup
)
2221 BpChar
*arg1
= DatumGetBpCharPP(x
);
2222 BpChar
*arg2
= DatumGetBpCharPP(y
);
2229 a1p
= VARDATA_ANY(arg1
);
2230 a2p
= VARDATA_ANY(arg2
);
2232 len1
= bpchartruelen(a1p
, VARSIZE_ANY_EXHDR(arg1
));
2233 len2
= bpchartruelen(a2p
, VARSIZE_ANY_EXHDR(arg2
));
2235 result
= memcmp(a1p
, a2p
, Min(len1
, len2
));
2236 if ((result
== 0) && (len1
!= len2
))
2237 result
= (len1
< len2
) ? -1 : 1;
2239 /* We can't afford to leak memory here. */
2240 if (PointerGetDatum(arg1
) != x
)
2242 if (PointerGetDatum(arg2
) != y
)
2249 * sortsupport comparison func (for NAME C locale case)
2252 namefastcmp_c(Datum x
, Datum y
, SortSupport ssup
)
2254 Name arg1
= DatumGetName(x
);
2255 Name arg2
= DatumGetName(y
);
2257 return strncmp(NameStr(*arg1
), NameStr(*arg2
), NAMEDATALEN
);
2261 * sortsupport comparison func (for locale case with all varlena types)
2264 varlenafastcmp_locale(Datum x
, Datum y
, SortSupport ssup
)
2266 VarString
*arg1
= DatumGetVarStringPP(x
);
2267 VarString
*arg2
= DatumGetVarStringPP(y
);
2274 a1p
= VARDATA_ANY(arg1
);
2275 a2p
= VARDATA_ANY(arg2
);
2277 len1
= VARSIZE_ANY_EXHDR(arg1
);
2278 len2
= VARSIZE_ANY_EXHDR(arg2
);
2280 result
= varstrfastcmp_locale(a1p
, len1
, a2p
, len2
, ssup
);
2282 /* We can't afford to leak memory here. */
2283 if (PointerGetDatum(arg1
) != x
)
2285 if (PointerGetDatum(arg2
) != y
)
2292 * sortsupport comparison func (for locale case with NAME type)
2295 namefastcmp_locale(Datum x
, Datum y
, SortSupport ssup
)
2297 Name arg1
= DatumGetName(x
);
2298 Name arg2
= DatumGetName(y
);
2300 return varstrfastcmp_locale(NameStr(*arg1
), strlen(NameStr(*arg1
)),
2301 NameStr(*arg2
), strlen(NameStr(*arg2
)),
2306 * sortsupport comparison func for locale cases
2309 varstrfastcmp_locale(char *a1p
, int len1
, char *a2p
, int len2
, SortSupport ssup
)
2311 VarStringSortSupport
*sss
= (VarStringSortSupport
*) ssup
->ssup_extra
;
2315 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2316 if (len1
== len2
&& memcmp(a1p
, a2p
, len1
) == 0)
2319 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2320 * last_len2. Existing contents of buffers might still be used by
2323 * It's fine to allow the comparison of BpChar padding bytes here,
2324 * even though that implies that the memcmp() will usually be
2325 * performed for BpChar callers (though multibyte characters could
2326 * still prevent that from occurring). The memcmp() is still very
2327 * cheap, and BpChar's funny semantics have us remove trailing spaces
2328 * (not limited to padding), so we need make no distinction between
2329 * padding space characters and "real" space characters.
2334 if (sss
->typid
== BPCHAROID
)
2336 /* Get true number of bytes, ignoring trailing spaces */
2337 len1
= bpchartruelen(a1p
, len1
);
2338 len2
= bpchartruelen(a2p
, len2
);
2341 if (len1
>= sss
->buflen1
)
2344 sss
->buflen1
= Max(len1
+ 1, Min(sss
->buflen1
* 2, MaxAllocSize
));
2345 sss
->buf1
= MemoryContextAlloc(ssup
->ssup_cxt
, sss
->buflen1
);
2347 if (len2
>= sss
->buflen2
)
2350 sss
->buflen2
= Max(len2
+ 1, Min(sss
->buflen2
* 2, MaxAllocSize
));
2351 sss
->buf2
= MemoryContextAlloc(ssup
->ssup_cxt
, sss
->buflen2
);
2355 * We're likely to be asked to compare the same strings repeatedly, and
2356 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2357 * comparisons, even though in general there is no reason to think that
2358 * that will work out (every string datum may be unique). Caching does
2359 * not slow things down measurably when it doesn't work out, and can speed
2360 * things up by rather a lot when it does. In part, this is because the
2361 * memcmp() compares data from cachelines that are needed in L1 cache even
2362 * when the last comparison's result cannot be reused.
2365 if (len1
!= sss
->last_len1
|| memcmp(sss
->buf1
, a1p
, len1
) != 0)
2368 memcpy(sss
->buf1
, a1p
, len1
);
2369 sss
->buf1
[len1
] = '\0';
2370 sss
->last_len1
= len1
;
2374 * If we're comparing the same two strings as last time, we can return the
2375 * same answer without calling strcoll() again. This is more likely than
2376 * it seems (at least with moderate to low cardinality sets), because
2377 * quicksort compares the same pivot against many values.
2379 if (len2
!= sss
->last_len2
|| memcmp(sss
->buf2
, a2p
, len2
) != 0)
2381 memcpy(sss
->buf2
, a2p
, len2
);
2382 sss
->buf2
[len2
] = '\0';
2383 sss
->last_len2
= len2
;
2385 else if (arg1_match
&& !sss
->cache_blob
)
2387 /* Use result cached following last actual strcoll() call */
2388 return sss
->last_returned
;
2393 if (sss
->locale
->provider
== COLLPROVIDER_ICU
)
2396 #ifdef HAVE_UCOL_STRCOLLUTF8
2397 if (GetDatabaseEncoding() == PG_UTF8
)
2401 status
= U_ZERO_ERROR
;
2402 result
= ucol_strcollUTF8(sss
->locale
->info
.icu
.ucol
,
2406 if (U_FAILURE(status
))
2408 (errmsg("collation failed: %s", u_errorName(status
))));
2418 ulen1
= icu_to_uchar(&uchar1
, a1p
, len1
);
2419 ulen2
= icu_to_uchar(&uchar2
, a2p
, len2
);
2421 result
= ucol_strcoll(sss
->locale
->info
.icu
.ucol
,
2428 #else /* not USE_ICU */
2429 /* shouldn't happen */
2430 elog(ERROR
, "unsupported collprovider: %c", sss
->locale
->provider
);
2431 #endif /* not USE_ICU */
2435 #ifdef HAVE_LOCALE_T
2436 result
= strcoll_l(sss
->buf1
, sss
->buf2
, sss
->locale
->info
.lt
);
2438 /* shouldn't happen */
2439 elog(ERROR
, "unsupported collprovider: %c", sss
->locale
->provider
);
2444 result
= strcoll(sss
->buf1
, sss
->buf2
);
2446 /* Break tie if necessary. */
2448 (!sss
->locale
|| sss
->locale
->deterministic
))
2449 result
= strcmp(sss
->buf1
, sss
->buf2
);
2451 /* Cache result, perhaps saving an expensive strcoll() call next time */
2452 sss
->cache_blob
= false;
2453 sss
->last_returned
= result
;
2458 * Abbreviated key comparison func
2461 varstrcmp_abbrev(Datum x
, Datum y
, SortSupport ssup
)
2464 * When 0 is returned, the core system will call varstrfastcmp_c()
2465 * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2466 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2467 * authoritatively, for the same reason that there is a strcoll()
2468 * tie-breaker call to strcmp() in varstr_cmp().
2479 * Conversion routine for sortsupport. Converts original to abbreviated key
2480 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2481 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2482 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2483 * locale is used, or in case of bytea, just memcpy() from original instead.
2486 varstr_abbrev_convert(Datum original
, SortSupport ssup
)
2488 VarStringSortSupport
*sss
= (VarStringSortSupport
*) ssup
->ssup_extra
;
2489 VarString
*authoritative
= DatumGetVarStringPP(original
);
2490 char *authoritative_data
= VARDATA_ANY(authoritative
);
2498 pres
= (char *) &res
;
2499 /* memset(), so any non-overwritten bytes are NUL */
2500 memset(pres
, 0, sizeof(Datum
));
2501 len
= VARSIZE_ANY_EXHDR(authoritative
);
2503 /* Get number of bytes, ignoring trailing spaces */
2504 if (sss
->typid
== BPCHAROID
)
2505 len
= bpchartruelen(authoritative_data
, len
);
2508 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2509 * abbreviate keys. The full comparator for the C locale is always
2510 * memcmp(). It would be incorrect to allow bytea callers (callers that
2511 * always force the C collation -- bytea isn't a collatable type, but this
2512 * approach is convenient) to use strxfrm(). This is because bytea
2513 * strings may contain NUL bytes. Besides, this should be faster, too.
2515 * More generally, it's okay that bytea callers can have NUL bytes in
2516 * strings because varstrcmp_abbrev() need not make a distinction between
2517 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2518 * authoritative representation. Hopefully a comparison at or past one
2519 * abbreviated key's terminating NUL byte will resolve the comparison
2520 * without consulting the authoritative representation; specifically, some
2521 * later non-NUL byte in the longer string can resolve the comparison
2522 * against a subsequent terminating NUL in the shorter string. There will
2523 * usually be what is effectively a "length-wise" resolution there and
2526 * If that doesn't work out -- if all bytes in the longer string
2527 * positioned at or past the offset of the smaller string's (first)
2528 * terminating NUL are actually representative of NUL bytes in the
2529 * authoritative binary string (perhaps with some *terminating* NUL bytes
2530 * towards the end of the longer string iff it happens to still be small)
2531 * -- then an authoritative tie-breaker will happen, and do the right
2532 * thing: explicitly consider string length.
2535 memcpy(pres
, authoritative_data
, Min(len
, sizeof(Datum
)));
2541 UChar
*uchar
= NULL
;
2545 * We're not using the C collation, so fall back on strxfrm or ICU
2549 /* By convention, we use buffer 1 to store and NUL-terminate */
2550 if (len
>= sss
->buflen1
)
2553 sss
->buflen1
= Max(len
+ 1, Min(sss
->buflen1
* 2, MaxAllocSize
));
2554 sss
->buf1
= palloc(sss
->buflen1
);
2557 /* Might be able to reuse strxfrm() blob from last call */
2558 if (sss
->last_len1
== len
&& sss
->cache_blob
&&
2559 memcmp(sss
->buf1
, authoritative_data
, len
) == 0)
2561 memcpy(pres
, sss
->buf2
, Min(sizeof(Datum
), sss
->last_len2
));
2562 /* No change affecting cardinality, so no hashing required */
2566 memcpy(sss
->buf1
, authoritative_data
, len
);
2569 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2570 * necessary for ICU, but doesn't hurt.
2572 sss
->buf1
[len
] = '\0';
2573 sss
->last_len1
= len
;
2576 /* When using ICU and not UTF8, convert string to UChar. */
2577 if (sss
->locale
&& sss
->locale
->provider
== COLLPROVIDER_ICU
&&
2578 GetDatabaseEncoding() != PG_UTF8
)
2579 ulen
= icu_to_uchar(&uchar
, sss
->buf1
, len
);
2583 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2584 * and try again. Both of these functions have the result buffer
2585 * content undefined if the result did not fit, so we need to retry
2586 * until everything fits, even though we only need the first few bytes
2587 * in the end. When using ucol_nextSortKeyPart(), however, we only
2588 * ask for as many bytes as we actually need.
2593 if (sss
->locale
&& sss
->locale
->provider
== COLLPROVIDER_ICU
)
2596 * When using UTF8, use the iteration interface so we only
2597 * need to produce as many bytes as we actually need.
2599 if (GetDatabaseEncoding() == PG_UTF8
)
2605 uiter_setUTF8(&iter
, sss
->buf1
, len
);
2606 state
[0] = state
[1] = 0; /* won't need that again */
2607 status
= U_ZERO_ERROR
;
2608 bsize
= ucol_nextSortKeyPart(sss
->locale
->info
.icu
.ucol
,
2611 (uint8_t *) sss
->buf2
,
2612 Min(sizeof(Datum
), sss
->buflen2
),
2614 if (U_FAILURE(status
))
2616 (errmsg("sort key generation failed: %s",
2617 u_errorName(status
))));
2620 bsize
= ucol_getSortKey(sss
->locale
->info
.icu
.ucol
,
2622 (uint8_t *) sss
->buf2
, sss
->buflen2
);
2626 #ifdef HAVE_LOCALE_T
2627 if (sss
->locale
&& sss
->locale
->provider
== COLLPROVIDER_LIBC
)
2628 bsize
= strxfrm_l(sss
->buf2
, sss
->buf1
,
2629 sss
->buflen2
, sss
->locale
->info
.lt
);
2632 bsize
= strxfrm(sss
->buf2
, sss
->buf1
, sss
->buflen2
);
2634 sss
->last_len2
= bsize
;
2635 if (bsize
< sss
->buflen2
)
2639 * Grow buffer and retry.
2642 sss
->buflen2
= Max(bsize
+ 1,
2643 Min(sss
->buflen2
* 2, MaxAllocSize
));
2644 sss
->buf2
= palloc(sss
->buflen2
);
2648 * Every Datum byte is always compared. This is safe because the
2649 * strxfrm() blob is itself NUL terminated, leaving no danger of
2650 * misinterpreting any NUL bytes not intended to be interpreted as
2651 * logically representing termination.
2653 * (Actually, even if there were NUL bytes in the blob it would be
2654 * okay. See remarks on bytea case above.)
2656 memcpy(pres
, sss
->buf2
, Min(sizeof(Datum
), bsize
));
2665 * Maintain approximate cardinality of both abbreviated keys and original,
2666 * authoritative keys using HyperLogLog. Used as cheap insurance against
2667 * the worst case, where we do many string transformations for no saving
2668 * in full strcoll()-based comparisons. These statistics are used by
2669 * varstr_abbrev_abort().
2671 * First, Hash key proper, or a significant fraction of it. Mix in length
2672 * in order to compensate for cases where differences are past
2673 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2675 hash
= DatumGetUInt32(hash_any((unsigned char *) authoritative_data
,
2676 Min(len
, PG_CACHE_LINE_SIZE
)));
2678 if (len
> PG_CACHE_LINE_SIZE
)
2679 hash
^= DatumGetUInt32(hash_uint32((uint32
) len
));
2681 addHyperLogLog(&sss
->full_card
, hash
);
2683 /* Hash abbreviated key */
2684 #if SIZEOF_DATUM == 8
2689 lohalf
= (uint32
) res
;
2690 hihalf
= (uint32
) (res
>> 32);
2691 hash
= DatumGetUInt32(hash_uint32(lohalf
^ hihalf
));
2693 #else /* SIZEOF_DATUM != 8 */
2694 hash
= DatumGetUInt32(hash_uint32((uint32
) res
));
2697 addHyperLogLog(&sss
->abbr_card
, hash
);
2699 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2700 sss
->cache_blob
= true;
2704 * Byteswap on little-endian machines.
2706 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2707 * comparator) works correctly on all platforms. If we didn't do this,
2708 * the comparator would have to call memcmp() with a pair of pointers to
2709 * the first byte of each abbreviated key, which is slower.
2711 res
= DatumBigEndianToNative(res
);
2713 /* Don't leak memory here */
2714 if (PointerGetDatum(authoritative
) != original
)
2715 pfree(authoritative
);
2721 * Callback for estimating effectiveness of abbreviated key optimization, using
2722 * heuristic rules. Returns value indicating if the abbreviation optimization
2723 * should be aborted, based on its projected effectiveness.
2726 varstr_abbrev_abort(int memtupcount
, SortSupport ssup
)
2728 VarStringSortSupport
*sss
= (VarStringSortSupport
*) ssup
->ssup_extra
;
2729 double abbrev_distinct
,
2732 Assert(ssup
->abbreviate
);
2734 /* Have a little patience */
2735 if (memtupcount
< 100)
2738 abbrev_distinct
= estimateHyperLogLog(&sss
->abbr_card
);
2739 key_distinct
= estimateHyperLogLog(&sss
->full_card
);
2742 * Clamp cardinality estimates to at least one distinct value. While
2743 * NULLs are generally disregarded, if only NULL values were seen so far,
2744 * that might misrepresent costs if we failed to clamp.
2746 if (abbrev_distinct
<= 1.0)
2747 abbrev_distinct
= 1.0;
2749 if (key_distinct
<= 1.0)
2753 * In the worst case all abbreviated keys are identical, while at the same
2754 * time there are differences within full key strings not captured in
2760 double norm_abbrev_card
= abbrev_distinct
/ (double) memtupcount
;
2762 elog(LOG
, "varstr_abbrev: abbrev_distinct after %d: %f "
2763 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2764 memtupcount
, abbrev_distinct
, key_distinct
, norm_abbrev_card
,
2770 * If the number of distinct abbreviated keys approximately matches the
2771 * number of distinct authoritative original keys, that's reason enough to
2772 * proceed. We can win even with a very low cardinality set if most
2773 * tie-breakers only memcmp(). This is by far the most important
2776 * While comparisons that are resolved at the abbreviated key level are
2777 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2778 * those two outcomes are so much cheaper than a full strcoll() once
2779 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2780 * cardinality against the overall size of the set in order to more
2781 * accurately model costs. Assume that an abbreviated comparison, and an
2782 * abbreviated comparison with a cheap memcmp()-based authoritative
2783 * resolution are equivalent.
2785 if (abbrev_distinct
> key_distinct
* sss
->prop_card
)
2788 * When we have exceeded 10,000 tuples, decay required cardinality
2789 * aggressively for next call.
2791 * This is useful because the number of comparisons required on
2792 * average increases at a linearithmic rate, and at roughly 10,000
2793 * tuples that factor will start to dominate over the linear costs of
2794 * string transformation (this is a conservative estimate). The decay
2795 * rate is chosen to be a little less aggressive than halving -- which
2796 * (since we're called at points at which memtupcount has doubled)
2797 * would never see the cost model actually abort past the first call
2798 * following a decay. This decay rate is mostly a precaution against
2799 * a sudden, violent swing in how well abbreviated cardinality tracks
2800 * full key cardinality. The decay also serves to prevent a marginal
2801 * case from being aborted too late, when too much has already been
2802 * invested in string transformation.
2804 * It's possible for sets of several million distinct strings with
2805 * mere tens of thousands of distinct abbreviated keys to still
2806 * benefit very significantly. This will generally occur provided
2807 * each abbreviated key is a proxy for a roughly uniform number of the
2808 * set's full keys. If it isn't so, we hope to catch that early and
2809 * abort. If it isn't caught early, by the time the problem is
2810 * apparent it's probably not worth aborting.
2812 if (memtupcount
> 10000)
2813 sss
->prop_card
*= 0.65;
2819 * Abort abbreviation strategy.
2821 * The worst case, where all abbreviated keys are identical while all
2822 * original strings differ will typically only see a regression of about
2823 * 10% in execution time for small to medium sized lists of strings.
2824 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2825 * often expect very large improvements, particularly with sets of strings
2826 * of moderately high to high abbreviated cardinality. There is little to
2827 * lose but much to gain, which our strategy reflects.
2831 elog(LOG
, "varstr_abbrev: aborted abbreviation at %d "
2832 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2833 memtupcount
, abbrev_distinct
, key_distinct
, sss
->prop_card
);
2840 * Generic equalimage support function for character type's operator classes.
2841 * Disables the use of deduplication with nondeterministic collations.
2844 btvarstrequalimage(PG_FUNCTION_ARGS
)
2846 /* Oid opcintype = PG_GETARG_OID(0); */
2847 Oid collid
= PG_GET_COLLATION();
2849 check_collation_set(collid
);
2851 if (lc_collate_is_c(collid
) ||
2852 collid
== DEFAULT_COLLATION_OID
||
2853 get_collation_isdeterministic(collid
))
2854 PG_RETURN_BOOL(true);
2856 PG_RETURN_BOOL(false);
2860 text_larger(PG_FUNCTION_ARGS
)
2862 text
*arg1
= PG_GETARG_TEXT_PP(0);
2863 text
*arg2
= PG_GETARG_TEXT_PP(1);
2866 result
= ((text_cmp(arg1
, arg2
, PG_GET_COLLATION()) > 0) ? arg1
: arg2
);
2868 PG_RETURN_TEXT_P(result
);
2872 text_smaller(PG_FUNCTION_ARGS
)
2874 text
*arg1
= PG_GETARG_TEXT_PP(0);
2875 text
*arg2
= PG_GETARG_TEXT_PP(1);
2878 result
= ((text_cmp(arg1
, arg2
, PG_GET_COLLATION()) < 0) ? arg1
: arg2
);
2880 PG_RETURN_TEXT_P(result
);
2885 * Cross-type comparison functions for types text and name.
2889 nameeqtext(PG_FUNCTION_ARGS
)
2891 Name arg1
= PG_GETARG_NAME(0);
2892 text
*arg2
= PG_GETARG_TEXT_PP(1);
2893 size_t len1
= strlen(NameStr(*arg1
));
2894 size_t len2
= VARSIZE_ANY_EXHDR(arg2
);
2895 Oid collid
= PG_GET_COLLATION();
2898 check_collation_set(collid
);
2900 if (collid
== C_COLLATION_OID
)
2901 result
= (len1
== len2
&&
2902 memcmp(NameStr(*arg1
), VARDATA_ANY(arg2
), len1
) == 0);
2904 result
= (varstr_cmp(NameStr(*arg1
), len1
,
2905 VARDATA_ANY(arg2
), len2
,
2908 PG_FREE_IF_COPY(arg2
, 1);
2910 PG_RETURN_BOOL(result
);
2914 texteqname(PG_FUNCTION_ARGS
)
2916 text
*arg1
= PG_GETARG_TEXT_PP(0);
2917 Name arg2
= PG_GETARG_NAME(1);
2918 size_t len1
= VARSIZE_ANY_EXHDR(arg1
);
2919 size_t len2
= strlen(NameStr(*arg2
));
2920 Oid collid
= PG_GET_COLLATION();
2923 check_collation_set(collid
);
2925 if (collid
== C_COLLATION_OID
)
2926 result
= (len1
== len2
&&
2927 memcmp(VARDATA_ANY(arg1
), NameStr(*arg2
), len1
) == 0);
2929 result
= (varstr_cmp(VARDATA_ANY(arg1
), len1
,
2930 NameStr(*arg2
), len2
,
2933 PG_FREE_IF_COPY(arg1
, 0);
2935 PG_RETURN_BOOL(result
);
2939 namenetext(PG_FUNCTION_ARGS
)
2941 Name arg1
= PG_GETARG_NAME(0);
2942 text
*arg2
= PG_GETARG_TEXT_PP(1);
2943 size_t len1
= strlen(NameStr(*arg1
));
2944 size_t len2
= VARSIZE_ANY_EXHDR(arg2
);
2945 Oid collid
= PG_GET_COLLATION();
2948 check_collation_set(collid
);
2950 if (collid
== C_COLLATION_OID
)
2951 result
= !(len1
== len2
&&
2952 memcmp(NameStr(*arg1
), VARDATA_ANY(arg2
), len1
) == 0);
2954 result
= !(varstr_cmp(NameStr(*arg1
), len1
,
2955 VARDATA_ANY(arg2
), len2
,
2958 PG_FREE_IF_COPY(arg2
, 1);
2960 PG_RETURN_BOOL(result
);
2964 textnename(PG_FUNCTION_ARGS
)
2966 text
*arg1
= PG_GETARG_TEXT_PP(0);
2967 Name arg2
= PG_GETARG_NAME(1);
2968 size_t len1
= VARSIZE_ANY_EXHDR(arg1
);
2969 size_t len2
= strlen(NameStr(*arg2
));
2970 Oid collid
= PG_GET_COLLATION();
2973 check_collation_set(collid
);
2975 if (collid
== C_COLLATION_OID
)
2976 result
= !(len1
== len2
&&
2977 memcmp(VARDATA_ANY(arg1
), NameStr(*arg2
), len1
) == 0);
2979 result
= !(varstr_cmp(VARDATA_ANY(arg1
), len1
,
2980 NameStr(*arg2
), len2
,
2983 PG_FREE_IF_COPY(arg1
, 0);
2985 PG_RETURN_BOOL(result
);
2989 btnametextcmp(PG_FUNCTION_ARGS
)
2991 Name arg1
= PG_GETARG_NAME(0);
2992 text
*arg2
= PG_GETARG_TEXT_PP(1);
2995 result
= varstr_cmp(NameStr(*arg1
), strlen(NameStr(*arg1
)),
2996 VARDATA_ANY(arg2
), VARSIZE_ANY_EXHDR(arg2
),
2997 PG_GET_COLLATION());
2999 PG_FREE_IF_COPY(arg2
, 1);
3001 PG_RETURN_INT32(result
);
3005 bttextnamecmp(PG_FUNCTION_ARGS
)
3007 text
*arg1
= PG_GETARG_TEXT_PP(0);
3008 Name arg2
= PG_GETARG_NAME(1);
3011 result
= varstr_cmp(VARDATA_ANY(arg1
), VARSIZE_ANY_EXHDR(arg1
),
3012 NameStr(*arg2
), strlen(NameStr(*arg2
)),
3013 PG_GET_COLLATION());
3015 PG_FREE_IF_COPY(arg1
, 0);
3017 PG_RETURN_INT32(result
);
3020 #define CmpCall(cmpfunc) \
3021 DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
3022 PG_GET_COLLATION(), \
3023 PG_GETARG_DATUM(0), \
3024 PG_GETARG_DATUM(1)))
3027 namelttext(PG_FUNCTION_ARGS
)
3029 PG_RETURN_BOOL(CmpCall(btnametextcmp
) < 0);
3033 nameletext(PG_FUNCTION_ARGS
)
3035 PG_RETURN_BOOL(CmpCall(btnametextcmp
) <= 0);
3039 namegttext(PG_FUNCTION_ARGS
)
3041 PG_RETURN_BOOL(CmpCall(btnametextcmp
) > 0);
3045 namegetext(PG_FUNCTION_ARGS
)
3047 PG_RETURN_BOOL(CmpCall(btnametextcmp
) >= 0);
3051 textltname(PG_FUNCTION_ARGS
)
3053 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) < 0);
3057 textlename(PG_FUNCTION_ARGS
)
3059 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) <= 0);
3063 textgtname(PG_FUNCTION_ARGS
)
3065 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) > 0);
3069 textgename(PG_FUNCTION_ARGS
)
3071 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) >= 0);
3078 * The following operators support character-by-character comparison
3079 * of text datums, to allow building indexes suitable for LIKE clauses.
3080 * Note that the regular texteq/textne comparison operators, and regular
3081 * support functions 1 and 2 with "C" collation are assumed to be
3082 * compatible with these!
3086 internal_text_pattern_compare(text
*arg1
, text
*arg2
)
3092 len1
= VARSIZE_ANY_EXHDR(arg1
);
3093 len2
= VARSIZE_ANY_EXHDR(arg2
);
3095 result
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3098 else if (len1
< len2
)
3100 else if (len1
> len2
)
3108 text_pattern_lt(PG_FUNCTION_ARGS
)
3110 text
*arg1
= PG_GETARG_TEXT_PP(0);
3111 text
*arg2
= PG_GETARG_TEXT_PP(1);
3114 result
= internal_text_pattern_compare(arg1
, arg2
);
3116 PG_FREE_IF_COPY(arg1
, 0);
3117 PG_FREE_IF_COPY(arg2
, 1);
3119 PG_RETURN_BOOL(result
< 0);
3124 text_pattern_le(PG_FUNCTION_ARGS
)
3126 text
*arg1
= PG_GETARG_TEXT_PP(0);
3127 text
*arg2
= PG_GETARG_TEXT_PP(1);
3130 result
= internal_text_pattern_compare(arg1
, arg2
);
3132 PG_FREE_IF_COPY(arg1
, 0);
3133 PG_FREE_IF_COPY(arg2
, 1);
3135 PG_RETURN_BOOL(result
<= 0);
3140 text_pattern_ge(PG_FUNCTION_ARGS
)
3142 text
*arg1
= PG_GETARG_TEXT_PP(0);
3143 text
*arg2
= PG_GETARG_TEXT_PP(1);
3146 result
= internal_text_pattern_compare(arg1
, arg2
);
3148 PG_FREE_IF_COPY(arg1
, 0);
3149 PG_FREE_IF_COPY(arg2
, 1);
3151 PG_RETURN_BOOL(result
>= 0);
3156 text_pattern_gt(PG_FUNCTION_ARGS
)
3158 text
*arg1
= PG_GETARG_TEXT_PP(0);
3159 text
*arg2
= PG_GETARG_TEXT_PP(1);
3162 result
= internal_text_pattern_compare(arg1
, arg2
);
3164 PG_FREE_IF_COPY(arg1
, 0);
3165 PG_FREE_IF_COPY(arg2
, 1);
3167 PG_RETURN_BOOL(result
> 0);
3172 bttext_pattern_cmp(PG_FUNCTION_ARGS
)
3174 text
*arg1
= PG_GETARG_TEXT_PP(0);
3175 text
*arg2
= PG_GETARG_TEXT_PP(1);
3178 result
= internal_text_pattern_compare(arg1
, arg2
);
3180 PG_FREE_IF_COPY(arg1
, 0);
3181 PG_FREE_IF_COPY(arg2
, 1);
3183 PG_RETURN_INT32(result
);
3188 bttext_pattern_sortsupport(PG_FUNCTION_ARGS
)
3190 SortSupport ssup
= (SortSupport
) PG_GETARG_POINTER(0);
3191 MemoryContext oldcontext
;
3193 oldcontext
= MemoryContextSwitchTo(ssup
->ssup_cxt
);
3195 /* Use generic string SortSupport, forcing "C" collation */
3196 varstr_sortsupport(ssup
, TEXTOID
, C_COLLATION_OID
);
3198 MemoryContextSwitchTo(oldcontext
);
3204 /*-------------------------------------------------------------
3207 * get the number of bytes contained in an instance of type 'bytea'
3208 *-------------------------------------------------------------
3211 byteaoctetlen(PG_FUNCTION_ARGS
)
3213 Datum str
= PG_GETARG_DATUM(0);
3215 /* We need not detoast the input at all */
3216 PG_RETURN_INT32(toast_raw_datum_size(str
) - VARHDRSZ
);
3221 * takes two bytea* and returns a bytea* that is the concatenation of
3224 * Cloned from textcat and modified as required.
3227 byteacat(PG_FUNCTION_ARGS
)
3229 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3230 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
3232 PG_RETURN_BYTEA_P(bytea_catenate(t1
, t2
));
3237 * Guts of byteacat(), broken out so it can be used by other functions
3239 * Arguments can be in short-header form, but not compressed or out-of-line
3242 bytea_catenate(bytea
*t1
, bytea
*t2
)
3250 len1
= VARSIZE_ANY_EXHDR(t1
);
3251 len2
= VARSIZE_ANY_EXHDR(t2
);
3253 /* paranoia ... probably should throw error instead? */
3259 len
= len1
+ len2
+ VARHDRSZ
;
3260 result
= (bytea
*) palloc(len
);
3262 /* Set size of result string... */
3263 SET_VARSIZE(result
, len
);
3265 /* Fill data field of result string... */
3266 ptr
= VARDATA(result
);
3268 memcpy(ptr
, VARDATA_ANY(t1
), len1
);
3270 memcpy(ptr
+ len1
, VARDATA_ANY(t2
), len2
);
3275 #define PG_STR_GET_BYTEA(str_) \
3276 DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3280 * Return a substring starting at the specified position.
3281 * Cloned from text_substr and modified as required.
3285 * - starting position (is one-based)
3286 * - string length (optional)
3288 * If the starting position is zero or less, then return from the start of the string
3289 * adjusting the length to be consistent with the "negative start" per SQL.
3290 * If the length is less than zero, an ERROR is thrown. If no third argument
3291 * (length) is provided, the length to the end of the string is assumed.
3294 bytea_substr(PG_FUNCTION_ARGS
)
3296 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3303 * bytea_substr_no_len -
3304 * Wrapper to avoid opr_sanity failure due to
3305 * one function accepting a different number of args.
3308 bytea_substr_no_len(PG_FUNCTION_ARGS
)
3310 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3317 bytea_substring(Datum str
,
3320 bool length_not_specified
)
3322 int32 S1
; /* adjusted start position */
3323 int32 L1
; /* adjusted substring length */
3324 int32 E
; /* end position */
3327 * The logic here should generally match text_substring().
3331 if (length_not_specified
)
3334 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3335 * end of the string if we pass it a negative value for length.
3341 /* SQL99 says to throw an error for E < S, i.e., negative length */
3343 (errcode(ERRCODE_SUBSTRING_ERROR
),
3344 errmsg("negative substring length not allowed")));
3345 L1
= -1; /* silence stupider compilers */
3347 else if (pg_add_s32_overflow(S
, L
, &E
))
3350 * L could be large enough for S + L to overflow, in which case the
3351 * substring must run to end of string.
3358 * A zero or negative value for the end position can happen if the
3359 * start was negative or one. SQL99 says to return a zero-length
3363 return PG_STR_GET_BYTEA("");
3369 * If the start position is past the end of the string, SQL99 says to
3370 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3371 * us. We need only convert S1 to zero-based starting position.
3373 return DatumGetByteaPSlice(str
, S1
- 1, L1
);
3378 * Replace specified substring of first string with second
3380 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3381 * This code is a direct implementation of what the standard says.
3384 byteaoverlay(PG_FUNCTION_ARGS
)
3386 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3387 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
3388 int sp
= PG_GETARG_INT32(2); /* substring start position */
3389 int sl
= PG_GETARG_INT32(3); /* substring length */
3391 PG_RETURN_BYTEA_P(bytea_overlay(t1
, t2
, sp
, sl
));
3395 byteaoverlay_no_len(PG_FUNCTION_ARGS
)
3397 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3398 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
3399 int sp
= PG_GETARG_INT32(2); /* substring start position */
3402 sl
= VARSIZE_ANY_EXHDR(t2
); /* defaults to length(t2) */
3403 PG_RETURN_BYTEA_P(bytea_overlay(t1
, t2
, sp
, sl
));
3407 bytea_overlay(bytea
*t1
, bytea
*t2
, int sp
, int sl
)
3415 * Check for possible integer-overflow cases. For negative sp, throw a
3416 * "substring length" error because that's what should be expected
3417 * according to the spec's definition of OVERLAY().
3421 (errcode(ERRCODE_SUBSTRING_ERROR
),
3422 errmsg("negative substring length not allowed")));
3423 if (pg_add_s32_overflow(sp
, sl
, &sp_pl_sl
))
3425 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
3426 errmsg("integer out of range")));
3428 s1
= bytea_substring(PointerGetDatum(t1
), 1, sp
- 1, false);
3429 s2
= bytea_substring(PointerGetDatum(t1
), sp_pl_sl
, -1, true);
3430 result
= bytea_catenate(s1
, t2
);
3431 result
= bytea_catenate(result
, s2
);
3440 bytea_bit_count(PG_FUNCTION_ARGS
)
3442 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3444 PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1
), VARSIZE_ANY_EXHDR(t1
)));
3449 * Return the position of the specified substring.
3450 * Implements the SQL POSITION() function.
3451 * Cloned from textpos and modified as required.
3454 byteapos(PG_FUNCTION_ARGS
)
3456 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3457 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
3466 len1
= VARSIZE_ANY_EXHDR(t1
);
3467 len2
= VARSIZE_ANY_EXHDR(t2
);
3470 PG_RETURN_INT32(1); /* result for empty pattern */
3472 p1
= VARDATA_ANY(t1
);
3473 p2
= VARDATA_ANY(t2
);
3477 for (p
= 0; p
<= px
; p
++)
3479 if ((*p2
== *p1
) && (memcmp(p1
, p2
, len2
) == 0))
3487 PG_RETURN_INT32(pos
);
3490 /*-------------------------------------------------------------
3493 * this routine treats "bytea" as an array of bytes.
3494 * It returns the Nth byte (a number between 0 and 255).
3495 *-------------------------------------------------------------
3498 byteaGetByte(PG_FUNCTION_ARGS
)
3500 bytea
*v
= PG_GETARG_BYTEA_PP(0);
3501 int32 n
= PG_GETARG_INT32(1);
3505 len
= VARSIZE_ANY_EXHDR(v
);
3507 if (n
< 0 || n
>= len
)
3509 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3510 errmsg("index %d out of valid range, 0..%d",
3513 byte
= ((unsigned char *) VARDATA_ANY(v
))[n
];
3515 PG_RETURN_INT32(byte
);
3518 /*-------------------------------------------------------------
3521 * This routine treats a "bytea" type like an array of bits.
3522 * It returns the value of the Nth bit (0 or 1).
3524 *-------------------------------------------------------------
3527 byteaGetBit(PG_FUNCTION_ARGS
)
3529 bytea
*v
= PG_GETARG_BYTEA_PP(0);
3530 int64 n
= PG_GETARG_INT64(1);
3536 len
= VARSIZE_ANY_EXHDR(v
);
3538 if (n
< 0 || n
>= (int64
) len
* 8)
3540 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3541 errmsg("index %lld out of valid range, 0..%lld",
3542 (long long) n
, (long long) len
* 8 - 1)));
3544 /* n/8 is now known < len, so safe to cast to int */
3545 byteNo
= (int) (n
/ 8);
3546 bitNo
= (int) (n
% 8);
3548 byte
= ((unsigned char *) VARDATA_ANY(v
))[byteNo
];
3550 if (byte
& (1 << bitNo
))
3556 /*-------------------------------------------------------------
3559 * Given an instance of type 'bytea' creates a new one with
3560 * the Nth byte set to the given value.
3562 *-------------------------------------------------------------
3565 byteaSetByte(PG_FUNCTION_ARGS
)
3567 bytea
*res
= PG_GETARG_BYTEA_P_COPY(0);
3568 int32 n
= PG_GETARG_INT32(1);
3569 int32 newByte
= PG_GETARG_INT32(2);
3572 len
= VARSIZE(res
) - VARHDRSZ
;
3574 if (n
< 0 || n
>= len
)
3576 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3577 errmsg("index %d out of valid range, 0..%d",
3583 ((unsigned char *) VARDATA(res
))[n
] = newByte
;
3585 PG_RETURN_BYTEA_P(res
);
3588 /*-------------------------------------------------------------
3591 * Given an instance of type 'bytea' creates a new one with
3592 * the Nth bit set to the given value.
3594 *-------------------------------------------------------------
3597 byteaSetBit(PG_FUNCTION_ARGS
)
3599 bytea
*res
= PG_GETARG_BYTEA_P_COPY(0);
3600 int64 n
= PG_GETARG_INT64(1);
3601 int32 newBit
= PG_GETARG_INT32(2);
3608 len
= VARSIZE(res
) - VARHDRSZ
;
3610 if (n
< 0 || n
>= (int64
) len
* 8)
3612 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3613 errmsg("index %lld out of valid range, 0..%lld",
3614 (long long) n
, (long long) len
* 8 - 1)));
3616 /* n/8 is now known < len, so safe to cast to int */
3617 byteNo
= (int) (n
/ 8);
3618 bitNo
= (int) (n
% 8);
3623 if (newBit
!= 0 && newBit
!= 1)
3625 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
3626 errmsg("new bit must be 0 or 1")));
3631 oldByte
= ((unsigned char *) VARDATA(res
))[byteNo
];
3634 newByte
= oldByte
& (~(1 << bitNo
));
3636 newByte
= oldByte
| (1 << bitNo
);
3638 ((unsigned char *) VARDATA(res
))[byteNo
] = newByte
;
3640 PG_RETURN_BYTEA_P(res
);
3645 * Converts a text type to a Name type.
3648 text_name(PG_FUNCTION_ARGS
)
3650 text
*s
= PG_GETARG_TEXT_PP(0);
3654 len
= VARSIZE_ANY_EXHDR(s
);
3656 /* Truncate oversize input */
3657 if (len
>= NAMEDATALEN
)
3658 len
= pg_mbcliplen(VARDATA_ANY(s
), len
, NAMEDATALEN
- 1);
3660 /* We use palloc0 here to ensure result is zero-padded */
3661 result
= (Name
) palloc0(NAMEDATALEN
);
3662 memcpy(NameStr(*result
), VARDATA_ANY(s
), len
);
3664 PG_RETURN_NAME(result
);
3668 * Converts a Name type to a text type.
3671 name_text(PG_FUNCTION_ARGS
)
3673 Name s
= PG_GETARG_NAME(0);
3675 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s
)));
3680 * textToQualifiedNameList - convert a text object to list of names
3682 * This implements the input parsing needed by nextval() and other
3683 * functions that take a text parameter representing a qualified name.
3684 * We split the name at dots, downcase if not double-quoted, and
3685 * truncate names if they're too long.
3688 textToQualifiedNameList(text
*textval
)
3695 /* Convert to C string (handles possible detoasting). */
3696 /* Note we rely on being able to modify rawname below. */
3697 rawname
= text_to_cstring(textval
);
3699 if (!SplitIdentifierString(rawname
, '.', &namelist
))
3701 (errcode(ERRCODE_INVALID_NAME
),
3702 errmsg("invalid name syntax")));
3704 if (namelist
== NIL
)
3706 (errcode(ERRCODE_INVALID_NAME
),
3707 errmsg("invalid name syntax")));
3709 foreach(l
, namelist
)
3711 char *curname
= (char *) lfirst(l
);
3713 result
= lappend(result
, makeString(pstrdup(curname
)));
3717 list_free(namelist
);
3723 * SplitIdentifierString --- parse a string containing identifiers
3725 * This is the guts of textToQualifiedNameList, and is exported for use in
3726 * other situations such as parsing GUC variables. In the GUC case, it's
3727 * important to avoid memory leaks, so the API is designed to minimize the
3728 * amount of stuff that needs to be allocated and freed.
3731 * rawstring: the input string; must be overwritable! On return, it's
3732 * been modified to contain the separated identifiers.
3733 * separator: the separator punctuation expected between identifiers
3734 * (typically '.' or ','). Whitespace may also appear around
3737 * namelist: filled with a palloc'd list of pointers to identifiers within
3738 * rawstring. Caller should list_free() this even on error return.
3740 * Returns true if okay, false if there is a syntax error in the string.
3742 * Note that an empty string is considered okay here, though not in
3743 * textToQualifiedNameList.
3746 SplitIdentifierString(char *rawstring
, char separator
,
3749 char *nextp
= rawstring
;
3754 while (scanner_isspace(*nextp
))
3755 nextp
++; /* skip leading whitespace */
3758 return true; /* allow empty string */
3760 /* At the top of the loop, we are at start of a new identifier. */
3768 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3769 curname
= nextp
+ 1;
3772 endp
= strchr(nextp
+ 1, '"');
3774 return false; /* mismatched quotes */
3776 break; /* found end of quoted name */
3777 /* Collapse adjacent quotes into one quote, and look again */
3778 memmove(endp
, endp
+ 1, strlen(endp
));
3781 /* endp now points at the terminating quote */
3786 /* Unquoted name --- extends to separator or whitespace */
3791 while (*nextp
&& *nextp
!= separator
&&
3792 !scanner_isspace(*nextp
))
3795 if (curname
== nextp
)
3796 return false; /* empty unquoted name not allowed */
3799 * Downcase the identifier, using same code as main lexer does.
3801 * XXX because we want to overwrite the input in-place, we cannot
3802 * support a downcasing transformation that increases the string
3803 * length. This is not a problem given the current implementation
3804 * of downcase_truncate_identifier, but we'll probably have to do
3805 * something about this someday.
3807 len
= endp
- curname
;
3808 downname
= downcase_truncate_identifier(curname
, len
, false);
3809 Assert(strlen(downname
) <= len
);
3810 strncpy(curname
, downname
, len
); /* strncpy is required here */
3814 while (scanner_isspace(*nextp
))
3815 nextp
++; /* skip trailing whitespace */
3817 if (*nextp
== separator
)
3820 while (scanner_isspace(*nextp
))
3821 nextp
++; /* skip leading whitespace for next */
3822 /* we expect another name, so done remains false */
3824 else if (*nextp
== '\0')
3827 return false; /* invalid syntax */
3829 /* Now safe to overwrite separator with a null */
3832 /* Truncate name if it's overlength */
3833 truncate_identifier(curname
, strlen(curname
), false);
3836 * Finished isolating current name --- add it to list
3838 *namelist
= lappend(*namelist
, curname
);
3840 /* Loop back if we didn't reach end of string */
3848 * SplitDirectoriesString --- parse a string containing file/directory names
3850 * This works fine on file names too; the function name is historical.
3852 * This is similar to SplitIdentifierString, except that the parsing
3853 * rules are meant to handle pathnames instead of identifiers: there is
3854 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3855 * and we apply canonicalize_path() to each extracted string. Because of the
3856 * last, the returned strings are separately palloc'd rather than being
3857 * pointers into rawstring --- but we still scribble on rawstring.
3860 * rawstring: the input string; must be modifiable!
3861 * separator: the separator punctuation expected between directories
3862 * (typically ',' or ';'). Whitespace may also appear around
3865 * namelist: filled with a palloc'd list of directory names.
3866 * Caller should list_free_deep() this even on error return.
3868 * Returns true if okay, false if there is a syntax error in the string.
3870 * Note that an empty string is considered okay here.
3873 SplitDirectoriesString(char *rawstring
, char separator
,
3876 char *nextp
= rawstring
;
3881 while (scanner_isspace(*nextp
))
3882 nextp
++; /* skip leading whitespace */
3885 return true; /* allow empty string */
3887 /* At the top of the loop, we are at start of a new directory. */
3895 /* Quoted name --- collapse quote-quote pairs */
3896 curname
= nextp
+ 1;
3899 endp
= strchr(nextp
+ 1, '"');
3901 return false; /* mismatched quotes */
3903 break; /* found end of quoted name */
3904 /* Collapse adjacent quotes into one quote, and look again */
3905 memmove(endp
, endp
+ 1, strlen(endp
));
3908 /* endp now points at the terminating quote */
3913 /* Unquoted name --- extends to separator or end of string */
3914 curname
= endp
= nextp
;
3915 while (*nextp
&& *nextp
!= separator
)
3917 /* trailing whitespace should not be included in name */
3918 if (!scanner_isspace(*nextp
))
3922 if (curname
== endp
)
3923 return false; /* empty unquoted name not allowed */
3926 while (scanner_isspace(*nextp
))
3927 nextp
++; /* skip trailing whitespace */
3929 if (*nextp
== separator
)
3932 while (scanner_isspace(*nextp
))
3933 nextp
++; /* skip leading whitespace for next */
3934 /* we expect another name, so done remains false */
3936 else if (*nextp
== '\0')
3939 return false; /* invalid syntax */
3941 /* Now safe to overwrite separator with a null */
3944 /* Truncate path if it's overlength */
3945 if (strlen(curname
) >= MAXPGPATH
)
3946 curname
[MAXPGPATH
- 1] = '\0';
3949 * Finished isolating current name --- add it to list
3951 curname
= pstrdup(curname
);
3952 canonicalize_path(curname
);
3953 *namelist
= lappend(*namelist
, curname
);
3955 /* Loop back if we didn't reach end of string */
3963 * SplitGUCList --- parse a string containing identifiers or file names
3965 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3966 * presuming whether the elements will be taken as identifiers or file names.
3967 * We assume the input has already been through flatten_set_variable_args(),
3968 * so that we need never downcase (if appropriate, that was done already).
3969 * Nor do we ever truncate, since we don't know the correct max length.
3970 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3971 * because any embedded whitespace should have led to double-quoting).
3972 * Otherwise the API is identical to SplitIdentifierString.
3974 * XXX it's annoying to have so many copies of this string-splitting logic.
3975 * However, it's not clear that having one function with a bunch of option
3976 * flags would be much better.
3978 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3979 * Be sure to update that if you have to change this.
3982 * rawstring: the input string; must be overwritable! On return, it's
3983 * been modified to contain the separated identifiers.
3984 * separator: the separator punctuation expected between identifiers
3985 * (typically '.' or ','). Whitespace may also appear around
3988 * namelist: filled with a palloc'd list of pointers to identifiers within
3989 * rawstring. Caller should list_free() this even on error return.
3991 * Returns true if okay, false if there is a syntax error in the string.
3994 SplitGUCList(char *rawstring
, char separator
,
3997 char *nextp
= rawstring
;
4002 while (scanner_isspace(*nextp
))
4003 nextp
++; /* skip leading whitespace */
4006 return true; /* allow empty string */
4008 /* At the top of the loop, we are at start of a new identifier. */
4016 /* Quoted name --- collapse quote-quote pairs */
4017 curname
= nextp
+ 1;
4020 endp
= strchr(nextp
+ 1, '"');
4022 return false; /* mismatched quotes */
4024 break; /* found end of quoted name */
4025 /* Collapse adjacent quotes into one quote, and look again */
4026 memmove(endp
, endp
+ 1, strlen(endp
));
4029 /* endp now points at the terminating quote */
4034 /* Unquoted name --- extends to separator or whitespace */
4036 while (*nextp
&& *nextp
!= separator
&&
4037 !scanner_isspace(*nextp
))
4040 if (curname
== nextp
)
4041 return false; /* empty unquoted name not allowed */
4044 while (scanner_isspace(*nextp
))
4045 nextp
++; /* skip trailing whitespace */
4047 if (*nextp
== separator
)
4050 while (scanner_isspace(*nextp
))
4051 nextp
++; /* skip leading whitespace for next */
4052 /* we expect another name, so done remains false */
4054 else if (*nextp
== '\0')
4057 return false; /* invalid syntax */
4059 /* Now safe to overwrite separator with a null */
4063 * Finished isolating current name --- add it to list
4065 *namelist
= lappend(*namelist
, curname
);
4067 /* Loop back if we didn't reach end of string */
4074 /*****************************************************************************
4075 * Comparison Functions used for bytea
4077 * Note: btree indexes need these routines not to leak memory; therefore,
4078 * be careful to free working copies of toasted datums. Most places don't
4079 * need to be so careful.
4080 *****************************************************************************/
4083 byteaeq(PG_FUNCTION_ARGS
)
4085 Datum arg1
= PG_GETARG_DATUM(0);
4086 Datum arg2
= PG_GETARG_DATUM(1);
4092 * We can use a fast path for unequal lengths, which might save us from
4093 * having to detoast one or both values.
4095 len1
= toast_raw_datum_size(arg1
);
4096 len2
= toast_raw_datum_size(arg2
);
4101 bytea
*barg1
= DatumGetByteaPP(arg1
);
4102 bytea
*barg2
= DatumGetByteaPP(arg2
);
4104 result
= (memcmp(VARDATA_ANY(barg1
), VARDATA_ANY(barg2
),
4105 len1
- VARHDRSZ
) == 0);
4107 PG_FREE_IF_COPY(barg1
, 0);
4108 PG_FREE_IF_COPY(barg2
, 1);
4111 PG_RETURN_BOOL(result
);
4115 byteane(PG_FUNCTION_ARGS
)
4117 Datum arg1
= PG_GETARG_DATUM(0);
4118 Datum arg2
= PG_GETARG_DATUM(1);
4124 * We can use a fast path for unequal lengths, which might save us from
4125 * having to detoast one or both values.
4127 len1
= toast_raw_datum_size(arg1
);
4128 len2
= toast_raw_datum_size(arg2
);
4133 bytea
*barg1
= DatumGetByteaPP(arg1
);
4134 bytea
*barg2
= DatumGetByteaPP(arg2
);
4136 result
= (memcmp(VARDATA_ANY(barg1
), VARDATA_ANY(barg2
),
4137 len1
- VARHDRSZ
) != 0);
4139 PG_FREE_IF_COPY(barg1
, 0);
4140 PG_FREE_IF_COPY(barg2
, 1);
4143 PG_RETURN_BOOL(result
);
4147 bytealt(PG_FUNCTION_ARGS
)
4149 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
4150 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
4155 len1
= VARSIZE_ANY_EXHDR(arg1
);
4156 len2
= VARSIZE_ANY_EXHDR(arg2
);
4158 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
4160 PG_FREE_IF_COPY(arg1
, 0);
4161 PG_FREE_IF_COPY(arg2
, 1);
4163 PG_RETURN_BOOL((cmp
< 0) || ((cmp
== 0) && (len1
< len2
)));
4167 byteale(PG_FUNCTION_ARGS
)
4169 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
4170 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
4175 len1
= VARSIZE_ANY_EXHDR(arg1
);
4176 len2
= VARSIZE_ANY_EXHDR(arg2
);
4178 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
4180 PG_FREE_IF_COPY(arg1
, 0);
4181 PG_FREE_IF_COPY(arg2
, 1);
4183 PG_RETURN_BOOL((cmp
< 0) || ((cmp
== 0) && (len1
<= len2
)));
4187 byteagt(PG_FUNCTION_ARGS
)
4189 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
4190 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
4195 len1
= VARSIZE_ANY_EXHDR(arg1
);
4196 len2
= VARSIZE_ANY_EXHDR(arg2
);
4198 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
4200 PG_FREE_IF_COPY(arg1
, 0);
4201 PG_FREE_IF_COPY(arg2
, 1);
4203 PG_RETURN_BOOL((cmp
> 0) || ((cmp
== 0) && (len1
> len2
)));
4207 byteage(PG_FUNCTION_ARGS
)
4209 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
4210 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
4215 len1
= VARSIZE_ANY_EXHDR(arg1
);
4216 len2
= VARSIZE_ANY_EXHDR(arg2
);
4218 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
4220 PG_FREE_IF_COPY(arg1
, 0);
4221 PG_FREE_IF_COPY(arg2
, 1);
4223 PG_RETURN_BOOL((cmp
> 0) || ((cmp
== 0) && (len1
>= len2
)));
4227 byteacmp(PG_FUNCTION_ARGS
)
4229 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
4230 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
4235 len1
= VARSIZE_ANY_EXHDR(arg1
);
4236 len2
= VARSIZE_ANY_EXHDR(arg2
);
4238 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
4239 if ((cmp
== 0) && (len1
!= len2
))
4240 cmp
= (len1
< len2
) ? -1 : 1;
4242 PG_FREE_IF_COPY(arg1
, 0);
4243 PG_FREE_IF_COPY(arg2
, 1);
4245 PG_RETURN_INT32(cmp
);
4249 bytea_sortsupport(PG_FUNCTION_ARGS
)
4251 SortSupport ssup
= (SortSupport
) PG_GETARG_POINTER(0);
4252 MemoryContext oldcontext
;
4254 oldcontext
= MemoryContextSwitchTo(ssup
->ssup_cxt
);
4256 /* Use generic string SortSupport, forcing "C" collation */
4257 varstr_sortsupport(ssup
, BYTEAOID
, C_COLLATION_OID
);
4259 MemoryContextSwitchTo(oldcontext
);
4265 * appendStringInfoText
4267 * Append a text to str.
4268 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4271 appendStringInfoText(StringInfo str
, const text
*t
)
4273 appendBinaryStringInfo(str
, VARDATA_ANY(t
), VARSIZE_ANY_EXHDR(t
));
4278 * replace all occurrences of 'old_sub_str' in 'orig_str'
4279 * with 'new_sub_str' to form 'new_str'
4281 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4282 * otherwise returns 'new_str'
4285 replace_text(PG_FUNCTION_ARGS
)
4287 text
*src_text
= PG_GETARG_TEXT_PP(0);
4288 text
*from_sub_text
= PG_GETARG_TEXT_PP(1);
4289 text
*to_sub_text
= PG_GETARG_TEXT_PP(2);
4291 int from_sub_text_len
;
4292 TextPositionState state
;
4300 src_text_len
= VARSIZE_ANY_EXHDR(src_text
);
4301 from_sub_text_len
= VARSIZE_ANY_EXHDR(from_sub_text
);
4303 /* Return unmodified source string if empty source or pattern */
4304 if (src_text_len
< 1 || from_sub_text_len
< 1)
4306 PG_RETURN_TEXT_P(src_text
);
4309 text_position_setup(src_text
, from_sub_text
, PG_GET_COLLATION(), &state
);
4311 found
= text_position_next(&state
);
4313 /* When the from_sub_text is not found, there is nothing to do. */
4316 text_position_cleanup(&state
);
4317 PG_RETURN_TEXT_P(src_text
);
4319 curr_ptr
= text_position_get_match_ptr(&state
);
4320 start_ptr
= VARDATA_ANY(src_text
);
4322 initStringInfo(&str
);
4326 CHECK_FOR_INTERRUPTS();
4328 /* copy the data skipped over by last text_position_next() */
4329 chunk_len
= curr_ptr
- start_ptr
;
4330 appendBinaryStringInfo(&str
, start_ptr
, chunk_len
);
4332 appendStringInfoText(&str
, to_sub_text
);
4334 start_ptr
= curr_ptr
+ from_sub_text_len
;
4336 found
= text_position_next(&state
);
4338 curr_ptr
= text_position_get_match_ptr(&state
);
4342 /* copy trailing data */
4343 chunk_len
= ((char *) src_text
+ VARSIZE_ANY(src_text
)) - start_ptr
;
4344 appendBinaryStringInfo(&str
, start_ptr
, chunk_len
);
4346 text_position_cleanup(&state
);
4348 ret_text
= cstring_to_text_with_len(str
.data
, str
.len
);
4351 PG_RETURN_TEXT_P(ret_text
);
4355 * check_replace_text_has_escape
4357 * Returns 0 if text contains no backslashes that need processing.
4358 * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4359 * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4362 check_replace_text_has_escape(const text
*replace_text
)
4365 const char *p
= VARDATA_ANY(replace_text
);
4366 const char *p_end
= p
+ VARSIZE_ANY_EXHDR(replace_text
);
4370 /* Find next escape char, if any. */
4371 p
= memchr(p
, '\\', p_end
- p
);
4375 /* Note: a backslash at the end doesn't require extra processing. */
4378 if (*p
>= '1' && *p
<= '9')
4379 return 2; /* Found a submatch specifier, so done */
4380 result
= 1; /* Found some other sequence, keep looking */
4388 * appendStringInfoRegexpSubstr
4390 * Append replace_text to str, substituting regexp back references for
4391 * \n escapes. start_ptr is the start of the match in the source string,
4392 * at logical character position data_pos.
4395 appendStringInfoRegexpSubstr(StringInfo str
, text
*replace_text
,
4397 char *start_ptr
, int data_pos
)
4399 const char *p
= VARDATA_ANY(replace_text
);
4400 const char *p_end
= p
+ VARSIZE_ANY_EXHDR(replace_text
);
4404 const char *chunk_start
= p
;
4408 /* Find next escape char, if any. */
4409 p
= memchr(p
, '\\', p_end
- p
);
4413 /* Copy the text we just scanned over, if any. */
4414 if (p
> chunk_start
)
4415 appendBinaryStringInfo(str
, chunk_start
, p
- chunk_start
);
4417 /* Done if at end of string, else advance over escape char. */
4424 /* Escape at very end of input. Treat same as unexpected char */
4425 appendStringInfoChar(str
, '\\');
4429 if (*p
>= '1' && *p
<= '9')
4431 /* Use the back reference of regexp. */
4434 so
= pmatch
[idx
].rm_so
;
4435 eo
= pmatch
[idx
].rm_eo
;
4440 /* Use the entire matched string. */
4441 so
= pmatch
[0].rm_so
;
4442 eo
= pmatch
[0].rm_eo
;
4445 else if (*p
== '\\')
4447 /* \\ means transfer one \ to output. */
4448 appendStringInfoChar(str
, '\\');
4455 * If escape char is not followed by any expected char, just treat
4456 * it as ordinary data to copy. (XXX would it be better to throw
4459 appendStringInfoChar(str
, '\\');
4463 if (so
>= 0 && eo
>= 0)
4466 * Copy the text that is back reference of regexp. Note so and eo
4467 * are counted in characters not bytes.
4472 Assert(so
>= data_pos
);
4473 chunk_start
= start_ptr
;
4474 chunk_start
+= charlen_to_bytelen(chunk_start
, so
- data_pos
);
4475 chunk_len
= charlen_to_bytelen(chunk_start
, eo
- so
);
4476 appendBinaryStringInfo(str
, chunk_start
, chunk_len
);
4482 * replace_text_regexp
4484 * replace substring(s) in src_text that match pattern with replace_text.
4485 * The replace_text can contain backslash markers to substitute
4486 * (parts of) the matched text.
4488 * cflags: regexp compile flags.
4489 * collation: collation to use.
4490 * search_start: the character (not byte) offset in src_text at which to
4492 * n: if 0, replace all matches; if > 0, replace only the N'th match.
4495 replace_text_regexp(text
*src_text
, text
*pattern_text
,
4497 int cflags
, Oid collation
,
4498 int search_start
, int n
)
4502 int src_text_len
= VARSIZE_ANY_EXHDR(src_text
);
4505 regmatch_t pmatch
[10]; /* main match, plus \1 to \9 */
4506 int nmatch
= lengthof(pmatch
);
4513 initStringInfo(&buf
);
4515 /* Convert data string to wide characters. */
4516 data
= (pg_wchar
*) palloc((src_text_len
+ 1) * sizeof(pg_wchar
));
4517 data_len
= pg_mb2wchar_with_len(VARDATA_ANY(src_text
), data
, src_text_len
);
4519 /* Check whether replace_text has escapes, especially regexp submatches. */
4520 escape_status
= check_replace_text_has_escape(replace_text
);
4522 /* If no regexp submatches, we can use REG_NOSUB. */
4523 if (escape_status
< 2)
4525 cflags
|= REG_NOSUB
;
4526 /* Also tell pg_regexec we only want the whole-match location. */
4530 /* Prepare the regexp. */
4531 re
= RE_compile_and_cache(pattern_text
, cflags
, collation
);
4533 /* start_ptr points to the data_pos'th character of src_text */
4534 start_ptr
= (char *) VARDATA_ANY(src_text
);
4537 while (search_start
<= data_len
)
4541 CHECK_FOR_INTERRUPTS();
4543 regexec_result
= pg_regexec(re
,
4547 NULL
, /* no details */
4552 if (regexec_result
== REG_NOMATCH
)
4555 if (regexec_result
!= REG_OKAY
)
4559 CHECK_FOR_INTERRUPTS();
4560 pg_regerror(regexec_result
, re
, errMsg
, sizeof(errMsg
));
4562 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION
),
4563 errmsg("regular expression failed: %s", errMsg
)));
4567 * Count matches, and decide whether to replace this match.
4570 if (n
> 0 && nmatches
!= n
)
4573 * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4574 * we treat the matched text as if it weren't matched, and copy it
4575 * to the output later.)
4577 search_start
= pmatch
[0].rm_eo
;
4578 if (pmatch
[0].rm_so
== pmatch
[0].rm_eo
)
4584 * Copy the text to the left of the match position. Note we are given
4585 * character not byte indexes.
4587 if (pmatch
[0].rm_so
- data_pos
> 0)
4591 chunk_len
= charlen_to_bytelen(start_ptr
,
4592 pmatch
[0].rm_so
- data_pos
);
4593 appendBinaryStringInfo(&buf
, start_ptr
, chunk_len
);
4596 * Advance start_ptr over that text, to avoid multiple rescans of
4597 * it if the replace_text contains multiple back-references.
4599 start_ptr
+= chunk_len
;
4600 data_pos
= pmatch
[0].rm_so
;
4604 * Copy the replace_text, processing escapes if any are present.
4606 if (escape_status
> 0)
4607 appendStringInfoRegexpSubstr(&buf
, replace_text
, pmatch
,
4608 start_ptr
, data_pos
);
4610 appendStringInfoText(&buf
, replace_text
);
4612 /* Advance start_ptr and data_pos over the matched text. */
4613 start_ptr
+= charlen_to_bytelen(start_ptr
,
4614 pmatch
[0].rm_eo
- data_pos
);
4615 data_pos
= pmatch
[0].rm_eo
;
4618 * If we only want to replace one occurrence, we're done.
4624 * Advance search position. Normally we start the next search at the
4625 * end of the previous match; but if the match was of zero length, we
4626 * have to advance by one character, or we'd just find the same match
4629 search_start
= data_pos
;
4630 if (pmatch
[0].rm_so
== pmatch
[0].rm_eo
)
4635 * Copy the text to the right of the last match.
4637 if (data_pos
< data_len
)
4641 chunk_len
= ((char *) src_text
+ VARSIZE_ANY(src_text
)) - start_ptr
;
4642 appendBinaryStringInfo(&buf
, start_ptr
, chunk_len
);
4645 ret_text
= cstring_to_text_with_len(buf
.data
, buf
.len
);
4654 * parse input string based on provided field separator
4655 * return N'th item (1 based, negative counts from end)
4658 split_part(PG_FUNCTION_ARGS
)
4660 text
*inputstring
= PG_GETARG_TEXT_PP(0);
4661 text
*fldsep
= PG_GETARG_TEXT_PP(1);
4662 int fldnum
= PG_GETARG_INT32(2);
4663 int inputstring_len
;
4665 TextPositionState state
;
4671 /* field number is 1 based */
4674 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
4675 errmsg("field position must not be zero")));
4677 inputstring_len
= VARSIZE_ANY_EXHDR(inputstring
);
4678 fldsep_len
= VARSIZE_ANY_EXHDR(fldsep
);
4680 /* return empty string for empty input string */
4681 if (inputstring_len
< 1)
4682 PG_RETURN_TEXT_P(cstring_to_text(""));
4684 /* handle empty field separator */
4687 /* if first or last field, return input string, else empty string */
4688 if (fldnum
== 1 || fldnum
== -1)
4689 PG_RETURN_TEXT_P(inputstring
);
4691 PG_RETURN_TEXT_P(cstring_to_text(""));
4694 /* find the first field separator */
4695 text_position_setup(inputstring
, fldsep
, PG_GET_COLLATION(), &state
);
4697 found
= text_position_next(&state
);
4699 /* special case if fldsep not found at all */
4702 text_position_cleanup(&state
);
4703 /* if first or last field, return input string, else empty string */
4704 if (fldnum
== 1 || fldnum
== -1)
4705 PG_RETURN_TEXT_P(inputstring
);
4707 PG_RETURN_TEXT_P(cstring_to_text(""));
4711 * take care of a negative field number (i.e. count from the right) by
4712 * converting to a positive field number; we need total number of fields
4716 /* we found a fldsep, so there are at least two fields */
4719 while (text_position_next(&state
))
4722 /* special case of last field does not require an extra pass */
4725 start_ptr
= text_position_get_match_ptr(&state
) + fldsep_len
;
4726 end_ptr
= VARDATA_ANY(inputstring
) + inputstring_len
;
4727 text_position_cleanup(&state
);
4728 PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr
,
4729 end_ptr
- start_ptr
));
4732 /* else, convert fldnum to positive notation */
4733 fldnum
+= numfields
+ 1;
4735 /* if nonexistent field, return empty string */
4738 text_position_cleanup(&state
);
4739 PG_RETURN_TEXT_P(cstring_to_text(""));
4742 /* reset to pointing at first match, but now with positive fldnum */
4743 text_position_reset(&state
);
4744 found
= text_position_next(&state
);
4748 /* identify bounds of first field */
4749 start_ptr
= VARDATA_ANY(inputstring
);
4750 end_ptr
= text_position_get_match_ptr(&state
);
4752 while (found
&& --fldnum
> 0)
4754 /* identify bounds of next field */
4755 start_ptr
= end_ptr
+ fldsep_len
;
4756 found
= text_position_next(&state
);
4758 end_ptr
= text_position_get_match_ptr(&state
);
4761 text_position_cleanup(&state
);
4765 /* N'th field separator not found */
4766 /* if last field requested, return it, else empty string */
4769 int last_len
= start_ptr
- VARDATA_ANY(inputstring
);
4771 result_text
= cstring_to_text_with_len(start_ptr
,
4772 inputstring_len
- last_len
);
4775 result_text
= cstring_to_text("");
4779 /* non-last field requested */
4780 result_text
= cstring_to_text_with_len(start_ptr
, end_ptr
- start_ptr
);
4783 PG_RETURN_TEXT_P(result_text
);
4787 * Convenience function to return true when two text params are equal.
4790 text_isequal(text
*txt1
, text
*txt2
, Oid collid
)
4792 return DatumGetBool(DirectFunctionCall2Coll(texteq
,
4794 PointerGetDatum(txt1
),
4795 PointerGetDatum(txt2
)));
4800 * parse input string and return text array of elements,
4801 * based on provided field separator
4804 text_to_array(PG_FUNCTION_ARGS
)
4806 SplitTextOutputData tstate
;
4808 /* For array output, tstate should start as all zeroes */
4809 memset(&tstate
, 0, sizeof(tstate
));
4811 if (!split_text(fcinfo
, &tstate
))
4814 if (tstate
.astate
== NULL
)
4815 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID
));
4817 PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate
.astate
,
4818 CurrentMemoryContext
));
4822 * text_to_array_null
4823 * parse input string and return text array of elements,
4824 * based on provided field separator and null string
4826 * This is a separate entry point only to prevent the regression tests from
4827 * complaining about different argument sets for the same internal function.
4830 text_to_array_null(PG_FUNCTION_ARGS
)
4832 return text_to_array(fcinfo
);
4837 * parse input string and return table of elements,
4838 * based on provided field separator
4841 text_to_table(PG_FUNCTION_ARGS
)
4843 ReturnSetInfo
*rsi
= (ReturnSetInfo
*) fcinfo
->resultinfo
;
4844 SplitTextOutputData tstate
;
4845 MemoryContext old_cxt
;
4847 /* check to see if caller supports us returning a tuplestore */
4848 if (rsi
== NULL
|| !IsA(rsi
, ReturnSetInfo
))
4850 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
4851 errmsg("set-valued function called in context that cannot accept a set")));
4852 if (!(rsi
->allowedModes
& SFRM_Materialize
))
4854 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
4855 errmsg("materialize mode required, but it is not allowed in this context")));
4857 /* OK, prepare tuplestore in per-query memory */
4858 old_cxt
= MemoryContextSwitchTo(rsi
->econtext
->ecxt_per_query_memory
);
4860 tstate
.astate
= NULL
;
4861 tstate
.tupdesc
= CreateTupleDescCopy(rsi
->expectedDesc
);
4862 tstate
.tupstore
= tuplestore_begin_heap(true, false, work_mem
);
4864 MemoryContextSwitchTo(old_cxt
);
4866 (void) split_text(fcinfo
, &tstate
);
4868 tuplestore_donestoring(tstate
.tupstore
);
4870 rsi
->returnMode
= SFRM_Materialize
;
4871 rsi
->setResult
= tstate
.tupstore
;
4872 rsi
->setDesc
= tstate
.tupdesc
;
4878 * text_to_table_null
4879 * parse input string and return table of elements,
4880 * based on provided field separator and null string
4882 * This is a separate entry point only to prevent the regression tests from
4883 * complaining about different argument sets for the same internal function.
4886 text_to_table_null(PG_FUNCTION_ARGS
)
4888 return text_to_table(fcinfo
);
4892 * Common code for text_to_array, text_to_array_null, text_to_table
4893 * and text_to_table_null functions.
4895 * These are not strict so we have to test for null inputs explicitly.
4896 * Returns false if result is to be null, else returns true.
4898 * Note that if the result is valid but empty (zero elements), we return
4899 * without changing *tstate --- caller must handle that case, too.
4902 split_text(FunctionCallInfo fcinfo
, SplitTextOutputData
*tstate
)
4907 Oid collation
= PG_GET_COLLATION();
4908 int inputstring_len
;
4913 /* when input string is NULL, then result is NULL too */
4914 if (PG_ARGISNULL(0))
4917 inputstring
= PG_GETARG_TEXT_PP(0);
4919 /* fldsep can be NULL */
4920 if (!PG_ARGISNULL(1))
4921 fldsep
= PG_GETARG_TEXT_PP(1);
4925 /* null_string can be NULL or omitted */
4926 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4927 null_string
= PG_GETARG_TEXT_PP(2);
4934 * Normal case with non-null fldsep. Use the text_position machinery
4935 * to search for occurrences of fldsep.
4937 TextPositionState state
;
4939 inputstring_len
= VARSIZE_ANY_EXHDR(inputstring
);
4940 fldsep_len
= VARSIZE_ANY_EXHDR(fldsep
);
4942 /* return empty set for empty input string */
4943 if (inputstring_len
< 1)
4946 /* empty field separator: return input string as a one-element set */
4949 split_text_accum_result(tstate
, inputstring
,
4950 null_string
, collation
);
4954 text_position_setup(inputstring
, fldsep
, collation
, &state
);
4956 start_ptr
= VARDATA_ANY(inputstring
);
4964 CHECK_FOR_INTERRUPTS();
4966 found
= text_position_next(&state
);
4969 /* fetch last field */
4970 chunk_len
= ((char *) inputstring
+ VARSIZE_ANY(inputstring
)) - start_ptr
;
4971 end_ptr
= NULL
; /* not used, but some compilers complain */
4975 /* fetch non-last field */
4976 end_ptr
= text_position_get_match_ptr(&state
);
4977 chunk_len
= end_ptr
- start_ptr
;
4980 /* build a temp text datum to pass to split_text_accum_result */
4981 result_text
= cstring_to_text_with_len(start_ptr
, chunk_len
);
4983 /* stash away this field */
4984 split_text_accum_result(tstate
, result_text
,
4985 null_string
, collation
);
4992 start_ptr
= end_ptr
+ fldsep_len
;
4995 text_position_cleanup(&state
);
5000 * When fldsep is NULL, each character in the input string becomes a
5001 * separate element in the result set. The separator is effectively
5002 * the space between characters.
5004 inputstring_len
= VARSIZE_ANY_EXHDR(inputstring
);
5006 start_ptr
= VARDATA_ANY(inputstring
);
5008 while (inputstring_len
> 0)
5010 int chunk_len
= pg_mblen(start_ptr
);
5012 CHECK_FOR_INTERRUPTS();
5014 /* build a temp text datum to pass to split_text_accum_result */
5015 result_text
= cstring_to_text_with_len(start_ptr
, chunk_len
);
5017 /* stash away this field */
5018 split_text_accum_result(tstate
, result_text
,
5019 null_string
, collation
);
5023 start_ptr
+= chunk_len
;
5024 inputstring_len
-= chunk_len
;
5032 * Add text item to result set (table or array).
5034 * This is also responsible for checking to see if the item matches
5035 * the null_string, in which case we should emit NULL instead.
5038 split_text_accum_result(SplitTextOutputData
*tstate
,
5043 bool is_null
= false;
5045 if (null_string
&& text_isequal(field_value
, null_string
, collation
))
5048 if (tstate
->tupstore
)
5053 values
[0] = PointerGetDatum(field_value
);
5056 tuplestore_putvalues(tstate
->tupstore
,
5063 tstate
->astate
= accumArrayResult(tstate
->astate
,
5064 PointerGetDatum(field_value
),
5067 CurrentMemoryContext
);
5073 * concatenate Cstring representation of input array elements
5074 * using provided field separator
5077 array_to_text(PG_FUNCTION_ARGS
)
5079 ArrayType
*v
= PG_GETARG_ARRAYTYPE_P(0);
5080 char *fldsep
= text_to_cstring(PG_GETARG_TEXT_PP(1));
5082 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo
, v
, fldsep
, NULL
));
5086 * array_to_text_null
5087 * concatenate Cstring representation of input array elements
5088 * using provided field separator and null string
5090 * This version is not strict so we have to test for null inputs explicitly.
5093 array_to_text_null(PG_FUNCTION_ARGS
)
5099 /* returns NULL when first or second parameter is NULL */
5100 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5103 v
= PG_GETARG_ARRAYTYPE_P(0);
5104 fldsep
= text_to_cstring(PG_GETARG_TEXT_PP(1));
5106 /* NULL null string is passed through as a null pointer */
5107 if (!PG_ARGISNULL(2))
5108 null_string
= text_to_cstring(PG_GETARG_TEXT_PP(2));
5112 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo
, v
, fldsep
, null_string
));
5116 * common code for array_to_text and array_to_text_null functions
5119 array_to_text_internal(FunctionCallInfo fcinfo
, ArrayType
*v
,
5120 const char *fldsep
, const char *null_string
)
5131 bool printed
= false;
5136 ArrayMetaState
*my_extra
;
5138 ndims
= ARR_NDIM(v
);
5140 nitems
= ArrayGetNItems(ndims
, dims
);
5142 /* if there are no elements, return an empty string */
5144 return cstring_to_text_with_len("", 0);
5146 element_type
= ARR_ELEMTYPE(v
);
5147 initStringInfo(&buf
);
5150 * We arrange to look up info about element type, including its output
5151 * conversion proc, only once per series of calls, assuming the element
5152 * type doesn't change underneath us.
5154 my_extra
= (ArrayMetaState
*) fcinfo
->flinfo
->fn_extra
;
5155 if (my_extra
== NULL
)
5157 fcinfo
->flinfo
->fn_extra
= MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5158 sizeof(ArrayMetaState
));
5159 my_extra
= (ArrayMetaState
*) fcinfo
->flinfo
->fn_extra
;
5160 my_extra
->element_type
= ~element_type
;
5163 if (my_extra
->element_type
!= element_type
)
5166 * Get info about element type, including its output conversion proc
5168 get_type_io_data(element_type
, IOFunc_output
,
5169 &my_extra
->typlen
, &my_extra
->typbyval
,
5170 &my_extra
->typalign
, &my_extra
->typdelim
,
5171 &my_extra
->typioparam
, &my_extra
->typiofunc
);
5172 fmgr_info_cxt(my_extra
->typiofunc
, &my_extra
->proc
,
5173 fcinfo
->flinfo
->fn_mcxt
);
5174 my_extra
->element_type
= element_type
;
5176 typlen
= my_extra
->typlen
;
5177 typbyval
= my_extra
->typbyval
;
5178 typalign
= my_extra
->typalign
;
5180 p
= ARR_DATA_PTR(v
);
5181 bitmap
= ARR_NULLBITMAP(v
);
5184 for (i
= 0; i
< nitems
; i
++)
5189 /* Get source element, checking for NULL */
5190 if (bitmap
&& (*bitmap
& bitmask
) == 0)
5192 /* if null_string is NULL, we just ignore null elements */
5193 if (null_string
!= NULL
)
5196 appendStringInfo(&buf
, "%s%s", fldsep
, null_string
);
5198 appendStringInfoString(&buf
, null_string
);
5204 itemvalue
= fetch_att(p
, typbyval
, typlen
);
5206 value
= OutputFunctionCall(&my_extra
->proc
, itemvalue
);
5209 appendStringInfo(&buf
, "%s%s", fldsep
, value
);
5211 appendStringInfoString(&buf
, value
);
5214 p
= att_addlength_pointer(p
, typlen
, p
);
5215 p
= (char *) att_align_nominal(p
, typalign
);
5218 /* advance bitmap pointer if any */
5222 if (bitmask
== 0x100)
5230 result
= cstring_to_text_with_len(buf
.data
, buf
.len
);
5238 * Convert an int32 to a string containing a base 16 (hex) representation of
5242 to_hex32(PG_FUNCTION_ARGS
)
5244 uint32 value
= (uint32
) PG_GETARG_INT32(0);
5246 const char *digits
= "0123456789abcdef";
5247 char buf
[32]; /* bigger than needed, but reasonable */
5249 ptr
= buf
+ sizeof(buf
) - 1;
5254 *--ptr
= digits
[value
% HEXBASE
];
5256 } while (ptr
> buf
&& value
);
5258 PG_RETURN_TEXT_P(cstring_to_text(ptr
));
5262 * Convert an int64 to a string containing a base 16 (hex) representation of
5266 to_hex64(PG_FUNCTION_ARGS
)
5268 uint64 value
= (uint64
) PG_GETARG_INT64(0);
5270 const char *digits
= "0123456789abcdef";
5271 char buf
[32]; /* bigger than needed, but reasonable */
5273 ptr
= buf
+ sizeof(buf
) - 1;
5278 *--ptr
= digits
[value
% HEXBASE
];
5280 } while (ptr
> buf
&& value
);
5282 PG_RETURN_TEXT_P(cstring_to_text(ptr
));
5286 * Return the size of a datum, possibly compressed
5288 * Works on any data type
5291 pg_column_size(PG_FUNCTION_ARGS
)
5293 Datum value
= PG_GETARG_DATUM(0);
5297 /* On first call, get the input type's typlen, and save at *fn_extra */
5298 if (fcinfo
->flinfo
->fn_extra
== NULL
)
5300 /* Lookup the datatype of the supplied argument */
5301 Oid argtypeid
= get_fn_expr_argtype(fcinfo
->flinfo
, 0);
5303 typlen
= get_typlen(argtypeid
);
5304 if (typlen
== 0) /* should not happen */
5305 elog(ERROR
, "cache lookup failed for type %u", argtypeid
);
5307 fcinfo
->flinfo
->fn_extra
= MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5309 *((int *) fcinfo
->flinfo
->fn_extra
) = typlen
;
5312 typlen
= *((int *) fcinfo
->flinfo
->fn_extra
);
5316 /* varlena type, possibly toasted */
5317 result
= toast_datum_size(value
);
5319 else if (typlen
== -2)
5322 result
= strlen(DatumGetCString(value
)) + 1;
5326 /* ordinary fixed-width type */
5330 PG_RETURN_INT32(result
);
5334 * Return the compression method stored in the compressed attribute. Return
5335 * NULL for non varlena type or uncompressed data.
5338 pg_column_compression(PG_FUNCTION_ARGS
)
5342 ToastCompressionId cmid
;
5344 /* On first call, get the input type's typlen, and save at *fn_extra */
5345 if (fcinfo
->flinfo
->fn_extra
== NULL
)
5347 /* Lookup the datatype of the supplied argument */
5348 Oid argtypeid
= get_fn_expr_argtype(fcinfo
->flinfo
, 0);
5350 typlen
= get_typlen(argtypeid
);
5351 if (typlen
== 0) /* should not happen */
5352 elog(ERROR
, "cache lookup failed for type %u", argtypeid
);
5354 fcinfo
->flinfo
->fn_extra
= MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5356 *((int *) fcinfo
->flinfo
->fn_extra
) = typlen
;
5359 typlen
= *((int *) fcinfo
->flinfo
->fn_extra
);
5364 /* get the compression method id stored in the compressed varlena */
5365 cmid
= toast_get_compression_id((struct varlena
*)
5366 DatumGetPointer(PG_GETARG_DATUM(0)));
5367 if (cmid
== TOAST_INVALID_COMPRESSION_ID
)
5370 /* convert compression method id to compression method name */
5373 case TOAST_PGLZ_COMPRESSION_ID
:
5376 case TOAST_LZ4_COMPRESSION_ID
:
5380 elog(ERROR
, "invalid compression method id %d", cmid
);
5383 PG_RETURN_TEXT_P(cstring_to_text(result
));
5387 * string_agg - Concatenates values and returns string.
5389 * Syntax: string_agg(value text, delimiter text) RETURNS text
5391 * Note: Any NULL values are ignored. The first-call delimiter isn't
5392 * actually used at all, and on subsequent calls the delimiter precedes
5393 * the associated value.
5396 /* subroutine to initialize state */
5398 makeStringAggState(FunctionCallInfo fcinfo
)
5401 MemoryContext aggcontext
;
5402 MemoryContext oldcontext
;
5404 if (!AggCheckCallContext(fcinfo
, &aggcontext
))
5406 /* cannot be called directly because of internal-type argument */
5407 elog(ERROR
, "string_agg_transfn called in non-aggregate context");
5411 * Create state in aggregate context. It'll stay there across subsequent
5414 oldcontext
= MemoryContextSwitchTo(aggcontext
);
5415 state
= makeStringInfo();
5416 MemoryContextSwitchTo(oldcontext
);
5422 string_agg_transfn(PG_FUNCTION_ARGS
)
5426 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
5428 /* Append the value unless null. */
5429 if (!PG_ARGISNULL(1))
5431 /* On the first time through, we ignore the delimiter. */
5433 state
= makeStringAggState(fcinfo
);
5434 else if (!PG_ARGISNULL(2))
5435 appendStringInfoText(state
, PG_GETARG_TEXT_PP(2)); /* delimiter */
5437 appendStringInfoText(state
, PG_GETARG_TEXT_PP(1)); /* value */
5441 * The transition type for string_agg() is declared to be "internal",
5442 * which is a pass-by-value type the same size as a pointer.
5444 PG_RETURN_POINTER(state
);
5448 string_agg_finalfn(PG_FUNCTION_ARGS
)
5452 /* cannot be called directly because of internal-type argument */
5453 Assert(AggCheckCallContext(fcinfo
, NULL
));
5455 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
5458 PG_RETURN_TEXT_P(cstring_to_text_with_len(state
->data
, state
->len
));
5464 * Prepare cache with fmgr info for the output functions of the datatypes of
5465 * the arguments of a concat-like function, beginning with argument "argidx".
5466 * (Arguments before that will have corresponding slots in the resulting
5467 * FmgrInfo array, but we don't fill those slots.)
5470 build_concat_foutcache(FunctionCallInfo fcinfo
, int argidx
)
5472 FmgrInfo
*foutcache
;
5475 /* We keep the info in fn_mcxt so it survives across calls */
5476 foutcache
= (FmgrInfo
*) MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5477 PG_NARGS() * sizeof(FmgrInfo
));
5479 for (i
= argidx
; i
< PG_NARGS(); i
++)
5485 valtype
= get_fn_expr_argtype(fcinfo
->flinfo
, i
);
5486 if (!OidIsValid(valtype
))
5487 elog(ERROR
, "could not determine data type of concat() input");
5489 getTypeOutputInfo(valtype
, &typOutput
, &typIsVarlena
);
5490 fmgr_info_cxt(typOutput
, &foutcache
[i
], fcinfo
->flinfo
->fn_mcxt
);
5493 fcinfo
->flinfo
->fn_extra
= foutcache
;
5499 * Implementation of both concat() and concat_ws().
5501 * sepstr is the separator string to place between values.
5502 * argidx identifies the first argument to concatenate (counting from zero);
5503 * note that this must be constant across any one series of calls.
5505 * Returns NULL if result should be NULL, else text value.
5508 concat_internal(const char *sepstr
, int argidx
,
5509 FunctionCallInfo fcinfo
)
5513 FmgrInfo
*foutcache
;
5514 bool first_arg
= true;
5518 * concat(VARIADIC some-array) is essentially equivalent to
5519 * array_to_text(), ie concat the array elements with the given separator.
5520 * So we just pass the case off to that code.
5522 if (get_fn_expr_variadic(fcinfo
->flinfo
))
5526 /* Should have just the one argument */
5527 Assert(argidx
== PG_NARGS() - 1);
5529 /* concat(VARIADIC NULL) is defined as NULL */
5530 if (PG_ARGISNULL(argidx
))
5534 * Non-null argument had better be an array. We assume that any call
5535 * context that could let get_fn_expr_variadic return true will have
5536 * checked that a VARIADIC-labeled parameter actually is an array. So
5537 * it should be okay to just Assert that it's an array rather than
5538 * doing a full-fledged error check.
5540 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo
->flinfo
, argidx
))));
5542 /* OK, safe to fetch the array value */
5543 arr
= PG_GETARG_ARRAYTYPE_P(argidx
);
5546 * And serialize the array. We tell array_to_text to ignore null
5547 * elements, which matches the behavior of the loop below.
5549 return array_to_text_internal(fcinfo
, arr
, sepstr
, NULL
);
5552 /* Normal case without explicit VARIADIC marker */
5553 initStringInfo(&str
);
5555 /* Get output function info, building it if first time through */
5556 foutcache
= (FmgrInfo
*) fcinfo
->flinfo
->fn_extra
;
5557 if (foutcache
== NULL
)
5558 foutcache
= build_concat_foutcache(fcinfo
, argidx
);
5560 for (i
= argidx
; i
< PG_NARGS(); i
++)
5562 if (!PG_ARGISNULL(i
))
5564 Datum value
= PG_GETARG_DATUM(i
);
5566 /* add separator if appropriate */
5570 appendStringInfoString(&str
, sepstr
);
5572 /* call the appropriate type output function, append the result */
5573 appendStringInfoString(&str
,
5574 OutputFunctionCall(&foutcache
[i
], value
));
5578 result
= cstring_to_text_with_len(str
.data
, str
.len
);
5585 * Concatenate all arguments. NULL arguments are ignored.
5588 text_concat(PG_FUNCTION_ARGS
)
5592 result
= concat_internal("", 0, fcinfo
);
5595 PG_RETURN_TEXT_P(result
);
5599 * Concatenate all but first argument value with separators. The first
5600 * parameter is used as the separator. NULL arguments are ignored.
5603 text_concat_ws(PG_FUNCTION_ARGS
)
5608 /* return NULL when separator is NULL */
5609 if (PG_ARGISNULL(0))
5611 sep
= text_to_cstring(PG_GETARG_TEXT_PP(0));
5613 result
= concat_internal(sep
, 1, fcinfo
);
5616 PG_RETURN_TEXT_P(result
);
5620 * Return first n characters in the string. When n is negative,
5621 * return all but last |n| characters.
5624 text_left(PG_FUNCTION_ARGS
)
5626 int n
= PG_GETARG_INT32(1);
5630 text
*str
= PG_GETARG_TEXT_PP(0);
5631 const char *p
= VARDATA_ANY(str
);
5632 int len
= VARSIZE_ANY_EXHDR(str
);
5635 n
= pg_mbstrlen_with_len(p
, len
) + n
;
5636 rlen
= pg_mbcharcliplen(p
, len
, n
);
5637 PG_RETURN_TEXT_P(cstring_to_text_with_len(p
, rlen
));
5640 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n
, false));
5644 * Return last n characters in the string. When n is negative,
5645 * return all but first |n| characters.
5648 text_right(PG_FUNCTION_ARGS
)
5650 text
*str
= PG_GETARG_TEXT_PP(0);
5651 const char *p
= VARDATA_ANY(str
);
5652 int len
= VARSIZE_ANY_EXHDR(str
);
5653 int n
= PG_GETARG_INT32(1);
5659 n
= pg_mbstrlen_with_len(p
, len
) - n
;
5660 off
= pg_mbcharcliplen(p
, len
, n
);
5662 PG_RETURN_TEXT_P(cstring_to_text_with_len(p
+ off
, len
- off
));
5666 * Return reversed string
5669 text_reverse(PG_FUNCTION_ARGS
)
5671 text
*str
= PG_GETARG_TEXT_PP(0);
5672 const char *p
= VARDATA_ANY(str
);
5673 int len
= VARSIZE_ANY_EXHDR(str
);
5674 const char *endp
= p
+ len
;
5678 result
= palloc(len
+ VARHDRSZ
);
5679 dst
= (char *) VARDATA(result
) + len
;
5680 SET_VARSIZE(result
, len
+ VARHDRSZ
);
5682 if (pg_database_encoding_max_length() > 1)
5684 /* multibyte version */
5697 /* single byte version */
5702 PG_RETURN_TEXT_P(result
);
5707 * Support macros for text_format()
5709 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5711 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5713 if (++(ptr) >= (end_ptr)) \
5715 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5716 errmsg("unterminated format() type specifier"), \
5717 errhint("For a single \"%%\" use \"%%%%\"."))); \
5721 * Returns a formatted string
5724 text_format(PG_FUNCTION_ARGS
)
5729 const char *start_ptr
;
5730 const char *end_ptr
;
5735 Datum
*elements
= NULL
;
5737 Oid element_type
= InvalidOid
;
5738 Oid prev_type
= InvalidOid
;
5739 Oid prev_width_type
= InvalidOid
;
5740 FmgrInfo typoutputfinfo
;
5741 FmgrInfo typoutputinfo_width
;
5743 /* When format string is null, immediately return null */
5744 if (PG_ARGISNULL(0))
5747 /* If argument is marked VARIADIC, expand array into elements */
5748 if (get_fn_expr_variadic(fcinfo
->flinfo
))
5756 /* Should have just the one argument */
5757 Assert(PG_NARGS() == 2);
5759 /* If argument is NULL, we treat it as zero-length array */
5760 if (PG_ARGISNULL(1))
5765 * Non-null argument had better be an array. We assume that any
5766 * call context that could let get_fn_expr_variadic return true
5767 * will have checked that a VARIADIC-labeled parameter actually is
5768 * an array. So it should be okay to just Assert that it's an
5769 * array rather than doing a full-fledged error check.
5771 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo
->flinfo
, 1))));
5773 /* OK, safe to fetch the array value */
5774 arr
= PG_GETARG_ARRAYTYPE_P(1);
5776 /* Get info about array element type */
5777 element_type
= ARR_ELEMTYPE(arr
);
5778 get_typlenbyvalalign(element_type
,
5779 &elmlen
, &elmbyval
, &elmalign
);
5781 /* Extract all array elements */
5782 deconstruct_array(arr
, element_type
, elmlen
, elmbyval
, elmalign
,
5783 &elements
, &nulls
, &nitems
);
5787 funcvariadic
= true;
5791 /* Non-variadic case, we'll process the arguments individually */
5793 funcvariadic
= false;
5796 /* Setup for main loop. */
5797 fmt
= PG_GETARG_TEXT_PP(0);
5798 start_ptr
= VARDATA_ANY(fmt
);
5799 end_ptr
= start_ptr
+ VARSIZE_ANY_EXHDR(fmt
);
5800 initStringInfo(&str
);
5801 arg
= 1; /* next argument position to print */
5803 /* Scan format string, looking for conversion specifiers. */
5804 for (cp
= start_ptr
; cp
< end_ptr
; cp
++)
5815 * If it's not the start of a conversion specifier, just copy it to
5816 * the output buffer.
5820 appendStringInfoCharMacro(&str
, *cp
);
5824 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
5826 /* Easy case: %% outputs a single % */
5829 appendStringInfoCharMacro(&str
, *cp
);
5833 /* Parse the optional portions of the format specifier */
5834 cp
= text_format_parse_format(cp
, end_ptr
,
5839 * Next we should see the main conversion specifier. Whether or not
5840 * an argument position was present, it's known that at least one
5841 * character remains in the string at this point. Experience suggests
5842 * that it's worth checking that that character is one of the expected
5843 * ones before we try to fetch arguments, so as to produce the least
5844 * confusing response to a mis-formatted specifier.
5846 if (strchr("sIL", *cp
) == NULL
)
5848 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5849 errmsg("unrecognized format() type specifier \"%.*s\"",
5851 errhint("For a single \"%%\" use \"%%%%\".")));
5853 /* If indirect width was specified, get its value */
5856 /* Collect the specified or next argument position */
5861 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5862 errmsg("too few arguments for format()")));
5864 /* Get the value and type of the selected argument */
5867 value
= PG_GETARG_DATUM(arg
);
5868 isNull
= PG_ARGISNULL(arg
);
5869 typid
= get_fn_expr_argtype(fcinfo
->flinfo
, arg
);
5873 value
= elements
[arg
- 1];
5874 isNull
= nulls
[arg
- 1];
5875 typid
= element_type
;
5877 if (!OidIsValid(typid
))
5878 elog(ERROR
, "could not determine data type of format() input");
5882 /* We can treat NULL width the same as zero */
5885 else if (typid
== INT4OID
)
5886 width
= DatumGetInt32(value
);
5887 else if (typid
== INT2OID
)
5888 width
= DatumGetInt16(value
);
5891 /* For less-usual datatypes, convert to text then to int */
5894 if (typid
!= prev_width_type
)
5899 getTypeOutputInfo(typid
, &typoutputfunc
, &typIsVarlena
);
5900 fmgr_info(typoutputfunc
, &typoutputinfo_width
);
5901 prev_width_type
= typid
;
5904 str
= OutputFunctionCall(&typoutputinfo_width
, value
);
5906 /* pg_strtoint32 will complain about bad data or overflow */
5907 width
= pg_strtoint32(str
);
5913 /* Collect the specified or next argument position */
5918 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5919 errmsg("too few arguments for format()")));
5921 /* Get the value and type of the selected argument */
5924 value
= PG_GETARG_DATUM(arg
);
5925 isNull
= PG_ARGISNULL(arg
);
5926 typid
= get_fn_expr_argtype(fcinfo
->flinfo
, arg
);
5930 value
= elements
[arg
- 1];
5931 isNull
= nulls
[arg
- 1];
5932 typid
= element_type
;
5934 if (!OidIsValid(typid
))
5935 elog(ERROR
, "could not determine data type of format() input");
5940 * Get the appropriate typOutput function, reusing previous one if
5941 * same type as previous argument. That's particularly useful in the
5942 * variadic-array case, but often saves work even for ordinary calls.
5944 if (typid
!= prev_type
)
5949 getTypeOutputInfo(typid
, &typoutputfunc
, &typIsVarlena
);
5950 fmgr_info(typoutputfunc
, &typoutputfinfo
);
5955 * And now we can format the value.
5962 text_format_string_conversion(&str
, *cp
, &typoutputfinfo
,
5967 /* should not get here, because of previous check */
5969 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5970 errmsg("unrecognized format() type specifier \"%.*s\"",
5972 errhint("For a single \"%%\" use \"%%%%\".")));
5977 /* Don't need deconstruct_array results anymore. */
5978 if (elements
!= NULL
)
5983 /* Generate results. */
5984 result
= cstring_to_text_with_len(str
.data
, str
.len
);
5987 PG_RETURN_TEXT_P(result
);
5991 * Parse contiguous digits as a decimal number.
5993 * Returns true if some digits could be parsed.
5994 * The value is returned into *value, and *ptr is advanced to the next
5995 * character to be parsed.
5997 * Note parsing invariant: at least one character is known available before
5998 * string end (end_ptr) at entry, and this is still true at exit.
6001 text_format_parse_digits(const char **ptr
, const char *end_ptr
, int *value
)
6004 const char *cp
= *ptr
;
6007 while (*cp
>= '0' && *cp
<= '9')
6009 int8 digit
= (*cp
- '0');
6011 if (unlikely(pg_mul_s32_overflow(val
, 10, &val
)) ||
6012 unlikely(pg_add_s32_overflow(val
, digit
, &val
)))
6014 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
6015 errmsg("number is out of range")));
6016 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6027 * Parse a format specifier (generally following the SUS printf spec).
6029 * We have already advanced over the initial '%', and we are looking for
6030 * [argpos][flags][width]type (but the type character is not consumed here).
6032 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6033 * Output parameters:
6034 * argpos: argument position for value to be printed. -1 means unspecified.
6035 * widthpos: argument position for width. Zero means the argument position
6036 * was unspecified (ie, take the next arg) and -1 means no width
6037 * argument (width was omitted or specified as a constant).
6038 * flags: bitmask of flags.
6039 * width: directly-specified width value. Zero means the width was omitted
6040 * (note it's not necessary to distinguish this case from an explicit
6041 * zero width value).
6043 * The function result is the next character position to be parsed, ie, the
6044 * location where the type character is/should be.
6046 * Note parsing invariant: at least one character is known available before
6047 * string end (end_ptr) at entry, and this is still true at exit.
6050 text_format_parse_format(const char *start_ptr
, const char *end_ptr
,
6051 int *argpos
, int *widthpos
,
6052 int *flags
, int *width
)
6054 const char *cp
= start_ptr
;
6057 /* set defaults for output parameters */
6063 /* try to identify first number */
6064 if (text_format_parse_digits(&cp
, end_ptr
, &n
))
6068 /* Must be just a width and a type, so we're done */
6072 /* The number was argument position */
6074 /* Explicit 0 for argument index is immediately refused */
6077 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6078 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6079 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6082 /* Handle flags (only minus is supported now) */
6085 *flags
|= TEXT_FORMAT_FLAG_MINUS
;
6086 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6091 /* Handle indirect width */
6092 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6093 if (text_format_parse_digits(&cp
, end_ptr
, &n
))
6095 /* number in this position must be closed by $ */
6098 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6099 errmsg("width argument position must be ended by \"$\"")));
6100 /* The number was width argument position */
6102 /* Explicit 0 for argument index is immediately refused */
6105 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6106 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6107 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6110 *widthpos
= 0; /* width's argument position is unspecified */
6114 /* Check for direct width specification */
6115 if (text_format_parse_digits(&cp
, end_ptr
, &n
))
6119 /* cp should now be pointing at type character */
6124 * Format a %s, %I, or %L conversion
6127 text_format_string_conversion(StringInfo buf
, char conversion
,
6128 FmgrInfo
*typOutputInfo
,
6129 Datum value
, bool isNull
,
6130 int flags
, int width
)
6134 /* Handle NULL arguments before trying to stringify the value. */
6137 if (conversion
== 's')
6138 text_format_append_string(buf
, "", flags
, width
);
6139 else if (conversion
== 'L')
6140 text_format_append_string(buf
, "NULL", flags
, width
);
6141 else if (conversion
== 'I')
6143 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED
),
6144 errmsg("null values cannot be formatted as an SQL identifier")));
6149 str
= OutputFunctionCall(typOutputInfo
, value
);
6152 if (conversion
== 'I')
6154 /* quote_identifier may or may not allocate a new string. */
6155 text_format_append_string(buf
, quote_identifier(str
), flags
, width
);
6157 else if (conversion
== 'L')
6159 char *qstr
= quote_literal_cstr(str
);
6161 text_format_append_string(buf
, qstr
, flags
, width
);
6162 /* quote_literal_cstr() always allocates a new string */
6166 text_format_append_string(buf
, str
, flags
, width
);
6173 * Append str to buf, padding as directed by flags/width
6176 text_format_append_string(StringInfo buf
, const char *str
,
6177 int flags
, int width
)
6179 bool align_to_left
= false;
6182 /* fast path for typical easy case */
6185 appendStringInfoString(buf
, str
);
6191 /* Negative width: implicit '-' flag, then take absolute value */
6192 align_to_left
= true;
6193 /* -INT_MIN is undefined */
6194 if (width
<= INT_MIN
)
6196 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
6197 errmsg("number is out of range")));
6200 else if (flags
& TEXT_FORMAT_FLAG_MINUS
)
6201 align_to_left
= true;
6203 len
= pg_mbstrlen(str
);
6207 appendStringInfoString(buf
, str
);
6209 appendStringInfoSpaces(buf
, width
- len
);
6215 appendStringInfoSpaces(buf
, width
- len
);
6216 appendStringInfoString(buf
, str
);
6221 * text_format_nv - nonvariadic wrapper for text_format function.
6223 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6224 * which checks that all built-in functions that share the implementing C
6225 * function take the same number of arguments.
6228 text_format_nv(PG_FUNCTION_ARGS
)
6230 return text_format(fcinfo
);
6234 * Helper function for Levenshtein distance functions. Faster than memcmp(),
6235 * for this use case.
6238 rest_of_char_same(const char *s1
, const char *s2
, int len
)
6243 if (s1
[len
] != s2
[len
])
6249 /* Expand each Levenshtein distance variant */
6250 #include "levenshtein.c"
6251 #define LEVENSHTEIN_LESS_EQUAL
6252 #include "levenshtein.c"
6259 static UnicodeNormalizationForm
6260 unicode_norm_form_from_string(const char *formstr
)
6262 UnicodeNormalizationForm form
= -1;
6265 * Might as well check this while we're here.
6267 if (GetDatabaseEncoding() != PG_UTF8
)
6269 (errcode(ERRCODE_SYNTAX_ERROR
),
6270 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6272 if (pg_strcasecmp(formstr
, "NFC") == 0)
6274 else if (pg_strcasecmp(formstr
, "NFD") == 0)
6276 else if (pg_strcasecmp(formstr
, "NFKC") == 0)
6277 form
= UNICODE_NFKC
;
6278 else if (pg_strcasecmp(formstr
, "NFKD") == 0)
6279 form
= UNICODE_NFKD
;
6282 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6283 errmsg("invalid normalization form: %s", formstr
)));
6289 unicode_normalize_func(PG_FUNCTION_ARGS
)
6291 text
*input
= PG_GETARG_TEXT_PP(0);
6292 char *formstr
= text_to_cstring(PG_GETARG_TEXT_PP(1));
6293 UnicodeNormalizationForm form
;
6295 pg_wchar
*input_chars
;
6296 pg_wchar
*output_chars
;
6301 form
= unicode_norm_form_from_string(formstr
);
6303 /* convert to pg_wchar */
6304 size
= pg_mbstrlen_with_len(VARDATA_ANY(input
), VARSIZE_ANY_EXHDR(input
));
6305 input_chars
= palloc((size
+ 1) * sizeof(pg_wchar
));
6306 p
= (unsigned char *) VARDATA_ANY(input
);
6307 for (i
= 0; i
< size
; i
++)
6309 input_chars
[i
] = utf8_to_unicode(p
);
6310 p
+= pg_utf_mblen(p
);
6312 input_chars
[i
] = (pg_wchar
) '\0';
6313 Assert((char *) p
== VARDATA_ANY(input
) + VARSIZE_ANY_EXHDR(input
));
6316 output_chars
= unicode_normalize(form
, input_chars
);
6318 /* convert back to UTF-8 string */
6320 for (pg_wchar
*wp
= output_chars
; *wp
; wp
++)
6322 unsigned char buf
[4];
6324 unicode_to_utf8(*wp
, buf
);
6325 size
+= pg_utf_mblen(buf
);
6328 result
= palloc(size
+ VARHDRSZ
);
6329 SET_VARSIZE(result
, size
+ VARHDRSZ
);
6331 p
= (unsigned char *) VARDATA_ANY(result
);
6332 for (pg_wchar
*wp
= output_chars
; *wp
; wp
++)
6334 unicode_to_utf8(*wp
, p
);
6335 p
+= pg_utf_mblen(p
);
6337 Assert((char *) p
== (char *) result
+ size
+ VARHDRSZ
);
6339 PG_RETURN_TEXT_P(result
);
6343 * Check whether the string is in the specified Unicode normalization form.
6345 * This is done by converting the string to the specified normal form and then
6346 * comparing that to the original string. To speed that up, we also apply the
6347 * "quick check" algorithm specified in UAX #15, which can give a yes or no
6348 * answer for many strings by just scanning the string once.
6350 * This function should generally be optimized for the case where the string
6351 * is in fact normalized. In that case, we'll end up looking at the entire
6352 * string, so it's probably not worth doing any incremental conversion etc.
6355 unicode_is_normalized(PG_FUNCTION_ARGS
)
6357 text
*input
= PG_GETARG_TEXT_PP(0);
6358 char *formstr
= text_to_cstring(PG_GETARG_TEXT_PP(1));
6359 UnicodeNormalizationForm form
;
6361 pg_wchar
*input_chars
;
6362 pg_wchar
*output_chars
;
6365 UnicodeNormalizationQC quickcheck
;
6369 form
= unicode_norm_form_from_string(formstr
);
6371 /* convert to pg_wchar */
6372 size
= pg_mbstrlen_with_len(VARDATA_ANY(input
), VARSIZE_ANY_EXHDR(input
));
6373 input_chars
= palloc((size
+ 1) * sizeof(pg_wchar
));
6374 p
= (unsigned char *) VARDATA_ANY(input
);
6375 for (i
= 0; i
< size
; i
++)
6377 input_chars
[i
] = utf8_to_unicode(p
);
6378 p
+= pg_utf_mblen(p
);
6380 input_chars
[i
] = (pg_wchar
) '\0';
6381 Assert((char *) p
== VARDATA_ANY(input
) + VARSIZE_ANY_EXHDR(input
));
6383 /* quick check (see UAX #15) */
6384 quickcheck
= unicode_is_normalized_quickcheck(form
, input_chars
);
6385 if (quickcheck
== UNICODE_NORM_QC_YES
)
6386 PG_RETURN_BOOL(true);
6387 else if (quickcheck
== UNICODE_NORM_QC_NO
)
6388 PG_RETURN_BOOL(false);
6390 /* normalize and compare with original */
6391 output_chars
= unicode_normalize(form
, input_chars
);
6394 for (pg_wchar
*wp
= output_chars
; *wp
; wp
++)
6397 result
= (size
== output_size
) &&
6398 (memcmp(input_chars
, output_chars
, size
* sizeof(pg_wchar
)) == 0);
6400 PG_RETURN_BOOL(result
);
6404 * Check if first n chars are hexadecimal digits
6407 isxdigits_n(const char *instr
, size_t n
)
6409 for (size_t i
= 0; i
< n
; i
++)
6410 if (!isxdigit((unsigned char) instr
[i
]))
6417 hexval(unsigned char c
)
6419 if (c
>= '0' && c
<= '9')
6421 if (c
>= 'a' && c
<= 'f')
6422 return c
- 'a' + 0xA;
6423 if (c
>= 'A' && c
<= 'F')
6424 return c
- 'A' + 0xA;
6425 elog(ERROR
, "invalid hexadecimal digit");
6426 return 0; /* not reached */
6430 * Translate string with hexadecimal digits to number
6433 hexval_n(const char *instr
, size_t n
)
6435 unsigned int result
= 0;
6437 for (size_t i
= 0; i
< n
; i
++)
6438 result
+= hexval(instr
[i
]) << (4 * (n
- i
- 1));
6444 * Replaces Unicode escape sequences by Unicode characters
6447 unistr(PG_FUNCTION_ARGS
)
6449 text
*input_text
= PG_GETARG_TEXT_PP(0);
6454 pg_wchar pair_first
= 0;
6455 char cbuf
[MAX_UNICODE_EQUIVALENT_STRING
+ 1];
6457 instr
= VARDATA_ANY(input_text
);
6458 len
= VARSIZE_ANY_EXHDR(input_text
);
6460 initStringInfo(&str
);
6464 if (instr
[0] == '\\')
6471 appendStringInfoChar(&str
, '\\');
6475 else if ((len
>= 5 && isxdigits_n(instr
+ 1, 4)) ||
6476 (len
>= 6 && instr
[1] == 'u' && isxdigits_n(instr
+ 2, 4)))
6479 int offset
= instr
[1] == 'u' ? 2 : 1;
6481 unicode
= hexval_n(instr
+ offset
, 4);
6483 if (!is_valid_unicode_codepoint(unicode
))
6485 errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6486 errmsg("invalid Unicode code point: %04X", unicode
));
6490 if (is_utf16_surrogate_second(unicode
))
6492 unicode
= surrogate_pair_to_codepoint(pair_first
, unicode
);
6498 else if (is_utf16_surrogate_second(unicode
))
6501 if (is_utf16_surrogate_first(unicode
))
6502 pair_first
= unicode
;
6505 pg_unicode_to_server(unicode
, (unsigned char *) cbuf
);
6506 appendStringInfoString(&str
, cbuf
);
6509 instr
+= 4 + offset
;
6512 else if (len
>= 8 && instr
[1] == '+' && isxdigits_n(instr
+ 2, 6))
6516 unicode
= hexval_n(instr
+ 2, 6);
6518 if (!is_valid_unicode_codepoint(unicode
))
6520 errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6521 errmsg("invalid Unicode code point: %04X", unicode
));
6525 if (is_utf16_surrogate_second(unicode
))
6527 unicode
= surrogate_pair_to_codepoint(pair_first
, unicode
);
6533 else if (is_utf16_surrogate_second(unicode
))
6536 if (is_utf16_surrogate_first(unicode
))
6537 pair_first
= unicode
;
6540 pg_unicode_to_server(unicode
, (unsigned char *) cbuf
);
6541 appendStringInfoString(&str
, cbuf
);
6547 else if (len
>= 10 && instr
[1] == 'U' && isxdigits_n(instr
+ 2, 8))
6551 unicode
= hexval_n(instr
+ 2, 8);
6553 if (!is_valid_unicode_codepoint(unicode
))
6555 errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6556 errmsg("invalid Unicode code point: %04X", unicode
));
6560 if (is_utf16_surrogate_second(unicode
))
6562 unicode
= surrogate_pair_to_codepoint(pair_first
, unicode
);
6568 else if (is_utf16_surrogate_second(unicode
))
6571 if (is_utf16_surrogate_first(unicode
))
6572 pair_first
= unicode
;
6575 pg_unicode_to_server(unicode
, (unsigned char *) cbuf
);
6576 appendStringInfoString(&str
, cbuf
);
6584 (errcode(ERRCODE_SYNTAX_ERROR
),
6585 errmsg("invalid Unicode escape"),
6586 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6593 appendStringInfoChar(&str
, *instr
++);
6598 /* unfinished surrogate pair? */
6602 result
= cstring_to_text_with_len(str
.data
, str
.len
);
6605 PG_RETURN_TEXT_P(result
);
6609 (errcode(ERRCODE_SYNTAX_ERROR
),
6610 errmsg("invalid Unicode surrogate pair")));
6611 PG_RETURN_NULL(); /* keep compiler quiet */