Convert a few more datatype input functions to report errors softly.
[pgsql.git] / src / backend / utils / adt / varlena.c
blob1c52deec556aab2f8a9f3e3aafc73219c08a36d8
1 /*-------------------------------------------------------------------------
3 * varlena.c
4 * Functions for the variable-length built-in types.
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/utils/adt/varlena.c
13 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include <ctype.h>
18 #include <limits.h>
20 #include "access/detoast.h"
21 #include "access/toast_compression.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "funcapi.h"
28 #include "lib/hyperloglog.h"
29 #include "libpq/pqformat.h"
30 #include "miscadmin.h"
31 #include "nodes/execnodes.h"
32 #include "parser/scansup.h"
33 #include "port/pg_bswap.h"
34 #include "regex/regex.h"
35 #include "utils/builtins.h"
36 #include "utils/bytea.h"
37 #include "utils/guc.h"
38 #include "utils/lsyscache.h"
39 #include "utils/memutils.h"
40 #include "utils/pg_locale.h"
41 #include "utils/sortsupport.h"
42 #include "utils/varlena.h"
45 /* GUC variable */
46 int bytea_output = BYTEA_OUTPUT_HEX;
48 typedef struct varlena unknown;
49 typedef struct varlena VarString;
52 * State for text_position_* functions.
54 typedef struct
56 bool is_multibyte_char_in_char; /* need to check char boundaries? */
58 char *str1; /* haystack string */
59 char *str2; /* needle string */
60 int len1; /* string lengths in bytes */
61 int len2;
63 /* Skip table for Boyer-Moore-Horspool search algorithm: */
64 int skiptablemask; /* mask for ANDing with skiptable subscripts */
65 int skiptable[256]; /* skip distance for given mismatched char */
67 char *last_match; /* pointer to last match in 'str1' */
70 * Sometimes we need to convert the byte position of a match to a
71 * character position. These store the last position that was converted,
72 * so that on the next call, we can continue from that point, rather than
73 * count characters from the very beginning.
75 char *refpoint; /* pointer within original haystack string */
76 int refpos; /* 0-based character offset of the same point */
77 } TextPositionState;
79 typedef struct
81 char *buf1; /* 1st string, or abbreviation original string
82 * buf */
83 char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
84 int buflen1; /* Allocated length of buf1 */
85 int buflen2; /* Allocated length of buf2 */
86 int last_len1; /* Length of last buf1 string/strxfrm() input */
87 int last_len2; /* Length of last buf2 string/strxfrm() blob */
88 int last_returned; /* Last comparison result (cache) */
89 bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
90 bool collate_c;
91 Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
92 hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
93 hyperLogLogState full_card; /* Full key cardinality state */
94 double prop_card; /* Required cardinality proportion */
95 pg_locale_t locale;
96 } VarStringSortSupport;
99 * Output data for split_text(): we output either to an array or a table.
100 * tupstore and tupdesc must be set up in advance to output to a table.
102 typedef struct
104 ArrayBuildState *astate;
105 Tuplestorestate *tupstore;
106 TupleDesc tupdesc;
107 } SplitTextOutputData;
110 * This should be large enough that most strings will fit, but small enough
111 * that we feel comfortable putting it on the stack
113 #define TEXTBUFLEN 1024
115 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
116 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
117 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
118 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
119 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
121 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
122 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
124 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
127 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
129 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
132 static int32 text_length(Datum str);
133 static text *text_catenate(text *t1, text *t2);
134 static text *text_substring(Datum str,
135 int32 start,
136 int32 length,
137 bool length_not_specified);
138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
139 static int text_position(text *t1, text *t2, Oid collid);
140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
141 static bool text_position_next(TextPositionState *state);
142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
143 static char *text_position_get_match_ptr(TextPositionState *state);
144 static int text_position_get_match_pos(TextPositionState *state);
145 static void text_position_cleanup(TextPositionState *state);
146 static void check_collation_set(Oid collid);
147 static int text_cmp(text *arg1, text *arg2, Oid collid);
148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
149 static bytea *bytea_substring(Datum str,
150 int S,
151 int L,
152 bool length_not_specified);
153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
154 static void appendStringInfoText(StringInfo str, const text *t);
155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
156 static void split_text_accum_result(SplitTextOutputData *tstate,
157 text *field_value,
158 text *null_string,
159 Oid collation);
160 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
161 const char *fldsep, const char *null_string);
162 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
164 int *value);
165 static const char *text_format_parse_format(const char *start_ptr,
166 const char *end_ptr,
167 int *argpos, int *widthpos,
168 int *flags, int *width);
169 static void text_format_string_conversion(StringInfo buf, char conversion,
170 FmgrInfo *typOutputInfo,
171 Datum value, bool isNull,
172 int flags, int width);
173 static void text_format_append_string(StringInfo buf, const char *str,
174 int flags, int width);
177 /*****************************************************************************
178 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
179 *****************************************************************************/
182 * cstring_to_text
184 * Create a text value from a null-terminated C string.
186 * The new text value is freshly palloc'd with a full-size VARHDR.
188 text *
189 cstring_to_text(const char *s)
191 return cstring_to_text_with_len(s, strlen(s));
195 * cstring_to_text_with_len
197 * Same as cstring_to_text except the caller specifies the string length;
198 * the string need not be null_terminated.
200 text *
201 cstring_to_text_with_len(const char *s, int len)
203 text *result = (text *) palloc(len + VARHDRSZ);
205 SET_VARSIZE(result, len + VARHDRSZ);
206 memcpy(VARDATA(result), s, len);
208 return result;
212 * text_to_cstring
214 * Create a palloc'd, null-terminated C string from a text value.
216 * We support being passed a compressed or toasted text value.
217 * This is a bit bogus since such values shouldn't really be referred to as
218 * "text *", but it seems useful for robustness. If we didn't handle that
219 * case here, we'd need another routine that did, anyway.
221 char *
222 text_to_cstring(const text *t)
224 /* must cast away the const, unfortunately */
225 text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
226 int len = VARSIZE_ANY_EXHDR(tunpacked);
227 char *result;
229 result = (char *) palloc(len + 1);
230 memcpy(result, VARDATA_ANY(tunpacked), len);
231 result[len] = '\0';
233 if (tunpacked != t)
234 pfree(tunpacked);
236 return result;
240 * text_to_cstring_buffer
242 * Copy a text value into a caller-supplied buffer of size dst_len.
244 * The text string is truncated if necessary to fit. The result is
245 * guaranteed null-terminated (unless dst_len == 0).
247 * We support being passed a compressed or toasted text value.
248 * This is a bit bogus since such values shouldn't really be referred to as
249 * "text *", but it seems useful for robustness. If we didn't handle that
250 * case here, we'd need another routine that did, anyway.
252 void
253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
255 /* must cast away the const, unfortunately */
256 text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
257 size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
259 if (dst_len > 0)
261 dst_len--;
262 if (dst_len >= src_len)
263 dst_len = src_len;
264 else /* ensure truncation is encoding-safe */
265 dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
266 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
267 dst[dst_len] = '\0';
270 if (srcunpacked != src)
271 pfree(srcunpacked);
275 /*****************************************************************************
276 * USER I/O ROUTINES *
277 *****************************************************************************/
280 #define VAL(CH) ((CH) - '0')
281 #define DIG(VAL) ((VAL) + '0')
284 * byteain - converts from printable representation of byte array
286 * Non-printable characters must be passed as '\nnn' (octal) and are
287 * converted to internal form. '\' must be passed as '\\'.
288 * ereport(ERROR, ...) if bad form.
290 * BUGS:
291 * The input is scanned twice.
292 * The error checking of input is minimal.
294 Datum
295 byteain(PG_FUNCTION_ARGS)
297 char *inputText = PG_GETARG_CSTRING(0);
298 Node *escontext = fcinfo->context;
299 char *tp;
300 char *rp;
301 int bc;
302 bytea *result;
304 /* Recognize hex input */
305 if (inputText[0] == '\\' && inputText[1] == 'x')
307 size_t len = strlen(inputText);
309 bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
310 result = palloc(bc);
311 bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
312 escontext);
313 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
315 PG_RETURN_BYTEA_P(result);
318 /* Else, it's the traditional escaped style */
319 for (bc = 0, tp = inputText; *tp != '\0'; bc++)
321 if (tp[0] != '\\')
322 tp++;
323 else if ((tp[0] == '\\') &&
324 (tp[1] >= '0' && tp[1] <= '3') &&
325 (tp[2] >= '0' && tp[2] <= '7') &&
326 (tp[3] >= '0' && tp[3] <= '7'))
327 tp += 4;
328 else if ((tp[0] == '\\') &&
329 (tp[1] == '\\'))
330 tp += 2;
331 else
334 * one backslash, not followed by another or ### valid octal
336 ereturn(escontext, (Datum) 0,
337 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
338 errmsg("invalid input syntax for type %s", "bytea")));
342 bc += VARHDRSZ;
344 result = (bytea *) palloc(bc);
345 SET_VARSIZE(result, bc);
347 tp = inputText;
348 rp = VARDATA(result);
349 while (*tp != '\0')
351 if (tp[0] != '\\')
352 *rp++ = *tp++;
353 else if ((tp[0] == '\\') &&
354 (tp[1] >= '0' && tp[1] <= '3') &&
355 (tp[2] >= '0' && tp[2] <= '7') &&
356 (tp[3] >= '0' && tp[3] <= '7'))
358 bc = VAL(tp[1]);
359 bc <<= 3;
360 bc += VAL(tp[2]);
361 bc <<= 3;
362 *rp++ = bc + VAL(tp[3]);
364 tp += 4;
366 else if ((tp[0] == '\\') &&
367 (tp[1] == '\\'))
369 *rp++ = '\\';
370 tp += 2;
372 else
375 * We should never get here. The first pass should not allow it.
377 ereturn(escontext, (Datum) 0,
378 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
379 errmsg("invalid input syntax for type %s", "bytea")));
383 PG_RETURN_BYTEA_P(result);
387 * byteaout - converts to printable representation of byte array
389 * In the traditional escaped format, non-printable characters are
390 * printed as '\nnn' (octal) and '\' as '\\'.
392 Datum
393 byteaout(PG_FUNCTION_ARGS)
395 bytea *vlena = PG_GETARG_BYTEA_PP(0);
396 char *result;
397 char *rp;
399 if (bytea_output == BYTEA_OUTPUT_HEX)
401 /* Print hex format */
402 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
403 *rp++ = '\\';
404 *rp++ = 'x';
405 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
407 else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
409 /* Print traditional escaped format */
410 char *vp;
411 uint64 len;
412 int i;
414 len = 1; /* empty string has 1 char */
415 vp = VARDATA_ANY(vlena);
416 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
418 if (*vp == '\\')
419 len += 2;
420 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
421 len += 4;
422 else
423 len++;
427 * In principle len can't overflow uint32 if the input fit in 1GB, but
428 * for safety let's check rather than relying on palloc's internal
429 * check.
431 if (len > MaxAllocSize)
432 ereport(ERROR,
433 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
434 errmsg_internal("result of bytea output conversion is too large")));
435 rp = result = (char *) palloc(len);
437 vp = VARDATA_ANY(vlena);
438 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
440 if (*vp == '\\')
442 *rp++ = '\\';
443 *rp++ = '\\';
445 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
447 int val; /* holds unprintable chars */
449 val = *vp;
450 rp[0] = '\\';
451 rp[3] = DIG(val & 07);
452 val >>= 3;
453 rp[2] = DIG(val & 07);
454 val >>= 3;
455 rp[1] = DIG(val & 03);
456 rp += 4;
458 else
459 *rp++ = *vp;
462 else
464 elog(ERROR, "unrecognized bytea_output setting: %d",
465 bytea_output);
466 rp = result = NULL; /* keep compiler quiet */
468 *rp = '\0';
469 PG_RETURN_CSTRING(result);
473 * bytearecv - converts external binary format to bytea
475 Datum
476 bytearecv(PG_FUNCTION_ARGS)
478 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
479 bytea *result;
480 int nbytes;
482 nbytes = buf->len - buf->cursor;
483 result = (bytea *) palloc(nbytes + VARHDRSZ);
484 SET_VARSIZE(result, nbytes + VARHDRSZ);
485 pq_copymsgbytes(buf, VARDATA(result), nbytes);
486 PG_RETURN_BYTEA_P(result);
490 * byteasend - converts bytea to binary format
492 * This is a special case: just copy the input...
494 Datum
495 byteasend(PG_FUNCTION_ARGS)
497 bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
499 PG_RETURN_BYTEA_P(vlena);
502 Datum
503 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
505 StringInfo state;
507 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
509 /* Append the value unless null. */
510 if (!PG_ARGISNULL(1))
512 bytea *value = PG_GETARG_BYTEA_PP(1);
514 /* On the first time through, we ignore the delimiter. */
515 if (state == NULL)
516 state = makeStringAggState(fcinfo);
517 else if (!PG_ARGISNULL(2))
519 bytea *delim = PG_GETARG_BYTEA_PP(2);
521 appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
524 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
528 * The transition type for string_agg() is declared to be "internal",
529 * which is a pass-by-value type the same size as a pointer.
531 PG_RETURN_POINTER(state);
534 Datum
535 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
537 StringInfo state;
539 /* cannot be called directly because of internal-type argument */
540 Assert(AggCheckCallContext(fcinfo, NULL));
542 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
544 if (state != NULL)
546 bytea *result;
548 result = (bytea *) palloc(state->len + VARHDRSZ);
549 SET_VARSIZE(result, state->len + VARHDRSZ);
550 memcpy(VARDATA(result), state->data, state->len);
551 PG_RETURN_BYTEA_P(result);
553 else
554 PG_RETURN_NULL();
558 * textin - converts "..." to internal representation
560 Datum
561 textin(PG_FUNCTION_ARGS)
563 char *inputText = PG_GETARG_CSTRING(0);
565 PG_RETURN_TEXT_P(cstring_to_text(inputText));
569 * textout - converts internal representation to "..."
571 Datum
572 textout(PG_FUNCTION_ARGS)
574 Datum txt = PG_GETARG_DATUM(0);
576 PG_RETURN_CSTRING(TextDatumGetCString(txt));
580 * textrecv - converts external binary format to text
582 Datum
583 textrecv(PG_FUNCTION_ARGS)
585 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
586 text *result;
587 char *str;
588 int nbytes;
590 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
592 result = cstring_to_text_with_len(str, nbytes);
593 pfree(str);
594 PG_RETURN_TEXT_P(result);
598 * textsend - converts text to binary format
600 Datum
601 textsend(PG_FUNCTION_ARGS)
603 text *t = PG_GETARG_TEXT_PP(0);
604 StringInfoData buf;
606 pq_begintypsend(&buf);
607 pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
608 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
613 * unknownin - converts "..." to internal representation
615 Datum
616 unknownin(PG_FUNCTION_ARGS)
618 char *str = PG_GETARG_CSTRING(0);
620 /* representation is same as cstring */
621 PG_RETURN_CSTRING(pstrdup(str));
625 * unknownout - converts internal representation to "..."
627 Datum
628 unknownout(PG_FUNCTION_ARGS)
630 /* representation is same as cstring */
631 char *str = PG_GETARG_CSTRING(0);
633 PG_RETURN_CSTRING(pstrdup(str));
637 * unknownrecv - converts external binary format to unknown
639 Datum
640 unknownrecv(PG_FUNCTION_ARGS)
642 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
643 char *str;
644 int nbytes;
646 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
647 /* representation is same as cstring */
648 PG_RETURN_CSTRING(str);
652 * unknownsend - converts unknown to binary format
654 Datum
655 unknownsend(PG_FUNCTION_ARGS)
657 /* representation is same as cstring */
658 char *str = PG_GETARG_CSTRING(0);
659 StringInfoData buf;
661 pq_begintypsend(&buf);
662 pq_sendtext(&buf, str, strlen(str));
663 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
667 /* ========== PUBLIC ROUTINES ========== */
670 * textlen -
671 * returns the logical length of a text*
672 * (which is less than the VARSIZE of the text*)
674 Datum
675 textlen(PG_FUNCTION_ARGS)
677 Datum str = PG_GETARG_DATUM(0);
679 /* try to avoid decompressing argument */
680 PG_RETURN_INT32(text_length(str));
684 * text_length -
685 * Does the real work for textlen()
687 * This is broken out so it can be called directly by other string processing
688 * functions. Note that the argument is passed as a Datum, to indicate that
689 * it may still be in compressed form. We can avoid decompressing it at all
690 * in some cases.
692 static int32
693 text_length(Datum str)
695 /* fastpath when max encoding length is one */
696 if (pg_database_encoding_max_length() == 1)
697 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
698 else
700 text *t = DatumGetTextPP(str);
702 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
703 VARSIZE_ANY_EXHDR(t)));
708 * textoctetlen -
709 * returns the physical length of a text*
710 * (which is less than the VARSIZE of the text*)
712 Datum
713 textoctetlen(PG_FUNCTION_ARGS)
715 Datum str = PG_GETARG_DATUM(0);
717 /* We need not detoast the input at all */
718 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
722 * textcat -
723 * takes two text* and returns a text* that is the concatenation of
724 * the two.
726 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
727 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
728 * Allocate space for output in all cases.
729 * XXX - thomas 1997-07-10
731 Datum
732 textcat(PG_FUNCTION_ARGS)
734 text *t1 = PG_GETARG_TEXT_PP(0);
735 text *t2 = PG_GETARG_TEXT_PP(1);
737 PG_RETURN_TEXT_P(text_catenate(t1, t2));
741 * text_catenate
742 * Guts of textcat(), broken out so it can be used by other functions
744 * Arguments can be in short-header form, but not compressed or out-of-line
746 static text *
747 text_catenate(text *t1, text *t2)
749 text *result;
750 int len1,
751 len2,
752 len;
753 char *ptr;
755 len1 = VARSIZE_ANY_EXHDR(t1);
756 len2 = VARSIZE_ANY_EXHDR(t2);
758 /* paranoia ... probably should throw error instead? */
759 if (len1 < 0)
760 len1 = 0;
761 if (len2 < 0)
762 len2 = 0;
764 len = len1 + len2 + VARHDRSZ;
765 result = (text *) palloc(len);
767 /* Set size of result string... */
768 SET_VARSIZE(result, len);
770 /* Fill data field of result string... */
771 ptr = VARDATA(result);
772 if (len1 > 0)
773 memcpy(ptr, VARDATA_ANY(t1), len1);
774 if (len2 > 0)
775 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
777 return result;
781 * charlen_to_bytelen()
782 * Compute the number of bytes occupied by n characters starting at *p
784 * It is caller's responsibility that there actually are n characters;
785 * the string need not be null-terminated.
787 static int
788 charlen_to_bytelen(const char *p, int n)
790 if (pg_database_encoding_max_length() == 1)
792 /* Optimization for single-byte encodings */
793 return n;
795 else
797 const char *s;
799 for (s = p; n > 0; n--)
800 s += pg_mblen(s);
802 return s - p;
807 * text_substr()
808 * Return a substring starting at the specified position.
809 * - thomas 1997-12-31
811 * Input:
812 * - string
813 * - starting position (is one-based)
814 * - string length
816 * If the starting position is zero or less, then return from the start of the string
817 * adjusting the length to be consistent with the "negative start" per SQL.
818 * If the length is less than zero, return the remaining string.
820 * Added multibyte support.
821 * - Tatsuo Ishii 1998-4-21
822 * Changed behavior if starting position is less than one to conform to SQL behavior.
823 * Formerly returned the entire string; now returns a portion.
824 * - Thomas Lockhart 1998-12-10
825 * Now uses faster TOAST-slicing interface
826 * - John Gray 2002-02-22
827 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
828 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
829 * error; if E < 1, return '', not entire string). Fixed MB related bug when
830 * S > LC and < LC + 4 sometimes garbage characters are returned.
831 * - Joe Conway 2002-08-10
833 Datum
834 text_substr(PG_FUNCTION_ARGS)
836 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
837 PG_GETARG_INT32(1),
838 PG_GETARG_INT32(2),
839 false));
843 * text_substr_no_len -
844 * Wrapper to avoid opr_sanity failure due to
845 * one function accepting a different number of args.
847 Datum
848 text_substr_no_len(PG_FUNCTION_ARGS)
850 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
851 PG_GETARG_INT32(1),
852 -1, true));
856 * text_substring -
857 * Does the real work for text_substr() and text_substr_no_len()
859 * This is broken out so it can be called directly by other string processing
860 * functions. Note that the argument is passed as a Datum, to indicate that
861 * it may still be in compressed/toasted form. We can avoid detoasting all
862 * of it in some cases.
864 * The result is always a freshly palloc'd datum.
866 static text *
867 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
869 int32 eml = pg_database_encoding_max_length();
870 int32 S = start; /* start position */
871 int32 S1; /* adjusted start position */
872 int32 L1; /* adjusted substring length */
873 int32 E; /* end position */
876 * SQL99 says S can be zero or negative, but we still must fetch from the
877 * start of the string.
879 S1 = Max(S, 1);
881 /* life is easy if the encoding max length is 1 */
882 if (eml == 1)
884 if (length_not_specified) /* special case - get length to end of
885 * string */
886 L1 = -1;
887 else if (length < 0)
889 /* SQL99 says to throw an error for E < S, i.e., negative length */
890 ereport(ERROR,
891 (errcode(ERRCODE_SUBSTRING_ERROR),
892 errmsg("negative substring length not allowed")));
893 L1 = -1; /* silence stupider compilers */
895 else if (pg_add_s32_overflow(S, length, &E))
898 * L could be large enough for S + L to overflow, in which case
899 * the substring must run to end of string.
901 L1 = -1;
903 else
906 * A zero or negative value for the end position can happen if the
907 * start was negative or one. SQL99 says to return a zero-length
908 * string.
910 if (E < 1)
911 return cstring_to_text("");
913 L1 = E - S1;
917 * If the start position is past the end of the string, SQL99 says to
918 * return a zero-length string -- DatumGetTextPSlice() will do that
919 * for us. We need only convert S1 to zero-based starting position.
921 return DatumGetTextPSlice(str, S1 - 1, L1);
923 else if (eml > 1)
926 * When encoding max length is > 1, we can't get LC without
927 * detoasting, so we'll grab a conservatively large slice now and go
928 * back later to do the right thing
930 int32 slice_start;
931 int32 slice_size;
932 int32 slice_strlen;
933 text *slice;
934 int32 E1;
935 int32 i;
936 char *p;
937 char *s;
938 text *ret;
941 * We need to start at position zero because there is no way to know
942 * in advance which byte offset corresponds to the supplied start
943 * position.
945 slice_start = 0;
947 if (length_not_specified) /* special case - get length to end of
948 * string */
949 slice_size = L1 = -1;
950 else if (length < 0)
952 /* SQL99 says to throw an error for E < S, i.e., negative length */
953 ereport(ERROR,
954 (errcode(ERRCODE_SUBSTRING_ERROR),
955 errmsg("negative substring length not allowed")));
956 slice_size = L1 = -1; /* silence stupider compilers */
958 else if (pg_add_s32_overflow(S, length, &E))
961 * L could be large enough for S + L to overflow, in which case
962 * the substring must run to end of string.
964 slice_size = L1 = -1;
966 else
969 * A zero or negative value for the end position can happen if the
970 * start was negative or one. SQL99 says to return a zero-length
971 * string.
973 if (E < 1)
974 return cstring_to_text("");
977 * if E is past the end of the string, the tuple toaster will
978 * truncate the length for us
980 L1 = E - S1;
983 * Total slice size in bytes can't be any longer than the start
984 * position plus substring length times the encoding max length.
985 * If that overflows, we can just use -1.
987 if (pg_mul_s32_overflow(E, eml, &slice_size))
988 slice_size = -1;
992 * If we're working with an untoasted source, no need to do an extra
993 * copying step.
995 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
996 VARATT_IS_EXTERNAL(DatumGetPointer(str)))
997 slice = DatumGetTextPSlice(str, slice_start, slice_size);
998 else
999 slice = (text *) DatumGetPointer(str);
1001 /* see if we got back an empty string */
1002 if (VARSIZE_ANY_EXHDR(slice) == 0)
1004 if (slice != (text *) DatumGetPointer(str))
1005 pfree(slice);
1006 return cstring_to_text("");
1009 /* Now we can get the actual length of the slice in MB characters */
1010 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1011 VARSIZE_ANY_EXHDR(slice));
1014 * Check that the start position wasn't > slice_strlen. If so, SQL99
1015 * says to return a zero-length string.
1017 if (S1 > slice_strlen)
1019 if (slice != (text *) DatumGetPointer(str))
1020 pfree(slice);
1021 return cstring_to_text("");
1025 * Adjust L1 and E1 now that we know the slice string length. Again
1026 * remember that S1 is one based, and slice_start is zero based.
1028 if (L1 > -1)
1029 E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1030 else
1031 E1 = slice_start + 1 + slice_strlen;
1034 * Find the start position in the slice; remember S1 is not zero based
1036 p = VARDATA_ANY(slice);
1037 for (i = 0; i < S1 - 1; i++)
1038 p += pg_mblen(p);
1040 /* hang onto a pointer to our start position */
1041 s = p;
1044 * Count the actual bytes used by the substring of the requested
1045 * length.
1047 for (i = S1; i < E1; i++)
1048 p += pg_mblen(p);
1050 ret = (text *) palloc(VARHDRSZ + (p - s));
1051 SET_VARSIZE(ret, VARHDRSZ + (p - s));
1052 memcpy(VARDATA(ret), s, (p - s));
1054 if (slice != (text *) DatumGetPointer(str))
1055 pfree(slice);
1057 return ret;
1059 else
1060 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1062 /* not reached: suppress compiler warning */
1063 return NULL;
1067 * textoverlay
1068 * Replace specified substring of first string with second
1070 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1071 * This code is a direct implementation of what the standard says.
1073 Datum
1074 textoverlay(PG_FUNCTION_ARGS)
1076 text *t1 = PG_GETARG_TEXT_PP(0);
1077 text *t2 = PG_GETARG_TEXT_PP(1);
1078 int sp = PG_GETARG_INT32(2); /* substring start position */
1079 int sl = PG_GETARG_INT32(3); /* substring length */
1081 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1084 Datum
1085 textoverlay_no_len(PG_FUNCTION_ARGS)
1087 text *t1 = PG_GETARG_TEXT_PP(0);
1088 text *t2 = PG_GETARG_TEXT_PP(1);
1089 int sp = PG_GETARG_INT32(2); /* substring start position */
1090 int sl;
1092 sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1093 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1096 static text *
1097 text_overlay(text *t1, text *t2, int sp, int sl)
1099 text *result;
1100 text *s1;
1101 text *s2;
1102 int sp_pl_sl;
1105 * Check for possible integer-overflow cases. For negative sp, throw a
1106 * "substring length" error because that's what should be expected
1107 * according to the spec's definition of OVERLAY().
1109 if (sp <= 0)
1110 ereport(ERROR,
1111 (errcode(ERRCODE_SUBSTRING_ERROR),
1112 errmsg("negative substring length not allowed")));
1113 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1114 ereport(ERROR,
1115 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1116 errmsg("integer out of range")));
1118 s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1119 s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1120 result = text_catenate(s1, t2);
1121 result = text_catenate(result, s2);
1123 return result;
1127 * textpos -
1128 * Return the position of the specified substring.
1129 * Implements the SQL POSITION() function.
1130 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1131 * - thomas 1997-07-27
1133 Datum
1134 textpos(PG_FUNCTION_ARGS)
1136 text *str = PG_GETARG_TEXT_PP(0);
1137 text *search_str = PG_GETARG_TEXT_PP(1);
1139 PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1143 * text_position -
1144 * Does the real work for textpos()
1146 * Inputs:
1147 * t1 - string to be searched
1148 * t2 - pattern to match within t1
1149 * Result:
1150 * Character index of the first matched char, starting from 1,
1151 * or 0 if no match.
1153 * This is broken out so it can be called directly by other string processing
1154 * functions.
1156 static int
1157 text_position(text *t1, text *t2, Oid collid)
1159 TextPositionState state;
1160 int result;
1162 /* Empty needle always matches at position 1 */
1163 if (VARSIZE_ANY_EXHDR(t2) < 1)
1164 return 1;
1166 /* Otherwise, can't match if haystack is shorter than needle */
1167 if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1168 return 0;
1170 text_position_setup(t1, t2, collid, &state);
1171 if (!text_position_next(&state))
1172 result = 0;
1173 else
1174 result = text_position_get_match_pos(&state);
1175 text_position_cleanup(&state);
1176 return result;
1181 * text_position_setup, text_position_next, text_position_cleanup -
1182 * Component steps of text_position()
1184 * These are broken out so that a string can be efficiently searched for
1185 * multiple occurrences of the same pattern. text_position_next may be
1186 * called multiple times, and it advances to the next match on each call.
1187 * text_position_get_match_ptr() and text_position_get_match_pos() return
1188 * a pointer or 1-based character position of the last match, respectively.
1190 * The "state" variable is normally just a local variable in the caller.
1192 * NOTE: text_position_next skips over the matched portion. For example,
1193 * searching for "xx" in "xxx" returns only one match, not two.
1196 static void
1197 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1199 int len1 = VARSIZE_ANY_EXHDR(t1);
1200 int len2 = VARSIZE_ANY_EXHDR(t2);
1201 pg_locale_t mylocale = 0;
1203 check_collation_set(collid);
1205 if (!lc_collate_is_c(collid))
1206 mylocale = pg_newlocale_from_collation(collid);
1208 if (mylocale && !mylocale->deterministic)
1209 ereport(ERROR,
1210 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1211 errmsg("nondeterministic collations are not supported for substring searches")));
1213 Assert(len1 > 0);
1214 Assert(len2 > 0);
1217 * Even with a multi-byte encoding, we perform the search using the raw
1218 * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1219 * because in UTF-8 the byte sequence of one character cannot contain
1220 * another character. For other multi-byte encodings, we do the search
1221 * initially as a simple byte search, ignoring multibyte issues, but
1222 * verify afterwards that the match we found is at a character boundary,
1223 * and continue the search if it was a false match.
1225 if (pg_database_encoding_max_length() == 1)
1226 state->is_multibyte_char_in_char = false;
1227 else if (GetDatabaseEncoding() == PG_UTF8)
1228 state->is_multibyte_char_in_char = false;
1229 else
1230 state->is_multibyte_char_in_char = true;
1232 state->str1 = VARDATA_ANY(t1);
1233 state->str2 = VARDATA_ANY(t2);
1234 state->len1 = len1;
1235 state->len2 = len2;
1236 state->last_match = NULL;
1237 state->refpoint = state->str1;
1238 state->refpos = 0;
1241 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1242 * notes we use the terminology that the "haystack" is the string to be
1243 * searched (t1) and the "needle" is the pattern being sought (t2).
1245 * If the needle is empty or bigger than the haystack then there is no
1246 * point in wasting cycles initializing the table. We also choose not to
1247 * use B-M-H for needles of length 1, since the skip table can't possibly
1248 * save anything in that case.
1250 if (len1 >= len2 && len2 > 1)
1252 int searchlength = len1 - len2;
1253 int skiptablemask;
1254 int last;
1255 int i;
1256 const char *str2 = state->str2;
1259 * First we must determine how much of the skip table to use. The
1260 * declaration of TextPositionState allows up to 256 elements, but for
1261 * short search problems we don't really want to have to initialize so
1262 * many elements --- it would take too long in comparison to the
1263 * actual search time. So we choose a useful skip table size based on
1264 * the haystack length minus the needle length. The closer the needle
1265 * length is to the haystack length the less useful skipping becomes.
1267 * Note: since we use bit-masking to select table elements, the skip
1268 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1270 if (searchlength < 16)
1271 skiptablemask = 3;
1272 else if (searchlength < 64)
1273 skiptablemask = 7;
1274 else if (searchlength < 128)
1275 skiptablemask = 15;
1276 else if (searchlength < 512)
1277 skiptablemask = 31;
1278 else if (searchlength < 2048)
1279 skiptablemask = 63;
1280 else if (searchlength < 4096)
1281 skiptablemask = 127;
1282 else
1283 skiptablemask = 255;
1284 state->skiptablemask = skiptablemask;
1287 * Initialize the skip table. We set all elements to the needle
1288 * length, since this is the correct skip distance for any character
1289 * not found in the needle.
1291 for (i = 0; i <= skiptablemask; i++)
1292 state->skiptable[i] = len2;
1295 * Now examine the needle. For each character except the last one,
1296 * set the corresponding table element to the appropriate skip
1297 * distance. Note that when two characters share the same skip table
1298 * entry, the one later in the needle must determine the skip
1299 * distance.
1301 last = len2 - 1;
1303 for (i = 0; i < last; i++)
1304 state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1309 * Advance to the next match, starting from the end of the previous match
1310 * (or the beginning of the string, on first call). Returns true if a match
1311 * is found.
1313 * Note that this refuses to match an empty-string needle. Most callers
1314 * will have handled that case specially and we'll never see it here.
1316 static bool
1317 text_position_next(TextPositionState *state)
1319 int needle_len = state->len2;
1320 char *start_ptr;
1321 char *matchptr;
1323 if (needle_len <= 0)
1324 return false; /* result for empty pattern */
1326 /* Start from the point right after the previous match. */
1327 if (state->last_match)
1328 start_ptr = state->last_match + needle_len;
1329 else
1330 start_ptr = state->str1;
1332 retry:
1333 matchptr = text_position_next_internal(start_ptr, state);
1335 if (!matchptr)
1336 return false;
1339 * Found a match for the byte sequence. If this is a multibyte encoding,
1340 * where one character's byte sequence can appear inside a longer
1341 * multi-byte character, we need to verify that the match was at a
1342 * character boundary, not in the middle of a multi-byte character.
1344 if (state->is_multibyte_char_in_char)
1346 /* Walk one character at a time, until we reach the match. */
1348 /* the search should never move backwards. */
1349 Assert(state->refpoint <= matchptr);
1351 while (state->refpoint < matchptr)
1353 /* step to next character. */
1354 state->refpoint += pg_mblen(state->refpoint);
1355 state->refpos++;
1358 * If we stepped over the match's start position, then it was a
1359 * false positive, where the byte sequence appeared in the middle
1360 * of a multi-byte character. Skip it, and continue the search at
1361 * the next character boundary.
1363 if (state->refpoint > matchptr)
1365 start_ptr = state->refpoint;
1366 goto retry;
1371 state->last_match = matchptr;
1372 return true;
1376 * Subroutine of text_position_next(). This searches for the raw byte
1377 * sequence, ignoring any multi-byte encoding issues. Returns the first
1378 * match starting at 'start_ptr', or NULL if no match is found.
1380 static char *
1381 text_position_next_internal(char *start_ptr, TextPositionState *state)
1383 int haystack_len = state->len1;
1384 int needle_len = state->len2;
1385 int skiptablemask = state->skiptablemask;
1386 const char *haystack = state->str1;
1387 const char *needle = state->str2;
1388 const char *haystack_end = &haystack[haystack_len];
1389 const char *hptr;
1391 Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1393 if (needle_len == 1)
1395 /* No point in using B-M-H for a one-character needle */
1396 char nchar = *needle;
1398 hptr = start_ptr;
1399 while (hptr < haystack_end)
1401 if (*hptr == nchar)
1402 return (char *) hptr;
1403 hptr++;
1406 else
1408 const char *needle_last = &needle[needle_len - 1];
1410 /* Start at startpos plus the length of the needle */
1411 hptr = start_ptr + needle_len - 1;
1412 while (hptr < haystack_end)
1414 /* Match the needle scanning *backward* */
1415 const char *nptr;
1416 const char *p;
1418 nptr = needle_last;
1419 p = hptr;
1420 while (*nptr == *p)
1422 /* Matched it all? If so, return 1-based position */
1423 if (nptr == needle)
1424 return (char *) p;
1425 nptr--, p--;
1429 * No match, so use the haystack char at hptr to decide how far to
1430 * advance. If the needle had any occurrence of that character
1431 * (or more precisely, one sharing the same skiptable entry)
1432 * before its last character, then we advance far enough to align
1433 * the last such needle character with that haystack position.
1434 * Otherwise we can advance by the whole needle length.
1436 hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1440 return 0; /* not found */
1444 * Return a pointer to the current match.
1446 * The returned pointer points into the original haystack string.
1448 static char *
1449 text_position_get_match_ptr(TextPositionState *state)
1451 return state->last_match;
1455 * Return the offset of the current match.
1457 * The offset is in characters, 1-based.
1459 static int
1460 text_position_get_match_pos(TextPositionState *state)
1462 /* Convert the byte position to char position. */
1463 state->refpos += pg_mbstrlen_with_len(state->refpoint,
1464 state->last_match - state->refpoint);
1465 state->refpoint = state->last_match;
1466 return state->refpos + 1;
1470 * Reset search state to the initial state installed by text_position_setup.
1472 * The next call to text_position_next will search from the beginning
1473 * of the string.
1475 static void
1476 text_position_reset(TextPositionState *state)
1478 state->last_match = NULL;
1479 state->refpoint = state->str1;
1480 state->refpos = 0;
1483 static void
1484 text_position_cleanup(TextPositionState *state)
1486 /* no cleanup needed */
1490 static void
1491 check_collation_set(Oid collid)
1493 if (!OidIsValid(collid))
1496 * This typically means that the parser could not resolve a conflict
1497 * of implicit collations, so report it that way.
1499 ereport(ERROR,
1500 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1501 errmsg("could not determine which collation to use for string comparison"),
1502 errhint("Use the COLLATE clause to set the collation explicitly.")));
1506 /* varstr_cmp()
1507 * Comparison function for text strings with given lengths.
1508 * Includes locale support, but must copy strings to temporary memory
1509 * to allow null-termination for inputs to strcoll().
1510 * Returns an integer less than, equal to, or greater than zero, indicating
1511 * whether arg1 is less than, equal to, or greater than arg2.
1513 * Note: many functions that depend on this are marked leakproof; therefore,
1514 * avoid reporting the actual contents of the input when throwing errors.
1515 * All errors herein should be things that can't happen except on corrupt
1516 * data, anyway; otherwise we will have trouble with indexing strings that
1517 * would cause them.
1520 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1522 int result;
1524 check_collation_set(collid);
1527 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1528 * have to do some memory copying. This turns out to be significantly
1529 * slower, so we optimize the case where LC_COLLATE is C. We also try to
1530 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1532 if (lc_collate_is_c(collid))
1534 result = memcmp(arg1, arg2, Min(len1, len2));
1535 if ((result == 0) && (len1 != len2))
1536 result = (len1 < len2) ? -1 : 1;
1538 else
1540 char a1buf[TEXTBUFLEN];
1541 char a2buf[TEXTBUFLEN];
1542 char *a1p,
1543 *a2p;
1544 pg_locale_t mylocale;
1546 mylocale = pg_newlocale_from_collation(collid);
1549 * memcmp() can't tell us which of two unequal strings sorts first,
1550 * but it's a cheap way to tell if they're equal. Testing shows that
1551 * memcmp() followed by strcoll() is only trivially slower than
1552 * strcoll() by itself, so we don't lose much if this doesn't work out
1553 * very often, and if it does - for example, because there are many
1554 * equal strings in the input - then we win big by avoiding expensive
1555 * collation-aware comparisons.
1557 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1558 return 0;
1560 #ifdef WIN32
1561 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1562 if (GetDatabaseEncoding() == PG_UTF8
1563 && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1565 int a1len;
1566 int a2len;
1567 int r;
1569 if (len1 >= TEXTBUFLEN / 2)
1571 a1len = len1 * 2 + 2;
1572 a1p = palloc(a1len);
1574 else
1576 a1len = TEXTBUFLEN;
1577 a1p = a1buf;
1579 if (len2 >= TEXTBUFLEN / 2)
1581 a2len = len2 * 2 + 2;
1582 a2p = palloc(a2len);
1584 else
1586 a2len = TEXTBUFLEN;
1587 a2p = a2buf;
1590 /* stupid Microsloth API does not work for zero-length input */
1591 if (len1 == 0)
1592 r = 0;
1593 else
1595 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1596 (LPWSTR) a1p, a1len / 2);
1597 if (!r)
1598 ereport(ERROR,
1599 (errmsg("could not convert string to UTF-16: error code %lu",
1600 GetLastError())));
1602 ((LPWSTR) a1p)[r] = 0;
1604 if (len2 == 0)
1605 r = 0;
1606 else
1608 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1609 (LPWSTR) a2p, a2len / 2);
1610 if (!r)
1611 ereport(ERROR,
1612 (errmsg("could not convert string to UTF-16: error code %lu",
1613 GetLastError())));
1615 ((LPWSTR) a2p)[r] = 0;
1617 errno = 0;
1618 #ifdef HAVE_LOCALE_T
1619 if (mylocale)
1620 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1621 else
1622 #endif
1623 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1624 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1625 * headers */
1626 ereport(ERROR,
1627 (errmsg("could not compare Unicode strings: %m")));
1629 /* Break tie if necessary. */
1630 if (result == 0 &&
1631 (!mylocale || mylocale->deterministic))
1633 result = memcmp(arg1, arg2, Min(len1, len2));
1634 if ((result == 0) && (len1 != len2))
1635 result = (len1 < len2) ? -1 : 1;
1638 if (a1p != a1buf)
1639 pfree(a1p);
1640 if (a2p != a2buf)
1641 pfree(a2p);
1643 return result;
1645 #endif /* WIN32 */
1647 if (len1 >= TEXTBUFLEN)
1648 a1p = (char *) palloc(len1 + 1);
1649 else
1650 a1p = a1buf;
1651 if (len2 >= TEXTBUFLEN)
1652 a2p = (char *) palloc(len2 + 1);
1653 else
1654 a2p = a2buf;
1656 memcpy(a1p, arg1, len1);
1657 a1p[len1] = '\0';
1658 memcpy(a2p, arg2, len2);
1659 a2p[len2] = '\0';
1661 if (mylocale)
1663 if (mylocale->provider == COLLPROVIDER_ICU)
1665 #ifdef USE_ICU
1666 #ifdef HAVE_UCOL_STRCOLLUTF8
1667 if (GetDatabaseEncoding() == PG_UTF8)
1669 UErrorCode status;
1671 status = U_ZERO_ERROR;
1672 result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1673 arg1, len1,
1674 arg2, len2,
1675 &status);
1676 if (U_FAILURE(status))
1677 ereport(ERROR,
1678 (errmsg("collation failed: %s", u_errorName(status))));
1680 else
1681 #endif
1683 int32_t ulen1,
1684 ulen2;
1685 UChar *uchar1,
1686 *uchar2;
1688 ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1689 ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1691 result = ucol_strcoll(mylocale->info.icu.ucol,
1692 uchar1, ulen1,
1693 uchar2, ulen2);
1695 pfree(uchar1);
1696 pfree(uchar2);
1698 #else /* not USE_ICU */
1699 /* shouldn't happen */
1700 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1701 #endif /* not USE_ICU */
1703 else
1705 #ifdef HAVE_LOCALE_T
1706 result = strcoll_l(a1p, a2p, mylocale->info.lt);
1707 #else
1708 /* shouldn't happen */
1709 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1710 #endif
1713 else
1714 result = strcoll(a1p, a2p);
1716 /* Break tie if necessary. */
1717 if (result == 0 &&
1718 (!mylocale || mylocale->deterministic))
1719 result = strcmp(a1p, a2p);
1721 if (a1p != a1buf)
1722 pfree(a1p);
1723 if (a2p != a2buf)
1724 pfree(a2p);
1727 return result;
1730 /* text_cmp()
1731 * Internal comparison function for text strings.
1732 * Returns -1, 0 or 1
1734 static int
1735 text_cmp(text *arg1, text *arg2, Oid collid)
1737 char *a1p,
1738 *a2p;
1739 int len1,
1740 len2;
1742 a1p = VARDATA_ANY(arg1);
1743 a2p = VARDATA_ANY(arg2);
1745 len1 = VARSIZE_ANY_EXHDR(arg1);
1746 len2 = VARSIZE_ANY_EXHDR(arg2);
1748 return varstr_cmp(a1p, len1, a2p, len2, collid);
1752 * Comparison functions for text strings.
1754 * Note: btree indexes need these routines not to leak memory; therefore,
1755 * be careful to free working copies of toasted datums. Most places don't
1756 * need to be so careful.
1759 Datum
1760 texteq(PG_FUNCTION_ARGS)
1762 Oid collid = PG_GET_COLLATION();
1763 bool locale_is_c = false;
1764 pg_locale_t mylocale = 0;
1765 bool result;
1767 check_collation_set(collid);
1769 if (lc_collate_is_c(collid))
1770 locale_is_c = true;
1771 else
1772 mylocale = pg_newlocale_from_collation(collid);
1774 if (locale_is_c || !mylocale || mylocale->deterministic)
1776 Datum arg1 = PG_GETARG_DATUM(0);
1777 Datum arg2 = PG_GETARG_DATUM(1);
1778 Size len1,
1779 len2;
1782 * Since we only care about equality or not-equality, we can avoid all
1783 * the expense of strcoll() here, and just do bitwise comparison. In
1784 * fact, we don't even have to do a bitwise comparison if we can show
1785 * the lengths of the strings are unequal; which might save us from
1786 * having to detoast one or both values.
1788 len1 = toast_raw_datum_size(arg1);
1789 len2 = toast_raw_datum_size(arg2);
1790 if (len1 != len2)
1791 result = false;
1792 else
1794 text *targ1 = DatumGetTextPP(arg1);
1795 text *targ2 = DatumGetTextPP(arg2);
1797 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1798 len1 - VARHDRSZ) == 0);
1800 PG_FREE_IF_COPY(targ1, 0);
1801 PG_FREE_IF_COPY(targ2, 1);
1804 else
1806 text *arg1 = PG_GETARG_TEXT_PP(0);
1807 text *arg2 = PG_GETARG_TEXT_PP(1);
1809 result = (text_cmp(arg1, arg2, collid) == 0);
1811 PG_FREE_IF_COPY(arg1, 0);
1812 PG_FREE_IF_COPY(arg2, 1);
1815 PG_RETURN_BOOL(result);
1818 Datum
1819 textne(PG_FUNCTION_ARGS)
1821 Oid collid = PG_GET_COLLATION();
1822 bool locale_is_c = false;
1823 pg_locale_t mylocale = 0;
1824 bool result;
1826 check_collation_set(collid);
1828 if (lc_collate_is_c(collid))
1829 locale_is_c = true;
1830 else
1831 mylocale = pg_newlocale_from_collation(collid);
1833 if (locale_is_c || !mylocale || mylocale->deterministic)
1835 Datum arg1 = PG_GETARG_DATUM(0);
1836 Datum arg2 = PG_GETARG_DATUM(1);
1837 Size len1,
1838 len2;
1840 /* See comment in texteq() */
1841 len1 = toast_raw_datum_size(arg1);
1842 len2 = toast_raw_datum_size(arg2);
1843 if (len1 != len2)
1844 result = true;
1845 else
1847 text *targ1 = DatumGetTextPP(arg1);
1848 text *targ2 = DatumGetTextPP(arg2);
1850 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1851 len1 - VARHDRSZ) != 0);
1853 PG_FREE_IF_COPY(targ1, 0);
1854 PG_FREE_IF_COPY(targ2, 1);
1857 else
1859 text *arg1 = PG_GETARG_TEXT_PP(0);
1860 text *arg2 = PG_GETARG_TEXT_PP(1);
1862 result = (text_cmp(arg1, arg2, collid) != 0);
1864 PG_FREE_IF_COPY(arg1, 0);
1865 PG_FREE_IF_COPY(arg2, 1);
1868 PG_RETURN_BOOL(result);
1871 Datum
1872 text_lt(PG_FUNCTION_ARGS)
1874 text *arg1 = PG_GETARG_TEXT_PP(0);
1875 text *arg2 = PG_GETARG_TEXT_PP(1);
1876 bool result;
1878 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1880 PG_FREE_IF_COPY(arg1, 0);
1881 PG_FREE_IF_COPY(arg2, 1);
1883 PG_RETURN_BOOL(result);
1886 Datum
1887 text_le(PG_FUNCTION_ARGS)
1889 text *arg1 = PG_GETARG_TEXT_PP(0);
1890 text *arg2 = PG_GETARG_TEXT_PP(1);
1891 bool result;
1893 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1895 PG_FREE_IF_COPY(arg1, 0);
1896 PG_FREE_IF_COPY(arg2, 1);
1898 PG_RETURN_BOOL(result);
1901 Datum
1902 text_gt(PG_FUNCTION_ARGS)
1904 text *arg1 = PG_GETARG_TEXT_PP(0);
1905 text *arg2 = PG_GETARG_TEXT_PP(1);
1906 bool result;
1908 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1910 PG_FREE_IF_COPY(arg1, 0);
1911 PG_FREE_IF_COPY(arg2, 1);
1913 PG_RETURN_BOOL(result);
1916 Datum
1917 text_ge(PG_FUNCTION_ARGS)
1919 text *arg1 = PG_GETARG_TEXT_PP(0);
1920 text *arg2 = PG_GETARG_TEXT_PP(1);
1921 bool result;
1923 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1925 PG_FREE_IF_COPY(arg1, 0);
1926 PG_FREE_IF_COPY(arg2, 1);
1928 PG_RETURN_BOOL(result);
1931 Datum
1932 text_starts_with(PG_FUNCTION_ARGS)
1934 Datum arg1 = PG_GETARG_DATUM(0);
1935 Datum arg2 = PG_GETARG_DATUM(1);
1936 Oid collid = PG_GET_COLLATION();
1937 pg_locale_t mylocale = 0;
1938 bool result;
1939 Size len1,
1940 len2;
1942 check_collation_set(collid);
1944 if (!lc_collate_is_c(collid))
1945 mylocale = pg_newlocale_from_collation(collid);
1947 if (mylocale && !mylocale->deterministic)
1948 ereport(ERROR,
1949 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1950 errmsg("nondeterministic collations are not supported for substring searches")));
1952 len1 = toast_raw_datum_size(arg1);
1953 len2 = toast_raw_datum_size(arg2);
1954 if (len2 > len1)
1955 result = false;
1956 else
1958 text *targ1 = text_substring(arg1, 1, len2, false);
1959 text *targ2 = DatumGetTextPP(arg2);
1961 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1962 VARSIZE_ANY_EXHDR(targ2)) == 0);
1964 PG_FREE_IF_COPY(targ1, 0);
1965 PG_FREE_IF_COPY(targ2, 1);
1968 PG_RETURN_BOOL(result);
1971 Datum
1972 bttextcmp(PG_FUNCTION_ARGS)
1974 text *arg1 = PG_GETARG_TEXT_PP(0);
1975 text *arg2 = PG_GETARG_TEXT_PP(1);
1976 int32 result;
1978 result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1980 PG_FREE_IF_COPY(arg1, 0);
1981 PG_FREE_IF_COPY(arg2, 1);
1983 PG_RETURN_INT32(result);
1986 Datum
1987 bttextsortsupport(PG_FUNCTION_ARGS)
1989 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1990 Oid collid = ssup->ssup_collation;
1991 MemoryContext oldcontext;
1993 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1995 /* Use generic string SortSupport */
1996 varstr_sortsupport(ssup, TEXTOID, collid);
1998 MemoryContextSwitchTo(oldcontext);
2000 PG_RETURN_VOID();
2004 * Generic sortsupport interface for character type's operator classes.
2005 * Includes locale support, and support for BpChar semantics (i.e. removing
2006 * trailing spaces before comparison).
2008 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2009 * same representation. Callers that always use the C collation (e.g.
2010 * non-collatable type callers like bytea) may have NUL bytes in their strings;
2011 * this will not work with any other collation, though.
2013 void
2014 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
2016 bool abbreviate = ssup->abbreviate;
2017 bool collate_c = false;
2018 VarStringSortSupport *sss;
2019 pg_locale_t locale = 0;
2021 check_collation_set(collid);
2024 * If possible, set ssup->comparator to a function which can be used to
2025 * directly compare two datums. If we can do this, we'll avoid the
2026 * overhead of a trip through the fmgr layer for every comparison, which
2027 * can be substantial.
2029 * Most typically, we'll set the comparator to varlenafastcmp_locale,
2030 * which uses strcoll() to perform comparisons. We use that for the
2031 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2032 * LC_COLLATE = C, we can make things quite a bit faster with
2033 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2034 * memcmp() rather than strcoll().
2036 if (lc_collate_is_c(collid))
2038 if (typid == BPCHAROID)
2039 ssup->comparator = bpcharfastcmp_c;
2040 else if (typid == NAMEOID)
2042 ssup->comparator = namefastcmp_c;
2043 /* Not supporting abbreviation with type NAME, for now */
2044 abbreviate = false;
2046 else
2047 ssup->comparator = varstrfastcmp_c;
2049 collate_c = true;
2051 else
2054 * We need a collation-sensitive comparison. To make things faster,
2055 * we'll figure out the collation based on the locale id and cache the
2056 * result.
2058 locale = pg_newlocale_from_collation(collid);
2061 * There is a further exception on Windows. When the database
2062 * encoding is UTF-8 and we are not using the C collation, complex
2063 * hacks are required. We don't currently have a comparator that
2064 * handles that case, so we fall back on the slow method of having the
2065 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2066 * trampoline. ICU locales work just the same on Windows, however.
2068 #ifdef WIN32
2069 if (GetDatabaseEncoding() == PG_UTF8 &&
2070 !(locale && locale->provider == COLLPROVIDER_ICU))
2071 return;
2072 #endif
2075 * We use varlenafastcmp_locale except for type NAME.
2077 if (typid == NAMEOID)
2079 ssup->comparator = namefastcmp_locale;
2080 /* Not supporting abbreviation with type NAME, for now */
2081 abbreviate = false;
2083 else
2084 ssup->comparator = varlenafastcmp_locale;
2088 * Unfortunately, it seems that abbreviation for non-C collations is
2089 * broken on many common platforms; testing of multiple versions of glibc
2090 * reveals that, for many locales, strcoll() and strxfrm() do not return
2091 * consistent results, which is fatal to this optimization. While no
2092 * other libc other than Cygwin has so far been shown to have a problem,
2093 * we take the conservative course of action for right now and disable
2094 * this categorically. (Users who are certain this isn't a problem on
2095 * their system can define TRUST_STRXFRM.)
2097 * Even apart from the risk of broken locales, it's possible that there
2098 * are platforms where the use of abbreviated keys should be disabled at
2099 * compile time. Having only 4 byte datums could make worst-case
2100 * performance drastically more likely, for example. Moreover, macOS's
2101 * strxfrm() implementation is known to not effectively concentrate a
2102 * significant amount of entropy from the original string in earlier
2103 * transformed blobs. It's possible that other supported platforms are
2104 * similarly encumbered. So, if we ever get past disabling this
2105 * categorically, we may still want or need to disable it for particular
2106 * platforms.
2108 #ifndef TRUST_STRXFRM
2109 if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2110 abbreviate = false;
2111 #endif
2114 * If we're using abbreviated keys, or if we're using a locale-aware
2115 * comparison, we need to initialize a VarStringSortSupport object. Both
2116 * cases will make use of the temporary buffers we initialize here for
2117 * scratch space (and to detect requirement for BpChar semantics from
2118 * caller), and the abbreviation case requires additional state.
2120 if (abbreviate || !collate_c)
2122 sss = palloc(sizeof(VarStringSortSupport));
2123 sss->buf1 = palloc(TEXTBUFLEN);
2124 sss->buflen1 = TEXTBUFLEN;
2125 sss->buf2 = palloc(TEXTBUFLEN);
2126 sss->buflen2 = TEXTBUFLEN;
2127 /* Start with invalid values */
2128 sss->last_len1 = -1;
2129 sss->last_len2 = -1;
2130 /* Initialize */
2131 sss->last_returned = 0;
2132 sss->locale = locale;
2135 * To avoid somehow confusing a strxfrm() blob and an original string,
2136 * constantly keep track of the variety of data that buf1 and buf2
2137 * currently contain.
2139 * Comparisons may be interleaved with conversion calls. Frequently,
2140 * conversions and comparisons are batched into two distinct phases,
2141 * but the correctness of caching cannot hinge upon this. For
2142 * comparison caching, buffer state is only trusted if cache_blob is
2143 * found set to false, whereas strxfrm() caching only trusts the state
2144 * when cache_blob is found set to true.
2146 * Arbitrarily initialize cache_blob to true.
2148 sss->cache_blob = true;
2149 sss->collate_c = collate_c;
2150 sss->typid = typid;
2151 ssup->ssup_extra = sss;
2154 * If possible, plan to use the abbreviated keys optimization. The
2155 * core code may switch back to authoritative comparator should
2156 * abbreviation be aborted.
2158 if (abbreviate)
2160 sss->prop_card = 0.20;
2161 initHyperLogLog(&sss->abbr_card, 10);
2162 initHyperLogLog(&sss->full_card, 10);
2163 ssup->abbrev_full_comparator = ssup->comparator;
2164 ssup->comparator = ssup_datum_unsigned_cmp;
2165 ssup->abbrev_converter = varstr_abbrev_convert;
2166 ssup->abbrev_abort = varstr_abbrev_abort;
2172 * sortsupport comparison func (for C locale case)
2174 static int
2175 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2177 VarString *arg1 = DatumGetVarStringPP(x);
2178 VarString *arg2 = DatumGetVarStringPP(y);
2179 char *a1p,
2180 *a2p;
2181 int len1,
2182 len2,
2183 result;
2185 a1p = VARDATA_ANY(arg1);
2186 a2p = VARDATA_ANY(arg2);
2188 len1 = VARSIZE_ANY_EXHDR(arg1);
2189 len2 = VARSIZE_ANY_EXHDR(arg2);
2191 result = memcmp(a1p, a2p, Min(len1, len2));
2192 if ((result == 0) && (len1 != len2))
2193 result = (len1 < len2) ? -1 : 1;
2195 /* We can't afford to leak memory here. */
2196 if (PointerGetDatum(arg1) != x)
2197 pfree(arg1);
2198 if (PointerGetDatum(arg2) != y)
2199 pfree(arg2);
2201 return result;
2205 * sortsupport comparison func (for BpChar C locale case)
2207 * BpChar outsources its sortsupport to this module. Specialization for the
2208 * varstr_sortsupport BpChar case, modeled on
2209 * internal_bpchar_pattern_compare().
2211 static int
2212 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2214 BpChar *arg1 = DatumGetBpCharPP(x);
2215 BpChar *arg2 = DatumGetBpCharPP(y);
2216 char *a1p,
2217 *a2p;
2218 int len1,
2219 len2,
2220 result;
2222 a1p = VARDATA_ANY(arg1);
2223 a2p = VARDATA_ANY(arg2);
2225 len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2226 len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2228 result = memcmp(a1p, a2p, Min(len1, len2));
2229 if ((result == 0) && (len1 != len2))
2230 result = (len1 < len2) ? -1 : 1;
2232 /* We can't afford to leak memory here. */
2233 if (PointerGetDatum(arg1) != x)
2234 pfree(arg1);
2235 if (PointerGetDatum(arg2) != y)
2236 pfree(arg2);
2238 return result;
2242 * sortsupport comparison func (for NAME C locale case)
2244 static int
2245 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2247 Name arg1 = DatumGetName(x);
2248 Name arg2 = DatumGetName(y);
2250 return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2254 * sortsupport comparison func (for locale case with all varlena types)
2256 static int
2257 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2259 VarString *arg1 = DatumGetVarStringPP(x);
2260 VarString *arg2 = DatumGetVarStringPP(y);
2261 char *a1p,
2262 *a2p;
2263 int len1,
2264 len2,
2265 result;
2267 a1p = VARDATA_ANY(arg1);
2268 a2p = VARDATA_ANY(arg2);
2270 len1 = VARSIZE_ANY_EXHDR(arg1);
2271 len2 = VARSIZE_ANY_EXHDR(arg2);
2273 result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2275 /* We can't afford to leak memory here. */
2276 if (PointerGetDatum(arg1) != x)
2277 pfree(arg1);
2278 if (PointerGetDatum(arg2) != y)
2279 pfree(arg2);
2281 return result;
2285 * sortsupport comparison func (for locale case with NAME type)
2287 static int
2288 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2290 Name arg1 = DatumGetName(x);
2291 Name arg2 = DatumGetName(y);
2293 return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2294 NameStr(*arg2), strlen(NameStr(*arg2)),
2295 ssup);
2299 * sortsupport comparison func for locale cases
2301 static int
2302 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2304 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2305 int result;
2306 bool arg1_match;
2308 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2309 if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2312 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2313 * last_len2. Existing contents of buffers might still be used by
2314 * next call.
2316 * It's fine to allow the comparison of BpChar padding bytes here,
2317 * even though that implies that the memcmp() will usually be
2318 * performed for BpChar callers (though multibyte characters could
2319 * still prevent that from occurring). The memcmp() is still very
2320 * cheap, and BpChar's funny semantics have us remove trailing spaces
2321 * (not limited to padding), so we need make no distinction between
2322 * padding space characters and "real" space characters.
2324 return 0;
2327 if (sss->typid == BPCHAROID)
2329 /* Get true number of bytes, ignoring trailing spaces */
2330 len1 = bpchartruelen(a1p, len1);
2331 len2 = bpchartruelen(a2p, len2);
2334 if (len1 >= sss->buflen1)
2336 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2337 sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2339 if (len2 >= sss->buflen2)
2341 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2342 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2346 * We're likely to be asked to compare the same strings repeatedly, and
2347 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2348 * comparisons, even though in general there is no reason to think that
2349 * that will work out (every string datum may be unique). Caching does
2350 * not slow things down measurably when it doesn't work out, and can speed
2351 * things up by rather a lot when it does. In part, this is because the
2352 * memcmp() compares data from cachelines that are needed in L1 cache even
2353 * when the last comparison's result cannot be reused.
2355 arg1_match = true;
2356 if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2358 arg1_match = false;
2359 memcpy(sss->buf1, a1p, len1);
2360 sss->buf1[len1] = '\0';
2361 sss->last_len1 = len1;
2365 * If we're comparing the same two strings as last time, we can return the
2366 * same answer without calling strcoll() again. This is more likely than
2367 * it seems (at least with moderate to low cardinality sets), because
2368 * quicksort compares the same pivot against many values.
2370 if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2372 memcpy(sss->buf2, a2p, len2);
2373 sss->buf2[len2] = '\0';
2374 sss->last_len2 = len2;
2376 else if (arg1_match && !sss->cache_blob)
2378 /* Use result cached following last actual strcoll() call */
2379 return sss->last_returned;
2382 if (sss->locale)
2384 if (sss->locale->provider == COLLPROVIDER_ICU)
2386 #ifdef USE_ICU
2387 #ifdef HAVE_UCOL_STRCOLLUTF8
2388 if (GetDatabaseEncoding() == PG_UTF8)
2390 UErrorCode status;
2392 status = U_ZERO_ERROR;
2393 result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2394 a1p, len1,
2395 a2p, len2,
2396 &status);
2397 if (U_FAILURE(status))
2398 ereport(ERROR,
2399 (errmsg("collation failed: %s", u_errorName(status))));
2401 else
2402 #endif
2404 int32_t ulen1,
2405 ulen2;
2406 UChar *uchar1,
2407 *uchar2;
2409 ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2410 ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2412 result = ucol_strcoll(sss->locale->info.icu.ucol,
2413 uchar1, ulen1,
2414 uchar2, ulen2);
2416 pfree(uchar1);
2417 pfree(uchar2);
2419 #else /* not USE_ICU */
2420 /* shouldn't happen */
2421 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2422 #endif /* not USE_ICU */
2424 else
2426 #ifdef HAVE_LOCALE_T
2427 result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2428 #else
2429 /* shouldn't happen */
2430 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2431 #endif
2434 else
2435 result = strcoll(sss->buf1, sss->buf2);
2437 /* Break tie if necessary. */
2438 if (result == 0 &&
2439 (!sss->locale || sss->locale->deterministic))
2440 result = strcmp(sss->buf1, sss->buf2);
2442 /* Cache result, perhaps saving an expensive strcoll() call next time */
2443 sss->cache_blob = false;
2444 sss->last_returned = result;
2445 return result;
2449 * Conversion routine for sortsupport. Converts original to abbreviated key
2450 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2451 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2452 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2453 * locale is used, or in case of bytea, just memcpy() from original instead.
2455 static Datum
2456 varstr_abbrev_convert(Datum original, SortSupport ssup)
2458 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2459 VarString *authoritative = DatumGetVarStringPP(original);
2460 char *authoritative_data = VARDATA_ANY(authoritative);
2462 /* working state */
2463 Datum res;
2464 char *pres;
2465 int len;
2466 uint32 hash;
2468 pres = (char *) &res;
2469 /* memset(), so any non-overwritten bytes are NUL */
2470 memset(pres, 0, sizeof(Datum));
2471 len = VARSIZE_ANY_EXHDR(authoritative);
2473 /* Get number of bytes, ignoring trailing spaces */
2474 if (sss->typid == BPCHAROID)
2475 len = bpchartruelen(authoritative_data, len);
2478 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2479 * abbreviate keys. The full comparator for the C locale is always
2480 * memcmp(). It would be incorrect to allow bytea callers (callers that
2481 * always force the C collation -- bytea isn't a collatable type, but this
2482 * approach is convenient) to use strxfrm(). This is because bytea
2483 * strings may contain NUL bytes. Besides, this should be faster, too.
2485 * More generally, it's okay that bytea callers can have NUL bytes in
2486 * strings because abbreviated cmp need not make a distinction between
2487 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2488 * authoritative representation. Hopefully a comparison at or past one
2489 * abbreviated key's terminating NUL byte will resolve the comparison
2490 * without consulting the authoritative representation; specifically, some
2491 * later non-NUL byte in the longer string can resolve the comparison
2492 * against a subsequent terminating NUL in the shorter string. There will
2493 * usually be what is effectively a "length-wise" resolution there and
2494 * then.
2496 * If that doesn't work out -- if all bytes in the longer string
2497 * positioned at or past the offset of the smaller string's (first)
2498 * terminating NUL are actually representative of NUL bytes in the
2499 * authoritative binary string (perhaps with some *terminating* NUL bytes
2500 * towards the end of the longer string iff it happens to still be small)
2501 * -- then an authoritative tie-breaker will happen, and do the right
2502 * thing: explicitly consider string length.
2504 if (sss->collate_c)
2505 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2506 else
2508 Size bsize;
2509 #ifdef USE_ICU
2510 int32_t ulen = -1;
2511 UChar *uchar = NULL;
2512 #endif
2515 * We're not using the C collation, so fall back on strxfrm or ICU
2516 * analogs.
2519 /* By convention, we use buffer 1 to store and NUL-terminate */
2520 if (len >= sss->buflen1)
2522 sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2523 sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2526 /* Might be able to reuse strxfrm() blob from last call */
2527 if (sss->last_len1 == len && sss->cache_blob &&
2528 memcmp(sss->buf1, authoritative_data, len) == 0)
2530 memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2531 /* No change affecting cardinality, so no hashing required */
2532 goto done;
2535 memcpy(sss->buf1, authoritative_data, len);
2538 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2539 * necessary for ICU, but doesn't hurt.
2541 sss->buf1[len] = '\0';
2542 sss->last_len1 = len;
2544 #ifdef USE_ICU
2545 /* When using ICU and not UTF8, convert string to UChar. */
2546 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2547 GetDatabaseEncoding() != PG_UTF8)
2548 ulen = icu_to_uchar(&uchar, sss->buf1, len);
2549 #endif
2552 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2553 * and try again. Both of these functions have the result buffer
2554 * content undefined if the result did not fit, so we need to retry
2555 * until everything fits, even though we only need the first few bytes
2556 * in the end. When using ucol_nextSortKeyPart(), however, we only
2557 * ask for as many bytes as we actually need.
2559 for (;;)
2561 #ifdef USE_ICU
2562 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2565 * When using UTF8, use the iteration interface so we only
2566 * need to produce as many bytes as we actually need.
2568 if (GetDatabaseEncoding() == PG_UTF8)
2570 UCharIterator iter;
2571 uint32_t state[2];
2572 UErrorCode status;
2574 uiter_setUTF8(&iter, sss->buf1, len);
2575 state[0] = state[1] = 0; /* won't need that again */
2576 status = U_ZERO_ERROR;
2577 bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2578 &iter,
2579 state,
2580 (uint8_t *) sss->buf2,
2581 Min(sizeof(Datum), sss->buflen2),
2582 &status);
2583 if (U_FAILURE(status))
2584 ereport(ERROR,
2585 (errmsg("sort key generation failed: %s",
2586 u_errorName(status))));
2588 else
2589 bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2590 uchar, ulen,
2591 (uint8_t *) sss->buf2, sss->buflen2);
2593 else
2594 #endif
2595 #ifdef HAVE_LOCALE_T
2596 if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2597 bsize = strxfrm_l(sss->buf2, sss->buf1,
2598 sss->buflen2, sss->locale->info.lt);
2599 else
2600 #endif
2601 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2603 sss->last_len2 = bsize;
2604 if (bsize < sss->buflen2)
2605 break;
2608 * Grow buffer and retry.
2610 sss->buflen2 = Max(bsize + 1,
2611 Min(sss->buflen2 * 2, MaxAllocSize));
2612 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2616 * Every Datum byte is always compared. This is safe because the
2617 * strxfrm() blob is itself NUL terminated, leaving no danger of
2618 * misinterpreting any NUL bytes not intended to be interpreted as
2619 * logically representing termination.
2621 * (Actually, even if there were NUL bytes in the blob it would be
2622 * okay. See remarks on bytea case above.)
2624 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2626 #ifdef USE_ICU
2627 if (uchar)
2628 pfree(uchar);
2629 #endif
2633 * Maintain approximate cardinality of both abbreviated keys and original,
2634 * authoritative keys using HyperLogLog. Used as cheap insurance against
2635 * the worst case, where we do many string transformations for no saving
2636 * in full strcoll()-based comparisons. These statistics are used by
2637 * varstr_abbrev_abort().
2639 * First, Hash key proper, or a significant fraction of it. Mix in length
2640 * in order to compensate for cases where differences are past
2641 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2643 hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2644 Min(len, PG_CACHE_LINE_SIZE)));
2646 if (len > PG_CACHE_LINE_SIZE)
2647 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2649 addHyperLogLog(&sss->full_card, hash);
2651 /* Hash abbreviated key */
2652 #if SIZEOF_DATUM == 8
2654 uint32 lohalf,
2655 hihalf;
2657 lohalf = (uint32) res;
2658 hihalf = (uint32) (res >> 32);
2659 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2661 #else /* SIZEOF_DATUM != 8 */
2662 hash = DatumGetUInt32(hash_uint32((uint32) res));
2663 #endif
2665 addHyperLogLog(&sss->abbr_card, hash);
2667 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2668 sss->cache_blob = true;
2669 done:
2672 * Byteswap on little-endian machines.
2674 * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2675 * 3-way comparator) works correctly on all platforms. If we didn't do
2676 * this, the comparator would have to call memcmp() with a pair of
2677 * pointers to the first byte of each abbreviated key, which is slower.
2679 res = DatumBigEndianToNative(res);
2681 /* Don't leak memory here */
2682 if (PointerGetDatum(authoritative) != original)
2683 pfree(authoritative);
2685 return res;
2689 * Callback for estimating effectiveness of abbreviated key optimization, using
2690 * heuristic rules. Returns value indicating if the abbreviation optimization
2691 * should be aborted, based on its projected effectiveness.
2693 static bool
2694 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2696 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2697 double abbrev_distinct,
2698 key_distinct;
2700 Assert(ssup->abbreviate);
2702 /* Have a little patience */
2703 if (memtupcount < 100)
2704 return false;
2706 abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2707 key_distinct = estimateHyperLogLog(&sss->full_card);
2710 * Clamp cardinality estimates to at least one distinct value. While
2711 * NULLs are generally disregarded, if only NULL values were seen so far,
2712 * that might misrepresent costs if we failed to clamp.
2714 if (abbrev_distinct <= 1.0)
2715 abbrev_distinct = 1.0;
2717 if (key_distinct <= 1.0)
2718 key_distinct = 1.0;
2721 * In the worst case all abbreviated keys are identical, while at the same
2722 * time there are differences within full key strings not captured in
2723 * abbreviations.
2725 #ifdef TRACE_SORT
2726 if (trace_sort)
2728 double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2730 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2731 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2732 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2733 sss->prop_card);
2735 #endif
2738 * If the number of distinct abbreviated keys approximately matches the
2739 * number of distinct authoritative original keys, that's reason enough to
2740 * proceed. We can win even with a very low cardinality set if most
2741 * tie-breakers only memcmp(). This is by far the most important
2742 * consideration.
2744 * While comparisons that are resolved at the abbreviated key level are
2745 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2746 * those two outcomes are so much cheaper than a full strcoll() once
2747 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2748 * cardinality against the overall size of the set in order to more
2749 * accurately model costs. Assume that an abbreviated comparison, and an
2750 * abbreviated comparison with a cheap memcmp()-based authoritative
2751 * resolution are equivalent.
2753 if (abbrev_distinct > key_distinct * sss->prop_card)
2756 * When we have exceeded 10,000 tuples, decay required cardinality
2757 * aggressively for next call.
2759 * This is useful because the number of comparisons required on
2760 * average increases at a linearithmic rate, and at roughly 10,000
2761 * tuples that factor will start to dominate over the linear costs of
2762 * string transformation (this is a conservative estimate). The decay
2763 * rate is chosen to be a little less aggressive than halving -- which
2764 * (since we're called at points at which memtupcount has doubled)
2765 * would never see the cost model actually abort past the first call
2766 * following a decay. This decay rate is mostly a precaution against
2767 * a sudden, violent swing in how well abbreviated cardinality tracks
2768 * full key cardinality. The decay also serves to prevent a marginal
2769 * case from being aborted too late, when too much has already been
2770 * invested in string transformation.
2772 * It's possible for sets of several million distinct strings with
2773 * mere tens of thousands of distinct abbreviated keys to still
2774 * benefit very significantly. This will generally occur provided
2775 * each abbreviated key is a proxy for a roughly uniform number of the
2776 * set's full keys. If it isn't so, we hope to catch that early and
2777 * abort. If it isn't caught early, by the time the problem is
2778 * apparent it's probably not worth aborting.
2780 if (memtupcount > 10000)
2781 sss->prop_card *= 0.65;
2783 return false;
2787 * Abort abbreviation strategy.
2789 * The worst case, where all abbreviated keys are identical while all
2790 * original strings differ will typically only see a regression of about
2791 * 10% in execution time for small to medium sized lists of strings.
2792 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2793 * often expect very large improvements, particularly with sets of strings
2794 * of moderately high to high abbreviated cardinality. There is little to
2795 * lose but much to gain, which our strategy reflects.
2797 #ifdef TRACE_SORT
2798 if (trace_sort)
2799 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2800 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2801 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2802 #endif
2804 return true;
2808 * Generic equalimage support function for character type's operator classes.
2809 * Disables the use of deduplication with nondeterministic collations.
2811 Datum
2812 btvarstrequalimage(PG_FUNCTION_ARGS)
2814 /* Oid opcintype = PG_GETARG_OID(0); */
2815 Oid collid = PG_GET_COLLATION();
2817 check_collation_set(collid);
2819 if (lc_collate_is_c(collid) ||
2820 collid == DEFAULT_COLLATION_OID ||
2821 get_collation_isdeterministic(collid))
2822 PG_RETURN_BOOL(true);
2823 else
2824 PG_RETURN_BOOL(false);
2827 Datum
2828 text_larger(PG_FUNCTION_ARGS)
2830 text *arg1 = PG_GETARG_TEXT_PP(0);
2831 text *arg2 = PG_GETARG_TEXT_PP(1);
2832 text *result;
2834 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2836 PG_RETURN_TEXT_P(result);
2839 Datum
2840 text_smaller(PG_FUNCTION_ARGS)
2842 text *arg1 = PG_GETARG_TEXT_PP(0);
2843 text *arg2 = PG_GETARG_TEXT_PP(1);
2844 text *result;
2846 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2848 PG_RETURN_TEXT_P(result);
2853 * Cross-type comparison functions for types text and name.
2856 Datum
2857 nameeqtext(PG_FUNCTION_ARGS)
2859 Name arg1 = PG_GETARG_NAME(0);
2860 text *arg2 = PG_GETARG_TEXT_PP(1);
2861 size_t len1 = strlen(NameStr(*arg1));
2862 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2863 Oid collid = PG_GET_COLLATION();
2864 bool result;
2866 check_collation_set(collid);
2868 if (collid == C_COLLATION_OID)
2869 result = (len1 == len2 &&
2870 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2871 else
2872 result = (varstr_cmp(NameStr(*arg1), len1,
2873 VARDATA_ANY(arg2), len2,
2874 collid) == 0);
2876 PG_FREE_IF_COPY(arg2, 1);
2878 PG_RETURN_BOOL(result);
2881 Datum
2882 texteqname(PG_FUNCTION_ARGS)
2884 text *arg1 = PG_GETARG_TEXT_PP(0);
2885 Name arg2 = PG_GETARG_NAME(1);
2886 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2887 size_t len2 = strlen(NameStr(*arg2));
2888 Oid collid = PG_GET_COLLATION();
2889 bool result;
2891 check_collation_set(collid);
2893 if (collid == C_COLLATION_OID)
2894 result = (len1 == len2 &&
2895 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2896 else
2897 result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2898 NameStr(*arg2), len2,
2899 collid) == 0);
2901 PG_FREE_IF_COPY(arg1, 0);
2903 PG_RETURN_BOOL(result);
2906 Datum
2907 namenetext(PG_FUNCTION_ARGS)
2909 Name arg1 = PG_GETARG_NAME(0);
2910 text *arg2 = PG_GETARG_TEXT_PP(1);
2911 size_t len1 = strlen(NameStr(*arg1));
2912 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2913 Oid collid = PG_GET_COLLATION();
2914 bool result;
2916 check_collation_set(collid);
2918 if (collid == C_COLLATION_OID)
2919 result = !(len1 == len2 &&
2920 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2921 else
2922 result = !(varstr_cmp(NameStr(*arg1), len1,
2923 VARDATA_ANY(arg2), len2,
2924 collid) == 0);
2926 PG_FREE_IF_COPY(arg2, 1);
2928 PG_RETURN_BOOL(result);
2931 Datum
2932 textnename(PG_FUNCTION_ARGS)
2934 text *arg1 = PG_GETARG_TEXT_PP(0);
2935 Name arg2 = PG_GETARG_NAME(1);
2936 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2937 size_t len2 = strlen(NameStr(*arg2));
2938 Oid collid = PG_GET_COLLATION();
2939 bool result;
2941 check_collation_set(collid);
2943 if (collid == C_COLLATION_OID)
2944 result = !(len1 == len2 &&
2945 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2946 else
2947 result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2948 NameStr(*arg2), len2,
2949 collid) == 0);
2951 PG_FREE_IF_COPY(arg1, 0);
2953 PG_RETURN_BOOL(result);
2956 Datum
2957 btnametextcmp(PG_FUNCTION_ARGS)
2959 Name arg1 = PG_GETARG_NAME(0);
2960 text *arg2 = PG_GETARG_TEXT_PP(1);
2961 int32 result;
2963 result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2964 VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2965 PG_GET_COLLATION());
2967 PG_FREE_IF_COPY(arg2, 1);
2969 PG_RETURN_INT32(result);
2972 Datum
2973 bttextnamecmp(PG_FUNCTION_ARGS)
2975 text *arg1 = PG_GETARG_TEXT_PP(0);
2976 Name arg2 = PG_GETARG_NAME(1);
2977 int32 result;
2979 result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2980 NameStr(*arg2), strlen(NameStr(*arg2)),
2981 PG_GET_COLLATION());
2983 PG_FREE_IF_COPY(arg1, 0);
2985 PG_RETURN_INT32(result);
2988 #define CmpCall(cmpfunc) \
2989 DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2990 PG_GET_COLLATION(), \
2991 PG_GETARG_DATUM(0), \
2992 PG_GETARG_DATUM(1)))
2994 Datum
2995 namelttext(PG_FUNCTION_ARGS)
2997 PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
3000 Datum
3001 nameletext(PG_FUNCTION_ARGS)
3003 PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3006 Datum
3007 namegttext(PG_FUNCTION_ARGS)
3009 PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3012 Datum
3013 namegetext(PG_FUNCTION_ARGS)
3015 PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3018 Datum
3019 textltname(PG_FUNCTION_ARGS)
3021 PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3024 Datum
3025 textlename(PG_FUNCTION_ARGS)
3027 PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3030 Datum
3031 textgtname(PG_FUNCTION_ARGS)
3033 PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3036 Datum
3037 textgename(PG_FUNCTION_ARGS)
3039 PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3042 #undef CmpCall
3046 * The following operators support character-by-character comparison
3047 * of text datums, to allow building indexes suitable for LIKE clauses.
3048 * Note that the regular texteq/textne comparison operators, and regular
3049 * support functions 1 and 2 with "C" collation are assumed to be
3050 * compatible with these!
3053 static int
3054 internal_text_pattern_compare(text *arg1, text *arg2)
3056 int result;
3057 int len1,
3058 len2;
3060 len1 = VARSIZE_ANY_EXHDR(arg1);
3061 len2 = VARSIZE_ANY_EXHDR(arg2);
3063 result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3064 if (result != 0)
3065 return result;
3066 else if (len1 < len2)
3067 return -1;
3068 else if (len1 > len2)
3069 return 1;
3070 else
3071 return 0;
3075 Datum
3076 text_pattern_lt(PG_FUNCTION_ARGS)
3078 text *arg1 = PG_GETARG_TEXT_PP(0);
3079 text *arg2 = PG_GETARG_TEXT_PP(1);
3080 int result;
3082 result = internal_text_pattern_compare(arg1, arg2);
3084 PG_FREE_IF_COPY(arg1, 0);
3085 PG_FREE_IF_COPY(arg2, 1);
3087 PG_RETURN_BOOL(result < 0);
3091 Datum
3092 text_pattern_le(PG_FUNCTION_ARGS)
3094 text *arg1 = PG_GETARG_TEXT_PP(0);
3095 text *arg2 = PG_GETARG_TEXT_PP(1);
3096 int result;
3098 result = internal_text_pattern_compare(arg1, arg2);
3100 PG_FREE_IF_COPY(arg1, 0);
3101 PG_FREE_IF_COPY(arg2, 1);
3103 PG_RETURN_BOOL(result <= 0);
3107 Datum
3108 text_pattern_ge(PG_FUNCTION_ARGS)
3110 text *arg1 = PG_GETARG_TEXT_PP(0);
3111 text *arg2 = PG_GETARG_TEXT_PP(1);
3112 int result;
3114 result = internal_text_pattern_compare(arg1, arg2);
3116 PG_FREE_IF_COPY(arg1, 0);
3117 PG_FREE_IF_COPY(arg2, 1);
3119 PG_RETURN_BOOL(result >= 0);
3123 Datum
3124 text_pattern_gt(PG_FUNCTION_ARGS)
3126 text *arg1 = PG_GETARG_TEXT_PP(0);
3127 text *arg2 = PG_GETARG_TEXT_PP(1);
3128 int result;
3130 result = internal_text_pattern_compare(arg1, arg2);
3132 PG_FREE_IF_COPY(arg1, 0);
3133 PG_FREE_IF_COPY(arg2, 1);
3135 PG_RETURN_BOOL(result > 0);
3139 Datum
3140 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3142 text *arg1 = PG_GETARG_TEXT_PP(0);
3143 text *arg2 = PG_GETARG_TEXT_PP(1);
3144 int result;
3146 result = internal_text_pattern_compare(arg1, arg2);
3148 PG_FREE_IF_COPY(arg1, 0);
3149 PG_FREE_IF_COPY(arg2, 1);
3151 PG_RETURN_INT32(result);
3155 Datum
3156 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3158 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3159 MemoryContext oldcontext;
3161 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3163 /* Use generic string SortSupport, forcing "C" collation */
3164 varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3166 MemoryContextSwitchTo(oldcontext);
3168 PG_RETURN_VOID();
3172 /*-------------------------------------------------------------
3173 * byteaoctetlen
3175 * get the number of bytes contained in an instance of type 'bytea'
3176 *-------------------------------------------------------------
3178 Datum
3179 byteaoctetlen(PG_FUNCTION_ARGS)
3181 Datum str = PG_GETARG_DATUM(0);
3183 /* We need not detoast the input at all */
3184 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3188 * byteacat -
3189 * takes two bytea* and returns a bytea* that is the concatenation of
3190 * the two.
3192 * Cloned from textcat and modified as required.
3194 Datum
3195 byteacat(PG_FUNCTION_ARGS)
3197 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3198 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3200 PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3204 * bytea_catenate
3205 * Guts of byteacat(), broken out so it can be used by other functions
3207 * Arguments can be in short-header form, but not compressed or out-of-line
3209 static bytea *
3210 bytea_catenate(bytea *t1, bytea *t2)
3212 bytea *result;
3213 int len1,
3214 len2,
3215 len;
3216 char *ptr;
3218 len1 = VARSIZE_ANY_EXHDR(t1);
3219 len2 = VARSIZE_ANY_EXHDR(t2);
3221 /* paranoia ... probably should throw error instead? */
3222 if (len1 < 0)
3223 len1 = 0;
3224 if (len2 < 0)
3225 len2 = 0;
3227 len = len1 + len2 + VARHDRSZ;
3228 result = (bytea *) palloc(len);
3230 /* Set size of result string... */
3231 SET_VARSIZE(result, len);
3233 /* Fill data field of result string... */
3234 ptr = VARDATA(result);
3235 if (len1 > 0)
3236 memcpy(ptr, VARDATA_ANY(t1), len1);
3237 if (len2 > 0)
3238 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3240 return result;
3243 #define PG_STR_GET_BYTEA(str_) \
3244 DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3247 * bytea_substr()
3248 * Return a substring starting at the specified position.
3249 * Cloned from text_substr and modified as required.
3251 * Input:
3252 * - string
3253 * - starting position (is one-based)
3254 * - string length (optional)
3256 * If the starting position is zero or less, then return from the start of the string
3257 * adjusting the length to be consistent with the "negative start" per SQL.
3258 * If the length is less than zero, an ERROR is thrown. If no third argument
3259 * (length) is provided, the length to the end of the string is assumed.
3261 Datum
3262 bytea_substr(PG_FUNCTION_ARGS)
3264 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3265 PG_GETARG_INT32(1),
3266 PG_GETARG_INT32(2),
3267 false));
3271 * bytea_substr_no_len -
3272 * Wrapper to avoid opr_sanity failure due to
3273 * one function accepting a different number of args.
3275 Datum
3276 bytea_substr_no_len(PG_FUNCTION_ARGS)
3278 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3279 PG_GETARG_INT32(1),
3281 true));
3284 static bytea *
3285 bytea_substring(Datum str,
3286 int S,
3287 int L,
3288 bool length_not_specified)
3290 int32 S1; /* adjusted start position */
3291 int32 L1; /* adjusted substring length */
3292 int32 E; /* end position */
3295 * The logic here should generally match text_substring().
3297 S1 = Max(S, 1);
3299 if (length_not_specified)
3302 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3303 * end of the string if we pass it a negative value for length.
3305 L1 = -1;
3307 else if (L < 0)
3309 /* SQL99 says to throw an error for E < S, i.e., negative length */
3310 ereport(ERROR,
3311 (errcode(ERRCODE_SUBSTRING_ERROR),
3312 errmsg("negative substring length not allowed")));
3313 L1 = -1; /* silence stupider compilers */
3315 else if (pg_add_s32_overflow(S, L, &E))
3318 * L could be large enough for S + L to overflow, in which case the
3319 * substring must run to end of string.
3321 L1 = -1;
3323 else
3326 * A zero or negative value for the end position can happen if the
3327 * start was negative or one. SQL99 says to return a zero-length
3328 * string.
3330 if (E < 1)
3331 return PG_STR_GET_BYTEA("");
3333 L1 = E - S1;
3337 * If the start position is past the end of the string, SQL99 says to
3338 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3339 * us. We need only convert S1 to zero-based starting position.
3341 return DatumGetByteaPSlice(str, S1 - 1, L1);
3345 * byteaoverlay
3346 * Replace specified substring of first string with second
3348 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3349 * This code is a direct implementation of what the standard says.
3351 Datum
3352 byteaoverlay(PG_FUNCTION_ARGS)
3354 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3355 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3356 int sp = PG_GETARG_INT32(2); /* substring start position */
3357 int sl = PG_GETARG_INT32(3); /* substring length */
3359 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3362 Datum
3363 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3365 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3366 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3367 int sp = PG_GETARG_INT32(2); /* substring start position */
3368 int sl;
3370 sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3371 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3374 static bytea *
3375 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3377 bytea *result;
3378 bytea *s1;
3379 bytea *s2;
3380 int sp_pl_sl;
3383 * Check for possible integer-overflow cases. For negative sp, throw a
3384 * "substring length" error because that's what should be expected
3385 * according to the spec's definition of OVERLAY().
3387 if (sp <= 0)
3388 ereport(ERROR,
3389 (errcode(ERRCODE_SUBSTRING_ERROR),
3390 errmsg("negative substring length not allowed")));
3391 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3392 ereport(ERROR,
3393 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3394 errmsg("integer out of range")));
3396 s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3397 s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3398 result = bytea_catenate(s1, t2);
3399 result = bytea_catenate(result, s2);
3401 return result;
3405 * bit_count
3407 Datum
3408 bytea_bit_count(PG_FUNCTION_ARGS)
3410 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3412 PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3416 * byteapos -
3417 * Return the position of the specified substring.
3418 * Implements the SQL POSITION() function.
3419 * Cloned from textpos and modified as required.
3421 Datum
3422 byteapos(PG_FUNCTION_ARGS)
3424 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3425 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3426 int pos;
3427 int px,
3429 int len1,
3430 len2;
3431 char *p1,
3432 *p2;
3434 len1 = VARSIZE_ANY_EXHDR(t1);
3435 len2 = VARSIZE_ANY_EXHDR(t2);
3437 if (len2 <= 0)
3438 PG_RETURN_INT32(1); /* result for empty pattern */
3440 p1 = VARDATA_ANY(t1);
3441 p2 = VARDATA_ANY(t2);
3443 pos = 0;
3444 px = (len1 - len2);
3445 for (p = 0; p <= px; p++)
3447 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3449 pos = p + 1;
3450 break;
3452 p1++;
3455 PG_RETURN_INT32(pos);
3458 /*-------------------------------------------------------------
3459 * byteaGetByte
3461 * this routine treats "bytea" as an array of bytes.
3462 * It returns the Nth byte (a number between 0 and 255).
3463 *-------------------------------------------------------------
3465 Datum
3466 byteaGetByte(PG_FUNCTION_ARGS)
3468 bytea *v = PG_GETARG_BYTEA_PP(0);
3469 int32 n = PG_GETARG_INT32(1);
3470 int len;
3471 int byte;
3473 len = VARSIZE_ANY_EXHDR(v);
3475 if (n < 0 || n >= len)
3476 ereport(ERROR,
3477 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3478 errmsg("index %d out of valid range, 0..%d",
3479 n, len - 1)));
3481 byte = ((unsigned char *) VARDATA_ANY(v))[n];
3483 PG_RETURN_INT32(byte);
3486 /*-------------------------------------------------------------
3487 * byteaGetBit
3489 * This routine treats a "bytea" type like an array of bits.
3490 * It returns the value of the Nth bit (0 or 1).
3492 *-------------------------------------------------------------
3494 Datum
3495 byteaGetBit(PG_FUNCTION_ARGS)
3497 bytea *v = PG_GETARG_BYTEA_PP(0);
3498 int64 n = PG_GETARG_INT64(1);
3499 int byteNo,
3500 bitNo;
3501 int len;
3502 int byte;
3504 len = VARSIZE_ANY_EXHDR(v);
3506 if (n < 0 || n >= (int64) len * 8)
3507 ereport(ERROR,
3508 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3509 errmsg("index %lld out of valid range, 0..%lld",
3510 (long long) n, (long long) len * 8 - 1)));
3512 /* n/8 is now known < len, so safe to cast to int */
3513 byteNo = (int) (n / 8);
3514 bitNo = (int) (n % 8);
3516 byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3518 if (byte & (1 << bitNo))
3519 PG_RETURN_INT32(1);
3520 else
3521 PG_RETURN_INT32(0);
3524 /*-------------------------------------------------------------
3525 * byteaSetByte
3527 * Given an instance of type 'bytea' creates a new one with
3528 * the Nth byte set to the given value.
3530 *-------------------------------------------------------------
3532 Datum
3533 byteaSetByte(PG_FUNCTION_ARGS)
3535 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3536 int32 n = PG_GETARG_INT32(1);
3537 int32 newByte = PG_GETARG_INT32(2);
3538 int len;
3540 len = VARSIZE(res) - VARHDRSZ;
3542 if (n < 0 || n >= len)
3543 ereport(ERROR,
3544 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3545 errmsg("index %d out of valid range, 0..%d",
3546 n, len - 1)));
3549 * Now set the byte.
3551 ((unsigned char *) VARDATA(res))[n] = newByte;
3553 PG_RETURN_BYTEA_P(res);
3556 /*-------------------------------------------------------------
3557 * byteaSetBit
3559 * Given an instance of type 'bytea' creates a new one with
3560 * the Nth bit set to the given value.
3562 *-------------------------------------------------------------
3564 Datum
3565 byteaSetBit(PG_FUNCTION_ARGS)
3567 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3568 int64 n = PG_GETARG_INT64(1);
3569 int32 newBit = PG_GETARG_INT32(2);
3570 int len;
3571 int oldByte,
3572 newByte;
3573 int byteNo,
3574 bitNo;
3576 len = VARSIZE(res) - VARHDRSZ;
3578 if (n < 0 || n >= (int64) len * 8)
3579 ereport(ERROR,
3580 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3581 errmsg("index %lld out of valid range, 0..%lld",
3582 (long long) n, (long long) len * 8 - 1)));
3584 /* n/8 is now known < len, so safe to cast to int */
3585 byteNo = (int) (n / 8);
3586 bitNo = (int) (n % 8);
3589 * sanity check!
3591 if (newBit != 0 && newBit != 1)
3592 ereport(ERROR,
3593 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3594 errmsg("new bit must be 0 or 1")));
3597 * Update the byte.
3599 oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3601 if (newBit == 0)
3602 newByte = oldByte & (~(1 << bitNo));
3603 else
3604 newByte = oldByte | (1 << bitNo);
3606 ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3608 PG_RETURN_BYTEA_P(res);
3612 /* text_name()
3613 * Converts a text type to a Name type.
3615 Datum
3616 text_name(PG_FUNCTION_ARGS)
3618 text *s = PG_GETARG_TEXT_PP(0);
3619 Name result;
3620 int len;
3622 len = VARSIZE_ANY_EXHDR(s);
3624 /* Truncate oversize input */
3625 if (len >= NAMEDATALEN)
3626 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3628 /* We use palloc0 here to ensure result is zero-padded */
3629 result = (Name) palloc0(NAMEDATALEN);
3630 memcpy(NameStr(*result), VARDATA_ANY(s), len);
3632 PG_RETURN_NAME(result);
3635 /* name_text()
3636 * Converts a Name type to a text type.
3638 Datum
3639 name_text(PG_FUNCTION_ARGS)
3641 Name s = PG_GETARG_NAME(0);
3643 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3648 * textToQualifiedNameList - convert a text object to list of names
3650 * This implements the input parsing needed by nextval() and other
3651 * functions that take a text parameter representing a qualified name.
3652 * We split the name at dots, downcase if not double-quoted, and
3653 * truncate names if they're too long.
3655 List *
3656 textToQualifiedNameList(text *textval)
3658 char *rawname;
3659 List *result = NIL;
3660 List *namelist;
3661 ListCell *l;
3663 /* Convert to C string (handles possible detoasting). */
3664 /* Note we rely on being able to modify rawname below. */
3665 rawname = text_to_cstring(textval);
3667 if (!SplitIdentifierString(rawname, '.', &namelist))
3668 ereport(ERROR,
3669 (errcode(ERRCODE_INVALID_NAME),
3670 errmsg("invalid name syntax")));
3672 if (namelist == NIL)
3673 ereport(ERROR,
3674 (errcode(ERRCODE_INVALID_NAME),
3675 errmsg("invalid name syntax")));
3677 foreach(l, namelist)
3679 char *curname = (char *) lfirst(l);
3681 result = lappend(result, makeString(pstrdup(curname)));
3684 pfree(rawname);
3685 list_free(namelist);
3687 return result;
3691 * SplitIdentifierString --- parse a string containing identifiers
3693 * This is the guts of textToQualifiedNameList, and is exported for use in
3694 * other situations such as parsing GUC variables. In the GUC case, it's
3695 * important to avoid memory leaks, so the API is designed to minimize the
3696 * amount of stuff that needs to be allocated and freed.
3698 * Inputs:
3699 * rawstring: the input string; must be overwritable! On return, it's
3700 * been modified to contain the separated identifiers.
3701 * separator: the separator punctuation expected between identifiers
3702 * (typically '.' or ','). Whitespace may also appear around
3703 * identifiers.
3704 * Outputs:
3705 * namelist: filled with a palloc'd list of pointers to identifiers within
3706 * rawstring. Caller should list_free() this even on error return.
3708 * Returns true if okay, false if there is a syntax error in the string.
3710 * Note that an empty string is considered okay here, though not in
3711 * textToQualifiedNameList.
3713 bool
3714 SplitIdentifierString(char *rawstring, char separator,
3715 List **namelist)
3717 char *nextp = rawstring;
3718 bool done = false;
3720 *namelist = NIL;
3722 while (scanner_isspace(*nextp))
3723 nextp++; /* skip leading whitespace */
3725 if (*nextp == '\0')
3726 return true; /* allow empty string */
3728 /* At the top of the loop, we are at start of a new identifier. */
3731 char *curname;
3732 char *endp;
3734 if (*nextp == '"')
3736 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3737 curname = nextp + 1;
3738 for (;;)
3740 endp = strchr(nextp + 1, '"');
3741 if (endp == NULL)
3742 return false; /* mismatched quotes */
3743 if (endp[1] != '"')
3744 break; /* found end of quoted name */
3745 /* Collapse adjacent quotes into one quote, and look again */
3746 memmove(endp, endp + 1, strlen(endp));
3747 nextp = endp;
3749 /* endp now points at the terminating quote */
3750 nextp = endp + 1;
3752 else
3754 /* Unquoted name --- extends to separator or whitespace */
3755 char *downname;
3756 int len;
3758 curname = nextp;
3759 while (*nextp && *nextp != separator &&
3760 !scanner_isspace(*nextp))
3761 nextp++;
3762 endp = nextp;
3763 if (curname == nextp)
3764 return false; /* empty unquoted name not allowed */
3767 * Downcase the identifier, using same code as main lexer does.
3769 * XXX because we want to overwrite the input in-place, we cannot
3770 * support a downcasing transformation that increases the string
3771 * length. This is not a problem given the current implementation
3772 * of downcase_truncate_identifier, but we'll probably have to do
3773 * something about this someday.
3775 len = endp - curname;
3776 downname = downcase_truncate_identifier(curname, len, false);
3777 Assert(strlen(downname) <= len);
3778 strncpy(curname, downname, len); /* strncpy is required here */
3779 pfree(downname);
3782 while (scanner_isspace(*nextp))
3783 nextp++; /* skip trailing whitespace */
3785 if (*nextp == separator)
3787 nextp++;
3788 while (scanner_isspace(*nextp))
3789 nextp++; /* skip leading whitespace for next */
3790 /* we expect another name, so done remains false */
3792 else if (*nextp == '\0')
3793 done = true;
3794 else
3795 return false; /* invalid syntax */
3797 /* Now safe to overwrite separator with a null */
3798 *endp = '\0';
3800 /* Truncate name if it's overlength */
3801 truncate_identifier(curname, strlen(curname), false);
3804 * Finished isolating current name --- add it to list
3806 *namelist = lappend(*namelist, curname);
3808 /* Loop back if we didn't reach end of string */
3809 } while (!done);
3811 return true;
3816 * SplitDirectoriesString --- parse a string containing file/directory names
3818 * This works fine on file names too; the function name is historical.
3820 * This is similar to SplitIdentifierString, except that the parsing
3821 * rules are meant to handle pathnames instead of identifiers: there is
3822 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3823 * and we apply canonicalize_path() to each extracted string. Because of the
3824 * last, the returned strings are separately palloc'd rather than being
3825 * pointers into rawstring --- but we still scribble on rawstring.
3827 * Inputs:
3828 * rawstring: the input string; must be modifiable!
3829 * separator: the separator punctuation expected between directories
3830 * (typically ',' or ';'). Whitespace may also appear around
3831 * directories.
3832 * Outputs:
3833 * namelist: filled with a palloc'd list of directory names.
3834 * Caller should list_free_deep() this even on error return.
3836 * Returns true if okay, false if there is a syntax error in the string.
3838 * Note that an empty string is considered okay here.
3840 bool
3841 SplitDirectoriesString(char *rawstring, char separator,
3842 List **namelist)
3844 char *nextp = rawstring;
3845 bool done = false;
3847 *namelist = NIL;
3849 while (scanner_isspace(*nextp))
3850 nextp++; /* skip leading whitespace */
3852 if (*nextp == '\0')
3853 return true; /* allow empty string */
3855 /* At the top of the loop, we are at start of a new directory. */
3858 char *curname;
3859 char *endp;
3861 if (*nextp == '"')
3863 /* Quoted name --- collapse quote-quote pairs */
3864 curname = nextp + 1;
3865 for (;;)
3867 endp = strchr(nextp + 1, '"');
3868 if (endp == NULL)
3869 return false; /* mismatched quotes */
3870 if (endp[1] != '"')
3871 break; /* found end of quoted name */
3872 /* Collapse adjacent quotes into one quote, and look again */
3873 memmove(endp, endp + 1, strlen(endp));
3874 nextp = endp;
3876 /* endp now points at the terminating quote */
3877 nextp = endp + 1;
3879 else
3881 /* Unquoted name --- extends to separator or end of string */
3882 curname = endp = nextp;
3883 while (*nextp && *nextp != separator)
3885 /* trailing whitespace should not be included in name */
3886 if (!scanner_isspace(*nextp))
3887 endp = nextp + 1;
3888 nextp++;
3890 if (curname == endp)
3891 return false; /* empty unquoted name not allowed */
3894 while (scanner_isspace(*nextp))
3895 nextp++; /* skip trailing whitespace */
3897 if (*nextp == separator)
3899 nextp++;
3900 while (scanner_isspace(*nextp))
3901 nextp++; /* skip leading whitespace for next */
3902 /* we expect another name, so done remains false */
3904 else if (*nextp == '\0')
3905 done = true;
3906 else
3907 return false; /* invalid syntax */
3909 /* Now safe to overwrite separator with a null */
3910 *endp = '\0';
3912 /* Truncate path if it's overlength */
3913 if (strlen(curname) >= MAXPGPATH)
3914 curname[MAXPGPATH - 1] = '\0';
3917 * Finished isolating current name --- add it to list
3919 curname = pstrdup(curname);
3920 canonicalize_path(curname);
3921 *namelist = lappend(*namelist, curname);
3923 /* Loop back if we didn't reach end of string */
3924 } while (!done);
3926 return true;
3931 * SplitGUCList --- parse a string containing identifiers or file names
3933 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3934 * presuming whether the elements will be taken as identifiers or file names.
3935 * We assume the input has already been through flatten_set_variable_args(),
3936 * so that we need never downcase (if appropriate, that was done already).
3937 * Nor do we ever truncate, since we don't know the correct max length.
3938 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3939 * because any embedded whitespace should have led to double-quoting).
3940 * Otherwise the API is identical to SplitIdentifierString.
3942 * XXX it's annoying to have so many copies of this string-splitting logic.
3943 * However, it's not clear that having one function with a bunch of option
3944 * flags would be much better.
3946 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3947 * Be sure to update that if you have to change this.
3949 * Inputs:
3950 * rawstring: the input string; must be overwritable! On return, it's
3951 * been modified to contain the separated identifiers.
3952 * separator: the separator punctuation expected between identifiers
3953 * (typically '.' or ','). Whitespace may also appear around
3954 * identifiers.
3955 * Outputs:
3956 * namelist: filled with a palloc'd list of pointers to identifiers within
3957 * rawstring. Caller should list_free() this even on error return.
3959 * Returns true if okay, false if there is a syntax error in the string.
3961 bool
3962 SplitGUCList(char *rawstring, char separator,
3963 List **namelist)
3965 char *nextp = rawstring;
3966 bool done = false;
3968 *namelist = NIL;
3970 while (scanner_isspace(*nextp))
3971 nextp++; /* skip leading whitespace */
3973 if (*nextp == '\0')
3974 return true; /* allow empty string */
3976 /* At the top of the loop, we are at start of a new identifier. */
3979 char *curname;
3980 char *endp;
3982 if (*nextp == '"')
3984 /* Quoted name --- collapse quote-quote pairs */
3985 curname = nextp + 1;
3986 for (;;)
3988 endp = strchr(nextp + 1, '"');
3989 if (endp == NULL)
3990 return false; /* mismatched quotes */
3991 if (endp[1] != '"')
3992 break; /* found end of quoted name */
3993 /* Collapse adjacent quotes into one quote, and look again */
3994 memmove(endp, endp + 1, strlen(endp));
3995 nextp = endp;
3997 /* endp now points at the terminating quote */
3998 nextp = endp + 1;
4000 else
4002 /* Unquoted name --- extends to separator or whitespace */
4003 curname = nextp;
4004 while (*nextp && *nextp != separator &&
4005 !scanner_isspace(*nextp))
4006 nextp++;
4007 endp = nextp;
4008 if (curname == nextp)
4009 return false; /* empty unquoted name not allowed */
4012 while (scanner_isspace(*nextp))
4013 nextp++; /* skip trailing whitespace */
4015 if (*nextp == separator)
4017 nextp++;
4018 while (scanner_isspace(*nextp))
4019 nextp++; /* skip leading whitespace for next */
4020 /* we expect another name, so done remains false */
4022 else if (*nextp == '\0')
4023 done = true;
4024 else
4025 return false; /* invalid syntax */
4027 /* Now safe to overwrite separator with a null */
4028 *endp = '\0';
4031 * Finished isolating current name --- add it to list
4033 *namelist = lappend(*namelist, curname);
4035 /* Loop back if we didn't reach end of string */
4036 } while (!done);
4038 return true;
4042 /*****************************************************************************
4043 * Comparison Functions used for bytea
4045 * Note: btree indexes need these routines not to leak memory; therefore,
4046 * be careful to free working copies of toasted datums. Most places don't
4047 * need to be so careful.
4048 *****************************************************************************/
4050 Datum
4051 byteaeq(PG_FUNCTION_ARGS)
4053 Datum arg1 = PG_GETARG_DATUM(0);
4054 Datum arg2 = PG_GETARG_DATUM(1);
4055 bool result;
4056 Size len1,
4057 len2;
4060 * We can use a fast path for unequal lengths, which might save us from
4061 * having to detoast one or both values.
4063 len1 = toast_raw_datum_size(arg1);
4064 len2 = toast_raw_datum_size(arg2);
4065 if (len1 != len2)
4066 result = false;
4067 else
4069 bytea *barg1 = DatumGetByteaPP(arg1);
4070 bytea *barg2 = DatumGetByteaPP(arg2);
4072 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4073 len1 - VARHDRSZ) == 0);
4075 PG_FREE_IF_COPY(barg1, 0);
4076 PG_FREE_IF_COPY(barg2, 1);
4079 PG_RETURN_BOOL(result);
4082 Datum
4083 byteane(PG_FUNCTION_ARGS)
4085 Datum arg1 = PG_GETARG_DATUM(0);
4086 Datum arg2 = PG_GETARG_DATUM(1);
4087 bool result;
4088 Size len1,
4089 len2;
4092 * We can use a fast path for unequal lengths, which might save us from
4093 * having to detoast one or both values.
4095 len1 = toast_raw_datum_size(arg1);
4096 len2 = toast_raw_datum_size(arg2);
4097 if (len1 != len2)
4098 result = true;
4099 else
4101 bytea *barg1 = DatumGetByteaPP(arg1);
4102 bytea *barg2 = DatumGetByteaPP(arg2);
4104 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4105 len1 - VARHDRSZ) != 0);
4107 PG_FREE_IF_COPY(barg1, 0);
4108 PG_FREE_IF_COPY(barg2, 1);
4111 PG_RETURN_BOOL(result);
4114 Datum
4115 bytealt(PG_FUNCTION_ARGS)
4117 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4118 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4119 int len1,
4120 len2;
4121 int cmp;
4123 len1 = VARSIZE_ANY_EXHDR(arg1);
4124 len2 = VARSIZE_ANY_EXHDR(arg2);
4126 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4128 PG_FREE_IF_COPY(arg1, 0);
4129 PG_FREE_IF_COPY(arg2, 1);
4131 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4134 Datum
4135 byteale(PG_FUNCTION_ARGS)
4137 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4138 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4139 int len1,
4140 len2;
4141 int cmp;
4143 len1 = VARSIZE_ANY_EXHDR(arg1);
4144 len2 = VARSIZE_ANY_EXHDR(arg2);
4146 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4148 PG_FREE_IF_COPY(arg1, 0);
4149 PG_FREE_IF_COPY(arg2, 1);
4151 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4154 Datum
4155 byteagt(PG_FUNCTION_ARGS)
4157 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4158 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4159 int len1,
4160 len2;
4161 int cmp;
4163 len1 = VARSIZE_ANY_EXHDR(arg1);
4164 len2 = VARSIZE_ANY_EXHDR(arg2);
4166 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4168 PG_FREE_IF_COPY(arg1, 0);
4169 PG_FREE_IF_COPY(arg2, 1);
4171 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4174 Datum
4175 byteage(PG_FUNCTION_ARGS)
4177 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4178 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4179 int len1,
4180 len2;
4181 int cmp;
4183 len1 = VARSIZE_ANY_EXHDR(arg1);
4184 len2 = VARSIZE_ANY_EXHDR(arg2);
4186 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4188 PG_FREE_IF_COPY(arg1, 0);
4189 PG_FREE_IF_COPY(arg2, 1);
4191 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4194 Datum
4195 byteacmp(PG_FUNCTION_ARGS)
4197 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4198 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4199 int len1,
4200 len2;
4201 int cmp;
4203 len1 = VARSIZE_ANY_EXHDR(arg1);
4204 len2 = VARSIZE_ANY_EXHDR(arg2);
4206 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4207 if ((cmp == 0) && (len1 != len2))
4208 cmp = (len1 < len2) ? -1 : 1;
4210 PG_FREE_IF_COPY(arg1, 0);
4211 PG_FREE_IF_COPY(arg2, 1);
4213 PG_RETURN_INT32(cmp);
4216 Datum
4217 bytea_sortsupport(PG_FUNCTION_ARGS)
4219 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4220 MemoryContext oldcontext;
4222 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4224 /* Use generic string SortSupport, forcing "C" collation */
4225 varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4227 MemoryContextSwitchTo(oldcontext);
4229 PG_RETURN_VOID();
4233 * appendStringInfoText
4235 * Append a text to str.
4236 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4238 static void
4239 appendStringInfoText(StringInfo str, const text *t)
4241 appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4245 * replace_text
4246 * replace all occurrences of 'old_sub_str' in 'orig_str'
4247 * with 'new_sub_str' to form 'new_str'
4249 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4250 * otherwise returns 'new_str'
4252 Datum
4253 replace_text(PG_FUNCTION_ARGS)
4255 text *src_text = PG_GETARG_TEXT_PP(0);
4256 text *from_sub_text = PG_GETARG_TEXT_PP(1);
4257 text *to_sub_text = PG_GETARG_TEXT_PP(2);
4258 int src_text_len;
4259 int from_sub_text_len;
4260 TextPositionState state;
4261 text *ret_text;
4262 int chunk_len;
4263 char *curr_ptr;
4264 char *start_ptr;
4265 StringInfoData str;
4266 bool found;
4268 src_text_len = VARSIZE_ANY_EXHDR(src_text);
4269 from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4271 /* Return unmodified source string if empty source or pattern */
4272 if (src_text_len < 1 || from_sub_text_len < 1)
4274 PG_RETURN_TEXT_P(src_text);
4277 text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4279 found = text_position_next(&state);
4281 /* When the from_sub_text is not found, there is nothing to do. */
4282 if (!found)
4284 text_position_cleanup(&state);
4285 PG_RETURN_TEXT_P(src_text);
4287 curr_ptr = text_position_get_match_ptr(&state);
4288 start_ptr = VARDATA_ANY(src_text);
4290 initStringInfo(&str);
4294 CHECK_FOR_INTERRUPTS();
4296 /* copy the data skipped over by last text_position_next() */
4297 chunk_len = curr_ptr - start_ptr;
4298 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4300 appendStringInfoText(&str, to_sub_text);
4302 start_ptr = curr_ptr + from_sub_text_len;
4304 found = text_position_next(&state);
4305 if (found)
4306 curr_ptr = text_position_get_match_ptr(&state);
4308 while (found);
4310 /* copy trailing data */
4311 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4312 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4314 text_position_cleanup(&state);
4316 ret_text = cstring_to_text_with_len(str.data, str.len);
4317 pfree(str.data);
4319 PG_RETURN_TEXT_P(ret_text);
4323 * check_replace_text_has_escape
4325 * Returns 0 if text contains no backslashes that need processing.
4326 * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4327 * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4329 static int
4330 check_replace_text_has_escape(const text *replace_text)
4332 int result = 0;
4333 const char *p = VARDATA_ANY(replace_text);
4334 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4336 while (p < p_end)
4338 /* Find next escape char, if any. */
4339 p = memchr(p, '\\', p_end - p);
4340 if (p == NULL)
4341 break;
4342 p++;
4343 /* Note: a backslash at the end doesn't require extra processing. */
4344 if (p < p_end)
4346 if (*p >= '1' && *p <= '9')
4347 return 2; /* Found a submatch specifier, so done */
4348 result = 1; /* Found some other sequence, keep looking */
4349 p++;
4352 return result;
4356 * appendStringInfoRegexpSubstr
4358 * Append replace_text to str, substituting regexp back references for
4359 * \n escapes. start_ptr is the start of the match in the source string,
4360 * at logical character position data_pos.
4362 static void
4363 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4364 regmatch_t *pmatch,
4365 char *start_ptr, int data_pos)
4367 const char *p = VARDATA_ANY(replace_text);
4368 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4370 while (p < p_end)
4372 const char *chunk_start = p;
4373 int so;
4374 int eo;
4376 /* Find next escape char, if any. */
4377 p = memchr(p, '\\', p_end - p);
4378 if (p == NULL)
4379 p = p_end;
4381 /* Copy the text we just scanned over, if any. */
4382 if (p > chunk_start)
4383 appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4385 /* Done if at end of string, else advance over escape char. */
4386 if (p >= p_end)
4387 break;
4388 p++;
4390 if (p >= p_end)
4392 /* Escape at very end of input. Treat same as unexpected char */
4393 appendStringInfoChar(str, '\\');
4394 break;
4397 if (*p >= '1' && *p <= '9')
4399 /* Use the back reference of regexp. */
4400 int idx = *p - '0';
4402 so = pmatch[idx].rm_so;
4403 eo = pmatch[idx].rm_eo;
4404 p++;
4406 else if (*p == '&')
4408 /* Use the entire matched string. */
4409 so = pmatch[0].rm_so;
4410 eo = pmatch[0].rm_eo;
4411 p++;
4413 else if (*p == '\\')
4415 /* \\ means transfer one \ to output. */
4416 appendStringInfoChar(str, '\\');
4417 p++;
4418 continue;
4420 else
4423 * If escape char is not followed by any expected char, just treat
4424 * it as ordinary data to copy. (XXX would it be better to throw
4425 * an error?)
4427 appendStringInfoChar(str, '\\');
4428 continue;
4431 if (so >= 0 && eo >= 0)
4434 * Copy the text that is back reference of regexp. Note so and eo
4435 * are counted in characters not bytes.
4437 char *chunk_start;
4438 int chunk_len;
4440 Assert(so >= data_pos);
4441 chunk_start = start_ptr;
4442 chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4443 chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4444 appendBinaryStringInfo(str, chunk_start, chunk_len);
4450 * replace_text_regexp
4452 * replace substring(s) in src_text that match pattern with replace_text.
4453 * The replace_text can contain backslash markers to substitute
4454 * (parts of) the matched text.
4456 * cflags: regexp compile flags.
4457 * collation: collation to use.
4458 * search_start: the character (not byte) offset in src_text at which to
4459 * begin searching.
4460 * n: if 0, replace all matches; if > 0, replace only the N'th match.
4462 text *
4463 replace_text_regexp(text *src_text, text *pattern_text,
4464 text *replace_text,
4465 int cflags, Oid collation,
4466 int search_start, int n)
4468 text *ret_text;
4469 regex_t *re;
4470 int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4471 int nmatches = 0;
4472 StringInfoData buf;
4473 regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4474 int nmatch = lengthof(pmatch);
4475 pg_wchar *data;
4476 size_t data_len;
4477 int data_pos;
4478 char *start_ptr;
4479 int escape_status;
4481 initStringInfo(&buf);
4483 /* Convert data string to wide characters. */
4484 data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4485 data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4487 /* Check whether replace_text has escapes, especially regexp submatches. */
4488 escape_status = check_replace_text_has_escape(replace_text);
4490 /* If no regexp submatches, we can use REG_NOSUB. */
4491 if (escape_status < 2)
4493 cflags |= REG_NOSUB;
4494 /* Also tell pg_regexec we only want the whole-match location. */
4495 nmatch = 1;
4498 /* Prepare the regexp. */
4499 re = RE_compile_and_cache(pattern_text, cflags, collation);
4501 /* start_ptr points to the data_pos'th character of src_text */
4502 start_ptr = (char *) VARDATA_ANY(src_text);
4503 data_pos = 0;
4505 while (search_start <= data_len)
4507 int regexec_result;
4509 CHECK_FOR_INTERRUPTS();
4511 regexec_result = pg_regexec(re,
4512 data,
4513 data_len,
4514 search_start,
4515 NULL, /* no details */
4516 nmatch,
4517 pmatch,
4520 if (regexec_result == REG_NOMATCH)
4521 break;
4523 if (regexec_result != REG_OKAY)
4525 char errMsg[100];
4527 CHECK_FOR_INTERRUPTS();
4528 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4529 ereport(ERROR,
4530 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4531 errmsg("regular expression failed: %s", errMsg)));
4535 * Count matches, and decide whether to replace this match.
4537 nmatches++;
4538 if (n > 0 && nmatches != n)
4541 * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4542 * we treat the matched text as if it weren't matched, and copy it
4543 * to the output later.)
4545 search_start = pmatch[0].rm_eo;
4546 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4547 search_start++;
4548 continue;
4552 * Copy the text to the left of the match position. Note we are given
4553 * character not byte indexes.
4555 if (pmatch[0].rm_so - data_pos > 0)
4557 int chunk_len;
4559 chunk_len = charlen_to_bytelen(start_ptr,
4560 pmatch[0].rm_so - data_pos);
4561 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4564 * Advance start_ptr over that text, to avoid multiple rescans of
4565 * it if the replace_text contains multiple back-references.
4567 start_ptr += chunk_len;
4568 data_pos = pmatch[0].rm_so;
4572 * Copy the replace_text, processing escapes if any are present.
4574 if (escape_status > 0)
4575 appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4576 start_ptr, data_pos);
4577 else
4578 appendStringInfoText(&buf, replace_text);
4580 /* Advance start_ptr and data_pos over the matched text. */
4581 start_ptr += charlen_to_bytelen(start_ptr,
4582 pmatch[0].rm_eo - data_pos);
4583 data_pos = pmatch[0].rm_eo;
4586 * If we only want to replace one occurrence, we're done.
4588 if (n > 0)
4589 break;
4592 * Advance search position. Normally we start the next search at the
4593 * end of the previous match; but if the match was of zero length, we
4594 * have to advance by one character, or we'd just find the same match
4595 * again.
4597 search_start = data_pos;
4598 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4599 search_start++;
4603 * Copy the text to the right of the last match.
4605 if (data_pos < data_len)
4607 int chunk_len;
4609 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4610 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4613 ret_text = cstring_to_text_with_len(buf.data, buf.len);
4614 pfree(buf.data);
4615 pfree(data);
4617 return ret_text;
4621 * split_part
4622 * parse input string based on provided field separator
4623 * return N'th item (1 based, negative counts from end)
4625 Datum
4626 split_part(PG_FUNCTION_ARGS)
4628 text *inputstring = PG_GETARG_TEXT_PP(0);
4629 text *fldsep = PG_GETARG_TEXT_PP(1);
4630 int fldnum = PG_GETARG_INT32(2);
4631 int inputstring_len;
4632 int fldsep_len;
4633 TextPositionState state;
4634 char *start_ptr;
4635 char *end_ptr;
4636 text *result_text;
4637 bool found;
4639 /* field number is 1 based */
4640 if (fldnum == 0)
4641 ereport(ERROR,
4642 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4643 errmsg("field position must not be zero")));
4645 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4646 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4648 /* return empty string for empty input string */
4649 if (inputstring_len < 1)
4650 PG_RETURN_TEXT_P(cstring_to_text(""));
4652 /* handle empty field separator */
4653 if (fldsep_len < 1)
4655 /* if first or last field, return input string, else empty string */
4656 if (fldnum == 1 || fldnum == -1)
4657 PG_RETURN_TEXT_P(inputstring);
4658 else
4659 PG_RETURN_TEXT_P(cstring_to_text(""));
4662 /* find the first field separator */
4663 text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4665 found = text_position_next(&state);
4667 /* special case if fldsep not found at all */
4668 if (!found)
4670 text_position_cleanup(&state);
4671 /* if first or last field, return input string, else empty string */
4672 if (fldnum == 1 || fldnum == -1)
4673 PG_RETURN_TEXT_P(inputstring);
4674 else
4675 PG_RETURN_TEXT_P(cstring_to_text(""));
4679 * take care of a negative field number (i.e. count from the right) by
4680 * converting to a positive field number; we need total number of fields
4682 if (fldnum < 0)
4684 /* we found a fldsep, so there are at least two fields */
4685 int numfields = 2;
4687 while (text_position_next(&state))
4688 numfields++;
4690 /* special case of last field does not require an extra pass */
4691 if (fldnum == -1)
4693 start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4694 end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4695 text_position_cleanup(&state);
4696 PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4697 end_ptr - start_ptr));
4700 /* else, convert fldnum to positive notation */
4701 fldnum += numfields + 1;
4703 /* if nonexistent field, return empty string */
4704 if (fldnum <= 0)
4706 text_position_cleanup(&state);
4707 PG_RETURN_TEXT_P(cstring_to_text(""));
4710 /* reset to pointing at first match, but now with positive fldnum */
4711 text_position_reset(&state);
4712 found = text_position_next(&state);
4713 Assert(found);
4716 /* identify bounds of first field */
4717 start_ptr = VARDATA_ANY(inputstring);
4718 end_ptr = text_position_get_match_ptr(&state);
4720 while (found && --fldnum > 0)
4722 /* identify bounds of next field */
4723 start_ptr = end_ptr + fldsep_len;
4724 found = text_position_next(&state);
4725 if (found)
4726 end_ptr = text_position_get_match_ptr(&state);
4729 text_position_cleanup(&state);
4731 if (fldnum > 0)
4733 /* N'th field separator not found */
4734 /* if last field requested, return it, else empty string */
4735 if (fldnum == 1)
4737 int last_len = start_ptr - VARDATA_ANY(inputstring);
4739 result_text = cstring_to_text_with_len(start_ptr,
4740 inputstring_len - last_len);
4742 else
4743 result_text = cstring_to_text("");
4745 else
4747 /* non-last field requested */
4748 result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4751 PG_RETURN_TEXT_P(result_text);
4755 * Convenience function to return true when two text params are equal.
4757 static bool
4758 text_isequal(text *txt1, text *txt2, Oid collid)
4760 return DatumGetBool(DirectFunctionCall2Coll(texteq,
4761 collid,
4762 PointerGetDatum(txt1),
4763 PointerGetDatum(txt2)));
4767 * text_to_array
4768 * parse input string and return text array of elements,
4769 * based on provided field separator
4771 Datum
4772 text_to_array(PG_FUNCTION_ARGS)
4774 SplitTextOutputData tstate;
4776 /* For array output, tstate should start as all zeroes */
4777 memset(&tstate, 0, sizeof(tstate));
4779 if (!split_text(fcinfo, &tstate))
4780 PG_RETURN_NULL();
4782 if (tstate.astate == NULL)
4783 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4785 PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4786 CurrentMemoryContext));
4790 * text_to_array_null
4791 * parse input string and return text array of elements,
4792 * based on provided field separator and null string
4794 * This is a separate entry point only to prevent the regression tests from
4795 * complaining about different argument sets for the same internal function.
4797 Datum
4798 text_to_array_null(PG_FUNCTION_ARGS)
4800 return text_to_array(fcinfo);
4804 * text_to_table
4805 * parse input string and return table of elements,
4806 * based on provided field separator
4808 Datum
4809 text_to_table(PG_FUNCTION_ARGS)
4811 ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4812 SplitTextOutputData tstate;
4814 tstate.astate = NULL;
4815 InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4816 tstate.tupstore = rsi->setResult;
4817 tstate.tupdesc = rsi->setDesc;
4819 (void) split_text(fcinfo, &tstate);
4821 return (Datum) 0;
4825 * text_to_table_null
4826 * parse input string and return table of elements,
4827 * based on provided field separator and null string
4829 * This is a separate entry point only to prevent the regression tests from
4830 * complaining about different argument sets for the same internal function.
4832 Datum
4833 text_to_table_null(PG_FUNCTION_ARGS)
4835 return text_to_table(fcinfo);
4839 * Common code for text_to_array, text_to_array_null, text_to_table
4840 * and text_to_table_null functions.
4842 * These are not strict so we have to test for null inputs explicitly.
4843 * Returns false if result is to be null, else returns true.
4845 * Note that if the result is valid but empty (zero elements), we return
4846 * without changing *tstate --- caller must handle that case, too.
4848 static bool
4849 split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4851 text *inputstring;
4852 text *fldsep;
4853 text *null_string;
4854 Oid collation = PG_GET_COLLATION();
4855 int inputstring_len;
4856 int fldsep_len;
4857 char *start_ptr;
4858 text *result_text;
4860 /* when input string is NULL, then result is NULL too */
4861 if (PG_ARGISNULL(0))
4862 return false;
4864 inputstring = PG_GETARG_TEXT_PP(0);
4866 /* fldsep can be NULL */
4867 if (!PG_ARGISNULL(1))
4868 fldsep = PG_GETARG_TEXT_PP(1);
4869 else
4870 fldsep = NULL;
4872 /* null_string can be NULL or omitted */
4873 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4874 null_string = PG_GETARG_TEXT_PP(2);
4875 else
4876 null_string = NULL;
4878 if (fldsep != NULL)
4881 * Normal case with non-null fldsep. Use the text_position machinery
4882 * to search for occurrences of fldsep.
4884 TextPositionState state;
4886 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4887 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4889 /* return empty set for empty input string */
4890 if (inputstring_len < 1)
4891 return true;
4893 /* empty field separator: return input string as a one-element set */
4894 if (fldsep_len < 1)
4896 split_text_accum_result(tstate, inputstring,
4897 null_string, collation);
4898 return true;
4901 text_position_setup(inputstring, fldsep, collation, &state);
4903 start_ptr = VARDATA_ANY(inputstring);
4905 for (;;)
4907 bool found;
4908 char *end_ptr;
4909 int chunk_len;
4911 CHECK_FOR_INTERRUPTS();
4913 found = text_position_next(&state);
4914 if (!found)
4916 /* fetch last field */
4917 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4918 end_ptr = NULL; /* not used, but some compilers complain */
4920 else
4922 /* fetch non-last field */
4923 end_ptr = text_position_get_match_ptr(&state);
4924 chunk_len = end_ptr - start_ptr;
4927 /* build a temp text datum to pass to split_text_accum_result */
4928 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4930 /* stash away this field */
4931 split_text_accum_result(tstate, result_text,
4932 null_string, collation);
4934 pfree(result_text);
4936 if (!found)
4937 break;
4939 start_ptr = end_ptr + fldsep_len;
4942 text_position_cleanup(&state);
4944 else
4947 * When fldsep is NULL, each character in the input string becomes a
4948 * separate element in the result set. The separator is effectively
4949 * the space between characters.
4951 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4953 start_ptr = VARDATA_ANY(inputstring);
4955 while (inputstring_len > 0)
4957 int chunk_len = pg_mblen(start_ptr);
4959 CHECK_FOR_INTERRUPTS();
4961 /* build a temp text datum to pass to split_text_accum_result */
4962 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4964 /* stash away this field */
4965 split_text_accum_result(tstate, result_text,
4966 null_string, collation);
4968 pfree(result_text);
4970 start_ptr += chunk_len;
4971 inputstring_len -= chunk_len;
4975 return true;
4979 * Add text item to result set (table or array).
4981 * This is also responsible for checking to see if the item matches
4982 * the null_string, in which case we should emit NULL instead.
4984 static void
4985 split_text_accum_result(SplitTextOutputData *tstate,
4986 text *field_value,
4987 text *null_string,
4988 Oid collation)
4990 bool is_null = false;
4992 if (null_string && text_isequal(field_value, null_string, collation))
4993 is_null = true;
4995 if (tstate->tupstore)
4997 Datum values[1];
4998 bool nulls[1];
5000 values[0] = PointerGetDatum(field_value);
5001 nulls[0] = is_null;
5003 tuplestore_putvalues(tstate->tupstore,
5004 tstate->tupdesc,
5005 values,
5006 nulls);
5008 else
5010 tstate->astate = accumArrayResult(tstate->astate,
5011 PointerGetDatum(field_value),
5012 is_null,
5013 TEXTOID,
5014 CurrentMemoryContext);
5019 * array_to_text
5020 * concatenate Cstring representation of input array elements
5021 * using provided field separator
5023 Datum
5024 array_to_text(PG_FUNCTION_ARGS)
5026 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
5027 char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5029 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5033 * array_to_text_null
5034 * concatenate Cstring representation of input array elements
5035 * using provided field separator and null string
5037 * This version is not strict so we have to test for null inputs explicitly.
5039 Datum
5040 array_to_text_null(PG_FUNCTION_ARGS)
5042 ArrayType *v;
5043 char *fldsep;
5044 char *null_string;
5046 /* returns NULL when first or second parameter is NULL */
5047 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5048 PG_RETURN_NULL();
5050 v = PG_GETARG_ARRAYTYPE_P(0);
5051 fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5053 /* NULL null string is passed through as a null pointer */
5054 if (!PG_ARGISNULL(2))
5055 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5056 else
5057 null_string = NULL;
5059 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5063 * common code for array_to_text and array_to_text_null functions
5065 static text *
5066 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5067 const char *fldsep, const char *null_string)
5069 text *result;
5070 int nitems,
5071 *dims,
5072 ndims;
5073 Oid element_type;
5074 int typlen;
5075 bool typbyval;
5076 char typalign;
5077 StringInfoData buf;
5078 bool printed = false;
5079 char *p;
5080 bits8 *bitmap;
5081 int bitmask;
5082 int i;
5083 ArrayMetaState *my_extra;
5085 ndims = ARR_NDIM(v);
5086 dims = ARR_DIMS(v);
5087 nitems = ArrayGetNItems(ndims, dims);
5089 /* if there are no elements, return an empty string */
5090 if (nitems == 0)
5091 return cstring_to_text_with_len("", 0);
5093 element_type = ARR_ELEMTYPE(v);
5094 initStringInfo(&buf);
5097 * We arrange to look up info about element type, including its output
5098 * conversion proc, only once per series of calls, assuming the element
5099 * type doesn't change underneath us.
5101 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5102 if (my_extra == NULL)
5104 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5105 sizeof(ArrayMetaState));
5106 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5107 my_extra->element_type = ~element_type;
5110 if (my_extra->element_type != element_type)
5113 * Get info about element type, including its output conversion proc
5115 get_type_io_data(element_type, IOFunc_output,
5116 &my_extra->typlen, &my_extra->typbyval,
5117 &my_extra->typalign, &my_extra->typdelim,
5118 &my_extra->typioparam, &my_extra->typiofunc);
5119 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5120 fcinfo->flinfo->fn_mcxt);
5121 my_extra->element_type = element_type;
5123 typlen = my_extra->typlen;
5124 typbyval = my_extra->typbyval;
5125 typalign = my_extra->typalign;
5127 p = ARR_DATA_PTR(v);
5128 bitmap = ARR_NULLBITMAP(v);
5129 bitmask = 1;
5131 for (i = 0; i < nitems; i++)
5133 Datum itemvalue;
5134 char *value;
5136 /* Get source element, checking for NULL */
5137 if (bitmap && (*bitmap & bitmask) == 0)
5139 /* if null_string is NULL, we just ignore null elements */
5140 if (null_string != NULL)
5142 if (printed)
5143 appendStringInfo(&buf, "%s%s", fldsep, null_string);
5144 else
5145 appendStringInfoString(&buf, null_string);
5146 printed = true;
5149 else
5151 itemvalue = fetch_att(p, typbyval, typlen);
5153 value = OutputFunctionCall(&my_extra->proc, itemvalue);
5155 if (printed)
5156 appendStringInfo(&buf, "%s%s", fldsep, value);
5157 else
5158 appendStringInfoString(&buf, value);
5159 printed = true;
5161 p = att_addlength_pointer(p, typlen, p);
5162 p = (char *) att_align_nominal(p, typalign);
5165 /* advance bitmap pointer if any */
5166 if (bitmap)
5168 bitmask <<= 1;
5169 if (bitmask == 0x100)
5171 bitmap++;
5172 bitmask = 1;
5177 result = cstring_to_text_with_len(buf.data, buf.len);
5178 pfree(buf.data);
5180 return result;
5183 #define HEXBASE 16
5185 * Convert an int32 to a string containing a base 16 (hex) representation of
5186 * the number.
5188 Datum
5189 to_hex32(PG_FUNCTION_ARGS)
5191 uint32 value = (uint32) PG_GETARG_INT32(0);
5192 char *ptr;
5193 const char *digits = "0123456789abcdef";
5194 char buf[32]; /* bigger than needed, but reasonable */
5196 ptr = buf + sizeof(buf) - 1;
5197 *ptr = '\0';
5201 *--ptr = digits[value % HEXBASE];
5202 value /= HEXBASE;
5203 } while (ptr > buf && value);
5205 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5209 * Convert an int64 to a string containing a base 16 (hex) representation of
5210 * the number.
5212 Datum
5213 to_hex64(PG_FUNCTION_ARGS)
5215 uint64 value = (uint64) PG_GETARG_INT64(0);
5216 char *ptr;
5217 const char *digits = "0123456789abcdef";
5218 char buf[32]; /* bigger than needed, but reasonable */
5220 ptr = buf + sizeof(buf) - 1;
5221 *ptr = '\0';
5225 *--ptr = digits[value % HEXBASE];
5226 value /= HEXBASE;
5227 } while (ptr > buf && value);
5229 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5233 * Return the size of a datum, possibly compressed
5235 * Works on any data type
5237 Datum
5238 pg_column_size(PG_FUNCTION_ARGS)
5240 Datum value = PG_GETARG_DATUM(0);
5241 int32 result;
5242 int typlen;
5244 /* On first call, get the input type's typlen, and save at *fn_extra */
5245 if (fcinfo->flinfo->fn_extra == NULL)
5247 /* Lookup the datatype of the supplied argument */
5248 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5250 typlen = get_typlen(argtypeid);
5251 if (typlen == 0) /* should not happen */
5252 elog(ERROR, "cache lookup failed for type %u", argtypeid);
5254 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5255 sizeof(int));
5256 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5258 else
5259 typlen = *((int *) fcinfo->flinfo->fn_extra);
5261 if (typlen == -1)
5263 /* varlena type, possibly toasted */
5264 result = toast_datum_size(value);
5266 else if (typlen == -2)
5268 /* cstring */
5269 result = strlen(DatumGetCString(value)) + 1;
5271 else
5273 /* ordinary fixed-width type */
5274 result = typlen;
5277 PG_RETURN_INT32(result);
5281 * Return the compression method stored in the compressed attribute. Return
5282 * NULL for non varlena type or uncompressed data.
5284 Datum
5285 pg_column_compression(PG_FUNCTION_ARGS)
5287 int typlen;
5288 char *result;
5289 ToastCompressionId cmid;
5291 /* On first call, get the input type's typlen, and save at *fn_extra */
5292 if (fcinfo->flinfo->fn_extra == NULL)
5294 /* Lookup the datatype of the supplied argument */
5295 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5297 typlen = get_typlen(argtypeid);
5298 if (typlen == 0) /* should not happen */
5299 elog(ERROR, "cache lookup failed for type %u", argtypeid);
5301 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5302 sizeof(int));
5303 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5305 else
5306 typlen = *((int *) fcinfo->flinfo->fn_extra);
5308 if (typlen != -1)
5309 PG_RETURN_NULL();
5311 /* get the compression method id stored in the compressed varlena */
5312 cmid = toast_get_compression_id((struct varlena *)
5313 DatumGetPointer(PG_GETARG_DATUM(0)));
5314 if (cmid == TOAST_INVALID_COMPRESSION_ID)
5315 PG_RETURN_NULL();
5317 /* convert compression method id to compression method name */
5318 switch (cmid)
5320 case TOAST_PGLZ_COMPRESSION_ID:
5321 result = "pglz";
5322 break;
5323 case TOAST_LZ4_COMPRESSION_ID:
5324 result = "lz4";
5325 break;
5326 default:
5327 elog(ERROR, "invalid compression method id %d", cmid);
5330 PG_RETURN_TEXT_P(cstring_to_text(result));
5334 * string_agg - Concatenates values and returns string.
5336 * Syntax: string_agg(value text, delimiter text) RETURNS text
5338 * Note: Any NULL values are ignored. The first-call delimiter isn't
5339 * actually used at all, and on subsequent calls the delimiter precedes
5340 * the associated value.
5343 /* subroutine to initialize state */
5344 static StringInfo
5345 makeStringAggState(FunctionCallInfo fcinfo)
5347 StringInfo state;
5348 MemoryContext aggcontext;
5349 MemoryContext oldcontext;
5351 if (!AggCheckCallContext(fcinfo, &aggcontext))
5353 /* cannot be called directly because of internal-type argument */
5354 elog(ERROR, "string_agg_transfn called in non-aggregate context");
5358 * Create state in aggregate context. It'll stay there across subsequent
5359 * calls.
5361 oldcontext = MemoryContextSwitchTo(aggcontext);
5362 state = makeStringInfo();
5363 MemoryContextSwitchTo(oldcontext);
5365 return state;
5368 Datum
5369 string_agg_transfn(PG_FUNCTION_ARGS)
5371 StringInfo state;
5373 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5375 /* Append the value unless null. */
5376 if (!PG_ARGISNULL(1))
5378 /* On the first time through, we ignore the delimiter. */
5379 if (state == NULL)
5380 state = makeStringAggState(fcinfo);
5381 else if (!PG_ARGISNULL(2))
5382 appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5384 appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5388 * The transition type for string_agg() is declared to be "internal",
5389 * which is a pass-by-value type the same size as a pointer.
5391 PG_RETURN_POINTER(state);
5394 Datum
5395 string_agg_finalfn(PG_FUNCTION_ARGS)
5397 StringInfo state;
5399 /* cannot be called directly because of internal-type argument */
5400 Assert(AggCheckCallContext(fcinfo, NULL));
5402 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5404 if (state != NULL)
5405 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5406 else
5407 PG_RETURN_NULL();
5411 * Prepare cache with fmgr info for the output functions of the datatypes of
5412 * the arguments of a concat-like function, beginning with argument "argidx".
5413 * (Arguments before that will have corresponding slots in the resulting
5414 * FmgrInfo array, but we don't fill those slots.)
5416 static FmgrInfo *
5417 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5419 FmgrInfo *foutcache;
5420 int i;
5422 /* We keep the info in fn_mcxt so it survives across calls */
5423 foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5424 PG_NARGS() * sizeof(FmgrInfo));
5426 for (i = argidx; i < PG_NARGS(); i++)
5428 Oid valtype;
5429 Oid typOutput;
5430 bool typIsVarlena;
5432 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5433 if (!OidIsValid(valtype))
5434 elog(ERROR, "could not determine data type of concat() input");
5436 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5437 fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5440 fcinfo->flinfo->fn_extra = foutcache;
5442 return foutcache;
5446 * Implementation of both concat() and concat_ws().
5448 * sepstr is the separator string to place between values.
5449 * argidx identifies the first argument to concatenate (counting from zero);
5450 * note that this must be constant across any one series of calls.
5452 * Returns NULL if result should be NULL, else text value.
5454 static text *
5455 concat_internal(const char *sepstr, int argidx,
5456 FunctionCallInfo fcinfo)
5458 text *result;
5459 StringInfoData str;
5460 FmgrInfo *foutcache;
5461 bool first_arg = true;
5462 int i;
5465 * concat(VARIADIC some-array) is essentially equivalent to
5466 * array_to_text(), ie concat the array elements with the given separator.
5467 * So we just pass the case off to that code.
5469 if (get_fn_expr_variadic(fcinfo->flinfo))
5471 ArrayType *arr;
5473 /* Should have just the one argument */
5474 Assert(argidx == PG_NARGS() - 1);
5476 /* concat(VARIADIC NULL) is defined as NULL */
5477 if (PG_ARGISNULL(argidx))
5478 return NULL;
5481 * Non-null argument had better be an array. We assume that any call
5482 * context that could let get_fn_expr_variadic return true will have
5483 * checked that a VARIADIC-labeled parameter actually is an array. So
5484 * it should be okay to just Assert that it's an array rather than
5485 * doing a full-fledged error check.
5487 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5489 /* OK, safe to fetch the array value */
5490 arr = PG_GETARG_ARRAYTYPE_P(argidx);
5493 * And serialize the array. We tell array_to_text to ignore null
5494 * elements, which matches the behavior of the loop below.
5496 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5499 /* Normal case without explicit VARIADIC marker */
5500 initStringInfo(&str);
5502 /* Get output function info, building it if first time through */
5503 foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5504 if (foutcache == NULL)
5505 foutcache = build_concat_foutcache(fcinfo, argidx);
5507 for (i = argidx; i < PG_NARGS(); i++)
5509 if (!PG_ARGISNULL(i))
5511 Datum value = PG_GETARG_DATUM(i);
5513 /* add separator if appropriate */
5514 if (first_arg)
5515 first_arg = false;
5516 else
5517 appendStringInfoString(&str, sepstr);
5519 /* call the appropriate type output function, append the result */
5520 appendStringInfoString(&str,
5521 OutputFunctionCall(&foutcache[i], value));
5525 result = cstring_to_text_with_len(str.data, str.len);
5526 pfree(str.data);
5528 return result;
5532 * Concatenate all arguments. NULL arguments are ignored.
5534 Datum
5535 text_concat(PG_FUNCTION_ARGS)
5537 text *result;
5539 result = concat_internal("", 0, fcinfo);
5540 if (result == NULL)
5541 PG_RETURN_NULL();
5542 PG_RETURN_TEXT_P(result);
5546 * Concatenate all but first argument value with separators. The first
5547 * parameter is used as the separator. NULL arguments are ignored.
5549 Datum
5550 text_concat_ws(PG_FUNCTION_ARGS)
5552 char *sep;
5553 text *result;
5555 /* return NULL when separator is NULL */
5556 if (PG_ARGISNULL(0))
5557 PG_RETURN_NULL();
5558 sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5560 result = concat_internal(sep, 1, fcinfo);
5561 if (result == NULL)
5562 PG_RETURN_NULL();
5563 PG_RETURN_TEXT_P(result);
5567 * Return first n characters in the string. When n is negative,
5568 * return all but last |n| characters.
5570 Datum
5571 text_left(PG_FUNCTION_ARGS)
5573 int n = PG_GETARG_INT32(1);
5575 if (n < 0)
5577 text *str = PG_GETARG_TEXT_PP(0);
5578 const char *p = VARDATA_ANY(str);
5579 int len = VARSIZE_ANY_EXHDR(str);
5580 int rlen;
5582 n = pg_mbstrlen_with_len(p, len) + n;
5583 rlen = pg_mbcharcliplen(p, len, n);
5584 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5586 else
5587 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5591 * Return last n characters in the string. When n is negative,
5592 * return all but first |n| characters.
5594 Datum
5595 text_right(PG_FUNCTION_ARGS)
5597 text *str = PG_GETARG_TEXT_PP(0);
5598 const char *p = VARDATA_ANY(str);
5599 int len = VARSIZE_ANY_EXHDR(str);
5600 int n = PG_GETARG_INT32(1);
5601 int off;
5603 if (n < 0)
5604 n = -n;
5605 else
5606 n = pg_mbstrlen_with_len(p, len) - n;
5607 off = pg_mbcharcliplen(p, len, n);
5609 PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5613 * Return reversed string
5615 Datum
5616 text_reverse(PG_FUNCTION_ARGS)
5618 text *str = PG_GETARG_TEXT_PP(0);
5619 const char *p = VARDATA_ANY(str);
5620 int len = VARSIZE_ANY_EXHDR(str);
5621 const char *endp = p + len;
5622 text *result;
5623 char *dst;
5625 result = palloc(len + VARHDRSZ);
5626 dst = (char *) VARDATA(result) + len;
5627 SET_VARSIZE(result, len + VARHDRSZ);
5629 if (pg_database_encoding_max_length() > 1)
5631 /* multibyte version */
5632 while (p < endp)
5634 int sz;
5636 sz = pg_mblen(p);
5637 dst -= sz;
5638 memcpy(dst, p, sz);
5639 p += sz;
5642 else
5644 /* single byte version */
5645 while (p < endp)
5646 *(--dst) = *p++;
5649 PG_RETURN_TEXT_P(result);
5654 * Support macros for text_format()
5656 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5658 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5659 do { \
5660 if (++(ptr) >= (end_ptr)) \
5661 ereport(ERROR, \
5662 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5663 errmsg("unterminated format() type specifier"), \
5664 errhint("For a single \"%%\" use \"%%%%\"."))); \
5665 } while (0)
5668 * Returns a formatted string
5670 Datum
5671 text_format(PG_FUNCTION_ARGS)
5673 text *fmt;
5674 StringInfoData str;
5675 const char *cp;
5676 const char *start_ptr;
5677 const char *end_ptr;
5678 text *result;
5679 int arg;
5680 bool funcvariadic;
5681 int nargs;
5682 Datum *elements = NULL;
5683 bool *nulls = NULL;
5684 Oid element_type = InvalidOid;
5685 Oid prev_type = InvalidOid;
5686 Oid prev_width_type = InvalidOid;
5687 FmgrInfo typoutputfinfo;
5688 FmgrInfo typoutputinfo_width;
5690 /* When format string is null, immediately return null */
5691 if (PG_ARGISNULL(0))
5692 PG_RETURN_NULL();
5694 /* If argument is marked VARIADIC, expand array into elements */
5695 if (get_fn_expr_variadic(fcinfo->flinfo))
5697 ArrayType *arr;
5698 int16 elmlen;
5699 bool elmbyval;
5700 char elmalign;
5701 int nitems;
5703 /* Should have just the one argument */
5704 Assert(PG_NARGS() == 2);
5706 /* If argument is NULL, we treat it as zero-length array */
5707 if (PG_ARGISNULL(1))
5708 nitems = 0;
5709 else
5712 * Non-null argument had better be an array. We assume that any
5713 * call context that could let get_fn_expr_variadic return true
5714 * will have checked that a VARIADIC-labeled parameter actually is
5715 * an array. So it should be okay to just Assert that it's an
5716 * array rather than doing a full-fledged error check.
5718 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5720 /* OK, safe to fetch the array value */
5721 arr = PG_GETARG_ARRAYTYPE_P(1);
5723 /* Get info about array element type */
5724 element_type = ARR_ELEMTYPE(arr);
5725 get_typlenbyvalalign(element_type,
5726 &elmlen, &elmbyval, &elmalign);
5728 /* Extract all array elements */
5729 deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5730 &elements, &nulls, &nitems);
5733 nargs = nitems + 1;
5734 funcvariadic = true;
5736 else
5738 /* Non-variadic case, we'll process the arguments individually */
5739 nargs = PG_NARGS();
5740 funcvariadic = false;
5743 /* Setup for main loop. */
5744 fmt = PG_GETARG_TEXT_PP(0);
5745 start_ptr = VARDATA_ANY(fmt);
5746 end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5747 initStringInfo(&str);
5748 arg = 1; /* next argument position to print */
5750 /* Scan format string, looking for conversion specifiers. */
5751 for (cp = start_ptr; cp < end_ptr; cp++)
5753 int argpos;
5754 int widthpos;
5755 int flags;
5756 int width;
5757 Datum value;
5758 bool isNull;
5759 Oid typid;
5762 * If it's not the start of a conversion specifier, just copy it to
5763 * the output buffer.
5765 if (*cp != '%')
5767 appendStringInfoCharMacro(&str, *cp);
5768 continue;
5771 ADVANCE_PARSE_POINTER(cp, end_ptr);
5773 /* Easy case: %% outputs a single % */
5774 if (*cp == '%')
5776 appendStringInfoCharMacro(&str, *cp);
5777 continue;
5780 /* Parse the optional portions of the format specifier */
5781 cp = text_format_parse_format(cp, end_ptr,
5782 &argpos, &widthpos,
5783 &flags, &width);
5786 * Next we should see the main conversion specifier. Whether or not
5787 * an argument position was present, it's known that at least one
5788 * character remains in the string at this point. Experience suggests
5789 * that it's worth checking that that character is one of the expected
5790 * ones before we try to fetch arguments, so as to produce the least
5791 * confusing response to a mis-formatted specifier.
5793 if (strchr("sIL", *cp) == NULL)
5794 ereport(ERROR,
5795 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5796 errmsg("unrecognized format() type specifier \"%.*s\"",
5797 pg_mblen(cp), cp),
5798 errhint("For a single \"%%\" use \"%%%%\".")));
5800 /* If indirect width was specified, get its value */
5801 if (widthpos >= 0)
5803 /* Collect the specified or next argument position */
5804 if (widthpos > 0)
5805 arg = widthpos;
5806 if (arg >= nargs)
5807 ereport(ERROR,
5808 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5809 errmsg("too few arguments for format()")));
5811 /* Get the value and type of the selected argument */
5812 if (!funcvariadic)
5814 value = PG_GETARG_DATUM(arg);
5815 isNull = PG_ARGISNULL(arg);
5816 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5818 else
5820 value = elements[arg - 1];
5821 isNull = nulls[arg - 1];
5822 typid = element_type;
5824 if (!OidIsValid(typid))
5825 elog(ERROR, "could not determine data type of format() input");
5827 arg++;
5829 /* We can treat NULL width the same as zero */
5830 if (isNull)
5831 width = 0;
5832 else if (typid == INT4OID)
5833 width = DatumGetInt32(value);
5834 else if (typid == INT2OID)
5835 width = DatumGetInt16(value);
5836 else
5838 /* For less-usual datatypes, convert to text then to int */
5839 char *str;
5841 if (typid != prev_width_type)
5843 Oid typoutputfunc;
5844 bool typIsVarlena;
5846 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5847 fmgr_info(typoutputfunc, &typoutputinfo_width);
5848 prev_width_type = typid;
5851 str = OutputFunctionCall(&typoutputinfo_width, value);
5853 /* pg_strtoint32 will complain about bad data or overflow */
5854 width = pg_strtoint32(str);
5856 pfree(str);
5860 /* Collect the specified or next argument position */
5861 if (argpos > 0)
5862 arg = argpos;
5863 if (arg >= nargs)
5864 ereport(ERROR,
5865 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5866 errmsg("too few arguments for format()")));
5868 /* Get the value and type of the selected argument */
5869 if (!funcvariadic)
5871 value = PG_GETARG_DATUM(arg);
5872 isNull = PG_ARGISNULL(arg);
5873 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5875 else
5877 value = elements[arg - 1];
5878 isNull = nulls[arg - 1];
5879 typid = element_type;
5881 if (!OidIsValid(typid))
5882 elog(ERROR, "could not determine data type of format() input");
5884 arg++;
5887 * Get the appropriate typOutput function, reusing previous one if
5888 * same type as previous argument. That's particularly useful in the
5889 * variadic-array case, but often saves work even for ordinary calls.
5891 if (typid != prev_type)
5893 Oid typoutputfunc;
5894 bool typIsVarlena;
5896 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5897 fmgr_info(typoutputfunc, &typoutputfinfo);
5898 prev_type = typid;
5902 * And now we can format the value.
5904 switch (*cp)
5906 case 's':
5907 case 'I':
5908 case 'L':
5909 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5910 value, isNull,
5911 flags, width);
5912 break;
5913 default:
5914 /* should not get here, because of previous check */
5915 ereport(ERROR,
5916 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5917 errmsg("unrecognized format() type specifier \"%.*s\"",
5918 pg_mblen(cp), cp),
5919 errhint("For a single \"%%\" use \"%%%%\".")));
5920 break;
5924 /* Don't need deconstruct_array results anymore. */
5925 if (elements != NULL)
5926 pfree(elements);
5927 if (nulls != NULL)
5928 pfree(nulls);
5930 /* Generate results. */
5931 result = cstring_to_text_with_len(str.data, str.len);
5932 pfree(str.data);
5934 PG_RETURN_TEXT_P(result);
5938 * Parse contiguous digits as a decimal number.
5940 * Returns true if some digits could be parsed.
5941 * The value is returned into *value, and *ptr is advanced to the next
5942 * character to be parsed.
5944 * Note parsing invariant: at least one character is known available before
5945 * string end (end_ptr) at entry, and this is still true at exit.
5947 static bool
5948 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5950 bool found = false;
5951 const char *cp = *ptr;
5952 int val = 0;
5954 while (*cp >= '0' && *cp <= '9')
5956 int8 digit = (*cp - '0');
5958 if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5959 unlikely(pg_add_s32_overflow(val, digit, &val)))
5960 ereport(ERROR,
5961 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5962 errmsg("number is out of range")));
5963 ADVANCE_PARSE_POINTER(cp, end_ptr);
5964 found = true;
5967 *ptr = cp;
5968 *value = val;
5970 return found;
5974 * Parse a format specifier (generally following the SUS printf spec).
5976 * We have already advanced over the initial '%', and we are looking for
5977 * [argpos][flags][width]type (but the type character is not consumed here).
5979 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5980 * Output parameters:
5981 * argpos: argument position for value to be printed. -1 means unspecified.
5982 * widthpos: argument position for width. Zero means the argument position
5983 * was unspecified (ie, take the next arg) and -1 means no width
5984 * argument (width was omitted or specified as a constant).
5985 * flags: bitmask of flags.
5986 * width: directly-specified width value. Zero means the width was omitted
5987 * (note it's not necessary to distinguish this case from an explicit
5988 * zero width value).
5990 * The function result is the next character position to be parsed, ie, the
5991 * location where the type character is/should be.
5993 * Note parsing invariant: at least one character is known available before
5994 * string end (end_ptr) at entry, and this is still true at exit.
5996 static const char *
5997 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5998 int *argpos, int *widthpos,
5999 int *flags, int *width)
6001 const char *cp = start_ptr;
6002 int n;
6004 /* set defaults for output parameters */
6005 *argpos = -1;
6006 *widthpos = -1;
6007 *flags = 0;
6008 *width = 0;
6010 /* try to identify first number */
6011 if (text_format_parse_digits(&cp, end_ptr, &n))
6013 if (*cp != '$')
6015 /* Must be just a width and a type, so we're done */
6016 *width = n;
6017 return cp;
6019 /* The number was argument position */
6020 *argpos = n;
6021 /* Explicit 0 for argument index is immediately refused */
6022 if (n == 0)
6023 ereport(ERROR,
6024 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6025 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6026 ADVANCE_PARSE_POINTER(cp, end_ptr);
6029 /* Handle flags (only minus is supported now) */
6030 while (*cp == '-')
6032 *flags |= TEXT_FORMAT_FLAG_MINUS;
6033 ADVANCE_PARSE_POINTER(cp, end_ptr);
6036 if (*cp == '*')
6038 /* Handle indirect width */
6039 ADVANCE_PARSE_POINTER(cp, end_ptr);
6040 if (text_format_parse_digits(&cp, end_ptr, &n))
6042 /* number in this position must be closed by $ */
6043 if (*cp != '$')
6044 ereport(ERROR,
6045 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6046 errmsg("width argument position must be ended by \"$\"")));
6047 /* The number was width argument position */
6048 *widthpos = n;
6049 /* Explicit 0 for argument index is immediately refused */
6050 if (n == 0)
6051 ereport(ERROR,
6052 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6053 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6054 ADVANCE_PARSE_POINTER(cp, end_ptr);
6056 else
6057 *widthpos = 0; /* width's argument position is unspecified */
6059 else
6061 /* Check for direct width specification */
6062 if (text_format_parse_digits(&cp, end_ptr, &n))
6063 *width = n;
6066 /* cp should now be pointing at type character */
6067 return cp;
6071 * Format a %s, %I, or %L conversion
6073 static void
6074 text_format_string_conversion(StringInfo buf, char conversion,
6075 FmgrInfo *typOutputInfo,
6076 Datum value, bool isNull,
6077 int flags, int width)
6079 char *str;
6081 /* Handle NULL arguments before trying to stringify the value. */
6082 if (isNull)
6084 if (conversion == 's')
6085 text_format_append_string(buf, "", flags, width);
6086 else if (conversion == 'L')
6087 text_format_append_string(buf, "NULL", flags, width);
6088 else if (conversion == 'I')
6089 ereport(ERROR,
6090 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6091 errmsg("null values cannot be formatted as an SQL identifier")));
6092 return;
6095 /* Stringify. */
6096 str = OutputFunctionCall(typOutputInfo, value);
6098 /* Escape. */
6099 if (conversion == 'I')
6101 /* quote_identifier may or may not allocate a new string. */
6102 text_format_append_string(buf, quote_identifier(str), flags, width);
6104 else if (conversion == 'L')
6106 char *qstr = quote_literal_cstr(str);
6108 text_format_append_string(buf, qstr, flags, width);
6109 /* quote_literal_cstr() always allocates a new string */
6110 pfree(qstr);
6112 else
6113 text_format_append_string(buf, str, flags, width);
6115 /* Cleanup. */
6116 pfree(str);
6120 * Append str to buf, padding as directed by flags/width
6122 static void
6123 text_format_append_string(StringInfo buf, const char *str,
6124 int flags, int width)
6126 bool align_to_left = false;
6127 int len;
6129 /* fast path for typical easy case */
6130 if (width == 0)
6132 appendStringInfoString(buf, str);
6133 return;
6136 if (width < 0)
6138 /* Negative width: implicit '-' flag, then take absolute value */
6139 align_to_left = true;
6140 /* -INT_MIN is undefined */
6141 if (width <= INT_MIN)
6142 ereport(ERROR,
6143 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6144 errmsg("number is out of range")));
6145 width = -width;
6147 else if (flags & TEXT_FORMAT_FLAG_MINUS)
6148 align_to_left = true;
6150 len = pg_mbstrlen(str);
6151 if (align_to_left)
6153 /* left justify */
6154 appendStringInfoString(buf, str);
6155 if (len < width)
6156 appendStringInfoSpaces(buf, width - len);
6158 else
6160 /* right justify */
6161 if (len < width)
6162 appendStringInfoSpaces(buf, width - len);
6163 appendStringInfoString(buf, str);
6168 * text_format_nv - nonvariadic wrapper for text_format function.
6170 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6171 * which checks that all built-in functions that share the implementing C
6172 * function take the same number of arguments.
6174 Datum
6175 text_format_nv(PG_FUNCTION_ARGS)
6177 return text_format(fcinfo);
6181 * Helper function for Levenshtein distance functions. Faster than memcmp(),
6182 * for this use case.
6184 static inline bool
6185 rest_of_char_same(const char *s1, const char *s2, int len)
6187 while (len > 0)
6189 len--;
6190 if (s1[len] != s2[len])
6191 return false;
6193 return true;
6196 /* Expand each Levenshtein distance variant */
6197 #include "levenshtein.c"
6198 #define LEVENSHTEIN_LESS_EQUAL
6199 #include "levenshtein.c"
6203 * The following *ClosestMatch() functions can be used to determine whether a
6204 * user-provided string resembles any known valid values, which is useful for
6205 * providing hints in log messages, among other things. Use these functions
6206 * like so:
6208 * initClosestMatch(&state, source_string, max_distance);
6210 * for (int i = 0; i < num_valid_strings; i++)
6211 * updateClosestMatch(&state, valid_strings[i]);
6213 * closestMatch = getClosestMatch(&state);
6217 * Initialize the given state with the source string and maximum Levenshtein
6218 * distance to consider.
6220 void
6221 initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6223 Assert(state);
6224 Assert(max_d >= 0);
6226 state->source = source;
6227 state->min_d = -1;
6228 state->max_d = max_d;
6229 state->match = NULL;
6233 * If the candidate string is a closer match than the current one saved (or
6234 * there is no match saved), save it as the closest match.
6236 * If the source or candidate string is NULL, empty, or too long, this function
6237 * takes no action. Likewise, if the Levenshtein distance exceeds the maximum
6238 * allowed or more than half the characters are different, no action is taken.
6240 void
6241 updateClosestMatch(ClosestMatchState *state, const char *candidate)
6243 int dist;
6245 Assert(state);
6247 if (state->source == NULL || state->source[0] == '\0' ||
6248 candidate == NULL || candidate[0] == '\0')
6249 return;
6252 * To avoid ERROR-ing, we check the lengths here instead of setting
6253 * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6255 if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6256 strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6257 return;
6259 dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6260 candidate, strlen(candidate), 1, 1, 1,
6261 state->max_d, true);
6262 if (dist <= state->max_d &&
6263 dist <= strlen(state->source) / 2 &&
6264 (state->min_d == -1 || dist < state->min_d))
6266 state->min_d = dist;
6267 state->match = candidate;
6272 * Return the closest match. If no suitable candidates were provided via
6273 * updateClosestMatch(), return NULL.
6275 const char *
6276 getClosestMatch(ClosestMatchState *state)
6278 Assert(state);
6280 return state->match;
6285 * Unicode support
6288 static UnicodeNormalizationForm
6289 unicode_norm_form_from_string(const char *formstr)
6291 UnicodeNormalizationForm form = -1;
6294 * Might as well check this while we're here.
6296 if (GetDatabaseEncoding() != PG_UTF8)
6297 ereport(ERROR,
6298 (errcode(ERRCODE_SYNTAX_ERROR),
6299 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6301 if (pg_strcasecmp(formstr, "NFC") == 0)
6302 form = UNICODE_NFC;
6303 else if (pg_strcasecmp(formstr, "NFD") == 0)
6304 form = UNICODE_NFD;
6305 else if (pg_strcasecmp(formstr, "NFKC") == 0)
6306 form = UNICODE_NFKC;
6307 else if (pg_strcasecmp(formstr, "NFKD") == 0)
6308 form = UNICODE_NFKD;
6309 else
6310 ereport(ERROR,
6311 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6312 errmsg("invalid normalization form: %s", formstr)));
6314 return form;
6317 Datum
6318 unicode_normalize_func(PG_FUNCTION_ARGS)
6320 text *input = PG_GETARG_TEXT_PP(0);
6321 char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6322 UnicodeNormalizationForm form;
6323 int size;
6324 pg_wchar *input_chars;
6325 pg_wchar *output_chars;
6326 unsigned char *p;
6327 text *result;
6328 int i;
6330 form = unicode_norm_form_from_string(formstr);
6332 /* convert to pg_wchar */
6333 size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6334 input_chars = palloc((size + 1) * sizeof(pg_wchar));
6335 p = (unsigned char *) VARDATA_ANY(input);
6336 for (i = 0; i < size; i++)
6338 input_chars[i] = utf8_to_unicode(p);
6339 p += pg_utf_mblen(p);
6341 input_chars[i] = (pg_wchar) '\0';
6342 Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6344 /* action */
6345 output_chars = unicode_normalize(form, input_chars);
6347 /* convert back to UTF-8 string */
6348 size = 0;
6349 for (pg_wchar *wp = output_chars; *wp; wp++)
6351 unsigned char buf[4];
6353 unicode_to_utf8(*wp, buf);
6354 size += pg_utf_mblen(buf);
6357 result = palloc(size + VARHDRSZ);
6358 SET_VARSIZE(result, size + VARHDRSZ);
6360 p = (unsigned char *) VARDATA_ANY(result);
6361 for (pg_wchar *wp = output_chars; *wp; wp++)
6363 unicode_to_utf8(*wp, p);
6364 p += pg_utf_mblen(p);
6366 Assert((char *) p == (char *) result + size + VARHDRSZ);
6368 PG_RETURN_TEXT_P(result);
6372 * Check whether the string is in the specified Unicode normalization form.
6374 * This is done by converting the string to the specified normal form and then
6375 * comparing that to the original string. To speed that up, we also apply the
6376 * "quick check" algorithm specified in UAX #15, which can give a yes or no
6377 * answer for many strings by just scanning the string once.
6379 * This function should generally be optimized for the case where the string
6380 * is in fact normalized. In that case, we'll end up looking at the entire
6381 * string, so it's probably not worth doing any incremental conversion etc.
6383 Datum
6384 unicode_is_normalized(PG_FUNCTION_ARGS)
6386 text *input = PG_GETARG_TEXT_PP(0);
6387 char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6388 UnicodeNormalizationForm form;
6389 int size;
6390 pg_wchar *input_chars;
6391 pg_wchar *output_chars;
6392 unsigned char *p;
6393 int i;
6394 UnicodeNormalizationQC quickcheck;
6395 int output_size;
6396 bool result;
6398 form = unicode_norm_form_from_string(formstr);
6400 /* convert to pg_wchar */
6401 size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6402 input_chars = palloc((size + 1) * sizeof(pg_wchar));
6403 p = (unsigned char *) VARDATA_ANY(input);
6404 for (i = 0; i < size; i++)
6406 input_chars[i] = utf8_to_unicode(p);
6407 p += pg_utf_mblen(p);
6409 input_chars[i] = (pg_wchar) '\0';
6410 Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6412 /* quick check (see UAX #15) */
6413 quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6414 if (quickcheck == UNICODE_NORM_QC_YES)
6415 PG_RETURN_BOOL(true);
6416 else if (quickcheck == UNICODE_NORM_QC_NO)
6417 PG_RETURN_BOOL(false);
6419 /* normalize and compare with original */
6420 output_chars = unicode_normalize(form, input_chars);
6422 output_size = 0;
6423 for (pg_wchar *wp = output_chars; *wp; wp++)
6424 output_size++;
6426 result = (size == output_size) &&
6427 (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6429 PG_RETURN_BOOL(result);
6433 * Check if first n chars are hexadecimal digits
6435 static bool
6436 isxdigits_n(const char *instr, size_t n)
6438 for (size_t i = 0; i < n; i++)
6439 if (!isxdigit((unsigned char) instr[i]))
6440 return false;
6442 return true;
6445 static unsigned int
6446 hexval(unsigned char c)
6448 if (c >= '0' && c <= '9')
6449 return c - '0';
6450 if (c >= 'a' && c <= 'f')
6451 return c - 'a' + 0xA;
6452 if (c >= 'A' && c <= 'F')
6453 return c - 'A' + 0xA;
6454 elog(ERROR, "invalid hexadecimal digit");
6455 return 0; /* not reached */
6459 * Translate string with hexadecimal digits to number
6461 static unsigned int
6462 hexval_n(const char *instr, size_t n)
6464 unsigned int result = 0;
6466 for (size_t i = 0; i < n; i++)
6467 result += hexval(instr[i]) << (4 * (n - i - 1));
6469 return result;
6473 * Replaces Unicode escape sequences by Unicode characters
6475 Datum
6476 unistr(PG_FUNCTION_ARGS)
6478 text *input_text = PG_GETARG_TEXT_PP(0);
6479 char *instr;
6480 int len;
6481 StringInfoData str;
6482 text *result;
6483 pg_wchar pair_first = 0;
6484 char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6486 instr = VARDATA_ANY(input_text);
6487 len = VARSIZE_ANY_EXHDR(input_text);
6489 initStringInfo(&str);
6491 while (len > 0)
6493 if (instr[0] == '\\')
6495 if (len >= 2 &&
6496 instr[1] == '\\')
6498 if (pair_first)
6499 goto invalid_pair;
6500 appendStringInfoChar(&str, '\\');
6501 instr += 2;
6502 len -= 2;
6504 else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6505 (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6507 pg_wchar unicode;
6508 int offset = instr[1] == 'u' ? 2 : 1;
6510 unicode = hexval_n(instr + offset, 4);
6512 if (!is_valid_unicode_codepoint(unicode))
6513 ereport(ERROR,
6514 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6515 errmsg("invalid Unicode code point: %04X", unicode));
6517 if (pair_first)
6519 if (is_utf16_surrogate_second(unicode))
6521 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6522 pair_first = 0;
6524 else
6525 goto invalid_pair;
6527 else if (is_utf16_surrogate_second(unicode))
6528 goto invalid_pair;
6530 if (is_utf16_surrogate_first(unicode))
6531 pair_first = unicode;
6532 else
6534 pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6535 appendStringInfoString(&str, cbuf);
6538 instr += 4 + offset;
6539 len -= 4 + offset;
6541 else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6543 pg_wchar unicode;
6545 unicode = hexval_n(instr + 2, 6);
6547 if (!is_valid_unicode_codepoint(unicode))
6548 ereport(ERROR,
6549 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6550 errmsg("invalid Unicode code point: %04X", unicode));
6552 if (pair_first)
6554 if (is_utf16_surrogate_second(unicode))
6556 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6557 pair_first = 0;
6559 else
6560 goto invalid_pair;
6562 else if (is_utf16_surrogate_second(unicode))
6563 goto invalid_pair;
6565 if (is_utf16_surrogate_first(unicode))
6566 pair_first = unicode;
6567 else
6569 pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6570 appendStringInfoString(&str, cbuf);
6573 instr += 8;
6574 len -= 8;
6576 else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6578 pg_wchar unicode;
6580 unicode = hexval_n(instr + 2, 8);
6582 if (!is_valid_unicode_codepoint(unicode))
6583 ereport(ERROR,
6584 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6585 errmsg("invalid Unicode code point: %04X", unicode));
6587 if (pair_first)
6589 if (is_utf16_surrogate_second(unicode))
6591 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6592 pair_first = 0;
6594 else
6595 goto invalid_pair;
6597 else if (is_utf16_surrogate_second(unicode))
6598 goto invalid_pair;
6600 if (is_utf16_surrogate_first(unicode))
6601 pair_first = unicode;
6602 else
6604 pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6605 appendStringInfoString(&str, cbuf);
6608 instr += 10;
6609 len -= 10;
6611 else
6612 ereport(ERROR,
6613 (errcode(ERRCODE_SYNTAX_ERROR),
6614 errmsg("invalid Unicode escape"),
6615 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6617 else
6619 if (pair_first)
6620 goto invalid_pair;
6622 appendStringInfoChar(&str, *instr++);
6623 len--;
6627 /* unfinished surrogate pair? */
6628 if (pair_first)
6629 goto invalid_pair;
6631 result = cstring_to_text_with_len(str.data, str.len);
6632 pfree(str.data);
6634 PG_RETURN_TEXT_P(result);
6636 invalid_pair:
6637 ereport(ERROR,
6638 (errcode(ERRCODE_SYNTAX_ERROR),
6639 errmsg("invalid Unicode surrogate pair")));
6640 PG_RETURN_NULL(); /* keep compiler quiet */