Update copyright for 2022
[pgsql.git] / src / backend / utils / adt / varlena.c
blobb3eb39761d378cfa55b1b6ddf6f53d4c5b84fdb8
1 /*-------------------------------------------------------------------------
3 * varlena.c
4 * Functions for the variable-length built-in types.
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/utils/adt/varlena.c
13 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include <ctype.h>
18 #include <limits.h>
20 #include "access/detoast.h"
21 #include "access/toast_compression.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "lib/hyperloglog.h"
28 #include "libpq/pqformat.h"
29 #include "miscadmin.h"
30 #include "nodes/execnodes.h"
31 #include "parser/scansup.h"
32 #include "port/pg_bswap.h"
33 #include "regex/regex.h"
34 #include "utils/builtins.h"
35 #include "utils/bytea.h"
36 #include "utils/lsyscache.h"
37 #include "utils/memutils.h"
38 #include "utils/pg_locale.h"
39 #include "utils/sortsupport.h"
40 #include "utils/varlena.h"
43 /* GUC variable */
44 int bytea_output = BYTEA_OUTPUT_HEX;
46 typedef struct varlena unknown;
47 typedef struct varlena VarString;
50 * State for text_position_* functions.
52 typedef struct
54 bool is_multibyte; /* T if multibyte encoding */
55 bool is_multibyte_char_in_char; /* need to check char boundaries? */
57 char *str1; /* haystack string */
58 char *str2; /* needle string */
59 int len1; /* string lengths in bytes */
60 int len2;
62 /* Skip table for Boyer-Moore-Horspool search algorithm: */
63 int skiptablemask; /* mask for ANDing with skiptable subscripts */
64 int skiptable[256]; /* skip distance for given mismatched char */
66 char *last_match; /* pointer to last match in 'str1' */
69 * Sometimes we need to convert the byte position of a match to a
70 * character position. These store the last position that was converted,
71 * so that on the next call, we can continue from that point, rather than
72 * count characters from the very beginning.
74 char *refpoint; /* pointer within original haystack string */
75 int refpos; /* 0-based character offset of the same point */
76 } TextPositionState;
78 typedef struct
80 char *buf1; /* 1st string, or abbreviation original string
81 * buf */
82 char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
83 int buflen1;
84 int buflen2;
85 int last_len1; /* Length of last buf1 string/strxfrm() input */
86 int last_len2; /* Length of last buf2 string/strxfrm() blob */
87 int last_returned; /* Last comparison result (cache) */
88 bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
89 bool collate_c;
90 Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
91 hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92 hyperLogLogState full_card; /* Full key cardinality state */
93 double prop_card; /* Required cardinality proportion */
94 pg_locale_t locale;
95 } VarStringSortSupport;
98 * Output data for split_text(): we output either to an array or a table.
99 * tupstore and tupdesc must be set up in advance to output to a table.
101 typedef struct
103 ArrayBuildState *astate;
104 Tuplestorestate *tupstore;
105 TupleDesc tupdesc;
106 } SplitTextOutputData;
109 * This should be large enough that most strings will fit, but small enough
110 * that we feel comfortable putting it on the stack
112 #define TEXTBUFLEN 1024
114 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
115 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
116 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
117 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
118 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
120 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
121 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
123 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
124 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
127 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
129 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
132 static int32 text_length(Datum str);
133 static text *text_catenate(text *t1, text *t2);
134 static text *text_substring(Datum str,
135 int32 start,
136 int32 length,
137 bool length_not_specified);
138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
139 static int text_position(text *t1, text *t2, Oid collid);
140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
141 static bool text_position_next(TextPositionState *state);
142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
143 static char *text_position_get_match_ptr(TextPositionState *state);
144 static int text_position_get_match_pos(TextPositionState *state);
145 static void text_position_cleanup(TextPositionState *state);
146 static void check_collation_set(Oid collid);
147 static int text_cmp(text *arg1, text *arg2, Oid collid);
148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
149 static bytea *bytea_substring(Datum str,
150 int S,
151 int L,
152 bool length_not_specified);
153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
154 static void appendStringInfoText(StringInfo str, const text *t);
155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
156 static void split_text_accum_result(SplitTextOutputData *tstate,
157 text *field_value,
158 text *null_string,
159 Oid collation);
160 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
161 const char *fldsep, const char *null_string);
162 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
164 int *value);
165 static const char *text_format_parse_format(const char *start_ptr,
166 const char *end_ptr,
167 int *argpos, int *widthpos,
168 int *flags, int *width);
169 static void text_format_string_conversion(StringInfo buf, char conversion,
170 FmgrInfo *typOutputInfo,
171 Datum value, bool isNull,
172 int flags, int width);
173 static void text_format_append_string(StringInfo buf, const char *str,
174 int flags, int width);
177 /*****************************************************************************
178 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
179 *****************************************************************************/
182 * cstring_to_text
184 * Create a text value from a null-terminated C string.
186 * The new text value is freshly palloc'd with a full-size VARHDR.
188 text *
189 cstring_to_text(const char *s)
191 return cstring_to_text_with_len(s, strlen(s));
195 * cstring_to_text_with_len
197 * Same as cstring_to_text except the caller specifies the string length;
198 * the string need not be null_terminated.
200 text *
201 cstring_to_text_with_len(const char *s, int len)
203 text *result = (text *) palloc(len + VARHDRSZ);
205 SET_VARSIZE(result, len + VARHDRSZ);
206 memcpy(VARDATA(result), s, len);
208 return result;
212 * text_to_cstring
214 * Create a palloc'd, null-terminated C string from a text value.
216 * We support being passed a compressed or toasted text value.
217 * This is a bit bogus since such values shouldn't really be referred to as
218 * "text *", but it seems useful for robustness. If we didn't handle that
219 * case here, we'd need another routine that did, anyway.
221 char *
222 text_to_cstring(const text *t)
224 /* must cast away the const, unfortunately */
225 text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
226 int len = VARSIZE_ANY_EXHDR(tunpacked);
227 char *result;
229 result = (char *) palloc(len + 1);
230 memcpy(result, VARDATA_ANY(tunpacked), len);
231 result[len] = '\0';
233 if (tunpacked != t)
234 pfree(tunpacked);
236 return result;
240 * text_to_cstring_buffer
242 * Copy a text value into a caller-supplied buffer of size dst_len.
244 * The text string is truncated if necessary to fit. The result is
245 * guaranteed null-terminated (unless dst_len == 0).
247 * We support being passed a compressed or toasted text value.
248 * This is a bit bogus since such values shouldn't really be referred to as
249 * "text *", but it seems useful for robustness. If we didn't handle that
250 * case here, we'd need another routine that did, anyway.
252 void
253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
255 /* must cast away the const, unfortunately */
256 text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
257 size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
259 if (dst_len > 0)
261 dst_len--;
262 if (dst_len >= src_len)
263 dst_len = src_len;
264 else /* ensure truncation is encoding-safe */
265 dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
266 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
267 dst[dst_len] = '\0';
270 if (srcunpacked != src)
271 pfree(srcunpacked);
275 /*****************************************************************************
276 * USER I/O ROUTINES *
277 *****************************************************************************/
280 #define VAL(CH) ((CH) - '0')
281 #define DIG(VAL) ((VAL) + '0')
284 * byteain - converts from printable representation of byte array
286 * Non-printable characters must be passed as '\nnn' (octal) and are
287 * converted to internal form. '\' must be passed as '\\'.
288 * ereport(ERROR, ...) if bad form.
290 * BUGS:
291 * The input is scanned twice.
292 * The error checking of input is minimal.
294 Datum
295 byteain(PG_FUNCTION_ARGS)
297 char *inputText = PG_GETARG_CSTRING(0);
298 char *tp;
299 char *rp;
300 int bc;
301 bytea *result;
303 /* Recognize hex input */
304 if (inputText[0] == '\\' && inputText[1] == 'x')
306 size_t len = strlen(inputText);
308 bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
309 result = palloc(bc);
310 bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
311 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
313 PG_RETURN_BYTEA_P(result);
316 /* Else, it's the traditional escaped style */
317 for (bc = 0, tp = inputText; *tp != '\0'; bc++)
319 if (tp[0] != '\\')
320 tp++;
321 else if ((tp[0] == '\\') &&
322 (tp[1] >= '0' && tp[1] <= '3') &&
323 (tp[2] >= '0' && tp[2] <= '7') &&
324 (tp[3] >= '0' && tp[3] <= '7'))
325 tp += 4;
326 else if ((tp[0] == '\\') &&
327 (tp[1] == '\\'))
328 tp += 2;
329 else
332 * one backslash, not followed by another or ### valid octal
334 ereport(ERROR,
335 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
336 errmsg("invalid input syntax for type %s", "bytea")));
340 bc += VARHDRSZ;
342 result = (bytea *) palloc(bc);
343 SET_VARSIZE(result, bc);
345 tp = inputText;
346 rp = VARDATA(result);
347 while (*tp != '\0')
349 if (tp[0] != '\\')
350 *rp++ = *tp++;
351 else if ((tp[0] == '\\') &&
352 (tp[1] >= '0' && tp[1] <= '3') &&
353 (tp[2] >= '0' && tp[2] <= '7') &&
354 (tp[3] >= '0' && tp[3] <= '7'))
356 bc = VAL(tp[1]);
357 bc <<= 3;
358 bc += VAL(tp[2]);
359 bc <<= 3;
360 *rp++ = bc + VAL(tp[3]);
362 tp += 4;
364 else if ((tp[0] == '\\') &&
365 (tp[1] == '\\'))
367 *rp++ = '\\';
368 tp += 2;
370 else
373 * We should never get here. The first pass should not allow it.
375 ereport(ERROR,
376 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
377 errmsg("invalid input syntax for type %s", "bytea")));
381 PG_RETURN_BYTEA_P(result);
385 * byteaout - converts to printable representation of byte array
387 * In the traditional escaped format, non-printable characters are
388 * printed as '\nnn' (octal) and '\' as '\\'.
390 Datum
391 byteaout(PG_FUNCTION_ARGS)
393 bytea *vlena = PG_GETARG_BYTEA_PP(0);
394 char *result;
395 char *rp;
397 if (bytea_output == BYTEA_OUTPUT_HEX)
399 /* Print hex format */
400 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
401 *rp++ = '\\';
402 *rp++ = 'x';
403 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
405 else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
407 /* Print traditional escaped format */
408 char *vp;
409 uint64 len;
410 int i;
412 len = 1; /* empty string has 1 char */
413 vp = VARDATA_ANY(vlena);
414 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
416 if (*vp == '\\')
417 len += 2;
418 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
419 len += 4;
420 else
421 len++;
425 * In principle len can't overflow uint32 if the input fit in 1GB, but
426 * for safety let's check rather than relying on palloc's internal
427 * check.
429 if (len > MaxAllocSize)
430 ereport(ERROR,
431 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
432 errmsg_internal("result of bytea output conversion is too large")));
433 rp = result = (char *) palloc(len);
435 vp = VARDATA_ANY(vlena);
436 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
438 if (*vp == '\\')
440 *rp++ = '\\';
441 *rp++ = '\\';
443 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
445 int val; /* holds unprintable chars */
447 val = *vp;
448 rp[0] = '\\';
449 rp[3] = DIG(val & 07);
450 val >>= 3;
451 rp[2] = DIG(val & 07);
452 val >>= 3;
453 rp[1] = DIG(val & 03);
454 rp += 4;
456 else
457 *rp++ = *vp;
460 else
462 elog(ERROR, "unrecognized bytea_output setting: %d",
463 bytea_output);
464 rp = result = NULL; /* keep compiler quiet */
466 *rp = '\0';
467 PG_RETURN_CSTRING(result);
471 * bytearecv - converts external binary format to bytea
473 Datum
474 bytearecv(PG_FUNCTION_ARGS)
476 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
477 bytea *result;
478 int nbytes;
480 nbytes = buf->len - buf->cursor;
481 result = (bytea *) palloc(nbytes + VARHDRSZ);
482 SET_VARSIZE(result, nbytes + VARHDRSZ);
483 pq_copymsgbytes(buf, VARDATA(result), nbytes);
484 PG_RETURN_BYTEA_P(result);
488 * byteasend - converts bytea to binary format
490 * This is a special case: just copy the input...
492 Datum
493 byteasend(PG_FUNCTION_ARGS)
495 bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
497 PG_RETURN_BYTEA_P(vlena);
500 Datum
501 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
503 StringInfo state;
505 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
507 /* Append the value unless null. */
508 if (!PG_ARGISNULL(1))
510 bytea *value = PG_GETARG_BYTEA_PP(1);
512 /* On the first time through, we ignore the delimiter. */
513 if (state == NULL)
514 state = makeStringAggState(fcinfo);
515 else if (!PG_ARGISNULL(2))
517 bytea *delim = PG_GETARG_BYTEA_PP(2);
519 appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
522 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
526 * The transition type for string_agg() is declared to be "internal",
527 * which is a pass-by-value type the same size as a pointer.
529 PG_RETURN_POINTER(state);
532 Datum
533 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
535 StringInfo state;
537 /* cannot be called directly because of internal-type argument */
538 Assert(AggCheckCallContext(fcinfo, NULL));
540 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
542 if (state != NULL)
544 bytea *result;
546 result = (bytea *) palloc(state->len + VARHDRSZ);
547 SET_VARSIZE(result, state->len + VARHDRSZ);
548 memcpy(VARDATA(result), state->data, state->len);
549 PG_RETURN_BYTEA_P(result);
551 else
552 PG_RETURN_NULL();
556 * textin - converts "..." to internal representation
558 Datum
559 textin(PG_FUNCTION_ARGS)
561 char *inputText = PG_GETARG_CSTRING(0);
563 PG_RETURN_TEXT_P(cstring_to_text(inputText));
567 * textout - converts internal representation to "..."
569 Datum
570 textout(PG_FUNCTION_ARGS)
572 Datum txt = PG_GETARG_DATUM(0);
574 PG_RETURN_CSTRING(TextDatumGetCString(txt));
578 * textrecv - converts external binary format to text
580 Datum
581 textrecv(PG_FUNCTION_ARGS)
583 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
584 text *result;
585 char *str;
586 int nbytes;
588 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
590 result = cstring_to_text_with_len(str, nbytes);
591 pfree(str);
592 PG_RETURN_TEXT_P(result);
596 * textsend - converts text to binary format
598 Datum
599 textsend(PG_FUNCTION_ARGS)
601 text *t = PG_GETARG_TEXT_PP(0);
602 StringInfoData buf;
604 pq_begintypsend(&buf);
605 pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
606 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
611 * unknownin - converts "..." to internal representation
613 Datum
614 unknownin(PG_FUNCTION_ARGS)
616 char *str = PG_GETARG_CSTRING(0);
618 /* representation is same as cstring */
619 PG_RETURN_CSTRING(pstrdup(str));
623 * unknownout - converts internal representation to "..."
625 Datum
626 unknownout(PG_FUNCTION_ARGS)
628 /* representation is same as cstring */
629 char *str = PG_GETARG_CSTRING(0);
631 PG_RETURN_CSTRING(pstrdup(str));
635 * unknownrecv - converts external binary format to unknown
637 Datum
638 unknownrecv(PG_FUNCTION_ARGS)
640 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
641 char *str;
642 int nbytes;
644 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
645 /* representation is same as cstring */
646 PG_RETURN_CSTRING(str);
650 * unknownsend - converts unknown to binary format
652 Datum
653 unknownsend(PG_FUNCTION_ARGS)
655 /* representation is same as cstring */
656 char *str = PG_GETARG_CSTRING(0);
657 StringInfoData buf;
659 pq_begintypsend(&buf);
660 pq_sendtext(&buf, str, strlen(str));
661 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
665 /* ========== PUBLIC ROUTINES ========== */
668 * textlen -
669 * returns the logical length of a text*
670 * (which is less than the VARSIZE of the text*)
672 Datum
673 textlen(PG_FUNCTION_ARGS)
675 Datum str = PG_GETARG_DATUM(0);
677 /* try to avoid decompressing argument */
678 PG_RETURN_INT32(text_length(str));
682 * text_length -
683 * Does the real work for textlen()
685 * This is broken out so it can be called directly by other string processing
686 * functions. Note that the argument is passed as a Datum, to indicate that
687 * it may still be in compressed form. We can avoid decompressing it at all
688 * in some cases.
690 static int32
691 text_length(Datum str)
693 /* fastpath when max encoding length is one */
694 if (pg_database_encoding_max_length() == 1)
695 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
696 else
698 text *t = DatumGetTextPP(str);
700 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
701 VARSIZE_ANY_EXHDR(t)));
706 * textoctetlen -
707 * returns the physical length of a text*
708 * (which is less than the VARSIZE of the text*)
710 Datum
711 textoctetlen(PG_FUNCTION_ARGS)
713 Datum str = PG_GETARG_DATUM(0);
715 /* We need not detoast the input at all */
716 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
720 * textcat -
721 * takes two text* and returns a text* that is the concatenation of
722 * the two.
724 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
725 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
726 * Allocate space for output in all cases.
727 * XXX - thomas 1997-07-10
729 Datum
730 textcat(PG_FUNCTION_ARGS)
732 text *t1 = PG_GETARG_TEXT_PP(0);
733 text *t2 = PG_GETARG_TEXT_PP(1);
735 PG_RETURN_TEXT_P(text_catenate(t1, t2));
739 * text_catenate
740 * Guts of textcat(), broken out so it can be used by other functions
742 * Arguments can be in short-header form, but not compressed or out-of-line
744 static text *
745 text_catenate(text *t1, text *t2)
747 text *result;
748 int len1,
749 len2,
750 len;
751 char *ptr;
753 len1 = VARSIZE_ANY_EXHDR(t1);
754 len2 = VARSIZE_ANY_EXHDR(t2);
756 /* paranoia ... probably should throw error instead? */
757 if (len1 < 0)
758 len1 = 0;
759 if (len2 < 0)
760 len2 = 0;
762 len = len1 + len2 + VARHDRSZ;
763 result = (text *) palloc(len);
765 /* Set size of result string... */
766 SET_VARSIZE(result, len);
768 /* Fill data field of result string... */
769 ptr = VARDATA(result);
770 if (len1 > 0)
771 memcpy(ptr, VARDATA_ANY(t1), len1);
772 if (len2 > 0)
773 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
775 return result;
779 * charlen_to_bytelen()
780 * Compute the number of bytes occupied by n characters starting at *p
782 * It is caller's responsibility that there actually are n characters;
783 * the string need not be null-terminated.
785 static int
786 charlen_to_bytelen(const char *p, int n)
788 if (pg_database_encoding_max_length() == 1)
790 /* Optimization for single-byte encodings */
791 return n;
793 else
795 const char *s;
797 for (s = p; n > 0; n--)
798 s += pg_mblen(s);
800 return s - p;
805 * text_substr()
806 * Return a substring starting at the specified position.
807 * - thomas 1997-12-31
809 * Input:
810 * - string
811 * - starting position (is one-based)
812 * - string length
814 * If the starting position is zero or less, then return from the start of the string
815 * adjusting the length to be consistent with the "negative start" per SQL.
816 * If the length is less than zero, return the remaining string.
818 * Added multibyte support.
819 * - Tatsuo Ishii 1998-4-21
820 * Changed behavior if starting position is less than one to conform to SQL behavior.
821 * Formerly returned the entire string; now returns a portion.
822 * - Thomas Lockhart 1998-12-10
823 * Now uses faster TOAST-slicing interface
824 * - John Gray 2002-02-22
825 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
826 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
827 * error; if E < 1, return '', not entire string). Fixed MB related bug when
828 * S > LC and < LC + 4 sometimes garbage characters are returned.
829 * - Joe Conway 2002-08-10
831 Datum
832 text_substr(PG_FUNCTION_ARGS)
834 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
835 PG_GETARG_INT32(1),
836 PG_GETARG_INT32(2),
837 false));
841 * text_substr_no_len -
842 * Wrapper to avoid opr_sanity failure due to
843 * one function accepting a different number of args.
845 Datum
846 text_substr_no_len(PG_FUNCTION_ARGS)
848 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
849 PG_GETARG_INT32(1),
850 -1, true));
854 * text_substring -
855 * Does the real work for text_substr() and text_substr_no_len()
857 * This is broken out so it can be called directly by other string processing
858 * functions. Note that the argument is passed as a Datum, to indicate that
859 * it may still be in compressed/toasted form. We can avoid detoasting all
860 * of it in some cases.
862 * The result is always a freshly palloc'd datum.
864 static text *
865 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
867 int32 eml = pg_database_encoding_max_length();
868 int32 S = start; /* start position */
869 int32 S1; /* adjusted start position */
870 int32 L1; /* adjusted substring length */
871 int32 E; /* end position */
874 * SQL99 says S can be zero or negative, but we still must fetch from the
875 * start of the string.
877 S1 = Max(S, 1);
879 /* life is easy if the encoding max length is 1 */
880 if (eml == 1)
882 if (length_not_specified) /* special case - get length to end of
883 * string */
884 L1 = -1;
885 else if (length < 0)
887 /* SQL99 says to throw an error for E < S, i.e., negative length */
888 ereport(ERROR,
889 (errcode(ERRCODE_SUBSTRING_ERROR),
890 errmsg("negative substring length not allowed")));
891 L1 = -1; /* silence stupider compilers */
893 else if (pg_add_s32_overflow(S, length, &E))
896 * L could be large enough for S + L to overflow, in which case
897 * the substring must run to end of string.
899 L1 = -1;
901 else
904 * A zero or negative value for the end position can happen if the
905 * start was negative or one. SQL99 says to return a zero-length
906 * string.
908 if (E < 1)
909 return cstring_to_text("");
911 L1 = E - S1;
915 * If the start position is past the end of the string, SQL99 says to
916 * return a zero-length string -- DatumGetTextPSlice() will do that
917 * for us. We need only convert S1 to zero-based starting position.
919 return DatumGetTextPSlice(str, S1 - 1, L1);
921 else if (eml > 1)
924 * When encoding max length is > 1, we can't get LC without
925 * detoasting, so we'll grab a conservatively large slice now and go
926 * back later to do the right thing
928 int32 slice_start;
929 int32 slice_size;
930 int32 slice_strlen;
931 text *slice;
932 int32 E1;
933 int32 i;
934 char *p;
935 char *s;
936 text *ret;
939 * We need to start at position zero because there is no way to know
940 * in advance which byte offset corresponds to the supplied start
941 * position.
943 slice_start = 0;
945 if (length_not_specified) /* special case - get length to end of
946 * string */
947 slice_size = L1 = -1;
948 else if (length < 0)
950 /* SQL99 says to throw an error for E < S, i.e., negative length */
951 ereport(ERROR,
952 (errcode(ERRCODE_SUBSTRING_ERROR),
953 errmsg("negative substring length not allowed")));
954 slice_size = L1 = -1; /* silence stupider compilers */
956 else if (pg_add_s32_overflow(S, length, &E))
959 * L could be large enough for S + L to overflow, in which case
960 * the substring must run to end of string.
962 slice_size = L1 = -1;
964 else
967 * A zero or negative value for the end position can happen if the
968 * start was negative or one. SQL99 says to return a zero-length
969 * string.
971 if (E < 1)
972 return cstring_to_text("");
975 * if E is past the end of the string, the tuple toaster will
976 * truncate the length for us
978 L1 = E - S1;
981 * Total slice size in bytes can't be any longer than the start
982 * position plus substring length times the encoding max length.
983 * If that overflows, we can just use -1.
985 if (pg_mul_s32_overflow(E, eml, &slice_size))
986 slice_size = -1;
990 * If we're working with an untoasted source, no need to do an extra
991 * copying step.
993 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
994 VARATT_IS_EXTERNAL(DatumGetPointer(str)))
995 slice = DatumGetTextPSlice(str, slice_start, slice_size);
996 else
997 slice = (text *) DatumGetPointer(str);
999 /* see if we got back an empty string */
1000 if (VARSIZE_ANY_EXHDR(slice) == 0)
1002 if (slice != (text *) DatumGetPointer(str))
1003 pfree(slice);
1004 return cstring_to_text("");
1007 /* Now we can get the actual length of the slice in MB characters */
1008 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1009 VARSIZE_ANY_EXHDR(slice));
1012 * Check that the start position wasn't > slice_strlen. If so, SQL99
1013 * says to return a zero-length string.
1015 if (S1 > slice_strlen)
1017 if (slice != (text *) DatumGetPointer(str))
1018 pfree(slice);
1019 return cstring_to_text("");
1023 * Adjust L1 and E1 now that we know the slice string length. Again
1024 * remember that S1 is one based, and slice_start is zero based.
1026 if (L1 > -1)
1027 E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1028 else
1029 E1 = slice_start + 1 + slice_strlen;
1032 * Find the start position in the slice; remember S1 is not zero based
1034 p = VARDATA_ANY(slice);
1035 for (i = 0; i < S1 - 1; i++)
1036 p += pg_mblen(p);
1038 /* hang onto a pointer to our start position */
1039 s = p;
1042 * Count the actual bytes used by the substring of the requested
1043 * length.
1045 for (i = S1; i < E1; i++)
1046 p += pg_mblen(p);
1048 ret = (text *) palloc(VARHDRSZ + (p - s));
1049 SET_VARSIZE(ret, VARHDRSZ + (p - s));
1050 memcpy(VARDATA(ret), s, (p - s));
1052 if (slice != (text *) DatumGetPointer(str))
1053 pfree(slice);
1055 return ret;
1057 else
1058 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1060 /* not reached: suppress compiler warning */
1061 return NULL;
1065 * textoverlay
1066 * Replace specified substring of first string with second
1068 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1069 * This code is a direct implementation of what the standard says.
1071 Datum
1072 textoverlay(PG_FUNCTION_ARGS)
1074 text *t1 = PG_GETARG_TEXT_PP(0);
1075 text *t2 = PG_GETARG_TEXT_PP(1);
1076 int sp = PG_GETARG_INT32(2); /* substring start position */
1077 int sl = PG_GETARG_INT32(3); /* substring length */
1079 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1082 Datum
1083 textoverlay_no_len(PG_FUNCTION_ARGS)
1085 text *t1 = PG_GETARG_TEXT_PP(0);
1086 text *t2 = PG_GETARG_TEXT_PP(1);
1087 int sp = PG_GETARG_INT32(2); /* substring start position */
1088 int sl;
1090 sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1091 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1094 static text *
1095 text_overlay(text *t1, text *t2, int sp, int sl)
1097 text *result;
1098 text *s1;
1099 text *s2;
1100 int sp_pl_sl;
1103 * Check for possible integer-overflow cases. For negative sp, throw a
1104 * "substring length" error because that's what should be expected
1105 * according to the spec's definition of OVERLAY().
1107 if (sp <= 0)
1108 ereport(ERROR,
1109 (errcode(ERRCODE_SUBSTRING_ERROR),
1110 errmsg("negative substring length not allowed")));
1111 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1112 ereport(ERROR,
1113 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1114 errmsg("integer out of range")));
1116 s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1117 s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1118 result = text_catenate(s1, t2);
1119 result = text_catenate(result, s2);
1121 return result;
1125 * textpos -
1126 * Return the position of the specified substring.
1127 * Implements the SQL POSITION() function.
1128 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1129 * - thomas 1997-07-27
1131 Datum
1132 textpos(PG_FUNCTION_ARGS)
1134 text *str = PG_GETARG_TEXT_PP(0);
1135 text *search_str = PG_GETARG_TEXT_PP(1);
1137 PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1141 * text_position -
1142 * Does the real work for textpos()
1144 * Inputs:
1145 * t1 - string to be searched
1146 * t2 - pattern to match within t1
1147 * Result:
1148 * Character index of the first matched char, starting from 1,
1149 * or 0 if no match.
1151 * This is broken out so it can be called directly by other string processing
1152 * functions.
1154 static int
1155 text_position(text *t1, text *t2, Oid collid)
1157 TextPositionState state;
1158 int result;
1160 /* Empty needle always matches at position 1 */
1161 if (VARSIZE_ANY_EXHDR(t2) < 1)
1162 return 1;
1164 /* Otherwise, can't match if haystack is shorter than needle */
1165 if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1166 return 0;
1168 text_position_setup(t1, t2, collid, &state);
1169 if (!text_position_next(&state))
1170 result = 0;
1171 else
1172 result = text_position_get_match_pos(&state);
1173 text_position_cleanup(&state);
1174 return result;
1179 * text_position_setup, text_position_next, text_position_cleanup -
1180 * Component steps of text_position()
1182 * These are broken out so that a string can be efficiently searched for
1183 * multiple occurrences of the same pattern. text_position_next may be
1184 * called multiple times, and it advances to the next match on each call.
1185 * text_position_get_match_ptr() and text_position_get_match_pos() return
1186 * a pointer or 1-based character position of the last match, respectively.
1188 * The "state" variable is normally just a local variable in the caller.
1190 * NOTE: text_position_next skips over the matched portion. For example,
1191 * searching for "xx" in "xxx" returns only one match, not two.
1194 static void
1195 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1197 int len1 = VARSIZE_ANY_EXHDR(t1);
1198 int len2 = VARSIZE_ANY_EXHDR(t2);
1199 pg_locale_t mylocale = 0;
1201 check_collation_set(collid);
1203 if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1204 mylocale = pg_newlocale_from_collation(collid);
1206 if (mylocale && !mylocale->deterministic)
1207 ereport(ERROR,
1208 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1209 errmsg("nondeterministic collations are not supported for substring searches")));
1211 Assert(len1 > 0);
1212 Assert(len2 > 0);
1215 * Even with a multi-byte encoding, we perform the search using the raw
1216 * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1217 * because in UTF-8 the byte sequence of one character cannot contain
1218 * another character. For other multi-byte encodings, we do the search
1219 * initially as a simple byte search, ignoring multibyte issues, but
1220 * verify afterwards that the match we found is at a character boundary,
1221 * and continue the search if it was a false match.
1223 if (pg_database_encoding_max_length() == 1)
1225 state->is_multibyte = false;
1226 state->is_multibyte_char_in_char = false;
1228 else if (GetDatabaseEncoding() == PG_UTF8)
1230 state->is_multibyte = true;
1231 state->is_multibyte_char_in_char = false;
1233 else
1235 state->is_multibyte = true;
1236 state->is_multibyte_char_in_char = true;
1239 state->str1 = VARDATA_ANY(t1);
1240 state->str2 = VARDATA_ANY(t2);
1241 state->len1 = len1;
1242 state->len2 = len2;
1243 state->last_match = NULL;
1244 state->refpoint = state->str1;
1245 state->refpos = 0;
1248 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1249 * notes we use the terminology that the "haystack" is the string to be
1250 * searched (t1) and the "needle" is the pattern being sought (t2).
1252 * If the needle is empty or bigger than the haystack then there is no
1253 * point in wasting cycles initializing the table. We also choose not to
1254 * use B-M-H for needles of length 1, since the skip table can't possibly
1255 * save anything in that case.
1257 if (len1 >= len2 && len2 > 1)
1259 int searchlength = len1 - len2;
1260 int skiptablemask;
1261 int last;
1262 int i;
1263 const char *str2 = state->str2;
1266 * First we must determine how much of the skip table to use. The
1267 * declaration of TextPositionState allows up to 256 elements, but for
1268 * short search problems we don't really want to have to initialize so
1269 * many elements --- it would take too long in comparison to the
1270 * actual search time. So we choose a useful skip table size based on
1271 * the haystack length minus the needle length. The closer the needle
1272 * length is to the haystack length the less useful skipping becomes.
1274 * Note: since we use bit-masking to select table elements, the skip
1275 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1277 if (searchlength < 16)
1278 skiptablemask = 3;
1279 else if (searchlength < 64)
1280 skiptablemask = 7;
1281 else if (searchlength < 128)
1282 skiptablemask = 15;
1283 else if (searchlength < 512)
1284 skiptablemask = 31;
1285 else if (searchlength < 2048)
1286 skiptablemask = 63;
1287 else if (searchlength < 4096)
1288 skiptablemask = 127;
1289 else
1290 skiptablemask = 255;
1291 state->skiptablemask = skiptablemask;
1294 * Initialize the skip table. We set all elements to the needle
1295 * length, since this is the correct skip distance for any character
1296 * not found in the needle.
1298 for (i = 0; i <= skiptablemask; i++)
1299 state->skiptable[i] = len2;
1302 * Now examine the needle. For each character except the last one,
1303 * set the corresponding table element to the appropriate skip
1304 * distance. Note that when two characters share the same skip table
1305 * entry, the one later in the needle must determine the skip
1306 * distance.
1308 last = len2 - 1;
1310 for (i = 0; i < last; i++)
1311 state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1316 * Advance to the next match, starting from the end of the previous match
1317 * (or the beginning of the string, on first call). Returns true if a match
1318 * is found.
1320 * Note that this refuses to match an empty-string needle. Most callers
1321 * will have handled that case specially and we'll never see it here.
1323 static bool
1324 text_position_next(TextPositionState *state)
1326 int needle_len = state->len2;
1327 char *start_ptr;
1328 char *matchptr;
1330 if (needle_len <= 0)
1331 return false; /* result for empty pattern */
1333 /* Start from the point right after the previous match. */
1334 if (state->last_match)
1335 start_ptr = state->last_match + needle_len;
1336 else
1337 start_ptr = state->str1;
1339 retry:
1340 matchptr = text_position_next_internal(start_ptr, state);
1342 if (!matchptr)
1343 return false;
1346 * Found a match for the byte sequence. If this is a multibyte encoding,
1347 * where one character's byte sequence can appear inside a longer
1348 * multi-byte character, we need to verify that the match was at a
1349 * character boundary, not in the middle of a multi-byte character.
1351 if (state->is_multibyte_char_in_char)
1353 /* Walk one character at a time, until we reach the match. */
1355 /* the search should never move backwards. */
1356 Assert(state->refpoint <= matchptr);
1358 while (state->refpoint < matchptr)
1360 /* step to next character. */
1361 state->refpoint += pg_mblen(state->refpoint);
1362 state->refpos++;
1365 * If we stepped over the match's start position, then it was a
1366 * false positive, where the byte sequence appeared in the middle
1367 * of a multi-byte character. Skip it, and continue the search at
1368 * the next character boundary.
1370 if (state->refpoint > matchptr)
1372 start_ptr = state->refpoint;
1373 goto retry;
1378 state->last_match = matchptr;
1379 return true;
1383 * Subroutine of text_position_next(). This searches for the raw byte
1384 * sequence, ignoring any multi-byte encoding issues. Returns the first
1385 * match starting at 'start_ptr', or NULL if no match is found.
1387 static char *
1388 text_position_next_internal(char *start_ptr, TextPositionState *state)
1390 int haystack_len = state->len1;
1391 int needle_len = state->len2;
1392 int skiptablemask = state->skiptablemask;
1393 const char *haystack = state->str1;
1394 const char *needle = state->str2;
1395 const char *haystack_end = &haystack[haystack_len];
1396 const char *hptr;
1398 Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1400 if (needle_len == 1)
1402 /* No point in using B-M-H for a one-character needle */
1403 char nchar = *needle;
1405 hptr = start_ptr;
1406 while (hptr < haystack_end)
1408 if (*hptr == nchar)
1409 return (char *) hptr;
1410 hptr++;
1413 else
1415 const char *needle_last = &needle[needle_len - 1];
1417 /* Start at startpos plus the length of the needle */
1418 hptr = start_ptr + needle_len - 1;
1419 while (hptr < haystack_end)
1421 /* Match the needle scanning *backward* */
1422 const char *nptr;
1423 const char *p;
1425 nptr = needle_last;
1426 p = hptr;
1427 while (*nptr == *p)
1429 /* Matched it all? If so, return 1-based position */
1430 if (nptr == needle)
1431 return (char *) p;
1432 nptr--, p--;
1436 * No match, so use the haystack char at hptr to decide how far to
1437 * advance. If the needle had any occurrence of that character
1438 * (or more precisely, one sharing the same skiptable entry)
1439 * before its last character, then we advance far enough to align
1440 * the last such needle character with that haystack position.
1441 * Otherwise we can advance by the whole needle length.
1443 hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1447 return 0; /* not found */
1451 * Return a pointer to the current match.
1453 * The returned pointer points into the original haystack string.
1455 static char *
1456 text_position_get_match_ptr(TextPositionState *state)
1458 return state->last_match;
1462 * Return the offset of the current match.
1464 * The offset is in characters, 1-based.
1466 static int
1467 text_position_get_match_pos(TextPositionState *state)
1469 if (!state->is_multibyte)
1470 return state->last_match - state->str1 + 1;
1471 else
1473 /* Convert the byte position to char position. */
1474 while (state->refpoint < state->last_match)
1476 state->refpoint += pg_mblen(state->refpoint);
1477 state->refpos++;
1479 Assert(state->refpoint == state->last_match);
1480 return state->refpos + 1;
1485 * Reset search state to the initial state installed by text_position_setup.
1487 * The next call to text_position_next will search from the beginning
1488 * of the string.
1490 static void
1491 text_position_reset(TextPositionState *state)
1493 state->last_match = NULL;
1494 state->refpoint = state->str1;
1495 state->refpos = 0;
1498 static void
1499 text_position_cleanup(TextPositionState *state)
1501 /* no cleanup needed */
1505 static void
1506 check_collation_set(Oid collid)
1508 if (!OidIsValid(collid))
1511 * This typically means that the parser could not resolve a conflict
1512 * of implicit collations, so report it that way.
1514 ereport(ERROR,
1515 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1516 errmsg("could not determine which collation to use for string comparison"),
1517 errhint("Use the COLLATE clause to set the collation explicitly.")));
1521 /* varstr_cmp()
1522 * Comparison function for text strings with given lengths.
1523 * Includes locale support, but must copy strings to temporary memory
1524 * to allow null-termination for inputs to strcoll().
1525 * Returns an integer less than, equal to, or greater than zero, indicating
1526 * whether arg1 is less than, equal to, or greater than arg2.
1528 * Note: many functions that depend on this are marked leakproof; therefore,
1529 * avoid reporting the actual contents of the input when throwing errors.
1530 * All errors herein should be things that can't happen except on corrupt
1531 * data, anyway; otherwise we will have trouble with indexing strings that
1532 * would cause them.
1535 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1537 int result;
1539 check_collation_set(collid);
1542 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1543 * have to do some memory copying. This turns out to be significantly
1544 * slower, so we optimize the case where LC_COLLATE is C. We also try to
1545 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1547 if (lc_collate_is_c(collid))
1549 result = memcmp(arg1, arg2, Min(len1, len2));
1550 if ((result == 0) && (len1 != len2))
1551 result = (len1 < len2) ? -1 : 1;
1553 else
1555 char a1buf[TEXTBUFLEN];
1556 char a2buf[TEXTBUFLEN];
1557 char *a1p,
1558 *a2p;
1559 pg_locale_t mylocale = 0;
1561 if (collid != DEFAULT_COLLATION_OID)
1562 mylocale = pg_newlocale_from_collation(collid);
1565 * memcmp() can't tell us which of two unequal strings sorts first,
1566 * but it's a cheap way to tell if they're equal. Testing shows that
1567 * memcmp() followed by strcoll() is only trivially slower than
1568 * strcoll() by itself, so we don't lose much if this doesn't work out
1569 * very often, and if it does - for example, because there are many
1570 * equal strings in the input - then we win big by avoiding expensive
1571 * collation-aware comparisons.
1573 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1574 return 0;
1576 #ifdef WIN32
1577 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1578 if (GetDatabaseEncoding() == PG_UTF8
1579 && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1581 int a1len;
1582 int a2len;
1583 int r;
1585 if (len1 >= TEXTBUFLEN / 2)
1587 a1len = len1 * 2 + 2;
1588 a1p = palloc(a1len);
1590 else
1592 a1len = TEXTBUFLEN;
1593 a1p = a1buf;
1595 if (len2 >= TEXTBUFLEN / 2)
1597 a2len = len2 * 2 + 2;
1598 a2p = palloc(a2len);
1600 else
1602 a2len = TEXTBUFLEN;
1603 a2p = a2buf;
1606 /* stupid Microsloth API does not work for zero-length input */
1607 if (len1 == 0)
1608 r = 0;
1609 else
1611 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1612 (LPWSTR) a1p, a1len / 2);
1613 if (!r)
1614 ereport(ERROR,
1615 (errmsg("could not convert string to UTF-16: error code %lu",
1616 GetLastError())));
1618 ((LPWSTR) a1p)[r] = 0;
1620 if (len2 == 0)
1621 r = 0;
1622 else
1624 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1625 (LPWSTR) a2p, a2len / 2);
1626 if (!r)
1627 ereport(ERROR,
1628 (errmsg("could not convert string to UTF-16: error code %lu",
1629 GetLastError())));
1631 ((LPWSTR) a2p)[r] = 0;
1633 errno = 0;
1634 #ifdef HAVE_LOCALE_T
1635 if (mylocale)
1636 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1637 else
1638 #endif
1639 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1640 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1641 * headers */
1642 ereport(ERROR,
1643 (errmsg("could not compare Unicode strings: %m")));
1645 /* Break tie if necessary. */
1646 if (result == 0 &&
1647 (!mylocale || mylocale->deterministic))
1649 result = memcmp(arg1, arg2, Min(len1, len2));
1650 if ((result == 0) && (len1 != len2))
1651 result = (len1 < len2) ? -1 : 1;
1654 if (a1p != a1buf)
1655 pfree(a1p);
1656 if (a2p != a2buf)
1657 pfree(a2p);
1659 return result;
1661 #endif /* WIN32 */
1663 if (len1 >= TEXTBUFLEN)
1664 a1p = (char *) palloc(len1 + 1);
1665 else
1666 a1p = a1buf;
1667 if (len2 >= TEXTBUFLEN)
1668 a2p = (char *) palloc(len2 + 1);
1669 else
1670 a2p = a2buf;
1672 memcpy(a1p, arg1, len1);
1673 a1p[len1] = '\0';
1674 memcpy(a2p, arg2, len2);
1675 a2p[len2] = '\0';
1677 if (mylocale)
1679 if (mylocale->provider == COLLPROVIDER_ICU)
1681 #ifdef USE_ICU
1682 #ifdef HAVE_UCOL_STRCOLLUTF8
1683 if (GetDatabaseEncoding() == PG_UTF8)
1685 UErrorCode status;
1687 status = U_ZERO_ERROR;
1688 result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1689 arg1, len1,
1690 arg2, len2,
1691 &status);
1692 if (U_FAILURE(status))
1693 ereport(ERROR,
1694 (errmsg("collation failed: %s", u_errorName(status))));
1696 else
1697 #endif
1699 int32_t ulen1,
1700 ulen2;
1701 UChar *uchar1,
1702 *uchar2;
1704 ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1705 ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1707 result = ucol_strcoll(mylocale->info.icu.ucol,
1708 uchar1, ulen1,
1709 uchar2, ulen2);
1711 pfree(uchar1);
1712 pfree(uchar2);
1714 #else /* not USE_ICU */
1715 /* shouldn't happen */
1716 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1717 #endif /* not USE_ICU */
1719 else
1721 #ifdef HAVE_LOCALE_T
1722 result = strcoll_l(a1p, a2p, mylocale->info.lt);
1723 #else
1724 /* shouldn't happen */
1725 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1726 #endif
1729 else
1730 result = strcoll(a1p, a2p);
1732 /* Break tie if necessary. */
1733 if (result == 0 &&
1734 (!mylocale || mylocale->deterministic))
1735 result = strcmp(a1p, a2p);
1737 if (a1p != a1buf)
1738 pfree(a1p);
1739 if (a2p != a2buf)
1740 pfree(a2p);
1743 return result;
1746 /* text_cmp()
1747 * Internal comparison function for text strings.
1748 * Returns -1, 0 or 1
1750 static int
1751 text_cmp(text *arg1, text *arg2, Oid collid)
1753 char *a1p,
1754 *a2p;
1755 int len1,
1756 len2;
1758 a1p = VARDATA_ANY(arg1);
1759 a2p = VARDATA_ANY(arg2);
1761 len1 = VARSIZE_ANY_EXHDR(arg1);
1762 len2 = VARSIZE_ANY_EXHDR(arg2);
1764 return varstr_cmp(a1p, len1, a2p, len2, collid);
1768 * Comparison functions for text strings.
1770 * Note: btree indexes need these routines not to leak memory; therefore,
1771 * be careful to free working copies of toasted datums. Most places don't
1772 * need to be so careful.
1775 Datum
1776 texteq(PG_FUNCTION_ARGS)
1778 Oid collid = PG_GET_COLLATION();
1779 bool result;
1781 check_collation_set(collid);
1783 if (lc_collate_is_c(collid) ||
1784 collid == DEFAULT_COLLATION_OID ||
1785 pg_newlocale_from_collation(collid)->deterministic)
1787 Datum arg1 = PG_GETARG_DATUM(0);
1788 Datum arg2 = PG_GETARG_DATUM(1);
1789 Size len1,
1790 len2;
1793 * Since we only care about equality or not-equality, we can avoid all
1794 * the expense of strcoll() here, and just do bitwise comparison. In
1795 * fact, we don't even have to do a bitwise comparison if we can show
1796 * the lengths of the strings are unequal; which might save us from
1797 * having to detoast one or both values.
1799 len1 = toast_raw_datum_size(arg1);
1800 len2 = toast_raw_datum_size(arg2);
1801 if (len1 != len2)
1802 result = false;
1803 else
1805 text *targ1 = DatumGetTextPP(arg1);
1806 text *targ2 = DatumGetTextPP(arg2);
1808 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1809 len1 - VARHDRSZ) == 0);
1811 PG_FREE_IF_COPY(targ1, 0);
1812 PG_FREE_IF_COPY(targ2, 1);
1815 else
1817 text *arg1 = PG_GETARG_TEXT_PP(0);
1818 text *arg2 = PG_GETARG_TEXT_PP(1);
1820 result = (text_cmp(arg1, arg2, collid) == 0);
1822 PG_FREE_IF_COPY(arg1, 0);
1823 PG_FREE_IF_COPY(arg2, 1);
1826 PG_RETURN_BOOL(result);
1829 Datum
1830 textne(PG_FUNCTION_ARGS)
1832 Oid collid = PG_GET_COLLATION();
1833 bool result;
1835 check_collation_set(collid);
1837 if (lc_collate_is_c(collid) ||
1838 collid == DEFAULT_COLLATION_OID ||
1839 pg_newlocale_from_collation(collid)->deterministic)
1841 Datum arg1 = PG_GETARG_DATUM(0);
1842 Datum arg2 = PG_GETARG_DATUM(1);
1843 Size len1,
1844 len2;
1846 /* See comment in texteq() */
1847 len1 = toast_raw_datum_size(arg1);
1848 len2 = toast_raw_datum_size(arg2);
1849 if (len1 != len2)
1850 result = true;
1851 else
1853 text *targ1 = DatumGetTextPP(arg1);
1854 text *targ2 = DatumGetTextPP(arg2);
1856 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1857 len1 - VARHDRSZ) != 0);
1859 PG_FREE_IF_COPY(targ1, 0);
1860 PG_FREE_IF_COPY(targ2, 1);
1863 else
1865 text *arg1 = PG_GETARG_TEXT_PP(0);
1866 text *arg2 = PG_GETARG_TEXT_PP(1);
1868 result = (text_cmp(arg1, arg2, collid) != 0);
1870 PG_FREE_IF_COPY(arg1, 0);
1871 PG_FREE_IF_COPY(arg2, 1);
1874 PG_RETURN_BOOL(result);
1877 Datum
1878 text_lt(PG_FUNCTION_ARGS)
1880 text *arg1 = PG_GETARG_TEXT_PP(0);
1881 text *arg2 = PG_GETARG_TEXT_PP(1);
1882 bool result;
1884 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1886 PG_FREE_IF_COPY(arg1, 0);
1887 PG_FREE_IF_COPY(arg2, 1);
1889 PG_RETURN_BOOL(result);
1892 Datum
1893 text_le(PG_FUNCTION_ARGS)
1895 text *arg1 = PG_GETARG_TEXT_PP(0);
1896 text *arg2 = PG_GETARG_TEXT_PP(1);
1897 bool result;
1899 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1901 PG_FREE_IF_COPY(arg1, 0);
1902 PG_FREE_IF_COPY(arg2, 1);
1904 PG_RETURN_BOOL(result);
1907 Datum
1908 text_gt(PG_FUNCTION_ARGS)
1910 text *arg1 = PG_GETARG_TEXT_PP(0);
1911 text *arg2 = PG_GETARG_TEXT_PP(1);
1912 bool result;
1914 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1916 PG_FREE_IF_COPY(arg1, 0);
1917 PG_FREE_IF_COPY(arg2, 1);
1919 PG_RETURN_BOOL(result);
1922 Datum
1923 text_ge(PG_FUNCTION_ARGS)
1925 text *arg1 = PG_GETARG_TEXT_PP(0);
1926 text *arg2 = PG_GETARG_TEXT_PP(1);
1927 bool result;
1929 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1931 PG_FREE_IF_COPY(arg1, 0);
1932 PG_FREE_IF_COPY(arg2, 1);
1934 PG_RETURN_BOOL(result);
1937 Datum
1938 text_starts_with(PG_FUNCTION_ARGS)
1940 Datum arg1 = PG_GETARG_DATUM(0);
1941 Datum arg2 = PG_GETARG_DATUM(1);
1942 Oid collid = PG_GET_COLLATION();
1943 pg_locale_t mylocale = 0;
1944 bool result;
1945 Size len1,
1946 len2;
1948 check_collation_set(collid);
1950 if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1951 mylocale = pg_newlocale_from_collation(collid);
1953 if (mylocale && !mylocale->deterministic)
1954 ereport(ERROR,
1955 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1956 errmsg("nondeterministic collations are not supported for substring searches")));
1958 len1 = toast_raw_datum_size(arg1);
1959 len2 = toast_raw_datum_size(arg2);
1960 if (len2 > len1)
1961 result = false;
1962 else
1964 text *targ1 = text_substring(arg1, 1, len2, false);
1965 text *targ2 = DatumGetTextPP(arg2);
1967 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1968 VARSIZE_ANY_EXHDR(targ2)) == 0);
1970 PG_FREE_IF_COPY(targ1, 0);
1971 PG_FREE_IF_COPY(targ2, 1);
1974 PG_RETURN_BOOL(result);
1977 Datum
1978 bttextcmp(PG_FUNCTION_ARGS)
1980 text *arg1 = PG_GETARG_TEXT_PP(0);
1981 text *arg2 = PG_GETARG_TEXT_PP(1);
1982 int32 result;
1984 result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1986 PG_FREE_IF_COPY(arg1, 0);
1987 PG_FREE_IF_COPY(arg2, 1);
1989 PG_RETURN_INT32(result);
1992 Datum
1993 bttextsortsupport(PG_FUNCTION_ARGS)
1995 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1996 Oid collid = ssup->ssup_collation;
1997 MemoryContext oldcontext;
1999 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2001 /* Use generic string SortSupport */
2002 varstr_sortsupport(ssup, TEXTOID, collid);
2004 MemoryContextSwitchTo(oldcontext);
2006 PG_RETURN_VOID();
2010 * Generic sortsupport interface for character type's operator classes.
2011 * Includes locale support, and support for BpChar semantics (i.e. removing
2012 * trailing spaces before comparison).
2014 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2015 * same representation. Callers that always use the C collation (e.g.
2016 * non-collatable type callers like bytea) may have NUL bytes in their strings;
2017 * this will not work with any other collation, though.
2019 void
2020 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
2022 bool abbreviate = ssup->abbreviate;
2023 bool collate_c = false;
2024 VarStringSortSupport *sss;
2025 pg_locale_t locale = 0;
2027 check_collation_set(collid);
2030 * If possible, set ssup->comparator to a function which can be used to
2031 * directly compare two datums. If we can do this, we'll avoid the
2032 * overhead of a trip through the fmgr layer for every comparison, which
2033 * can be substantial.
2035 * Most typically, we'll set the comparator to varlenafastcmp_locale,
2036 * which uses strcoll() to perform comparisons. We use that for the
2037 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2038 * LC_COLLATE = C, we can make things quite a bit faster with
2039 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2040 * memcmp() rather than strcoll().
2042 if (lc_collate_is_c(collid))
2044 if (typid == BPCHAROID)
2045 ssup->comparator = bpcharfastcmp_c;
2046 else if (typid == NAMEOID)
2048 ssup->comparator = namefastcmp_c;
2049 /* Not supporting abbreviation with type NAME, for now */
2050 abbreviate = false;
2052 else
2053 ssup->comparator = varstrfastcmp_c;
2055 collate_c = true;
2057 else
2060 * We need a collation-sensitive comparison. To make things faster,
2061 * we'll figure out the collation based on the locale id and cache the
2062 * result.
2064 if (collid != DEFAULT_COLLATION_OID)
2065 locale = pg_newlocale_from_collation(collid);
2068 * There is a further exception on Windows. When the database
2069 * encoding is UTF-8 and we are not using the C collation, complex
2070 * hacks are required. We don't currently have a comparator that
2071 * handles that case, so we fall back on the slow method of having the
2072 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2073 * trampoline. ICU locales work just the same on Windows, however.
2075 #ifdef WIN32
2076 if (GetDatabaseEncoding() == PG_UTF8 &&
2077 !(locale && locale->provider == COLLPROVIDER_ICU))
2078 return;
2079 #endif
2082 * We use varlenafastcmp_locale except for type NAME.
2084 if (typid == NAMEOID)
2086 ssup->comparator = namefastcmp_locale;
2087 /* Not supporting abbreviation with type NAME, for now */
2088 abbreviate = false;
2090 else
2091 ssup->comparator = varlenafastcmp_locale;
2095 * Unfortunately, it seems that abbreviation for non-C collations is
2096 * broken on many common platforms; testing of multiple versions of glibc
2097 * reveals that, for many locales, strcoll() and strxfrm() do not return
2098 * consistent results, which is fatal to this optimization. While no
2099 * other libc other than Cygwin has so far been shown to have a problem,
2100 * we take the conservative course of action for right now and disable
2101 * this categorically. (Users who are certain this isn't a problem on
2102 * their system can define TRUST_STRXFRM.)
2104 * Even apart from the risk of broken locales, it's possible that there
2105 * are platforms where the use of abbreviated keys should be disabled at
2106 * compile time. Having only 4 byte datums could make worst-case
2107 * performance drastically more likely, for example. Moreover, macOS's
2108 * strxfrm() implementation is known to not effectively concentrate a
2109 * significant amount of entropy from the original string in earlier
2110 * transformed blobs. It's possible that other supported platforms are
2111 * similarly encumbered. So, if we ever get past disabling this
2112 * categorically, we may still want or need to disable it for particular
2113 * platforms.
2115 #ifndef TRUST_STRXFRM
2116 if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2117 abbreviate = false;
2118 #endif
2121 * If we're using abbreviated keys, or if we're using a locale-aware
2122 * comparison, we need to initialize a VarStringSortSupport object. Both
2123 * cases will make use of the temporary buffers we initialize here for
2124 * scratch space (and to detect requirement for BpChar semantics from
2125 * caller), and the abbreviation case requires additional state.
2127 if (abbreviate || !collate_c)
2129 sss = palloc(sizeof(VarStringSortSupport));
2130 sss->buf1 = palloc(TEXTBUFLEN);
2131 sss->buflen1 = TEXTBUFLEN;
2132 sss->buf2 = palloc(TEXTBUFLEN);
2133 sss->buflen2 = TEXTBUFLEN;
2134 /* Start with invalid values */
2135 sss->last_len1 = -1;
2136 sss->last_len2 = -1;
2137 /* Initialize */
2138 sss->last_returned = 0;
2139 sss->locale = locale;
2142 * To avoid somehow confusing a strxfrm() blob and an original string,
2143 * constantly keep track of the variety of data that buf1 and buf2
2144 * currently contain.
2146 * Comparisons may be interleaved with conversion calls. Frequently,
2147 * conversions and comparisons are batched into two distinct phases,
2148 * but the correctness of caching cannot hinge upon this. For
2149 * comparison caching, buffer state is only trusted if cache_blob is
2150 * found set to false, whereas strxfrm() caching only trusts the state
2151 * when cache_blob is found set to true.
2153 * Arbitrarily initialize cache_blob to true.
2155 sss->cache_blob = true;
2156 sss->collate_c = collate_c;
2157 sss->typid = typid;
2158 ssup->ssup_extra = sss;
2161 * If possible, plan to use the abbreviated keys optimization. The
2162 * core code may switch back to authoritative comparator should
2163 * abbreviation be aborted.
2165 if (abbreviate)
2167 sss->prop_card = 0.20;
2168 initHyperLogLog(&sss->abbr_card, 10);
2169 initHyperLogLog(&sss->full_card, 10);
2170 ssup->abbrev_full_comparator = ssup->comparator;
2171 ssup->comparator = varstrcmp_abbrev;
2172 ssup->abbrev_converter = varstr_abbrev_convert;
2173 ssup->abbrev_abort = varstr_abbrev_abort;
2179 * sortsupport comparison func (for C locale case)
2181 static int
2182 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2184 VarString *arg1 = DatumGetVarStringPP(x);
2185 VarString *arg2 = DatumGetVarStringPP(y);
2186 char *a1p,
2187 *a2p;
2188 int len1,
2189 len2,
2190 result;
2192 a1p = VARDATA_ANY(arg1);
2193 a2p = VARDATA_ANY(arg2);
2195 len1 = VARSIZE_ANY_EXHDR(arg1);
2196 len2 = VARSIZE_ANY_EXHDR(arg2);
2198 result = memcmp(a1p, a2p, Min(len1, len2));
2199 if ((result == 0) && (len1 != len2))
2200 result = (len1 < len2) ? -1 : 1;
2202 /* We can't afford to leak memory here. */
2203 if (PointerGetDatum(arg1) != x)
2204 pfree(arg1);
2205 if (PointerGetDatum(arg2) != y)
2206 pfree(arg2);
2208 return result;
2212 * sortsupport comparison func (for BpChar C locale case)
2214 * BpChar outsources its sortsupport to this module. Specialization for the
2215 * varstr_sortsupport BpChar case, modeled on
2216 * internal_bpchar_pattern_compare().
2218 static int
2219 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2221 BpChar *arg1 = DatumGetBpCharPP(x);
2222 BpChar *arg2 = DatumGetBpCharPP(y);
2223 char *a1p,
2224 *a2p;
2225 int len1,
2226 len2,
2227 result;
2229 a1p = VARDATA_ANY(arg1);
2230 a2p = VARDATA_ANY(arg2);
2232 len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2233 len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2235 result = memcmp(a1p, a2p, Min(len1, len2));
2236 if ((result == 0) && (len1 != len2))
2237 result = (len1 < len2) ? -1 : 1;
2239 /* We can't afford to leak memory here. */
2240 if (PointerGetDatum(arg1) != x)
2241 pfree(arg1);
2242 if (PointerGetDatum(arg2) != y)
2243 pfree(arg2);
2245 return result;
2249 * sortsupport comparison func (for NAME C locale case)
2251 static int
2252 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2254 Name arg1 = DatumGetName(x);
2255 Name arg2 = DatumGetName(y);
2257 return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2261 * sortsupport comparison func (for locale case with all varlena types)
2263 static int
2264 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2266 VarString *arg1 = DatumGetVarStringPP(x);
2267 VarString *arg2 = DatumGetVarStringPP(y);
2268 char *a1p,
2269 *a2p;
2270 int len1,
2271 len2,
2272 result;
2274 a1p = VARDATA_ANY(arg1);
2275 a2p = VARDATA_ANY(arg2);
2277 len1 = VARSIZE_ANY_EXHDR(arg1);
2278 len2 = VARSIZE_ANY_EXHDR(arg2);
2280 result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2282 /* We can't afford to leak memory here. */
2283 if (PointerGetDatum(arg1) != x)
2284 pfree(arg1);
2285 if (PointerGetDatum(arg2) != y)
2286 pfree(arg2);
2288 return result;
2292 * sortsupport comparison func (for locale case with NAME type)
2294 static int
2295 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2297 Name arg1 = DatumGetName(x);
2298 Name arg2 = DatumGetName(y);
2300 return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2301 NameStr(*arg2), strlen(NameStr(*arg2)),
2302 ssup);
2306 * sortsupport comparison func for locale cases
2308 static int
2309 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2311 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2312 int result;
2313 bool arg1_match;
2315 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2316 if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2319 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2320 * last_len2. Existing contents of buffers might still be used by
2321 * next call.
2323 * It's fine to allow the comparison of BpChar padding bytes here,
2324 * even though that implies that the memcmp() will usually be
2325 * performed for BpChar callers (though multibyte characters could
2326 * still prevent that from occurring). The memcmp() is still very
2327 * cheap, and BpChar's funny semantics have us remove trailing spaces
2328 * (not limited to padding), so we need make no distinction between
2329 * padding space characters and "real" space characters.
2331 return 0;
2334 if (sss->typid == BPCHAROID)
2336 /* Get true number of bytes, ignoring trailing spaces */
2337 len1 = bpchartruelen(a1p, len1);
2338 len2 = bpchartruelen(a2p, len2);
2341 if (len1 >= sss->buflen1)
2343 pfree(sss->buf1);
2344 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2345 sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2347 if (len2 >= sss->buflen2)
2349 pfree(sss->buf2);
2350 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2351 sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2355 * We're likely to be asked to compare the same strings repeatedly, and
2356 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2357 * comparisons, even though in general there is no reason to think that
2358 * that will work out (every string datum may be unique). Caching does
2359 * not slow things down measurably when it doesn't work out, and can speed
2360 * things up by rather a lot when it does. In part, this is because the
2361 * memcmp() compares data from cachelines that are needed in L1 cache even
2362 * when the last comparison's result cannot be reused.
2364 arg1_match = true;
2365 if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2367 arg1_match = false;
2368 memcpy(sss->buf1, a1p, len1);
2369 sss->buf1[len1] = '\0';
2370 sss->last_len1 = len1;
2374 * If we're comparing the same two strings as last time, we can return the
2375 * same answer without calling strcoll() again. This is more likely than
2376 * it seems (at least with moderate to low cardinality sets), because
2377 * quicksort compares the same pivot against many values.
2379 if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2381 memcpy(sss->buf2, a2p, len2);
2382 sss->buf2[len2] = '\0';
2383 sss->last_len2 = len2;
2385 else if (arg1_match && !sss->cache_blob)
2387 /* Use result cached following last actual strcoll() call */
2388 return sss->last_returned;
2391 if (sss->locale)
2393 if (sss->locale->provider == COLLPROVIDER_ICU)
2395 #ifdef USE_ICU
2396 #ifdef HAVE_UCOL_STRCOLLUTF8
2397 if (GetDatabaseEncoding() == PG_UTF8)
2399 UErrorCode status;
2401 status = U_ZERO_ERROR;
2402 result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2403 a1p, len1,
2404 a2p, len2,
2405 &status);
2406 if (U_FAILURE(status))
2407 ereport(ERROR,
2408 (errmsg("collation failed: %s", u_errorName(status))));
2410 else
2411 #endif
2413 int32_t ulen1,
2414 ulen2;
2415 UChar *uchar1,
2416 *uchar2;
2418 ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2419 ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2421 result = ucol_strcoll(sss->locale->info.icu.ucol,
2422 uchar1, ulen1,
2423 uchar2, ulen2);
2425 pfree(uchar1);
2426 pfree(uchar2);
2428 #else /* not USE_ICU */
2429 /* shouldn't happen */
2430 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2431 #endif /* not USE_ICU */
2433 else
2435 #ifdef HAVE_LOCALE_T
2436 result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2437 #else
2438 /* shouldn't happen */
2439 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2440 #endif
2443 else
2444 result = strcoll(sss->buf1, sss->buf2);
2446 /* Break tie if necessary. */
2447 if (result == 0 &&
2448 (!sss->locale || sss->locale->deterministic))
2449 result = strcmp(sss->buf1, sss->buf2);
2451 /* Cache result, perhaps saving an expensive strcoll() call next time */
2452 sss->cache_blob = false;
2453 sss->last_returned = result;
2454 return result;
2458 * Abbreviated key comparison func
2460 static int
2461 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2464 * When 0 is returned, the core system will call varstrfastcmp_c()
2465 * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2466 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2467 * authoritatively, for the same reason that there is a strcoll()
2468 * tie-breaker call to strcmp() in varstr_cmp().
2470 if (x > y)
2471 return 1;
2472 else if (x == y)
2473 return 0;
2474 else
2475 return -1;
2479 * Conversion routine for sortsupport. Converts original to abbreviated key
2480 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2481 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2482 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2483 * locale is used, or in case of bytea, just memcpy() from original instead.
2485 static Datum
2486 varstr_abbrev_convert(Datum original, SortSupport ssup)
2488 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2489 VarString *authoritative = DatumGetVarStringPP(original);
2490 char *authoritative_data = VARDATA_ANY(authoritative);
2492 /* working state */
2493 Datum res;
2494 char *pres;
2495 int len;
2496 uint32 hash;
2498 pres = (char *) &res;
2499 /* memset(), so any non-overwritten bytes are NUL */
2500 memset(pres, 0, sizeof(Datum));
2501 len = VARSIZE_ANY_EXHDR(authoritative);
2503 /* Get number of bytes, ignoring trailing spaces */
2504 if (sss->typid == BPCHAROID)
2505 len = bpchartruelen(authoritative_data, len);
2508 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2509 * abbreviate keys. The full comparator for the C locale is always
2510 * memcmp(). It would be incorrect to allow bytea callers (callers that
2511 * always force the C collation -- bytea isn't a collatable type, but this
2512 * approach is convenient) to use strxfrm(). This is because bytea
2513 * strings may contain NUL bytes. Besides, this should be faster, too.
2515 * More generally, it's okay that bytea callers can have NUL bytes in
2516 * strings because varstrcmp_abbrev() need not make a distinction between
2517 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2518 * authoritative representation. Hopefully a comparison at or past one
2519 * abbreviated key's terminating NUL byte will resolve the comparison
2520 * without consulting the authoritative representation; specifically, some
2521 * later non-NUL byte in the longer string can resolve the comparison
2522 * against a subsequent terminating NUL in the shorter string. There will
2523 * usually be what is effectively a "length-wise" resolution there and
2524 * then.
2526 * If that doesn't work out -- if all bytes in the longer string
2527 * positioned at or past the offset of the smaller string's (first)
2528 * terminating NUL are actually representative of NUL bytes in the
2529 * authoritative binary string (perhaps with some *terminating* NUL bytes
2530 * towards the end of the longer string iff it happens to still be small)
2531 * -- then an authoritative tie-breaker will happen, and do the right
2532 * thing: explicitly consider string length.
2534 if (sss->collate_c)
2535 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2536 else
2538 Size bsize;
2539 #ifdef USE_ICU
2540 int32_t ulen = -1;
2541 UChar *uchar = NULL;
2542 #endif
2545 * We're not using the C collation, so fall back on strxfrm or ICU
2546 * analogs.
2549 /* By convention, we use buffer 1 to store and NUL-terminate */
2550 if (len >= sss->buflen1)
2552 pfree(sss->buf1);
2553 sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2554 sss->buf1 = palloc(sss->buflen1);
2557 /* Might be able to reuse strxfrm() blob from last call */
2558 if (sss->last_len1 == len && sss->cache_blob &&
2559 memcmp(sss->buf1, authoritative_data, len) == 0)
2561 memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2562 /* No change affecting cardinality, so no hashing required */
2563 goto done;
2566 memcpy(sss->buf1, authoritative_data, len);
2569 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2570 * necessary for ICU, but doesn't hurt.
2572 sss->buf1[len] = '\0';
2573 sss->last_len1 = len;
2575 #ifdef USE_ICU
2576 /* When using ICU and not UTF8, convert string to UChar. */
2577 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2578 GetDatabaseEncoding() != PG_UTF8)
2579 ulen = icu_to_uchar(&uchar, sss->buf1, len);
2580 #endif
2583 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2584 * and try again. Both of these functions have the result buffer
2585 * content undefined if the result did not fit, so we need to retry
2586 * until everything fits, even though we only need the first few bytes
2587 * in the end. When using ucol_nextSortKeyPart(), however, we only
2588 * ask for as many bytes as we actually need.
2590 for (;;)
2592 #ifdef USE_ICU
2593 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2596 * When using UTF8, use the iteration interface so we only
2597 * need to produce as many bytes as we actually need.
2599 if (GetDatabaseEncoding() == PG_UTF8)
2601 UCharIterator iter;
2602 uint32_t state[2];
2603 UErrorCode status;
2605 uiter_setUTF8(&iter, sss->buf1, len);
2606 state[0] = state[1] = 0; /* won't need that again */
2607 status = U_ZERO_ERROR;
2608 bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2609 &iter,
2610 state,
2611 (uint8_t *) sss->buf2,
2612 Min(sizeof(Datum), sss->buflen2),
2613 &status);
2614 if (U_FAILURE(status))
2615 ereport(ERROR,
2616 (errmsg("sort key generation failed: %s",
2617 u_errorName(status))));
2619 else
2620 bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2621 uchar, ulen,
2622 (uint8_t *) sss->buf2, sss->buflen2);
2624 else
2625 #endif
2626 #ifdef HAVE_LOCALE_T
2627 if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2628 bsize = strxfrm_l(sss->buf2, sss->buf1,
2629 sss->buflen2, sss->locale->info.lt);
2630 else
2631 #endif
2632 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2634 sss->last_len2 = bsize;
2635 if (bsize < sss->buflen2)
2636 break;
2639 * Grow buffer and retry.
2641 pfree(sss->buf2);
2642 sss->buflen2 = Max(bsize + 1,
2643 Min(sss->buflen2 * 2, MaxAllocSize));
2644 sss->buf2 = palloc(sss->buflen2);
2648 * Every Datum byte is always compared. This is safe because the
2649 * strxfrm() blob is itself NUL terminated, leaving no danger of
2650 * misinterpreting any NUL bytes not intended to be interpreted as
2651 * logically representing termination.
2653 * (Actually, even if there were NUL bytes in the blob it would be
2654 * okay. See remarks on bytea case above.)
2656 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2658 #ifdef USE_ICU
2659 if (uchar)
2660 pfree(uchar);
2661 #endif
2665 * Maintain approximate cardinality of both abbreviated keys and original,
2666 * authoritative keys using HyperLogLog. Used as cheap insurance against
2667 * the worst case, where we do many string transformations for no saving
2668 * in full strcoll()-based comparisons. These statistics are used by
2669 * varstr_abbrev_abort().
2671 * First, Hash key proper, or a significant fraction of it. Mix in length
2672 * in order to compensate for cases where differences are past
2673 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2675 hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2676 Min(len, PG_CACHE_LINE_SIZE)));
2678 if (len > PG_CACHE_LINE_SIZE)
2679 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2681 addHyperLogLog(&sss->full_card, hash);
2683 /* Hash abbreviated key */
2684 #if SIZEOF_DATUM == 8
2686 uint32 lohalf,
2687 hihalf;
2689 lohalf = (uint32) res;
2690 hihalf = (uint32) (res >> 32);
2691 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2693 #else /* SIZEOF_DATUM != 8 */
2694 hash = DatumGetUInt32(hash_uint32((uint32) res));
2695 #endif
2697 addHyperLogLog(&sss->abbr_card, hash);
2699 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2700 sss->cache_blob = true;
2701 done:
2704 * Byteswap on little-endian machines.
2706 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2707 * comparator) works correctly on all platforms. If we didn't do this,
2708 * the comparator would have to call memcmp() with a pair of pointers to
2709 * the first byte of each abbreviated key, which is slower.
2711 res = DatumBigEndianToNative(res);
2713 /* Don't leak memory here */
2714 if (PointerGetDatum(authoritative) != original)
2715 pfree(authoritative);
2717 return res;
2721 * Callback for estimating effectiveness of abbreviated key optimization, using
2722 * heuristic rules. Returns value indicating if the abbreviation optimization
2723 * should be aborted, based on its projected effectiveness.
2725 static bool
2726 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2728 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2729 double abbrev_distinct,
2730 key_distinct;
2732 Assert(ssup->abbreviate);
2734 /* Have a little patience */
2735 if (memtupcount < 100)
2736 return false;
2738 abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2739 key_distinct = estimateHyperLogLog(&sss->full_card);
2742 * Clamp cardinality estimates to at least one distinct value. While
2743 * NULLs are generally disregarded, if only NULL values were seen so far,
2744 * that might misrepresent costs if we failed to clamp.
2746 if (abbrev_distinct <= 1.0)
2747 abbrev_distinct = 1.0;
2749 if (key_distinct <= 1.0)
2750 key_distinct = 1.0;
2753 * In the worst case all abbreviated keys are identical, while at the same
2754 * time there are differences within full key strings not captured in
2755 * abbreviations.
2757 #ifdef TRACE_SORT
2758 if (trace_sort)
2760 double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2762 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2763 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2764 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2765 sss->prop_card);
2767 #endif
2770 * If the number of distinct abbreviated keys approximately matches the
2771 * number of distinct authoritative original keys, that's reason enough to
2772 * proceed. We can win even with a very low cardinality set if most
2773 * tie-breakers only memcmp(). This is by far the most important
2774 * consideration.
2776 * While comparisons that are resolved at the abbreviated key level are
2777 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2778 * those two outcomes are so much cheaper than a full strcoll() once
2779 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2780 * cardinality against the overall size of the set in order to more
2781 * accurately model costs. Assume that an abbreviated comparison, and an
2782 * abbreviated comparison with a cheap memcmp()-based authoritative
2783 * resolution are equivalent.
2785 if (abbrev_distinct > key_distinct * sss->prop_card)
2788 * When we have exceeded 10,000 tuples, decay required cardinality
2789 * aggressively for next call.
2791 * This is useful because the number of comparisons required on
2792 * average increases at a linearithmic rate, and at roughly 10,000
2793 * tuples that factor will start to dominate over the linear costs of
2794 * string transformation (this is a conservative estimate). The decay
2795 * rate is chosen to be a little less aggressive than halving -- which
2796 * (since we're called at points at which memtupcount has doubled)
2797 * would never see the cost model actually abort past the first call
2798 * following a decay. This decay rate is mostly a precaution against
2799 * a sudden, violent swing in how well abbreviated cardinality tracks
2800 * full key cardinality. The decay also serves to prevent a marginal
2801 * case from being aborted too late, when too much has already been
2802 * invested in string transformation.
2804 * It's possible for sets of several million distinct strings with
2805 * mere tens of thousands of distinct abbreviated keys to still
2806 * benefit very significantly. This will generally occur provided
2807 * each abbreviated key is a proxy for a roughly uniform number of the
2808 * set's full keys. If it isn't so, we hope to catch that early and
2809 * abort. If it isn't caught early, by the time the problem is
2810 * apparent it's probably not worth aborting.
2812 if (memtupcount > 10000)
2813 sss->prop_card *= 0.65;
2815 return false;
2819 * Abort abbreviation strategy.
2821 * The worst case, where all abbreviated keys are identical while all
2822 * original strings differ will typically only see a regression of about
2823 * 10% in execution time for small to medium sized lists of strings.
2824 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2825 * often expect very large improvements, particularly with sets of strings
2826 * of moderately high to high abbreviated cardinality. There is little to
2827 * lose but much to gain, which our strategy reflects.
2829 #ifdef TRACE_SORT
2830 if (trace_sort)
2831 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2832 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2833 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2834 #endif
2836 return true;
2840 * Generic equalimage support function for character type's operator classes.
2841 * Disables the use of deduplication with nondeterministic collations.
2843 Datum
2844 btvarstrequalimage(PG_FUNCTION_ARGS)
2846 /* Oid opcintype = PG_GETARG_OID(0); */
2847 Oid collid = PG_GET_COLLATION();
2849 check_collation_set(collid);
2851 if (lc_collate_is_c(collid) ||
2852 collid == DEFAULT_COLLATION_OID ||
2853 get_collation_isdeterministic(collid))
2854 PG_RETURN_BOOL(true);
2855 else
2856 PG_RETURN_BOOL(false);
2859 Datum
2860 text_larger(PG_FUNCTION_ARGS)
2862 text *arg1 = PG_GETARG_TEXT_PP(0);
2863 text *arg2 = PG_GETARG_TEXT_PP(1);
2864 text *result;
2866 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2868 PG_RETURN_TEXT_P(result);
2871 Datum
2872 text_smaller(PG_FUNCTION_ARGS)
2874 text *arg1 = PG_GETARG_TEXT_PP(0);
2875 text *arg2 = PG_GETARG_TEXT_PP(1);
2876 text *result;
2878 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2880 PG_RETURN_TEXT_P(result);
2885 * Cross-type comparison functions for types text and name.
2888 Datum
2889 nameeqtext(PG_FUNCTION_ARGS)
2891 Name arg1 = PG_GETARG_NAME(0);
2892 text *arg2 = PG_GETARG_TEXT_PP(1);
2893 size_t len1 = strlen(NameStr(*arg1));
2894 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2895 Oid collid = PG_GET_COLLATION();
2896 bool result;
2898 check_collation_set(collid);
2900 if (collid == C_COLLATION_OID)
2901 result = (len1 == len2 &&
2902 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2903 else
2904 result = (varstr_cmp(NameStr(*arg1), len1,
2905 VARDATA_ANY(arg2), len2,
2906 collid) == 0);
2908 PG_FREE_IF_COPY(arg2, 1);
2910 PG_RETURN_BOOL(result);
2913 Datum
2914 texteqname(PG_FUNCTION_ARGS)
2916 text *arg1 = PG_GETARG_TEXT_PP(0);
2917 Name arg2 = PG_GETARG_NAME(1);
2918 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2919 size_t len2 = strlen(NameStr(*arg2));
2920 Oid collid = PG_GET_COLLATION();
2921 bool result;
2923 check_collation_set(collid);
2925 if (collid == C_COLLATION_OID)
2926 result = (len1 == len2 &&
2927 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2928 else
2929 result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2930 NameStr(*arg2), len2,
2931 collid) == 0);
2933 PG_FREE_IF_COPY(arg1, 0);
2935 PG_RETURN_BOOL(result);
2938 Datum
2939 namenetext(PG_FUNCTION_ARGS)
2941 Name arg1 = PG_GETARG_NAME(0);
2942 text *arg2 = PG_GETARG_TEXT_PP(1);
2943 size_t len1 = strlen(NameStr(*arg1));
2944 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2945 Oid collid = PG_GET_COLLATION();
2946 bool result;
2948 check_collation_set(collid);
2950 if (collid == C_COLLATION_OID)
2951 result = !(len1 == len2 &&
2952 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2953 else
2954 result = !(varstr_cmp(NameStr(*arg1), len1,
2955 VARDATA_ANY(arg2), len2,
2956 collid) == 0);
2958 PG_FREE_IF_COPY(arg2, 1);
2960 PG_RETURN_BOOL(result);
2963 Datum
2964 textnename(PG_FUNCTION_ARGS)
2966 text *arg1 = PG_GETARG_TEXT_PP(0);
2967 Name arg2 = PG_GETARG_NAME(1);
2968 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2969 size_t len2 = strlen(NameStr(*arg2));
2970 Oid collid = PG_GET_COLLATION();
2971 bool result;
2973 check_collation_set(collid);
2975 if (collid == C_COLLATION_OID)
2976 result = !(len1 == len2 &&
2977 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2978 else
2979 result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2980 NameStr(*arg2), len2,
2981 collid) == 0);
2983 PG_FREE_IF_COPY(arg1, 0);
2985 PG_RETURN_BOOL(result);
2988 Datum
2989 btnametextcmp(PG_FUNCTION_ARGS)
2991 Name arg1 = PG_GETARG_NAME(0);
2992 text *arg2 = PG_GETARG_TEXT_PP(1);
2993 int32 result;
2995 result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2996 VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2997 PG_GET_COLLATION());
2999 PG_FREE_IF_COPY(arg2, 1);
3001 PG_RETURN_INT32(result);
3004 Datum
3005 bttextnamecmp(PG_FUNCTION_ARGS)
3007 text *arg1 = PG_GETARG_TEXT_PP(0);
3008 Name arg2 = PG_GETARG_NAME(1);
3009 int32 result;
3011 result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
3012 NameStr(*arg2), strlen(NameStr(*arg2)),
3013 PG_GET_COLLATION());
3015 PG_FREE_IF_COPY(arg1, 0);
3017 PG_RETURN_INT32(result);
3020 #define CmpCall(cmpfunc) \
3021 DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
3022 PG_GET_COLLATION(), \
3023 PG_GETARG_DATUM(0), \
3024 PG_GETARG_DATUM(1)))
3026 Datum
3027 namelttext(PG_FUNCTION_ARGS)
3029 PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
3032 Datum
3033 nameletext(PG_FUNCTION_ARGS)
3035 PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3038 Datum
3039 namegttext(PG_FUNCTION_ARGS)
3041 PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3044 Datum
3045 namegetext(PG_FUNCTION_ARGS)
3047 PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3050 Datum
3051 textltname(PG_FUNCTION_ARGS)
3053 PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3056 Datum
3057 textlename(PG_FUNCTION_ARGS)
3059 PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3062 Datum
3063 textgtname(PG_FUNCTION_ARGS)
3065 PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3068 Datum
3069 textgename(PG_FUNCTION_ARGS)
3071 PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3074 #undef CmpCall
3078 * The following operators support character-by-character comparison
3079 * of text datums, to allow building indexes suitable for LIKE clauses.
3080 * Note that the regular texteq/textne comparison operators, and regular
3081 * support functions 1 and 2 with "C" collation are assumed to be
3082 * compatible with these!
3085 static int
3086 internal_text_pattern_compare(text *arg1, text *arg2)
3088 int result;
3089 int len1,
3090 len2;
3092 len1 = VARSIZE_ANY_EXHDR(arg1);
3093 len2 = VARSIZE_ANY_EXHDR(arg2);
3095 result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3096 if (result != 0)
3097 return result;
3098 else if (len1 < len2)
3099 return -1;
3100 else if (len1 > len2)
3101 return 1;
3102 else
3103 return 0;
3107 Datum
3108 text_pattern_lt(PG_FUNCTION_ARGS)
3110 text *arg1 = PG_GETARG_TEXT_PP(0);
3111 text *arg2 = PG_GETARG_TEXT_PP(1);
3112 int result;
3114 result = internal_text_pattern_compare(arg1, arg2);
3116 PG_FREE_IF_COPY(arg1, 0);
3117 PG_FREE_IF_COPY(arg2, 1);
3119 PG_RETURN_BOOL(result < 0);
3123 Datum
3124 text_pattern_le(PG_FUNCTION_ARGS)
3126 text *arg1 = PG_GETARG_TEXT_PP(0);
3127 text *arg2 = PG_GETARG_TEXT_PP(1);
3128 int result;
3130 result = internal_text_pattern_compare(arg1, arg2);
3132 PG_FREE_IF_COPY(arg1, 0);
3133 PG_FREE_IF_COPY(arg2, 1);
3135 PG_RETURN_BOOL(result <= 0);
3139 Datum
3140 text_pattern_ge(PG_FUNCTION_ARGS)
3142 text *arg1 = PG_GETARG_TEXT_PP(0);
3143 text *arg2 = PG_GETARG_TEXT_PP(1);
3144 int result;
3146 result = internal_text_pattern_compare(arg1, arg2);
3148 PG_FREE_IF_COPY(arg1, 0);
3149 PG_FREE_IF_COPY(arg2, 1);
3151 PG_RETURN_BOOL(result >= 0);
3155 Datum
3156 text_pattern_gt(PG_FUNCTION_ARGS)
3158 text *arg1 = PG_GETARG_TEXT_PP(0);
3159 text *arg2 = PG_GETARG_TEXT_PP(1);
3160 int result;
3162 result = internal_text_pattern_compare(arg1, arg2);
3164 PG_FREE_IF_COPY(arg1, 0);
3165 PG_FREE_IF_COPY(arg2, 1);
3167 PG_RETURN_BOOL(result > 0);
3171 Datum
3172 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3174 text *arg1 = PG_GETARG_TEXT_PP(0);
3175 text *arg2 = PG_GETARG_TEXT_PP(1);
3176 int result;
3178 result = internal_text_pattern_compare(arg1, arg2);
3180 PG_FREE_IF_COPY(arg1, 0);
3181 PG_FREE_IF_COPY(arg2, 1);
3183 PG_RETURN_INT32(result);
3187 Datum
3188 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3190 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3191 MemoryContext oldcontext;
3193 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3195 /* Use generic string SortSupport, forcing "C" collation */
3196 varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3198 MemoryContextSwitchTo(oldcontext);
3200 PG_RETURN_VOID();
3204 /*-------------------------------------------------------------
3205 * byteaoctetlen
3207 * get the number of bytes contained in an instance of type 'bytea'
3208 *-------------------------------------------------------------
3210 Datum
3211 byteaoctetlen(PG_FUNCTION_ARGS)
3213 Datum str = PG_GETARG_DATUM(0);
3215 /* We need not detoast the input at all */
3216 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3220 * byteacat -
3221 * takes two bytea* and returns a bytea* that is the concatenation of
3222 * the two.
3224 * Cloned from textcat and modified as required.
3226 Datum
3227 byteacat(PG_FUNCTION_ARGS)
3229 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3230 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3232 PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3236 * bytea_catenate
3237 * Guts of byteacat(), broken out so it can be used by other functions
3239 * Arguments can be in short-header form, but not compressed or out-of-line
3241 static bytea *
3242 bytea_catenate(bytea *t1, bytea *t2)
3244 bytea *result;
3245 int len1,
3246 len2,
3247 len;
3248 char *ptr;
3250 len1 = VARSIZE_ANY_EXHDR(t1);
3251 len2 = VARSIZE_ANY_EXHDR(t2);
3253 /* paranoia ... probably should throw error instead? */
3254 if (len1 < 0)
3255 len1 = 0;
3256 if (len2 < 0)
3257 len2 = 0;
3259 len = len1 + len2 + VARHDRSZ;
3260 result = (bytea *) palloc(len);
3262 /* Set size of result string... */
3263 SET_VARSIZE(result, len);
3265 /* Fill data field of result string... */
3266 ptr = VARDATA(result);
3267 if (len1 > 0)
3268 memcpy(ptr, VARDATA_ANY(t1), len1);
3269 if (len2 > 0)
3270 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3272 return result;
3275 #define PG_STR_GET_BYTEA(str_) \
3276 DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3279 * bytea_substr()
3280 * Return a substring starting at the specified position.
3281 * Cloned from text_substr and modified as required.
3283 * Input:
3284 * - string
3285 * - starting position (is one-based)
3286 * - string length (optional)
3288 * If the starting position is zero or less, then return from the start of the string
3289 * adjusting the length to be consistent with the "negative start" per SQL.
3290 * If the length is less than zero, an ERROR is thrown. If no third argument
3291 * (length) is provided, the length to the end of the string is assumed.
3293 Datum
3294 bytea_substr(PG_FUNCTION_ARGS)
3296 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3297 PG_GETARG_INT32(1),
3298 PG_GETARG_INT32(2),
3299 false));
3303 * bytea_substr_no_len -
3304 * Wrapper to avoid opr_sanity failure due to
3305 * one function accepting a different number of args.
3307 Datum
3308 bytea_substr_no_len(PG_FUNCTION_ARGS)
3310 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3311 PG_GETARG_INT32(1),
3313 true));
3316 static bytea *
3317 bytea_substring(Datum str,
3318 int S,
3319 int L,
3320 bool length_not_specified)
3322 int32 S1; /* adjusted start position */
3323 int32 L1; /* adjusted substring length */
3324 int32 E; /* end position */
3327 * The logic here should generally match text_substring().
3329 S1 = Max(S, 1);
3331 if (length_not_specified)
3334 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3335 * end of the string if we pass it a negative value for length.
3337 L1 = -1;
3339 else if (L < 0)
3341 /* SQL99 says to throw an error for E < S, i.e., negative length */
3342 ereport(ERROR,
3343 (errcode(ERRCODE_SUBSTRING_ERROR),
3344 errmsg("negative substring length not allowed")));
3345 L1 = -1; /* silence stupider compilers */
3347 else if (pg_add_s32_overflow(S, L, &E))
3350 * L could be large enough for S + L to overflow, in which case the
3351 * substring must run to end of string.
3353 L1 = -1;
3355 else
3358 * A zero or negative value for the end position can happen if the
3359 * start was negative or one. SQL99 says to return a zero-length
3360 * string.
3362 if (E < 1)
3363 return PG_STR_GET_BYTEA("");
3365 L1 = E - S1;
3369 * If the start position is past the end of the string, SQL99 says to
3370 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3371 * us. We need only convert S1 to zero-based starting position.
3373 return DatumGetByteaPSlice(str, S1 - 1, L1);
3377 * byteaoverlay
3378 * Replace specified substring of first string with second
3380 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3381 * This code is a direct implementation of what the standard says.
3383 Datum
3384 byteaoverlay(PG_FUNCTION_ARGS)
3386 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3387 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3388 int sp = PG_GETARG_INT32(2); /* substring start position */
3389 int sl = PG_GETARG_INT32(3); /* substring length */
3391 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3394 Datum
3395 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3397 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3398 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3399 int sp = PG_GETARG_INT32(2); /* substring start position */
3400 int sl;
3402 sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3403 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3406 static bytea *
3407 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3409 bytea *result;
3410 bytea *s1;
3411 bytea *s2;
3412 int sp_pl_sl;
3415 * Check for possible integer-overflow cases. For negative sp, throw a
3416 * "substring length" error because that's what should be expected
3417 * according to the spec's definition of OVERLAY().
3419 if (sp <= 0)
3420 ereport(ERROR,
3421 (errcode(ERRCODE_SUBSTRING_ERROR),
3422 errmsg("negative substring length not allowed")));
3423 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3424 ereport(ERROR,
3425 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3426 errmsg("integer out of range")));
3428 s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3429 s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3430 result = bytea_catenate(s1, t2);
3431 result = bytea_catenate(result, s2);
3433 return result;
3437 * bit_count
3439 Datum
3440 bytea_bit_count(PG_FUNCTION_ARGS)
3442 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3444 PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3448 * byteapos -
3449 * Return the position of the specified substring.
3450 * Implements the SQL POSITION() function.
3451 * Cloned from textpos and modified as required.
3453 Datum
3454 byteapos(PG_FUNCTION_ARGS)
3456 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3457 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3458 int pos;
3459 int px,
3461 int len1,
3462 len2;
3463 char *p1,
3464 *p2;
3466 len1 = VARSIZE_ANY_EXHDR(t1);
3467 len2 = VARSIZE_ANY_EXHDR(t2);
3469 if (len2 <= 0)
3470 PG_RETURN_INT32(1); /* result for empty pattern */
3472 p1 = VARDATA_ANY(t1);
3473 p2 = VARDATA_ANY(t2);
3475 pos = 0;
3476 px = (len1 - len2);
3477 for (p = 0; p <= px; p++)
3479 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3481 pos = p + 1;
3482 break;
3484 p1++;
3487 PG_RETURN_INT32(pos);
3490 /*-------------------------------------------------------------
3491 * byteaGetByte
3493 * this routine treats "bytea" as an array of bytes.
3494 * It returns the Nth byte (a number between 0 and 255).
3495 *-------------------------------------------------------------
3497 Datum
3498 byteaGetByte(PG_FUNCTION_ARGS)
3500 bytea *v = PG_GETARG_BYTEA_PP(0);
3501 int32 n = PG_GETARG_INT32(1);
3502 int len;
3503 int byte;
3505 len = VARSIZE_ANY_EXHDR(v);
3507 if (n < 0 || n >= len)
3508 ereport(ERROR,
3509 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3510 errmsg("index %d out of valid range, 0..%d",
3511 n, len - 1)));
3513 byte = ((unsigned char *) VARDATA_ANY(v))[n];
3515 PG_RETURN_INT32(byte);
3518 /*-------------------------------------------------------------
3519 * byteaGetBit
3521 * This routine treats a "bytea" type like an array of bits.
3522 * It returns the value of the Nth bit (0 or 1).
3524 *-------------------------------------------------------------
3526 Datum
3527 byteaGetBit(PG_FUNCTION_ARGS)
3529 bytea *v = PG_GETARG_BYTEA_PP(0);
3530 int64 n = PG_GETARG_INT64(1);
3531 int byteNo,
3532 bitNo;
3533 int len;
3534 int byte;
3536 len = VARSIZE_ANY_EXHDR(v);
3538 if (n < 0 || n >= (int64) len * 8)
3539 ereport(ERROR,
3540 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3541 errmsg("index %lld out of valid range, 0..%lld",
3542 (long long) n, (long long) len * 8 - 1)));
3544 /* n/8 is now known < len, so safe to cast to int */
3545 byteNo = (int) (n / 8);
3546 bitNo = (int) (n % 8);
3548 byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3550 if (byte & (1 << bitNo))
3551 PG_RETURN_INT32(1);
3552 else
3553 PG_RETURN_INT32(0);
3556 /*-------------------------------------------------------------
3557 * byteaSetByte
3559 * Given an instance of type 'bytea' creates a new one with
3560 * the Nth byte set to the given value.
3562 *-------------------------------------------------------------
3564 Datum
3565 byteaSetByte(PG_FUNCTION_ARGS)
3567 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3568 int32 n = PG_GETARG_INT32(1);
3569 int32 newByte = PG_GETARG_INT32(2);
3570 int len;
3572 len = VARSIZE(res) - VARHDRSZ;
3574 if (n < 0 || n >= len)
3575 ereport(ERROR,
3576 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3577 errmsg("index %d out of valid range, 0..%d",
3578 n, len - 1)));
3581 * Now set the byte.
3583 ((unsigned char *) VARDATA(res))[n] = newByte;
3585 PG_RETURN_BYTEA_P(res);
3588 /*-------------------------------------------------------------
3589 * byteaSetBit
3591 * Given an instance of type 'bytea' creates a new one with
3592 * the Nth bit set to the given value.
3594 *-------------------------------------------------------------
3596 Datum
3597 byteaSetBit(PG_FUNCTION_ARGS)
3599 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3600 int64 n = PG_GETARG_INT64(1);
3601 int32 newBit = PG_GETARG_INT32(2);
3602 int len;
3603 int oldByte,
3604 newByte;
3605 int byteNo,
3606 bitNo;
3608 len = VARSIZE(res) - VARHDRSZ;
3610 if (n < 0 || n >= (int64) len * 8)
3611 ereport(ERROR,
3612 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3613 errmsg("index %lld out of valid range, 0..%lld",
3614 (long long) n, (long long) len * 8 - 1)));
3616 /* n/8 is now known < len, so safe to cast to int */
3617 byteNo = (int) (n / 8);
3618 bitNo = (int) (n % 8);
3621 * sanity check!
3623 if (newBit != 0 && newBit != 1)
3624 ereport(ERROR,
3625 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3626 errmsg("new bit must be 0 or 1")));
3629 * Update the byte.
3631 oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3633 if (newBit == 0)
3634 newByte = oldByte & (~(1 << bitNo));
3635 else
3636 newByte = oldByte | (1 << bitNo);
3638 ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3640 PG_RETURN_BYTEA_P(res);
3644 /* text_name()
3645 * Converts a text type to a Name type.
3647 Datum
3648 text_name(PG_FUNCTION_ARGS)
3650 text *s = PG_GETARG_TEXT_PP(0);
3651 Name result;
3652 int len;
3654 len = VARSIZE_ANY_EXHDR(s);
3656 /* Truncate oversize input */
3657 if (len >= NAMEDATALEN)
3658 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3660 /* We use palloc0 here to ensure result is zero-padded */
3661 result = (Name) palloc0(NAMEDATALEN);
3662 memcpy(NameStr(*result), VARDATA_ANY(s), len);
3664 PG_RETURN_NAME(result);
3667 /* name_text()
3668 * Converts a Name type to a text type.
3670 Datum
3671 name_text(PG_FUNCTION_ARGS)
3673 Name s = PG_GETARG_NAME(0);
3675 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3680 * textToQualifiedNameList - convert a text object to list of names
3682 * This implements the input parsing needed by nextval() and other
3683 * functions that take a text parameter representing a qualified name.
3684 * We split the name at dots, downcase if not double-quoted, and
3685 * truncate names if they're too long.
3687 List *
3688 textToQualifiedNameList(text *textval)
3690 char *rawname;
3691 List *result = NIL;
3692 List *namelist;
3693 ListCell *l;
3695 /* Convert to C string (handles possible detoasting). */
3696 /* Note we rely on being able to modify rawname below. */
3697 rawname = text_to_cstring(textval);
3699 if (!SplitIdentifierString(rawname, '.', &namelist))
3700 ereport(ERROR,
3701 (errcode(ERRCODE_INVALID_NAME),
3702 errmsg("invalid name syntax")));
3704 if (namelist == NIL)
3705 ereport(ERROR,
3706 (errcode(ERRCODE_INVALID_NAME),
3707 errmsg("invalid name syntax")));
3709 foreach(l, namelist)
3711 char *curname = (char *) lfirst(l);
3713 result = lappend(result, makeString(pstrdup(curname)));
3716 pfree(rawname);
3717 list_free(namelist);
3719 return result;
3723 * SplitIdentifierString --- parse a string containing identifiers
3725 * This is the guts of textToQualifiedNameList, and is exported for use in
3726 * other situations such as parsing GUC variables. In the GUC case, it's
3727 * important to avoid memory leaks, so the API is designed to minimize the
3728 * amount of stuff that needs to be allocated and freed.
3730 * Inputs:
3731 * rawstring: the input string; must be overwritable! On return, it's
3732 * been modified to contain the separated identifiers.
3733 * separator: the separator punctuation expected between identifiers
3734 * (typically '.' or ','). Whitespace may also appear around
3735 * identifiers.
3736 * Outputs:
3737 * namelist: filled with a palloc'd list of pointers to identifiers within
3738 * rawstring. Caller should list_free() this even on error return.
3740 * Returns true if okay, false if there is a syntax error in the string.
3742 * Note that an empty string is considered okay here, though not in
3743 * textToQualifiedNameList.
3745 bool
3746 SplitIdentifierString(char *rawstring, char separator,
3747 List **namelist)
3749 char *nextp = rawstring;
3750 bool done = false;
3752 *namelist = NIL;
3754 while (scanner_isspace(*nextp))
3755 nextp++; /* skip leading whitespace */
3757 if (*nextp == '\0')
3758 return true; /* allow empty string */
3760 /* At the top of the loop, we are at start of a new identifier. */
3763 char *curname;
3764 char *endp;
3766 if (*nextp == '"')
3768 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3769 curname = nextp + 1;
3770 for (;;)
3772 endp = strchr(nextp + 1, '"');
3773 if (endp == NULL)
3774 return false; /* mismatched quotes */
3775 if (endp[1] != '"')
3776 break; /* found end of quoted name */
3777 /* Collapse adjacent quotes into one quote, and look again */
3778 memmove(endp, endp + 1, strlen(endp));
3779 nextp = endp;
3781 /* endp now points at the terminating quote */
3782 nextp = endp + 1;
3784 else
3786 /* Unquoted name --- extends to separator or whitespace */
3787 char *downname;
3788 int len;
3790 curname = nextp;
3791 while (*nextp && *nextp != separator &&
3792 !scanner_isspace(*nextp))
3793 nextp++;
3794 endp = nextp;
3795 if (curname == nextp)
3796 return false; /* empty unquoted name not allowed */
3799 * Downcase the identifier, using same code as main lexer does.
3801 * XXX because we want to overwrite the input in-place, we cannot
3802 * support a downcasing transformation that increases the string
3803 * length. This is not a problem given the current implementation
3804 * of downcase_truncate_identifier, but we'll probably have to do
3805 * something about this someday.
3807 len = endp - curname;
3808 downname = downcase_truncate_identifier(curname, len, false);
3809 Assert(strlen(downname) <= len);
3810 strncpy(curname, downname, len); /* strncpy is required here */
3811 pfree(downname);
3814 while (scanner_isspace(*nextp))
3815 nextp++; /* skip trailing whitespace */
3817 if (*nextp == separator)
3819 nextp++;
3820 while (scanner_isspace(*nextp))
3821 nextp++; /* skip leading whitespace for next */
3822 /* we expect another name, so done remains false */
3824 else if (*nextp == '\0')
3825 done = true;
3826 else
3827 return false; /* invalid syntax */
3829 /* Now safe to overwrite separator with a null */
3830 *endp = '\0';
3832 /* Truncate name if it's overlength */
3833 truncate_identifier(curname, strlen(curname), false);
3836 * Finished isolating current name --- add it to list
3838 *namelist = lappend(*namelist, curname);
3840 /* Loop back if we didn't reach end of string */
3841 } while (!done);
3843 return true;
3848 * SplitDirectoriesString --- parse a string containing file/directory names
3850 * This works fine on file names too; the function name is historical.
3852 * This is similar to SplitIdentifierString, except that the parsing
3853 * rules are meant to handle pathnames instead of identifiers: there is
3854 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3855 * and we apply canonicalize_path() to each extracted string. Because of the
3856 * last, the returned strings are separately palloc'd rather than being
3857 * pointers into rawstring --- but we still scribble on rawstring.
3859 * Inputs:
3860 * rawstring: the input string; must be modifiable!
3861 * separator: the separator punctuation expected between directories
3862 * (typically ',' or ';'). Whitespace may also appear around
3863 * directories.
3864 * Outputs:
3865 * namelist: filled with a palloc'd list of directory names.
3866 * Caller should list_free_deep() this even on error return.
3868 * Returns true if okay, false if there is a syntax error in the string.
3870 * Note that an empty string is considered okay here.
3872 bool
3873 SplitDirectoriesString(char *rawstring, char separator,
3874 List **namelist)
3876 char *nextp = rawstring;
3877 bool done = false;
3879 *namelist = NIL;
3881 while (scanner_isspace(*nextp))
3882 nextp++; /* skip leading whitespace */
3884 if (*nextp == '\0')
3885 return true; /* allow empty string */
3887 /* At the top of the loop, we are at start of a new directory. */
3890 char *curname;
3891 char *endp;
3893 if (*nextp == '"')
3895 /* Quoted name --- collapse quote-quote pairs */
3896 curname = nextp + 1;
3897 for (;;)
3899 endp = strchr(nextp + 1, '"');
3900 if (endp == NULL)
3901 return false; /* mismatched quotes */
3902 if (endp[1] != '"')
3903 break; /* found end of quoted name */
3904 /* Collapse adjacent quotes into one quote, and look again */
3905 memmove(endp, endp + 1, strlen(endp));
3906 nextp = endp;
3908 /* endp now points at the terminating quote */
3909 nextp = endp + 1;
3911 else
3913 /* Unquoted name --- extends to separator or end of string */
3914 curname = endp = nextp;
3915 while (*nextp && *nextp != separator)
3917 /* trailing whitespace should not be included in name */
3918 if (!scanner_isspace(*nextp))
3919 endp = nextp + 1;
3920 nextp++;
3922 if (curname == endp)
3923 return false; /* empty unquoted name not allowed */
3926 while (scanner_isspace(*nextp))
3927 nextp++; /* skip trailing whitespace */
3929 if (*nextp == separator)
3931 nextp++;
3932 while (scanner_isspace(*nextp))
3933 nextp++; /* skip leading whitespace for next */
3934 /* we expect another name, so done remains false */
3936 else if (*nextp == '\0')
3937 done = true;
3938 else
3939 return false; /* invalid syntax */
3941 /* Now safe to overwrite separator with a null */
3942 *endp = '\0';
3944 /* Truncate path if it's overlength */
3945 if (strlen(curname) >= MAXPGPATH)
3946 curname[MAXPGPATH - 1] = '\0';
3949 * Finished isolating current name --- add it to list
3951 curname = pstrdup(curname);
3952 canonicalize_path(curname);
3953 *namelist = lappend(*namelist, curname);
3955 /* Loop back if we didn't reach end of string */
3956 } while (!done);
3958 return true;
3963 * SplitGUCList --- parse a string containing identifiers or file names
3965 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3966 * presuming whether the elements will be taken as identifiers or file names.
3967 * We assume the input has already been through flatten_set_variable_args(),
3968 * so that we need never downcase (if appropriate, that was done already).
3969 * Nor do we ever truncate, since we don't know the correct max length.
3970 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3971 * because any embedded whitespace should have led to double-quoting).
3972 * Otherwise the API is identical to SplitIdentifierString.
3974 * XXX it's annoying to have so many copies of this string-splitting logic.
3975 * However, it's not clear that having one function with a bunch of option
3976 * flags would be much better.
3978 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3979 * Be sure to update that if you have to change this.
3981 * Inputs:
3982 * rawstring: the input string; must be overwritable! On return, it's
3983 * been modified to contain the separated identifiers.
3984 * separator: the separator punctuation expected between identifiers
3985 * (typically '.' or ','). Whitespace may also appear around
3986 * identifiers.
3987 * Outputs:
3988 * namelist: filled with a palloc'd list of pointers to identifiers within
3989 * rawstring. Caller should list_free() this even on error return.
3991 * Returns true if okay, false if there is a syntax error in the string.
3993 bool
3994 SplitGUCList(char *rawstring, char separator,
3995 List **namelist)
3997 char *nextp = rawstring;
3998 bool done = false;
4000 *namelist = NIL;
4002 while (scanner_isspace(*nextp))
4003 nextp++; /* skip leading whitespace */
4005 if (*nextp == '\0')
4006 return true; /* allow empty string */
4008 /* At the top of the loop, we are at start of a new identifier. */
4011 char *curname;
4012 char *endp;
4014 if (*nextp == '"')
4016 /* Quoted name --- collapse quote-quote pairs */
4017 curname = nextp + 1;
4018 for (;;)
4020 endp = strchr(nextp + 1, '"');
4021 if (endp == NULL)
4022 return false; /* mismatched quotes */
4023 if (endp[1] != '"')
4024 break; /* found end of quoted name */
4025 /* Collapse adjacent quotes into one quote, and look again */
4026 memmove(endp, endp + 1, strlen(endp));
4027 nextp = endp;
4029 /* endp now points at the terminating quote */
4030 nextp = endp + 1;
4032 else
4034 /* Unquoted name --- extends to separator or whitespace */
4035 curname = nextp;
4036 while (*nextp && *nextp != separator &&
4037 !scanner_isspace(*nextp))
4038 nextp++;
4039 endp = nextp;
4040 if (curname == nextp)
4041 return false; /* empty unquoted name not allowed */
4044 while (scanner_isspace(*nextp))
4045 nextp++; /* skip trailing whitespace */
4047 if (*nextp == separator)
4049 nextp++;
4050 while (scanner_isspace(*nextp))
4051 nextp++; /* skip leading whitespace for next */
4052 /* we expect another name, so done remains false */
4054 else if (*nextp == '\0')
4055 done = true;
4056 else
4057 return false; /* invalid syntax */
4059 /* Now safe to overwrite separator with a null */
4060 *endp = '\0';
4063 * Finished isolating current name --- add it to list
4065 *namelist = lappend(*namelist, curname);
4067 /* Loop back if we didn't reach end of string */
4068 } while (!done);
4070 return true;
4074 /*****************************************************************************
4075 * Comparison Functions used for bytea
4077 * Note: btree indexes need these routines not to leak memory; therefore,
4078 * be careful to free working copies of toasted datums. Most places don't
4079 * need to be so careful.
4080 *****************************************************************************/
4082 Datum
4083 byteaeq(PG_FUNCTION_ARGS)
4085 Datum arg1 = PG_GETARG_DATUM(0);
4086 Datum arg2 = PG_GETARG_DATUM(1);
4087 bool result;
4088 Size len1,
4089 len2;
4092 * We can use a fast path for unequal lengths, which might save us from
4093 * having to detoast one or both values.
4095 len1 = toast_raw_datum_size(arg1);
4096 len2 = toast_raw_datum_size(arg2);
4097 if (len1 != len2)
4098 result = false;
4099 else
4101 bytea *barg1 = DatumGetByteaPP(arg1);
4102 bytea *barg2 = DatumGetByteaPP(arg2);
4104 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4105 len1 - VARHDRSZ) == 0);
4107 PG_FREE_IF_COPY(barg1, 0);
4108 PG_FREE_IF_COPY(barg2, 1);
4111 PG_RETURN_BOOL(result);
4114 Datum
4115 byteane(PG_FUNCTION_ARGS)
4117 Datum arg1 = PG_GETARG_DATUM(0);
4118 Datum arg2 = PG_GETARG_DATUM(1);
4119 bool result;
4120 Size len1,
4121 len2;
4124 * We can use a fast path for unequal lengths, which might save us from
4125 * having to detoast one or both values.
4127 len1 = toast_raw_datum_size(arg1);
4128 len2 = toast_raw_datum_size(arg2);
4129 if (len1 != len2)
4130 result = true;
4131 else
4133 bytea *barg1 = DatumGetByteaPP(arg1);
4134 bytea *barg2 = DatumGetByteaPP(arg2);
4136 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4137 len1 - VARHDRSZ) != 0);
4139 PG_FREE_IF_COPY(barg1, 0);
4140 PG_FREE_IF_COPY(barg2, 1);
4143 PG_RETURN_BOOL(result);
4146 Datum
4147 bytealt(PG_FUNCTION_ARGS)
4149 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4150 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4151 int len1,
4152 len2;
4153 int cmp;
4155 len1 = VARSIZE_ANY_EXHDR(arg1);
4156 len2 = VARSIZE_ANY_EXHDR(arg2);
4158 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4160 PG_FREE_IF_COPY(arg1, 0);
4161 PG_FREE_IF_COPY(arg2, 1);
4163 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4166 Datum
4167 byteale(PG_FUNCTION_ARGS)
4169 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4170 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4171 int len1,
4172 len2;
4173 int cmp;
4175 len1 = VARSIZE_ANY_EXHDR(arg1);
4176 len2 = VARSIZE_ANY_EXHDR(arg2);
4178 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4180 PG_FREE_IF_COPY(arg1, 0);
4181 PG_FREE_IF_COPY(arg2, 1);
4183 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4186 Datum
4187 byteagt(PG_FUNCTION_ARGS)
4189 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4190 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4191 int len1,
4192 len2;
4193 int cmp;
4195 len1 = VARSIZE_ANY_EXHDR(arg1);
4196 len2 = VARSIZE_ANY_EXHDR(arg2);
4198 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4200 PG_FREE_IF_COPY(arg1, 0);
4201 PG_FREE_IF_COPY(arg2, 1);
4203 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4206 Datum
4207 byteage(PG_FUNCTION_ARGS)
4209 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4210 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4211 int len1,
4212 len2;
4213 int cmp;
4215 len1 = VARSIZE_ANY_EXHDR(arg1);
4216 len2 = VARSIZE_ANY_EXHDR(arg2);
4218 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4220 PG_FREE_IF_COPY(arg1, 0);
4221 PG_FREE_IF_COPY(arg2, 1);
4223 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4226 Datum
4227 byteacmp(PG_FUNCTION_ARGS)
4229 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4230 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4231 int len1,
4232 len2;
4233 int cmp;
4235 len1 = VARSIZE_ANY_EXHDR(arg1);
4236 len2 = VARSIZE_ANY_EXHDR(arg2);
4238 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4239 if ((cmp == 0) && (len1 != len2))
4240 cmp = (len1 < len2) ? -1 : 1;
4242 PG_FREE_IF_COPY(arg1, 0);
4243 PG_FREE_IF_COPY(arg2, 1);
4245 PG_RETURN_INT32(cmp);
4248 Datum
4249 bytea_sortsupport(PG_FUNCTION_ARGS)
4251 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4252 MemoryContext oldcontext;
4254 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4256 /* Use generic string SortSupport, forcing "C" collation */
4257 varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4259 MemoryContextSwitchTo(oldcontext);
4261 PG_RETURN_VOID();
4265 * appendStringInfoText
4267 * Append a text to str.
4268 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4270 static void
4271 appendStringInfoText(StringInfo str, const text *t)
4273 appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4277 * replace_text
4278 * replace all occurrences of 'old_sub_str' in 'orig_str'
4279 * with 'new_sub_str' to form 'new_str'
4281 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4282 * otherwise returns 'new_str'
4284 Datum
4285 replace_text(PG_FUNCTION_ARGS)
4287 text *src_text = PG_GETARG_TEXT_PP(0);
4288 text *from_sub_text = PG_GETARG_TEXT_PP(1);
4289 text *to_sub_text = PG_GETARG_TEXT_PP(2);
4290 int src_text_len;
4291 int from_sub_text_len;
4292 TextPositionState state;
4293 text *ret_text;
4294 int chunk_len;
4295 char *curr_ptr;
4296 char *start_ptr;
4297 StringInfoData str;
4298 bool found;
4300 src_text_len = VARSIZE_ANY_EXHDR(src_text);
4301 from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4303 /* Return unmodified source string if empty source or pattern */
4304 if (src_text_len < 1 || from_sub_text_len < 1)
4306 PG_RETURN_TEXT_P(src_text);
4309 text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4311 found = text_position_next(&state);
4313 /* When the from_sub_text is not found, there is nothing to do. */
4314 if (!found)
4316 text_position_cleanup(&state);
4317 PG_RETURN_TEXT_P(src_text);
4319 curr_ptr = text_position_get_match_ptr(&state);
4320 start_ptr = VARDATA_ANY(src_text);
4322 initStringInfo(&str);
4326 CHECK_FOR_INTERRUPTS();
4328 /* copy the data skipped over by last text_position_next() */
4329 chunk_len = curr_ptr - start_ptr;
4330 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4332 appendStringInfoText(&str, to_sub_text);
4334 start_ptr = curr_ptr + from_sub_text_len;
4336 found = text_position_next(&state);
4337 if (found)
4338 curr_ptr = text_position_get_match_ptr(&state);
4340 while (found);
4342 /* copy trailing data */
4343 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4344 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4346 text_position_cleanup(&state);
4348 ret_text = cstring_to_text_with_len(str.data, str.len);
4349 pfree(str.data);
4351 PG_RETURN_TEXT_P(ret_text);
4355 * check_replace_text_has_escape
4357 * Returns 0 if text contains no backslashes that need processing.
4358 * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4359 * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4361 static int
4362 check_replace_text_has_escape(const text *replace_text)
4364 int result = 0;
4365 const char *p = VARDATA_ANY(replace_text);
4366 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4368 while (p < p_end)
4370 /* Find next escape char, if any. */
4371 p = memchr(p, '\\', p_end - p);
4372 if (p == NULL)
4373 break;
4374 p++;
4375 /* Note: a backslash at the end doesn't require extra processing. */
4376 if (p < p_end)
4378 if (*p >= '1' && *p <= '9')
4379 return 2; /* Found a submatch specifier, so done */
4380 result = 1; /* Found some other sequence, keep looking */
4381 p++;
4384 return result;
4388 * appendStringInfoRegexpSubstr
4390 * Append replace_text to str, substituting regexp back references for
4391 * \n escapes. start_ptr is the start of the match in the source string,
4392 * at logical character position data_pos.
4394 static void
4395 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4396 regmatch_t *pmatch,
4397 char *start_ptr, int data_pos)
4399 const char *p = VARDATA_ANY(replace_text);
4400 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4402 while (p < p_end)
4404 const char *chunk_start = p;
4405 int so;
4406 int eo;
4408 /* Find next escape char, if any. */
4409 p = memchr(p, '\\', p_end - p);
4410 if (p == NULL)
4411 p = p_end;
4413 /* Copy the text we just scanned over, if any. */
4414 if (p > chunk_start)
4415 appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4417 /* Done if at end of string, else advance over escape char. */
4418 if (p >= p_end)
4419 break;
4420 p++;
4422 if (p >= p_end)
4424 /* Escape at very end of input. Treat same as unexpected char */
4425 appendStringInfoChar(str, '\\');
4426 break;
4429 if (*p >= '1' && *p <= '9')
4431 /* Use the back reference of regexp. */
4432 int idx = *p - '0';
4434 so = pmatch[idx].rm_so;
4435 eo = pmatch[idx].rm_eo;
4436 p++;
4438 else if (*p == '&')
4440 /* Use the entire matched string. */
4441 so = pmatch[0].rm_so;
4442 eo = pmatch[0].rm_eo;
4443 p++;
4445 else if (*p == '\\')
4447 /* \\ means transfer one \ to output. */
4448 appendStringInfoChar(str, '\\');
4449 p++;
4450 continue;
4452 else
4455 * If escape char is not followed by any expected char, just treat
4456 * it as ordinary data to copy. (XXX would it be better to throw
4457 * an error?)
4459 appendStringInfoChar(str, '\\');
4460 continue;
4463 if (so >= 0 && eo >= 0)
4466 * Copy the text that is back reference of regexp. Note so and eo
4467 * are counted in characters not bytes.
4469 char *chunk_start;
4470 int chunk_len;
4472 Assert(so >= data_pos);
4473 chunk_start = start_ptr;
4474 chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4475 chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4476 appendBinaryStringInfo(str, chunk_start, chunk_len);
4482 * replace_text_regexp
4484 * replace substring(s) in src_text that match pattern with replace_text.
4485 * The replace_text can contain backslash markers to substitute
4486 * (parts of) the matched text.
4488 * cflags: regexp compile flags.
4489 * collation: collation to use.
4490 * search_start: the character (not byte) offset in src_text at which to
4491 * begin searching.
4492 * n: if 0, replace all matches; if > 0, replace only the N'th match.
4494 text *
4495 replace_text_regexp(text *src_text, text *pattern_text,
4496 text *replace_text,
4497 int cflags, Oid collation,
4498 int search_start, int n)
4500 text *ret_text;
4501 regex_t *re;
4502 int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4503 int nmatches = 0;
4504 StringInfoData buf;
4505 regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4506 int nmatch = lengthof(pmatch);
4507 pg_wchar *data;
4508 size_t data_len;
4509 int data_pos;
4510 char *start_ptr;
4511 int escape_status;
4513 initStringInfo(&buf);
4515 /* Convert data string to wide characters. */
4516 data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4517 data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4519 /* Check whether replace_text has escapes, especially regexp submatches. */
4520 escape_status = check_replace_text_has_escape(replace_text);
4522 /* If no regexp submatches, we can use REG_NOSUB. */
4523 if (escape_status < 2)
4525 cflags |= REG_NOSUB;
4526 /* Also tell pg_regexec we only want the whole-match location. */
4527 nmatch = 1;
4530 /* Prepare the regexp. */
4531 re = RE_compile_and_cache(pattern_text, cflags, collation);
4533 /* start_ptr points to the data_pos'th character of src_text */
4534 start_ptr = (char *) VARDATA_ANY(src_text);
4535 data_pos = 0;
4537 while (search_start <= data_len)
4539 int regexec_result;
4541 CHECK_FOR_INTERRUPTS();
4543 regexec_result = pg_regexec(re,
4544 data,
4545 data_len,
4546 search_start,
4547 NULL, /* no details */
4548 nmatch,
4549 pmatch,
4552 if (regexec_result == REG_NOMATCH)
4553 break;
4555 if (regexec_result != REG_OKAY)
4557 char errMsg[100];
4559 CHECK_FOR_INTERRUPTS();
4560 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4561 ereport(ERROR,
4562 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4563 errmsg("regular expression failed: %s", errMsg)));
4567 * Count matches, and decide whether to replace this match.
4569 nmatches++;
4570 if (n > 0 && nmatches != n)
4573 * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4574 * we treat the matched text as if it weren't matched, and copy it
4575 * to the output later.)
4577 search_start = pmatch[0].rm_eo;
4578 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4579 search_start++;
4580 continue;
4584 * Copy the text to the left of the match position. Note we are given
4585 * character not byte indexes.
4587 if (pmatch[0].rm_so - data_pos > 0)
4589 int chunk_len;
4591 chunk_len = charlen_to_bytelen(start_ptr,
4592 pmatch[0].rm_so - data_pos);
4593 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4596 * Advance start_ptr over that text, to avoid multiple rescans of
4597 * it if the replace_text contains multiple back-references.
4599 start_ptr += chunk_len;
4600 data_pos = pmatch[0].rm_so;
4604 * Copy the replace_text, processing escapes if any are present.
4606 if (escape_status > 0)
4607 appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4608 start_ptr, data_pos);
4609 else
4610 appendStringInfoText(&buf, replace_text);
4612 /* Advance start_ptr and data_pos over the matched text. */
4613 start_ptr += charlen_to_bytelen(start_ptr,
4614 pmatch[0].rm_eo - data_pos);
4615 data_pos = pmatch[0].rm_eo;
4618 * If we only want to replace one occurrence, we're done.
4620 if (n > 0)
4621 break;
4624 * Advance search position. Normally we start the next search at the
4625 * end of the previous match; but if the match was of zero length, we
4626 * have to advance by one character, or we'd just find the same match
4627 * again.
4629 search_start = data_pos;
4630 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4631 search_start++;
4635 * Copy the text to the right of the last match.
4637 if (data_pos < data_len)
4639 int chunk_len;
4641 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4642 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4645 ret_text = cstring_to_text_with_len(buf.data, buf.len);
4646 pfree(buf.data);
4647 pfree(data);
4649 return ret_text;
4653 * split_part
4654 * parse input string based on provided field separator
4655 * return N'th item (1 based, negative counts from end)
4657 Datum
4658 split_part(PG_FUNCTION_ARGS)
4660 text *inputstring = PG_GETARG_TEXT_PP(0);
4661 text *fldsep = PG_GETARG_TEXT_PP(1);
4662 int fldnum = PG_GETARG_INT32(2);
4663 int inputstring_len;
4664 int fldsep_len;
4665 TextPositionState state;
4666 char *start_ptr;
4667 char *end_ptr;
4668 text *result_text;
4669 bool found;
4671 /* field number is 1 based */
4672 if (fldnum == 0)
4673 ereport(ERROR,
4674 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4675 errmsg("field position must not be zero")));
4677 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4678 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4680 /* return empty string for empty input string */
4681 if (inputstring_len < 1)
4682 PG_RETURN_TEXT_P(cstring_to_text(""));
4684 /* handle empty field separator */
4685 if (fldsep_len < 1)
4687 /* if first or last field, return input string, else empty string */
4688 if (fldnum == 1 || fldnum == -1)
4689 PG_RETURN_TEXT_P(inputstring);
4690 else
4691 PG_RETURN_TEXT_P(cstring_to_text(""));
4694 /* find the first field separator */
4695 text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4697 found = text_position_next(&state);
4699 /* special case if fldsep not found at all */
4700 if (!found)
4702 text_position_cleanup(&state);
4703 /* if first or last field, return input string, else empty string */
4704 if (fldnum == 1 || fldnum == -1)
4705 PG_RETURN_TEXT_P(inputstring);
4706 else
4707 PG_RETURN_TEXT_P(cstring_to_text(""));
4711 * take care of a negative field number (i.e. count from the right) by
4712 * converting to a positive field number; we need total number of fields
4714 if (fldnum < 0)
4716 /* we found a fldsep, so there are at least two fields */
4717 int numfields = 2;
4719 while (text_position_next(&state))
4720 numfields++;
4722 /* special case of last field does not require an extra pass */
4723 if (fldnum == -1)
4725 start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4726 end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4727 text_position_cleanup(&state);
4728 PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4729 end_ptr - start_ptr));
4732 /* else, convert fldnum to positive notation */
4733 fldnum += numfields + 1;
4735 /* if nonexistent field, return empty string */
4736 if (fldnum <= 0)
4738 text_position_cleanup(&state);
4739 PG_RETURN_TEXT_P(cstring_to_text(""));
4742 /* reset to pointing at first match, but now with positive fldnum */
4743 text_position_reset(&state);
4744 found = text_position_next(&state);
4745 Assert(found);
4748 /* identify bounds of first field */
4749 start_ptr = VARDATA_ANY(inputstring);
4750 end_ptr = text_position_get_match_ptr(&state);
4752 while (found && --fldnum > 0)
4754 /* identify bounds of next field */
4755 start_ptr = end_ptr + fldsep_len;
4756 found = text_position_next(&state);
4757 if (found)
4758 end_ptr = text_position_get_match_ptr(&state);
4761 text_position_cleanup(&state);
4763 if (fldnum > 0)
4765 /* N'th field separator not found */
4766 /* if last field requested, return it, else empty string */
4767 if (fldnum == 1)
4769 int last_len = start_ptr - VARDATA_ANY(inputstring);
4771 result_text = cstring_to_text_with_len(start_ptr,
4772 inputstring_len - last_len);
4774 else
4775 result_text = cstring_to_text("");
4777 else
4779 /* non-last field requested */
4780 result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4783 PG_RETURN_TEXT_P(result_text);
4787 * Convenience function to return true when two text params are equal.
4789 static bool
4790 text_isequal(text *txt1, text *txt2, Oid collid)
4792 return DatumGetBool(DirectFunctionCall2Coll(texteq,
4793 collid,
4794 PointerGetDatum(txt1),
4795 PointerGetDatum(txt2)));
4799 * text_to_array
4800 * parse input string and return text array of elements,
4801 * based on provided field separator
4803 Datum
4804 text_to_array(PG_FUNCTION_ARGS)
4806 SplitTextOutputData tstate;
4808 /* For array output, tstate should start as all zeroes */
4809 memset(&tstate, 0, sizeof(tstate));
4811 if (!split_text(fcinfo, &tstate))
4812 PG_RETURN_NULL();
4814 if (tstate.astate == NULL)
4815 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4817 PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate.astate,
4818 CurrentMemoryContext));
4822 * text_to_array_null
4823 * parse input string and return text array of elements,
4824 * based on provided field separator and null string
4826 * This is a separate entry point only to prevent the regression tests from
4827 * complaining about different argument sets for the same internal function.
4829 Datum
4830 text_to_array_null(PG_FUNCTION_ARGS)
4832 return text_to_array(fcinfo);
4836 * text_to_table
4837 * parse input string and return table of elements,
4838 * based on provided field separator
4840 Datum
4841 text_to_table(PG_FUNCTION_ARGS)
4843 ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4844 SplitTextOutputData tstate;
4845 MemoryContext old_cxt;
4847 /* check to see if caller supports us returning a tuplestore */
4848 if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
4849 ereport(ERROR,
4850 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4851 errmsg("set-valued function called in context that cannot accept a set")));
4852 if (!(rsi->allowedModes & SFRM_Materialize))
4853 ereport(ERROR,
4854 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4855 errmsg("materialize mode required, but it is not allowed in this context")));
4857 /* OK, prepare tuplestore in per-query memory */
4858 old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory);
4860 tstate.astate = NULL;
4861 tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
4862 tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
4864 MemoryContextSwitchTo(old_cxt);
4866 (void) split_text(fcinfo, &tstate);
4868 tuplestore_donestoring(tstate.tupstore);
4870 rsi->returnMode = SFRM_Materialize;
4871 rsi->setResult = tstate.tupstore;
4872 rsi->setDesc = tstate.tupdesc;
4874 return (Datum) 0;
4878 * text_to_table_null
4879 * parse input string and return table of elements,
4880 * based on provided field separator and null string
4882 * This is a separate entry point only to prevent the regression tests from
4883 * complaining about different argument sets for the same internal function.
4885 Datum
4886 text_to_table_null(PG_FUNCTION_ARGS)
4888 return text_to_table(fcinfo);
4892 * Common code for text_to_array, text_to_array_null, text_to_table
4893 * and text_to_table_null functions.
4895 * These are not strict so we have to test for null inputs explicitly.
4896 * Returns false if result is to be null, else returns true.
4898 * Note that if the result is valid but empty (zero elements), we return
4899 * without changing *tstate --- caller must handle that case, too.
4901 static bool
4902 split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4904 text *inputstring;
4905 text *fldsep;
4906 text *null_string;
4907 Oid collation = PG_GET_COLLATION();
4908 int inputstring_len;
4909 int fldsep_len;
4910 char *start_ptr;
4911 text *result_text;
4913 /* when input string is NULL, then result is NULL too */
4914 if (PG_ARGISNULL(0))
4915 return false;
4917 inputstring = PG_GETARG_TEXT_PP(0);
4919 /* fldsep can be NULL */
4920 if (!PG_ARGISNULL(1))
4921 fldsep = PG_GETARG_TEXT_PP(1);
4922 else
4923 fldsep = NULL;
4925 /* null_string can be NULL or omitted */
4926 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4927 null_string = PG_GETARG_TEXT_PP(2);
4928 else
4929 null_string = NULL;
4931 if (fldsep != NULL)
4934 * Normal case with non-null fldsep. Use the text_position machinery
4935 * to search for occurrences of fldsep.
4937 TextPositionState state;
4939 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4940 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4942 /* return empty set for empty input string */
4943 if (inputstring_len < 1)
4944 return true;
4946 /* empty field separator: return input string as a one-element set */
4947 if (fldsep_len < 1)
4949 split_text_accum_result(tstate, inputstring,
4950 null_string, collation);
4951 return true;
4954 text_position_setup(inputstring, fldsep, collation, &state);
4956 start_ptr = VARDATA_ANY(inputstring);
4958 for (;;)
4960 bool found;
4961 char *end_ptr;
4962 int chunk_len;
4964 CHECK_FOR_INTERRUPTS();
4966 found = text_position_next(&state);
4967 if (!found)
4969 /* fetch last field */
4970 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4971 end_ptr = NULL; /* not used, but some compilers complain */
4973 else
4975 /* fetch non-last field */
4976 end_ptr = text_position_get_match_ptr(&state);
4977 chunk_len = end_ptr - start_ptr;
4980 /* build a temp text datum to pass to split_text_accum_result */
4981 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4983 /* stash away this field */
4984 split_text_accum_result(tstate, result_text,
4985 null_string, collation);
4987 pfree(result_text);
4989 if (!found)
4990 break;
4992 start_ptr = end_ptr + fldsep_len;
4995 text_position_cleanup(&state);
4997 else
5000 * When fldsep is NULL, each character in the input string becomes a
5001 * separate element in the result set. The separator is effectively
5002 * the space between characters.
5004 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
5006 start_ptr = VARDATA_ANY(inputstring);
5008 while (inputstring_len > 0)
5010 int chunk_len = pg_mblen(start_ptr);
5012 CHECK_FOR_INTERRUPTS();
5014 /* build a temp text datum to pass to split_text_accum_result */
5015 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
5017 /* stash away this field */
5018 split_text_accum_result(tstate, result_text,
5019 null_string, collation);
5021 pfree(result_text);
5023 start_ptr += chunk_len;
5024 inputstring_len -= chunk_len;
5028 return true;
5032 * Add text item to result set (table or array).
5034 * This is also responsible for checking to see if the item matches
5035 * the null_string, in which case we should emit NULL instead.
5037 static void
5038 split_text_accum_result(SplitTextOutputData *tstate,
5039 text *field_value,
5040 text *null_string,
5041 Oid collation)
5043 bool is_null = false;
5045 if (null_string && text_isequal(field_value, null_string, collation))
5046 is_null = true;
5048 if (tstate->tupstore)
5050 Datum values[1];
5051 bool nulls[1];
5053 values[0] = PointerGetDatum(field_value);
5054 nulls[0] = is_null;
5056 tuplestore_putvalues(tstate->tupstore,
5057 tstate->tupdesc,
5058 values,
5059 nulls);
5061 else
5063 tstate->astate = accumArrayResult(tstate->astate,
5064 PointerGetDatum(field_value),
5065 is_null,
5066 TEXTOID,
5067 CurrentMemoryContext);
5072 * array_to_text
5073 * concatenate Cstring representation of input array elements
5074 * using provided field separator
5076 Datum
5077 array_to_text(PG_FUNCTION_ARGS)
5079 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
5080 char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5082 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5086 * array_to_text_null
5087 * concatenate Cstring representation of input array elements
5088 * using provided field separator and null string
5090 * This version is not strict so we have to test for null inputs explicitly.
5092 Datum
5093 array_to_text_null(PG_FUNCTION_ARGS)
5095 ArrayType *v;
5096 char *fldsep;
5097 char *null_string;
5099 /* returns NULL when first or second parameter is NULL */
5100 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5101 PG_RETURN_NULL();
5103 v = PG_GETARG_ARRAYTYPE_P(0);
5104 fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5106 /* NULL null string is passed through as a null pointer */
5107 if (!PG_ARGISNULL(2))
5108 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5109 else
5110 null_string = NULL;
5112 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5116 * common code for array_to_text and array_to_text_null functions
5118 static text *
5119 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5120 const char *fldsep, const char *null_string)
5122 text *result;
5123 int nitems,
5124 *dims,
5125 ndims;
5126 Oid element_type;
5127 int typlen;
5128 bool typbyval;
5129 char typalign;
5130 StringInfoData buf;
5131 bool printed = false;
5132 char *p;
5133 bits8 *bitmap;
5134 int bitmask;
5135 int i;
5136 ArrayMetaState *my_extra;
5138 ndims = ARR_NDIM(v);
5139 dims = ARR_DIMS(v);
5140 nitems = ArrayGetNItems(ndims, dims);
5142 /* if there are no elements, return an empty string */
5143 if (nitems == 0)
5144 return cstring_to_text_with_len("", 0);
5146 element_type = ARR_ELEMTYPE(v);
5147 initStringInfo(&buf);
5150 * We arrange to look up info about element type, including its output
5151 * conversion proc, only once per series of calls, assuming the element
5152 * type doesn't change underneath us.
5154 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5155 if (my_extra == NULL)
5157 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5158 sizeof(ArrayMetaState));
5159 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5160 my_extra->element_type = ~element_type;
5163 if (my_extra->element_type != element_type)
5166 * Get info about element type, including its output conversion proc
5168 get_type_io_data(element_type, IOFunc_output,
5169 &my_extra->typlen, &my_extra->typbyval,
5170 &my_extra->typalign, &my_extra->typdelim,
5171 &my_extra->typioparam, &my_extra->typiofunc);
5172 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5173 fcinfo->flinfo->fn_mcxt);
5174 my_extra->element_type = element_type;
5176 typlen = my_extra->typlen;
5177 typbyval = my_extra->typbyval;
5178 typalign = my_extra->typalign;
5180 p = ARR_DATA_PTR(v);
5181 bitmap = ARR_NULLBITMAP(v);
5182 bitmask = 1;
5184 for (i = 0; i < nitems; i++)
5186 Datum itemvalue;
5187 char *value;
5189 /* Get source element, checking for NULL */
5190 if (bitmap && (*bitmap & bitmask) == 0)
5192 /* if null_string is NULL, we just ignore null elements */
5193 if (null_string != NULL)
5195 if (printed)
5196 appendStringInfo(&buf, "%s%s", fldsep, null_string);
5197 else
5198 appendStringInfoString(&buf, null_string);
5199 printed = true;
5202 else
5204 itemvalue = fetch_att(p, typbyval, typlen);
5206 value = OutputFunctionCall(&my_extra->proc, itemvalue);
5208 if (printed)
5209 appendStringInfo(&buf, "%s%s", fldsep, value);
5210 else
5211 appendStringInfoString(&buf, value);
5212 printed = true;
5214 p = att_addlength_pointer(p, typlen, p);
5215 p = (char *) att_align_nominal(p, typalign);
5218 /* advance bitmap pointer if any */
5219 if (bitmap)
5221 bitmask <<= 1;
5222 if (bitmask == 0x100)
5224 bitmap++;
5225 bitmask = 1;
5230 result = cstring_to_text_with_len(buf.data, buf.len);
5231 pfree(buf.data);
5233 return result;
5236 #define HEXBASE 16
5238 * Convert an int32 to a string containing a base 16 (hex) representation of
5239 * the number.
5241 Datum
5242 to_hex32(PG_FUNCTION_ARGS)
5244 uint32 value = (uint32) PG_GETARG_INT32(0);
5245 char *ptr;
5246 const char *digits = "0123456789abcdef";
5247 char buf[32]; /* bigger than needed, but reasonable */
5249 ptr = buf + sizeof(buf) - 1;
5250 *ptr = '\0';
5254 *--ptr = digits[value % HEXBASE];
5255 value /= HEXBASE;
5256 } while (ptr > buf && value);
5258 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5262 * Convert an int64 to a string containing a base 16 (hex) representation of
5263 * the number.
5265 Datum
5266 to_hex64(PG_FUNCTION_ARGS)
5268 uint64 value = (uint64) PG_GETARG_INT64(0);
5269 char *ptr;
5270 const char *digits = "0123456789abcdef";
5271 char buf[32]; /* bigger than needed, but reasonable */
5273 ptr = buf + sizeof(buf) - 1;
5274 *ptr = '\0';
5278 *--ptr = digits[value % HEXBASE];
5279 value /= HEXBASE;
5280 } while (ptr > buf && value);
5282 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5286 * Return the size of a datum, possibly compressed
5288 * Works on any data type
5290 Datum
5291 pg_column_size(PG_FUNCTION_ARGS)
5293 Datum value = PG_GETARG_DATUM(0);
5294 int32 result;
5295 int typlen;
5297 /* On first call, get the input type's typlen, and save at *fn_extra */
5298 if (fcinfo->flinfo->fn_extra == NULL)
5300 /* Lookup the datatype of the supplied argument */
5301 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5303 typlen = get_typlen(argtypeid);
5304 if (typlen == 0) /* should not happen */
5305 elog(ERROR, "cache lookup failed for type %u", argtypeid);
5307 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5308 sizeof(int));
5309 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5311 else
5312 typlen = *((int *) fcinfo->flinfo->fn_extra);
5314 if (typlen == -1)
5316 /* varlena type, possibly toasted */
5317 result = toast_datum_size(value);
5319 else if (typlen == -2)
5321 /* cstring */
5322 result = strlen(DatumGetCString(value)) + 1;
5324 else
5326 /* ordinary fixed-width type */
5327 result = typlen;
5330 PG_RETURN_INT32(result);
5334 * Return the compression method stored in the compressed attribute. Return
5335 * NULL for non varlena type or uncompressed data.
5337 Datum
5338 pg_column_compression(PG_FUNCTION_ARGS)
5340 int typlen;
5341 char *result;
5342 ToastCompressionId cmid;
5344 /* On first call, get the input type's typlen, and save at *fn_extra */
5345 if (fcinfo->flinfo->fn_extra == NULL)
5347 /* Lookup the datatype of the supplied argument */
5348 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5350 typlen = get_typlen(argtypeid);
5351 if (typlen == 0) /* should not happen */
5352 elog(ERROR, "cache lookup failed for type %u", argtypeid);
5354 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5355 sizeof(int));
5356 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5358 else
5359 typlen = *((int *) fcinfo->flinfo->fn_extra);
5361 if (typlen != -1)
5362 PG_RETURN_NULL();
5364 /* get the compression method id stored in the compressed varlena */
5365 cmid = toast_get_compression_id((struct varlena *)
5366 DatumGetPointer(PG_GETARG_DATUM(0)));
5367 if (cmid == TOAST_INVALID_COMPRESSION_ID)
5368 PG_RETURN_NULL();
5370 /* convert compression method id to compression method name */
5371 switch (cmid)
5373 case TOAST_PGLZ_COMPRESSION_ID:
5374 result = "pglz";
5375 break;
5376 case TOAST_LZ4_COMPRESSION_ID:
5377 result = "lz4";
5378 break;
5379 default:
5380 elog(ERROR, "invalid compression method id %d", cmid);
5383 PG_RETURN_TEXT_P(cstring_to_text(result));
5387 * string_agg - Concatenates values and returns string.
5389 * Syntax: string_agg(value text, delimiter text) RETURNS text
5391 * Note: Any NULL values are ignored. The first-call delimiter isn't
5392 * actually used at all, and on subsequent calls the delimiter precedes
5393 * the associated value.
5396 /* subroutine to initialize state */
5397 static StringInfo
5398 makeStringAggState(FunctionCallInfo fcinfo)
5400 StringInfo state;
5401 MemoryContext aggcontext;
5402 MemoryContext oldcontext;
5404 if (!AggCheckCallContext(fcinfo, &aggcontext))
5406 /* cannot be called directly because of internal-type argument */
5407 elog(ERROR, "string_agg_transfn called in non-aggregate context");
5411 * Create state in aggregate context. It'll stay there across subsequent
5412 * calls.
5414 oldcontext = MemoryContextSwitchTo(aggcontext);
5415 state = makeStringInfo();
5416 MemoryContextSwitchTo(oldcontext);
5418 return state;
5421 Datum
5422 string_agg_transfn(PG_FUNCTION_ARGS)
5424 StringInfo state;
5426 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5428 /* Append the value unless null. */
5429 if (!PG_ARGISNULL(1))
5431 /* On the first time through, we ignore the delimiter. */
5432 if (state == NULL)
5433 state = makeStringAggState(fcinfo);
5434 else if (!PG_ARGISNULL(2))
5435 appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5437 appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5441 * The transition type for string_agg() is declared to be "internal",
5442 * which is a pass-by-value type the same size as a pointer.
5444 PG_RETURN_POINTER(state);
5447 Datum
5448 string_agg_finalfn(PG_FUNCTION_ARGS)
5450 StringInfo state;
5452 /* cannot be called directly because of internal-type argument */
5453 Assert(AggCheckCallContext(fcinfo, NULL));
5455 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5457 if (state != NULL)
5458 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5459 else
5460 PG_RETURN_NULL();
5464 * Prepare cache with fmgr info for the output functions of the datatypes of
5465 * the arguments of a concat-like function, beginning with argument "argidx".
5466 * (Arguments before that will have corresponding slots in the resulting
5467 * FmgrInfo array, but we don't fill those slots.)
5469 static FmgrInfo *
5470 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5472 FmgrInfo *foutcache;
5473 int i;
5475 /* We keep the info in fn_mcxt so it survives across calls */
5476 foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5477 PG_NARGS() * sizeof(FmgrInfo));
5479 for (i = argidx; i < PG_NARGS(); i++)
5481 Oid valtype;
5482 Oid typOutput;
5483 bool typIsVarlena;
5485 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5486 if (!OidIsValid(valtype))
5487 elog(ERROR, "could not determine data type of concat() input");
5489 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5490 fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5493 fcinfo->flinfo->fn_extra = foutcache;
5495 return foutcache;
5499 * Implementation of both concat() and concat_ws().
5501 * sepstr is the separator string to place between values.
5502 * argidx identifies the first argument to concatenate (counting from zero);
5503 * note that this must be constant across any one series of calls.
5505 * Returns NULL if result should be NULL, else text value.
5507 static text *
5508 concat_internal(const char *sepstr, int argidx,
5509 FunctionCallInfo fcinfo)
5511 text *result;
5512 StringInfoData str;
5513 FmgrInfo *foutcache;
5514 bool first_arg = true;
5515 int i;
5518 * concat(VARIADIC some-array) is essentially equivalent to
5519 * array_to_text(), ie concat the array elements with the given separator.
5520 * So we just pass the case off to that code.
5522 if (get_fn_expr_variadic(fcinfo->flinfo))
5524 ArrayType *arr;
5526 /* Should have just the one argument */
5527 Assert(argidx == PG_NARGS() - 1);
5529 /* concat(VARIADIC NULL) is defined as NULL */
5530 if (PG_ARGISNULL(argidx))
5531 return NULL;
5534 * Non-null argument had better be an array. We assume that any call
5535 * context that could let get_fn_expr_variadic return true will have
5536 * checked that a VARIADIC-labeled parameter actually is an array. So
5537 * it should be okay to just Assert that it's an array rather than
5538 * doing a full-fledged error check.
5540 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5542 /* OK, safe to fetch the array value */
5543 arr = PG_GETARG_ARRAYTYPE_P(argidx);
5546 * And serialize the array. We tell array_to_text to ignore null
5547 * elements, which matches the behavior of the loop below.
5549 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5552 /* Normal case without explicit VARIADIC marker */
5553 initStringInfo(&str);
5555 /* Get output function info, building it if first time through */
5556 foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5557 if (foutcache == NULL)
5558 foutcache = build_concat_foutcache(fcinfo, argidx);
5560 for (i = argidx; i < PG_NARGS(); i++)
5562 if (!PG_ARGISNULL(i))
5564 Datum value = PG_GETARG_DATUM(i);
5566 /* add separator if appropriate */
5567 if (first_arg)
5568 first_arg = false;
5569 else
5570 appendStringInfoString(&str, sepstr);
5572 /* call the appropriate type output function, append the result */
5573 appendStringInfoString(&str,
5574 OutputFunctionCall(&foutcache[i], value));
5578 result = cstring_to_text_with_len(str.data, str.len);
5579 pfree(str.data);
5581 return result;
5585 * Concatenate all arguments. NULL arguments are ignored.
5587 Datum
5588 text_concat(PG_FUNCTION_ARGS)
5590 text *result;
5592 result = concat_internal("", 0, fcinfo);
5593 if (result == NULL)
5594 PG_RETURN_NULL();
5595 PG_RETURN_TEXT_P(result);
5599 * Concatenate all but first argument value with separators. The first
5600 * parameter is used as the separator. NULL arguments are ignored.
5602 Datum
5603 text_concat_ws(PG_FUNCTION_ARGS)
5605 char *sep;
5606 text *result;
5608 /* return NULL when separator is NULL */
5609 if (PG_ARGISNULL(0))
5610 PG_RETURN_NULL();
5611 sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5613 result = concat_internal(sep, 1, fcinfo);
5614 if (result == NULL)
5615 PG_RETURN_NULL();
5616 PG_RETURN_TEXT_P(result);
5620 * Return first n characters in the string. When n is negative,
5621 * return all but last |n| characters.
5623 Datum
5624 text_left(PG_FUNCTION_ARGS)
5626 int n = PG_GETARG_INT32(1);
5628 if (n < 0)
5630 text *str = PG_GETARG_TEXT_PP(0);
5631 const char *p = VARDATA_ANY(str);
5632 int len = VARSIZE_ANY_EXHDR(str);
5633 int rlen;
5635 n = pg_mbstrlen_with_len(p, len) + n;
5636 rlen = pg_mbcharcliplen(p, len, n);
5637 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5639 else
5640 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5644 * Return last n characters in the string. When n is negative,
5645 * return all but first |n| characters.
5647 Datum
5648 text_right(PG_FUNCTION_ARGS)
5650 text *str = PG_GETARG_TEXT_PP(0);
5651 const char *p = VARDATA_ANY(str);
5652 int len = VARSIZE_ANY_EXHDR(str);
5653 int n = PG_GETARG_INT32(1);
5654 int off;
5656 if (n < 0)
5657 n = -n;
5658 else
5659 n = pg_mbstrlen_with_len(p, len) - n;
5660 off = pg_mbcharcliplen(p, len, n);
5662 PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5666 * Return reversed string
5668 Datum
5669 text_reverse(PG_FUNCTION_ARGS)
5671 text *str = PG_GETARG_TEXT_PP(0);
5672 const char *p = VARDATA_ANY(str);
5673 int len = VARSIZE_ANY_EXHDR(str);
5674 const char *endp = p + len;
5675 text *result;
5676 char *dst;
5678 result = palloc(len + VARHDRSZ);
5679 dst = (char *) VARDATA(result) + len;
5680 SET_VARSIZE(result, len + VARHDRSZ);
5682 if (pg_database_encoding_max_length() > 1)
5684 /* multibyte version */
5685 while (p < endp)
5687 int sz;
5689 sz = pg_mblen(p);
5690 dst -= sz;
5691 memcpy(dst, p, sz);
5692 p += sz;
5695 else
5697 /* single byte version */
5698 while (p < endp)
5699 *(--dst) = *p++;
5702 PG_RETURN_TEXT_P(result);
5707 * Support macros for text_format()
5709 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5711 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5712 do { \
5713 if (++(ptr) >= (end_ptr)) \
5714 ereport(ERROR, \
5715 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5716 errmsg("unterminated format() type specifier"), \
5717 errhint("For a single \"%%\" use \"%%%%\"."))); \
5718 } while (0)
5721 * Returns a formatted string
5723 Datum
5724 text_format(PG_FUNCTION_ARGS)
5726 text *fmt;
5727 StringInfoData str;
5728 const char *cp;
5729 const char *start_ptr;
5730 const char *end_ptr;
5731 text *result;
5732 int arg;
5733 bool funcvariadic;
5734 int nargs;
5735 Datum *elements = NULL;
5736 bool *nulls = NULL;
5737 Oid element_type = InvalidOid;
5738 Oid prev_type = InvalidOid;
5739 Oid prev_width_type = InvalidOid;
5740 FmgrInfo typoutputfinfo;
5741 FmgrInfo typoutputinfo_width;
5743 /* When format string is null, immediately return null */
5744 if (PG_ARGISNULL(0))
5745 PG_RETURN_NULL();
5747 /* If argument is marked VARIADIC, expand array into elements */
5748 if (get_fn_expr_variadic(fcinfo->flinfo))
5750 ArrayType *arr;
5751 int16 elmlen;
5752 bool elmbyval;
5753 char elmalign;
5754 int nitems;
5756 /* Should have just the one argument */
5757 Assert(PG_NARGS() == 2);
5759 /* If argument is NULL, we treat it as zero-length array */
5760 if (PG_ARGISNULL(1))
5761 nitems = 0;
5762 else
5765 * Non-null argument had better be an array. We assume that any
5766 * call context that could let get_fn_expr_variadic return true
5767 * will have checked that a VARIADIC-labeled parameter actually is
5768 * an array. So it should be okay to just Assert that it's an
5769 * array rather than doing a full-fledged error check.
5771 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5773 /* OK, safe to fetch the array value */
5774 arr = PG_GETARG_ARRAYTYPE_P(1);
5776 /* Get info about array element type */
5777 element_type = ARR_ELEMTYPE(arr);
5778 get_typlenbyvalalign(element_type,
5779 &elmlen, &elmbyval, &elmalign);
5781 /* Extract all array elements */
5782 deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5783 &elements, &nulls, &nitems);
5786 nargs = nitems + 1;
5787 funcvariadic = true;
5789 else
5791 /* Non-variadic case, we'll process the arguments individually */
5792 nargs = PG_NARGS();
5793 funcvariadic = false;
5796 /* Setup for main loop. */
5797 fmt = PG_GETARG_TEXT_PP(0);
5798 start_ptr = VARDATA_ANY(fmt);
5799 end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5800 initStringInfo(&str);
5801 arg = 1; /* next argument position to print */
5803 /* Scan format string, looking for conversion specifiers. */
5804 for (cp = start_ptr; cp < end_ptr; cp++)
5806 int argpos;
5807 int widthpos;
5808 int flags;
5809 int width;
5810 Datum value;
5811 bool isNull;
5812 Oid typid;
5815 * If it's not the start of a conversion specifier, just copy it to
5816 * the output buffer.
5818 if (*cp != '%')
5820 appendStringInfoCharMacro(&str, *cp);
5821 continue;
5824 ADVANCE_PARSE_POINTER(cp, end_ptr);
5826 /* Easy case: %% outputs a single % */
5827 if (*cp == '%')
5829 appendStringInfoCharMacro(&str, *cp);
5830 continue;
5833 /* Parse the optional portions of the format specifier */
5834 cp = text_format_parse_format(cp, end_ptr,
5835 &argpos, &widthpos,
5836 &flags, &width);
5839 * Next we should see the main conversion specifier. Whether or not
5840 * an argument position was present, it's known that at least one
5841 * character remains in the string at this point. Experience suggests
5842 * that it's worth checking that that character is one of the expected
5843 * ones before we try to fetch arguments, so as to produce the least
5844 * confusing response to a mis-formatted specifier.
5846 if (strchr("sIL", *cp) == NULL)
5847 ereport(ERROR,
5848 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5849 errmsg("unrecognized format() type specifier \"%.*s\"",
5850 pg_mblen(cp), cp),
5851 errhint("For a single \"%%\" use \"%%%%\".")));
5853 /* If indirect width was specified, get its value */
5854 if (widthpos >= 0)
5856 /* Collect the specified or next argument position */
5857 if (widthpos > 0)
5858 arg = widthpos;
5859 if (arg >= nargs)
5860 ereport(ERROR,
5861 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5862 errmsg("too few arguments for format()")));
5864 /* Get the value and type of the selected argument */
5865 if (!funcvariadic)
5867 value = PG_GETARG_DATUM(arg);
5868 isNull = PG_ARGISNULL(arg);
5869 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5871 else
5873 value = elements[arg - 1];
5874 isNull = nulls[arg - 1];
5875 typid = element_type;
5877 if (!OidIsValid(typid))
5878 elog(ERROR, "could not determine data type of format() input");
5880 arg++;
5882 /* We can treat NULL width the same as zero */
5883 if (isNull)
5884 width = 0;
5885 else if (typid == INT4OID)
5886 width = DatumGetInt32(value);
5887 else if (typid == INT2OID)
5888 width = DatumGetInt16(value);
5889 else
5891 /* For less-usual datatypes, convert to text then to int */
5892 char *str;
5894 if (typid != prev_width_type)
5896 Oid typoutputfunc;
5897 bool typIsVarlena;
5899 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5900 fmgr_info(typoutputfunc, &typoutputinfo_width);
5901 prev_width_type = typid;
5904 str = OutputFunctionCall(&typoutputinfo_width, value);
5906 /* pg_strtoint32 will complain about bad data or overflow */
5907 width = pg_strtoint32(str);
5909 pfree(str);
5913 /* Collect the specified or next argument position */
5914 if (argpos > 0)
5915 arg = argpos;
5916 if (arg >= nargs)
5917 ereport(ERROR,
5918 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5919 errmsg("too few arguments for format()")));
5921 /* Get the value and type of the selected argument */
5922 if (!funcvariadic)
5924 value = PG_GETARG_DATUM(arg);
5925 isNull = PG_ARGISNULL(arg);
5926 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5928 else
5930 value = elements[arg - 1];
5931 isNull = nulls[arg - 1];
5932 typid = element_type;
5934 if (!OidIsValid(typid))
5935 elog(ERROR, "could not determine data type of format() input");
5937 arg++;
5940 * Get the appropriate typOutput function, reusing previous one if
5941 * same type as previous argument. That's particularly useful in the
5942 * variadic-array case, but often saves work even for ordinary calls.
5944 if (typid != prev_type)
5946 Oid typoutputfunc;
5947 bool typIsVarlena;
5949 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5950 fmgr_info(typoutputfunc, &typoutputfinfo);
5951 prev_type = typid;
5955 * And now we can format the value.
5957 switch (*cp)
5959 case 's':
5960 case 'I':
5961 case 'L':
5962 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5963 value, isNull,
5964 flags, width);
5965 break;
5966 default:
5967 /* should not get here, because of previous check */
5968 ereport(ERROR,
5969 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5970 errmsg("unrecognized format() type specifier \"%.*s\"",
5971 pg_mblen(cp), cp),
5972 errhint("For a single \"%%\" use \"%%%%\".")));
5973 break;
5977 /* Don't need deconstruct_array results anymore. */
5978 if (elements != NULL)
5979 pfree(elements);
5980 if (nulls != NULL)
5981 pfree(nulls);
5983 /* Generate results. */
5984 result = cstring_to_text_with_len(str.data, str.len);
5985 pfree(str.data);
5987 PG_RETURN_TEXT_P(result);
5991 * Parse contiguous digits as a decimal number.
5993 * Returns true if some digits could be parsed.
5994 * The value is returned into *value, and *ptr is advanced to the next
5995 * character to be parsed.
5997 * Note parsing invariant: at least one character is known available before
5998 * string end (end_ptr) at entry, and this is still true at exit.
6000 static bool
6001 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
6003 bool found = false;
6004 const char *cp = *ptr;
6005 int val = 0;
6007 while (*cp >= '0' && *cp <= '9')
6009 int8 digit = (*cp - '0');
6011 if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
6012 unlikely(pg_add_s32_overflow(val, digit, &val)))
6013 ereport(ERROR,
6014 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6015 errmsg("number is out of range")));
6016 ADVANCE_PARSE_POINTER(cp, end_ptr);
6017 found = true;
6020 *ptr = cp;
6021 *value = val;
6023 return found;
6027 * Parse a format specifier (generally following the SUS printf spec).
6029 * We have already advanced over the initial '%', and we are looking for
6030 * [argpos][flags][width]type (but the type character is not consumed here).
6032 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6033 * Output parameters:
6034 * argpos: argument position for value to be printed. -1 means unspecified.
6035 * widthpos: argument position for width. Zero means the argument position
6036 * was unspecified (ie, take the next arg) and -1 means no width
6037 * argument (width was omitted or specified as a constant).
6038 * flags: bitmask of flags.
6039 * width: directly-specified width value. Zero means the width was omitted
6040 * (note it's not necessary to distinguish this case from an explicit
6041 * zero width value).
6043 * The function result is the next character position to be parsed, ie, the
6044 * location where the type character is/should be.
6046 * Note parsing invariant: at least one character is known available before
6047 * string end (end_ptr) at entry, and this is still true at exit.
6049 static const char *
6050 text_format_parse_format(const char *start_ptr, const char *end_ptr,
6051 int *argpos, int *widthpos,
6052 int *flags, int *width)
6054 const char *cp = start_ptr;
6055 int n;
6057 /* set defaults for output parameters */
6058 *argpos = -1;
6059 *widthpos = -1;
6060 *flags = 0;
6061 *width = 0;
6063 /* try to identify first number */
6064 if (text_format_parse_digits(&cp, end_ptr, &n))
6066 if (*cp != '$')
6068 /* Must be just a width and a type, so we're done */
6069 *width = n;
6070 return cp;
6072 /* The number was argument position */
6073 *argpos = n;
6074 /* Explicit 0 for argument index is immediately refused */
6075 if (n == 0)
6076 ereport(ERROR,
6077 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6078 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6079 ADVANCE_PARSE_POINTER(cp, end_ptr);
6082 /* Handle flags (only minus is supported now) */
6083 while (*cp == '-')
6085 *flags |= TEXT_FORMAT_FLAG_MINUS;
6086 ADVANCE_PARSE_POINTER(cp, end_ptr);
6089 if (*cp == '*')
6091 /* Handle indirect width */
6092 ADVANCE_PARSE_POINTER(cp, end_ptr);
6093 if (text_format_parse_digits(&cp, end_ptr, &n))
6095 /* number in this position must be closed by $ */
6096 if (*cp != '$')
6097 ereport(ERROR,
6098 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6099 errmsg("width argument position must be ended by \"$\"")));
6100 /* The number was width argument position */
6101 *widthpos = n;
6102 /* Explicit 0 for argument index is immediately refused */
6103 if (n == 0)
6104 ereport(ERROR,
6105 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6106 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6107 ADVANCE_PARSE_POINTER(cp, end_ptr);
6109 else
6110 *widthpos = 0; /* width's argument position is unspecified */
6112 else
6114 /* Check for direct width specification */
6115 if (text_format_parse_digits(&cp, end_ptr, &n))
6116 *width = n;
6119 /* cp should now be pointing at type character */
6120 return cp;
6124 * Format a %s, %I, or %L conversion
6126 static void
6127 text_format_string_conversion(StringInfo buf, char conversion,
6128 FmgrInfo *typOutputInfo,
6129 Datum value, bool isNull,
6130 int flags, int width)
6132 char *str;
6134 /* Handle NULL arguments before trying to stringify the value. */
6135 if (isNull)
6137 if (conversion == 's')
6138 text_format_append_string(buf, "", flags, width);
6139 else if (conversion == 'L')
6140 text_format_append_string(buf, "NULL", flags, width);
6141 else if (conversion == 'I')
6142 ereport(ERROR,
6143 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6144 errmsg("null values cannot be formatted as an SQL identifier")));
6145 return;
6148 /* Stringify. */
6149 str = OutputFunctionCall(typOutputInfo, value);
6151 /* Escape. */
6152 if (conversion == 'I')
6154 /* quote_identifier may or may not allocate a new string. */
6155 text_format_append_string(buf, quote_identifier(str), flags, width);
6157 else if (conversion == 'L')
6159 char *qstr = quote_literal_cstr(str);
6161 text_format_append_string(buf, qstr, flags, width);
6162 /* quote_literal_cstr() always allocates a new string */
6163 pfree(qstr);
6165 else
6166 text_format_append_string(buf, str, flags, width);
6168 /* Cleanup. */
6169 pfree(str);
6173 * Append str to buf, padding as directed by flags/width
6175 static void
6176 text_format_append_string(StringInfo buf, const char *str,
6177 int flags, int width)
6179 bool align_to_left = false;
6180 int len;
6182 /* fast path for typical easy case */
6183 if (width == 0)
6185 appendStringInfoString(buf, str);
6186 return;
6189 if (width < 0)
6191 /* Negative width: implicit '-' flag, then take absolute value */
6192 align_to_left = true;
6193 /* -INT_MIN is undefined */
6194 if (width <= INT_MIN)
6195 ereport(ERROR,
6196 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6197 errmsg("number is out of range")));
6198 width = -width;
6200 else if (flags & TEXT_FORMAT_FLAG_MINUS)
6201 align_to_left = true;
6203 len = pg_mbstrlen(str);
6204 if (align_to_left)
6206 /* left justify */
6207 appendStringInfoString(buf, str);
6208 if (len < width)
6209 appendStringInfoSpaces(buf, width - len);
6211 else
6213 /* right justify */
6214 if (len < width)
6215 appendStringInfoSpaces(buf, width - len);
6216 appendStringInfoString(buf, str);
6221 * text_format_nv - nonvariadic wrapper for text_format function.
6223 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6224 * which checks that all built-in functions that share the implementing C
6225 * function take the same number of arguments.
6227 Datum
6228 text_format_nv(PG_FUNCTION_ARGS)
6230 return text_format(fcinfo);
6234 * Helper function for Levenshtein distance functions. Faster than memcmp(),
6235 * for this use case.
6237 static inline bool
6238 rest_of_char_same(const char *s1, const char *s2, int len)
6240 while (len > 0)
6242 len--;
6243 if (s1[len] != s2[len])
6244 return false;
6246 return true;
6249 /* Expand each Levenshtein distance variant */
6250 #include "levenshtein.c"
6251 #define LEVENSHTEIN_LESS_EQUAL
6252 #include "levenshtein.c"
6256 * Unicode support
6259 static UnicodeNormalizationForm
6260 unicode_norm_form_from_string(const char *formstr)
6262 UnicodeNormalizationForm form = -1;
6265 * Might as well check this while we're here.
6267 if (GetDatabaseEncoding() != PG_UTF8)
6268 ereport(ERROR,
6269 (errcode(ERRCODE_SYNTAX_ERROR),
6270 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6272 if (pg_strcasecmp(formstr, "NFC") == 0)
6273 form = UNICODE_NFC;
6274 else if (pg_strcasecmp(formstr, "NFD") == 0)
6275 form = UNICODE_NFD;
6276 else if (pg_strcasecmp(formstr, "NFKC") == 0)
6277 form = UNICODE_NFKC;
6278 else if (pg_strcasecmp(formstr, "NFKD") == 0)
6279 form = UNICODE_NFKD;
6280 else
6281 ereport(ERROR,
6282 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6283 errmsg("invalid normalization form: %s", formstr)));
6285 return form;
6288 Datum
6289 unicode_normalize_func(PG_FUNCTION_ARGS)
6291 text *input = PG_GETARG_TEXT_PP(0);
6292 char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6293 UnicodeNormalizationForm form;
6294 int size;
6295 pg_wchar *input_chars;
6296 pg_wchar *output_chars;
6297 unsigned char *p;
6298 text *result;
6299 int i;
6301 form = unicode_norm_form_from_string(formstr);
6303 /* convert to pg_wchar */
6304 size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6305 input_chars = palloc((size + 1) * sizeof(pg_wchar));
6306 p = (unsigned char *) VARDATA_ANY(input);
6307 for (i = 0; i < size; i++)
6309 input_chars[i] = utf8_to_unicode(p);
6310 p += pg_utf_mblen(p);
6312 input_chars[i] = (pg_wchar) '\0';
6313 Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6315 /* action */
6316 output_chars = unicode_normalize(form, input_chars);
6318 /* convert back to UTF-8 string */
6319 size = 0;
6320 for (pg_wchar *wp = output_chars; *wp; wp++)
6322 unsigned char buf[4];
6324 unicode_to_utf8(*wp, buf);
6325 size += pg_utf_mblen(buf);
6328 result = palloc(size + VARHDRSZ);
6329 SET_VARSIZE(result, size + VARHDRSZ);
6331 p = (unsigned char *) VARDATA_ANY(result);
6332 for (pg_wchar *wp = output_chars; *wp; wp++)
6334 unicode_to_utf8(*wp, p);
6335 p += pg_utf_mblen(p);
6337 Assert((char *) p == (char *) result + size + VARHDRSZ);
6339 PG_RETURN_TEXT_P(result);
6343 * Check whether the string is in the specified Unicode normalization form.
6345 * This is done by converting the string to the specified normal form and then
6346 * comparing that to the original string. To speed that up, we also apply the
6347 * "quick check" algorithm specified in UAX #15, which can give a yes or no
6348 * answer for many strings by just scanning the string once.
6350 * This function should generally be optimized for the case where the string
6351 * is in fact normalized. In that case, we'll end up looking at the entire
6352 * string, so it's probably not worth doing any incremental conversion etc.
6354 Datum
6355 unicode_is_normalized(PG_FUNCTION_ARGS)
6357 text *input = PG_GETARG_TEXT_PP(0);
6358 char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6359 UnicodeNormalizationForm form;
6360 int size;
6361 pg_wchar *input_chars;
6362 pg_wchar *output_chars;
6363 unsigned char *p;
6364 int i;
6365 UnicodeNormalizationQC quickcheck;
6366 int output_size;
6367 bool result;
6369 form = unicode_norm_form_from_string(formstr);
6371 /* convert to pg_wchar */
6372 size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6373 input_chars = palloc((size + 1) * sizeof(pg_wchar));
6374 p = (unsigned char *) VARDATA_ANY(input);
6375 for (i = 0; i < size; i++)
6377 input_chars[i] = utf8_to_unicode(p);
6378 p += pg_utf_mblen(p);
6380 input_chars[i] = (pg_wchar) '\0';
6381 Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6383 /* quick check (see UAX #15) */
6384 quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6385 if (quickcheck == UNICODE_NORM_QC_YES)
6386 PG_RETURN_BOOL(true);
6387 else if (quickcheck == UNICODE_NORM_QC_NO)
6388 PG_RETURN_BOOL(false);
6390 /* normalize and compare with original */
6391 output_chars = unicode_normalize(form, input_chars);
6393 output_size = 0;
6394 for (pg_wchar *wp = output_chars; *wp; wp++)
6395 output_size++;
6397 result = (size == output_size) &&
6398 (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6400 PG_RETURN_BOOL(result);
6404 * Check if first n chars are hexadecimal digits
6406 static bool
6407 isxdigits_n(const char *instr, size_t n)
6409 for (size_t i = 0; i < n; i++)
6410 if (!isxdigit((unsigned char) instr[i]))
6411 return false;
6413 return true;
6416 static unsigned int
6417 hexval(unsigned char c)
6419 if (c >= '0' && c <= '9')
6420 return c - '0';
6421 if (c >= 'a' && c <= 'f')
6422 return c - 'a' + 0xA;
6423 if (c >= 'A' && c <= 'F')
6424 return c - 'A' + 0xA;
6425 elog(ERROR, "invalid hexadecimal digit");
6426 return 0; /* not reached */
6430 * Translate string with hexadecimal digits to number
6432 static unsigned int
6433 hexval_n(const char *instr, size_t n)
6435 unsigned int result = 0;
6437 for (size_t i = 0; i < n; i++)
6438 result += hexval(instr[i]) << (4 * (n - i - 1));
6440 return result;
6444 * Replaces Unicode escape sequences by Unicode characters
6446 Datum
6447 unistr(PG_FUNCTION_ARGS)
6449 text *input_text = PG_GETARG_TEXT_PP(0);
6450 char *instr;
6451 int len;
6452 StringInfoData str;
6453 text *result;
6454 pg_wchar pair_first = 0;
6455 char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6457 instr = VARDATA_ANY(input_text);
6458 len = VARSIZE_ANY_EXHDR(input_text);
6460 initStringInfo(&str);
6462 while (len > 0)
6464 if (instr[0] == '\\')
6466 if (len >= 2 &&
6467 instr[1] == '\\')
6469 if (pair_first)
6470 goto invalid_pair;
6471 appendStringInfoChar(&str, '\\');
6472 instr += 2;
6473 len -= 2;
6475 else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6476 (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6478 pg_wchar unicode;
6479 int offset = instr[1] == 'u' ? 2 : 1;
6481 unicode = hexval_n(instr + offset, 4);
6483 if (!is_valid_unicode_codepoint(unicode))
6484 ereport(ERROR,
6485 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6486 errmsg("invalid Unicode code point: %04X", unicode));
6488 if (pair_first)
6490 if (is_utf16_surrogate_second(unicode))
6492 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6493 pair_first = 0;
6495 else
6496 goto invalid_pair;
6498 else if (is_utf16_surrogate_second(unicode))
6499 goto invalid_pair;
6501 if (is_utf16_surrogate_first(unicode))
6502 pair_first = unicode;
6503 else
6505 pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6506 appendStringInfoString(&str, cbuf);
6509 instr += 4 + offset;
6510 len -= 4 + offset;
6512 else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6514 pg_wchar unicode;
6516 unicode = hexval_n(instr + 2, 6);
6518 if (!is_valid_unicode_codepoint(unicode))
6519 ereport(ERROR,
6520 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6521 errmsg("invalid Unicode code point: %04X", unicode));
6523 if (pair_first)
6525 if (is_utf16_surrogate_second(unicode))
6527 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6528 pair_first = 0;
6530 else
6531 goto invalid_pair;
6533 else if (is_utf16_surrogate_second(unicode))
6534 goto invalid_pair;
6536 if (is_utf16_surrogate_first(unicode))
6537 pair_first = unicode;
6538 else
6540 pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6541 appendStringInfoString(&str, cbuf);
6544 instr += 8;
6545 len -= 8;
6547 else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6549 pg_wchar unicode;
6551 unicode = hexval_n(instr + 2, 8);
6553 if (!is_valid_unicode_codepoint(unicode))
6554 ereport(ERROR,
6555 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6556 errmsg("invalid Unicode code point: %04X", unicode));
6558 if (pair_first)
6560 if (is_utf16_surrogate_second(unicode))
6562 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6563 pair_first = 0;
6565 else
6566 goto invalid_pair;
6568 else if (is_utf16_surrogate_second(unicode))
6569 goto invalid_pair;
6571 if (is_utf16_surrogate_first(unicode))
6572 pair_first = unicode;
6573 else
6575 pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6576 appendStringInfoString(&str, cbuf);
6579 instr += 10;
6580 len -= 10;
6582 else
6583 ereport(ERROR,
6584 (errcode(ERRCODE_SYNTAX_ERROR),
6585 errmsg("invalid Unicode escape"),
6586 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6588 else
6590 if (pair_first)
6591 goto invalid_pair;
6593 appendStringInfoChar(&str, *instr++);
6594 len--;
6598 /* unfinished surrogate pair? */
6599 if (pair_first)
6600 goto invalid_pair;
6602 result = cstring_to_text_with_len(str.data, str.len);
6603 pfree(str.data);
6605 PG_RETURN_TEXT_P(result);
6607 invalid_pair:
6608 ereport(ERROR,
6609 (errcode(ERRCODE_SYNTAX_ERROR),
6610 errmsg("invalid Unicode surrogate pair")));
6611 PG_RETURN_NULL(); /* keep compiler quiet */