src/backend/utils/adt/varlena.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * varlena.c
   4  *        Functions for the variable-length built-in types.
   5  *
   6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/utils/adt/varlena.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <limits.h>
  19
  20 #include "access/detoast.h"
  21 #include "access/toast_compression.h"
  22 #include "catalog/pg_collation.h"
  23 #include "catalog/pg_type.h"
  24 #include "common/hashfn.h"
  25 #include "common/int.h"
  26 #include "common/unicode_norm.h"
  27 #include "lib/hyperloglog.h"
  28 #include "libpq/pqformat.h"
  29 #include "miscadmin.h"
  30 #include "nodes/execnodes.h"
  31 #include "parser/scansup.h"
  32 #include "port/pg_bswap.h"
  33 #include "regex/regex.h"
  34 #include "utils/builtins.h"
  35 #include "utils/bytea.h"
  36 #include "utils/lsyscache.h"
  37 #include "utils/memutils.h"
  38 #include "utils/pg_locale.h"
  39 #include "utils/sortsupport.h"
  40 #include "utils/varlena.h"
  41
  42
  43 /* GUC variable */
  44 int                     bytea_output = BYTEA_OUTPUT_HEX;
  45
  46 typedef struct varlena unknown;
  47 typedef struct varlena VarString;
  48
  49 /*
  50  * State for text_position_* functions.
  51  */
  52 typedef struct
  53 {
  54         bool            is_multibyte;   /* T if multibyte encoding */
  55         bool            is_multibyte_char_in_char;      /* need to check char boundaries? */
  56
  57         char       *str1;                       /* haystack string */
  58         char       *str2;                       /* needle string */
  59         int                     len1;                   /* string lengths in bytes */
  60         int                     len2;
  61
  62         /* Skip table for Boyer-Moore-Horspool search algorithm: */
  63         int                     skiptablemask;  /* mask for ANDing with skiptable subscripts */
  64         int                     skiptable[256]; /* skip distance for given mismatched char */
  65
  66         char       *last_match;         /* pointer to last match in 'str1' */
  67
  68         /*
  69          * Sometimes we need to convert the byte position of a match to a
  70          * character position.  These store the last position that was converted,
  71          * so that on the next call, we can continue from that point, rather than
  72          * count characters from the very beginning.
  73          */
  74         char       *refpoint;           /* pointer within original haystack string */
  75         int                     refpos;                 /* 0-based character offset of the same point */
  76 } TextPositionState;
  77
  78 typedef struct
  79 {
  80         char       *buf1;                       /* 1st string, or abbreviation original string
  81                                                                  * buf */
  82         char       *buf2;                       /* 2nd string, or abbreviation strxfrm() buf */
  83         int                     buflen1;
  84         int                     buflen2;
  85         int                     last_len1;              /* Length of last buf1 string/strxfrm() input */
  86         int                     last_len2;              /* Length of last buf2 string/strxfrm() blob */
  87         int                     last_returned;  /* Last comparison result (cache) */
  88         bool            cache_blob;             /* Does buf2 contain strxfrm() blob, etc? */
  89         bool            collate_c;
  90         Oid                     typid;                  /* Actual datatype (text/bpchar/bytea/name) */
  91         hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
  92         hyperLogLogState full_card; /* Full key cardinality state */
  93         double          prop_card;              /* Required cardinality proportion */
  94         pg_locale_t locale;
  95 } VarStringSortSupport;
  96
  97 /*
  98  * Output data for split_text(): we output either to an array or a table.
  99  * tupstore and tupdesc must be set up in advance to output to a table.
 100  */
 101 typedef struct
 102 {
 103         ArrayBuildState *astate;
 104         Tuplestorestate *tupstore;
 105         TupleDesc       tupdesc;
 106 } SplitTextOutputData;
 107
 108 /*
 109  * This should be large enough that most strings will fit, but small enough
 110  * that we feel comfortable putting it on the stack
 111  */
 112 #define TEXTBUFLEN              1024
 113
 114 #define DatumGetUnknownP(X)                     ((unknown *) PG_DETOAST_DATUM(X))
 115 #define DatumGetUnknownPCopy(X)         ((unknown *) PG_DETOAST_DATUM_COPY(X))
 116 #define PG_GETARG_UNKNOWN_P(n)          DatumGetUnknownP(PG_GETARG_DATUM(n))
 117 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
 118 #define PG_RETURN_UNKNOWN_P(x)          PG_RETURN_POINTER(x)
 119
 120 #define DatumGetVarStringP(X)           ((VarString *) PG_DETOAST_DATUM(X))
 121 #define DatumGetVarStringPP(X)          ((VarString *) PG_DETOAST_DATUM_PACKED(X))
 122
 123 static int      varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
 124 static int      bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
 125 static int      namefastcmp_c(Datum x, Datum y, SortSupport ssup);
 126 static int      varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
 127 static int      namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
 128 static int      varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
 129 static int      varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
 130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
 131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
 132 static int32 text_length(Datum str);
 133 static text *text_catenate(text *t1, text *t2);
 134 static text *text_substring(Datum str,
 135                                                         int32 start,
 136                                                         int32 length,
 137                                                         bool length_not_specified);
 138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
 139 static int      text_position(text *t1, text *t2, Oid collid);
 140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
 141 static bool text_position_next(TextPositionState *state);
 142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
 143 static char *text_position_get_match_ptr(TextPositionState *state);
 144 static int      text_position_get_match_pos(TextPositionState *state);
 145 static void text_position_cleanup(TextPositionState *state);
 146 static void check_collation_set(Oid collid);
 147 static int      text_cmp(text *arg1, text *arg2, Oid collid);
 148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
 149 static bytea *bytea_substring(Datum str,
 150                                                           int S,
 151                                                           int L,
 152                                                           bool length_not_specified);
 153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
 154 static void appendStringInfoText(StringInfo str, const text *t);
 155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
 156 static void split_text_accum_result(SplitTextOutputData *tstate,
 157                                                                         text *field_value,
 158                                                                         text *null_string,
 159                                                                         Oid collation);
 160 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
 161                                                                         const char *fldsep, const char *null_string);
 162 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
 163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
 164                                                                          int *value);
 165 static const char *text_format_parse_format(const char *start_ptr,
 166                                                                                         const char *end_ptr,
 167                                                                                         int *argpos, int *widthpos,
 168                                                                                         int *flags, int *width);
 169 static void text_format_string_conversion(StringInfo buf, char conversion,
 170                                                                                   FmgrInfo *typOutputInfo,
 171                                                                                   Datum value, bool isNull,
 172                                                                                   int flags, int width);
 173 static void text_format_append_string(StringInfo buf, const char *str,
 174                                                                           int flags, int width);
 175
 176
 177 /*****************************************************************************
 178  *       CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                                                  *
 179  *****************************************************************************/
 180
 181 /*
 182  * cstring_to_text
 183  *
 184  * Create a text value from a null-terminated C string.
 185  *
 186  * The new text value is freshly palloc'd with a full-size VARHDR.
 187  */
 188 text *
 189 cstring_to_text(const char *s)
 190 {
 191         return cstring_to_text_with_len(s, strlen(s));
 192 }
 193
 194 /*
 195  * cstring_to_text_with_len
 196  *
 197  * Same as cstring_to_text except the caller specifies the string length;
 198  * the string need not be null_terminated.
 199  */
 200 text *
 201 cstring_to_text_with_len(const char *s, int len)
 202 {
 203         text       *result = (text *) palloc(len + VARHDRSZ);
 204
 205         SET_VARSIZE(result, len + VARHDRSZ);
 206         memcpy(VARDATA(result), s, len);
 207
 208         return result;
 209 }
 210
 211 /*
 212  * text_to_cstring
 213  *
 214  * Create a palloc'd, null-terminated C string from a text value.
 215  *
 216  * We support being passed a compressed or toasted text value.
 217  * This is a bit bogus since such values shouldn't really be referred to as
 218  * "text *", but it seems useful for robustness.  If we didn't handle that
 219  * case here, we'd need another routine that did, anyway.
 220  */
 221 char *
 222 text_to_cstring(const text *t)
 223 {
 224         /* must cast away the const, unfortunately */
 225         text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
 226         int                     len = VARSIZE_ANY_EXHDR(tunpacked);
 227         char       *result;
 228
 229         result = (char *) palloc(len + 1);
 230         memcpy(result, VARDATA_ANY(tunpacked), len);
 231         result[len] = '\0';
 232
 233         if (tunpacked != t)
 234                 pfree(tunpacked);
 235
 236         return result;
 237 }
 238
 239 /*
 240  * text_to_cstring_buffer
 241  *
 242  * Copy a text value into a caller-supplied buffer of size dst_len.
 243  *
 244  * The text string is truncated if necessary to fit.  The result is
 245  * guaranteed null-terminated (unless dst_len == 0).
 246  *
 247  * We support being passed a compressed or toasted text value.
 248  * This is a bit bogus since such values shouldn't really be referred to as
 249  * "text *", but it seems useful for robustness.  If we didn't handle that
 250  * case here, we'd need another routine that did, anyway.
 251  */
 252 void
 253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
 254 {
 255         /* must cast away the const, unfortunately */
 256         text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
 257         size_t          src_len = VARSIZE_ANY_EXHDR(srcunpacked);
 258
 259         if (dst_len > 0)
 260         {
 261                 dst_len--;
 262                 if (dst_len >= src_len)
 263                         dst_len = src_len;
 264                 else                                    /* ensure truncation is encoding-safe */
 265                         dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
 266                 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
 267                 dst[dst_len] = '\0';
 268         }
 269
 270         if (srcunpacked != src)
 271                 pfree(srcunpacked);
 272 }
 273
 274
 275 /*****************************************************************************
 276  *       USER I/O ROUTINES                                                                                                               *
 277  *****************************************************************************/
 278
 279
 280 #define VAL(CH)                 ((CH) - '0')
 281 #define DIG(VAL)                ((VAL) + '0')
 282
 283 /*
 284  *              byteain                 - converts from printable representation of byte array
 285  *
 286  *              Non-printable characters must be passed as '\nnn' (octal) and are
 287  *              converted to internal form.  '\' must be passed as '\\'.
 288  *              ereport(ERROR, ...) if bad form.
 289  *
 290  *              BUGS:
 291  *                              The input is scanned twice.
 292  *                              The error checking of input is minimal.
 293  */
 294 Datum
 295 byteain(PG_FUNCTION_ARGS)
 296 {
 297         char       *inputText = PG_GETARG_CSTRING(0);
 298         char       *tp;
 299         char       *rp;
 300         int                     bc;
 301         bytea      *result;
 302
 303         /* Recognize hex input */
 304         if (inputText[0] == '\\' && inputText[1] == 'x')
 305         {
 306                 size_t          len = strlen(inputText);
 307
 308                 bc = (len - 2) / 2 + VARHDRSZ;  /* maximum possible length */
 309                 result = palloc(bc);
 310                 bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
 311                 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
 312
 313                 PG_RETURN_BYTEA_P(result);
 314         }
 315
 316         /* Else, it's the traditional escaped style */
 317         for (bc = 0, tp = inputText; *tp != '\0'; bc++)
 318         {
 319                 if (tp[0] != '\\')
 320                         tp++;
 321                 else if ((tp[0] == '\\') &&
 322                                  (tp[1] >= '0' && tp[1] <= '3') &&
 323                                  (tp[2] >= '0' && tp[2] <= '7') &&
 324                                  (tp[3] >= '0' && tp[3] <= '7'))
 325                         tp += 4;
 326                 else if ((tp[0] == '\\') &&
 327                                  (tp[1] == '\\'))
 328                         tp += 2;
 329                 else
 330                 {
 331                         /*
 332                          * one backslash, not followed by another or ### valid octal
 333                          */
 334                         ereport(ERROR,
 335                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 336                                          errmsg("invalid input syntax for type %s", "bytea")));
 337                 }
 338         }
 339
 340         bc += VARHDRSZ;
 341
 342         result = (bytea *) palloc(bc);
 343         SET_VARSIZE(result, bc);
 344
 345         tp = inputText;
 346         rp = VARDATA(result);
 347         while (*tp != '\0')
 348         {
 349                 if (tp[0] != '\\')
 350                         *rp++ = *tp++;
 351                 else if ((tp[0] == '\\') &&
 352                                  (tp[1] >= '0' && tp[1] <= '3') &&
 353                                  (tp[2] >= '0' && tp[2] <= '7') &&
 354                                  (tp[3] >= '0' && tp[3] <= '7'))
 355                 {
 356                         bc = VAL(tp[1]);
 357                         bc <<= 3;
 358                         bc += VAL(tp[2]);
 359                         bc <<= 3;
 360                         *rp++ = bc + VAL(tp[3]);
 361
 362                         tp += 4;
 363                 }
 364                 else if ((tp[0] == '\\') &&
 365                                  (tp[1] == '\\'))
 366                 {
 367                         *rp++ = '\\';
 368                         tp += 2;
 369                 }
 370                 else
 371                 {
 372                         /*
 373                          * We should never get here. The first pass should not allow it.
 374                          */
 375                         ereport(ERROR,
 376                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 377                                          errmsg("invalid input syntax for type %s", "bytea")));
 378                 }
 379         }
 380
 381         PG_RETURN_BYTEA_P(result);
 382 }
 383
 384 /*
 385  *              byteaout                - converts to printable representation of byte array
 386  *
 387  *              In the traditional escaped format, non-printable characters are
 388  *              printed as '\nnn' (octal) and '\' as '\\'.
 389  */
 390 Datum
 391 byteaout(PG_FUNCTION_ARGS)
 392 {
 393         bytea      *vlena = PG_GETARG_BYTEA_PP(0);
 394         char       *result;
 395         char       *rp;
 396
 397         if (bytea_output == BYTEA_OUTPUT_HEX)
 398         {
 399                 /* Print hex format */
 400                 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
 401                 *rp++ = '\\';
 402                 *rp++ = 'x';
 403                 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
 404         }
 405         else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
 406         {
 407                 /* Print traditional escaped format */
 408                 char       *vp;
 409                 uint64          len;
 410                 int                     i;
 411
 412                 len = 1;                                /* empty string has 1 char */
 413                 vp = VARDATA_ANY(vlena);
 414                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 415                 {
 416                         if (*vp == '\\')
 417                                 len += 2;
 418                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 419                                 len += 4;
 420                         else
 421                                 len++;
 422                 }
 423
 424                 /*
 425                  * In principle len can't overflow uint32 if the input fit in 1GB, but
 426                  * for safety let's check rather than relying on palloc's internal
 427                  * check.
 428                  */
 429                 if (len > MaxAllocSize)
 430                         ereport(ERROR,
 431                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 432                                          errmsg_internal("result of bytea output conversion is too large")));
 433                 rp = result = (char *) palloc(len);
 434
 435                 vp = VARDATA_ANY(vlena);
 436                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 437                 {
 438                         if (*vp == '\\')
 439                         {
 440                                 *rp++ = '\\';
 441                                 *rp++ = '\\';
 442                         }
 443                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 444                         {
 445                                 int                     val;    /* holds unprintable chars */
 446
 447                                 val = *vp;
 448                                 rp[0] = '\\';
 449                                 rp[3] = DIG(val & 07);
 450                                 val >>= 3;
 451                                 rp[2] = DIG(val & 07);
 452                                 val >>= 3;
 453                                 rp[1] = DIG(val & 03);
 454                                 rp += 4;
 455                         }
 456                         else
 457                                 *rp++ = *vp;
 458                 }
 459         }
 460         else
 461         {
 462                 elog(ERROR, "unrecognized bytea_output setting: %d",
 463                          bytea_output);
 464                 rp = result = NULL;             /* keep compiler quiet */
 465         }
 466         *rp = '\0';
 467         PG_RETURN_CSTRING(result);
 468 }
 469
 470 /*
 471  *              bytearecv                       - converts external binary format to bytea
 472  */
 473 Datum
 474 bytearecv(PG_FUNCTION_ARGS)
 475 {
 476         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 477         bytea      *result;
 478         int                     nbytes;
 479
 480         nbytes = buf->len - buf->cursor;
 481         result = (bytea *) palloc(nbytes + VARHDRSZ);
 482         SET_VARSIZE(result, nbytes + VARHDRSZ);
 483         pq_copymsgbytes(buf, VARDATA(result), nbytes);
 484         PG_RETURN_BYTEA_P(result);
 485 }
 486
 487 /*
 488  *              byteasend                       - converts bytea to binary format
 489  *
 490  * This is a special case: just copy the input...
 491  */
 492 Datum
 493 byteasend(PG_FUNCTION_ARGS)
 494 {
 495         bytea      *vlena = PG_GETARG_BYTEA_P_COPY(0);
 496
 497         PG_RETURN_BYTEA_P(vlena);
 498 }
 499
 500 Datum
 501 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
 502 {
 503         StringInfo      state;
 504
 505         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 506
 507         /* Append the value unless null. */
 508         if (!PG_ARGISNULL(1))
 509         {
 510                 bytea      *value = PG_GETARG_BYTEA_PP(1);
 511
 512                 /* On the first time through, we ignore the delimiter. */
 513                 if (state == NULL)
 514                         state = makeStringAggState(fcinfo);
 515                 else if (!PG_ARGISNULL(2))
 516                 {
 517                         bytea      *delim = PG_GETARG_BYTEA_PP(2);
 518
 519                         appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
 520                 }
 521
 522                 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
 523         }
 524
 525         /*
 526          * The transition type for string_agg() is declared to be "internal",
 527          * which is a pass-by-value type the same size as a pointer.
 528          */
 529         PG_RETURN_POINTER(state);
 530 }
 531
 532 Datum
 533 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
 534 {
 535         StringInfo      state;
 536
 537         /* cannot be called directly because of internal-type argument */
 538         Assert(AggCheckCallContext(fcinfo, NULL));
 539
 540         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 541
 542         if (state != NULL)
 543         {
 544                 bytea      *result;
 545
 546                 result = (bytea *) palloc(state->len + VARHDRSZ);
 547                 SET_VARSIZE(result, state->len + VARHDRSZ);
 548                 memcpy(VARDATA(result), state->data, state->len);
 549                 PG_RETURN_BYTEA_P(result);
 550         }
 551         else
 552                 PG_RETURN_NULL();
 553 }
 554
 555 /*
 556  *              textin                  - converts "..." to internal representation
 557  */
 558 Datum
 559 textin(PG_FUNCTION_ARGS)
 560 {
 561         char       *inputText = PG_GETARG_CSTRING(0);
 562
 563         PG_RETURN_TEXT_P(cstring_to_text(inputText));
 564 }
 565
 566 /*
 567  *              textout                 - converts internal representation to "..."
 568  */
 569 Datum
 570 textout(PG_FUNCTION_ARGS)
 571 {
 572         Datum           txt = PG_GETARG_DATUM(0);
 573
 574         PG_RETURN_CSTRING(TextDatumGetCString(txt));
 575 }
 576
 577 /*
 578  *              textrecv                        - converts external binary format to text
 579  */
 580 Datum
 581 textrecv(PG_FUNCTION_ARGS)
 582 {
 583         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 584         text       *result;
 585         char       *str;
 586         int                     nbytes;
 587
 588         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 589
 590         result = cstring_to_text_with_len(str, nbytes);
 591         pfree(str);
 592         PG_RETURN_TEXT_P(result);
 593 }
 594
 595 /*
 596  *              textsend                        - converts text to binary format
 597  */
 598 Datum
 599 textsend(PG_FUNCTION_ARGS)
 600 {
 601         text       *t = PG_GETARG_TEXT_PP(0);
 602         StringInfoData buf;
 603
 604         pq_begintypsend(&buf);
 605         pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
 606         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 607 }
 608
 609
 610 /*
 611  *              unknownin                       - converts "..." to internal representation
 612  */
 613 Datum
 614 unknownin(PG_FUNCTION_ARGS)
 615 {
 616         char       *str = PG_GETARG_CSTRING(0);
 617
 618         /* representation is same as cstring */
 619         PG_RETURN_CSTRING(pstrdup(str));
 620 }
 621
 622 /*
 623  *              unknownout                      - converts internal representation to "..."
 624  */
 625 Datum
 626 unknownout(PG_FUNCTION_ARGS)
 627 {
 628         /* representation is same as cstring */
 629         char       *str = PG_GETARG_CSTRING(0);
 630
 631         PG_RETURN_CSTRING(pstrdup(str));
 632 }
 633
 634 /*
 635  *              unknownrecv                     - converts external binary format to unknown
 636  */
 637 Datum
 638 unknownrecv(PG_FUNCTION_ARGS)
 639 {
 640         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 641         char       *str;
 642         int                     nbytes;
 643
 644         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 645         /* representation is same as cstring */
 646         PG_RETURN_CSTRING(str);
 647 }
 648
 649 /*
 650  *              unknownsend                     - converts unknown to binary format
 651  */
 652 Datum
 653 unknownsend(PG_FUNCTION_ARGS)
 654 {
 655         /* representation is same as cstring */
 656         char       *str = PG_GETARG_CSTRING(0);
 657         StringInfoData buf;
 658
 659         pq_begintypsend(&buf);
 660         pq_sendtext(&buf, str, strlen(str));
 661         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 662 }
 663
 664
 665 /* ========== PUBLIC ROUTINES ========== */
 666
 667 /*
 668  * textlen -
 669  *        returns the logical length of a text*
 670  *         (which is less than the VARSIZE of the text*)
 671  */
 672 Datum
 673 textlen(PG_FUNCTION_ARGS)
 674 {
 675         Datum           str = PG_GETARG_DATUM(0);
 676
 677         /* try to avoid decompressing argument */
 678         PG_RETURN_INT32(text_length(str));
 679 }
 680
 681 /*
 682  * text_length -
 683  *      Does the real work for textlen()
 684  *
 685  *      This is broken out so it can be called directly by other string processing
 686  *      functions.  Note that the argument is passed as a Datum, to indicate that
 687  *      it may still be in compressed form.  We can avoid decompressing it at all
 688  *      in some cases.
 689  */
 690 static int32
 691 text_length(Datum str)
 692 {
 693         /* fastpath when max encoding length is one */
 694         if (pg_database_encoding_max_length() == 1)
 695                 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 696         else
 697         {
 698                 text       *t = DatumGetTextPP(str);
 699
 700                 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
 701                                                                                          VARSIZE_ANY_EXHDR(t)));
 702         }
 703 }
 704
 705 /*
 706  * textoctetlen -
 707  *        returns the physical length of a text*
 708  *         (which is less than the VARSIZE of the text*)
 709  */
 710 Datum
 711 textoctetlen(PG_FUNCTION_ARGS)
 712 {
 713         Datum           str = PG_GETARG_DATUM(0);
 714
 715         /* We need not detoast the input at all */
 716         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 717 }
 718
 719 /*
 720  * textcat -
 721  *        takes two text* and returns a text* that is the concatenation of
 722  *        the two.
 723  *
 724  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
 725  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
 726  * Allocate space for output in all cases.
 727  * XXX - thomas 1997-07-10
 728  */
 729 Datum
 730 textcat(PG_FUNCTION_ARGS)
 731 {
 732         text       *t1 = PG_GETARG_TEXT_PP(0);
 733         text       *t2 = PG_GETARG_TEXT_PP(1);
 734
 735         PG_RETURN_TEXT_P(text_catenate(t1, t2));
 736 }
 737
 738 /*
 739  * text_catenate
 740  *      Guts of textcat(), broken out so it can be used by other functions
 741  *
 742  * Arguments can be in short-header form, but not compressed or out-of-line
 743  */
 744 static text *
 745 text_catenate(text *t1, text *t2)
 746 {
 747         text       *result;
 748         int                     len1,
 749                                 len2,
 750                                 len;
 751         char       *ptr;
 752
 753         len1 = VARSIZE_ANY_EXHDR(t1);
 754         len2 = VARSIZE_ANY_EXHDR(t2);
 755
 756         /* paranoia ... probably should throw error instead? */
 757         if (len1 < 0)
 758                 len1 = 0;
 759         if (len2 < 0)
 760                 len2 = 0;
 761
 762         len = len1 + len2 + VARHDRSZ;
 763         result = (text *) palloc(len);
 764
 765         /* Set size of result string... */
 766         SET_VARSIZE(result, len);
 767
 768         /* Fill data field of result string... */
 769         ptr = VARDATA(result);
 770         if (len1 > 0)
 771                 memcpy(ptr, VARDATA_ANY(t1), len1);
 772         if (len2 > 0)
 773                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
 774
 775         return result;
 776 }
 777
 778 /*
 779  * charlen_to_bytelen()
 780  *      Compute the number of bytes occupied by n characters starting at *p
 781  *
 782  * It is caller's responsibility that there actually are n characters;
 783  * the string need not be null-terminated.
 784  */
 785 static int
 786 charlen_to_bytelen(const char *p, int n)
 787 {
 788         if (pg_database_encoding_max_length() == 1)
 789         {
 790                 /* Optimization for single-byte encodings */
 791                 return n;
 792         }
 793         else
 794         {
 795                 const char *s;
 796
 797                 for (s = p; n > 0; n--)
 798                         s += pg_mblen(s);
 799
 800                 return s - p;
 801         }
 802 }
 803
 804 /*
 805  * text_substr()
 806  * Return a substring starting at the specified position.
 807  * - thomas 1997-12-31
 808  *
 809  * Input:
 810  *      - string
 811  *      - starting position (is one-based)
 812  *      - string length
 813  *
 814  * If the starting position is zero or less, then return from the start of the string
 815  *      adjusting the length to be consistent with the "negative start" per SQL.
 816  * If the length is less than zero, return the remaining string.
 817  *
 818  * Added multibyte support.
 819  * - Tatsuo Ishii 1998-4-21
 820  * Changed behavior if starting position is less than one to conform to SQL behavior.
 821  * Formerly returned the entire string; now returns a portion.
 822  * - Thomas Lockhart 1998-12-10
 823  * Now uses faster TOAST-slicing interface
 824  * - John Gray 2002-02-22
 825  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
 826  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
 827  * error; if E < 1, return '', not entire string). Fixed MB related bug when
 828  * S > LC and < LC + 4 sometimes garbage characters are returned.
 829  * - Joe Conway 2002-08-10
 830  */
 831 Datum
 832 text_substr(PG_FUNCTION_ARGS)
 833 {
 834         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 835                                                                         PG_GETARG_INT32(1),
 836                                                                         PG_GETARG_INT32(2),
 837                                                                         false));
 838 }
 839
 840 /*
 841  * text_substr_no_len -
 842  *        Wrapper to avoid opr_sanity failure due to
 843  *        one function accepting a different number of args.
 844  */
 845 Datum
 846 text_substr_no_len(PG_FUNCTION_ARGS)
 847 {
 848         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 849                                                                         PG_GETARG_INT32(1),
 850                                                                         -1, true));
 851 }
 852
 853 /*
 854  * text_substring -
 855  *      Does the real work for text_substr() and text_substr_no_len()
 856  *
 857  *      This is broken out so it can be called directly by other string processing
 858  *      functions.  Note that the argument is passed as a Datum, to indicate that
 859  *      it may still be in compressed/toasted form.  We can avoid detoasting all
 860  *      of it in some cases.
 861  *
 862  *      The result is always a freshly palloc'd datum.
 863  */
 864 static text *
 865 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 866 {
 867         int32           eml = pg_database_encoding_max_length();
 868         int32           S = start;              /* start position */
 869         int32           S1;                             /* adjusted start position */
 870         int32           L1;                             /* adjusted substring length */
 871         int32           E;                              /* end position */
 872
 873         /*
 874          * SQL99 says S can be zero or negative, but we still must fetch from the
 875          * start of the string.
 876          */
 877         S1 = Max(S, 1);
 878
 879         /* life is easy if the encoding max length is 1 */
 880         if (eml == 1)
 881         {
 882                 if (length_not_specified)       /* special case - get length to end of
 883                                                                          * string */
 884                         L1 = -1;
 885                 else if (length < 0)
 886                 {
 887                         /* SQL99 says to throw an error for E < S, i.e., negative length */
 888                         ereport(ERROR,
 889                                         (errcode(ERRCODE_SUBSTRING_ERROR),
 890                                          errmsg("negative substring length not allowed")));
 891                         L1 = -1;                        /* silence stupider compilers */
 892                 }
 893                 else if (pg_add_s32_overflow(S, length, &E))
 894                 {
 895                         /*
 896                          * L could be large enough for S + L to overflow, in which case
 897                          * the substring must run to end of string.
 898                          */
 899                         L1 = -1;
 900                 }
 901                 else
 902                 {
 903                         /*
 904                          * A zero or negative value for the end position can happen if the
 905                          * start was negative or one. SQL99 says to return a zero-length
 906                          * string.
 907                          */
 908                         if (E < 1)
 909                                 return cstring_to_text("");
 910
 911                         L1 = E - S1;
 912                 }
 913
 914                 /*
 915                  * If the start position is past the end of the string, SQL99 says to
 916                  * return a zero-length string -- DatumGetTextPSlice() will do that
 917                  * for us.  We need only convert S1 to zero-based starting position.
 918                  */
 919                 return DatumGetTextPSlice(str, S1 - 1, L1);
 920         }
 921         else if (eml > 1)
 922         {
 923                 /*
 924                  * When encoding max length is > 1, we can't get LC without
 925                  * detoasting, so we'll grab a conservatively large slice now and go
 926                  * back later to do the right thing
 927                  */
 928                 int32           slice_start;
 929                 int32           slice_size;
 930                 int32           slice_strlen;
 931                 text       *slice;
 932                 int32           E1;
 933                 int32           i;
 934                 char       *p;
 935                 char       *s;
 936                 text       *ret;
 937
 938                 /*
 939                  * We need to start at position zero because there is no way to know
 940                  * in advance which byte offset corresponds to the supplied start
 941                  * position.
 942                  */
 943                 slice_start = 0;
 944
 945                 if (length_not_specified)       /* special case - get length to end of
 946                                                                          * string */
 947                         slice_size = L1 = -1;
 948                 else if (length < 0)
 949                 {
 950                         /* SQL99 says to throw an error for E < S, i.e., negative length */
 951                         ereport(ERROR,
 952                                         (errcode(ERRCODE_SUBSTRING_ERROR),
 953                                          errmsg("negative substring length not allowed")));
 954                         slice_size = L1 = -1;   /* silence stupider compilers */
 955                 }
 956                 else if (pg_add_s32_overflow(S, length, &E))
 957                 {
 958                         /*
 959                          * L could be large enough for S + L to overflow, in which case
 960                          * the substring must run to end of string.
 961                          */
 962                         slice_size = L1 = -1;
 963                 }
 964                 else
 965                 {
 966                         /*
 967                          * A zero or negative value for the end position can happen if the
 968                          * start was negative or one. SQL99 says to return a zero-length
 969                          * string.
 970                          */
 971                         if (E < 1)
 972                                 return cstring_to_text("");
 973
 974                         /*
 975                          * if E is past the end of the string, the tuple toaster will
 976                          * truncate the length for us
 977                          */
 978                         L1 = E - S1;
 979
 980                         /*
 981                          * Total slice size in bytes can't be any longer than the start
 982                          * position plus substring length times the encoding max length.
 983                          * If that overflows, we can just use -1.
 984                          */
 985                         if (pg_mul_s32_overflow(E, eml, &slice_size))
 986                                 slice_size = -1;
 987                 }
 988
 989                 /*
 990                  * If we're working with an untoasted source, no need to do an extra
 991                  * copying step.
 992                  */
 993                 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
 994                         VARATT_IS_EXTERNAL(DatumGetPointer(str)))
 995                         slice = DatumGetTextPSlice(str, slice_start, slice_size);
 996                 else
 997                         slice = (text *) DatumGetPointer(str);
 998
 999                 /* see if we got back an empty string */
1000                 if (VARSIZE_ANY_EXHDR(slice) == 0)
1001                 {
1002                         if (slice != (text *) DatumGetPointer(str))
1003                                 pfree(slice);
1004                         return cstring_to_text("");
1005                 }
1006
1007                 /* Now we can get the actual length of the slice in MB characters */
1008                 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1009                                                                                         VARSIZE_ANY_EXHDR(slice));
1010
1011                 /*
1012                  * Check that the start position wasn't > slice_strlen. If so, SQL99
1013                  * says to return a zero-length string.
1014                  */
1015                 if (S1 > slice_strlen)
1016                 {
1017                         if (slice != (text *) DatumGetPointer(str))
1018                                 pfree(slice);
1019                         return cstring_to_text("");
1020                 }
1021
1022                 /*
1023                  * Adjust L1 and E1 now that we know the slice string length. Again
1024                  * remember that S1 is one based, and slice_start is zero based.
1025                  */
1026                 if (L1 > -1)
1027                         E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1028                 else
1029                         E1 = slice_start + 1 + slice_strlen;
1030
1031                 /*
1032                  * Find the start position in the slice; remember S1 is not zero based
1033                  */
1034                 p = VARDATA_ANY(slice);
1035                 for (i = 0; i < S1 - 1; i++)
1036                         p += pg_mblen(p);
1037
1038                 /* hang onto a pointer to our start position */
1039                 s = p;
1040
1041                 /*
1042                  * Count the actual bytes used by the substring of the requested
1043                  * length.
1044                  */
1045                 for (i = S1; i < E1; i++)
1046                         p += pg_mblen(p);
1047
1048                 ret = (text *) palloc(VARHDRSZ + (p - s));
1049                 SET_VARSIZE(ret, VARHDRSZ + (p - s));
1050                 memcpy(VARDATA(ret), s, (p - s));
1051
1052                 if (slice != (text *) DatumGetPointer(str))
1053                         pfree(slice);
1054
1055                 return ret;
1056         }
1057         else
1058                 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1059
1060         /* not reached: suppress compiler warning */
1061         return NULL;
1062 }
1063
1064 /*
1065  * textoverlay
1066  *      Replace specified substring of first string with second
1067  *
1068  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1069  * This code is a direct implementation of what the standard says.
1070  */
1071 Datum
1072 textoverlay(PG_FUNCTION_ARGS)
1073 {
1074         text       *t1 = PG_GETARG_TEXT_PP(0);
1075         text       *t2 = PG_GETARG_TEXT_PP(1);
1076         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
1077         int                     sl = PG_GETARG_INT32(3);        /* substring length */
1078
1079         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1080 }
1081
1082 Datum
1083 textoverlay_no_len(PG_FUNCTION_ARGS)
1084 {
1085         text       *t1 = PG_GETARG_TEXT_PP(0);
1086         text       *t2 = PG_GETARG_TEXT_PP(1);
1087         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
1088         int                     sl;
1089
1090         sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
1091         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1092 }
1093
1094 static text *
1095 text_overlay(text *t1, text *t2, int sp, int sl)
1096 {
1097         text       *result;
1098         text       *s1;
1099         text       *s2;
1100         int                     sp_pl_sl;
1101
1102         /*
1103          * Check for possible integer-overflow cases.  For negative sp, throw a
1104          * "substring length" error because that's what should be expected
1105          * according to the spec's definition of OVERLAY().
1106          */
1107         if (sp <= 0)
1108                 ereport(ERROR,
1109                                 (errcode(ERRCODE_SUBSTRING_ERROR),
1110                                  errmsg("negative substring length not allowed")));
1111         if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1112                 ereport(ERROR,
1113                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1114                                  errmsg("integer out of range")));
1115
1116         s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1117         s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1118         result = text_catenate(s1, t2);
1119         result = text_catenate(result, s2);
1120
1121         return result;
1122 }
1123
1124 /*
1125  * textpos -
1126  *        Return the position of the specified substring.
1127  *        Implements the SQL POSITION() function.
1128  *        Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1129  * - thomas 1997-07-27
1130  */
1131 Datum
1132 textpos(PG_FUNCTION_ARGS)
1133 {
1134         text       *str = PG_GETARG_TEXT_PP(0);
1135         text       *search_str = PG_GETARG_TEXT_PP(1);
1136
1137         PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1138 }
1139
1140 /*
1141  * text_position -
1142  *      Does the real work for textpos()
1143  *
1144  * Inputs:
1145  *              t1 - string to be searched
1146  *              t2 - pattern to match within t1
1147  * Result:
1148  *              Character index of the first matched char, starting from 1,
1149  *              or 0 if no match.
1150  *
1151  *      This is broken out so it can be called directly by other string processing
1152  *      functions.
1153  */
1154 static int
1155 text_position(text *t1, text *t2, Oid collid)
1156 {
1157         TextPositionState state;
1158         int                     result;
1159
1160         /* Empty needle always matches at position 1 */
1161         if (VARSIZE_ANY_EXHDR(t2) < 1)
1162                 return 1;
1163
1164         /* Otherwise, can't match if haystack is shorter than needle */
1165         if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1166                 return 0;
1167
1168         text_position_setup(t1, t2, collid, &state);
1169         if (!text_position_next(&state))
1170                 result = 0;
1171         else
1172                 result = text_position_get_match_pos(&state);
1173         text_position_cleanup(&state);
1174         return result;
1175 }
1176
1177
1178 /*
1179  * text_position_setup, text_position_next, text_position_cleanup -
1180  *      Component steps of text_position()
1181  *
1182  * These are broken out so that a string can be efficiently searched for
1183  * multiple occurrences of the same pattern.  text_position_next may be
1184  * called multiple times, and it advances to the next match on each call.
1185  * text_position_get_match_ptr() and text_position_get_match_pos() return
1186  * a pointer or 1-based character position of the last match, respectively.
1187  *
1188  * The "state" variable is normally just a local variable in the caller.
1189  *
1190  * NOTE: text_position_next skips over the matched portion.  For example,
1191  * searching for "xx" in "xxx" returns only one match, not two.
1192  */
1193
1194 static void
1195 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1196 {
1197         int                     len1 = VARSIZE_ANY_EXHDR(t1);
1198         int                     len2 = VARSIZE_ANY_EXHDR(t2);
1199         pg_locale_t mylocale = 0;
1200
1201         check_collation_set(collid);
1202
1203         if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1204                 mylocale = pg_newlocale_from_collation(collid);
1205
1206         if (mylocale && !mylocale->deterministic)
1207                 ereport(ERROR,
1208                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1209                                  errmsg("nondeterministic collations are not supported for substring searches")));
1210
1211         Assert(len1 > 0);
1212         Assert(len2 > 0);
1213
1214         /*
1215          * Even with a multi-byte encoding, we perform the search using the raw
1216          * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
1217          * because in UTF-8 the byte sequence of one character cannot contain
1218          * another character.  For other multi-byte encodings, we do the search
1219          * initially as a simple byte search, ignoring multibyte issues, but
1220          * verify afterwards that the match we found is at a character boundary,
1221          * and continue the search if it was a false match.
1222          */
1223         if (pg_database_encoding_max_length() == 1)
1224         {
1225                 state->is_multibyte = false;
1226                 state->is_multibyte_char_in_char = false;
1227         }
1228         else if (GetDatabaseEncoding() == PG_UTF8)
1229         {
1230                 state->is_multibyte = true;
1231                 state->is_multibyte_char_in_char = false;
1232         }
1233         else
1234         {
1235                 state->is_multibyte = true;
1236                 state->is_multibyte_char_in_char = true;
1237         }
1238
1239         state->str1 = VARDATA_ANY(t1);
1240         state->str2 = VARDATA_ANY(t2);
1241         state->len1 = len1;
1242         state->len2 = len2;
1243         state->last_match = NULL;
1244         state->refpoint = state->str1;
1245         state->refpos = 0;
1246
1247         /*
1248          * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1249          * notes we use the terminology that the "haystack" is the string to be
1250          * searched (t1) and the "needle" is the pattern being sought (t2).
1251          *
1252          * If the needle is empty or bigger than the haystack then there is no
1253          * point in wasting cycles initializing the table.  We also choose not to
1254          * use B-M-H for needles of length 1, since the skip table can't possibly
1255          * save anything in that case.
1256          */
1257         if (len1 >= len2 && len2 > 1)
1258         {
1259                 int                     searchlength = len1 - len2;
1260                 int                     skiptablemask;
1261                 int                     last;
1262                 int                     i;
1263                 const char *str2 = state->str2;
1264
1265                 /*
1266                  * First we must determine how much of the skip table to use.  The
1267                  * declaration of TextPositionState allows up to 256 elements, but for
1268                  * short search problems we don't really want to have to initialize so
1269                  * many elements --- it would take too long in comparison to the
1270                  * actual search time.  So we choose a useful skip table size based on
1271                  * the haystack length minus the needle length.  The closer the needle
1272                  * length is to the haystack length the less useful skipping becomes.
1273                  *
1274                  * Note: since we use bit-masking to select table elements, the skip
1275                  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1276                  */
1277                 if (searchlength < 16)
1278                         skiptablemask = 3;
1279                 else if (searchlength < 64)
1280                         skiptablemask = 7;
1281                 else if (searchlength < 128)
1282                         skiptablemask = 15;
1283                 else if (searchlength < 512)
1284                         skiptablemask = 31;
1285                 else if (searchlength < 2048)
1286                         skiptablemask = 63;
1287                 else if (searchlength < 4096)
1288                         skiptablemask = 127;
1289                 else
1290                         skiptablemask = 255;
1291                 state->skiptablemask = skiptablemask;
1292
1293                 /*
1294                  * Initialize the skip table.  We set all elements to the needle
1295                  * length, since this is the correct skip distance for any character
1296                  * not found in the needle.
1297                  */
1298                 for (i = 0; i <= skiptablemask; i++)
1299                         state->skiptable[i] = len2;
1300
1301                 /*
1302                  * Now examine the needle.  For each character except the last one,
1303                  * set the corresponding table element to the appropriate skip
1304                  * distance.  Note that when two characters share the same skip table
1305                  * entry, the one later in the needle must determine the skip
1306                  * distance.
1307                  */
1308                 last = len2 - 1;
1309
1310                 for (i = 0; i < last; i++)
1311                         state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1312         }
1313 }
1314
1315 /*
1316  * Advance to the next match, starting from the end of the previous match
1317  * (or the beginning of the string, on first call).  Returns true if a match
1318  * is found.
1319  *
1320  * Note that this refuses to match an empty-string needle.  Most callers
1321  * will have handled that case specially and we'll never see it here.
1322  */
1323 static bool
1324 text_position_next(TextPositionState *state)
1325 {
1326         int                     needle_len = state->len2;
1327         char       *start_ptr;
1328         char       *matchptr;
1329
1330         if (needle_len <= 0)
1331                 return false;                   /* result for empty pattern */
1332
1333         /* Start from the point right after the previous match. */
1334         if (state->last_match)
1335                 start_ptr = state->last_match + needle_len;
1336         else
1337                 start_ptr = state->str1;
1338
1339 retry:
1340         matchptr = text_position_next_internal(start_ptr, state);
1341
1342         if (!matchptr)
1343                 return false;
1344
1345         /*
1346          * Found a match for the byte sequence.  If this is a multibyte encoding,
1347          * where one character's byte sequence can appear inside a longer
1348          * multi-byte character, we need to verify that the match was at a
1349          * character boundary, not in the middle of a multi-byte character.
1350          */
1351         if (state->is_multibyte_char_in_char)
1352         {
1353                 /* Walk one character at a time, until we reach the match. */
1354
1355                 /* the search should never move backwards. */
1356                 Assert(state->refpoint <= matchptr);
1357
1358                 while (state->refpoint < matchptr)
1359                 {
1360                         /* step to next character. */
1361                         state->refpoint += pg_mblen(state->refpoint);
1362                         state->refpos++;
1363
1364                         /*
1365                          * If we stepped over the match's start position, then it was a
1366                          * false positive, where the byte sequence appeared in the middle
1367                          * of a multi-byte character.  Skip it, and continue the search at
1368                          * the next character boundary.
1369                          */
1370                         if (state->refpoint > matchptr)
1371                         {
1372                                 start_ptr = state->refpoint;
1373                                 goto retry;
1374                         }
1375                 }
1376         }
1377
1378         state->last_match = matchptr;
1379         return true;
1380 }
1381
1382 /*
1383  * Subroutine of text_position_next().  This searches for the raw byte
1384  * sequence, ignoring any multi-byte encoding issues.  Returns the first
1385  * match starting at 'start_ptr', or NULL if no match is found.
1386  */
1387 static char *
1388 text_position_next_internal(char *start_ptr, TextPositionState *state)
1389 {
1390         int                     haystack_len = state->len1;
1391         int                     needle_len = state->len2;
1392         int                     skiptablemask = state->skiptablemask;
1393         const char *haystack = state->str1;
1394         const char *needle = state->str2;
1395         const char *haystack_end = &haystack[haystack_len];
1396         const char *hptr;
1397
1398         Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1399
1400         if (needle_len == 1)
1401         {
1402                 /* No point in using B-M-H for a one-character needle */
1403                 char            nchar = *needle;
1404
1405                 hptr = start_ptr;
1406                 while (hptr < haystack_end)
1407                 {
1408                         if (*hptr == nchar)
1409                                 return (char *) hptr;
1410                         hptr++;
1411                 }
1412         }
1413         else
1414         {
1415                 const char *needle_last = &needle[needle_len - 1];
1416
1417                 /* Start at startpos plus the length of the needle */
1418                 hptr = start_ptr + needle_len - 1;
1419                 while (hptr < haystack_end)
1420                 {
1421                         /* Match the needle scanning *backward* */
1422                         const char *nptr;
1423                         const char *p;
1424
1425                         nptr = needle_last;
1426                         p = hptr;
1427                         while (*nptr == *p)
1428                         {
1429                                 /* Matched it all?      If so, return 1-based position */
1430                                 if (nptr == needle)
1431                                         return (char *) p;
1432                                 nptr--, p--;
1433                         }
1434
1435                         /*
1436                          * No match, so use the haystack char at hptr to decide how far to
1437                          * advance.  If the needle had any occurrence of that character
1438                          * (or more precisely, one sharing the same skiptable entry)
1439                          * before its last character, then we advance far enough to align
1440                          * the last such needle character with that haystack position.
1441                          * Otherwise we can advance by the whole needle length.
1442                          */
1443                         hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1444                 }
1445         }
1446
1447         return 0;                                       /* not found */
1448 }
1449
1450 /*
1451  * Return a pointer to the current match.
1452  *
1453  * The returned pointer points into the original haystack string.
1454  */
1455 static char *
1456 text_position_get_match_ptr(TextPositionState *state)
1457 {
1458         return state->last_match;
1459 }
1460
1461 /*
1462  * Return the offset of the current match.
1463  *
1464  * The offset is in characters, 1-based.
1465  */
1466 static int
1467 text_position_get_match_pos(TextPositionState *state)
1468 {
1469         if (!state->is_multibyte)
1470                 return state->last_match - state->str1 + 1;
1471         else
1472         {
1473                 /* Convert the byte position to char position. */
1474                 while (state->refpoint < state->last_match)
1475                 {
1476                         state->refpoint += pg_mblen(state->refpoint);
1477                         state->refpos++;
1478                 }
1479                 Assert(state->refpoint == state->last_match);
1480                 return state->refpos + 1;
1481         }
1482 }
1483
1484 /*
1485  * Reset search state to the initial state installed by text_position_setup.
1486  *
1487  * The next call to text_position_next will search from the beginning
1488  * of the string.
1489  */
1490 static void
1491 text_position_reset(TextPositionState *state)
1492 {
1493         state->last_match = NULL;
1494         state->refpoint = state->str1;
1495         state->refpos = 0;
1496 }
1497
1498 static void
1499 text_position_cleanup(TextPositionState *state)
1500 {
1501         /* no cleanup needed */
1502 }
1503
1504
1505 static void
1506 check_collation_set(Oid collid)
1507 {
1508         if (!OidIsValid(collid))
1509         {
1510                 /*
1511                  * This typically means that the parser could not resolve a conflict
1512                  * of implicit collations, so report it that way.
1513                  */
1514                 ereport(ERROR,
1515                                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1516                                  errmsg("could not determine which collation to use for string comparison"),
1517                                  errhint("Use the COLLATE clause to set the collation explicitly.")));
1518         }
1519 }
1520
1521 /* varstr_cmp()
1522  * Comparison function for text strings with given lengths.
1523  * Includes locale support, but must copy strings to temporary memory
1524  *      to allow null-termination for inputs to strcoll().
1525  * Returns an integer less than, equal to, or greater than zero, indicating
1526  * whether arg1 is less than, equal to, or greater than arg2.
1527  *
1528  * Note: many functions that depend on this are marked leakproof; therefore,
1529  * avoid reporting the actual contents of the input when throwing errors.
1530  * All errors herein should be things that can't happen except on corrupt
1531  * data, anyway; otherwise we will have trouble with indexing strings that
1532  * would cause them.
1533  */
1534 int
1535 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1536 {
1537         int                     result;
1538
1539         check_collation_set(collid);
1540
1541         /*
1542          * Unfortunately, there is no strncoll(), so in the non-C locale case we
1543          * have to do some memory copying.  This turns out to be significantly
1544          * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1545          * optimize relatively-short strings by avoiding palloc/pfree overhead.
1546          */
1547         if (lc_collate_is_c(collid))
1548         {
1549                 result = memcmp(arg1, arg2, Min(len1, len2));
1550                 if ((result == 0) && (len1 != len2))
1551                         result = (len1 < len2) ? -1 : 1;
1552         }
1553         else
1554         {
1555                 char            a1buf[TEXTBUFLEN];
1556                 char            a2buf[TEXTBUFLEN];
1557                 char       *a1p,
1558                                    *a2p;
1559                 pg_locale_t mylocale = 0;
1560
1561                 if (collid != DEFAULT_COLLATION_OID)
1562                         mylocale = pg_newlocale_from_collation(collid);
1563
1564                 /*
1565                  * memcmp() can't tell us which of two unequal strings sorts first,
1566                  * but it's a cheap way to tell if they're equal.  Testing shows that
1567                  * memcmp() followed by strcoll() is only trivially slower than
1568                  * strcoll() by itself, so we don't lose much if this doesn't work out
1569                  * very often, and if it does - for example, because there are many
1570                  * equal strings in the input - then we win big by avoiding expensive
1571                  * collation-aware comparisons.
1572                  */
1573                 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1574                         return 0;
1575
1576 #ifdef WIN32
1577                 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1578                 if (GetDatabaseEncoding() == PG_UTF8
1579                         && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1580                 {
1581                         int                     a1len;
1582                         int                     a2len;
1583                         int                     r;
1584
1585                         if (len1 >= TEXTBUFLEN / 2)
1586                         {
1587                                 a1len = len1 * 2 + 2;
1588                                 a1p = palloc(a1len);
1589                         }
1590                         else
1591                         {
1592                                 a1len = TEXTBUFLEN;
1593                                 a1p = a1buf;
1594                         }
1595                         if (len2 >= TEXTBUFLEN / 2)
1596                         {
1597                                 a2len = len2 * 2 + 2;
1598                                 a2p = palloc(a2len);
1599                         }
1600                         else
1601                         {
1602                                 a2len = TEXTBUFLEN;
1603                                 a2p = a2buf;
1604                         }
1605
1606                         /* stupid Microsloth API does not work for zero-length input */
1607                         if (len1 == 0)
1608                                 r = 0;
1609                         else
1610                         {
1611                                 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1612                                                                                 (LPWSTR) a1p, a1len / 2);
1613                                 if (!r)
1614                                         ereport(ERROR,
1615                                                         (errmsg("could not convert string to UTF-16: error code %lu",
1616                                                                         GetLastError())));
1617                         }
1618                         ((LPWSTR) a1p)[r] = 0;
1619
1620                         if (len2 == 0)
1621                                 r = 0;
1622                         else
1623                         {
1624                                 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1625                                                                                 (LPWSTR) a2p, a2len / 2);
1626                                 if (!r)
1627                                         ereport(ERROR,
1628                                                         (errmsg("could not convert string to UTF-16: error code %lu",
1629                                                                         GetLastError())));
1630                         }
1631                         ((LPWSTR) a2p)[r] = 0;
1632
1633                         errno = 0;
1634 #ifdef HAVE_LOCALE_T
1635                         if (mylocale)
1636                                 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1637                         else
1638 #endif
1639                                 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1640                         if (result == 2147483647)       /* _NLSCMPERROR; missing from mingw
1641                                                                                  * headers */
1642                                 ereport(ERROR,
1643                                                 (errmsg("could not compare Unicode strings: %m")));
1644
1645                         /* Break tie if necessary. */
1646                         if (result == 0 &&
1647                                 (!mylocale || mylocale->deterministic))
1648                         {
1649                                 result = memcmp(arg1, arg2, Min(len1, len2));
1650                                 if ((result == 0) && (len1 != len2))
1651                                         result = (len1 < len2) ? -1 : 1;
1652                         }
1653
1654                         if (a1p != a1buf)
1655                                 pfree(a1p);
1656                         if (a2p != a2buf)
1657                                 pfree(a2p);
1658
1659                         return result;
1660                 }
1661 #endif                                                  /* WIN32 */
1662
1663                 if (len1 >= TEXTBUFLEN)
1664                         a1p = (char *) palloc(len1 + 1);
1665                 else
1666                         a1p = a1buf;
1667                 if (len2 >= TEXTBUFLEN)
1668                         a2p = (char *) palloc(len2 + 1);
1669                 else
1670                         a2p = a2buf;
1671
1672                 memcpy(a1p, arg1, len1);
1673                 a1p[len1] = '\0';
1674                 memcpy(a2p, arg2, len2);
1675                 a2p[len2] = '\0';
1676
1677                 if (mylocale)
1678                 {
1679                         if (mylocale->provider == COLLPROVIDER_ICU)
1680                         {
1681 #ifdef USE_ICU
1682 #ifdef HAVE_UCOL_STRCOLLUTF8
1683                                 if (GetDatabaseEncoding() == PG_UTF8)
1684                                 {
1685                                         UErrorCode      status;
1686
1687                                         status = U_ZERO_ERROR;
1688                                         result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1689                                                                                           arg1, len1,
1690                                                                                           arg2, len2,
1691                                                                                           &status);
1692                                         if (U_FAILURE(status))
1693                                                 ereport(ERROR,
1694                                                                 (errmsg("collation failed: %s", u_errorName(status))));
1695                                 }
1696                                 else
1697 #endif
1698                                 {
1699                                         int32_t         ulen1,
1700                                                                 ulen2;
1701                                         UChar      *uchar1,
1702                                                            *uchar2;
1703
1704                                         ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1705                                         ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1706
1707                                         result = ucol_strcoll(mylocale->info.icu.ucol,
1708                                                                                   uchar1, ulen1,
1709                                                                                   uchar2, ulen2);
1710
1711                                         pfree(uchar1);
1712                                         pfree(uchar2);
1713                                 }
1714 #else                                                   /* not USE_ICU */
1715                                 /* shouldn't happen */
1716                                 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1717 #endif                                                  /* not USE_ICU */
1718                         }
1719                         else
1720                         {
1721 #ifdef HAVE_LOCALE_T
1722                                 result = strcoll_l(a1p, a2p, mylocale->info.lt);
1723 #else
1724                                 /* shouldn't happen */
1725                                 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1726 #endif
1727                         }
1728                 }
1729                 else
1730                         result = strcoll(a1p, a2p);
1731
1732                 /* Break tie if necessary. */
1733                 if (result == 0 &&
1734                         (!mylocale || mylocale->deterministic))
1735                         result = strcmp(a1p, a2p);
1736
1737                 if (a1p != a1buf)
1738                         pfree(a1p);
1739                 if (a2p != a2buf)
1740                         pfree(a2p);
1741         }
1742
1743         return result;
1744 }
1745
1746 /* text_cmp()
1747  * Internal comparison function for text strings.
1748  * Returns -1, 0 or 1
1749  */
1750 static int
1751 text_cmp(text *arg1, text *arg2, Oid collid)
1752 {
1753         char       *a1p,
1754                            *a2p;
1755         int                     len1,
1756                                 len2;
1757
1758         a1p = VARDATA_ANY(arg1);
1759         a2p = VARDATA_ANY(arg2);
1760
1761         len1 = VARSIZE_ANY_EXHDR(arg1);
1762         len2 = VARSIZE_ANY_EXHDR(arg2);
1763
1764         return varstr_cmp(a1p, len1, a2p, len2, collid);
1765 }
1766
1767 /*
1768  * Comparison functions for text strings.
1769  *
1770  * Note: btree indexes need these routines not to leak memory; therefore,
1771  * be careful to free working copies of toasted datums.  Most places don't
1772  * need to be so careful.
1773  */
1774
1775 Datum
1776 texteq(PG_FUNCTION_ARGS)
1777 {
1778         Oid                     collid = PG_GET_COLLATION();
1779         bool            result;
1780
1781         check_collation_set(collid);
1782
1783         if (lc_collate_is_c(collid) ||
1784                 collid == DEFAULT_COLLATION_OID ||
1785                 pg_newlocale_from_collation(collid)->deterministic)
1786         {
1787                 Datum           arg1 = PG_GETARG_DATUM(0);
1788                 Datum           arg2 = PG_GETARG_DATUM(1);
1789                 Size            len1,
1790                                         len2;
1791
1792                 /*
1793                  * Since we only care about equality or not-equality, we can avoid all
1794                  * the expense of strcoll() here, and just do bitwise comparison.  In
1795                  * fact, we don't even have to do a bitwise comparison if we can show
1796                  * the lengths of the strings are unequal; which might save us from
1797                  * having to detoast one or both values.
1798                  */
1799                 len1 = toast_raw_datum_size(arg1);
1800                 len2 = toast_raw_datum_size(arg2);
1801                 if (len1 != len2)
1802                         result = false;
1803                 else
1804                 {
1805                         text       *targ1 = DatumGetTextPP(arg1);
1806                         text       *targ2 = DatumGetTextPP(arg2);
1807
1808                         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1809                                                          len1 - VARHDRSZ) == 0);
1810
1811                         PG_FREE_IF_COPY(targ1, 0);
1812                         PG_FREE_IF_COPY(targ2, 1);
1813                 }
1814         }
1815         else
1816         {
1817                 text       *arg1 = PG_GETARG_TEXT_PP(0);
1818                 text       *arg2 = PG_GETARG_TEXT_PP(1);
1819
1820                 result = (text_cmp(arg1, arg2, collid) == 0);
1821
1822                 PG_FREE_IF_COPY(arg1, 0);
1823                 PG_FREE_IF_COPY(arg2, 1);
1824         }
1825
1826         PG_RETURN_BOOL(result);
1827 }
1828
1829 Datum
1830 textne(PG_FUNCTION_ARGS)
1831 {
1832         Oid                     collid = PG_GET_COLLATION();
1833         bool            result;
1834
1835         check_collation_set(collid);
1836
1837         if (lc_collate_is_c(collid) ||
1838                 collid == DEFAULT_COLLATION_OID ||
1839                 pg_newlocale_from_collation(collid)->deterministic)
1840         {
1841                 Datum           arg1 = PG_GETARG_DATUM(0);
1842                 Datum           arg2 = PG_GETARG_DATUM(1);
1843                 Size            len1,
1844                                         len2;
1845
1846                 /* See comment in texteq() */
1847                 len1 = toast_raw_datum_size(arg1);
1848                 len2 = toast_raw_datum_size(arg2);
1849                 if (len1 != len2)
1850                         result = true;
1851                 else
1852                 {
1853                         text       *targ1 = DatumGetTextPP(arg1);
1854                         text       *targ2 = DatumGetTextPP(arg2);
1855
1856                         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1857                                                          len1 - VARHDRSZ) != 0);
1858
1859                         PG_FREE_IF_COPY(targ1, 0);
1860                         PG_FREE_IF_COPY(targ2, 1);
1861                 }
1862         }
1863         else
1864         {
1865                 text       *arg1 = PG_GETARG_TEXT_PP(0);
1866                 text       *arg2 = PG_GETARG_TEXT_PP(1);
1867
1868                 result = (text_cmp(arg1, arg2, collid) != 0);
1869
1870                 PG_FREE_IF_COPY(arg1, 0);
1871                 PG_FREE_IF_COPY(arg2, 1);
1872         }
1873
1874         PG_RETURN_BOOL(result);
1875 }
1876
1877 Datum
1878 text_lt(PG_FUNCTION_ARGS)
1879 {
1880         text       *arg1 = PG_GETARG_TEXT_PP(0);
1881         text       *arg2 = PG_GETARG_TEXT_PP(1);
1882         bool            result;
1883
1884         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1885
1886         PG_FREE_IF_COPY(arg1, 0);
1887         PG_FREE_IF_COPY(arg2, 1);
1888
1889         PG_RETURN_BOOL(result);
1890 }
1891
1892 Datum
1893 text_le(PG_FUNCTION_ARGS)
1894 {
1895         text       *arg1 = PG_GETARG_TEXT_PP(0);
1896         text       *arg2 = PG_GETARG_TEXT_PP(1);
1897         bool            result;
1898
1899         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1900
1901         PG_FREE_IF_COPY(arg1, 0);
1902         PG_FREE_IF_COPY(arg2, 1);
1903
1904         PG_RETURN_BOOL(result);
1905 }
1906
1907 Datum
1908 text_gt(PG_FUNCTION_ARGS)
1909 {
1910         text       *arg1 = PG_GETARG_TEXT_PP(0);
1911         text       *arg2 = PG_GETARG_TEXT_PP(1);
1912         bool            result;
1913
1914         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1915
1916         PG_FREE_IF_COPY(arg1, 0);
1917         PG_FREE_IF_COPY(arg2, 1);
1918
1919         PG_RETURN_BOOL(result);
1920 }
1921
1922 Datum
1923 text_ge(PG_FUNCTION_ARGS)
1924 {
1925         text       *arg1 = PG_GETARG_TEXT_PP(0);
1926         text       *arg2 = PG_GETARG_TEXT_PP(1);
1927         bool            result;
1928
1929         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1930
1931         PG_FREE_IF_COPY(arg1, 0);
1932         PG_FREE_IF_COPY(arg2, 1);
1933
1934         PG_RETURN_BOOL(result);
1935 }
1936
1937 Datum
1938 text_starts_with(PG_FUNCTION_ARGS)
1939 {
1940         Datum           arg1 = PG_GETARG_DATUM(0);
1941         Datum           arg2 = PG_GETARG_DATUM(1);
1942         Oid                     collid = PG_GET_COLLATION();
1943         pg_locale_t mylocale = 0;
1944         bool            result;
1945         Size            len1,
1946                                 len2;
1947
1948         check_collation_set(collid);
1949
1950         if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1951                 mylocale = pg_newlocale_from_collation(collid);
1952
1953         if (mylocale && !mylocale->deterministic)
1954                 ereport(ERROR,
1955                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1956                                  errmsg("nondeterministic collations are not supported for substring searches")));
1957
1958         len1 = toast_raw_datum_size(arg1);
1959         len2 = toast_raw_datum_size(arg2);
1960         if (len2 > len1)
1961                 result = false;
1962         else
1963         {
1964                 text       *targ1 = text_substring(arg1, 1, len2, false);
1965                 text       *targ2 = DatumGetTextPP(arg2);
1966
1967                 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1968                                                  VARSIZE_ANY_EXHDR(targ2)) == 0);
1969
1970                 PG_FREE_IF_COPY(targ1, 0);
1971                 PG_FREE_IF_COPY(targ2, 1);
1972         }
1973
1974         PG_RETURN_BOOL(result);
1975 }
1976
1977 Datum
1978 bttextcmp(PG_FUNCTION_ARGS)
1979 {
1980         text       *arg1 = PG_GETARG_TEXT_PP(0);
1981         text       *arg2 = PG_GETARG_TEXT_PP(1);
1982         int32           result;
1983
1984         result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1985
1986         PG_FREE_IF_COPY(arg1, 0);
1987         PG_FREE_IF_COPY(arg2, 1);
1988
1989         PG_RETURN_INT32(result);
1990 }
1991
1992 Datum
1993 bttextsortsupport(PG_FUNCTION_ARGS)
1994 {
1995         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1996         Oid                     collid = ssup->ssup_collation;
1997         MemoryContext oldcontext;
1998
1999         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2000
2001         /* Use generic string SortSupport */
2002         varstr_sortsupport(ssup, TEXTOID, collid);
2003
2004         MemoryContextSwitchTo(oldcontext);
2005
2006         PG_RETURN_VOID();
2007 }
2008
2009 /*
2010  * Generic sortsupport interface for character type's operator classes.
2011  * Includes locale support, and support for BpChar semantics (i.e. removing
2012  * trailing spaces before comparison).
2013  *
2014  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2015  * same representation.  Callers that always use the C collation (e.g.
2016  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2017  * this will not work with any other collation, though.
2018  */
2019 void
2020 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
2021 {
2022         bool            abbreviate = ssup->abbreviate;
2023         bool            collate_c = false;
2024         VarStringSortSupport *sss;
2025         pg_locale_t locale = 0;
2026
2027         check_collation_set(collid);
2028
2029         /*
2030          * If possible, set ssup->comparator to a function which can be used to
2031          * directly compare two datums.  If we can do this, we'll avoid the
2032          * overhead of a trip through the fmgr layer for every comparison, which
2033          * can be substantial.
2034          *
2035          * Most typically, we'll set the comparator to varlenafastcmp_locale,
2036          * which uses strcoll() to perform comparisons.  We use that for the
2037          * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2038          * LC_COLLATE = C, we can make things quite a bit faster with
2039          * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2040          * memcmp() rather than strcoll().
2041          */
2042         if (lc_collate_is_c(collid))
2043         {
2044                 if (typid == BPCHAROID)
2045                         ssup->comparator = bpcharfastcmp_c;
2046                 else if (typid == NAMEOID)
2047                 {
2048                         ssup->comparator = namefastcmp_c;
2049                         /* Not supporting abbreviation with type NAME, for now */
2050                         abbreviate = false;
2051                 }
2052                 else
2053                         ssup->comparator = varstrfastcmp_c;
2054
2055                 collate_c = true;
2056         }
2057         else
2058         {
2059                 /*
2060                  * We need a collation-sensitive comparison.  To make things faster,
2061                  * we'll figure out the collation based on the locale id and cache the
2062                  * result.
2063                  */
2064                 if (collid != DEFAULT_COLLATION_OID)
2065                         locale = pg_newlocale_from_collation(collid);
2066
2067                 /*
2068                  * There is a further exception on Windows.  When the database
2069                  * encoding is UTF-8 and we are not using the C collation, complex
2070                  * hacks are required.  We don't currently have a comparator that
2071                  * handles that case, so we fall back on the slow method of having the
2072                  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2073                  * trampoline.  ICU locales work just the same on Windows, however.
2074                  */
2075 #ifdef WIN32
2076                 if (GetDatabaseEncoding() == PG_UTF8 &&
2077                         !(locale && locale->provider == COLLPROVIDER_ICU))
2078                         return;
2079 #endif
2080
2081                 /*
2082                  * We use varlenafastcmp_locale except for type NAME.
2083                  */
2084                 if (typid == NAMEOID)
2085                 {
2086                         ssup->comparator = namefastcmp_locale;
2087                         /* Not supporting abbreviation with type NAME, for now */
2088                         abbreviate = false;
2089                 }
2090                 else
2091                         ssup->comparator = varlenafastcmp_locale;
2092         }
2093
2094         /*
2095          * Unfortunately, it seems that abbreviation for non-C collations is
2096          * broken on many common platforms; testing of multiple versions of glibc
2097          * reveals that, for many locales, strcoll() and strxfrm() do not return
2098          * consistent results, which is fatal to this optimization.  While no
2099          * other libc other than Cygwin has so far been shown to have a problem,
2100          * we take the conservative course of action for right now and disable
2101          * this categorically.  (Users who are certain this isn't a problem on
2102          * their system can define TRUST_STRXFRM.)
2103          *
2104          * Even apart from the risk of broken locales, it's possible that there
2105          * are platforms where the use of abbreviated keys should be disabled at
2106          * compile time.  Having only 4 byte datums could make worst-case
2107          * performance drastically more likely, for example.  Moreover, macOS's
2108          * strxfrm() implementation is known to not effectively concentrate a
2109          * significant amount of entropy from the original string in earlier
2110          * transformed blobs.  It's possible that other supported platforms are
2111          * similarly encumbered.  So, if we ever get past disabling this
2112          * categorically, we may still want or need to disable it for particular
2113          * platforms.
2114          */
2115 #ifndef TRUST_STRXFRM
2116         if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2117                 abbreviate = false;
2118 #endif
2119
2120         /*
2121          * If we're using abbreviated keys, or if we're using a locale-aware
2122          * comparison, we need to initialize a VarStringSortSupport object. Both
2123          * cases will make use of the temporary buffers we initialize here for
2124          * scratch space (and to detect requirement for BpChar semantics from
2125          * caller), and the abbreviation case requires additional state.
2126          */
2127         if (abbreviate || !collate_c)
2128         {
2129                 sss = palloc(sizeof(VarStringSortSupport));
2130                 sss->buf1 = palloc(TEXTBUFLEN);
2131                 sss->buflen1 = TEXTBUFLEN;
2132                 sss->buf2 = palloc(TEXTBUFLEN);
2133                 sss->buflen2 = TEXTBUFLEN;
2134                 /* Start with invalid values */
2135                 sss->last_len1 = -1;
2136                 sss->last_len2 = -1;
2137                 /* Initialize */
2138                 sss->last_returned = 0;
2139                 sss->locale = locale;
2140
2141                 /*
2142                  * To avoid somehow confusing a strxfrm() blob and an original string,
2143                  * constantly keep track of the variety of data that buf1 and buf2
2144                  * currently contain.
2145                  *
2146                  * Comparisons may be interleaved with conversion calls.  Frequently,
2147                  * conversions and comparisons are batched into two distinct phases,
2148                  * but the correctness of caching cannot hinge upon this.  For
2149                  * comparison caching, buffer state is only trusted if cache_blob is
2150                  * found set to false, whereas strxfrm() caching only trusts the state
2151                  * when cache_blob is found set to true.
2152                  *
2153                  * Arbitrarily initialize cache_blob to true.
2154                  */
2155                 sss->cache_blob = true;
2156                 sss->collate_c = collate_c;
2157                 sss->typid = typid;
2158                 ssup->ssup_extra = sss;
2159
2160                 /*
2161                  * If possible, plan to use the abbreviated keys optimization.  The
2162                  * core code may switch back to authoritative comparator should
2163                  * abbreviation be aborted.
2164                  */
2165                 if (abbreviate)
2166                 {
2167                         sss->prop_card = 0.20;
2168                         initHyperLogLog(&sss->abbr_card, 10);
2169                         initHyperLogLog(&sss->full_card, 10);
2170                         ssup->abbrev_full_comparator = ssup->comparator;
2171                         ssup->comparator = varstrcmp_abbrev;
2172                         ssup->abbrev_converter = varstr_abbrev_convert;
2173                         ssup->abbrev_abort = varstr_abbrev_abort;
2174                 }
2175         }
2176 }
2177
2178 /*
2179  * sortsupport comparison func (for C locale case)
2180  */
2181 static int
2182 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2183 {
2184         VarString  *arg1 = DatumGetVarStringPP(x);
2185         VarString  *arg2 = DatumGetVarStringPP(y);
2186         char       *a1p,
2187                            *a2p;
2188         int                     len1,
2189                                 len2,
2190                                 result;
2191
2192         a1p = VARDATA_ANY(arg1);
2193         a2p = VARDATA_ANY(arg2);
2194
2195         len1 = VARSIZE_ANY_EXHDR(arg1);
2196         len2 = VARSIZE_ANY_EXHDR(arg2);
2197
2198         result = memcmp(a1p, a2p, Min(len1, len2));
2199         if ((result == 0) && (len1 != len2))
2200                 result = (len1 < len2) ? -1 : 1;
2201
2202         /* We can't afford to leak memory here. */
2203         if (PointerGetDatum(arg1) != x)
2204                 pfree(arg1);
2205         if (PointerGetDatum(arg2) != y)
2206                 pfree(arg2);
2207
2208         return result;
2209 }
2210
2211 /*
2212  * sortsupport comparison func (for BpChar C locale case)
2213  *
2214  * BpChar outsources its sortsupport to this module.  Specialization for the
2215  * varstr_sortsupport BpChar case, modeled on
2216  * internal_bpchar_pattern_compare().
2217  */
2218 static int
2219 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2220 {
2221         BpChar     *arg1 = DatumGetBpCharPP(x);
2222         BpChar     *arg2 = DatumGetBpCharPP(y);
2223         char       *a1p,
2224                            *a2p;
2225         int                     len1,
2226                                 len2,
2227                                 result;
2228
2229         a1p = VARDATA_ANY(arg1);
2230         a2p = VARDATA_ANY(arg2);
2231
2232         len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2233         len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2234
2235         result = memcmp(a1p, a2p, Min(len1, len2));
2236         if ((result == 0) && (len1 != len2))
2237                 result = (len1 < len2) ? -1 : 1;
2238
2239         /* We can't afford to leak memory here. */
2240         if (PointerGetDatum(arg1) != x)
2241                 pfree(arg1);
2242         if (PointerGetDatum(arg2) != y)
2243                 pfree(arg2);
2244
2245         return result;
2246 }
2247
2248 /*
2249  * sortsupport comparison func (for NAME C locale case)
2250  */
2251 static int
2252 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2253 {
2254         Name            arg1 = DatumGetName(x);
2255         Name            arg2 = DatumGetName(y);
2256
2257         return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2258 }
2259
2260 /*
2261  * sortsupport comparison func (for locale case with all varlena types)
2262  */
2263 static int
2264 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2265 {
2266         VarString  *arg1 = DatumGetVarStringPP(x);
2267         VarString  *arg2 = DatumGetVarStringPP(y);
2268         char       *a1p,
2269                            *a2p;
2270         int                     len1,
2271                                 len2,
2272                                 result;
2273
2274         a1p = VARDATA_ANY(arg1);
2275         a2p = VARDATA_ANY(arg2);
2276
2277         len1 = VARSIZE_ANY_EXHDR(arg1);
2278         len2 = VARSIZE_ANY_EXHDR(arg2);
2279
2280         result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2281
2282         /* We can't afford to leak memory here. */
2283         if (PointerGetDatum(arg1) != x)
2284                 pfree(arg1);
2285         if (PointerGetDatum(arg2) != y)
2286                 pfree(arg2);
2287
2288         return result;
2289 }
2290
2291 /*
2292  * sortsupport comparison func (for locale case with NAME type)
2293  */
2294 static int
2295 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2296 {
2297         Name            arg1 = DatumGetName(x);
2298         Name            arg2 = DatumGetName(y);
2299
2300         return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2301                                                                 NameStr(*arg2), strlen(NameStr(*arg2)),
2302                                                                 ssup);
2303 }
2304
2305 /*
2306  * sortsupport comparison func for locale cases
2307  */
2308 static int
2309 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2310 {
2311         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2312         int                     result;
2313         bool            arg1_match;
2314
2315         /* Fast pre-check for equality, as discussed in varstr_cmp() */
2316         if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2317         {
2318                 /*
2319                  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2320                  * last_len2.  Existing contents of buffers might still be used by
2321                  * next call.
2322                  *
2323                  * It's fine to allow the comparison of BpChar padding bytes here,
2324                  * even though that implies that the memcmp() will usually be
2325                  * performed for BpChar callers (though multibyte characters could
2326                  * still prevent that from occurring).  The memcmp() is still very
2327                  * cheap, and BpChar's funny semantics have us remove trailing spaces
2328                  * (not limited to padding), so we need make no distinction between
2329                  * padding space characters and "real" space characters.
2330                  */
2331                 return 0;
2332         }
2333
2334         if (sss->typid == BPCHAROID)
2335         {
2336                 /* Get true number of bytes, ignoring trailing spaces */
2337                 len1 = bpchartruelen(a1p, len1);
2338                 len2 = bpchartruelen(a2p, len2);
2339         }
2340
2341         if (len1 >= sss->buflen1)
2342         {
2343                 pfree(sss->buf1);
2344                 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2345                 sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2346         }
2347         if (len2 >= sss->buflen2)
2348         {
2349                 pfree(sss->buf2);
2350                 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2351                 sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2352         }
2353
2354         /*
2355          * We're likely to be asked to compare the same strings repeatedly, and
2356          * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2357          * comparisons, even though in general there is no reason to think that
2358          * that will work out (every string datum may be unique).  Caching does
2359          * not slow things down measurably when it doesn't work out, and can speed
2360          * things up by rather a lot when it does.  In part, this is because the
2361          * memcmp() compares data from cachelines that are needed in L1 cache even
2362          * when the last comparison's result cannot be reused.
2363          */
2364         arg1_match = true;
2365         if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2366         {
2367                 arg1_match = false;
2368                 memcpy(sss->buf1, a1p, len1);
2369                 sss->buf1[len1] = '\0';
2370                 sss->last_len1 = len1;
2371         }
2372
2373         /*
2374          * If we're comparing the same two strings as last time, we can return the
2375          * same answer without calling strcoll() again.  This is more likely than
2376          * it seems (at least with moderate to low cardinality sets), because
2377          * quicksort compares the same pivot against many values.
2378          */
2379         if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2380         {
2381                 memcpy(sss->buf2, a2p, len2);
2382                 sss->buf2[len2] = '\0';
2383                 sss->last_len2 = len2;
2384         }
2385         else if (arg1_match && !sss->cache_blob)
2386         {
2387                 /* Use result cached following last actual strcoll() call */
2388                 return sss->last_returned;
2389         }
2390
2391         if (sss->locale)
2392         {
2393                 if (sss->locale->provider == COLLPROVIDER_ICU)
2394                 {
2395 #ifdef USE_ICU
2396 #ifdef HAVE_UCOL_STRCOLLUTF8
2397                         if (GetDatabaseEncoding() == PG_UTF8)
2398                         {
2399                                 UErrorCode      status;
2400
2401                                 status = U_ZERO_ERROR;
2402                                 result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2403                                                                                   a1p, len1,
2404                                                                                   a2p, len2,
2405                                                                                   &status);
2406                                 if (U_FAILURE(status))
2407                                         ereport(ERROR,
2408                                                         (errmsg("collation failed: %s", u_errorName(status))));
2409                         }
2410                         else
2411 #endif
2412                         {
2413                                 int32_t         ulen1,
2414                                                         ulen2;
2415                                 UChar      *uchar1,
2416                                                    *uchar2;
2417
2418                                 ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2419                                 ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2420
2421                                 result = ucol_strcoll(sss->locale->info.icu.ucol,
2422                                                                           uchar1, ulen1,
2423                                                                           uchar2, ulen2);
2424
2425                                 pfree(uchar1);
2426                                 pfree(uchar2);
2427                         }
2428 #else                                                   /* not USE_ICU */
2429                         /* shouldn't happen */
2430                         elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2431 #endif                                                  /* not USE_ICU */
2432                 }
2433                 else
2434                 {
2435 #ifdef HAVE_LOCALE_T
2436                         result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2437 #else
2438                         /* shouldn't happen */
2439                         elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2440 #endif
2441                 }
2442         }
2443         else
2444                 result = strcoll(sss->buf1, sss->buf2);
2445
2446         /* Break tie if necessary. */
2447         if (result == 0 &&
2448                 (!sss->locale || sss->locale->deterministic))
2449                 result = strcmp(sss->buf1, sss->buf2);
2450
2451         /* Cache result, perhaps saving an expensive strcoll() call next time */
2452         sss->cache_blob = false;
2453         sss->last_returned = result;
2454         return result;
2455 }
2456
2457 /*
2458  * Abbreviated key comparison func
2459  */
2460 static int
2461 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2462 {
2463         /*
2464          * When 0 is returned, the core system will call varstrfastcmp_c()
2465          * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale().  Even a
2466          * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2467          * authoritatively, for the same reason that there is a strcoll()
2468          * tie-breaker call to strcmp() in varstr_cmp().
2469          */
2470         if (x > y)
2471                 return 1;
2472         else if (x == y)
2473                 return 0;
2474         else
2475                 return -1;
2476 }
2477
2478 /*
2479  * Conversion routine for sortsupport.  Converts original to abbreviated key
2480  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2481  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2482  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2483  * locale is used, or in case of bytea, just memcpy() from original instead.
2484  */
2485 static Datum
2486 varstr_abbrev_convert(Datum original, SortSupport ssup)
2487 {
2488         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2489         VarString  *authoritative = DatumGetVarStringPP(original);
2490         char       *authoritative_data = VARDATA_ANY(authoritative);
2491
2492         /* working state */
2493         Datum           res;
2494         char       *pres;
2495         int                     len;
2496         uint32          hash;
2497
2498         pres = (char *) &res;
2499         /* memset(), so any non-overwritten bytes are NUL */
2500         memset(pres, 0, sizeof(Datum));
2501         len = VARSIZE_ANY_EXHDR(authoritative);
2502
2503         /* Get number of bytes, ignoring trailing spaces */
2504         if (sss->typid == BPCHAROID)
2505                 len = bpchartruelen(authoritative_data, len);
2506
2507         /*
2508          * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2509          * abbreviate keys.  The full comparator for the C locale is always
2510          * memcmp().  It would be incorrect to allow bytea callers (callers that
2511          * always force the C collation -- bytea isn't a collatable type, but this
2512          * approach is convenient) to use strxfrm().  This is because bytea
2513          * strings may contain NUL bytes.  Besides, this should be faster, too.
2514          *
2515          * More generally, it's okay that bytea callers can have NUL bytes in
2516          * strings because varstrcmp_abbrev() need not make a distinction between
2517          * terminating NUL bytes, and NUL bytes representing actual NULs in the
2518          * authoritative representation.  Hopefully a comparison at or past one
2519          * abbreviated key's terminating NUL byte will resolve the comparison
2520          * without consulting the authoritative representation; specifically, some
2521          * later non-NUL byte in the longer string can resolve the comparison
2522          * against a subsequent terminating NUL in the shorter string.  There will
2523          * usually be what is effectively a "length-wise" resolution there and
2524          * then.
2525          *
2526          * If that doesn't work out -- if all bytes in the longer string
2527          * positioned at or past the offset of the smaller string's (first)
2528          * terminating NUL are actually representative of NUL bytes in the
2529          * authoritative binary string (perhaps with some *terminating* NUL bytes
2530          * towards the end of the longer string iff it happens to still be small)
2531          * -- then an authoritative tie-breaker will happen, and do the right
2532          * thing: explicitly consider string length.
2533          */
2534         if (sss->collate_c)
2535                 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2536         else
2537         {
2538                 Size            bsize;
2539 #ifdef USE_ICU
2540                 int32_t         ulen = -1;
2541                 UChar      *uchar = NULL;
2542 #endif
2543
2544                 /*
2545                  * We're not using the C collation, so fall back on strxfrm or ICU
2546                  * analogs.
2547                  */
2548
2549                 /* By convention, we use buffer 1 to store and NUL-terminate */
2550                 if (len >= sss->buflen1)
2551                 {
2552                         pfree(sss->buf1);
2553                         sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2554                         sss->buf1 = palloc(sss->buflen1);
2555                 }
2556
2557                 /* Might be able to reuse strxfrm() blob from last call */
2558                 if (sss->last_len1 == len && sss->cache_blob &&
2559                         memcmp(sss->buf1, authoritative_data, len) == 0)
2560                 {
2561                         memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2562                         /* No change affecting cardinality, so no hashing required */
2563                         goto done;
2564                 }
2565
2566                 memcpy(sss->buf1, authoritative_data, len);
2567
2568                 /*
2569                  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2570                  * necessary for ICU, but doesn't hurt.
2571                  */
2572                 sss->buf1[len] = '\0';
2573                 sss->last_len1 = len;
2574
2575 #ifdef USE_ICU
2576                 /* When using ICU and not UTF8, convert string to UChar. */
2577                 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2578                         GetDatabaseEncoding() != PG_UTF8)
2579                         ulen = icu_to_uchar(&uchar, sss->buf1, len);
2580 #endif
2581
2582                 /*
2583                  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2584                  * and try again.  Both of these functions have the result buffer
2585                  * content undefined if the result did not fit, so we need to retry
2586                  * until everything fits, even though we only need the first few bytes
2587                  * in the end.  When using ucol_nextSortKeyPart(), however, we only
2588                  * ask for as many bytes as we actually need.
2589                  */
2590                 for (;;)
2591                 {
2592 #ifdef USE_ICU
2593                         if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2594                         {
2595                                 /*
2596                                  * When using UTF8, use the iteration interface so we only
2597                                  * need to produce as many bytes as we actually need.
2598                                  */
2599                                 if (GetDatabaseEncoding() == PG_UTF8)
2600                                 {
2601                                         UCharIterator iter;
2602                                         uint32_t        state[2];
2603                                         UErrorCode      status;
2604
2605                                         uiter_setUTF8(&iter, sss->buf1, len);
2606                                         state[0] = state[1] = 0;        /* won't need that again */
2607                                         status = U_ZERO_ERROR;
2608                                         bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2609                                                                                                  &iter,
2610                                                                                                  state,
2611                                                                                                  (uint8_t *) sss->buf2,
2612                                                                                                  Min(sizeof(Datum), sss->buflen2),
2613                                                                                                  &status);
2614                                         if (U_FAILURE(status))
2615                                                 ereport(ERROR,
2616                                                                 (errmsg("sort key generation failed: %s",
2617                                                                                 u_errorName(status))));
2618                                 }
2619                                 else
2620                                         bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2621                                                                                         uchar, ulen,
2622                                                                                         (uint8_t *) sss->buf2, sss->buflen2);
2623                         }
2624                         else
2625 #endif
2626 #ifdef HAVE_LOCALE_T
2627                         if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2628                                 bsize = strxfrm_l(sss->buf2, sss->buf1,
2629                                                                   sss->buflen2, sss->locale->info.lt);
2630                         else
2631 #endif
2632                                 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2633
2634                         sss->last_len2 = bsize;
2635                         if (bsize < sss->buflen2)
2636                                 break;
2637
2638                         /*
2639                          * Grow buffer and retry.
2640                          */
2641                         pfree(sss->buf2);
2642                         sss->buflen2 = Max(bsize + 1,
2643                                                            Min(sss->buflen2 * 2, MaxAllocSize));
2644                         sss->buf2 = palloc(sss->buflen2);
2645                 }
2646
2647                 /*
2648                  * Every Datum byte is always compared.  This is safe because the
2649                  * strxfrm() blob is itself NUL terminated, leaving no danger of
2650                  * misinterpreting any NUL bytes not intended to be interpreted as
2651                  * logically representing termination.
2652                  *
2653                  * (Actually, even if there were NUL bytes in the blob it would be
2654                  * okay.  See remarks on bytea case above.)
2655                  */
2656                 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2657
2658 #ifdef USE_ICU
2659                 if (uchar)
2660                         pfree(uchar);
2661 #endif
2662         }
2663
2664         /*
2665          * Maintain approximate cardinality of both abbreviated keys and original,
2666          * authoritative keys using HyperLogLog.  Used as cheap insurance against
2667          * the worst case, where we do many string transformations for no saving
2668          * in full strcoll()-based comparisons.  These statistics are used by
2669          * varstr_abbrev_abort().
2670          *
2671          * First, Hash key proper, or a significant fraction of it.  Mix in length
2672          * in order to compensate for cases where differences are past
2673          * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2674          */
2675         hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2676                                                                    Min(len, PG_CACHE_LINE_SIZE)));
2677
2678         if (len > PG_CACHE_LINE_SIZE)
2679                 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2680
2681         addHyperLogLog(&sss->full_card, hash);
2682
2683         /* Hash abbreviated key */
2684 #if SIZEOF_DATUM == 8
2685         {
2686                 uint32          lohalf,
2687                                         hihalf;
2688
2689                 lohalf = (uint32) res;
2690                 hihalf = (uint32) (res >> 32);
2691                 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2692         }
2693 #else                                                   /* SIZEOF_DATUM != 8 */
2694         hash = DatumGetUInt32(hash_uint32((uint32) res));
2695 #endif
2696
2697         addHyperLogLog(&sss->abbr_card, hash);
2698
2699         /* Cache result, perhaps saving an expensive strxfrm() call next time */
2700         sss->cache_blob = true;
2701 done:
2702
2703         /*
2704          * Byteswap on little-endian machines.
2705          *
2706          * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2707          * comparator) works correctly on all platforms.  If we didn't do this,
2708          * the comparator would have to call memcmp() with a pair of pointers to
2709          * the first byte of each abbreviated key, which is slower.
2710          */
2711         res = DatumBigEndianToNative(res);
2712
2713         /* Don't leak memory here */
2714         if (PointerGetDatum(authoritative) != original)
2715                 pfree(authoritative);
2716
2717         return res;
2718 }
2719
2720 /*
2721  * Callback for estimating effectiveness of abbreviated key optimization, using
2722  * heuristic rules.  Returns value indicating if the abbreviation optimization
2723  * should be aborted, based on its projected effectiveness.
2724  */
2725 static bool
2726 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2727 {
2728         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2729         double          abbrev_distinct,
2730                                 key_distinct;
2731
2732         Assert(ssup->abbreviate);
2733
2734         /* Have a little patience */
2735         if (memtupcount < 100)
2736                 return false;
2737
2738         abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2739         key_distinct = estimateHyperLogLog(&sss->full_card);
2740
2741         /*
2742          * Clamp cardinality estimates to at least one distinct value.  While
2743          * NULLs are generally disregarded, if only NULL values were seen so far,
2744          * that might misrepresent costs if we failed to clamp.
2745          */
2746         if (abbrev_distinct <= 1.0)
2747                 abbrev_distinct = 1.0;
2748
2749         if (key_distinct <= 1.0)
2750                 key_distinct = 1.0;
2751
2752         /*
2753          * In the worst case all abbreviated keys are identical, while at the same
2754          * time there are differences within full key strings not captured in
2755          * abbreviations.
2756          */
2757 #ifdef TRACE_SORT
2758         if (trace_sort)
2759         {
2760                 double          norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2761
2762                 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2763                          "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2764                          memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2765                          sss->prop_card);
2766         }
2767 #endif
2768
2769         /*
2770          * If the number of distinct abbreviated keys approximately matches the
2771          * number of distinct authoritative original keys, that's reason enough to
2772          * proceed.  We can win even with a very low cardinality set if most
2773          * tie-breakers only memcmp().  This is by far the most important
2774          * consideration.
2775          *
2776          * While comparisons that are resolved at the abbreviated key level are
2777          * considerably cheaper than tie-breakers resolved with memcmp(), both of
2778          * those two outcomes are so much cheaper than a full strcoll() once
2779          * sorting is underway that it doesn't seem worth it to weigh abbreviated
2780          * cardinality against the overall size of the set in order to more
2781          * accurately model costs.  Assume that an abbreviated comparison, and an
2782          * abbreviated comparison with a cheap memcmp()-based authoritative
2783          * resolution are equivalent.
2784          */
2785         if (abbrev_distinct > key_distinct * sss->prop_card)
2786         {
2787                 /*
2788                  * When we have exceeded 10,000 tuples, decay required cardinality
2789                  * aggressively for next call.
2790                  *
2791                  * This is useful because the number of comparisons required on
2792                  * average increases at a linearithmic rate, and at roughly 10,000
2793                  * tuples that factor will start to dominate over the linear costs of
2794                  * string transformation (this is a conservative estimate).  The decay
2795                  * rate is chosen to be a little less aggressive than halving -- which
2796                  * (since we're called at points at which memtupcount has doubled)
2797                  * would never see the cost model actually abort past the first call
2798                  * following a decay.  This decay rate is mostly a precaution against
2799                  * a sudden, violent swing in how well abbreviated cardinality tracks
2800                  * full key cardinality.  The decay also serves to prevent a marginal
2801                  * case from being aborted too late, when too much has already been
2802                  * invested in string transformation.
2803                  *
2804                  * It's possible for sets of several million distinct strings with
2805                  * mere tens of thousands of distinct abbreviated keys to still
2806                  * benefit very significantly.  This will generally occur provided
2807                  * each abbreviated key is a proxy for a roughly uniform number of the
2808                  * set's full keys. If it isn't so, we hope to catch that early and
2809                  * abort.  If it isn't caught early, by the time the problem is
2810                  * apparent it's probably not worth aborting.
2811                  */
2812                 if (memtupcount > 10000)
2813                         sss->prop_card *= 0.65;
2814
2815                 return false;
2816         }
2817
2818         /*
2819          * Abort abbreviation strategy.
2820          *
2821          * The worst case, where all abbreviated keys are identical while all
2822          * original strings differ will typically only see a regression of about
2823          * 10% in execution time for small to medium sized lists of strings.
2824          * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2825          * often expect very large improvements, particularly with sets of strings
2826          * of moderately high to high abbreviated cardinality.  There is little to
2827          * lose but much to gain, which our strategy reflects.
2828          */
2829 #ifdef TRACE_SORT
2830         if (trace_sort)
2831                 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2832                          "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2833                          memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2834 #endif
2835
2836         return true;
2837 }
2838
2839 /*
2840  * Generic equalimage support function for character type's operator classes.
2841  * Disables the use of deduplication with nondeterministic collations.
2842  */
2843 Datum
2844 btvarstrequalimage(PG_FUNCTION_ARGS)
2845 {
2846         /* Oid          opcintype = PG_GETARG_OID(0); */
2847         Oid                     collid = PG_GET_COLLATION();
2848
2849         check_collation_set(collid);
2850
2851         if (lc_collate_is_c(collid) ||
2852                 collid == DEFAULT_COLLATION_OID ||
2853                 get_collation_isdeterministic(collid))
2854                 PG_RETURN_BOOL(true);
2855         else
2856                 PG_RETURN_BOOL(false);
2857 }
2858
2859 Datum
2860 text_larger(PG_FUNCTION_ARGS)
2861 {
2862         text       *arg1 = PG_GETARG_TEXT_PP(0);
2863         text       *arg2 = PG_GETARG_TEXT_PP(1);
2864         text       *result;
2865
2866         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2867
2868         PG_RETURN_TEXT_P(result);
2869 }
2870
2871 Datum
2872 text_smaller(PG_FUNCTION_ARGS)
2873 {
2874         text       *arg1 = PG_GETARG_TEXT_PP(0);
2875         text       *arg2 = PG_GETARG_TEXT_PP(1);
2876         text       *result;
2877
2878         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2879
2880         PG_RETURN_TEXT_P(result);
2881 }
2882
2883
2884 /*
2885  * Cross-type comparison functions for types text and name.
2886  */
2887
2888 Datum
2889 nameeqtext(PG_FUNCTION_ARGS)
2890 {
2891         Name            arg1 = PG_GETARG_NAME(0);
2892         text       *arg2 = PG_GETARG_TEXT_PP(1);
2893         size_t          len1 = strlen(NameStr(*arg1));
2894         size_t          len2 = VARSIZE_ANY_EXHDR(arg2);
2895         Oid                     collid = PG_GET_COLLATION();
2896         bool            result;
2897
2898         check_collation_set(collid);
2899
2900         if (collid == C_COLLATION_OID)
2901                 result = (len1 == len2 &&
2902                                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2903         else
2904                 result = (varstr_cmp(NameStr(*arg1), len1,
2905                                                          VARDATA_ANY(arg2), len2,
2906                                                          collid) == 0);
2907
2908         PG_FREE_IF_COPY(arg2, 1);
2909
2910         PG_RETURN_BOOL(result);
2911 }
2912
2913 Datum
2914 texteqname(PG_FUNCTION_ARGS)
2915 {
2916         text       *arg1 = PG_GETARG_TEXT_PP(0);
2917         Name            arg2 = PG_GETARG_NAME(1);
2918         size_t          len1 = VARSIZE_ANY_EXHDR(arg1);
2919         size_t          len2 = strlen(NameStr(*arg2));
2920         Oid                     collid = PG_GET_COLLATION();
2921         bool            result;
2922
2923         check_collation_set(collid);
2924
2925         if (collid == C_COLLATION_OID)
2926                 result = (len1 == len2 &&
2927                                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2928         else
2929                 result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2930                                                          NameStr(*arg2), len2,
2931                                                          collid) == 0);
2932
2933         PG_FREE_IF_COPY(arg1, 0);
2934
2935         PG_RETURN_BOOL(result);
2936 }
2937
2938 Datum
2939 namenetext(PG_FUNCTION_ARGS)
2940 {
2941         Name            arg1 = PG_GETARG_NAME(0);
2942         text       *arg2 = PG_GETARG_TEXT_PP(1);
2943         size_t          len1 = strlen(NameStr(*arg1));
2944         size_t          len2 = VARSIZE_ANY_EXHDR(arg2);
2945         Oid                     collid = PG_GET_COLLATION();
2946         bool            result;
2947
2948         check_collation_set(collid);
2949
2950         if (collid == C_COLLATION_OID)
2951                 result = !(len1 == len2 &&
2952                                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2953         else
2954                 result = !(varstr_cmp(NameStr(*arg1), len1,
2955                                                           VARDATA_ANY(arg2), len2,
2956                                                           collid) == 0);
2957
2958         PG_FREE_IF_COPY(arg2, 1);
2959
2960         PG_RETURN_BOOL(result);
2961 }
2962
2963 Datum
2964 textnename(PG_FUNCTION_ARGS)
2965 {
2966         text       *arg1 = PG_GETARG_TEXT_PP(0);
2967         Name            arg2 = PG_GETARG_NAME(1);
2968         size_t          len1 = VARSIZE_ANY_EXHDR(arg1);
2969         size_t          len2 = strlen(NameStr(*arg2));
2970         Oid                     collid = PG_GET_COLLATION();
2971         bool            result;
2972
2973         check_collation_set(collid);
2974
2975         if (collid == C_COLLATION_OID)
2976                 result = !(len1 == len2 &&
2977                                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2978         else
2979                 result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2980                                                           NameStr(*arg2), len2,
2981                                                           collid) == 0);
2982
2983         PG_FREE_IF_COPY(arg1, 0);
2984
2985         PG_RETURN_BOOL(result);
2986 }
2987
2988 Datum
2989 btnametextcmp(PG_FUNCTION_ARGS)
2990 {
2991         Name            arg1 = PG_GETARG_NAME(0);
2992         text       *arg2 = PG_GETARG_TEXT_PP(1);
2993         int32           result;
2994
2995         result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2996                                                 VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2997                                                 PG_GET_COLLATION());
2998
2999         PG_FREE_IF_COPY(arg2, 1);
3000
3001         PG_RETURN_INT32(result);
3002 }
3003
3004 Datum
3005 bttextnamecmp(PG_FUNCTION_ARGS)
3006 {
3007         text       *arg1 = PG_GETARG_TEXT_PP(0);
3008         Name            arg2 = PG_GETARG_NAME(1);
3009         int32           result;
3010
3011         result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
3012                                                 NameStr(*arg2), strlen(NameStr(*arg2)),
3013                                                 PG_GET_COLLATION());
3014
3015         PG_FREE_IF_COPY(arg1, 0);
3016
3017         PG_RETURN_INT32(result);
3018 }
3019
3020 #define CmpCall(cmpfunc) \
3021         DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
3022                                                                                   PG_GET_COLLATION(), \
3023                                                                                   PG_GETARG_DATUM(0), \
3024                                                                                   PG_GETARG_DATUM(1)))
3025
3026 Datum
3027 namelttext(PG_FUNCTION_ARGS)
3028 {
3029         PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
3030 }
3031
3032 Datum
3033 nameletext(PG_FUNCTION_ARGS)
3034 {
3035         PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3036 }
3037
3038 Datum
3039 namegttext(PG_FUNCTION_ARGS)
3040 {
3041         PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3042 }
3043
3044 Datum
3045 namegetext(PG_FUNCTION_ARGS)
3046 {
3047         PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3048 }
3049
3050 Datum
3051 textltname(PG_FUNCTION_ARGS)
3052 {
3053         PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3054 }
3055
3056 Datum
3057 textlename(PG_FUNCTION_ARGS)
3058 {
3059         PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3060 }
3061
3062 Datum
3063 textgtname(PG_FUNCTION_ARGS)
3064 {
3065         PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3066 }
3067
3068 Datum
3069 textgename(PG_FUNCTION_ARGS)
3070 {
3071         PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3072 }
3073
3074 #undef CmpCall
3075
3076
3077 /*
3078  * The following operators support character-by-character comparison
3079  * of text datums, to allow building indexes suitable for LIKE clauses.
3080  * Note that the regular texteq/textne comparison operators, and regular
3081  * support functions 1 and 2 with "C" collation are assumed to be
3082  * compatible with these!
3083  */
3084
3085 static int
3086 internal_text_pattern_compare(text *arg1, text *arg2)
3087 {
3088         int                     result;
3089         int                     len1,
3090                                 len2;
3091
3092         len1 = VARSIZE_ANY_EXHDR(arg1);
3093         len2 = VARSIZE_ANY_EXHDR(arg2);
3094
3095         result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3096         if (result != 0)
3097                 return result;
3098         else if (len1 < len2)
3099                 return -1;
3100         else if (len1 > len2)
3101                 return 1;
3102         else
3103                 return 0;
3104 }
3105
3106
3107 Datum
3108 text_pattern_lt(PG_FUNCTION_ARGS)
3109 {
3110         text       *arg1 = PG_GETARG_TEXT_PP(0);
3111         text       *arg2 = PG_GETARG_TEXT_PP(1);
3112         int                     result;
3113
3114         result = internal_text_pattern_compare(arg1, arg2);
3115
3116         PG_FREE_IF_COPY(arg1, 0);
3117         PG_FREE_IF_COPY(arg2, 1);
3118
3119         PG_RETURN_BOOL(result < 0);
3120 }
3121
3122
3123 Datum
3124 text_pattern_le(PG_FUNCTION_ARGS)
3125 {
3126         text       *arg1 = PG_GETARG_TEXT_PP(0);
3127         text       *arg2 = PG_GETARG_TEXT_PP(1);
3128         int                     result;
3129
3130         result = internal_text_pattern_compare(arg1, arg2);
3131
3132         PG_FREE_IF_COPY(arg1, 0);
3133         PG_FREE_IF_COPY(arg2, 1);
3134
3135         PG_RETURN_BOOL(result <= 0);
3136 }
3137
3138
3139 Datum
3140 text_pattern_ge(PG_FUNCTION_ARGS)
3141 {
3142         text       *arg1 = PG_GETARG_TEXT_PP(0);
3143         text       *arg2 = PG_GETARG_TEXT_PP(1);
3144         int                     result;
3145
3146         result = internal_text_pattern_compare(arg1, arg2);
3147
3148         PG_FREE_IF_COPY(arg1, 0);
3149         PG_FREE_IF_COPY(arg2, 1);
3150
3151         PG_RETURN_BOOL(result >= 0);
3152 }
3153
3154
3155 Datum
3156 text_pattern_gt(PG_FUNCTION_ARGS)
3157 {
3158         text       *arg1 = PG_GETARG_TEXT_PP(0);
3159         text       *arg2 = PG_GETARG_TEXT_PP(1);
3160         int                     result;
3161
3162         result = internal_text_pattern_compare(arg1, arg2);
3163
3164         PG_FREE_IF_COPY(arg1, 0);
3165         PG_FREE_IF_COPY(arg2, 1);
3166
3167         PG_RETURN_BOOL(result > 0);
3168 }
3169
3170
3171 Datum
3172 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3173 {
3174         text       *arg1 = PG_GETARG_TEXT_PP(0);
3175         text       *arg2 = PG_GETARG_TEXT_PP(1);
3176         int                     result;
3177
3178         result = internal_text_pattern_compare(arg1, arg2);
3179
3180         PG_FREE_IF_COPY(arg1, 0);
3181         PG_FREE_IF_COPY(arg2, 1);
3182
3183         PG_RETURN_INT32(result);
3184 }
3185
3186
3187 Datum
3188 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3189 {
3190         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3191         MemoryContext oldcontext;
3192
3193         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3194
3195         /* Use generic string SortSupport, forcing "C" collation */
3196         varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3197
3198         MemoryContextSwitchTo(oldcontext);
3199
3200         PG_RETURN_VOID();
3201 }
3202
3203
3204 /*-------------------------------------------------------------
3205  * byteaoctetlen
3206  *
3207  * get the number of bytes contained in an instance of type 'bytea'
3208  *-------------------------------------------------------------
3209  */
3210 Datum
3211 byteaoctetlen(PG_FUNCTION_ARGS)
3212 {
3213         Datum           str = PG_GETARG_DATUM(0);
3214
3215         /* We need not detoast the input at all */
3216         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3217 }
3218
3219 /*
3220  * byteacat -
3221  *        takes two bytea* and returns a bytea* that is the concatenation of
3222  *        the two.
3223  *
3224  * Cloned from textcat and modified as required.
3225  */
3226 Datum
3227 byteacat(PG_FUNCTION_ARGS)
3228 {
3229         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3230         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3231
3232         PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3233 }
3234
3235 /*
3236  * bytea_catenate
3237  *      Guts of byteacat(), broken out so it can be used by other functions
3238  *
3239  * Arguments can be in short-header form, but not compressed or out-of-line
3240  */
3241 static bytea *
3242 bytea_catenate(bytea *t1, bytea *t2)
3243 {
3244         bytea      *result;
3245         int                     len1,
3246                                 len2,
3247                                 len;
3248         char       *ptr;
3249
3250         len1 = VARSIZE_ANY_EXHDR(t1);
3251         len2 = VARSIZE_ANY_EXHDR(t2);
3252
3253         /* paranoia ... probably should throw error instead? */
3254         if (len1 < 0)
3255                 len1 = 0;
3256         if (len2 < 0)
3257                 len2 = 0;
3258
3259         len = len1 + len2 + VARHDRSZ;
3260         result = (bytea *) palloc(len);
3261
3262         /* Set size of result string... */
3263         SET_VARSIZE(result, len);
3264
3265         /* Fill data field of result string... */
3266         ptr = VARDATA(result);
3267         if (len1 > 0)
3268                 memcpy(ptr, VARDATA_ANY(t1), len1);
3269         if (len2 > 0)
3270                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3271
3272         return result;
3273 }
3274
3275 #define PG_STR_GET_BYTEA(str_) \
3276         DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3277
3278 /*
3279  * bytea_substr()
3280  * Return a substring starting at the specified position.
3281  * Cloned from text_substr and modified as required.
3282  *
3283  * Input:
3284  *      - string
3285  *      - starting position (is one-based)
3286  *      - string length (optional)
3287  *
3288  * If the starting position is zero or less, then return from the start of the string
3289  * adjusting the length to be consistent with the "negative start" per SQL.
3290  * If the length is less than zero, an ERROR is thrown. If no third argument
3291  * (length) is provided, the length to the end of the string is assumed.
3292  */
3293 Datum
3294 bytea_substr(PG_FUNCTION_ARGS)
3295 {
3296         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3297                                                                           PG_GETARG_INT32(1),
3298                                                                           PG_GETARG_INT32(2),
3299                                                                           false));
3300 }
3301
3302 /*
3303  * bytea_substr_no_len -
3304  *        Wrapper to avoid opr_sanity failure due to
3305  *        one function accepting a different number of args.
3306  */
3307 Datum
3308 bytea_substr_no_len(PG_FUNCTION_ARGS)
3309 {
3310         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3311                                                                           PG_GETARG_INT32(1),
3312                                                                           -1,
3313                                                                           true));
3314 }
3315
3316 static bytea *
3317 bytea_substring(Datum str,
3318                                 int S,
3319                                 int L,
3320                                 bool length_not_specified)
3321 {
3322         int32           S1;                             /* adjusted start position */
3323         int32           L1;                             /* adjusted substring length */
3324         int32           E;                              /* end position */
3325
3326         /*
3327          * The logic here should generally match text_substring().
3328          */
3329         S1 = Max(S, 1);
3330
3331         if (length_not_specified)
3332         {
3333                 /*
3334                  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3335                  * end of the string if we pass it a negative value for length.
3336                  */
3337                 L1 = -1;
3338         }
3339         else if (L < 0)
3340         {
3341                 /* SQL99 says to throw an error for E < S, i.e., negative length */
3342                 ereport(ERROR,
3343                                 (errcode(ERRCODE_SUBSTRING_ERROR),
3344                                  errmsg("negative substring length not allowed")));
3345                 L1 = -1;                                /* silence stupider compilers */
3346         }
3347         else if (pg_add_s32_overflow(S, L, &E))
3348         {
3349                 /*
3350                  * L could be large enough for S + L to overflow, in which case the
3351                  * substring must run to end of string.
3352                  */
3353                 L1 = -1;
3354         }
3355         else
3356         {
3357                 /*
3358                  * A zero or negative value for the end position can happen if the
3359                  * start was negative or one. SQL99 says to return a zero-length
3360                  * string.
3361                  */
3362                 if (E < 1)
3363                         return PG_STR_GET_BYTEA("");
3364
3365                 L1 = E - S1;
3366         }
3367
3368         /*
3369          * If the start position is past the end of the string, SQL99 says to
3370          * return a zero-length string -- DatumGetByteaPSlice() will do that for
3371          * us.  We need only convert S1 to zero-based starting position.
3372          */
3373         return DatumGetByteaPSlice(str, S1 - 1, L1);
3374 }
3375
3376 /*
3377  * byteaoverlay
3378  *      Replace specified substring of first string with second
3379  *
3380  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3381  * This code is a direct implementation of what the standard says.
3382  */
3383 Datum
3384 byteaoverlay(PG_FUNCTION_ARGS)
3385 {
3386         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3387         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3388         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
3389         int                     sl = PG_GETARG_INT32(3);        /* substring length */
3390
3391         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3392 }
3393
3394 Datum
3395 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3396 {
3397         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3398         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3399         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
3400         int                     sl;
3401
3402         sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3403         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3404 }
3405
3406 static bytea *
3407 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3408 {
3409         bytea      *result;
3410         bytea      *s1;
3411         bytea      *s2;
3412         int                     sp_pl_sl;
3413
3414         /*
3415          * Check for possible integer-overflow cases.  For negative sp, throw a
3416          * "substring length" error because that's what should be expected
3417          * according to the spec's definition of OVERLAY().
3418          */
3419         if (sp <= 0)
3420                 ereport(ERROR,
3421                                 (errcode(ERRCODE_SUBSTRING_ERROR),
3422                                  errmsg("negative substring length not allowed")));
3423         if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3424                 ereport(ERROR,
3425                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3426                                  errmsg("integer out of range")));
3427
3428         s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3429         s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3430         result = bytea_catenate(s1, t2);
3431         result = bytea_catenate(result, s2);
3432
3433         return result;
3434 }
3435
3436 /*
3437  * bit_count
3438  */
3439 Datum
3440 bytea_bit_count(PG_FUNCTION_ARGS)
3441 {
3442         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3443
3444         PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3445 }
3446
3447 /*
3448  * byteapos -
3449  *        Return the position of the specified substring.
3450  *        Implements the SQL POSITION() function.
3451  * Cloned from textpos and modified as required.
3452  */
3453 Datum
3454 byteapos(PG_FUNCTION_ARGS)
3455 {
3456         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3457         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3458         int                     pos;
3459         int                     px,
3460                                 p;
3461         int                     len1,
3462                                 len2;
3463         char       *p1,
3464                            *p2;
3465
3466         len1 = VARSIZE_ANY_EXHDR(t1);
3467         len2 = VARSIZE_ANY_EXHDR(t2);
3468
3469         if (len2 <= 0)
3470                 PG_RETURN_INT32(1);             /* result for empty pattern */
3471
3472         p1 = VARDATA_ANY(t1);
3473         p2 = VARDATA_ANY(t2);
3474
3475         pos = 0;
3476         px = (len1 - len2);
3477         for (p = 0; p <= px; p++)
3478         {
3479                 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3480                 {
3481                         pos = p + 1;
3482                         break;
3483                 };
3484                 p1++;
3485         };
3486
3487         PG_RETURN_INT32(pos);
3488 }
3489
3490 /*-------------------------------------------------------------
3491  * byteaGetByte
3492  *
3493  * this routine treats "bytea" as an array of bytes.
3494  * It returns the Nth byte (a number between 0 and 255).
3495  *-------------------------------------------------------------
3496  */
3497 Datum
3498 byteaGetByte(PG_FUNCTION_ARGS)
3499 {
3500         bytea      *v = PG_GETARG_BYTEA_PP(0);
3501         int32           n = PG_GETARG_INT32(1);
3502         int                     len;
3503         int                     byte;
3504
3505         len = VARSIZE_ANY_EXHDR(v);
3506
3507         if (n < 0 || n >= len)
3508                 ereport(ERROR,
3509                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3510                                  errmsg("index %d out of valid range, 0..%d",
3511                                                 n, len - 1)));
3512
3513         byte = ((unsigned char *) VARDATA_ANY(v))[n];
3514
3515         PG_RETURN_INT32(byte);
3516 }
3517
3518 /*-------------------------------------------------------------
3519  * byteaGetBit
3520  *
3521  * This routine treats a "bytea" type like an array of bits.
3522  * It returns the value of the Nth bit (0 or 1).
3523  *
3524  *-------------------------------------------------------------
3525  */
3526 Datum
3527 byteaGetBit(PG_FUNCTION_ARGS)
3528 {
3529         bytea      *v = PG_GETARG_BYTEA_PP(0);
3530         int64           n = PG_GETARG_INT64(1);
3531         int                     byteNo,
3532                                 bitNo;
3533         int                     len;
3534         int                     byte;
3535
3536         len = VARSIZE_ANY_EXHDR(v);
3537
3538         if (n < 0 || n >= (int64) len * 8)
3539                 ereport(ERROR,
3540                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3541                                  errmsg("index %lld out of valid range, 0..%lld",
3542                                                 (long long) n, (long long) len * 8 - 1)));
3543
3544         /* n/8 is now known < len, so safe to cast to int */
3545         byteNo = (int) (n / 8);
3546         bitNo = (int) (n % 8);
3547
3548         byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3549
3550         if (byte & (1 << bitNo))
3551                 PG_RETURN_INT32(1);
3552         else
3553                 PG_RETURN_INT32(0);
3554 }
3555
3556 /*-------------------------------------------------------------
3557  * byteaSetByte
3558  *
3559  * Given an instance of type 'bytea' creates a new one with
3560  * the Nth byte set to the given value.
3561  *
3562  *-------------------------------------------------------------
3563  */
3564 Datum
3565 byteaSetByte(PG_FUNCTION_ARGS)
3566 {
3567         bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
3568         int32           n = PG_GETARG_INT32(1);
3569         int32           newByte = PG_GETARG_INT32(2);
3570         int                     len;
3571
3572         len = VARSIZE(res) - VARHDRSZ;
3573
3574         if (n < 0 || n >= len)
3575                 ereport(ERROR,
3576                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3577                                  errmsg("index %d out of valid range, 0..%d",
3578                                                 n, len - 1)));
3579
3580         /*
3581          * Now set the byte.
3582          */
3583         ((unsigned char *) VARDATA(res))[n] = newByte;
3584
3585         PG_RETURN_BYTEA_P(res);
3586 }
3587
3588 /*-------------------------------------------------------------
3589  * byteaSetBit
3590  *
3591  * Given an instance of type 'bytea' creates a new one with
3592  * the Nth bit set to the given value.
3593  *
3594  *-------------------------------------------------------------
3595  */
3596 Datum
3597 byteaSetBit(PG_FUNCTION_ARGS)
3598 {
3599         bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
3600         int64           n = PG_GETARG_INT64(1);
3601         int32           newBit = PG_GETARG_INT32(2);
3602         int                     len;
3603         int                     oldByte,
3604                                 newByte;
3605         int                     byteNo,
3606                                 bitNo;
3607
3608         len = VARSIZE(res) - VARHDRSZ;
3609
3610         if (n < 0 || n >= (int64) len * 8)
3611                 ereport(ERROR,
3612                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3613                                  errmsg("index %lld out of valid range, 0..%lld",
3614                                                 (long long) n, (long long) len * 8 - 1)));
3615
3616         /* n/8 is now known < len, so safe to cast to int */
3617         byteNo = (int) (n / 8);
3618         bitNo = (int) (n % 8);
3619
3620         /*
3621          * sanity check!
3622          */
3623         if (newBit != 0 && newBit != 1)
3624                 ereport(ERROR,
3625                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3626                                  errmsg("new bit must be 0 or 1")));
3627
3628         /*
3629          * Update the byte.
3630          */
3631         oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3632
3633         if (newBit == 0)
3634                 newByte = oldByte & (~(1 << bitNo));
3635         else
3636                 newByte = oldByte | (1 << bitNo);
3637
3638         ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3639
3640         PG_RETURN_BYTEA_P(res);
3641 }
3642
3643
3644 /* text_name()
3645  * Converts a text type to a Name type.
3646  */
3647 Datum
3648 text_name(PG_FUNCTION_ARGS)
3649 {
3650         text       *s = PG_GETARG_TEXT_PP(0);
3651         Name            result;
3652         int                     len;
3653
3654         len = VARSIZE_ANY_EXHDR(s);
3655
3656         /* Truncate oversize input */
3657         if (len >= NAMEDATALEN)
3658                 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3659
3660         /* We use palloc0 here to ensure result is zero-padded */
3661         result = (Name) palloc0(NAMEDATALEN);
3662         memcpy(NameStr(*result), VARDATA_ANY(s), len);
3663
3664         PG_RETURN_NAME(result);
3665 }
3666
3667 /* name_text()
3668  * Converts a Name type to a text type.
3669  */
3670 Datum
3671 name_text(PG_FUNCTION_ARGS)
3672 {
3673         Name            s = PG_GETARG_NAME(0);
3674
3675         PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3676 }
3677
3678
3679 /*
3680  * textToQualifiedNameList - convert a text object to list of names
3681  *
3682  * This implements the input parsing needed by nextval() and other
3683  * functions that take a text parameter representing a qualified name.
3684  * We split the name at dots, downcase if not double-quoted, and
3685  * truncate names if they're too long.
3686  */
3687 List *
3688 textToQualifiedNameList(text *textval)
3689 {
3690         char       *rawname;
3691         List       *result = NIL;
3692         List       *namelist;
3693         ListCell   *l;
3694
3695         /* Convert to C string (handles possible detoasting). */
3696         /* Note we rely on being able to modify rawname below. */
3697         rawname = text_to_cstring(textval);
3698
3699         if (!SplitIdentifierString(rawname, '.', &namelist))
3700                 ereport(ERROR,
3701                                 (errcode(ERRCODE_INVALID_NAME),
3702                                  errmsg("invalid name syntax")));
3703
3704         if (namelist == NIL)
3705                 ereport(ERROR,
3706                                 (errcode(ERRCODE_INVALID_NAME),
3707                                  errmsg("invalid name syntax")));
3708
3709         foreach(l, namelist)
3710         {
3711                 char       *curname = (char *) lfirst(l);
3712
3713                 result = lappend(result, makeString(pstrdup(curname)));
3714         }
3715
3716         pfree(rawname);
3717         list_free(namelist);
3718
3719         return result;
3720 }
3721
3722 /*
3723  * SplitIdentifierString --- parse a string containing identifiers
3724  *
3725  * This is the guts of textToQualifiedNameList, and is exported for use in
3726  * other situations such as parsing GUC variables.  In the GUC case, it's
3727  * important to avoid memory leaks, so the API is designed to minimize the
3728  * amount of stuff that needs to be allocated and freed.
3729  *
3730  * Inputs:
3731  *      rawstring: the input string; must be overwritable!      On return, it's
3732  *                         been modified to contain the separated identifiers.
3733  *      separator: the separator punctuation expected between identifiers
3734  *                         (typically '.' or ',').  Whitespace may also appear around
3735  *                         identifiers.
3736  * Outputs:
3737  *      namelist: filled with a palloc'd list of pointers to identifiers within
3738  *                        rawstring.  Caller should list_free() this even on error return.
3739  *
3740  * Returns true if okay, false if there is a syntax error in the string.
3741  *
3742  * Note that an empty string is considered okay here, though not in
3743  * textToQualifiedNameList.
3744  */
3745 bool
3746 SplitIdentifierString(char *rawstring, char separator,
3747                                           List **namelist)
3748 {
3749         char       *nextp = rawstring;
3750         bool            done = false;
3751
3752         *namelist = NIL;
3753
3754         while (scanner_isspace(*nextp))
3755                 nextp++;                                /* skip leading whitespace */
3756
3757         if (*nextp == '\0')
3758                 return true;                    /* allow empty string */
3759
3760         /* At the top of the loop, we are at start of a new identifier. */
3761         do
3762         {
3763                 char       *curname;
3764                 char       *endp;
3765
3766                 if (*nextp == '"')
3767                 {
3768                         /* Quoted name --- collapse quote-quote pairs, no downcasing */
3769                         curname = nextp + 1;
3770                         for (;;)
3771                         {
3772                                 endp = strchr(nextp + 1, '"');
3773                                 if (endp == NULL)
3774                                         return false;   /* mismatched quotes */
3775                                 if (endp[1] != '"')
3776                                         break;          /* found end of quoted name */
3777                                 /* Collapse adjacent quotes into one quote, and look again */
3778                                 memmove(endp, endp + 1, strlen(endp));
3779                                 nextp = endp;
3780                         }
3781                         /* endp now points at the terminating quote */
3782                         nextp = endp + 1;
3783                 }
3784                 else
3785                 {
3786                         /* Unquoted name --- extends to separator or whitespace */
3787                         char       *downname;
3788                         int                     len;
3789
3790                         curname = nextp;
3791                         while (*nextp && *nextp != separator &&
3792                                    !scanner_isspace(*nextp))
3793                                 nextp++;
3794                         endp = nextp;
3795                         if (curname == nextp)
3796                                 return false;   /* empty unquoted name not allowed */
3797
3798                         /*
3799                          * Downcase the identifier, using same code as main lexer does.
3800                          *
3801                          * XXX because we want to overwrite the input in-place, we cannot
3802                          * support a downcasing transformation that increases the string
3803                          * length.  This is not a problem given the current implementation
3804                          * of downcase_truncate_identifier, but we'll probably have to do
3805                          * something about this someday.
3806                          */
3807                         len = endp - curname;
3808                         downname = downcase_truncate_identifier(curname, len, false);
3809                         Assert(strlen(downname) <= len);
3810                         strncpy(curname, downname, len);        /* strncpy is required here */
3811                         pfree(downname);
3812                 }
3813
3814                 while (scanner_isspace(*nextp))
3815                         nextp++;                        /* skip trailing whitespace */
3816
3817                 if (*nextp == separator)
3818                 {
3819                         nextp++;
3820                         while (scanner_isspace(*nextp))
3821                                 nextp++;                /* skip leading whitespace for next */
3822                         /* we expect another name, so done remains false */
3823                 }
3824                 else if (*nextp == '\0')
3825                         done = true;
3826                 else
3827                         return false;           /* invalid syntax */
3828
3829                 /* Now safe to overwrite separator with a null */
3830                 *endp = '\0';
3831
3832                 /* Truncate name if it's overlength */
3833                 truncate_identifier(curname, strlen(curname), false);
3834
3835                 /*
3836                  * Finished isolating current name --- add it to list
3837                  */
3838                 *namelist = lappend(*namelist, curname);
3839
3840                 /* Loop back if we didn't reach end of string */
3841         } while (!done);
3842
3843         return true;
3844 }
3845
3846
3847 /*
3848  * SplitDirectoriesString --- parse a string containing file/directory names
3849  *
3850  * This works fine on file names too; the function name is historical.
3851  *
3852  * This is similar to SplitIdentifierString, except that the parsing
3853  * rules are meant to handle pathnames instead of identifiers: there is
3854  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3855  * and we apply canonicalize_path() to each extracted string.  Because of the
3856  * last, the returned strings are separately palloc'd rather than being
3857  * pointers into rawstring --- but we still scribble on rawstring.
3858  *
3859  * Inputs:
3860  *      rawstring: the input string; must be modifiable!
3861  *      separator: the separator punctuation expected between directories
3862  *                         (typically ',' or ';').  Whitespace may also appear around
3863  *                         directories.
3864  * Outputs:
3865  *      namelist: filled with a palloc'd list of directory names.
3866  *                        Caller should list_free_deep() this even on error return.
3867  *
3868  * Returns true if okay, false if there is a syntax error in the string.
3869  *
3870  * Note that an empty string is considered okay here.
3871  */
3872 bool
3873 SplitDirectoriesString(char *rawstring, char separator,
3874                                            List **namelist)
3875 {
3876         char       *nextp = rawstring;
3877         bool            done = false;
3878
3879         *namelist = NIL;
3880
3881         while (scanner_isspace(*nextp))
3882                 nextp++;                                /* skip leading whitespace */
3883
3884         if (*nextp == '\0')
3885                 return true;                    /* allow empty string */
3886
3887         /* At the top of the loop, we are at start of a new directory. */
3888         do
3889         {
3890                 char       *curname;
3891                 char       *endp;
3892
3893                 if (*nextp == '"')
3894                 {
3895                         /* Quoted name --- collapse quote-quote pairs */
3896                         curname = nextp + 1;
3897                         for (;;)
3898                         {
3899                                 endp = strchr(nextp + 1, '"');
3900                                 if (endp == NULL)
3901                                         return false;   /* mismatched quotes */
3902                                 if (endp[1] != '"')
3903                                         break;          /* found end of quoted name */
3904                                 /* Collapse adjacent quotes into one quote, and look again */
3905                                 memmove(endp, endp + 1, strlen(endp));
3906                                 nextp = endp;
3907                         }
3908                         /* endp now points at the terminating quote */
3909                         nextp = endp + 1;
3910                 }
3911                 else
3912                 {
3913                         /* Unquoted name --- extends to separator or end of string */
3914                         curname = endp = nextp;
3915                         while (*nextp && *nextp != separator)
3916                         {
3917                                 /* trailing whitespace should not be included in name */
3918                                 if (!scanner_isspace(*nextp))
3919                                         endp = nextp + 1;
3920                                 nextp++;
3921                         }
3922                         if (curname == endp)
3923                                 return false;   /* empty unquoted name not allowed */
3924                 }
3925
3926                 while (scanner_isspace(*nextp))
3927                         nextp++;                        /* skip trailing whitespace */
3928
3929                 if (*nextp == separator)
3930                 {
3931                         nextp++;
3932                         while (scanner_isspace(*nextp))
3933                                 nextp++;                /* skip leading whitespace for next */
3934                         /* we expect another name, so done remains false */
3935                 }
3936                 else if (*nextp == '\0')
3937                         done = true;
3938                 else
3939                         return false;           /* invalid syntax */
3940
3941                 /* Now safe to overwrite separator with a null */
3942                 *endp = '\0';
3943
3944                 /* Truncate path if it's overlength */
3945                 if (strlen(curname) >= MAXPGPATH)
3946                         curname[MAXPGPATH - 1] = '\0';
3947
3948                 /*
3949                  * Finished isolating current name --- add it to list
3950                  */
3951                 curname = pstrdup(curname);
3952                 canonicalize_path(curname);
3953                 *namelist = lappend(*namelist, curname);
3954
3955                 /* Loop back if we didn't reach end of string */
3956         } while (!done);
3957
3958         return true;
3959 }
3960
3961
3962 /*
3963  * SplitGUCList --- parse a string containing identifiers or file names
3964  *
3965  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3966  * presuming whether the elements will be taken as identifiers or file names.
3967  * We assume the input has already been through flatten_set_variable_args(),
3968  * so that we need never downcase (if appropriate, that was done already).
3969  * Nor do we ever truncate, since we don't know the correct max length.
3970  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3971  * because any embedded whitespace should have led to double-quoting).
3972  * Otherwise the API is identical to SplitIdentifierString.
3973  *
3974  * XXX it's annoying to have so many copies of this string-splitting logic.
3975  * However, it's not clear that having one function with a bunch of option
3976  * flags would be much better.
3977  *
3978  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3979  * Be sure to update that if you have to change this.
3980  *
3981  * Inputs:
3982  *      rawstring: the input string; must be overwritable!      On return, it's
3983  *                         been modified to contain the separated identifiers.
3984  *      separator: the separator punctuation expected between identifiers
3985  *                         (typically '.' or ',').  Whitespace may also appear around
3986  *                         identifiers.
3987  * Outputs:
3988  *      namelist: filled with a palloc'd list of pointers to identifiers within
3989  *                        rawstring.  Caller should list_free() this even on error return.
3990  *
3991  * Returns true if okay, false if there is a syntax error in the string.
3992  */
3993 bool
3994 SplitGUCList(char *rawstring, char separator,
3995                          List **namelist)
3996 {
3997         char       *nextp = rawstring;
3998         bool            done = false;
3999
4000         *namelist = NIL;
4001
4002         while (scanner_isspace(*nextp))
4003                 nextp++;                                /* skip leading whitespace */
4004
4005         if (*nextp == '\0')
4006                 return true;                    /* allow empty string */
4007
4008         /* At the top of the loop, we are at start of a new identifier. */
4009         do
4010         {
4011                 char       *curname;
4012                 char       *endp;
4013
4014                 if (*nextp == '"')
4015                 {
4016                         /* Quoted name --- collapse quote-quote pairs */
4017                         curname = nextp + 1;
4018                         for (;;)
4019                         {
4020                                 endp = strchr(nextp + 1, '"');
4021                                 if (endp == NULL)
4022                                         return false;   /* mismatched quotes */
4023                                 if (endp[1] != '"')
4024                                         break;          /* found end of quoted name */
4025                                 /* Collapse adjacent quotes into one quote, and look again */
4026                                 memmove(endp, endp + 1, strlen(endp));
4027                                 nextp = endp;
4028                         }
4029                         /* endp now points at the terminating quote */
4030                         nextp = endp + 1;
4031                 }
4032                 else
4033                 {
4034                         /* Unquoted name --- extends to separator or whitespace */
4035                         curname = nextp;
4036                         while (*nextp && *nextp != separator &&
4037                                    !scanner_isspace(*nextp))
4038                                 nextp++;
4039                         endp = nextp;
4040                         if (curname == nextp)
4041                                 return false;   /* empty unquoted name not allowed */
4042                 }
4043
4044                 while (scanner_isspace(*nextp))
4045                         nextp++;                        /* skip trailing whitespace */
4046
4047                 if (*nextp == separator)
4048                 {
4049                         nextp++;
4050                         while (scanner_isspace(*nextp))
4051                                 nextp++;                /* skip leading whitespace for next */
4052                         /* we expect another name, so done remains false */
4053                 }
4054                 else if (*nextp == '\0')
4055                         done = true;
4056                 else
4057                         return false;           /* invalid syntax */
4058
4059                 /* Now safe to overwrite separator with a null */
4060                 *endp = '\0';
4061
4062                 /*
4063                  * Finished isolating current name --- add it to list
4064                  */
4065                 *namelist = lappend(*namelist, curname);
4066
4067                 /* Loop back if we didn't reach end of string */
4068         } while (!done);
4069
4070         return true;
4071 }
4072
4073
4074 /*****************************************************************************
4075  *      Comparison Functions used for bytea
4076  *
4077  * Note: btree indexes need these routines not to leak memory; therefore,
4078  * be careful to free working copies of toasted datums.  Most places don't
4079  * need to be so careful.
4080  *****************************************************************************/
4081
4082 Datum
4083 byteaeq(PG_FUNCTION_ARGS)
4084 {
4085         Datum           arg1 = PG_GETARG_DATUM(0);
4086         Datum           arg2 = PG_GETARG_DATUM(1);
4087         bool            result;
4088         Size            len1,
4089                                 len2;
4090
4091         /*
4092          * We can use a fast path for unequal lengths, which might save us from
4093          * having to detoast one or both values.
4094          */
4095         len1 = toast_raw_datum_size(arg1);
4096         len2 = toast_raw_datum_size(arg2);
4097         if (len1 != len2)
4098                 result = false;
4099         else
4100         {
4101                 bytea      *barg1 = DatumGetByteaPP(arg1);
4102                 bytea      *barg2 = DatumGetByteaPP(arg2);
4103
4104                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4105                                                  len1 - VARHDRSZ) == 0);
4106
4107                 PG_FREE_IF_COPY(barg1, 0);
4108                 PG_FREE_IF_COPY(barg2, 1);
4109         }
4110
4111         PG_RETURN_BOOL(result);
4112 }
4113
4114 Datum
4115 byteane(PG_FUNCTION_ARGS)
4116 {
4117         Datum           arg1 = PG_GETARG_DATUM(0);
4118         Datum           arg2 = PG_GETARG_DATUM(1);
4119         bool            result;
4120         Size            len1,
4121                                 len2;
4122
4123         /*
4124          * We can use a fast path for unequal lengths, which might save us from
4125          * having to detoast one or both values.
4126          */
4127         len1 = toast_raw_datum_size(arg1);
4128         len2 = toast_raw_datum_size(arg2);
4129         if (len1 != len2)
4130                 result = true;
4131         else
4132         {
4133                 bytea      *barg1 = DatumGetByteaPP(arg1);
4134                 bytea      *barg2 = DatumGetByteaPP(arg2);
4135
4136                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4137                                                  len1 - VARHDRSZ) != 0);
4138
4139                 PG_FREE_IF_COPY(barg1, 0);
4140                 PG_FREE_IF_COPY(barg2, 1);
4141         }
4142
4143         PG_RETURN_BOOL(result);
4144 }
4145
4146 Datum
4147 bytealt(PG_FUNCTION_ARGS)
4148 {
4149         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4150         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4151         int                     len1,
4152                                 len2;
4153         int                     cmp;
4154
4155         len1 = VARSIZE_ANY_EXHDR(arg1);
4156         len2 = VARSIZE_ANY_EXHDR(arg2);
4157
4158         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4159
4160         PG_FREE_IF_COPY(arg1, 0);
4161         PG_FREE_IF_COPY(arg2, 1);
4162
4163         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4164 }
4165
4166 Datum
4167 byteale(PG_FUNCTION_ARGS)
4168 {
4169         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4170         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4171         int                     len1,
4172                                 len2;
4173         int                     cmp;
4174
4175         len1 = VARSIZE_ANY_EXHDR(arg1);
4176         len2 = VARSIZE_ANY_EXHDR(arg2);
4177
4178         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4179
4180         PG_FREE_IF_COPY(arg1, 0);
4181         PG_FREE_IF_COPY(arg2, 1);
4182
4183         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4184 }
4185
4186 Datum
4187 byteagt(PG_FUNCTION_ARGS)
4188 {
4189         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4190         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4191         int                     len1,
4192                                 len2;
4193         int                     cmp;
4194
4195         len1 = VARSIZE_ANY_EXHDR(arg1);
4196         len2 = VARSIZE_ANY_EXHDR(arg2);
4197
4198         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4199
4200         PG_FREE_IF_COPY(arg1, 0);
4201         PG_FREE_IF_COPY(arg2, 1);
4202
4203         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4204 }
4205
4206 Datum
4207 byteage(PG_FUNCTION_ARGS)
4208 {
4209         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4210         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4211         int                     len1,
4212                                 len2;
4213         int                     cmp;
4214
4215         len1 = VARSIZE_ANY_EXHDR(arg1);
4216         len2 = VARSIZE_ANY_EXHDR(arg2);
4217
4218         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4219
4220         PG_FREE_IF_COPY(arg1, 0);
4221         PG_FREE_IF_COPY(arg2, 1);
4222
4223         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4224 }
4225
4226 Datum
4227 byteacmp(PG_FUNCTION_ARGS)
4228 {
4229         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4230         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4231         int                     len1,
4232                                 len2;
4233         int                     cmp;
4234
4235         len1 = VARSIZE_ANY_EXHDR(arg1);
4236         len2 = VARSIZE_ANY_EXHDR(arg2);
4237
4238         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4239         if ((cmp == 0) && (len1 != len2))
4240                 cmp = (len1 < len2) ? -1 : 1;
4241
4242         PG_FREE_IF_COPY(arg1, 0);
4243         PG_FREE_IF_COPY(arg2, 1);
4244
4245         PG_RETURN_INT32(cmp);
4246 }
4247
4248 Datum
4249 bytea_sortsupport(PG_FUNCTION_ARGS)
4250 {
4251         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4252         MemoryContext oldcontext;
4253
4254         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4255
4256         /* Use generic string SortSupport, forcing "C" collation */
4257         varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4258
4259         MemoryContextSwitchTo(oldcontext);
4260
4261         PG_RETURN_VOID();
4262 }
4263
4264 /*
4265  * appendStringInfoText
4266  *
4267  * Append a text to str.
4268  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4269  */
4270 static void
4271 appendStringInfoText(StringInfo str, const text *t)
4272 {
4273         appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4274 }
4275
4276 /*
4277  * replace_text
4278  * replace all occurrences of 'old_sub_str' in 'orig_str'
4279  * with 'new_sub_str' to form 'new_str'
4280  *
4281  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4282  * otherwise returns 'new_str'
4283  */
4284 Datum
4285 replace_text(PG_FUNCTION_ARGS)
4286 {
4287         text       *src_text = PG_GETARG_TEXT_PP(0);
4288         text       *from_sub_text = PG_GETARG_TEXT_PP(1);
4289         text       *to_sub_text = PG_GETARG_TEXT_PP(2);
4290         int                     src_text_len;
4291         int                     from_sub_text_len;
4292         TextPositionState state;
4293         text       *ret_text;
4294         int                     chunk_len;
4295         char       *curr_ptr;
4296         char       *start_ptr;
4297         StringInfoData str;
4298         bool            found;
4299
4300         src_text_len = VARSIZE_ANY_EXHDR(src_text);
4301         from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4302
4303         /* Return unmodified source string if empty source or pattern */
4304         if (src_text_len < 1 || from_sub_text_len < 1)
4305         {
4306                 PG_RETURN_TEXT_P(src_text);
4307         }
4308
4309         text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4310
4311         found = text_position_next(&state);
4312
4313         /* When the from_sub_text is not found, there is nothing to do. */
4314         if (!found)
4315         {
4316                 text_position_cleanup(&state);
4317                 PG_RETURN_TEXT_P(src_text);
4318         }
4319         curr_ptr = text_position_get_match_ptr(&state);
4320         start_ptr = VARDATA_ANY(src_text);
4321
4322         initStringInfo(&str);
4323
4324         do
4325         {
4326                 CHECK_FOR_INTERRUPTS();
4327
4328                 /* copy the data skipped over by last text_position_next() */
4329                 chunk_len = curr_ptr - start_ptr;
4330                 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4331
4332                 appendStringInfoText(&str, to_sub_text);
4333
4334                 start_ptr = curr_ptr + from_sub_text_len;
4335
4336                 found = text_position_next(&state);
4337                 if (found)
4338                         curr_ptr = text_position_get_match_ptr(&state);
4339         }
4340         while (found);
4341
4342         /* copy trailing data */
4343         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4344         appendBinaryStringInfo(&str, start_ptr, chunk_len);
4345
4346         text_position_cleanup(&state);
4347
4348         ret_text = cstring_to_text_with_len(str.data, str.len);
4349         pfree(str.data);
4350
4351         PG_RETURN_TEXT_P(ret_text);
4352 }
4353
4354 /*
4355  * check_replace_text_has_escape
4356  *
4357  * Returns 0 if text contains no backslashes that need processing.
4358  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4359  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4360  */
4361 static int
4362 check_replace_text_has_escape(const text *replace_text)
4363 {
4364         int                     result = 0;
4365         const char *p = VARDATA_ANY(replace_text);
4366         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4367
4368         while (p < p_end)
4369         {
4370                 /* Find next escape char, if any. */
4371                 p = memchr(p, '\\', p_end - p);
4372                 if (p == NULL)
4373                         break;
4374                 p++;
4375                 /* Note: a backslash at the end doesn't require extra processing. */
4376                 if (p < p_end)
4377                 {
4378                         if (*p >= '1' && *p <= '9')
4379                                 return 2;               /* Found a submatch specifier, so done */
4380                         result = 1;                     /* Found some other sequence, keep looking */
4381                         p++;
4382                 }
4383         }
4384         return result;
4385 }
4386
4387 /*
4388  * appendStringInfoRegexpSubstr
4389  *
4390  * Append replace_text to str, substituting regexp back references for
4391  * \n escapes.  start_ptr is the start of the match in the source string,
4392  * at logical character position data_pos.
4393  */
4394 static void
4395 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4396                                                          regmatch_t *pmatch,
4397                                                          char *start_ptr, int data_pos)
4398 {
4399         const char *p = VARDATA_ANY(replace_text);
4400         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4401
4402         while (p < p_end)
4403         {
4404                 const char *chunk_start = p;
4405                 int                     so;
4406                 int                     eo;
4407
4408                 /* Find next escape char, if any. */
4409                 p = memchr(p, '\\', p_end - p);
4410                 if (p == NULL)
4411                         p = p_end;
4412
4413                 /* Copy the text we just scanned over, if any. */
4414                 if (p > chunk_start)
4415                         appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4416
4417                 /* Done if at end of string, else advance over escape char. */
4418                 if (p >= p_end)
4419                         break;
4420                 p++;
4421
4422                 if (p >= p_end)
4423                 {
4424                         /* Escape at very end of input.  Treat same as unexpected char */
4425                         appendStringInfoChar(str, '\\');
4426                         break;
4427                 }
4428
4429                 if (*p >= '1' && *p <= '9')
4430                 {
4431                         /* Use the back reference of regexp. */
4432                         int                     idx = *p - '0';
4433
4434                         so = pmatch[idx].rm_so;
4435                         eo = pmatch[idx].rm_eo;
4436                         p++;
4437                 }
4438                 else if (*p == '&')
4439                 {
4440                         /* Use the entire matched string. */
4441                         so = pmatch[0].rm_so;
4442                         eo = pmatch[0].rm_eo;
4443                         p++;
4444                 }
4445                 else if (*p == '\\')
4446                 {
4447                         /* \\ means transfer one \ to output. */
4448                         appendStringInfoChar(str, '\\');
4449                         p++;
4450                         continue;
4451                 }
4452                 else
4453                 {
4454                         /*
4455                          * If escape char is not followed by any expected char, just treat
4456                          * it as ordinary data to copy.  (XXX would it be better to throw
4457                          * an error?)
4458                          */
4459                         appendStringInfoChar(str, '\\');
4460                         continue;
4461                 }
4462
4463                 if (so >= 0 && eo >= 0)
4464                 {
4465                         /*
4466                          * Copy the text that is back reference of regexp.  Note so and eo
4467                          * are counted in characters not bytes.
4468                          */
4469                         char       *chunk_start;
4470                         int                     chunk_len;
4471
4472                         Assert(so >= data_pos);
4473                         chunk_start = start_ptr;
4474                         chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4475                         chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4476                         appendBinaryStringInfo(str, chunk_start, chunk_len);
4477                 }
4478         }
4479 }
4480
4481 /*
4482  * replace_text_regexp
4483  *
4484  * replace substring(s) in src_text that match pattern with replace_text.
4485  * The replace_text can contain backslash markers to substitute
4486  * (parts of) the matched text.
4487  *
4488  * cflags: regexp compile flags.
4489  * collation: collation to use.
4490  * search_start: the character (not byte) offset in src_text at which to
4491  * begin searching.
4492  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4493  */
4494 text *
4495 replace_text_regexp(text *src_text, text *pattern_text,
4496                                         text *replace_text,
4497                                         int cflags, Oid collation,
4498                                         int search_start, int n)
4499 {
4500         text       *ret_text;
4501         regex_t    *re;
4502         int                     src_text_len = VARSIZE_ANY_EXHDR(src_text);
4503         int                     nmatches = 0;
4504         StringInfoData buf;
4505         regmatch_t      pmatch[10];             /* main match, plus \1 to \9 */
4506         int                     nmatch = lengthof(pmatch);
4507         pg_wchar   *data;
4508         size_t          data_len;
4509         int                     data_pos;
4510         char       *start_ptr;
4511         int                     escape_status;
4512
4513         initStringInfo(&buf);
4514
4515         /* Convert data string to wide characters. */
4516         data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4517         data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4518
4519         /* Check whether replace_text has escapes, especially regexp submatches. */
4520         escape_status = check_replace_text_has_escape(replace_text);
4521
4522         /* If no regexp submatches, we can use REG_NOSUB. */
4523         if (escape_status < 2)
4524         {
4525                 cflags |= REG_NOSUB;
4526                 /* Also tell pg_regexec we only want the whole-match location. */
4527                 nmatch = 1;
4528         }
4529
4530         /* Prepare the regexp. */
4531         re = RE_compile_and_cache(pattern_text, cflags, collation);
4532
4533         /* start_ptr points to the data_pos'th character of src_text */
4534         start_ptr = (char *) VARDATA_ANY(src_text);
4535         data_pos = 0;
4536
4537         while (search_start <= data_len)
4538         {
4539                 int                     regexec_result;
4540
4541                 CHECK_FOR_INTERRUPTS();
4542
4543                 regexec_result = pg_regexec(re,
4544                                                                         data,
4545                                                                         data_len,
4546                                                                         search_start,
4547                                                                         NULL,   /* no details */
4548                                                                         nmatch,
4549                                                                         pmatch,
4550                                                                         0);
4551
4552                 if (regexec_result == REG_NOMATCH)
4553                         break;
4554
4555                 if (regexec_result != REG_OKAY)
4556                 {
4557                         char            errMsg[100];
4558
4559                         CHECK_FOR_INTERRUPTS();
4560                         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4561                         ereport(ERROR,
4562                                         (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4563                                          errmsg("regular expression failed: %s", errMsg)));
4564                 }
4565
4566                 /*
4567                  * Count matches, and decide whether to replace this match.
4568                  */
4569                 nmatches++;
4570                 if (n > 0 && nmatches != n)
4571                 {
4572                         /*
4573                          * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4574                          * we treat the matched text as if it weren't matched, and copy it
4575                          * to the output later.)
4576                          */
4577                         search_start = pmatch[0].rm_eo;
4578                         if (pmatch[0].rm_so == pmatch[0].rm_eo)
4579                                 search_start++;
4580                         continue;
4581                 }
4582
4583                 /*
4584                  * Copy the text to the left of the match position.  Note we are given
4585                  * character not byte indexes.
4586                  */
4587                 if (pmatch[0].rm_so - data_pos > 0)
4588                 {
4589                         int                     chunk_len;
4590
4591                         chunk_len = charlen_to_bytelen(start_ptr,
4592                                                                                    pmatch[0].rm_so - data_pos);
4593                         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4594
4595                         /*
4596                          * Advance start_ptr over that text, to avoid multiple rescans of
4597                          * it if the replace_text contains multiple back-references.
4598                          */
4599                         start_ptr += chunk_len;
4600                         data_pos = pmatch[0].rm_so;
4601                 }
4602
4603                 /*
4604                  * Copy the replace_text, processing escapes if any are present.
4605                  */
4606                 if (escape_status > 0)
4607                         appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4608                                                                                  start_ptr, data_pos);
4609                 else
4610                         appendStringInfoText(&buf, replace_text);
4611
4612                 /* Advance start_ptr and data_pos over the matched text. */
4613                 start_ptr += charlen_to_bytelen(start_ptr,
4614                                                                                 pmatch[0].rm_eo - data_pos);
4615                 data_pos = pmatch[0].rm_eo;
4616
4617                 /*
4618                  * If we only want to replace one occurrence, we're done.
4619                  */
4620                 if (n > 0)
4621                         break;
4622
4623                 /*
4624                  * Advance search position.  Normally we start the next search at the
4625                  * end of the previous match; but if the match was of zero length, we
4626                  * have to advance by one character, or we'd just find the same match
4627                  * again.
4628                  */
4629                 search_start = data_pos;
4630                 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4631                         search_start++;
4632         }
4633
4634         /*
4635          * Copy the text to the right of the last match.
4636          */
4637         if (data_pos < data_len)
4638         {
4639                 int                     chunk_len;
4640
4641                 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4642                 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4643         }
4644
4645         ret_text = cstring_to_text_with_len(buf.data, buf.len);
4646         pfree(buf.data);
4647         pfree(data);
4648
4649         return ret_text;
4650 }
4651
4652 /*
4653  * split_part
4654  * parse input string based on provided field separator
4655  * return N'th item (1 based, negative counts from end)
4656  */
4657 Datum
4658 split_part(PG_FUNCTION_ARGS)
4659 {
4660         text       *inputstring = PG_GETARG_TEXT_PP(0);
4661         text       *fldsep = PG_GETARG_TEXT_PP(1);
4662         int                     fldnum = PG_GETARG_INT32(2);
4663         int                     inputstring_len;
4664         int                     fldsep_len;
4665         TextPositionState state;
4666         char       *start_ptr;
4667         char       *end_ptr;
4668         text       *result_text;
4669         bool            found;
4670
4671         /* field number is 1 based */
4672         if (fldnum == 0)
4673                 ereport(ERROR,
4674                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4675                                  errmsg("field position must not be zero")));
4676
4677         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4678         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4679
4680         /* return empty string for empty input string */
4681         if (inputstring_len < 1)
4682                 PG_RETURN_TEXT_P(cstring_to_text(""));
4683
4684         /* handle empty field separator */
4685         if (fldsep_len < 1)
4686         {
4687                 /* if first or last field, return input string, else empty string */
4688                 if (fldnum == 1 || fldnum == -1)
4689                         PG_RETURN_TEXT_P(inputstring);
4690                 else
4691                         PG_RETURN_TEXT_P(cstring_to_text(""));
4692         }
4693
4694         /* find the first field separator */
4695         text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4696
4697         found = text_position_next(&state);
4698
4699         /* special case if fldsep not found at all */
4700         if (!found)
4701         {
4702                 text_position_cleanup(&state);
4703                 /* if first or last field, return input string, else empty string */
4704                 if (fldnum == 1 || fldnum == -1)
4705                         PG_RETURN_TEXT_P(inputstring);
4706                 else
4707                         PG_RETURN_TEXT_P(cstring_to_text(""));
4708         }
4709
4710         /*
4711          * take care of a negative field number (i.e. count from the right) by
4712          * converting to a positive field number; we need total number of fields
4713          */
4714         if (fldnum < 0)
4715         {
4716                 /* we found a fldsep, so there are at least two fields */
4717                 int                     numfields = 2;
4718
4719                 while (text_position_next(&state))
4720                         numfields++;
4721
4722                 /* special case of last field does not require an extra pass */
4723                 if (fldnum == -1)
4724                 {
4725                         start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4726                         end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4727                         text_position_cleanup(&state);
4728                         PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4729                                                                                                           end_ptr - start_ptr));
4730                 }
4731
4732                 /* else, convert fldnum to positive notation */
4733                 fldnum += numfields + 1;
4734
4735                 /* if nonexistent field, return empty string */
4736                 if (fldnum <= 0)
4737                 {
4738                         text_position_cleanup(&state);
4739                         PG_RETURN_TEXT_P(cstring_to_text(""));
4740                 }
4741
4742                 /* reset to pointing at first match, but now with positive fldnum */
4743                 text_position_reset(&state);
4744                 found = text_position_next(&state);
4745                 Assert(found);
4746         }
4747
4748         /* identify bounds of first field */
4749         start_ptr = VARDATA_ANY(inputstring);
4750         end_ptr = text_position_get_match_ptr(&state);
4751
4752         while (found && --fldnum > 0)
4753         {
4754                 /* identify bounds of next field */
4755                 start_ptr = end_ptr + fldsep_len;
4756                 found = text_position_next(&state);
4757                 if (found)
4758                         end_ptr = text_position_get_match_ptr(&state);
4759         }
4760
4761         text_position_cleanup(&state);
4762
4763         if (fldnum > 0)
4764         {
4765                 /* N'th field separator not found */
4766                 /* if last field requested, return it, else empty string */
4767                 if (fldnum == 1)
4768                 {
4769                         int                     last_len = start_ptr - VARDATA_ANY(inputstring);
4770
4771                         result_text = cstring_to_text_with_len(start_ptr,
4772                                                                                                    inputstring_len - last_len);
4773                 }
4774                 else
4775                         result_text = cstring_to_text("");
4776         }
4777         else
4778         {
4779                 /* non-last field requested */
4780                 result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4781         }
4782
4783         PG_RETURN_TEXT_P(result_text);
4784 }
4785
4786 /*
4787  * Convenience function to return true when two text params are equal.
4788  */
4789 static bool
4790 text_isequal(text *txt1, text *txt2, Oid collid)
4791 {
4792         return DatumGetBool(DirectFunctionCall2Coll(texteq,
4793                                                                                                 collid,
4794                                                                                                 PointerGetDatum(txt1),
4795                                                                                                 PointerGetDatum(txt2)));
4796 }
4797
4798 /*
4799  * text_to_array
4800  * parse input string and return text array of elements,
4801  * based on provided field separator
4802  */
4803 Datum
4804 text_to_array(PG_FUNCTION_ARGS)
4805 {
4806         SplitTextOutputData tstate;
4807
4808         /* For array output, tstate should start as all zeroes */
4809         memset(&tstate, 0, sizeof(tstate));
4810
4811         if (!split_text(fcinfo, &tstate))
4812                 PG_RETURN_NULL();
4813
4814         if (tstate.astate == NULL)
4815                 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4816
4817         PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate.astate,
4818                                                                                   CurrentMemoryContext));
4819 }
4820
4821 /*
4822  * text_to_array_null
4823  * parse input string and return text array of elements,
4824  * based on provided field separator and null string
4825  *
4826  * This is a separate entry point only to prevent the regression tests from
4827  * complaining about different argument sets for the same internal function.
4828  */
4829 Datum
4830 text_to_array_null(PG_FUNCTION_ARGS)
4831 {
4832         return text_to_array(fcinfo);
4833 }
4834
4835 /*
4836  * text_to_table
4837  * parse input string and return table of elements,
4838  * based on provided field separator
4839  */
4840 Datum
4841 text_to_table(PG_FUNCTION_ARGS)
4842 {
4843         ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4844         SplitTextOutputData tstate;
4845         MemoryContext old_cxt;
4846
4847         /* check to see if caller supports us returning a tuplestore */
4848         if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
4849                 ereport(ERROR,
4850                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4851                                  errmsg("set-valued function called in context that cannot accept a set")));
4852         if (!(rsi->allowedModes & SFRM_Materialize))
4853                 ereport(ERROR,
4854                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4855                                  errmsg("materialize mode required, but it is not allowed in this context")));
4856
4857         /* OK, prepare tuplestore in per-query memory */
4858         old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory);
4859
4860         tstate.astate = NULL;
4861         tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
4862         tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
4863
4864         MemoryContextSwitchTo(old_cxt);
4865
4866         (void) split_text(fcinfo, &tstate);
4867
4868         tuplestore_donestoring(tstate.tupstore);
4869
4870         rsi->returnMode = SFRM_Materialize;
4871         rsi->setResult = tstate.tupstore;
4872         rsi->setDesc = tstate.tupdesc;
4873
4874         return (Datum) 0;
4875 }
4876
4877 /*
4878  * text_to_table_null
4879  * parse input string and return table of elements,
4880  * based on provided field separator and null string
4881  *
4882  * This is a separate entry point only to prevent the regression tests from
4883  * complaining about different argument sets for the same internal function.
4884  */
4885 Datum
4886 text_to_table_null(PG_FUNCTION_ARGS)
4887 {
4888         return text_to_table(fcinfo);
4889 }
4890
4891 /*
4892  * Common code for text_to_array, text_to_array_null, text_to_table
4893  * and text_to_table_null functions.
4894  *
4895  * These are not strict so we have to test for null inputs explicitly.
4896  * Returns false if result is to be null, else returns true.
4897  *
4898  * Note that if the result is valid but empty (zero elements), we return
4899  * without changing *tstate --- caller must handle that case, too.
4900  */
4901 static bool
4902 split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4903 {
4904         text       *inputstring;
4905         text       *fldsep;
4906         text       *null_string;
4907         Oid                     collation = PG_GET_COLLATION();
4908         int                     inputstring_len;
4909         int                     fldsep_len;
4910         char       *start_ptr;
4911         text       *result_text;
4912
4913         /* when input string is NULL, then result is NULL too */
4914         if (PG_ARGISNULL(0))
4915                 return false;
4916
4917         inputstring = PG_GETARG_TEXT_PP(0);
4918
4919         /* fldsep can be NULL */
4920         if (!PG_ARGISNULL(1))
4921                 fldsep = PG_GETARG_TEXT_PP(1);
4922         else
4923                 fldsep = NULL;
4924
4925         /* null_string can be NULL or omitted */
4926         if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4927                 null_string = PG_GETARG_TEXT_PP(2);
4928         else
4929                 null_string = NULL;
4930
4931         if (fldsep != NULL)
4932         {
4933                 /*
4934                  * Normal case with non-null fldsep.  Use the text_position machinery
4935                  * to search for occurrences of fldsep.
4936                  */
4937                 TextPositionState state;
4938
4939                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4940                 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4941
4942                 /* return empty set for empty input string */
4943                 if (inputstring_len < 1)
4944                         return true;
4945
4946                 /* empty field separator: return input string as a one-element set */
4947                 if (fldsep_len < 1)
4948                 {
4949                         split_text_accum_result(tstate, inputstring,
4950                                                                         null_string, collation);
4951                         return true;
4952                 }
4953
4954                 text_position_setup(inputstring, fldsep, collation, &state);
4955
4956                 start_ptr = VARDATA_ANY(inputstring);
4957
4958                 for (;;)
4959                 {
4960                         bool            found;
4961                         char       *end_ptr;
4962                         int                     chunk_len;
4963
4964                         CHECK_FOR_INTERRUPTS();
4965
4966                         found = text_position_next(&state);
4967                         if (!found)
4968                         {
4969                                 /* fetch last field */
4970                                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4971                                 end_ptr = NULL; /* not used, but some compilers complain */
4972                         }
4973                         else
4974                         {
4975                                 /* fetch non-last field */
4976                                 end_ptr = text_position_get_match_ptr(&state);
4977                                 chunk_len = end_ptr - start_ptr;
4978                         }
4979
4980                         /* build a temp text datum to pass to split_text_accum_result */
4981                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4982
4983                         /* stash away this field */
4984                         split_text_accum_result(tstate, result_text,
4985                                                                         null_string, collation);
4986
4987                         pfree(result_text);
4988
4989                         if (!found)
4990                                 break;
4991
4992                         start_ptr = end_ptr + fldsep_len;
4993                 }
4994
4995                 text_position_cleanup(&state);
4996         }
4997         else
4998         {
4999                 /*
5000                  * When fldsep is NULL, each character in the input string becomes a
5001                  * separate element in the result set.  The separator is effectively
5002                  * the space between characters.
5003                  */
5004                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
5005
5006                 start_ptr = VARDATA_ANY(inputstring);
5007
5008                 while (inputstring_len > 0)
5009                 {
5010                         int                     chunk_len = pg_mblen(start_ptr);
5011
5012                         CHECK_FOR_INTERRUPTS();
5013
5014                         /* build a temp text datum to pass to split_text_accum_result */
5015                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
5016
5017                         /* stash away this field */
5018                         split_text_accum_result(tstate, result_text,
5019                                                                         null_string, collation);
5020
5021                         pfree(result_text);
5022
5023                         start_ptr += chunk_len;
5024                         inputstring_len -= chunk_len;
5025                 }
5026         }
5027
5028         return true;
5029 }
5030
5031 /*
5032  * Add text item to result set (table or array).
5033  *
5034  * This is also responsible for checking to see if the item matches
5035  * the null_string, in which case we should emit NULL instead.
5036  */
5037 static void
5038 split_text_accum_result(SplitTextOutputData *tstate,
5039                                                 text *field_value,
5040                                                 text *null_string,
5041                                                 Oid collation)
5042 {
5043         bool            is_null = false;
5044
5045         if (null_string && text_isequal(field_value, null_string, collation))
5046                 is_null = true;
5047
5048         if (tstate->tupstore)
5049         {
5050                 Datum           values[1];
5051                 bool            nulls[1];
5052
5053                 values[0] = PointerGetDatum(field_value);
5054                 nulls[0] = is_null;
5055
5056                 tuplestore_putvalues(tstate->tupstore,
5057                                                          tstate->tupdesc,
5058                                                          values,
5059                                                          nulls);
5060         }
5061         else
5062         {
5063                 tstate->astate = accumArrayResult(tstate->astate,
5064                                                                                   PointerGetDatum(field_value),
5065                                                                                   is_null,
5066                                                                                   TEXTOID,
5067                                                                                   CurrentMemoryContext);
5068         }
5069 }
5070
5071 /*
5072  * array_to_text
5073  * concatenate Cstring representation of input array elements
5074  * using provided field separator
5075  */
5076 Datum
5077 array_to_text(PG_FUNCTION_ARGS)
5078 {
5079         ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
5080         char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5081
5082         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5083 }
5084
5085 /*
5086  * array_to_text_null
5087  * concatenate Cstring representation of input array elements
5088  * using provided field separator and null string
5089  *
5090  * This version is not strict so we have to test for null inputs explicitly.
5091  */
5092 Datum
5093 array_to_text_null(PG_FUNCTION_ARGS)
5094 {
5095         ArrayType  *v;
5096         char       *fldsep;
5097         char       *null_string;
5098
5099         /* returns NULL when first or second parameter is NULL */
5100         if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5101                 PG_RETURN_NULL();
5102
5103         v = PG_GETARG_ARRAYTYPE_P(0);
5104         fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5105
5106         /* NULL null string is passed through as a null pointer */
5107         if (!PG_ARGISNULL(2))
5108                 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5109         else
5110                 null_string = NULL;
5111
5112         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5113 }
5114
5115 /*
5116  * common code for array_to_text and array_to_text_null functions
5117  */
5118 static text *
5119 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5120                                            const char *fldsep, const char *null_string)
5121 {
5122         text       *result;
5123         int                     nitems,
5124                            *dims,
5125                                 ndims;
5126         Oid                     element_type;
5127         int                     typlen;
5128         bool            typbyval;
5129         char            typalign;
5130         StringInfoData buf;
5131         bool            printed = false;
5132         char       *p;
5133         bits8      *bitmap;
5134         int                     bitmask;
5135         int                     i;
5136         ArrayMetaState *my_extra;
5137
5138         ndims = ARR_NDIM(v);
5139         dims = ARR_DIMS(v);
5140         nitems = ArrayGetNItems(ndims, dims);
5141
5142         /* if there are no elements, return an empty string */
5143         if (nitems == 0)
5144                 return cstring_to_text_with_len("", 0);
5145
5146         element_type = ARR_ELEMTYPE(v);
5147         initStringInfo(&buf);
5148
5149         /*
5150          * We arrange to look up info about element type, including its output
5151          * conversion proc, only once per series of calls, assuming the element
5152          * type doesn't change underneath us.
5153          */
5154         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5155         if (my_extra == NULL)
5156         {
5157                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5158                                                                                                           sizeof(ArrayMetaState));
5159                 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5160                 my_extra->element_type = ~element_type;
5161         }
5162
5163         if (my_extra->element_type != element_type)
5164         {
5165                 /*
5166                  * Get info about element type, including its output conversion proc
5167                  */
5168                 get_type_io_data(element_type, IOFunc_output,
5169                                                  &my_extra->typlen, &my_extra->typbyval,
5170                                                  &my_extra->typalign, &my_extra->typdelim,
5171                                                  &my_extra->typioparam, &my_extra->typiofunc);
5172                 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5173                                           fcinfo->flinfo->fn_mcxt);
5174                 my_extra->element_type = element_type;
5175         }
5176         typlen = my_extra->typlen;
5177         typbyval = my_extra->typbyval;
5178         typalign = my_extra->typalign;
5179
5180         p = ARR_DATA_PTR(v);
5181         bitmap = ARR_NULLBITMAP(v);
5182         bitmask = 1;
5183
5184         for (i = 0; i < nitems; i++)
5185         {
5186                 Datum           itemvalue;
5187                 char       *value;
5188
5189                 /* Get source element, checking for NULL */
5190                 if (bitmap && (*bitmap & bitmask) == 0)
5191                 {
5192                         /* if null_string is NULL, we just ignore null elements */
5193                         if (null_string != NULL)
5194                         {
5195                                 if (printed)
5196                                         appendStringInfo(&buf, "%s%s", fldsep, null_string);
5197                                 else
5198                                         appendStringInfoString(&buf, null_string);
5199                                 printed = true;
5200                         }
5201                 }
5202                 else
5203                 {
5204                         itemvalue = fetch_att(p, typbyval, typlen);
5205
5206                         value = OutputFunctionCall(&my_extra->proc, itemvalue);
5207
5208                         if (printed)
5209                                 appendStringInfo(&buf, "%s%s", fldsep, value);
5210                         else
5211                                 appendStringInfoString(&buf, value);
5212                         printed = true;
5213
5214                         p = att_addlength_pointer(p, typlen, p);
5215                         p = (char *) att_align_nominal(p, typalign);
5216                 }
5217
5218                 /* advance bitmap pointer if any */
5219                 if (bitmap)
5220                 {
5221                         bitmask <<= 1;
5222                         if (bitmask == 0x100)
5223                         {
5224                                 bitmap++;
5225                                 bitmask = 1;
5226                         }
5227                 }
5228         }
5229
5230         result = cstring_to_text_with_len(buf.data, buf.len);
5231         pfree(buf.data);
5232
5233         return result;
5234 }
5235
5236 #define HEXBASE 16
5237 /*
5238  * Convert an int32 to a string containing a base 16 (hex) representation of
5239  * the number.
5240  */
5241 Datum
5242 to_hex32(PG_FUNCTION_ARGS)
5243 {
5244         uint32          value = (uint32) PG_GETARG_INT32(0);
5245         char       *ptr;
5246         const char *digits = "0123456789abcdef";
5247         char            buf[32];                /* bigger than needed, but reasonable */
5248
5249         ptr = buf + sizeof(buf) - 1;
5250         *ptr = '\0';
5251
5252         do
5253         {
5254                 *--ptr = digits[value % HEXBASE];
5255                 value /= HEXBASE;
5256         } while (ptr > buf && value);
5257
5258         PG_RETURN_TEXT_P(cstring_to_text(ptr));
5259 }
5260
5261 /*
5262  * Convert an int64 to a string containing a base 16 (hex) representation of
5263  * the number.
5264  */
5265 Datum
5266 to_hex64(PG_FUNCTION_ARGS)
5267 {
5268         uint64          value = (uint64) PG_GETARG_INT64(0);
5269         char       *ptr;
5270         const char *digits = "0123456789abcdef";
5271         char            buf[32];                /* bigger than needed, but reasonable */
5272
5273         ptr = buf + sizeof(buf) - 1;
5274         *ptr = '\0';
5275
5276         do
5277         {
5278                 *--ptr = digits[value % HEXBASE];
5279                 value /= HEXBASE;
5280         } while (ptr > buf && value);
5281
5282         PG_RETURN_TEXT_P(cstring_to_text(ptr));
5283 }
5284
5285 /*
5286  * Return the size of a datum, possibly compressed
5287  *
5288  * Works on any data type
5289  */
5290 Datum
5291 pg_column_size(PG_FUNCTION_ARGS)
5292 {
5293         Datum           value = PG_GETARG_DATUM(0);
5294         int32           result;
5295         int                     typlen;
5296
5297         /* On first call, get the input type's typlen, and save at *fn_extra */
5298         if (fcinfo->flinfo->fn_extra == NULL)
5299         {
5300                 /* Lookup the datatype of the supplied argument */
5301                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5302
5303                 typlen = get_typlen(argtypeid);
5304                 if (typlen == 0)                /* should not happen */
5305                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
5306
5307                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5308                                                                                                           sizeof(int));
5309                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5310         }
5311         else
5312                 typlen = *((int *) fcinfo->flinfo->fn_extra);
5313
5314         if (typlen == -1)
5315         {
5316                 /* varlena type, possibly toasted */
5317                 result = toast_datum_size(value);
5318         }
5319         else if (typlen == -2)
5320         {
5321                 /* cstring */
5322                 result = strlen(DatumGetCString(value)) + 1;
5323         }
5324         else
5325         {
5326                 /* ordinary fixed-width type */
5327                 result = typlen;
5328         }
5329
5330         PG_RETURN_INT32(result);
5331 }
5332
5333 /*
5334  * Return the compression method stored in the compressed attribute.  Return
5335  * NULL for non varlena type or uncompressed data.
5336  */
5337 Datum
5338 pg_column_compression(PG_FUNCTION_ARGS)
5339 {
5340         int                     typlen;
5341         char       *result;
5342         ToastCompressionId cmid;
5343
5344         /* On first call, get the input type's typlen, and save at *fn_extra */
5345         if (fcinfo->flinfo->fn_extra == NULL)
5346         {
5347                 /* Lookup the datatype of the supplied argument */
5348                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5349
5350                 typlen = get_typlen(argtypeid);
5351                 if (typlen == 0)                /* should not happen */
5352                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
5353
5354                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5355                                                                                                           sizeof(int));
5356                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5357         }
5358         else
5359                 typlen = *((int *) fcinfo->flinfo->fn_extra);
5360
5361         if (typlen != -1)
5362                 PG_RETURN_NULL();
5363
5364         /* get the compression method id stored in the compressed varlena */
5365         cmid = toast_get_compression_id((struct varlena *)
5366                                                                         DatumGetPointer(PG_GETARG_DATUM(0)));
5367         if (cmid == TOAST_INVALID_COMPRESSION_ID)
5368                 PG_RETURN_NULL();
5369
5370         /* convert compression method id to compression method name */
5371         switch (cmid)
5372         {
5373                 case TOAST_PGLZ_COMPRESSION_ID:
5374                         result = "pglz";
5375                         break;
5376                 case TOAST_LZ4_COMPRESSION_ID:
5377                         result = "lz4";
5378                         break;
5379                 default:
5380                         elog(ERROR, "invalid compression method id %d", cmid);
5381         }
5382
5383         PG_RETURN_TEXT_P(cstring_to_text(result));
5384 }
5385
5386 /*
5387  * string_agg - Concatenates values and returns string.
5388  *
5389  * Syntax: string_agg(value text, delimiter text) RETURNS text
5390  *
5391  * Note: Any NULL values are ignored. The first-call delimiter isn't
5392  * actually used at all, and on subsequent calls the delimiter precedes
5393  * the associated value.
5394  */
5395
5396 /* subroutine to initialize state */
5397 static StringInfo
5398 makeStringAggState(FunctionCallInfo fcinfo)
5399 {
5400         StringInfo      state;
5401         MemoryContext aggcontext;
5402         MemoryContext oldcontext;
5403
5404         if (!AggCheckCallContext(fcinfo, &aggcontext))
5405         {
5406                 /* cannot be called directly because of internal-type argument */
5407                 elog(ERROR, "string_agg_transfn called in non-aggregate context");
5408         }
5409
5410         /*
5411          * Create state in aggregate context.  It'll stay there across subsequent
5412          * calls.
5413          */
5414         oldcontext = MemoryContextSwitchTo(aggcontext);
5415         state = makeStringInfo();
5416         MemoryContextSwitchTo(oldcontext);
5417
5418         return state;
5419 }
5420
5421 Datum
5422 string_agg_transfn(PG_FUNCTION_ARGS)
5423 {
5424         StringInfo      state;
5425
5426         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5427
5428         /* Append the value unless null. */
5429         if (!PG_ARGISNULL(1))
5430         {
5431                 /* On the first time through, we ignore the delimiter. */
5432                 if (state == NULL)
5433                         state = makeStringAggState(fcinfo);
5434                 else if (!PG_ARGISNULL(2))
5435                         appendStringInfoText(state, PG_GETARG_TEXT_PP(2));      /* delimiter */
5436
5437                 appendStringInfoText(state, PG_GETARG_TEXT_PP(1));      /* value */
5438         }
5439
5440         /*
5441          * The transition type for string_agg() is declared to be "internal",
5442          * which is a pass-by-value type the same size as a pointer.
5443          */
5444         PG_RETURN_POINTER(state);
5445 }
5446
5447 Datum
5448 string_agg_finalfn(PG_FUNCTION_ARGS)
5449 {
5450         StringInfo      state;
5451
5452         /* cannot be called directly because of internal-type argument */
5453         Assert(AggCheckCallContext(fcinfo, NULL));
5454
5455         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5456
5457         if (state != NULL)
5458                 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5459         else
5460                 PG_RETURN_NULL();
5461 }
5462
5463 /*
5464  * Prepare cache with fmgr info for the output functions of the datatypes of
5465  * the arguments of a concat-like function, beginning with argument "argidx".
5466  * (Arguments before that will have corresponding slots in the resulting
5467  * FmgrInfo array, but we don't fill those slots.)
5468  */
5469 static FmgrInfo *
5470 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5471 {
5472         FmgrInfo   *foutcache;
5473         int                     i;
5474
5475         /* We keep the info in fn_mcxt so it survives across calls */
5476         foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5477                                                                                                 PG_NARGS() * sizeof(FmgrInfo));
5478
5479         for (i = argidx; i < PG_NARGS(); i++)
5480         {
5481                 Oid                     valtype;
5482                 Oid                     typOutput;
5483                 bool            typIsVarlena;
5484
5485                 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5486                 if (!OidIsValid(valtype))
5487                         elog(ERROR, "could not determine data type of concat() input");
5488
5489                 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5490                 fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5491         }
5492
5493         fcinfo->flinfo->fn_extra = foutcache;
5494
5495         return foutcache;
5496 }
5497
5498 /*
5499  * Implementation of both concat() and concat_ws().
5500  *
5501  * sepstr is the separator string to place between values.
5502  * argidx identifies the first argument to concatenate (counting from zero);
5503  * note that this must be constant across any one series of calls.
5504  *
5505  * Returns NULL if result should be NULL, else text value.
5506  */
5507 static text *
5508 concat_internal(const char *sepstr, int argidx,
5509                                 FunctionCallInfo fcinfo)
5510 {
5511         text       *result;
5512         StringInfoData str;
5513         FmgrInfo   *foutcache;
5514         bool            first_arg = true;
5515         int                     i;
5516
5517         /*
5518          * concat(VARIADIC some-array) is essentially equivalent to
5519          * array_to_text(), ie concat the array elements with the given separator.
5520          * So we just pass the case off to that code.
5521          */
5522         if (get_fn_expr_variadic(fcinfo->flinfo))
5523         {
5524                 ArrayType  *arr;
5525
5526                 /* Should have just the one argument */
5527                 Assert(argidx == PG_NARGS() - 1);
5528
5529                 /* concat(VARIADIC NULL) is defined as NULL */
5530                 if (PG_ARGISNULL(argidx))
5531                         return NULL;
5532
5533                 /*
5534                  * Non-null argument had better be an array.  We assume that any call
5535                  * context that could let get_fn_expr_variadic return true will have
5536                  * checked that a VARIADIC-labeled parameter actually is an array.  So
5537                  * it should be okay to just Assert that it's an array rather than
5538                  * doing a full-fledged error check.
5539                  */
5540                 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5541
5542                 /* OK, safe to fetch the array value */
5543                 arr = PG_GETARG_ARRAYTYPE_P(argidx);
5544
5545                 /*
5546                  * And serialize the array.  We tell array_to_text to ignore null
5547                  * elements, which matches the behavior of the loop below.
5548                  */
5549                 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5550         }
5551
5552         /* Normal case without explicit VARIADIC marker */
5553         initStringInfo(&str);
5554
5555         /* Get output function info, building it if first time through */
5556         foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5557         if (foutcache == NULL)
5558                 foutcache = build_concat_foutcache(fcinfo, argidx);
5559
5560         for (i = argidx; i < PG_NARGS(); i++)
5561         {
5562                 if (!PG_ARGISNULL(i))
5563                 {
5564                         Datum           value = PG_GETARG_DATUM(i);
5565
5566                         /* add separator if appropriate */
5567                         if (first_arg)
5568                                 first_arg = false;
5569                         else
5570                                 appendStringInfoString(&str, sepstr);
5571
5572                         /* call the appropriate type output function, append the result */
5573                         appendStringInfoString(&str,
5574                                                                    OutputFunctionCall(&foutcache[i], value));
5575                 }
5576         }
5577
5578         result = cstring_to_text_with_len(str.data, str.len);
5579         pfree(str.data);
5580
5581         return result;
5582 }
5583
5584 /*
5585  * Concatenate all arguments. NULL arguments are ignored.
5586  */
5587 Datum
5588 text_concat(PG_FUNCTION_ARGS)
5589 {
5590         text       *result;
5591
5592         result = concat_internal("", 0, fcinfo);
5593         if (result == NULL)
5594                 PG_RETURN_NULL();
5595         PG_RETURN_TEXT_P(result);
5596 }
5597
5598 /*
5599  * Concatenate all but first argument value with separators. The first
5600  * parameter is used as the separator. NULL arguments are ignored.
5601  */
5602 Datum
5603 text_concat_ws(PG_FUNCTION_ARGS)
5604 {
5605         char       *sep;
5606         text       *result;
5607
5608         /* return NULL when separator is NULL */
5609         if (PG_ARGISNULL(0))
5610                 PG_RETURN_NULL();
5611         sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5612
5613         result = concat_internal(sep, 1, fcinfo);
5614         if (result == NULL)
5615                 PG_RETURN_NULL();
5616         PG_RETURN_TEXT_P(result);
5617 }
5618
5619 /*
5620  * Return first n characters in the string. When n is negative,
5621  * return all but last |n| characters.
5622  */
5623 Datum
5624 text_left(PG_FUNCTION_ARGS)
5625 {
5626         int                     n = PG_GETARG_INT32(1);
5627
5628         if (n < 0)
5629         {
5630                 text       *str = PG_GETARG_TEXT_PP(0);
5631                 const char *p = VARDATA_ANY(str);
5632                 int                     len = VARSIZE_ANY_EXHDR(str);
5633                 int                     rlen;
5634
5635                 n = pg_mbstrlen_with_len(p, len) + n;
5636                 rlen = pg_mbcharcliplen(p, len, n);
5637                 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5638         }
5639         else
5640                 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5641 }
5642
5643 /*
5644  * Return last n characters in the string. When n is negative,
5645  * return all but first |n| characters.
5646  */
5647 Datum
5648 text_right(PG_FUNCTION_ARGS)
5649 {
5650         text       *str = PG_GETARG_TEXT_PP(0);
5651         const char *p = VARDATA_ANY(str);
5652         int                     len = VARSIZE_ANY_EXHDR(str);
5653         int                     n = PG_GETARG_INT32(1);
5654         int                     off;
5655
5656         if (n < 0)
5657                 n = -n;
5658         else
5659                 n = pg_mbstrlen_with_len(p, len) - n;
5660         off = pg_mbcharcliplen(p, len, n);
5661
5662         PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5663 }
5664
5665 /*
5666  * Return reversed string
5667  */
5668 Datum
5669 text_reverse(PG_FUNCTION_ARGS)
5670 {
5671         text       *str = PG_GETARG_TEXT_PP(0);
5672         const char *p = VARDATA_ANY(str);
5673         int                     len = VARSIZE_ANY_EXHDR(str);
5674         const char *endp = p + len;
5675         text       *result;
5676         char       *dst;
5677
5678         result = palloc(len + VARHDRSZ);
5679         dst = (char *) VARDATA(result) + len;
5680         SET_VARSIZE(result, len + VARHDRSZ);
5681
5682         if (pg_database_encoding_max_length() > 1)
5683         {
5684                 /* multibyte version */
5685                 while (p < endp)
5686                 {
5687                         int                     sz;
5688
5689                         sz = pg_mblen(p);
5690                         dst -= sz;
5691                         memcpy(dst, p, sz);
5692                         p += sz;
5693                 }
5694         }
5695         else
5696         {
5697                 /* single byte version */
5698                 while (p < endp)
5699                         *(--dst) = *p++;
5700         }
5701
5702         PG_RETURN_TEXT_P(result);
5703 }
5704
5705
5706 /*
5707  * Support macros for text_format()
5708  */
5709 #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
5710
5711 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5712         do { \
5713                 if (++(ptr) >= (end_ptr)) \
5714                         ereport(ERROR, \
5715                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5716                                          errmsg("unterminated format() type specifier"), \
5717                                          errhint("For a single \"%%\" use \"%%%%\"."))); \
5718         } while (0)
5719
5720 /*
5721  * Returns a formatted string
5722  */
5723 Datum
5724 text_format(PG_FUNCTION_ARGS)
5725 {
5726         text       *fmt;
5727         StringInfoData str;
5728         const char *cp;
5729         const char *start_ptr;
5730         const char *end_ptr;
5731         text       *result;
5732         int                     arg;
5733         bool            funcvariadic;
5734         int                     nargs;
5735         Datum      *elements = NULL;
5736         bool       *nulls = NULL;
5737         Oid                     element_type = InvalidOid;
5738         Oid                     prev_type = InvalidOid;
5739         Oid                     prev_width_type = InvalidOid;
5740         FmgrInfo        typoutputfinfo;
5741         FmgrInfo        typoutputinfo_width;
5742
5743         /* When format string is null, immediately return null */
5744         if (PG_ARGISNULL(0))
5745                 PG_RETURN_NULL();
5746
5747         /* If argument is marked VARIADIC, expand array into elements */
5748         if (get_fn_expr_variadic(fcinfo->flinfo))
5749         {
5750                 ArrayType  *arr;
5751                 int16           elmlen;
5752                 bool            elmbyval;
5753                 char            elmalign;
5754                 int                     nitems;
5755
5756                 /* Should have just the one argument */
5757                 Assert(PG_NARGS() == 2);
5758
5759                 /* If argument is NULL, we treat it as zero-length array */
5760                 if (PG_ARGISNULL(1))
5761                         nitems = 0;
5762                 else
5763                 {
5764                         /*
5765                          * Non-null argument had better be an array.  We assume that any
5766                          * call context that could let get_fn_expr_variadic return true
5767                          * will have checked that a VARIADIC-labeled parameter actually is
5768                          * an array.  So it should be okay to just Assert that it's an
5769                          * array rather than doing a full-fledged error check.
5770                          */
5771                         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5772
5773                         /* OK, safe to fetch the array value */
5774                         arr = PG_GETARG_ARRAYTYPE_P(1);
5775
5776                         /* Get info about array element type */
5777                         element_type = ARR_ELEMTYPE(arr);
5778                         get_typlenbyvalalign(element_type,
5779                                                                  &elmlen, &elmbyval, &elmalign);
5780
5781                         /* Extract all array elements */
5782                         deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5783                                                           &elements, &nulls, &nitems);
5784                 }
5785
5786                 nargs = nitems + 1;
5787                 funcvariadic = true;
5788         }
5789         else
5790         {
5791                 /* Non-variadic case, we'll process the arguments individually */
5792                 nargs = PG_NARGS();
5793                 funcvariadic = false;
5794         }
5795
5796         /* Setup for main loop. */
5797         fmt = PG_GETARG_TEXT_PP(0);
5798         start_ptr = VARDATA_ANY(fmt);
5799         end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5800         initStringInfo(&str);
5801         arg = 1;                                        /* next argument position to print */
5802
5803         /* Scan format string, looking for conversion specifiers. */
5804         for (cp = start_ptr; cp < end_ptr; cp++)
5805         {
5806                 int                     argpos;
5807                 int                     widthpos;
5808                 int                     flags;
5809                 int                     width;
5810                 Datum           value;
5811                 bool            isNull;
5812                 Oid                     typid;
5813
5814                 /*
5815                  * If it's not the start of a conversion specifier, just copy it to
5816                  * the output buffer.
5817                  */
5818                 if (*cp != '%')
5819                 {
5820                         appendStringInfoCharMacro(&str, *cp);
5821                         continue;
5822                 }
5823
5824                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5825
5826                 /* Easy case: %% outputs a single % */
5827                 if (*cp == '%')
5828                 {
5829                         appendStringInfoCharMacro(&str, *cp);
5830                         continue;
5831                 }
5832
5833                 /* Parse the optional portions of the format specifier */
5834                 cp = text_format_parse_format(cp, end_ptr,
5835                                                                           &argpos, &widthpos,
5836                                                                           &flags, &width);
5837
5838                 /*
5839                  * Next we should see the main conversion specifier.  Whether or not
5840                  * an argument position was present, it's known that at least one
5841                  * character remains in the string at this point.  Experience suggests
5842                  * that it's worth checking that that character is one of the expected
5843                  * ones before we try to fetch arguments, so as to produce the least
5844                  * confusing response to a mis-formatted specifier.
5845                  */
5846                 if (strchr("sIL", *cp) == NULL)
5847                         ereport(ERROR,
5848                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5849                                          errmsg("unrecognized format() type specifier \"%.*s\"",
5850                                                         pg_mblen(cp), cp),
5851                                          errhint("For a single \"%%\" use \"%%%%\".")));
5852
5853                 /* If indirect width was specified, get its value */
5854                 if (widthpos >= 0)
5855                 {
5856                         /* Collect the specified or next argument position */
5857                         if (widthpos > 0)
5858                                 arg = widthpos;
5859                         if (arg >= nargs)
5860                                 ereport(ERROR,
5861                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5862                                                  errmsg("too few arguments for format()")));
5863
5864                         /* Get the value and type of the selected argument */
5865                         if (!funcvariadic)
5866                         {
5867                                 value = PG_GETARG_DATUM(arg);
5868                                 isNull = PG_ARGISNULL(arg);
5869                                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5870                         }
5871                         else
5872                         {
5873                                 value = elements[arg - 1];
5874                                 isNull = nulls[arg - 1];
5875                                 typid = element_type;
5876                         }
5877                         if (!OidIsValid(typid))
5878                                 elog(ERROR, "could not determine data type of format() input");
5879
5880                         arg++;
5881
5882                         /* We can treat NULL width the same as zero */
5883                         if (isNull)
5884                                 width = 0;
5885                         else if (typid == INT4OID)
5886                                 width = DatumGetInt32(value);
5887                         else if (typid == INT2OID)
5888                                 width = DatumGetInt16(value);
5889                         else
5890                         {
5891                                 /* For less-usual datatypes, convert to text then to int */
5892                                 char       *str;
5893
5894                                 if (typid != prev_width_type)
5895                                 {
5896                                         Oid                     typoutputfunc;
5897                                         bool            typIsVarlena;
5898
5899                                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5900                                         fmgr_info(typoutputfunc, &typoutputinfo_width);
5901                                         prev_width_type = typid;
5902                                 }
5903
5904                                 str = OutputFunctionCall(&typoutputinfo_width, value);
5905
5906                                 /* pg_strtoint32 will complain about bad data or overflow */
5907                                 width = pg_strtoint32(str);
5908
5909                                 pfree(str);
5910                         }
5911                 }
5912
5913                 /* Collect the specified or next argument position */
5914                 if (argpos > 0)
5915                         arg = argpos;
5916                 if (arg >= nargs)
5917                         ereport(ERROR,
5918                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5919                                          errmsg("too few arguments for format()")));
5920
5921                 /* Get the value and type of the selected argument */
5922                 if (!funcvariadic)
5923                 {
5924                         value = PG_GETARG_DATUM(arg);
5925                         isNull = PG_ARGISNULL(arg);
5926                         typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5927                 }
5928                 else
5929                 {
5930                         value = elements[arg - 1];
5931                         isNull = nulls[arg - 1];
5932                         typid = element_type;
5933                 }
5934                 if (!OidIsValid(typid))
5935                         elog(ERROR, "could not determine data type of format() input");
5936
5937                 arg++;
5938
5939                 /*
5940                  * Get the appropriate typOutput function, reusing previous one if
5941                  * same type as previous argument.  That's particularly useful in the
5942                  * variadic-array case, but often saves work even for ordinary calls.
5943                  */
5944                 if (typid != prev_type)
5945                 {
5946                         Oid                     typoutputfunc;
5947                         bool            typIsVarlena;
5948
5949                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5950                         fmgr_info(typoutputfunc, &typoutputfinfo);
5951                         prev_type = typid;
5952                 }
5953
5954                 /*
5955                  * And now we can format the value.
5956                  */
5957                 switch (*cp)
5958                 {
5959                         case 's':
5960                         case 'I':
5961                         case 'L':
5962                                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5963                                                                                           value, isNull,
5964                                                                                           flags, width);
5965                                 break;
5966                         default:
5967                                 /* should not get here, because of previous check */
5968                                 ereport(ERROR,
5969                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5970                                                  errmsg("unrecognized format() type specifier \"%.*s\"",
5971                                                                 pg_mblen(cp), cp),
5972                                                  errhint("For a single \"%%\" use \"%%%%\".")));
5973                                 break;
5974                 }
5975         }
5976
5977         /* Don't need deconstruct_array results anymore. */
5978         if (elements != NULL)
5979                 pfree(elements);
5980         if (nulls != NULL)
5981                 pfree(nulls);
5982
5983         /* Generate results. */
5984         result = cstring_to_text_with_len(str.data, str.len);
5985         pfree(str.data);
5986
5987         PG_RETURN_TEXT_P(result);
5988 }
5989
5990 /*
5991  * Parse contiguous digits as a decimal number.
5992  *
5993  * Returns true if some digits could be parsed.
5994  * The value is returned into *value, and *ptr is advanced to the next
5995  * character to be parsed.
5996  *
5997  * Note parsing invariant: at least one character is known available before
5998  * string end (end_ptr) at entry, and this is still true at exit.
5999  */
6000 static bool
6001 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
6002 {
6003         bool            found = false;
6004         const char *cp = *ptr;
6005         int                     val = 0;
6006
6007         while (*cp >= '0' && *cp <= '9')
6008         {
6009                 int8            digit = (*cp - '0');
6010
6011                 if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
6012                         unlikely(pg_add_s32_overflow(val, digit, &val)))
6013                         ereport(ERROR,
6014                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6015                                          errmsg("number is out of range")));
6016                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6017                 found = true;
6018         }
6019
6020         *ptr = cp;
6021         *value = val;
6022
6023         return found;
6024 }
6025
6026 /*
6027  * Parse a format specifier (generally following the SUS printf spec).
6028  *
6029  * We have already advanced over the initial '%', and we are looking for
6030  * [argpos][flags][width]type (but the type character is not consumed here).
6031  *
6032  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6033  * Output parameters:
6034  *      argpos: argument position for value to be printed.  -1 means unspecified.
6035  *      widthpos: argument position for width.  Zero means the argument position
6036  *                      was unspecified (ie, take the next arg) and -1 means no width
6037  *                      argument (width was omitted or specified as a constant).
6038  *      flags: bitmask of flags.
6039  *      width: directly-specified width value.  Zero means the width was omitted
6040  *                      (note it's not necessary to distinguish this case from an explicit
6041  *                      zero width value).
6042  *
6043  * The function result is the next character position to be parsed, ie, the
6044  * location where the type character is/should be.
6045  *
6046  * Note parsing invariant: at least one character is known available before
6047  * string end (end_ptr) at entry, and this is still true at exit.
6048  */
6049 static const char *
6050 text_format_parse_format(const char *start_ptr, const char *end_ptr,
6051                                                  int *argpos, int *widthpos,
6052                                                  int *flags, int *width)
6053 {
6054         const char *cp = start_ptr;
6055         int                     n;
6056
6057         /* set defaults for output parameters */
6058         *argpos = -1;
6059         *widthpos = -1;
6060         *flags = 0;
6061         *width = 0;
6062
6063         /* try to identify first number */
6064         if (text_format_parse_digits(&cp, end_ptr, &n))
6065         {
6066                 if (*cp != '$')
6067                 {
6068                         /* Must be just a width and a type, so we're done */
6069                         *width = n;
6070                         return cp;
6071                 }
6072                 /* The number was argument position */
6073                 *argpos = n;
6074                 /* Explicit 0 for argument index is immediately refused */
6075                 if (n == 0)
6076                         ereport(ERROR,
6077                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6078                                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
6079                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6080         }
6081
6082         /* Handle flags (only minus is supported now) */
6083         while (*cp == '-')
6084         {
6085                 *flags |= TEXT_FORMAT_FLAG_MINUS;
6086                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6087         }
6088
6089         if (*cp == '*')
6090         {
6091                 /* Handle indirect width */
6092                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6093                 if (text_format_parse_digits(&cp, end_ptr, &n))
6094                 {
6095                         /* number in this position must be closed by $ */
6096                         if (*cp != '$')
6097                                 ereport(ERROR,
6098                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6099                                                  errmsg("width argument position must be ended by \"$\"")));
6100                         /* The number was width argument position */
6101                         *widthpos = n;
6102                         /* Explicit 0 for argument index is immediately refused */
6103                         if (n == 0)
6104                                 ereport(ERROR,
6105                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6106                                                  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6107                         ADVANCE_PARSE_POINTER(cp, end_ptr);
6108                 }
6109                 else
6110                         *widthpos = 0;          /* width's argument position is unspecified */
6111         }
6112         else
6113         {
6114                 /* Check for direct width specification */
6115                 if (text_format_parse_digits(&cp, end_ptr, &n))
6116                         *width = n;
6117         }
6118
6119         /* cp should now be pointing at type character */
6120         return cp;
6121 }
6122
6123 /*
6124  * Format a %s, %I, or %L conversion
6125  */
6126 static void
6127 text_format_string_conversion(StringInfo buf, char conversion,
6128                                                           FmgrInfo *typOutputInfo,
6129                                                           Datum value, bool isNull,
6130                                                           int flags, int width)
6131 {
6132         char       *str;
6133
6134         /* Handle NULL arguments before trying to stringify the value. */
6135         if (isNull)
6136         {
6137                 if (conversion == 's')
6138                         text_format_append_string(buf, "", flags, width);
6139                 else if (conversion == 'L')
6140                         text_format_append_string(buf, "NULL", flags, width);
6141                 else if (conversion == 'I')
6142                         ereport(ERROR,
6143                                         (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6144                                          errmsg("null values cannot be formatted as an SQL identifier")));
6145                 return;
6146         }
6147
6148         /* Stringify. */
6149         str = OutputFunctionCall(typOutputInfo, value);
6150
6151         /* Escape. */
6152         if (conversion == 'I')
6153         {
6154                 /* quote_identifier may or may not allocate a new string. */
6155                 text_format_append_string(buf, quote_identifier(str), flags, width);
6156         }
6157         else if (conversion == 'L')
6158         {
6159                 char       *qstr = quote_literal_cstr(str);
6160
6161                 text_format_append_string(buf, qstr, flags, width);
6162                 /* quote_literal_cstr() always allocates a new string */
6163                 pfree(qstr);
6164         }
6165         else
6166                 text_format_append_string(buf, str, flags, width);
6167
6168         /* Cleanup. */
6169         pfree(str);
6170 }
6171
6172 /*
6173  * Append str to buf, padding as directed by flags/width
6174  */
6175 static void
6176 text_format_append_string(StringInfo buf, const char *str,
6177                                                   int flags, int width)
6178 {
6179         bool            align_to_left = false;
6180         int                     len;
6181
6182         /* fast path for typical easy case */
6183         if (width == 0)
6184         {
6185                 appendStringInfoString(buf, str);
6186                 return;
6187         }
6188
6189         if (width < 0)
6190         {
6191                 /* Negative width: implicit '-' flag, then take absolute value */
6192                 align_to_left = true;
6193                 /* -INT_MIN is undefined */
6194                 if (width <= INT_MIN)
6195                         ereport(ERROR,
6196                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6197                                          errmsg("number is out of range")));
6198                 width = -width;
6199         }
6200         else if (flags & TEXT_FORMAT_FLAG_MINUS)
6201                 align_to_left = true;
6202
6203         len = pg_mbstrlen(str);
6204         if (align_to_left)
6205         {
6206                 /* left justify */
6207                 appendStringInfoString(buf, str);
6208                 if (len < width)
6209                         appendStringInfoSpaces(buf, width - len);
6210         }
6211         else
6212         {
6213                 /* right justify */
6214                 if (len < width)
6215                         appendStringInfoSpaces(buf, width - len);
6216                 appendStringInfoString(buf, str);
6217         }
6218 }
6219
6220 /*
6221  * text_format_nv - nonvariadic wrapper for text_format function.
6222  *
6223  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6224  * which checks that all built-in functions that share the implementing C
6225  * function take the same number of arguments.
6226  */
6227 Datum
6228 text_format_nv(PG_FUNCTION_ARGS)
6229 {
6230         return text_format(fcinfo);
6231 }
6232
6233 /*
6234  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6235  * for this use case.
6236  */
6237 static inline bool
6238 rest_of_char_same(const char *s1, const char *s2, int len)
6239 {
6240         while (len > 0)
6241         {
6242                 len--;
6243                 if (s1[len] != s2[len])
6244                         return false;
6245         }
6246         return true;
6247 }
6248
6249 /* Expand each Levenshtein distance variant */
6250 #include "levenshtein.c"
6251 #define LEVENSHTEIN_LESS_EQUAL
6252 #include "levenshtein.c"
6253
6254
6255 /*
6256  * Unicode support
6257  */
6258
6259 static UnicodeNormalizationForm
6260 unicode_norm_form_from_string(const char *formstr)
6261 {
6262         UnicodeNormalizationForm form = -1;
6263
6264         /*
6265          * Might as well check this while we're here.
6266          */
6267         if (GetDatabaseEncoding() != PG_UTF8)
6268                 ereport(ERROR,
6269                                 (errcode(ERRCODE_SYNTAX_ERROR),
6270                                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6271
6272         if (pg_strcasecmp(formstr, "NFC") == 0)
6273                 form = UNICODE_NFC;
6274         else if (pg_strcasecmp(formstr, "NFD") == 0)
6275                 form = UNICODE_NFD;
6276         else if (pg_strcasecmp(formstr, "NFKC") == 0)
6277                 form = UNICODE_NFKC;
6278         else if (pg_strcasecmp(formstr, "NFKD") == 0)
6279                 form = UNICODE_NFKD;
6280         else
6281                 ereport(ERROR,
6282                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6283                                  errmsg("invalid normalization form: %s", formstr)));
6284
6285         return form;
6286 }
6287
6288 Datum
6289 unicode_normalize_func(PG_FUNCTION_ARGS)
6290 {
6291         text       *input = PG_GETARG_TEXT_PP(0);
6292         char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6293         UnicodeNormalizationForm form;
6294         int                     size;
6295         pg_wchar   *input_chars;
6296         pg_wchar   *output_chars;
6297         unsigned char *p;
6298         text       *result;
6299         int                     i;
6300
6301         form = unicode_norm_form_from_string(formstr);
6302
6303         /* convert to pg_wchar */
6304         size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6305         input_chars = palloc((size + 1) * sizeof(pg_wchar));
6306         p = (unsigned char *) VARDATA_ANY(input);
6307         for (i = 0; i < size; i++)
6308         {
6309                 input_chars[i] = utf8_to_unicode(p);
6310                 p += pg_utf_mblen(p);
6311         }
6312         input_chars[i] = (pg_wchar) '\0';
6313         Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6314
6315         /* action */
6316         output_chars = unicode_normalize(form, input_chars);
6317
6318         /* convert back to UTF-8 string */
6319         size = 0;
6320         for (pg_wchar *wp = output_chars; *wp; wp++)
6321         {
6322                 unsigned char buf[4];
6323
6324                 unicode_to_utf8(*wp, buf);
6325                 size += pg_utf_mblen(buf);
6326         }
6327
6328         result = palloc(size + VARHDRSZ);
6329         SET_VARSIZE(result, size + VARHDRSZ);
6330
6331         p = (unsigned char *) VARDATA_ANY(result);
6332         for (pg_wchar *wp = output_chars; *wp; wp++)
6333         {
6334                 unicode_to_utf8(*wp, p);
6335                 p += pg_utf_mblen(p);
6336         }
6337         Assert((char *) p == (char *) result + size + VARHDRSZ);
6338
6339         PG_RETURN_TEXT_P(result);
6340 }
6341
6342 /*
6343  * Check whether the string is in the specified Unicode normalization form.
6344  *
6345  * This is done by converting the string to the specified normal form and then
6346  * comparing that to the original string.  To speed that up, we also apply the
6347  * "quick check" algorithm specified in UAX #15, which can give a yes or no
6348  * answer for many strings by just scanning the string once.
6349  *
6350  * This function should generally be optimized for the case where the string
6351  * is in fact normalized.  In that case, we'll end up looking at the entire
6352  * string, so it's probably not worth doing any incremental conversion etc.
6353  */
6354 Datum
6355 unicode_is_normalized(PG_FUNCTION_ARGS)
6356 {
6357         text       *input = PG_GETARG_TEXT_PP(0);
6358         char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6359         UnicodeNormalizationForm form;
6360         int                     size;
6361         pg_wchar   *input_chars;
6362         pg_wchar   *output_chars;
6363         unsigned char *p;
6364         int                     i;
6365         UnicodeNormalizationQC quickcheck;
6366         int                     output_size;
6367         bool            result;
6368
6369         form = unicode_norm_form_from_string(formstr);
6370
6371         /* convert to pg_wchar */
6372         size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6373         input_chars = palloc((size + 1) * sizeof(pg_wchar));
6374         p = (unsigned char *) VARDATA_ANY(input);
6375         for (i = 0; i < size; i++)
6376         {
6377                 input_chars[i] = utf8_to_unicode(p);
6378                 p += pg_utf_mblen(p);
6379         }
6380         input_chars[i] = (pg_wchar) '\0';
6381         Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6382
6383         /* quick check (see UAX #15) */
6384         quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6385         if (quickcheck == UNICODE_NORM_QC_YES)
6386                 PG_RETURN_BOOL(true);
6387         else if (quickcheck == UNICODE_NORM_QC_NO)
6388                 PG_RETURN_BOOL(false);
6389
6390         /* normalize and compare with original */
6391         output_chars = unicode_normalize(form, input_chars);
6392
6393         output_size = 0;
6394         for (pg_wchar *wp = output_chars; *wp; wp++)
6395                 output_size++;
6396
6397         result = (size == output_size) &&
6398                 (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6399
6400         PG_RETURN_BOOL(result);
6401 }
6402
6403 /*
6404  * Check if first n chars are hexadecimal digits
6405  */
6406 static bool
6407 isxdigits_n(const char *instr, size_t n)
6408 {
6409         for (size_t i = 0; i < n; i++)
6410                 if (!isxdigit((unsigned char) instr[i]))
6411                         return false;
6412
6413         return true;
6414 }
6415
6416 static unsigned int
6417 hexval(unsigned char c)
6418 {
6419         if (c >= '0' && c <= '9')
6420                 return c - '0';
6421         if (c >= 'a' && c <= 'f')
6422                 return c - 'a' + 0xA;
6423         if (c >= 'A' && c <= 'F')
6424                 return c - 'A' + 0xA;
6425         elog(ERROR, "invalid hexadecimal digit");
6426         return 0;                                       /* not reached */
6427 }
6428
6429 /*
6430  * Translate string with hexadecimal digits to number
6431  */
6432 static unsigned int
6433 hexval_n(const char *instr, size_t n)
6434 {
6435         unsigned int result = 0;
6436
6437         for (size_t i = 0; i < n; i++)
6438                 result += hexval(instr[i]) << (4 * (n - i - 1));
6439
6440         return result;
6441 }
6442
6443 /*
6444  * Replaces Unicode escape sequences by Unicode characters
6445  */
6446 Datum
6447 unistr(PG_FUNCTION_ARGS)
6448 {
6449         text       *input_text = PG_GETARG_TEXT_PP(0);
6450         char       *instr;
6451         int                     len;
6452         StringInfoData str;
6453         text       *result;
6454         pg_wchar        pair_first = 0;
6455         char            cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6456
6457         instr = VARDATA_ANY(input_text);
6458         len = VARSIZE_ANY_EXHDR(input_text);
6459
6460         initStringInfo(&str);
6461
6462         while (len > 0)
6463         {
6464                 if (instr[0] == '\\')
6465                 {
6466                         if (len >= 2 &&
6467                                 instr[1] == '\\')
6468                         {
6469                                 if (pair_first)
6470                                         goto invalid_pair;
6471                                 appendStringInfoChar(&str, '\\');
6472                                 instr += 2;
6473                                 len -= 2;
6474                         }
6475                         else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6476                                          (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6477                         {
6478                                 pg_wchar        unicode;
6479                                 int                     offset = instr[1] == 'u' ? 2 : 1;
6480
6481                                 unicode = hexval_n(instr + offset, 4);
6482
6483                                 if (!is_valid_unicode_codepoint(unicode))
6484                                         ereport(ERROR,
6485                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6486                                                         errmsg("invalid Unicode code point: %04X", unicode));
6487
6488                                 if (pair_first)
6489                                 {
6490                                         if (is_utf16_surrogate_second(unicode))
6491                                         {
6492                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6493                                                 pair_first = 0;
6494                                         }
6495                                         else
6496                                                 goto invalid_pair;
6497                                 }
6498                                 else if (is_utf16_surrogate_second(unicode))
6499                                         goto invalid_pair;
6500
6501                                 if (is_utf16_surrogate_first(unicode))
6502                                         pair_first = unicode;
6503                                 else
6504                                 {
6505                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6506                                         appendStringInfoString(&str, cbuf);
6507                                 }
6508
6509                                 instr += 4 + offset;
6510                                 len -= 4 + offset;
6511                         }
6512                         else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6513                         {
6514                                 pg_wchar        unicode;
6515
6516                                 unicode = hexval_n(instr + 2, 6);
6517
6518                                 if (!is_valid_unicode_codepoint(unicode))
6519                                         ereport(ERROR,
6520                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6521                                                         errmsg("invalid Unicode code point: %04X", unicode));
6522
6523                                 if (pair_first)
6524                                 {
6525                                         if (is_utf16_surrogate_second(unicode))
6526                                         {
6527                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6528                                                 pair_first = 0;
6529                                         }
6530                                         else
6531                                                 goto invalid_pair;
6532                                 }
6533                                 else if (is_utf16_surrogate_second(unicode))
6534                                         goto invalid_pair;
6535
6536                                 if (is_utf16_surrogate_first(unicode))
6537                                         pair_first = unicode;
6538                                 else
6539                                 {
6540                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6541                                         appendStringInfoString(&str, cbuf);
6542                                 }
6543
6544                                 instr += 8;
6545                                 len -= 8;
6546                         }
6547                         else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6548                         {
6549                                 pg_wchar        unicode;
6550
6551                                 unicode = hexval_n(instr + 2, 8);
6552
6553                                 if (!is_valid_unicode_codepoint(unicode))
6554                                         ereport(ERROR,
6555                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6556                                                         errmsg("invalid Unicode code point: %04X", unicode));
6557
6558                                 if (pair_first)
6559                                 {
6560                                         if (is_utf16_surrogate_second(unicode))
6561                                         {
6562                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6563                                                 pair_first = 0;
6564                                         }
6565                                         else
6566                                                 goto invalid_pair;
6567                                 }
6568                                 else if (is_utf16_surrogate_second(unicode))
6569                                         goto invalid_pair;
6570
6571                                 if (is_utf16_surrogate_first(unicode))
6572                                         pair_first = unicode;
6573                                 else
6574                                 {
6575                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6576                                         appendStringInfoString(&str, cbuf);
6577                                 }
6578
6579                                 instr += 10;
6580                                 len -= 10;
6581                         }
6582                         else
6583                                 ereport(ERROR,
6584                                                 (errcode(ERRCODE_SYNTAX_ERROR),
6585                                                  errmsg("invalid Unicode escape"),
6586                                                  errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6587                 }
6588                 else
6589                 {
6590                         if (pair_first)
6591                                 goto invalid_pair;
6592
6593                         appendStringInfoChar(&str, *instr++);
6594                         len--;
6595                 }
6596         }
6597
6598         /* unfinished surrogate pair? */
6599         if (pair_first)
6600                 goto invalid_pair;
6601
6602         result = cstring_to_text_with_len(str.data, str.len);
6603         pfree(str.data);
6604
6605         PG_RETURN_TEXT_P(result);
6606
6607 invalid_pair:
6608         ereport(ERROR,
6609                         (errcode(ERRCODE_SYNTAX_ERROR),
6610                          errmsg("invalid Unicode surrogate pair")));
6611         PG_RETURN_NULL();                       /* keep compiler quiet */
6612 }