src/backend/utils/adt/varlena.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * varlena.c
   4  *        Functions for the variable-length built-in types.
   5  *
   6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/utils/adt/varlena.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <limits.h>
  19
  20 #include "access/detoast.h"
  21 #include "access/toast_compression.h"
  22 #include "catalog/pg_collation.h"
  23 #include "catalog/pg_type.h"
  24 #include "common/hashfn.h"
  25 #include "common/int.h"
  26 #include "common/unicode_norm.h"
  27 #include "funcapi.h"
  28 #include "lib/hyperloglog.h"
  29 #include "libpq/pqformat.h"
  30 #include "miscadmin.h"
  31 #include "nodes/execnodes.h"
  32 #include "parser/scansup.h"
  33 #include "port/pg_bswap.h"
  34 #include "regex/regex.h"
  35 #include "utils/builtins.h"
  36 #include "utils/bytea.h"
  37 #include "utils/guc.h"
  38 #include "utils/lsyscache.h"
  39 #include "utils/memutils.h"
  40 #include "utils/pg_locale.h"
  41 #include "utils/sortsupport.h"
  42 #include "utils/varlena.h"
  43
  44
  45 /* GUC variable */
  46 int                     bytea_output = BYTEA_OUTPUT_HEX;
  47
  48 typedef struct varlena unknown;
  49 typedef struct varlena VarString;
  50
  51 /*
  52  * State for text_position_* functions.
  53  */
  54 typedef struct
  55 {
  56         bool            is_multibyte_char_in_char;      /* need to check char boundaries? */
  57
  58         char       *str1;                       /* haystack string */
  59         char       *str2;                       /* needle string */
  60         int                     len1;                   /* string lengths in bytes */
  61         int                     len2;
  62
  63         /* Skip table for Boyer-Moore-Horspool search algorithm: */
  64         int                     skiptablemask;  /* mask for ANDing with skiptable subscripts */
  65         int                     skiptable[256]; /* skip distance for given mismatched char */
  66
  67         char       *last_match;         /* pointer to last match in 'str1' */
  68
  69         /*
  70          * Sometimes we need to convert the byte position of a match to a
  71          * character position.  These store the last position that was converted,
  72          * so that on the next call, we can continue from that point, rather than
  73          * count characters from the very beginning.
  74          */
  75         char       *refpoint;           /* pointer within original haystack string */
  76         int                     refpos;                 /* 0-based character offset of the same point */
  77 } TextPositionState;
  78
  79 typedef struct
  80 {
  81         char       *buf1;                       /* 1st string, or abbreviation original string
  82                                                                  * buf */
  83         char       *buf2;                       /* 2nd string, or abbreviation strxfrm() buf */
  84         int                     buflen1;                /* Allocated length of buf1 */
  85         int                     buflen2;                /* Allocated length of buf2 */
  86         int                     last_len1;              /* Length of last buf1 string/strxfrm() input */
  87         int                     last_len2;              /* Length of last buf2 string/strxfrm() blob */
  88         int                     last_returned;  /* Last comparison result (cache) */
  89         bool            cache_blob;             /* Does buf2 contain strxfrm() blob, etc? */
  90         bool            collate_c;
  91         Oid                     typid;                  /* Actual datatype (text/bpchar/bytea/name) */
  92         hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
  93         hyperLogLogState full_card; /* Full key cardinality state */
  94         double          prop_card;              /* Required cardinality proportion */
  95         pg_locale_t locale;
  96 } VarStringSortSupport;
  97
  98 /*
  99  * Output data for split_text(): we output either to an array or a table.
 100  * tupstore and tupdesc must be set up in advance to output to a table.
 101  */
 102 typedef struct
 103 {
 104         ArrayBuildState *astate;
 105         Tuplestorestate *tupstore;
 106         TupleDesc       tupdesc;
 107 } SplitTextOutputData;
 108
 109 /*
 110  * This should be large enough that most strings will fit, but small enough
 111  * that we feel comfortable putting it on the stack
 112  */
 113 #define TEXTBUFLEN              1024
 114
 115 #define DatumGetUnknownP(X)                     ((unknown *) PG_DETOAST_DATUM(X))
 116 #define DatumGetUnknownPCopy(X)         ((unknown *) PG_DETOAST_DATUM_COPY(X))
 117 #define PG_GETARG_UNKNOWN_P(n)          DatumGetUnknownP(PG_GETARG_DATUM(n))
 118 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
 119 #define PG_RETURN_UNKNOWN_P(x)          PG_RETURN_POINTER(x)
 120
 121 #define DatumGetVarStringP(X)           ((VarString *) PG_DETOAST_DATUM(X))
 122 #define DatumGetVarStringPP(X)          ((VarString *) PG_DETOAST_DATUM_PACKED(X))
 123
 124 static int      varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
 125 static int      bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
 126 static int      namefastcmp_c(Datum x, Datum y, SortSupport ssup);
 127 static int      varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
 128 static int      namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
 129 static int      varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
 130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
 131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
 132 static int32 text_length(Datum str);
 133 static text *text_catenate(text *t1, text *t2);
 134 static text *text_substring(Datum str,
 135                                                         int32 start,
 136                                                         int32 length,
 137                                                         bool length_not_specified);
 138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
 139 static int      text_position(text *t1, text *t2, Oid collid);
 140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
 141 static bool text_position_next(TextPositionState *state);
 142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
 143 static char *text_position_get_match_ptr(TextPositionState *state);
 144 static int      text_position_get_match_pos(TextPositionState *state);
 145 static void text_position_cleanup(TextPositionState *state);
 146 static void check_collation_set(Oid collid);
 147 static int      text_cmp(text *arg1, text *arg2, Oid collid);
 148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
 149 static bytea *bytea_substring(Datum str,
 150                                                           int S,
 151                                                           int L,
 152                                                           bool length_not_specified);
 153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
 154 static void appendStringInfoText(StringInfo str, const text *t);
 155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
 156 static void split_text_accum_result(SplitTextOutputData *tstate,
 157                                                                         text *field_value,
 158                                                                         text *null_string,
 159                                                                         Oid collation);
 160 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
 161                                                                         const char *fldsep, const char *null_string);
 162 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
 163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
 164                                                                          int *value);
 165 static const char *text_format_parse_format(const char *start_ptr,
 166                                                                                         const char *end_ptr,
 167                                                                                         int *argpos, int *widthpos,
 168                                                                                         int *flags, int *width);
 169 static void text_format_string_conversion(StringInfo buf, char conversion,
 170                                                                                   FmgrInfo *typOutputInfo,
 171                                                                                   Datum value, bool isNull,
 172                                                                                   int flags, int width);
 173 static void text_format_append_string(StringInfo buf, const char *str,
 174                                                                           int flags, int width);
 175
 176
 177 /*****************************************************************************
 178  *       CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                                                  *
 179  *****************************************************************************/
 180
 181 /*
 182  * cstring_to_text
 183  *
 184  * Create a text value from a null-terminated C string.
 185  *
 186  * The new text value is freshly palloc'd with a full-size VARHDR.
 187  */
 188 text *
 189 cstring_to_text(const char *s)
 190 {
 191         return cstring_to_text_with_len(s, strlen(s));
 192 }
 193
 194 /*
 195  * cstring_to_text_with_len
 196  *
 197  * Same as cstring_to_text except the caller specifies the string length;
 198  * the string need not be null_terminated.
 199  */
 200 text *
 201 cstring_to_text_with_len(const char *s, int len)
 202 {
 203         text       *result = (text *) palloc(len + VARHDRSZ);
 204
 205         SET_VARSIZE(result, len + VARHDRSZ);
 206         memcpy(VARDATA(result), s, len);
 207
 208         return result;
 209 }
 210
 211 /*
 212  * text_to_cstring
 213  *
 214  * Create a palloc'd, null-terminated C string from a text value.
 215  *
 216  * We support being passed a compressed or toasted text value.
 217  * This is a bit bogus since such values shouldn't really be referred to as
 218  * "text *", but it seems useful for robustness.  If we didn't handle that
 219  * case here, we'd need another routine that did, anyway.
 220  */
 221 char *
 222 text_to_cstring(const text *t)
 223 {
 224         /* must cast away the const, unfortunately */
 225         text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
 226         int                     len = VARSIZE_ANY_EXHDR(tunpacked);
 227         char       *result;
 228
 229         result = (char *) palloc(len + 1);
 230         memcpy(result, VARDATA_ANY(tunpacked), len);
 231         result[len] = '\0';
 232
 233         if (tunpacked != t)
 234                 pfree(tunpacked);
 235
 236         return result;
 237 }
 238
 239 /*
 240  * text_to_cstring_buffer
 241  *
 242  * Copy a text value into a caller-supplied buffer of size dst_len.
 243  *
 244  * The text string is truncated if necessary to fit.  The result is
 245  * guaranteed null-terminated (unless dst_len == 0).
 246  *
 247  * We support being passed a compressed or toasted text value.
 248  * This is a bit bogus since such values shouldn't really be referred to as
 249  * "text *", but it seems useful for robustness.  If we didn't handle that
 250  * case here, we'd need another routine that did, anyway.
 251  */
 252 void
 253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
 254 {
 255         /* must cast away the const, unfortunately */
 256         text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
 257         size_t          src_len = VARSIZE_ANY_EXHDR(srcunpacked);
 258
 259         if (dst_len > 0)
 260         {
 261                 dst_len--;
 262                 if (dst_len >= src_len)
 263                         dst_len = src_len;
 264                 else                                    /* ensure truncation is encoding-safe */
 265                         dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
 266                 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
 267                 dst[dst_len] = '\0';
 268         }
 269
 270         if (srcunpacked != src)
 271                 pfree(srcunpacked);
 272 }
 273
 274
 275 /*****************************************************************************
 276  *       USER I/O ROUTINES                                                                                                               *
 277  *****************************************************************************/
 278
 279
 280 #define VAL(CH)                 ((CH) - '0')
 281 #define DIG(VAL)                ((VAL) + '0')
 282
 283 /*
 284  *              byteain                 - converts from printable representation of byte array
 285  *
 286  *              Non-printable characters must be passed as '\nnn' (octal) and are
 287  *              converted to internal form.  '\' must be passed as '\\'.
 288  *              ereport(ERROR, ...) if bad form.
 289  *
 290  *              BUGS:
 291  *                              The input is scanned twice.
 292  *                              The error checking of input is minimal.
 293  */
 294 Datum
 295 byteain(PG_FUNCTION_ARGS)
 296 {
 297         char       *inputText = PG_GETARG_CSTRING(0);
 298         Node       *escontext = fcinfo->context;
 299         char       *tp;
 300         char       *rp;
 301         int                     bc;
 302         bytea      *result;
 303
 304         /* Recognize hex input */
 305         if (inputText[0] == '\\' && inputText[1] == 'x')
 306         {
 307                 size_t          len = strlen(inputText);
 308
 309                 bc = (len - 2) / 2 + VARHDRSZ;  /* maximum possible length */
 310                 result = palloc(bc);
 311                 bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
 312                                                          escontext);
 313                 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
 314
 315                 PG_RETURN_BYTEA_P(result);
 316         }
 317
 318         /* Else, it's the traditional escaped style */
 319         for (bc = 0, tp = inputText; *tp != '\0'; bc++)
 320         {
 321                 if (tp[0] != '\\')
 322                         tp++;
 323                 else if ((tp[0] == '\\') &&
 324                                  (tp[1] >= '0' && tp[1] <= '3') &&
 325                                  (tp[2] >= '0' && tp[2] <= '7') &&
 326                                  (tp[3] >= '0' && tp[3] <= '7'))
 327                         tp += 4;
 328                 else if ((tp[0] == '\\') &&
 329                                  (tp[1] == '\\'))
 330                         tp += 2;
 331                 else
 332                 {
 333                         /*
 334                          * one backslash, not followed by another or ### valid octal
 335                          */
 336                         ereturn(escontext, (Datum) 0,
 337                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 338                                          errmsg("invalid input syntax for type %s", "bytea")));
 339                 }
 340         }
 341
 342         bc += VARHDRSZ;
 343
 344         result = (bytea *) palloc(bc);
 345         SET_VARSIZE(result, bc);
 346
 347         tp = inputText;
 348         rp = VARDATA(result);
 349         while (*tp != '\0')
 350         {
 351                 if (tp[0] != '\\')
 352                         *rp++ = *tp++;
 353                 else if ((tp[0] == '\\') &&
 354                                  (tp[1] >= '0' && tp[1] <= '3') &&
 355                                  (tp[2] >= '0' && tp[2] <= '7') &&
 356                                  (tp[3] >= '0' && tp[3] <= '7'))
 357                 {
 358                         bc = VAL(tp[1]);
 359                         bc <<= 3;
 360                         bc += VAL(tp[2]);
 361                         bc <<= 3;
 362                         *rp++ = bc + VAL(tp[3]);
 363
 364                         tp += 4;
 365                 }
 366                 else if ((tp[0] == '\\') &&
 367                                  (tp[1] == '\\'))
 368                 {
 369                         *rp++ = '\\';
 370                         tp += 2;
 371                 }
 372                 else
 373                 {
 374                         /*
 375                          * We should never get here. The first pass should not allow it.
 376                          */
 377                         ereturn(escontext, (Datum) 0,
 378                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 379                                          errmsg("invalid input syntax for type %s", "bytea")));
 380                 }
 381         }
 382
 383         PG_RETURN_BYTEA_P(result);
 384 }
 385
 386 /*
 387  *              byteaout                - converts to printable representation of byte array
 388  *
 389  *              In the traditional escaped format, non-printable characters are
 390  *              printed as '\nnn' (octal) and '\' as '\\'.
 391  */
 392 Datum
 393 byteaout(PG_FUNCTION_ARGS)
 394 {
 395         bytea      *vlena = PG_GETARG_BYTEA_PP(0);
 396         char       *result;
 397         char       *rp;
 398
 399         if (bytea_output == BYTEA_OUTPUT_HEX)
 400         {
 401                 /* Print hex format */
 402                 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
 403                 *rp++ = '\\';
 404                 *rp++ = 'x';
 405                 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
 406         }
 407         else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
 408         {
 409                 /* Print traditional escaped format */
 410                 char       *vp;
 411                 uint64          len;
 412                 int                     i;
 413
 414                 len = 1;                                /* empty string has 1 char */
 415                 vp = VARDATA_ANY(vlena);
 416                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 417                 {
 418                         if (*vp == '\\')
 419                                 len += 2;
 420                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 421                                 len += 4;
 422                         else
 423                                 len++;
 424                 }
 425
 426                 /*
 427                  * In principle len can't overflow uint32 if the input fit in 1GB, but
 428                  * for safety let's check rather than relying on palloc's internal
 429                  * check.
 430                  */
 431                 if (len > MaxAllocSize)
 432                         ereport(ERROR,
 433                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 434                                          errmsg_internal("result of bytea output conversion is too large")));
 435                 rp = result = (char *) palloc(len);
 436
 437                 vp = VARDATA_ANY(vlena);
 438                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 439                 {
 440                         if (*vp == '\\')
 441                         {
 442                                 *rp++ = '\\';
 443                                 *rp++ = '\\';
 444                         }
 445                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 446                         {
 447                                 int                     val;    /* holds unprintable chars */
 448
 449                                 val = *vp;
 450                                 rp[0] = '\\';
 451                                 rp[3] = DIG(val & 07);
 452                                 val >>= 3;
 453                                 rp[2] = DIG(val & 07);
 454                                 val >>= 3;
 455                                 rp[1] = DIG(val & 03);
 456                                 rp += 4;
 457                         }
 458                         else
 459                                 *rp++ = *vp;
 460                 }
 461         }
 462         else
 463         {
 464                 elog(ERROR, "unrecognized bytea_output setting: %d",
 465                          bytea_output);
 466                 rp = result = NULL;             /* keep compiler quiet */
 467         }
 468         *rp = '\0';
 469         PG_RETURN_CSTRING(result);
 470 }
 471
 472 /*
 473  *              bytearecv                       - converts external binary format to bytea
 474  */
 475 Datum
 476 bytearecv(PG_FUNCTION_ARGS)
 477 {
 478         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 479         bytea      *result;
 480         int                     nbytes;
 481
 482         nbytes = buf->len - buf->cursor;
 483         result = (bytea *) palloc(nbytes + VARHDRSZ);
 484         SET_VARSIZE(result, nbytes + VARHDRSZ);
 485         pq_copymsgbytes(buf, VARDATA(result), nbytes);
 486         PG_RETURN_BYTEA_P(result);
 487 }
 488
 489 /*
 490  *              byteasend                       - converts bytea to binary format
 491  *
 492  * This is a special case: just copy the input...
 493  */
 494 Datum
 495 byteasend(PG_FUNCTION_ARGS)
 496 {
 497         bytea      *vlena = PG_GETARG_BYTEA_P_COPY(0);
 498
 499         PG_RETURN_BYTEA_P(vlena);
 500 }
 501
 502 Datum
 503 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
 504 {
 505         StringInfo      state;
 506
 507         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 508
 509         /* Append the value unless null. */
 510         if (!PG_ARGISNULL(1))
 511         {
 512                 bytea      *value = PG_GETARG_BYTEA_PP(1);
 513
 514                 /* On the first time through, we ignore the delimiter. */
 515                 if (state == NULL)
 516                         state = makeStringAggState(fcinfo);
 517                 else if (!PG_ARGISNULL(2))
 518                 {
 519                         bytea      *delim = PG_GETARG_BYTEA_PP(2);
 520
 521                         appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
 522                 }
 523
 524                 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
 525         }
 526
 527         /*
 528          * The transition type for string_agg() is declared to be "internal",
 529          * which is a pass-by-value type the same size as a pointer.
 530          */
 531         PG_RETURN_POINTER(state);
 532 }
 533
 534 Datum
 535 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
 536 {
 537         StringInfo      state;
 538
 539         /* cannot be called directly because of internal-type argument */
 540         Assert(AggCheckCallContext(fcinfo, NULL));
 541
 542         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 543
 544         if (state != NULL)
 545         {
 546                 bytea      *result;
 547
 548                 result = (bytea *) palloc(state->len + VARHDRSZ);
 549                 SET_VARSIZE(result, state->len + VARHDRSZ);
 550                 memcpy(VARDATA(result), state->data, state->len);
 551                 PG_RETURN_BYTEA_P(result);
 552         }
 553         else
 554                 PG_RETURN_NULL();
 555 }
 556
 557 /*
 558  *              textin                  - converts "..." to internal representation
 559  */
 560 Datum
 561 textin(PG_FUNCTION_ARGS)
 562 {
 563         char       *inputText = PG_GETARG_CSTRING(0);
 564
 565         PG_RETURN_TEXT_P(cstring_to_text(inputText));
 566 }
 567
 568 /*
 569  *              textout                 - converts internal representation to "..."
 570  */
 571 Datum
 572 textout(PG_FUNCTION_ARGS)
 573 {
 574         Datum           txt = PG_GETARG_DATUM(0);
 575
 576         PG_RETURN_CSTRING(TextDatumGetCString(txt));
 577 }
 578
 579 /*
 580  *              textrecv                        - converts external binary format to text
 581  */
 582 Datum
 583 textrecv(PG_FUNCTION_ARGS)
 584 {
 585         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 586         text       *result;
 587         char       *str;
 588         int                     nbytes;
 589
 590         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 591
 592         result = cstring_to_text_with_len(str, nbytes);
 593         pfree(str);
 594         PG_RETURN_TEXT_P(result);
 595 }
 596
 597 /*
 598  *              textsend                        - converts text to binary format
 599  */
 600 Datum
 601 textsend(PG_FUNCTION_ARGS)
 602 {
 603         text       *t = PG_GETARG_TEXT_PP(0);
 604         StringInfoData buf;
 605
 606         pq_begintypsend(&buf);
 607         pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
 608         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 609 }
 610
 611
 612 /*
 613  *              unknownin                       - converts "..." to internal representation
 614  */
 615 Datum
 616 unknownin(PG_FUNCTION_ARGS)
 617 {
 618         char       *str = PG_GETARG_CSTRING(0);
 619
 620         /* representation is same as cstring */
 621         PG_RETURN_CSTRING(pstrdup(str));
 622 }
 623
 624 /*
 625  *              unknownout                      - converts internal representation to "..."
 626  */
 627 Datum
 628 unknownout(PG_FUNCTION_ARGS)
 629 {
 630         /* representation is same as cstring */
 631         char       *str = PG_GETARG_CSTRING(0);
 632
 633         PG_RETURN_CSTRING(pstrdup(str));
 634 }
 635
 636 /*
 637  *              unknownrecv                     - converts external binary format to unknown
 638  */
 639 Datum
 640 unknownrecv(PG_FUNCTION_ARGS)
 641 {
 642         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 643         char       *str;
 644         int                     nbytes;
 645
 646         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 647         /* representation is same as cstring */
 648         PG_RETURN_CSTRING(str);
 649 }
 650
 651 /*
 652  *              unknownsend                     - converts unknown to binary format
 653  */
 654 Datum
 655 unknownsend(PG_FUNCTION_ARGS)
 656 {
 657         /* representation is same as cstring */
 658         char       *str = PG_GETARG_CSTRING(0);
 659         StringInfoData buf;
 660
 661         pq_begintypsend(&buf);
 662         pq_sendtext(&buf, str, strlen(str));
 663         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 664 }
 665
 666
 667 /* ========== PUBLIC ROUTINES ========== */
 668
 669 /*
 670  * textlen -
 671  *        returns the logical length of a text*
 672  *         (which is less than the VARSIZE of the text*)
 673  */
 674 Datum
 675 textlen(PG_FUNCTION_ARGS)
 676 {
 677         Datum           str = PG_GETARG_DATUM(0);
 678
 679         /* try to avoid decompressing argument */
 680         PG_RETURN_INT32(text_length(str));
 681 }
 682
 683 /*
 684  * text_length -
 685  *      Does the real work for textlen()
 686  *
 687  *      This is broken out so it can be called directly by other string processing
 688  *      functions.  Note that the argument is passed as a Datum, to indicate that
 689  *      it may still be in compressed form.  We can avoid decompressing it at all
 690  *      in some cases.
 691  */
 692 static int32
 693 text_length(Datum str)
 694 {
 695         /* fastpath when max encoding length is one */
 696         if (pg_database_encoding_max_length() == 1)
 697                 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 698         else
 699         {
 700                 text       *t = DatumGetTextPP(str);
 701
 702                 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
 703                                                                                          VARSIZE_ANY_EXHDR(t)));
 704         }
 705 }
 706
 707 /*
 708  * textoctetlen -
 709  *        returns the physical length of a text*
 710  *         (which is less than the VARSIZE of the text*)
 711  */
 712 Datum
 713 textoctetlen(PG_FUNCTION_ARGS)
 714 {
 715         Datum           str = PG_GETARG_DATUM(0);
 716
 717         /* We need not detoast the input at all */
 718         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 719 }
 720
 721 /*
 722  * textcat -
 723  *        takes two text* and returns a text* that is the concatenation of
 724  *        the two.
 725  *
 726  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
 727  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
 728  * Allocate space for output in all cases.
 729  * XXX - thomas 1997-07-10
 730  */
 731 Datum
 732 textcat(PG_FUNCTION_ARGS)
 733 {
 734         text       *t1 = PG_GETARG_TEXT_PP(0);
 735         text       *t2 = PG_GETARG_TEXT_PP(1);
 736
 737         PG_RETURN_TEXT_P(text_catenate(t1, t2));
 738 }
 739
 740 /*
 741  * text_catenate
 742  *      Guts of textcat(), broken out so it can be used by other functions
 743  *
 744  * Arguments can be in short-header form, but not compressed or out-of-line
 745  */
 746 static text *
 747 text_catenate(text *t1, text *t2)
 748 {
 749         text       *result;
 750         int                     len1,
 751                                 len2,
 752                                 len;
 753         char       *ptr;
 754
 755         len1 = VARSIZE_ANY_EXHDR(t1);
 756         len2 = VARSIZE_ANY_EXHDR(t2);
 757
 758         /* paranoia ... probably should throw error instead? */
 759         if (len1 < 0)
 760                 len1 = 0;
 761         if (len2 < 0)
 762                 len2 = 0;
 763
 764         len = len1 + len2 + VARHDRSZ;
 765         result = (text *) palloc(len);
 766
 767         /* Set size of result string... */
 768         SET_VARSIZE(result, len);
 769
 770         /* Fill data field of result string... */
 771         ptr = VARDATA(result);
 772         if (len1 > 0)
 773                 memcpy(ptr, VARDATA_ANY(t1), len1);
 774         if (len2 > 0)
 775                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
 776
 777         return result;
 778 }
 779
 780 /*
 781  * charlen_to_bytelen()
 782  *      Compute the number of bytes occupied by n characters starting at *p
 783  *
 784  * It is caller's responsibility that there actually are n characters;
 785  * the string need not be null-terminated.
 786  */
 787 static int
 788 charlen_to_bytelen(const char *p, int n)
 789 {
 790         if (pg_database_encoding_max_length() == 1)
 791         {
 792                 /* Optimization for single-byte encodings */
 793                 return n;
 794         }
 795         else
 796         {
 797                 const char *s;
 798
 799                 for (s = p; n > 0; n--)
 800                         s += pg_mblen(s);
 801
 802                 return s - p;
 803         }
 804 }
 805
 806 /*
 807  * text_substr()
 808  * Return a substring starting at the specified position.
 809  * - thomas 1997-12-31
 810  *
 811  * Input:
 812  *      - string
 813  *      - starting position (is one-based)
 814  *      - string length
 815  *
 816  * If the starting position is zero or less, then return from the start of the string
 817  *      adjusting the length to be consistent with the "negative start" per SQL.
 818  * If the length is less than zero, return the remaining string.
 819  *
 820  * Added multibyte support.
 821  * - Tatsuo Ishii 1998-4-21
 822  * Changed behavior if starting position is less than one to conform to SQL behavior.
 823  * Formerly returned the entire string; now returns a portion.
 824  * - Thomas Lockhart 1998-12-10
 825  * Now uses faster TOAST-slicing interface
 826  * - John Gray 2002-02-22
 827  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
 828  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
 829  * error; if E < 1, return '', not entire string). Fixed MB related bug when
 830  * S > LC and < LC + 4 sometimes garbage characters are returned.
 831  * - Joe Conway 2002-08-10
 832  */
 833 Datum
 834 text_substr(PG_FUNCTION_ARGS)
 835 {
 836         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 837                                                                         PG_GETARG_INT32(1),
 838                                                                         PG_GETARG_INT32(2),
 839                                                                         false));
 840 }
 841
 842 /*
 843  * text_substr_no_len -
 844  *        Wrapper to avoid opr_sanity failure due to
 845  *        one function accepting a different number of args.
 846  */
 847 Datum
 848 text_substr_no_len(PG_FUNCTION_ARGS)
 849 {
 850         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 851                                                                         PG_GETARG_INT32(1),
 852                                                                         -1, true));
 853 }
 854
 855 /*
 856  * text_substring -
 857  *      Does the real work for text_substr() and text_substr_no_len()
 858  *
 859  *      This is broken out so it can be called directly by other string processing
 860  *      functions.  Note that the argument is passed as a Datum, to indicate that
 861  *      it may still be in compressed/toasted form.  We can avoid detoasting all
 862  *      of it in some cases.
 863  *
 864  *      The result is always a freshly palloc'd datum.
 865  */
 866 static text *
 867 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 868 {
 869         int32           eml = pg_database_encoding_max_length();
 870         int32           S = start;              /* start position */
 871         int32           S1;                             /* adjusted start position */
 872         int32           L1;                             /* adjusted substring length */
 873         int32           E;                              /* end position */
 874
 875         /*
 876          * SQL99 says S can be zero or negative, but we still must fetch from the
 877          * start of the string.
 878          */
 879         S1 = Max(S, 1);
 880
 881         /* life is easy if the encoding max length is 1 */
 882         if (eml == 1)
 883         {
 884                 if (length_not_specified)       /* special case - get length to end of
 885                                                                          * string */
 886                         L1 = -1;
 887                 else if (length < 0)
 888                 {
 889                         /* SQL99 says to throw an error for E < S, i.e., negative length */
 890                         ereport(ERROR,
 891                                         (errcode(ERRCODE_SUBSTRING_ERROR),
 892                                          errmsg("negative substring length not allowed")));
 893                         L1 = -1;                        /* silence stupider compilers */
 894                 }
 895                 else if (pg_add_s32_overflow(S, length, &E))
 896                 {
 897                         /*
 898                          * L could be large enough for S + L to overflow, in which case
 899                          * the substring must run to end of string.
 900                          */
 901                         L1 = -1;
 902                 }
 903                 else
 904                 {
 905                         /*
 906                          * A zero or negative value for the end position can happen if the
 907                          * start was negative or one. SQL99 says to return a zero-length
 908                          * string.
 909                          */
 910                         if (E < 1)
 911                                 return cstring_to_text("");
 912
 913                         L1 = E - S1;
 914                 }
 915
 916                 /*
 917                  * If the start position is past the end of the string, SQL99 says to
 918                  * return a zero-length string -- DatumGetTextPSlice() will do that
 919                  * for us.  We need only convert S1 to zero-based starting position.
 920                  */
 921                 return DatumGetTextPSlice(str, S1 - 1, L1);
 922         }
 923         else if (eml > 1)
 924         {
 925                 /*
 926                  * When encoding max length is > 1, we can't get LC without
 927                  * detoasting, so we'll grab a conservatively large slice now and go
 928                  * back later to do the right thing
 929                  */
 930                 int32           slice_start;
 931                 int32           slice_size;
 932                 int32           slice_strlen;
 933                 text       *slice;
 934                 int32           E1;
 935                 int32           i;
 936                 char       *p;
 937                 char       *s;
 938                 text       *ret;
 939
 940                 /*
 941                  * We need to start at position zero because there is no way to know
 942                  * in advance which byte offset corresponds to the supplied start
 943                  * position.
 944                  */
 945                 slice_start = 0;
 946
 947                 if (length_not_specified)       /* special case - get length to end of
 948                                                                          * string */
 949                         slice_size = L1 = -1;
 950                 else if (length < 0)
 951                 {
 952                         /* SQL99 says to throw an error for E < S, i.e., negative length */
 953                         ereport(ERROR,
 954                                         (errcode(ERRCODE_SUBSTRING_ERROR),
 955                                          errmsg("negative substring length not allowed")));
 956                         slice_size = L1 = -1;   /* silence stupider compilers */
 957                 }
 958                 else if (pg_add_s32_overflow(S, length, &E))
 959                 {
 960                         /*
 961                          * L could be large enough for S + L to overflow, in which case
 962                          * the substring must run to end of string.
 963                          */
 964                         slice_size = L1 = -1;
 965                 }
 966                 else
 967                 {
 968                         /*
 969                          * A zero or negative value for the end position can happen if the
 970                          * start was negative or one. SQL99 says to return a zero-length
 971                          * string.
 972                          */
 973                         if (E < 1)
 974                                 return cstring_to_text("");
 975
 976                         /*
 977                          * if E is past the end of the string, the tuple toaster will
 978                          * truncate the length for us
 979                          */
 980                         L1 = E - S1;
 981
 982                         /*
 983                          * Total slice size in bytes can't be any longer than the start
 984                          * position plus substring length times the encoding max length.
 985                          * If that overflows, we can just use -1.
 986                          */
 987                         if (pg_mul_s32_overflow(E, eml, &slice_size))
 988                                 slice_size = -1;
 989                 }
 990
 991                 /*
 992                  * If we're working with an untoasted source, no need to do an extra
 993                  * copying step.
 994                  */
 995                 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
 996                         VARATT_IS_EXTERNAL(DatumGetPointer(str)))
 997                         slice = DatumGetTextPSlice(str, slice_start, slice_size);
 998                 else
 999                         slice = (text *) DatumGetPointer(str);
1000
1001                 /* see if we got back an empty string */
1002                 if (VARSIZE_ANY_EXHDR(slice) == 0)
1003                 {
1004                         if (slice != (text *) DatumGetPointer(str))
1005                                 pfree(slice);
1006                         return cstring_to_text("");
1007                 }
1008
1009                 /* Now we can get the actual length of the slice in MB characters */
1010                 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1011                                                                                         VARSIZE_ANY_EXHDR(slice));
1012
1013                 /*
1014                  * Check that the start position wasn't > slice_strlen. If so, SQL99
1015                  * says to return a zero-length string.
1016                  */
1017                 if (S1 > slice_strlen)
1018                 {
1019                         if (slice != (text *) DatumGetPointer(str))
1020                                 pfree(slice);
1021                         return cstring_to_text("");
1022                 }
1023
1024                 /*
1025                  * Adjust L1 and E1 now that we know the slice string length. Again
1026                  * remember that S1 is one based, and slice_start is zero based.
1027                  */
1028                 if (L1 > -1)
1029                         E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1030                 else
1031                         E1 = slice_start + 1 + slice_strlen;
1032
1033                 /*
1034                  * Find the start position in the slice; remember S1 is not zero based
1035                  */
1036                 p = VARDATA_ANY(slice);
1037                 for (i = 0; i < S1 - 1; i++)
1038                         p += pg_mblen(p);
1039
1040                 /* hang onto a pointer to our start position */
1041                 s = p;
1042
1043                 /*
1044                  * Count the actual bytes used by the substring of the requested
1045                  * length.
1046                  */
1047                 for (i = S1; i < E1; i++)
1048                         p += pg_mblen(p);
1049
1050                 ret = (text *) palloc(VARHDRSZ + (p - s));
1051                 SET_VARSIZE(ret, VARHDRSZ + (p - s));
1052                 memcpy(VARDATA(ret), s, (p - s));
1053
1054                 if (slice != (text *) DatumGetPointer(str))
1055                         pfree(slice);
1056
1057                 return ret;
1058         }
1059         else
1060                 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1061
1062         /* not reached: suppress compiler warning */
1063         return NULL;
1064 }
1065
1066 /*
1067  * textoverlay
1068  *      Replace specified substring of first string with second
1069  *
1070  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1071  * This code is a direct implementation of what the standard says.
1072  */
1073 Datum
1074 textoverlay(PG_FUNCTION_ARGS)
1075 {
1076         text       *t1 = PG_GETARG_TEXT_PP(0);
1077         text       *t2 = PG_GETARG_TEXT_PP(1);
1078         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
1079         int                     sl = PG_GETARG_INT32(3);        /* substring length */
1080
1081         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1082 }
1083
1084 Datum
1085 textoverlay_no_len(PG_FUNCTION_ARGS)
1086 {
1087         text       *t1 = PG_GETARG_TEXT_PP(0);
1088         text       *t2 = PG_GETARG_TEXT_PP(1);
1089         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
1090         int                     sl;
1091
1092         sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
1093         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1094 }
1095
1096 static text *
1097 text_overlay(text *t1, text *t2, int sp, int sl)
1098 {
1099         text       *result;
1100         text       *s1;
1101         text       *s2;
1102         int                     sp_pl_sl;
1103
1104         /*
1105          * Check for possible integer-overflow cases.  For negative sp, throw a
1106          * "substring length" error because that's what should be expected
1107          * according to the spec's definition of OVERLAY().
1108          */
1109         if (sp <= 0)
1110                 ereport(ERROR,
1111                                 (errcode(ERRCODE_SUBSTRING_ERROR),
1112                                  errmsg("negative substring length not allowed")));
1113         if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1114                 ereport(ERROR,
1115                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1116                                  errmsg("integer out of range")));
1117
1118         s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1119         s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1120         result = text_catenate(s1, t2);
1121         result = text_catenate(result, s2);
1122
1123         return result;
1124 }
1125
1126 /*
1127  * textpos -
1128  *        Return the position of the specified substring.
1129  *        Implements the SQL POSITION() function.
1130  *        Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1131  * - thomas 1997-07-27
1132  */
1133 Datum
1134 textpos(PG_FUNCTION_ARGS)
1135 {
1136         text       *str = PG_GETARG_TEXT_PP(0);
1137         text       *search_str = PG_GETARG_TEXT_PP(1);
1138
1139         PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1140 }
1141
1142 /*
1143  * text_position -
1144  *      Does the real work for textpos()
1145  *
1146  * Inputs:
1147  *              t1 - string to be searched
1148  *              t2 - pattern to match within t1
1149  * Result:
1150  *              Character index of the first matched char, starting from 1,
1151  *              or 0 if no match.
1152  *
1153  *      This is broken out so it can be called directly by other string processing
1154  *      functions.
1155  */
1156 static int
1157 text_position(text *t1, text *t2, Oid collid)
1158 {
1159         TextPositionState state;
1160         int                     result;
1161
1162         /* Empty needle always matches at position 1 */
1163         if (VARSIZE_ANY_EXHDR(t2) < 1)
1164                 return 1;
1165
1166         /* Otherwise, can't match if haystack is shorter than needle */
1167         if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1168                 return 0;
1169
1170         text_position_setup(t1, t2, collid, &state);
1171         if (!text_position_next(&state))
1172                 result = 0;
1173         else
1174                 result = text_position_get_match_pos(&state);
1175         text_position_cleanup(&state);
1176         return result;
1177 }
1178
1179
1180 /*
1181  * text_position_setup, text_position_next, text_position_cleanup -
1182  *      Component steps of text_position()
1183  *
1184  * These are broken out so that a string can be efficiently searched for
1185  * multiple occurrences of the same pattern.  text_position_next may be
1186  * called multiple times, and it advances to the next match on each call.
1187  * text_position_get_match_ptr() and text_position_get_match_pos() return
1188  * a pointer or 1-based character position of the last match, respectively.
1189  *
1190  * The "state" variable is normally just a local variable in the caller.
1191  *
1192  * NOTE: text_position_next skips over the matched portion.  For example,
1193  * searching for "xx" in "xxx" returns only one match, not two.
1194  */
1195
1196 static void
1197 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1198 {
1199         int                     len1 = VARSIZE_ANY_EXHDR(t1);
1200         int                     len2 = VARSIZE_ANY_EXHDR(t2);
1201         pg_locale_t mylocale = 0;
1202
1203         check_collation_set(collid);
1204
1205         if (!lc_collate_is_c(collid))
1206                 mylocale = pg_newlocale_from_collation(collid);
1207
1208         if (mylocale && !mylocale->deterministic)
1209                 ereport(ERROR,
1210                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1211                                  errmsg("nondeterministic collations are not supported for substring searches")));
1212
1213         Assert(len1 > 0);
1214         Assert(len2 > 0);
1215
1216         /*
1217          * Even with a multi-byte encoding, we perform the search using the raw
1218          * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
1219          * because in UTF-8 the byte sequence of one character cannot contain
1220          * another character.  For other multi-byte encodings, we do the search
1221          * initially as a simple byte search, ignoring multibyte issues, but
1222          * verify afterwards that the match we found is at a character boundary,
1223          * and continue the search if it was a false match.
1224          */
1225         if (pg_database_encoding_max_length() == 1)
1226                 state->is_multibyte_char_in_char = false;
1227         else if (GetDatabaseEncoding() == PG_UTF8)
1228                 state->is_multibyte_char_in_char = false;
1229         else
1230                 state->is_multibyte_char_in_char = true;
1231
1232         state->str1 = VARDATA_ANY(t1);
1233         state->str2 = VARDATA_ANY(t2);
1234         state->len1 = len1;
1235         state->len2 = len2;
1236         state->last_match = NULL;
1237         state->refpoint = state->str1;
1238         state->refpos = 0;
1239
1240         /*
1241          * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1242          * notes we use the terminology that the "haystack" is the string to be
1243          * searched (t1) and the "needle" is the pattern being sought (t2).
1244          *
1245          * If the needle is empty or bigger than the haystack then there is no
1246          * point in wasting cycles initializing the table.  We also choose not to
1247          * use B-M-H for needles of length 1, since the skip table can't possibly
1248          * save anything in that case.
1249          */
1250         if (len1 >= len2 && len2 > 1)
1251         {
1252                 int                     searchlength = len1 - len2;
1253                 int                     skiptablemask;
1254                 int                     last;
1255                 int                     i;
1256                 const char *str2 = state->str2;
1257
1258                 /*
1259                  * First we must determine how much of the skip table to use.  The
1260                  * declaration of TextPositionState allows up to 256 elements, but for
1261                  * short search problems we don't really want to have to initialize so
1262                  * many elements --- it would take too long in comparison to the
1263                  * actual search time.  So we choose a useful skip table size based on
1264                  * the haystack length minus the needle length.  The closer the needle
1265                  * length is to the haystack length the less useful skipping becomes.
1266                  *
1267                  * Note: since we use bit-masking to select table elements, the skip
1268                  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1269                  */
1270                 if (searchlength < 16)
1271                         skiptablemask = 3;
1272                 else if (searchlength < 64)
1273                         skiptablemask = 7;
1274                 else if (searchlength < 128)
1275                         skiptablemask = 15;
1276                 else if (searchlength < 512)
1277                         skiptablemask = 31;
1278                 else if (searchlength < 2048)
1279                         skiptablemask = 63;
1280                 else if (searchlength < 4096)
1281                         skiptablemask = 127;
1282                 else
1283                         skiptablemask = 255;
1284                 state->skiptablemask = skiptablemask;
1285
1286                 /*
1287                  * Initialize the skip table.  We set all elements to the needle
1288                  * length, since this is the correct skip distance for any character
1289                  * not found in the needle.
1290                  */
1291                 for (i = 0; i <= skiptablemask; i++)
1292                         state->skiptable[i] = len2;
1293
1294                 /*
1295                  * Now examine the needle.  For each character except the last one,
1296                  * set the corresponding table element to the appropriate skip
1297                  * distance.  Note that when two characters share the same skip table
1298                  * entry, the one later in the needle must determine the skip
1299                  * distance.
1300                  */
1301                 last = len2 - 1;
1302
1303                 for (i = 0; i < last; i++)
1304                         state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1305         }
1306 }
1307
1308 /*
1309  * Advance to the next match, starting from the end of the previous match
1310  * (or the beginning of the string, on first call).  Returns true if a match
1311  * is found.
1312  *
1313  * Note that this refuses to match an empty-string needle.  Most callers
1314  * will have handled that case specially and we'll never see it here.
1315  */
1316 static bool
1317 text_position_next(TextPositionState *state)
1318 {
1319         int                     needle_len = state->len2;
1320         char       *start_ptr;
1321         char       *matchptr;
1322
1323         if (needle_len <= 0)
1324                 return false;                   /* result for empty pattern */
1325
1326         /* Start from the point right after the previous match. */
1327         if (state->last_match)
1328                 start_ptr = state->last_match + needle_len;
1329         else
1330                 start_ptr = state->str1;
1331
1332 retry:
1333         matchptr = text_position_next_internal(start_ptr, state);
1334
1335         if (!matchptr)
1336                 return false;
1337
1338         /*
1339          * Found a match for the byte sequence.  If this is a multibyte encoding,
1340          * where one character's byte sequence can appear inside a longer
1341          * multi-byte character, we need to verify that the match was at a
1342          * character boundary, not in the middle of a multi-byte character.
1343          */
1344         if (state->is_multibyte_char_in_char)
1345         {
1346                 /* Walk one character at a time, until we reach the match. */
1347
1348                 /* the search should never move backwards. */
1349                 Assert(state->refpoint <= matchptr);
1350
1351                 while (state->refpoint < matchptr)
1352                 {
1353                         /* step to next character. */
1354                         state->refpoint += pg_mblen(state->refpoint);
1355                         state->refpos++;
1356
1357                         /*
1358                          * If we stepped over the match's start position, then it was a
1359                          * false positive, where the byte sequence appeared in the middle
1360                          * of a multi-byte character.  Skip it, and continue the search at
1361                          * the next character boundary.
1362                          */
1363                         if (state->refpoint > matchptr)
1364                         {
1365                                 start_ptr = state->refpoint;
1366                                 goto retry;
1367                         }
1368                 }
1369         }
1370
1371         state->last_match = matchptr;
1372         return true;
1373 }
1374
1375 /*
1376  * Subroutine of text_position_next().  This searches for the raw byte
1377  * sequence, ignoring any multi-byte encoding issues.  Returns the first
1378  * match starting at 'start_ptr', or NULL if no match is found.
1379  */
1380 static char *
1381 text_position_next_internal(char *start_ptr, TextPositionState *state)
1382 {
1383         int                     haystack_len = state->len1;
1384         int                     needle_len = state->len2;
1385         int                     skiptablemask = state->skiptablemask;
1386         const char *haystack = state->str1;
1387         const char *needle = state->str2;
1388         const char *haystack_end = &haystack[haystack_len];
1389         const char *hptr;
1390
1391         Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1392
1393         if (needle_len == 1)
1394         {
1395                 /* No point in using B-M-H for a one-character needle */
1396                 char            nchar = *needle;
1397
1398                 hptr = start_ptr;
1399                 while (hptr < haystack_end)
1400                 {
1401                         if (*hptr == nchar)
1402                                 return (char *) hptr;
1403                         hptr++;
1404                 }
1405         }
1406         else
1407         {
1408                 const char *needle_last = &needle[needle_len - 1];
1409
1410                 /* Start at startpos plus the length of the needle */
1411                 hptr = start_ptr + needle_len - 1;
1412                 while (hptr < haystack_end)
1413                 {
1414                         /* Match the needle scanning *backward* */
1415                         const char *nptr;
1416                         const char *p;
1417
1418                         nptr = needle_last;
1419                         p = hptr;
1420                         while (*nptr == *p)
1421                         {
1422                                 /* Matched it all?      If so, return 1-based position */
1423                                 if (nptr == needle)
1424                                         return (char *) p;
1425                                 nptr--, p--;
1426                         }
1427
1428                         /*
1429                          * No match, so use the haystack char at hptr to decide how far to
1430                          * advance.  If the needle had any occurrence of that character
1431                          * (or more precisely, one sharing the same skiptable entry)
1432                          * before its last character, then we advance far enough to align
1433                          * the last such needle character with that haystack position.
1434                          * Otherwise we can advance by the whole needle length.
1435                          */
1436                         hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1437                 }
1438         }
1439
1440         return 0;                                       /* not found */
1441 }
1442
1443 /*
1444  * Return a pointer to the current match.
1445  *
1446  * The returned pointer points into the original haystack string.
1447  */
1448 static char *
1449 text_position_get_match_ptr(TextPositionState *state)
1450 {
1451         return state->last_match;
1452 }
1453
1454 /*
1455  * Return the offset of the current match.
1456  *
1457  * The offset is in characters, 1-based.
1458  */
1459 static int
1460 text_position_get_match_pos(TextPositionState *state)
1461 {
1462         /* Convert the byte position to char position. */
1463         state->refpos += pg_mbstrlen_with_len(state->refpoint,
1464                                                                                   state->last_match - state->refpoint);
1465         state->refpoint = state->last_match;
1466         return state->refpos + 1;
1467 }
1468
1469 /*
1470  * Reset search state to the initial state installed by text_position_setup.
1471  *
1472  * The next call to text_position_next will search from the beginning
1473  * of the string.
1474  */
1475 static void
1476 text_position_reset(TextPositionState *state)
1477 {
1478         state->last_match = NULL;
1479         state->refpoint = state->str1;
1480         state->refpos = 0;
1481 }
1482
1483 static void
1484 text_position_cleanup(TextPositionState *state)
1485 {
1486         /* no cleanup needed */
1487 }
1488
1489
1490 static void
1491 check_collation_set(Oid collid)
1492 {
1493         if (!OidIsValid(collid))
1494         {
1495                 /*
1496                  * This typically means that the parser could not resolve a conflict
1497                  * of implicit collations, so report it that way.
1498                  */
1499                 ereport(ERROR,
1500                                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1501                                  errmsg("could not determine which collation to use for string comparison"),
1502                                  errhint("Use the COLLATE clause to set the collation explicitly.")));
1503         }
1504 }
1505
1506 /* varstr_cmp()
1507  * Comparison function for text strings with given lengths.
1508  * Includes locale support, but must copy strings to temporary memory
1509  *      to allow null-termination for inputs to strcoll().
1510  * Returns an integer less than, equal to, or greater than zero, indicating
1511  * whether arg1 is less than, equal to, or greater than arg2.
1512  *
1513  * Note: many functions that depend on this are marked leakproof; therefore,
1514  * avoid reporting the actual contents of the input when throwing errors.
1515  * All errors herein should be things that can't happen except on corrupt
1516  * data, anyway; otherwise we will have trouble with indexing strings that
1517  * would cause them.
1518  */
1519 int
1520 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1521 {
1522         int                     result;
1523
1524         check_collation_set(collid);
1525
1526         /*
1527          * Unfortunately, there is no strncoll(), so in the non-C locale case we
1528          * have to do some memory copying.  This turns out to be significantly
1529          * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1530          * optimize relatively-short strings by avoiding palloc/pfree overhead.
1531          */
1532         if (lc_collate_is_c(collid))
1533         {
1534                 result = memcmp(arg1, arg2, Min(len1, len2));
1535                 if ((result == 0) && (len1 != len2))
1536                         result = (len1 < len2) ? -1 : 1;
1537         }
1538         else
1539         {
1540                 char            a1buf[TEXTBUFLEN];
1541                 char            a2buf[TEXTBUFLEN];
1542                 char       *a1p,
1543                                    *a2p;
1544                 pg_locale_t mylocale;
1545
1546                 mylocale = pg_newlocale_from_collation(collid);
1547
1548                 /*
1549                  * memcmp() can't tell us which of two unequal strings sorts first,
1550                  * but it's a cheap way to tell if they're equal.  Testing shows that
1551                  * memcmp() followed by strcoll() is only trivially slower than
1552                  * strcoll() by itself, so we don't lose much if this doesn't work out
1553                  * very often, and if it does - for example, because there are many
1554                  * equal strings in the input - then we win big by avoiding expensive
1555                  * collation-aware comparisons.
1556                  */
1557                 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1558                         return 0;
1559
1560 #ifdef WIN32
1561                 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1562                 if (GetDatabaseEncoding() == PG_UTF8
1563                         && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1564                 {
1565                         int                     a1len;
1566                         int                     a2len;
1567                         int                     r;
1568
1569                         if (len1 >= TEXTBUFLEN / 2)
1570                         {
1571                                 a1len = len1 * 2 + 2;
1572                                 a1p = palloc(a1len);
1573                         }
1574                         else
1575                         {
1576                                 a1len = TEXTBUFLEN;
1577                                 a1p = a1buf;
1578                         }
1579                         if (len2 >= TEXTBUFLEN / 2)
1580                         {
1581                                 a2len = len2 * 2 + 2;
1582                                 a2p = palloc(a2len);
1583                         }
1584                         else
1585                         {
1586                                 a2len = TEXTBUFLEN;
1587                                 a2p = a2buf;
1588                         }
1589
1590                         /* stupid Microsloth API does not work for zero-length input */
1591                         if (len1 == 0)
1592                                 r = 0;
1593                         else
1594                         {
1595                                 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1596                                                                                 (LPWSTR) a1p, a1len / 2);
1597                                 if (!r)
1598                                         ereport(ERROR,
1599                                                         (errmsg("could not convert string to UTF-16: error code %lu",
1600                                                                         GetLastError())));
1601                         }
1602                         ((LPWSTR) a1p)[r] = 0;
1603
1604                         if (len2 == 0)
1605                                 r = 0;
1606                         else
1607                         {
1608                                 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1609                                                                                 (LPWSTR) a2p, a2len / 2);
1610                                 if (!r)
1611                                         ereport(ERROR,
1612                                                         (errmsg("could not convert string to UTF-16: error code %lu",
1613                                                                         GetLastError())));
1614                         }
1615                         ((LPWSTR) a2p)[r] = 0;
1616
1617                         errno = 0;
1618 #ifdef HAVE_LOCALE_T
1619                         if (mylocale)
1620                                 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1621                         else
1622 #endif
1623                                 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1624                         if (result == 2147483647)       /* _NLSCMPERROR; missing from mingw
1625                                                                                  * headers */
1626                                 ereport(ERROR,
1627                                                 (errmsg("could not compare Unicode strings: %m")));
1628
1629                         /* Break tie if necessary. */
1630                         if (result == 0 &&
1631                                 (!mylocale || mylocale->deterministic))
1632                         {
1633                                 result = memcmp(arg1, arg2, Min(len1, len2));
1634                                 if ((result == 0) && (len1 != len2))
1635                                         result = (len1 < len2) ? -1 : 1;
1636                         }
1637
1638                         if (a1p != a1buf)
1639                                 pfree(a1p);
1640                         if (a2p != a2buf)
1641                                 pfree(a2p);
1642
1643                         return result;
1644                 }
1645 #endif                                                  /* WIN32 */
1646
1647                 if (len1 >= TEXTBUFLEN)
1648                         a1p = (char *) palloc(len1 + 1);
1649                 else
1650                         a1p = a1buf;
1651                 if (len2 >= TEXTBUFLEN)
1652                         a2p = (char *) palloc(len2 + 1);
1653                 else
1654                         a2p = a2buf;
1655
1656                 memcpy(a1p, arg1, len1);
1657                 a1p[len1] = '\0';
1658                 memcpy(a2p, arg2, len2);
1659                 a2p[len2] = '\0';
1660
1661                 if (mylocale)
1662                 {
1663                         if (mylocale->provider == COLLPROVIDER_ICU)
1664                         {
1665 #ifdef USE_ICU
1666 #ifdef HAVE_UCOL_STRCOLLUTF8
1667                                 if (GetDatabaseEncoding() == PG_UTF8)
1668                                 {
1669                                         UErrorCode      status;
1670
1671                                         status = U_ZERO_ERROR;
1672                                         result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1673                                                                                           arg1, len1,
1674                                                                                           arg2, len2,
1675                                                                                           &status);
1676                                         if (U_FAILURE(status))
1677                                                 ereport(ERROR,
1678                                                                 (errmsg("collation failed: %s", u_errorName(status))));
1679                                 }
1680                                 else
1681 #endif
1682                                 {
1683                                         int32_t         ulen1,
1684                                                                 ulen2;
1685                                         UChar      *uchar1,
1686                                                            *uchar2;
1687
1688                                         ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1689                                         ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1690
1691                                         result = ucol_strcoll(mylocale->info.icu.ucol,
1692                                                                                   uchar1, ulen1,
1693                                                                                   uchar2, ulen2);
1694
1695                                         pfree(uchar1);
1696                                         pfree(uchar2);
1697                                 }
1698 #else                                                   /* not USE_ICU */
1699                                 /* shouldn't happen */
1700                                 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1701 #endif                                                  /* not USE_ICU */
1702                         }
1703                         else
1704                         {
1705 #ifdef HAVE_LOCALE_T
1706                                 result = strcoll_l(a1p, a2p, mylocale->info.lt);
1707 #else
1708                                 /* shouldn't happen */
1709                                 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1710 #endif
1711                         }
1712                 }
1713                 else
1714                         result = strcoll(a1p, a2p);
1715
1716                 /* Break tie if necessary. */
1717                 if (result == 0 &&
1718                         (!mylocale || mylocale->deterministic))
1719                         result = strcmp(a1p, a2p);
1720
1721                 if (a1p != a1buf)
1722                         pfree(a1p);
1723                 if (a2p != a2buf)
1724                         pfree(a2p);
1725         }
1726
1727         return result;
1728 }
1729
1730 /* text_cmp()
1731  * Internal comparison function for text strings.
1732  * Returns -1, 0 or 1
1733  */
1734 static int
1735 text_cmp(text *arg1, text *arg2, Oid collid)
1736 {
1737         char       *a1p,
1738                            *a2p;
1739         int                     len1,
1740                                 len2;
1741
1742         a1p = VARDATA_ANY(arg1);
1743         a2p = VARDATA_ANY(arg2);
1744
1745         len1 = VARSIZE_ANY_EXHDR(arg1);
1746         len2 = VARSIZE_ANY_EXHDR(arg2);
1747
1748         return varstr_cmp(a1p, len1, a2p, len2, collid);
1749 }
1750
1751 /*
1752  * Comparison functions for text strings.
1753  *
1754  * Note: btree indexes need these routines not to leak memory; therefore,
1755  * be careful to free working copies of toasted datums.  Most places don't
1756  * need to be so careful.
1757  */
1758
1759 Datum
1760 texteq(PG_FUNCTION_ARGS)
1761 {
1762         Oid                     collid = PG_GET_COLLATION();
1763         bool            locale_is_c = false;
1764         pg_locale_t mylocale = 0;
1765         bool            result;
1766
1767         check_collation_set(collid);
1768
1769         if (lc_collate_is_c(collid))
1770                 locale_is_c = true;
1771         else
1772                 mylocale = pg_newlocale_from_collation(collid);
1773
1774         if (locale_is_c || !mylocale || mylocale->deterministic)
1775         {
1776                 Datum           arg1 = PG_GETARG_DATUM(0);
1777                 Datum           arg2 = PG_GETARG_DATUM(1);
1778                 Size            len1,
1779                                         len2;
1780
1781                 /*
1782                  * Since we only care about equality or not-equality, we can avoid all
1783                  * the expense of strcoll() here, and just do bitwise comparison.  In
1784                  * fact, we don't even have to do a bitwise comparison if we can show
1785                  * the lengths of the strings are unequal; which might save us from
1786                  * having to detoast one or both values.
1787                  */
1788                 len1 = toast_raw_datum_size(arg1);
1789                 len2 = toast_raw_datum_size(arg2);
1790                 if (len1 != len2)
1791                         result = false;
1792                 else
1793                 {
1794                         text       *targ1 = DatumGetTextPP(arg1);
1795                         text       *targ2 = DatumGetTextPP(arg2);
1796
1797                         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1798                                                          len1 - VARHDRSZ) == 0);
1799
1800                         PG_FREE_IF_COPY(targ1, 0);
1801                         PG_FREE_IF_COPY(targ2, 1);
1802                 }
1803         }
1804         else
1805         {
1806                 text       *arg1 = PG_GETARG_TEXT_PP(0);
1807                 text       *arg2 = PG_GETARG_TEXT_PP(1);
1808
1809                 result = (text_cmp(arg1, arg2, collid) == 0);
1810
1811                 PG_FREE_IF_COPY(arg1, 0);
1812                 PG_FREE_IF_COPY(arg2, 1);
1813         }
1814
1815         PG_RETURN_BOOL(result);
1816 }
1817
1818 Datum
1819 textne(PG_FUNCTION_ARGS)
1820 {
1821         Oid                     collid = PG_GET_COLLATION();
1822         bool            locale_is_c = false;
1823         pg_locale_t mylocale = 0;
1824         bool            result;
1825
1826         check_collation_set(collid);
1827
1828         if (lc_collate_is_c(collid))
1829                 locale_is_c = true;
1830         else
1831                 mylocale = pg_newlocale_from_collation(collid);
1832
1833         if (locale_is_c || !mylocale || mylocale->deterministic)
1834         {
1835                 Datum           arg1 = PG_GETARG_DATUM(0);
1836                 Datum           arg2 = PG_GETARG_DATUM(1);
1837                 Size            len1,
1838                                         len2;
1839
1840                 /* See comment in texteq() */
1841                 len1 = toast_raw_datum_size(arg1);
1842                 len2 = toast_raw_datum_size(arg2);
1843                 if (len1 != len2)
1844                         result = true;
1845                 else
1846                 {
1847                         text       *targ1 = DatumGetTextPP(arg1);
1848                         text       *targ2 = DatumGetTextPP(arg2);
1849
1850                         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1851                                                          len1 - VARHDRSZ) != 0);
1852
1853                         PG_FREE_IF_COPY(targ1, 0);
1854                         PG_FREE_IF_COPY(targ2, 1);
1855                 }
1856         }
1857         else
1858         {
1859                 text       *arg1 = PG_GETARG_TEXT_PP(0);
1860                 text       *arg2 = PG_GETARG_TEXT_PP(1);
1861
1862                 result = (text_cmp(arg1, arg2, collid) != 0);
1863
1864                 PG_FREE_IF_COPY(arg1, 0);
1865                 PG_FREE_IF_COPY(arg2, 1);
1866         }
1867
1868         PG_RETURN_BOOL(result);
1869 }
1870
1871 Datum
1872 text_lt(PG_FUNCTION_ARGS)
1873 {
1874         text       *arg1 = PG_GETARG_TEXT_PP(0);
1875         text       *arg2 = PG_GETARG_TEXT_PP(1);
1876         bool            result;
1877
1878         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1879
1880         PG_FREE_IF_COPY(arg1, 0);
1881         PG_FREE_IF_COPY(arg2, 1);
1882
1883         PG_RETURN_BOOL(result);
1884 }
1885
1886 Datum
1887 text_le(PG_FUNCTION_ARGS)
1888 {
1889         text       *arg1 = PG_GETARG_TEXT_PP(0);
1890         text       *arg2 = PG_GETARG_TEXT_PP(1);
1891         bool            result;
1892
1893         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1894
1895         PG_FREE_IF_COPY(arg1, 0);
1896         PG_FREE_IF_COPY(arg2, 1);
1897
1898         PG_RETURN_BOOL(result);
1899 }
1900
1901 Datum
1902 text_gt(PG_FUNCTION_ARGS)
1903 {
1904         text       *arg1 = PG_GETARG_TEXT_PP(0);
1905         text       *arg2 = PG_GETARG_TEXT_PP(1);
1906         bool            result;
1907
1908         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1909
1910         PG_FREE_IF_COPY(arg1, 0);
1911         PG_FREE_IF_COPY(arg2, 1);
1912
1913         PG_RETURN_BOOL(result);
1914 }
1915
1916 Datum
1917 text_ge(PG_FUNCTION_ARGS)
1918 {
1919         text       *arg1 = PG_GETARG_TEXT_PP(0);
1920         text       *arg2 = PG_GETARG_TEXT_PP(1);
1921         bool            result;
1922
1923         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1924
1925         PG_FREE_IF_COPY(arg1, 0);
1926         PG_FREE_IF_COPY(arg2, 1);
1927
1928         PG_RETURN_BOOL(result);
1929 }
1930
1931 Datum
1932 text_starts_with(PG_FUNCTION_ARGS)
1933 {
1934         Datum           arg1 = PG_GETARG_DATUM(0);
1935         Datum           arg2 = PG_GETARG_DATUM(1);
1936         Oid                     collid = PG_GET_COLLATION();
1937         pg_locale_t mylocale = 0;
1938         bool            result;
1939         Size            len1,
1940                                 len2;
1941
1942         check_collation_set(collid);
1943
1944         if (!lc_collate_is_c(collid))
1945                 mylocale = pg_newlocale_from_collation(collid);
1946
1947         if (mylocale && !mylocale->deterministic)
1948                 ereport(ERROR,
1949                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1950                                  errmsg("nondeterministic collations are not supported for substring searches")));
1951
1952         len1 = toast_raw_datum_size(arg1);
1953         len2 = toast_raw_datum_size(arg2);
1954         if (len2 > len1)
1955                 result = false;
1956         else
1957         {
1958                 text       *targ1 = text_substring(arg1, 1, len2, false);
1959                 text       *targ2 = DatumGetTextPP(arg2);
1960
1961                 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1962                                                  VARSIZE_ANY_EXHDR(targ2)) == 0);
1963
1964                 PG_FREE_IF_COPY(targ1, 0);
1965                 PG_FREE_IF_COPY(targ2, 1);
1966         }
1967
1968         PG_RETURN_BOOL(result);
1969 }
1970
1971 Datum
1972 bttextcmp(PG_FUNCTION_ARGS)
1973 {
1974         text       *arg1 = PG_GETARG_TEXT_PP(0);
1975         text       *arg2 = PG_GETARG_TEXT_PP(1);
1976         int32           result;
1977
1978         result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1979
1980         PG_FREE_IF_COPY(arg1, 0);
1981         PG_FREE_IF_COPY(arg2, 1);
1982
1983         PG_RETURN_INT32(result);
1984 }
1985
1986 Datum
1987 bttextsortsupport(PG_FUNCTION_ARGS)
1988 {
1989         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1990         Oid                     collid = ssup->ssup_collation;
1991         MemoryContext oldcontext;
1992
1993         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1994
1995         /* Use generic string SortSupport */
1996         varstr_sortsupport(ssup, TEXTOID, collid);
1997
1998         MemoryContextSwitchTo(oldcontext);
1999
2000         PG_RETURN_VOID();
2001 }
2002
2003 /*
2004  * Generic sortsupport interface for character type's operator classes.
2005  * Includes locale support, and support for BpChar semantics (i.e. removing
2006  * trailing spaces before comparison).
2007  *
2008  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2009  * same representation.  Callers that always use the C collation (e.g.
2010  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2011  * this will not work with any other collation, though.
2012  */
2013 void
2014 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
2015 {
2016         bool            abbreviate = ssup->abbreviate;
2017         bool            collate_c = false;
2018         VarStringSortSupport *sss;
2019         pg_locale_t locale = 0;
2020
2021         check_collation_set(collid);
2022
2023         /*
2024          * If possible, set ssup->comparator to a function which can be used to
2025          * directly compare two datums.  If we can do this, we'll avoid the
2026          * overhead of a trip through the fmgr layer for every comparison, which
2027          * can be substantial.
2028          *
2029          * Most typically, we'll set the comparator to varlenafastcmp_locale,
2030          * which uses strcoll() to perform comparisons.  We use that for the
2031          * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2032          * LC_COLLATE = C, we can make things quite a bit faster with
2033          * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2034          * memcmp() rather than strcoll().
2035          */
2036         if (lc_collate_is_c(collid))
2037         {
2038                 if (typid == BPCHAROID)
2039                         ssup->comparator = bpcharfastcmp_c;
2040                 else if (typid == NAMEOID)
2041                 {
2042                         ssup->comparator = namefastcmp_c;
2043                         /* Not supporting abbreviation with type NAME, for now */
2044                         abbreviate = false;
2045                 }
2046                 else
2047                         ssup->comparator = varstrfastcmp_c;
2048
2049                 collate_c = true;
2050         }
2051         else
2052         {
2053                 /*
2054                  * We need a collation-sensitive comparison.  To make things faster,
2055                  * we'll figure out the collation based on the locale id and cache the
2056                  * result.
2057                  */
2058                 locale = pg_newlocale_from_collation(collid);
2059
2060                 /*
2061                  * There is a further exception on Windows.  When the database
2062                  * encoding is UTF-8 and we are not using the C collation, complex
2063                  * hacks are required.  We don't currently have a comparator that
2064                  * handles that case, so we fall back on the slow method of having the
2065                  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2066                  * trampoline.  ICU locales work just the same on Windows, however.
2067                  */
2068 #ifdef WIN32
2069                 if (GetDatabaseEncoding() == PG_UTF8 &&
2070                         !(locale && locale->provider == COLLPROVIDER_ICU))
2071                         return;
2072 #endif
2073
2074                 /*
2075                  * We use varlenafastcmp_locale except for type NAME.
2076                  */
2077                 if (typid == NAMEOID)
2078                 {
2079                         ssup->comparator = namefastcmp_locale;
2080                         /* Not supporting abbreviation with type NAME, for now */
2081                         abbreviate = false;
2082                 }
2083                 else
2084                         ssup->comparator = varlenafastcmp_locale;
2085         }
2086
2087         /*
2088          * Unfortunately, it seems that abbreviation for non-C collations is
2089          * broken on many common platforms; testing of multiple versions of glibc
2090          * reveals that, for many locales, strcoll() and strxfrm() do not return
2091          * consistent results, which is fatal to this optimization.  While no
2092          * other libc other than Cygwin has so far been shown to have a problem,
2093          * we take the conservative course of action for right now and disable
2094          * this categorically.  (Users who are certain this isn't a problem on
2095          * their system can define TRUST_STRXFRM.)
2096          *
2097          * Even apart from the risk of broken locales, it's possible that there
2098          * are platforms where the use of abbreviated keys should be disabled at
2099          * compile time.  Having only 4 byte datums could make worst-case
2100          * performance drastically more likely, for example.  Moreover, macOS's
2101          * strxfrm() implementation is known to not effectively concentrate a
2102          * significant amount of entropy from the original string in earlier
2103          * transformed blobs.  It's possible that other supported platforms are
2104          * similarly encumbered.  So, if we ever get past disabling this
2105          * categorically, we may still want or need to disable it for particular
2106          * platforms.
2107          */
2108 #ifndef TRUST_STRXFRM
2109         if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2110                 abbreviate = false;
2111 #endif
2112
2113         /*
2114          * If we're using abbreviated keys, or if we're using a locale-aware
2115          * comparison, we need to initialize a VarStringSortSupport object. Both
2116          * cases will make use of the temporary buffers we initialize here for
2117          * scratch space (and to detect requirement for BpChar semantics from
2118          * caller), and the abbreviation case requires additional state.
2119          */
2120         if (abbreviate || !collate_c)
2121         {
2122                 sss = palloc(sizeof(VarStringSortSupport));
2123                 sss->buf1 = palloc(TEXTBUFLEN);
2124                 sss->buflen1 = TEXTBUFLEN;
2125                 sss->buf2 = palloc(TEXTBUFLEN);
2126                 sss->buflen2 = TEXTBUFLEN;
2127                 /* Start with invalid values */
2128                 sss->last_len1 = -1;
2129                 sss->last_len2 = -1;
2130                 /* Initialize */
2131                 sss->last_returned = 0;
2132                 sss->locale = locale;
2133
2134                 /*
2135                  * To avoid somehow confusing a strxfrm() blob and an original string,
2136                  * constantly keep track of the variety of data that buf1 and buf2
2137                  * currently contain.
2138                  *
2139                  * Comparisons may be interleaved with conversion calls.  Frequently,
2140                  * conversions and comparisons are batched into two distinct phases,
2141                  * but the correctness of caching cannot hinge upon this.  For
2142                  * comparison caching, buffer state is only trusted if cache_blob is
2143                  * found set to false, whereas strxfrm() caching only trusts the state
2144                  * when cache_blob is found set to true.
2145                  *
2146                  * Arbitrarily initialize cache_blob to true.
2147                  */
2148                 sss->cache_blob = true;
2149                 sss->collate_c = collate_c;
2150                 sss->typid = typid;
2151                 ssup->ssup_extra = sss;
2152
2153                 /*
2154                  * If possible, plan to use the abbreviated keys optimization.  The
2155                  * core code may switch back to authoritative comparator should
2156                  * abbreviation be aborted.
2157                  */
2158                 if (abbreviate)
2159                 {
2160                         sss->prop_card = 0.20;
2161                         initHyperLogLog(&sss->abbr_card, 10);
2162                         initHyperLogLog(&sss->full_card, 10);
2163                         ssup->abbrev_full_comparator = ssup->comparator;
2164                         ssup->comparator = ssup_datum_unsigned_cmp;
2165                         ssup->abbrev_converter = varstr_abbrev_convert;
2166                         ssup->abbrev_abort = varstr_abbrev_abort;
2167                 }
2168         }
2169 }
2170
2171 /*
2172  * sortsupport comparison func (for C locale case)
2173  */
2174 static int
2175 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2176 {
2177         VarString  *arg1 = DatumGetVarStringPP(x);
2178         VarString  *arg2 = DatumGetVarStringPP(y);
2179         char       *a1p,
2180                            *a2p;
2181         int                     len1,
2182                                 len2,
2183                                 result;
2184
2185         a1p = VARDATA_ANY(arg1);
2186         a2p = VARDATA_ANY(arg2);
2187
2188         len1 = VARSIZE_ANY_EXHDR(arg1);
2189         len2 = VARSIZE_ANY_EXHDR(arg2);
2190
2191         result = memcmp(a1p, a2p, Min(len1, len2));
2192         if ((result == 0) && (len1 != len2))
2193                 result = (len1 < len2) ? -1 : 1;
2194
2195         /* We can't afford to leak memory here. */
2196         if (PointerGetDatum(arg1) != x)
2197                 pfree(arg1);
2198         if (PointerGetDatum(arg2) != y)
2199                 pfree(arg2);
2200
2201         return result;
2202 }
2203
2204 /*
2205  * sortsupport comparison func (for BpChar C locale case)
2206  *
2207  * BpChar outsources its sortsupport to this module.  Specialization for the
2208  * varstr_sortsupport BpChar case, modeled on
2209  * internal_bpchar_pattern_compare().
2210  */
2211 static int
2212 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2213 {
2214         BpChar     *arg1 = DatumGetBpCharPP(x);
2215         BpChar     *arg2 = DatumGetBpCharPP(y);
2216         char       *a1p,
2217                            *a2p;
2218         int                     len1,
2219                                 len2,
2220                                 result;
2221
2222         a1p = VARDATA_ANY(arg1);
2223         a2p = VARDATA_ANY(arg2);
2224
2225         len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2226         len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2227
2228         result = memcmp(a1p, a2p, Min(len1, len2));
2229         if ((result == 0) && (len1 != len2))
2230                 result = (len1 < len2) ? -1 : 1;
2231
2232         /* We can't afford to leak memory here. */
2233         if (PointerGetDatum(arg1) != x)
2234                 pfree(arg1);
2235         if (PointerGetDatum(arg2) != y)
2236                 pfree(arg2);
2237
2238         return result;
2239 }
2240
2241 /*
2242  * sortsupport comparison func (for NAME C locale case)
2243  */
2244 static int
2245 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2246 {
2247         Name            arg1 = DatumGetName(x);
2248         Name            arg2 = DatumGetName(y);
2249
2250         return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2251 }
2252
2253 /*
2254  * sortsupport comparison func (for locale case with all varlena types)
2255  */
2256 static int
2257 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2258 {
2259         VarString  *arg1 = DatumGetVarStringPP(x);
2260         VarString  *arg2 = DatumGetVarStringPP(y);
2261         char       *a1p,
2262                            *a2p;
2263         int                     len1,
2264                                 len2,
2265                                 result;
2266
2267         a1p = VARDATA_ANY(arg1);
2268         a2p = VARDATA_ANY(arg2);
2269
2270         len1 = VARSIZE_ANY_EXHDR(arg1);
2271         len2 = VARSIZE_ANY_EXHDR(arg2);
2272
2273         result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2274
2275         /* We can't afford to leak memory here. */
2276         if (PointerGetDatum(arg1) != x)
2277                 pfree(arg1);
2278         if (PointerGetDatum(arg2) != y)
2279                 pfree(arg2);
2280
2281         return result;
2282 }
2283
2284 /*
2285  * sortsupport comparison func (for locale case with NAME type)
2286  */
2287 static int
2288 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2289 {
2290         Name            arg1 = DatumGetName(x);
2291         Name            arg2 = DatumGetName(y);
2292
2293         return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2294                                                                 NameStr(*arg2), strlen(NameStr(*arg2)),
2295                                                                 ssup);
2296 }
2297
2298 /*
2299  * sortsupport comparison func for locale cases
2300  */
2301 static int
2302 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2303 {
2304         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2305         int                     result;
2306         bool            arg1_match;
2307
2308         /* Fast pre-check for equality, as discussed in varstr_cmp() */
2309         if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2310         {
2311                 /*
2312                  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2313                  * last_len2.  Existing contents of buffers might still be used by
2314                  * next call.
2315                  *
2316                  * It's fine to allow the comparison of BpChar padding bytes here,
2317                  * even though that implies that the memcmp() will usually be
2318                  * performed for BpChar callers (though multibyte characters could
2319                  * still prevent that from occurring).  The memcmp() is still very
2320                  * cheap, and BpChar's funny semantics have us remove trailing spaces
2321                  * (not limited to padding), so we need make no distinction between
2322                  * padding space characters and "real" space characters.
2323                  */
2324                 return 0;
2325         }
2326
2327         if (sss->typid == BPCHAROID)
2328         {
2329                 /* Get true number of bytes, ignoring trailing spaces */
2330                 len1 = bpchartruelen(a1p, len1);
2331                 len2 = bpchartruelen(a2p, len2);
2332         }
2333
2334         if (len1 >= sss->buflen1)
2335         {
2336                 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2337                 sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2338         }
2339         if (len2 >= sss->buflen2)
2340         {
2341                 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2342                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2343         }
2344
2345         /*
2346          * We're likely to be asked to compare the same strings repeatedly, and
2347          * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2348          * comparisons, even though in general there is no reason to think that
2349          * that will work out (every string datum may be unique).  Caching does
2350          * not slow things down measurably when it doesn't work out, and can speed
2351          * things up by rather a lot when it does.  In part, this is because the
2352          * memcmp() compares data from cachelines that are needed in L1 cache even
2353          * when the last comparison's result cannot be reused.
2354          */
2355         arg1_match = true;
2356         if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2357         {
2358                 arg1_match = false;
2359                 memcpy(sss->buf1, a1p, len1);
2360                 sss->buf1[len1] = '\0';
2361                 sss->last_len1 = len1;
2362         }
2363
2364         /*
2365          * If we're comparing the same two strings as last time, we can return the
2366          * same answer without calling strcoll() again.  This is more likely than
2367          * it seems (at least with moderate to low cardinality sets), because
2368          * quicksort compares the same pivot against many values.
2369          */
2370         if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2371         {
2372                 memcpy(sss->buf2, a2p, len2);
2373                 sss->buf2[len2] = '\0';
2374                 sss->last_len2 = len2;
2375         }
2376         else if (arg1_match && !sss->cache_blob)
2377         {
2378                 /* Use result cached following last actual strcoll() call */
2379                 return sss->last_returned;
2380         }
2381
2382         if (sss->locale)
2383         {
2384                 if (sss->locale->provider == COLLPROVIDER_ICU)
2385                 {
2386 #ifdef USE_ICU
2387 #ifdef HAVE_UCOL_STRCOLLUTF8
2388                         if (GetDatabaseEncoding() == PG_UTF8)
2389                         {
2390                                 UErrorCode      status;
2391
2392                                 status = U_ZERO_ERROR;
2393                                 result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2394                                                                                   a1p, len1,
2395                                                                                   a2p, len2,
2396                                                                                   &status);
2397                                 if (U_FAILURE(status))
2398                                         ereport(ERROR,
2399                                                         (errmsg("collation failed: %s", u_errorName(status))));
2400                         }
2401                         else
2402 #endif
2403                         {
2404                                 int32_t         ulen1,
2405                                                         ulen2;
2406                                 UChar      *uchar1,
2407                                                    *uchar2;
2408
2409                                 ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2410                                 ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2411
2412                                 result = ucol_strcoll(sss->locale->info.icu.ucol,
2413                                                                           uchar1, ulen1,
2414                                                                           uchar2, ulen2);
2415
2416                                 pfree(uchar1);
2417                                 pfree(uchar2);
2418                         }
2419 #else                                                   /* not USE_ICU */
2420                         /* shouldn't happen */
2421                         elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2422 #endif                                                  /* not USE_ICU */
2423                 }
2424                 else
2425                 {
2426 #ifdef HAVE_LOCALE_T
2427                         result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2428 #else
2429                         /* shouldn't happen */
2430                         elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2431 #endif
2432                 }
2433         }
2434         else
2435                 result = strcoll(sss->buf1, sss->buf2);
2436
2437         /* Break tie if necessary. */
2438         if (result == 0 &&
2439                 (!sss->locale || sss->locale->deterministic))
2440                 result = strcmp(sss->buf1, sss->buf2);
2441
2442         /* Cache result, perhaps saving an expensive strcoll() call next time */
2443         sss->cache_blob = false;
2444         sss->last_returned = result;
2445         return result;
2446 }
2447
2448 /*
2449  * Conversion routine for sortsupport.  Converts original to abbreviated key
2450  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2451  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2452  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2453  * locale is used, or in case of bytea, just memcpy() from original instead.
2454  */
2455 static Datum
2456 varstr_abbrev_convert(Datum original, SortSupport ssup)
2457 {
2458         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2459         VarString  *authoritative = DatumGetVarStringPP(original);
2460         char       *authoritative_data = VARDATA_ANY(authoritative);
2461
2462         /* working state */
2463         Datum           res;
2464         char       *pres;
2465         int                     len;
2466         uint32          hash;
2467
2468         pres = (char *) &res;
2469         /* memset(), so any non-overwritten bytes are NUL */
2470         memset(pres, 0, sizeof(Datum));
2471         len = VARSIZE_ANY_EXHDR(authoritative);
2472
2473         /* Get number of bytes, ignoring trailing spaces */
2474         if (sss->typid == BPCHAROID)
2475                 len = bpchartruelen(authoritative_data, len);
2476
2477         /*
2478          * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2479          * abbreviate keys.  The full comparator for the C locale is always
2480          * memcmp().  It would be incorrect to allow bytea callers (callers that
2481          * always force the C collation -- bytea isn't a collatable type, but this
2482          * approach is convenient) to use strxfrm().  This is because bytea
2483          * strings may contain NUL bytes.  Besides, this should be faster, too.
2484          *
2485          * More generally, it's okay that bytea callers can have NUL bytes in
2486          * strings because abbreviated cmp need not make a distinction between
2487          * terminating NUL bytes, and NUL bytes representing actual NULs in the
2488          * authoritative representation.  Hopefully a comparison at or past one
2489          * abbreviated key's terminating NUL byte will resolve the comparison
2490          * without consulting the authoritative representation; specifically, some
2491          * later non-NUL byte in the longer string can resolve the comparison
2492          * against a subsequent terminating NUL in the shorter string.  There will
2493          * usually be what is effectively a "length-wise" resolution there and
2494          * then.
2495          *
2496          * If that doesn't work out -- if all bytes in the longer string
2497          * positioned at or past the offset of the smaller string's (first)
2498          * terminating NUL are actually representative of NUL bytes in the
2499          * authoritative binary string (perhaps with some *terminating* NUL bytes
2500          * towards the end of the longer string iff it happens to still be small)
2501          * -- then an authoritative tie-breaker will happen, and do the right
2502          * thing: explicitly consider string length.
2503          */
2504         if (sss->collate_c)
2505                 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2506         else
2507         {
2508                 Size            bsize;
2509 #ifdef USE_ICU
2510                 int32_t         ulen = -1;
2511                 UChar      *uchar = NULL;
2512 #endif
2513
2514                 /*
2515                  * We're not using the C collation, so fall back on strxfrm or ICU
2516                  * analogs.
2517                  */
2518
2519                 /* By convention, we use buffer 1 to store and NUL-terminate */
2520                 if (len >= sss->buflen1)
2521                 {
2522                         sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2523                         sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2524                 }
2525
2526                 /* Might be able to reuse strxfrm() blob from last call */
2527                 if (sss->last_len1 == len && sss->cache_blob &&
2528                         memcmp(sss->buf1, authoritative_data, len) == 0)
2529                 {
2530                         memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2531                         /* No change affecting cardinality, so no hashing required */
2532                         goto done;
2533                 }
2534
2535                 memcpy(sss->buf1, authoritative_data, len);
2536
2537                 /*
2538                  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2539                  * necessary for ICU, but doesn't hurt.
2540                  */
2541                 sss->buf1[len] = '\0';
2542                 sss->last_len1 = len;
2543
2544 #ifdef USE_ICU
2545                 /* When using ICU and not UTF8, convert string to UChar. */
2546                 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2547                         GetDatabaseEncoding() != PG_UTF8)
2548                         ulen = icu_to_uchar(&uchar, sss->buf1, len);
2549 #endif
2550
2551                 /*
2552                  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2553                  * and try again.  Both of these functions have the result buffer
2554                  * content undefined if the result did not fit, so we need to retry
2555                  * until everything fits, even though we only need the first few bytes
2556                  * in the end.  When using ucol_nextSortKeyPart(), however, we only
2557                  * ask for as many bytes as we actually need.
2558                  */
2559                 for (;;)
2560                 {
2561 #ifdef USE_ICU
2562                         if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2563                         {
2564                                 /*
2565                                  * When using UTF8, use the iteration interface so we only
2566                                  * need to produce as many bytes as we actually need.
2567                                  */
2568                                 if (GetDatabaseEncoding() == PG_UTF8)
2569                                 {
2570                                         UCharIterator iter;
2571                                         uint32_t        state[2];
2572                                         UErrorCode      status;
2573
2574                                         uiter_setUTF8(&iter, sss->buf1, len);
2575                                         state[0] = state[1] = 0;        /* won't need that again */
2576                                         status = U_ZERO_ERROR;
2577                                         bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2578                                                                                                  &iter,
2579                                                                                                  state,
2580                                                                                                  (uint8_t *) sss->buf2,
2581                                                                                                  Min(sizeof(Datum), sss->buflen2),
2582                                                                                                  &status);
2583                                         if (U_FAILURE(status))
2584                                                 ereport(ERROR,
2585                                                                 (errmsg("sort key generation failed: %s",
2586                                                                                 u_errorName(status))));
2587                                 }
2588                                 else
2589                                         bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2590                                                                                         uchar, ulen,
2591                                                                                         (uint8_t *) sss->buf2, sss->buflen2);
2592                         }
2593                         else
2594 #endif
2595 #ifdef HAVE_LOCALE_T
2596                         if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2597                                 bsize = strxfrm_l(sss->buf2, sss->buf1,
2598                                                                   sss->buflen2, sss->locale->info.lt);
2599                         else
2600 #endif
2601                                 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2602
2603                         sss->last_len2 = bsize;
2604                         if (bsize < sss->buflen2)
2605                                 break;
2606
2607                         /*
2608                          * Grow buffer and retry.
2609                          */
2610                         sss->buflen2 = Max(bsize + 1,
2611                                                            Min(sss->buflen2 * 2, MaxAllocSize));
2612                         sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2613                 }
2614
2615                 /*
2616                  * Every Datum byte is always compared.  This is safe because the
2617                  * strxfrm() blob is itself NUL terminated, leaving no danger of
2618                  * misinterpreting any NUL bytes not intended to be interpreted as
2619                  * logically representing termination.
2620                  *
2621                  * (Actually, even if there were NUL bytes in the blob it would be
2622                  * okay.  See remarks on bytea case above.)
2623                  */
2624                 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2625
2626 #ifdef USE_ICU
2627                 if (uchar)
2628                         pfree(uchar);
2629 #endif
2630         }
2631
2632         /*
2633          * Maintain approximate cardinality of both abbreviated keys and original,
2634          * authoritative keys using HyperLogLog.  Used as cheap insurance against
2635          * the worst case, where we do many string transformations for no saving
2636          * in full strcoll()-based comparisons.  These statistics are used by
2637          * varstr_abbrev_abort().
2638          *
2639          * First, Hash key proper, or a significant fraction of it.  Mix in length
2640          * in order to compensate for cases where differences are past
2641          * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2642          */
2643         hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2644                                                                    Min(len, PG_CACHE_LINE_SIZE)));
2645
2646         if (len > PG_CACHE_LINE_SIZE)
2647                 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2648
2649         addHyperLogLog(&sss->full_card, hash);
2650
2651         /* Hash abbreviated key */
2652 #if SIZEOF_DATUM == 8
2653         {
2654                 uint32          lohalf,
2655                                         hihalf;
2656
2657                 lohalf = (uint32) res;
2658                 hihalf = (uint32) (res >> 32);
2659                 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2660         }
2661 #else                                                   /* SIZEOF_DATUM != 8 */
2662         hash = DatumGetUInt32(hash_uint32((uint32) res));
2663 #endif
2664
2665         addHyperLogLog(&sss->abbr_card, hash);
2666
2667         /* Cache result, perhaps saving an expensive strxfrm() call next time */
2668         sss->cache_blob = true;
2669 done:
2670
2671         /*
2672          * Byteswap on little-endian machines.
2673          *
2674          * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2675          * 3-way comparator) works correctly on all platforms.  If we didn't do
2676          * this, the comparator would have to call memcmp() with a pair of
2677          * pointers to the first byte of each abbreviated key, which is slower.
2678          */
2679         res = DatumBigEndianToNative(res);
2680
2681         /* Don't leak memory here */
2682         if (PointerGetDatum(authoritative) != original)
2683                 pfree(authoritative);
2684
2685         return res;
2686 }
2687
2688 /*
2689  * Callback for estimating effectiveness of abbreviated key optimization, using
2690  * heuristic rules.  Returns value indicating if the abbreviation optimization
2691  * should be aborted, based on its projected effectiveness.
2692  */
2693 static bool
2694 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2695 {
2696         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2697         double          abbrev_distinct,
2698                                 key_distinct;
2699
2700         Assert(ssup->abbreviate);
2701
2702         /* Have a little patience */
2703         if (memtupcount < 100)
2704                 return false;
2705
2706         abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2707         key_distinct = estimateHyperLogLog(&sss->full_card);
2708
2709         /*
2710          * Clamp cardinality estimates to at least one distinct value.  While
2711          * NULLs are generally disregarded, if only NULL values were seen so far,
2712          * that might misrepresent costs if we failed to clamp.
2713          */
2714         if (abbrev_distinct <= 1.0)
2715                 abbrev_distinct = 1.0;
2716
2717         if (key_distinct <= 1.0)
2718                 key_distinct = 1.0;
2719
2720         /*
2721          * In the worst case all abbreviated keys are identical, while at the same
2722          * time there are differences within full key strings not captured in
2723          * abbreviations.
2724          */
2725 #ifdef TRACE_SORT
2726         if (trace_sort)
2727         {
2728                 double          norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2729
2730                 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2731                          "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2732                          memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2733                          sss->prop_card);
2734         }
2735 #endif
2736
2737         /*
2738          * If the number of distinct abbreviated keys approximately matches the
2739          * number of distinct authoritative original keys, that's reason enough to
2740          * proceed.  We can win even with a very low cardinality set if most
2741          * tie-breakers only memcmp().  This is by far the most important
2742          * consideration.
2743          *
2744          * While comparisons that are resolved at the abbreviated key level are
2745          * considerably cheaper than tie-breakers resolved with memcmp(), both of
2746          * those two outcomes are so much cheaper than a full strcoll() once
2747          * sorting is underway that it doesn't seem worth it to weigh abbreviated
2748          * cardinality against the overall size of the set in order to more
2749          * accurately model costs.  Assume that an abbreviated comparison, and an
2750          * abbreviated comparison with a cheap memcmp()-based authoritative
2751          * resolution are equivalent.
2752          */
2753         if (abbrev_distinct > key_distinct * sss->prop_card)
2754         {
2755                 /*
2756                  * When we have exceeded 10,000 tuples, decay required cardinality
2757                  * aggressively for next call.
2758                  *
2759                  * This is useful because the number of comparisons required on
2760                  * average increases at a linearithmic rate, and at roughly 10,000
2761                  * tuples that factor will start to dominate over the linear costs of
2762                  * string transformation (this is a conservative estimate).  The decay
2763                  * rate is chosen to be a little less aggressive than halving -- which
2764                  * (since we're called at points at which memtupcount has doubled)
2765                  * would never see the cost model actually abort past the first call
2766                  * following a decay.  This decay rate is mostly a precaution against
2767                  * a sudden, violent swing in how well abbreviated cardinality tracks
2768                  * full key cardinality.  The decay also serves to prevent a marginal
2769                  * case from being aborted too late, when too much has already been
2770                  * invested in string transformation.
2771                  *
2772                  * It's possible for sets of several million distinct strings with
2773                  * mere tens of thousands of distinct abbreviated keys to still
2774                  * benefit very significantly.  This will generally occur provided
2775                  * each abbreviated key is a proxy for a roughly uniform number of the
2776                  * set's full keys. If it isn't so, we hope to catch that early and
2777                  * abort.  If it isn't caught early, by the time the problem is
2778                  * apparent it's probably not worth aborting.
2779                  */
2780                 if (memtupcount > 10000)
2781                         sss->prop_card *= 0.65;
2782
2783                 return false;
2784         }
2785
2786         /*
2787          * Abort abbreviation strategy.
2788          *
2789          * The worst case, where all abbreviated keys are identical while all
2790          * original strings differ will typically only see a regression of about
2791          * 10% in execution time for small to medium sized lists of strings.
2792          * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2793          * often expect very large improvements, particularly with sets of strings
2794          * of moderately high to high abbreviated cardinality.  There is little to
2795          * lose but much to gain, which our strategy reflects.
2796          */
2797 #ifdef TRACE_SORT
2798         if (trace_sort)
2799                 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2800                          "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2801                          memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2802 #endif
2803
2804         return true;
2805 }
2806
2807 /*
2808  * Generic equalimage support function for character type's operator classes.
2809  * Disables the use of deduplication with nondeterministic collations.
2810  */
2811 Datum
2812 btvarstrequalimage(PG_FUNCTION_ARGS)
2813 {
2814         /* Oid          opcintype = PG_GETARG_OID(0); */
2815         Oid                     collid = PG_GET_COLLATION();
2816
2817         check_collation_set(collid);
2818
2819         if (lc_collate_is_c(collid) ||
2820                 collid == DEFAULT_COLLATION_OID ||
2821                 get_collation_isdeterministic(collid))
2822                 PG_RETURN_BOOL(true);
2823         else
2824                 PG_RETURN_BOOL(false);
2825 }
2826
2827 Datum
2828 text_larger(PG_FUNCTION_ARGS)
2829 {
2830         text       *arg1 = PG_GETARG_TEXT_PP(0);
2831         text       *arg2 = PG_GETARG_TEXT_PP(1);
2832         text       *result;
2833
2834         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2835
2836         PG_RETURN_TEXT_P(result);
2837 }
2838
2839 Datum
2840 text_smaller(PG_FUNCTION_ARGS)
2841 {
2842         text       *arg1 = PG_GETARG_TEXT_PP(0);
2843         text       *arg2 = PG_GETARG_TEXT_PP(1);
2844         text       *result;
2845
2846         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2847
2848         PG_RETURN_TEXT_P(result);
2849 }
2850
2851
2852 /*
2853  * Cross-type comparison functions for types text and name.
2854  */
2855
2856 Datum
2857 nameeqtext(PG_FUNCTION_ARGS)
2858 {
2859         Name            arg1 = PG_GETARG_NAME(0);
2860         text       *arg2 = PG_GETARG_TEXT_PP(1);
2861         size_t          len1 = strlen(NameStr(*arg1));
2862         size_t          len2 = VARSIZE_ANY_EXHDR(arg2);
2863         Oid                     collid = PG_GET_COLLATION();
2864         bool            result;
2865
2866         check_collation_set(collid);
2867
2868         if (collid == C_COLLATION_OID)
2869                 result = (len1 == len2 &&
2870                                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2871         else
2872                 result = (varstr_cmp(NameStr(*arg1), len1,
2873                                                          VARDATA_ANY(arg2), len2,
2874                                                          collid) == 0);
2875
2876         PG_FREE_IF_COPY(arg2, 1);
2877
2878         PG_RETURN_BOOL(result);
2879 }
2880
2881 Datum
2882 texteqname(PG_FUNCTION_ARGS)
2883 {
2884         text       *arg1 = PG_GETARG_TEXT_PP(0);
2885         Name            arg2 = PG_GETARG_NAME(1);
2886         size_t          len1 = VARSIZE_ANY_EXHDR(arg1);
2887         size_t          len2 = strlen(NameStr(*arg2));
2888         Oid                     collid = PG_GET_COLLATION();
2889         bool            result;
2890
2891         check_collation_set(collid);
2892
2893         if (collid == C_COLLATION_OID)
2894                 result = (len1 == len2 &&
2895                                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2896         else
2897                 result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2898                                                          NameStr(*arg2), len2,
2899                                                          collid) == 0);
2900
2901         PG_FREE_IF_COPY(arg1, 0);
2902
2903         PG_RETURN_BOOL(result);
2904 }
2905
2906 Datum
2907 namenetext(PG_FUNCTION_ARGS)
2908 {
2909         Name            arg1 = PG_GETARG_NAME(0);
2910         text       *arg2 = PG_GETARG_TEXT_PP(1);
2911         size_t          len1 = strlen(NameStr(*arg1));
2912         size_t          len2 = VARSIZE_ANY_EXHDR(arg2);
2913         Oid                     collid = PG_GET_COLLATION();
2914         bool            result;
2915
2916         check_collation_set(collid);
2917
2918         if (collid == C_COLLATION_OID)
2919                 result = !(len1 == len2 &&
2920                                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2921         else
2922                 result = !(varstr_cmp(NameStr(*arg1), len1,
2923                                                           VARDATA_ANY(arg2), len2,
2924                                                           collid) == 0);
2925
2926         PG_FREE_IF_COPY(arg2, 1);
2927
2928         PG_RETURN_BOOL(result);
2929 }
2930
2931 Datum
2932 textnename(PG_FUNCTION_ARGS)
2933 {
2934         text       *arg1 = PG_GETARG_TEXT_PP(0);
2935         Name            arg2 = PG_GETARG_NAME(1);
2936         size_t          len1 = VARSIZE_ANY_EXHDR(arg1);
2937         size_t          len2 = strlen(NameStr(*arg2));
2938         Oid                     collid = PG_GET_COLLATION();
2939         bool            result;
2940
2941         check_collation_set(collid);
2942
2943         if (collid == C_COLLATION_OID)
2944                 result = !(len1 == len2 &&
2945                                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2946         else
2947                 result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2948                                                           NameStr(*arg2), len2,
2949                                                           collid) == 0);
2950
2951         PG_FREE_IF_COPY(arg1, 0);
2952
2953         PG_RETURN_BOOL(result);
2954 }
2955
2956 Datum
2957 btnametextcmp(PG_FUNCTION_ARGS)
2958 {
2959         Name            arg1 = PG_GETARG_NAME(0);
2960         text       *arg2 = PG_GETARG_TEXT_PP(1);
2961         int32           result;
2962
2963         result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2964                                                 VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2965                                                 PG_GET_COLLATION());
2966
2967         PG_FREE_IF_COPY(arg2, 1);
2968
2969         PG_RETURN_INT32(result);
2970 }
2971
2972 Datum
2973 bttextnamecmp(PG_FUNCTION_ARGS)
2974 {
2975         text       *arg1 = PG_GETARG_TEXT_PP(0);
2976         Name            arg2 = PG_GETARG_NAME(1);
2977         int32           result;
2978
2979         result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2980                                                 NameStr(*arg2), strlen(NameStr(*arg2)),
2981                                                 PG_GET_COLLATION());
2982
2983         PG_FREE_IF_COPY(arg1, 0);
2984
2985         PG_RETURN_INT32(result);
2986 }
2987
2988 #define CmpCall(cmpfunc) \
2989         DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2990                                                                                   PG_GET_COLLATION(), \
2991                                                                                   PG_GETARG_DATUM(0), \
2992                                                                                   PG_GETARG_DATUM(1)))
2993
2994 Datum
2995 namelttext(PG_FUNCTION_ARGS)
2996 {
2997         PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2998 }
2999
3000 Datum
3001 nameletext(PG_FUNCTION_ARGS)
3002 {
3003         PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3004 }
3005
3006 Datum
3007 namegttext(PG_FUNCTION_ARGS)
3008 {
3009         PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3010 }
3011
3012 Datum
3013 namegetext(PG_FUNCTION_ARGS)
3014 {
3015         PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3016 }
3017
3018 Datum
3019 textltname(PG_FUNCTION_ARGS)
3020 {
3021         PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3022 }
3023
3024 Datum
3025 textlename(PG_FUNCTION_ARGS)
3026 {
3027         PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3028 }
3029
3030 Datum
3031 textgtname(PG_FUNCTION_ARGS)
3032 {
3033         PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3034 }
3035
3036 Datum
3037 textgename(PG_FUNCTION_ARGS)
3038 {
3039         PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3040 }
3041
3042 #undef CmpCall
3043
3044
3045 /*
3046  * The following operators support character-by-character comparison
3047  * of text datums, to allow building indexes suitable for LIKE clauses.
3048  * Note that the regular texteq/textne comparison operators, and regular
3049  * support functions 1 and 2 with "C" collation are assumed to be
3050  * compatible with these!
3051  */
3052
3053 static int
3054 internal_text_pattern_compare(text *arg1, text *arg2)
3055 {
3056         int                     result;
3057         int                     len1,
3058                                 len2;
3059
3060         len1 = VARSIZE_ANY_EXHDR(arg1);
3061         len2 = VARSIZE_ANY_EXHDR(arg2);
3062
3063         result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3064         if (result != 0)
3065                 return result;
3066         else if (len1 < len2)
3067                 return -1;
3068         else if (len1 > len2)
3069                 return 1;
3070         else
3071                 return 0;
3072 }
3073
3074
3075 Datum
3076 text_pattern_lt(PG_FUNCTION_ARGS)
3077 {
3078         text       *arg1 = PG_GETARG_TEXT_PP(0);
3079         text       *arg2 = PG_GETARG_TEXT_PP(1);
3080         int                     result;
3081
3082         result = internal_text_pattern_compare(arg1, arg2);
3083
3084         PG_FREE_IF_COPY(arg1, 0);
3085         PG_FREE_IF_COPY(arg2, 1);
3086
3087         PG_RETURN_BOOL(result < 0);
3088 }
3089
3090
3091 Datum
3092 text_pattern_le(PG_FUNCTION_ARGS)
3093 {
3094         text       *arg1 = PG_GETARG_TEXT_PP(0);
3095         text       *arg2 = PG_GETARG_TEXT_PP(1);
3096         int                     result;
3097
3098         result = internal_text_pattern_compare(arg1, arg2);
3099
3100         PG_FREE_IF_COPY(arg1, 0);
3101         PG_FREE_IF_COPY(arg2, 1);
3102
3103         PG_RETURN_BOOL(result <= 0);
3104 }
3105
3106
3107 Datum
3108 text_pattern_ge(PG_FUNCTION_ARGS)
3109 {
3110         text       *arg1 = PG_GETARG_TEXT_PP(0);
3111         text       *arg2 = PG_GETARG_TEXT_PP(1);
3112         int                     result;
3113
3114         result = internal_text_pattern_compare(arg1, arg2);
3115
3116         PG_FREE_IF_COPY(arg1, 0);
3117         PG_FREE_IF_COPY(arg2, 1);
3118
3119         PG_RETURN_BOOL(result >= 0);
3120 }
3121
3122
3123 Datum
3124 text_pattern_gt(PG_FUNCTION_ARGS)
3125 {
3126         text       *arg1 = PG_GETARG_TEXT_PP(0);
3127         text       *arg2 = PG_GETARG_TEXT_PP(1);
3128         int                     result;
3129
3130         result = internal_text_pattern_compare(arg1, arg2);
3131
3132         PG_FREE_IF_COPY(arg1, 0);
3133         PG_FREE_IF_COPY(arg2, 1);
3134
3135         PG_RETURN_BOOL(result > 0);
3136 }
3137
3138
3139 Datum
3140 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3141 {
3142         text       *arg1 = PG_GETARG_TEXT_PP(0);
3143         text       *arg2 = PG_GETARG_TEXT_PP(1);
3144         int                     result;
3145
3146         result = internal_text_pattern_compare(arg1, arg2);
3147
3148         PG_FREE_IF_COPY(arg1, 0);
3149         PG_FREE_IF_COPY(arg2, 1);
3150
3151         PG_RETURN_INT32(result);
3152 }
3153
3154
3155 Datum
3156 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3157 {
3158         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3159         MemoryContext oldcontext;
3160
3161         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3162
3163         /* Use generic string SortSupport, forcing "C" collation */
3164         varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3165
3166         MemoryContextSwitchTo(oldcontext);
3167
3168         PG_RETURN_VOID();
3169 }
3170
3171
3172 /*-------------------------------------------------------------
3173  * byteaoctetlen
3174  *
3175  * get the number of bytes contained in an instance of type 'bytea'
3176  *-------------------------------------------------------------
3177  */
3178 Datum
3179 byteaoctetlen(PG_FUNCTION_ARGS)
3180 {
3181         Datum           str = PG_GETARG_DATUM(0);
3182
3183         /* We need not detoast the input at all */
3184         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3185 }
3186
3187 /*
3188  * byteacat -
3189  *        takes two bytea* and returns a bytea* that is the concatenation of
3190  *        the two.
3191  *
3192  * Cloned from textcat and modified as required.
3193  */
3194 Datum
3195 byteacat(PG_FUNCTION_ARGS)
3196 {
3197         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3198         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3199
3200         PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3201 }
3202
3203 /*
3204  * bytea_catenate
3205  *      Guts of byteacat(), broken out so it can be used by other functions
3206  *
3207  * Arguments can be in short-header form, but not compressed or out-of-line
3208  */
3209 static bytea *
3210 bytea_catenate(bytea *t1, bytea *t2)
3211 {
3212         bytea      *result;
3213         int                     len1,
3214                                 len2,
3215                                 len;
3216         char       *ptr;
3217
3218         len1 = VARSIZE_ANY_EXHDR(t1);
3219         len2 = VARSIZE_ANY_EXHDR(t2);
3220
3221         /* paranoia ... probably should throw error instead? */
3222         if (len1 < 0)
3223                 len1 = 0;
3224         if (len2 < 0)
3225                 len2 = 0;
3226
3227         len = len1 + len2 + VARHDRSZ;
3228         result = (bytea *) palloc(len);
3229
3230         /* Set size of result string... */
3231         SET_VARSIZE(result, len);
3232
3233         /* Fill data field of result string... */
3234         ptr = VARDATA(result);
3235         if (len1 > 0)
3236                 memcpy(ptr, VARDATA_ANY(t1), len1);
3237         if (len2 > 0)
3238                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3239
3240         return result;
3241 }
3242
3243 #define PG_STR_GET_BYTEA(str_) \
3244         DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3245
3246 /*
3247  * bytea_substr()
3248  * Return a substring starting at the specified position.
3249  * Cloned from text_substr and modified as required.
3250  *
3251  * Input:
3252  *      - string
3253  *      - starting position (is one-based)
3254  *      - string length (optional)
3255  *
3256  * If the starting position is zero or less, then return from the start of the string
3257  * adjusting the length to be consistent with the "negative start" per SQL.
3258  * If the length is less than zero, an ERROR is thrown. If no third argument
3259  * (length) is provided, the length to the end of the string is assumed.
3260  */
3261 Datum
3262 bytea_substr(PG_FUNCTION_ARGS)
3263 {
3264         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3265                                                                           PG_GETARG_INT32(1),
3266                                                                           PG_GETARG_INT32(2),
3267                                                                           false));
3268 }
3269
3270 /*
3271  * bytea_substr_no_len -
3272  *        Wrapper to avoid opr_sanity failure due to
3273  *        one function accepting a different number of args.
3274  */
3275 Datum
3276 bytea_substr_no_len(PG_FUNCTION_ARGS)
3277 {
3278         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3279                                                                           PG_GETARG_INT32(1),
3280                                                                           -1,
3281                                                                           true));
3282 }
3283
3284 static bytea *
3285 bytea_substring(Datum str,
3286                                 int S,
3287                                 int L,
3288                                 bool length_not_specified)
3289 {
3290         int32           S1;                             /* adjusted start position */
3291         int32           L1;                             /* adjusted substring length */
3292         int32           E;                              /* end position */
3293
3294         /*
3295          * The logic here should generally match text_substring().
3296          */
3297         S1 = Max(S, 1);
3298
3299         if (length_not_specified)
3300         {
3301                 /*
3302                  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3303                  * end of the string if we pass it a negative value for length.
3304                  */
3305                 L1 = -1;
3306         }
3307         else if (L < 0)
3308         {
3309                 /* SQL99 says to throw an error for E < S, i.e., negative length */
3310                 ereport(ERROR,
3311                                 (errcode(ERRCODE_SUBSTRING_ERROR),
3312                                  errmsg("negative substring length not allowed")));
3313                 L1 = -1;                                /* silence stupider compilers */
3314         }
3315         else if (pg_add_s32_overflow(S, L, &E))
3316         {
3317                 /*
3318                  * L could be large enough for S + L to overflow, in which case the
3319                  * substring must run to end of string.
3320                  */
3321                 L1 = -1;
3322         }
3323         else
3324         {
3325                 /*
3326                  * A zero or negative value for the end position can happen if the
3327                  * start was negative or one. SQL99 says to return a zero-length
3328                  * string.
3329                  */
3330                 if (E < 1)
3331                         return PG_STR_GET_BYTEA("");
3332
3333                 L1 = E - S1;
3334         }
3335
3336         /*
3337          * If the start position is past the end of the string, SQL99 says to
3338          * return a zero-length string -- DatumGetByteaPSlice() will do that for
3339          * us.  We need only convert S1 to zero-based starting position.
3340          */
3341         return DatumGetByteaPSlice(str, S1 - 1, L1);
3342 }
3343
3344 /*
3345  * byteaoverlay
3346  *      Replace specified substring of first string with second
3347  *
3348  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3349  * This code is a direct implementation of what the standard says.
3350  */
3351 Datum
3352 byteaoverlay(PG_FUNCTION_ARGS)
3353 {
3354         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3355         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3356         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
3357         int                     sl = PG_GETARG_INT32(3);        /* substring length */
3358
3359         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3360 }
3361
3362 Datum
3363 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3364 {
3365         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3366         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3367         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
3368         int                     sl;
3369
3370         sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3371         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3372 }
3373
3374 static bytea *
3375 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3376 {
3377         bytea      *result;
3378         bytea      *s1;
3379         bytea      *s2;
3380         int                     sp_pl_sl;
3381
3382         /*
3383          * Check for possible integer-overflow cases.  For negative sp, throw a
3384          * "substring length" error because that's what should be expected
3385          * according to the spec's definition of OVERLAY().
3386          */
3387         if (sp <= 0)
3388                 ereport(ERROR,
3389                                 (errcode(ERRCODE_SUBSTRING_ERROR),
3390                                  errmsg("negative substring length not allowed")));
3391         if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3392                 ereport(ERROR,
3393                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3394                                  errmsg("integer out of range")));
3395
3396         s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3397         s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3398         result = bytea_catenate(s1, t2);
3399         result = bytea_catenate(result, s2);
3400
3401         return result;
3402 }
3403
3404 /*
3405  * bit_count
3406  */
3407 Datum
3408 bytea_bit_count(PG_FUNCTION_ARGS)
3409 {
3410         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3411
3412         PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3413 }
3414
3415 /*
3416  * byteapos -
3417  *        Return the position of the specified substring.
3418  *        Implements the SQL POSITION() function.
3419  * Cloned from textpos and modified as required.
3420  */
3421 Datum
3422 byteapos(PG_FUNCTION_ARGS)
3423 {
3424         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3425         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3426         int                     pos;
3427         int                     px,
3428                                 p;
3429         int                     len1,
3430                                 len2;
3431         char       *p1,
3432                            *p2;
3433
3434         len1 = VARSIZE_ANY_EXHDR(t1);
3435         len2 = VARSIZE_ANY_EXHDR(t2);
3436
3437         if (len2 <= 0)
3438                 PG_RETURN_INT32(1);             /* result for empty pattern */
3439
3440         p1 = VARDATA_ANY(t1);
3441         p2 = VARDATA_ANY(t2);
3442
3443         pos = 0;
3444         px = (len1 - len2);
3445         for (p = 0; p <= px; p++)
3446         {
3447                 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3448                 {
3449                         pos = p + 1;
3450                         break;
3451                 };
3452                 p1++;
3453         };
3454
3455         PG_RETURN_INT32(pos);
3456 }
3457
3458 /*-------------------------------------------------------------
3459  * byteaGetByte
3460  *
3461  * this routine treats "bytea" as an array of bytes.
3462  * It returns the Nth byte (a number between 0 and 255).
3463  *-------------------------------------------------------------
3464  */
3465 Datum
3466 byteaGetByte(PG_FUNCTION_ARGS)
3467 {
3468         bytea      *v = PG_GETARG_BYTEA_PP(0);
3469         int32           n = PG_GETARG_INT32(1);
3470         int                     len;
3471         int                     byte;
3472
3473         len = VARSIZE_ANY_EXHDR(v);
3474
3475         if (n < 0 || n >= len)
3476                 ereport(ERROR,
3477                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3478                                  errmsg("index %d out of valid range, 0..%d",
3479                                                 n, len - 1)));
3480
3481         byte = ((unsigned char *) VARDATA_ANY(v))[n];
3482
3483         PG_RETURN_INT32(byte);
3484 }
3485
3486 /*-------------------------------------------------------------
3487  * byteaGetBit
3488  *
3489  * This routine treats a "bytea" type like an array of bits.
3490  * It returns the value of the Nth bit (0 or 1).
3491  *
3492  *-------------------------------------------------------------
3493  */
3494 Datum
3495 byteaGetBit(PG_FUNCTION_ARGS)
3496 {
3497         bytea      *v = PG_GETARG_BYTEA_PP(0);
3498         int64           n = PG_GETARG_INT64(1);
3499         int                     byteNo,
3500                                 bitNo;
3501         int                     len;
3502         int                     byte;
3503
3504         len = VARSIZE_ANY_EXHDR(v);
3505
3506         if (n < 0 || n >= (int64) len * 8)
3507                 ereport(ERROR,
3508                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3509                                  errmsg("index %lld out of valid range, 0..%lld",
3510                                                 (long long) n, (long long) len * 8 - 1)));
3511
3512         /* n/8 is now known < len, so safe to cast to int */
3513         byteNo = (int) (n / 8);
3514         bitNo = (int) (n % 8);
3515
3516         byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3517
3518         if (byte & (1 << bitNo))
3519                 PG_RETURN_INT32(1);
3520         else
3521                 PG_RETURN_INT32(0);
3522 }
3523
3524 /*-------------------------------------------------------------
3525  * byteaSetByte
3526  *
3527  * Given an instance of type 'bytea' creates a new one with
3528  * the Nth byte set to the given value.
3529  *
3530  *-------------------------------------------------------------
3531  */
3532 Datum
3533 byteaSetByte(PG_FUNCTION_ARGS)
3534 {
3535         bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
3536         int32           n = PG_GETARG_INT32(1);
3537         int32           newByte = PG_GETARG_INT32(2);
3538         int                     len;
3539
3540         len = VARSIZE(res) - VARHDRSZ;
3541
3542         if (n < 0 || n >= len)
3543                 ereport(ERROR,
3544                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3545                                  errmsg("index %d out of valid range, 0..%d",
3546                                                 n, len - 1)));
3547
3548         /*
3549          * Now set the byte.
3550          */
3551         ((unsigned char *) VARDATA(res))[n] = newByte;
3552
3553         PG_RETURN_BYTEA_P(res);
3554 }
3555
3556 /*-------------------------------------------------------------
3557  * byteaSetBit
3558  *
3559  * Given an instance of type 'bytea' creates a new one with
3560  * the Nth bit set to the given value.
3561  *
3562  *-------------------------------------------------------------
3563  */
3564 Datum
3565 byteaSetBit(PG_FUNCTION_ARGS)
3566 {
3567         bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
3568         int64           n = PG_GETARG_INT64(1);
3569         int32           newBit = PG_GETARG_INT32(2);
3570         int                     len;
3571         int                     oldByte,
3572                                 newByte;
3573         int                     byteNo,
3574                                 bitNo;
3575
3576         len = VARSIZE(res) - VARHDRSZ;
3577
3578         if (n < 0 || n >= (int64) len * 8)
3579                 ereport(ERROR,
3580                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3581                                  errmsg("index %lld out of valid range, 0..%lld",
3582                                                 (long long) n, (long long) len * 8 - 1)));
3583
3584         /* n/8 is now known < len, so safe to cast to int */
3585         byteNo = (int) (n / 8);
3586         bitNo = (int) (n % 8);
3587
3588         /*
3589          * sanity check!
3590          */
3591         if (newBit != 0 && newBit != 1)
3592                 ereport(ERROR,
3593                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3594                                  errmsg("new bit must be 0 or 1")));
3595
3596         /*
3597          * Update the byte.
3598          */
3599         oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3600
3601         if (newBit == 0)
3602                 newByte = oldByte & (~(1 << bitNo));
3603         else
3604                 newByte = oldByte | (1 << bitNo);
3605
3606         ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3607
3608         PG_RETURN_BYTEA_P(res);
3609 }
3610
3611
3612 /* text_name()
3613  * Converts a text type to a Name type.
3614  */
3615 Datum
3616 text_name(PG_FUNCTION_ARGS)
3617 {
3618         text       *s = PG_GETARG_TEXT_PP(0);
3619         Name            result;
3620         int                     len;
3621
3622         len = VARSIZE_ANY_EXHDR(s);
3623
3624         /* Truncate oversize input */
3625         if (len >= NAMEDATALEN)
3626                 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3627
3628         /* We use palloc0 here to ensure result is zero-padded */
3629         result = (Name) palloc0(NAMEDATALEN);
3630         memcpy(NameStr(*result), VARDATA_ANY(s), len);
3631
3632         PG_RETURN_NAME(result);
3633 }
3634
3635 /* name_text()
3636  * Converts a Name type to a text type.
3637  */
3638 Datum
3639 name_text(PG_FUNCTION_ARGS)
3640 {
3641         Name            s = PG_GETARG_NAME(0);
3642
3643         PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3644 }
3645
3646
3647 /*
3648  * textToQualifiedNameList - convert a text object to list of names
3649  *
3650  * This implements the input parsing needed by nextval() and other
3651  * functions that take a text parameter representing a qualified name.
3652  * We split the name at dots, downcase if not double-quoted, and
3653  * truncate names if they're too long.
3654  */
3655 List *
3656 textToQualifiedNameList(text *textval)
3657 {
3658         char       *rawname;
3659         List       *result = NIL;
3660         List       *namelist;
3661         ListCell   *l;
3662
3663         /* Convert to C string (handles possible detoasting). */
3664         /* Note we rely on being able to modify rawname below. */
3665         rawname = text_to_cstring(textval);
3666
3667         if (!SplitIdentifierString(rawname, '.', &namelist))
3668                 ereport(ERROR,
3669                                 (errcode(ERRCODE_INVALID_NAME),
3670                                  errmsg("invalid name syntax")));
3671
3672         if (namelist == NIL)
3673                 ereport(ERROR,
3674                                 (errcode(ERRCODE_INVALID_NAME),
3675                                  errmsg("invalid name syntax")));
3676
3677         foreach(l, namelist)
3678         {
3679                 char       *curname = (char *) lfirst(l);
3680
3681                 result = lappend(result, makeString(pstrdup(curname)));
3682         }
3683
3684         pfree(rawname);
3685         list_free(namelist);
3686
3687         return result;
3688 }
3689
3690 /*
3691  * SplitIdentifierString --- parse a string containing identifiers
3692  *
3693  * This is the guts of textToQualifiedNameList, and is exported for use in
3694  * other situations such as parsing GUC variables.  In the GUC case, it's
3695  * important to avoid memory leaks, so the API is designed to minimize the
3696  * amount of stuff that needs to be allocated and freed.
3697  *
3698  * Inputs:
3699  *      rawstring: the input string; must be overwritable!      On return, it's
3700  *                         been modified to contain the separated identifiers.
3701  *      separator: the separator punctuation expected between identifiers
3702  *                         (typically '.' or ',').  Whitespace may also appear around
3703  *                         identifiers.
3704  * Outputs:
3705  *      namelist: filled with a palloc'd list of pointers to identifiers within
3706  *                        rawstring.  Caller should list_free() this even on error return.
3707  *
3708  * Returns true if okay, false if there is a syntax error in the string.
3709  *
3710  * Note that an empty string is considered okay here, though not in
3711  * textToQualifiedNameList.
3712  */
3713 bool
3714 SplitIdentifierString(char *rawstring, char separator,
3715                                           List **namelist)
3716 {
3717         char       *nextp = rawstring;
3718         bool            done = false;
3719
3720         *namelist = NIL;
3721
3722         while (scanner_isspace(*nextp))
3723                 nextp++;                                /* skip leading whitespace */
3724
3725         if (*nextp == '\0')
3726                 return true;                    /* allow empty string */
3727
3728         /* At the top of the loop, we are at start of a new identifier. */
3729         do
3730         {
3731                 char       *curname;
3732                 char       *endp;
3733
3734                 if (*nextp == '"')
3735                 {
3736                         /* Quoted name --- collapse quote-quote pairs, no downcasing */
3737                         curname = nextp + 1;
3738                         for (;;)
3739                         {
3740                                 endp = strchr(nextp + 1, '"');
3741                                 if (endp == NULL)
3742                                         return false;   /* mismatched quotes */
3743                                 if (endp[1] != '"')
3744                                         break;          /* found end of quoted name */
3745                                 /* Collapse adjacent quotes into one quote, and look again */
3746                                 memmove(endp, endp + 1, strlen(endp));
3747                                 nextp = endp;
3748                         }
3749                         /* endp now points at the terminating quote */
3750                         nextp = endp + 1;
3751                 }
3752                 else
3753                 {
3754                         /* Unquoted name --- extends to separator or whitespace */
3755                         char       *downname;
3756                         int                     len;
3757
3758                         curname = nextp;
3759                         while (*nextp && *nextp != separator &&
3760                                    !scanner_isspace(*nextp))
3761                                 nextp++;
3762                         endp = nextp;
3763                         if (curname == nextp)
3764                                 return false;   /* empty unquoted name not allowed */
3765
3766                         /*
3767                          * Downcase the identifier, using same code as main lexer does.
3768                          *
3769                          * XXX because we want to overwrite the input in-place, we cannot
3770                          * support a downcasing transformation that increases the string
3771                          * length.  This is not a problem given the current implementation
3772                          * of downcase_truncate_identifier, but we'll probably have to do
3773                          * something about this someday.
3774                          */
3775                         len = endp - curname;
3776                         downname = downcase_truncate_identifier(curname, len, false);
3777                         Assert(strlen(downname) <= len);
3778                         strncpy(curname, downname, len);        /* strncpy is required here */
3779                         pfree(downname);
3780                 }
3781
3782                 while (scanner_isspace(*nextp))
3783                         nextp++;                        /* skip trailing whitespace */
3784
3785                 if (*nextp == separator)
3786                 {
3787                         nextp++;
3788                         while (scanner_isspace(*nextp))
3789                                 nextp++;                /* skip leading whitespace for next */
3790                         /* we expect another name, so done remains false */
3791                 }
3792                 else if (*nextp == '\0')
3793                         done = true;
3794                 else
3795                         return false;           /* invalid syntax */
3796
3797                 /* Now safe to overwrite separator with a null */
3798                 *endp = '\0';
3799
3800                 /* Truncate name if it's overlength */
3801                 truncate_identifier(curname, strlen(curname), false);
3802
3803                 /*
3804                  * Finished isolating current name --- add it to list
3805                  */
3806                 *namelist = lappend(*namelist, curname);
3807
3808                 /* Loop back if we didn't reach end of string */
3809         } while (!done);
3810
3811         return true;
3812 }
3813
3814
3815 /*
3816  * SplitDirectoriesString --- parse a string containing file/directory names
3817  *
3818  * This works fine on file names too; the function name is historical.
3819  *
3820  * This is similar to SplitIdentifierString, except that the parsing
3821  * rules are meant to handle pathnames instead of identifiers: there is
3822  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3823  * and we apply canonicalize_path() to each extracted string.  Because of the
3824  * last, the returned strings are separately palloc'd rather than being
3825  * pointers into rawstring --- but we still scribble on rawstring.
3826  *
3827  * Inputs:
3828  *      rawstring: the input string; must be modifiable!
3829  *      separator: the separator punctuation expected between directories
3830  *                         (typically ',' or ';').  Whitespace may also appear around
3831  *                         directories.
3832  * Outputs:
3833  *      namelist: filled with a palloc'd list of directory names.
3834  *                        Caller should list_free_deep() this even on error return.
3835  *
3836  * Returns true if okay, false if there is a syntax error in the string.
3837  *
3838  * Note that an empty string is considered okay here.
3839  */
3840 bool
3841 SplitDirectoriesString(char *rawstring, char separator,
3842                                            List **namelist)
3843 {
3844         char       *nextp = rawstring;
3845         bool            done = false;
3846
3847         *namelist = NIL;
3848
3849         while (scanner_isspace(*nextp))
3850                 nextp++;                                /* skip leading whitespace */
3851
3852         if (*nextp == '\0')
3853                 return true;                    /* allow empty string */
3854
3855         /* At the top of the loop, we are at start of a new directory. */
3856         do
3857         {
3858                 char       *curname;
3859                 char       *endp;
3860
3861                 if (*nextp == '"')
3862                 {
3863                         /* Quoted name --- collapse quote-quote pairs */
3864                         curname = nextp + 1;
3865                         for (;;)
3866                         {
3867                                 endp = strchr(nextp + 1, '"');
3868                                 if (endp == NULL)
3869                                         return false;   /* mismatched quotes */
3870                                 if (endp[1] != '"')
3871                                         break;          /* found end of quoted name */
3872                                 /* Collapse adjacent quotes into one quote, and look again */
3873                                 memmove(endp, endp + 1, strlen(endp));
3874                                 nextp = endp;
3875                         }
3876                         /* endp now points at the terminating quote */
3877                         nextp = endp + 1;
3878                 }
3879                 else
3880                 {
3881                         /* Unquoted name --- extends to separator or end of string */
3882                         curname = endp = nextp;
3883                         while (*nextp && *nextp != separator)
3884                         {
3885                                 /* trailing whitespace should not be included in name */
3886                                 if (!scanner_isspace(*nextp))
3887                                         endp = nextp + 1;
3888                                 nextp++;
3889                         }
3890                         if (curname == endp)
3891                                 return false;   /* empty unquoted name not allowed */
3892                 }
3893
3894                 while (scanner_isspace(*nextp))
3895                         nextp++;                        /* skip trailing whitespace */
3896
3897                 if (*nextp == separator)
3898                 {
3899                         nextp++;
3900                         while (scanner_isspace(*nextp))
3901                                 nextp++;                /* skip leading whitespace for next */
3902                         /* we expect another name, so done remains false */
3903                 }
3904                 else if (*nextp == '\0')
3905                         done = true;
3906                 else
3907                         return false;           /* invalid syntax */
3908
3909                 /* Now safe to overwrite separator with a null */
3910                 *endp = '\0';
3911
3912                 /* Truncate path if it's overlength */
3913                 if (strlen(curname) >= MAXPGPATH)
3914                         curname[MAXPGPATH - 1] = '\0';
3915
3916                 /*
3917                  * Finished isolating current name --- add it to list
3918                  */
3919                 curname = pstrdup(curname);
3920                 canonicalize_path(curname);
3921                 *namelist = lappend(*namelist, curname);
3922
3923                 /* Loop back if we didn't reach end of string */
3924         } while (!done);
3925
3926         return true;
3927 }
3928
3929
3930 /*
3931  * SplitGUCList --- parse a string containing identifiers or file names
3932  *
3933  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3934  * presuming whether the elements will be taken as identifiers or file names.
3935  * We assume the input has already been through flatten_set_variable_args(),
3936  * so that we need never downcase (if appropriate, that was done already).
3937  * Nor do we ever truncate, since we don't know the correct max length.
3938  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3939  * because any embedded whitespace should have led to double-quoting).
3940  * Otherwise the API is identical to SplitIdentifierString.
3941  *
3942  * XXX it's annoying to have so many copies of this string-splitting logic.
3943  * However, it's not clear that having one function with a bunch of option
3944  * flags would be much better.
3945  *
3946  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3947  * Be sure to update that if you have to change this.
3948  *
3949  * Inputs:
3950  *      rawstring: the input string; must be overwritable!      On return, it's
3951  *                         been modified to contain the separated identifiers.
3952  *      separator: the separator punctuation expected between identifiers
3953  *                         (typically '.' or ',').  Whitespace may also appear around
3954  *                         identifiers.
3955  * Outputs:
3956  *      namelist: filled with a palloc'd list of pointers to identifiers within
3957  *                        rawstring.  Caller should list_free() this even on error return.
3958  *
3959  * Returns true if okay, false if there is a syntax error in the string.
3960  */
3961 bool
3962 SplitGUCList(char *rawstring, char separator,
3963                          List **namelist)
3964 {
3965         char       *nextp = rawstring;
3966         bool            done = false;
3967
3968         *namelist = NIL;
3969
3970         while (scanner_isspace(*nextp))
3971                 nextp++;                                /* skip leading whitespace */
3972
3973         if (*nextp == '\0')
3974                 return true;                    /* allow empty string */
3975
3976         /* At the top of the loop, we are at start of a new identifier. */
3977         do
3978         {
3979                 char       *curname;
3980                 char       *endp;
3981
3982                 if (*nextp == '"')
3983                 {
3984                         /* Quoted name --- collapse quote-quote pairs */
3985                         curname = nextp + 1;
3986                         for (;;)
3987                         {
3988                                 endp = strchr(nextp + 1, '"');
3989                                 if (endp == NULL)
3990                                         return false;   /* mismatched quotes */
3991                                 if (endp[1] != '"')
3992                                         break;          /* found end of quoted name */
3993                                 /* Collapse adjacent quotes into one quote, and look again */
3994                                 memmove(endp, endp + 1, strlen(endp));
3995                                 nextp = endp;
3996                         }
3997                         /* endp now points at the terminating quote */
3998                         nextp = endp + 1;
3999                 }
4000                 else
4001                 {
4002                         /* Unquoted name --- extends to separator or whitespace */
4003                         curname = nextp;
4004                         while (*nextp && *nextp != separator &&
4005                                    !scanner_isspace(*nextp))
4006                                 nextp++;
4007                         endp = nextp;
4008                         if (curname == nextp)
4009                                 return false;   /* empty unquoted name not allowed */
4010                 }
4011
4012                 while (scanner_isspace(*nextp))
4013                         nextp++;                        /* skip trailing whitespace */
4014
4015                 if (*nextp == separator)
4016                 {
4017                         nextp++;
4018                         while (scanner_isspace(*nextp))
4019                                 nextp++;                /* skip leading whitespace for next */
4020                         /* we expect another name, so done remains false */
4021                 }
4022                 else if (*nextp == '\0')
4023                         done = true;
4024                 else
4025                         return false;           /* invalid syntax */
4026
4027                 /* Now safe to overwrite separator with a null */
4028                 *endp = '\0';
4029
4030                 /*
4031                  * Finished isolating current name --- add it to list
4032                  */
4033                 *namelist = lappend(*namelist, curname);
4034
4035                 /* Loop back if we didn't reach end of string */
4036         } while (!done);
4037
4038         return true;
4039 }
4040
4041
4042 /*****************************************************************************
4043  *      Comparison Functions used for bytea
4044  *
4045  * Note: btree indexes need these routines not to leak memory; therefore,
4046  * be careful to free working copies of toasted datums.  Most places don't
4047  * need to be so careful.
4048  *****************************************************************************/
4049
4050 Datum
4051 byteaeq(PG_FUNCTION_ARGS)
4052 {
4053         Datum           arg1 = PG_GETARG_DATUM(0);
4054         Datum           arg2 = PG_GETARG_DATUM(1);
4055         bool            result;
4056         Size            len1,
4057                                 len2;
4058
4059         /*
4060          * We can use a fast path for unequal lengths, which might save us from
4061          * having to detoast one or both values.
4062          */
4063         len1 = toast_raw_datum_size(arg1);
4064         len2 = toast_raw_datum_size(arg2);
4065         if (len1 != len2)
4066                 result = false;
4067         else
4068         {
4069                 bytea      *barg1 = DatumGetByteaPP(arg1);
4070                 bytea      *barg2 = DatumGetByteaPP(arg2);
4071
4072                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4073                                                  len1 - VARHDRSZ) == 0);
4074
4075                 PG_FREE_IF_COPY(barg1, 0);
4076                 PG_FREE_IF_COPY(barg2, 1);
4077         }
4078
4079         PG_RETURN_BOOL(result);
4080 }
4081
4082 Datum
4083 byteane(PG_FUNCTION_ARGS)
4084 {
4085         Datum           arg1 = PG_GETARG_DATUM(0);
4086         Datum           arg2 = PG_GETARG_DATUM(1);
4087         bool            result;
4088         Size            len1,
4089                                 len2;
4090
4091         /*
4092          * We can use a fast path for unequal lengths, which might save us from
4093          * having to detoast one or both values.
4094          */
4095         len1 = toast_raw_datum_size(arg1);
4096         len2 = toast_raw_datum_size(arg2);
4097         if (len1 != len2)
4098                 result = true;
4099         else
4100         {
4101                 bytea      *barg1 = DatumGetByteaPP(arg1);
4102                 bytea      *barg2 = DatumGetByteaPP(arg2);
4103
4104                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4105                                                  len1 - VARHDRSZ) != 0);
4106
4107                 PG_FREE_IF_COPY(barg1, 0);
4108                 PG_FREE_IF_COPY(barg2, 1);
4109         }
4110
4111         PG_RETURN_BOOL(result);
4112 }
4113
4114 Datum
4115 bytealt(PG_FUNCTION_ARGS)
4116 {
4117         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4118         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4119         int                     len1,
4120                                 len2;
4121         int                     cmp;
4122
4123         len1 = VARSIZE_ANY_EXHDR(arg1);
4124         len2 = VARSIZE_ANY_EXHDR(arg2);
4125
4126         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4127
4128         PG_FREE_IF_COPY(arg1, 0);
4129         PG_FREE_IF_COPY(arg2, 1);
4130
4131         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4132 }
4133
4134 Datum
4135 byteale(PG_FUNCTION_ARGS)
4136 {
4137         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4138         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4139         int                     len1,
4140                                 len2;
4141         int                     cmp;
4142
4143         len1 = VARSIZE_ANY_EXHDR(arg1);
4144         len2 = VARSIZE_ANY_EXHDR(arg2);
4145
4146         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4147
4148         PG_FREE_IF_COPY(arg1, 0);
4149         PG_FREE_IF_COPY(arg2, 1);
4150
4151         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4152 }
4153
4154 Datum
4155 byteagt(PG_FUNCTION_ARGS)
4156 {
4157         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4158         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4159         int                     len1,
4160                                 len2;
4161         int                     cmp;
4162
4163         len1 = VARSIZE_ANY_EXHDR(arg1);
4164         len2 = VARSIZE_ANY_EXHDR(arg2);
4165
4166         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4167
4168         PG_FREE_IF_COPY(arg1, 0);
4169         PG_FREE_IF_COPY(arg2, 1);
4170
4171         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4172 }
4173
4174 Datum
4175 byteage(PG_FUNCTION_ARGS)
4176 {
4177         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4178         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4179         int                     len1,
4180                                 len2;
4181         int                     cmp;
4182
4183         len1 = VARSIZE_ANY_EXHDR(arg1);
4184         len2 = VARSIZE_ANY_EXHDR(arg2);
4185
4186         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4187
4188         PG_FREE_IF_COPY(arg1, 0);
4189         PG_FREE_IF_COPY(arg2, 1);
4190
4191         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4192 }
4193
4194 Datum
4195 byteacmp(PG_FUNCTION_ARGS)
4196 {
4197         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
4198         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
4199         int                     len1,
4200                                 len2;
4201         int                     cmp;
4202
4203         len1 = VARSIZE_ANY_EXHDR(arg1);
4204         len2 = VARSIZE_ANY_EXHDR(arg2);
4205
4206         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4207         if ((cmp == 0) && (len1 != len2))
4208                 cmp = (len1 < len2) ? -1 : 1;
4209
4210         PG_FREE_IF_COPY(arg1, 0);
4211         PG_FREE_IF_COPY(arg2, 1);
4212
4213         PG_RETURN_INT32(cmp);
4214 }
4215
4216 Datum
4217 bytea_sortsupport(PG_FUNCTION_ARGS)
4218 {
4219         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4220         MemoryContext oldcontext;
4221
4222         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4223
4224         /* Use generic string SortSupport, forcing "C" collation */
4225         varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4226
4227         MemoryContextSwitchTo(oldcontext);
4228
4229         PG_RETURN_VOID();
4230 }
4231
4232 /*
4233  * appendStringInfoText
4234  *
4235  * Append a text to str.
4236  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4237  */
4238 static void
4239 appendStringInfoText(StringInfo str, const text *t)
4240 {
4241         appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4242 }
4243
4244 /*
4245  * replace_text
4246  * replace all occurrences of 'old_sub_str' in 'orig_str'
4247  * with 'new_sub_str' to form 'new_str'
4248  *
4249  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4250  * otherwise returns 'new_str'
4251  */
4252 Datum
4253 replace_text(PG_FUNCTION_ARGS)
4254 {
4255         text       *src_text = PG_GETARG_TEXT_PP(0);
4256         text       *from_sub_text = PG_GETARG_TEXT_PP(1);
4257         text       *to_sub_text = PG_GETARG_TEXT_PP(2);
4258         int                     src_text_len;
4259         int                     from_sub_text_len;
4260         TextPositionState state;
4261         text       *ret_text;
4262         int                     chunk_len;
4263         char       *curr_ptr;
4264         char       *start_ptr;
4265         StringInfoData str;
4266         bool            found;
4267
4268         src_text_len = VARSIZE_ANY_EXHDR(src_text);
4269         from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4270
4271         /* Return unmodified source string if empty source or pattern */
4272         if (src_text_len < 1 || from_sub_text_len < 1)
4273         {
4274                 PG_RETURN_TEXT_P(src_text);
4275         }
4276
4277         text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4278
4279         found = text_position_next(&state);
4280
4281         /* When the from_sub_text is not found, there is nothing to do. */
4282         if (!found)
4283         {
4284                 text_position_cleanup(&state);
4285                 PG_RETURN_TEXT_P(src_text);
4286         }
4287         curr_ptr = text_position_get_match_ptr(&state);
4288         start_ptr = VARDATA_ANY(src_text);
4289
4290         initStringInfo(&str);
4291
4292         do
4293         {
4294                 CHECK_FOR_INTERRUPTS();
4295
4296                 /* copy the data skipped over by last text_position_next() */
4297                 chunk_len = curr_ptr - start_ptr;
4298                 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4299
4300                 appendStringInfoText(&str, to_sub_text);
4301
4302                 start_ptr = curr_ptr + from_sub_text_len;
4303
4304                 found = text_position_next(&state);
4305                 if (found)
4306                         curr_ptr = text_position_get_match_ptr(&state);
4307         }
4308         while (found);
4309
4310         /* copy trailing data */
4311         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4312         appendBinaryStringInfo(&str, start_ptr, chunk_len);
4313
4314         text_position_cleanup(&state);
4315
4316         ret_text = cstring_to_text_with_len(str.data, str.len);
4317         pfree(str.data);
4318
4319         PG_RETURN_TEXT_P(ret_text);
4320 }
4321
4322 /*
4323  * check_replace_text_has_escape
4324  *
4325  * Returns 0 if text contains no backslashes that need processing.
4326  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4327  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4328  */
4329 static int
4330 check_replace_text_has_escape(const text *replace_text)
4331 {
4332         int                     result = 0;
4333         const char *p = VARDATA_ANY(replace_text);
4334         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4335
4336         while (p < p_end)
4337         {
4338                 /* Find next escape char, if any. */
4339                 p = memchr(p, '\\', p_end - p);
4340                 if (p == NULL)
4341                         break;
4342                 p++;
4343                 /* Note: a backslash at the end doesn't require extra processing. */
4344                 if (p < p_end)
4345                 {
4346                         if (*p >= '1' && *p <= '9')
4347                                 return 2;               /* Found a submatch specifier, so done */
4348                         result = 1;                     /* Found some other sequence, keep looking */
4349                         p++;
4350                 }
4351         }
4352         return result;
4353 }
4354
4355 /*
4356  * appendStringInfoRegexpSubstr
4357  *
4358  * Append replace_text to str, substituting regexp back references for
4359  * \n escapes.  start_ptr is the start of the match in the source string,
4360  * at logical character position data_pos.
4361  */
4362 static void
4363 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4364                                                          regmatch_t *pmatch,
4365                                                          char *start_ptr, int data_pos)
4366 {
4367         const char *p = VARDATA_ANY(replace_text);
4368         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4369
4370         while (p < p_end)
4371         {
4372                 const char *chunk_start = p;
4373                 int                     so;
4374                 int                     eo;
4375
4376                 /* Find next escape char, if any. */
4377                 p = memchr(p, '\\', p_end - p);
4378                 if (p == NULL)
4379                         p = p_end;
4380
4381                 /* Copy the text we just scanned over, if any. */
4382                 if (p > chunk_start)
4383                         appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4384
4385                 /* Done if at end of string, else advance over escape char. */
4386                 if (p >= p_end)
4387                         break;
4388                 p++;
4389
4390                 if (p >= p_end)
4391                 {
4392                         /* Escape at very end of input.  Treat same as unexpected char */
4393                         appendStringInfoChar(str, '\\');
4394                         break;
4395                 }
4396
4397                 if (*p >= '1' && *p <= '9')
4398                 {
4399                         /* Use the back reference of regexp. */
4400                         int                     idx = *p - '0';
4401
4402                         so = pmatch[idx].rm_so;
4403                         eo = pmatch[idx].rm_eo;
4404                         p++;
4405                 }
4406                 else if (*p == '&')
4407                 {
4408                         /* Use the entire matched string. */
4409                         so = pmatch[0].rm_so;
4410                         eo = pmatch[0].rm_eo;
4411                         p++;
4412                 }
4413                 else if (*p == '\\')
4414                 {
4415                         /* \\ means transfer one \ to output. */
4416                         appendStringInfoChar(str, '\\');
4417                         p++;
4418                         continue;
4419                 }
4420                 else
4421                 {
4422                         /*
4423                          * If escape char is not followed by any expected char, just treat
4424                          * it as ordinary data to copy.  (XXX would it be better to throw
4425                          * an error?)
4426                          */
4427                         appendStringInfoChar(str, '\\');
4428                         continue;
4429                 }
4430
4431                 if (so >= 0 && eo >= 0)
4432                 {
4433                         /*
4434                          * Copy the text that is back reference of regexp.  Note so and eo
4435                          * are counted in characters not bytes.
4436                          */
4437                         char       *chunk_start;
4438                         int                     chunk_len;
4439
4440                         Assert(so >= data_pos);
4441                         chunk_start = start_ptr;
4442                         chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4443                         chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4444                         appendBinaryStringInfo(str, chunk_start, chunk_len);
4445                 }
4446         }
4447 }
4448
4449 /*
4450  * replace_text_regexp
4451  *
4452  * replace substring(s) in src_text that match pattern with replace_text.
4453  * The replace_text can contain backslash markers to substitute
4454  * (parts of) the matched text.
4455  *
4456  * cflags: regexp compile flags.
4457  * collation: collation to use.
4458  * search_start: the character (not byte) offset in src_text at which to
4459  * begin searching.
4460  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4461  */
4462 text *
4463 replace_text_regexp(text *src_text, text *pattern_text,
4464                                         text *replace_text,
4465                                         int cflags, Oid collation,
4466                                         int search_start, int n)
4467 {
4468         text       *ret_text;
4469         regex_t    *re;
4470         int                     src_text_len = VARSIZE_ANY_EXHDR(src_text);
4471         int                     nmatches = 0;
4472         StringInfoData buf;
4473         regmatch_t      pmatch[10];             /* main match, plus \1 to \9 */
4474         int                     nmatch = lengthof(pmatch);
4475         pg_wchar   *data;
4476         size_t          data_len;
4477         int                     data_pos;
4478         char       *start_ptr;
4479         int                     escape_status;
4480
4481         initStringInfo(&buf);
4482
4483         /* Convert data string to wide characters. */
4484         data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4485         data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4486
4487         /* Check whether replace_text has escapes, especially regexp submatches. */
4488         escape_status = check_replace_text_has_escape(replace_text);
4489
4490         /* If no regexp submatches, we can use REG_NOSUB. */
4491         if (escape_status < 2)
4492         {
4493                 cflags |= REG_NOSUB;
4494                 /* Also tell pg_regexec we only want the whole-match location. */
4495                 nmatch = 1;
4496         }
4497
4498         /* Prepare the regexp. */
4499         re = RE_compile_and_cache(pattern_text, cflags, collation);
4500
4501         /* start_ptr points to the data_pos'th character of src_text */
4502         start_ptr = (char *) VARDATA_ANY(src_text);
4503         data_pos = 0;
4504
4505         while (search_start <= data_len)
4506         {
4507                 int                     regexec_result;
4508
4509                 CHECK_FOR_INTERRUPTS();
4510
4511                 regexec_result = pg_regexec(re,
4512                                                                         data,
4513                                                                         data_len,
4514                                                                         search_start,
4515                                                                         NULL,   /* no details */
4516                                                                         nmatch,
4517                                                                         pmatch,
4518                                                                         0);
4519
4520                 if (regexec_result == REG_NOMATCH)
4521                         break;
4522
4523                 if (regexec_result != REG_OKAY)
4524                 {
4525                         char            errMsg[100];
4526
4527                         CHECK_FOR_INTERRUPTS();
4528                         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4529                         ereport(ERROR,
4530                                         (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4531                                          errmsg("regular expression failed: %s", errMsg)));
4532                 }
4533
4534                 /*
4535                  * Count matches, and decide whether to replace this match.
4536                  */
4537                 nmatches++;
4538                 if (n > 0 && nmatches != n)
4539                 {
4540                         /*
4541                          * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4542                          * we treat the matched text as if it weren't matched, and copy it
4543                          * to the output later.)
4544                          */
4545                         search_start = pmatch[0].rm_eo;
4546                         if (pmatch[0].rm_so == pmatch[0].rm_eo)
4547                                 search_start++;
4548                         continue;
4549                 }
4550
4551                 /*
4552                  * Copy the text to the left of the match position.  Note we are given
4553                  * character not byte indexes.
4554                  */
4555                 if (pmatch[0].rm_so - data_pos > 0)
4556                 {
4557                         int                     chunk_len;
4558
4559                         chunk_len = charlen_to_bytelen(start_ptr,
4560                                                                                    pmatch[0].rm_so - data_pos);
4561                         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4562
4563                         /*
4564                          * Advance start_ptr over that text, to avoid multiple rescans of
4565                          * it if the replace_text contains multiple back-references.
4566                          */
4567                         start_ptr += chunk_len;
4568                         data_pos = pmatch[0].rm_so;
4569                 }
4570
4571                 /*
4572                  * Copy the replace_text, processing escapes if any are present.
4573                  */
4574                 if (escape_status > 0)
4575                         appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4576                                                                                  start_ptr, data_pos);
4577                 else
4578                         appendStringInfoText(&buf, replace_text);
4579
4580                 /* Advance start_ptr and data_pos over the matched text. */
4581                 start_ptr += charlen_to_bytelen(start_ptr,
4582                                                                                 pmatch[0].rm_eo - data_pos);
4583                 data_pos = pmatch[0].rm_eo;
4584
4585                 /*
4586                  * If we only want to replace one occurrence, we're done.
4587                  */
4588                 if (n > 0)
4589                         break;
4590
4591                 /*
4592                  * Advance search position.  Normally we start the next search at the
4593                  * end of the previous match; but if the match was of zero length, we
4594                  * have to advance by one character, or we'd just find the same match
4595                  * again.
4596                  */
4597                 search_start = data_pos;
4598                 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4599                         search_start++;
4600         }
4601
4602         /*
4603          * Copy the text to the right of the last match.
4604          */
4605         if (data_pos < data_len)
4606         {
4607                 int                     chunk_len;
4608
4609                 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4610                 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4611         }
4612
4613         ret_text = cstring_to_text_with_len(buf.data, buf.len);
4614         pfree(buf.data);
4615         pfree(data);
4616
4617         return ret_text;
4618 }
4619
4620 /*
4621  * split_part
4622  * parse input string based on provided field separator
4623  * return N'th item (1 based, negative counts from end)
4624  */
4625 Datum
4626 split_part(PG_FUNCTION_ARGS)
4627 {
4628         text       *inputstring = PG_GETARG_TEXT_PP(0);
4629         text       *fldsep = PG_GETARG_TEXT_PP(1);
4630         int                     fldnum = PG_GETARG_INT32(2);
4631         int                     inputstring_len;
4632         int                     fldsep_len;
4633         TextPositionState state;
4634         char       *start_ptr;
4635         char       *end_ptr;
4636         text       *result_text;
4637         bool            found;
4638
4639         /* field number is 1 based */
4640         if (fldnum == 0)
4641                 ereport(ERROR,
4642                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4643                                  errmsg("field position must not be zero")));
4644
4645         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4646         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4647
4648         /* return empty string for empty input string */
4649         if (inputstring_len < 1)
4650                 PG_RETURN_TEXT_P(cstring_to_text(""));
4651
4652         /* handle empty field separator */
4653         if (fldsep_len < 1)
4654         {
4655                 /* if first or last field, return input string, else empty string */
4656                 if (fldnum == 1 || fldnum == -1)
4657                         PG_RETURN_TEXT_P(inputstring);
4658                 else
4659                         PG_RETURN_TEXT_P(cstring_to_text(""));
4660         }
4661
4662         /* find the first field separator */
4663         text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4664
4665         found = text_position_next(&state);
4666
4667         /* special case if fldsep not found at all */
4668         if (!found)
4669         {
4670                 text_position_cleanup(&state);
4671                 /* if first or last field, return input string, else empty string */
4672                 if (fldnum == 1 || fldnum == -1)
4673                         PG_RETURN_TEXT_P(inputstring);
4674                 else
4675                         PG_RETURN_TEXT_P(cstring_to_text(""));
4676         }
4677
4678         /*
4679          * take care of a negative field number (i.e. count from the right) by
4680          * converting to a positive field number; we need total number of fields
4681          */
4682         if (fldnum < 0)
4683         {
4684                 /* we found a fldsep, so there are at least two fields */
4685                 int                     numfields = 2;
4686
4687                 while (text_position_next(&state))
4688                         numfields++;
4689
4690                 /* special case of last field does not require an extra pass */
4691                 if (fldnum == -1)
4692                 {
4693                         start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4694                         end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4695                         text_position_cleanup(&state);
4696                         PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4697                                                                                                           end_ptr - start_ptr));
4698                 }
4699
4700                 /* else, convert fldnum to positive notation */
4701                 fldnum += numfields + 1;
4702
4703                 /* if nonexistent field, return empty string */
4704                 if (fldnum <= 0)
4705                 {
4706                         text_position_cleanup(&state);
4707                         PG_RETURN_TEXT_P(cstring_to_text(""));
4708                 }
4709
4710                 /* reset to pointing at first match, but now with positive fldnum */
4711                 text_position_reset(&state);
4712                 found = text_position_next(&state);
4713                 Assert(found);
4714         }
4715
4716         /* identify bounds of first field */
4717         start_ptr = VARDATA_ANY(inputstring);
4718         end_ptr = text_position_get_match_ptr(&state);
4719
4720         while (found && --fldnum > 0)
4721         {
4722                 /* identify bounds of next field */
4723                 start_ptr = end_ptr + fldsep_len;
4724                 found = text_position_next(&state);
4725                 if (found)
4726                         end_ptr = text_position_get_match_ptr(&state);
4727         }
4728
4729         text_position_cleanup(&state);
4730
4731         if (fldnum > 0)
4732         {
4733                 /* N'th field separator not found */
4734                 /* if last field requested, return it, else empty string */
4735                 if (fldnum == 1)
4736                 {
4737                         int                     last_len = start_ptr - VARDATA_ANY(inputstring);
4738
4739                         result_text = cstring_to_text_with_len(start_ptr,
4740                                                                                                    inputstring_len - last_len);
4741                 }
4742                 else
4743                         result_text = cstring_to_text("");
4744         }
4745         else
4746         {
4747                 /* non-last field requested */
4748                 result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4749         }
4750
4751         PG_RETURN_TEXT_P(result_text);
4752 }
4753
4754 /*
4755  * Convenience function to return true when two text params are equal.
4756  */
4757 static bool
4758 text_isequal(text *txt1, text *txt2, Oid collid)
4759 {
4760         return DatumGetBool(DirectFunctionCall2Coll(texteq,
4761                                                                                                 collid,
4762                                                                                                 PointerGetDatum(txt1),
4763                                                                                                 PointerGetDatum(txt2)));
4764 }
4765
4766 /*
4767  * text_to_array
4768  * parse input string and return text array of elements,
4769  * based on provided field separator
4770  */
4771 Datum
4772 text_to_array(PG_FUNCTION_ARGS)
4773 {
4774         SplitTextOutputData tstate;
4775
4776         /* For array output, tstate should start as all zeroes */
4777         memset(&tstate, 0, sizeof(tstate));
4778
4779         if (!split_text(fcinfo, &tstate))
4780                 PG_RETURN_NULL();
4781
4782         if (tstate.astate == NULL)
4783                 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4784
4785         PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4786                                                                                   CurrentMemoryContext));
4787 }
4788
4789 /*
4790  * text_to_array_null
4791  * parse input string and return text array of elements,
4792  * based on provided field separator and null string
4793  *
4794  * This is a separate entry point only to prevent the regression tests from
4795  * complaining about different argument sets for the same internal function.
4796  */
4797 Datum
4798 text_to_array_null(PG_FUNCTION_ARGS)
4799 {
4800         return text_to_array(fcinfo);
4801 }
4802
4803 /*
4804  * text_to_table
4805  * parse input string and return table of elements,
4806  * based on provided field separator
4807  */
4808 Datum
4809 text_to_table(PG_FUNCTION_ARGS)
4810 {
4811         ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4812         SplitTextOutputData tstate;
4813
4814         tstate.astate = NULL;
4815         InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4816         tstate.tupstore = rsi->setResult;
4817         tstate.tupdesc = rsi->setDesc;
4818
4819         (void) split_text(fcinfo, &tstate);
4820
4821         return (Datum) 0;
4822 }
4823
4824 /*
4825  * text_to_table_null
4826  * parse input string and return table of elements,
4827  * based on provided field separator and null string
4828  *
4829  * This is a separate entry point only to prevent the regression tests from
4830  * complaining about different argument sets for the same internal function.
4831  */
4832 Datum
4833 text_to_table_null(PG_FUNCTION_ARGS)
4834 {
4835         return text_to_table(fcinfo);
4836 }
4837
4838 /*
4839  * Common code for text_to_array, text_to_array_null, text_to_table
4840  * and text_to_table_null functions.
4841  *
4842  * These are not strict so we have to test for null inputs explicitly.
4843  * Returns false if result is to be null, else returns true.
4844  *
4845  * Note that if the result is valid but empty (zero elements), we return
4846  * without changing *tstate --- caller must handle that case, too.
4847  */
4848 static bool
4849 split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4850 {
4851         text       *inputstring;
4852         text       *fldsep;
4853         text       *null_string;
4854         Oid                     collation = PG_GET_COLLATION();
4855         int                     inputstring_len;
4856         int                     fldsep_len;
4857         char       *start_ptr;
4858         text       *result_text;
4859
4860         /* when input string is NULL, then result is NULL too */
4861         if (PG_ARGISNULL(0))
4862                 return false;
4863
4864         inputstring = PG_GETARG_TEXT_PP(0);
4865
4866         /* fldsep can be NULL */
4867         if (!PG_ARGISNULL(1))
4868                 fldsep = PG_GETARG_TEXT_PP(1);
4869         else
4870                 fldsep = NULL;
4871
4872         /* null_string can be NULL or omitted */
4873         if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4874                 null_string = PG_GETARG_TEXT_PP(2);
4875         else
4876                 null_string = NULL;
4877
4878         if (fldsep != NULL)
4879         {
4880                 /*
4881                  * Normal case with non-null fldsep.  Use the text_position machinery
4882                  * to search for occurrences of fldsep.
4883                  */
4884                 TextPositionState state;
4885
4886                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4887                 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4888
4889                 /* return empty set for empty input string */
4890                 if (inputstring_len < 1)
4891                         return true;
4892
4893                 /* empty field separator: return input string as a one-element set */
4894                 if (fldsep_len < 1)
4895                 {
4896                         split_text_accum_result(tstate, inputstring,
4897                                                                         null_string, collation);
4898                         return true;
4899                 }
4900
4901                 text_position_setup(inputstring, fldsep, collation, &state);
4902
4903                 start_ptr = VARDATA_ANY(inputstring);
4904
4905                 for (;;)
4906                 {
4907                         bool            found;
4908                         char       *end_ptr;
4909                         int                     chunk_len;
4910
4911                         CHECK_FOR_INTERRUPTS();
4912
4913                         found = text_position_next(&state);
4914                         if (!found)
4915                         {
4916                                 /* fetch last field */
4917                                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4918                                 end_ptr = NULL; /* not used, but some compilers complain */
4919                         }
4920                         else
4921                         {
4922                                 /* fetch non-last field */
4923                                 end_ptr = text_position_get_match_ptr(&state);
4924                                 chunk_len = end_ptr - start_ptr;
4925                         }
4926
4927                         /* build a temp text datum to pass to split_text_accum_result */
4928                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4929
4930                         /* stash away this field */
4931                         split_text_accum_result(tstate, result_text,
4932                                                                         null_string, collation);
4933
4934                         pfree(result_text);
4935
4936                         if (!found)
4937                                 break;
4938
4939                         start_ptr = end_ptr + fldsep_len;
4940                 }
4941
4942                 text_position_cleanup(&state);
4943         }
4944         else
4945         {
4946                 /*
4947                  * When fldsep is NULL, each character in the input string becomes a
4948                  * separate element in the result set.  The separator is effectively
4949                  * the space between characters.
4950                  */
4951                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4952
4953                 start_ptr = VARDATA_ANY(inputstring);
4954
4955                 while (inputstring_len > 0)
4956                 {
4957                         int                     chunk_len = pg_mblen(start_ptr);
4958
4959                         CHECK_FOR_INTERRUPTS();
4960
4961                         /* build a temp text datum to pass to split_text_accum_result */
4962                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4963
4964                         /* stash away this field */
4965                         split_text_accum_result(tstate, result_text,
4966                                                                         null_string, collation);
4967
4968                         pfree(result_text);
4969
4970                         start_ptr += chunk_len;
4971                         inputstring_len -= chunk_len;
4972                 }
4973         }
4974
4975         return true;
4976 }
4977
4978 /*
4979  * Add text item to result set (table or array).
4980  *
4981  * This is also responsible for checking to see if the item matches
4982  * the null_string, in which case we should emit NULL instead.
4983  */
4984 static void
4985 split_text_accum_result(SplitTextOutputData *tstate,
4986                                                 text *field_value,
4987                                                 text *null_string,
4988                                                 Oid collation)
4989 {
4990         bool            is_null = false;
4991
4992         if (null_string && text_isequal(field_value, null_string, collation))
4993                 is_null = true;
4994
4995         if (tstate->tupstore)
4996         {
4997                 Datum           values[1];
4998                 bool            nulls[1];
4999
5000                 values[0] = PointerGetDatum(field_value);
5001                 nulls[0] = is_null;
5002
5003                 tuplestore_putvalues(tstate->tupstore,
5004                                                          tstate->tupdesc,
5005                                                          values,
5006                                                          nulls);
5007         }
5008         else
5009         {
5010                 tstate->astate = accumArrayResult(tstate->astate,
5011                                                                                   PointerGetDatum(field_value),
5012                                                                                   is_null,
5013                                                                                   TEXTOID,
5014                                                                                   CurrentMemoryContext);
5015         }
5016 }
5017
5018 /*
5019  * array_to_text
5020  * concatenate Cstring representation of input array elements
5021  * using provided field separator
5022  */
5023 Datum
5024 array_to_text(PG_FUNCTION_ARGS)
5025 {
5026         ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
5027         char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5028
5029         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5030 }
5031
5032 /*
5033  * array_to_text_null
5034  * concatenate Cstring representation of input array elements
5035  * using provided field separator and null string
5036  *
5037  * This version is not strict so we have to test for null inputs explicitly.
5038  */
5039 Datum
5040 array_to_text_null(PG_FUNCTION_ARGS)
5041 {
5042         ArrayType  *v;
5043         char       *fldsep;
5044         char       *null_string;
5045
5046         /* returns NULL when first or second parameter is NULL */
5047         if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5048                 PG_RETURN_NULL();
5049
5050         v = PG_GETARG_ARRAYTYPE_P(0);
5051         fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5052
5053         /* NULL null string is passed through as a null pointer */
5054         if (!PG_ARGISNULL(2))
5055                 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5056         else
5057                 null_string = NULL;
5058
5059         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5060 }
5061
5062 /*
5063  * common code for array_to_text and array_to_text_null functions
5064  */
5065 static text *
5066 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5067                                            const char *fldsep, const char *null_string)
5068 {
5069         text       *result;
5070         int                     nitems,
5071                            *dims,
5072                                 ndims;
5073         Oid                     element_type;
5074         int                     typlen;
5075         bool            typbyval;
5076         char            typalign;
5077         StringInfoData buf;
5078         bool            printed = false;
5079         char       *p;
5080         bits8      *bitmap;
5081         int                     bitmask;
5082         int                     i;
5083         ArrayMetaState *my_extra;
5084
5085         ndims = ARR_NDIM(v);
5086         dims = ARR_DIMS(v);
5087         nitems = ArrayGetNItems(ndims, dims);
5088
5089         /* if there are no elements, return an empty string */
5090         if (nitems == 0)
5091                 return cstring_to_text_with_len("", 0);
5092
5093         element_type = ARR_ELEMTYPE(v);
5094         initStringInfo(&buf);
5095
5096         /*
5097          * We arrange to look up info about element type, including its output
5098          * conversion proc, only once per series of calls, assuming the element
5099          * type doesn't change underneath us.
5100          */
5101         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5102         if (my_extra == NULL)
5103         {
5104                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5105                                                                                                           sizeof(ArrayMetaState));
5106                 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5107                 my_extra->element_type = ~element_type;
5108         }
5109
5110         if (my_extra->element_type != element_type)
5111         {
5112                 /*
5113                  * Get info about element type, including its output conversion proc
5114                  */
5115                 get_type_io_data(element_type, IOFunc_output,
5116                                                  &my_extra->typlen, &my_extra->typbyval,
5117                                                  &my_extra->typalign, &my_extra->typdelim,
5118                                                  &my_extra->typioparam, &my_extra->typiofunc);
5119                 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5120                                           fcinfo->flinfo->fn_mcxt);
5121                 my_extra->element_type = element_type;
5122         }
5123         typlen = my_extra->typlen;
5124         typbyval = my_extra->typbyval;
5125         typalign = my_extra->typalign;
5126
5127         p = ARR_DATA_PTR(v);
5128         bitmap = ARR_NULLBITMAP(v);
5129         bitmask = 1;
5130
5131         for (i = 0; i < nitems; i++)
5132         {
5133                 Datum           itemvalue;
5134                 char       *value;
5135
5136                 /* Get source element, checking for NULL */
5137                 if (bitmap && (*bitmap & bitmask) == 0)
5138                 {
5139                         /* if null_string is NULL, we just ignore null elements */
5140                         if (null_string != NULL)
5141                         {
5142                                 if (printed)
5143                                         appendStringInfo(&buf, "%s%s", fldsep, null_string);
5144                                 else
5145                                         appendStringInfoString(&buf, null_string);
5146                                 printed = true;
5147                         }
5148                 }
5149                 else
5150                 {
5151                         itemvalue = fetch_att(p, typbyval, typlen);
5152
5153                         value = OutputFunctionCall(&my_extra->proc, itemvalue);
5154
5155                         if (printed)
5156                                 appendStringInfo(&buf, "%s%s", fldsep, value);
5157                         else
5158                                 appendStringInfoString(&buf, value);
5159                         printed = true;
5160
5161                         p = att_addlength_pointer(p, typlen, p);
5162                         p = (char *) att_align_nominal(p, typalign);
5163                 }
5164
5165                 /* advance bitmap pointer if any */
5166                 if (bitmap)
5167                 {
5168                         bitmask <<= 1;
5169                         if (bitmask == 0x100)
5170                         {
5171                                 bitmap++;
5172                                 bitmask = 1;
5173                         }
5174                 }
5175         }
5176
5177         result = cstring_to_text_with_len(buf.data, buf.len);
5178         pfree(buf.data);
5179
5180         return result;
5181 }
5182
5183 #define HEXBASE 16
5184 /*
5185  * Convert an int32 to a string containing a base 16 (hex) representation of
5186  * the number.
5187  */
5188 Datum
5189 to_hex32(PG_FUNCTION_ARGS)
5190 {
5191         uint32          value = (uint32) PG_GETARG_INT32(0);
5192         char       *ptr;
5193         const char *digits = "0123456789abcdef";
5194         char            buf[32];                /* bigger than needed, but reasonable */
5195
5196         ptr = buf + sizeof(buf) - 1;
5197         *ptr = '\0';
5198
5199         do
5200         {
5201                 *--ptr = digits[value % HEXBASE];
5202                 value /= HEXBASE;
5203         } while (ptr > buf && value);
5204
5205         PG_RETURN_TEXT_P(cstring_to_text(ptr));
5206 }
5207
5208 /*
5209  * Convert an int64 to a string containing a base 16 (hex) representation of
5210  * the number.
5211  */
5212 Datum
5213 to_hex64(PG_FUNCTION_ARGS)
5214 {
5215         uint64          value = (uint64) PG_GETARG_INT64(0);
5216         char       *ptr;
5217         const char *digits = "0123456789abcdef";
5218         char            buf[32];                /* bigger than needed, but reasonable */
5219
5220         ptr = buf + sizeof(buf) - 1;
5221         *ptr = '\0';
5222
5223         do
5224         {
5225                 *--ptr = digits[value % HEXBASE];
5226                 value /= HEXBASE;
5227         } while (ptr > buf && value);
5228
5229         PG_RETURN_TEXT_P(cstring_to_text(ptr));
5230 }
5231
5232 /*
5233  * Return the size of a datum, possibly compressed
5234  *
5235  * Works on any data type
5236  */
5237 Datum
5238 pg_column_size(PG_FUNCTION_ARGS)
5239 {
5240         Datum           value = PG_GETARG_DATUM(0);
5241         int32           result;
5242         int                     typlen;
5243
5244         /* On first call, get the input type's typlen, and save at *fn_extra */
5245         if (fcinfo->flinfo->fn_extra == NULL)
5246         {
5247                 /* Lookup the datatype of the supplied argument */
5248                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5249
5250                 typlen = get_typlen(argtypeid);
5251                 if (typlen == 0)                /* should not happen */
5252                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
5253
5254                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5255                                                                                                           sizeof(int));
5256                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5257         }
5258         else
5259                 typlen = *((int *) fcinfo->flinfo->fn_extra);
5260
5261         if (typlen == -1)
5262         {
5263                 /* varlena type, possibly toasted */
5264                 result = toast_datum_size(value);
5265         }
5266         else if (typlen == -2)
5267         {
5268                 /* cstring */
5269                 result = strlen(DatumGetCString(value)) + 1;
5270         }
5271         else
5272         {
5273                 /* ordinary fixed-width type */
5274                 result = typlen;
5275         }
5276
5277         PG_RETURN_INT32(result);
5278 }
5279
5280 /*
5281  * Return the compression method stored in the compressed attribute.  Return
5282  * NULL for non varlena type or uncompressed data.
5283  */
5284 Datum
5285 pg_column_compression(PG_FUNCTION_ARGS)
5286 {
5287         int                     typlen;
5288         char       *result;
5289         ToastCompressionId cmid;
5290
5291         /* On first call, get the input type's typlen, and save at *fn_extra */
5292         if (fcinfo->flinfo->fn_extra == NULL)
5293         {
5294                 /* Lookup the datatype of the supplied argument */
5295                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5296
5297                 typlen = get_typlen(argtypeid);
5298                 if (typlen == 0)                /* should not happen */
5299                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
5300
5301                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5302                                                                                                           sizeof(int));
5303                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5304         }
5305         else
5306                 typlen = *((int *) fcinfo->flinfo->fn_extra);
5307
5308         if (typlen != -1)
5309                 PG_RETURN_NULL();
5310
5311         /* get the compression method id stored in the compressed varlena */
5312         cmid = toast_get_compression_id((struct varlena *)
5313                                                                         DatumGetPointer(PG_GETARG_DATUM(0)));
5314         if (cmid == TOAST_INVALID_COMPRESSION_ID)
5315                 PG_RETURN_NULL();
5316
5317         /* convert compression method id to compression method name */
5318         switch (cmid)
5319         {
5320                 case TOAST_PGLZ_COMPRESSION_ID:
5321                         result = "pglz";
5322                         break;
5323                 case TOAST_LZ4_COMPRESSION_ID:
5324                         result = "lz4";
5325                         break;
5326                 default:
5327                         elog(ERROR, "invalid compression method id %d", cmid);
5328         }
5329
5330         PG_RETURN_TEXT_P(cstring_to_text(result));
5331 }
5332
5333 /*
5334  * string_agg - Concatenates values and returns string.
5335  *
5336  * Syntax: string_agg(value text, delimiter text) RETURNS text
5337  *
5338  * Note: Any NULL values are ignored. The first-call delimiter isn't
5339  * actually used at all, and on subsequent calls the delimiter precedes
5340  * the associated value.
5341  */
5342
5343 /* subroutine to initialize state */
5344 static StringInfo
5345 makeStringAggState(FunctionCallInfo fcinfo)
5346 {
5347         StringInfo      state;
5348         MemoryContext aggcontext;
5349         MemoryContext oldcontext;
5350
5351         if (!AggCheckCallContext(fcinfo, &aggcontext))
5352         {
5353                 /* cannot be called directly because of internal-type argument */
5354                 elog(ERROR, "string_agg_transfn called in non-aggregate context");
5355         }
5356
5357         /*
5358          * Create state in aggregate context.  It'll stay there across subsequent
5359          * calls.
5360          */
5361         oldcontext = MemoryContextSwitchTo(aggcontext);
5362         state = makeStringInfo();
5363         MemoryContextSwitchTo(oldcontext);
5364
5365         return state;
5366 }
5367
5368 Datum
5369 string_agg_transfn(PG_FUNCTION_ARGS)
5370 {
5371         StringInfo      state;
5372
5373         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5374
5375         /* Append the value unless null. */
5376         if (!PG_ARGISNULL(1))
5377         {
5378                 /* On the first time through, we ignore the delimiter. */
5379                 if (state == NULL)
5380                         state = makeStringAggState(fcinfo);
5381                 else if (!PG_ARGISNULL(2))
5382                         appendStringInfoText(state, PG_GETARG_TEXT_PP(2));      /* delimiter */
5383
5384                 appendStringInfoText(state, PG_GETARG_TEXT_PP(1));      /* value */
5385         }
5386
5387         /*
5388          * The transition type for string_agg() is declared to be "internal",
5389          * which is a pass-by-value type the same size as a pointer.
5390          */
5391         PG_RETURN_POINTER(state);
5392 }
5393
5394 Datum
5395 string_agg_finalfn(PG_FUNCTION_ARGS)
5396 {
5397         StringInfo      state;
5398
5399         /* cannot be called directly because of internal-type argument */
5400         Assert(AggCheckCallContext(fcinfo, NULL));
5401
5402         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5403
5404         if (state != NULL)
5405                 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5406         else
5407                 PG_RETURN_NULL();
5408 }
5409
5410 /*
5411  * Prepare cache with fmgr info for the output functions of the datatypes of
5412  * the arguments of a concat-like function, beginning with argument "argidx".
5413  * (Arguments before that will have corresponding slots in the resulting
5414  * FmgrInfo array, but we don't fill those slots.)
5415  */
5416 static FmgrInfo *
5417 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5418 {
5419         FmgrInfo   *foutcache;
5420         int                     i;
5421
5422         /* We keep the info in fn_mcxt so it survives across calls */
5423         foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5424                                                                                                 PG_NARGS() * sizeof(FmgrInfo));
5425
5426         for (i = argidx; i < PG_NARGS(); i++)
5427         {
5428                 Oid                     valtype;
5429                 Oid                     typOutput;
5430                 bool            typIsVarlena;
5431
5432                 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5433                 if (!OidIsValid(valtype))
5434                         elog(ERROR, "could not determine data type of concat() input");
5435
5436                 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5437                 fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5438         }
5439
5440         fcinfo->flinfo->fn_extra = foutcache;
5441
5442         return foutcache;
5443 }
5444
5445 /*
5446  * Implementation of both concat() and concat_ws().
5447  *
5448  * sepstr is the separator string to place between values.
5449  * argidx identifies the first argument to concatenate (counting from zero);
5450  * note that this must be constant across any one series of calls.
5451  *
5452  * Returns NULL if result should be NULL, else text value.
5453  */
5454 static text *
5455 concat_internal(const char *sepstr, int argidx,
5456                                 FunctionCallInfo fcinfo)
5457 {
5458         text       *result;
5459         StringInfoData str;
5460         FmgrInfo   *foutcache;
5461         bool            first_arg = true;
5462         int                     i;
5463
5464         /*
5465          * concat(VARIADIC some-array) is essentially equivalent to
5466          * array_to_text(), ie concat the array elements with the given separator.
5467          * So we just pass the case off to that code.
5468          */
5469         if (get_fn_expr_variadic(fcinfo->flinfo))
5470         {
5471                 ArrayType  *arr;
5472
5473                 /* Should have just the one argument */
5474                 Assert(argidx == PG_NARGS() - 1);
5475
5476                 /* concat(VARIADIC NULL) is defined as NULL */
5477                 if (PG_ARGISNULL(argidx))
5478                         return NULL;
5479
5480                 /*
5481                  * Non-null argument had better be an array.  We assume that any call
5482                  * context that could let get_fn_expr_variadic return true will have
5483                  * checked that a VARIADIC-labeled parameter actually is an array.  So
5484                  * it should be okay to just Assert that it's an array rather than
5485                  * doing a full-fledged error check.
5486                  */
5487                 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5488
5489                 /* OK, safe to fetch the array value */
5490                 arr = PG_GETARG_ARRAYTYPE_P(argidx);
5491
5492                 /*
5493                  * And serialize the array.  We tell array_to_text to ignore null
5494                  * elements, which matches the behavior of the loop below.
5495                  */
5496                 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5497         }
5498
5499         /* Normal case without explicit VARIADIC marker */
5500         initStringInfo(&str);
5501
5502         /* Get output function info, building it if first time through */
5503         foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5504         if (foutcache == NULL)
5505                 foutcache = build_concat_foutcache(fcinfo, argidx);
5506
5507         for (i = argidx; i < PG_NARGS(); i++)
5508         {
5509                 if (!PG_ARGISNULL(i))
5510                 {
5511                         Datum           value = PG_GETARG_DATUM(i);
5512
5513                         /* add separator if appropriate */
5514                         if (first_arg)
5515                                 first_arg = false;
5516                         else
5517                                 appendStringInfoString(&str, sepstr);
5518
5519                         /* call the appropriate type output function, append the result */
5520                         appendStringInfoString(&str,
5521                                                                    OutputFunctionCall(&foutcache[i], value));
5522                 }
5523         }
5524
5525         result = cstring_to_text_with_len(str.data, str.len);
5526         pfree(str.data);
5527
5528         return result;
5529 }
5530
5531 /*
5532  * Concatenate all arguments. NULL arguments are ignored.
5533  */
5534 Datum
5535 text_concat(PG_FUNCTION_ARGS)
5536 {
5537         text       *result;
5538
5539         result = concat_internal("", 0, fcinfo);
5540         if (result == NULL)
5541                 PG_RETURN_NULL();
5542         PG_RETURN_TEXT_P(result);
5543 }
5544
5545 /*
5546  * Concatenate all but first argument value with separators. The first
5547  * parameter is used as the separator. NULL arguments are ignored.
5548  */
5549 Datum
5550 text_concat_ws(PG_FUNCTION_ARGS)
5551 {
5552         char       *sep;
5553         text       *result;
5554
5555         /* return NULL when separator is NULL */
5556         if (PG_ARGISNULL(0))
5557                 PG_RETURN_NULL();
5558         sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5559
5560         result = concat_internal(sep, 1, fcinfo);
5561         if (result == NULL)
5562                 PG_RETURN_NULL();
5563         PG_RETURN_TEXT_P(result);
5564 }
5565
5566 /*
5567  * Return first n characters in the string. When n is negative,
5568  * return all but last |n| characters.
5569  */
5570 Datum
5571 text_left(PG_FUNCTION_ARGS)
5572 {
5573         int                     n = PG_GETARG_INT32(1);
5574
5575         if (n < 0)
5576         {
5577                 text       *str = PG_GETARG_TEXT_PP(0);
5578                 const char *p = VARDATA_ANY(str);
5579                 int                     len = VARSIZE_ANY_EXHDR(str);
5580                 int                     rlen;
5581
5582                 n = pg_mbstrlen_with_len(p, len) + n;
5583                 rlen = pg_mbcharcliplen(p, len, n);
5584                 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5585         }
5586         else
5587                 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5588 }
5589
5590 /*
5591  * Return last n characters in the string. When n is negative,
5592  * return all but first |n| characters.
5593  */
5594 Datum
5595 text_right(PG_FUNCTION_ARGS)
5596 {
5597         text       *str = PG_GETARG_TEXT_PP(0);
5598         const char *p = VARDATA_ANY(str);
5599         int                     len = VARSIZE_ANY_EXHDR(str);
5600         int                     n = PG_GETARG_INT32(1);
5601         int                     off;
5602
5603         if (n < 0)
5604                 n = -n;
5605         else
5606                 n = pg_mbstrlen_with_len(p, len) - n;
5607         off = pg_mbcharcliplen(p, len, n);
5608
5609         PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5610 }
5611
5612 /*
5613  * Return reversed string
5614  */
5615 Datum
5616 text_reverse(PG_FUNCTION_ARGS)
5617 {
5618         text       *str = PG_GETARG_TEXT_PP(0);
5619         const char *p = VARDATA_ANY(str);
5620         int                     len = VARSIZE_ANY_EXHDR(str);
5621         const char *endp = p + len;
5622         text       *result;
5623         char       *dst;
5624
5625         result = palloc(len + VARHDRSZ);
5626         dst = (char *) VARDATA(result) + len;
5627         SET_VARSIZE(result, len + VARHDRSZ);
5628
5629         if (pg_database_encoding_max_length() > 1)
5630         {
5631                 /* multibyte version */
5632                 while (p < endp)
5633                 {
5634                         int                     sz;
5635
5636                         sz = pg_mblen(p);
5637                         dst -= sz;
5638                         memcpy(dst, p, sz);
5639                         p += sz;
5640                 }
5641         }
5642         else
5643         {
5644                 /* single byte version */
5645                 while (p < endp)
5646                         *(--dst) = *p++;
5647         }
5648
5649         PG_RETURN_TEXT_P(result);
5650 }
5651
5652
5653 /*
5654  * Support macros for text_format()
5655  */
5656 #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
5657
5658 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5659         do { \
5660                 if (++(ptr) >= (end_ptr)) \
5661                         ereport(ERROR, \
5662                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5663                                          errmsg("unterminated format() type specifier"), \
5664                                          errhint("For a single \"%%\" use \"%%%%\"."))); \
5665         } while (0)
5666
5667 /*
5668  * Returns a formatted string
5669  */
5670 Datum
5671 text_format(PG_FUNCTION_ARGS)
5672 {
5673         text       *fmt;
5674         StringInfoData str;
5675         const char *cp;
5676         const char *start_ptr;
5677         const char *end_ptr;
5678         text       *result;
5679         int                     arg;
5680         bool            funcvariadic;
5681         int                     nargs;
5682         Datum      *elements = NULL;
5683         bool       *nulls = NULL;
5684         Oid                     element_type = InvalidOid;
5685         Oid                     prev_type = InvalidOid;
5686         Oid                     prev_width_type = InvalidOid;
5687         FmgrInfo        typoutputfinfo;
5688         FmgrInfo        typoutputinfo_width;
5689
5690         /* When format string is null, immediately return null */
5691         if (PG_ARGISNULL(0))
5692                 PG_RETURN_NULL();
5693
5694         /* If argument is marked VARIADIC, expand array into elements */
5695         if (get_fn_expr_variadic(fcinfo->flinfo))
5696         {
5697                 ArrayType  *arr;
5698                 int16           elmlen;
5699                 bool            elmbyval;
5700                 char            elmalign;
5701                 int                     nitems;
5702
5703                 /* Should have just the one argument */
5704                 Assert(PG_NARGS() == 2);
5705
5706                 /* If argument is NULL, we treat it as zero-length array */
5707                 if (PG_ARGISNULL(1))
5708                         nitems = 0;
5709                 else
5710                 {
5711                         /*
5712                          * Non-null argument had better be an array.  We assume that any
5713                          * call context that could let get_fn_expr_variadic return true
5714                          * will have checked that a VARIADIC-labeled parameter actually is
5715                          * an array.  So it should be okay to just Assert that it's an
5716                          * array rather than doing a full-fledged error check.
5717                          */
5718                         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5719
5720                         /* OK, safe to fetch the array value */
5721                         arr = PG_GETARG_ARRAYTYPE_P(1);
5722
5723                         /* Get info about array element type */
5724                         element_type = ARR_ELEMTYPE(arr);
5725                         get_typlenbyvalalign(element_type,
5726                                                                  &elmlen, &elmbyval, &elmalign);
5727
5728                         /* Extract all array elements */
5729                         deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5730                                                           &elements, &nulls, &nitems);
5731                 }
5732
5733                 nargs = nitems + 1;
5734                 funcvariadic = true;
5735         }
5736         else
5737         {
5738                 /* Non-variadic case, we'll process the arguments individually */
5739                 nargs = PG_NARGS();
5740                 funcvariadic = false;
5741         }
5742
5743         /* Setup for main loop. */
5744         fmt = PG_GETARG_TEXT_PP(0);
5745         start_ptr = VARDATA_ANY(fmt);
5746         end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5747         initStringInfo(&str);
5748         arg = 1;                                        /* next argument position to print */
5749
5750         /* Scan format string, looking for conversion specifiers. */
5751         for (cp = start_ptr; cp < end_ptr; cp++)
5752         {
5753                 int                     argpos;
5754                 int                     widthpos;
5755                 int                     flags;
5756                 int                     width;
5757                 Datum           value;
5758                 bool            isNull;
5759                 Oid                     typid;
5760
5761                 /*
5762                  * If it's not the start of a conversion specifier, just copy it to
5763                  * the output buffer.
5764                  */
5765                 if (*cp != '%')
5766                 {
5767                         appendStringInfoCharMacro(&str, *cp);
5768                         continue;
5769                 }
5770
5771                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5772
5773                 /* Easy case: %% outputs a single % */
5774                 if (*cp == '%')
5775                 {
5776                         appendStringInfoCharMacro(&str, *cp);
5777                         continue;
5778                 }
5779
5780                 /* Parse the optional portions of the format specifier */
5781                 cp = text_format_parse_format(cp, end_ptr,
5782                                                                           &argpos, &widthpos,
5783                                                                           &flags, &width);
5784
5785                 /*
5786                  * Next we should see the main conversion specifier.  Whether or not
5787                  * an argument position was present, it's known that at least one
5788                  * character remains in the string at this point.  Experience suggests
5789                  * that it's worth checking that that character is one of the expected
5790                  * ones before we try to fetch arguments, so as to produce the least
5791                  * confusing response to a mis-formatted specifier.
5792                  */
5793                 if (strchr("sIL", *cp) == NULL)
5794                         ereport(ERROR,
5795                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5796                                          errmsg("unrecognized format() type specifier \"%.*s\"",
5797                                                         pg_mblen(cp), cp),
5798                                          errhint("For a single \"%%\" use \"%%%%\".")));
5799
5800                 /* If indirect width was specified, get its value */
5801                 if (widthpos >= 0)
5802                 {
5803                         /* Collect the specified or next argument position */
5804                         if (widthpos > 0)
5805                                 arg = widthpos;
5806                         if (arg >= nargs)
5807                                 ereport(ERROR,
5808                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5809                                                  errmsg("too few arguments for format()")));
5810
5811                         /* Get the value and type of the selected argument */
5812                         if (!funcvariadic)
5813                         {
5814                                 value = PG_GETARG_DATUM(arg);
5815                                 isNull = PG_ARGISNULL(arg);
5816                                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5817                         }
5818                         else
5819                         {
5820                                 value = elements[arg - 1];
5821                                 isNull = nulls[arg - 1];
5822                                 typid = element_type;
5823                         }
5824                         if (!OidIsValid(typid))
5825                                 elog(ERROR, "could not determine data type of format() input");
5826
5827                         arg++;
5828
5829                         /* We can treat NULL width the same as zero */
5830                         if (isNull)
5831                                 width = 0;
5832                         else if (typid == INT4OID)
5833                                 width = DatumGetInt32(value);
5834                         else if (typid == INT2OID)
5835                                 width = DatumGetInt16(value);
5836                         else
5837                         {
5838                                 /* For less-usual datatypes, convert to text then to int */
5839                                 char       *str;
5840
5841                                 if (typid != prev_width_type)
5842                                 {
5843                                         Oid                     typoutputfunc;
5844                                         bool            typIsVarlena;
5845
5846                                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5847                                         fmgr_info(typoutputfunc, &typoutputinfo_width);
5848                                         prev_width_type = typid;
5849                                 }
5850
5851                                 str = OutputFunctionCall(&typoutputinfo_width, value);
5852
5853                                 /* pg_strtoint32 will complain about bad data or overflow */
5854                                 width = pg_strtoint32(str);
5855
5856                                 pfree(str);
5857                         }
5858                 }
5859
5860                 /* Collect the specified or next argument position */
5861                 if (argpos > 0)
5862                         arg = argpos;
5863                 if (arg >= nargs)
5864                         ereport(ERROR,
5865                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5866                                          errmsg("too few arguments for format()")));
5867
5868                 /* Get the value and type of the selected argument */
5869                 if (!funcvariadic)
5870                 {
5871                         value = PG_GETARG_DATUM(arg);
5872                         isNull = PG_ARGISNULL(arg);
5873                         typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5874                 }
5875                 else
5876                 {
5877                         value = elements[arg - 1];
5878                         isNull = nulls[arg - 1];
5879                         typid = element_type;
5880                 }
5881                 if (!OidIsValid(typid))
5882                         elog(ERROR, "could not determine data type of format() input");
5883
5884                 arg++;
5885
5886                 /*
5887                  * Get the appropriate typOutput function, reusing previous one if
5888                  * same type as previous argument.  That's particularly useful in the
5889                  * variadic-array case, but often saves work even for ordinary calls.
5890                  */
5891                 if (typid != prev_type)
5892                 {
5893                         Oid                     typoutputfunc;
5894                         bool            typIsVarlena;
5895
5896                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5897                         fmgr_info(typoutputfunc, &typoutputfinfo);
5898                         prev_type = typid;
5899                 }
5900
5901                 /*
5902                  * And now we can format the value.
5903                  */
5904                 switch (*cp)
5905                 {
5906                         case 's':
5907                         case 'I':
5908                         case 'L':
5909                                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5910                                                                                           value, isNull,
5911                                                                                           flags, width);
5912                                 break;
5913                         default:
5914                                 /* should not get here, because of previous check */
5915                                 ereport(ERROR,
5916                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5917                                                  errmsg("unrecognized format() type specifier \"%.*s\"",
5918                                                                 pg_mblen(cp), cp),
5919                                                  errhint("For a single \"%%\" use \"%%%%\".")));
5920                                 break;
5921                 }
5922         }
5923
5924         /* Don't need deconstruct_array results anymore. */
5925         if (elements != NULL)
5926                 pfree(elements);
5927         if (nulls != NULL)
5928                 pfree(nulls);
5929
5930         /* Generate results. */
5931         result = cstring_to_text_with_len(str.data, str.len);
5932         pfree(str.data);
5933
5934         PG_RETURN_TEXT_P(result);
5935 }
5936
5937 /*
5938  * Parse contiguous digits as a decimal number.
5939  *
5940  * Returns true if some digits could be parsed.
5941  * The value is returned into *value, and *ptr is advanced to the next
5942  * character to be parsed.
5943  *
5944  * Note parsing invariant: at least one character is known available before
5945  * string end (end_ptr) at entry, and this is still true at exit.
5946  */
5947 static bool
5948 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5949 {
5950         bool            found = false;
5951         const char *cp = *ptr;
5952         int                     val = 0;
5953
5954         while (*cp >= '0' && *cp <= '9')
5955         {
5956                 int8            digit = (*cp - '0');
5957
5958                 if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5959                         unlikely(pg_add_s32_overflow(val, digit, &val)))
5960                         ereport(ERROR,
5961                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5962                                          errmsg("number is out of range")));
5963                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5964                 found = true;
5965         }
5966
5967         *ptr = cp;
5968         *value = val;
5969
5970         return found;
5971 }
5972
5973 /*
5974  * Parse a format specifier (generally following the SUS printf spec).
5975  *
5976  * We have already advanced over the initial '%', and we are looking for
5977  * [argpos][flags][width]type (but the type character is not consumed here).
5978  *
5979  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5980  * Output parameters:
5981  *      argpos: argument position for value to be printed.  -1 means unspecified.
5982  *      widthpos: argument position for width.  Zero means the argument position
5983  *                      was unspecified (ie, take the next arg) and -1 means no width
5984  *                      argument (width was omitted or specified as a constant).
5985  *      flags: bitmask of flags.
5986  *      width: directly-specified width value.  Zero means the width was omitted
5987  *                      (note it's not necessary to distinguish this case from an explicit
5988  *                      zero width value).
5989  *
5990  * The function result is the next character position to be parsed, ie, the
5991  * location where the type character is/should be.
5992  *
5993  * Note parsing invariant: at least one character is known available before
5994  * string end (end_ptr) at entry, and this is still true at exit.
5995  */
5996 static const char *
5997 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5998                                                  int *argpos, int *widthpos,
5999                                                  int *flags, int *width)
6000 {
6001         const char *cp = start_ptr;
6002         int                     n;
6003
6004         /* set defaults for output parameters */
6005         *argpos = -1;
6006         *widthpos = -1;
6007         *flags = 0;
6008         *width = 0;
6009
6010         /* try to identify first number */
6011         if (text_format_parse_digits(&cp, end_ptr, &n))
6012         {
6013                 if (*cp != '$')
6014                 {
6015                         /* Must be just a width and a type, so we're done */
6016                         *width = n;
6017                         return cp;
6018                 }
6019                 /* The number was argument position */
6020                 *argpos = n;
6021                 /* Explicit 0 for argument index is immediately refused */
6022                 if (n == 0)
6023                         ereport(ERROR,
6024                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6025                                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
6026                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6027         }
6028
6029         /* Handle flags (only minus is supported now) */
6030         while (*cp == '-')
6031         {
6032                 *flags |= TEXT_FORMAT_FLAG_MINUS;
6033                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6034         }
6035
6036         if (*cp == '*')
6037         {
6038                 /* Handle indirect width */
6039                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6040                 if (text_format_parse_digits(&cp, end_ptr, &n))
6041                 {
6042                         /* number in this position must be closed by $ */
6043                         if (*cp != '$')
6044                                 ereport(ERROR,
6045                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6046                                                  errmsg("width argument position must be ended by \"$\"")));
6047                         /* The number was width argument position */
6048                         *widthpos = n;
6049                         /* Explicit 0 for argument index is immediately refused */
6050                         if (n == 0)
6051                                 ereport(ERROR,
6052                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6053                                                  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6054                         ADVANCE_PARSE_POINTER(cp, end_ptr);
6055                 }
6056                 else
6057                         *widthpos = 0;          /* width's argument position is unspecified */
6058         }
6059         else
6060         {
6061                 /* Check for direct width specification */
6062                 if (text_format_parse_digits(&cp, end_ptr, &n))
6063                         *width = n;
6064         }
6065
6066         /* cp should now be pointing at type character */
6067         return cp;
6068 }
6069
6070 /*
6071  * Format a %s, %I, or %L conversion
6072  */
6073 static void
6074 text_format_string_conversion(StringInfo buf, char conversion,
6075                                                           FmgrInfo *typOutputInfo,
6076                                                           Datum value, bool isNull,
6077                                                           int flags, int width)
6078 {
6079         char       *str;
6080
6081         /* Handle NULL arguments before trying to stringify the value. */
6082         if (isNull)
6083         {
6084                 if (conversion == 's')
6085                         text_format_append_string(buf, "", flags, width);
6086                 else if (conversion == 'L')
6087                         text_format_append_string(buf, "NULL", flags, width);
6088                 else if (conversion == 'I')
6089                         ereport(ERROR,
6090                                         (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6091                                          errmsg("null values cannot be formatted as an SQL identifier")));
6092                 return;
6093         }
6094
6095         /* Stringify. */
6096         str = OutputFunctionCall(typOutputInfo, value);
6097
6098         /* Escape. */
6099         if (conversion == 'I')
6100         {
6101                 /* quote_identifier may or may not allocate a new string. */
6102                 text_format_append_string(buf, quote_identifier(str), flags, width);
6103         }
6104         else if (conversion == 'L')
6105         {
6106                 char       *qstr = quote_literal_cstr(str);
6107
6108                 text_format_append_string(buf, qstr, flags, width);
6109                 /* quote_literal_cstr() always allocates a new string */
6110                 pfree(qstr);
6111         }
6112         else
6113                 text_format_append_string(buf, str, flags, width);
6114
6115         /* Cleanup. */
6116         pfree(str);
6117 }
6118
6119 /*
6120  * Append str to buf, padding as directed by flags/width
6121  */
6122 static void
6123 text_format_append_string(StringInfo buf, const char *str,
6124                                                   int flags, int width)
6125 {
6126         bool            align_to_left = false;
6127         int                     len;
6128
6129         /* fast path for typical easy case */
6130         if (width == 0)
6131         {
6132                 appendStringInfoString(buf, str);
6133                 return;
6134         }
6135
6136         if (width < 0)
6137         {
6138                 /* Negative width: implicit '-' flag, then take absolute value */
6139                 align_to_left = true;
6140                 /* -INT_MIN is undefined */
6141                 if (width <= INT_MIN)
6142                         ereport(ERROR,
6143                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6144                                          errmsg("number is out of range")));
6145                 width = -width;
6146         }
6147         else if (flags & TEXT_FORMAT_FLAG_MINUS)
6148                 align_to_left = true;
6149
6150         len = pg_mbstrlen(str);
6151         if (align_to_left)
6152         {
6153                 /* left justify */
6154                 appendStringInfoString(buf, str);
6155                 if (len < width)
6156                         appendStringInfoSpaces(buf, width - len);
6157         }
6158         else
6159         {
6160                 /* right justify */
6161                 if (len < width)
6162                         appendStringInfoSpaces(buf, width - len);
6163                 appendStringInfoString(buf, str);
6164         }
6165 }
6166
6167 /*
6168  * text_format_nv - nonvariadic wrapper for text_format function.
6169  *
6170  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6171  * which checks that all built-in functions that share the implementing C
6172  * function take the same number of arguments.
6173  */
6174 Datum
6175 text_format_nv(PG_FUNCTION_ARGS)
6176 {
6177         return text_format(fcinfo);
6178 }
6179
6180 /*
6181  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6182  * for this use case.
6183  */
6184 static inline bool
6185 rest_of_char_same(const char *s1, const char *s2, int len)
6186 {
6187         while (len > 0)
6188         {
6189                 len--;
6190                 if (s1[len] != s2[len])
6191                         return false;
6192         }
6193         return true;
6194 }
6195
6196 /* Expand each Levenshtein distance variant */
6197 #include "levenshtein.c"
6198 #define LEVENSHTEIN_LESS_EQUAL
6199 #include "levenshtein.c"
6200
6201
6202 /*
6203  * The following *ClosestMatch() functions can be used to determine whether a
6204  * user-provided string resembles any known valid values, which is useful for
6205  * providing hints in log messages, among other things.  Use these functions
6206  * like so:
6207  *
6208  *              initClosestMatch(&state, source_string, max_distance);
6209  *
6210  *              for (int i = 0; i < num_valid_strings; i++)
6211  *                      updateClosestMatch(&state, valid_strings[i]);
6212  *
6213  *              closestMatch = getClosestMatch(&state);
6214  */
6215
6216 /*
6217  * Initialize the given state with the source string and maximum Levenshtein
6218  * distance to consider.
6219  */
6220 void
6221 initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6222 {
6223         Assert(state);
6224         Assert(max_d >= 0);
6225
6226         state->source = source;
6227         state->min_d = -1;
6228         state->max_d = max_d;
6229         state->match = NULL;
6230 }
6231
6232 /*
6233  * If the candidate string is a closer match than the current one saved (or
6234  * there is no match saved), save it as the closest match.
6235  *
6236  * If the source or candidate string is NULL, empty, or too long, this function
6237  * takes no action.  Likewise, if the Levenshtein distance exceeds the maximum
6238  * allowed or more than half the characters are different, no action is taken.
6239  */
6240 void
6241 updateClosestMatch(ClosestMatchState *state, const char *candidate)
6242 {
6243         int                     dist;
6244
6245         Assert(state);
6246
6247         if (state->source == NULL || state->source[0] == '\0' ||
6248                 candidate == NULL || candidate[0] == '\0')
6249                 return;
6250
6251         /*
6252          * To avoid ERROR-ing, we check the lengths here instead of setting
6253          * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6254          */
6255         if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6256                 strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6257                 return;
6258
6259         dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6260                                                                                  candidate, strlen(candidate), 1, 1, 1,
6261                                                                                  state->max_d, true);
6262         if (dist <= state->max_d &&
6263                 dist <= strlen(state->source) / 2 &&
6264                 (state->min_d == -1 || dist < state->min_d))
6265         {
6266                 state->min_d = dist;
6267                 state->match = candidate;
6268         }
6269 }
6270
6271 /*
6272  * Return the closest match.  If no suitable candidates were provided via
6273  * updateClosestMatch(), return NULL.
6274  */
6275 const char *
6276 getClosestMatch(ClosestMatchState *state)
6277 {
6278         Assert(state);
6279
6280         return state->match;
6281 }
6282
6283
6284 /*
6285  * Unicode support
6286  */
6287
6288 static UnicodeNormalizationForm
6289 unicode_norm_form_from_string(const char *formstr)
6290 {
6291         UnicodeNormalizationForm form = -1;
6292
6293         /*
6294          * Might as well check this while we're here.
6295          */
6296         if (GetDatabaseEncoding() != PG_UTF8)
6297                 ereport(ERROR,
6298                                 (errcode(ERRCODE_SYNTAX_ERROR),
6299                                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6300
6301         if (pg_strcasecmp(formstr, "NFC") == 0)
6302                 form = UNICODE_NFC;
6303         else if (pg_strcasecmp(formstr, "NFD") == 0)
6304                 form = UNICODE_NFD;
6305         else if (pg_strcasecmp(formstr, "NFKC") == 0)
6306                 form = UNICODE_NFKC;
6307         else if (pg_strcasecmp(formstr, "NFKD") == 0)
6308                 form = UNICODE_NFKD;
6309         else
6310                 ereport(ERROR,
6311                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6312                                  errmsg("invalid normalization form: %s", formstr)));
6313
6314         return form;
6315 }
6316
6317 Datum
6318 unicode_normalize_func(PG_FUNCTION_ARGS)
6319 {
6320         text       *input = PG_GETARG_TEXT_PP(0);
6321         char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6322         UnicodeNormalizationForm form;
6323         int                     size;
6324         pg_wchar   *input_chars;
6325         pg_wchar   *output_chars;
6326         unsigned char *p;
6327         text       *result;
6328         int                     i;
6329
6330         form = unicode_norm_form_from_string(formstr);
6331
6332         /* convert to pg_wchar */
6333         size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6334         input_chars = palloc((size + 1) * sizeof(pg_wchar));
6335         p = (unsigned char *) VARDATA_ANY(input);
6336         for (i = 0; i < size; i++)
6337         {
6338                 input_chars[i] = utf8_to_unicode(p);
6339                 p += pg_utf_mblen(p);
6340         }
6341         input_chars[i] = (pg_wchar) '\0';
6342         Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6343
6344         /* action */
6345         output_chars = unicode_normalize(form, input_chars);
6346
6347         /* convert back to UTF-8 string */
6348         size = 0;
6349         for (pg_wchar *wp = output_chars; *wp; wp++)
6350         {
6351                 unsigned char buf[4];
6352
6353                 unicode_to_utf8(*wp, buf);
6354                 size += pg_utf_mblen(buf);
6355         }
6356
6357         result = palloc(size + VARHDRSZ);
6358         SET_VARSIZE(result, size + VARHDRSZ);
6359
6360         p = (unsigned char *) VARDATA_ANY(result);
6361         for (pg_wchar *wp = output_chars; *wp; wp++)
6362         {
6363                 unicode_to_utf8(*wp, p);
6364                 p += pg_utf_mblen(p);
6365         }
6366         Assert((char *) p == (char *) result + size + VARHDRSZ);
6367
6368         PG_RETURN_TEXT_P(result);
6369 }
6370
6371 /*
6372  * Check whether the string is in the specified Unicode normalization form.
6373  *
6374  * This is done by converting the string to the specified normal form and then
6375  * comparing that to the original string.  To speed that up, we also apply the
6376  * "quick check" algorithm specified in UAX #15, which can give a yes or no
6377  * answer for many strings by just scanning the string once.
6378  *
6379  * This function should generally be optimized for the case where the string
6380  * is in fact normalized.  In that case, we'll end up looking at the entire
6381  * string, so it's probably not worth doing any incremental conversion etc.
6382  */
6383 Datum
6384 unicode_is_normalized(PG_FUNCTION_ARGS)
6385 {
6386         text       *input = PG_GETARG_TEXT_PP(0);
6387         char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6388         UnicodeNormalizationForm form;
6389         int                     size;
6390         pg_wchar   *input_chars;
6391         pg_wchar   *output_chars;
6392         unsigned char *p;
6393         int                     i;
6394         UnicodeNormalizationQC quickcheck;
6395         int                     output_size;
6396         bool            result;
6397
6398         form = unicode_norm_form_from_string(formstr);
6399
6400         /* convert to pg_wchar */
6401         size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6402         input_chars = palloc((size + 1) * sizeof(pg_wchar));
6403         p = (unsigned char *) VARDATA_ANY(input);
6404         for (i = 0; i < size; i++)
6405         {
6406                 input_chars[i] = utf8_to_unicode(p);
6407                 p += pg_utf_mblen(p);
6408         }
6409         input_chars[i] = (pg_wchar) '\0';
6410         Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6411
6412         /* quick check (see UAX #15) */
6413         quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6414         if (quickcheck == UNICODE_NORM_QC_YES)
6415                 PG_RETURN_BOOL(true);
6416         else if (quickcheck == UNICODE_NORM_QC_NO)
6417                 PG_RETURN_BOOL(false);
6418
6419         /* normalize and compare with original */
6420         output_chars = unicode_normalize(form, input_chars);
6421
6422         output_size = 0;
6423         for (pg_wchar *wp = output_chars; *wp; wp++)
6424                 output_size++;
6425
6426         result = (size == output_size) &&
6427                 (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6428
6429         PG_RETURN_BOOL(result);
6430 }
6431
6432 /*
6433  * Check if first n chars are hexadecimal digits
6434  */
6435 static bool
6436 isxdigits_n(const char *instr, size_t n)
6437 {
6438         for (size_t i = 0; i < n; i++)
6439                 if (!isxdigit((unsigned char) instr[i]))
6440                         return false;
6441
6442         return true;
6443 }
6444
6445 static unsigned int
6446 hexval(unsigned char c)
6447 {
6448         if (c >= '0' && c <= '9')
6449                 return c - '0';
6450         if (c >= 'a' && c <= 'f')
6451                 return c - 'a' + 0xA;
6452         if (c >= 'A' && c <= 'F')
6453                 return c - 'A' + 0xA;
6454         elog(ERROR, "invalid hexadecimal digit");
6455         return 0;                                       /* not reached */
6456 }
6457
6458 /*
6459  * Translate string with hexadecimal digits to number
6460  */
6461 static unsigned int
6462 hexval_n(const char *instr, size_t n)
6463 {
6464         unsigned int result = 0;
6465
6466         for (size_t i = 0; i < n; i++)
6467                 result += hexval(instr[i]) << (4 * (n - i - 1));
6468
6469         return result;
6470 }
6471
6472 /*
6473  * Replaces Unicode escape sequences by Unicode characters
6474  */
6475 Datum
6476 unistr(PG_FUNCTION_ARGS)
6477 {
6478         text       *input_text = PG_GETARG_TEXT_PP(0);
6479         char       *instr;
6480         int                     len;
6481         StringInfoData str;
6482         text       *result;
6483         pg_wchar        pair_first = 0;
6484         char            cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6485
6486         instr = VARDATA_ANY(input_text);
6487         len = VARSIZE_ANY_EXHDR(input_text);
6488
6489         initStringInfo(&str);
6490
6491         while (len > 0)
6492         {
6493                 if (instr[0] == '\\')
6494                 {
6495                         if (len >= 2 &&
6496                                 instr[1] == '\\')
6497                         {
6498                                 if (pair_first)
6499                                         goto invalid_pair;
6500                                 appendStringInfoChar(&str, '\\');
6501                                 instr += 2;
6502                                 len -= 2;
6503                         }
6504                         else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6505                                          (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6506                         {
6507                                 pg_wchar        unicode;
6508                                 int                     offset = instr[1] == 'u' ? 2 : 1;
6509
6510                                 unicode = hexval_n(instr + offset, 4);
6511
6512                                 if (!is_valid_unicode_codepoint(unicode))
6513                                         ereport(ERROR,
6514                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6515                                                         errmsg("invalid Unicode code point: %04X", unicode));
6516
6517                                 if (pair_first)
6518                                 {
6519                                         if (is_utf16_surrogate_second(unicode))
6520                                         {
6521                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6522                                                 pair_first = 0;
6523                                         }
6524                                         else
6525                                                 goto invalid_pair;
6526                                 }
6527                                 else if (is_utf16_surrogate_second(unicode))
6528                                         goto invalid_pair;
6529
6530                                 if (is_utf16_surrogate_first(unicode))
6531                                         pair_first = unicode;
6532                                 else
6533                                 {
6534                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6535                                         appendStringInfoString(&str, cbuf);
6536                                 }
6537
6538                                 instr += 4 + offset;
6539                                 len -= 4 + offset;
6540                         }
6541                         else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6542                         {
6543                                 pg_wchar        unicode;
6544
6545                                 unicode = hexval_n(instr + 2, 6);
6546
6547                                 if (!is_valid_unicode_codepoint(unicode))
6548                                         ereport(ERROR,
6549                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6550                                                         errmsg("invalid Unicode code point: %04X", unicode));
6551
6552                                 if (pair_first)
6553                                 {
6554                                         if (is_utf16_surrogate_second(unicode))
6555                                         {
6556                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6557                                                 pair_first = 0;
6558                                         }
6559                                         else
6560                                                 goto invalid_pair;
6561                                 }
6562                                 else if (is_utf16_surrogate_second(unicode))
6563                                         goto invalid_pair;
6564
6565                                 if (is_utf16_surrogate_first(unicode))
6566                                         pair_first = unicode;
6567                                 else
6568                                 {
6569                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6570                                         appendStringInfoString(&str, cbuf);
6571                                 }
6572
6573                                 instr += 8;
6574                                 len -= 8;
6575                         }
6576                         else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6577                         {
6578                                 pg_wchar        unicode;
6579
6580                                 unicode = hexval_n(instr + 2, 8);
6581
6582                                 if (!is_valid_unicode_codepoint(unicode))
6583                                         ereport(ERROR,
6584                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6585                                                         errmsg("invalid Unicode code point: %04X", unicode));
6586
6587                                 if (pair_first)
6588                                 {
6589                                         if (is_utf16_surrogate_second(unicode))
6590                                         {
6591                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6592                                                 pair_first = 0;
6593                                         }
6594                                         else
6595                                                 goto invalid_pair;
6596                                 }
6597                                 else if (is_utf16_surrogate_second(unicode))
6598                                         goto invalid_pair;
6599
6600                                 if (is_utf16_surrogate_first(unicode))
6601                                         pair_first = unicode;
6602                                 else
6603                                 {
6604                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6605                                         appendStringInfoString(&str, cbuf);
6606                                 }
6607
6608                                 instr += 10;
6609                                 len -= 10;
6610                         }
6611                         else
6612                                 ereport(ERROR,
6613                                                 (errcode(ERRCODE_SYNTAX_ERROR),
6614                                                  errmsg("invalid Unicode escape"),
6615                                                  errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6616                 }
6617                 else
6618                 {
6619                         if (pair_first)
6620                                 goto invalid_pair;
6621
6622                         appendStringInfoChar(&str, *instr++);
6623                         len--;
6624                 }
6625         }
6626
6627         /* unfinished surrogate pair? */
6628         if (pair_first)
6629                 goto invalid_pair;
6630
6631         result = cstring_to_text_with_len(str.data, str.len);
6632         pfree(str.data);
6633
6634         PG_RETURN_TEXT_P(result);
6635
6636 invalid_pair:
6637         ereport(ERROR,
6638                         (errcode(ERRCODE_SYNTAX_ERROR),
6639                          errmsg("invalid Unicode surrogate pair")));
6640         PG_RETURN_NULL();                       /* keep compiler quiet */
6641 }