src/backend/commands/copyfromparse.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * copyfromparse.c
   4  *              Parse CSV/text/binary format for COPY FROM.
   5  *
   6  * This file contains routines to parse the text, CSV and binary input
   7  * formats.  The main entry point is NextCopyFrom(), which parses the
   8  * next input line and returns it as Datums.
   9  *
  10  * In text/CSV mode, the parsing happens in multiple stages:
  11  *
  12  * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
  13  *                1.          2.            3.           4.
  14  *
  15  * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
  16  *    places it into 'raw_buf'.
  17  *
  18  * 2. CopyConvertBuf() calls the encoding conversion function to convert
  19  *    the data in 'raw_buf' from client to server encoding, placing the
  20  *    converted result in 'input_buf'.
  21  *
  22  * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
  23  *    It is responsible for finding the next newline marker, taking quote and
  24  *    escape characters into account according to the COPY options.  The line
  25  *    is copied into 'line_buf', with quotes and escape characters still
  26  *    intact.
  27  *
  28  * 4. CopyReadAttributesText/CSV() function takes the input line from
  29  *    'line_buf', and splits it into fields, unescaping the data as required.
  30  *    The fields are stored in 'attribute_buf', and 'raw_fields' array holds
  31  *    pointers to each field.
  32  *
  33  * If encoding conversion is not required, a shortcut is taken in step 2 to
  34  * avoid copying the data unnecessarily.  The 'input_buf' pointer is set to
  35  * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
  36  * directly into 'input_buf'.  CopyConvertBuf() then merely validates that
  37  * the data is valid in the current encoding.
  38  *
  39  * In binary mode, the pipeline is much simpler.  Input is loaded into
  40  * 'raw_buf', and encoding conversion is done in the datatype-specific
  41  * receive functions, if required.  'input_buf' and 'line_buf' are not used,
  42  * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
  43  * data when it's passed the receive function.
  44  *
  45  * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE).  'input_buf' is also
  46  * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required.  'line_buf'
  47  * and 'attribute_buf' are expanded on demand, to hold the longest line
  48  * encountered so far.
  49  *
  50  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  51  * Portions Copyright (c) 1994, Regents of the University of California
  52  *
  53  *
  54  * IDENTIFICATION
  55  *        src/backend/commands/copyfromparse.c
  56  *
  57  *-------------------------------------------------------------------------
  58  */
  59 #include "postgres.h"
  60
  61 #include <ctype.h>
  62 #include <unistd.h>
  63 #include <sys/stat.h>
  64
  65 #include "commands/copy.h"
  66 #include "commands/copyfrom_internal.h"
  67 #include "commands/progress.h"
  68 #include "executor/executor.h"
  69 #include "libpq/libpq.h"
  70 #include "libpq/pqformat.h"
  71 #include "mb/pg_wchar.h"
  72 #include "miscadmin.h"
  73 #include "pgstat.h"
  74 #include "port/pg_bswap.h"
  75 #include "utils/memutils.h"
  76 #include "utils/rel.h"
  77
  78 #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
  79 #define OCTVALUE(c) ((c) - '0')
  80
  81 /*
  82  * These macros centralize code used to process line_buf and input_buf buffers.
  83  * They are macros because they often do continue/break control and to avoid
  84  * function call overhead in tight COPY loops.
  85  *
  86  * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
  87  * prevent the continue/break processing from working.  We end the "if (1)"
  88  * with "else ((void) 0)" to ensure the "if" does not unintentionally match
  89  * any "else" in the calling code, and to avoid any compiler warnings about
  90  * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
  91  */
  92
  93 /*
  94  * This keeps the character read at the top of the loop in the buffer
  95  * even if there is more than one read-ahead.
  96  */
  97 #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
  98 if (1) \
  99 { \
 100         if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
 101         { \
 102                 input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
 103                 need_data = true; \
 104                 continue; \
 105         } \
 106 } else ((void) 0)
 107
 108 /* This consumes the remainder of the buffer and breaks */
 109 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
 110 if (1) \
 111 { \
 112         if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
 113         { \
 114                 if (extralen) \
 115                         input_buf_ptr = copy_buf_len; /* consume the partial character */ \
 116                 /* backslash just before EOF, treat as data char */ \
 117                 result = true; \
 118                 break; \
 119         } \
 120 } else ((void) 0)
 121
 122 /*
 123  * Transfer any approved data to line_buf; must do this to be sure
 124  * there is some room in input_buf.
 125  */
 126 #define REFILL_LINEBUF \
 127 if (1) \
 128 { \
 129         if (input_buf_ptr > cstate->input_buf_index) \
 130         { \
 131                 appendBinaryStringInfo(&cstate->line_buf, \
 132                                                          cstate->input_buf + cstate->input_buf_index, \
 133                                                            input_buf_ptr - cstate->input_buf_index); \
 134                 cstate->input_buf_index = input_buf_ptr; \
 135         } \
 136 } else ((void) 0)
 137
 138 /* Undo any read-ahead and jump out of the block. */
 139 #define NO_END_OF_COPY_GOTO \
 140 if (1) \
 141 { \
 142         input_buf_ptr = prev_raw_ptr + 1; \
 143         goto not_end_of_copy; \
 144 } else ((void) 0)
 145
 146 /* NOTE: there's a copy of this in copyto.c */
 147 static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
 148
 149
 150 /* non-export function prototypes */
 151 static bool CopyReadLine(CopyFromState cstate);
 152 static bool CopyReadLineText(CopyFromState cstate);
 153 static int      CopyReadAttributesText(CopyFromState cstate);
 154 static int      CopyReadAttributesCSV(CopyFromState cstate);
 155 static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
 156                                                                          Oid typioparam, int32 typmod,
 157                                                                          bool *isnull);
 158
 159
 160 /* Low-level communications functions */
 161 static int      CopyGetData(CopyFromState cstate, void *databuf,
 162                                                 int minread, int maxread);
 163 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
 164 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
 165 static void CopyLoadInputBuf(CopyFromState cstate);
 166 static int      CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
 167
 168 void
 169 ReceiveCopyBegin(CopyFromState cstate)
 170 {
 171         StringInfoData buf;
 172         int                     natts = list_length(cstate->attnumlist);
 173         int16           format = (cstate->opts.binary ? 1 : 0);
 174         int                     i;
 175
 176         pq_beginmessage(&buf, 'G');
 177         pq_sendbyte(&buf, format);      /* overall format */
 178         pq_sendint16(&buf, natts);
 179         for (i = 0; i < natts; i++)
 180                 pq_sendint16(&buf, format); /* per-column formats */
 181         pq_endmessage(&buf);
 182         cstate->copy_src = COPY_FRONTEND;
 183         cstate->fe_msgbuf = makeStringInfo();
 184         /* We *must* flush here to ensure FE knows it can send. */
 185         pq_flush();
 186 }
 187
 188 void
 189 ReceiveCopyBinaryHeader(CopyFromState cstate)
 190 {
 191         char            readSig[11];
 192         int32           tmp;
 193
 194         /* Signature */
 195         if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
 196                 memcmp(readSig, BinarySignature, 11) != 0)
 197                 ereport(ERROR,
 198                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 199                                  errmsg("COPY file signature not recognized")));
 200         /* Flags field */
 201         if (!CopyGetInt32(cstate, &tmp))
 202                 ereport(ERROR,
 203                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 204                                  errmsg("invalid COPY file header (missing flags)")));
 205         if ((tmp & (1 << 16)) != 0)
 206                 ereport(ERROR,
 207                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 208                                  errmsg("invalid COPY file header (WITH OIDS)")));
 209         tmp &= ~(1 << 16);
 210         if ((tmp >> 16) != 0)
 211                 ereport(ERROR,
 212                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 213                                  errmsg("unrecognized critical flags in COPY file header")));
 214         /* Header extension length */
 215         if (!CopyGetInt32(cstate, &tmp) ||
 216                 tmp < 0)
 217                 ereport(ERROR,
 218                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 219                                  errmsg("invalid COPY file header (missing length)")));
 220         /* Skip extension header, if present */
 221         while (tmp-- > 0)
 222         {
 223                 if (CopyReadBinaryData(cstate, readSig, 1) != 1)
 224                         ereport(ERROR,
 225                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 226                                          errmsg("invalid COPY file header (wrong length)")));
 227         }
 228 }
 229
 230 /*
 231  * CopyGetData reads data from the source (file or frontend)
 232  *
 233  * We attempt to read at least minread, and at most maxread, bytes from
 234  * the source.  The actual number of bytes read is returned; if this is
 235  * less than minread, EOF was detected.
 236  *
 237  * Note: when copying from the frontend, we expect a proper EOF mark per
 238  * protocol; if the frontend simply drops the connection, we raise error.
 239  * It seems unwise to allow the COPY IN to complete normally in that case.
 240  *
 241  * NB: no data conversion is applied here.
 242  */
 243 static int
 244 CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 245 {
 246         int                     bytesread = 0;
 247
 248         switch (cstate->copy_src)
 249         {
 250                 case COPY_FILE:
 251                         bytesread = fread(databuf, 1, maxread, cstate->copy_file);
 252                         if (ferror(cstate->copy_file))
 253                                 ereport(ERROR,
 254                                                 (errcode_for_file_access(),
 255                                                  errmsg("could not read from COPY file: %m")));
 256                         if (bytesread == 0)
 257                                 cstate->raw_reached_eof = true;
 258                         break;
 259                 case COPY_FRONTEND:
 260                         while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
 261                         {
 262                                 int                     avail;
 263
 264                                 while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
 265                                 {
 266                                         /* Try to receive another message */
 267                                         int                     mtype;
 268                                         int                     maxmsglen;
 269
 270                         readmessage:
 271                                         HOLD_CANCEL_INTERRUPTS();
 272                                         pq_startmsgread();
 273                                         mtype = pq_getbyte();
 274                                         if (mtype == EOF)
 275                                                 ereport(ERROR,
 276                                                                 (errcode(ERRCODE_CONNECTION_FAILURE),
 277                                                                  errmsg("unexpected EOF on client connection with an open transaction")));
 278                                         /* Validate message type and set packet size limit */
 279                                         switch (mtype)
 280                                         {
 281                                                 case 'd':       /* CopyData */
 282                                                         maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
 283                                                         break;
 284                                                 case 'c':       /* CopyDone */
 285                                                 case 'f':       /* CopyFail */
 286                                                 case 'H':       /* Flush */
 287                                                 case 'S':       /* Sync */
 288                                                         maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
 289                                                         break;
 290                                                 default:
 291                                                         ereport(ERROR,
 292                                                                         (errcode(ERRCODE_PROTOCOL_VIOLATION),
 293                                                                          errmsg("unexpected message type 0x%02X during COPY from stdin",
 294                                                                                         mtype)));
 295                                                         maxmsglen = 0;  /* keep compiler quiet */
 296                                                         break;
 297                                         }
 298                                         /* Now collect the message body */
 299                                         if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
 300                                                 ereport(ERROR,
 301                                                                 (errcode(ERRCODE_CONNECTION_FAILURE),
 302                                                                  errmsg("unexpected EOF on client connection with an open transaction")));
 303                                         RESUME_CANCEL_INTERRUPTS();
 304                                         /* ... and process it */
 305                                         switch (mtype)
 306                                         {
 307                                                 case 'd':       /* CopyData */
 308                                                         break;
 309                                                 case 'c':       /* CopyDone */
 310                                                         /* COPY IN correctly terminated by frontend */
 311                                                         cstate->raw_reached_eof = true;
 312                                                         return bytesread;
 313                                                 case 'f':       /* CopyFail */
 314                                                         ereport(ERROR,
 315                                                                         (errcode(ERRCODE_QUERY_CANCELED),
 316                                                                          errmsg("COPY from stdin failed: %s",
 317                                                                                         pq_getmsgstring(cstate->fe_msgbuf))));
 318                                                         break;
 319                                                 case 'H':       /* Flush */
 320                                                 case 'S':       /* Sync */
 321
 322                                                         /*
 323                                                          * Ignore Flush/Sync for the convenience of client
 324                                                          * libraries (such as libpq) that may send those
 325                                                          * without noticing that the command they just
 326                                                          * sent was COPY.
 327                                                          */
 328                                                         goto readmessage;
 329                                                 default:
 330                                                         Assert(false);  /* NOT REACHED */
 331                                         }
 332                                 }
 333                                 avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
 334                                 if (avail > maxread)
 335                                         avail = maxread;
 336                                 pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
 337                                 databuf = (void *) ((char *) databuf + avail);
 338                                 maxread -= avail;
 339                                 bytesread += avail;
 340                         }
 341                         break;
 342                 case COPY_CALLBACK:
 343                         bytesread = cstate->data_source_cb(databuf, minread, maxread);
 344                         break;
 345         }
 346
 347         return bytesread;
 348 }
 349
 350
 351 /*
 352  * These functions do apply some data conversion
 353  */
 354
 355 /*
 356  * CopyGetInt32 reads an int32 that appears in network byte order
 357  *
 358  * Returns true if OK, false if EOF
 359  */
 360 static inline bool
 361 CopyGetInt32(CopyFromState cstate, int32 *val)
 362 {
 363         uint32          buf;
 364
 365         if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
 366         {
 367                 *val = 0;                               /* suppress compiler warning */
 368                 return false;
 369         }
 370         *val = (int32) pg_ntoh32(buf);
 371         return true;
 372 }
 373
 374 /*
 375  * CopyGetInt16 reads an int16 that appears in network byte order
 376  */
 377 static inline bool
 378 CopyGetInt16(CopyFromState cstate, int16 *val)
 379 {
 380         uint16          buf;
 381
 382         if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
 383         {
 384                 *val = 0;                               /* suppress compiler warning */
 385                 return false;
 386         }
 387         *val = (int16) pg_ntoh16(buf);
 388         return true;
 389 }
 390
 391
 392 /*
 393  * Perform encoding conversion on data in 'raw_buf', writing the converted
 394  * data into 'input_buf'.
 395  *
 396  * On entry, there must be some data to convert in 'raw_buf'.
 397  */
 398 static void
 399 CopyConvertBuf(CopyFromState cstate)
 400 {
 401         /*
 402          * If the file and server encoding are the same, no encoding conversion is
 403          * required.  However, we still need to verify that the input is valid for
 404          * the encoding.
 405          */
 406         if (!cstate->need_transcoding)
 407         {
 408                 /*
 409                  * When conversion is not required, input_buf and raw_buf are the
 410                  * same.  raw_buf_len is the total number of bytes in the buffer, and
 411                  * input_buf_len tracks how many of those bytes have already been
 412                  * verified.
 413                  */
 414                 int                     preverifiedlen = cstate->input_buf_len;
 415                 int                     unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
 416                 int                     nverified;
 417
 418                 if (unverifiedlen == 0)
 419                 {
 420                         /*
 421                          * If no more raw data is coming, report the EOF to the caller.
 422                          */
 423                         if (cstate->raw_reached_eof)
 424                                 cstate->input_reached_eof = true;
 425                         return;
 426                 }
 427
 428                 /*
 429                  * Verify the new data, including any residual unverified bytes from
 430                  * previous round.
 431                  */
 432                 nverified = pg_encoding_verifymbstr(cstate->file_encoding,
 433                                                                                         cstate->raw_buf + preverifiedlen,
 434                                                                                         unverifiedlen);
 435                 if (nverified == 0)
 436                 {
 437                         /*
 438                          * Could not verify anything.
 439                          *
 440                          * If there is no more raw input data coming, it means that there
 441                          * was an incomplete multi-byte sequence at the end.  Also, if
 442                          * there's "enough" input left, we should be able to verify at
 443                          * least one character, and a failure to do so means that we've
 444                          * hit an invalid byte sequence.
 445                          */
 446                         if (cstate->raw_reached_eof || unverifiedlen >= pg_database_encoding_max_length())
 447                                 cstate->input_reached_error = true;
 448                         return;
 449                 }
 450                 cstate->input_buf_len += nverified;
 451         }
 452         else
 453         {
 454                 /*
 455                  * Encoding conversion is needed.
 456                  */
 457                 int                     nbytes;
 458                 unsigned char *src;
 459                 int                     srclen;
 460                 unsigned char *dst;
 461                 int                     dstlen;
 462                 int                     convertedlen;
 463
 464                 if (RAW_BUF_BYTES(cstate) == 0)
 465                 {
 466                         /*
 467                          * If no more raw data is coming, report the EOF to the caller.
 468                          */
 469                         if (cstate->raw_reached_eof)
 470                                 cstate->input_reached_eof = true;
 471                         return;
 472                 }
 473
 474                 /*
 475                  * First, copy down any unprocessed data.
 476                  */
 477                 nbytes = INPUT_BUF_BYTES(cstate);
 478                 if (nbytes > 0 && cstate->input_buf_index > 0)
 479                         memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
 480                                         nbytes);
 481                 cstate->input_buf_index = 0;
 482                 cstate->input_buf_len = nbytes;
 483                 cstate->input_buf[nbytes] = '\0';
 484
 485                 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
 486                 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
 487                 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
 488                 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
 489
 490                 /*
 491                  * Do the conversion.  This might stop short, if there is an invalid
 492                  * byte sequence in the input.  We'll convert as much as we can in
 493                  * that case.
 494                  *
 495                  * Note: Even if we hit an invalid byte sequence, we don't report the
 496                  * error until all the valid bytes have been consumed.  The input
 497                  * might contain an end-of-input marker (\.), and we don't want to
 498                  * report an error if the invalid byte sequence is after the
 499                  * end-of-input marker.  We might unnecessarily convert some data
 500                  * after the end-of-input marker as long as it's valid for the
 501                  * encoding, but that's harmless.
 502                  */
 503                 convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
 504                                                                                                          cstate->file_encoding,
 505                                                                                                          GetDatabaseEncoding(),
 506                                                                                                          src, srclen,
 507                                                                                                          dst, dstlen,
 508                                                                                                          true);
 509                 if (convertedlen == 0)
 510                 {
 511                         /*
 512                          * Could not convert anything.  If there is no more raw input data
 513                          * coming, it means that there was an incomplete multi-byte
 514                          * sequence at the end.  Also, if there is plenty of input left,
 515                          * we should be able to convert at least one character, so a
 516                          * failure to do so must mean that we've hit a byte sequence
 517                          * that's invalid.
 518                          */
 519                         if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
 520                                 cstate->input_reached_error = true;
 521                         return;
 522                 }
 523                 cstate->raw_buf_index += convertedlen;
 524                 cstate->input_buf_len += strlen((char *) dst);
 525         }
 526 }
 527
 528 /*
 529  * Report an encoding or conversion error.
 530  */
 531 static void
 532 CopyConversionError(CopyFromState cstate)
 533 {
 534         Assert(cstate->raw_buf_len > 0);
 535         Assert(cstate->input_reached_error);
 536
 537         if (!cstate->need_transcoding)
 538         {
 539                 /*
 540                  * Everything up to input_buf_len was successfully verified, and
 541                  * input_buf_len points to the invalid or incomplete character.
 542                  */
 543                 report_invalid_encoding(cstate->file_encoding,
 544                                                                 cstate->raw_buf + cstate->input_buf_len,
 545                                                                 cstate->raw_buf_len - cstate->input_buf_len);
 546         }
 547         else
 548         {
 549                 /*
 550                  * raw_buf_index points to the invalid or untranslatable character. We
 551                  * let the conversion routine report the error, because it can provide
 552                  * a more specific error message than we could here.  An earlier call
 553                  * to the conversion routine in CopyConvertBuf() detected that there
 554                  * is an error, now we call the conversion routine again with
 555                  * noError=false, to have it throw the error.
 556                  */
 557                 unsigned char *src;
 558                 int                     srclen;
 559                 unsigned char *dst;
 560                 int                     dstlen;
 561
 562                 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
 563                 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
 564                 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
 565                 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
 566
 567                 (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
 568                                                                                          cstate->file_encoding,
 569                                                                                          GetDatabaseEncoding(),
 570                                                                                          src, srclen,
 571                                                                                          dst, dstlen,
 572                                                                                          false);
 573
 574                 /*
 575                  * The conversion routine should have reported an error, so this
 576                  * should not be reached.
 577                  */
 578                 elog(ERROR, "encoding conversion failed without error");
 579         }
 580 }
 581
 582 /*
 583  * Load more data from data source to raw_buf.
 584  *
 585  * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
 586  * beginning of the buffer, and we load new data after that.
 587  */
 588 static void
 589 CopyLoadRawBuf(CopyFromState cstate)
 590 {
 591         int                     nbytes;
 592         int                     inbytes;
 593
 594         /*
 595          * In text mode, if encoding conversion is not required, raw_buf and
 596          * input_buf point to the same buffer.  Their len/index better agree, too.
 597          */
 598         if (cstate->raw_buf == cstate->input_buf)
 599         {
 600                 Assert(!cstate->need_transcoding);
 601                 Assert(cstate->raw_buf_index == cstate->input_buf_index);
 602                 Assert(cstate->input_buf_len <= cstate->raw_buf_len);
 603         }
 604
 605         /*
 606          * Copy down the unprocessed data if any.
 607          */
 608         nbytes = RAW_BUF_BYTES(cstate);
 609         if (nbytes > 0 && cstate->raw_buf_index > 0)
 610                 memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
 611                                 nbytes);
 612         cstate->raw_buf_len -= cstate->raw_buf_index;
 613         cstate->raw_buf_index = 0;
 614
 615         /*
 616          * If raw_buf and input_buf are in fact the same buffer, adjust the
 617          * input_buf variables, too.
 618          */
 619         if (cstate->raw_buf == cstate->input_buf)
 620         {
 621                 cstate->input_buf_len -= cstate->input_buf_index;
 622                 cstate->input_buf_index = 0;
 623         }
 624
 625         /* Load more data */
 626         inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
 627                                                   1, RAW_BUF_SIZE - cstate->raw_buf_len);
 628         nbytes += inbytes;
 629         cstate->raw_buf[nbytes] = '\0';
 630         cstate->raw_buf_len = nbytes;
 631
 632         cstate->bytes_processed += inbytes;
 633         pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
 634
 635         if (inbytes == 0)
 636                 cstate->raw_reached_eof = true;
 637 }
 638
 639 /*
 640  * CopyLoadInputBuf loads some more data into input_buf
 641  *
 642  * On return, at least one more input character is loaded into
 643  * input_buf, or input_reached_eof is set.
 644  *
 645  * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
 646  * of the buffer and then we load more data after that.
 647  */
 648 static void
 649 CopyLoadInputBuf(CopyFromState cstate)
 650 {
 651         int                     nbytes = INPUT_BUF_BYTES(cstate);
 652
 653         /*
 654          * The caller has updated input_buf_index to indicate how much of the
 655          * input has been consumed and isn't needed anymore.  If input_buf is the
 656          * same physical area as raw_buf, update raw_buf_index accordingly.
 657          */
 658         if (cstate->raw_buf == cstate->input_buf)
 659         {
 660                 Assert(!cstate->need_transcoding);
 661                 Assert(cstate->input_buf_index >= cstate->raw_buf_index);
 662                 cstate->raw_buf_index = cstate->input_buf_index;
 663         }
 664
 665         for (;;)
 666         {
 667                 /* If we now have some unconverted data, try to convert it */
 668                 CopyConvertBuf(cstate);
 669
 670                 /* If we now have some more input bytes ready, return them */
 671                 if (INPUT_BUF_BYTES(cstate) > nbytes)
 672                         return;
 673
 674                 /*
 675                  * If we reached an invalid byte sequence, or we're at an incomplete
 676                  * multi-byte character but there is no more raw input data, report
 677                  * conversion error.
 678                  */
 679                 if (cstate->input_reached_error)
 680                         CopyConversionError(cstate);
 681
 682                 /* no more input, and everything has been converted */
 683                 if (cstate->input_reached_eof)
 684                         break;
 685
 686                 /* Try to load more raw data */
 687                 Assert(!cstate->raw_reached_eof);
 688                 CopyLoadRawBuf(cstate);
 689         }
 690 }
 691
 692 /*
 693  * CopyReadBinaryData
 694  *
 695  * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
 696  * and writes them to 'dest'.  Returns the number of bytes read (which
 697  * would be less than 'nbytes' only if we reach EOF).
 698  */
 699 static int
 700 CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
 701 {
 702         int                     copied_bytes = 0;
 703
 704         if (RAW_BUF_BYTES(cstate) >= nbytes)
 705         {
 706                 /* Enough bytes are present in the buffer. */
 707                 memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
 708                 cstate->raw_buf_index += nbytes;
 709                 copied_bytes = nbytes;
 710         }
 711         else
 712         {
 713                 /*
 714                  * Not enough bytes in the buffer, so must read from the file.  Need
 715                  * to loop since 'nbytes' could be larger than the buffer size.
 716                  */
 717                 do
 718                 {
 719                         int                     copy_bytes;
 720
 721                         /* Load more data if buffer is empty. */
 722                         if (RAW_BUF_BYTES(cstate) == 0)
 723                         {
 724                                 CopyLoadRawBuf(cstate);
 725                                 if (cstate->raw_reached_eof)
 726                                         break;          /* EOF */
 727                         }
 728
 729                         /* Transfer some bytes. */
 730                         copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
 731                         memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
 732                         cstate->raw_buf_index += copy_bytes;
 733                         dest += copy_bytes;
 734                         copied_bytes += copy_bytes;
 735                 } while (copied_bytes < nbytes);
 736         }
 737
 738         return copied_bytes;
 739 }
 740
 741 /*
 742  * Read raw fields in the next line for COPY FROM in text or csv mode.
 743  * Return false if no more lines.
 744  *
 745  * An internal temporary buffer is returned via 'fields'. It is valid until
 746  * the next call of the function. Since the function returns all raw fields
 747  * in the input file, 'nfields' could be different from the number of columns
 748  * in the relation.
 749  *
 750  * NOTE: force_not_null option are not applied to the returned fields.
 751  */
 752 bool
 753 NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 754 {
 755         int                     fldct;
 756         bool            done;
 757
 758         /* only available for text or csv input */
 759         Assert(!cstate->opts.binary);
 760
 761         /* on input just throw the header line away */
 762         if (cstate->cur_lineno == 0 && cstate->opts.header_line)
 763         {
 764                 cstate->cur_lineno++;
 765                 if (CopyReadLine(cstate))
 766                         return false;           /* done */
 767         }
 768
 769         cstate->cur_lineno++;
 770
 771         /* Actually read the line into memory here */
 772         done = CopyReadLine(cstate);
 773
 774         /*
 775          * EOF at start of line means we're done.  If we see EOF after some
 776          * characters, we act as though it was newline followed by EOF, ie,
 777          * process the line and then exit loop on next iteration.
 778          */
 779         if (done && cstate->line_buf.len == 0)
 780                 return false;
 781
 782         /* Parse the line into de-escaped field values */
 783         if (cstate->opts.csv_mode)
 784                 fldct = CopyReadAttributesCSV(cstate);
 785         else
 786                 fldct = CopyReadAttributesText(cstate);
 787
 788         *fields = cstate->raw_fields;
 789         *nfields = fldct;
 790         return true;
 791 }
 792
 793 /*
 794  * Read next tuple from file for COPY FROM. Return false if no more tuples.
 795  *
 796  * 'econtext' is used to evaluate default expression for each columns not
 797  * read from the file. It can be NULL when no default values are used, i.e.
 798  * when all columns are read from the file.
 799  *
 800  * 'values' and 'nulls' arrays must be the same length as columns of the
 801  * relation passed to BeginCopyFrom. This function fills the arrays.
 802  */
 803 bool
 804 NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 805                          Datum *values, bool *nulls)
 806 {
 807         TupleDesc       tupDesc;
 808         AttrNumber      num_phys_attrs,
 809                                 attr_count,
 810                                 num_defaults = cstate->num_defaults;
 811         FmgrInfo   *in_functions = cstate->in_functions;
 812         Oid                *typioparams = cstate->typioparams;
 813         int                     i;
 814         int                *defmap = cstate->defmap;
 815         ExprState **defexprs = cstate->defexprs;
 816
 817         tupDesc = RelationGetDescr(cstate->rel);
 818         num_phys_attrs = tupDesc->natts;
 819         attr_count = list_length(cstate->attnumlist);
 820
 821         /* Initialize all values for row to NULL */
 822         MemSet(values, 0, num_phys_attrs * sizeof(Datum));
 823         MemSet(nulls, true, num_phys_attrs * sizeof(bool));
 824
 825         if (!cstate->opts.binary)
 826         {
 827                 char      **field_strings;
 828                 ListCell   *cur;
 829                 int                     fldct;
 830                 int                     fieldno;
 831                 char       *string;
 832
 833                 /* read raw fields in the next line */
 834                 if (!NextCopyFromRawFields(cstate, &field_strings, &fldct))
 835                         return false;
 836
 837                 /* check for overflowing fields */
 838                 if (attr_count > 0 && fldct > attr_count)
 839                         ereport(ERROR,
 840                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 841                                          errmsg("extra data after last expected column")));
 842
 843                 fieldno = 0;
 844
 845                 /* Loop to read the user attributes on the line. */
 846                 foreach(cur, cstate->attnumlist)
 847                 {
 848                         int                     attnum = lfirst_int(cur);
 849                         int                     m = attnum - 1;
 850                         Form_pg_attribute att = TupleDescAttr(tupDesc, m);
 851
 852                         if (fieldno >= fldct)
 853                                 ereport(ERROR,
 854                                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 855                                                  errmsg("missing data for column \"%s\"",
 856                                                                 NameStr(att->attname))));
 857                         string = field_strings[fieldno++];
 858
 859                         if (cstate->convert_select_flags &&
 860                                 !cstate->convert_select_flags[m])
 861                         {
 862                                 /* ignore input field, leaving column as NULL */
 863                                 continue;
 864                         }
 865
 866                         if (cstate->opts.csv_mode)
 867                         {
 868                                 if (string == NULL &&
 869                                         cstate->opts.force_notnull_flags[m])
 870                                 {
 871                                         /*
 872                                          * FORCE_NOT_NULL option is set and column is NULL -
 873                                          * convert it to the NULL string.
 874                                          */
 875                                         string = cstate->opts.null_print;
 876                                 }
 877                                 else if (string != NULL && cstate->opts.force_null_flags[m]
 878                                                  && strcmp(string, cstate->opts.null_print) == 0)
 879                                 {
 880                                         /*
 881                                          * FORCE_NULL option is set and column matches the NULL
 882                                          * string. It must have been quoted, or otherwise the
 883                                          * string would already have been set to NULL. Convert it
 884                                          * to NULL as specified.
 885                                          */
 886                                         string = NULL;
 887                                 }
 888                         }
 889
 890                         cstate->cur_attname = NameStr(att->attname);
 891                         cstate->cur_attval = string;
 892                         values[m] = InputFunctionCall(&in_functions[m],
 893                                                                                   string,
 894                                                                                   typioparams[m],
 895                                                                                   att->atttypmod);
 896                         if (string != NULL)
 897                                 nulls[m] = false;
 898                         cstate->cur_attname = NULL;
 899                         cstate->cur_attval = NULL;
 900                 }
 901
 902                 Assert(fieldno == attr_count);
 903         }
 904         else
 905         {
 906                 /* binary */
 907                 int16           fld_count;
 908                 ListCell   *cur;
 909
 910                 cstate->cur_lineno++;
 911
 912                 if (!CopyGetInt16(cstate, &fld_count))
 913                 {
 914                         /* EOF detected (end of file, or protocol-level EOF) */
 915                         return false;
 916                 }
 917
 918                 if (fld_count == -1)
 919                 {
 920                         /*
 921                          * Received EOF marker.  Wait for the protocol-level EOF, and
 922                          * complain if it doesn't come immediately.  In COPY FROM STDIN,
 923                          * this ensures that we correctly handle CopyFail, if client
 924                          * chooses to send that now.  When copying from file, we could
 925                          * ignore the rest of the file like in text mode, but we choose to
 926                          * be consistent with the COPY FROM STDIN case.
 927                          */
 928                         char            dummy;
 929
 930                         if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
 931                                 ereport(ERROR,
 932                                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 933                                                  errmsg("received copy data after EOF marker")));
 934                         return false;
 935                 }
 936
 937                 if (fld_count != attr_count)
 938                         ereport(ERROR,
 939                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 940                                          errmsg("row field count is %d, expected %d",
 941                                                         (int) fld_count, attr_count)));
 942
 943                 foreach(cur, cstate->attnumlist)
 944                 {
 945                         int                     attnum = lfirst_int(cur);
 946                         int                     m = attnum - 1;
 947                         Form_pg_attribute att = TupleDescAttr(tupDesc, m);
 948
 949                         cstate->cur_attname = NameStr(att->attname);
 950                         values[m] = CopyReadBinaryAttribute(cstate,
 951                                                                                                 &in_functions[m],
 952                                                                                                 typioparams[m],
 953                                                                                                 att->atttypmod,
 954                                                                                                 &nulls[m]);
 955                         cstate->cur_attname = NULL;
 956                 }
 957         }
 958
 959         /*
 960          * Now compute and insert any defaults available for the columns not
 961          * provided by the input data.  Anything not processed here or above will
 962          * remain NULL.
 963          */
 964         for (i = 0; i < num_defaults; i++)
 965         {
 966                 /*
 967                  * The caller must supply econtext and have switched into the
 968                  * per-tuple memory context in it.
 969                  */
 970                 Assert(econtext != NULL);
 971                 Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
 972
 973                 values[defmap[i]] = ExecEvalExpr(defexprs[i], econtext,
 974                                                                                  &nulls[defmap[i]]);
 975         }
 976
 977         return true;
 978 }
 979
 980 /*
 981  * Read the next input line and stash it in line_buf.
 982  *
 983  * Result is true if read was terminated by EOF, false if terminated
 984  * by newline.  The terminating newline or EOF marker is not included
 985  * in the final value of line_buf.
 986  */
 987 static bool
 988 CopyReadLine(CopyFromState cstate)
 989 {
 990         bool            result;
 991
 992         resetStringInfo(&cstate->line_buf);
 993         cstate->line_buf_valid = false;
 994
 995         /* Parse data and transfer into line_buf */
 996         result = CopyReadLineText(cstate);
 997
 998         if (result)
 999         {
1000                 /*
1001                  * Reached EOF.  In protocol version 3, we should ignore anything
1002                  * after \. up to the protocol end of copy data.  (XXX maybe better
1003                  * not to treat \. as special?)
1004                  */
1005                 if (cstate->copy_src == COPY_FRONTEND)
1006                 {
1007                         int                     inbytes;
1008
1009                         do
1010                         {
1011                                 inbytes = CopyGetData(cstate, cstate->input_buf,
1012                                                                           1, INPUT_BUF_SIZE);
1013                         } while (inbytes > 0);
1014                         cstate->input_buf_index = 0;
1015                         cstate->input_buf_len = 0;
1016                         cstate->raw_buf_index = 0;
1017                         cstate->raw_buf_len = 0;
1018                 }
1019         }
1020         else
1021         {
1022                 /*
1023                  * If we didn't hit EOF, then we must have transferred the EOL marker
1024                  * to line_buf along with the data.  Get rid of it.
1025                  */
1026                 switch (cstate->eol_type)
1027                 {
1028                         case EOL_NL:
1029                                 Assert(cstate->line_buf.len >= 1);
1030                                 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1031                                 cstate->line_buf.len--;
1032                                 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1033                                 break;
1034                         case EOL_CR:
1035                                 Assert(cstate->line_buf.len >= 1);
1036                                 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1037                                 cstate->line_buf.len--;
1038                                 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1039                                 break;
1040                         case EOL_CRNL:
1041                                 Assert(cstate->line_buf.len >= 2);
1042                                 Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1043                                 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1044                                 cstate->line_buf.len -= 2;
1045                                 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1046                                 break;
1047                         case EOL_UNKNOWN:
1048                                 /* shouldn't get here */
1049                                 Assert(false);
1050                                 break;
1051                 }
1052         }
1053
1054         /* Now it's safe to use the buffer in error messages */
1055         cstate->line_buf_valid = true;
1056
1057         return result;
1058 }
1059
1060 /*
1061  * CopyReadLineText - inner loop of CopyReadLine for text mode
1062  */
1063 static bool
1064 CopyReadLineText(CopyFromState cstate)
1065 {
1066         char       *copy_input_buf;
1067         int                     input_buf_ptr;
1068         int                     copy_buf_len;
1069         bool            need_data = false;
1070         bool            hit_eof = false;
1071         bool            result = false;
1072
1073         /* CSV variables */
1074         bool            first_char_in_line = true;
1075         bool            in_quote = false,
1076                                 last_was_esc = false;
1077         char            quotec = '\0';
1078         char            escapec = '\0';
1079
1080         if (cstate->opts.csv_mode)
1081         {
1082                 quotec = cstate->opts.quote[0];
1083                 escapec = cstate->opts.escape[0];
1084                 /* ignore special escape processing if it's the same as quotec */
1085                 if (quotec == escapec)
1086                         escapec = '\0';
1087         }
1088
1089         /*
1090          * The objective of this loop is to transfer the entire next input line
1091          * into line_buf.  Hence, we only care for detecting newlines (\r and/or
1092          * \n) and the end-of-copy marker (\.).
1093          *
1094          * In CSV mode, \r and \n inside a quoted field are just part of the data
1095          * value and are put in line_buf.  We keep just enough state to know if we
1096          * are currently in a quoted field or not.
1097          *
1098          * These four characters, and the CSV escape and quote characters, are
1099          * assumed the same in frontend and backend encodings.
1100          *
1101          * The input has already been converted to the database encoding.  All
1102          * supported server encodings have the property that all bytes in a
1103          * multi-byte sequence have the high bit set, so a multibyte character
1104          * cannot contain any newline or escape characters embedded in the
1105          * multibyte sequence.  Therefore, we can process the input byte-by-byte,
1106          * regardless of the encoding.
1107          *
1108          * For speed, we try to move data from input_buf to line_buf in chunks
1109          * rather than one character at a time.  input_buf_ptr points to the next
1110          * character to examine; any characters from input_buf_index to
1111          * input_buf_ptr have been determined to be part of the line, but not yet
1112          * transferred to line_buf.
1113          *
1114          * For a little extra speed within the loop, we copy input_buf and
1115          * input_buf_len into local variables.
1116          */
1117         copy_input_buf = cstate->input_buf;
1118         input_buf_ptr = cstate->input_buf_index;
1119         copy_buf_len = cstate->input_buf_len;
1120
1121         for (;;)
1122         {
1123                 int                     prev_raw_ptr;
1124                 char            c;
1125
1126                 /*
1127                  * Load more data if needed.  Ideally we would just force four bytes
1128                  * of read-ahead and avoid the many calls to
1129                  * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
1130                  * does not allow us to read too far ahead or we might read into the
1131                  * next data, so we read-ahead only as far we know we can.  One
1132                  * optimization would be to read-ahead four byte here if
1133                  * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
1134                  * considering the size of the buffer.
1135                  */
1136                 if (input_buf_ptr >= copy_buf_len || need_data)
1137                 {
1138                         REFILL_LINEBUF;
1139
1140                         CopyLoadInputBuf(cstate);
1141                         /* update our local variables */
1142                         hit_eof = cstate->input_reached_eof;
1143                         input_buf_ptr = cstate->input_buf_index;
1144                         copy_buf_len = cstate->input_buf_len;
1145
1146                         /*
1147                          * If we are completely out of data, break out of the loop,
1148                          * reporting EOF.
1149                          */
1150                         if (INPUT_BUF_BYTES(cstate) <= 0)
1151                         {
1152                                 result = true;
1153                                 break;
1154                         }
1155                         need_data = false;
1156                 }
1157
1158                 /* OK to fetch a character */
1159                 prev_raw_ptr = input_buf_ptr;
1160                 c = copy_input_buf[input_buf_ptr++];
1161
1162                 if (cstate->opts.csv_mode)
1163                 {
1164                         /*
1165                          * If character is '\\' or '\r', we may need to look ahead below.
1166                          * Force fetch of the next character if we don't already have it.
1167                          * We need to do this before changing CSV state, in case one of
1168                          * these characters is also the quote or escape character.
1169                          *
1170                          * Note: old-protocol does not like forced prefetch, but it's OK
1171                          * here since we cannot validly be at EOF.
1172                          */
1173                         if (c == '\\' || c == '\r')
1174                         {
1175                                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1176                         }
1177
1178                         /*
1179                          * Dealing with quotes and escapes here is mildly tricky. If the
1180                          * quote char is also the escape char, there's no problem - we
1181                          * just use the char as a toggle. If they are different, we need
1182                          * to ensure that we only take account of an escape inside a
1183                          * quoted field and immediately preceding a quote char, and not
1184                          * the second in an escape-escape sequence.
1185                          */
1186                         if (in_quote && c == escapec)
1187                                 last_was_esc = !last_was_esc;
1188                         if (c == quotec && !last_was_esc)
1189                                 in_quote = !in_quote;
1190                         if (c != escapec)
1191                                 last_was_esc = false;
1192
1193                         /*
1194                          * Updating the line count for embedded CR and/or LF chars is
1195                          * necessarily a little fragile - this test is probably about the
1196                          * best we can do.  (XXX it's arguable whether we should do this
1197                          * at all --- is cur_lineno a physical or logical count?)
1198                          */
1199                         if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1200                                 cstate->cur_lineno++;
1201                 }
1202
1203                 /* Process \r */
1204                 if (c == '\r' && (!cstate->opts.csv_mode || !in_quote))
1205                 {
1206                         /* Check for \r\n on first line, _and_ handle \r\n. */
1207                         if (cstate->eol_type == EOL_UNKNOWN ||
1208                                 cstate->eol_type == EOL_CRNL)
1209                         {
1210                                 /*
1211                                  * If need more data, go back to loop top to load it.
1212                                  *
1213                                  * Note that if we are at EOF, c will wind up as '\0' because
1214                                  * of the guaranteed pad of input_buf.
1215                                  */
1216                                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1217
1218                                 /* get next char */
1219                                 c = copy_input_buf[input_buf_ptr];
1220
1221                                 if (c == '\n')
1222                                 {
1223                                         input_buf_ptr++;        /* eat newline */
1224                                         cstate->eol_type = EOL_CRNL;    /* in case not set yet */
1225                                 }
1226                                 else
1227                                 {
1228                                         /* found \r, but no \n */
1229                                         if (cstate->eol_type == EOL_CRNL)
1230                                                 ereport(ERROR,
1231                                                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1232                                                                  !cstate->opts.csv_mode ?
1233                                                                  errmsg("literal carriage return found in data") :
1234                                                                  errmsg("unquoted carriage return found in data"),
1235                                                                  !cstate->opts.csv_mode ?
1236                                                                  errhint("Use \"\\r\" to represent carriage return.") :
1237                                                                  errhint("Use quoted CSV field to represent carriage return.")));
1238
1239                                         /*
1240                                          * if we got here, it is the first line and we didn't find
1241                                          * \n, so don't consume the peeked character
1242                                          */
1243                                         cstate->eol_type = EOL_CR;
1244                                 }
1245                         }
1246                         else if (cstate->eol_type == EOL_NL)
1247                                 ereport(ERROR,
1248                                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1249                                                  !cstate->opts.csv_mode ?
1250                                                  errmsg("literal carriage return found in data") :
1251                                                  errmsg("unquoted carriage return found in data"),
1252                                                  !cstate->opts.csv_mode ?
1253                                                  errhint("Use \"\\r\" to represent carriage return.") :
1254                                                  errhint("Use quoted CSV field to represent carriage return.")));
1255                         /* If reach here, we have found the line terminator */
1256                         break;
1257                 }
1258
1259                 /* Process \n */
1260                 if (c == '\n' && (!cstate->opts.csv_mode || !in_quote))
1261                 {
1262                         if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1263                                 ereport(ERROR,
1264                                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1265                                                  !cstate->opts.csv_mode ?
1266                                                  errmsg("literal newline found in data") :
1267                                                  errmsg("unquoted newline found in data"),
1268                                                  !cstate->opts.csv_mode ?
1269                                                  errhint("Use \"\\n\" to represent newline.") :
1270                                                  errhint("Use quoted CSV field to represent newline.")));
1271                         cstate->eol_type = EOL_NL;      /* in case not set yet */
1272                         /* If reach here, we have found the line terminator */
1273                         break;
1274                 }
1275
1276                 /*
1277                  * In CSV mode, we only recognize \. alone on a line.  This is because
1278                  * \. is a valid CSV data value.
1279                  */
1280                 if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line))
1281                 {
1282                         char            c2;
1283
1284                         IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1285                         IF_NEED_REFILL_AND_EOF_BREAK(0);
1286
1287                         /* -----
1288                          * get next character
1289                          * Note: we do not change c so if it isn't \., we can fall
1290                          * through and continue processing.
1291                          * -----
1292                          */
1293                         c2 = copy_input_buf[input_buf_ptr];
1294
1295                         if (c2 == '.')
1296                         {
1297                                 input_buf_ptr++;        /* consume the '.' */
1298
1299                                 /*
1300                                  * Note: if we loop back for more data here, it does not
1301                                  * matter that the CSV state change checks are re-executed; we
1302                                  * will come back here with no important state changed.
1303                                  */
1304                                 if (cstate->eol_type == EOL_CRNL)
1305                                 {
1306                                         /* Get the next character */
1307                                         IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1308                                         /* if hit_eof, c2 will become '\0' */
1309                                         c2 = copy_input_buf[input_buf_ptr++];
1310
1311                                         if (c2 == '\n')
1312                                         {
1313                                                 if (!cstate->opts.csv_mode)
1314                                                         ereport(ERROR,
1315                                                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1316                                                                          errmsg("end-of-copy marker does not match previous newline style")));
1317                                                 else
1318                                                         NO_END_OF_COPY_GOTO;
1319                                         }
1320                                         else if (c2 != '\r')
1321                                         {
1322                                                 if (!cstate->opts.csv_mode)
1323                                                         ereport(ERROR,
1324                                                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1325                                                                          errmsg("end-of-copy marker corrupt")));
1326                                                 else
1327                                                         NO_END_OF_COPY_GOTO;
1328                                         }
1329                                 }
1330
1331                                 /* Get the next character */
1332                                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1333                                 /* if hit_eof, c2 will become '\0' */
1334                                 c2 = copy_input_buf[input_buf_ptr++];
1335
1336                                 if (c2 != '\r' && c2 != '\n')
1337                                 {
1338                                         if (!cstate->opts.csv_mode)
1339                                                 ereport(ERROR,
1340                                                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1341                                                                  errmsg("end-of-copy marker corrupt")));
1342                                         else
1343                                                 NO_END_OF_COPY_GOTO;
1344                                 }
1345
1346                                 if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1347                                         (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1348                                         (cstate->eol_type == EOL_CR && c2 != '\r'))
1349                                 {
1350                                         ereport(ERROR,
1351                                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1352                                                          errmsg("end-of-copy marker does not match previous newline style")));
1353                                 }
1354
1355                                 /*
1356                                  * Transfer only the data before the \. into line_buf, then
1357                                  * discard the data and the \. sequence.
1358                                  */
1359                                 if (prev_raw_ptr > cstate->input_buf_index)
1360                                         appendBinaryStringInfo(&cstate->line_buf,
1361                                                                                    cstate->input_buf + cstate->input_buf_index,
1362                                                                                    prev_raw_ptr - cstate->input_buf_index);
1363                                 cstate->input_buf_index = input_buf_ptr;
1364                                 result = true;  /* report EOF */
1365                                 break;
1366                         }
1367                         else if (!cstate->opts.csv_mode)
1368                         {
1369                                 /*
1370                                  * If we are here, it means we found a backslash followed by
1371                                  * something other than a period.  In non-CSV mode, anything
1372                                  * after a backslash is special, so we skip over that second
1373                                  * character too.  If we didn't do that \\. would be
1374                                  * considered an eof-of copy, while in non-CSV mode it is a
1375                                  * literal backslash followed by a period.  In CSV mode,
1376                                  * backslashes are not special, so we want to process the
1377                                  * character after the backslash just like a normal character,
1378                                  * so we don't increment in those cases.
1379                                  */
1380                                 input_buf_ptr++;
1381                         }
1382                 }
1383
1384                 /*
1385                  * This label is for CSV cases where \. appears at the start of a
1386                  * line, but there is more text after it, meaning it was a data value.
1387                  * We are more strict for \. in CSV mode because \. could be a data
1388                  * value, while in non-CSV mode, \. cannot be a data value.
1389                  */
1390 not_end_of_copy:
1391                 first_char_in_line = false;
1392         }                                                       /* end of outer loop */
1393
1394         /*
1395          * Transfer any still-uncopied data to line_buf.
1396          */
1397         REFILL_LINEBUF;
1398
1399         return result;
1400 }
1401
1402 /*
1403  *      Return decimal value for a hexadecimal digit
1404  */
1405 static int
1406 GetDecimalFromHex(char hex)
1407 {
1408         if (isdigit((unsigned char) hex))
1409                 return hex - '0';
1410         else
1411                 return tolower((unsigned char) hex) - 'a' + 10;
1412 }
1413
1414 /*
1415  * Parse the current line into separate attributes (fields),
1416  * performing de-escaping as needed.
1417  *
1418  * The input is in line_buf.  We use attribute_buf to hold the result
1419  * strings.  cstate->raw_fields[k] is set to point to the k'th attribute
1420  * string, or NULL when the input matches the null marker string.
1421  * This array is expanded as necessary.
1422  *
1423  * (Note that the caller cannot check for nulls since the returned
1424  * string would be the post-de-escaping equivalent, which may look
1425  * the same as some valid data string.)
1426  *
1427  * delim is the column delimiter string (must be just one byte for now).
1428  * null_print is the null marker string.  Note that this is compared to
1429  * the pre-de-escaped input string.
1430  *
1431  * The return value is the number of fields actually read.
1432  */
1433 static int
1434 CopyReadAttributesText(CopyFromState cstate)
1435 {
1436         char            delimc = cstate->opts.delim[0];
1437         int                     fieldno;
1438         char       *output_ptr;
1439         char       *cur_ptr;
1440         char       *line_end_ptr;
1441
1442         /*
1443          * We need a special case for zero-column tables: check that the input
1444          * line is empty, and return.
1445          */
1446         if (cstate->max_fields <= 0)
1447         {
1448                 if (cstate->line_buf.len != 0)
1449                         ereport(ERROR,
1450                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1451                                          errmsg("extra data after last expected column")));
1452                 return 0;
1453         }
1454
1455         resetStringInfo(&cstate->attribute_buf);
1456
1457         /*
1458          * The de-escaped attributes will certainly not be longer than the input
1459          * data line, so we can just force attribute_buf to be large enough and
1460          * then transfer data without any checks for enough space.  We need to do
1461          * it this way because enlarging attribute_buf mid-stream would invalidate
1462          * pointers already stored into cstate->raw_fields[].
1463          */
1464         if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1465                 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1466         output_ptr = cstate->attribute_buf.data;
1467
1468         /* set pointer variables for loop */
1469         cur_ptr = cstate->line_buf.data;
1470         line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1471
1472         /* Outer loop iterates over fields */
1473         fieldno = 0;
1474         for (;;)
1475         {
1476                 bool            found_delim = false;
1477                 char       *start_ptr;
1478                 char       *end_ptr;
1479                 int                     input_len;
1480                 bool            saw_non_ascii = false;
1481
1482                 /* Make sure there is enough space for the next value */
1483                 if (fieldno >= cstate->max_fields)
1484                 {
1485                         cstate->max_fields *= 2;
1486                         cstate->raw_fields =
1487                                 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1488                 }
1489
1490                 /* Remember start of field on both input and output sides */
1491                 start_ptr = cur_ptr;
1492                 cstate->raw_fields[fieldno] = output_ptr;
1493
1494                 /*
1495                  * Scan data for field.
1496                  *
1497                  * Note that in this loop, we are scanning to locate the end of field
1498                  * and also speculatively performing de-escaping.  Once we find the
1499                  * end-of-field, we can match the raw field contents against the null
1500                  * marker string.  Only after that comparison fails do we know that
1501                  * de-escaping is actually the right thing to do; therefore we *must
1502                  * not* throw any syntax errors before we've done the null-marker
1503                  * check.
1504                  */
1505                 for (;;)
1506                 {
1507                         char            c;
1508
1509                         end_ptr = cur_ptr;
1510                         if (cur_ptr >= line_end_ptr)
1511                                 break;
1512                         c = *cur_ptr++;
1513                         if (c == delimc)
1514                         {
1515                                 found_delim = true;
1516                                 break;
1517                         }
1518                         if (c == '\\')
1519                         {
1520                                 if (cur_ptr >= line_end_ptr)
1521                                         break;
1522                                 c = *cur_ptr++;
1523                                 switch (c)
1524                                 {
1525                                         case '0':
1526                                         case '1':
1527                                         case '2':
1528                                         case '3':
1529                                         case '4':
1530                                         case '5':
1531                                         case '6':
1532                                         case '7':
1533                                                 {
1534                                                         /* handle \013 */
1535                                                         int                     val;
1536
1537                                                         val = OCTVALUE(c);
1538                                                         if (cur_ptr < line_end_ptr)
1539                                                         {
1540                                                                 c = *cur_ptr;
1541                                                                 if (ISOCTAL(c))
1542                                                                 {
1543                                                                         cur_ptr++;
1544                                                                         val = (val << 3) + OCTVALUE(c);
1545                                                                         if (cur_ptr < line_end_ptr)
1546                                                                         {
1547                                                                                 c = *cur_ptr;
1548                                                                                 if (ISOCTAL(c))
1549                                                                                 {
1550                                                                                         cur_ptr++;
1551                                                                                         val = (val << 3) + OCTVALUE(c);
1552                                                                                 }
1553                                                                         }
1554                                                                 }
1555                                                         }
1556                                                         c = val & 0377;
1557                                                         if (c == '\0' || IS_HIGHBIT_SET(c))
1558                                                                 saw_non_ascii = true;
1559                                                 }
1560                                                 break;
1561                                         case 'x':
1562                                                 /* Handle \x3F */
1563                                                 if (cur_ptr < line_end_ptr)
1564                                                 {
1565                                                         char            hexchar = *cur_ptr;
1566
1567                                                         if (isxdigit((unsigned char) hexchar))
1568                                                         {
1569                                                                 int                     val = GetDecimalFromHex(hexchar);
1570
1571                                                                 cur_ptr++;
1572                                                                 if (cur_ptr < line_end_ptr)
1573                                                                 {
1574                                                                         hexchar = *cur_ptr;
1575                                                                         if (isxdigit((unsigned char) hexchar))
1576                                                                         {
1577                                                                                 cur_ptr++;
1578                                                                                 val = (val << 4) + GetDecimalFromHex(hexchar);
1579                                                                         }
1580                                                                 }
1581                                                                 c = val & 0xff;
1582                                                                 if (c == '\0' || IS_HIGHBIT_SET(c))
1583                                                                         saw_non_ascii = true;
1584                                                         }
1585                                                 }
1586                                                 break;
1587                                         case 'b':
1588                                                 c = '\b';
1589                                                 break;
1590                                         case 'f':
1591                                                 c = '\f';
1592                                                 break;
1593                                         case 'n':
1594                                                 c = '\n';
1595                                                 break;
1596                                         case 'r':
1597                                                 c = '\r';
1598                                                 break;
1599                                         case 't':
1600                                                 c = '\t';
1601                                                 break;
1602                                         case 'v':
1603                                                 c = '\v';
1604                                                 break;
1605
1606                                                 /*
1607                                                  * in all other cases, take the char after '\'
1608                                                  * literally
1609                                                  */
1610                                 }
1611                         }
1612
1613                         /* Add c to output string */
1614                         *output_ptr++ = c;
1615                 }
1616
1617                 /* Check whether raw input matched null marker */
1618                 input_len = end_ptr - start_ptr;
1619                 if (input_len == cstate->opts.null_print_len &&
1620                         strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1621                         cstate->raw_fields[fieldno] = NULL;
1622                 else
1623                 {
1624                         /*
1625                          * At this point we know the field is supposed to contain data.
1626                          *
1627                          * If we de-escaped any non-7-bit-ASCII chars, make sure the
1628                          * resulting string is valid data for the db encoding.
1629                          */
1630                         if (saw_non_ascii)
1631                         {
1632                                 char       *fld = cstate->raw_fields[fieldno];
1633
1634                                 pg_verifymbstr(fld, output_ptr - fld, false);
1635                         }
1636                 }
1637
1638                 /* Terminate attribute value in output area */
1639                 *output_ptr++ = '\0';
1640
1641                 fieldno++;
1642                 /* Done if we hit EOL instead of a delim */
1643                 if (!found_delim)
1644                         break;
1645         }
1646
1647         /* Clean up state of attribute_buf */
1648         output_ptr--;
1649         Assert(*output_ptr == '\0');
1650         cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1651
1652         return fieldno;
1653 }
1654
1655 /*
1656  * Parse the current line into separate attributes (fields),
1657  * performing de-escaping as needed.  This has exactly the same API as
1658  * CopyReadAttributesText, except we parse the fields according to
1659  * "standard" (i.e. common) CSV usage.
1660  */
1661 static int
1662 CopyReadAttributesCSV(CopyFromState cstate)
1663 {
1664         char            delimc = cstate->opts.delim[0];
1665         char            quotec = cstate->opts.quote[0];
1666         char            escapec = cstate->opts.escape[0];
1667         int                     fieldno;
1668         char       *output_ptr;
1669         char       *cur_ptr;
1670         char       *line_end_ptr;
1671
1672         /*
1673          * We need a special case for zero-column tables: check that the input
1674          * line is empty, and return.
1675          */
1676         if (cstate->max_fields <= 0)
1677         {
1678                 if (cstate->line_buf.len != 0)
1679                         ereport(ERROR,
1680                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1681                                          errmsg("extra data after last expected column")));
1682                 return 0;
1683         }
1684
1685         resetStringInfo(&cstate->attribute_buf);
1686
1687         /*
1688          * The de-escaped attributes will certainly not be longer than the input
1689          * data line, so we can just force attribute_buf to be large enough and
1690          * then transfer data without any checks for enough space.  We need to do
1691          * it this way because enlarging attribute_buf mid-stream would invalidate
1692          * pointers already stored into cstate->raw_fields[].
1693          */
1694         if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1695                 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1696         output_ptr = cstate->attribute_buf.data;
1697
1698         /* set pointer variables for loop */
1699         cur_ptr = cstate->line_buf.data;
1700         line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1701
1702         /* Outer loop iterates over fields */
1703         fieldno = 0;
1704         for (;;)
1705         {
1706                 bool            found_delim = false;
1707                 bool            saw_quote = false;
1708                 char       *start_ptr;
1709                 char       *end_ptr;
1710                 int                     input_len;
1711
1712                 /* Make sure there is enough space for the next value */
1713                 if (fieldno >= cstate->max_fields)
1714                 {
1715                         cstate->max_fields *= 2;
1716                         cstate->raw_fields =
1717                                 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1718                 }
1719
1720                 /* Remember start of field on both input and output sides */
1721                 start_ptr = cur_ptr;
1722                 cstate->raw_fields[fieldno] = output_ptr;
1723
1724                 /*
1725                  * Scan data for field,
1726                  *
1727                  * The loop starts in "not quote" mode and then toggles between that
1728                  * and "in quote" mode. The loop exits normally if it is in "not
1729                  * quote" mode and a delimiter or line end is seen.
1730                  */
1731                 for (;;)
1732                 {
1733                         char            c;
1734
1735                         /* Not in quote */
1736                         for (;;)
1737                         {
1738                                 end_ptr = cur_ptr;
1739                                 if (cur_ptr >= line_end_ptr)
1740                                         goto endfield;
1741                                 c = *cur_ptr++;
1742                                 /* unquoted field delimiter */
1743                                 if (c == delimc)
1744                                 {
1745                                         found_delim = true;
1746                                         goto endfield;
1747                                 }
1748                                 /* start of quoted field (or part of field) */
1749                                 if (c == quotec)
1750                                 {
1751                                         saw_quote = true;
1752                                         break;
1753                                 }
1754                                 /* Add c to output string */
1755                                 *output_ptr++ = c;
1756                         }
1757
1758                         /* In quote */
1759                         for (;;)
1760                         {
1761                                 end_ptr = cur_ptr;
1762                                 if (cur_ptr >= line_end_ptr)
1763                                         ereport(ERROR,
1764                                                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1765                                                          errmsg("unterminated CSV quoted field")));
1766
1767                                 c = *cur_ptr++;
1768
1769                                 /* escape within a quoted field */
1770                                 if (c == escapec)
1771                                 {
1772                                         /*
1773                                          * peek at the next char if available, and escape it if it
1774                                          * is an escape char or a quote char
1775                                          */
1776                                         if (cur_ptr < line_end_ptr)
1777                                         {
1778                                                 char            nextc = *cur_ptr;
1779
1780                                                 if (nextc == escapec || nextc == quotec)
1781                                                 {
1782                                                         *output_ptr++ = nextc;
1783                                                         cur_ptr++;
1784                                                         continue;
1785                                                 }
1786                                         }
1787                                 }
1788
1789                                 /*
1790                                  * end of quoted field. Must do this test after testing for
1791                                  * escape in case quote char and escape char are the same
1792                                  * (which is the common case).
1793                                  */
1794                                 if (c == quotec)
1795                                         break;
1796
1797                                 /* Add c to output string */
1798                                 *output_ptr++ = c;
1799                         }
1800                 }
1801 endfield:
1802
1803                 /* Terminate attribute value in output area */
1804                 *output_ptr++ = '\0';
1805
1806                 /* Check whether raw input matched null marker */
1807                 input_len = end_ptr - start_ptr;
1808                 if (!saw_quote && input_len == cstate->opts.null_print_len &&
1809                         strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1810                         cstate->raw_fields[fieldno] = NULL;
1811
1812                 fieldno++;
1813                 /* Done if we hit EOL instead of a delim */
1814                 if (!found_delim)
1815                         break;
1816         }
1817
1818         /* Clean up state of attribute_buf */
1819         output_ptr--;
1820         Assert(*output_ptr == '\0');
1821         cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1822
1823         return fieldno;
1824 }
1825
1826
1827 /*
1828  * Read a binary attribute
1829  */
1830 static Datum
1831 CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
1832                                                 Oid typioparam, int32 typmod,
1833                                                 bool *isnull)
1834 {
1835         int32           fld_size;
1836         Datum           result;
1837
1838         if (!CopyGetInt32(cstate, &fld_size))
1839                 ereport(ERROR,
1840                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1841                                  errmsg("unexpected EOF in COPY data")));
1842         if (fld_size == -1)
1843         {
1844                 *isnull = true;
1845                 return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
1846         }
1847         if (fld_size < 0)
1848                 ereport(ERROR,
1849                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1850                                  errmsg("invalid field size")));
1851
1852         /* reset attribute_buf to empty, and load raw data in it */
1853         resetStringInfo(&cstate->attribute_buf);
1854
1855         enlargeStringInfo(&cstate->attribute_buf, fld_size);
1856         if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
1857                                                    fld_size) != fld_size)
1858                 ereport(ERROR,
1859                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1860                                  errmsg("unexpected EOF in COPY data")));
1861
1862         cstate->attribute_buf.len = fld_size;
1863         cstate->attribute_buf.data[fld_size] = '\0';
1864
1865         /* Call the column type's binary input converter */
1866         result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
1867                                                                  typioparam, typmod);
1868
1869         /* Trouble if it didn't eat the whole buffer */
1870         if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
1871                 ereport(ERROR,
1872                                 (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
1873                                  errmsg("incorrect binary data format")));
1874
1875         *isnull = false;
1876         return result;
1877 }