src/backend/replication/pgoutput/pgoutput.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * pgoutput.c
   4  *              Logical Replication output plugin
   5  *
   6  * Copyright (c) 2012-2024, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *                src/backend/replication/pgoutput/pgoutput.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "postgres.h"
  14
  15 #include "access/tupconvert.h"
  16 #include "catalog/partition.h"
  17 #include "catalog/pg_publication.h"
  18 #include "catalog/pg_publication_rel.h"
  19 #include "catalog/pg_subscription.h"
  20 #include "commands/defrem.h"
  21 #include "commands/subscriptioncmds.h"
  22 #include "executor/executor.h"
  23 #include "fmgr.h"
  24 #include "nodes/makefuncs.h"
  25 #include "parser/parse_relation.h"
  26 #include "replication/logical.h"
  27 #include "replication/logicalproto.h"
  28 #include "replication/origin.h"
  29 #include "replication/pgoutput.h"
  30 #include "utils/builtins.h"
  31 #include "utils/inval.h"
  32 #include "utils/lsyscache.h"
  33 #include "utils/memutils.h"
  34 #include "utils/rel.h"
  35 #include "utils/syscache.h"
  36 #include "utils/varlena.h"
  37
  38 PG_MODULE_MAGIC;
  39
  40 static void pgoutput_startup(LogicalDecodingContext *ctx,
  41                                                          OutputPluginOptions *opt, bool is_init);
  42 static void pgoutput_shutdown(LogicalDecodingContext *ctx);
  43 static void pgoutput_begin_txn(LogicalDecodingContext *ctx,
  44                                                            ReorderBufferTXN *txn);
  45 static void pgoutput_commit_txn(LogicalDecodingContext *ctx,
  46                                                                 ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
  47 static void pgoutput_change(LogicalDecodingContext *ctx,
  48                                                         ReorderBufferTXN *txn, Relation relation,
  49                                                         ReorderBufferChange *change);
  50 static void pgoutput_truncate(LogicalDecodingContext *ctx,
  51                                                           ReorderBufferTXN *txn, int nrelations, Relation relations[],
  52                                                           ReorderBufferChange *change);
  53 static void pgoutput_message(LogicalDecodingContext *ctx,
  54                                                          ReorderBufferTXN *txn, XLogRecPtr message_lsn,
  55                                                          bool transactional, const char *prefix,
  56                                                          Size sz, const char *message);
  57 static bool pgoutput_origin_filter(LogicalDecodingContext *ctx,
  58                                                                    RepOriginId origin_id);
  59 static void pgoutput_begin_prepare_txn(LogicalDecodingContext *ctx,
  60                                                                            ReorderBufferTXN *txn);
  61 static void pgoutput_prepare_txn(LogicalDecodingContext *ctx,
  62                                                                  ReorderBufferTXN *txn, XLogRecPtr prepare_lsn);
  63 static void pgoutput_commit_prepared_txn(LogicalDecodingContext *ctx,
  64                                                                                  ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
  65 static void pgoutput_rollback_prepared_txn(LogicalDecodingContext *ctx,
  66                                                                                    ReorderBufferTXN *txn,
  67                                                                                    XLogRecPtr prepare_end_lsn,
  68                                                                                    TimestampTz prepare_time);
  69 static void pgoutput_stream_start(struct LogicalDecodingContext *ctx,
  70                                                                   ReorderBufferTXN *txn);
  71 static void pgoutput_stream_stop(struct LogicalDecodingContext *ctx,
  72                                                                  ReorderBufferTXN *txn);
  73 static void pgoutput_stream_abort(struct LogicalDecodingContext *ctx,
  74                                                                   ReorderBufferTXN *txn,
  75                                                                   XLogRecPtr abort_lsn);
  76 static void pgoutput_stream_commit(struct LogicalDecodingContext *ctx,
  77                                                                    ReorderBufferTXN *txn,
  78                                                                    XLogRecPtr commit_lsn);
  79 static void pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx,
  80                                                                                 ReorderBufferTXN *txn, XLogRecPtr prepare_lsn);
  81
  82 static bool publications_valid;
  83
  84 static List *LoadPublications(List *pubnames);
  85 static void publication_invalidation_cb(Datum arg, int cacheid,
  86                                                                                 uint32 hashvalue);
  87 static void send_relation_and_attrs(Relation relation, TransactionId xid,
  88                                                                         LogicalDecodingContext *ctx,
  89                                                                         Bitmapset *columns);
  90 static void send_repl_origin(LogicalDecodingContext *ctx,
  91                                                          RepOriginId origin_id, XLogRecPtr origin_lsn,
  92                                                          bool send_origin);
  93
  94 /*
  95  * Only 3 publication actions are used for row filtering ("insert", "update",
  96  * "delete"). See RelationSyncEntry.exprstate[].
  97  */
  98 enum RowFilterPubAction
  99 {
 100         PUBACTION_INSERT,
 101         PUBACTION_UPDATE,
 102         PUBACTION_DELETE,
 103 };
 104
 105 #define NUM_ROWFILTER_PUBACTIONS (PUBACTION_DELETE+1)
 106
 107 /*
 108  * Entry in the map used to remember which relation schemas we sent.
 109  *
 110  * The schema_sent flag determines if the current schema record for the
 111  * relation (and for its ancestor if publish_as_relid is set) was already
 112  * sent to the subscriber (in which case we don't need to send it again).
 113  *
 114  * The schema cache on downstream is however updated only at commit time,
 115  * and with streamed transactions the commit order may be different from
 116  * the order the transactions are sent in. Also, the (sub) transactions
 117  * might get aborted so we need to send the schema for each (sub) transaction
 118  * so that we don't lose the schema information on abort. For handling this,
 119  * we maintain the list of xids (streamed_txns) for those we have already sent
 120  * the schema.
 121  *
 122  * For partitions, 'pubactions' considers not only the table's own
 123  * publications, but also those of all of its ancestors.
 124  */
 125 typedef struct RelationSyncEntry
 126 {
 127         Oid                     relid;                  /* relation oid */
 128
 129         bool            replicate_valid;        /* overall validity flag for entry */
 130
 131         bool            schema_sent;
 132         List       *streamed_txns;      /* streamed toplevel transactions with this
 133                                                                  * schema */
 134
 135         /* are we publishing this rel? */
 136         PublicationActions pubactions;
 137
 138         /*
 139          * ExprState array for row filter. Different publication actions don't
 140          * allow multiple expressions to always be combined into one, because
 141          * updates or deletes restrict the column in expression to be part of the
 142          * replica identity index whereas inserts do not have this restriction, so
 143          * there is one ExprState per publication action.
 144          */
 145         ExprState  *exprstate[NUM_ROWFILTER_PUBACTIONS];
 146         EState     *estate;                     /* executor state used for row filter */
 147         TupleTableSlot *new_slot;       /* slot for storing new tuple */
 148         TupleTableSlot *old_slot;       /* slot for storing old tuple */
 149
 150         /*
 151          * OID of the relation to publish changes as.  For a partition, this may
 152          * be set to one of its ancestors whose schema will be used when
 153          * replicating changes, if publish_via_partition_root is set for the
 154          * publication.
 155          */
 156         Oid                     publish_as_relid;
 157
 158         /*
 159          * Map used when replicating using an ancestor's schema to convert tuples
 160          * from partition's type to the ancestor's; NULL if publish_as_relid is
 161          * same as 'relid' or if unnecessary due to partition and the ancestor
 162          * having identical TupleDesc.
 163          */
 164         AttrMap    *attrmap;
 165
 166         /*
 167          * Columns included in the publication, or NULL if all columns are
 168          * included implicitly.  Note that the attnums in this bitmap are not
 169          * shifted by FirstLowInvalidHeapAttributeNumber.
 170          */
 171         Bitmapset  *columns;
 172
 173         /*
 174          * Private context to store additional data for this entry - state for the
 175          * row filter expressions, column list, etc.
 176          */
 177         MemoryContext entry_cxt;
 178 } RelationSyncEntry;
 179
 180 /*
 181  * Maintain a per-transaction level variable to track whether the transaction
 182  * has sent BEGIN. BEGIN is only sent when the first change in a transaction
 183  * is processed. This makes it possible to skip sending a pair of BEGIN/COMMIT
 184  * messages for empty transactions which saves network bandwidth.
 185  *
 186  * This optimization is not used for prepared transactions because if the
 187  * WALSender restarts after prepare of a transaction and before commit prepared
 188  * of the same transaction then we won't be able to figure out if we have
 189  * skipped sending BEGIN/PREPARE of a transaction as it was empty. This is
 190  * because we would have lost the in-memory txndata information that was
 191  * present prior to the restart. This will result in sending a spurious
 192  * COMMIT PREPARED without a corresponding prepared transaction at the
 193  * downstream which would lead to an error when it tries to process it.
 194  *
 195  * XXX We could achieve this optimization by changing protocol to send
 196  * additional information so that downstream can detect that the corresponding
 197  * prepare has not been sent. However, adding such a check for every
 198  * transaction in the downstream could be costly so we might want to do it
 199  * optionally.
 200  *
 201  * We also don't have this optimization for streamed transactions because
 202  * they can contain prepared transactions.
 203  */
 204 typedef struct PGOutputTxnData
 205 {
 206         bool            sent_begin_txn; /* flag indicating whether BEGIN has been sent */
 207 } PGOutputTxnData;
 208
 209 /* Map used to remember which relation schemas we sent. */
 210 static HTAB *RelationSyncCache = NULL;
 211
 212 static void init_rel_sync_cache(MemoryContext cachectx);
 213 static void cleanup_rel_sync_cache(TransactionId xid, bool is_commit);
 214 static RelationSyncEntry *get_rel_sync_entry(PGOutputData *data,
 215                                                                                          Relation relation);
 216 static void rel_sync_cache_relation_cb(Datum arg, Oid relid);
 217 static void rel_sync_cache_publication_cb(Datum arg, int cacheid,
 218                                                                                   uint32 hashvalue);
 219 static void set_schema_sent_in_streamed_txn(RelationSyncEntry *entry,
 220                                                                                         TransactionId xid);
 221 static bool get_schema_sent_in_streamed_txn(RelationSyncEntry *entry,
 222                                                                                         TransactionId xid);
 223 static void init_tuple_slot(PGOutputData *data, Relation relation,
 224                                                         RelationSyncEntry *entry);
 225
 226 /* row filter routines */
 227 static EState *create_estate_for_relation(Relation rel);
 228 static void pgoutput_row_filter_init(PGOutputData *data,
 229                                                                          List *publications,
 230                                                                          RelationSyncEntry *entry);
 231 static bool pgoutput_row_filter_exec_expr(ExprState *state,
 232                                                                                   ExprContext *econtext);
 233 static bool pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot,
 234                                                                 TupleTableSlot **new_slot_ptr,
 235                                                                 RelationSyncEntry *entry,
 236                                                                 ReorderBufferChangeType *action);
 237
 238 /* column list routines */
 239 static void pgoutput_column_list_init(PGOutputData *data,
 240                                                                           List *publications,
 241                                                                           RelationSyncEntry *entry);
 242
 243 /*
 244  * Specify output plugin callbacks
 245  */
 246 void
 247 _PG_output_plugin_init(OutputPluginCallbacks *cb)
 248 {
 249         cb->startup_cb = pgoutput_startup;
 250         cb->begin_cb = pgoutput_begin_txn;
 251         cb->change_cb = pgoutput_change;
 252         cb->truncate_cb = pgoutput_truncate;
 253         cb->message_cb = pgoutput_message;
 254         cb->commit_cb = pgoutput_commit_txn;
 255
 256         cb->begin_prepare_cb = pgoutput_begin_prepare_txn;
 257         cb->prepare_cb = pgoutput_prepare_txn;
 258         cb->commit_prepared_cb = pgoutput_commit_prepared_txn;
 259         cb->rollback_prepared_cb = pgoutput_rollback_prepared_txn;
 260         cb->filter_by_origin_cb = pgoutput_origin_filter;
 261         cb->shutdown_cb = pgoutput_shutdown;
 262
 263         /* transaction streaming */
 264         cb->stream_start_cb = pgoutput_stream_start;
 265         cb->stream_stop_cb = pgoutput_stream_stop;
 266         cb->stream_abort_cb = pgoutput_stream_abort;
 267         cb->stream_commit_cb = pgoutput_stream_commit;
 268         cb->stream_change_cb = pgoutput_change;
 269         cb->stream_message_cb = pgoutput_message;
 270         cb->stream_truncate_cb = pgoutput_truncate;
 271         /* transaction streaming - two-phase commit */
 272         cb->stream_prepare_cb = pgoutput_stream_prepare_txn;
 273 }
 274
 275 static void
 276 parse_output_parameters(List *options, PGOutputData *data)
 277 {
 278         ListCell   *lc;
 279         bool            protocol_version_given = false;
 280         bool            publication_names_given = false;
 281         bool            binary_option_given = false;
 282         bool            messages_option_given = false;
 283         bool            streaming_given = false;
 284         bool            two_phase_option_given = false;
 285         bool            origin_option_given = false;
 286
 287         data->binary = false;
 288         data->streaming = LOGICALREP_STREAM_OFF;
 289         data->messages = false;
 290         data->two_phase = false;
 291
 292         foreach(lc, options)
 293         {
 294                 DefElem    *defel = (DefElem *) lfirst(lc);
 295
 296                 Assert(defel->arg == NULL || IsA(defel->arg, String));
 297
 298                 /* Check each param, whether or not we recognize it */
 299                 if (strcmp(defel->defname, "proto_version") == 0)
 300                 {
 301                         unsigned long parsed;
 302                         char       *endptr;
 303
 304                         if (protocol_version_given)
 305                                 ereport(ERROR,
 306                                                 (errcode(ERRCODE_SYNTAX_ERROR),
 307                                                  errmsg("conflicting or redundant options")));
 308                         protocol_version_given = true;
 309
 310                         errno = 0;
 311                         parsed = strtoul(strVal(defel->arg), &endptr, 10);
 312                         if (errno != 0 || *endptr != '\0')
 313                                 ereport(ERROR,
 314                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 315                                                  errmsg("invalid proto_version")));
 316
 317                         if (parsed > PG_UINT32_MAX)
 318                                 ereport(ERROR,
 319                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 320                                                  errmsg("proto_version \"%s\" out of range",
 321                                                                 strVal(defel->arg))));
 322
 323                         data->protocol_version = (uint32) parsed;
 324                 }
 325                 else if (strcmp(defel->defname, "publication_names") == 0)
 326                 {
 327                         if (publication_names_given)
 328                                 ereport(ERROR,
 329                                                 (errcode(ERRCODE_SYNTAX_ERROR),
 330                                                  errmsg("conflicting or redundant options")));
 331                         publication_names_given = true;
 332
 333                         if (!SplitIdentifierString(strVal(defel->arg), ',',
 334                                                                            &data->publication_names))
 335                                 ereport(ERROR,
 336                                                 (errcode(ERRCODE_INVALID_NAME),
 337                                                  errmsg("invalid publication_names syntax")));
 338                 }
 339                 else if (strcmp(defel->defname, "binary") == 0)
 340                 {
 341                         if (binary_option_given)
 342                                 ereport(ERROR,
 343                                                 (errcode(ERRCODE_SYNTAX_ERROR),
 344                                                  errmsg("conflicting or redundant options")));
 345                         binary_option_given = true;
 346
 347                         data->binary = defGetBoolean(defel);
 348                 }
 349                 else if (strcmp(defel->defname, "messages") == 0)
 350                 {
 351                         if (messages_option_given)
 352                                 ereport(ERROR,
 353                                                 (errcode(ERRCODE_SYNTAX_ERROR),
 354                                                  errmsg("conflicting or redundant options")));
 355                         messages_option_given = true;
 356
 357                         data->messages = defGetBoolean(defel);
 358                 }
 359                 else if (strcmp(defel->defname, "streaming") == 0)
 360                 {
 361                         if (streaming_given)
 362                                 ereport(ERROR,
 363                                                 (errcode(ERRCODE_SYNTAX_ERROR),
 364                                                  errmsg("conflicting or redundant options")));
 365                         streaming_given = true;
 366
 367                         data->streaming = defGetStreamingMode(defel);
 368                 }
 369                 else if (strcmp(defel->defname, "two_phase") == 0)
 370                 {
 371                         if (two_phase_option_given)
 372                                 ereport(ERROR,
 373                                                 (errcode(ERRCODE_SYNTAX_ERROR),
 374                                                  errmsg("conflicting or redundant options")));
 375                         two_phase_option_given = true;
 376
 377                         data->two_phase = defGetBoolean(defel);
 378                 }
 379                 else if (strcmp(defel->defname, "origin") == 0)
 380                 {
 381                         char       *origin;
 382
 383                         if (origin_option_given)
 384                                 ereport(ERROR,
 385                                                 errcode(ERRCODE_SYNTAX_ERROR),
 386                                                 errmsg("conflicting or redundant options"));
 387                         origin_option_given = true;
 388
 389                         origin = defGetString(defel);
 390                         if (pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) == 0)
 391                                 data->publish_no_origin = true;
 392                         else if (pg_strcasecmp(origin, LOGICALREP_ORIGIN_ANY) == 0)
 393                                 data->publish_no_origin = false;
 394                         else
 395                                 ereport(ERROR,
 396                                                 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 397                                                 errmsg("unrecognized origin value: \"%s\"", origin));
 398                 }
 399                 else
 400                         elog(ERROR, "unrecognized pgoutput option: %s", defel->defname);
 401         }
 402
 403         /* Check required options */
 404         if (!protocol_version_given)
 405                 ereport(ERROR,
 406                                 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 407                                 errmsg("option \"%s\" missing", "proto_version"));
 408         if (!publication_names_given)
 409                 ereport(ERROR,
 410                                 errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 411                                 errmsg("option \"%s\" missing", "publication_names"));
 412 }
 413
 414 /*
 415  * Initialize this plugin
 416  */
 417 static void
 418 pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 419                                  bool is_init)
 420 {
 421         PGOutputData *data = palloc0(sizeof(PGOutputData));
 422         static bool publication_callback_registered = false;
 423
 424         /* Create our memory context for private allocations. */
 425         data->context = AllocSetContextCreate(ctx->context,
 426                                                                                   "logical replication output context",
 427                                                                                   ALLOCSET_DEFAULT_SIZES);
 428
 429         data->cachectx = AllocSetContextCreate(ctx->context,
 430                                                                                    "logical replication cache context",
 431                                                                                    ALLOCSET_DEFAULT_SIZES);
 432
 433         ctx->output_plugin_private = data;
 434
 435         /* This plugin uses binary protocol. */
 436         opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT;
 437
 438         /*
 439          * This is replication start and not slot initialization.
 440          *
 441          * Parse and validate options passed by the client.
 442          */
 443         if (!is_init)
 444         {
 445                 /* Parse the params and ERROR if we see any we don't recognize */
 446                 parse_output_parameters(ctx->output_plugin_options, data);
 447
 448                 /* Check if we support requested protocol */
 449                 if (data->protocol_version > LOGICALREP_PROTO_MAX_VERSION_NUM)
 450                         ereport(ERROR,
 451                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 452                                          errmsg("client sent proto_version=%d but server only supports protocol %d or lower",
 453                                                         data->protocol_version, LOGICALREP_PROTO_MAX_VERSION_NUM)));
 454
 455                 if (data->protocol_version < LOGICALREP_PROTO_MIN_VERSION_NUM)
 456                         ereport(ERROR,
 457                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 458                                          errmsg("client sent proto_version=%d but server only supports protocol %d or higher",
 459                                                         data->protocol_version, LOGICALREP_PROTO_MIN_VERSION_NUM)));
 460
 461                 /*
 462                  * Decide whether to enable streaming. It is disabled by default, in
 463                  * which case we just update the flag in decoding context. Otherwise
 464                  * we only allow it with sufficient version of the protocol, and when
 465                  * the output plugin supports it.
 466                  */
 467                 if (data->streaming == LOGICALREP_STREAM_OFF)
 468                         ctx->streaming = false;
 469                 else if (data->streaming == LOGICALREP_STREAM_ON &&
 470                                  data->protocol_version < LOGICALREP_PROTO_STREAM_VERSION_NUM)
 471                         ereport(ERROR,
 472                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 473                                          errmsg("requested proto_version=%d does not support streaming, need %d or higher",
 474                                                         data->protocol_version, LOGICALREP_PROTO_STREAM_VERSION_NUM)));
 475                 else if (data->streaming == LOGICALREP_STREAM_PARALLEL &&
 476                                  data->protocol_version < LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM)
 477                         ereport(ERROR,
 478                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 479                                          errmsg("requested proto_version=%d does not support parallel streaming, need %d or higher",
 480                                                         data->protocol_version, LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM)));
 481                 else if (!ctx->streaming)
 482                         ereport(ERROR,
 483                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 484                                          errmsg("streaming requested, but not supported by output plugin")));
 485
 486                 /*
 487                  * Here, we just check whether the two-phase option is passed by
 488                  * plugin and decide whether to enable it at later point of time. It
 489                  * remains enabled if the previous start-up has done so. But we only
 490                  * allow the option to be passed in with sufficient version of the
 491                  * protocol, and when the output plugin supports it.
 492                  */
 493                 if (!data->two_phase)
 494                         ctx->twophase_opt_given = false;
 495                 else if (data->protocol_version < LOGICALREP_PROTO_TWOPHASE_VERSION_NUM)
 496                         ereport(ERROR,
 497                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 498                                          errmsg("requested proto_version=%d does not support two-phase commit, need %d or higher",
 499                                                         data->protocol_version, LOGICALREP_PROTO_TWOPHASE_VERSION_NUM)));
 500                 else if (!ctx->twophase)
 501                         ereport(ERROR,
 502                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 503                                          errmsg("two-phase commit requested, but not supported by output plugin")));
 504                 else
 505                         ctx->twophase_opt_given = true;
 506
 507                 /* Init publication state. */
 508                 data->publications = NIL;
 509                 publications_valid = false;
 510
 511                 /*
 512                  * Register callback for pg_publication if we didn't already do that
 513                  * during some previous call in this process.
 514                  */
 515                 if (!publication_callback_registered)
 516                 {
 517                         CacheRegisterSyscacheCallback(PUBLICATIONOID,
 518                                                                                   publication_invalidation_cb,
 519                                                                                   (Datum) 0);
 520                         publication_callback_registered = true;
 521                 }
 522
 523                 /* Initialize relation schema cache. */
 524                 init_rel_sync_cache(CacheMemoryContext);
 525         }
 526         else
 527         {
 528                 /*
 529                  * Disable the streaming and prepared transactions during the slot
 530                  * initialization mode.
 531                  */
 532                 ctx->streaming = false;
 533                 ctx->twophase = false;
 534         }
 535 }
 536
 537 /*
 538  * BEGIN callback.
 539  *
 540  * Don't send the BEGIN message here instead postpone it until the first
 541  * change. In logical replication, a common scenario is to replicate a set of
 542  * tables (instead of all tables) and transactions whose changes were on
 543  * the table(s) that are not published will produce empty transactions. These
 544  * empty transactions will send BEGIN and COMMIT messages to subscribers,
 545  * using bandwidth on something with little/no use for logical replication.
 546  */
 547 static void
 548 pgoutput_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
 549 {
 550         PGOutputTxnData *txndata = MemoryContextAllocZero(ctx->context,
 551                                                                                                           sizeof(PGOutputTxnData));
 552
 553         txn->output_plugin_private = txndata;
 554 }
 555
 556 /*
 557  * Send BEGIN.
 558  *
 559  * This is called while processing the first change of the transaction.
 560  */
 561 static void
 562 pgoutput_send_begin(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
 563 {
 564         bool            send_replication_origin = txn->origin_id != InvalidRepOriginId;
 565         PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
 566
 567         Assert(txndata);
 568         Assert(!txndata->sent_begin_txn);
 569
 570         OutputPluginPrepareWrite(ctx, !send_replication_origin);
 571         logicalrep_write_begin(ctx->out, txn);
 572         txndata->sent_begin_txn = true;
 573
 574         send_repl_origin(ctx, txn->origin_id, txn->origin_lsn,
 575                                          send_replication_origin);
 576
 577         OutputPluginWrite(ctx, true);
 578 }
 579
 580 /*
 581  * COMMIT callback
 582  */
 583 static void
 584 pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 585                                         XLogRecPtr commit_lsn)
 586 {
 587         PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
 588         bool            sent_begin_txn;
 589
 590         Assert(txndata);
 591
 592         /*
 593          * We don't need to send the commit message unless some relevant change
 594          * from this transaction has been sent to the downstream.
 595          */
 596         sent_begin_txn = txndata->sent_begin_txn;
 597         OutputPluginUpdateProgress(ctx, !sent_begin_txn);
 598         pfree(txndata);
 599         txn->output_plugin_private = NULL;
 600
 601         if (!sent_begin_txn)
 602         {
 603                 elog(DEBUG1, "skipped replication of an empty transaction with XID: %u", txn->xid);
 604                 return;
 605         }
 606
 607         OutputPluginPrepareWrite(ctx, true);
 608         logicalrep_write_commit(ctx->out, txn, commit_lsn);
 609         OutputPluginWrite(ctx, true);
 610 }
 611
 612 /*
 613  * BEGIN PREPARE callback
 614  */
 615 static void
 616 pgoutput_begin_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
 617 {
 618         bool            send_replication_origin = txn->origin_id != InvalidRepOriginId;
 619
 620         OutputPluginPrepareWrite(ctx, !send_replication_origin);
 621         logicalrep_write_begin_prepare(ctx->out, txn);
 622
 623         send_repl_origin(ctx, txn->origin_id, txn->origin_lsn,
 624                                          send_replication_origin);
 625
 626         OutputPluginWrite(ctx, true);
 627 }
 628
 629 /*
 630  * PREPARE callback
 631  */
 632 static void
 633 pgoutput_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 634                                          XLogRecPtr prepare_lsn)
 635 {
 636         OutputPluginUpdateProgress(ctx, false);
 637
 638         OutputPluginPrepareWrite(ctx, true);
 639         logicalrep_write_prepare(ctx->out, txn, prepare_lsn);
 640         OutputPluginWrite(ctx, true);
 641 }
 642
 643 /*
 644  * COMMIT PREPARED callback
 645  */
 646 static void
 647 pgoutput_commit_prepared_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 648                                                          XLogRecPtr commit_lsn)
 649 {
 650         OutputPluginUpdateProgress(ctx, false);
 651
 652         OutputPluginPrepareWrite(ctx, true);
 653         logicalrep_write_commit_prepared(ctx->out, txn, commit_lsn);
 654         OutputPluginWrite(ctx, true);
 655 }
 656
 657 /*
 658  * ROLLBACK PREPARED callback
 659  */
 660 static void
 661 pgoutput_rollback_prepared_txn(LogicalDecodingContext *ctx,
 662                                                            ReorderBufferTXN *txn,
 663                                                            XLogRecPtr prepare_end_lsn,
 664                                                            TimestampTz prepare_time)
 665 {
 666         OutputPluginUpdateProgress(ctx, false);
 667
 668         OutputPluginPrepareWrite(ctx, true);
 669         logicalrep_write_rollback_prepared(ctx->out, txn, prepare_end_lsn,
 670                                                                            prepare_time);
 671         OutputPluginWrite(ctx, true);
 672 }
 673
 674 /*
 675  * Write the current schema of the relation and its ancestor (if any) if not
 676  * done yet.
 677  */
 678 static void
 679 maybe_send_schema(LogicalDecodingContext *ctx,
 680                                   ReorderBufferChange *change,
 681                                   Relation relation, RelationSyncEntry *relentry)
 682 {
 683         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
 684         bool            schema_sent;
 685         TransactionId xid = InvalidTransactionId;
 686         TransactionId topxid = InvalidTransactionId;
 687
 688         /*
 689          * Remember XID of the (sub)transaction for the change. We don't care if
 690          * it's top-level transaction or not (we have already sent that XID in
 691          * start of the current streaming block).
 692          *
 693          * If we're not in a streaming block, just use InvalidTransactionId and
 694          * the write methods will not include it.
 695          */
 696         if (data->in_streaming)
 697                 xid = change->txn->xid;
 698
 699         if (rbtxn_is_subtxn(change->txn))
 700                 topxid = rbtxn_get_toptxn(change->txn)->xid;
 701         else
 702                 topxid = xid;
 703
 704         /*
 705          * Do we need to send the schema? We do track streamed transactions
 706          * separately, because those may be applied later (and the regular
 707          * transactions won't see their effects until then) and in an order that
 708          * we don't know at this point.
 709          *
 710          * XXX There is a scope of optimization here. Currently, we always send
 711          * the schema first time in a streaming transaction but we can probably
 712          * avoid that by checking 'relentry->schema_sent' flag. However, before
 713          * doing that we need to study its impact on the case where we have a mix
 714          * of streaming and non-streaming transactions.
 715          */
 716         if (data->in_streaming)
 717                 schema_sent = get_schema_sent_in_streamed_txn(relentry, topxid);
 718         else
 719                 schema_sent = relentry->schema_sent;
 720
 721         /* Nothing to do if we already sent the schema. */
 722         if (schema_sent)
 723                 return;
 724
 725         /*
 726          * Send the schema.  If the changes will be published using an ancestor's
 727          * schema, not the relation's own, send that ancestor's schema before
 728          * sending relation's own (XXX - maybe sending only the former suffices?).
 729          */
 730         if (relentry->publish_as_relid != RelationGetRelid(relation))
 731         {
 732                 Relation        ancestor = RelationIdGetRelation(relentry->publish_as_relid);
 733
 734                 send_relation_and_attrs(ancestor, xid, ctx, relentry->columns);
 735                 RelationClose(ancestor);
 736         }
 737
 738         send_relation_and_attrs(relation, xid, ctx, relentry->columns);
 739
 740         if (data->in_streaming)
 741                 set_schema_sent_in_streamed_txn(relentry, topxid);
 742         else
 743                 relentry->schema_sent = true;
 744 }
 745
 746 /*
 747  * Sends a relation
 748  */
 749 static void
 750 send_relation_and_attrs(Relation relation, TransactionId xid,
 751                                                 LogicalDecodingContext *ctx,
 752                                                 Bitmapset *columns)
 753 {
 754         TupleDesc       desc = RelationGetDescr(relation);
 755         int                     i;
 756
 757         /*
 758          * Write out type info if needed.  We do that only for user-created types.
 759          * We use FirstGenbkiObjectId as the cutoff, so that we only consider
 760          * objects with hand-assigned OIDs to be "built in", not for instance any
 761          * function or type defined in the information_schema. This is important
 762          * because only hand-assigned OIDs can be expected to remain stable across
 763          * major versions.
 764          */
 765         for (i = 0; i < desc->natts; i++)
 766         {
 767                 Form_pg_attribute att = TupleDescAttr(desc, i);
 768
 769                 if (att->attisdropped || att->attgenerated)
 770                         continue;
 771
 772                 if (att->atttypid < FirstGenbkiObjectId)
 773                         continue;
 774
 775                 /* Skip this attribute if it's not present in the column list */
 776                 if (columns != NULL && !bms_is_member(att->attnum, columns))
 777                         continue;
 778
 779                 OutputPluginPrepareWrite(ctx, false);
 780                 logicalrep_write_typ(ctx->out, xid, att->atttypid);
 781                 OutputPluginWrite(ctx, false);
 782         }
 783
 784         OutputPluginPrepareWrite(ctx, false);
 785         logicalrep_write_rel(ctx->out, xid, relation, columns);
 786         OutputPluginWrite(ctx, false);
 787 }
 788
 789 /*
 790  * Executor state preparation for evaluation of row filter expressions for the
 791  * specified relation.
 792  */
 793 static EState *
 794 create_estate_for_relation(Relation rel)
 795 {
 796         EState     *estate;
 797         RangeTblEntry *rte;
 798         List       *perminfos = NIL;
 799
 800         estate = CreateExecutorState();
 801
 802         rte = makeNode(RangeTblEntry);
 803         rte->rtekind = RTE_RELATION;
 804         rte->relid = RelationGetRelid(rel);
 805         rte->relkind = rel->rd_rel->relkind;
 806         rte->rellockmode = AccessShareLock;
 807
 808         addRTEPermissionInfo(&perminfos, rte);
 809
 810         ExecInitRangeTable(estate, list_make1(rte), perminfos);
 811
 812         estate->es_output_cid = GetCurrentCommandId(false);
 813
 814         return estate;
 815 }
 816
 817 /*
 818  * Evaluates row filter.
 819  *
 820  * If the row filter evaluates to NULL, it is taken as false i.e. the change
 821  * isn't replicated.
 822  */
 823 static bool
 824 pgoutput_row_filter_exec_expr(ExprState *state, ExprContext *econtext)
 825 {
 826         Datum           ret;
 827         bool            isnull;
 828
 829         Assert(state != NULL);
 830
 831         ret = ExecEvalExprSwitchContext(state, econtext, &isnull);
 832
 833         elog(DEBUG3, "row filter evaluates to %s (isnull: %s)",
 834                  isnull ? "false" : DatumGetBool(ret) ? "true" : "false",
 835                  isnull ? "true" : "false");
 836
 837         if (isnull)
 838                 return false;
 839
 840         return DatumGetBool(ret);
 841 }
 842
 843 /*
 844  * Make sure the per-entry memory context exists.
 845  */
 846 static void
 847 pgoutput_ensure_entry_cxt(PGOutputData *data, RelationSyncEntry *entry)
 848 {
 849         Relation        relation;
 850
 851         /* The context may already exist, in which case bail out. */
 852         if (entry->entry_cxt)
 853                 return;
 854
 855         relation = RelationIdGetRelation(entry->publish_as_relid);
 856
 857         entry->entry_cxt = AllocSetContextCreate(data->cachectx,
 858                                                                                          "entry private context",
 859                                                                                          ALLOCSET_SMALL_SIZES);
 860
 861         MemoryContextCopyAndSetIdentifier(entry->entry_cxt,
 862                                                                           RelationGetRelationName(relation));
 863 }
 864
 865 /*
 866  * Initialize the row filter.
 867  */
 868 static void
 869 pgoutput_row_filter_init(PGOutputData *data, List *publications,
 870                                                  RelationSyncEntry *entry)
 871 {
 872         ListCell   *lc;
 873         List       *rfnodes[] = {NIL, NIL, NIL};        /* One per pubaction */
 874         bool            no_filter[] = {false, false, false};    /* One per pubaction */
 875         MemoryContext oldctx;
 876         int                     idx;
 877         bool            has_filter = true;
 878         Oid                     schemaid = get_rel_namespace(entry->publish_as_relid);
 879
 880         /*
 881          * Find if there are any row filters for this relation. If there are, then
 882          * prepare the necessary ExprState and cache it in entry->exprstate. To
 883          * build an expression state, we need to ensure the following:
 884          *
 885          * All the given publication-table mappings must be checked.
 886          *
 887          * Multiple publications might have multiple row filters for this
 888          * relation. Since row filter usage depends on the DML operation, there
 889          * are multiple lists (one for each operation) to which row filters will
 890          * be appended.
 891          *
 892          * FOR ALL TABLES and FOR TABLES IN SCHEMA implies "don't use row filter
 893          * expression" so it takes precedence.
 894          */
 895         foreach(lc, publications)
 896         {
 897                 Publication *pub = lfirst(lc);
 898                 HeapTuple       rftuple = NULL;
 899                 Datum           rfdatum = 0;
 900                 bool            pub_no_filter = true;
 901
 902                 /*
 903                  * If the publication is FOR ALL TABLES, or the publication includes a
 904                  * FOR TABLES IN SCHEMA where the table belongs to the referred
 905                  * schema, then it is treated the same as if there are no row filters
 906                  * (even if other publications have a row filter).
 907                  */
 908                 if (!pub->alltables &&
 909                         !SearchSysCacheExists2(PUBLICATIONNAMESPACEMAP,
 910                                                                    ObjectIdGetDatum(schemaid),
 911                                                                    ObjectIdGetDatum(pub->oid)))
 912                 {
 913                         /*
 914                          * Check for the presence of a row filter in this publication.
 915                          */
 916                         rftuple = SearchSysCache2(PUBLICATIONRELMAP,
 917                                                                           ObjectIdGetDatum(entry->publish_as_relid),
 918                                                                           ObjectIdGetDatum(pub->oid));
 919
 920                         if (HeapTupleIsValid(rftuple))
 921                         {
 922                                 /* Null indicates no filter. */
 923                                 rfdatum = SysCacheGetAttr(PUBLICATIONRELMAP, rftuple,
 924                                                                                   Anum_pg_publication_rel_prqual,
 925                                                                                   &pub_no_filter);
 926                         }
 927                 }
 928
 929                 if (pub_no_filter)
 930                 {
 931                         if (rftuple)
 932                                 ReleaseSysCache(rftuple);
 933
 934                         no_filter[PUBACTION_INSERT] |= pub->pubactions.pubinsert;
 935                         no_filter[PUBACTION_UPDATE] |= pub->pubactions.pubupdate;
 936                         no_filter[PUBACTION_DELETE] |= pub->pubactions.pubdelete;
 937
 938                         /*
 939                          * Quick exit if all the DML actions are publicized via this
 940                          * publication.
 941                          */
 942                         if (no_filter[PUBACTION_INSERT] &&
 943                                 no_filter[PUBACTION_UPDATE] &&
 944                                 no_filter[PUBACTION_DELETE])
 945                         {
 946                                 has_filter = false;
 947                                 break;
 948                         }
 949
 950                         /* No additional work for this publication. Next one. */
 951                         continue;
 952                 }
 953
 954                 /* Form the per pubaction row filter lists. */
 955                 if (pub->pubactions.pubinsert && !no_filter[PUBACTION_INSERT])
 956                         rfnodes[PUBACTION_INSERT] = lappend(rfnodes[PUBACTION_INSERT],
 957                                                                                                 TextDatumGetCString(rfdatum));
 958                 if (pub->pubactions.pubupdate && !no_filter[PUBACTION_UPDATE])
 959                         rfnodes[PUBACTION_UPDATE] = lappend(rfnodes[PUBACTION_UPDATE],
 960                                                                                                 TextDatumGetCString(rfdatum));
 961                 if (pub->pubactions.pubdelete && !no_filter[PUBACTION_DELETE])
 962                         rfnodes[PUBACTION_DELETE] = lappend(rfnodes[PUBACTION_DELETE],
 963                                                                                                 TextDatumGetCString(rfdatum));
 964
 965                 ReleaseSysCache(rftuple);
 966         }                                                       /* loop all subscribed publications */
 967
 968         /* Clean the row filter */
 969         for (idx = 0; idx < NUM_ROWFILTER_PUBACTIONS; idx++)
 970         {
 971                 if (no_filter[idx])
 972                 {
 973                         list_free_deep(rfnodes[idx]);
 974                         rfnodes[idx] = NIL;
 975                 }
 976         }
 977
 978         if (has_filter)
 979         {
 980                 Relation        relation = RelationIdGetRelation(entry->publish_as_relid);
 981
 982                 pgoutput_ensure_entry_cxt(data, entry);
 983
 984                 /*
 985                  * Now all the filters for all pubactions are known. Combine them when
 986                  * their pubactions are the same.
 987                  */
 988                 oldctx = MemoryContextSwitchTo(entry->entry_cxt);
 989                 entry->estate = create_estate_for_relation(relation);
 990                 for (idx = 0; idx < NUM_ROWFILTER_PUBACTIONS; idx++)
 991                 {
 992                         List       *filters = NIL;
 993                         Expr       *rfnode;
 994
 995                         if (rfnodes[idx] == NIL)
 996                                 continue;
 997
 998                         foreach(lc, rfnodes[idx])
 999                                 filters = lappend(filters, stringToNode((char *) lfirst(lc)));
1000
1001                         /* combine the row filter and cache the ExprState */
1002                         rfnode = make_orclause(filters);
1003                         entry->exprstate[idx] = ExecPrepareExpr(rfnode, entry->estate);
1004                 }                                               /* for each pubaction */
1005                 MemoryContextSwitchTo(oldctx);
1006
1007                 RelationClose(relation);
1008         }
1009 }
1010
1011 /*
1012  * Initialize the column list.
1013  */
1014 static void
1015 pgoutput_column_list_init(PGOutputData *data, List *publications,
1016                                                   RelationSyncEntry *entry)
1017 {
1018         ListCell   *lc;
1019         bool            first = true;
1020         Relation        relation = RelationIdGetRelation(entry->publish_as_relid);
1021
1022         /*
1023          * Find if there are any column lists for this relation. If there are,
1024          * build a bitmap using the column lists.
1025          *
1026          * Multiple publications might have multiple column lists for this
1027          * relation.
1028          *
1029          * Note that we don't support the case where the column list is different
1030          * for the same table when combining publications. See comments atop
1031          * fetch_table_list. But one can later change the publication so we still
1032          * need to check all the given publication-table mappings and report an
1033          * error if any publications have a different column list.
1034          *
1035          * FOR ALL TABLES and FOR TABLES IN SCHEMA imply "don't use column list".
1036          */
1037         foreach(lc, publications)
1038         {
1039                 Publication *pub = lfirst(lc);
1040                 HeapTuple       cftuple = NULL;
1041                 Datum           cfdatum = 0;
1042                 Bitmapset  *cols = NULL;
1043
1044                 /*
1045                  * If the publication is FOR ALL TABLES then it is treated the same as
1046                  * if there are no column lists (even if other publications have a
1047                  * list).
1048                  */
1049                 if (!pub->alltables)
1050                 {
1051                         bool            pub_no_list = true;
1052
1053                         /*
1054                          * Check for the presence of a column list in this publication.
1055                          *
1056                          * Note: If we find no pg_publication_rel row, it's a publication
1057                          * defined for a whole schema, so it can't have a column list,
1058                          * just like a FOR ALL TABLES publication.
1059                          */
1060                         cftuple = SearchSysCache2(PUBLICATIONRELMAP,
1061                                                                           ObjectIdGetDatum(entry->publish_as_relid),
1062                                                                           ObjectIdGetDatum(pub->oid));
1063
1064                         if (HeapTupleIsValid(cftuple))
1065                         {
1066                                 /* Lookup the column list attribute. */
1067                                 cfdatum = SysCacheGetAttr(PUBLICATIONRELMAP, cftuple,
1068                                                                                   Anum_pg_publication_rel_prattrs,
1069                                                                                   &pub_no_list);
1070
1071                                 /* Build the column list bitmap in the per-entry context. */
1072                                 if (!pub_no_list)       /* when not null */
1073                                 {
1074                                         int                     i;
1075                                         int                     nliveatts = 0;
1076                                         TupleDesc       desc = RelationGetDescr(relation);
1077
1078                                         pgoutput_ensure_entry_cxt(data, entry);
1079
1080                                         cols = pub_collist_to_bitmapset(cols, cfdatum,
1081                                                                                                         entry->entry_cxt);
1082
1083                                         /* Get the number of live attributes. */
1084                                         for (i = 0; i < desc->natts; i++)
1085                                         {
1086                                                 Form_pg_attribute att = TupleDescAttr(desc, i);
1087
1088                                                 if (att->attisdropped || att->attgenerated)
1089                                                         continue;
1090
1091                                                 nliveatts++;
1092                                         }
1093
1094                                         /*
1095                                          * If column list includes all the columns of the table,
1096                                          * set it to NULL.
1097                                          */
1098                                         if (bms_num_members(cols) == nliveatts)
1099                                         {
1100                                                 bms_free(cols);
1101                                                 cols = NULL;
1102                                         }
1103                                 }
1104
1105                                 ReleaseSysCache(cftuple);
1106                         }
1107                 }
1108
1109                 if (first)
1110                 {
1111                         entry->columns = cols;
1112                         first = false;
1113                 }
1114                 else if (!bms_equal(entry->columns, cols))
1115                         ereport(ERROR,
1116                                         errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1117                                         errmsg("cannot use different column lists for table \"%s.%s\" in different publications",
1118                                                    get_namespace_name(RelationGetNamespace(relation)),
1119                                                    RelationGetRelationName(relation)));
1120         }                                                       /* loop all subscribed publications */
1121
1122         RelationClose(relation);
1123 }
1124
1125 /*
1126  * Initialize the slot for storing new and old tuples, and build the map that
1127  * will be used to convert the relation's tuples into the ancestor's format.
1128  */
1129 static void
1130 init_tuple_slot(PGOutputData *data, Relation relation,
1131                                 RelationSyncEntry *entry)
1132 {
1133         MemoryContext oldctx;
1134         TupleDesc       oldtupdesc;
1135         TupleDesc       newtupdesc;
1136
1137         oldctx = MemoryContextSwitchTo(data->cachectx);
1138
1139         /*
1140          * Create tuple table slots. Create a copy of the TupleDesc as it needs to
1141          * live as long as the cache remains.
1142          */
1143         oldtupdesc = CreateTupleDescCopyConstr(RelationGetDescr(relation));
1144         newtupdesc = CreateTupleDescCopyConstr(RelationGetDescr(relation));
1145
1146         entry->old_slot = MakeSingleTupleTableSlot(oldtupdesc, &TTSOpsHeapTuple);
1147         entry->new_slot = MakeSingleTupleTableSlot(newtupdesc, &TTSOpsHeapTuple);
1148
1149         MemoryContextSwitchTo(oldctx);
1150
1151         /*
1152          * Cache the map that will be used to convert the relation's tuples into
1153          * the ancestor's format, if needed.
1154          */
1155         if (entry->publish_as_relid != RelationGetRelid(relation))
1156         {
1157                 Relation        ancestor = RelationIdGetRelation(entry->publish_as_relid);
1158                 TupleDesc       indesc = RelationGetDescr(relation);
1159                 TupleDesc       outdesc = RelationGetDescr(ancestor);
1160
1161                 /* Map must live as long as the session does. */
1162                 oldctx = MemoryContextSwitchTo(CacheMemoryContext);
1163
1164                 entry->attrmap = build_attrmap_by_name_if_req(indesc, outdesc, false);
1165
1166                 MemoryContextSwitchTo(oldctx);
1167                 RelationClose(ancestor);
1168         }
1169 }
1170
1171 /*
1172  * Change is checked against the row filter if any.
1173  *
1174  * Returns true if the change is to be replicated, else false.
1175  *
1176  * For inserts, evaluate the row filter for new tuple.
1177  * For deletes, evaluate the row filter for old tuple.
1178  * For updates, evaluate the row filter for old and new tuple.
1179  *
1180  * For updates, if both evaluations are true, we allow sending the UPDATE and
1181  * if both the evaluations are false, it doesn't replicate the UPDATE. Now, if
1182  * only one of the tuples matches the row filter expression, we transform
1183  * UPDATE to DELETE or INSERT to avoid any data inconsistency based on the
1184  * following rules:
1185  *
1186  * Case 1: old-row (no match)    new-row (no match)  -> (drop change)
1187  * Case 2: old-row (no match)    new row (match)     -> INSERT
1188  * Case 3: old-row (match)       new-row (no match)  -> DELETE
1189  * Case 4: old-row (match)       new row (match)     -> UPDATE
1190  *
1191  * The new action is updated in the action parameter.
1192  *
1193  * The new slot could be updated when transforming the UPDATE into INSERT,
1194  * because the original new tuple might not have column values from the replica
1195  * identity.
1196  *
1197  * Examples:
1198  * Let's say the old tuple satisfies the row filter but the new tuple doesn't.
1199  * Since the old tuple satisfies, the initial table synchronization copied this
1200  * row (or another method was used to guarantee that there is data
1201  * consistency).  However, after the UPDATE the new tuple doesn't satisfy the
1202  * row filter, so from a data consistency perspective, that row should be
1203  * removed on the subscriber. The UPDATE should be transformed into a DELETE
1204  * statement and be sent to the subscriber. Keeping this row on the subscriber
1205  * is undesirable because it doesn't reflect what was defined in the row filter
1206  * expression on the publisher. This row on the subscriber would likely not be
1207  * modified by replication again. If someone inserted a new row with the same
1208  * old identifier, replication could stop due to a constraint violation.
1209  *
1210  * Let's say the old tuple doesn't match the row filter but the new tuple does.
1211  * Since the old tuple doesn't satisfy, the initial table synchronization
1212  * probably didn't copy this row. However, after the UPDATE the new tuple does
1213  * satisfy the row filter, so from a data consistency perspective, that row
1214  * should be inserted on the subscriber. Otherwise, subsequent UPDATE or DELETE
1215  * statements have no effect (it matches no row -- see
1216  * apply_handle_update_internal()). So, the UPDATE should be transformed into a
1217  * INSERT statement and be sent to the subscriber. However, this might surprise
1218  * someone who expects the data set to satisfy the row filter expression on the
1219  * provider.
1220  */
1221 static bool
1222 pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot,
1223                                         TupleTableSlot **new_slot_ptr, RelationSyncEntry *entry,
1224                                         ReorderBufferChangeType *action)
1225 {
1226         TupleDesc       desc;
1227         int                     i;
1228         bool            old_matched,
1229                                 new_matched,
1230                                 result;
1231         TupleTableSlot *tmp_new_slot;
1232         TupleTableSlot *new_slot = *new_slot_ptr;
1233         ExprContext *ecxt;
1234         ExprState  *filter_exprstate;
1235
1236         /*
1237          * We need this map to avoid relying on ReorderBufferChangeType enums
1238          * having specific values.
1239          */
1240         static const int map_changetype_pubaction[] = {
1241                 [REORDER_BUFFER_CHANGE_INSERT] = PUBACTION_INSERT,
1242                 [REORDER_BUFFER_CHANGE_UPDATE] = PUBACTION_UPDATE,
1243                 [REORDER_BUFFER_CHANGE_DELETE] = PUBACTION_DELETE
1244         };
1245
1246         Assert(*action == REORDER_BUFFER_CHANGE_INSERT ||
1247                    *action == REORDER_BUFFER_CHANGE_UPDATE ||
1248                    *action == REORDER_BUFFER_CHANGE_DELETE);
1249
1250         Assert(new_slot || old_slot);
1251
1252         /* Get the corresponding row filter */
1253         filter_exprstate = entry->exprstate[map_changetype_pubaction[*action]];
1254
1255         /* Bail out if there is no row filter */
1256         if (!filter_exprstate)
1257                 return true;
1258
1259         elog(DEBUG3, "table \"%s.%s\" has row filter",
1260                  get_namespace_name(RelationGetNamespace(relation)),
1261                  RelationGetRelationName(relation));
1262
1263         ResetPerTupleExprContext(entry->estate);
1264
1265         ecxt = GetPerTupleExprContext(entry->estate);
1266
1267         /*
1268          * For the following occasions where there is only one tuple, we can
1269          * evaluate the row filter for that tuple and return.
1270          *
1271          * For inserts, we only have the new tuple.
1272          *
1273          * For updates, we can have only a new tuple when none of the replica
1274          * identity columns changed and none of those columns have external data
1275          * but we still need to evaluate the row filter for the new tuple as the
1276          * existing values of those columns might not match the filter. Also,
1277          * users can use constant expressions in the row filter, so we anyway need
1278          * to evaluate it for the new tuple.
1279          *
1280          * For deletes, we only have the old tuple.
1281          */
1282         if (!new_slot || !old_slot)
1283         {
1284                 ecxt->ecxt_scantuple = new_slot ? new_slot : old_slot;
1285                 result = pgoutput_row_filter_exec_expr(filter_exprstate, ecxt);
1286
1287                 return result;
1288         }
1289
1290         /*
1291          * Both the old and new tuples must be valid only for updates and need to
1292          * be checked against the row filter.
1293          */
1294         Assert(map_changetype_pubaction[*action] == PUBACTION_UPDATE);
1295
1296         slot_getallattrs(new_slot);
1297         slot_getallattrs(old_slot);
1298
1299         tmp_new_slot = NULL;
1300         desc = RelationGetDescr(relation);
1301
1302         /*
1303          * The new tuple might not have all the replica identity columns, in which
1304          * case it needs to be copied over from the old tuple.
1305          */
1306         for (i = 0; i < desc->natts; i++)
1307         {
1308                 Form_pg_attribute att = TupleDescAttr(desc, i);
1309
1310                 /*
1311                  * if the column in the new tuple or old tuple is null, nothing to do
1312                  */
1313                 if (new_slot->tts_isnull[i] || old_slot->tts_isnull[i])
1314                         continue;
1315
1316                 /*
1317                  * Unchanged toasted replica identity columns are only logged in the
1318                  * old tuple. Copy this over to the new tuple. The changed (or WAL
1319                  * Logged) toast values are always assembled in memory and set as
1320                  * VARTAG_INDIRECT. See ReorderBufferToastReplace.
1321                  */
1322                 if (att->attlen == -1 &&
1323                         VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) &&
1324                         !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i]))
1325                 {
1326                         if (!tmp_new_slot)
1327                         {
1328                                 tmp_new_slot = MakeSingleTupleTableSlot(desc, &TTSOpsVirtual);
1329                                 ExecClearTuple(tmp_new_slot);
1330
1331                                 memcpy(tmp_new_slot->tts_values, new_slot->tts_values,
1332                                            desc->natts * sizeof(Datum));
1333                                 memcpy(tmp_new_slot->tts_isnull, new_slot->tts_isnull,
1334                                            desc->natts * sizeof(bool));
1335                         }
1336
1337                         tmp_new_slot->tts_values[i] = old_slot->tts_values[i];
1338                         tmp_new_slot->tts_isnull[i] = old_slot->tts_isnull[i];
1339                 }
1340         }
1341
1342         ecxt->ecxt_scantuple = old_slot;
1343         old_matched = pgoutput_row_filter_exec_expr(filter_exprstate, ecxt);
1344
1345         if (tmp_new_slot)
1346         {
1347                 ExecStoreVirtualTuple(tmp_new_slot);
1348                 ecxt->ecxt_scantuple = tmp_new_slot;
1349         }
1350         else
1351                 ecxt->ecxt_scantuple = new_slot;
1352
1353         new_matched = pgoutput_row_filter_exec_expr(filter_exprstate, ecxt);
1354
1355         /*
1356          * Case 1: if both tuples don't match the row filter, bailout. Send
1357          * nothing.
1358          */
1359         if (!old_matched && !new_matched)
1360                 return false;
1361
1362         /*
1363          * Case 2: if the old tuple doesn't satisfy the row filter but the new
1364          * tuple does, transform the UPDATE into INSERT.
1365          *
1366          * Use the newly transformed tuple that must contain the column values for
1367          * all the replica identity columns. This is required to ensure that the
1368          * while inserting the tuple in the downstream node, we have all the
1369          * required column values.
1370          */
1371         if (!old_matched && new_matched)
1372         {
1373                 *action = REORDER_BUFFER_CHANGE_INSERT;
1374
1375                 if (tmp_new_slot)
1376                         *new_slot_ptr = tmp_new_slot;
1377         }
1378
1379         /*
1380          * Case 3: if the old tuple satisfies the row filter but the new tuple
1381          * doesn't, transform the UPDATE into DELETE.
1382          *
1383          * This transformation does not require another tuple. The Old tuple will
1384          * be used for DELETE.
1385          */
1386         else if (old_matched && !new_matched)
1387                 *action = REORDER_BUFFER_CHANGE_DELETE;
1388
1389         /*
1390          * Case 4: if both tuples match the row filter, transformation isn't
1391          * required. (*action is default UPDATE).
1392          */
1393
1394         return true;
1395 }
1396
1397 /*
1398  * Sends the decoded DML over wire.
1399  *
1400  * This is called both in streaming and non-streaming modes.
1401  */
1402 static void
1403 pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
1404                                 Relation relation, ReorderBufferChange *change)
1405 {
1406         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
1407         PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
1408         MemoryContext old;
1409         RelationSyncEntry *relentry;
1410         TransactionId xid = InvalidTransactionId;
1411         Relation        ancestor = NULL;
1412         Relation        targetrel = relation;
1413         ReorderBufferChangeType action = change->action;
1414         TupleTableSlot *old_slot = NULL;
1415         TupleTableSlot *new_slot = NULL;
1416
1417         if (!is_publishable_relation(relation))
1418                 return;
1419
1420         /*
1421          * Remember the xid for the change in streaming mode. We need to send xid
1422          * with each change in the streaming mode so that subscriber can make
1423          * their association and on aborts, it can discard the corresponding
1424          * changes.
1425          */
1426         if (data->in_streaming)
1427                 xid = change->txn->xid;
1428
1429         relentry = get_rel_sync_entry(data, relation);
1430
1431         /* First check the table filter */
1432         switch (action)
1433         {
1434                 case REORDER_BUFFER_CHANGE_INSERT:
1435                         if (!relentry->pubactions.pubinsert)
1436                                 return;
1437                         break;
1438                 case REORDER_BUFFER_CHANGE_UPDATE:
1439                         if (!relentry->pubactions.pubupdate)
1440                                 return;
1441                         break;
1442                 case REORDER_BUFFER_CHANGE_DELETE:
1443                         if (!relentry->pubactions.pubdelete)
1444                                 return;
1445
1446                         /*
1447                          * This is only possible if deletes are allowed even when replica
1448                          * identity is not defined for a table. Since the DELETE action
1449                          * can't be published, we simply return.
1450                          */
1451                         if (!change->data.tp.oldtuple)
1452                         {
1453                                 elog(DEBUG1, "didn't send DELETE change because of missing oldtuple");
1454                                 return;
1455                         }
1456                         break;
1457                 default:
1458                         Assert(false);
1459         }
1460
1461         /* Avoid leaking memory by using and resetting our own context */
1462         old = MemoryContextSwitchTo(data->context);
1463
1464         /* Switch relation if publishing via root. */
1465         if (relentry->publish_as_relid != RelationGetRelid(relation))
1466         {
1467                 Assert(relation->rd_rel->relispartition);
1468                 ancestor = RelationIdGetRelation(relentry->publish_as_relid);
1469                 targetrel = ancestor;
1470         }
1471
1472         if (change->data.tp.oldtuple)
1473         {
1474                 old_slot = relentry->old_slot;
1475                 ExecStoreHeapTuple(change->data.tp.oldtuple, old_slot, false);
1476
1477                 /* Convert tuple if needed. */
1478                 if (relentry->attrmap)
1479                 {
1480                         TupleTableSlot *slot = MakeTupleTableSlot(RelationGetDescr(targetrel),
1481                                                                                                           &TTSOpsVirtual);
1482
1483                         old_slot = execute_attr_map_slot(relentry->attrmap, old_slot, slot);
1484                 }
1485         }
1486
1487         if (change->data.tp.newtuple)
1488         {
1489                 new_slot = relentry->new_slot;
1490                 ExecStoreHeapTuple(change->data.tp.newtuple, new_slot, false);
1491
1492                 /* Convert tuple if needed. */
1493                 if (relentry->attrmap)
1494                 {
1495                         TupleTableSlot *slot = MakeTupleTableSlot(RelationGetDescr(targetrel),
1496                                                                                                           &TTSOpsVirtual);
1497
1498                         new_slot = execute_attr_map_slot(relentry->attrmap, new_slot, slot);
1499                 }
1500         }
1501
1502         /*
1503          * Check row filter.
1504          *
1505          * Updates could be transformed to inserts or deletes based on the results
1506          * of the row filter for old and new tuple.
1507          */
1508         if (!pgoutput_row_filter(targetrel, old_slot, &new_slot, relentry, &action))
1509                 goto cleanup;
1510
1511         /*
1512          * Send BEGIN if we haven't yet.
1513          *
1514          * We send the BEGIN message after ensuring that we will actually send the
1515          * change. This avoids sending a pair of BEGIN/COMMIT messages for empty
1516          * transactions.
1517          */
1518         if (txndata && !txndata->sent_begin_txn)
1519                 pgoutput_send_begin(ctx, txn);
1520
1521         /*
1522          * Schema should be sent using the original relation because it also sends
1523          * the ancestor's relation.
1524          */
1525         maybe_send_schema(ctx, change, relation, relentry);
1526
1527         OutputPluginPrepareWrite(ctx, true);
1528
1529         /* Send the data */
1530         switch (action)
1531         {
1532                 case REORDER_BUFFER_CHANGE_INSERT:
1533                         logicalrep_write_insert(ctx->out, xid, targetrel, new_slot,
1534                                                                         data->binary, relentry->columns);
1535                         break;
1536                 case REORDER_BUFFER_CHANGE_UPDATE:
1537                         logicalrep_write_update(ctx->out, xid, targetrel, old_slot,
1538                                                                         new_slot, data->binary, relentry->columns);
1539                         break;
1540                 case REORDER_BUFFER_CHANGE_DELETE:
1541                         logicalrep_write_delete(ctx->out, xid, targetrel, old_slot,
1542                                                                         data->binary, relentry->columns);
1543                         break;
1544                 default:
1545                         Assert(false);
1546         }
1547
1548         OutputPluginWrite(ctx, true);
1549
1550 cleanup:
1551         if (RelationIsValid(ancestor))
1552         {
1553                 RelationClose(ancestor);
1554                 ancestor = NULL;
1555         }
1556
1557         /* Drop the new slots that were used to store the converted tuples. */
1558         if (relentry->attrmap)
1559         {
1560                 if (old_slot)
1561                         ExecDropSingleTupleTableSlot(old_slot);
1562
1563                 if (new_slot)
1564                         ExecDropSingleTupleTableSlot(new_slot);
1565         }
1566
1567         MemoryContextSwitchTo(old);
1568         MemoryContextReset(data->context);
1569 }
1570
1571 static void
1572 pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
1573                                   int nrelations, Relation relations[], ReorderBufferChange *change)
1574 {
1575         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
1576         PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
1577         MemoryContext old;
1578         RelationSyncEntry *relentry;
1579         int                     i;
1580         int                     nrelids;
1581         Oid                *relids;
1582         TransactionId xid = InvalidTransactionId;
1583
1584         /* Remember the xid for the change in streaming mode. See pgoutput_change. */
1585         if (data->in_streaming)
1586                 xid = change->txn->xid;
1587
1588         old = MemoryContextSwitchTo(data->context);
1589
1590         relids = palloc0(nrelations * sizeof(Oid));
1591         nrelids = 0;
1592
1593         for (i = 0; i < nrelations; i++)
1594         {
1595                 Relation        relation = relations[i];
1596                 Oid                     relid = RelationGetRelid(relation);
1597
1598                 if (!is_publishable_relation(relation))
1599                         continue;
1600
1601                 relentry = get_rel_sync_entry(data, relation);
1602
1603                 if (!relentry->pubactions.pubtruncate)
1604                         continue;
1605
1606                 /*
1607                  * Don't send partitions if the publication wants to send only the
1608                  * root tables through it.
1609                  */
1610                 if (relation->rd_rel->relispartition &&
1611                         relentry->publish_as_relid != relid)
1612                         continue;
1613
1614                 relids[nrelids++] = relid;
1615
1616                 /* Send BEGIN if we haven't yet */
1617                 if (txndata && !txndata->sent_begin_txn)
1618                         pgoutput_send_begin(ctx, txn);
1619
1620                 maybe_send_schema(ctx, change, relation, relentry);
1621         }
1622
1623         if (nrelids > 0)
1624         {
1625                 OutputPluginPrepareWrite(ctx, true);
1626                 logicalrep_write_truncate(ctx->out,
1627                                                                   xid,
1628                                                                   nrelids,
1629                                                                   relids,
1630                                                                   change->data.truncate.cascade,
1631                                                                   change->data.truncate.restart_seqs);
1632                 OutputPluginWrite(ctx, true);
1633         }
1634
1635         MemoryContextSwitchTo(old);
1636         MemoryContextReset(data->context);
1637 }
1638
1639 static void
1640 pgoutput_message(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
1641                                  XLogRecPtr message_lsn, bool transactional, const char *prefix, Size sz,
1642                                  const char *message)
1643 {
1644         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
1645         TransactionId xid = InvalidTransactionId;
1646
1647         if (!data->messages)
1648                 return;
1649
1650         /*
1651          * Remember the xid for the message in streaming mode. See
1652          * pgoutput_change.
1653          */
1654         if (data->in_streaming)
1655                 xid = txn->xid;
1656
1657         /*
1658          * Output BEGIN if we haven't yet. Avoid for non-transactional messages.
1659          */
1660         if (transactional)
1661         {
1662                 PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
1663
1664                 /* Send BEGIN if we haven't yet */
1665                 if (txndata && !txndata->sent_begin_txn)
1666                         pgoutput_send_begin(ctx, txn);
1667         }
1668
1669         OutputPluginPrepareWrite(ctx, true);
1670         logicalrep_write_message(ctx->out,
1671                                                          xid,
1672                                                          message_lsn,
1673                                                          transactional,
1674                                                          prefix,
1675                                                          sz,
1676                                                          message);
1677         OutputPluginWrite(ctx, true);
1678 }
1679
1680 /*
1681  * Return true if the data is associated with an origin and the user has
1682  * requested the changes that don't have an origin, false otherwise.
1683  */
1684 static bool
1685 pgoutput_origin_filter(LogicalDecodingContext *ctx,
1686                                            RepOriginId origin_id)
1687 {
1688         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
1689
1690         if (data->publish_no_origin && origin_id != InvalidRepOriginId)
1691                 return true;
1692
1693         return false;
1694 }
1695
1696 /*
1697  * Shutdown the output plugin.
1698  *
1699  * Note, we don't need to clean the data->context and data->cachectx as
1700  * they are child contexts of the ctx->context so they will be cleaned up by
1701  * logical decoding machinery.
1702  */
1703 static void
1704 pgoutput_shutdown(LogicalDecodingContext *ctx)
1705 {
1706         if (RelationSyncCache)
1707         {
1708                 hash_destroy(RelationSyncCache);
1709                 RelationSyncCache = NULL;
1710         }
1711 }
1712
1713 /*
1714  * Load publications from the list of publication names.
1715  */
1716 static List *
1717 LoadPublications(List *pubnames)
1718 {
1719         List       *result = NIL;
1720         ListCell   *lc;
1721
1722         foreach(lc, pubnames)
1723         {
1724                 char       *pubname = (char *) lfirst(lc);
1725                 Publication *pub = GetPublicationByName(pubname, false);
1726
1727                 result = lappend(result, pub);
1728         }
1729
1730         return result;
1731 }
1732
1733 /*
1734  * Publication syscache invalidation callback.
1735  *
1736  * Called for invalidations on pg_publication.
1737  */
1738 static void
1739 publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue)
1740 {
1741         publications_valid = false;
1742
1743         /*
1744          * Also invalidate per-relation cache so that next time the filtering info
1745          * is checked it will be updated with the new publication settings.
1746          */
1747         rel_sync_cache_publication_cb(arg, cacheid, hashvalue);
1748 }
1749
1750 /*
1751  * START STREAM callback
1752  */
1753 static void
1754 pgoutput_stream_start(struct LogicalDecodingContext *ctx,
1755                                           ReorderBufferTXN *txn)
1756 {
1757         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
1758         bool            send_replication_origin = txn->origin_id != InvalidRepOriginId;
1759
1760         /* we can't nest streaming of transactions */
1761         Assert(!data->in_streaming);
1762
1763         /*
1764          * If we already sent the first stream for this transaction then don't
1765          * send the origin id in the subsequent streams.
1766          */
1767         if (rbtxn_is_streamed(txn))
1768                 send_replication_origin = false;
1769
1770         OutputPluginPrepareWrite(ctx, !send_replication_origin);
1771         logicalrep_write_stream_start(ctx->out, txn->xid, !rbtxn_is_streamed(txn));
1772
1773         send_repl_origin(ctx, txn->origin_id, InvalidXLogRecPtr,
1774                                          send_replication_origin);
1775
1776         OutputPluginWrite(ctx, true);
1777
1778         /* we're streaming a chunk of transaction now */
1779         data->in_streaming = true;
1780 }
1781
1782 /*
1783  * STOP STREAM callback
1784  */
1785 static void
1786 pgoutput_stream_stop(struct LogicalDecodingContext *ctx,
1787                                          ReorderBufferTXN *txn)
1788 {
1789         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
1790
1791         /* we should be streaming a transaction */
1792         Assert(data->in_streaming);
1793
1794         OutputPluginPrepareWrite(ctx, true);
1795         logicalrep_write_stream_stop(ctx->out);
1796         OutputPluginWrite(ctx, true);
1797
1798         /* we've stopped streaming a transaction */
1799         data->in_streaming = false;
1800 }
1801
1802 /*
1803  * Notify downstream to discard the streamed transaction (along with all
1804  * it's subtransactions, if it's a toplevel transaction).
1805  */
1806 static void
1807 pgoutput_stream_abort(struct LogicalDecodingContext *ctx,
1808                                           ReorderBufferTXN *txn,
1809                                           XLogRecPtr abort_lsn)
1810 {
1811         ReorderBufferTXN *toptxn;
1812         PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
1813         bool            write_abort_info = (data->streaming == LOGICALREP_STREAM_PARALLEL);
1814
1815         /*
1816          * The abort should happen outside streaming block, even for streamed
1817          * transactions. The transaction has to be marked as streamed, though.
1818          */
1819         Assert(!data->in_streaming);
1820
1821         /* determine the toplevel transaction */
1822         toptxn = rbtxn_get_toptxn(txn);
1823
1824         Assert(rbtxn_is_streamed(toptxn));
1825
1826         OutputPluginPrepareWrite(ctx, true);
1827         logicalrep_write_stream_abort(ctx->out, toptxn->xid, txn->xid, abort_lsn,
1828                                                                   txn->xact_time.abort_time, write_abort_info);
1829
1830         OutputPluginWrite(ctx, true);
1831
1832         cleanup_rel_sync_cache(toptxn->xid, false);
1833 }
1834
1835 /*
1836  * Notify downstream to apply the streamed transaction (along with all
1837  * it's subtransactions).
1838  */
1839 static void
1840 pgoutput_stream_commit(struct LogicalDecodingContext *ctx,
1841                                            ReorderBufferTXN *txn,
1842                                            XLogRecPtr commit_lsn)
1843 {
1844         PGOutputData *data PG_USED_FOR_ASSERTS_ONLY = (PGOutputData *) ctx->output_plugin_private;
1845
1846         /*
1847          * The commit should happen outside streaming block, even for streamed
1848          * transactions. The transaction has to be marked as streamed, though.
1849          */
1850         Assert(!data->in_streaming);
1851         Assert(rbtxn_is_streamed(txn));
1852
1853         OutputPluginUpdateProgress(ctx, false);
1854
1855         OutputPluginPrepareWrite(ctx, true);
1856         logicalrep_write_stream_commit(ctx->out, txn, commit_lsn);
1857         OutputPluginWrite(ctx, true);
1858
1859         cleanup_rel_sync_cache(txn->xid, true);
1860 }
1861
1862 /*
1863  * PREPARE callback (for streaming two-phase commit).
1864  *
1865  * Notify the downstream to prepare the transaction.
1866  */
1867 static void
1868 pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx,
1869                                                         ReorderBufferTXN *txn,
1870                                                         XLogRecPtr prepare_lsn)
1871 {
1872         Assert(rbtxn_is_streamed(txn));
1873
1874         OutputPluginUpdateProgress(ctx, false);
1875         OutputPluginPrepareWrite(ctx, true);
1876         logicalrep_write_stream_prepare(ctx->out, txn, prepare_lsn);
1877         OutputPluginWrite(ctx, true);
1878 }
1879
1880 /*
1881  * Initialize the relation schema sync cache for a decoding session.
1882  *
1883  * The hash table is destroyed at the end of a decoding session. While
1884  * relcache invalidations still exist and will still be invoked, they
1885  * will just see the null hash table global and take no action.
1886  */
1887 static void
1888 init_rel_sync_cache(MemoryContext cachectx)
1889 {
1890         HASHCTL         ctl;
1891         static bool relation_callbacks_registered = false;
1892
1893         /* Nothing to do if hash table already exists */
1894         if (RelationSyncCache != NULL)
1895                 return;
1896
1897         /* Make a new hash table for the cache */
1898         ctl.keysize = sizeof(Oid);
1899         ctl.entrysize = sizeof(RelationSyncEntry);
1900         ctl.hcxt = cachectx;
1901
1902         RelationSyncCache = hash_create("logical replication output relation cache",
1903                                                                         128, &ctl,
1904                                                                         HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
1905
1906         Assert(RelationSyncCache != NULL);
1907
1908         /* No more to do if we already registered callbacks */
1909         if (relation_callbacks_registered)
1910                 return;
1911
1912         /* We must update the cache entry for a relation after a relcache flush */
1913         CacheRegisterRelcacheCallback(rel_sync_cache_relation_cb, (Datum) 0);
1914
1915         /*
1916          * Flush all cache entries after a pg_namespace change, in case it was a
1917          * schema rename affecting a relation being replicated.
1918          */
1919         CacheRegisterSyscacheCallback(NAMESPACEOID,
1920                                                                   rel_sync_cache_publication_cb,
1921                                                                   (Datum) 0);
1922
1923         /*
1924          * Flush all cache entries after any publication changes.  (We need no
1925          * callback entry for pg_publication, because publication_invalidation_cb
1926          * will take care of it.)
1927          */
1928         CacheRegisterSyscacheCallback(PUBLICATIONRELMAP,
1929                                                                   rel_sync_cache_publication_cb,
1930                                                                   (Datum) 0);
1931         CacheRegisterSyscacheCallback(PUBLICATIONNAMESPACEMAP,
1932                                                                   rel_sync_cache_publication_cb,
1933                                                                   (Datum) 0);
1934
1935         relation_callbacks_registered = true;
1936 }
1937
1938 /*
1939  * We expect relatively small number of streamed transactions.
1940  */
1941 static bool
1942 get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid)
1943 {
1944         return list_member_xid(entry->streamed_txns, xid);
1945 }
1946
1947 /*
1948  * Add the xid in the rel sync entry for which we have already sent the schema
1949  * of the relation.
1950  */
1951 static void
1952 set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid)
1953 {
1954         MemoryContext oldctx;
1955
1956         oldctx = MemoryContextSwitchTo(CacheMemoryContext);
1957
1958         entry->streamed_txns = lappend_xid(entry->streamed_txns, xid);
1959
1960         MemoryContextSwitchTo(oldctx);
1961 }
1962
1963 /*
1964  * Find or create entry in the relation schema cache.
1965  *
1966  * This looks up publications that the given relation is directly or
1967  * indirectly part of (the latter if it's really the relation's ancestor that
1968  * is part of a publication) and fills up the found entry with the information
1969  * about which operations to publish and whether to use an ancestor's schema
1970  * when publishing.
1971  */
1972 static RelationSyncEntry *
1973 get_rel_sync_entry(PGOutputData *data, Relation relation)
1974 {
1975         RelationSyncEntry *entry;
1976         bool            found;
1977         MemoryContext oldctx;
1978         Oid                     relid = RelationGetRelid(relation);
1979
1980         Assert(RelationSyncCache != NULL);
1981
1982         /* Find cached relation info, creating if not found */
1983         entry = (RelationSyncEntry *) hash_search(RelationSyncCache,
1984                                                                                           &relid,
1985                                                                                           HASH_ENTER, &found);
1986         Assert(entry != NULL);
1987
1988         /* initialize entry, if it's new */
1989         if (!found)
1990         {
1991                 entry->replicate_valid = false;
1992                 entry->schema_sent = false;
1993                 entry->streamed_txns = NIL;
1994                 entry->pubactions.pubinsert = entry->pubactions.pubupdate =
1995                         entry->pubactions.pubdelete = entry->pubactions.pubtruncate = false;
1996                 entry->new_slot = NULL;
1997                 entry->old_slot = NULL;
1998                 memset(entry->exprstate, 0, sizeof(entry->exprstate));
1999                 entry->entry_cxt = NULL;
2000                 entry->publish_as_relid = InvalidOid;
2001                 entry->columns = NULL;
2002                 entry->attrmap = NULL;
2003         }
2004
2005         /* Validate the entry */
2006         if (!entry->replicate_valid)
2007         {
2008                 Oid                     schemaId = get_rel_namespace(relid);
2009                 List       *pubids = GetRelationPublications(relid);
2010
2011                 /*
2012                  * We don't acquire a lock on the namespace system table as we build
2013                  * the cache entry using a historic snapshot and all the later changes
2014                  * are absorbed while decoding WAL.
2015                  */
2016                 List       *schemaPubids = GetSchemaPublications(schemaId);
2017                 ListCell   *lc;
2018                 Oid                     publish_as_relid = relid;
2019                 int                     publish_ancestor_level = 0;
2020                 bool            am_partition = get_rel_relispartition(relid);
2021                 char            relkind = get_rel_relkind(relid);
2022                 List       *rel_publications = NIL;
2023
2024                 /* Reload publications if needed before use. */
2025                 if (!publications_valid)
2026                 {
2027                         oldctx = MemoryContextSwitchTo(CacheMemoryContext);
2028                         if (data->publications)
2029                         {
2030                                 list_free_deep(data->publications);
2031                                 data->publications = NIL;
2032                         }
2033                         data->publications = LoadPublications(data->publication_names);
2034                         MemoryContextSwitchTo(oldctx);
2035                         publications_valid = true;
2036                 }
2037
2038                 /*
2039                  * Reset schema_sent status as the relation definition may have
2040                  * changed.  Also reset pubactions to empty in case rel was dropped
2041                  * from a publication.  Also free any objects that depended on the
2042                  * earlier definition.
2043                  */
2044                 entry->schema_sent = false;
2045                 list_free(entry->streamed_txns);
2046                 entry->streamed_txns = NIL;
2047                 bms_free(entry->columns);
2048                 entry->columns = NULL;
2049                 entry->pubactions.pubinsert = false;
2050                 entry->pubactions.pubupdate = false;
2051                 entry->pubactions.pubdelete = false;
2052                 entry->pubactions.pubtruncate = false;
2053
2054                 /*
2055                  * Tuple slots cleanups. (Will be rebuilt later if needed).
2056                  */
2057                 if (entry->old_slot)
2058                         ExecDropSingleTupleTableSlot(entry->old_slot);
2059                 if (entry->new_slot)
2060                         ExecDropSingleTupleTableSlot(entry->new_slot);
2061
2062                 entry->old_slot = NULL;
2063                 entry->new_slot = NULL;
2064
2065                 if (entry->attrmap)
2066                         free_attrmap(entry->attrmap);
2067                 entry->attrmap = NULL;
2068
2069                 /*
2070                  * Row filter cache cleanups.
2071                  */
2072                 if (entry->entry_cxt)
2073                         MemoryContextDelete(entry->entry_cxt);
2074
2075                 entry->entry_cxt = NULL;
2076                 entry->estate = NULL;
2077                 memset(entry->exprstate, 0, sizeof(entry->exprstate));
2078
2079                 /*
2080                  * Build publication cache. We can't use one provided by relcache as
2081                  * relcache considers all publications that the given relation is in,
2082                  * but here we only need to consider ones that the subscriber
2083                  * requested.
2084                  */
2085                 foreach(lc, data->publications)
2086                 {
2087                         Publication *pub = lfirst(lc);
2088                         bool            publish = false;
2089
2090                         /*
2091                          * Under what relid should we publish changes in this publication?
2092                          * We'll use the top-most relid across all publications. Also
2093                          * track the ancestor level for this publication.
2094                          */
2095                         Oid                     pub_relid = relid;
2096                         int                     ancestor_level = 0;
2097
2098                         /*
2099                          * If this is a FOR ALL TABLES publication, pick the partition
2100                          * root and set the ancestor level accordingly.
2101                          */
2102                         if (pub->alltables)
2103                         {
2104                                 publish = true;
2105                                 if (pub->pubviaroot && am_partition)
2106                                 {
2107                                         List       *ancestors = get_partition_ancestors(relid);
2108
2109                                         pub_relid = llast_oid(ancestors);
2110                                         ancestor_level = list_length(ancestors);
2111                                 }
2112                         }
2113
2114                         if (!publish)
2115                         {
2116                                 bool            ancestor_published = false;
2117
2118                                 /*
2119                                  * For a partition, check if any of the ancestors are
2120                                  * published.  If so, note down the topmost ancestor that is
2121                                  * published via this publication, which will be used as the
2122                                  * relation via which to publish the partition's changes.
2123                                  */
2124                                 if (am_partition)
2125                                 {
2126                                         Oid                     ancestor;
2127                                         int                     level;
2128                                         List       *ancestors = get_partition_ancestors(relid);
2129
2130                                         ancestor = GetTopMostAncestorInPublication(pub->oid,
2131                                                                                                                            ancestors,
2132                                                                                                                            &level);
2133
2134                                         if (ancestor != InvalidOid)
2135                                         {
2136                                                 ancestor_published = true;
2137                                                 if (pub->pubviaroot)
2138                                                 {
2139                                                         pub_relid = ancestor;
2140                                                         ancestor_level = level;
2141                                                 }
2142                                         }
2143                                 }
2144
2145                                 if (list_member_oid(pubids, pub->oid) ||
2146                                         list_member_oid(schemaPubids, pub->oid) ||
2147                                         ancestor_published)
2148                                         publish = true;
2149                         }
2150
2151                         /*
2152                          * If the relation is to be published, determine actions to
2153                          * publish, and list of columns, if appropriate.
2154                          *
2155                          * Don't publish changes for partitioned tables, because
2156                          * publishing those of its partitions suffices, unless partition
2157                          * changes won't be published due to pubviaroot being set.
2158                          */
2159                         if (publish &&
2160                                 (relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot))
2161                         {
2162                                 entry->pubactions.pubinsert |= pub->pubactions.pubinsert;
2163                                 entry->pubactions.pubupdate |= pub->pubactions.pubupdate;
2164                                 entry->pubactions.pubdelete |= pub->pubactions.pubdelete;
2165                                 entry->pubactions.pubtruncate |= pub->pubactions.pubtruncate;
2166
2167                                 /*
2168                                  * We want to publish the changes as the top-most ancestor
2169                                  * across all publications. So we need to check if the already
2170                                  * calculated level is higher than the new one. If yes, we can
2171                                  * ignore the new value (as it's a child). Otherwise the new
2172                                  * value is an ancestor, so we keep it.
2173                                  */
2174                                 if (publish_ancestor_level > ancestor_level)
2175                                         continue;
2176
2177                                 /*
2178                                  * If we found an ancestor higher up in the tree, discard the
2179                                  * list of publications through which we replicate it, and use
2180                                  * the new ancestor.
2181                                  */
2182                                 if (publish_ancestor_level < ancestor_level)
2183                                 {
2184                                         publish_as_relid = pub_relid;
2185                                         publish_ancestor_level = ancestor_level;
2186
2187                                         /* reset the publication list for this relation */
2188                                         rel_publications = NIL;
2189                                 }
2190                                 else
2191                                 {
2192                                         /* Same ancestor level, has to be the same OID. */
2193                                         Assert(publish_as_relid == pub_relid);
2194                                 }
2195
2196                                 /* Track publications for this ancestor. */
2197                                 rel_publications = lappend(rel_publications, pub);
2198                         }
2199                 }
2200
2201                 entry->publish_as_relid = publish_as_relid;
2202
2203                 /*
2204                  * Initialize the tuple slot, map, and row filter. These are only used
2205                  * when publishing inserts, updates, or deletes.
2206                  */
2207                 if (entry->pubactions.pubinsert || entry->pubactions.pubupdate ||
2208                         entry->pubactions.pubdelete)
2209                 {
2210                         /* Initialize the tuple slot and map */
2211                         init_tuple_slot(data, relation, entry);
2212
2213                         /* Initialize the row filter */
2214                         pgoutput_row_filter_init(data, rel_publications, entry);
2215
2216                         /* Initialize the column list */
2217                         pgoutput_column_list_init(data, rel_publications, entry);
2218                 }
2219
2220                 list_free(pubids);
2221                 list_free(schemaPubids);
2222                 list_free(rel_publications);
2223
2224                 entry->replicate_valid = true;
2225         }
2226
2227         return entry;
2228 }
2229
2230 /*
2231  * Cleanup list of streamed transactions and update the schema_sent flag.
2232  *
2233  * When a streamed transaction commits or aborts, we need to remove the
2234  * toplevel XID from the schema cache. If the transaction aborted, the
2235  * subscriber will simply throw away the schema records we streamed, so
2236  * we don't need to do anything else.
2237  *
2238  * If the transaction is committed, the subscriber will update the relation
2239  * cache - so tweak the schema_sent flag accordingly.
2240  */
2241 static void
2242 cleanup_rel_sync_cache(TransactionId xid, bool is_commit)
2243 {
2244         HASH_SEQ_STATUS hash_seq;
2245         RelationSyncEntry *entry;
2246
2247         Assert(RelationSyncCache != NULL);
2248
2249         hash_seq_init(&hash_seq, RelationSyncCache);
2250         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2251         {
2252                 /*
2253                  * We can set the schema_sent flag for an entry that has committed xid
2254                  * in the list as that ensures that the subscriber would have the
2255                  * corresponding schema and we don't need to send it unless there is
2256                  * any invalidation for that relation.
2257                  */
2258                 foreach_xid(streamed_txn, entry->streamed_txns)
2259                 {
2260                         if (xid == streamed_txn)
2261                         {
2262                                 if (is_commit)
2263                                         entry->schema_sent = true;
2264
2265                                 entry->streamed_txns =
2266                                         foreach_delete_current(entry->streamed_txns, streamed_txn);
2267                                 break;
2268                         }
2269                 }
2270         }
2271 }
2272
2273 /*
2274  * Relcache invalidation callback
2275  */
2276 static void
2277 rel_sync_cache_relation_cb(Datum arg, Oid relid)
2278 {
2279         RelationSyncEntry *entry;
2280
2281         /*
2282          * We can get here if the plugin was used in SQL interface as the
2283          * RelationSyncCache is destroyed when the decoding finishes, but there is
2284          * no way to unregister the relcache invalidation callback.
2285          */
2286         if (RelationSyncCache == NULL)
2287                 return;
2288
2289         /*
2290          * Nobody keeps pointers to entries in this hash table around outside
2291          * logical decoding callback calls - but invalidation events can come in
2292          * *during* a callback if we do any syscache access in the callback.
2293          * Because of that we must mark the cache entry as invalid but not damage
2294          * any of its substructure here.  The next get_rel_sync_entry() call will
2295          * rebuild it all.
2296          */
2297         if (OidIsValid(relid))
2298         {
2299                 /*
2300                  * Getting invalidations for relations that aren't in the table is
2301                  * entirely normal.  So we don't care if it's found or not.
2302                  */
2303                 entry = (RelationSyncEntry *) hash_search(RelationSyncCache, &relid,
2304                                                                                                   HASH_FIND, NULL);
2305                 if (entry != NULL)
2306                         entry->replicate_valid = false;
2307         }
2308         else
2309         {
2310                 /* Whole cache must be flushed. */
2311                 HASH_SEQ_STATUS status;
2312
2313                 hash_seq_init(&status, RelationSyncCache);
2314                 while ((entry = (RelationSyncEntry *) hash_seq_search(&status)) != NULL)
2315                 {
2316                         entry->replicate_valid = false;
2317                 }
2318         }
2319 }
2320
2321 /*
2322  * Publication relation/schema map syscache invalidation callback
2323  *
2324  * Called for invalidations on pg_publication, pg_publication_rel,
2325  * pg_publication_namespace, and pg_namespace.
2326  */
2327 static void
2328 rel_sync_cache_publication_cb(Datum arg, int cacheid, uint32 hashvalue)
2329 {
2330         HASH_SEQ_STATUS status;
2331         RelationSyncEntry *entry;
2332
2333         /*
2334          * We can get here if the plugin was used in SQL interface as the
2335          * RelationSyncCache is destroyed when the decoding finishes, but there is
2336          * no way to unregister the invalidation callbacks.
2337          */
2338         if (RelationSyncCache == NULL)
2339                 return;
2340
2341         /*
2342          * We have no easy way to identify which cache entries this invalidation
2343          * event might have affected, so just mark them all invalid.
2344          */
2345         hash_seq_init(&status, RelationSyncCache);
2346         while ((entry = (RelationSyncEntry *) hash_seq_search(&status)) != NULL)
2347         {
2348                 entry->replicate_valid = false;
2349         }
2350 }
2351
2352 /* Send Replication origin */
2353 static void
2354 send_repl_origin(LogicalDecodingContext *ctx, RepOriginId origin_id,
2355                                  XLogRecPtr origin_lsn, bool send_origin)
2356 {
2357         if (send_origin)
2358         {
2359                 char       *origin;
2360
2361                 /*----------
2362                  * XXX: which behaviour do we want here?
2363                  *
2364                  * Alternatives:
2365                  *  - don't send origin message if origin name not found
2366                  *    (that's what we do now)
2367                  *  - throw error - that will break replication, not good
2368                  *  - send some special "unknown" origin
2369                  *----------
2370                  */
2371                 if (replorigin_by_oid(origin_id, true, &origin))
2372                 {
2373                         /* Message boundary */
2374                         OutputPluginWrite(ctx, false);
2375                         OutputPluginPrepareWrite(ctx, true);
2376
2377                         logicalrep_write_origin(ctx->out, origin, origin_lsn);
2378                 }
2379         }
2380 }