Update copyright for 2022
[pgsql.git] / src / backend / replication / logical / decode.c
blob1d22208c1ad6745b218d04f33793bf64e38e28c9
1 /* -------------------------------------------------------------------------
3 * decode.c
4 * This module decodes WAL records read using xlogreader.h's APIs for the
5 * purpose of logical decoding by passing information to the
6 * reorderbuffer module (containing the actual changes) and to the
7 * snapbuild module to build a fitting catalog snapshot (to be able to
8 * properly decode the changes in the reorderbuffer).
10 * NOTE:
11 * This basically tries to handle all low level xlog stuff for
12 * reorderbuffer.c and snapbuild.c. There's some minor leakage where a
13 * specific record's struct is used to pass data along, but those just
14 * happen to contain the right amount of data in a convenient
15 * format. There isn't and shouldn't be much intelligence about the
16 * contents of records in here except turning them into a more usable
17 * format.
19 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
20 * Portions Copyright (c) 1994, Regents of the University of California
22 * IDENTIFICATION
23 * src/backend/replication/logical/decode.c
25 * -------------------------------------------------------------------------
27 #include "postgres.h"
29 #include "access/heapam.h"
30 #include "access/heapam_xlog.h"
31 #include "access/transam.h"
32 #include "access/xact.h"
33 #include "access/xlog_internal.h"
34 #include "access/xlogreader.h"
35 #include "access/xlogrecord.h"
36 #include "access/xlogutils.h"
37 #include "catalog/pg_control.h"
38 #include "replication/decode.h"
39 #include "replication/logical.h"
40 #include "replication/message.h"
41 #include "replication/origin.h"
42 #include "replication/reorderbuffer.h"
43 #include "replication/snapbuild.h"
44 #include "storage/standby.h"
46 typedef struct XLogRecordBuffer
48 XLogRecPtr origptr;
49 XLogRecPtr endptr;
50 XLogReaderState *record;
51 } XLogRecordBuffer;
53 /* RMGR Handlers */
54 static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
55 static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
56 static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
57 static void DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
58 static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
59 static void DecodeLogicalMsgOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
61 /* individual record(group)'s handlers */
62 static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
63 static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
64 static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
65 static void DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
66 static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
67 static void DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
69 static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
70 xl_xact_parsed_commit *parsed, TransactionId xid,
71 bool two_phase);
72 static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
73 xl_xact_parsed_abort *parsed, TransactionId xid,
74 bool two_phase);
75 static void DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
76 xl_xact_parsed_prepare *parsed);
79 /* common function to decode tuples */
80 static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup);
82 /* helper functions for decoding transactions */
83 static inline bool FilterPrepare(LogicalDecodingContext *ctx,
84 TransactionId xid, const char *gid);
85 static bool DecodeTXNNeedSkip(LogicalDecodingContext *ctx,
86 XLogRecordBuffer *buf, Oid dbId,
87 RepOriginId origin_id);
90 * Take every XLogReadRecord()ed record and perform the actions required to
91 * decode it using the output plugin already setup in the logical decoding
92 * context.
94 * NB: Note that every record's xid needs to be processed by reorderbuffer
95 * (xids contained in the content of records are not relevant for this rule).
96 * That means that for records which'd otherwise not go through the
97 * reorderbuffer ReorderBufferProcessXid() has to be called. We don't want to
98 * call ReorderBufferProcessXid for each record type by default, because
99 * e.g. empty xacts can be handled more efficiently if there's no previous
100 * state for them.
102 * We also support the ability to fast forward thru records, skipping some
103 * record types completely - see individual record types for details.
105 void
106 LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *record)
108 XLogRecordBuffer buf;
109 TransactionId txid;
111 buf.origptr = ctx->reader->ReadRecPtr;
112 buf.endptr = ctx->reader->EndRecPtr;
113 buf.record = record;
115 txid = XLogRecGetTopXid(record);
118 * If the top-level xid is valid, we need to assign the subxact to the
119 * top-level xact. We need to do this for all records, hence we do it
120 * before the switch.
122 if (TransactionIdIsValid(txid))
124 ReorderBufferAssignChild(ctx->reorder,
125 txid,
126 record->decoded_record->xl_xid,
127 buf.origptr);
130 /* cast so we get a warning when new rmgrs are added */
131 switch ((RmgrId) XLogRecGetRmid(record))
134 * Rmgrs we care about for logical decoding. Add new rmgrs in
135 * rmgrlist.h's order.
137 case RM_XLOG_ID:
138 DecodeXLogOp(ctx, &buf);
139 break;
141 case RM_XACT_ID:
142 DecodeXactOp(ctx, &buf);
143 break;
145 case RM_STANDBY_ID:
146 DecodeStandbyOp(ctx, &buf);
147 break;
149 case RM_HEAP2_ID:
150 DecodeHeap2Op(ctx, &buf);
151 break;
153 case RM_HEAP_ID:
154 DecodeHeapOp(ctx, &buf);
155 break;
157 case RM_LOGICALMSG_ID:
158 DecodeLogicalMsgOp(ctx, &buf);
159 break;
162 * Rmgrs irrelevant for logical decoding; they describe stuff not
163 * represented in logical decoding. Add new rmgrs in rmgrlist.h's
164 * order.
166 case RM_SMGR_ID:
167 case RM_CLOG_ID:
168 case RM_DBASE_ID:
169 case RM_TBLSPC_ID:
170 case RM_MULTIXACT_ID:
171 case RM_RELMAP_ID:
172 case RM_BTREE_ID:
173 case RM_HASH_ID:
174 case RM_GIN_ID:
175 case RM_GIST_ID:
176 case RM_SEQ_ID:
177 case RM_SPGIST_ID:
178 case RM_BRIN_ID:
179 case RM_COMMIT_TS_ID:
180 case RM_REPLORIGIN_ID:
181 case RM_GENERIC_ID:
182 /* just deal with xid, and done */
183 ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record),
184 buf.origptr);
185 break;
186 case RM_NEXT_ID:
187 elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) XLogRecGetRmid(buf.record));
192 * Handle rmgr XLOG_ID records for DecodeRecordIntoReorderBuffer().
194 static void
195 DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
197 SnapBuild *builder = ctx->snapshot_builder;
198 uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK;
200 ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record),
201 buf->origptr);
203 switch (info)
205 /* this is also used in END_OF_RECOVERY checkpoints */
206 case XLOG_CHECKPOINT_SHUTDOWN:
207 case XLOG_END_OF_RECOVERY:
208 SnapBuildSerializationPoint(builder, buf->origptr);
210 break;
211 case XLOG_CHECKPOINT_ONLINE:
214 * a RUNNING_XACTS record will have been logged near to this, we
215 * can restart from there.
217 break;
218 case XLOG_NOOP:
219 case XLOG_NEXTOID:
220 case XLOG_SWITCH:
221 case XLOG_BACKUP_END:
222 case XLOG_PARAMETER_CHANGE:
223 case XLOG_RESTORE_POINT:
224 case XLOG_FPW_CHANGE:
225 case XLOG_FPI_FOR_HINT:
226 case XLOG_FPI:
227 case XLOG_OVERWRITE_CONTRECORD:
228 break;
229 default:
230 elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
235 * Handle rmgr XACT_ID records for DecodeRecordIntoReorderBuffer().
237 static void
238 DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
240 SnapBuild *builder = ctx->snapshot_builder;
241 ReorderBuffer *reorder = ctx->reorder;
242 XLogReaderState *r = buf->record;
243 uint8 info = XLogRecGetInfo(r) & XLOG_XACT_OPMASK;
246 * If the snapshot isn't yet fully built, we cannot decode anything, so
247 * bail out.
249 if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
250 return;
252 switch (info)
254 case XLOG_XACT_COMMIT:
255 case XLOG_XACT_COMMIT_PREPARED:
257 xl_xact_commit *xlrec;
258 xl_xact_parsed_commit parsed;
259 TransactionId xid;
260 bool two_phase = false;
262 xlrec = (xl_xact_commit *) XLogRecGetData(r);
263 ParseCommitRecord(XLogRecGetInfo(buf->record), xlrec, &parsed);
265 if (!TransactionIdIsValid(parsed.twophase_xid))
266 xid = XLogRecGetXid(r);
267 else
268 xid = parsed.twophase_xid;
271 * We would like to process the transaction in a two-phase
272 * manner iff output plugin supports two-phase commits and
273 * doesn't filter the transaction at prepare time.
275 if (info == XLOG_XACT_COMMIT_PREPARED)
276 two_phase = !(FilterPrepare(ctx, xid,
277 parsed.twophase_gid));
279 DecodeCommit(ctx, buf, &parsed, xid, two_phase);
280 break;
282 case XLOG_XACT_ABORT:
283 case XLOG_XACT_ABORT_PREPARED:
285 xl_xact_abort *xlrec;
286 xl_xact_parsed_abort parsed;
287 TransactionId xid;
288 bool two_phase = false;
290 xlrec = (xl_xact_abort *) XLogRecGetData(r);
291 ParseAbortRecord(XLogRecGetInfo(buf->record), xlrec, &parsed);
293 if (!TransactionIdIsValid(parsed.twophase_xid))
294 xid = XLogRecGetXid(r);
295 else
296 xid = parsed.twophase_xid;
299 * We would like to process the transaction in a two-phase
300 * manner iff output plugin supports two-phase commits and
301 * doesn't filter the transaction at prepare time.
303 if (info == XLOG_XACT_ABORT_PREPARED)
304 two_phase = !(FilterPrepare(ctx, xid,
305 parsed.twophase_gid));
307 DecodeAbort(ctx, buf, &parsed, xid, two_phase);
308 break;
310 case XLOG_XACT_ASSIGNMENT:
313 * We assign subxact to the toplevel xact while processing each
314 * record if required. So, we don't need to do anything here. See
315 * LogicalDecodingProcessRecord.
317 break;
318 case XLOG_XACT_INVALIDATIONS:
320 TransactionId xid;
321 xl_xact_invals *invals;
323 xid = XLogRecGetXid(r);
324 invals = (xl_xact_invals *) XLogRecGetData(r);
327 * Execute the invalidations for xid-less transactions,
328 * otherwise, accumulate them so that they can be processed at
329 * the commit time.
331 if (TransactionIdIsValid(xid))
333 if (!ctx->fast_forward)
334 ReorderBufferAddInvalidations(reorder, xid,
335 buf->origptr,
336 invals->nmsgs,
337 invals->msgs);
338 ReorderBufferXidSetCatalogChanges(ctx->reorder, xid,
339 buf->origptr);
341 else if ((!ctx->fast_forward))
342 ReorderBufferImmediateInvalidation(ctx->reorder,
343 invals->nmsgs,
344 invals->msgs);
346 break;
347 case XLOG_XACT_PREPARE:
349 xl_xact_parsed_prepare parsed;
350 xl_xact_prepare *xlrec;
352 /* ok, parse it */
353 xlrec = (xl_xact_prepare *) XLogRecGetData(r);
354 ParsePrepareRecord(XLogRecGetInfo(buf->record),
355 xlrec, &parsed);
358 * We would like to process the transaction in a two-phase
359 * manner iff output plugin supports two-phase commits and
360 * doesn't filter the transaction at prepare time.
362 if (FilterPrepare(ctx, parsed.twophase_xid,
363 parsed.twophase_gid))
365 ReorderBufferProcessXid(reorder, parsed.twophase_xid,
366 buf->origptr);
367 break;
371 * Note that if the prepared transaction has locked [user]
372 * catalog tables exclusively then decoding prepare can block
373 * till the main transaction is committed because it needs to
374 * lock the catalog tables.
376 * XXX Now, this can even lead to a deadlock if the prepare
377 * transaction is waiting to get it logically replicated for
378 * distributed 2PC. This can be avoided by disallowing
379 * preparing transactions that have locked [user] catalog
380 * tables exclusively but as of now, we ask users not to do
381 * such an operation.
383 DecodePrepare(ctx, buf, &parsed);
384 break;
386 default:
387 elog(ERROR, "unexpected RM_XACT_ID record type: %u", info);
392 * Handle rmgr STANDBY_ID records for DecodeRecordIntoReorderBuffer().
394 static void
395 DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
397 SnapBuild *builder = ctx->snapshot_builder;
398 XLogReaderState *r = buf->record;
399 uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK;
401 ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr);
403 switch (info)
405 case XLOG_RUNNING_XACTS:
407 xl_running_xacts *running = (xl_running_xacts *) XLogRecGetData(r);
409 SnapBuildProcessRunningXacts(builder, buf->origptr, running);
412 * Abort all transactions that we keep track of, that are
413 * older than the record's oldestRunningXid. This is the most
414 * convenient spot for doing so since, in contrast to shutdown
415 * or end-of-recovery checkpoints, we have information about
416 * all running transactions which includes prepared ones,
417 * while shutdown checkpoints just know that no non-prepared
418 * transactions are in progress.
420 ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid);
422 break;
423 case XLOG_STANDBY_LOCK:
424 break;
425 case XLOG_INVALIDATIONS:
428 * We are processing the invalidations at the command level via
429 * XLOG_XACT_INVALIDATIONS. So we don't need to do anything here.
431 break;
432 default:
433 elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
438 * Handle rmgr HEAP2_ID records for DecodeRecordIntoReorderBuffer().
440 static void
441 DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
443 uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK;
444 TransactionId xid = XLogRecGetXid(buf->record);
445 SnapBuild *builder = ctx->snapshot_builder;
447 ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
450 * If we don't have snapshot or we are just fast-forwarding, there is no
451 * point in decoding changes.
453 if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
454 ctx->fast_forward)
455 return;
457 switch (info)
459 case XLOG_HEAP2_MULTI_INSERT:
460 if (!ctx->fast_forward &&
461 SnapBuildProcessChange(builder, xid, buf->origptr))
462 DecodeMultiInsert(ctx, buf);
463 break;
464 case XLOG_HEAP2_NEW_CID:
466 xl_heap_new_cid *xlrec;
468 xlrec = (xl_heap_new_cid *) XLogRecGetData(buf->record);
469 SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec);
471 break;
473 case XLOG_HEAP2_REWRITE:
476 * Although these records only exist to serve the needs of logical
477 * decoding, all the work happens as part of crash or archive
478 * recovery, so we don't need to do anything here.
480 break;
483 * Everything else here is just low level physical stuff we're not
484 * interested in.
486 case XLOG_HEAP2_FREEZE_PAGE:
487 case XLOG_HEAP2_PRUNE:
488 case XLOG_HEAP2_VACUUM:
489 case XLOG_HEAP2_VISIBLE:
490 case XLOG_HEAP2_LOCK_UPDATED:
491 break;
492 default:
493 elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info);
498 * Handle rmgr HEAP_ID records for DecodeRecordIntoReorderBuffer().
500 static void
501 DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
503 uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK;
504 TransactionId xid = XLogRecGetXid(buf->record);
505 SnapBuild *builder = ctx->snapshot_builder;
507 ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
510 * If we don't have snapshot or we are just fast-forwarding, there is no
511 * point in decoding data changes.
513 if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
514 ctx->fast_forward)
515 return;
517 switch (info)
519 case XLOG_HEAP_INSERT:
520 if (SnapBuildProcessChange(builder, xid, buf->origptr))
521 DecodeInsert(ctx, buf);
522 break;
525 * Treat HOT update as normal updates. There is no useful
526 * information in the fact that we could make it a HOT update
527 * locally and the WAL layout is compatible.
529 case XLOG_HEAP_HOT_UPDATE:
530 case XLOG_HEAP_UPDATE:
531 if (SnapBuildProcessChange(builder, xid, buf->origptr))
532 DecodeUpdate(ctx, buf);
533 break;
535 case XLOG_HEAP_DELETE:
536 if (SnapBuildProcessChange(builder, xid, buf->origptr))
537 DecodeDelete(ctx, buf);
538 break;
540 case XLOG_HEAP_TRUNCATE:
541 if (SnapBuildProcessChange(builder, xid, buf->origptr))
542 DecodeTruncate(ctx, buf);
543 break;
545 case XLOG_HEAP_INPLACE:
548 * Inplace updates are only ever performed on catalog tuples and
549 * can, per definition, not change tuple visibility. Since we
550 * don't decode catalog tuples, we're not interested in the
551 * record's contents.
553 * In-place updates can be used either by XID-bearing transactions
554 * (e.g. in CREATE INDEX CONCURRENTLY) or by XID-less
555 * transactions (e.g. VACUUM). In the former case, the commit
556 * record will include cache invalidations, so we mark the
557 * transaction as catalog modifying here. Currently that's
558 * redundant because the commit will do that as well, but once we
559 * support decoding in-progress relations, this will be important.
561 if (!TransactionIdIsValid(xid))
562 break;
564 SnapBuildProcessChange(builder, xid, buf->origptr);
565 ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
566 break;
568 case XLOG_HEAP_CONFIRM:
569 if (SnapBuildProcessChange(builder, xid, buf->origptr))
570 DecodeSpecConfirm(ctx, buf);
571 break;
573 case XLOG_HEAP_LOCK:
574 /* we don't care about row level locks for now */
575 break;
577 default:
578 elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
579 break;
584 * Ask output plugin whether we want to skip this PREPARE and send
585 * this transaction as a regular commit later.
587 static inline bool
588 FilterPrepare(LogicalDecodingContext *ctx, TransactionId xid,
589 const char *gid)
592 * Skip if decoding of two-phase transactions at PREPARE time is not
593 * enabled. In that case, all two-phase transactions are considered
594 * filtered out and will be applied as regular transactions at COMMIT
595 * PREPARED.
597 if (!ctx->twophase)
598 return true;
601 * The filter_prepare callback is optional. When not supplied, all
602 * prepared transactions should go through.
604 if (ctx->callbacks.filter_prepare_cb == NULL)
605 return false;
607 return filter_prepare_cb_wrapper(ctx, xid, gid);
610 static inline bool
611 FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id)
613 if (ctx->callbacks.filter_by_origin_cb == NULL)
614 return false;
616 return filter_by_origin_cb_wrapper(ctx, origin_id);
620 * Handle rmgr LOGICALMSG_ID records for DecodeRecordIntoReorderBuffer().
622 static void
623 DecodeLogicalMsgOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
625 SnapBuild *builder = ctx->snapshot_builder;
626 XLogReaderState *r = buf->record;
627 TransactionId xid = XLogRecGetXid(r);
628 uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK;
629 RepOriginId origin_id = XLogRecGetOrigin(r);
630 Snapshot snapshot;
631 xl_logical_message *message;
633 if (info != XLOG_LOGICAL_MESSAGE)
634 elog(ERROR, "unexpected RM_LOGICALMSG_ID record type: %u", info);
636 ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr);
639 * If we don't have snapshot or we are just fast-forwarding, there is no
640 * point in decoding messages.
642 if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
643 ctx->fast_forward)
644 return;
646 message = (xl_logical_message *) XLogRecGetData(r);
648 if (message->dbId != ctx->slot->data.database ||
649 FilterByOrigin(ctx, origin_id))
650 return;
652 if (message->transactional &&
653 !SnapBuildProcessChange(builder, xid, buf->origptr))
654 return;
655 else if (!message->transactional &&
656 (SnapBuildCurrentState(builder) != SNAPBUILD_CONSISTENT ||
657 SnapBuildXactNeedsSkip(builder, buf->origptr)))
658 return;
660 snapshot = SnapBuildGetOrBuildSnapshot(builder, xid);
661 ReorderBufferQueueMessage(ctx->reorder, xid, snapshot, buf->endptr,
662 message->transactional,
663 message->message, /* first part of message is
664 * prefix */
665 message->message_size,
666 message->message + message->prefix_size);
670 * Consolidated commit record handling between the different form of commit
671 * records.
673 * 'two_phase' indicates that caller wants to process the transaction in two
674 * phases, first process prepare if not already done and then process
675 * commit_prepared.
677 static void
678 DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
679 xl_xact_parsed_commit *parsed, TransactionId xid,
680 bool two_phase)
682 XLogRecPtr origin_lsn = InvalidXLogRecPtr;
683 TimestampTz commit_time = parsed->xact_time;
684 RepOriginId origin_id = XLogRecGetOrigin(buf->record);
685 int i;
687 if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
689 origin_lsn = parsed->origin_lsn;
690 commit_time = parsed->origin_timestamp;
693 SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid,
694 parsed->nsubxacts, parsed->subxacts);
696 /* ----
697 * Check whether we are interested in this specific transaction, and tell
698 * the reorderbuffer to forget the content of the (sub-)transactions
699 * if not.
701 * We can't just use ReorderBufferAbort() here, because we need to execute
702 * the transaction's invalidations. This currently won't be needed if
703 * we're just skipping over the transaction because currently we only do
704 * so during startup, to get to the first transaction the client needs. As
705 * we have reset the catalog caches before starting to read WAL, and we
706 * haven't yet touched any catalogs, there can't be anything to invalidate.
707 * But if we're "forgetting" this commit because it happened in another
708 * database, the invalidations might be important, because they could be
709 * for shared catalogs and we might have loaded data into the relevant
710 * syscaches.
711 * ---
713 if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id))
715 for (i = 0; i < parsed->nsubxacts; i++)
717 ReorderBufferForget(ctx->reorder, parsed->subxacts[i], buf->origptr);
719 ReorderBufferForget(ctx->reorder, xid, buf->origptr);
721 return;
724 /* tell the reorderbuffer about the surviving subtransactions */
725 for (i = 0; i < parsed->nsubxacts; i++)
727 ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i],
728 buf->origptr, buf->endptr);
732 * Send the final commit record if the transaction data is already
733 * decoded, otherwise, process the entire transaction.
735 if (two_phase)
737 ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr,
738 SnapBuildGetTwoPhaseAt(ctx->snapshot_builder),
739 commit_time, origin_id, origin_lsn,
740 parsed->twophase_gid, true);
742 else
744 ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr,
745 commit_time, origin_id, origin_lsn);
749 * Update the decoding stats at transaction prepare/commit/abort.
750 * Additionally we send the stats when we spill or stream the changes to
751 * avoid losing them in case the decoding is interrupted. It is not clear
752 * that sending more or less frequently than this would be better.
754 UpdateDecodingStats(ctx);
758 * Decode PREPARE record. Similar logic as in DecodeCommit.
760 * Note that we don't skip prepare even if have detected concurrent abort
761 * because it is quite possible that we had already sent some changes before we
762 * detect abort in which case we need to abort those changes in the subscriber.
763 * To abort such changes, we do send the prepare and then the rollback prepared
764 * which is what happened on the publisher-side as well. Now, we can invent a
765 * new abort API wherein in such cases we send abort and skip sending prepared
766 * and rollback prepared but then it is not that straightforward because we
767 * might have streamed this transaction by that time in which case it is
768 * handled when the rollback is encountered. It is not impossible to optimize
769 * the concurrent abort case but it can introduce design complexity w.r.t
770 * handling different cases so leaving it for now as it doesn't seem worth it.
772 static void
773 DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
774 xl_xact_parsed_prepare *parsed)
776 SnapBuild *builder = ctx->snapshot_builder;
777 XLogRecPtr origin_lsn = parsed->origin_lsn;
778 TimestampTz prepare_time = parsed->xact_time;
779 XLogRecPtr origin_id = XLogRecGetOrigin(buf->record);
780 int i;
781 TransactionId xid = parsed->twophase_xid;
783 if (parsed->origin_timestamp != 0)
784 prepare_time = parsed->origin_timestamp;
787 * Remember the prepare info for a txn so that it can be used later in
788 * commit prepared if required. See ReorderBufferFinishPrepared.
790 if (!ReorderBufferRememberPrepareInfo(ctx->reorder, xid, buf->origptr,
791 buf->endptr, prepare_time, origin_id,
792 origin_lsn))
793 return;
795 /* We can't start streaming unless a consistent state is reached. */
796 if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
798 ReorderBufferSkipPrepare(ctx->reorder, xid);
799 return;
803 * Check whether we need to process this transaction. See
804 * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the
805 * transaction.
807 * We can't call ReorderBufferForget as we did in DecodeCommit as the txn
808 * hasn't yet been committed, removing this txn before a commit might
809 * result in the computation of an incorrect restart_lsn. See
810 * SnapBuildProcessRunningXacts. But we need to process cache
811 * invalidations if there are any for the reasons mentioned in
812 * DecodeCommit.
814 if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id))
816 ReorderBufferSkipPrepare(ctx->reorder, xid);
817 ReorderBufferInvalidate(ctx->reorder, xid, buf->origptr);
818 return;
821 /* Tell the reorderbuffer about the surviving subtransactions. */
822 for (i = 0; i < parsed->nsubxacts; i++)
824 ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i],
825 buf->origptr, buf->endptr);
828 /* replay actions of all transaction + subtransactions in order */
829 ReorderBufferPrepare(ctx->reorder, xid, parsed->twophase_gid);
832 * Update the decoding stats at transaction prepare/commit/abort.
833 * Additionally we send the stats when we spill or stream the changes to
834 * avoid losing them in case the decoding is interrupted. It is not clear
835 * that sending more or less frequently than this would be better.
837 UpdateDecodingStats(ctx);
842 * Get the data from the various forms of abort records and pass it on to
843 * snapbuild.c and reorderbuffer.c.
845 * 'two_phase' indicates to finish prepared transaction.
847 static void
848 DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
849 xl_xact_parsed_abort *parsed, TransactionId xid,
850 bool two_phase)
852 int i;
853 XLogRecPtr origin_lsn = InvalidXLogRecPtr;
854 TimestampTz abort_time = parsed->xact_time;
855 XLogRecPtr origin_id = XLogRecGetOrigin(buf->record);
856 bool skip_xact;
858 if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
860 origin_lsn = parsed->origin_lsn;
861 abort_time = parsed->origin_timestamp;
865 * Check whether we need to process this transaction. See
866 * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the
867 * transaction.
869 skip_xact = DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id);
872 * Send the final rollback record for a prepared transaction unless we
873 * need to skip it. For non-two-phase xacts, simply forget the xact.
875 if (two_phase && !skip_xact)
877 ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr,
878 InvalidXLogRecPtr,
879 abort_time, origin_id, origin_lsn,
880 parsed->twophase_gid, false);
882 else
884 for (i = 0; i < parsed->nsubxacts; i++)
886 ReorderBufferAbort(ctx->reorder, parsed->subxacts[i],
887 buf->record->EndRecPtr);
890 ReorderBufferAbort(ctx->reorder, xid, buf->record->EndRecPtr);
893 /* update the decoding stats */
894 UpdateDecodingStats(ctx);
898 * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
900 * Deletes can contain the new tuple.
902 static void
903 DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
905 Size datalen;
906 char *tupledata;
907 Size tuplelen;
908 XLogReaderState *r = buf->record;
909 xl_heap_insert *xlrec;
910 ReorderBufferChange *change;
911 RelFileNode target_node;
913 xlrec = (xl_heap_insert *) XLogRecGetData(r);
916 * Ignore insert records without new tuples (this does happen when
917 * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL).
919 if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
920 return;
922 /* only interested in our database */
923 XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL);
924 if (target_node.dbNode != ctx->slot->data.database)
925 return;
927 /* output plugin doesn't look for this origin, no need to queue */
928 if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
929 return;
931 change = ReorderBufferGetChange(ctx->reorder);
932 if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE))
933 change->action = REORDER_BUFFER_CHANGE_INSERT;
934 else
935 change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT;
936 change->origin_id = XLogRecGetOrigin(r);
938 memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode));
940 tupledata = XLogRecGetBlockData(r, 0, &datalen);
941 tuplelen = datalen - SizeOfHeapHeader;
943 change->data.tp.newtuple =
944 ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
946 DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple);
948 change->data.tp.clear_toast_afterwards = true;
950 ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
951 change,
952 xlrec->flags & XLH_INSERT_ON_TOAST_RELATION);
956 * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
957 * in the record, from wal into proper tuplebufs.
959 * Updates can possibly contain a new tuple and the old primary key.
961 static void
962 DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
964 XLogReaderState *r = buf->record;
965 xl_heap_update *xlrec;
966 ReorderBufferChange *change;
967 char *data;
968 RelFileNode target_node;
970 xlrec = (xl_heap_update *) XLogRecGetData(r);
972 /* only interested in our database */
973 XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL);
974 if (target_node.dbNode != ctx->slot->data.database)
975 return;
977 /* output plugin doesn't look for this origin, no need to queue */
978 if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
979 return;
981 change = ReorderBufferGetChange(ctx->reorder);
982 change->action = REORDER_BUFFER_CHANGE_UPDATE;
983 change->origin_id = XLogRecGetOrigin(r);
984 memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode));
986 if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE)
988 Size datalen;
989 Size tuplelen;
991 data = XLogRecGetBlockData(r, 0, &datalen);
993 tuplelen = datalen - SizeOfHeapHeader;
995 change->data.tp.newtuple =
996 ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
998 DecodeXLogTuple(data, datalen, change->data.tp.newtuple);
1001 if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD)
1003 Size datalen;
1004 Size tuplelen;
1006 /* caution, remaining data in record is not aligned */
1007 data = XLogRecGetData(r) + SizeOfHeapUpdate;
1008 datalen = XLogRecGetDataLen(r) - SizeOfHeapUpdate;
1009 tuplelen = datalen - SizeOfHeapHeader;
1011 change->data.tp.oldtuple =
1012 ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
1014 DecodeXLogTuple(data, datalen, change->data.tp.oldtuple);
1017 change->data.tp.clear_toast_afterwards = true;
1019 ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
1020 change, false);
1024 * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
1026 * Deletes can possibly contain the old primary key.
1028 static void
1029 DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
1031 XLogReaderState *r = buf->record;
1032 xl_heap_delete *xlrec;
1033 ReorderBufferChange *change;
1034 RelFileNode target_node;
1036 xlrec = (xl_heap_delete *) XLogRecGetData(r);
1038 /* only interested in our database */
1039 XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL);
1040 if (target_node.dbNode != ctx->slot->data.database)
1041 return;
1043 /* output plugin doesn't look for this origin, no need to queue */
1044 if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
1045 return;
1047 change = ReorderBufferGetChange(ctx->reorder);
1049 if (xlrec->flags & XLH_DELETE_IS_SUPER)
1050 change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT;
1051 else
1052 change->action = REORDER_BUFFER_CHANGE_DELETE;
1054 change->origin_id = XLogRecGetOrigin(r);
1056 memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode));
1058 /* old primary key stored */
1059 if (xlrec->flags & XLH_DELETE_CONTAINS_OLD)
1061 Size datalen = XLogRecGetDataLen(r) - SizeOfHeapDelete;
1062 Size tuplelen = datalen - SizeOfHeapHeader;
1064 Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader));
1066 change->data.tp.oldtuple =
1067 ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
1069 DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete,
1070 datalen, change->data.tp.oldtuple);
1073 change->data.tp.clear_toast_afterwards = true;
1075 ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
1076 change, false);
1080 * Parse XLOG_HEAP_TRUNCATE from wal
1082 static void
1083 DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
1085 XLogReaderState *r = buf->record;
1086 xl_heap_truncate *xlrec;
1087 ReorderBufferChange *change;
1089 xlrec = (xl_heap_truncate *) XLogRecGetData(r);
1091 /* only interested in our database */
1092 if (xlrec->dbId != ctx->slot->data.database)
1093 return;
1095 /* output plugin doesn't look for this origin, no need to queue */
1096 if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
1097 return;
1099 change = ReorderBufferGetChange(ctx->reorder);
1100 change->action = REORDER_BUFFER_CHANGE_TRUNCATE;
1101 change->origin_id = XLogRecGetOrigin(r);
1102 if (xlrec->flags & XLH_TRUNCATE_CASCADE)
1103 change->data.truncate.cascade = true;
1104 if (xlrec->flags & XLH_TRUNCATE_RESTART_SEQS)
1105 change->data.truncate.restart_seqs = true;
1106 change->data.truncate.nrelids = xlrec->nrelids;
1107 change->data.truncate.relids = ReorderBufferGetRelids(ctx->reorder,
1108 xlrec->nrelids);
1109 memcpy(change->data.truncate.relids, xlrec->relids,
1110 xlrec->nrelids * sizeof(Oid));
1111 ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
1112 buf->origptr, change, false);
1116 * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
1118 * Currently MULTI_INSERT will always contain the full tuples.
1120 static void
1121 DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
1123 XLogReaderState *r = buf->record;
1124 xl_heap_multi_insert *xlrec;
1125 int i;
1126 char *data;
1127 char *tupledata;
1128 Size tuplelen;
1129 RelFileNode rnode;
1131 xlrec = (xl_heap_multi_insert *) XLogRecGetData(r);
1134 * Ignore insert records without new tuples. This happens when a
1135 * multi_insert is done on a catalog or on a non-persistent relation.
1137 if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
1138 return;
1140 /* only interested in our database */
1141 XLogRecGetBlockTag(r, 0, &rnode, NULL, NULL);
1142 if (rnode.dbNode != ctx->slot->data.database)
1143 return;
1145 /* output plugin doesn't look for this origin, no need to queue */
1146 if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
1147 return;
1150 * We know that this multi_insert isn't for a catalog, so the block should
1151 * always have data even if a full-page write of it is taken.
1153 tupledata = XLogRecGetBlockData(r, 0, &tuplelen);
1154 Assert(tupledata != NULL);
1156 data = tupledata;
1157 for (i = 0; i < xlrec->ntuples; i++)
1159 ReorderBufferChange *change;
1160 xl_multi_insert_tuple *xlhdr;
1161 int datalen;
1162 ReorderBufferTupleBuf *tuple;
1163 HeapTupleHeader header;
1165 change = ReorderBufferGetChange(ctx->reorder);
1166 change->action = REORDER_BUFFER_CHANGE_INSERT;
1167 change->origin_id = XLogRecGetOrigin(r);
1169 memcpy(&change->data.tp.relnode, &rnode, sizeof(RelFileNode));
1171 xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data);
1172 data = ((char *) xlhdr) + SizeOfMultiInsertTuple;
1173 datalen = xlhdr->datalen;
1175 change->data.tp.newtuple =
1176 ReorderBufferGetTupleBuf(ctx->reorder, datalen);
1178 tuple = change->data.tp.newtuple;
1179 header = tuple->tuple.t_data;
1181 /* not a disk based tuple */
1182 ItemPointerSetInvalid(&tuple->tuple.t_self);
1185 * We can only figure this out after reassembling the transactions.
1187 tuple->tuple.t_tableOid = InvalidOid;
1189 tuple->tuple.t_len = datalen + SizeofHeapTupleHeader;
1191 memset(header, 0, SizeofHeapTupleHeader);
1193 memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader,
1194 (char *) data,
1195 datalen);
1196 header->t_infomask = xlhdr->t_infomask;
1197 header->t_infomask2 = xlhdr->t_infomask2;
1198 header->t_hoff = xlhdr->t_hoff;
1201 * Reset toast reassembly state only after the last row in the last
1202 * xl_multi_insert_tuple record emitted by one heap_multi_insert()
1203 * call.
1205 if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI &&
1206 (i + 1) == xlrec->ntuples)
1207 change->data.tp.clear_toast_afterwards = true;
1208 else
1209 change->data.tp.clear_toast_afterwards = false;
1211 ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
1212 buf->origptr, change, false);
1214 /* move to the next xl_multi_insert_tuple entry */
1215 data += datalen;
1217 Assert(data == tupledata + tuplelen);
1221 * Parse XLOG_HEAP_CONFIRM from wal into a confirmation change.
1223 * This is pretty trivial, all the state essentially already setup by the
1224 * speculative insertion.
1226 static void
1227 DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
1229 XLogReaderState *r = buf->record;
1230 ReorderBufferChange *change;
1231 RelFileNode target_node;
1233 /* only interested in our database */
1234 XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL);
1235 if (target_node.dbNode != ctx->slot->data.database)
1236 return;
1238 /* output plugin doesn't look for this origin, no need to queue */
1239 if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
1240 return;
1242 change = ReorderBufferGetChange(ctx->reorder);
1243 change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM;
1244 change->origin_id = XLogRecGetOrigin(r);
1246 memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode));
1248 change->data.tp.clear_toast_afterwards = true;
1250 ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
1251 change, false);
1256 * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
1257 * (but not by heap_multi_insert) into a tuplebuf.
1259 * The size 'len' and the pointer 'data' in the record need to be
1260 * computed outside as they are record specific.
1262 static void
1263 DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
1265 xl_heap_header xlhdr;
1266 int datalen = len - SizeOfHeapHeader;
1267 HeapTupleHeader header;
1269 Assert(datalen >= 0);
1271 tuple->tuple.t_len = datalen + SizeofHeapTupleHeader;
1272 header = tuple->tuple.t_data;
1274 /* not a disk based tuple */
1275 ItemPointerSetInvalid(&tuple->tuple.t_self);
1277 /* we can only figure this out after reassembling the transactions */
1278 tuple->tuple.t_tableOid = InvalidOid;
1280 /* data is not stored aligned, copy to aligned storage */
1281 memcpy((char *) &xlhdr,
1282 data,
1283 SizeOfHeapHeader);
1285 memset(header, 0, SizeofHeapTupleHeader);
1287 memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader,
1288 data + SizeOfHeapHeader,
1289 datalen);
1291 header->t_infomask = xlhdr.t_infomask;
1292 header->t_infomask2 = xlhdr.t_infomask2;
1293 header->t_hoff = xlhdr.t_hoff;
1297 * Check whether we are interested in this specific transaction.
1299 * There can be several reasons we might not be interested in this
1300 * transaction:
1301 * 1) We might not be interested in decoding transactions up to this
1302 * LSN. This can happen because we previously decoded it and now just
1303 * are restarting or if we haven't assembled a consistent snapshot yet.
1304 * 2) The transaction happened in another database.
1305 * 3) The output plugin is not interested in the origin.
1306 * 4) We are doing fast-forwarding
1308 static bool
1309 DecodeTXNNeedSkip(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
1310 Oid txn_dbid, RepOriginId origin_id)
1312 return (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) ||
1313 (txn_dbid != InvalidOid && txn_dbid != ctx->slot->data.database) ||
1314 ctx->fast_forward || FilterByOrigin(ctx, origin_id));