PostgreSQL Source Code git master
Loading...
Searching...
No Matches
reorderbuffer.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * reorderbuffer.c
4 * PostgreSQL logical replay/reorder buffer management
5 *
6 *
7 * Copyright (c) 2012-2026, PostgreSQL Global Development Group
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/replication/logical/reorderbuffer.c
12 *
13 * NOTES
14 * This module gets handed individual pieces of transactions in the order
15 * they are written to the WAL and is responsible to reassemble them into
16 * toplevel transaction sized pieces. When a transaction is completely
17 * reassembled - signaled by reading the transaction commit record - it
18 * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 * individual changes. The output plugins rely on snapshots built by
20 * snapbuild.c which hands them to us.
21 *
22 * Transactions and subtransactions/savepoints in postgres are not
23 * immediately linked to each other from outside the performing
24 * backend. Only at commit/abort (or special xact_assignment records) they
25 * are linked together. Which means that we will have to splice together a
26 * toplevel transaction from its subtransactions. To do that efficiently we
27 * build a binary heap indexed by the smallest current lsn of the individual
28 * subtransactions' changestreams. As the individual streams are inherently
29 * ordered by LSN - since that is where we build them from - the transaction
30 * can easily be reassembled by always using the subtransaction with the
31 * smallest current LSN from the heap.
32 *
33 * In order to cope with large transactions - which can be several times as
34 * big as the available memory - this module supports spooling the contents
35 * of large transactions to disk. When the transaction is replayed the
36 * contents of individual (sub-)transactions will be read from disk in
37 * chunks.
38 *
39 * This module also has to deal with reassembling toast records from the
40 * individual chunks stored in WAL. When a new (or initial) version of a
41 * tuple is stored in WAL it will always be preceded by the toast chunks
42 * emitted for the columns stored out of line. Within a single toplevel
43 * transaction there will be no other data carrying records between a row's
44 * toast chunks and the row data itself. See ReorderBufferToast* for
45 * details.
46 *
47 * ReorderBuffer uses two special memory context types - SlabContext for
48 * allocations of fixed-length structures (changes and transactions), and
49 * GenerationContext for the variable-length transaction data (allocated
50 * and freed in groups with similar lifespans).
51 *
52 * To limit the amount of memory used by decoded changes, we track memory
53 * used at the reorder buffer level (i.e. total amount of memory), and for
54 * each transaction. When the total amount of used memory exceeds the
55 * limit, the transaction consuming the most memory is then serialized to
56 * disk.
57 *
58 * Only decoded changes are evicted from memory (spilled to disk), not the
59 * transaction records. The number of toplevel transactions is limited,
60 * but a transaction with many subtransactions may still consume significant
61 * amounts of memory. However, the transaction records are fairly small and
62 * are not included in the memory limit.
63 *
64 * The current eviction algorithm is very simple - the transaction is
65 * picked merely by size, while it might be useful to also consider age
66 * (LSN) of the changes for example. With the new Generational memory
67 * allocator, evicting the oldest changes would make it more likely the
68 * memory gets actually freed.
69 *
70 * We use a max-heap with transaction size as the key to efficiently find
71 * the largest transaction. We update the max-heap whenever the memory
72 * counter is updated; however transactions with size 0 are not stored in
73 * the heap, because they have no changes to evict.
74 *
75 * We still rely on max_changes_in_memory when loading serialized changes
76 * back into memory. At that point we can't use the memory limit directly
77 * as we load the subxacts independently. One option to deal with this
78 * would be to count the subxacts, and allow each to allocate 1/N of the
79 * memory limit. That however does not seem very appealing, because with
80 * many subtransactions it may easily cause thrashing (short cycles of
81 * deserializing and applying very few changes). We probably should give
82 * a bit more memory to the oldest subtransactions, because it's likely
83 * they are the source for the next sequence of changes.
84 *
85 * -------------------------------------------------------------------------
86 */
87#include "postgres.h"
88
89#include <unistd.h>
90#include <sys/stat.h>
91
92#include "access/detoast.h"
93#include "access/heapam.h"
94#include "access/rewriteheap.h"
95#include "access/transam.h"
96#include "access/xact.h"
98#include "catalog/catalog.h"
99#include "common/int.h"
100#include "lib/binaryheap.h"
101#include "miscadmin.h"
102#include "pgstat.h"
103#include "replication/logical.h"
105#include "replication/slot.h"
106#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107#include "storage/bufmgr.h"
108#include "storage/fd.h"
109#include "storage/procarray.h"
110#include "storage/sinval.h"
111#include "utils/builtins.h"
112#include "utils/inval.h"
113#include "utils/memutils.h"
114#include "utils/rel.h"
116#include "utils/wait_event.h"
117
118/*
119 * Each transaction has an 8MB limit for invalidation messages distributed from
120 * other transactions. This limit is set considering scenarios with many
121 * concurrent logical decoding operations. When the distributed invalidation
122 * messages reach this threshold, the transaction is marked as
123 * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
124 * some inval messages and hence don't know what needs to be invalidated.
125 */
126#define MAX_DISTR_INVAL_MSG_PER_TXN \
127 ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
128
129/* entry for a hash table we use to map from xid to our transaction state */
135
136/* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
142
150
151/* Virtual file descriptor with file offset tracking */
152typedef struct TXNEntryFile
153{
154 File vfd; /* -1 when the file is closed */
155 off_t curOffset; /* offset for next write or read. Reset to 0
156 * when vfd is opened. */
158
159/* k-way in-order change iteration support structures */
168
176
177/* toast datastructures */
179{
180 Oid chunk_id; /* toast_table.chunk_id */
181 int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
182 * have seen */
183 Size num_chunks; /* number of chunks we've already seen */
184 Size size; /* combined size of chunks seen */
185 dlist_head chunks; /* linked list of chunks */
186 varlena *reconstructed; /* reconstructed varlena now pointed to in
187 * main tup */
189
190/* Disk serialization support datastructures */
197
198#define IsSpecInsert(action) \
199( \
200 ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
201)
202#define IsSpecConfirmOrAbort(action) \
203( \
204 (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
205 ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
206)
207#define IsInsertOrUpdate(action) \
208( \
209 (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
210 ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
211 ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
212)
213
214/*
215 * Maximum number of changes kept in memory, per transaction. After that,
216 * changes are spooled to disk.
217 *
218 * The current value should be sufficient to decode the entire transaction
219 * without hitting disk in OLTP workloads, while starting to spool to disk in
220 * other workloads reasonably fast.
221 *
222 * At some point in the future it probably makes sense to have a more elaborate
223 * resource management here, but it's not entirely clear what that would look
224 * like.
225 */
227static const Size max_changes_in_memory = 4096; /* XXX for restore only */
228
229/* GUC variable */
231
232/* ---------------------------------------
233 * primary reorderbuffer support routines
234 * ---------------------------------------
235 */
239 TransactionId xid, bool create, bool *is_new,
240 XLogRecPtr lsn, bool create_as_top);
243
245
246/* ---------------------------------------
247 * support functions for lsn-order iterating over the ->changes of a
248 * transaction and its subtransactions
249 *
250 * used for iteration over the k-way heap merge of a transaction and its
251 * subtransactions
252 * ---------------------------------------
253 */
260
261/*
262 * ---------------------------------------
263 * Disk serialization support functions
264 * ---------------------------------------
265 */
269 int fd, ReorderBufferChange *change);
271 TXNEntryFile *file, XLogSegNo *segno);
273 char *data);
276 bool txn_prepared);
279static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
280static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
281 TransactionId xid, XLogSegNo segno);
282static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
283
287
288/*
289 * ---------------------------------------
290 * Streaming support functions
291 * ---------------------------------------
292 */
293static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
297
298/* ---------------------------------------
299 * toast reassembly support
300 * ---------------------------------------
301 */
305 Relation relation, ReorderBufferChange *change);
307 Relation relation, ReorderBufferChange *change);
308
309/*
310 * ---------------------------------------
311 * memory accounting
312 * ---------------------------------------
313 */
316 ReorderBufferChange *change,
317 ReorderBufferTXN *txn,
318 bool addition, Size sz);
319
320/*
321 * Allocate a new ReorderBuffer and clean out any old serialized state from
322 * prior ReorderBuffer instances for the same slot.
323 */
326{
327 ReorderBuffer *buffer;
330
332
333 /* allocate memory in own context, to have better accountability */
335 "ReorderBuffer",
337
338 buffer =
340
341 memset(&hash_ctl, 0, sizeof(hash_ctl));
342
343 buffer->context = new_ctx;
344
346 "Change",
348 sizeof(ReorderBufferChange));
349
351 "TXN",
353 sizeof(ReorderBufferTXN));
354
355 /*
356 * To minimize memory fragmentation caused by long-running transactions
357 * with changes spanning multiple memory blocks, we use a single
358 * fixed-size memory block for decoded tuple storage. The performance
359 * testing showed that the default memory block size maintains logical
360 * decoding performance without causing fragmentation due to concurrent
361 * transactions. One might think that we can use the max size as
362 * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
363 * the memory fragmentation.
364 */
366 "Tuples",
370
371 hash_ctl.keysize = sizeof(TransactionId);
372 hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
373 hash_ctl.hcxt = buffer->context;
374
375 buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
377
379 buffer->by_txn_last_txn = NULL;
380
381 buffer->outbuf = NULL;
382 buffer->outbufsize = 0;
383 buffer->size = 0;
384
385 /* txn_heap is ordered by transaction size */
387
388 buffer->spillTxns = 0;
389 buffer->spillCount = 0;
390 buffer->spillBytes = 0;
391 buffer->streamTxns = 0;
392 buffer->streamCount = 0;
393 buffer->streamBytes = 0;
394 buffer->memExceededCount = 0;
395 buffer->totalTxns = 0;
396 buffer->totalBytes = 0;
397
399
400 dlist_init(&buffer->toplevel_by_lsn);
402 dclist_init(&buffer->catchange_txns);
403
404 /*
405 * Ensure there's no stale data from prior uses of this slot, in case some
406 * prior exit avoided calling ReorderBufferFree. Failure to do this can
407 * produce duplicated txns, and it's very cheap if there's nothing there.
408 */
410
411 return buffer;
412}
413
414/*
415 * Free a ReorderBuffer
416 */
417void
419{
420 MemoryContext context = rb->context;
421
422 /*
423 * We free separately allocated data by entirely scrapping reorderbuffer's
424 * memory context.
425 */
426 MemoryContextDelete(context);
427
428 /* Free disk space used by unconsumed reorder buffers */
430}
431
432/*
433 * Allocate a new ReorderBufferTXN.
434 */
435static ReorderBufferTXN *
437{
438 ReorderBufferTXN *txn;
439
440 txn = (ReorderBufferTXN *)
441 MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
442
443 memset(txn, 0, sizeof(ReorderBufferTXN));
444
445 dlist_init(&txn->changes);
446 dlist_init(&txn->tuplecids);
447 dlist_init(&txn->subtxns);
448
449 /* InvalidCommandId is not zero, so set it explicitly */
452
453 return txn;
454}
455
456/*
457 * Free a ReorderBufferTXN.
458 */
459static void
461{
462 /* clean the lookup cache if we were cached (quite likely) */
463 if (rb->by_txn_last_xid == txn->xid)
464 {
465 rb->by_txn_last_xid = InvalidTransactionId;
466 rb->by_txn_last_txn = NULL;
467 }
468
469 /* free data that's contained */
470
471 if (txn->gid != NULL)
472 {
473 pfree(txn->gid);
474 txn->gid = NULL;
475 }
476
477 if (txn->tuplecid_hash != NULL)
478 {
480 txn->tuplecid_hash = NULL;
481 }
482
483 if (txn->invalidations)
484 {
485 pfree(txn->invalidations);
486 txn->invalidations = NULL;
487 }
488
490 {
493 }
494
495 /* Reset the toast hash */
497
498 /* All changes must be deallocated */
499 Assert(txn->size == 0);
500
501 pfree(txn);
502}
503
504/*
505 * Allocate a ReorderBufferChange.
506 */
509{
510 ReorderBufferChange *change;
511
512 change = (ReorderBufferChange *)
513 MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
514
515 memset(change, 0, sizeof(ReorderBufferChange));
516 return change;
517}
518
519/*
520 * Free a ReorderBufferChange and update memory accounting, if requested.
521 */
522void
524 bool upd_mem)
525{
526 /* update memory accounting info */
527 if (upd_mem)
530
531 /* free contained data */
532 switch (change->action)
533 {
538 if (change->data.tp.newtuple)
539 {
541 change->data.tp.newtuple = NULL;
542 }
543
544 if (change->data.tp.oldtuple)
545 {
547 change->data.tp.oldtuple = NULL;
548 }
549 break;
551 if (change->data.msg.prefix != NULL)
552 pfree(change->data.msg.prefix);
553 change->data.msg.prefix = NULL;
554 if (change->data.msg.message != NULL)
555 pfree(change->data.msg.message);
556 change->data.msg.message = NULL;
557 break;
559 if (change->data.inval.invalidations)
560 pfree(change->data.inval.invalidations);
561 change->data.inval.invalidations = NULL;
562 break;
564 if (change->data.snapshot)
565 {
567 change->data.snapshot = NULL;
568 }
569 break;
570 /* no data in addition to the struct itself */
572 if (change->data.truncate.relids != NULL)
573 {
575 change->data.truncate.relids = NULL;
576 }
577 break;
582 break;
583 }
584
585 pfree(change);
586}
587
588/*
589 * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
590 * overhead).
591 */
594{
595 HeapTuple tuple;
597
598 alloc_len = tuple_len + SizeofHeapTupleHeader;
599
600 tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
602 tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
603
604 return tuple;
605}
606
607/*
608 * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
609 */
610void
612{
613 pfree(tuple);
614}
615
616/*
617 * Allocate an array for relids of truncated relations.
618 *
619 * We use the global memory context (for the whole reorder buffer), because
620 * none of the existing ones seems like a good match (some are SLAB, so we
621 * can't use those, and tup_context is meant for tuple data, not relids). We
622 * could add yet another context, but it seems like an overkill - TRUNCATE is
623 * not particularly common operation, so it does not seem worth it.
624 */
625Oid *
627{
628 Oid *relids;
630
631 alloc_len = sizeof(Oid) * nrelids;
632
633 relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
634
635 return relids;
636}
637
638/*
639 * Free an array of relids.
640 */
641void
643{
644 pfree(relids);
645}
646
647/*
648 * Return the ReorderBufferTXN from the given buffer, specified by Xid.
649 * If create is true, and a transaction doesn't already exist, create it
650 * (with the given LSN, and as top transaction if that's specified);
651 * when this happens, is_new is set to true.
652 */
653static ReorderBufferTXN *
655 bool *is_new, XLogRecPtr lsn, bool create_as_top)
656{
657 ReorderBufferTXN *txn;
659 bool found;
660
662
663 /*
664 * Check the one-entry lookup cache first
665 */
666 if (TransactionIdIsValid(rb->by_txn_last_xid) &&
667 rb->by_txn_last_xid == xid)
668 {
669 txn = rb->by_txn_last_txn;
670
671 if (txn != NULL)
672 {
673 /* found it, and it's valid */
674 if (is_new)
675 *is_new = false;
676 return txn;
677 }
678
679 /*
680 * cached as non-existent, and asked not to create? Then nothing else
681 * to do.
682 */
683 if (!create)
684 return NULL;
685 /* otherwise fall through to create it */
686 }
687
688 /*
689 * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
690 * create an entry.
691 */
692
693 /* search the lookup table */
695 hash_search(rb->by_txn,
696 &xid,
697 create ? HASH_ENTER : HASH_FIND,
698 &found);
699 if (found)
700 txn = ent->txn;
701 else if (create)
702 {
703 /* initialize the new entry, if creation was requested */
704 Assert(ent != NULL);
706
708 ent->txn->xid = xid;
709 txn = ent->txn;
710 txn->first_lsn = lsn;
711 txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
712
713 if (create_as_top)
714 {
715 dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
717 }
718 }
719 else
720 txn = NULL; /* not found and not asked to create */
721
722 /* update cache */
723 rb->by_txn_last_xid = xid;
724 rb->by_txn_last_txn = txn;
725
726 if (is_new)
727 *is_new = !found;
728
729 Assert(!create || txn != NULL);
730 return txn;
731}
732
733/*
734 * Record the partial change for the streaming of in-progress transactions. We
735 * can stream only complete changes so if we have a partial change like toast
736 * table insert or speculative insert then we mark such a 'txn' so that it
737 * can't be streamed. We also ensure that if the changes in such a 'txn' can
738 * be streamed and are above logical_decoding_work_mem threshold then we stream
739 * them as soon as we have a complete change.
740 */
741static void
743 ReorderBufferChange *change,
744 bool toast_insert)
745{
746 ReorderBufferTXN *toptxn;
747
748 /*
749 * The partial changes need to be processed only while streaming
750 * in-progress transactions.
751 */
753 return;
754
755 /* Get the top transaction. */
756 toptxn = rbtxn_get_toptxn(txn);
757
758 /*
759 * Indicate a partial change for toast inserts. The change will be
760 * considered as complete once we get the insert or update on the main
761 * table and we are sure that the pending toast chunks are not required
762 * anymore.
763 *
764 * If we allow streaming when there are pending toast chunks then such
765 * chunks won't be released till the insert (multi_insert) is complete and
766 * we expect the txn to have streamed all changes after streaming. This
767 * restriction is mainly to ensure the correctness of streamed
768 * transactions and it doesn't seem worth uplifting such a restriction
769 * just to allow this case because anyway we will stream the transaction
770 * once such an insert is complete.
771 */
772 if (toast_insert)
774 else if (rbtxn_has_partial_change(toptxn) &&
775 IsInsertOrUpdate(change->action) &&
778
779 /*
780 * Indicate a partial change for speculative inserts. The change will be
781 * considered as complete once we get the speculative confirm or abort
782 * token.
783 */
784 if (IsSpecInsert(change->action))
786 else if (rbtxn_has_partial_change(toptxn) &&
789
790 /*
791 * Stream the transaction if it is serialized before and the changes are
792 * now complete in the top-level transaction.
793 *
794 * The reason for doing the streaming of such a transaction as soon as we
795 * get the complete change for it is that previously it would have reached
796 * the memory threshold and wouldn't get streamed because of incomplete
797 * changes. Delaying such transactions would increase apply lag for them.
798 */
800 !(rbtxn_has_partial_change(toptxn)) &&
801 rbtxn_is_serialized(txn) &&
803 ReorderBufferStreamTXN(rb, toptxn);
804}
805
806/*
807 * Queue a change into a transaction so it can be replayed upon commit or will be
808 * streamed when we reach logical_decoding_work_mem threshold.
809 */
810void
812 ReorderBufferChange *change, bool toast_insert)
813{
814 ReorderBufferTXN *txn;
815
816 txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
817
818 /*
819 * If we have detected that the transaction is aborted while streaming the
820 * previous changes or by checking its CLOG, there is no point in
821 * collecting further changes for it.
822 */
823 if (rbtxn_is_aborted(txn))
824 {
825 /*
826 * We don't need to update memory accounting for this change as we
827 * have not added it to the queue yet.
828 */
829 ReorderBufferFreeChange(rb, change, false);
830 return;
831 }
832
833 /*
834 * The changes that are sent downstream are considered streamable. We
835 * remember such transactions so that only those will later be considered
836 * for streaming.
837 */
838 if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
844 {
845 ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
846
848 }
849
850 change->lsn = lsn;
851 change->txn = txn;
852
854 dlist_push_tail(&txn->changes, &change->node);
855 txn->nentries++;
856 txn->nentries_mem++;
857
858 /* update memory accounting information */
861
862 /* process partial change */
864
865 /* check the memory limits and evict something if needed */
867}
868
869/*
870 * A transactional message is queued to be processed upon commit and a
871 * non-transactional message gets processed immediately.
872 */
873void
876 bool transactional, const char *prefix,
877 Size message_size, const char *message)
878{
879 if (transactional)
880 {
881 MemoryContext oldcontext;
882 ReorderBufferChange *change;
883
885
886 /*
887 * We don't expect snapshots for transactional changes - we'll use the
888 * snapshot derived later during apply (unless the change gets
889 * skipped).
890 */
891 Assert(!snap);
892
893 oldcontext = MemoryContextSwitchTo(rb->context);
894
897 change->data.msg.prefix = pstrdup(prefix);
898 change->data.msg.message_size = message_size;
899 change->data.msg.message = palloc(message_size);
900 memcpy(change->data.msg.message, message, message_size);
901
902 ReorderBufferQueueChange(rb, xid, lsn, change, false);
903
904 MemoryContextSwitchTo(oldcontext);
905 }
906 else
907 {
908 ReorderBufferTXN *txn = NULL;
909 volatile Snapshot snapshot_now = snap;
910
911 /* Non-transactional changes require a valid snapshot. */
912 Assert(snapshot_now);
913
914 if (xid != InvalidTransactionId)
915 txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
916
917 /* setup snapshot to allow catalog access */
918 SetupHistoricSnapshot(snapshot_now, NULL);
919 PG_TRY();
920 {
921 rb->message(rb, txn, lsn, false, prefix, message_size, message);
922
924 }
925 PG_CATCH();
926 {
928 PG_RE_THROW();
929 }
930 PG_END_TRY();
931 }
932}
933
934/*
935 * AssertTXNLsnOrder
936 * Verify LSN ordering of transaction lists in the reorderbuffer
937 *
938 * Other LSN-related invariants are checked too.
939 *
940 * No-op if assertions are not in use.
941 */
942static void
944{
945#ifdef USE_ASSERT_CHECKING
946 LogicalDecodingContext *ctx = rb->private_data;
947 dlist_iter iter;
950
951 /*
952 * Skip the verification if we don't reach the LSN at which we start
953 * decoding the contents of transactions yet because until we reach the
954 * LSN, we could have transactions that don't have the association between
955 * the top-level transaction and subtransaction yet and consequently have
956 * the same LSN. We don't guarantee this association until we try to
957 * decode the actual contents of transaction. The ordering of the records
958 * prior to the start_decoding_at LSN should have been checked before the
959 * restart.
960 */
962 return;
963
964 dlist_foreach(iter, &rb->toplevel_by_lsn)
965 {
967 iter.cur);
968
969 /* start LSN must be set */
970 Assert(XLogRecPtrIsValid(cur_txn->first_lsn));
971
972 /* If there is an end LSN, it must be higher than start LSN */
973 if (XLogRecPtrIsValid(cur_txn->end_lsn))
974 Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
975
976 /* Current initial LSN must be strictly higher than previous */
979
980 /* known-as-subtxn txns must not be listed */
982
983 prev_first_lsn = cur_txn->first_lsn;
984 }
985
986 dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
987 {
989 base_snapshot_node,
990 iter.cur);
991
992 /* base snapshot (and its LSN) must be set */
993 Assert(cur_txn->base_snapshot != NULL);
994 Assert(XLogRecPtrIsValid(cur_txn->base_snapshot_lsn));
995
996 /* current LSN must be strictly higher than previous */
998 Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
999
1000 /* known-as-subtxn txns must not be listed */
1002
1003 prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
1004 }
1005#endif
1006}
1007
1008/*
1009 * AssertChangeLsnOrder
1010 *
1011 * Check ordering of changes in the (sub)transaction.
1012 */
1013static void
1015{
1016#ifdef USE_ASSERT_CHECKING
1017 dlist_iter iter;
1019
1020 dlist_foreach(iter, &txn->changes)
1021 {
1023
1025
1028 Assert(txn->first_lsn <= cur_change->lsn);
1029
1030 if (XLogRecPtrIsValid(txn->end_lsn))
1031 Assert(cur_change->lsn <= txn->end_lsn);
1032
1034
1035 prev_lsn = cur_change->lsn;
1036 }
1037#endif
1038}
1039
1040/*
1041 * ReorderBufferGetOldestTXN
1042 * Return oldest transaction in reorderbuffer
1043 */
1046{
1047 ReorderBufferTXN *txn;
1048
1050
1051 if (dlist_is_empty(&rb->toplevel_by_lsn))
1052 return NULL;
1053
1054 txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1055
1058 return txn;
1059}
1060
1061/*
1062 * ReorderBufferGetOldestXmin
1063 * Return oldest Xmin in reorderbuffer
1064 *
1065 * Returns oldest possibly running Xid from the point of view of snapshots
1066 * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1067 * there are none.
1068 *
1069 * Since snapshots are assigned monotonically, this equals the Xmin of the
1070 * base snapshot with minimal base_snapshot_lsn.
1071 */
1074{
1075 ReorderBufferTXN *txn;
1076
1078
1079 if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1080 return InvalidTransactionId;
1081
1082 txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1083 &rb->txns_by_base_snapshot_lsn);
1084 return txn->base_snapshot->xmin;
1085}
1086
1087void
1089{
1090 rb->current_restart_decoding_lsn = ptr;
1091}
1092
1093/*
1094 * ReorderBufferAssignChild
1095 *
1096 * Make note that we know that subxid is a subtransaction of xid, seen as of
1097 * the given lsn.
1098 */
1099void
1101 TransactionId subxid, XLogRecPtr lsn)
1102{
1103 ReorderBufferTXN *txn;
1105 bool new_top;
1106 bool new_sub;
1107
1108 txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1109 subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1110
1111 if (!new_sub)
1112 {
1114 {
1115 /* already associated, nothing to do */
1116 return;
1117 }
1118 else
1119 {
1120 /*
1121 * We already saw this transaction, but initially added it to the
1122 * list of top-level txns. Now that we know it's not top-level,
1123 * remove it from there.
1124 */
1125 dlist_delete(&subtxn->node);
1126 }
1127 }
1128
1129 subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1130 subtxn->toplevel_xid = xid;
1131 Assert(subtxn->nsubtxns == 0);
1132
1133 /* set the reference to top-level transaction */
1134 subtxn->toptxn = txn;
1135
1136 /* add to subtransaction list */
1137 dlist_push_tail(&txn->subtxns, &subtxn->node);
1138 txn->nsubtxns++;
1139
1140 /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1142
1143 /* Verify LSN-ordering invariant */
1145}
1146
1147/*
1148 * ReorderBufferTransferSnapToParent
1149 * Transfer base snapshot from subtxn to top-level txn, if needed
1150 *
1151 * This is done if the top-level txn doesn't have a base snapshot, or if the
1152 * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1153 * snapshot's LSN. This can happen if there are no changes in the toplevel
1154 * txn but there are some in the subtxn, or the first change in subtxn has
1155 * earlier LSN than first change in the top-level txn and we learned about
1156 * their kinship only now.
1157 *
1158 * The subtransaction's snapshot is cleared regardless of the transfer
1159 * happening, since it's not needed anymore in either case.
1160 *
1161 * We do this as soon as we become aware of their kinship, to avoid queueing
1162 * extra snapshots to txns known-as-subtxns -- only top-level txns will
1163 * receive further snapshots.
1164 */
1165static void
1168{
1169 Assert(subtxn->toplevel_xid == txn->xid);
1170
1171 if (subtxn->base_snapshot != NULL)
1172 {
1173 if (txn->base_snapshot == NULL ||
1174 subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1175 {
1176 /*
1177 * If the toplevel transaction already has a base snapshot but
1178 * it's newer than the subxact's, purge it.
1179 */
1180 if (txn->base_snapshot != NULL)
1181 {
1184 }
1185
1186 /*
1187 * The snapshot is now the top transaction's; transfer it, and
1188 * adjust the list position of the top transaction in the list by
1189 * moving it to where the subtransaction is.
1190 */
1191 txn->base_snapshot = subtxn->base_snapshot;
1192 txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1193 dlist_insert_before(&subtxn->base_snapshot_node,
1194 &txn->base_snapshot_node);
1195
1196 /*
1197 * The subtransaction doesn't have a snapshot anymore (so it
1198 * mustn't be in the list.)
1199 */
1200 subtxn->base_snapshot = NULL;
1201 subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1202 dlist_delete(&subtxn->base_snapshot_node);
1203 }
1204 else
1205 {
1206 /* Base snap of toplevel is fine, so subxact's is not needed */
1207 SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1208 dlist_delete(&subtxn->base_snapshot_node);
1209 subtxn->base_snapshot = NULL;
1210 subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1211 }
1212 }
1213}
1214
1215/*
1216 * Associate a subtransaction with its toplevel transaction at commit
1217 * time. There may be no further changes added after this.
1218 */
1219void
1221 TransactionId subxid, XLogRecPtr commit_lsn,
1222 XLogRecPtr end_lsn)
1223{
1225
1226 subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1227 InvalidXLogRecPtr, false);
1228
1229 /*
1230 * No need to do anything if that subtxn didn't contain any changes
1231 */
1232 if (!subtxn)
1233 return;
1234
1235 subtxn->final_lsn = commit_lsn;
1236 subtxn->end_lsn = end_lsn;
1237
1238 /*
1239 * Assign this subxact as a child of the toplevel xact (no-op if already
1240 * done.)
1241 */
1243}
1244
1245
1246/*
1247 * Support for efficiently iterating over a transaction's and its
1248 * subtransactions' changes.
1249 *
1250 * We do by doing a k-way merge between transactions/subtransactions. For that
1251 * we model the current heads of the different transactions as a binary heap
1252 * so we easily know which (sub-)transaction has the change with the smallest
1253 * lsn next.
1254 *
1255 * We assume the changes in individual transactions are already sorted by LSN.
1256 */
1257
1258/*
1259 * Binary heap comparison function.
1260 */
1261static int
1263{
1265 XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1266 XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1267
1268 if (pos_a < pos_b)
1269 return 1;
1270 else if (pos_a == pos_b)
1271 return 0;
1272 return -1;
1273}
1274
1275/*
1276 * Allocate & initialize an iterator which iterates in lsn order over a
1277 * transaction and all its subtransactions.
1278 *
1279 * Note: The iterator state is returned through iter_state parameter rather
1280 * than the function's return value. This is because the state gets cleaned up
1281 * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1282 * back the state even if this function throws an exception.
1283 */
1284static void
1287{
1288 Size nr_txns = 0;
1291 int32 off;
1292
1293 *iter_state = NULL;
1294
1295 /* Check ordering of changes in the toplevel transaction. */
1297
1298 /*
1299 * Calculate the size of our heap: one element for every transaction that
1300 * contains changes. (Besides the transactions already in the reorder
1301 * buffer, we count the one we were directly passed.)
1302 */
1303 if (txn->nentries > 0)
1304 nr_txns++;
1305
1307 {
1309
1311
1312 /* Check ordering of changes in this subtransaction. */
1314
1315 if (cur_txn->nentries > 0)
1316 nr_txns++;
1317 }
1318
1319 /* allocate iteration state */
1321 MemoryContextAllocZero(rb->context,
1323 sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1324
1325 state->nr_txns = nr_txns;
1326 dlist_init(&state->old_change);
1327
1328 for (off = 0; off < state->nr_txns; off++)
1329 {
1330 state->entries[off].file.vfd = -1;
1331 state->entries[off].segno = 0;
1332 }
1333
1334 /* allocate heap */
1335 state->heap = binaryheap_allocate(state->nr_txns,
1337 state);
1338
1339 /* Now that the state fields are initialized, it is safe to return it. */
1340 *iter_state = state;
1341
1342 /*
1343 * Now insert items into the binary heap, in an unordered fashion. (We
1344 * will run a heap assembly step at the end; this is more efficient.)
1345 */
1346
1347 off = 0;
1348
1349 /* add toplevel transaction if it contains changes */
1350 if (txn->nentries > 0)
1351 {
1353
1354 if (rbtxn_is_serialized(txn))
1355 {
1356 /* serialize remaining changes */
1358 ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1359 &state->entries[off].segno);
1360 }
1361
1363 &txn->changes);
1364
1365 state->entries[off].lsn = cur_change->lsn;
1366 state->entries[off].change = cur_change;
1367 state->entries[off].txn = txn;
1368
1370 }
1371
1372 /* add subtransactions if they contain changes */
1374 {
1376
1378
1379 if (cur_txn->nentries > 0)
1380 {
1382
1384 {
1385 /* serialize remaining changes */
1388 &state->entries[off].file,
1389 &state->entries[off].segno);
1390 }
1392 &cur_txn->changes);
1393
1394 state->entries[off].lsn = cur_change->lsn;
1395 state->entries[off].change = cur_change;
1396 state->entries[off].txn = cur_txn;
1397
1399 }
1400 }
1401
1402 /* assemble a valid binary heap */
1403 binaryheap_build(state->heap);
1404}
1405
1406/*
1407 * Return the next change when iterating over a transaction and its
1408 * subtransactions.
1409 *
1410 * Returns NULL when no further changes exist.
1411 */
1412static ReorderBufferChange *
1414{
1415 ReorderBufferChange *change;
1417 int32 off;
1418
1419 /* nothing there anymore */
1420 if (binaryheap_empty(state->heap))
1421 return NULL;
1422
1423 off = DatumGetInt32(binaryheap_first(state->heap));
1424 entry = &state->entries[off];
1425
1426 /* free memory we might have "leaked" in the previous *Next call */
1427 if (!dlist_is_empty(&state->old_change))
1428 {
1429 change = dlist_container(ReorderBufferChange, node,
1430 dlist_pop_head_node(&state->old_change));
1431 ReorderBufferFreeChange(rb, change, true);
1432 Assert(dlist_is_empty(&state->old_change));
1433 }
1434
1435 change = entry->change;
1436
1437 /*
1438 * update heap with information about which transaction has the next
1439 * relevant change in LSN order
1440 */
1441
1442 /* there are in-memory changes */
1443 if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1444 {
1445 dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1448
1449 /* txn stays the same */
1450 state->entries[off].lsn = next_change->lsn;
1451 state->entries[off].change = next_change;
1452
1454 return change;
1455 }
1456
1457 /* try to load changes from disk */
1458 if (entry->txn->nentries != entry->txn->nentries_mem)
1459 {
1460 /*
1461 * Ugly: restoring changes will reuse *Change records, thus delete the
1462 * current one from the per-tx list and only free in the next call.
1463 */
1464 dlist_delete(&change->node);
1465 dlist_push_tail(&state->old_change, &change->node);
1466
1467 /*
1468 * Update the total bytes processed by the txn for which we are
1469 * releasing the current set of changes and restoring the new set of
1470 * changes.
1471 */
1472 rb->totalBytes += entry->txn->size;
1473 if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1474 &state->entries[off].segno))
1475 {
1476 /* successfully restored changes from disk */
1479 &entry->txn->changes);
1480
1481 elog(DEBUG2, "restored %u/%u changes from disk",
1482 (uint32) entry->txn->nentries_mem,
1483 (uint32) entry->txn->nentries);
1484
1485 Assert(entry->txn->nentries_mem);
1486 /* txn stays the same */
1487 state->entries[off].lsn = next_change->lsn;
1488 state->entries[off].change = next_change;
1490
1491 return change;
1492 }
1493 }
1494
1495 /* ok, no changes there anymore, remove */
1497
1498 return change;
1499}
1500
1501/*
1502 * Deallocate the iterator
1503 */
1504static void
1507{
1508 int32 off;
1509
1510 for (off = 0; off < state->nr_txns; off++)
1511 {
1512 if (state->entries[off].file.vfd != -1)
1513 FileClose(state->entries[off].file.vfd);
1514 }
1515
1516 /* free memory we might have "leaked" in the last *Next call */
1517 if (!dlist_is_empty(&state->old_change))
1518 {
1519 ReorderBufferChange *change;
1520
1521 change = dlist_container(ReorderBufferChange, node,
1522 dlist_pop_head_node(&state->old_change));
1523 ReorderBufferFreeChange(rb, change, true);
1524 Assert(dlist_is_empty(&state->old_change));
1525 }
1526
1527 binaryheap_free(state->heap);
1528 pfree(state);
1529}
1530
1531/*
1532 * Cleanup the contents of a transaction, usually after the transaction
1533 * committed or aborted.
1534 */
1535static void
1537{
1538 bool found;
1539 dlist_mutable_iter iter;
1540 Size mem_freed = 0;
1541
1542 /* cleanup subtransactions & their changes */
1543 dlist_foreach_modify(iter, &txn->subtxns)
1544 {
1546
1548
1549 /*
1550 * Subtransactions are always associated to the toplevel TXN, even if
1551 * they originally were happening inside another subtxn, so we won't
1552 * ever recurse more than one level deep here.
1553 */
1555 Assert(subtxn->nsubtxns == 0);
1556
1558 }
1559
1560 /* cleanup changes in the txn */
1561 dlist_foreach_modify(iter, &txn->changes)
1562 {
1563 ReorderBufferChange *change;
1564
1565 change = dlist_container(ReorderBufferChange, node, iter.cur);
1566
1567 /* Check we're not mixing changes from different transactions. */
1568 Assert(change->txn == txn);
1569
1570 /*
1571 * Instead of updating the memory counter for individual changes, we
1572 * sum up the size of memory to free so we can update the memory
1573 * counter all together below. This saves costs of maintaining the
1574 * max-heap.
1575 */
1577
1578 ReorderBufferFreeChange(rb, change, false);
1579 }
1580
1581 /* Update the memory counter */
1583
1584 /*
1585 * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1586 * They are always stored in the toplevel transaction.
1587 */
1588 dlist_foreach_modify(iter, &txn->tuplecids)
1589 {
1590 ReorderBufferChange *change;
1591
1592 change = dlist_container(ReorderBufferChange, node, iter.cur);
1593
1594 /* Check we're not mixing changes from different transactions. */
1595 Assert(change->txn == txn);
1597
1598 ReorderBufferFreeChange(rb, change, true);
1599 }
1600
1601 /*
1602 * Cleanup the base snapshot, if set.
1603 */
1604 if (txn->base_snapshot != NULL)
1605 {
1608 }
1609
1610 /*
1611 * Cleanup the snapshot for the last streamed run.
1612 */
1613 if (txn->snapshot_now != NULL)
1614 {
1617 }
1618
1619 /*
1620 * Remove TXN from its containing lists.
1621 *
1622 * Note: if txn is known as subxact, we are deleting the TXN from its
1623 * parent's list of known subxacts; this leaves the parent's nsubxacts
1624 * count too high, but we don't care. Otherwise, we are deleting the TXN
1625 * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1626 * list of catalog modifying transactions as well.
1627 */
1628 dlist_delete(&txn->node);
1630 dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1631
1632 /* now remove reference from buffer */
1633 hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1634 Assert(found);
1635
1636 /* remove entries spilled to disk */
1637 if (rbtxn_is_serialized(txn))
1639
1640 /* deallocate */
1642}
1643
1644/*
1645 * Discard changes from a transaction (and subtransactions), either after
1646 * streaming, decoding them at PREPARE, or detecting the transaction abort.
1647 * Keep the remaining info - transactions, tuplecids, invalidations and
1648 * snapshots.
1649 *
1650 * We additionally remove tuplecids after decoding the transaction at prepare
1651 * time as we only need to perform invalidation at rollback or commit prepared.
1652 *
1653 * 'txn_prepared' indicates that we have decoded the transaction at prepare
1654 * time.
1655 */
1656static void
1658{
1659 dlist_mutable_iter iter;
1660 Size mem_freed = 0;
1661
1662 /* cleanup subtransactions & their changes */
1663 dlist_foreach_modify(iter, &txn->subtxns)
1664 {
1666
1668
1669 /*
1670 * Subtransactions are always associated to the toplevel TXN, even if
1671 * they originally were happening inside another subtxn, so we won't
1672 * ever recurse more than one level deep here.
1673 */
1675 Assert(subtxn->nsubtxns == 0);
1676
1679 }
1680
1681 /* cleanup changes in the txn */
1682 dlist_foreach_modify(iter, &txn->changes)
1683 {
1684 ReorderBufferChange *change;
1685
1686 change = dlist_container(ReorderBufferChange, node, iter.cur);
1687
1688 /* Check we're not mixing changes from different transactions. */
1689 Assert(change->txn == txn);
1690
1691 /* remove the change from its containing list */
1692 dlist_delete(&change->node);
1693
1694 /*
1695 * Instead of updating the memory counter for individual changes, we
1696 * sum up the size of memory to free so we can update the memory
1697 * counter all together below. This saves costs of maintaining the
1698 * max-heap.
1699 */
1701
1702 ReorderBufferFreeChange(rb, change, false);
1703 }
1704
1705 /* Update the memory counter */
1707
1708 if (txn_prepared)
1709 {
1710 /*
1711 * If this is a prepared txn, cleanup the tuplecids we stored for
1712 * decoding catalog snapshot access. They are always stored in the
1713 * toplevel transaction.
1714 */
1715 dlist_foreach_modify(iter, &txn->tuplecids)
1716 {
1717 ReorderBufferChange *change;
1718
1719 change = dlist_container(ReorderBufferChange, node, iter.cur);
1720
1721 /* Check we're not mixing changes from different transactions. */
1722 Assert(change->txn == txn);
1724
1725 /* Remove the change from its containing list. */
1726 dlist_delete(&change->node);
1727
1728 ReorderBufferFreeChange(rb, change, true);
1729 }
1730 }
1731
1732 /*
1733 * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1734 * memory. We could also keep the hash table and update it with new ctid
1735 * values, but this seems simpler and good enough for now.
1736 */
1737 if (txn->tuplecid_hash != NULL)
1738 {
1740 txn->tuplecid_hash = NULL;
1741 }
1742
1743 /* If this txn is serialized then clean the disk space. */
1744 if (rbtxn_is_serialized(txn))
1745 {
1748
1749 /*
1750 * We set this flag to indicate if the transaction is ever serialized.
1751 * We need this to accurately update the stats as otherwise the same
1752 * transaction can be counted as serialized multiple times.
1753 */
1755 }
1756
1757 /* also reset the number of entries in the transaction */
1758 txn->nentries_mem = 0;
1759 txn->nentries = 0;
1760}
1761
1762/*
1763 * Check the transaction status by CLOG lookup and discard all changes if
1764 * the transaction is aborted. The transaction status is cached in
1765 * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
1766 * next call.
1767 *
1768 * Return true if the transaction is aborted, otherwise return false.
1769 *
1770 * When the 'debug_logical_replication_streaming' is set to "immediate", we
1771 * don't check the transaction status, meaning the caller will always process
1772 * this transaction.
1773 */
1774static bool
1776{
1777 /* Quick return for regression tests */
1779 return false;
1780
1781 /*
1782 * Quick return if the transaction status is already known.
1783 */
1784
1785 if (rbtxn_is_committed(txn))
1786 return false;
1787 if (rbtxn_is_aborted(txn))
1788 {
1789 /* Already-aborted transactions should not have any changes */
1790 Assert(txn->size == 0);
1791
1792 return true;
1793 }
1794
1795 /* Otherwise, check the transaction status using CLOG lookup */
1796
1798 return false;
1799
1800 if (TransactionIdDidCommit(txn->xid))
1801 {
1802 /*
1803 * Remember the transaction is committed so that we can skip CLOG
1804 * check next time, avoiding the pressure on CLOG lookup.
1805 */
1806 Assert(!rbtxn_is_aborted(txn));
1808 return false;
1809 }
1810
1811 /*
1812 * The transaction aborted. We discard both the changes collected so far
1813 * and the toast reconstruction data. The full cleanup will happen as part
1814 * of decoding ABORT record of this transaction.
1815 */
1818
1819 /* All changes should be discarded */
1820 Assert(txn->size == 0);
1821
1822 /*
1823 * Mark the transaction as aborted so we can ignore future changes of this
1824 * transaction.
1825 */
1828
1829 return true;
1830}
1831
1832/*
1833 * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1834 * HeapTupleSatisfiesHistoricMVCC.
1835 */
1836static void
1838{
1839 dlist_iter iter;
1841
1843 return;
1844
1846 hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1847 hash_ctl.hcxt = rb->context;
1848
1849 /*
1850 * create the hash with the exact number of to-be-stored tuplecids from
1851 * the start
1852 */
1853 txn->tuplecid_hash =
1854 hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1856
1857 dlist_foreach(iter, &txn->tuplecids)
1858 {
1861 bool found;
1862 ReorderBufferChange *change;
1863
1864 change = dlist_container(ReorderBufferChange, node, iter.cur);
1865
1867
1868 /* be careful about padding */
1869 memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1870
1871 key.rlocator = change->data.tuplecid.locator;
1872
1874 &key.tid);
1875
1877 hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1878 if (!found)
1879 {
1880 ent->cmin = change->data.tuplecid.cmin;
1881 ent->cmax = change->data.tuplecid.cmax;
1882 ent->combocid = change->data.tuplecid.combocid;
1883 }
1884 else
1885 {
1886 /*
1887 * Maybe we already saw this tuple before in this transaction, but
1888 * if so it must have the same cmin.
1889 */
1890 Assert(ent->cmin == change->data.tuplecid.cmin);
1891
1892 /*
1893 * cmax may be initially invalid, but once set it can only grow,
1894 * and never become invalid again.
1895 */
1896 Assert((ent->cmax == InvalidCommandId) ||
1897 ((change->data.tuplecid.cmax != InvalidCommandId) &&
1898 (change->data.tuplecid.cmax > ent->cmax)));
1899 ent->cmax = change->data.tuplecid.cmax;
1900 }
1901 }
1902}
1903
1904/*
1905 * Copy a provided snapshot so we can modify it privately. This is needed so
1906 * that catalog modifying transactions can look into intermediate catalog
1907 * states.
1908 */
1909static Snapshot
1912{
1913 Snapshot snap;
1914 dlist_iter iter;
1915 int i = 0;
1916 Size size;
1917
1918 size = sizeof(SnapshotData) +
1919 sizeof(TransactionId) * orig_snap->xcnt +
1920 sizeof(TransactionId) * (txn->nsubtxns + 1);
1921
1922 snap = MemoryContextAllocZero(rb->context, size);
1923 memcpy(snap, orig_snap, sizeof(SnapshotData));
1924
1925 snap->copied = true;
1926 snap->active_count = 1; /* mark as active so nobody frees it */
1927 snap->regd_count = 0;
1928 snap->xip = (TransactionId *) (snap + 1);
1929
1930 memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1931
1932 /*
1933 * snap->subxip contains all txids that belong to our transaction which we
1934 * need to check via cmin/cmax. That's why we store the toplevel
1935 * transaction in there as well.
1936 */
1937 snap->subxip = snap->xip + snap->xcnt;
1938 snap->subxip[i++] = txn->xid;
1939
1940 /*
1941 * txn->nsubtxns isn't decreased when subtransactions abort, so count
1942 * manually. Since it's an upper boundary it is safe to use it for the
1943 * allocation above.
1944 */
1945 snap->subxcnt = 1;
1946
1947 dlist_foreach(iter, &txn->subtxns)
1948 {
1950
1952 snap->subxip[i++] = sub_txn->xid;
1953 snap->subxcnt++;
1954 }
1955
1956 /* sort so we can bsearch() later */
1957 qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1958
1959 /* store the specified current CommandId */
1960 snap->curcid = cid;
1961
1962 return snap;
1963}
1964
1965/*
1966 * Free a previously ReorderBufferCopySnap'ed snapshot
1967 */
1968static void
1970{
1971 if (snap->copied)
1972 pfree(snap);
1973 else
1975}
1976
1977/*
1978 * If the transaction was (partially) streamed, we need to prepare or commit
1979 * it in a 'streamed' way. That is, we first stream the remaining part of the
1980 * transaction, and then invoke stream_prepare or stream_commit message as per
1981 * the case.
1982 */
1983static void
1985{
1986 /* we should only call this for previously streamed transactions */
1988
1990
1991 if (rbtxn_is_prepared(txn))
1992 {
1993 /*
1994 * Note, we send stream prepare even if a concurrent abort is
1995 * detected. See DecodePrepare for more information.
1996 */
1998 rb->stream_prepare(rb, txn, txn->final_lsn);
2000
2001 /*
2002 * This is a PREPARED transaction, part of a two-phase commit. The
2003 * full cleanup will happen as part of the COMMIT PREPAREDs, so now
2004 * just truncate txn by removing changes and tuplecids.
2005 */
2006 ReorderBufferTruncateTXN(rb, txn, true);
2007 /* Reset the CheckXidAlive */
2009 }
2010 else
2011 {
2012 rb->stream_commit(rb, txn, txn->final_lsn);
2014 }
2015}
2016
2017/*
2018 * Set xid to detect concurrent aborts.
2019 *
2020 * While streaming an in-progress transaction or decoding a prepared
2021 * transaction there is a possibility that the (sub)transaction might get
2022 * aborted concurrently. In such case if the (sub)transaction has catalog
2023 * update then we might decode the tuple using wrong catalog version. For
2024 * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
2025 * the transaction 501 updates the catalog tuple and after that we will have
2026 * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
2027 * aborted and some other transaction say 502 updates the same catalog tuple
2028 * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
2029 * problem is that when we try to decode the tuple inserted/updated in 501
2030 * after the catalog update, we will see the catalog tuple with (xmin: 500,
2031 * xmax: 502) as visible because it will consider that the tuple is deleted by
2032 * xid 502 which is not visible to our snapshot. And when we will try to
2033 * decode with that catalog tuple, it can lead to a wrong result or a crash.
2034 * So, it is necessary to detect concurrent aborts to allow streaming of
2035 * in-progress transactions or decoding of prepared transactions.
2036 *
2037 * For detecting the concurrent abort we set CheckXidAlive to the current
2038 * (sub)transaction's xid for which this change belongs to. And, during
2039 * catalog scan we can check the status of the xid and if it is aborted we will
2040 * report a specific error so that we can stop streaming current transaction
2041 * and discard the already streamed changes on such an error. We might have
2042 * already streamed some of the changes for the aborted (sub)transaction, but
2043 * that is fine because when we decode the abort we will stream abort message
2044 * to truncate the changes in the subscriber. Similarly, for prepared
2045 * transactions, we stop decoding if concurrent abort is detected and then
2046 * rollback the changes when rollback prepared is encountered. See
2047 * DecodePrepare.
2048 */
2049static inline void
2051{
2052 /*
2053 * If the input transaction id is already set as a CheckXidAlive then
2054 * nothing to do.
2055 */
2057 return;
2058
2059 /*
2060 * setup CheckXidAlive if it's not committed yet. We don't check if the
2061 * xid is aborted. That will happen during catalog access.
2062 */
2063 if (!TransactionIdDidCommit(xid))
2064 CheckXidAlive = xid;
2065 else
2067}
2068
2069/*
2070 * Helper function for ReorderBufferProcessTXN for applying change.
2071 */
2072static inline void
2074 Relation relation, ReorderBufferChange *change,
2075 bool streaming)
2076{
2077 if (streaming)
2078 rb->stream_change(rb, txn, relation, change);
2079 else
2080 rb->apply_change(rb, txn, relation, change);
2081}
2082
2083/*
2084 * Helper function for ReorderBufferProcessTXN for applying the truncate.
2085 */
2086static inline void
2088 int nrelations, Relation *relations,
2089 ReorderBufferChange *change, bool streaming)
2090{
2091 if (streaming)
2092 rb->stream_truncate(rb, txn, nrelations, relations, change);
2093 else
2094 rb->apply_truncate(rb, txn, nrelations, relations, change);
2095}
2096
2097/*
2098 * Helper function for ReorderBufferProcessTXN for applying the message.
2099 */
2100static inline void
2102 ReorderBufferChange *change, bool streaming)
2103{
2104 if (streaming)
2105 rb->stream_message(rb, txn, change->lsn, true,
2106 change->data.msg.prefix,
2107 change->data.msg.message_size,
2108 change->data.msg.message);
2109 else
2110 rb->message(rb, txn, change->lsn, true,
2111 change->data.msg.prefix,
2112 change->data.msg.message_size,
2113 change->data.msg.message);
2114}
2115
2116/*
2117 * Function to store the command id and snapshot at the end of the current
2118 * stream so that we can reuse the same while sending the next stream.
2119 */
2120static inline void
2122 Snapshot snapshot_now, CommandId command_id)
2123{
2124 txn->command_id = command_id;
2125
2126 /* Avoid copying if it's already copied. */
2127 if (snapshot_now->copied)
2128 txn->snapshot_now = snapshot_now;
2129 else
2130 txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2131 txn, command_id);
2132}
2133
2134/*
2135 * Mark the given transaction as streamed if it's a top-level transaction
2136 * or has changes.
2137 */
2138static void
2140{
2141 /*
2142 * The top-level transaction, is marked as streamed always, even if it
2143 * does not contain any changes (that is, when all the changes are in
2144 * subtransactions).
2145 *
2146 * For subtransactions, we only mark them as streamed when there are
2147 * changes in them.
2148 *
2149 * We do it this way because of aborts - we don't want to send aborts for
2150 * XIDs the downstream is not aware of. And of course, it always knows
2151 * about the top-level xact (we send the XID in all messages), but we
2152 * never stream XIDs of empty subxacts.
2153 */
2154 if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
2156}
2157
2158/*
2159 * Helper function for ReorderBufferProcessTXN to handle the concurrent
2160 * abort of the streaming transaction. This resets the TXN such that it
2161 * can be used to stream the remaining data of transaction being processed.
2162 * This can happen when the subtransaction is aborted and we still want to
2163 * continue processing the main or other subtransactions data.
2164 */
2165static void
2167 Snapshot snapshot_now,
2168 CommandId command_id,
2169 XLogRecPtr last_lsn,
2171{
2172 /* Discard the changes that we just streamed */
2174
2175 /* Free all resources allocated for toast reconstruction */
2177
2178 /* Return the spec insert change if it is not NULL */
2179 if (specinsert != NULL)
2180 {
2182 specinsert = NULL;
2183 }
2184
2185 /*
2186 * For the streaming case, stop the stream and remember the command ID and
2187 * snapshot for the streaming run.
2188 */
2189 if (rbtxn_is_streamed(txn))
2190 {
2191 rb->stream_stop(rb, txn, last_lsn);
2192 ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2193 }
2194
2195 /* All changes must be deallocated */
2196 Assert(txn->size == 0);
2197}
2198
2199/*
2200 * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2201 *
2202 * Send data of a transaction (and its subtransactions) to the
2203 * output plugin. We iterate over the top and subtransactions (using a k-way
2204 * merge) and replay the changes in lsn order.
2205 *
2206 * If streaming is true then data will be sent using stream API.
2207 *
2208 * Note: "volatile" markers on some parameters are to avoid trouble with
2209 * PG_TRY inside the function.
2210 */
2211static void
2213 XLogRecPtr commit_lsn,
2214 volatile Snapshot snapshot_now,
2215 volatile CommandId command_id,
2216 bool streaming)
2217{
2218 bool using_subtxn;
2224 volatile bool stream_started = false;
2225 ReorderBufferTXN *volatile curtxn = NULL;
2226
2227 /* build data to be able to lookup the CommandIds of catalog tuples */
2229
2230 /* setup the initial snapshot */
2231 SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2232
2233 /*
2234 * Decoding needs access to syscaches et al., which in turn use
2235 * heavyweight locks and such. Thus we need to have enough state around to
2236 * keep track of those. The easiest way is to simply use a transaction
2237 * internally. That also allows us to easily enforce that nothing writes
2238 * to the database by checking for xid assignments.
2239 *
2240 * When we're called via the SQL SRF there's already a transaction
2241 * started, so start an explicit subtransaction there.
2242 */
2244
2245 PG_TRY();
2246 {
2247 ReorderBufferChange *change;
2248 int changes_count = 0; /* used to accumulate the number of
2249 * changes */
2250
2251 if (using_subtxn)
2252 BeginInternalSubTransaction(streaming ? "stream" : "replay");
2253 else
2255
2256 /*
2257 * We only need to send begin/begin-prepare for non-streamed
2258 * transactions.
2259 */
2260 if (!streaming)
2261 {
2262 if (rbtxn_is_prepared(txn))
2263 rb->begin_prepare(rb, txn);
2264 else
2265 rb->begin(rb, txn);
2266 }
2267
2269 while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2270 {
2271 Relation relation = NULL;
2272 Oid reloid;
2273
2275
2276 /*
2277 * We can't call start stream callback before processing first
2278 * change.
2279 */
2281 {
2282 if (streaming)
2283 {
2284 txn->origin_id = change->origin_id;
2285 rb->stream_start(rb, txn, change->lsn);
2286 stream_started = true;
2287 }
2288 }
2289
2290 /*
2291 * Enforce correct ordering of changes, merged from multiple
2292 * subtransactions. The changes may have the same LSN due to
2293 * MULTI_INSERT xlog records.
2294 */
2296
2297 prev_lsn = change->lsn;
2298
2299 /*
2300 * Set the current xid to detect concurrent aborts. This is
2301 * required for the cases when we decode the changes before the
2302 * COMMIT record is processed.
2303 */
2304 if (streaming || rbtxn_is_prepared(change->txn))
2305 {
2306 curtxn = change->txn;
2308 }
2309
2310 switch (change->action)
2311 {
2313
2314 /*
2315 * Confirmation for speculative insertion arrived. Simply
2316 * use as a normal record. It'll be cleaned up at the end
2317 * of INSERT processing.
2318 */
2319 if (specinsert == NULL)
2320 elog(ERROR, "invalid ordering of speculative insertion changes");
2321 Assert(specinsert->data.tp.oldtuple == NULL);
2322 change = specinsert;
2324
2325 /* intentionally fall through */
2330 Assert(snapshot_now);
2331
2332 reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2333 change->data.tp.rlocator.relNumber);
2334
2335 /*
2336 * Mapped catalog tuple without data, emitted while
2337 * catalog table was in the process of being rewritten. We
2338 * can fail to look up the relfilenumber, because the
2339 * relmapper has no "historic" view, in contrast to the
2340 * normal catalog during decoding. Thus repeated rewrites
2341 * can cause a lookup failure. That's OK because we do not
2342 * decode catalog changes anyway. Normally such tuples
2343 * would be skipped over below, but we can't identify
2344 * whether the table should be logically logged without
2345 * mapping the relfilenumber to the oid.
2346 */
2347 if (reloid == InvalidOid &&
2348 change->data.tp.newtuple == NULL &&
2349 change->data.tp.oldtuple == NULL)
2350 goto change_done;
2351 else if (reloid == InvalidOid)
2352 elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2353 relpathperm(change->data.tp.rlocator,
2354 MAIN_FORKNUM).str);
2355
2356 relation = RelationIdGetRelation(reloid);
2357
2358 if (!RelationIsValid(relation))
2359 elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2360 reloid,
2361 relpathperm(change->data.tp.rlocator,
2362 MAIN_FORKNUM).str);
2363
2364 if (!RelationIsLogicallyLogged(relation))
2365 goto change_done;
2366
2367 /*
2368 * Ignore temporary heaps created during DDL unless the
2369 * plugin has asked for them.
2370 */
2371 if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2372 goto change_done;
2373
2374 /*
2375 * For now ignore sequence changes entirely. Most of the
2376 * time they don't log changes using records we
2377 * understand, so it doesn't make sense to handle the few
2378 * cases we do.
2379 */
2380 if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2381 goto change_done;
2382
2383 /* user-triggered change */
2384 if (!IsToastRelation(relation))
2385 {
2386 ReorderBufferToastReplace(rb, txn, relation, change);
2387 ReorderBufferApplyChange(rb, txn, relation, change,
2388 streaming);
2389
2390 /*
2391 * Only clear reassembled toast chunks if we're sure
2392 * they're not required anymore. The creator of the
2393 * tuple tells us.
2394 */
2395 if (change->data.tp.clear_toast_afterwards)
2397 }
2398 /* we're not interested in toast deletions */
2399 else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2400 {
2401 /*
2402 * Need to reassemble the full toasted Datum in
2403 * memory, to ensure the chunks don't get reused till
2404 * we're done remove it from the list of this
2405 * transaction's changes. Otherwise it will get
2406 * freed/reused while restoring spooled data from
2407 * disk.
2408 */
2409 Assert(change->data.tp.newtuple != NULL);
2410
2411 dlist_delete(&change->node);
2412 ReorderBufferToastAppendChunk(rb, txn, relation,
2413 change);
2414 }
2415
2417
2418 /*
2419 * If speculative insertion was confirmed, the record
2420 * isn't needed anymore.
2421 */
2422 if (specinsert != NULL)
2423 {
2425 specinsert = NULL;
2426 }
2427
2428 if (RelationIsValid(relation))
2429 {
2430 RelationClose(relation);
2431 relation = NULL;
2432 }
2433 break;
2434
2436
2437 /*
2438 * Speculative insertions are dealt with by delaying the
2439 * processing of the insert until the confirmation record
2440 * arrives. For that we simply unlink the record from the
2441 * chain, so it does not get freed/reused while restoring
2442 * spooled data from disk.
2443 *
2444 * This is safe in the face of concurrent catalog changes
2445 * because the relevant relation can't be changed between
2446 * speculative insertion and confirmation due to
2447 * CheckTableNotInUse() and locking.
2448 */
2449
2450 /* Previous speculative insertion must be aborted */
2452
2453 /* and memorize the pending insertion */
2454 dlist_delete(&change->node);
2455 specinsert = change;
2456 break;
2457
2459
2460 /*
2461 * Abort for speculative insertion arrived. So cleanup the
2462 * specinsert tuple and toast hash.
2463 *
2464 * Note that we get the spec abort change for each toast
2465 * entry but we need to perform the cleanup only the first
2466 * time we get it for the main table.
2467 */
2468 if (specinsert != NULL)
2469 {
2470 /*
2471 * We must clean the toast hash before processing a
2472 * completely new tuple to avoid confusion about the
2473 * previous tuple's toast chunks.
2474 */
2477
2478 /* We don't need this record anymore. */
2480 specinsert = NULL;
2481 }
2482 break;
2483
2485 {
2486 int i;
2487 int nrelids = change->data.truncate.nrelids;
2488 int nrelations = 0;
2489 Relation *relations;
2490
2491 relations = palloc0_array(Relation, nrelids);
2492 for (i = 0; i < nrelids; i++)
2493 {
2494 Oid relid = change->data.truncate.relids[i];
2495 Relation rel;
2496
2497 rel = RelationIdGetRelation(relid);
2498
2499 if (!RelationIsValid(rel))
2500 elog(ERROR, "could not open relation with OID %u", relid);
2501
2502 if (!RelationIsLogicallyLogged(rel))
2503 continue;
2504
2505 relations[nrelations++] = rel;
2506 }
2507
2508 /* Apply the truncate. */
2510 relations, change,
2511 streaming);
2512
2513 for (i = 0; i < nrelations; i++)
2514 RelationClose(relations[i]);
2515
2516 break;
2517 }
2518
2520 ReorderBufferApplyMessage(rb, txn, change, streaming);
2521 break;
2522
2524 /* Execute the invalidation messages locally */
2526 change->data.inval.invalidations);
2527 break;
2528
2530 /* get rid of the old */
2532
2533 if (snapshot_now->copied)
2534 {
2535 ReorderBufferFreeSnap(rb, snapshot_now);
2536 snapshot_now =
2538 txn, command_id);
2539 }
2540
2541 /*
2542 * Restored from disk, need to be careful not to double
2543 * free. We could introduce refcounting for that, but for
2544 * now this seems infrequent enough not to care.
2545 */
2546 else if (change->data.snapshot->copied)
2547 {
2548 snapshot_now =
2550 txn, command_id);
2551 }
2552 else
2553 {
2554 snapshot_now = change->data.snapshot;
2555 }
2556
2557 /* and continue with the new one */
2558 SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2559 break;
2560
2563
2564 if (command_id < change->data.command_id)
2565 {
2566 command_id = change->data.command_id;
2567
2568 if (!snapshot_now->copied)
2569 {
2570 /* we don't use the global one anymore */
2571 snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2572 txn, command_id);
2573 }
2574
2575 snapshot_now->curcid = command_id;
2576
2578 SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2579 }
2580
2581 break;
2582
2584 elog(ERROR, "tuplecid value in changequeue");
2585 break;
2586 }
2587
2588 /*
2589 * It is possible that the data is not sent to downstream for a
2590 * long time either because the output plugin filtered it or there
2591 * is a DDL that generates a lot of data that is not processed by
2592 * the plugin. So, in such cases, the downstream can timeout. To
2593 * avoid that we try to send a keepalive message if required.
2594 * Trying to send a keepalive message after every change has some
2595 * overhead, but testing showed there is no noticeable overhead if
2596 * we do it after every ~100 changes.
2597 */
2598#define CHANGES_THRESHOLD 100
2599
2601 {
2602 rb->update_progress_txn(rb, txn, prev_lsn);
2603 changes_count = 0;
2604 }
2605 }
2606
2607 /* speculative insertion record must be freed by now */
2609
2610 /* clean up the iterator */
2612 iterstate = NULL;
2613
2614 /*
2615 * Update total transaction count and total bytes processed by the
2616 * transaction and its subtransactions. Ensure to not count the
2617 * streamed transaction multiple times.
2618 *
2619 * Note that the statistics computation has to be done after
2620 * ReorderBufferIterTXNFinish as it releases the serialized change
2621 * which we have already accounted in ReorderBufferIterTXNNext.
2622 */
2623 if (!rbtxn_is_streamed(txn))
2624 rb->totalTxns++;
2625
2626 rb->totalBytes += txn->total_size;
2627
2628 /*
2629 * Done with current changes, send the last message for this set of
2630 * changes depending upon streaming mode.
2631 */
2632 if (streaming)
2633 {
2634 if (stream_started)
2635 {
2636 rb->stream_stop(rb, txn, prev_lsn);
2637 stream_started = false;
2638 }
2639 }
2640 else
2641 {
2642 /*
2643 * Call either PREPARE (for two-phase transactions) or COMMIT (for
2644 * regular ones).
2645 */
2646 if (rbtxn_is_prepared(txn))
2647 {
2649 rb->prepare(rb, txn, commit_lsn);
2651 }
2652 else
2653 rb->commit(rb, txn, commit_lsn);
2654 }
2655
2656 /* this is just a sanity check against bad output plugin behaviour */
2658 elog(ERROR, "output plugin used XID %u",
2660
2661 /*
2662 * Remember the command ID and snapshot for the next set of changes in
2663 * streaming mode.
2664 */
2665 if (streaming)
2666 ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2667 else if (snapshot_now->copied)
2668 ReorderBufferFreeSnap(rb, snapshot_now);
2669
2670 /* cleanup */
2672
2673 /*
2674 * Aborting the current (sub-)transaction as a whole has the right
2675 * semantics. We want all locks acquired in here to be released, not
2676 * reassigned to the parent and we do not want any database access
2677 * have persistent effects.
2678 */
2680
2681 /* make sure there's no cache pollution */
2683 {
2686 }
2687 else
2688 {
2692 }
2693
2694 if (using_subtxn)
2695 {
2698 CurrentResourceOwner = cowner;
2699 }
2700
2701 /*
2702 * We are here due to one of the four reasons: 1. Decoding an
2703 * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2704 * prepared txn that was (partially) streamed. 4. Decoding a committed
2705 * txn.
2706 *
2707 * For 1, we allow truncation of txn data by removing the changes
2708 * already streamed but still keeping other things like invalidations,
2709 * snapshot, and tuplecids. For 2 and 3, we indicate
2710 * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2711 * data as the entire transaction has been decoded except for commit.
2712 * For 4, as the entire txn has been decoded, we can fully clean up
2713 * the TXN reorder buffer.
2714 */
2715 if (streaming || rbtxn_is_prepared(txn))
2716 {
2717 if (streaming)
2719
2721 /* Reset the CheckXidAlive */
2723 }
2724 else
2726 }
2727 PG_CATCH();
2728 {
2731
2732 /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2733 if (iterstate)
2735
2737
2738 /*
2739 * Force cache invalidation to happen outside of a valid transaction
2740 * to prevent catalog access as we just caught an error.
2741 */
2743
2744 /* make sure there's no cache pollution */
2746 {
2749 }
2750 else
2751 {
2755 }
2756
2757 if (using_subtxn)
2758 {
2761 CurrentResourceOwner = cowner;
2762 }
2763
2764 /*
2765 * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2766 * abort of the (sub)transaction we are streaming or preparing. We
2767 * need to do the cleanup and return gracefully on this error, see
2768 * SetupCheckXidLive.
2769 *
2770 * This error code can be thrown by one of the callbacks we call
2771 * during decoding so we need to ensure that we return gracefully only
2772 * when we are sending the data in streaming mode and the streaming is
2773 * not finished yet or when we are sending the data out on a PREPARE
2774 * during a two-phase commit.
2775 */
2776 if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2778 {
2779 /* curtxn must be set for streaming or prepared transactions */
2780 Assert(curtxn);
2781
2782 /* Cleanup the temporary error state. */
2785 errdata = NULL;
2786
2787 /* Remember the transaction is aborted. */
2789 curtxn->txn_flags |= RBTXN_IS_ABORTED;
2790
2791 /* Mark the transaction is streamed if appropriate */
2792 if (stream_started)
2794
2795 /* Reset the TXN so that it is allowed to stream remaining data. */
2796 ReorderBufferResetTXN(rb, txn, snapshot_now,
2797 command_id, prev_lsn,
2798 specinsert);
2799 }
2800 else
2801 {
2804 PG_RE_THROW();
2805 }
2806 }
2807 PG_END_TRY();
2808}
2809
2810/*
2811 * Perform the replay of a transaction and its non-aborted subtransactions.
2812 *
2813 * Subtransactions previously have to be processed by
2814 * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2815 * transaction with ReorderBufferAssignChild.
2816 *
2817 * This interface is called once a prepare or toplevel commit is read for both
2818 * streamed as well as non-streamed transactions.
2819 */
2820static void
2823 XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2824 TimestampTz commit_time,
2825 ReplOriginId origin_id, XLogRecPtr origin_lsn)
2826{
2827 Snapshot snapshot_now;
2828 CommandId command_id = FirstCommandId;
2829
2830 txn->final_lsn = commit_lsn;
2831 txn->end_lsn = end_lsn;
2832 txn->commit_time = commit_time;
2833 txn->origin_id = origin_id;
2834 txn->origin_lsn = origin_lsn;
2835
2836 /*
2837 * If the transaction was (partially) streamed, we need to commit it in a
2838 * 'streamed' way. That is, we first stream the remaining part of the
2839 * transaction, and then invoke stream_commit message.
2840 *
2841 * Called after everything (origin ID, LSN, ...) is stored in the
2842 * transaction to avoid passing that information directly.
2843 */
2844 if (rbtxn_is_streamed(txn))
2845 {
2847 return;
2848 }
2849
2850 /*
2851 * If this transaction has no snapshot, it didn't make any changes to the
2852 * database, so there's nothing to decode. Note that
2853 * ReorderBufferCommitChild will have transferred any snapshots from
2854 * subtransactions if there were any.
2855 */
2856 if (txn->base_snapshot == NULL)
2857 {
2858 Assert(txn->ninvalidations == 0);
2859
2860 /*
2861 * Removing this txn before a commit might result in the computation
2862 * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2863 */
2864 if (!rbtxn_is_prepared(txn))
2866 return;
2867 }
2868
2869 snapshot_now = txn->base_snapshot;
2870
2871 /* Process and send the changes to output plugin. */
2872 ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2873 command_id, false);
2874}
2875
2876/*
2877 * Commit a transaction.
2878 *
2879 * See comments for ReorderBufferReplay().
2880 */
2881void
2883 XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2884 TimestampTz commit_time,
2885 ReplOriginId origin_id, XLogRecPtr origin_lsn)
2886{
2887 ReorderBufferTXN *txn;
2888
2889 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2890 false);
2891
2892 /* unknown transaction, nothing to replay */
2893 if (txn == NULL)
2894 return;
2895
2896 ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2897 origin_id, origin_lsn);
2898}
2899
2900/*
2901 * Record the prepare information for a transaction. Also, mark the transaction
2902 * as a prepared transaction.
2903 */
2904bool
2906 XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2907 TimestampTz prepare_time,
2908 ReplOriginId origin_id, XLogRecPtr origin_lsn)
2909{
2910 ReorderBufferTXN *txn;
2911
2912 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2913
2914 /* unknown transaction, nothing to do */
2915 if (txn == NULL)
2916 return false;
2917
2918 /*
2919 * Remember the prepare information to be later used by commit prepared in
2920 * case we skip doing prepare.
2921 */
2922 txn->final_lsn = prepare_lsn;
2923 txn->end_lsn = end_lsn;
2924 txn->prepare_time = prepare_time;
2925 txn->origin_id = origin_id;
2926 txn->origin_lsn = origin_lsn;
2927
2928 /* Mark this transaction as a prepared transaction */
2931
2932 return true;
2933}
2934
2935/* Remember that we have skipped prepare */
2936void
2938{
2939 ReorderBufferTXN *txn;
2940
2941 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2942
2943 /* unknown transaction, nothing to do */
2944 if (txn == NULL)
2945 return;
2946
2947 /* txn must have been marked as a prepared transaction */
2950}
2951
2952/*
2953 * Prepare a two-phase transaction.
2954 *
2955 * See comments for ReorderBufferReplay().
2956 */
2957void
2959 char *gid)
2960{
2961 ReorderBufferTXN *txn;
2962
2963 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2964 false);
2965
2966 /* unknown transaction, nothing to replay */
2967 if (txn == NULL)
2968 return;
2969
2970 /*
2971 * txn must have been marked as a prepared transaction and must have
2972 * neither been skipped nor sent a prepare. Also, the prepare info must
2973 * have been updated in it by now.
2974 */
2977
2978 txn->gid = pstrdup(gid);
2979
2980 ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2981 txn->prepare_time, txn->origin_id, txn->origin_lsn);
2982
2983 /*
2984 * Send a prepare if not already done so. This might occur if we have
2985 * detected a concurrent abort while replaying the non-streaming
2986 * transaction.
2987 */
2988 if (!rbtxn_sent_prepare(txn))
2989 {
2990 rb->prepare(rb, txn, txn->final_lsn);
2992 }
2993}
2994
2995/*
2996 * This is used to handle COMMIT/ROLLBACK PREPARED.
2997 */
2998void
3000 XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
3001 XLogRecPtr two_phase_at,
3002 TimestampTz commit_time, ReplOriginId origin_id,
3003 XLogRecPtr origin_lsn, char *gid, bool is_commit)
3004{
3005 ReorderBufferTXN *txn;
3006 XLogRecPtr prepare_end_lsn;
3007 TimestampTz prepare_time;
3008
3009 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
3010
3011 /* unknown transaction, nothing to do */
3012 if (txn == NULL)
3013 return;
3014
3015 /*
3016 * By this time the txn has the prepare record information, remember it to
3017 * be later used for rollback.
3018 */
3019 prepare_end_lsn = txn->end_lsn;
3020 prepare_time = txn->prepare_time;
3021
3022 /* add the gid in the txn */
3023 txn->gid = pstrdup(gid);
3024
3025 /*
3026 * It is possible that this transaction is not decoded at prepare time
3027 * either because by that time we didn't have a consistent snapshot, or
3028 * two_phase was not enabled, or it was decoded earlier but we have
3029 * restarted. We only need to send the prepare if it was not decoded
3030 * earlier. We don't need to decode the xact for aborts if it is not done
3031 * already.
3032 */
3033 if ((txn->final_lsn < two_phase_at) && is_commit)
3034 {
3035 /*
3036 * txn must have been marked as a prepared transaction and skipped but
3037 * not sent a prepare. Also, the prepare info must have been updated
3038 * in txn even if we skip prepare.
3039 */
3043
3044 /*
3045 * By this time the txn has the prepare record information and it is
3046 * important to use that so that downstream gets the accurate
3047 * information. If instead, we have passed commit information here
3048 * then downstream can behave as it has already replayed commit
3049 * prepared after the restart.
3050 */
3051 ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
3052 txn->prepare_time, txn->origin_id, txn->origin_lsn);
3053 }
3054
3055 txn->final_lsn = commit_lsn;
3056 txn->end_lsn = end_lsn;
3057 txn->commit_time = commit_time;
3058 txn->origin_id = origin_id;
3059 txn->origin_lsn = origin_lsn;
3060
3061 if (is_commit)
3062 rb->commit_prepared(rb, txn, commit_lsn);
3063 else
3064 rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
3065
3066 /* cleanup: make sure there's no cache pollution */
3068 txn->invalidations);
3070}
3071
3072/*
3073 * Abort a transaction that possibly has previous changes. Needs to be first
3074 * called for subtransactions and then for the toplevel xid.
3075 *
3076 * NB: Transactions handled here have to have actively aborted (i.e. have
3077 * produced an abort record). Implicitly aborted transactions are handled via
3078 * ReorderBufferAbortOld(); transactions we're just not interested in, but
3079 * which have committed are handled in ReorderBufferForget().
3080 *
3081 * This function purges this transaction and its contents from memory and
3082 * disk.
3083 */
3084void
3086 TimestampTz abort_time)
3087{
3088 ReorderBufferTXN *txn;
3089
3090 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3091 false);
3092
3093 /* unknown, nothing to remove */
3094 if (txn == NULL)
3095 return;
3096
3097 txn->abort_time = abort_time;
3098
3099 /* For streamed transactions notify the remote node about the abort. */
3100 if (rbtxn_is_streamed(txn))
3101 {
3102 rb->stream_abort(rb, txn, lsn);
3103
3104 /*
3105 * We might have decoded changes for this transaction that could load
3106 * the cache as per the current transaction's view (consider DDL's
3107 * happened in this transaction). We don't want the decoding of future
3108 * transactions to use those cache entries so execute only the inval
3109 * messages in this transaction.
3110 */
3111 if (txn->ninvalidations > 0)
3113 txn->invalidations);
3114 }
3115
3116 /* cosmetic... */
3117 txn->final_lsn = lsn;
3118
3119 /* remove potential on-disk data, and deallocate */
3121}
3122
3123/*
3124 * Abort all transactions that aren't actually running anymore because the
3125 * server restarted.
3126 *
3127 * NB: These really have to be transactions that have aborted due to a server
3128 * crash/immediate restart, as we don't deal with invalidations here.
3129 */
3130void
3132{
3134
3135 /*
3136 * Iterate through all (potential) toplevel TXNs and abort all that are
3137 * older than what possibly can be running. Once we've found the first
3138 * that is alive we stop, there might be some that acquired an xid earlier
3139 * but started writing later, but it's unlikely and they will be cleaned
3140 * up in a later call to this function.
3141 */
3142 dlist_foreach_modify(it, &rb->toplevel_by_lsn)
3143 {
3144 ReorderBufferTXN *txn;
3145
3146 txn = dlist_container(ReorderBufferTXN, node, it.cur);
3147
3148 if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
3149 {
3150 elog(DEBUG2, "aborting old transaction %u", txn->xid);
3151
3152 /* Notify the remote node about the crash/immediate restart. */
3153 if (rbtxn_is_streamed(txn))
3154 rb->stream_abort(rb, txn, InvalidXLogRecPtr);
3155
3156 /* remove potential on-disk data, and deallocate this tx */
3158 }
3159 else
3160 return;
3161 }
3162}
3163
3164/*
3165 * Forget the contents of a transaction if we aren't interested in its
3166 * contents. Needs to be first called for subtransactions and then for the
3167 * toplevel xid.
3168 *
3169 * This is significantly different to ReorderBufferAbort() because
3170 * transactions that have committed need to be treated differently from aborted
3171 * ones since they may have modified the catalog.
3172 *
3173 * Note that this is only allowed to be called in the moment a transaction
3174 * commit has just been read, not earlier; otherwise later records referring
3175 * to this xid might re-create the transaction incompletely.
3176 */
3177void
3179{
3180 ReorderBufferTXN *txn;
3181
3182 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3183 false);
3184
3185 /* unknown, nothing to forget */
3186 if (txn == NULL)
3187 return;
3188
3189 /* this transaction mustn't be streamed */
3191
3192 /* cosmetic... */
3193 txn->final_lsn = lsn;
3194
3195 /*
3196 * Process only cache invalidation messages in this transaction if there
3197 * are any. Even if we're not interested in the transaction's contents, it
3198 * could have manipulated the catalog and we need to update the caches
3199 * according to that.
3200 */
3201 if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3203 txn->invalidations);
3204 else
3205 Assert(txn->ninvalidations == 0);
3206
3207 /* remove potential on-disk data, and deallocate */
3209}
3210
3211/*
3212 * Invalidate cache for those transactions that need to be skipped just in case
3213 * catalogs were manipulated as part of the transaction.
3214 *
3215 * Note that this is a special-purpose function for prepared transactions where
3216 * we don't want to clean up the TXN even when we decide to skip it. See
3217 * DecodePrepare.
3218 */
3219void
3221{
3222 ReorderBufferTXN *txn;
3223
3224 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3225 false);
3226
3227 /* unknown, nothing to do */
3228 if (txn == NULL)
3229 return;
3230
3231 /*
3232 * Process cache invalidation messages if there are any. Even if we're not
3233 * interested in the transaction's contents, it could have manipulated the
3234 * catalog and we need to update the caches according to that.
3235 */
3236 if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3238 txn->invalidations);
3239 else
3240 Assert(txn->ninvalidations == 0);
3241}
3242
3243
3244/*
3245 * Execute invalidations happening outside the context of a decoded
3246 * transaction. That currently happens either for xid-less commits
3247 * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3248 * transactions (via ReorderBufferForget()).
3249 */
3250void
3252 SharedInvalidationMessage *invalidations)
3253{
3257 int i;
3258
3259 if (use_subtxn)
3261
3262 /*
3263 * Force invalidations to happen outside of a valid transaction - that way
3264 * entries will just be marked as invalid without accessing the catalog.
3265 * That's advantageous because we don't need to setup the full state
3266 * necessary for catalog access.
3267 */
3268 if (use_subtxn)
3270
3271 for (i = 0; i < ninvalidations; i++)
3272 LocalExecuteInvalidationMessage(&invalidations[i]);
3273
3274 if (use_subtxn)
3275 {
3278 CurrentResourceOwner = cowner;
3279 }
3280}
3281
3282/*
3283 * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3284 * least once for every xid in XLogRecord->xl_xid (other places in records
3285 * may, but do not have to be passed through here).
3286 *
3287 * Reorderbuffer keeps some data structures about transactions in LSN order,
3288 * for efficiency. To do that it has to know about when transactions are seen
3289 * first in the WAL. As many types of records are not actually interesting for
3290 * logical decoding, they do not necessarily pass through here.
3291 */
3292void
3294{
3295 /* many records won't have an xid assigned, centralize check here */
3296 if (xid != InvalidTransactionId)
3297 ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3298}
3299
3300/*
3301 * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3302 * because the previous snapshot doesn't describe the catalog correctly for
3303 * following rows.
3304 */
3305void
3316
3317/*
3318 * Set up the transaction's base snapshot.
3319 *
3320 * If we know that xid is a subtransaction, set the base snapshot on the
3321 * top-level transaction instead.
3322 */
3323void
3326{
3327 ReorderBufferTXN *txn;
3328 bool is_new;
3329
3330 Assert(snap != NULL);
3331
3332 /*
3333 * Fetch the transaction to operate on. If we know it's a subtransaction,
3334 * operate on its top-level transaction instead.
3335 */
3336 txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3337 if (rbtxn_is_known_subxact(txn))
3338 txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3339 NULL, InvalidXLogRecPtr, false);
3340 Assert(txn->base_snapshot == NULL);
3341
3342 txn->base_snapshot = snap;
3343 txn->base_snapshot_lsn = lsn;
3344 dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3345
3347}
3348
3349/*
3350 * Access the catalog with this CommandId at this point in the changestream.
3351 *
3352 * May only be called for command ids > 1
3353 */
3354void
3365
3366/*
3367 * Update memory counters to account for the new or removed change.
3368 *
3369 * We update two counters - in the reorder buffer, and in the transaction
3370 * containing the change. The reorder buffer counter allows us to quickly
3371 * decide if we reached the memory limit, the transaction counter allows
3372 * us to quickly pick the largest transaction for eviction.
3373 *
3374 * Either txn or change must be non-NULL at least. We update the memory
3375 * counter of txn if it's non-NULL, otherwise change->txn.
3376 *
3377 * When streaming is enabled, we need to update the toplevel transaction
3378 * counters instead - we don't really care about subtransactions as we
3379 * can't stream them individually anyway, and we only pick toplevel
3380 * transactions for eviction. So only toplevel transactions matter.
3381 */
3382static void
3384 ReorderBufferChange *change,
3385 ReorderBufferTXN *txn,
3386 bool addition, Size sz)
3387{
3388 ReorderBufferTXN *toptxn;
3389
3390 Assert(txn || change);
3391
3392 /*
3393 * Ignore tuple CID changes, because those are not evicted when reaching
3394 * memory limit. So we just don't count them, because it might easily
3395 * trigger a pointless attempt to spill.
3396 */
3397 if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3398 return;
3399
3400 if (sz == 0)
3401 return;
3402
3403 if (txn == NULL)
3404 txn = change->txn;
3405 Assert(txn != NULL);
3406
3407 /*
3408 * Update the total size in top level as well. This is later used to
3409 * compute the decoding stats.
3410 */
3411 toptxn = rbtxn_get_toptxn(txn);
3412
3413 if (addition)
3414 {
3415 Size oldsize = txn->size;
3416
3417 txn->size += sz;
3418 rb->size += sz;
3419
3420 /* Update the total size in the top transaction. */
3421 toptxn->total_size += sz;
3422
3423 /* Update the max-heap */
3424 if (oldsize != 0)
3425 pairingheap_remove(rb->txn_heap, &txn->txn_node);
3426 pairingheap_add(rb->txn_heap, &txn->txn_node);
3427 }
3428 else
3429 {
3430 Assert((rb->size >= sz) && (txn->size >= sz));
3431 txn->size -= sz;
3432 rb->size -= sz;
3433
3434 /* Update the total size in the top transaction. */
3435 toptxn->total_size -= sz;
3436
3437 /* Update the max-heap */
3438 pairingheap_remove(rb->txn_heap, &txn->txn_node);
3439 if (txn->size != 0)
3440 pairingheap_add(rb->txn_heap, &txn->txn_node);
3441 }
3442
3443 Assert(txn->size <= rb->size);
3444}
3445
3446/*
3447 * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3448 *
3449 * We do not include this change type in memory accounting, because we
3450 * keep CIDs in a separate list and do not evict them when reaching
3451 * the memory limit.
3452 */
3453void
3455 XLogRecPtr lsn, RelFileLocator locator,
3456 ItemPointerData tid, CommandId cmin,
3457 CommandId cmax, CommandId combocid)
3458{
3460 ReorderBufferTXN *txn;
3461
3462 txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3463
3464 change->data.tuplecid.locator = locator;
3465 change->data.tuplecid.tid = tid;
3466 change->data.tuplecid.cmin = cmin;
3467 change->data.tuplecid.cmax = cmax;
3468 change->data.tuplecid.combocid = combocid;
3469 change->lsn = lsn;
3470 change->txn = txn;
3472
3473 dlist_push_tail(&txn->tuplecids, &change->node);
3474 txn->ntuplecids++;
3475}
3476
3477/*
3478 * Add new invalidation messages to the reorder buffer queue.
3479 */
3480static void
3482 XLogRecPtr lsn, Size nmsgs,
3484{
3485 ReorderBufferChange *change;
3486
3487 change = ReorderBufferAllocChange(rb);
3489 change->data.inval.ninvalidations = nmsgs;
3491 memcpy(change->data.inval.invalidations, msgs,
3492 sizeof(SharedInvalidationMessage) * nmsgs);
3493
3494 ReorderBufferQueueChange(rb, xid, lsn, change, false);
3495}
3496
3497/*
3498 * A helper function for ReorderBufferAddInvalidations() and
3499 * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
3500 * messages to the **invals_out.
3501 */
3502static void
3525
3526/*
3527 * Accumulate the invalidations for executing them later.
3528 *
3529 * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3530 * accumulates all the invalidation messages in the toplevel transaction, if
3531 * available, otherwise in the current transaction, as well as in the form of
3532 * change in reorder buffer. We require to record it in form of the change
3533 * so that we can execute only the required invalidations instead of executing
3534 * all the invalidations on each CommandId increment. We also need to
3535 * accumulate these in the txn buffer because in some cases where we skip
3536 * processing the transaction (see ReorderBufferForget), we need to execute
3537 * all the invalidations together.
3538 */
3539void
3541 XLogRecPtr lsn, Size nmsgs,
3543{
3544 ReorderBufferTXN *txn;
3545 MemoryContext oldcontext;
3546
3547 txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3548
3549 oldcontext = MemoryContextSwitchTo(rb->context);
3550
3551 /*
3552 * Collect all the invalidations under the top transaction, if available,
3553 * so that we can execute them all together. See comments atop this
3554 * function.
3555 */
3556 txn = rbtxn_get_toptxn(txn);
3557
3558 Assert(nmsgs > 0);
3559
3561 &txn->ninvalidations,
3562 msgs, nmsgs);
3563
3564 ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3565
3566 MemoryContextSwitchTo(oldcontext);
3567}
3568
3569/*
3570 * Accumulate the invalidations distributed by other committed transactions
3571 * for executing them later.
3572 *
3573 * This function is similar to ReorderBufferAddInvalidations() but stores
3574 * the given inval messages to the txn->invalidations_distributed with the
3575 * overflow check.
3576 *
3577 * This needs to be called by committed transactions to distribute their
3578 * inval messages to in-progress transactions.
3579 */
3580void
3582 XLogRecPtr lsn, Size nmsgs,
3584{
3585 ReorderBufferTXN *txn;
3586 MemoryContext oldcontext;
3587
3588 txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3589
3590 oldcontext = MemoryContextSwitchTo(rb->context);
3591
3592 /*
3593 * Collect all the invalidations under the top transaction, if available,
3594 * so that we can execute them all together. See comments
3595 * ReorderBufferAddInvalidations.
3596 */
3597 txn = rbtxn_get_toptxn(txn);
3598
3599 Assert(nmsgs > 0);
3600
3602 {
3603 /*
3604 * Check the transaction has enough space for storing distributed
3605 * invalidation messages.
3606 */
3608 {
3609 /*
3610 * Mark the invalidation message as overflowed and free up the
3611 * messages accumulated so far.
3612 */
3614
3616 {
3620 }
3621 }
3622 else
3625 msgs, nmsgs);
3626 }
3627
3628 /* Queue the invalidation messages into the transaction */
3629 ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3630
3631 MemoryContextSwitchTo(oldcontext);
3632}
3633
3634/*
3635 * Apply all invalidations we know. Possibly we only need parts at this point
3636 * in the changestream but we don't know which those are.
3637 */
3638static void
3640{
3641 int i;
3642
3643 for (i = 0; i < nmsgs; i++)
3645}
3646
3647/*
3648 * Mark a transaction as containing catalog changes
3649 */
3650void
3652 XLogRecPtr lsn)
3653{
3654 ReorderBufferTXN *txn;
3655
3656 txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3657
3658 if (!rbtxn_has_catalog_changes(txn))
3659 {
3661 dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3662 }
3663
3664 /*
3665 * Mark top-level transaction as having catalog changes too if one of its
3666 * children has so that the ReorderBufferBuildTupleCidHash can
3667 * conveniently check just top-level transaction and decide whether to
3668 * build the hash table or not.
3669 */
3670 if (rbtxn_is_subtxn(txn))
3671 {
3672 ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3673
3674 if (!rbtxn_has_catalog_changes(toptxn))
3675 {
3677 dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3678 }
3679 }
3680}
3681
3682/*
3683 * Return palloc'ed array of the transactions that have changed catalogs.
3684 * The returned array is sorted in xidComparator order.
3685 *
3686 * The caller must free the returned array when done with it.
3687 */
3690{
3691 dlist_iter iter;
3692 TransactionId *xids = NULL;
3693 size_t xcnt = 0;
3694
3695 /* Quick return if the list is empty */
3696 if (dclist_count(&rb->catchange_txns) == 0)
3697 return NULL;
3698
3699 /* Initialize XID array */
3700 xids = palloc_array(TransactionId, dclist_count(&rb->catchange_txns));
3701 dclist_foreach(iter, &rb->catchange_txns)
3702 {
3704 catchange_node,
3705 iter.cur);
3706
3708
3709 xids[xcnt++] = txn->xid;
3710 }
3711
3712 qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3713
3714 Assert(xcnt == dclist_count(&rb->catchange_txns));
3715 return xids;
3716}
3717
3718/*
3719 * Query whether a transaction is already *known* to contain catalog
3720 * changes. This can be wrong until directly before the commit!
3721 */
3722bool
3724{
3725 ReorderBufferTXN *txn;
3726
3727 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3728 false);
3729 if (txn == NULL)
3730 return false;
3731
3732 return rbtxn_has_catalog_changes(txn);
3733}
3734
3735/*
3736 * ReorderBufferXidHasBaseSnapshot
3737 * Have we already set the base snapshot for the given txn/subtxn?
3738 */
3739bool
3741{
3742 ReorderBufferTXN *txn;
3743
3744 txn = ReorderBufferTXNByXid(rb, xid, false,
3745 NULL, InvalidXLogRecPtr, false);
3746
3747 /* transaction isn't known yet, ergo no snapshot */
3748 if (txn == NULL)
3749 return false;
3750
3751 /* a known subtxn? operate on top-level txn instead */
3752 if (rbtxn_is_known_subxact(txn))
3753 txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3754 NULL, InvalidXLogRecPtr, false);
3755
3756 return txn->base_snapshot != NULL;
3757}
3758
3759
3760/*
3761 * ---------------------------------------
3762 * Disk serialization support
3763 * ---------------------------------------
3764 */
3765
3766/*
3767 * Ensure the IO buffer is >= sz.
3768 */
3769static void
3771{
3772 if (!rb->outbufsize)
3773 {
3774 rb->outbuf = MemoryContextAlloc(rb->context, sz);
3775 rb->outbufsize = sz;
3776 }
3777 else if (rb->outbufsize < sz)
3778 {
3779 rb->outbuf = repalloc(rb->outbuf, sz);
3780 rb->outbufsize = sz;
3781 }
3782}
3783
3784
3785/* Compare two transactions by size */
3786static int
3788{
3791
3792 if (ta->size < tb->size)
3793 return -1;
3794 if (ta->size > tb->size)
3795 return 1;
3796 return 0;
3797}
3798
3799/*
3800 * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3801 */
3802static ReorderBufferTXN *
3804{
3806
3807 /* Get the largest transaction from the max-heap */
3809 pairingheap_first(rb->txn_heap));
3810
3811 Assert(largest);
3812 Assert(largest->size > 0);
3813 Assert(largest->size <= rb->size);
3814
3815 return largest;
3816}
3817
3818/*
3819 * Find the largest streamable (and non-aborted) toplevel transaction to evict
3820 * (by streaming).
3821 *
3822 * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3823 * should give us the same transaction (because we don't update memory account
3824 * for subtransaction with streaming, so it's always 0). But we can simply
3825 * iterate over the limited number of toplevel transactions that have a base
3826 * snapshot. There is no use of selecting a transaction that doesn't have base
3827 * snapshot because we don't decode such transactions. Also, we do not select
3828 * the transaction which doesn't have any streamable change.
3829 *
3830 * Note that, we skip transactions that contain incomplete changes. There
3831 * is a scope of optimization here such that we can select the largest
3832 * transaction which has incomplete changes. But that will make the code and
3833 * design quite complex and that might not be worth the benefit. If we plan to
3834 * stream the transactions that contain incomplete changes then we need to
3835 * find a way to partially stream/truncate the transaction changes in-memory
3836 * and build a mechanism to partially truncate the spilled files.
3837 * Additionally, whenever we partially stream the transaction we need to
3838 * maintain the last streamed lsn and next time we need to restore from that
3839 * segment and the offset in WAL. As we stream the changes from the top
3840 * transaction and restore them subtransaction wise, we need to even remember
3841 * the subxact from where we streamed the last change.
3842 */
3843static ReorderBufferTXN *
3845{
3846 dlist_iter iter;
3847 Size largest_size = 0;
3849
3850 /* Find the largest top-level transaction having a base snapshot. */
3851 dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3852 {
3853 ReorderBufferTXN *txn;
3854
3855 txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3856
3857 /* must not be a subtxn */
3859 /* base_snapshot must be set */
3860 Assert(txn->base_snapshot != NULL);
3861
3862 /* Don't consider these kinds of transactions for eviction. */
3863 if (rbtxn_has_partial_change(txn) ||
3865 rbtxn_is_aborted(txn))
3866 continue;
3867
3868 /* Find the largest of the eviction candidates. */
3869 if ((largest == NULL || txn->total_size > largest_size) &&
3870 (txn->total_size > 0))
3871 {
3872 largest = txn;
3873 largest_size = txn->total_size;
3874 }
3875 }
3876
3877 return largest;
3878}
3879
3880/*
3881 * Check whether the logical_decoding_work_mem limit was reached, and if yes
3882 * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3883 * disk or send to the output plugin until we reach under the memory limit.
3884 *
3885 * If debug_logical_replication_streaming is set to "immediate", stream or
3886 * serialize the changes immediately.
3887 *
3888 * XXX At this point we select the transactions until we reach under the memory
3889 * limit, but we might also adapt a more elaborate eviction strategy - for example
3890 * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3891 * limit.
3892 */
3893static void
3895{
3896 ReorderBufferTXN *txn;
3897 bool update_stats = true;
3898
3899 if (rb->size >= logical_decoding_work_mem * (Size) 1024)
3900 {
3901 /*
3902 * Update the statistics as the memory usage has reached the limit. We
3903 * report the statistics update later in this function since we can
3904 * update the slot statistics altogether while streaming or
3905 * serializing transactions in most cases.
3906 */
3907 rb->memExceededCount += 1;
3908 }
3910 {
3911 /*
3912 * Bail out if debug_logical_replication_streaming is buffered and we
3913 * haven't exceeded the memory limit.
3914 */
3915 return;
3916 }
3917
3918 /*
3919 * If debug_logical_replication_streaming is immediate, loop until there's
3920 * no change. Otherwise, loop until we reach under the memory limit. One
3921 * might think that just by evicting the largest (sub)transaction we will
3922 * come under the memory limit based on assumption that the selected
3923 * transaction is at least as large as the most recent change (which
3924 * caused us to go over the memory limit). However, that is not true
3925 * because a user can reduce the logical_decoding_work_mem to a smaller
3926 * value before the most recent change.
3927 */
3928 while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
3930 rb->size > 0))
3931 {
3932 /*
3933 * Pick the largest non-aborted transaction and evict it from memory
3934 * by streaming, if possible. Otherwise, spill to disk.
3935 */
3938 {
3939 /* we know there has to be one, because the size is not zero */
3940 Assert(txn && rbtxn_is_toptxn(txn));
3941 Assert(txn->total_size > 0);
3942 Assert(rb->size >= txn->total_size);
3943
3944 /* skip the transaction if aborted */
3946 continue;
3947
3949 }
3950 else
3951 {
3952 /*
3953 * Pick the largest transaction (or subtransaction) and evict it
3954 * from memory by serializing it to disk.
3955 */
3957
3958 /* we know there has to be one, because the size is not zero */
3959 Assert(txn);
3960 Assert(txn->size > 0);
3961 Assert(rb->size >= txn->size);
3962
3963 /* skip the transaction if aborted */
3965 continue;
3966
3968 }
3969
3970 /*
3971 * After eviction, the transaction should have no entries in memory,
3972 * and should use 0 bytes for changes.
3973 */
3974 Assert(txn->size == 0);
3975 Assert(txn->nentries_mem == 0);
3976
3977 /*
3978 * We've reported the memExceededCount update while streaming or
3979 * serializing the transaction.
3980 */
3981 update_stats = false;
3982 }
3983
3984 if (update_stats)
3986
3987 /* We must be under the memory limit now. */
3988 Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
3989}
3990
3991/*
3992 * Spill data of a large transaction (and its subtransactions) to disk.
3993 */
3994static void
3996{
3999 int fd = -1;
4001 Size spilled = 0;
4002 Size size = txn->size;
4003
4004 elog(DEBUG2, "spill %u changes in XID %u to disk",
4005 (uint32) txn->nentries_mem, txn->xid);
4006
4007 /* do the same to all child TXs */
4009 {
4011
4014 }
4015
4016 /* serialize changestream */
4018 {
4019 ReorderBufferChange *change;
4020
4021 change = dlist_container(ReorderBufferChange, node, change_i.cur);
4022
4023 /*
4024 * store in segment in which it belongs by start lsn, don't split over
4025 * multiple segments tho
4026 */
4027 if (fd == -1 ||
4029 {
4030 char path[MAXPGPATH];
4031
4032 if (fd != -1)
4034
4036
4037 /*
4038 * No need to care about TLIs here, only used during a single run,
4039 * so each LSN only maps to a specific WAL record.
4040 */
4042 curOpenSegNo);
4043
4044 /* open segment, create it if necessary */
4045 fd = OpenTransientFile(path,
4047
4048 if (fd < 0)
4049 ereport(ERROR,
4051 errmsg("could not open file \"%s\": %m", path)));
4052 }
4053
4054 ReorderBufferSerializeChange(rb, txn, fd, change);
4055 dlist_delete(&change->node);
4056 ReorderBufferFreeChange(rb, change, false);
4057
4058 spilled++;
4059 }
4060
4061 /* Update the memory counter */
4062 ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
4063
4064 /* update the statistics iff we have spilled anything */
4065 if (spilled)
4066 {
4067 rb->spillCount += 1;
4068 rb->spillBytes += size;
4069
4070 /* don't consider already serialized transactions */
4071 rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
4072
4073 /* update the decoding stats */
4075 }
4076
4077 Assert(spilled == txn->nentries_mem);
4079 txn->nentries_mem = 0;
4081
4082 if (fd != -1)
4084}
4085
4086/*
4087 * Serialize individual change to disk.
4088 */
4089static void
4091 int fd, ReorderBufferChange *change)
4092{
4095
4097
4098 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4099 memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
4100
4101 switch (change->action)
4102 {
4103 /* fall through these, they're all similar enough */
4108 {
4109 char *data;
4111 newtup;
4112 Size oldlen = 0;
4113 Size newlen = 0;
4114
4115 oldtup = change->data.tp.oldtuple;
4116 newtup = change->data.tp.newtuple;
4117
4118 if (oldtup)
4119 {
4120 sz += sizeof(HeapTupleData);
4121 oldlen = oldtup->t_len;
4122 sz += oldlen;
4123 }
4124
4125 if (newtup)
4126 {
4127 sz += sizeof(HeapTupleData);
4128 newlen = newtup->t_len;
4129 sz += newlen;
4130 }
4131
4132 /* make sure we have enough space */
4134
4135 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4136 /* might have been reallocated above */
4137 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4138
4139 if (oldlen)
4140 {
4141 memcpy(data, oldtup, sizeof(HeapTupleData));
4142 data += sizeof(HeapTupleData);
4143
4144 memcpy(data, oldtup->t_data, oldlen);
4145 data += oldlen;
4146 }
4147
4148 if (newlen)
4149 {
4150 memcpy(data, newtup, sizeof(HeapTupleData));
4151 data += sizeof(HeapTupleData);
4152
4153 memcpy(data, newtup->t_data, newlen);
4154 data += newlen;
4155 }
4156 break;
4157 }
4159 {
4160 char *data;
4161 Size prefix_size = strlen(change->data.msg.prefix) + 1;
4162
4163 sz += prefix_size + change->data.msg.message_size +
4164 sizeof(Size) + sizeof(Size);
4166
4167 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4168
4169 /* might have been reallocated above */
4170 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4171
4172 /* write the prefix including the size */
4173 memcpy(data, &prefix_size, sizeof(Size));
4174 data += sizeof(Size);
4175 memcpy(data, change->data.msg.prefix,
4176 prefix_size);
4177 data += prefix_size;
4178
4179 /* write the message including the size */
4180 memcpy(data, &change->data.msg.message_size, sizeof(Size));
4181 data += sizeof(Size);
4182 memcpy(data, change->data.msg.message,
4183 change->data.msg.message_size);
4184 data += change->data.msg.message_size;
4185
4186 break;
4187 }
4189 {
4190 char *data;
4192 change->data.inval.ninvalidations;
4193
4194 sz += inval_size;
4195
4197 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4198
4199 /* might have been reallocated above */
4200 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4202 data += inval_size;
4203
4204 break;
4205 }
4207 {
4208 Snapshot snap;
4209 char *data;
4210
4211 snap = change->data.snapshot;
4212
4213 sz += sizeof(SnapshotData) +
4214 sizeof(TransactionId) * snap->xcnt +
4215 sizeof(TransactionId) * snap->subxcnt;
4216
4217 /* make sure we have enough space */
4219 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4220 /* might have been reallocated above */
4221 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4222
4223 memcpy(data, snap, sizeof(SnapshotData));
4224 data += sizeof(SnapshotData);
4225
4226 if (snap->xcnt)
4227 {
4228 memcpy(data, snap->xip,
4229 sizeof(TransactionId) * snap->xcnt);
4230 data += sizeof(TransactionId) * snap->xcnt;
4231 }
4232
4233 if (snap->subxcnt)
4234 {
4235 memcpy(data, snap->subxip,
4236 sizeof(TransactionId) * snap->subxcnt);
4237 data += sizeof(TransactionId) * snap->subxcnt;
4238 }
4239 break;
4240 }
4242 {
4243 Size size;
4244 char *data;
4245
4246 /* account for the OIDs of truncated relations */
4247 size = sizeof(Oid) * change->data.truncate.nrelids;
4248 sz += size;
4249
4250 /* make sure we have enough space */
4252
4253 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4254 /* might have been reallocated above */
4255 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4256
4257 memcpy(data, change->data.truncate.relids, size);
4258 data += size;
4259
4260 break;
4261 }
4266 /* ReorderBufferChange contains everything important */
4267 break;
4268 }
4269
4270 ondisk->size = sz;
4271
4272 errno = 0;
4274 if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
4275 {
4276 int save_errno = errno;
4277
4279
4280 /* if write didn't set errno, assume problem is no disk space */
4282 ereport(ERROR,
4284 errmsg("could not write to data file for XID %u: %m",
4285 txn->xid)));
4286 }
4288
4289 /*
4290 * Keep the transaction's final_lsn up to date with each change we send to
4291 * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4292 * only do this on commit and abort records, but that doesn't work if a
4293 * system crash leaves a transaction without its abort record).
4294 *
4295 * Make sure not to move it backwards.
4296 */
4297 if (txn->final_lsn < change->lsn)
4298 txn->final_lsn = change->lsn;
4299
4300 Assert(ondisk->change.action == change->action);
4301}
4302
4303/* Returns true, if the output plugin supports streaming, false, otherwise. */
4304static inline bool
4306{
4307 LogicalDecodingContext *ctx = rb->private_data;
4308
4309 return ctx->streaming;
4310}
4311
4312/* Returns true, if the streaming can be started now, false, otherwise. */
4313static inline bool
4315{
4316 LogicalDecodingContext *ctx = rb->private_data;
4317 SnapBuild *builder = ctx->snapshot_builder;
4318
4319 /* We can't start streaming unless a consistent state is reached. */
4321 return false;
4322
4323 /*
4324 * We can't start streaming immediately even if the streaming is enabled
4325 * because we previously decoded this transaction and now just are
4326 * restarting.
4327 */
4329 !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
4330 return true;
4331
4332 return false;
4333}
4334
4335/*
4336 * Send data of a large transaction (and its subtransactions) to the
4337 * output plugin, but using the stream API.
4338 */
4339static void
4341{
4342 Snapshot snapshot_now;
4343 CommandId command_id;
4344 Size stream_bytes;
4345 bool txn_is_streamed;
4346
4347 /* We can never reach here for a subtransaction. */
4348 Assert(rbtxn_is_toptxn(txn));
4349
4350 /*
4351 * We can't make any assumptions about base snapshot here, similar to what
4352 * ReorderBufferCommit() does. That relies on base_snapshot getting
4353 * transferred from subxact in ReorderBufferCommitChild(), but that was
4354 * not yet called as the transaction is in-progress.
4355 *
4356 * So just walk the subxacts and use the same logic here. But we only need
4357 * to do that once, when the transaction is streamed for the first time.
4358 * After that we need to reuse the snapshot from the previous run.
4359 *
4360 * Unlike DecodeCommit which adds xids of all the subtransactions in
4361 * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4362 * we do add them to subxip array instead via ReorderBufferCopySnap. This
4363 * allows the catalog changes made in subtransactions decoded till now to
4364 * be visible.
4365 */
4366 if (txn->snapshot_now == NULL)
4367 {
4369
4370 /* make sure this transaction is streamed for the first time */
4372
4373 /* at the beginning we should have invalid command ID */
4375
4377 {
4379
4382 }
4383
4384 /*
4385 * If this transaction has no snapshot, it didn't make any changes to
4386 * the database till now, so there's nothing to decode.
4387 */
4388 if (txn->base_snapshot == NULL)
4389 {
4390 Assert(txn->ninvalidations == 0);
4391 return;
4392 }
4393
4394 command_id = FirstCommandId;
4395 snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4396 txn, command_id);
4397 }
4398 else
4399 {
4400 /* the transaction must have been already streamed */
4402
4403 /*
4404 * Nah, we already have snapshot from the previous streaming run. We
4405 * assume new subxacts can't move the LSN backwards, and so can't beat
4406 * the LSN condition in the previous branch (so no need to walk
4407 * through subxacts again). In fact, we must not do that as we may be
4408 * using snapshot half-way through the subxact.
4409 */
4410 command_id = txn->command_id;
4411
4412 /*
4413 * We can't use txn->snapshot_now directly because after the last
4414 * streaming run, we might have got some new sub-transactions. So we
4415 * need to add them to the snapshot.
4416 */
4417 snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4418 txn, command_id);
4419
4420 /* Free the previously copied snapshot. */
4421 Assert(txn->snapshot_now->copied);
4423 txn->snapshot_now = NULL;
4424 }
4425
4426 /*
4427 * Remember this information to be used later to update stats. We can't
4428 * update the stats here as an error while processing the changes would
4429 * lead to the accumulation of stats even though we haven't streamed all
4430 * the changes.
4431 */
4433 stream_bytes = txn->total_size;
4434
4435 /* Process and send the changes to output plugin. */
4436 ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4437 command_id, true);
4438
4439 rb->streamCount += 1;
4440 rb->streamBytes += stream_bytes;
4441
4442 /* Don't consider already streamed transaction. */
4443 rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4444
4445 /* update the decoding stats */
4447
4449 Assert(txn->nentries == 0);
4450 Assert(txn->nentries_mem == 0);
4451}
4452
4453/*
4454 * Size of a change in memory.
4455 */
4456static Size
4458{
4459 Size sz = sizeof(ReorderBufferChange);
4460
4461 switch (change->action)
4462 {
4463 /* fall through these, they're all similar enough */
4468 {
4470 newtup;
4471 Size oldlen = 0;
4472 Size newlen = 0;
4473
4474 oldtup = change->data.tp.oldtuple;
4475 newtup = change->data.tp.newtuple;
4476
4477 if (oldtup)
4478 {
4479 sz += sizeof(HeapTupleData);
4480 oldlen = oldtup->t_len;
4481 sz += oldlen;
4482 }
4483
4484 if (newtup)
4485 {
4486 sz += sizeof(HeapTupleData);
4487 newlen = newtup->t_len;
4488 sz += newlen;
4489 }
4490
4491 break;
4492 }
4494 {
4495 Size prefix_size = strlen(change->data.msg.prefix) + 1;
4496
4497 sz += prefix_size + change->data.msg.message_size +
4498 sizeof(Size) + sizeof(Size);
4499
4500 break;
4501 }
4503 {
4504 sz += sizeof(SharedInvalidationMessage) *
4505 change->data.inval.ninvalidations;
4506 break;
4507 }
4509 {
4510 Snapshot snap;
4511
4512 snap = change->data.snapshot;
4513
4514 sz += sizeof(SnapshotData) +
4515 sizeof(TransactionId) * snap->xcnt +
4516 sizeof(TransactionId) * snap->subxcnt;
4517
4518 break;
4519 }
4521 {
4522 sz += sizeof(Oid) * change->data.truncate.nrelids;
4523
4524 break;
4525 }
4530 /* ReorderBufferChange contains everything important */
4531 break;
4532 }
4533
4534 return sz;
4535}
4536
4537
4538/*
4539 * Restore a number of changes spilled to disk back into memory.
4540 */
4541static Size
4543 TXNEntryFile *file, XLogSegNo *segno)
4544{
4545 Size restored = 0;
4548 File *fd = &file->vfd;
4549
4552
4553 /* free current entries, so we have memory for more */
4555 {
4558
4559 dlist_delete(&cleanup->node);
4561 }
4562 txn->nentries_mem = 0;
4564
4566
4567 while (restored < max_changes_in_memory && *segno <= last_segno)
4568 {
4569 int readBytes;
4571
4573
4574 if (*fd == -1)
4575 {
4576 char path[MAXPGPATH];
4577
4578 /* first time in */
4579 if (*segno == 0)
4580 XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4581
4582 Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4583
4584 /*
4585 * No need to care about TLIs here, only used during a single run,
4586 * so each LSN only maps to a specific WAL record.
4587 */
4589 *segno);
4590
4592
4593 /* No harm in resetting the offset even in case of failure */
4594 file->curOffset = 0;
4595
4596 if (*fd < 0 && errno == ENOENT)
4597 {
4598 *fd = -1;
4599 (*segno)++;
4600 continue;
4601 }
4602 else if (*fd < 0)
4603 ereport(ERROR,
4605 errmsg("could not open file \"%s\": %m",
4606 path)));
4607 }
4608
4609 /*
4610 * Read the statically sized part of a change which has information
4611 * about the total size. If we couldn't read a record, we're at the
4612 * end of this file.
4613 */
4615 readBytes = FileRead(file->vfd, rb->outbuf,
4618
4619 /* eof */
4620 if (readBytes == 0)
4621 {
4622 FileClose(*fd);
4623 *fd = -1;
4624 (*segno)++;
4625 continue;
4626 }
4627 else if (readBytes < 0)
4628 ereport(ERROR,
4630 errmsg("could not read from reorderbuffer spill file: %m")));
4631 else if (readBytes != sizeof(ReorderBufferDiskChange))
4632 ereport(ERROR,
4634 errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4635 readBytes,
4636 (uint32) sizeof(ReorderBufferDiskChange))));
4637
4638 file->curOffset += readBytes;
4639
4640 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4641
4643 sizeof(ReorderBufferDiskChange) + ondisk->size);
4644 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4645
4646 readBytes = FileRead(file->vfd,
4647 rb->outbuf + sizeof(ReorderBufferDiskChange),
4648 ondisk->size - sizeof(ReorderBufferDiskChange),
4649 file->curOffset,
4651
4652 if (readBytes < 0)
4653 ereport(ERROR,
4655 errmsg("could not read from reorderbuffer spill file: %m")));
4656 else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4657 ereport(ERROR,
4659 errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4660 readBytes,
4661 (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4662
4663 file->curOffset += readBytes;
4664
4665 /*
4666 * ok, read a full change from disk, now restore it into proper
4667 * in-memory format
4668 */
4669 ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4670 restored++;
4671 }
4672
4673 return restored;
4674}
4675
4676/*
4677 * Convert change from its on-disk format to in-memory format and queue it onto
4678 * the TXN's ->changes list.
4679 *
4680 * Note: although "data" is declared char*, at entry it points to a
4681 * maxalign'd buffer, making it safe in most of this function to assume
4682 * that the pointed-to data is suitably aligned for direct access.
4683 */
4684static void
4686 char *data)
4687{
4689 ReorderBufferChange *change;
4690
4691 ondisk = (ReorderBufferDiskChange *) data;
4692
4693 change = ReorderBufferAllocChange(rb);
4694
4695 /* copy static part */
4696 memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4697
4698 data += sizeof(ReorderBufferDiskChange);
4699
4700 /* restore individual stuff */
4701 switch (change->action)
4702 {
4703 /* fall through these, they're all similar enough */
4708 if (change->data.tp.oldtuple)
4709 {
4710 uint32 tuplelen = ((HeapTuple) data)->t_len;
4711
4712 change->data.tp.oldtuple =
4714
4715 /* restore ->tuple */
4716 memcpy(change->data.tp.oldtuple, data,
4717 sizeof(HeapTupleData));
4718 data += sizeof(HeapTupleData);
4719
4720 /* reset t_data pointer into the new tuplebuf */
4721 change->data.tp.oldtuple->t_data =
4722 (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4723
4724 /* restore tuple data itself */
4726 data += tuplelen;
4727 }
4728
4729 if (change->data.tp.newtuple)
4730 {
4731 /* here, data might not be suitably aligned! */
4733
4735 sizeof(uint32));
4736
4737 change->data.tp.newtuple =
4739
4740 /* restore ->tuple */
4741 memcpy(change->data.tp.newtuple, data,
4742 sizeof(HeapTupleData));
4743 data += sizeof(HeapTupleData);
4744
4745 /* reset t_data pointer into the new tuplebuf */
4746 change->data.tp.newtuple->t_data =
4747 (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4748
4749 /* restore tuple data itself */
4751 data += tuplelen;
4752 }
4753
4754 break;
4756 {
4757 Size prefix_size;
4758
4759 /* read prefix */
4760 memcpy(&prefix_size, data, sizeof(Size));
4761 data += sizeof(Size);
4762 change->data.msg.prefix = MemoryContextAlloc(rb->context,
4763 prefix_size);
4764 memcpy(change->data.msg.prefix, data, prefix_size);
4765 Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4766 data += prefix_size;
4767
4768 /* read the message */
4769 memcpy(&change->data.msg.message_size, data, sizeof(Size));
4770 data += sizeof(Size);
4771 change->data.msg.message = MemoryContextAlloc(rb->context,
4772 change->data.msg.message_size);
4773 memcpy(change->data.msg.message, data,
4774 change->data.msg.message_size);
4775 data += change->data.msg.message_size;
4776
4777 break;
4778 }
4780 {
4782 change->data.inval.ninvalidations;
4783
4784 change->data.inval.invalidations =
4785 MemoryContextAlloc(rb->context, inval_size);
4786
4787 /* read the message */
4789
4790 break;
4791 }
4793 {
4796 Size size;
4797
4798 oldsnap = (Snapshot) data;
4799
4800 size = sizeof(SnapshotData) +
4801 sizeof(TransactionId) * oldsnap->xcnt +
4802 sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4803
4804 change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4805
4806 newsnap = change->data.snapshot;
4807
4808 memcpy(newsnap, data, size);
4809 newsnap->xip = (TransactionId *)
4810 (((char *) newsnap) + sizeof(SnapshotData));
4811 newsnap->subxip = newsnap->xip + newsnap->xcnt;
4812 newsnap->copied = true;
4813 break;
4814 }
4815 /* the base struct contains all the data, easy peasy */
4817 {
4818 Oid *relids;
4819
4820 relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
4821 memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4822 change->data.truncate.relids = relids;
4823
4824 break;
4825 }
4830 break;
4831 }
4832
4833 dlist_push_tail(&txn->changes, &change->node);
4834 txn->nentries_mem++;
4835
4836 /*
4837 * Update memory accounting for the restored change. We need to do this
4838 * although we don't check the memory limit when restoring the changes in
4839 * this branch (we only do that when initially queueing the changes after
4840 * decoding), because we will release the changes later, and that will
4841 * update the accounting too (subtracting the size from the counters). And
4842 * we don't want to underflow there.
4843 */
4845 ReorderBufferChangeSize(change));
4846}
4847
4848/*
4849 * Remove all on-disk stored for the passed in transaction.
4850 */
4851static void
4853{
4854 XLogSegNo first;
4855 XLogSegNo cur;
4856 XLogSegNo last;
4857
4860
4863
4864 /* iterate over all possible filenames, and delete them */
4865 for (cur = first; cur <= last; cur++)
4866 {
4867 char path[MAXPGPATH];
4868
4870 if (unlink(path) != 0 && errno != ENOENT)
4871 ereport(ERROR,
4873 errmsg("could not remove file \"%s\": %m", path)));
4874 }
4875}
4876
4877/*
4878 * Remove any leftover serialized reorder buffers from a slot directory after a
4879 * prior crash or decoding session exit.
4880 */
4881static void
4883{
4884 DIR *spill_dir;
4885 struct dirent *spill_de;
4886 struct stat statbuf;
4887 char path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
4888
4889 sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
4890
4891 /* we're only handling directories here, skip if it's not ours */
4892 if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4893 return;
4894
4895 spill_dir = AllocateDir(path);
4896 while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4897 {
4898 /* only look at names that can be ours */
4899 if (strncmp(spill_de->d_name, "xid", 3) == 0)
4900 {
4901 snprintf(path, sizeof(path),
4902 "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
4903 spill_de->d_name);
4904
4905 if (unlink(path) != 0)
4906 ereport(ERROR,
4908 errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4909 path, PG_REPLSLOT_DIR, slotname)));
4910 }
4911 }
4913}
4914
4915/*
4916 * Given a replication slot, transaction ID and segment number, fill in the
4917 * corresponding spill file into 'path', which is a caller-owned buffer of size
4918 * at least MAXPGPATH.
4919 */
4920static void
4922 XLogSegNo segno)
4923{
4925
4927
4928 snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
4931 xid, LSN_FORMAT_ARGS(recptr));
4932}
4933
4934/*
4935 * Delete all data spilled to disk after we've restarted/crashed. It will be
4936 * recreated when the respective slots are reused.
4937 */
4938void
4940{
4942 struct dirent *logical_de;
4943
4946 {
4947 if (strcmp(logical_de->d_name, ".") == 0 ||
4948 strcmp(logical_de->d_name, "..") == 0)
4949 continue;
4950
4951 /* if it cannot be a slot, skip the directory */
4952 if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
4953 continue;
4954
4955 /*
4956 * ok, has to be a surviving logical slot, iterate and delete
4957 * everything starting with xid-*
4958 */
4960 }
4962}
4963
4964/* ---------------------------------------
4965 * toast reassembly support
4966 * ---------------------------------------
4967 */
4968
4969/*
4970 * Initialize per tuple toast reconstruction support.
4971 */
4972static void
4974{
4976
4977 Assert(txn->toast_hash == NULL);
4978
4979 hash_ctl.keysize = sizeof(Oid);
4980 hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4981 hash_ctl.hcxt = rb->context;
4982 txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4984}
4985
4986/*
4987 * Per toast-chunk handling for toast reconstruction
4988 *
4989 * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4990 * toasted Datum comes along.
4991 */
4992static void
4994 Relation relation, ReorderBufferChange *change)
4995{
4998 bool found;
5000 bool isnull;
5001 Pointer chunk;
5002 TupleDesc desc = RelationGetDescr(relation);
5003 Oid chunk_id;
5005
5006 if (txn->toast_hash == NULL)
5008
5009 Assert(IsToastRelation(relation));
5010
5011 newtup = change->data.tp.newtuple;
5012 chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
5013 Assert(!isnull);
5014 chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
5015 Assert(!isnull);
5016
5018 hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
5019
5020 if (!found)
5021 {
5022 Assert(ent->chunk_id == chunk_id);
5023 ent->num_chunks = 0;
5024 ent->last_chunk_seq = 0;
5025 ent->size = 0;
5026 ent->reconstructed = NULL;
5027 dlist_init(&ent->chunks);
5028
5029 if (chunk_seq != 0)
5030 elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
5031 chunk_seq, chunk_id);
5032 }
5033 else if (found && chunk_seq != ent->last_chunk_seq + 1)
5034 elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
5035 chunk_seq, chunk_id, ent->last_chunk_seq + 1);
5036
5037 chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
5038 Assert(!isnull);
5039
5040 /* calculate size so we can allocate the right size at once later */
5041 if (!VARATT_IS_EXTENDED(chunk))
5042 chunksize = VARSIZE(chunk) - VARHDRSZ;
5043 else if (VARATT_IS_SHORT(chunk))
5044 /* could happen due to heap_form_tuple doing its thing */
5046 else
5047 elog(ERROR, "unexpected type of toast chunk");
5048
5049 ent->size += chunksize;
5050 ent->last_chunk_seq = chunk_seq;
5051 ent->num_chunks++;
5052 dlist_push_tail(&ent->chunks, &change->node);
5053}
5054
5055/*
5056 * Rejigger change->newtuple to point to in-memory toast tuples instead of
5057 * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
5058 *
5059 * We cannot replace unchanged toast tuples though, so those will still point
5060 * to on-disk toast data.
5061 *
5062 * While updating the existing change with detoasted tuple data, we need to
5063 * update the memory accounting info, because the change size will differ.
5064 * Otherwise the accounting may get out of sync, triggering serialization
5065 * at unexpected times.
5066 *
5067 * We simply subtract size of the change before rejiggering the tuple, and
5068 * then add the new size. This makes it look like the change was removed
5069 * and then added back, except it only tweaks the accounting info.
5070 *
5071 * In particular it can't trigger serialization, which would be pointless
5072 * anyway as it happens during commit processing right before handing
5073 * the change to the output plugin.
5074 */
5075static void
5077 Relation relation, ReorderBufferChange *change)
5078{
5079 TupleDesc desc;
5080 int natt;
5081 Datum *attrs;
5082 bool *isnull;
5083 bool *free;
5085 Relation toast_rel;
5087 MemoryContext oldcontext;
5089 Size old_size;
5090
5091 /* no toast tuples changed */
5092 if (txn->toast_hash == NULL)
5093 return;
5094
5095 /*
5096 * We're going to modify the size of the change. So, to make sure the
5097 * accounting is correct we record the current change size and then after
5098 * re-computing the change we'll subtract the recorded size and then
5099 * re-add the new change size at the end. We don't immediately subtract
5100 * the old size because if there is any error before we add the new size,
5101 * we will release the changes and that will update the accounting info
5102 * (subtracting the size from the counters). And we don't want to
5103 * underflow there.
5104 */
5106
5107 oldcontext = MemoryContextSwitchTo(rb->context);
5108
5109 /* we should only have toast tuples in an INSERT or UPDATE */
5110 Assert(change->data.tp.newtuple);
5111
5112 desc = RelationGetDescr(relation);
5113
5114 toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
5115 if (!RelationIsValid(toast_rel))
5116 elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
5117 relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
5118
5119 toast_desc = RelationGetDescr(toast_rel);
5120
5121 /* should we allocate from stack instead? */
5122 attrs = palloc0_array(Datum, desc->natts);
5123 isnull = palloc0_array(bool, desc->natts);
5124 free = palloc0_array(bool, desc->natts);
5125
5126 newtup = change->data.tp.newtuple;
5127
5128 heap_deform_tuple(newtup, desc, attrs, isnull);
5129
5130 for (natt = 0; natt < desc->natts; natt++)
5131 {
5135
5136 /* va_rawsize is the size of the original datum -- including header */
5137 varatt_external toast_pointer;
5140 varlena *reconstructed;
5141 dlist_iter it;
5142 Size data_done = 0;
5143
5144 if (attr->attisdropped)
5145 continue;
5146
5147 /* not a varlena datatype */
5148 if (attr->attlen != -1)
5149 continue;
5150
5151 /* no data */
5152 if (isnull[natt])
5153 continue;
5154
5155 /* ok, we know we have a toast datum */
5157
5158 /* no need to do anything if the tuple isn't external */
5160 continue;
5161
5163
5164 /*
5165 * Check whether the toast tuple changed, replace if so.
5166 */
5169 &toast_pointer.va_valueid,
5170 HASH_FIND,
5171 NULL);
5172 if (ent == NULL)
5173 continue;
5174
5175 new_datum =
5177
5178 free[natt] = true;
5179
5180 reconstructed = palloc0(toast_pointer.va_rawsize);
5181
5182 ent->reconstructed = reconstructed;
5183
5184 /* stitch toast tuple back together from its parts */
5185 dlist_foreach(it, &ent->chunks)
5186 {
5187 bool cisnull;
5190 Pointer chunk;
5191
5193 ctup = cchange->data.tp.newtuple;
5195
5196 Assert(!cisnull);
5197 Assert(!VARATT_IS_EXTERNAL(chunk));
5198 Assert(!VARATT_IS_SHORT(chunk));
5199
5200 memcpy(VARDATA(reconstructed) + data_done,
5201 VARDATA(chunk),
5202 VARSIZE(chunk) - VARHDRSZ);
5203 data_done += VARSIZE(chunk) - VARHDRSZ;
5204 }
5205 Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
5206
5207 /* make sure its marked as compressed or not */
5208 if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
5209 SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
5210 else
5211 SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
5212
5214 redirect_pointer.pointer = reconstructed;
5215
5218 sizeof(redirect_pointer));
5219
5221 }
5222
5223 /*
5224 * Build tuple in separate memory & copy tuple back into the tuplebuf
5225 * passed to the output plugin. We can't directly heap_fill_tuple() into
5226 * the tuplebuf because attrs[] will point back into the current content.
5227 */
5228 tmphtup = heap_form_tuple(desc, attrs, isnull);
5229 Assert(newtup->t_len <= MaxHeapTupleSize);
5230 Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
5231
5232 memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
5233 newtup->t_len = tmphtup->t_len;
5234
5235 /*
5236 * free resources we won't further need, more persistent stuff will be
5237 * free'd in ReorderBufferToastReset().
5238 */
5239 RelationClose(toast_rel);
5240 pfree(tmphtup);
5241 for (natt = 0; natt < desc->natts; natt++)
5242 {
5243 if (free[natt])
5245 }
5246 pfree(attrs);
5247 pfree(free);
5248 pfree(isnull);
5249
5250 MemoryContextSwitchTo(oldcontext);
5251
5252 /* subtract the old change size */
5254 /* now add the change back, with the correct size */
5256 ReorderBufferChangeSize(change));
5257}
5258
5259/*
5260 * Free all resources allocated for toast reconstruction.
5261 */
5262static void
5264{
5267
5268 if (txn->toast_hash == NULL)
5269 return;
5270
5271 /* sequentially walk over the hash and free everything */
5274 {
5276
5277 if (ent->reconstructed != NULL)
5278 pfree(ent->reconstructed);
5279
5280 dlist_foreach_modify(it, &ent->chunks)
5281 {
5282 ReorderBufferChange *change =
5284
5285 dlist_delete(&change->node);
5286 ReorderBufferFreeChange(rb, change, true);
5287 }
5288 }
5289
5291 txn->toast_hash = NULL;
5292}
5293
5294
5295/* ---------------------------------------
5296 * Visibility support for logical decoding
5297 *
5298 *
5299 * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5300 * always rely on stored cmin/cmax values because of two scenarios:
5301 *
5302 * * A tuple got changed multiple times during a single transaction and thus
5303 * has got a combo CID. Combo CIDs are only valid for the duration of a
5304 * single transaction.
5305 * * A tuple with a cmin but no cmax (and thus no combo CID) got
5306 * deleted/updated in another transaction than the one which created it
5307 * which we are looking at right now. As only one of cmin, cmax or combo CID
5308 * is actually stored in the heap we don't have access to the value we
5309 * need anymore.
5310 *
5311 * To resolve those problems we have a per-transaction hash of (cmin,
5312 * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5313 * (cmin, cmax) values. That also takes care of combo CIDs by simply
5314 * not caring about them at all. As we have the real cmin/cmax values
5315 * combo CIDs aren't interesting.
5316 *
5317 * As we only care about catalog tuples here the overhead of this
5318 * hashtable should be acceptable.
5319 *
5320 * Heap rewrites complicate this a bit, check rewriteheap.c for
5321 * details.
5322 * -------------------------------------------------------------------------
5323 */
5324
5325/* struct for sorting mapping files by LSN efficiently */
5326typedef struct RewriteMappingFile
5327{
5331
5332#ifdef NOT_USED
5333static void
5335{
5338
5341 {
5342 elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5343 ent->key.rlocator.dbOid,
5344 ent->key.rlocator.spcOid,
5345 ent->key.rlocator.relNumber,
5346 ItemPointerGetBlockNumber(&ent->key.tid),
5348 ent->cmin,
5349 ent->cmax
5350 );
5351 }
5352}
5353#endif
5354
5355/*
5356 * Apply a single mapping file to tuplecid_data.
5357 *
5358 * The mapping file has to have been verified to be a) committed b) for our
5359 * transaction c) applied in LSN order.
5360 */
5361static void
5363{
5364 char path[MAXPGPATH];
5365 int fd;
5366 int readBytes;
5368
5369 sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
5371 if (fd < 0)
5372 ereport(ERROR,
5374 errmsg("could not open file \"%s\": %m", path)));
5375
5376 while (true)
5377 {
5381 bool found;
5382
5383 /* be careful about padding */
5384 memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5385
5386 /* read all mappings till the end of the file */
5388 readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5390
5391 if (readBytes < 0)
5392 ereport(ERROR,
5394 errmsg("could not read file \"%s\": %m",
5395 path)));
5396 else if (readBytes == 0) /* EOF */
5397 break;
5398 else if (readBytes != sizeof(LogicalRewriteMappingData))
5399 ereport(ERROR,
5401 errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5402 path, readBytes,
5403 (int32) sizeof(LogicalRewriteMappingData))));
5404
5405 key.rlocator = map.old_locator;
5407 &key.tid);
5408
5409
5412
5413 /* no existing mapping, no need to update */
5414 if (!ent)
5415 continue;
5416
5417 key.rlocator = map.new_locator;
5419 &key.tid);
5420
5422 hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5423
5424 if (found)
5425 {
5426 /*
5427 * Make sure the existing mapping makes sense. We sometime update
5428 * old records that did not yet have a cmax (e.g. pg_class' own
5429 * entry while rewriting it) during rewrites, so allow that.
5430 */
5431 Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5432 Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5433 }
5434 else
5435 {
5436 /* update mapping */
5437 new_ent->cmin = ent->cmin;
5438 new_ent->cmax = ent->cmax;
5439 new_ent->combocid = ent->combocid;
5440 }
5441 }
5442
5443 if (CloseTransientFile(fd) != 0)
5444 ereport(ERROR,
5446 errmsg("could not close file \"%s\": %m", path)));
5447}
5448
5449
5450/*
5451 * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5452 */
5453static bool
5455{
5456 return bsearch(&xid, xip, num,
5457 sizeof(TransactionId), xidComparator) != NULL;
5458}
5459
5460/*
5461 * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5462 */
5463static int
5465{
5468
5469 return pg_cmp_u64(a->lsn, b->lsn);
5470}
5471
5472/*
5473 * Apply any existing logical remapping files if there are any targeted at our
5474 * transaction for relid.
5475 */
5476static void
5478{
5480 struct dirent *mapping_de;
5481 List *files = NIL;
5482 ListCell *file;
5483 Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5484
5487 {
5488 Oid f_dboid;
5489 Oid f_relid;
5493 uint32 f_hi,
5494 f_lo;
5496
5497 if (strcmp(mapping_de->d_name, ".") == 0 ||
5498 strcmp(mapping_de->d_name, "..") == 0)
5499 continue;
5500
5501 /* Ignore files that aren't ours */
5502 if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5503 continue;
5504
5506 &f_dboid, &f_relid, &f_hi, &f_lo,
5507 &f_mapped_xid, &f_create_xid) != 6)
5508 elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5509
5510 f_lsn = ((uint64) f_hi) << 32 | f_lo;
5511
5512 /* mapping for another database */
5513 if (f_dboid != dboid)
5514 continue;
5515
5516 /* mapping for another relation */
5517 if (f_relid != relid)
5518 continue;
5519
5520 /* did the creating transaction abort? */
5522 continue;
5523
5524 /* not for our transaction */
5525 if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5526 continue;
5527
5528 /* ok, relevant, queue for apply */
5530 f->lsn = f_lsn;
5531 strcpy(f->fname, mapping_de->d_name);
5532 files = lappend(files, f);
5533 }
5535
5536 /* sort files so we apply them in LSN order */
5538
5539 foreach(file, files)
5540 {
5542
5543 elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5544 snapshot->subxip[0]);
5546 pfree(f);
5547 }
5548}
5549
5550/*
5551 * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5552 * combo CIDs.
5553 */
5554bool
5556 Snapshot snapshot,
5557 HeapTuple htup, Buffer buffer,
5558 CommandId *cmin, CommandId *cmax)
5559{
5562 ForkNumber forkno;
5563 BlockNumber blockno;
5564 bool updated_mapping = false;
5565
5566 /*
5567 * Return unresolved if tuplecid_data is not valid. That's because when
5568 * streaming in-progress transactions we may run into tuples with the CID
5569 * before actually decoding them. Think e.g. about INSERT followed by
5570 * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5571 * INSERT. So in such cases, we assume the CID is from the future
5572 * command.
5573 */
5574 if (tuplecid_data == NULL)
5575 return false;
5576
5577 /* be careful about padding */
5578 memset(&key, 0, sizeof(key));
5579
5580 Assert(!BufferIsLocal(buffer));
5581
5582 /*
5583 * get relfilelocator from the buffer, no convenient way to access it
5584 * other than that.
5585 */
5586 BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5587
5588 /* tuples can only be in the main fork */
5589 Assert(forkno == MAIN_FORKNUM);
5590 Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5591
5592 ItemPointerCopy(&htup->t_self,
5593 &key.tid);
5594
5595restart:
5598
5599 /*
5600 * failed to find a mapping, check whether the table was rewritten and
5601 * apply mapping if so, but only do that once - there can be no new
5602 * mappings while we are in here since we have to hold a lock on the
5603 * relation.
5604 */
5605 if (ent == NULL && !updated_mapping)
5606 {
5608 /* now check but don't update for a mapping again */
5609 updated_mapping = true;
5610 goto restart;
5611 }
5612 else if (ent == NULL)
5613 return false;
5614
5615 if (cmin)
5616 *cmin = ent->cmin;
5617 if (cmax)
5618 *cmax = ent->cmax;
5619 return true;
5620}
5621
5622/*
5623 * Count invalidation messages of specified transaction.
5624 *
5625 * Returns number of messages, and msgs is set to the pointer of the linked
5626 * list for the messages.
5627 */
5628uint32
5631{
5632 ReorderBufferTXN *txn;
5633
5634 txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
5635 false);
5636
5637 if (txn == NULL)
5638 return 0;
5639
5640 *msgs = txn->invalidations;
5641
5642 return txn->ninvalidations;
5643}
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
uint32 BlockNumber
Definition block.h:31
static int32 next
Definition blutils.c:225
static void cleanup(void)
Definition bootstrap.c:879
int Buffer
Definition buf.h:23
#define BufferIsLocal(buffer)
Definition buf.h:37
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4378
#define NameStr(name)
Definition c.h:837
#define InvalidCommandId
Definition c.h:755
#define VARHDRSZ
Definition c.h:783
#define Assert(condition)
Definition c.h:945
#define PG_BINARY
Definition c.h:1376
#define FLEXIBLE_ARRAY_MEMBER
Definition c.h:552
#define FirstCommandId
Definition c.h:754
int32_t int32
Definition c.h:614
uint64_t uint64
Definition c.h:619
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
#define pg_fallthrough
Definition c.h:152
void * Pointer
Definition c.h:609
uint32 CommandId
Definition c.h:752
uint32 TransactionId
Definition c.h:738
size_t Size
Definition c.h:691
bool IsToastRelation(Relation relation)
Definition catalog.c:206
bool IsSharedRelation(Oid relationId)
Definition catalog.c:304
int64 TimestampTz
Definition timestamp.h:39
#define INDIRECT_POINTER_SIZE
Definition detoast.h:34
#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)
Definition detoast.h:22
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition dynahash.c:358
void hash_destroy(HTAB *hashp)
Definition dynahash.c:865
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1380
struct cursor * cur
Definition ecpg.c:29
Datum arg
Definition elog.c:1322
void FreeErrorData(ErrorData *edata)
Definition elog.c:2013
int errcode_for_file_access(void)
Definition elog.c:897
ErrorData * CopyErrorData(void)
Definition elog.c:1941
void FlushErrorState(void)
Definition elog.c:2062
#define PG_RE_THROW()
Definition elog.h:405
#define DEBUG3
Definition elog.h:28
#define PG_TRY(...)
Definition elog.h:372
#define DEBUG2
Definition elog.h:29
#define PG_END_TRY(...)
Definition elog.h:397
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define PG_CATCH(...)
Definition elog.h:382
#define elog(elevel,...)
Definition elog.h:226
#define INFO
Definition elog.h:34
#define ereport(elevel,...)
Definition elog.h:150
struct SnapshotData * Snapshot
Definition execnodes.h:59
int FreeDir(DIR *dir)
Definition fd.c:3009
int CloseTransientFile(int fd)
Definition fd.c:2855
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition fd.c:2972
void FileClose(File file)
Definition fd.c:1966
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1563
DIR * AllocateDir(const char *dirname)
Definition fd.c:2891
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2957
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2678
static ssize_t FileRead(File file, void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info)
Definition fd.h:225
int File
Definition fd.h:51
#define palloc_object(type)
Definition fe_memutils.h:74
#define repalloc_array(pointer, type, count)
Definition fe_memutils.h:78
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_array(type, count)
Definition fe_memutils.h:77
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition generation.c:162
Oid MyDatabaseId
Definition globals.c:94
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition heaptuple.c:1037
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition heaptuple.c:1266
@ HASH_FIND
Definition hsearch.h:113
@ HASH_REMOVE
Definition hsearch.h:115
@ HASH_ENTER
Definition hsearch.h:114
#define HASH_CONTEXT
Definition hsearch.h:102
#define HASH_ELEM
Definition hsearch.h:95
#define HASH_BLOBS
Definition hsearch.h:97
#define HEAPTUPLESIZE
Definition htup.h:73
HeapTupleData * HeapTuple
Definition htup.h:71
HeapTupleHeaderData * HeapTupleHeader
Definition htup.h:23
#define SizeofHeapTupleHeader
#define MaxHeapTupleSize
static Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
static dlist_node * dlist_pop_head_node(dlist_head *head)
Definition ilist.h:450
#define dlist_foreach(iter, lhead)
Definition ilist.h:623
static void dlist_init(dlist_head *head)
Definition ilist.h:314
#define dclist_container(type, membername, ptr)
Definition ilist.h:947
static bool dlist_has_next(const dlist_head *head, const dlist_node *node)
Definition ilist.h:503
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition ilist.h:709
static void dlist_insert_before(dlist_node *before, dlist_node *node)
Definition ilist.h:393
#define dlist_head_element(type, membername, lhead)
Definition ilist.h:603
static dlist_node * dlist_next_node(dlist_head *head, dlist_node *node)
Definition ilist.h:537
static void dlist_delete(dlist_node *node)
Definition ilist.h:405
static uint32 dclist_count(const dclist_head *head)
Definition ilist.h:932
#define dlist_foreach_modify(iter, lhead)
Definition ilist.h:640
static bool dlist_is_empty(const dlist_head *head)
Definition ilist.h:336
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition ilist.h:364
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition ilist.h:763
static void dclist_init(dclist_head *head)
Definition ilist.h:671
#define dlist_container(type, membername, ptr)
Definition ilist.h:593
#define dclist_foreach(iter, lhead)
Definition ilist.h:970
static int pg_cmp_u64(uint64 a, uint64 b)
Definition int.h:731
#define write(a, b, c)
Definition win32.h:14
#define read(a, b, c)
Definition win32.h:13
void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
Definition inval.c:823
void InvalidateSystemCaches(void)
Definition inval.c:916
int b
Definition isn.c:74
int a
Definition isn.c:73
int i
Definition isn.c:77
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition itemptr.h:172
List * lappend(List *list, void *datum)
Definition list.c:339
void list_sort(List *list, list_sort_comparator cmp)
Definition list.c:1674
void UpdateDecodingStats(LogicalDecodingContext *ctx)
Definition logical.c:1943
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1266
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc0(Size size)
Definition mcxt.c:1417
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define SLAB_DEFAULT_BLOCK_SIZE
Definition memutils.h:189
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
static char * errmsg
void pairingheap_remove(pairingheap *heap, pairingheap_node *node)
void pairingheap_add(pairingheap *heap, pairingheap_node *node)
pairingheap * pairingheap_allocate(pairingheap_comparator compare, void *arg)
Definition pairingheap.c:42
pairingheap_node * pairingheap_first(pairingheap *heap)
#define pairingheap_container(type, membername, ptr)
Definition pairingheap.h:43
#define pairingheap_const_container(type, membername, ptr)
Definition pairingheap.h:51
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
#define MAXPGPATH
const void * data
#define lfirst(lc)
Definition pg_list.h:172
#define NIL
Definition pg_list.h:68
#define sprintf
Definition port.h:262
#define snprintf
Definition port.h:260
#define qsort(a, b, c, d)
Definition port.h:495
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
static Oid DatumGetObjectId(Datum X)
Definition postgres.h:242
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define InvalidOid
unsigned int Oid
static int fd(const char *x, int i)
static int fb(int x)
bool TransactionIdIsInProgress(TransactionId xid)
Definition procarray.c:1401
#define RelationIsLogicallyLogged(relation)
Definition rel.h:710
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationIsValid(relation)
Definition rel.h:489
Relation RelationIdGetRelation(Oid relationId)
Definition relcache.c:2088
void RelationClose(Relation relation)
Definition relcache.c:2209
Oid RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
static int file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
void ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
void ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change, bool upd_mem)
static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
void ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, CommandId cid)
static void ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ReorderBufferReplay(ReorderBufferTXN *txn, ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, ReplOriginId origin_id, XLogRecPtr origin_lsn)
static void ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out, uint32 *ninvals_out, SharedInvalidationMessage *msgs_new, Size nmsgs_new)
static ReorderBufferTXN * ReorderBufferLargestTXN(ReorderBuffer *rb)
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileLocator locator, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, TimestampTz abort_time)
static bool ReorderBufferCanStartStreaming(ReorderBuffer *rb)
static void ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id, XLogRecPtr last_lsn, ReorderBufferChange *specinsert)
bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
void ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
TransactionId ReorderBufferGetOldestXmin(ReorderBuffer *rb)
static int ReorderBufferIterCompare(Datum a, Datum b, void *arg)
static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, ReorderBufferIterTXNState *volatile *iter_state)
bool ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, Snapshot snapshot, HeapTuple htup, Buffer buffer, CommandId *cmin, CommandId *cmax)
static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
void ReorderBufferFreeTupleBuf(HeapTuple tuple)
void ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, ReorderBufferChange *change, bool toast_insert)
void ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid, char *gid)
uint32 ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid, SharedInvalidationMessage **msgs)
void ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
void ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid, TransactionId subxid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn)
TransactionId * ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
static void ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id)
#define IsSpecInsert(action)
static Size ReorderBufferChangeSize(ReorderBufferChange *change)
ReorderBuffer * ReorderBufferAllocate(void)
int logical_decoding_work_mem
static void AssertChangeLsnOrder(ReorderBufferTXN *txn)
static bool ReorderBufferCanStream(ReorderBuffer *rb)
static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
static void ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change, bool streaming)
void ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
bool ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid, XLogRecPtr prepare_lsn, XLogRecPtr end_lsn, TimestampTz prepare_time, ReplOriginId origin_id, XLogRecPtr origin_lsn)
void ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, XLogRecPtr two_phase_at, TimestampTz commit_time, ReplOriginId origin_id, XLogRecPtr origin_lsn, char *gid, bool is_commit)
static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, int fd, ReorderBufferChange *change)
void ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Size nmsgs, SharedInvalidationMessage *msgs)
void ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, ReplOriginId origin_id, XLogRecPtr origin_lsn)
int debug_logical_replication_streaming
void ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Size nmsgs, SharedInvalidationMessage *msgs)
#define IsInsertOrUpdate(action)
static void ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
void ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, Snapshot snap, XLogRecPtr lsn, bool transactional, const char *prefix, Size message_size, const char *message)
bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
void ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
#define CHANGES_THRESHOLD
static ReorderBufferTXN * ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, char *data)
HeapTuple ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
static void AssertTXNLsnOrder(ReorderBuffer *rb)
#define MAX_DISTR_INVAL_MSG_PER_TXN
static void ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn, ReorderBufferChange *change, bool streaming)
static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
static void ReorderBufferCleanupSerializedTXNs(const char *slotname)
ReorderBufferChange * ReorderBufferAllocChange(ReorderBuffer *rb)
void ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
static void SetupCheckXidLive(TransactionId xid)
static bool TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, ReorderBufferTXN *txn, CommandId cid)
static void ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn, int nrelations, Relation *relations, ReorderBufferChange *change, bool streaming)
static void ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn, ReorderBufferChange *change, bool toast_insert)
static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
static void ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Size nmsgs, SharedInvalidationMessage *msgs)
static ReorderBufferTXN * ReorderBufferAllocTXN(ReorderBuffer *rb)
static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, SharedInvalidationMessage *invalidations)
static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, ReorderBufferTXN *subtxn)
static void ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
static ReorderBufferChange * ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
Oid * ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, ReorderBufferChange *change, ReorderBufferTXN *txn, bool addition, Size sz)
static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, TXNEntryFile *file, XLogSegNo *segno)
void ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid, TransactionId subxid, XLogRecPtr lsn)
void ReorderBufferFree(ReorderBuffer *rb)
static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, XLogSegNo segno)
#define IsSpecConfirmOrAbort(action)
static void ApplyLogicalMappingFile(HTAB *tuplecid_data, const char *fname)
static const Size max_changes_in_memory
void StartupReorderBuffer(void)
void ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
static ReorderBufferTXN * ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, bool *is_new, XLogRecPtr lsn, bool create_as_top)
static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
ReorderBufferTXN * ReorderBufferGetOldestTXN(ReorderBuffer *rb)
static void ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, XLogRecPtr commit_lsn, volatile Snapshot snapshot_now, volatile CommandId command_id, bool streaming)
#define rbtxn_is_committed(txn)
#define rbtxn_has_streamable_change(txn)
#define rbtxn_has_catalog_changes(txn)
@ DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE
@ DEBUG_LOGICAL_REP_STREAMING_BUFFERED
#define RBTXN_PREPARE_STATUS_MASK
#define rbtxn_is_serialized_clear(txn)
#define RBTXN_IS_STREAMED
#define rbtxn_is_prepared(txn)
#define RBTXN_HAS_PARTIAL_CHANGE
#define rbtxn_is_streamed(txn)
#define RBTXN_SENT_PREPARE
#define rbtxn_is_toptxn(txn)
#define rbtxn_get_toptxn(txn)
#define rbtxn_is_known_subxact(txn)
#define rbtxn_is_subtxn(txn)
#define RBTXN_HAS_CATALOG_CHANGES
#define RBTXN_IS_COMMITTED
#define PG_LOGICAL_MAPPINGS_DIR
#define RBTXN_DISTR_INVAL_OVERFLOWED
#define RBTXN_IS_SERIALIZED_CLEAR
#define rbtxn_sent_prepare(txn)
#define RBTXN_IS_PREPARED
#define rbtxn_distr_inval_overflowed(txn)
#define RBTXN_SKIPPED_PREPARE
#define RBTXN_HAS_STREAMABLE_CHANGE
@ REORDER_BUFFER_CHANGE_INVALIDATION
@ REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM
@ REORDER_BUFFER_CHANGE_INSERT
@ REORDER_BUFFER_CHANGE_MESSAGE
@ REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT
@ REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID
@ REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
@ REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT
@ REORDER_BUFFER_CHANGE_TRUNCATE
@ REORDER_BUFFER_CHANGE_DELETE
@ REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT
@ REORDER_BUFFER_CHANGE_UPDATE
#define rbtxn_is_aborted(txn)
#define RBTXN_IS_SERIALIZED
#define rbtxn_is_serialized(txn)
#define RBTXN_IS_ABORTED
#define RBTXN_IS_SUBXACT
#define rbtxn_has_partial_change(txn)
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
#define LOGICAL_REWRITE_FORMAT
Definition rewriteheap.h:54
MemoryContext SlabContextCreate(MemoryContext parent, const char *name, Size blockSize, Size chunkSize)
Definition slab.c:322
ReplicationSlot * MyReplicationSlot
Definition slot.c:149
bool ReplicationSlotValidateName(const char *name, bool allow_reserved_name, int elevel)
Definition slot.c:268
#define PG_REPLSLOT_DIR
Definition slot.h:21
void SnapBuildSnapDecRefcount(Snapshot snap)
Definition snapbuild.c:331
bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
Definition snapbuild.c:307
SnapBuildState SnapBuildCurrentState(SnapBuild *builder)
Definition snapbuild.c:280
@ SNAPBUILD_CONSISTENT
Definition snapbuild.h:50
void TeardownHistoricSnapshot(bool is_error)
Definition snapmgr.c:1685
void SetupHistoricSnapshot(Snapshot historic_snapshot, HTAB *tuplecids)
Definition snapmgr.c:1669
static HTAB * tuplecid_data
Definition snapmgr.c:163
#define free(a)
bool attisdropped
Definition tupdesc.h:78
Definition dirent.c:26
Size keysize
Definition hsearch.h:75
ItemPointerData t_self
Definition htup.h:65
uint32 t_len
Definition htup.h:64
HeapTupleHeader t_data
Definition htup.h:68
Oid t_tableOid
Definition htup.h:66
Definition pg_list.h:54
XLogReaderState * reader
Definition logical.h:42
struct SnapBuild * snapshot_builder
Definition logical.h:44
ItemPointerData new_tid
Definition rewriteheap.h:40
RelFileLocator old_locator
Definition rewriteheap.h:37
ItemPointerData old_tid
Definition rewriteheap.h:39
RelFileLocator new_locator
Definition rewriteheap.h:38
RelFileNumber relNumber
Form_pg_class rd_rel
Definition rel.h:111
ReorderBufferChangeType action
struct ReorderBufferChange::@117::@119 truncate
struct ReorderBufferChange::@117::@121 tuplecid
RelFileLocator rlocator
ItemPointerData tid
union ReorderBufferChange::@117 data
struct ReorderBufferChange::@117::@122 inval
struct ReorderBufferChange::@117::@120 msg
struct ReorderBufferTXN * txn
RelFileLocator locator
ReplOriginId origin_id
SharedInvalidationMessage * invalidations
struct ReorderBufferChange::@117::@118 tp
ReorderBufferChange change
ReorderBufferChange * change
ReorderBufferTXN * txn
ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER]
ReorderBufferTXN * txn
XLogRecPtr restart_decoding_lsn
pairingheap_node txn_node
TimestampTz commit_time
XLogRecPtr base_snapshot_lsn
TransactionId toplevel_xid
dlist_node catchange_node
SharedInvalidationMessage * invalidations
dlist_head tuplecids
XLogRecPtr first_lsn
TimestampTz abort_time
XLogRecPtr final_lsn
void * output_plugin_private
uint32 ninvalidations_distributed
XLogRecPtr origin_lsn
TimestampTz prepare_time
TransactionId xid
ReplOriginId origin_id
dlist_node base_snapshot_node
SharedInvalidationMessage * invalidations_distributed
ReorderBufferTupleCidKey key
dlist_head txns_by_base_snapshot_lsn
MemoryContext context
dclist_head catchange_txns
MemoryContext change_context
ReorderBufferTXN * by_txn_last_txn
TransactionId by_txn_last_xid
MemoryContext tup_context
dlist_head toplevel_by_lsn
pairingheap * txn_heap
MemoryContext txn_context
XLogRecPtr current_restart_decoding_lsn
ReplicationSlotPersistentData data
Definition slot.h:213
char fname[MAXPGPATH]
TransactionId xmin
Definition snapshot.h:153
int32 subxcnt
Definition snapshot.h:177
CommandId curcid
Definition snapshot.h:183
uint32 xcnt
Definition snapshot.h:165
TransactionId * subxip
Definition snapshot.h:176
XLogRecPtr EndRecPtr
Definition xlogreader.h:206
XLogRecPtr ReadRecPtr
Definition xlogreader.h:205
dlist_node * cur
Definition ilist.h:179
dlist_node * cur
Definition ilist.h:200
int32 va_rawsize
Definition varatt.h:34
Definition c.h:778
bool TransactionIdDidCommit(TransactionId transactionId)
Definition transam.c:126
#define InvalidTransactionId
Definition transam.h:31
#define TransactionIdEquals(id1, id2)
Definition transam.h:43
#define TransactionIdIsValid(xid)
Definition transam.h:41
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition transam.h:263
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:193
#define VARHDRSZ_SHORT
Definition varatt.h:278
static bool VARATT_IS_SHORT(const void *PTR)
Definition varatt.h:403
static void SET_VARSIZE_COMPRESSED(void *PTR, Size len)
Definition varatt.h:446
static bool VARATT_IS_EXTENDED(const void *PTR)
Definition varatt.h:410
static bool VARATT_IS_EXTERNAL(const void *PTR)
Definition varatt.h:354
static char * VARDATA_EXTERNAL(const void *PTR)
Definition varatt.h:340
static Size VARSIZE(const void *PTR)
Definition varatt.h:298
static char * VARDATA(const void *PTR)
Definition varatt.h:305
static void SET_VARTAG_EXTERNAL(void *PTR, vartag_external tag)
Definition varatt.h:453
static Size VARATT_EXTERNAL_GET_EXTSIZE(varatt_external toast_pointer)
Definition varatt.h:507
@ VARTAG_INDIRECT
Definition varatt.h:86
static void SET_VARSIZE(void *PTR, Size len)
Definition varatt.h:432
static bool VARATT_EXTERNAL_IS_COMPRESSED(varatt_external toast_pointer)
Definition varatt.h:536
static Size VARSIZE_SHORT(const void *PTR)
Definition varatt.h:312
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
#define lstat(path, sb)
Definition win32_port.h:275
#define S_ISDIR(m)
Definition win32_port.h:315
bool IsTransactionOrTransactionBlock(void)
Definition xact.c:5012
void BeginInternalSubTransaction(const char *name)
Definition xact.c:4717
TransactionId CheckXidAlive
Definition xact.c:101
void RollbackAndReleaseCurrentSubTransaction(void)
Definition xact.c:4819
void StartTransactionCommand(void)
Definition xact.c:3081
TransactionId GetCurrentTransactionIdIfAny(void)
Definition xact.c:473
TransactionId GetCurrentTransactionId(void)
Definition xact.c:456
void AbortCurrentTransaction(void)
Definition xact.c:3473
int xidComparator(const void *arg1, const void *arg2)
Definition xid.c:152
int wal_segment_size
Definition xlog.c:147
#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest)
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
#define LSN_FORMAT_ARGS(lsn)
Definition xlogdefs.h:47
uint16 ReplOriginId
Definition xlogdefs.h:69
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
uint64 XLogSegNo
Definition xlogdefs.h:52