PostgreSQL Source Code  git master
reorderbuffer.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * reorderbuffer.c
4  * PostgreSQL logical replay/reorder buffer management
5  *
6  *
7  * Copyright (c) 2012-2022, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/logical/reorderbuffer.c
12  *
13  * NOTES
14  * This module gets handed individual pieces of transactions in the order
15  * they are written to the WAL and is responsible to reassemble them into
16  * toplevel transaction sized pieces. When a transaction is completely
17  * reassembled - signaled by reading the transaction commit record - it
18  * will then call the output plugin (cf. ReorderBufferCommit()) with the
19  * individual changes. The output plugins rely on snapshots built by
20  * snapbuild.c which hands them to us.
21  *
22  * Transactions and subtransactions/savepoints in postgres are not
23  * immediately linked to each other from outside the performing
24  * backend. Only at commit/abort (or special xact_assignment records) they
25  * are linked together. Which means that we will have to splice together a
26  * toplevel transaction from its subtransactions. To do that efficiently we
27  * build a binary heap indexed by the smallest current lsn of the individual
28  * subtransactions' changestreams. As the individual streams are inherently
29  * ordered by LSN - since that is where we build them from - the transaction
30  * can easily be reassembled by always using the subtransaction with the
31  * smallest current LSN from the heap.
32  *
33  * In order to cope with large transactions - which can be several times as
34  * big as the available memory - this module supports spooling the contents
35  * of a large transactions to disk. When the transaction is replayed the
36  * contents of individual (sub-)transactions will be read from disk in
37  * chunks.
38  *
39  * This module also has to deal with reassembling toast records from the
40  * individual chunks stored in WAL. When a new (or initial) version of a
41  * tuple is stored in WAL it will always be preceded by the toast chunks
42  * emitted for the columns stored out of line. Within a single toplevel
43  * transaction there will be no other data carrying records between a row's
44  * toast chunks and the row data itself. See ReorderBufferToast* for
45  * details.
46  *
47  * ReorderBuffer uses two special memory context types - SlabContext for
48  * allocations of fixed-length structures (changes and transactions), and
49  * GenerationContext for the variable-length transaction data (allocated
50  * and freed in groups with similar lifespans).
51  *
52  * To limit the amount of memory used by decoded changes, we track memory
53  * used at the reorder buffer level (i.e. total amount of memory), and for
54  * each transaction. When the total amount of used memory exceeds the
55  * limit, the transaction consuming the most memory is then serialized to
56  * disk.
57  *
58  * Only decoded changes are evicted from memory (spilled to disk), not the
59  * transaction records. The number of toplevel transactions is limited,
60  * but a transaction with many subtransactions may still consume significant
61  * amounts of memory. However, the transaction records are fairly small and
62  * are not included in the memory limit.
63  *
64  * The current eviction algorithm is very simple - the transaction is
65  * picked merely by size, while it might be useful to also consider age
66  * (LSN) of the changes for example. With the new Generational memory
67  * allocator, evicting the oldest changes would make it more likely the
68  * memory gets actually freed.
69  *
70  * We still rely on max_changes_in_memory when loading serialized changes
71  * back into memory. At that point we can't use the memory limit directly
72  * as we load the subxacts independently. One option to deal with this
73  * would be to count the subxacts, and allow each to allocate 1/N of the
74  * memory limit. That however does not seem very appealing, because with
75  * many subtransactions it may easily cause thrashing (short cycles of
76  * deserializing and applying very few changes). We probably should give
77  * a bit more memory to the oldest subtransactions, because it's likely
78  * they are the source for the next sequence of changes.
79  *
80  * -------------------------------------------------------------------------
81  */
82 #include "postgres.h"
83 
84 #include <unistd.h>
85 #include <sys/stat.h>
86 
87 #include "access/detoast.h"
88 #include "access/heapam.h"
89 #include "access/rewriteheap.h"
90 #include "access/transam.h"
91 #include "access/xact.h"
92 #include "access/xlog_internal.h"
93 #include "catalog/catalog.h"
94 #include "lib/binaryheap.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "replication/logical.h"
99 #include "replication/slot.h"
100 #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
101 #include "storage/bufmgr.h"
102 #include "storage/fd.h"
103 #include "storage/sinval.h"
104 #include "utils/builtins.h"
105 #include "utils/combocid.h"
106 #include "utils/memdebug.h"
107 #include "utils/memutils.h"
108 #include "utils/rel.h"
109 #include "utils/relfilenodemap.h"
110 
111 
112 /* entry for a hash table we use to map from xid to our transaction state */
114 {
118 
119 /* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
121 {
125 
127 {
131  CommandId combocid; /* just for debugging */
133 
134 /* Virtual file descriptor with file offset tracking */
135 typedef struct TXNEntryFile
136 {
137  File vfd; /* -1 when the file is closed */
138  off_t curOffset; /* offset for next write or read. Reset to 0
139  * when vfd is opened. */
141 
142 /* k-way in-order change iteration support structures */
144 {
151 
153 {
159 
160 /* toast datastructures */
161 typedef struct ReorderBufferToastEnt
162 {
163  Oid chunk_id; /* toast_table.chunk_id */
164  int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
165  * have seen */
166  Size num_chunks; /* number of chunks we've already seen */
167  Size size; /* combined size of chunks seen */
168  dlist_head chunks; /* linked list of chunks */
169  struct varlena *reconstructed; /* reconstructed varlena now pointed to in
170  * main tup */
172 
173 /* Disk serialization support datastructures */
175 {
178  /* data follows */
180 
181 #define IsSpecInsert(action) \
182 ( \
183  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
184 )
185 #define IsSpecConfirmOrAbort(action) \
186 ( \
187  (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
188  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
189 )
190 #define IsInsertOrUpdate(action) \
191 ( \
192  (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
193  ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
194  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
195 )
196 
197 /*
198  * Maximum number of changes kept in memory, per transaction. After that,
199  * changes are spooled to disk.
200  *
201  * The current value should be sufficient to decode the entire transaction
202  * without hitting disk in OLTP workloads, while starting to spool to disk in
203  * other workloads reasonably fast.
204  *
205  * At some point in the future it probably makes sense to have a more elaborate
206  * resource management here, but it's not entirely clear what that would look
207  * like.
208  */
210 static const Size max_changes_in_memory = 4096; /* XXX for restore only */
211 
212 /* ---------------------------------------
213  * primary reorderbuffer support routines
214  * ---------------------------------------
215  */
219  TransactionId xid, bool create, bool *is_new,
220  XLogRecPtr lsn, bool create_as_top);
222  ReorderBufferTXN *subtxn);
223 
224 static void AssertTXNLsnOrder(ReorderBuffer *rb);
225 
226 /* ---------------------------------------
227  * support functions for lsn-order iterating over the ->changes of a
228  * transaction and its subtransactions
229  *
230  * used for iteration over the k-way heap merge of a transaction and its
231  * subtransactions
232  * ---------------------------------------
233  */
235  ReorderBufferIterTXNState *volatile *iter_state);
240 
241 /*
242  * ---------------------------------------
243  * Disk serialization support functions
244  * ---------------------------------------
245  */
249  int fd, ReorderBufferChange *change);
251  TXNEntryFile *file, XLogSegNo *segno);
253  char *change);
256  bool txn_prepared);
257 static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
258 static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
259  TransactionId xid, XLogSegNo segno);
260 
261 static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
263  ReorderBufferTXN *txn, CommandId cid);
264 
265 /*
266  * ---------------------------------------
267  * Streaming support functions
268  * ---------------------------------------
269  */
270 static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
271 static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
274 
275 /* ---------------------------------------
276  * toast reassembly support
277  * ---------------------------------------
278  */
282  Relation relation, ReorderBufferChange *change);
284  Relation relation, ReorderBufferChange *change);
285 
286 /*
287  * ---------------------------------------
288  * memory accounting
289  * ---------------------------------------
290  */
293  ReorderBufferChange *change,
294  bool addition, Size sz);
295 
296 /*
297  * Allocate a new ReorderBuffer and clean out any old serialized state from
298  * prior ReorderBuffer instances for the same slot.
299  */
302 {
303  ReorderBuffer *buffer;
304  HASHCTL hash_ctl;
305  MemoryContext new_ctx;
306 
307  Assert(MyReplicationSlot != NULL);
308 
309  /* allocate memory in own context, to have better accountability */
311  "ReorderBuffer",
313 
314  buffer =
315  (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
316 
317  memset(&hash_ctl, 0, sizeof(hash_ctl));
318 
319  buffer->context = new_ctx;
320 
321  buffer->change_context = SlabContextCreate(new_ctx,
322  "Change",
324  sizeof(ReorderBufferChange));
325 
326  buffer->txn_context = SlabContextCreate(new_ctx,
327  "TXN",
329  sizeof(ReorderBufferTXN));
330 
331  /*
332  * XXX the allocation sizes used below pre-date generation context's block
333  * growing code. These values should likely be benchmarked and set to
334  * more suitable values.
335  */
336  buffer->tup_context = GenerationContextCreate(new_ctx,
337  "Tuples",
341 
342  hash_ctl.keysize = sizeof(TransactionId);
343  hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
344  hash_ctl.hcxt = buffer->context;
345 
346  buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
348 
350  buffer->by_txn_last_txn = NULL;
351 
352  buffer->outbuf = NULL;
353  buffer->outbufsize = 0;
354  buffer->size = 0;
355 
356  buffer->spillTxns = 0;
357  buffer->spillCount = 0;
358  buffer->spillBytes = 0;
359  buffer->streamTxns = 0;
360  buffer->streamCount = 0;
361  buffer->streamBytes = 0;
362  buffer->totalTxns = 0;
363  buffer->totalBytes = 0;
364 
366 
367  dlist_init(&buffer->toplevel_by_lsn);
369 
370  /*
371  * Ensure there's no stale data from prior uses of this slot, in case some
372  * prior exit avoided calling ReorderBufferFree. Failure to do this can
373  * produce duplicated txns, and it's very cheap if there's nothing there.
374  */
376 
377  return buffer;
378 }
379 
380 /*
381  * Free a ReorderBuffer
382  */
383 void
385 {
386  MemoryContext context = rb->context;
387 
388  /*
389  * We free separately allocated data by entirely scrapping reorderbuffer's
390  * memory context.
391  */
392  MemoryContextDelete(context);
393 
394  /* Free disk space used by unconsumed reorder buffers */
396 }
397 
398 /*
399  * Get an unused, possibly preallocated, ReorderBufferTXN.
400  */
401 static ReorderBufferTXN *
403 {
404  ReorderBufferTXN *txn;
405 
406  txn = (ReorderBufferTXN *)
408 
409  memset(txn, 0, sizeof(ReorderBufferTXN));
410 
411  dlist_init(&txn->changes);
412  dlist_init(&txn->tuplecids);
413  dlist_init(&txn->subtxns);
414 
415  /* InvalidCommandId is not zero, so set it explicitly */
417  txn->output_plugin_private = NULL;
418 
419  return txn;
420 }
421 
422 /*
423  * Free a ReorderBufferTXN.
424  */
425 static void
427 {
428  /* clean the lookup cache if we were cached (quite likely) */
429  if (rb->by_txn_last_xid == txn->xid)
430  {
432  rb->by_txn_last_txn = NULL;
433  }
434 
435  /* free data that's contained */
436 
437  if (txn->gid != NULL)
438  {
439  pfree(txn->gid);
440  txn->gid = NULL;
441  }
442 
443  if (txn->tuplecid_hash != NULL)
444  {
446  txn->tuplecid_hash = NULL;
447  }
448 
449  if (txn->invalidations)
450  {
451  pfree(txn->invalidations);
452  txn->invalidations = NULL;
453  }
454 
455  /* Reset the toast hash */
456  ReorderBufferToastReset(rb, txn);
457 
458  pfree(txn);
459 }
460 
461 /*
462  * Get a fresh ReorderBufferChange.
463  */
466 {
467  ReorderBufferChange *change;
468 
469  change = (ReorderBufferChange *)
471 
472  memset(change, 0, sizeof(ReorderBufferChange));
473  return change;
474 }
475 
476 /*
477  * Free a ReorderBufferChange and update memory accounting, if requested.
478  */
479 void
481  bool upd_mem)
482 {
483  /* update memory accounting info */
484  if (upd_mem)
485  ReorderBufferChangeMemoryUpdate(rb, change, false,
486  ReorderBufferChangeSize(change));
487 
488  /* free contained data */
489  switch (change->action)
490  {
495  if (change->data.tp.newtuple)
496  {
497  ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple);
498  change->data.tp.newtuple = NULL;
499  }
500 
501  if (change->data.tp.oldtuple)
502  {
503  ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple);
504  change->data.tp.oldtuple = NULL;
505  }
506  break;
508  if (change->data.msg.prefix != NULL)
509  pfree(change->data.msg.prefix);
510  change->data.msg.prefix = NULL;
511  if (change->data.msg.message != NULL)
512  pfree(change->data.msg.message);
513  change->data.msg.message = NULL;
514  break;
516  if (change->data.inval.invalidations)
517  pfree(change->data.inval.invalidations);
518  change->data.inval.invalidations = NULL;
519  break;
521  if (change->data.snapshot)
522  {
523  ReorderBufferFreeSnap(rb, change->data.snapshot);
524  change->data.snapshot = NULL;
525  }
526  break;
527  /* no data in addition to the struct itself */
529  if (change->data.truncate.relids != NULL)
530  {
531  ReorderBufferReturnRelids(rb, change->data.truncate.relids);
532  change->data.truncate.relids = NULL;
533  }
534  break;
539  break;
540  }
541 
542  pfree(change);
543 }
544 
545 /*
546  * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size
547  * tuple_len (excluding header overhead).
548  */
551 {
552  ReorderBufferTupleBuf *tuple;
553  Size alloc_len;
554 
555  alloc_len = tuple_len + SizeofHeapTupleHeader;
556 
557  tuple = (ReorderBufferTupleBuf *)
559  sizeof(ReorderBufferTupleBuf) +
560  MAXIMUM_ALIGNOF + alloc_len);
561  tuple->alloc_tuple_size = alloc_len;
562  tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
563 
564  return tuple;
565 }
566 
567 /*
568  * Free a ReorderBufferTupleBuf.
569  */
570 void
572 {
573  pfree(tuple);
574 }
575 
576 /*
577  * Get an array for relids of truncated relations.
578  *
579  * We use the global memory context (for the whole reorder buffer), because
580  * none of the existing ones seems like a good match (some are SLAB, so we
581  * can't use those, and tup_context is meant for tuple data, not relids). We
582  * could add yet another context, but it seems like an overkill - TRUNCATE is
583  * not particularly common operation, so it does not seem worth it.
584  */
585 Oid *
587 {
588  Oid *relids;
589  Size alloc_len;
590 
591  alloc_len = sizeof(Oid) * nrelids;
592 
593  relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
594 
595  return relids;
596 }
597 
598 /*
599  * Free an array of relids.
600  */
601 void
603 {
604  pfree(relids);
605 }
606 
607 /*
608  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
609  * If create is true, and a transaction doesn't already exist, create it
610  * (with the given LSN, and as top transaction if that's specified);
611  * when this happens, is_new is set to true.
612  */
613 static ReorderBufferTXN *
615  bool *is_new, XLogRecPtr lsn, bool create_as_top)
616 {
617  ReorderBufferTXN *txn;
619  bool found;
620 
622 
623  /*
624  * Check the one-entry lookup cache first
625  */
627  rb->by_txn_last_xid == xid)
628  {
629  txn = rb->by_txn_last_txn;
630 
631  if (txn != NULL)
632  {
633  /* found it, and it's valid */
634  if (is_new)
635  *is_new = false;
636  return txn;
637  }
638 
639  /*
640  * cached as non-existent, and asked not to create? Then nothing else
641  * to do.
642  */
643  if (!create)
644  return NULL;
645  /* otherwise fall through to create it */
646  }
647 
648  /*
649  * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
650  * create an entry.
651  */
652 
653  /* search the lookup table */
654  ent = (ReorderBufferTXNByIdEnt *)
655  hash_search(rb->by_txn,
656  (void *) &xid,
657  create ? HASH_ENTER : HASH_FIND,
658  &found);
659  if (found)
660  txn = ent->txn;
661  else if (create)
662  {
663  /* initialize the new entry, if creation was requested */
664  Assert(ent != NULL);
665  Assert(lsn != InvalidXLogRecPtr);
666 
667  ent->txn = ReorderBufferGetTXN(rb);
668  ent->txn->xid = xid;
669  txn = ent->txn;
670  txn->first_lsn = lsn;
672 
673  if (create_as_top)
674  {
675  dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
676  AssertTXNLsnOrder(rb);
677  }
678  }
679  else
680  txn = NULL; /* not found and not asked to create */
681 
682  /* update cache */
683  rb->by_txn_last_xid = xid;
684  rb->by_txn_last_txn = txn;
685 
686  if (is_new)
687  *is_new = !found;
688 
689  Assert(!create || txn != NULL);
690  return txn;
691 }
692 
693 /*
694  * Record the partial change for the streaming of in-progress transactions. We
695  * can stream only complete changes so if we have a partial change like toast
696  * table insert or speculative insert then we mark such a 'txn' so that it
697  * can't be streamed. We also ensure that if the changes in such a 'txn' are
698  * above logical_decoding_work_mem threshold then we stream them as soon as we
699  * have a complete change.
700  */
701 static void
703  ReorderBufferChange *change,
704  bool toast_insert)
705 {
706  ReorderBufferTXN *toptxn;
707 
708  /*
709  * The partial changes need to be processed only while streaming
710  * in-progress transactions.
711  */
712  if (!ReorderBufferCanStream(rb))
713  return;
714 
715  /* Get the top transaction. */
716  if (txn->toptxn != NULL)
717  toptxn = txn->toptxn;
718  else
719  toptxn = txn;
720 
721  /*
722  * Indicate a partial change for toast inserts. The change will be
723  * considered as complete once we get the insert or update on the main
724  * table and we are sure that the pending toast chunks are not required
725  * anymore.
726  *
727  * If we allow streaming when there are pending toast chunks then such
728  * chunks won't be released till the insert (multi_insert) is complete and
729  * we expect the txn to have streamed all changes after streaming. This
730  * restriction is mainly to ensure the correctness of streamed
731  * transactions and it doesn't seem worth uplifting such a restriction
732  * just to allow this case because anyway we will stream the transaction
733  * once such an insert is complete.
734  */
735  if (toast_insert)
737  else if (rbtxn_has_partial_change(toptxn) &&
738  IsInsertOrUpdate(change->action) &&
739  change->data.tp.clear_toast_afterwards)
741 
742  /*
743  * Indicate a partial change for speculative inserts. The change will be
744  * considered as complete once we get the speculative confirm or abort
745  * token.
746  */
747  if (IsSpecInsert(change->action))
749  else if (rbtxn_has_partial_change(toptxn) &&
750  IsSpecConfirmOrAbort(change->action))
752 
753  /*
754  * Stream the transaction if it is serialized before and the changes are
755  * now complete in the top-level transaction.
756  *
757  * The reason for doing the streaming of such a transaction as soon as we
758  * get the complete change for it is that previously it would have reached
759  * the memory threshold and wouldn't get streamed because of incomplete
760  * changes. Delaying such transactions would increase apply lag for them.
761  */
763  !(rbtxn_has_partial_change(toptxn)) &&
764  rbtxn_is_serialized(txn))
765  ReorderBufferStreamTXN(rb, toptxn);
766 }
767 
768 /*
769  * Queue a change into a transaction so it can be replayed upon commit or will be
770  * streamed when we reach logical_decoding_work_mem threshold.
771  */
772 void
774  ReorderBufferChange *change, bool toast_insert)
775 {
776  ReorderBufferTXN *txn;
777 
778  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
779 
780  /*
781  * While streaming the previous changes we have detected that the
782  * transaction is aborted. So there is no point in collecting further
783  * changes for it.
784  */
785  if (txn->concurrent_abort)
786  {
787  /*
788  * We don't need to update memory accounting for this change as we
789  * have not added it to the queue yet.
790  */
791  ReorderBufferReturnChange(rb, change, false);
792  return;
793  }
794 
795  change->lsn = lsn;
796  change->txn = txn;
797 
798  Assert(InvalidXLogRecPtr != lsn);
799  dlist_push_tail(&txn->changes, &change->node);
800  txn->nentries++;
801  txn->nentries_mem++;
802 
803  /* update memory accounting information */
804  ReorderBufferChangeMemoryUpdate(rb, change, true,
805  ReorderBufferChangeSize(change));
806 
807  /* process partial change */
808  ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
809 
810  /* check the memory limits and evict something if needed */
812 }
813 
814 /*
815  * A transactional message is queued to be processed upon commit and a
816  * non-transactional message gets processed immediately.
817  */
818 void
820  Snapshot snapshot, XLogRecPtr lsn,
821  bool transactional, const char *prefix,
822  Size message_size, const char *message)
823 {
824  if (transactional)
825  {
826  MemoryContext oldcontext;
827  ReorderBufferChange *change;
828 
830 
831  oldcontext = MemoryContextSwitchTo(rb->context);
832 
833  change = ReorderBufferGetChange(rb);
835  change->data.msg.prefix = pstrdup(prefix);
836  change->data.msg.message_size = message_size;
837  change->data.msg.message = palloc(message_size);
838  memcpy(change->data.msg.message, message, message_size);
839 
840  ReorderBufferQueueChange(rb, xid, lsn, change, false);
841 
842  MemoryContextSwitchTo(oldcontext);
843  }
844  else
845  {
846  ReorderBufferTXN *txn = NULL;
847  volatile Snapshot snapshot_now = snapshot;
848 
849  if (xid != InvalidTransactionId)
850  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
851 
852  /* setup snapshot to allow catalog access */
853  SetupHistoricSnapshot(snapshot_now, NULL);
854  PG_TRY();
855  {
856  rb->message(rb, txn, lsn, false, prefix, message_size, message);
857 
859  }
860  PG_CATCH();
861  {
863  PG_RE_THROW();
864  }
865  PG_END_TRY();
866  }
867 }
868 
869 /*
870  * AssertTXNLsnOrder
871  * Verify LSN ordering of transaction lists in the reorderbuffer
872  *
873  * Other LSN-related invariants are checked too.
874  *
875  * No-op if assertions are not in use.
876  */
877 static void
879 {
880 #ifdef USE_ASSERT_CHECKING
881  dlist_iter iter;
882  XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
883  XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
884 
885  dlist_foreach(iter, &rb->toplevel_by_lsn)
886  {
888  iter.cur);
889 
890  /* start LSN must be set */
891  Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
892 
893  /* If there is an end LSN, it must be higher than start LSN */
894  if (cur_txn->end_lsn != InvalidXLogRecPtr)
895  Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
896 
897  /* Current initial LSN must be strictly higher than previous */
898  if (prev_first_lsn != InvalidXLogRecPtr)
899  Assert(prev_first_lsn < cur_txn->first_lsn);
900 
901  /* known-as-subtxn txns must not be listed */
902  Assert(!rbtxn_is_known_subxact(cur_txn));
903 
904  prev_first_lsn = cur_txn->first_lsn;
905  }
906 
908  {
910  base_snapshot_node,
911  iter.cur);
912 
913  /* base snapshot (and its LSN) must be set */
914  Assert(cur_txn->base_snapshot != NULL);
916 
917  /* current LSN must be strictly higher than previous */
918  if (prev_base_snap_lsn != InvalidXLogRecPtr)
919  Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
920 
921  /* known-as-subtxn txns must not be listed */
922  Assert(!rbtxn_is_known_subxact(cur_txn));
923 
924  prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
925  }
926 #endif
927 }
928 
929 /*
930  * AssertChangeLsnOrder
931  *
932  * Check ordering of changes in the (sub)transaction.
933  */
934 static void
936 {
937 #ifdef USE_ASSERT_CHECKING
938  dlist_iter iter;
939  XLogRecPtr prev_lsn = txn->first_lsn;
940 
941  dlist_foreach(iter, &txn->changes)
942  {
943  ReorderBufferChange *cur_change;
944 
945  cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
946 
948  Assert(cur_change->lsn != InvalidXLogRecPtr);
949  Assert(txn->first_lsn <= cur_change->lsn);
950 
951  if (txn->end_lsn != InvalidXLogRecPtr)
952  Assert(cur_change->lsn <= txn->end_lsn);
953 
954  Assert(prev_lsn <= cur_change->lsn);
955 
956  prev_lsn = cur_change->lsn;
957  }
958 #endif
959 }
960 
961 /*
962  * ReorderBufferGetOldestTXN
963  * Return oldest transaction in reorderbuffer
964  */
967 {
968  ReorderBufferTXN *txn;
969 
970  AssertTXNLsnOrder(rb);
971 
973  return NULL;
974 
976 
979  return txn;
980 }
981 
982 /*
983  * ReorderBufferGetOldestXmin
984  * Return oldest Xmin in reorderbuffer
985  *
986  * Returns oldest possibly running Xid from the point of view of snapshots
987  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
988  * there are none.
989  *
990  * Since snapshots are assigned monotonically, this equals the Xmin of the
991  * base snapshot with minimal base_snapshot_lsn.
992  */
995 {
996  ReorderBufferTXN *txn;
997 
998  AssertTXNLsnOrder(rb);
999 
1001  return InvalidTransactionId;
1002 
1003  txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1005  return txn->base_snapshot->xmin;
1006 }
1007 
1008 void
1010 {
1011  rb->current_restart_decoding_lsn = ptr;
1012 }
1013 
1014 /*
1015  * ReorderBufferAssignChild
1016  *
1017  * Make note that we know that subxid is a subtransaction of xid, seen as of
1018  * the given lsn.
1019  */
1020 void
1022  TransactionId subxid, XLogRecPtr lsn)
1023 {
1024  ReorderBufferTXN *txn;
1025  ReorderBufferTXN *subtxn;
1026  bool new_top;
1027  bool new_sub;
1028 
1029  txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1030  subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1031 
1032  if (!new_sub)
1033  {
1034  if (rbtxn_is_known_subxact(subtxn))
1035  {
1036  /* already associated, nothing to do */
1037  return;
1038  }
1039  else
1040  {
1041  /*
1042  * We already saw this transaction, but initially added it to the
1043  * list of top-level txns. Now that we know it's not top-level,
1044  * remove it from there.
1045  */
1046  dlist_delete(&subtxn->node);
1047  }
1048  }
1049 
1050  subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1051  subtxn->toplevel_xid = xid;
1052  Assert(subtxn->nsubtxns == 0);
1053 
1054  /* set the reference to top-level transaction */
1055  subtxn->toptxn = txn;
1056 
1057  /* add to subtransaction list */
1058  dlist_push_tail(&txn->subtxns, &subtxn->node);
1059  txn->nsubtxns++;
1060 
1061  /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1062  ReorderBufferTransferSnapToParent(txn, subtxn);
1063 
1064  /* Verify LSN-ordering invariant */
1065  AssertTXNLsnOrder(rb);
1066 }
1067 
1068 /*
1069  * ReorderBufferTransferSnapToParent
1070  * Transfer base snapshot from subtxn to top-level txn, if needed
1071  *
1072  * This is done if the top-level txn doesn't have a base snapshot, or if the
1073  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1074  * snapshot's LSN. This can happen if there are no changes in the toplevel
1075  * txn but there are some in the subtxn, or the first change in subtxn has
1076  * earlier LSN than first change in the top-level txn and we learned about
1077  * their kinship only now.
1078  *
1079  * The subtransaction's snapshot is cleared regardless of the transfer
1080  * happening, since it's not needed anymore in either case.
1081  *
1082  * We do this as soon as we become aware of their kinship, to avoid queueing
1083  * extra snapshots to txns known-as-subtxns -- only top-level txns will
1084  * receive further snapshots.
1085  */
1086 static void
1088  ReorderBufferTXN *subtxn)
1089 {
1090  Assert(subtxn->toplevel_xid == txn->xid);
1091 
1092  if (subtxn->base_snapshot != NULL)
1093  {
1094  if (txn->base_snapshot == NULL ||
1095  subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1096  {
1097  /*
1098  * If the toplevel transaction already has a base snapshot but
1099  * it's newer than the subxact's, purge it.
1100  */
1101  if (txn->base_snapshot != NULL)
1102  {
1105  }
1106 
1107  /*
1108  * The snapshot is now the top transaction's; transfer it, and
1109  * adjust the list position of the top transaction in the list by
1110  * moving it to where the subtransaction is.
1111  */
1112  txn->base_snapshot = subtxn->base_snapshot;
1113  txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1115  &txn->base_snapshot_node);
1116 
1117  /*
1118  * The subtransaction doesn't have a snapshot anymore (so it
1119  * mustn't be in the list.)
1120  */
1121  subtxn->base_snapshot = NULL;
1123  dlist_delete(&subtxn->base_snapshot_node);
1124  }
1125  else
1126  {
1127  /* Base snap of toplevel is fine, so subxact's is not needed */
1129  dlist_delete(&subtxn->base_snapshot_node);
1130  subtxn->base_snapshot = NULL;
1132  }
1133  }
1134 }
1135 
1136 /*
1137  * Associate a subtransaction with its toplevel transaction at commit
1138  * time. There may be no further changes added after this.
1139  */
1140 void
1142  TransactionId subxid, XLogRecPtr commit_lsn,
1143  XLogRecPtr end_lsn)
1144 {
1145  ReorderBufferTXN *subtxn;
1146 
1147  subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1148  InvalidXLogRecPtr, false);
1149 
1150  /*
1151  * No need to do anything if that subtxn didn't contain any changes
1152  */
1153  if (!subtxn)
1154  return;
1155 
1156  subtxn->final_lsn = commit_lsn;
1157  subtxn->end_lsn = end_lsn;
1158 
1159  /*
1160  * Assign this subxact as a child of the toplevel xact (no-op if already
1161  * done.)
1162  */
1163  ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1164 }
1165 
1166 
1167 /*
1168  * Support for efficiently iterating over a transaction's and its
1169  * subtransactions' changes.
1170  *
1171  * We do by doing a k-way merge between transactions/subtransactions. For that
1172  * we model the current heads of the different transactions as a binary heap
1173  * so we easily know which (sub-)transaction has the change with the smallest
1174  * lsn next.
1175  *
1176  * We assume the changes in individual transactions are already sorted by LSN.
1177  */
1178 
1179 /*
1180  * Binary heap comparison function.
1181  */
1182 static int
1184 {
1186  XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1187  XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1188 
1189  if (pos_a < pos_b)
1190  return 1;
1191  else if (pos_a == pos_b)
1192  return 0;
1193  return -1;
1194 }
1195 
1196 /*
1197  * Allocate & initialize an iterator which iterates in lsn order over a
1198  * transaction and all its subtransactions.
1199  *
1200  * Note: The iterator state is returned through iter_state parameter rather
1201  * than the function's return value. This is because the state gets cleaned up
1202  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1203  * back the state even if this function throws an exception.
1204  */
1205 static void
1207  ReorderBufferIterTXNState *volatile *iter_state)
1208 {
1209  Size nr_txns = 0;
1211  dlist_iter cur_txn_i;
1212  int32 off;
1213 
1214  *iter_state = NULL;
1215 
1216  /* Check ordering of changes in the toplevel transaction. */
1217  AssertChangeLsnOrder(txn);
1218 
1219  /*
1220  * Calculate the size of our heap: one element for every transaction that
1221  * contains changes. (Besides the transactions already in the reorder
1222  * buffer, we count the one we were directly passed.)
1223  */
1224  if (txn->nentries > 0)
1225  nr_txns++;
1226 
1227  dlist_foreach(cur_txn_i, &txn->subtxns)
1228  {
1229  ReorderBufferTXN *cur_txn;
1230 
1231  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1232 
1233  /* Check ordering of changes in this subtransaction. */
1234  AssertChangeLsnOrder(cur_txn);
1235 
1236  if (cur_txn->nentries > 0)
1237  nr_txns++;
1238  }
1239 
1240  /* allocate iteration state */
1243  sizeof(ReorderBufferIterTXNState) +
1244  sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1245 
1246  state->nr_txns = nr_txns;
1247  dlist_init(&state->old_change);
1248 
1249  for (off = 0; off < state->nr_txns; off++)
1250  {
1251  state->entries[off].file.vfd = -1;
1252  state->entries[off].segno = 0;
1253  }
1254 
1255  /* allocate heap */
1256  state->heap = binaryheap_allocate(state->nr_txns,
1258  state);
1259 
1260  /* Now that the state fields are initialized, it is safe to return it. */
1261  *iter_state = state;
1262 
1263  /*
1264  * Now insert items into the binary heap, in an unordered fashion. (We
1265  * will run a heap assembly step at the end; this is more efficient.)
1266  */
1267 
1268  off = 0;
1269 
1270  /* add toplevel transaction if it contains changes */
1271  if (txn->nentries > 0)
1272  {
1273  ReorderBufferChange *cur_change;
1274 
1275  if (rbtxn_is_serialized(txn))
1276  {
1277  /* serialize remaining changes */
1278  ReorderBufferSerializeTXN(rb, txn);
1279  ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1280  &state->entries[off].segno);
1281  }
1282 
1283  cur_change = dlist_head_element(ReorderBufferChange, node,
1284  &txn->changes);
1285 
1286  state->entries[off].lsn = cur_change->lsn;
1287  state->entries[off].change = cur_change;
1288  state->entries[off].txn = txn;
1289 
1291  }
1292 
1293  /* add subtransactions if they contain changes */
1294  dlist_foreach(cur_txn_i, &txn->subtxns)
1295  {
1296  ReorderBufferTXN *cur_txn;
1297 
1298  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1299 
1300  if (cur_txn->nentries > 0)
1301  {
1302  ReorderBufferChange *cur_change;
1303 
1304  if (rbtxn_is_serialized(cur_txn))
1305  {
1306  /* serialize remaining changes */
1307  ReorderBufferSerializeTXN(rb, cur_txn);
1308  ReorderBufferRestoreChanges(rb, cur_txn,
1309  &state->entries[off].file,
1310  &state->entries[off].segno);
1311  }
1312  cur_change = dlist_head_element(ReorderBufferChange, node,
1313  &cur_txn->changes);
1314 
1315  state->entries[off].lsn = cur_change->lsn;
1316  state->entries[off].change = cur_change;
1317  state->entries[off].txn = cur_txn;
1318 
1320  }
1321  }
1322 
1323  /* assemble a valid binary heap */
1324  binaryheap_build(state->heap);
1325 }
1326 
1327 /*
1328  * Return the next change when iterating over a transaction and its
1329  * subtransactions.
1330  *
1331  * Returns NULL when no further changes exist.
1332  */
1333 static ReorderBufferChange *
1335 {
1336  ReorderBufferChange *change;
1338  int32 off;
1339 
1340  /* nothing there anymore */
1341  if (state->heap->bh_size == 0)
1342  return NULL;
1343 
1344  off = DatumGetInt32(binaryheap_first(state->heap));
1345  entry = &state->entries[off];
1346 
1347  /* free memory we might have "leaked" in the previous *Next call */
1348  if (!dlist_is_empty(&state->old_change))
1349  {
1350  change = dlist_container(ReorderBufferChange, node,
1351  dlist_pop_head_node(&state->old_change));
1352  ReorderBufferReturnChange(rb, change, true);
1353  Assert(dlist_is_empty(&state->old_change));
1354  }
1355 
1356  change = entry->change;
1357 
1358  /*
1359  * update heap with information about which transaction has the next
1360  * relevant change in LSN order
1361  */
1362 
1363  /* there are in-memory changes */
1364  if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1365  {
1366  dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1367  ReorderBufferChange *next_change =
1369 
1370  /* txn stays the same */
1371  state->entries[off].lsn = next_change->lsn;
1372  state->entries[off].change = next_change;
1373 
1375  return change;
1376  }
1377 
1378  /* try to load changes from disk */
1379  if (entry->txn->nentries != entry->txn->nentries_mem)
1380  {
1381  /*
1382  * Ugly: restoring changes will reuse *Change records, thus delete the
1383  * current one from the per-tx list and only free in the next call.
1384  */
1385  dlist_delete(&change->node);
1386  dlist_push_tail(&state->old_change, &change->node);
1387 
1388  /*
1389  * Update the total bytes processed by the txn for which we are
1390  * releasing the current set of changes and restoring the new set of
1391  * changes.
1392  */
1393  rb->totalBytes += entry->txn->size;
1394  if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1395  &state->entries[off].segno))
1396  {
1397  /* successfully restored changes from disk */
1398  ReorderBufferChange *next_change =
1400  &entry->txn->changes);
1401 
1402  elog(DEBUG2, "restored %u/%u changes from disk",
1403  (uint32) entry->txn->nentries_mem,
1404  (uint32) entry->txn->nentries);
1405 
1406  Assert(entry->txn->nentries_mem);
1407  /* txn stays the same */
1408  state->entries[off].lsn = next_change->lsn;
1409  state->entries[off].change = next_change;
1411 
1412  return change;
1413  }
1414  }
1415 
1416  /* ok, no changes there anymore, remove */
1418 
1419  return change;
1420 }
1421 
1422 /*
1423  * Deallocate the iterator
1424  */
1425 static void
1428 {
1429  int32 off;
1430 
1431  for (off = 0; off < state->nr_txns; off++)
1432  {
1433  if (state->entries[off].file.vfd != -1)
1434  FileClose(state->entries[off].file.vfd);
1435  }
1436 
1437  /* free memory we might have "leaked" in the last *Next call */
1438  if (!dlist_is_empty(&state->old_change))
1439  {
1440  ReorderBufferChange *change;
1441 
1442  change = dlist_container(ReorderBufferChange, node,
1443  dlist_pop_head_node(&state->old_change));
1444  ReorderBufferReturnChange(rb, change, true);
1445  Assert(dlist_is_empty(&state->old_change));
1446  }
1447 
1448  binaryheap_free(state->heap);
1449  pfree(state);
1450 }
1451 
1452 /*
1453  * Cleanup the contents of a transaction, usually after the transaction
1454  * committed or aborted.
1455  */
1456 static void
1458 {
1459  bool found;
1460  dlist_mutable_iter iter;
1461 
1462  /* cleanup subtransactions & their changes */
1463  dlist_foreach_modify(iter, &txn->subtxns)
1464  {
1465  ReorderBufferTXN *subtxn;
1466 
1467  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1468 
1469  /*
1470  * Subtransactions are always associated to the toplevel TXN, even if
1471  * they originally were happening inside another subtxn, so we won't
1472  * ever recurse more than one level deep here.
1473  */
1474  Assert(rbtxn_is_known_subxact(subtxn));
1475  Assert(subtxn->nsubtxns == 0);
1476 
1477  ReorderBufferCleanupTXN(rb, subtxn);
1478  }
1479 
1480  /* cleanup changes in the txn */
1481  dlist_foreach_modify(iter, &txn->changes)
1482  {
1483  ReorderBufferChange *change;
1484 
1485  change = dlist_container(ReorderBufferChange, node, iter.cur);
1486 
1487  /* Check we're not mixing changes from different transactions. */
1488  Assert(change->txn == txn);
1489 
1490  ReorderBufferReturnChange(rb, change, true);
1491  }
1492 
1493  /*
1494  * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1495  * They are always stored in the toplevel transaction.
1496  */
1497  dlist_foreach_modify(iter, &txn->tuplecids)
1498  {
1499  ReorderBufferChange *change;
1500 
1501  change = dlist_container(ReorderBufferChange, node, iter.cur);
1502 
1503  /* Check we're not mixing changes from different transactions. */
1504  Assert(change->txn == txn);
1506 
1507  ReorderBufferReturnChange(rb, change, true);
1508  }
1509 
1510  /*
1511  * Cleanup the base snapshot, if set.
1512  */
1513  if (txn->base_snapshot != NULL)
1514  {
1517  }
1518 
1519  /*
1520  * Cleanup the snapshot for the last streamed run.
1521  */
1522  if (txn->snapshot_now != NULL)
1523  {
1524  Assert(rbtxn_is_streamed(txn));
1526  }
1527 
1528  /*
1529  * Remove TXN from its containing list.
1530  *
1531  * Note: if txn is known as subxact, we are deleting the TXN from its
1532  * parent's list of known subxacts; this leaves the parent's nsubxacts
1533  * count too high, but we don't care. Otherwise, we are deleting the TXN
1534  * from the LSN-ordered list of toplevel TXNs.
1535  */
1536  dlist_delete(&txn->node);
1537 
1538  /* now remove reference from buffer */
1539  hash_search(rb->by_txn,
1540  (void *) &txn->xid,
1541  HASH_REMOVE,
1542  &found);
1543  Assert(found);
1544 
1545  /* remove entries spilled to disk */
1546  if (rbtxn_is_serialized(txn))
1547  ReorderBufferRestoreCleanup(rb, txn);
1548 
1549  /* deallocate */
1550  ReorderBufferReturnTXN(rb, txn);
1551 }
1552 
1553 /*
1554  * Discard changes from a transaction (and subtransactions), either after
1555  * streaming or decoding them at PREPARE. Keep the remaining info -
1556  * transactions, tuplecids, invalidations and snapshots.
1557  *
1558  * We additionally remove tuplecids after decoding the transaction at prepare
1559  * time as we only need to perform invalidation at rollback or commit prepared.
1560  *
1561  * 'txn_prepared' indicates that we have decoded the transaction at prepare
1562  * time.
1563  */
1564 static void
1566 {
1567  dlist_mutable_iter iter;
1568 
1569  /* cleanup subtransactions & their changes */
1570  dlist_foreach_modify(iter, &txn->subtxns)
1571  {
1572  ReorderBufferTXN *subtxn;
1573 
1574  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1575 
1576  /*
1577  * Subtransactions are always associated to the toplevel TXN, even if
1578  * they originally were happening inside another subtxn, so we won't
1579  * ever recurse more than one level deep here.
1580  */
1581  Assert(rbtxn_is_known_subxact(subtxn));
1582  Assert(subtxn->nsubtxns == 0);
1583 
1584  ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1585  }
1586 
1587  /* cleanup changes in the txn */
1588  dlist_foreach_modify(iter, &txn->changes)
1589  {
1590  ReorderBufferChange *change;
1591 
1592  change = dlist_container(ReorderBufferChange, node, iter.cur);
1593 
1594  /* Check we're not mixing changes from different transactions. */
1595  Assert(change->txn == txn);
1596 
1597  /* remove the change from it's containing list */
1598  dlist_delete(&change->node);
1599 
1600  ReorderBufferReturnChange(rb, change, true);
1601  }
1602 
1603  /*
1604  * Mark the transaction as streamed.
1605  *
1606  * The toplevel transaction, identified by (toptxn==NULL), is marked as
1607  * streamed always, even if it does not contain any changes (that is, when
1608  * all the changes are in subtransactions).
1609  *
1610  * For subtransactions, we only mark them as streamed when there are
1611  * changes in them.
1612  *
1613  * We do it this way because of aborts - we don't want to send aborts for
1614  * XIDs the downstream is not aware of. And of course, it always knows
1615  * about the toplevel xact (we send the XID in all messages), but we never
1616  * stream XIDs of empty subxacts.
1617  */
1618  if ((!txn_prepared) && ((!txn->toptxn) || (txn->nentries_mem != 0)))
1619  txn->txn_flags |= RBTXN_IS_STREAMED;
1620 
1621  if (txn_prepared)
1622  {
1623  /*
1624  * If this is a prepared txn, cleanup the tuplecids we stored for
1625  * decoding catalog snapshot access. They are always stored in the
1626  * toplevel transaction.
1627  */
1628  dlist_foreach_modify(iter, &txn->tuplecids)
1629  {
1630  ReorderBufferChange *change;
1631 
1632  change = dlist_container(ReorderBufferChange, node, iter.cur);
1633 
1634  /* Check we're not mixing changes from different transactions. */
1635  Assert(change->txn == txn);
1637 
1638  /* Remove the change from its containing list. */
1639  dlist_delete(&change->node);
1640 
1641  ReorderBufferReturnChange(rb, change, true);
1642  }
1643  }
1644 
1645  /*
1646  * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any
1647  * memory. We could also keep the hash table and update it with new ctid
1648  * values, but this seems simpler and good enough for now.
1649  */
1650  if (txn->tuplecid_hash != NULL)
1651  {
1653  txn->tuplecid_hash = NULL;
1654  }
1655 
1656  /* If this txn is serialized then clean the disk space. */
1657  if (rbtxn_is_serialized(txn))
1658  {
1659  ReorderBufferRestoreCleanup(rb, txn);
1660  txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1661 
1662  /*
1663  * We set this flag to indicate if the transaction is ever serialized.
1664  * We need this to accurately update the stats as otherwise the same
1665  * transaction can be counted as serialized multiple times.
1666  */
1668  }
1669 
1670  /* also reset the number of entries in the transaction */
1671  txn->nentries_mem = 0;
1672  txn->nentries = 0;
1673 }
1674 
1675 /*
1676  * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
1677  * HeapTupleSatisfiesHistoricMVCC.
1678  */
1679 static void
1681 {
1682  dlist_iter iter;
1683  HASHCTL hash_ctl;
1684 
1686  return;
1687 
1688  hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1689  hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1690  hash_ctl.hcxt = rb->context;
1691 
1692  /*
1693  * create the hash with the exact number of to-be-stored tuplecids from
1694  * the start
1695  */
1696  txn->tuplecid_hash =
1697  hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1699 
1700  dlist_foreach(iter, &txn->tuplecids)
1701  {
1704  bool found;
1705  ReorderBufferChange *change;
1706 
1707  change = dlist_container(ReorderBufferChange, node, iter.cur);
1708 
1710 
1711  /* be careful about padding */
1712  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1713 
1714  key.relnode = change->data.tuplecid.node;
1715 
1716  ItemPointerCopy(&change->data.tuplecid.tid,
1717  &key.tid);
1718 
1719  ent = (ReorderBufferTupleCidEnt *)
1721  (void *) &key,
1722  HASH_ENTER,
1723  &found);
1724  if (!found)
1725  {
1726  ent->cmin = change->data.tuplecid.cmin;
1727  ent->cmax = change->data.tuplecid.cmax;
1728  ent->combocid = change->data.tuplecid.combocid;
1729  }
1730  else
1731  {
1732  /*
1733  * Maybe we already saw this tuple before in this transaction, but
1734  * if so it must have the same cmin.
1735  */
1736  Assert(ent->cmin == change->data.tuplecid.cmin);
1737 
1738  /*
1739  * cmax may be initially invalid, but once set it can only grow,
1740  * and never become invalid again.
1741  */
1742  Assert((ent->cmax == InvalidCommandId) ||
1743  ((change->data.tuplecid.cmax != InvalidCommandId) &&
1744  (change->data.tuplecid.cmax > ent->cmax)));
1745  ent->cmax = change->data.tuplecid.cmax;
1746  }
1747  }
1748 }
1749 
1750 /*
1751  * Copy a provided snapshot so we can modify it privately. This is needed so
1752  * that catalog modifying transactions can look into intermediate catalog
1753  * states.
1754  */
1755 static Snapshot
1757  ReorderBufferTXN *txn, CommandId cid)
1758 {
1759  Snapshot snap;
1760  dlist_iter iter;
1761  int i = 0;
1762  Size size;
1763 
1764  size = sizeof(SnapshotData) +
1765  sizeof(TransactionId) * orig_snap->xcnt +
1766  sizeof(TransactionId) * (txn->nsubtxns + 1);
1767 
1768  snap = MemoryContextAllocZero(rb->context, size);
1769  memcpy(snap, orig_snap, sizeof(SnapshotData));
1770 
1771  snap->copied = true;
1772  snap->active_count = 1; /* mark as active so nobody frees it */
1773  snap->regd_count = 0;
1774  snap->xip = (TransactionId *) (snap + 1);
1775 
1776  memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1777 
1778  /*
1779  * snap->subxip contains all txids that belong to our transaction which we
1780  * need to check via cmin/cmax. That's why we store the toplevel
1781  * transaction in there as well.
1782  */
1783  snap->subxip = snap->xip + snap->xcnt;
1784  snap->subxip[i++] = txn->xid;
1785 
1786  /*
1787  * subxcnt isn't decreased when subtransactions abort, so count manually.
1788  * Since it's an upper boundary it is safe to use it for the allocation
1789  * above.
1790  */
1791  snap->subxcnt = 1;
1792 
1793  dlist_foreach(iter, &txn->subtxns)
1794  {
1795  ReorderBufferTXN *sub_txn;
1796 
1797  sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1798  snap->subxip[i++] = sub_txn->xid;
1799  snap->subxcnt++;
1800  }
1801 
1802  /* sort so we can bsearch() later */
1803  qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1804 
1805  /* store the specified current CommandId */
1806  snap->curcid = cid;
1807 
1808  return snap;
1809 }
1810 
1811 /*
1812  * Free a previously ReorderBufferCopySnap'ed snapshot
1813  */
1814 static void
1816 {
1817  if (snap->copied)
1818  pfree(snap);
1819  else
1821 }
1822 
1823 /*
1824  * If the transaction was (partially) streamed, we need to prepare or commit
1825  * it in a 'streamed' way. That is, we first stream the remaining part of the
1826  * transaction, and then invoke stream_prepare or stream_commit message as per
1827  * the case.
1828  */
1829 static void
1831 {
1832  /* we should only call this for previously streamed transactions */
1833  Assert(rbtxn_is_streamed(txn));
1834 
1835  ReorderBufferStreamTXN(rb, txn);
1836 
1837  if (rbtxn_prepared(txn))
1838  {
1839  /*
1840  * Note, we send stream prepare even if a concurrent abort is
1841  * detected. See DecodePrepare for more information.
1842  */
1843  rb->stream_prepare(rb, txn, txn->final_lsn);
1844 
1845  /*
1846  * This is a PREPARED transaction, part of a two-phase commit. The
1847  * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1848  * just truncate txn by removing changes and tuple_cids.
1849  */
1850  ReorderBufferTruncateTXN(rb, txn, true);
1851  /* Reset the CheckXidAlive */
1853  }
1854  else
1855  {
1856  rb->stream_commit(rb, txn, txn->final_lsn);
1857  ReorderBufferCleanupTXN(rb, txn);
1858  }
1859 }
1860 
1861 /*
1862  * Set xid to detect concurrent aborts.
1863  *
1864  * While streaming an in-progress transaction or decoding a prepared
1865  * transaction there is a possibility that the (sub)transaction might get
1866  * aborted concurrently. In such case if the (sub)transaction has catalog
1867  * update then we might decode the tuple using wrong catalog version. For
1868  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1869  * the transaction 501 updates the catalog tuple and after that we will have
1870  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1871  * aborted and some other transaction say 502 updates the same catalog tuple
1872  * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1873  * problem is that when we try to decode the tuple inserted/updated in 501
1874  * after the catalog update, we will see the catalog tuple with (xmin: 500,
1875  * xmax: 502) as visible because it will consider that the tuple is deleted by
1876  * xid 502 which is not visible to our snapshot. And when we will try to
1877  * decode with that catalog tuple, it can lead to a wrong result or a crash.
1878  * So, it is necessary to detect concurrent aborts to allow streaming of
1879  * in-progress transactions or decoding of prepared transactions.
1880  *
1881  * For detecting the concurrent abort we set CheckXidAlive to the current
1882  * (sub)transaction's xid for which this change belongs to. And, during
1883  * catalog scan we can check the status of the xid and if it is aborted we will
1884  * report a specific error so that we can stop streaming current transaction
1885  * and discard the already streamed changes on such an error. We might have
1886  * already streamed some of the changes for the aborted (sub)transaction, but
1887  * that is fine because when we decode the abort we will stream abort message
1888  * to truncate the changes in the subscriber. Similarly, for prepared
1889  * transactions, we stop decoding if concurrent abort is detected and then
1890  * rollback the changes when rollback prepared is encountered. See
1891  * DecodePrepare.
1892  */
1893 static inline void
1895 {
1896  /*
1897  * If the input transaction id is already set as a CheckXidAlive then
1898  * nothing to do.
1899  */
1901  return;
1902 
1903  /*
1904  * setup CheckXidAlive if it's not committed yet. We don't check if the
1905  * xid is aborted. That will happen during catalog access.
1906  */
1907  if (!TransactionIdDidCommit(xid))
1908  CheckXidAlive = xid;
1909  else
1911 }
1912 
1913 /*
1914  * Helper function for ReorderBufferProcessTXN for applying change.
1915  */
1916 static inline void
1918  Relation relation, ReorderBufferChange *change,
1919  bool streaming)
1920 {
1921  if (streaming)
1922  rb->stream_change(rb, txn, relation, change);
1923  else
1924  rb->apply_change(rb, txn, relation, change);
1925 }
1926 
1927 /*
1928  * Helper function for ReorderBufferProcessTXN for applying the truncate.
1929  */
1930 static inline void
1932  int nrelations, Relation *relations,
1933  ReorderBufferChange *change, bool streaming)
1934 {
1935  if (streaming)
1936  rb->stream_truncate(rb, txn, nrelations, relations, change);
1937  else
1938  rb->apply_truncate(rb, txn, nrelations, relations, change);
1939 }
1940 
1941 /*
1942  * Helper function for ReorderBufferProcessTXN for applying the message.
1943  */
1944 static inline void
1946  ReorderBufferChange *change, bool streaming)
1947 {
1948  if (streaming)
1949  rb->stream_message(rb, txn, change->lsn, true,
1950  change->data.msg.prefix,
1951  change->data.msg.message_size,
1952  change->data.msg.message);
1953  else
1954  rb->message(rb, txn, change->lsn, true,
1955  change->data.msg.prefix,
1956  change->data.msg.message_size,
1957  change->data.msg.message);
1958 }
1959 
1960 /*
1961  * Function to store the command id and snapshot at the end of the current
1962  * stream so that we can reuse the same while sending the next stream.
1963  */
1964 static inline void
1966  Snapshot snapshot_now, CommandId command_id)
1967 {
1968  txn->command_id = command_id;
1969 
1970  /* Avoid copying if it's already copied. */
1971  if (snapshot_now->copied)
1972  txn->snapshot_now = snapshot_now;
1973  else
1974  txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
1975  txn, command_id);
1976 }
1977 
1978 /*
1979  * Helper function for ReorderBufferProcessTXN to handle the concurrent
1980  * abort of the streaming transaction. This resets the TXN such that it
1981  * can be used to stream the remaining data of transaction being processed.
1982  * This can happen when the subtransaction is aborted and we still want to
1983  * continue processing the main or other subtransactions data.
1984  */
1985 static void
1987  Snapshot snapshot_now,
1988  CommandId command_id,
1989  XLogRecPtr last_lsn,
1990  ReorderBufferChange *specinsert)
1991 {
1992  /* Discard the changes that we just streamed */
1994 
1995  /* Free all resources allocated for toast reconstruction */
1996  ReorderBufferToastReset(rb, txn);
1997 
1998  /* Return the spec insert change if it is not NULL */
1999  if (specinsert != NULL)
2000  {
2001  ReorderBufferReturnChange(rb, specinsert, true);
2002  specinsert = NULL;
2003  }
2004 
2005  /*
2006  * For the streaming case, stop the stream and remember the command ID and
2007  * snapshot for the streaming run.
2008  */
2009  if (rbtxn_is_streamed(txn))
2010  {
2011  rb->stream_stop(rb, txn, last_lsn);
2012  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2013  }
2014 }
2015 
2016 /*
2017  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2018  *
2019  * Send data of a transaction (and its subtransactions) to the
2020  * output plugin. We iterate over the top and subtransactions (using a k-way
2021  * merge) and replay the changes in lsn order.
2022  *
2023  * If streaming is true then data will be sent using stream API.
2024  *
2025  * Note: "volatile" markers on some parameters are to avoid trouble with
2026  * PG_TRY inside the function.
2027  */
2028 static void
2030  XLogRecPtr commit_lsn,
2031  volatile Snapshot snapshot_now,
2032  volatile CommandId command_id,
2033  bool streaming)
2034 {
2035  bool using_subtxn;
2037  ReorderBufferIterTXNState *volatile iterstate = NULL;
2038  volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2039  ReorderBufferChange *volatile specinsert = NULL;
2040  volatile bool stream_started = false;
2041  ReorderBufferTXN *volatile curtxn = NULL;
2042 
2043  /* build data to be able to lookup the CommandIds of catalog tuples */
2045 
2046  /* setup the initial snapshot */
2047  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2048 
2049  /*
2050  * Decoding needs access to syscaches et al., which in turn use
2051  * heavyweight locks and such. Thus we need to have enough state around to
2052  * keep track of those. The easiest way is to simply use a transaction
2053  * internally. That also allows us to easily enforce that nothing writes
2054  * to the database by checking for xid assignments.
2055  *
2056  * When we're called via the SQL SRF there's already a transaction
2057  * started, so start an explicit subtransaction there.
2058  */
2059  using_subtxn = IsTransactionOrTransactionBlock();
2060 
2061  PG_TRY();
2062  {
2063  ReorderBufferChange *change;
2064 
2065  if (using_subtxn)
2066  BeginInternalSubTransaction(streaming ? "stream" : "replay");
2067  else
2069 
2070  /*
2071  * We only need to send begin/begin-prepare for non-streamed
2072  * transactions.
2073  */
2074  if (!streaming)
2075  {
2076  if (rbtxn_prepared(txn))
2077  rb->begin_prepare(rb, txn);
2078  else
2079  rb->begin(rb, txn);
2080  }
2081 
2082  ReorderBufferIterTXNInit(rb, txn, &iterstate);
2083  while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2084  {
2085  Relation relation = NULL;
2086  Oid reloid;
2087 
2088  /*
2089  * We can't call start stream callback before processing first
2090  * change.
2091  */
2092  if (prev_lsn == InvalidXLogRecPtr)
2093  {
2094  if (streaming)
2095  {
2096  txn->origin_id = change->origin_id;
2097  rb->stream_start(rb, txn, change->lsn);
2098  stream_started = true;
2099  }
2100  }
2101 
2102  /*
2103  * Enforce correct ordering of changes, merged from multiple
2104  * subtransactions. The changes may have the same LSN due to
2105  * MULTI_INSERT xlog records.
2106  */
2107  Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2108 
2109  prev_lsn = change->lsn;
2110 
2111  /*
2112  * Set the current xid to detect concurrent aborts. This is
2113  * required for the cases when we decode the changes before the
2114  * COMMIT record is processed.
2115  */
2116  if (streaming || rbtxn_prepared(change->txn))
2117  {
2118  curtxn = change->txn;
2119  SetupCheckXidLive(curtxn->xid);
2120  }
2121 
2122  switch (change->action)
2123  {
2125 
2126  /*
2127  * Confirmation for speculative insertion arrived. Simply
2128  * use as a normal record. It'll be cleaned up at the end
2129  * of INSERT processing.
2130  */
2131  if (specinsert == NULL)
2132  elog(ERROR, "invalid ordering of speculative insertion changes");
2133  Assert(specinsert->data.tp.oldtuple == NULL);
2134  change = specinsert;
2136 
2137  /* intentionally fall through */
2141  Assert(snapshot_now);
2142 
2143  reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode,
2144  change->data.tp.relnode.relNode);
2145 
2146  /*
2147  * Mapped catalog tuple without data, emitted while
2148  * catalog table was in the process of being rewritten. We
2149  * can fail to look up the relfilenode, because the
2150  * relmapper has no "historic" view, in contrast to the
2151  * normal catalog during decoding. Thus repeated rewrites
2152  * can cause a lookup failure. That's OK because we do not
2153  * decode catalog changes anyway. Normally such tuples
2154  * would be skipped over below, but we can't identify
2155  * whether the table should be logically logged without
2156  * mapping the relfilenode to the oid.
2157  */
2158  if (reloid == InvalidOid &&
2159  change->data.tp.newtuple == NULL &&
2160  change->data.tp.oldtuple == NULL)
2161  goto change_done;
2162  else if (reloid == InvalidOid)
2163  elog(ERROR, "could not map filenode \"%s\" to relation OID",
2164  relpathperm(change->data.tp.relnode,
2165  MAIN_FORKNUM));
2166 
2167  relation = RelationIdGetRelation(reloid);
2168 
2169  if (!RelationIsValid(relation))
2170  elog(ERROR, "could not open relation with OID %u (for filenode \"%s\")",
2171  reloid,
2172  relpathperm(change->data.tp.relnode,
2173  MAIN_FORKNUM));
2174 
2175  if (!RelationIsLogicallyLogged(relation))
2176  goto change_done;
2177 
2178  /*
2179  * Ignore temporary heaps created during DDL unless the
2180  * plugin has asked for them.
2181  */
2182  if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2183  goto change_done;
2184 
2185  /*
2186  * For now ignore sequence changes entirely. Most of the
2187  * time they don't log changes using records we
2188  * understand, so it doesn't make sense to handle the few
2189  * cases we do.
2190  */
2191  if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2192  goto change_done;
2193 
2194  /* user-triggered change */
2195  if (!IsToastRelation(relation))
2196  {
2197  ReorderBufferToastReplace(rb, txn, relation, change);
2198  ReorderBufferApplyChange(rb, txn, relation, change,
2199  streaming);
2200 
2201  /*
2202  * Only clear reassembled toast chunks if we're sure
2203  * they're not required anymore. The creator of the
2204  * tuple tells us.
2205  */
2206  if (change->data.tp.clear_toast_afterwards)
2207  ReorderBufferToastReset(rb, txn);
2208  }
2209  /* we're not interested in toast deletions */
2210  else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2211  {
2212  /*
2213  * Need to reassemble the full toasted Datum in
2214  * memory, to ensure the chunks don't get reused till
2215  * we're done remove it from the list of this
2216  * transaction's changes. Otherwise it will get
2217  * freed/reused while restoring spooled data from
2218  * disk.
2219  */
2220  Assert(change->data.tp.newtuple != NULL);
2221 
2222  dlist_delete(&change->node);
2223  ReorderBufferToastAppendChunk(rb, txn, relation,
2224  change);
2225  }
2226 
2227  change_done:
2228 
2229  /*
2230  * If speculative insertion was confirmed, the record
2231  * isn't needed anymore.
2232  */
2233  if (specinsert != NULL)
2234  {
2235  ReorderBufferReturnChange(rb, specinsert, true);
2236  specinsert = NULL;
2237  }
2238 
2239  if (RelationIsValid(relation))
2240  {
2241  RelationClose(relation);
2242  relation = NULL;
2243  }
2244  break;
2245 
2247 
2248  /*
2249  * Speculative insertions are dealt with by delaying the
2250  * processing of the insert until the confirmation record
2251  * arrives. For that we simply unlink the record from the
2252  * chain, so it does not get freed/reused while restoring
2253  * spooled data from disk.
2254  *
2255  * This is safe in the face of concurrent catalog changes
2256  * because the relevant relation can't be changed between
2257  * speculative insertion and confirmation due to
2258  * CheckTableNotInUse() and locking.
2259  */
2260 
2261  /* clear out a pending (and thus failed) speculation */
2262  if (specinsert != NULL)
2263  {
2264  ReorderBufferReturnChange(rb, specinsert, true);
2265  specinsert = NULL;
2266  }
2267 
2268  /* and memorize the pending insertion */
2269  dlist_delete(&change->node);
2270  specinsert = change;
2271  break;
2272 
2274 
2275  /*
2276  * Abort for speculative insertion arrived. So cleanup the
2277  * specinsert tuple and toast hash.
2278  *
2279  * Note that we get the spec abort change for each toast
2280  * entry but we need to perform the cleanup only the first
2281  * time we get it for the main table.
2282  */
2283  if (specinsert != NULL)
2284  {
2285  /*
2286  * We must clean the toast hash before processing a
2287  * completely new tuple to avoid confusion about the
2288  * previous tuple's toast chunks.
2289  */
2290  Assert(change->data.tp.clear_toast_afterwards);
2291  ReorderBufferToastReset(rb, txn);
2292 
2293  /* We don't need this record anymore. */
2294  ReorderBufferReturnChange(rb, specinsert, true);
2295  specinsert = NULL;
2296  }
2297  break;
2298 
2300  {
2301  int i;
2302  int nrelids = change->data.truncate.nrelids;
2303  int nrelations = 0;
2304  Relation *relations;
2305 
2306  relations = palloc0(nrelids * sizeof(Relation));
2307  for (i = 0; i < nrelids; i++)
2308  {
2309  Oid relid = change->data.truncate.relids[i];
2310  Relation relation;
2311 
2312  relation = RelationIdGetRelation(relid);
2313 
2314  if (!RelationIsValid(relation))
2315  elog(ERROR, "could not open relation with OID %u", relid);
2316 
2317  if (!RelationIsLogicallyLogged(relation))
2318  continue;
2319 
2320  relations[nrelations++] = relation;
2321  }
2322 
2323  /* Apply the truncate. */
2324  ReorderBufferApplyTruncate(rb, txn, nrelations,
2325  relations, change,
2326  streaming);
2327 
2328  for (i = 0; i < nrelations; i++)
2329  RelationClose(relations[i]);
2330 
2331  break;
2332  }
2333 
2335  ReorderBufferApplyMessage(rb, txn, change, streaming);
2336  break;
2337 
2339  /* Execute the invalidation messages locally */
2340  ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2341  change->data.inval.invalidations);
2342  break;
2343 
2345  /* get rid of the old */
2346  TeardownHistoricSnapshot(false);
2347 
2348  if (snapshot_now->copied)
2349  {
2350  ReorderBufferFreeSnap(rb, snapshot_now);
2351  snapshot_now =
2352  ReorderBufferCopySnap(rb, change->data.snapshot,
2353  txn, command_id);
2354  }
2355 
2356  /*
2357  * Restored from disk, need to be careful not to double
2358  * free. We could introduce refcounting for that, but for
2359  * now this seems infrequent enough not to care.
2360  */
2361  else if (change->data.snapshot->copied)
2362  {
2363  snapshot_now =
2364  ReorderBufferCopySnap(rb, change->data.snapshot,
2365  txn, command_id);
2366  }
2367  else
2368  {
2369  snapshot_now = change->data.snapshot;
2370  }
2371 
2372  /* and continue with the new one */
2373  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2374  break;
2375 
2377  Assert(change->data.command_id != InvalidCommandId);
2378 
2379  if (command_id < change->data.command_id)
2380  {
2381  command_id = change->data.command_id;
2382 
2383  if (!snapshot_now->copied)
2384  {
2385  /* we don't use the global one anymore */
2386  snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2387  txn, command_id);
2388  }
2389 
2390  snapshot_now->curcid = command_id;
2391 
2392  TeardownHistoricSnapshot(false);
2393  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2394  }
2395 
2396  break;
2397 
2399  elog(ERROR, "tuplecid value in changequeue");
2400  break;
2401  }
2402  }
2403 
2404  /* speculative insertion record must be freed by now */
2405  Assert(!specinsert);
2406 
2407  /* clean up the iterator */
2408  ReorderBufferIterTXNFinish(rb, iterstate);
2409  iterstate = NULL;
2410 
2411  /*
2412  * Update total transaction count and total bytes processed by the
2413  * transaction and its subtransactions. Ensure to not count the
2414  * streamed transaction multiple times.
2415  *
2416  * Note that the statistics computation has to be done after
2417  * ReorderBufferIterTXNFinish as it releases the serialized change
2418  * which we have already accounted in ReorderBufferIterTXNNext.
2419  */
2420  if (!rbtxn_is_streamed(txn))
2421  rb->totalTxns++;
2422 
2423  rb->totalBytes += txn->total_size;
2424 
2425  /*
2426  * Done with current changes, send the last message for this set of
2427  * changes depending upon streaming mode.
2428  */
2429  if (streaming)
2430  {
2431  if (stream_started)
2432  {
2433  rb->stream_stop(rb, txn, prev_lsn);
2434  stream_started = false;
2435  }
2436  }
2437  else
2438  {
2439  /*
2440  * Call either PREPARE (for two-phase transactions) or COMMIT (for
2441  * regular ones).
2442  */
2443  if (rbtxn_prepared(txn))
2444  rb->prepare(rb, txn, commit_lsn);
2445  else
2446  rb->commit(rb, txn, commit_lsn);
2447  }
2448 
2449  /* this is just a sanity check against bad output plugin behaviour */
2451  elog(ERROR, "output plugin used XID %u",
2453 
2454  /*
2455  * Remember the command ID and snapshot for the next set of changes in
2456  * streaming mode.
2457  */
2458  if (streaming)
2459  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2460  else if (snapshot_now->copied)
2461  ReorderBufferFreeSnap(rb, snapshot_now);
2462 
2463  /* cleanup */
2464  TeardownHistoricSnapshot(false);
2465 
2466  /*
2467  * Aborting the current (sub-)transaction as a whole has the right
2468  * semantics. We want all locks acquired in here to be released, not
2469  * reassigned to the parent and we do not want any database access
2470  * have persistent effects.
2471  */
2473 
2474  /* make sure there's no cache pollution */
2476 
2477  if (using_subtxn)
2479 
2480  /*
2481  * We are here due to one of the four reasons: 1. Decoding an
2482  * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2483  * prepared txn that was (partially) streamed. 4. Decoding a committed
2484  * txn.
2485  *
2486  * For 1, we allow truncation of txn data by removing the changes
2487  * already streamed but still keeping other things like invalidations,
2488  * snapshot, and tuplecids. For 2 and 3, we indicate
2489  * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2490  * data as the entire transaction has been decoded except for commit.
2491  * For 4, as the entire txn has been decoded, we can fully clean up
2492  * the TXN reorder buffer.
2493  */
2494  if (streaming || rbtxn_prepared(txn))
2495  {
2497  /* Reset the CheckXidAlive */
2499  }
2500  else
2501  ReorderBufferCleanupTXN(rb, txn);
2502  }
2503  PG_CATCH();
2504  {
2505  MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2506  ErrorData *errdata = CopyErrorData();
2507 
2508  /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2509  if (iterstate)
2510  ReorderBufferIterTXNFinish(rb, iterstate);
2511 
2513 
2514  /*
2515  * Force cache invalidation to happen outside of a valid transaction
2516  * to prevent catalog access as we just caught an error.
2517  */
2519 
2520  /* make sure there's no cache pollution */
2522  txn->invalidations);
2523 
2524  if (using_subtxn)
2526 
2527  /*
2528  * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2529  * abort of the (sub)transaction we are streaming or preparing. We
2530  * need to do the cleanup and return gracefully on this error, see
2531  * SetupCheckXidLive.
2532  *
2533  * This error code can be thrown by one of the callbacks we call
2534  * during decoding so we need to ensure that we return gracefully only
2535  * when we are sending the data in streaming mode and the streaming is
2536  * not finished yet or when we are sending the data out on a PREPARE
2537  * during a two-phase commit.
2538  */
2539  if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2540  (stream_started || rbtxn_prepared(txn)))
2541  {
2542  /* curtxn must be set for streaming or prepared transactions */
2543  Assert(curtxn);
2544 
2545  /* Cleanup the temporary error state. */
2546  FlushErrorState();
2547  FreeErrorData(errdata);
2548  errdata = NULL;
2549  curtxn->concurrent_abort = true;
2550 
2551  /* Reset the TXN so that it is allowed to stream remaining data. */
2552  ReorderBufferResetTXN(rb, txn, snapshot_now,
2553  command_id, prev_lsn,
2554  specinsert);
2555  }
2556  else
2557  {
2558  ReorderBufferCleanupTXN(rb, txn);
2559  MemoryContextSwitchTo(ecxt);
2560  PG_RE_THROW();
2561  }
2562  }
2563  PG_END_TRY();
2564 }
2565 
2566 /*
2567  * Perform the replay of a transaction and its non-aborted subtransactions.
2568  *
2569  * Subtransactions previously have to be processed by
2570  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2571  * transaction with ReorderBufferAssignChild.
2572  *
2573  * This interface is called once a prepare or toplevel commit is read for both
2574  * streamed as well as non-streamed transactions.
2575  */
2576 static void
2578  ReorderBuffer *rb, TransactionId xid,
2579  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2580  TimestampTz commit_time,
2581  RepOriginId origin_id, XLogRecPtr origin_lsn)
2582 {
2583  Snapshot snapshot_now;
2584  CommandId command_id = FirstCommandId;
2585 
2586  txn->final_lsn = commit_lsn;
2587  txn->end_lsn = end_lsn;
2588  txn->xact_time.commit_time = commit_time;
2589  txn->origin_id = origin_id;
2590  txn->origin_lsn = origin_lsn;
2591 
2592  /*
2593  * If the transaction was (partially) streamed, we need to commit it in a
2594  * 'streamed' way. That is, we first stream the remaining part of the
2595  * transaction, and then invoke stream_commit message.
2596  *
2597  * Called after everything (origin ID, LSN, ...) is stored in the
2598  * transaction to avoid passing that information directly.
2599  */
2600  if (rbtxn_is_streamed(txn))
2601  {
2602  ReorderBufferStreamCommit(rb, txn);
2603  return;
2604  }
2605 
2606  /*
2607  * If this transaction has no snapshot, it didn't make any changes to the
2608  * database, so there's nothing to decode. Note that
2609  * ReorderBufferCommitChild will have transferred any snapshots from
2610  * subtransactions if there were any.
2611  */
2612  if (txn->base_snapshot == NULL)
2613  {
2614  Assert(txn->ninvalidations == 0);
2615 
2616  /*
2617  * Removing this txn before a commit might result in the computation
2618  * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2619  */
2620  if (!rbtxn_prepared(txn))
2621  ReorderBufferCleanupTXN(rb, txn);
2622  return;
2623  }
2624 
2625  snapshot_now = txn->base_snapshot;
2626 
2627  /* Process and send the changes to output plugin. */
2628  ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2629  command_id, false);
2630 }
2631 
2632 /*
2633  * Commit a transaction.
2634  *
2635  * See comments for ReorderBufferReplay().
2636  */
2637 void
2639  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2640  TimestampTz commit_time,
2641  RepOriginId origin_id, XLogRecPtr origin_lsn)
2642 {
2643  ReorderBufferTXN *txn;
2644 
2645  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2646  false);
2647 
2648  /* unknown transaction, nothing to replay */
2649  if (txn == NULL)
2650  return;
2651 
2652  ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2653  origin_id, origin_lsn);
2654 }
2655 
2656 /*
2657  * Record the prepare information for a transaction.
2658  */
2659 bool
2661  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2662  TimestampTz prepare_time,
2663  RepOriginId origin_id, XLogRecPtr origin_lsn)
2664 {
2665  ReorderBufferTXN *txn;
2666 
2667  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2668 
2669  /* unknown transaction, nothing to do */
2670  if (txn == NULL)
2671  return false;
2672 
2673  /*
2674  * Remember the prepare information to be later used by commit prepared in
2675  * case we skip doing prepare.
2676  */
2677  txn->final_lsn = prepare_lsn;
2678  txn->end_lsn = end_lsn;
2679  txn->xact_time.prepare_time = prepare_time;
2680  txn->origin_id = origin_id;
2681  txn->origin_lsn = origin_lsn;
2682 
2683  return true;
2684 }
2685 
2686 /* Remember that we have skipped prepare */
2687 void
2689 {
2690  ReorderBufferTXN *txn;
2691 
2692  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2693 
2694  /* unknown transaction, nothing to do */
2695  if (txn == NULL)
2696  return;
2697 
2699 }
2700 
2701 /*
2702  * Prepare a two-phase transaction.
2703  *
2704  * See comments for ReorderBufferReplay().
2705  */
2706 void
2708  char *gid)
2709 {
2710  ReorderBufferTXN *txn;
2711 
2712  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2713  false);
2714 
2715  /* unknown transaction, nothing to replay */
2716  if (txn == NULL)
2717  return;
2718 
2719  txn->txn_flags |= RBTXN_PREPARE;
2720  txn->gid = pstrdup(gid);
2721 
2722  /* The prepare info must have been updated in txn by now. */
2724 
2725  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2726  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2727 
2728  /*
2729  * We send the prepare for the concurrently aborted xacts so that later
2730  * when rollback prepared is decoded and sent, the downstream should be
2731  * able to rollback such a xact. See comments atop DecodePrepare.
2732  *
2733  * Note, for the concurrent_abort + streaming case a stream_prepare was
2734  * already sent within the ReorderBufferReplay call above.
2735  */
2736  if (txn->concurrent_abort && !rbtxn_is_streamed(txn))
2737  rb->prepare(rb, txn, txn->final_lsn);
2738 }
2739 
2740 /*
2741  * This is used to handle COMMIT/ROLLBACK PREPARED.
2742  */
2743 void
2745  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2746  XLogRecPtr two_phase_at,
2747  TimestampTz commit_time, RepOriginId origin_id,
2748  XLogRecPtr origin_lsn, char *gid, bool is_commit)
2749 {
2750  ReorderBufferTXN *txn;
2751  XLogRecPtr prepare_end_lsn;
2752  TimestampTz prepare_time;
2753 
2754  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2755 
2756  /* unknown transaction, nothing to do */
2757  if (txn == NULL)
2758  return;
2759 
2760  /*
2761  * By this time the txn has the prepare record information, remember it to
2762  * be later used for rollback.
2763  */
2764  prepare_end_lsn = txn->end_lsn;
2765  prepare_time = txn->xact_time.prepare_time;
2766 
2767  /* add the gid in the txn */
2768  txn->gid = pstrdup(gid);
2769 
2770  /*
2771  * It is possible that this transaction is not decoded at prepare time
2772  * either because by that time we didn't have a consistent snapshot, or
2773  * two_phase was not enabled, or it was decoded earlier but we have
2774  * restarted. We only need to send the prepare if it was not decoded
2775  * earlier. We don't need to decode the xact for aborts if it is not done
2776  * already.
2777  */
2778  if ((txn->final_lsn < two_phase_at) && is_commit)
2779  {
2780  txn->txn_flags |= RBTXN_PREPARE;
2781 
2782  /*
2783  * The prepare info must have been updated in txn even if we skip
2784  * prepare.
2785  */
2787 
2788  /*
2789  * By this time the txn has the prepare record information and it is
2790  * important to use that so that downstream gets the accurate
2791  * information. If instead, we have passed commit information here
2792  * then downstream can behave as it has already replayed commit
2793  * prepared after the restart.
2794  */
2795  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2796  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2797  }
2798 
2799  txn->final_lsn = commit_lsn;
2800  txn->end_lsn = end_lsn;
2801  txn->xact_time.commit_time = commit_time;
2802  txn->origin_id = origin_id;
2803  txn->origin_lsn = origin_lsn;
2804 
2805  if (is_commit)
2806  rb->commit_prepared(rb, txn, commit_lsn);
2807  else
2808  rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2809 
2810  /* cleanup: make sure there's no cache pollution */
2812  txn->invalidations);
2813  ReorderBufferCleanupTXN(rb, txn);
2814 }
2815 
2816 /*
2817  * Abort a transaction that possibly has previous changes. Needs to be first
2818  * called for subtransactions and then for the toplevel xid.
2819  *
2820  * NB: Transactions handled here have to have actively aborted (i.e. have
2821  * produced an abort record). Implicitly aborted transactions are handled via
2822  * ReorderBufferAbortOld(); transactions we're just not interested in, but
2823  * which have committed are handled in ReorderBufferForget().
2824  *
2825  * This function purges this transaction and its contents from memory and
2826  * disk.
2827  */
2828 void
2830 {
2831  ReorderBufferTXN *txn;
2832 
2833  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2834  false);
2835 
2836  /* unknown, nothing to remove */
2837  if (txn == NULL)
2838  return;
2839 
2840  /* For streamed transactions notify the remote node about the abort. */
2841  if (rbtxn_is_streamed(txn))
2842  {
2843  rb->stream_abort(rb, txn, lsn);
2844 
2845  /*
2846  * We might have decoded changes for this transaction that could load
2847  * the cache as per the current transaction's view (consider DDL's
2848  * happened in this transaction). We don't want the decoding of future
2849  * transactions to use those cache entries so execute invalidations.
2850  */
2851  if (txn->ninvalidations > 0)
2853  txn->invalidations);
2854  }
2855 
2856  /* cosmetic... */
2857  txn->final_lsn = lsn;
2858 
2859  /* remove potential on-disk data, and deallocate */
2860  ReorderBufferCleanupTXN(rb, txn);
2861 }
2862 
2863 /*
2864  * Abort all transactions that aren't actually running anymore because the
2865  * server restarted.
2866  *
2867  * NB: These really have to be transactions that have aborted due to a server
2868  * crash/immediate restart, as we don't deal with invalidations here.
2869  */
2870 void
2872 {
2873  dlist_mutable_iter it;
2874 
2875  /*
2876  * Iterate through all (potential) toplevel TXNs and abort all that are
2877  * older than what possibly can be running. Once we've found the first
2878  * that is alive we stop, there might be some that acquired an xid earlier
2879  * but started writing later, but it's unlikely and they will be cleaned
2880  * up in a later call to this function.
2881  */
2883  {
2884  ReorderBufferTXN *txn;
2885 
2886  txn = dlist_container(ReorderBufferTXN, node, it.cur);
2887 
2888  if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2889  {
2890  elog(DEBUG2, "aborting old transaction %u", txn->xid);
2891 
2892  /* remove potential on-disk data, and deallocate this tx */
2893  ReorderBufferCleanupTXN(rb, txn);
2894  }
2895  else
2896  return;
2897  }
2898 }
2899 
2900 /*
2901  * Forget the contents of a transaction if we aren't interested in its
2902  * contents. Needs to be first called for subtransactions and then for the
2903  * toplevel xid.
2904  *
2905  * This is significantly different to ReorderBufferAbort() because
2906  * transactions that have committed need to be treated differently from aborted
2907  * ones since they may have modified the catalog.
2908  *
2909  * Note that this is only allowed to be called in the moment a transaction
2910  * commit has just been read, not earlier; otherwise later records referring
2911  * to this xid might re-create the transaction incompletely.
2912  */
2913 void
2915 {
2916  ReorderBufferTXN *txn;
2917 
2918  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2919  false);
2920 
2921  /* unknown, nothing to forget */
2922  if (txn == NULL)
2923  return;
2924 
2925  /* For streamed transactions notify the remote node about the abort. */
2926  if (rbtxn_is_streamed(txn))
2927  rb->stream_abort(rb, txn, lsn);
2928 
2929  /* cosmetic... */
2930  txn->final_lsn = lsn;
2931 
2932  /*
2933  * Process cache invalidation messages if there are any. Even if we're not
2934  * interested in the transaction's contents, it could have manipulated the
2935  * catalog and we need to update the caches according to that.
2936  */
2937  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2939  txn->invalidations);
2940  else
2941  Assert(txn->ninvalidations == 0);
2942 
2943  /* remove potential on-disk data, and deallocate */
2944  ReorderBufferCleanupTXN(rb, txn);
2945 }
2946 
2947 /*
2948  * Invalidate cache for those transactions that need to be skipped just in case
2949  * catalogs were manipulated as part of the transaction.
2950  *
2951  * Note that this is a special-purpose function for prepared transactions where
2952  * we don't want to clean up the TXN even when we decide to skip it. See
2953  * DecodePrepare.
2954  */
2955 void
2957 {
2958  ReorderBufferTXN *txn;
2959 
2960  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2961  false);
2962 
2963  /* unknown, nothing to do */
2964  if (txn == NULL)
2965  return;
2966 
2967  /*
2968  * Process cache invalidation messages if there are any. Even if we're not
2969  * interested in the transaction's contents, it could have manipulated the
2970  * catalog and we need to update the caches according to that.
2971  */
2972  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2974  txn->invalidations);
2975  else
2976  Assert(txn->ninvalidations == 0);
2977 }
2978 
2979 
2980 /*
2981  * Execute invalidations happening outside the context of a decoded
2982  * transaction. That currently happens either for xid-less commits
2983  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
2984  * transactions (via ReorderBufferForget()).
2985  */
2986 void
2988  SharedInvalidationMessage *invalidations)
2989 {
2990  bool use_subtxn = IsTransactionOrTransactionBlock();
2991  int i;
2992 
2993  if (use_subtxn)
2994  BeginInternalSubTransaction("replay");
2995 
2996  /*
2997  * Force invalidations to happen outside of a valid transaction - that way
2998  * entries will just be marked as invalid without accessing the catalog.
2999  * That's advantageous because we don't need to setup the full state
3000  * necessary for catalog access.
3001  */
3002  if (use_subtxn)
3004 
3005  for (i = 0; i < ninvalidations; i++)
3006  LocalExecuteInvalidationMessage(&invalidations[i]);
3007 
3008  if (use_subtxn)
3010 }
3011 
3012 /*
3013  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3014  * least once for every xid in XLogRecord->xl_xid (other places in records
3015  * may, but do not have to be passed through here).
3016  *
3017  * Reorderbuffer keeps some datastructures about transactions in LSN order,
3018  * for efficiency. To do that it has to know about when transactions are seen
3019  * first in the WAL. As many types of records are not actually interesting for
3020  * logical decoding, they do not necessarily pass though here.
3021  */
3022 void
3024 {
3025  /* many records won't have an xid assigned, centralize check here */
3026  if (xid != InvalidTransactionId)
3027  ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3028 }
3029 
3030 /*
3031  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3032  * because the previous snapshot doesn't describe the catalog correctly for
3033  * following rows.
3034  */
3035 void
3037  XLogRecPtr lsn, Snapshot snap)
3038 {
3040 
3041  change->data.snapshot = snap;
3043 
3044  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3045 }
3046 
3047 /*
3048  * Set up the transaction's base snapshot.
3049  *
3050  * If we know that xid is a subtransaction, set the base snapshot on the
3051  * top-level transaction instead.
3052  */
3053 void
3055  XLogRecPtr lsn, Snapshot snap)
3056 {
3057  ReorderBufferTXN *txn;
3058  bool is_new;
3059 
3060  AssertArg(snap != NULL);
3061 
3062  /*
3063  * Fetch the transaction to operate on. If we know it's a subtransaction,
3064  * operate on its top-level transaction instead.
3065  */
3066  txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3067  if (rbtxn_is_known_subxact(txn))
3068  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3069  NULL, InvalidXLogRecPtr, false);
3070  Assert(txn->base_snapshot == NULL);
3071 
3072  txn->base_snapshot = snap;
3073  txn->base_snapshot_lsn = lsn;
3075 
3076  AssertTXNLsnOrder(rb);
3077 }
3078 
3079 /*
3080  * Access the catalog with this CommandId at this point in the changestream.
3081  *
3082  * May only be called for command ids > 1
3083  */
3084 void
3086  XLogRecPtr lsn, CommandId cid)
3087 {
3089 
3090  change->data.command_id = cid;
3092 
3093  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3094 }
3095 
3096 /*
3097  * Update memory counters to account for the new or removed change.
3098  *
3099  * We update two counters - in the reorder buffer, and in the transaction
3100  * containing the change. The reorder buffer counter allows us to quickly
3101  * decide if we reached the memory limit, the transaction counter allows
3102  * us to quickly pick the largest transaction for eviction.
3103  *
3104  * When streaming is enabled, we need to update the toplevel transaction
3105  * counters instead - we don't really care about subtransactions as we
3106  * can't stream them individually anyway, and we only pick toplevel
3107  * transactions for eviction. So only toplevel transactions matter.
3108  */
3109 static void
3111  ReorderBufferChange *change,
3112  bool addition, Size sz)
3113 {
3114  ReorderBufferTXN *txn;
3115  ReorderBufferTXN *toptxn;
3116 
3117  Assert(change->txn);
3118 
3119  /*
3120  * Ignore tuple CID changes, because those are not evicted when reaching
3121  * memory limit. So we just don't count them, because it might easily
3122  * trigger a pointless attempt to spill.
3123  */
3125  return;
3126 
3127  txn = change->txn;
3128 
3129  /*
3130  * Update the total size in top level as well. This is later used to
3131  * compute the decoding stats.
3132  */
3133  if (txn->toptxn != NULL)
3134  toptxn = txn->toptxn;
3135  else
3136  toptxn = txn;
3137 
3138  if (addition)
3139  {
3140  txn->size += sz;
3141  rb->size += sz;
3142 
3143  /* Update the total size in the top transaction. */
3144  toptxn->total_size += sz;
3145  }
3146  else
3147  {
3148  Assert((rb->size >= sz) && (txn->size >= sz));
3149  txn->size -= sz;
3150  rb->size -= sz;
3151 
3152  /* Update the total size in the top transaction. */
3153  toptxn->total_size -= sz;
3154  }
3155 
3156  Assert(txn->size <= rb->size);
3157 }
3158 
3159 /*
3160  * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
3161  *
3162  * We do not include this change type in memory accounting, because we
3163  * keep CIDs in a separate list and do not evict them when reaching
3164  * the memory limit.
3165  */
3166 void
3168  XLogRecPtr lsn, RelFileNode node,
3169  ItemPointerData tid, CommandId cmin,
3170  CommandId cmax, CommandId combocid)
3171 {
3173  ReorderBufferTXN *txn;
3174 
3175  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3176 
3177  change->data.tuplecid.node = node;
3178  change->data.tuplecid.tid = tid;
3179  change->data.tuplecid.cmin = cmin;
3180  change->data.tuplecid.cmax = cmax;
3181  change->data.tuplecid.combocid = combocid;
3182  change->lsn = lsn;
3183  change->txn = txn;
3185 
3186  dlist_push_tail(&txn->tuplecids, &change->node);
3187  txn->ntuplecids++;
3188 }
3189 
3190 /*
3191  * Setup the invalidation of the toplevel transaction.
3192  *
3193  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3194  * accumulates all the invalidation messages in the toplevel transaction as
3195  * well as in the form of change in reorder buffer. We require to record it in
3196  * form of the change so that we can execute only the required invalidations
3197  * instead of executing all the invalidations on each CommandId increment. We
3198  * also need to accumulate these in the toplevel transaction because in some
3199  * cases we skip processing the transaction (see ReorderBufferForget), we need
3200  * to execute all the invalidations together.
3201  */
3202 void
3204  XLogRecPtr lsn, Size nmsgs,
3206 {
3207  ReorderBufferTXN *txn;
3208  MemoryContext oldcontext;
3209  ReorderBufferChange *change;
3210 
3211  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3212 
3213  oldcontext = MemoryContextSwitchTo(rb->context);
3214 
3215  /*
3216  * Collect all the invalidations under the top transaction so that we can
3217  * execute them all together. See comment atop this function
3218  */
3219  if (txn->toptxn)
3220  txn = txn->toptxn;
3221 
3222  Assert(nmsgs > 0);
3223 
3224  /* Accumulate invalidations. */
3225  if (txn->ninvalidations == 0)
3226  {
3227  txn->ninvalidations = nmsgs;
3229  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3230  memcpy(txn->invalidations, msgs,
3231  sizeof(SharedInvalidationMessage) * nmsgs);
3232  }
3233  else
3234  {
3237  (txn->ninvalidations + nmsgs));
3238 
3239  memcpy(txn->invalidations + txn->ninvalidations, msgs,
3240  nmsgs * sizeof(SharedInvalidationMessage));
3241  txn->ninvalidations += nmsgs;
3242  }
3243 
3244  change = ReorderBufferGetChange(rb);
3246  change->data.inval.ninvalidations = nmsgs;
3247  change->data.inval.invalidations = (SharedInvalidationMessage *)
3248  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3249  memcpy(change->data.inval.invalidations, msgs,
3250  sizeof(SharedInvalidationMessage) * nmsgs);
3251 
3252  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3253 
3254  MemoryContextSwitchTo(oldcontext);
3255 }
3256 
3257 /*
3258  * Apply all invalidations we know. Possibly we only need parts at this point
3259  * in the changestream but we don't know which those are.
3260  */
3261 static void
3263 {
3264  int i;
3265 
3266  for (i = 0; i < nmsgs; i++)
3268 }
3269 
3270 /*
3271  * Mark a transaction as containing catalog changes
3272  */
3273 void
3275  XLogRecPtr lsn)
3276 {
3277  ReorderBufferTXN *txn;
3278 
3279  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3280 
3282 
3283  /*
3284  * Mark top-level transaction as having catalog changes too if one of its
3285  * children has so that the ReorderBufferBuildTupleCidHash can
3286  * conveniently check just top-level transaction and decide whether to
3287  * build the hash table or not.
3288  */
3289  if (txn->toptxn != NULL)
3291 }
3292 
3293 /*
3294  * Query whether a transaction is already *known* to contain catalog
3295  * changes. This can be wrong until directly before the commit!
3296  */
3297 bool
3299 {
3300  ReorderBufferTXN *txn;
3301 
3302  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3303  false);
3304  if (txn == NULL)
3305  return false;
3306 
3307  return rbtxn_has_catalog_changes(txn);
3308 }
3309 
3310 /*
3311  * ReorderBufferXidHasBaseSnapshot
3312  * Have we already set the base snapshot for the given txn/subtxn?
3313  */
3314 bool
3316 {
3317  ReorderBufferTXN *txn;
3318 
3319  txn = ReorderBufferTXNByXid(rb, xid, false,
3320  NULL, InvalidXLogRecPtr, false);
3321 
3322  /* transaction isn't known yet, ergo no snapshot */
3323  if (txn == NULL)
3324  return false;
3325 
3326  /* a known subtxn? operate on top-level txn instead */
3327  if (rbtxn_is_known_subxact(txn))
3328  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3329  NULL, InvalidXLogRecPtr, false);
3330 
3331  return txn->base_snapshot != NULL;
3332 }
3333 
3334 
3335 /*
3336  * ---------------------------------------
3337  * Disk serialization support
3338  * ---------------------------------------
3339  */
3340 
3341 /*
3342  * Ensure the IO buffer is >= sz.
3343  */
3344 static void
3346 {
3347  if (!rb->outbufsize)
3348  {
3349  rb->outbuf = MemoryContextAlloc(rb->context, sz);
3350  rb->outbufsize = sz;
3351  }
3352  else if (rb->outbufsize < sz)
3353  {
3354  rb->outbuf = repalloc(rb->outbuf, sz);
3355  rb->outbufsize = sz;
3356  }
3357 }
3358 
3359 /*
3360  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3361  *
3362  * XXX With many subtransactions this might be quite slow, because we'll have
3363  * to walk through all of them. There are some options how we could improve
3364  * that: (a) maintain some secondary structure with transactions sorted by
3365  * amount of changes, (b) not looking for the entirely largest transaction,
3366  * but e.g. for transaction using at least some fraction of the memory limit,
3367  * and (c) evicting multiple transactions at once, e.g. to free a given portion
3368  * of the memory limit (e.g. 50%).
3369  */
3370 static ReorderBufferTXN *
3372 {
3373  HASH_SEQ_STATUS hash_seq;
3375  ReorderBufferTXN *largest = NULL;
3376 
3377  hash_seq_init(&hash_seq, rb->by_txn);
3378  while ((ent = hash_seq_search(&hash_seq)) != NULL)
3379  {
3380  ReorderBufferTXN *txn = ent->txn;
3381 
3382  /* if the current transaction is larger, remember it */
3383  if ((!largest) || (txn->size > largest->size))
3384  largest = txn;
3385  }
3386 
3387  Assert(largest);
3388  Assert(largest->size > 0);
3389  Assert(largest->size <= rb->size);
3390 
3391  return largest;
3392 }
3393 
3394 /*
3395  * Find the largest toplevel transaction to evict (by streaming).
3396  *
3397  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3398  * should give us the same transaction (because we don't update memory account
3399  * for subtransaction with streaming, so it's always 0). But we can simply
3400  * iterate over the limited number of toplevel transactions that have a base
3401  * snapshot. There is no use of selecting a transaction that doesn't have base
3402  * snapshot because we don't decode such transactions.
3403  *
3404  * Note that, we skip transactions that contains incomplete changes. There
3405  * is a scope of optimization here such that we can select the largest
3406  * transaction which has incomplete changes. But that will make the code and
3407  * design quite complex and that might not be worth the benefit. If we plan to
3408  * stream the transactions that contains incomplete changes then we need to
3409  * find a way to partially stream/truncate the transaction changes in-memory
3410  * and build a mechanism to partially truncate the spilled files.
3411  * Additionally, whenever we partially stream the transaction we need to
3412  * maintain the last streamed lsn and next time we need to restore from that
3413  * segment and the offset in WAL. As we stream the changes from the top
3414  * transaction and restore them subtransaction wise, we need to even remember
3415  * the subxact from where we streamed the last change.
3416  */
3417 static ReorderBufferTXN *
3419 {
3420  dlist_iter iter;
3421  Size largest_size = 0;
3422  ReorderBufferTXN *largest = NULL;
3423 
3424  /* Find the largest top-level transaction having a base snapshot. */
3426  {
3427  ReorderBufferTXN *txn;
3428 
3429  txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3430 
3431  /* must not be a subtxn */
3433  /* base_snapshot must be set */
3434  Assert(txn->base_snapshot != NULL);
3435 
3436  if ((largest == NULL || txn->total_size > largest_size) &&
3437  (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)))
3438  {
3439  largest = txn;
3440  largest_size = txn->total_size;
3441  }
3442  }
3443 
3444  return largest;
3445 }
3446 
3447 /*
3448  * Check whether the logical_decoding_work_mem limit was reached, and if yes
3449  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3450  * disk until we reach under the memory limit.
3451  *
3452  * XXX At this point we select the transactions until we reach under the memory
3453  * limit, but we might also adapt a more elaborate eviction strategy - for example
3454  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3455  * limit.
3456  */
3457 static void
3459 {
3460  ReorderBufferTXN *txn;
3461 
3462  /* bail out if we haven't exceeded the memory limit */
3463  if (rb->size < logical_decoding_work_mem * 1024L)
3464  return;
3465 
3466  /*
3467  * Loop until we reach under the memory limit. One might think that just
3468  * by evicting the largest (sub)transaction we will come under the memory
3469  * limit based on assumption that the selected transaction is at least as
3470  * large as the most recent change (which caused us to go over the memory
3471  * limit). However, that is not true because a user can reduce the
3472  * logical_decoding_work_mem to a smaller value before the most recent
3473  * change.
3474  */
3475  while (rb->size >= logical_decoding_work_mem * 1024L)
3476  {
3477  /*
3478  * Pick the largest transaction (or subtransaction) and evict it from
3479  * memory by streaming, if possible. Otherwise, spill to disk.
3480  */
3482  (txn = ReorderBufferLargestTopTXN(rb)) != NULL)
3483  {
3484  /* we know there has to be one, because the size is not zero */
3485  Assert(txn && !txn->toptxn);
3486  Assert(txn->total_size > 0);
3487  Assert(rb->size >= txn->total_size);
3488 
3489  ReorderBufferStreamTXN(rb, txn);
3490  }
3491  else
3492  {
3493  /*
3494  * Pick the largest transaction (or subtransaction) and evict it
3495  * from memory by serializing it to disk.
3496  */
3497  txn = ReorderBufferLargestTXN(rb);
3498 
3499  /* we know there has to be one, because the size is not zero */
3500  Assert(txn);
3501  Assert(txn->size > 0);
3502  Assert(rb->size >= txn->size);
3503 
3504  ReorderBufferSerializeTXN(rb, txn);
3505  }
3506 
3507  /*
3508  * After eviction, the transaction should have no entries in memory,
3509  * and should use 0 bytes for changes.
3510  */
3511  Assert(txn->size == 0);
3512  Assert(txn->nentries_mem == 0);
3513  }
3514 
3515  /* We must be under the memory limit now. */
3516  Assert(rb->size < logical_decoding_work_mem * 1024L);
3517 }
3518 
3519 /*
3520  * Spill data of a large transaction (and its subtransactions) to disk.
3521  */
3522 static void
3524 {
3525  dlist_iter subtxn_i;
3526  dlist_mutable_iter change_i;
3527  int fd = -1;
3528  XLogSegNo curOpenSegNo = 0;
3529  Size spilled = 0;
3530  Size size = txn->size;
3531 
3532  elog(DEBUG2, "spill %u changes in XID %u to disk",
3533  (uint32) txn->nentries_mem, txn->xid);
3534 
3535  /* do the same to all child TXs */
3536  dlist_foreach(subtxn_i, &txn->subtxns)
3537  {
3538  ReorderBufferTXN *subtxn;
3539 
3540  subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3541  ReorderBufferSerializeTXN(rb, subtxn);
3542  }
3543 
3544  /* serialize changestream */
3545  dlist_foreach_modify(change_i, &txn->changes)
3546  {
3547  ReorderBufferChange *change;
3548 
3549  change = dlist_container(ReorderBufferChange, node, change_i.cur);
3550 
3551  /*
3552  * store in segment in which it belongs by start lsn, don't split over
3553  * multiple segments tho
3554  */
3555  if (fd == -1 ||
3556  !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3557  {
3558  char path[MAXPGPATH];
3559 
3560  if (fd != -1)
3562 
3563  XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3564 
3565  /*
3566  * No need to care about TLIs here, only used during a single run,
3567  * so each LSN only maps to a specific WAL record.
3568  */
3570  curOpenSegNo);
3571 
3572  /* open segment, create it if necessary */
3573  fd = OpenTransientFile(path,
3574  O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3575 
3576  if (fd < 0)
3577  ereport(ERROR,
3579  errmsg("could not open file \"%s\": %m", path)));
3580  }
3581 
3582  ReorderBufferSerializeChange(rb, txn, fd, change);
3583  dlist_delete(&change->node);
3584  ReorderBufferReturnChange(rb, change, true);
3585 
3586  spilled++;
3587  }
3588 
3589  /* update the statistics iff we have spilled anything */
3590  if (spilled)
3591  {
3592  rb->spillCount += 1;
3593  rb->spillBytes += size;
3594 
3595  /* don't consider already serialized transactions */
3596  rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3597 
3598  /* update the decoding stats */
3600  }
3601 
3602  Assert(spilled == txn->nentries_mem);
3603  Assert(dlist_is_empty(&txn->changes));
3604  txn->nentries_mem = 0;
3606 
3607  if (fd != -1)
3609 }
3610 
3611 /*
3612  * Serialize individual change to disk.
3613  */
3614 static void
3616  int fd, ReorderBufferChange *change)
3617 {
3618  ReorderBufferDiskChange *ondisk;
3619  Size sz = sizeof(ReorderBufferDiskChange);
3620 
3622 
3623  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3624  memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3625 
3626  switch (change->action)
3627  {
3628  /* fall through these, they're all similar enough */
3633  {
3634  char *data;
3635  ReorderBufferTupleBuf *oldtup,
3636  *newtup;
3637  Size oldlen = 0;
3638  Size newlen = 0;
3639 
3640  oldtup = change->data.tp.oldtuple;
3641  newtup = change->data.tp.newtuple;
3642 
3643  if (oldtup)
3644  {
3645  sz += sizeof(HeapTupleData);
3646  oldlen = oldtup->tuple.t_len;
3647  sz += oldlen;
3648  }
3649 
3650  if (newtup)
3651  {
3652  sz += sizeof(HeapTupleData);
3653  newlen = newtup->tuple.t_len;
3654  sz += newlen;
3655  }
3656 
3657  /* make sure we have enough space */
3659 
3660  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3661  /* might have been reallocated above */
3662  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3663 
3664  if (oldlen)
3665  {
3666  memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
3667  data += sizeof(HeapTupleData);
3668 
3669  memcpy(data, oldtup->tuple.t_data, oldlen);
3670  data += oldlen;
3671  }
3672 
3673  if (newlen)
3674  {
3675  memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
3676  data += sizeof(HeapTupleData);
3677 
3678  memcpy(data, newtup->tuple.t_data, newlen);
3679  data += newlen;
3680  }
3681  break;
3682  }
3684  {
3685  char *data;
3686  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3687 
3688  sz += prefix_size + change->data.msg.message_size +
3689  sizeof(Size) + sizeof(Size);
3691 
3692  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3693 
3694  /* might have been reallocated above */
3695  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3696 
3697  /* write the prefix including the size */
3698  memcpy(data, &prefix_size, sizeof(Size));
3699  data += sizeof(Size);
3700  memcpy(data, change->data.msg.prefix,
3701  prefix_size);
3702  data += prefix_size;
3703 
3704  /* write the message including the size */
3705  memcpy(data, &change->data.msg.message_size, sizeof(Size));
3706  data += sizeof(Size);
3707  memcpy(data, change->data.msg.message,
3708  change->data.msg.message_size);
3709  data += change->data.msg.message_size;
3710 
3711  break;
3712  }
3714  {
3715  char *data;
3716  Size inval_size = sizeof(SharedInvalidationMessage) *
3717  change->data.inval.ninvalidations;
3718 
3719  sz += inval_size;
3720 
3722  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3723 
3724  /* might have been reallocated above */
3725  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3726  memcpy(data, change->data.inval.invalidations, inval_size);
3727  data += inval_size;
3728 
3729  break;
3730  }
3732  {
3733  Snapshot snap;
3734  char *data;
3735 
3736  snap = change->data.snapshot;
3737 
3738  sz += sizeof(SnapshotData) +
3739  sizeof(TransactionId) * snap->xcnt +
3740  sizeof(TransactionId) * snap->subxcnt;
3741 
3742  /* make sure we have enough space */
3744  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3745  /* might have been reallocated above */
3746  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3747 
3748  memcpy(data, snap, sizeof(SnapshotData));
3749  data += sizeof(SnapshotData);
3750 
3751  if (snap->xcnt)
3752  {
3753  memcpy(data, snap->xip,
3754  sizeof(TransactionId) * snap->xcnt);
3755  data += sizeof(TransactionId) * snap->xcnt;
3756  }
3757 
3758  if (snap->subxcnt)
3759  {
3760  memcpy(data, snap->subxip,
3761  sizeof(TransactionId) * snap->subxcnt);
3762  data += sizeof(TransactionId) * snap->subxcnt;
3763  }
3764  break;
3765  }
3767  {
3768  Size size;
3769  char *data;
3770 
3771  /* account for the OIDs of truncated relations */
3772  size = sizeof(Oid) * change->data.truncate.nrelids;
3773  sz += size;
3774 
3775  /* make sure we have enough space */
3777 
3778  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3779  /* might have been reallocated above */
3780  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3781 
3782  memcpy(data, change->data.truncate.relids, size);
3783  data += size;
3784 
3785  break;
3786  }
3791  /* ReorderBufferChange contains everything important */
3792  break;
3793  }
3794 
3795  ondisk->size = sz;
3796 
3797  errno = 0;
3799  if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3800  {
3801  int save_errno = errno;
3802 
3804 
3805  /* if write didn't set errno, assume problem is no disk space */
3806  errno = save_errno ? save_errno : ENOSPC;
3807  ereport(ERROR,
3809  errmsg("could not write to data file for XID %u: %m",
3810  txn->xid)));
3811  }
3813 
3814  /*
3815  * Keep the transaction's final_lsn up to date with each change we send to
3816  * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3817  * only do this on commit and abort records, but that doesn't work if a
3818  * system crash leaves a transaction without its abort record).
3819  *
3820  * Make sure not to move it backwards.
3821  */
3822  if (txn->final_lsn < change->lsn)
3823  txn->final_lsn = change->lsn;
3824 
3825  Assert(ondisk->change.action == change->action);
3826 }
3827 
3828 /* Returns true, if the output plugin supports streaming, false, otherwise. */
3829 static inline bool
3831 {
3833 
3834  return ctx->streaming;
3835 }
3836 
3837 /* Returns true, if the streaming can be started now, false, otherwise. */
3838 static inline bool
3840 {
3842  SnapBuild *builder = ctx->snapshot_builder;
3843 
3844  /* We can't start streaming unless a consistent state is reached. */
3846  return false;
3847 
3848  /*
3849  * We can't start streaming immediately even if the streaming is enabled
3850  * because we previously decoded this transaction and now just are
3851  * restarting.
3852  */
3853  if (ReorderBufferCanStream(rb) &&
3854  !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr))
3855  return true;
3856 
3857  return false;
3858 }
3859 
3860 /*
3861  * Send data of a large transaction (and its subtransactions) to the
3862  * output plugin, but using the stream API.
3863  */
3864 static void
3866 {
3867  Snapshot snapshot_now;
3868  CommandId command_id;
3869  Size stream_bytes;
3870  bool txn_is_streamed;
3871 
3872  /* We can never reach here for a subtransaction. */
3873  Assert(txn->toptxn == NULL);
3874 
3875  /*
3876  * We can't make any assumptions about base snapshot here, similar to what
3877  * ReorderBufferCommit() does. That relies on base_snapshot getting
3878  * transferred from subxact in ReorderBufferCommitChild(), but that was
3879  * not yet called as the transaction is in-progress.
3880  *
3881  * So just walk the subxacts and use the same logic here. But we only need
3882  * to do that once, when the transaction is streamed for the first time.
3883  * After that we need to reuse the snapshot from the previous run.
3884  *
3885  * Unlike DecodeCommit which adds xids of all the subtransactions in
3886  * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here
3887  * but we do add them to subxip array instead via ReorderBufferCopySnap.
3888  * This allows the catalog changes made in subtransactions decoded till
3889  * now to be visible.
3890  */
3891  if (txn->snapshot_now == NULL)
3892  {
3893  dlist_iter subxact_i;
3894 
3895  /* make sure this transaction is streamed for the first time */
3896  Assert(!rbtxn_is_streamed(txn));
3897 
3898  /* at the beginning we should have invalid command ID */
3900 
3901  dlist_foreach(subxact_i, &txn->subtxns)
3902  {
3903  ReorderBufferTXN *subtxn;
3904 
3905  subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
3906  ReorderBufferTransferSnapToParent(txn, subtxn);
3907  }
3908 
3909  /*
3910  * If this transaction has no snapshot, it didn't make any changes to
3911  * the database till now, so there's nothing to decode.
3912  */
3913  if (txn->base_snapshot == NULL)
3914  {
3915  Assert(txn->ninvalidations == 0);
3916  return;
3917  }
3918 
3919  command_id = FirstCommandId;
3920  snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
3921  txn, command_id);
3922  }
3923  else
3924  {
3925  /* the transaction must have been already streamed */
3926  Assert(rbtxn_is_streamed(txn));
3927 
3928  /*
3929  * Nah, we already have snapshot from the previous streaming run. We
3930  * assume new subxacts can't move the LSN backwards, and so can't beat
3931  * the LSN condition in the previous branch (so no need to walk
3932  * through subxacts again). In fact, we must not do that as we may be
3933  * using snapshot half-way through the subxact.
3934  */
3935  command_id = txn->command_id;
3936 
3937  /*
3938  * We can't use txn->snapshot_now directly because after the last
3939  * streaming run, we might have got some new sub-transactions. So we
3940  * need to add them to the snapshot.
3941  */
3942  snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
3943  txn, command_id);
3944 
3945  /* Free the previously copied snapshot. */
3946  Assert(txn->snapshot_now->copied);
3948  txn->snapshot_now = NULL;
3949  }
3950 
3951  /*
3952  * Remember this information to be used later to update stats. We can't
3953  * update the stats here as an error while processing the changes would
3954  * lead to the accumulation of stats even though we haven't streamed all
3955  * the changes.
3956  */
3957  txn_is_streamed = rbtxn_is_streamed(txn);
3958  stream_bytes = txn->total_size;
3959 
3960  /* Process and send the changes to output plugin. */
3961  ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
3962  command_id, true);
3963 
3964  rb->streamCount += 1;
3965  rb->streamBytes += stream_bytes;
3966 
3967  /* Don't consider already streamed transaction. */
3968  rb->streamTxns += (txn_is_streamed) ? 0 : 1;
3969 
3970  /* update the decoding stats */
3972 
3973  Assert(dlist_is_empty(&txn->changes));
3974  Assert(txn->nentries == 0);
3975  Assert(txn->nentries_mem == 0);
3976 }
3977 
3978 /*
3979  * Size of a change in memory.
3980  */
3981 static Size
3983 {
3984  Size sz = sizeof(ReorderBufferChange);
3985 
3986  switch (change->action)
3987  {
3988  /* fall through these, they're all similar enough */
3993  {
3994  ReorderBufferTupleBuf *oldtup,
3995  *newtup;
3996  Size oldlen = 0;
3997  Size newlen = 0;
3998 
3999  oldtup = change->data.tp.oldtuple;
4000  newtup = change->data.tp.newtuple;
4001 
4002  if (oldtup)
4003  {
4004  sz += sizeof(HeapTupleData);
4005  oldlen = oldtup->tuple.t_len;
4006  sz += oldlen;
4007  }
4008 
4009  if (newtup)
4010  {
4011  sz += sizeof(HeapTupleData);
4012  newlen = newtup->tuple.t_len;
4013  sz += newlen;
4014  }
4015 
4016  break;
4017  }
4019  {
4020  Size prefix_size = strlen(change->data.msg.prefix) + 1;
4021 
4022  sz += prefix_size + change->data.msg.message_size +
4023  sizeof(Size) + sizeof(Size);
4024 
4025  break;
4026  }
4028  {
4029  sz += sizeof(SharedInvalidationMessage) *
4030  change->data.inval.ninvalidations;
4031  break;
4032  }
4034  {
4035  Snapshot snap;
4036 
4037  snap = change->data.snapshot;
4038 
4039  sz += sizeof(SnapshotData) +
4040  sizeof(TransactionId) * snap->xcnt +
4041  sizeof(TransactionId) * snap->subxcnt;
4042 
4043  break;
4044  }
4046  {
4047  sz += sizeof(Oid) * change->data.truncate.nrelids;
4048 
4049  break;
4050  }
4055  /* ReorderBufferChange contains everything important */
4056  break;
4057  }
4058 
4059  return sz;
4060 }
4061 
4062 
4063 /*
4064  * Restore a number of changes spilled to disk back into memory.
4065  */
4066 static Size
4068  TXNEntryFile *file, XLogSegNo *segno)
4069 {
4070  Size restored = 0;
4071  XLogSegNo last_segno;
4072  dlist_mutable_iter cleanup_iter;
4073  File *fd = &file->vfd;
4074 
4077 
4078  /* free current entries, so we have memory for more */
4079  dlist_foreach_modify(cleanup_iter, &txn->changes)
4080  {
4082  dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4083 
4084  dlist_delete(&cleanup->node);
4086  }
4087  txn->nentries_mem = 0;
4088  Assert(dlist_is_empty(&txn->changes));
4089 
4090  XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4091 
4092  while (restored < max_changes_in_memory && *segno <= last_segno)
4093  {
4094  int readBytes;
4095  ReorderBufferDiskChange *ondisk;
4096 
4097  if (*fd == -1)
4098  {
4099  char path[MAXPGPATH];
4100 
4101  /* first time in */
4102  if (*segno == 0)
4103  XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4104 
4105  Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4106 
4107  /*
4108  * No need to care about TLIs here, only used during a single run,
4109  * so each LSN only maps to a specific WAL record.
4110  */
4112  *segno);
4113 
4114  *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4115 
4116  /* No harm in resetting the offset even in case of failure */
4117  file->curOffset = 0;
4118 
4119  if (*fd < 0 && errno == ENOENT)
4120  {
4121  *fd = -1;
4122  (*segno)++;
4123  continue;
4124  }
4125  else if (*fd < 0)
4126  ereport(ERROR,
4128  errmsg("could not open file \"%s\": %m",
4129  path)));
4130  }
4131 
4132  /*
4133  * Read the statically sized part of a change which has information
4134  * about the total size. If we couldn't read a record, we're at the
4135  * end of this file.
4136  */
4138  readBytes = FileRead(file->vfd, rb->outbuf,
4139  sizeof(ReorderBufferDiskChange),
4141 
4142  /* eof */
4143  if (readBytes == 0)
4144  {
4145  FileClose(*fd);
4146  *fd = -1;
4147  (*segno)++;
4148  continue;
4149  }
4150  else if (readBytes < 0)
4151  ereport(ERROR,
4153  errmsg("could not read from reorderbuffer spill file: %m")));
4154  else if (readBytes != sizeof(ReorderBufferDiskChange))
4155  ereport(ERROR,
4157  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4158  readBytes,
4159  (uint32) sizeof(ReorderBufferDiskChange))));
4160 
4161  file->curOffset += readBytes;
4162 
4163  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4164 
4166  sizeof(ReorderBufferDiskChange) + ondisk->size);
4167  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4168 
4169  readBytes = FileRead(file->vfd,
4170  rb->outbuf + sizeof(ReorderBufferDiskChange),
4171  ondisk->size - sizeof(ReorderBufferDiskChange),
4172  file->curOffset,
4174 
4175  if (readBytes < 0)
4176  ereport(ERROR,
4178  errmsg("could not read from reorderbuffer spill file: %m")));
4179  else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4180  ereport(ERROR,
4182  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4183  readBytes,
4184  (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4185 
4186  file->curOffset += readBytes;
4187 
4188  /*
4189  * ok, read a full change from disk, now restore it into proper
4190  * in-memory format
4191  */
4192  ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4193  restored++;
4194  }
4195 
4196  return restored;
4197 }
4198 
4199 /*
4200  * Convert change from its on-disk format to in-memory format and queue it onto
4201  * the TXN's ->changes list.
4202  *
4203  * Note: although "data" is declared char*, at entry it points to a
4204  * maxalign'd buffer, making it safe in most of this function to assume
4205  * that the pointed-to data is suitably aligned for direct access.
4206  */
4207 static void
4209  char *data)
4210 {
4211  ReorderBufferDiskChange *ondisk;
4212  ReorderBufferChange *change;
4213 
4214  ondisk = (ReorderBufferDiskChange *) data;
4215 
4216  change = ReorderBufferGetChange(rb);
4217 
4218  /* copy static part */
4219  memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4220 
4221  data += sizeof(ReorderBufferDiskChange);
4222 
4223  /* restore individual stuff */
4224  switch (change->action)
4225  {
4226  /* fall through these, they're all similar enough */
4231  if (change->data.tp.oldtuple)
4232  {
4233  uint32 tuplelen = ((HeapTuple) data)->t_len;
4234 
4235  change->data.tp.oldtuple =
4237 
4238  /* restore ->tuple */
4239  memcpy(&change->data.tp.oldtuple->tuple, data,
4240  sizeof(HeapTupleData));
4241  data += sizeof(HeapTupleData);
4242 
4243  /* reset t_data pointer into the new tuplebuf */
4244  change->data.tp.oldtuple->tuple.t_data =
4245  ReorderBufferTupleBufData(change->data.tp.oldtuple);
4246 
4247  /* restore tuple data itself */
4248  memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
4249  data += tuplelen;
4250  }
4251 
4252  if (change->data.tp.newtuple)
4253  {
4254  /* here, data might not be suitably aligned! */
4255  uint32 tuplelen;
4256 
4257  memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4258  sizeof(uint32));
4259 
4260  change->data.tp.newtuple =
4262 
4263  /* restore ->tuple */
4264  memcpy(&change->data.tp.newtuple->tuple, data,
4265  sizeof(HeapTupleData));
4266  data += sizeof(HeapTupleData);
4267 
4268  /* reset t_data pointer into the new tuplebuf */
4269  change->data.tp.newtuple->tuple.t_data =
4270  ReorderBufferTupleBufData(change->data.tp.newtuple);
4271 
4272  /* restore tuple data itself */
4273  memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
4274  data += tuplelen;
4275  }
4276 
4277  break;
4279  {
4280  Size prefix_size;
4281 
4282  /* read prefix */
4283  memcpy(&prefix_size, data, sizeof(Size));
4284  data += sizeof(Size);
4285  change->data.msg.prefix = MemoryContextAlloc(rb->context,
4286  prefix_size);
4287  memcpy(change->data.msg.prefix, data, prefix_size);
4288  Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4289  data += prefix_size;
4290 
4291  /* read the message */
4292  memcpy(&change->data.msg.message_size, data, sizeof(Size));
4293  data += sizeof(Size);
4294  change->data.msg.message = MemoryContextAlloc(rb->context,
4295  change->data.msg.message_size);
4296  memcpy(change->data.msg.message, data,
4297  change->data.msg.message_size);
4298  data += change->data.msg.message_size;
4299 
4300  break;
4301  }
4303  {
4304  Size inval_size = sizeof(SharedInvalidationMessage) *
4305  change->data.inval.ninvalidations;
4306 
4307  change->data.inval.invalidations =
4308  MemoryContextAlloc(rb->context, inval_size);
4309 
4310  /* read the message */
4311  memcpy(change->data.inval.invalidations, data, inval_size);
4312 
4313  break;
4314  }
4316  {
4317  Snapshot oldsnap;
4318  Snapshot newsnap;
4319  Size size;
4320 
4321  oldsnap = (Snapshot) data;
4322 
4323  size = sizeof(SnapshotData) +
4324  sizeof(TransactionId) * oldsnap->xcnt +
4325  sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4326 
4327  change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4328 
4329  newsnap = change->data.snapshot;
4330 
4331  memcpy(newsnap, data, size);
4332  newsnap->xip = (TransactionId *)
4333  (((char *) newsnap) + sizeof(SnapshotData));
4334  newsnap->subxip = newsnap->xip + newsnap->xcnt;
4335  newsnap->copied = true;
4336  break;
4337  }
4338  /* the base struct contains all the data, easy peasy */
4340  {
4341  Oid *relids;
4342 
4343  relids = ReorderBufferGetRelids(rb,
4344  change->data.truncate.nrelids);
4345  memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4346  change->data.truncate.relids = relids;
4347 
4348  break;
4349  }
4354  break;
4355  }
4356 
4357  dlist_push_tail(&txn->changes, &change->node);
4358  txn->nentries_mem++;
4359 
4360  /*
4361  * Update memory accounting for the restored change. We need to do this
4362  * although we don't check the memory limit when restoring the changes in
4363  * this branch (we only do that when initially queueing the changes after
4364  * decoding), because we will release the changes later, and that will
4365  * update the accounting too (subtracting the size from the counters). And
4366  * we don't want to underflow there.
4367  */
4368  ReorderBufferChangeMemoryUpdate(rb, change, true,
4369  ReorderBufferChangeSize(change));
4370 }
4371 
4372 /*
4373  * Remove all on-disk stored for the passed in transaction.
4374  */
4375 static void
4377 {
4378  XLogSegNo first;
4379  XLogSegNo cur;
4380  XLogSegNo last;
4381 
4384 
4385  XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4386  XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4387 
4388  /* iterate over all possible filenames, and delete them */
4389  for (cur = first; cur <= last; cur++)
4390  {
4391  char path[MAXPGPATH];
4392 
4394  if (unlink(path) != 0 && errno != ENOENT)
4395  ereport(ERROR,
4397  errmsg("could not remove file \"%s\": %m", path)));
4398  }
4399 }
4400 
4401 /*
4402  * Remove any leftover serialized reorder buffers from a slot directory after a
4403  * prior crash or decoding session exit.
4404  */
4405 static void
4407 {
4408  DIR *spill_dir;
4409  struct dirent *spill_de;
4410  struct stat statbuf;
4411  char path[MAXPGPATH * 2 + 12];
4412 
4413  sprintf(path, "pg_replslot/%s", slotname);
4414 
4415  /* we're only handling directories here, skip if it's not ours */
4416  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4417  return;
4418 
4419  spill_dir = AllocateDir(path);
4420  while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4421  {
4422  /* only look at names that can be ours */
4423  if (strncmp(spill_de->d_name, "xid", 3) == 0)
4424  {
4425  snprintf(path, sizeof(path),
4426  "pg_replslot/%s/%s", slotname,
4427  spill_de->d_name);
4428 
4429  if (unlink(path) != 0)
4430  ereport(ERROR,
4432  errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4433  path, slotname)));
4434  }
4435  }
4436  FreeDir(spill_dir);
4437 }
4438 
4439 /*
4440  * Given a replication slot, transaction ID and segment number, fill in the
4441  * corresponding spill file into 'path', which is a caller-owned buffer of size
4442  * at least MAXPGPATH.
4443  */
4444 static void
4446  XLogSegNo segno)
4447 {
4448  XLogRecPtr recptr;
4449 
4450  XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4451 
4452  snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4454  xid, LSN_FORMAT_ARGS(recptr));
4455 }
4456 
4457 /*
4458  * Delete all data spilled to disk after we've restarted/crashed. It will be
4459  * recreated when the respective slots are reused.
4460  */
4461 void
4463 {
4464  DIR *logical_dir;
4465  struct dirent *logical_de;
4466 
4467  logical_dir = AllocateDir("pg_replslot");
4468  while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4469  {
4470  if (strcmp(logical_de->d_name, ".") == 0 ||
4471  strcmp(logical_de->d_name, "..") == 0)
4472  continue;
4473 
4474  /* if it cannot be a slot, skip the directory */
4475  if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4476  continue;
4477 
4478  /*
4479  * ok, has to be a surviving logical slot, iterate and delete
4480  * everything starting with xid-*
4481  */
4483  }
4484  FreeDir(logical_dir);
4485 }
4486 
4487 /* ---------------------------------------
4488  * toast reassembly support
4489  * ---------------------------------------
4490  */
4491 
4492 /*
4493  * Initialize per tuple toast reconstruction support.
4494  */
4495 static void
4497 {
4498  HASHCTL hash_ctl;
4499 
4500  Assert(txn->toast_hash == NULL);
4501 
4502  hash_ctl.keysize = sizeof(Oid);
4503  hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4504  hash_ctl.hcxt = rb->context;
4505  txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4507 }
4508 
4509 /*
4510  * Per toast-chunk handling for toast reconstruction
4511  *
4512  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4513  * toasted Datum comes along.
4514  */
4515 static void
4517  Relation relation, ReorderBufferChange *change)
4518 {
4519  ReorderBufferToastEnt *ent;
4520  ReorderBufferTupleBuf *newtup;
4521  bool found;
4522  int32 chunksize;
4523  bool isnull;
4524  Pointer chunk;
4525  TupleDesc desc = RelationGetDescr(relation);
4526  Oid chunk_id;
4527  int32 chunk_seq;
4528 
4529  if (txn->toast_hash == NULL)
4530  ReorderBufferToastInitHash(rb, txn);
4531 
4532  Assert(IsToastRelation(relation));
4533 
4534  newtup = change->data.tp.newtuple;
4535  chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull));
4536  Assert(!isnull);
4537  chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull));
4538  Assert(!isnull);
4539 
4540  ent = (ReorderBufferToastEnt *)
4541  hash_search(txn->toast_hash,
4542  (void *) &chunk_id,
4543  HASH_ENTER,
4544  &found);
4545 
4546  if (!found)
4547  {
4548  Assert(ent->chunk_id == chunk_id);
4549  ent->num_chunks = 0;
4550  ent->last_chunk_seq = 0;
4551  ent->size = 0;
4552  ent->reconstructed = NULL;
4553  dlist_init(&ent->chunks);
4554 
4555  if (chunk_seq != 0)
4556  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4557  chunk_seq, chunk_id);
4558  }
4559  else if (found && chunk_seq != ent->last_chunk_seq + 1)
4560  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4561  chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4562 
4563  chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull));
4564  Assert(!isnull);
4565 
4566  /* calculate size so we can allocate the right size at once later */
4567  if (!VARATT_IS_EXTENDED(chunk))
4568  chunksize = VARSIZE(chunk) - VARHDRSZ;
4569  else if (VARATT_IS_SHORT(chunk))
4570  /* could happen due to heap_form_tuple doing its thing */
4571  chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4572  else
4573  elog(ERROR, "unexpected type of toast chunk");
4574 
4575  ent->size += chunksize;
4576  ent->last_chunk_seq = chunk_seq;
4577  ent->num_chunks++;
4578  dlist_push_tail(&ent->chunks, &change->node);
4579 }
4580 
4581 /*
4582  * Rejigger change->newtuple to point to in-memory toast tuples instead to
4583  * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
4584  *
4585  * We cannot replace unchanged toast tuples though, so those will still point
4586  * to on-disk toast data.
4587  *
4588  * While updating the existing change with detoasted tuple data, we need to
4589  * update the memory accounting info, because the change size will differ.
4590  * Otherwise the accounting may get out of sync, triggering serialization
4591  * at unexpected times.
4592  *
4593  * We simply subtract size of the change before rejiggering the tuple, and
4594  * then adding the new size. This makes it look like the change was removed
4595  * and then added back, except it only tweaks the accounting info.
4596  *
4597  * In particular it can't trigger serialization, which would be pointless
4598  * anyway as it happens during commit processing right before handing
4599  * the change to the output plugin.
4600  */
4601 static void
4603  Relation relation, ReorderBufferChange *change)
4604 {
4605  TupleDesc desc;
4606  int natt;
4607  Datum *attrs;
4608  bool *isnull;
4609  bool *free;
4610  HeapTuple tmphtup;
4611  Relation toast_rel;
4612  TupleDesc toast_desc;
4613  MemoryContext oldcontext;
4614  ReorderBufferTupleBuf *newtup;
4615  Size old_size;
4616 
4617  /* no toast tuples changed */
4618  if (txn->toast_hash == NULL)
4619  return;
4620 
4621  /*
4622  * We're going to modify the size of the change. So, to make sure the
4623  * accounting is correct we record the current change size and then after
4624  * re-computing the change we'll subtract the recorded size and then
4625  * re-add the new change size at the end. We don't immediately subtract
4626  * the old size because if there is any error before we add the new size,
4627  * we will release the changes and that will update the accounting info
4628  * (subtracting the size from the counters). And we don't want to
4629  * underflow there.
4630  */
4631  old_size = ReorderBufferChangeSize(change);
4632 
4633  oldcontext = MemoryContextSwitchTo(rb->context);
4634 
4635  /* we should only have toast tuples in an INSERT or UPDATE */
4636  Assert(change->data.tp.newtuple);
4637 
4638  desc = RelationGetDescr(relation);
4639 
4640  toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4641  if (!RelationIsValid(toast_rel))
4642  elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
4643  relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
4644 
4645  toast_desc = RelationGetDescr(toast_rel);
4646 
4647  /* should we allocate from stack instead? */
4648  attrs = palloc0(sizeof(Datum) * desc->natts);
4649  isnull = palloc0(sizeof(bool) * desc->natts);
4650  free = palloc0(sizeof(bool) * desc->natts);
4651 
4652  newtup = change->data.tp.newtuple;
4653 
4654  heap_deform_tuple(&newtup->tuple, desc, attrs, isnull);
4655 
4656  for (natt = 0; natt < desc->natts; natt++)
4657  {
4658  Form_pg_attribute attr = TupleDescAttr(desc, natt);
4659  ReorderBufferToastEnt *ent;
4660  struct varlena *varlena;
4661 
4662  /* va_rawsize is the size of the original datum -- including header */
4663  struct varatt_external toast_pointer;
4664  struct varatt_indirect redirect_pointer;
4665  struct varlena *new_datum = NULL;
4666  struct varlena *reconstructed;
4667  dlist_iter it;
4668  Size data_done = 0;
4669 
4670  /* system columns aren't toasted */
4671  if (attr->attnum < 0)
4672  continue;
4673 
4674  if (attr->attisdropped)
4675  continue;
4676 
4677  /* not a varlena datatype */
4678  if (attr->attlen != -1)
4679  continue;
4680 
4681  /* no data */
4682  if (isnull[natt])
4683  continue;
4684 
4685  /* ok, we know we have a toast datum */
4686  varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4687 
4688  /* no need to do anything if the tuple isn't external */
4690  continue;
4691 
4692  VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4693 
4694  /*
4695  * Check whether the toast tuple changed, replace if so.
4696  */
4697  ent = (ReorderBufferToastEnt *)
4698  hash_search(txn->toast_hash,
4699  (void *) &toast_pointer.va_valueid,
4700  HASH_FIND,
4701  NULL);
4702  if (ent == NULL)
4703  continue;
4704 
4705  new_datum =
4707 
4708  free[natt] = true;
4709 
4710  reconstructed = palloc0(toast_pointer.va_rawsize);
4711 
4712  ent->reconstructed = reconstructed;
4713 
4714  /* stitch toast tuple back together from its parts */
4715  dlist_foreach(it, &ent->chunks)
4716  {
4717  bool isnull;
4718  ReorderBufferChange *cchange;
4719  ReorderBufferTupleBuf *ctup;
4720  Pointer chunk;
4721 
4722  cchange = dlist_container(ReorderBufferChange, node, it.cur);
4723  ctup = cchange->data.tp.newtuple;
4724  chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull));
4725 
4726  Assert(!isnull);
4727  Assert(!VARATT_IS_EXTERNAL(chunk));
4728  Assert(!VARATT_IS_SHORT(chunk));
4729 
4730  memcpy(VARDATA(reconstructed) + data_done,
4731  VARDATA(chunk),
4732  VARSIZE(chunk) - VARHDRSZ);
4733  data_done += VARSIZE(chunk) - VARHDRSZ;
4734  }
4735  Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4736 
4737  /* make sure its marked as compressed or not */
4738  if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4739  SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4740  else
4741  SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4742 
4743  memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4744  redirect_pointer.pointer = reconstructed;
4745 
4747  memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4748  sizeof(redirect_pointer));
4749 
4750  attrs[natt] = PointerGetDatum(new_datum);
4751  }
4752 
4753  /*
4754  * Build tuple in separate memory & copy tuple back into the tuplebuf
4755  * passed to the output plugin. We can't directly heap_fill_tuple() into
4756  * the tuplebuf because attrs[] will point back into the current content.
4757  */
4758  tmphtup = heap_form_tuple(desc, attrs, isnull);
4759  Assert(newtup->tuple.t_len <= MaxHeapTupleSize);
4760  Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data);
4761 
4762  memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len);
4763  newtup->tuple.t_len = tmphtup->t_len;
4764 
4765  /*
4766  * free resources we won't further need, more persistent stuff will be
4767  * free'd in ReorderBufferToastReset().
4768  */
4769  RelationClose(toast_rel);
4770  pfree(tmphtup);
4771  for (natt = 0; natt < desc->natts; natt++)
4772  {
4773  if (free[natt])
4774  pfree(DatumGetPointer(attrs[natt]));
4775  }
4776  pfree(attrs);
4777  pfree(free);
4778  pfree(isnull);
4779 
4780  MemoryContextSwitchTo(oldcontext);
4781 
4782  /* subtract the old change size */
4783  ReorderBufferChangeMemoryUpdate(rb, change, false, old_size);
4784  /* now add the change back, with the correct size */
4785  ReorderBufferChangeMemoryUpdate(rb, change, true,
4786  ReorderBufferChangeSize(change));
4787 }
4788 
4789 /*
4790  * Free all resources allocated for toast reconstruction.
4791  */
4792 static void
4794 {
4795  HASH_SEQ_STATUS hstat;
4796  ReorderBufferToastEnt *ent;
4797 
4798  if (txn->toast_hash == NULL)
4799  return;
4800 
4801  /* sequentially walk over the hash and free everything */
4802  hash_seq_init(&hstat, txn->toast_hash);
4803  while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4804  {
4805  dlist_mutable_iter it;
4806 
4807  if (ent->reconstructed != NULL)
4808  pfree(ent->reconstructed);
4809 
4810  dlist_foreach_modify(it, &ent->chunks)
4811  {
4812  ReorderBufferChange *change =
4814 
4815  dlist_delete(&change->node);
4816  ReorderBufferReturnChange(rb, change, true);
4817  }
4818  }
4819 
4820  hash_destroy(txn->toast_hash);
4821  txn->toast_hash = NULL;
4822 }
4823 
4824 
4825 /* ---------------------------------------
4826  * Visibility support for logical decoding
4827  *
4828  *
4829  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4830  * always rely on stored cmin/cmax values because of two scenarios:
4831  *
4832  * * A tuple got changed multiple times during a single transaction and thus
4833  * has got a combo CID. Combo CIDs are only valid for the duration of a
4834  * single transaction.
4835  * * A tuple with a cmin but no cmax (and thus no combo CID) got
4836  * deleted/updated in another transaction than the one which created it
4837  * which we are looking at right now. As only one of cmin, cmax or combo CID
4838  * is actually stored in the heap we don't have access to the value we
4839  * need anymore.
4840  *
4841  * To resolve those problems we have a per-transaction hash of (cmin,
4842  * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
4843  * (cmin, cmax) values. That also takes care of combo CIDs by simply
4844  * not caring about them at all. As we have the real cmin/cmax values
4845  * combo CIDs aren't interesting.
4846  *
4847  * As we only care about catalog tuples here the overhead of this
4848  * hashtable should be acceptable.
4849  *
4850  * Heap rewrites complicate this a bit, check rewriteheap.c for
4851  * details.
4852  * -------------------------------------------------------------------------
4853  */
4854 
4855 /* struct for sorting mapping files by LSN efficiently */
4856 typedef struct RewriteMappingFile
4857 {
4861 
4862 #ifdef NOT_USED
4863 static void
4864 DisplayMapping(HTAB *tuplecid_data)
4865 {
4866  HASH_SEQ_STATUS hstat;
4868 
4869  hash_seq_init(&hstat, tuplecid_data);
4870  while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4871  {
4872  elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4873  ent->key.relnode.dbNode,
4874  ent->key.relnode.spcNode,
4875  ent->key.relnode.relNode,
4878  ent->cmin,
4879  ent->cmax
4880  );
4881  }
4882 }
4883 #endif
4884 
4885 /*
4886  * Apply a single mapping file to tuplecid_data.
4887  *
4888  * The mapping file has to have been verified to be a) committed b) for our
4889  * transaction c) applied in LSN order.
4890  */
4891 static void
4892 ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
4893 {
4894  char path[MAXPGPATH];
4895  int fd;
4896  int readBytes;
4898 
4899  sprintf(path, "pg_logical/mappings/%s", fname);
4900  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4901  if (fd < 0)
4902  ereport(ERROR,
4904  errmsg("could not open file \"%s\": %m", path)));
4905 
4906  while (true)
4907  {
4910  ReorderBufferTupleCidEnt *new_ent;
4911  bool found;
4912 
4913  /* be careful about padding */
4914  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
4915 
4916  /* read all mappings till the end of the file */
4918  readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
4920 
4921  if (readBytes < 0)
4922  ereport(ERROR,
4924  errmsg("could not read file \"%s\": %m",
4925  path)));
4926  else if (readBytes == 0) /* EOF */
4927  break;
4928  else if (readBytes != sizeof(LogicalRewriteMappingData))
4929  ereport(ERROR,
4931  errmsg("could not read from file \"%s\": read %d instead of %d bytes",
4932  path, readBytes,
4933  (int32) sizeof(LogicalRewriteMappingData))));
4934 
4935  key.relnode = map.old_node;
4936  ItemPointerCopy(&map.old_tid,
4937  &key.tid);
4938 
4939 
4940  ent = (ReorderBufferTupleCidEnt *)
4942  (void *) &key,
4943  HASH_FIND,
4944  NULL);
4945 
4946  /* no existing mapping, no need to update */
4947  if (!ent)
4948  continue;
4949 
4950  key.relnode = map.new_node;
4951  ItemPointerCopy(&map.new_tid,
4952  &key.tid);
4953 
4954  new_ent = (ReorderBufferTupleCidEnt *)
4956  (void *) &key,
4957  HASH_ENTER,
4958  &found);
4959 
4960  if (found)
4961  {
4962  /*
4963  * Make sure the existing mapping makes sense. We sometime update
4964  * old records that did not yet have a cmax (e.g. pg_class' own
4965  * entry while rewriting it) during rewrites, so allow that.
4966  */
4967  Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
4968  Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
4969  }
4970  else
4971  {
4972  /* update mapping */
4973  new_ent->cmin = ent->cmin;
4974  new_ent->cmax = ent->cmax;
4975  new_ent->combocid = ent->combocid;
4976  }
4977  }
4978 
4979  if (CloseTransientFile(fd) != 0)
4980  ereport(ERROR,
4982  errmsg("could not close file \"%s\": %m", path)));
4983 }
4984 
4985 
4986 /*
4987  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
4988  */
4989 static bool
4991 {
4992  return bsearch(&xid, xip, num,
4993  sizeof(TransactionId), xidComparator) != NULL;
4994 }
4995 
4996 /*
4997  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
4998  */
4999 static int
5000 file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5001 {
5004 
5005  if (a->lsn < b->lsn)
5006  return -1;
5007  else if (a->lsn > b->lsn)
5008  return 1;
5009  return 0;
5010 }
5011 
5012 /*
5013  * Apply any existing logical remapping files if there are any targeted at our
5014  * transaction for relid.
5015  */
5016 static void
5018 {
5019  DIR *mapping_dir;
5020  struct dirent *mapping_de;
5021  List *files = NIL;
5022  ListCell *file;
5023  Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5024 
5025  mapping_dir = AllocateDir("pg_logical/mappings");
5026  while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
5027  {
5028  Oid f_dboid;
5029  Oid f_relid;
5030  TransactionId f_mapped_xid;
5031  TransactionId f_create_xid;
5032  XLogRecPtr f_lsn;
5033  uint32 f_hi,
5034  f_lo;
5035  RewriteMappingFile *f;
5036 
5037  if (strcmp(mapping_de->d_name, ".") == 0 ||
5038  strcmp(mapping_de->d_name, "..") == 0)
5039  continue;
5040 
5041  /* Ignore files that aren't ours */
5042  if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5043  continue;
5044 
5045  if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5046  &f_dboid, &f_relid, &f_hi, &f_lo,
5047  &f_mapped_xid, &f_create_xid) != 6)
5048  elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5049 
5050  f_lsn = ((uint64) f_hi) << 32 | f_lo;
5051 
5052  /* mapping for another database */
5053  if (f_dboid != dboid)
5054  continue;
5055 
5056  /* mapping for another relation */
5057  if (f_relid != relid)
5058  continue;
5059 
5060  /* did the creating transaction abort? */
5061  if (!TransactionIdDidCommit(f_create_xid))
5062  continue;
5063 
5064  /* not for our transaction */
5065  if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5066  continue;
5067 
5068  /* ok, relevant, queue for apply */
5069  f = palloc(sizeof(RewriteMappingFile));
5070  f->lsn = f_lsn;
5071  strcpy(f->fname, mapping_de->d_name);
5072  files = lappend(files, f);
5073  }
5074  FreeDir(mapping_dir);
5075 
5076  /* sort files so we apply them in LSN order */
5077  list_sort(files, file_sort_by_lsn);
5078 
5079  foreach(file, files)
5080  {
5082 
5083  elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5084  snapshot->subxip[0]);
5086  pfree(f);
5087  }
5088 }
5089 
5090 /*
5091  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5092  * combo CIDs.
5093  */
5094 bool
5096  Snapshot snapshot,
5097  HeapTuple htup, Buffer buffer,
5098  CommandId *cmin, CommandId *cmax)
5099 {
5102  ForkNumber forkno;
5103  BlockNumber blockno;
5104  bool updated_mapping = false;
5105 
5106  /*
5107  * Return unresolved if tuplecid_data is not valid. That's because when
5108  * streaming in-progress transactions we may run into tuples with the CID
5109  * before actually decoding them. Think e.g. about INSERT followed by
5110  * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5111  * INSERT. So in such cases, we assume the CID is from the future
5112  * command.
5113  */
5114  if (tuplecid_data == NULL)
5115  return false;
5116 
5117  /* be careful about padding */
5118  memset(&key, 0, sizeof(key));
5119 
5120  Assert(!BufferIsLocal(buffer));
5121 
5122  /*
5123  * get relfilenode from the buffer, no convenient way to access it other
5124  * than that.
5125  */
5126  BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
5127 
5128  /* tuples can only be in the main fork */
5129  Assert(forkno == MAIN_FORKNUM);
5130  Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5131 
5132  ItemPointerCopy(&htup->t_self,
5133  &key.tid);
5134 
5135 restart:
5136  ent = (ReorderBufferTupleCidEnt *)
5138  (void *) &key,
5139  HASH_FIND,
5140  NULL);
5141 
5142  /*
5143  * failed to find a mapping, check whether the table was rewritten and
5144  * apply mapping if so, but only do that once - there can be no new
5145  * mappings while we are in here since we have to hold a lock on the
5146  * relation.
5147  */
5148  if (ent == NULL && !updated_mapping)
5149  {
5151  /* now check but don't update for a mapping again */
5152  updated_mapping = true;
5153  goto restart;
5154  }
5155  else if (ent == NULL)
5156  return false;
5157 
5158  if (cmin)
5159  *cmin = ent->cmin;
5160  if (cmax)
5161  *cmax = ent->cmax;
5162  return true;
5163 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:125
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:109
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:32
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:173
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:68
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:207
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:158
uint32 BlockNumber
Definition: block.h:31
static int32 next
Definition: blutils.c:219
static void cleanup(void)
Definition: bootstrap.c:696
int Buffer
Definition: buf.h:23
#define BufferIsLocal(buffer)
Definition: buf.h:37
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2776
#define NameStr(name)
Definition: c.h:692
#define InvalidCommandId
Definition: c.h:615
unsigned int uint32
Definition: c.h:452
#define offsetof(type, field)
Definition: c.h:738
signed int int32
Definition: c.h:440
char * Pointer
Definition: c.h:429
#define VARHDRSZ
Definition: c.h:638
#define PG_BINARY
Definition: c.h:1279
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:361
#define FirstCommandId
Definition: c.h:614
uint32 CommandId
Definition: c.h:612
uint32 TransactionId
Definition: c.h:598
size_t Size
Definition: c.h:551
#define AssertArg(condition)
Definition: c.h:817
bool IsToastRelation(Relation relation)
Definition: catalog.c:147
bool IsSharedRelation(Oid relationId)
Definition: catalog.c:245
int64 TimestampTz
Definition: timestamp.h:39
#define INDIRECT_POINTER_SIZE
Definition: detoast.h:34
#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)
Definition: detoast.h:22
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:862
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1436
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1426
struct cursor * cur
Definition: ecpg.c:28
void FreeErrorData(ErrorData *edata)
Definition: elog.c:1611
int errcode_for_file_access(void)
Definition: elog.c:716
void FlushErrorState(void)
Definition: elog.c:1649
int errmsg(const char *fmt,...)
Definition: elog.c:904
ErrorData * CopyErrorData(void)
Definition: elog.c:1555
#define PG_RE_THROW()
Definition: elog.h:340
#define PG_END_TRY()
Definition: elog.h:324
#define DEBUG3
Definition: elog.h:22
#define PG_TRY()
Definition: elog.h:299
#define DEBUG2
Definition: elog.h:23
#define DEBUG1
Definition: elog.h:24
#define ERROR
Definition: elog.h:33
#define PG_CATCH()
Definition: elog.h:309
#define INFO
Definition: elog.h:28
#define ereport(elevel,...)
Definition: elog.h:143
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2788
int FreeDir(DIR *dir)
Definition: fd.c:2840
int CloseTransientFile(int fd)
Definition: fd.c:2688
void FileClose(File file)
Definition: fd.c:1961
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1566
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2803
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2112
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2511
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2722
int File
Definition: fd.h:54
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: generation.c:215
Oid MyDatabaseId
Definition: globals.c:89
#define free(a)
Definition: header.h:65
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: heaptuple.c:1020
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition: heaptuple.c:1249
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
HeapTupleData * HeapTuple
Definition: htup.h:71
struct HeapTupleData HeapTupleData
#define SizeofHeapTupleHeader
Definition: htup_details.h:184
#define MaxHeapTupleSize
Definition: htup_details.h:554
static Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:745
#define dlist_foreach(iter, lhead)
Definition: ilist.h:526
static void dlist_init(dlist_head *head)
Definition: ilist.h:278
static void dlist_insert_before(dlist_node *before, dlist_node *node)
Definition: ilist.h:346
#define dlist_head_element(type, membername, lhead)
Definition: ilist.h:506
static bool dlist_is_empty(dlist_head *head)
Definition: ilist.h:289
static void dlist_delete(dlist_node *node)
Definition: ilist.h:358
static dlist_node * dlist_next_node(dlist_head *head, dlist_node *node)
Definition: ilist.h:440
static dlist_node * dlist_pop_head_node(dlist_head *head)
Definition: ilist.h:368
static bool dlist_has_next(dlist_head *head, dlist_node *node)
Definition: ilist.h:421
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:543
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:317
#define dlist_container(type, membername, ptr)
Definition: ilist.h:496
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
Definition: inval.c:615
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
Assert(fmt[strlen(fmt) - 1] !='\n')
void list_sort(List *list, list_sort_comparator cmp)
Definition: list.c:1652
List * lappend(List *list, void *datum)
Definition: list.c:338
void UpdateDecodingStats(LogicalDecodingContext *ctx)
Definition: logical.c:1833
char * pstrdup(const char *in)
Definition: mcxt.c:1305
void pfree(void *pointer)
Definition: mcxt.c:1175
void * palloc0(Size size)
Definition: mcxt.c:1099
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:906
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1188
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:863
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
void * palloc(Size size)
Definition: mcxt.c:1068
#define AllocSetContextCreate
Definition: memutils.h:173
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:197
#define SLAB_DEFAULT_BLOCK_SIZE
Definition: memutils.h:226
#define SLAB_LARGE_BLOCK_SIZE
Definition: memutils.h:227
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:207
void * arg
#define MAXPGPATH
const void * data
#define lfirst(lc)
Definition: pg_list.h:170
#define NIL
Definition: pg_list.h:66
#define sprintf
Definition: port.h:227
#define snprintf
Definition: port.h:225
#define qsort(a, b, c, d)
Definition: port.h:495
#define VARHDRSZ_SHORT
Definition: postgres.h:292
#define VARSIZE_SHORT(PTR)
Definition: postgres.h:318
#define DatumGetObjectId(X)
Definition: postgres.h:544
#define VARATT_IS_EXTENDED(PTR)
Definition: postgres.h:340
uintptr_t Datum
Definition: postgres.h:411
#define VARATT_IS_SHORT(PTR)
Definition: postgres.h:339
#define SET_VARSIZE_COMPRESSED(PTR, len)
Definition: postgres.h:344
#define DatumGetPointer(X)
Definition: postgres.h:593
#define VARDATA(PTR)
Definition: postgres.h:315
#define SET_VARTAG_EXTERNAL(PTR, tag)
Definition: postgres.h:346
#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)
Definition: postgres.h:391
#define VARDATA_EXTERNAL(PTR)
Definition: postgres.h:323
#define DatumGetInt32(X)
Definition: postgres.h:516
#define Int32GetDatum(X)
Definition: postgres.h:523
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:342
#define VARSIZE(PTR)
Definition: postgres.h:316
#define VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer)
Definition: postgres.h:371
#define VARATT_IS_EXTERNAL(PTR)
Definition: postgres.h:326
@ VARTAG_INDIRECT
Definition: postgres.h:123
#define PointerGetDatum(X)
Definition: postgres.h:600
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define RelationIsLogicallyLogged(relation)
Definition: rel.h:685
#define RelationGetDescr(relation)
Definition: rel.h:514
#define RelationGetRelationName(relation)
Definition: rel.h:522
#define RelationIsValid(relation)
Definition: rel.h:461
Relation RelationIdGetRelation(Oid relationId)
Definition: relcache.c:2053
void RelationClose(Relation relation)
Definition: relcache.c:2159
Oid RelidByRelfilenode(Oid reltablespace, Oid relfilenode)
ForkNumber
Definition: relpath.h:41
@ MAIN_FORKNUM
Definition: relpath.h:43
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
static int file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
void ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change, bool upd_mem)
static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
void ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
void ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, CommandId cid)
static void ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
static bool ReorderBufferCanStartStreaming(ReorderBuffer *rb)
static void ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id, XLogRecPtr last_lsn, ReorderBufferChange *specinsert)
struct ReorderBufferDiskChange ReorderBufferDiskChange
bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
void ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
TransactionId ReorderBufferGetOldestXmin(ReorderBuffer *rb)
static int ReorderBufferIterCompare(Datum a, Datum b, void *arg)
static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, ReorderBufferIterTXNState *volatile *iter_state)
bool ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, Snapshot snapshot, HeapTuple htup, Buffer buffer, CommandId *cmin, CommandId *cmax)
static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
void ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, ReorderBufferChange *change, bool toast_insert)
static void ReorderBufferReplay(ReorderBufferTXN *txn, ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn)
void ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid, char *gid)
ReorderBuffer * ReorderBufferAllocate(void)
void ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
void ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid, TransactionId subxid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn)
static void ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id)
#define IsSpecInsert(action)
static Size ReorderBufferChangeSize(ReorderBufferChange *change)
int logical_decoding_work_mem
static void AssertChangeLsnOrder(ReorderBufferTXN *txn)
static bool ReorderBufferCanStream(ReorderBuffer *rb)
static void ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change, bool streaming)
void ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
Oid * ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids)
static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, int fd, ReorderBufferChange *change)
struct ReorderBufferIterTXNState ReorderBufferIterTXNState
void ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Size nmsgs, SharedInvalidationMessage *msgs)
ReorderBufferTXN * ReorderBufferGetOldestTXN(ReorderBuffer *rb)
struct ReorderBufferTXNByIdEnt ReorderBufferTXNByIdEnt
struct ReorderBufferIterTXNEntry ReorderBufferIterTXNEntry
static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
#define IsInsertOrUpdate(action)
static void ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
ReorderBufferChange * ReorderBufferGetChange(ReorderBuffer *rb)
static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
void ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, Snapshot snapshot, XLogRecPtr lsn, bool transactional, const char *prefix, Size message_size, const char *message)
static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
struct RewriteMappingFile RewriteMappingFile
void ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
static ReorderBufferTXN * ReorderBufferLargestTopTXN(ReorderBuffer *rb)
void ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, XLogRecPtr two_phase_at, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn, char *gid, bool is_commit)
static void AssertTXNLsnOrder(ReorderBuffer *rb)
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileNode node, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
static void ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn, ReorderBufferChange *change, bool streaming)
static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
static void ReorderBufferCleanupSerializedTXNs(const char *slotname)
void ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn)
void ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
static void SetupCheckXidLive(TransactionId xid)
static bool TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)