PostgreSQL Source Code  git master
reorderbuffer.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * reorderbuffer.c
4  * PostgreSQL logical replay/reorder buffer management
5  *
6  *
7  * Copyright (c) 2012-2022, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/logical/reorderbuffer.c
12  *
13  * NOTES
14  * This module gets handed individual pieces of transactions in the order
15  * they are written to the WAL and is responsible to reassemble them into
16  * toplevel transaction sized pieces. When a transaction is completely
17  * reassembled - signaled by reading the transaction commit record - it
18  * will then call the output plugin (cf. ReorderBufferCommit()) with the
19  * individual changes. The output plugins rely on snapshots built by
20  * snapbuild.c which hands them to us.
21  *
22  * Transactions and subtransactions/savepoints in postgres are not
23  * immediately linked to each other from outside the performing
24  * backend. Only at commit/abort (or special xact_assignment records) they
25  * are linked together. Which means that we will have to splice together a
26  * toplevel transaction from its subtransactions. To do that efficiently we
27  * build a binary heap indexed by the smallest current lsn of the individual
28  * subtransactions' changestreams. As the individual streams are inherently
29  * ordered by LSN - since that is where we build them from - the transaction
30  * can easily be reassembled by always using the subtransaction with the
31  * smallest current LSN from the heap.
32  *
33  * In order to cope with large transactions - which can be several times as
34  * big as the available memory - this module supports spooling the contents
35  * of a large transactions to disk. When the transaction is replayed the
36  * contents of individual (sub-)transactions will be read from disk in
37  * chunks.
38  *
39  * This module also has to deal with reassembling toast records from the
40  * individual chunks stored in WAL. When a new (or initial) version of a
41  * tuple is stored in WAL it will always be preceded by the toast chunks
42  * emitted for the columns stored out of line. Within a single toplevel
43  * transaction there will be no other data carrying records between a row's
44  * toast chunks and the row data itself. See ReorderBufferToast* for
45  * details.
46  *
47  * ReorderBuffer uses two special memory context types - SlabContext for
48  * allocations of fixed-length structures (changes and transactions), and
49  * GenerationContext for the variable-length transaction data (allocated
50  * and freed in groups with similar lifespans).
51  *
52  * To limit the amount of memory used by decoded changes, we track memory
53  * used at the reorder buffer level (i.e. total amount of memory), and for
54  * each transaction. When the total amount of used memory exceeds the
55  * limit, the transaction consuming the most memory is then serialized to
56  * disk.
57  *
58  * Only decoded changes are evicted from memory (spilled to disk), not the
59  * transaction records. The number of toplevel transactions is limited,
60  * but a transaction with many subtransactions may still consume significant
61  * amounts of memory. However, the transaction records are fairly small and
62  * are not included in the memory limit.
63  *
64  * The current eviction algorithm is very simple - the transaction is
65  * picked merely by size, while it might be useful to also consider age
66  * (LSN) of the changes for example. With the new Generational memory
67  * allocator, evicting the oldest changes would make it more likely the
68  * memory gets actually freed.
69  *
70  * We still rely on max_changes_in_memory when loading serialized changes
71  * back into memory. At that point we can't use the memory limit directly
72  * as we load the subxacts independently. One option to deal with this
73  * would be to count the subxacts, and allow each to allocate 1/N of the
74  * memory limit. That however does not seem very appealing, because with
75  * many subtransactions it may easily cause thrashing (short cycles of
76  * deserializing and applying very few changes). We probably should give
77  * a bit more memory to the oldest subtransactions, because it's likely
78  * they are the source for the next sequence of changes.
79  *
80  * -------------------------------------------------------------------------
81  */
82 #include "postgres.h"
83 
84 #include <unistd.h>
85 #include <sys/stat.h>
86 
87 #include "access/detoast.h"
88 #include "access/heapam.h"
89 #include "access/rewriteheap.h"
90 #include "access/transam.h"
91 #include "access/xact.h"
92 #include "access/xlog_internal.h"
93 #include "catalog/catalog.h"
94 #include "lib/binaryheap.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "replication/logical.h"
99 #include "replication/slot.h"
100 #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
101 #include "storage/bufmgr.h"
102 #include "storage/fd.h"
103 #include "storage/sinval.h"
104 #include "utils/builtins.h"
105 #include "utils/combocid.h"
106 #include "utils/memdebug.h"
107 #include "utils/memutils.h"
108 #include "utils/rel.h"
109 #include "utils/relfilenumbermap.h"
110 
111 
112 /* entry for a hash table we use to map from xid to our transaction state */
114 {
118 
119 /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
121 {
125 
127 {
131  CommandId combocid; /* just for debugging */
133 
134 /* Virtual file descriptor with file offset tracking */
135 typedef struct TXNEntryFile
136 {
137  File vfd; /* -1 when the file is closed */
138  off_t curOffset; /* offset for next write or read. Reset to 0
139  * when vfd is opened. */
141 
142 /* k-way in-order change iteration support structures */
144 {
151 
153 {
159 
160 /* toast datastructures */
161 typedef struct ReorderBufferToastEnt
162 {
163  Oid chunk_id; /* toast_table.chunk_id */
164  int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
165  * have seen */
166  Size num_chunks; /* number of chunks we've already seen */
167  Size size; /* combined size of chunks seen */
168  dlist_head chunks; /* linked list of chunks */
169  struct varlena *reconstructed; /* reconstructed varlena now pointed to in
170  * main tup */
172 
173 /* Disk serialization support datastructures */
175 {
178  /* data follows */
180 
181 #define IsSpecInsert(action) \
182 ( \
183  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
184 )
185 #define IsSpecConfirmOrAbort(action) \
186 ( \
187  (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
188  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
189 )
190 #define IsInsertOrUpdate(action) \
191 ( \
192  (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
193  ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
194  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
195 )
196 
197 /*
198  * Maximum number of changes kept in memory, per transaction. After that,
199  * changes are spooled to disk.
200  *
201  * The current value should be sufficient to decode the entire transaction
202  * without hitting disk in OLTP workloads, while starting to spool to disk in
203  * other workloads reasonably fast.
204  *
205  * At some point in the future it probably makes sense to have a more elaborate
206  * resource management here, but it's not entirely clear what that would look
207  * like.
208  */
210 static const Size max_changes_in_memory = 4096; /* XXX for restore only */
211 
212 /* ---------------------------------------
213  * primary reorderbuffer support routines
214  * ---------------------------------------
215  */
219  TransactionId xid, bool create, bool *is_new,
220  XLogRecPtr lsn, bool create_as_top);
222  ReorderBufferTXN *subtxn);
223 
224 static void AssertTXNLsnOrder(ReorderBuffer *rb);
225 
226 /* ---------------------------------------
227  * support functions for lsn-order iterating over the ->changes of a
228  * transaction and its subtransactions
229  *
230  * used for iteration over the k-way heap merge of a transaction and its
231  * subtransactions
232  * ---------------------------------------
233  */
235  ReorderBufferIterTXNState *volatile *iter_state);
240 
241 /*
242  * ---------------------------------------
243  * Disk serialization support functions
244  * ---------------------------------------
245  */
249  int fd, ReorderBufferChange *change);
251  TXNEntryFile *file, XLogSegNo *segno);
253  char *data);
256  bool txn_prepared);
257 static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
258 static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
259  TransactionId xid, XLogSegNo segno);
260 
261 static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
263  ReorderBufferTXN *txn, CommandId cid);
264 
265 /*
266  * ---------------------------------------
267  * Streaming support functions
268  * ---------------------------------------
269  */
270 static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
271 static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
274 
275 /* ---------------------------------------
276  * toast reassembly support
277  * ---------------------------------------
278  */
282  Relation relation, ReorderBufferChange *change);
284  Relation relation, ReorderBufferChange *change);
285 
286 /*
287  * ---------------------------------------
288  * memory accounting
289  * ---------------------------------------
290  */
293  ReorderBufferChange *change,
294  bool addition, Size sz);
295 
296 /*
297  * Allocate a new ReorderBuffer and clean out any old serialized state from
298  * prior ReorderBuffer instances for the same slot.
299  */
302 {
303  ReorderBuffer *buffer;
304  HASHCTL hash_ctl;
305  MemoryContext new_ctx;
306 
307  Assert(MyReplicationSlot != NULL);
308 
309  /* allocate memory in own context, to have better accountability */
311  "ReorderBuffer",
313 
314  buffer =
315  (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
316 
317  memset(&hash_ctl, 0, sizeof(hash_ctl));
318 
319  buffer->context = new_ctx;
320 
321  buffer->change_context = SlabContextCreate(new_ctx,
322  "Change",
324  sizeof(ReorderBufferChange));
325 
326  buffer->txn_context = SlabContextCreate(new_ctx,
327  "TXN",
329  sizeof(ReorderBufferTXN));
330 
331  /*
332  * XXX the allocation sizes used below pre-date generation context's block
333  * growing code. These values should likely be benchmarked and set to
334  * more suitable values.
335  */
336  buffer->tup_context = GenerationContextCreate(new_ctx,
337  "Tuples",
341 
342  hash_ctl.keysize = sizeof(TransactionId);
343  hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
344  hash_ctl.hcxt = buffer->context;
345 
346  buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
348 
350  buffer->by_txn_last_txn = NULL;
351 
352  buffer->outbuf = NULL;
353  buffer->outbufsize = 0;
354  buffer->size = 0;
355 
356  buffer->spillTxns = 0;
357  buffer->spillCount = 0;
358  buffer->spillBytes = 0;
359  buffer->streamTxns = 0;
360  buffer->streamCount = 0;
361  buffer->streamBytes = 0;
362  buffer->totalTxns = 0;
363  buffer->totalBytes = 0;
364 
366 
367  dlist_init(&buffer->toplevel_by_lsn);
369  dclist_init(&buffer->catchange_txns);
370 
371  /*
372  * Ensure there's no stale data from prior uses of this slot, in case some
373  * prior exit avoided calling ReorderBufferFree. Failure to do this can
374  * produce duplicated txns, and it's very cheap if there's nothing there.
375  */
377 
378  return buffer;
379 }
380 
381 /*
382  * Free a ReorderBuffer
383  */
384 void
386 {
387  MemoryContext context = rb->context;
388 
389  /*
390  * We free separately allocated data by entirely scrapping reorderbuffer's
391  * memory context.
392  */
393  MemoryContextDelete(context);
394 
395  /* Free disk space used by unconsumed reorder buffers */
397 }
398 
399 /*
400  * Get an unused, possibly preallocated, ReorderBufferTXN.
401  */
402 static ReorderBufferTXN *
404 {
405  ReorderBufferTXN *txn;
406 
407  txn = (ReorderBufferTXN *)
409 
410  memset(txn, 0, sizeof(ReorderBufferTXN));
411 
412  dlist_init(&txn->changes);
413  dlist_init(&txn->tuplecids);
414  dlist_init(&txn->subtxns);
415 
416  /* InvalidCommandId is not zero, so set it explicitly */
418  txn->output_plugin_private = NULL;
419 
420  return txn;
421 }
422 
423 /*
424  * Free a ReorderBufferTXN.
425  */
426 static void
428 {
429  /* clean the lookup cache if we were cached (quite likely) */
430  if (rb->by_txn_last_xid == txn->xid)
431  {
433  rb->by_txn_last_txn = NULL;
434  }
435 
436  /* free data that's contained */
437 
438  if (txn->gid != NULL)
439  {
440  pfree(txn->gid);
441  txn->gid = NULL;
442  }
443 
444  if (txn->tuplecid_hash != NULL)
445  {
447  txn->tuplecid_hash = NULL;
448  }
449 
450  if (txn->invalidations)
451  {
452  pfree(txn->invalidations);
453  txn->invalidations = NULL;
454  }
455 
456  /* Reset the toast hash */
457  ReorderBufferToastReset(rb, txn);
458 
459  pfree(txn);
460 }
461 
462 /*
463  * Get a fresh ReorderBufferChange.
464  */
467 {
468  ReorderBufferChange *change;
469 
470  change = (ReorderBufferChange *)
472 
473  memset(change, 0, sizeof(ReorderBufferChange));
474  return change;
475 }
476 
477 /*
478  * Free a ReorderBufferChange and update memory accounting, if requested.
479  */
480 void
482  bool upd_mem)
483 {
484  /* update memory accounting info */
485  if (upd_mem)
486  ReorderBufferChangeMemoryUpdate(rb, change, false,
487  ReorderBufferChangeSize(change));
488 
489  /* free contained data */
490  switch (change->action)
491  {
496  if (change->data.tp.newtuple)
497  {
498  ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple);
499  change->data.tp.newtuple = NULL;
500  }
501 
502  if (change->data.tp.oldtuple)
503  {
504  ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple);
505  change->data.tp.oldtuple = NULL;
506  }
507  break;
509  if (change->data.msg.prefix != NULL)
510  pfree(change->data.msg.prefix);
511  change->data.msg.prefix = NULL;
512  if (change->data.msg.message != NULL)
513  pfree(change->data.msg.message);
514  change->data.msg.message = NULL;
515  break;
517  if (change->data.inval.invalidations)
518  pfree(change->data.inval.invalidations);
519  change->data.inval.invalidations = NULL;
520  break;
522  if (change->data.snapshot)
523  {
524  ReorderBufferFreeSnap(rb, change->data.snapshot);
525  change->data.snapshot = NULL;
526  }
527  break;
528  /* no data in addition to the struct itself */
530  if (change->data.truncate.relids != NULL)
531  {
532  ReorderBufferReturnRelids(rb, change->data.truncate.relids);
533  change->data.truncate.relids = NULL;
534  }
535  break;
540  break;
541  }
542 
543  pfree(change);
544 }
545 
546 /*
547  * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size
548  * tuple_len (excluding header overhead).
549  */
552 {
553  ReorderBufferTupleBuf *tuple;
554  Size alloc_len;
555 
556  alloc_len = tuple_len + SizeofHeapTupleHeader;
557 
558  tuple = (ReorderBufferTupleBuf *)
560  sizeof(ReorderBufferTupleBuf) +
561  MAXIMUM_ALIGNOF + alloc_len);
562  tuple->alloc_tuple_size = alloc_len;
563  tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
564 
565  return tuple;
566 }
567 
568 /*
569  * Free a ReorderBufferTupleBuf.
570  */
571 void
573 {
574  pfree(tuple);
575 }
576 
577 /*
578  * Get an array for relids of truncated relations.
579  *
580  * We use the global memory context (for the whole reorder buffer), because
581  * none of the existing ones seems like a good match (some are SLAB, so we
582  * can't use those, and tup_context is meant for tuple data, not relids). We
583  * could add yet another context, but it seems like an overkill - TRUNCATE is
584  * not particularly common operation, so it does not seem worth it.
585  */
586 Oid *
588 {
589  Oid *relids;
590  Size alloc_len;
591 
592  alloc_len = sizeof(Oid) * nrelids;
593 
594  relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
595 
596  return relids;
597 }
598 
599 /*
600  * Free an array of relids.
601  */
602 void
604 {
605  pfree(relids);
606 }
607 
608 /*
609  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
610  * If create is true, and a transaction doesn't already exist, create it
611  * (with the given LSN, and as top transaction if that's specified);
612  * when this happens, is_new is set to true.
613  */
614 static ReorderBufferTXN *
616  bool *is_new, XLogRecPtr lsn, bool create_as_top)
617 {
618  ReorderBufferTXN *txn;
620  bool found;
621 
623 
624  /*
625  * Check the one-entry lookup cache first
626  */
628  rb->by_txn_last_xid == xid)
629  {
630  txn = rb->by_txn_last_txn;
631 
632  if (txn != NULL)
633  {
634  /* found it, and it's valid */
635  if (is_new)
636  *is_new = false;
637  return txn;
638  }
639 
640  /*
641  * cached as non-existent, and asked not to create? Then nothing else
642  * to do.
643  */
644  if (!create)
645  return NULL;
646  /* otherwise fall through to create it */
647  }
648 
649  /*
650  * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
651  * create an entry.
652  */
653 
654  /* search the lookup table */
655  ent = (ReorderBufferTXNByIdEnt *)
656  hash_search(rb->by_txn,
657  (void *) &xid,
658  create ? HASH_ENTER : HASH_FIND,
659  &found);
660  if (found)
661  txn = ent->txn;
662  else if (create)
663  {
664  /* initialize the new entry, if creation was requested */
665  Assert(ent != NULL);
666  Assert(lsn != InvalidXLogRecPtr);
667 
668  ent->txn = ReorderBufferGetTXN(rb);
669  ent->txn->xid = xid;
670  txn = ent->txn;
671  txn->first_lsn = lsn;
673 
674  if (create_as_top)
675  {
676  dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
677  AssertTXNLsnOrder(rb);
678  }
679  }
680  else
681  txn = NULL; /* not found and not asked to create */
682 
683  /* update cache */
684  rb->by_txn_last_xid = xid;
685  rb->by_txn_last_txn = txn;
686 
687  if (is_new)
688  *is_new = !found;
689 
690  Assert(!create || txn != NULL);
691  return txn;
692 }
693 
694 /*
695  * Record the partial change for the streaming of in-progress transactions. We
696  * can stream only complete changes so if we have a partial change like toast
697  * table insert or speculative insert then we mark such a 'txn' so that it
698  * can't be streamed. We also ensure that if the changes in such a 'txn' are
699  * above logical_decoding_work_mem threshold then we stream them as soon as we
700  * have a complete change.
701  */
702 static void
704  ReorderBufferChange *change,
705  bool toast_insert)
706 {
707  ReorderBufferTXN *toptxn;
708 
709  /*
710  * The partial changes need to be processed only while streaming
711  * in-progress transactions.
712  */
713  if (!ReorderBufferCanStream(rb))
714  return;
715 
716  /* Get the top transaction. */
717  if (txn->toptxn != NULL)
718  toptxn = txn->toptxn;
719  else
720  toptxn = txn;
721 
722  /*
723  * Indicate a partial change for toast inserts. The change will be
724  * considered as complete once we get the insert or update on the main
725  * table and we are sure that the pending toast chunks are not required
726  * anymore.
727  *
728  * If we allow streaming when there are pending toast chunks then such
729  * chunks won't be released till the insert (multi_insert) is complete and
730  * we expect the txn to have streamed all changes after streaming. This
731  * restriction is mainly to ensure the correctness of streamed
732  * transactions and it doesn't seem worth uplifting such a restriction
733  * just to allow this case because anyway we will stream the transaction
734  * once such an insert is complete.
735  */
736  if (toast_insert)
738  else if (rbtxn_has_partial_change(toptxn) &&
739  IsInsertOrUpdate(change->action) &&
740  change->data.tp.clear_toast_afterwards)
742 
743  /*
744  * Indicate a partial change for speculative inserts. The change will be
745  * considered as complete once we get the speculative confirm or abort
746  * token.
747  */
748  if (IsSpecInsert(change->action))
750  else if (rbtxn_has_partial_change(toptxn) &&
751  IsSpecConfirmOrAbort(change->action))
753 
754  /*
755  * Stream the transaction if it is serialized before and the changes are
756  * now complete in the top-level transaction.
757  *
758  * The reason for doing the streaming of such a transaction as soon as we
759  * get the complete change for it is that previously it would have reached
760  * the memory threshold and wouldn't get streamed because of incomplete
761  * changes. Delaying such transactions would increase apply lag for them.
762  */
764  !(rbtxn_has_partial_change(toptxn)) &&
765  rbtxn_is_serialized(txn))
766  ReorderBufferStreamTXN(rb, toptxn);
767 }
768 
769 /*
770  * Queue a change into a transaction so it can be replayed upon commit or will be
771  * streamed when we reach logical_decoding_work_mem threshold.
772  */
773 void
775  ReorderBufferChange *change, bool toast_insert)
776 {
777  ReorderBufferTXN *txn;
778 
779  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
780 
781  /*
782  * While streaming the previous changes we have detected that the
783  * transaction is aborted. So there is no point in collecting further
784  * changes for it.
785  */
786  if (txn->concurrent_abort)
787  {
788  /*
789  * We don't need to update memory accounting for this change as we
790  * have not added it to the queue yet.
791  */
792  ReorderBufferReturnChange(rb, change, false);
793  return;
794  }
795 
796  change->lsn = lsn;
797  change->txn = txn;
798 
799  Assert(InvalidXLogRecPtr != lsn);
800  dlist_push_tail(&txn->changes, &change->node);
801  txn->nentries++;
802  txn->nentries_mem++;
803 
804  /* update memory accounting information */
805  ReorderBufferChangeMemoryUpdate(rb, change, true,
806  ReorderBufferChangeSize(change));
807 
808  /* process partial change */
809  ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
810 
811  /* check the memory limits and evict something if needed */
813 }
814 
815 /*
816  * A transactional message is queued to be processed upon commit and a
817  * non-transactional message gets processed immediately.
818  */
819 void
821  Snapshot snap, XLogRecPtr lsn,
822  bool transactional, const char *prefix,
823  Size message_size, const char *message)
824 {
825  if (transactional)
826  {
827  MemoryContext oldcontext;
828  ReorderBufferChange *change;
829 
831 
832  oldcontext = MemoryContextSwitchTo(rb->context);
833 
834  change = ReorderBufferGetChange(rb);
836  change->data.msg.prefix = pstrdup(prefix);
837  change->data.msg.message_size = message_size;
838  change->data.msg.message = palloc(message_size);
839  memcpy(change->data.msg.message, message, message_size);
840 
841  ReorderBufferQueueChange(rb, xid, lsn, change, false);
842 
843  MemoryContextSwitchTo(oldcontext);
844  }
845  else
846  {
847  ReorderBufferTXN *txn = NULL;
848  volatile Snapshot snapshot_now = snap;
849 
850  if (xid != InvalidTransactionId)
851  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
852 
853  /* setup snapshot to allow catalog access */
854  SetupHistoricSnapshot(snapshot_now, NULL);
855  PG_TRY();
856  {
857  rb->message(rb, txn, lsn, false, prefix, message_size, message);
858 
860  }
861  PG_CATCH();
862  {
864  PG_RE_THROW();
865  }
866  PG_END_TRY();
867  }
868 }
869 
870 /*
871  * AssertTXNLsnOrder
872  * Verify LSN ordering of transaction lists in the reorderbuffer
873  *
874  * Other LSN-related invariants are checked too.
875  *
876  * No-op if assertions are not in use.
877  */
878 static void
880 {
881 #ifdef USE_ASSERT_CHECKING
883  dlist_iter iter;
884  XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
885  XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
886 
887  /*
888  * Skip the verification if we don't reach the LSN at which we start
889  * decoding the contents of transactions yet because until we reach the
890  * LSN, we could have transactions that don't have the association between
891  * the top-level transaction and subtransaction yet and consequently have
892  * the same LSN. We don't guarantee this association until we try to
893  * decode the actual contents of transaction. The ordering of the records
894  * prior to the start_decoding_at LSN should have been checked before the
895  * restart.
896  */
898  return;
899 
900  dlist_foreach(iter, &rb->toplevel_by_lsn)
901  {
903  iter.cur);
904 
905  /* start LSN must be set */
906  Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
907 
908  /* If there is an end LSN, it must be higher than start LSN */
909  if (cur_txn->end_lsn != InvalidXLogRecPtr)
910  Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
911 
912  /* Current initial LSN must be strictly higher than previous */
913  if (prev_first_lsn != InvalidXLogRecPtr)
914  Assert(prev_first_lsn < cur_txn->first_lsn);
915 
916  /* known-as-subtxn txns must not be listed */
917  Assert(!rbtxn_is_known_subxact(cur_txn));
918 
919  prev_first_lsn = cur_txn->first_lsn;
920  }
921 
923  {
925  base_snapshot_node,
926  iter.cur);
927 
928  /* base snapshot (and its LSN) must be set */
929  Assert(cur_txn->base_snapshot != NULL);
931 
932  /* current LSN must be strictly higher than previous */
933  if (prev_base_snap_lsn != InvalidXLogRecPtr)
934  Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
935 
936  /* known-as-subtxn txns must not be listed */
937  Assert(!rbtxn_is_known_subxact(cur_txn));
938 
939  prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
940  }
941 #endif
942 }
943 
944 /*
945  * AssertChangeLsnOrder
946  *
947  * Check ordering of changes in the (sub)transaction.
948  */
949 static void
951 {
952 #ifdef USE_ASSERT_CHECKING
953  dlist_iter iter;
954  XLogRecPtr prev_lsn = txn->first_lsn;
955 
956  dlist_foreach(iter, &txn->changes)
957  {
958  ReorderBufferChange *cur_change;
959 
960  cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
961 
963  Assert(cur_change->lsn != InvalidXLogRecPtr);
964  Assert(txn->first_lsn <= cur_change->lsn);
965 
966  if (txn->end_lsn != InvalidXLogRecPtr)
967  Assert(cur_change->lsn <= txn->end_lsn);
968 
969  Assert(prev_lsn <= cur_change->lsn);
970 
971  prev_lsn = cur_change->lsn;
972  }
973 #endif
974 }
975 
976 /*
977  * ReorderBufferGetOldestTXN
978  * Return oldest transaction in reorderbuffer
979  */
982 {
983  ReorderBufferTXN *txn;
984 
985  AssertTXNLsnOrder(rb);
986 
988  return NULL;
989 
991 
994  return txn;
995 }
996 
997 /*
998  * ReorderBufferGetOldestXmin
999  * Return oldest Xmin in reorderbuffer
1000  *
1001  * Returns oldest possibly running Xid from the point of view of snapshots
1002  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1003  * there are none.
1004  *
1005  * Since snapshots are assigned monotonically, this equals the Xmin of the
1006  * base snapshot with minimal base_snapshot_lsn.
1007  */
1010 {
1011  ReorderBufferTXN *txn;
1012 
1013  AssertTXNLsnOrder(rb);
1014 
1016  return InvalidTransactionId;
1017 
1018  txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1020  return txn->base_snapshot->xmin;
1021 }
1022 
1023 void
1025 {
1026  rb->current_restart_decoding_lsn = ptr;
1027 }
1028 
1029 /*
1030  * ReorderBufferAssignChild
1031  *
1032  * Make note that we know that subxid is a subtransaction of xid, seen as of
1033  * the given lsn.
1034  */
1035 void
1037  TransactionId subxid, XLogRecPtr lsn)
1038 {
1039  ReorderBufferTXN *txn;
1040  ReorderBufferTXN *subtxn;
1041  bool new_top;
1042  bool new_sub;
1043 
1044  txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1045  subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1046 
1047  if (!new_sub)
1048  {
1049  if (rbtxn_is_known_subxact(subtxn))
1050  {
1051  /* already associated, nothing to do */
1052  return;
1053  }
1054  else
1055  {
1056  /*
1057  * We already saw this transaction, but initially added it to the
1058  * list of top-level txns. Now that we know it's not top-level,
1059  * remove it from there.
1060  */
1061  dlist_delete(&subtxn->node);
1062  }
1063  }
1064 
1065  subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1066  subtxn->toplevel_xid = xid;
1067  Assert(subtxn->nsubtxns == 0);
1068 
1069  /* set the reference to top-level transaction */
1070  subtxn->toptxn = txn;
1071 
1072  /* add to subtransaction list */
1073  dlist_push_tail(&txn->subtxns, &subtxn->node);
1074  txn->nsubtxns++;
1075 
1076  /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1077  ReorderBufferTransferSnapToParent(txn, subtxn);
1078 
1079  /* Verify LSN-ordering invariant */
1080  AssertTXNLsnOrder(rb);
1081 }
1082 
1083 /*
1084  * ReorderBufferTransferSnapToParent
1085  * Transfer base snapshot from subtxn to top-level txn, if needed
1086  *
1087  * This is done if the top-level txn doesn't have a base snapshot, or if the
1088  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1089  * snapshot's LSN. This can happen if there are no changes in the toplevel
1090  * txn but there are some in the subtxn, or the first change in subtxn has
1091  * earlier LSN than first change in the top-level txn and we learned about
1092  * their kinship only now.
1093  *
1094  * The subtransaction's snapshot is cleared regardless of the transfer
1095  * happening, since it's not needed anymore in either case.
1096  *
1097  * We do this as soon as we become aware of their kinship, to avoid queueing
1098  * extra snapshots to txns known-as-subtxns -- only top-level txns will
1099  * receive further snapshots.
1100  */
1101 static void
1103  ReorderBufferTXN *subtxn)
1104 {
1105  Assert(subtxn->toplevel_xid == txn->xid);
1106 
1107  if (subtxn->base_snapshot != NULL)
1108  {
1109  if (txn->base_snapshot == NULL ||
1110  subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1111  {
1112  /*
1113  * If the toplevel transaction already has a base snapshot but
1114  * it's newer than the subxact's, purge it.
1115  */
1116  if (txn->base_snapshot != NULL)
1117  {
1120  }
1121 
1122  /*
1123  * The snapshot is now the top transaction's; transfer it, and
1124  * adjust the list position of the top transaction in the list by
1125  * moving it to where the subtransaction is.
1126  */
1127  txn->base_snapshot = subtxn->base_snapshot;
1128  txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1130  &txn->base_snapshot_node);
1131 
1132  /*
1133  * The subtransaction doesn't have a snapshot anymore (so it
1134  * mustn't be in the list.)
1135  */
1136  subtxn->base_snapshot = NULL;
1138  dlist_delete(&subtxn->base_snapshot_node);
1139  }
1140  else
1141  {
1142  /* Base snap of toplevel is fine, so subxact's is not needed */
1144  dlist_delete(&subtxn->base_snapshot_node);
1145  subtxn->base_snapshot = NULL;
1147  }
1148  }
1149 }
1150 
1151 /*
1152  * Associate a subtransaction with its toplevel transaction at commit
1153  * time. There may be no further changes added after this.
1154  */
1155 void
1157  TransactionId subxid, XLogRecPtr commit_lsn,
1158  XLogRecPtr end_lsn)
1159 {
1160  ReorderBufferTXN *subtxn;
1161 
1162  subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1163  InvalidXLogRecPtr, false);
1164 
1165  /*
1166  * No need to do anything if that subtxn didn't contain any changes
1167  */
1168  if (!subtxn)
1169  return;
1170 
1171  subtxn->final_lsn = commit_lsn;
1172  subtxn->end_lsn = end_lsn;
1173 
1174  /*
1175  * Assign this subxact as a child of the toplevel xact (no-op if already
1176  * done.)
1177  */
1178  ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1179 }
1180 
1181 
1182 /*
1183  * Support for efficiently iterating over a transaction's and its
1184  * subtransactions' changes.
1185  *
1186  * We do by doing a k-way merge between transactions/subtransactions. For that
1187  * we model the current heads of the different transactions as a binary heap
1188  * so we easily know which (sub-)transaction has the change with the smallest
1189  * lsn next.
1190  *
1191  * We assume the changes in individual transactions are already sorted by LSN.
1192  */
1193 
1194 /*
1195  * Binary heap comparison function.
1196  */
1197 static int
1199 {
1201  XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1202  XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1203 
1204  if (pos_a < pos_b)
1205  return 1;
1206  else if (pos_a == pos_b)
1207  return 0;
1208  return -1;
1209 }
1210 
1211 /*
1212  * Allocate & initialize an iterator which iterates in lsn order over a
1213  * transaction and all its subtransactions.
1214  *
1215  * Note: The iterator state is returned through iter_state parameter rather
1216  * than the function's return value. This is because the state gets cleaned up
1217  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1218  * back the state even if this function throws an exception.
1219  */
1220 static void
1222  ReorderBufferIterTXNState *volatile *iter_state)
1223 {
1224  Size nr_txns = 0;
1226  dlist_iter cur_txn_i;
1227  int32 off;
1228 
1229  *iter_state = NULL;
1230 
1231  /* Check ordering of changes in the toplevel transaction. */
1232  AssertChangeLsnOrder(txn);
1233 
1234  /*
1235  * Calculate the size of our heap: one element for every transaction that
1236  * contains changes. (Besides the transactions already in the reorder
1237  * buffer, we count the one we were directly passed.)
1238  */
1239  if (txn->nentries > 0)
1240  nr_txns++;
1241 
1242  dlist_foreach(cur_txn_i, &txn->subtxns)
1243  {
1244  ReorderBufferTXN *cur_txn;
1245 
1246  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1247 
1248  /* Check ordering of changes in this subtransaction. */
1249  AssertChangeLsnOrder(cur_txn);
1250 
1251  if (cur_txn->nentries > 0)
1252  nr_txns++;
1253  }
1254 
1255  /* allocate iteration state */
1258  sizeof(ReorderBufferIterTXNState) +
1259  sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1260 
1261  state->nr_txns = nr_txns;
1262  dlist_init(&state->old_change);
1263 
1264  for (off = 0; off < state->nr_txns; off++)
1265  {
1266  state->entries[off].file.vfd = -1;
1267  state->entries[off].segno = 0;
1268  }
1269 
1270  /* allocate heap */
1271  state->heap = binaryheap_allocate(state->nr_txns,
1273  state);
1274 
1275  /* Now that the state fields are initialized, it is safe to return it. */
1276  *iter_state = state;
1277 
1278  /*
1279  * Now insert items into the binary heap, in an unordered fashion. (We
1280  * will run a heap assembly step at the end; this is more efficient.)
1281  */
1282 
1283  off = 0;
1284 
1285  /* add toplevel transaction if it contains changes */
1286  if (txn->nentries > 0)
1287  {
1288  ReorderBufferChange *cur_change;
1289 
1290  if (rbtxn_is_serialized(txn))
1291  {
1292  /* serialize remaining changes */
1293  ReorderBufferSerializeTXN(rb, txn);
1294  ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1295  &state->entries[off].segno);
1296  }
1297 
1298  cur_change = dlist_head_element(ReorderBufferChange, node,
1299  &txn->changes);
1300 
1301  state->entries[off].lsn = cur_change->lsn;
1302  state->entries[off].change = cur_change;
1303  state->entries[off].txn = txn;
1304 
1306  }
1307 
1308  /* add subtransactions if they contain changes */
1309  dlist_foreach(cur_txn_i, &txn->subtxns)
1310  {
1311  ReorderBufferTXN *cur_txn;
1312 
1313  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1314 
1315  if (cur_txn->nentries > 0)
1316  {
1317  ReorderBufferChange *cur_change;
1318 
1319  if (rbtxn_is_serialized(cur_txn))
1320  {
1321  /* serialize remaining changes */
1322  ReorderBufferSerializeTXN(rb, cur_txn);
1323  ReorderBufferRestoreChanges(rb, cur_txn,
1324  &state->entries[off].file,
1325  &state->entries[off].segno);
1326  }
1327  cur_change = dlist_head_element(ReorderBufferChange, node,
1328  &cur_txn->changes);
1329 
1330  state->entries[off].lsn = cur_change->lsn;
1331  state->entries[off].change = cur_change;
1332  state->entries[off].txn = cur_txn;
1333 
1335  }
1336  }
1337 
1338  /* assemble a valid binary heap */
1339  binaryheap_build(state->heap);
1340 }
1341 
1342 /*
1343  * Return the next change when iterating over a transaction and its
1344  * subtransactions.
1345  *
1346  * Returns NULL when no further changes exist.
1347  */
1348 static ReorderBufferChange *
1350 {
1351  ReorderBufferChange *change;
1353  int32 off;
1354 
1355  /* nothing there anymore */
1356  if (state->heap->bh_size == 0)
1357  return NULL;
1358 
1359  off = DatumGetInt32(binaryheap_first(state->heap));
1360  entry = &state->entries[off];
1361 
1362  /* free memory we might have "leaked" in the previous *Next call */
1363  if (!dlist_is_empty(&state->old_change))
1364  {
1365  change = dlist_container(ReorderBufferChange, node,
1366  dlist_pop_head_node(&state->old_change));
1367  ReorderBufferReturnChange(rb, change, true);
1368  Assert(dlist_is_empty(&state->old_change));
1369  }
1370 
1371  change = entry->change;
1372 
1373  /*
1374  * update heap with information about which transaction has the next
1375  * relevant change in LSN order
1376  */
1377 
1378  /* there are in-memory changes */
1379  if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1380  {
1381  dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1382  ReorderBufferChange *next_change =
1384 
1385  /* txn stays the same */
1386  state->entries[off].lsn = next_change->lsn;
1387  state->entries[off].change = next_change;
1388 
1390  return change;
1391  }
1392 
1393  /* try to load changes from disk */
1394  if (entry->txn->nentries != entry->txn->nentries_mem)
1395  {
1396  /*
1397  * Ugly: restoring changes will reuse *Change records, thus delete the
1398  * current one from the per-tx list and only free in the next call.
1399  */
1400  dlist_delete(&change->node);
1401  dlist_push_tail(&state->old_change, &change->node);
1402 
1403  /*
1404  * Update the total bytes processed by the txn for which we are
1405  * releasing the current set of changes and restoring the new set of
1406  * changes.
1407  */
1408  rb->totalBytes += entry->txn->size;
1409  if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1410  &state->entries[off].segno))
1411  {
1412  /* successfully restored changes from disk */
1413  ReorderBufferChange *next_change =
1415  &entry->txn->changes);
1416 
1417  elog(DEBUG2, "restored %u/%u changes from disk",
1418  (uint32) entry->txn->nentries_mem,
1419  (uint32) entry->txn->nentries);
1420 
1421  Assert(entry->txn->nentries_mem);
1422  /* txn stays the same */
1423  state->entries[off].lsn = next_change->lsn;
1424  state->entries[off].change = next_change;
1426 
1427  return change;
1428  }
1429  }
1430 
1431  /* ok, no changes there anymore, remove */
1433 
1434  return change;
1435 }
1436 
1437 /*
1438  * Deallocate the iterator
1439  */
1440 static void
1443 {
1444  int32 off;
1445 
1446  for (off = 0; off < state->nr_txns; off++)
1447  {
1448  if (state->entries[off].file.vfd != -1)
1449  FileClose(state->entries[off].file.vfd);
1450  }
1451 
1452  /* free memory we might have "leaked" in the last *Next call */
1453  if (!dlist_is_empty(&state->old_change))
1454  {
1455  ReorderBufferChange *change;
1456 
1457  change = dlist_container(ReorderBufferChange, node,
1458  dlist_pop_head_node(&state->old_change));
1459  ReorderBufferReturnChange(rb, change, true);
1460  Assert(dlist_is_empty(&state->old_change));
1461  }
1462 
1463  binaryheap_free(state->heap);
1464  pfree(state);
1465 }
1466 
1467 /*
1468  * Cleanup the contents of a transaction, usually after the transaction
1469  * committed or aborted.
1470  */
1471 static void
1473 {
1474  bool found;
1475  dlist_mutable_iter iter;
1476 
1477  /* cleanup subtransactions & their changes */
1478  dlist_foreach_modify(iter, &txn->subtxns)
1479  {
1480  ReorderBufferTXN *subtxn;
1481 
1482  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1483 
1484  /*
1485  * Subtransactions are always associated to the toplevel TXN, even if
1486  * they originally were happening inside another subtxn, so we won't
1487  * ever recurse more than one level deep here.
1488  */
1489  Assert(rbtxn_is_known_subxact(subtxn));
1490  Assert(subtxn->nsubtxns == 0);
1491 
1492  ReorderBufferCleanupTXN(rb, subtxn);
1493  }
1494 
1495  /* cleanup changes in the txn */
1496  dlist_foreach_modify(iter, &txn->changes)
1497  {
1498  ReorderBufferChange *change;
1499 
1500  change = dlist_container(ReorderBufferChange, node, iter.cur);
1501 
1502  /* Check we're not mixing changes from different transactions. */
1503  Assert(change->txn == txn);
1504 
1505  ReorderBufferReturnChange(rb, change, true);
1506  }
1507 
1508  /*
1509  * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1510  * They are always stored in the toplevel transaction.
1511  */
1512  dlist_foreach_modify(iter, &txn->tuplecids)
1513  {
1514  ReorderBufferChange *change;
1515 
1516  change = dlist_container(ReorderBufferChange, node, iter.cur);
1517 
1518  /* Check we're not mixing changes from different transactions. */
1519  Assert(change->txn == txn);
1521 
1522  ReorderBufferReturnChange(rb, change, true);
1523  }
1524 
1525  /*
1526  * Cleanup the base snapshot, if set.
1527  */
1528  if (txn->base_snapshot != NULL)
1529  {
1532  }
1533 
1534  /*
1535  * Cleanup the snapshot for the last streamed run.
1536  */
1537  if (txn->snapshot_now != NULL)
1538  {
1539  Assert(rbtxn_is_streamed(txn));
1541  }
1542 
1543  /*
1544  * Remove TXN from its containing lists.
1545  *
1546  * Note: if txn is known as subxact, we are deleting the TXN from its
1547  * parent's list of known subxacts; this leaves the parent's nsubxacts
1548  * count too high, but we don't care. Otherwise, we are deleting the TXN
1549  * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1550  * list of catalog modifying transactions as well.
1551  */
1552  dlist_delete(&txn->node);
1553  if (rbtxn_has_catalog_changes(txn))
1555 
1556  /* now remove reference from buffer */
1557  hash_search(rb->by_txn,
1558  (void *) &txn->xid,
1559  HASH_REMOVE,
1560  &found);
1561  Assert(found);
1562 
1563  /* remove entries spilled to disk */
1564  if (rbtxn_is_serialized(txn))
1565  ReorderBufferRestoreCleanup(rb, txn);
1566 
1567  /* deallocate */
1568  ReorderBufferReturnTXN(rb, txn);
1569 }
1570 
1571 /*
1572  * Discard changes from a transaction (and subtransactions), either after
1573  * streaming or decoding them at PREPARE. Keep the remaining info -
1574  * transactions, tuplecids, invalidations and snapshots.
1575  *
1576  * We additionally remove tuplecids after decoding the transaction at prepare
1577  * time as we only need to perform invalidation at rollback or commit prepared.
1578  *
1579  * 'txn_prepared' indicates that we have decoded the transaction at prepare
1580  * time.
1581  */
1582 static void
1584 {
1585  dlist_mutable_iter iter;
1586 
1587  /* cleanup subtransactions & their changes */
1588  dlist_foreach_modify(iter, &txn->subtxns)
1589  {
1590  ReorderBufferTXN *subtxn;
1591 
1592  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1593 
1594  /*
1595  * Subtransactions are always associated to the toplevel TXN, even if
1596  * they originally were happening inside another subtxn, so we won't
1597  * ever recurse more than one level deep here.
1598  */
1599  Assert(rbtxn_is_known_subxact(subtxn));
1600  Assert(subtxn->nsubtxns == 0);
1601 
1602  ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1603  }
1604 
1605  /* cleanup changes in the txn */
1606  dlist_foreach_modify(iter, &txn->changes)
1607  {
1608  ReorderBufferChange *change;
1609 
1610  change = dlist_container(ReorderBufferChange, node, iter.cur);
1611 
1612  /* Check we're not mixing changes from different transactions. */
1613  Assert(change->txn == txn);
1614 
1615  /* remove the change from it's containing list */
1616  dlist_delete(&change->node);
1617 
1618  ReorderBufferReturnChange(rb, change, true);
1619  }
1620 
1621  /*
1622  * Mark the transaction as streamed.
1623  *
1624  * The toplevel transaction, identified by (toptxn==NULL), is marked as
1625  * streamed always, even if it does not contain any changes (that is, when
1626  * all the changes are in subtransactions).
1627  *
1628  * For subtransactions, we only mark them as streamed when there are
1629  * changes in them.
1630  *
1631  * We do it this way because of aborts - we don't want to send aborts for
1632  * XIDs the downstream is not aware of. And of course, it always knows
1633  * about the toplevel xact (we send the XID in all messages), but we never
1634  * stream XIDs of empty subxacts.
1635  */
1636  if ((!txn_prepared) && ((!txn->toptxn) || (txn->nentries_mem != 0)))
1637  txn->txn_flags |= RBTXN_IS_STREAMED;
1638 
1639  if (txn_prepared)
1640  {
1641  /*
1642  * If this is a prepared txn, cleanup the tuplecids we stored for
1643  * decoding catalog snapshot access. They are always stored in the
1644  * toplevel transaction.
1645  */
1646  dlist_foreach_modify(iter, &txn->tuplecids)
1647  {
1648  ReorderBufferChange *change;
1649 
1650  change = dlist_container(ReorderBufferChange, node, iter.cur);
1651 
1652  /* Check we're not mixing changes from different transactions. */
1653  Assert(change->txn == txn);
1655 
1656  /* Remove the change from its containing list. */
1657  dlist_delete(&change->node);
1658 
1659  ReorderBufferReturnChange(rb, change, true);
1660  }
1661  }
1662 
1663  /*
1664  * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1665  * memory. We could also keep the hash table and update it with new ctid
1666  * values, but this seems simpler and good enough for now.
1667  */
1668  if (txn->tuplecid_hash != NULL)
1669  {
1671  txn->tuplecid_hash = NULL;
1672  }
1673 
1674  /* If this txn is serialized then clean the disk space. */
1675  if (rbtxn_is_serialized(txn))
1676  {
1677  ReorderBufferRestoreCleanup(rb, txn);
1678  txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1679 
1680  /*
1681  * We set this flag to indicate if the transaction is ever serialized.
1682  * We need this to accurately update the stats as otherwise the same
1683  * transaction can be counted as serialized multiple times.
1684  */
1686  }
1687 
1688  /* also reset the number of entries in the transaction */
1689  txn->nentries_mem = 0;
1690  txn->nentries = 0;
1691 }
1692 
1693 /*
1694  * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1695  * HeapTupleSatisfiesHistoricMVCC.
1696  */
1697 static void
1699 {
1700  dlist_iter iter;
1701  HASHCTL hash_ctl;
1702 
1704  return;
1705 
1706  hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1707  hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1708  hash_ctl.hcxt = rb->context;
1709 
1710  /*
1711  * create the hash with the exact number of to-be-stored tuplecids from
1712  * the start
1713  */
1714  txn->tuplecid_hash =
1715  hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1717 
1718  dlist_foreach(iter, &txn->tuplecids)
1719  {
1722  bool found;
1723  ReorderBufferChange *change;
1724 
1725  change = dlist_container(ReorderBufferChange, node, iter.cur);
1726 
1728 
1729  /* be careful about padding */
1730  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1731 
1732  key.rlocator = change->data.tuplecid.locator;
1733 
1734  ItemPointerCopy(&change->data.tuplecid.tid,
1735  &key.tid);
1736 
1737  ent = (ReorderBufferTupleCidEnt *)
1739  (void *) &key,
1740  HASH_ENTER,
1741  &found);
1742  if (!found)
1743  {
1744  ent->cmin = change->data.tuplecid.cmin;
1745  ent->cmax = change->data.tuplecid.cmax;
1746  ent->combocid = change->data.tuplecid.combocid;
1747  }
1748  else
1749  {
1750  /*
1751  * Maybe we already saw this tuple before in this transaction, but
1752  * if so it must have the same cmin.
1753  */
1754  Assert(ent->cmin == change->data.tuplecid.cmin);
1755 
1756  /*
1757  * cmax may be initially invalid, but once set it can only grow,
1758  * and never become invalid again.
1759  */
1760  Assert((ent->cmax == InvalidCommandId) ||
1761  ((change->data.tuplecid.cmax != InvalidCommandId) &&
1762  (change->data.tuplecid.cmax > ent->cmax)));
1763  ent->cmax = change->data.tuplecid.cmax;
1764  }
1765  }
1766 }
1767 
1768 /*
1769  * Copy a provided snapshot so we can modify it privately. This is needed so
1770  * that catalog modifying transactions can look into intermediate catalog
1771  * states.
1772  */
1773 static Snapshot
1775  ReorderBufferTXN *txn, CommandId cid)
1776 {
1777  Snapshot snap;
1778  dlist_iter iter;
1779  int i = 0;
1780  Size size;
1781 
1782  size = sizeof(SnapshotData) +
1783  sizeof(TransactionId) * orig_snap->xcnt +
1784  sizeof(TransactionId) * (txn->nsubtxns + 1);
1785 
1786  snap = MemoryContextAllocZero(rb->context, size);
1787  memcpy(snap, orig_snap, sizeof(SnapshotData));
1788 
1789  snap->copied = true;
1790  snap->active_count = 1; /* mark as active so nobody frees it */
1791  snap->regd_count = 0;
1792  snap->xip = (TransactionId *) (snap + 1);
1793 
1794  memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1795 
1796  /*
1797  * snap->subxip contains all txids that belong to our transaction which we
1798  * need to check via cmin/cmax. That's why we store the toplevel
1799  * transaction in there as well.
1800  */
1801  snap->subxip = snap->xip + snap->xcnt;
1802  snap->subxip[i++] = txn->xid;
1803 
1804  /*
1805  * subxcnt isn't decreased when subtransactions abort, so count manually.
1806  * Since it's an upper boundary it is safe to use it for the allocation
1807  * above.
1808  */
1809  snap->subxcnt = 1;
1810 
1811  dlist_foreach(iter, &txn->subtxns)
1812  {
1813  ReorderBufferTXN *sub_txn;
1814 
1815  sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1816  snap->subxip[i++] = sub_txn->xid;
1817  snap->subxcnt++;
1818  }
1819 
1820  /* sort so we can bsearch() later */
1821  qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1822 
1823  /* store the specified current CommandId */
1824  snap->curcid = cid;
1825 
1826  return snap;
1827 }
1828 
1829 /*
1830  * Free a previously ReorderBufferCopySnap'ed snapshot
1831  */
1832 static void
1834 {
1835  if (snap->copied)
1836  pfree(snap);
1837  else
1839 }
1840 
1841 /*
1842  * If the transaction was (partially) streamed, we need to prepare or commit
1843  * it in a 'streamed' way. That is, we first stream the remaining part of the
1844  * transaction, and then invoke stream_prepare or stream_commit message as per
1845  * the case.
1846  */
1847 static void
1849 {
1850  /* we should only call this for previously streamed transactions */
1851  Assert(rbtxn_is_streamed(txn));
1852 
1853  ReorderBufferStreamTXN(rb, txn);
1854 
1855  if (rbtxn_prepared(txn))
1856  {
1857  /*
1858  * Note, we send stream prepare even if a concurrent abort is
1859  * detected. See DecodePrepare for more information.
1860  */
1861  rb->stream_prepare(rb, txn, txn->final_lsn);
1862 
1863  /*
1864  * This is a PREPARED transaction, part of a two-phase commit. The
1865  * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1866  * just truncate txn by removing changes and tuple_cids.
1867  */
1868  ReorderBufferTruncateTXN(rb, txn, true);
1869  /* Reset the CheckXidAlive */
1871  }
1872  else
1873  {
1874  rb->stream_commit(rb, txn, txn->final_lsn);
1875  ReorderBufferCleanupTXN(rb, txn);
1876  }
1877 }
1878 
1879 /*
1880  * Set xid to detect concurrent aborts.
1881  *
1882  * While streaming an in-progress transaction or decoding a prepared
1883  * transaction there is a possibility that the (sub)transaction might get
1884  * aborted concurrently. In such case if the (sub)transaction has catalog
1885  * update then we might decode the tuple using wrong catalog version. For
1886  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1887  * the transaction 501 updates the catalog tuple and after that we will have
1888  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1889  * aborted and some other transaction say 502 updates the same catalog tuple
1890  * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1891  * problem is that when we try to decode the tuple inserted/updated in 501
1892  * after the catalog update, we will see the catalog tuple with (xmin: 500,
1893  * xmax: 502) as visible because it will consider that the tuple is deleted by
1894  * xid 502 which is not visible to our snapshot. And when we will try to
1895  * decode with that catalog tuple, it can lead to a wrong result or a crash.
1896  * So, it is necessary to detect concurrent aborts to allow streaming of
1897  * in-progress transactions or decoding of prepared transactions.
1898  *
1899  * For detecting the concurrent abort we set CheckXidAlive to the current
1900  * (sub)transaction's xid for which this change belongs to. And, during
1901  * catalog scan we can check the status of the xid and if it is aborted we will
1902  * report a specific error so that we can stop streaming current transaction
1903  * and discard the already streamed changes on such an error. We might have
1904  * already streamed some of the changes for the aborted (sub)transaction, but
1905  * that is fine because when we decode the abort we will stream abort message
1906  * to truncate the changes in the subscriber. Similarly, for prepared
1907  * transactions, we stop decoding if concurrent abort is detected and then
1908  * rollback the changes when rollback prepared is encountered. See
1909  * DecodePrepare.
1910  */
1911 static inline void
1913 {
1914  /*
1915  * If the input transaction id is already set as a CheckXidAlive then
1916  * nothing to do.
1917  */
1919  return;
1920 
1921  /*
1922  * setup CheckXidAlive if it's not committed yet. We don't check if the
1923  * xid is aborted. That will happen during catalog access.
1924  */
1925  if (!TransactionIdDidCommit(xid))
1926  CheckXidAlive = xid;
1927  else
1929 }
1930 
1931 /*
1932  * Helper function for ReorderBufferProcessTXN for applying change.
1933  */
1934 static inline void
1936  Relation relation, ReorderBufferChange *change,
1937  bool streaming)
1938 {
1939  if (streaming)
1940  rb->stream_change(rb, txn, relation, change);
1941  else
1942  rb->apply_change(rb, txn, relation, change);
1943 }
1944 
1945 /*
1946  * Helper function for ReorderBufferProcessTXN for applying the truncate.
1947  */
1948 static inline void
1950  int nrelations, Relation *relations,
1951  ReorderBufferChange *change, bool streaming)
1952 {
1953  if (streaming)
1954  rb->stream_truncate(rb, txn, nrelations, relations, change);
1955  else
1956  rb->apply_truncate(rb, txn, nrelations, relations, change);
1957 }
1958 
1959 /*
1960  * Helper function for ReorderBufferProcessTXN for applying the message.
1961  */
1962 static inline void
1964  ReorderBufferChange *change, bool streaming)
1965 {
1966  if (streaming)
1967  rb->stream_message(rb, txn, change->lsn, true,
1968  change->data.msg.prefix,
1969  change->data.msg.message_size,
1970  change->data.msg.message);
1971  else
1972  rb->message(rb, txn, change->lsn, true,
1973  change->data.msg.prefix,
1974  change->data.msg.message_size,
1975  change->data.msg.message);
1976 }
1977 
1978 /*
1979  * Function to store the command id and snapshot at the end of the current
1980  * stream so that we can reuse the same while sending the next stream.
1981  */
1982 static inline void
1984  Snapshot snapshot_now, CommandId command_id)
1985 {
1986  txn->command_id = command_id;
1987 
1988  /* Avoid copying if it's already copied. */
1989  if (snapshot_now->copied)
1990  txn->snapshot_now = snapshot_now;
1991  else
1992  txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
1993  txn, command_id);
1994 }
1995 
1996 /*
1997  * Helper function for ReorderBufferProcessTXN to handle the concurrent
1998  * abort of the streaming transaction. This resets the TXN such that it
1999  * can be used to stream the remaining data of transaction being processed.
2000  * This can happen when the subtransaction is aborted and we still want to
2001  * continue processing the main or other subtransactions data.
2002  */
2003 static void
2005  Snapshot snapshot_now,
2006  CommandId command_id,
2007  XLogRecPtr last_lsn,
2008  ReorderBufferChange *specinsert)
2009 {
2010  /* Discard the changes that we just streamed */
2012 
2013  /* Free all resources allocated for toast reconstruction */
2014  ReorderBufferToastReset(rb, txn);
2015 
2016  /* Return the spec insert change if it is not NULL */
2017  if (specinsert != NULL)
2018  {
2019  ReorderBufferReturnChange(rb, specinsert, true);
2020  specinsert = NULL;
2021  }
2022 
2023  /*
2024  * For the streaming case, stop the stream and remember the command ID and
2025  * snapshot for the streaming run.
2026  */
2027  if (rbtxn_is_streamed(txn))
2028  {
2029  rb->stream_stop(rb, txn, last_lsn);
2030  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2031  }
2032 }
2033 
2034 /*
2035  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2036  *
2037  * Send data of a transaction (and its subtransactions) to the
2038  * output plugin. We iterate over the top and subtransactions (using a k-way
2039  * merge) and replay the changes in lsn order.
2040  *
2041  * If streaming is true then data will be sent using stream API.
2042  *
2043  * Note: "volatile" markers on some parameters are to avoid trouble with
2044  * PG_TRY inside the function.
2045  */
2046 static void
2048  XLogRecPtr commit_lsn,
2049  volatile Snapshot snapshot_now,
2050  volatile CommandId command_id,
2051  bool streaming)
2052 {
2053  bool using_subtxn;
2055  ReorderBufferIterTXNState *volatile iterstate = NULL;
2056  volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2057  ReorderBufferChange *volatile specinsert = NULL;
2058  volatile bool stream_started = false;
2059  ReorderBufferTXN *volatile curtxn = NULL;
2060 
2061  /* build data to be able to lookup the CommandIds of catalog tuples */
2063 
2064  /* setup the initial snapshot */
2065  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2066 
2067  /*
2068  * Decoding needs access to syscaches et al., which in turn use
2069  * heavyweight locks and such. Thus we need to have enough state around to
2070  * keep track of those. The easiest way is to simply use a transaction
2071  * internally. That also allows us to easily enforce that nothing writes
2072  * to the database by checking for xid assignments.
2073  *
2074  * When we're called via the SQL SRF there's already a transaction
2075  * started, so start an explicit subtransaction there.
2076  */
2077  using_subtxn = IsTransactionOrTransactionBlock();
2078 
2079  PG_TRY();
2080  {
2081  ReorderBufferChange *change;
2082 
2083  if (using_subtxn)
2084  BeginInternalSubTransaction(streaming ? "stream" : "replay");
2085  else
2087 
2088  /*
2089  * We only need to send begin/begin-prepare for non-streamed
2090  * transactions.
2091  */
2092  if (!streaming)
2093  {
2094  if (rbtxn_prepared(txn))
2095  rb->begin_prepare(rb, txn);
2096  else
2097  rb->begin(rb, txn);
2098  }
2099 
2100  ReorderBufferIterTXNInit(rb, txn, &iterstate);
2101  while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2102  {
2103  Relation relation = NULL;
2104  Oid reloid;
2105 
2107 
2108  /*
2109  * We can't call start stream callback before processing first
2110  * change.
2111  */
2112  if (prev_lsn == InvalidXLogRecPtr)
2113  {
2114  if (streaming)
2115  {
2116  txn->origin_id = change->origin_id;
2117  rb->stream_start(rb, txn, change->lsn);
2118  stream_started = true;
2119  }
2120  }
2121 
2122  /*
2123  * Enforce correct ordering of changes, merged from multiple
2124  * subtransactions. The changes may have the same LSN due to
2125  * MULTI_INSERT xlog records.
2126  */
2127  Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2128 
2129  prev_lsn = change->lsn;
2130 
2131  /*
2132  * Set the current xid to detect concurrent aborts. This is
2133  * required for the cases when we decode the changes before the
2134  * COMMIT record is processed.
2135  */
2136  if (streaming || rbtxn_prepared(change->txn))
2137  {
2138  curtxn = change->txn;
2139  SetupCheckXidLive(curtxn->xid);
2140  }
2141 
2142  switch (change->action)
2143  {
2145 
2146  /*
2147  * Confirmation for speculative insertion arrived. Simply
2148  * use as a normal record. It'll be cleaned up at the end
2149  * of INSERT processing.
2150  */
2151  if (specinsert == NULL)
2152  elog(ERROR, "invalid ordering of speculative insertion changes");
2153  Assert(specinsert->data.tp.oldtuple == NULL);
2154  change = specinsert;
2156 
2157  /* intentionally fall through */
2161  Assert(snapshot_now);
2162 
2163  reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2164  change->data.tp.rlocator.relNumber);
2165 
2166  /*
2167  * Mapped catalog tuple without data, emitted while
2168  * catalog table was in the process of being rewritten. We
2169  * can fail to look up the relfilenumber, because the
2170  * relmapper has no "historic" view, in contrast to the
2171  * normal catalog during decoding. Thus repeated rewrites
2172  * can cause a lookup failure. That's OK because we do not
2173  * decode catalog changes anyway. Normally such tuples
2174  * would be skipped over below, but we can't identify
2175  * whether the table should be logically logged without
2176  * mapping the relfilenumber to the oid.
2177  */
2178  if (reloid == InvalidOid &&
2179  change->data.tp.newtuple == NULL &&
2180  change->data.tp.oldtuple == NULL)
2181  goto change_done;
2182  else if (reloid == InvalidOid)
2183  elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2184  relpathperm(change->data.tp.rlocator,
2185  MAIN_FORKNUM));
2186 
2187  relation = RelationIdGetRelation(reloid);
2188 
2189  if (!RelationIsValid(relation))
2190  elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2191  reloid,
2192  relpathperm(change->data.tp.rlocator,
2193  MAIN_FORKNUM));
2194 
2195  if (!RelationIsLogicallyLogged(relation))
2196  goto change_done;
2197 
2198  /*
2199  * Ignore temporary heaps created during DDL unless the
2200  * plugin has asked for them.
2201  */
2202  if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2203  goto change_done;
2204 
2205  /*
2206  * For now ignore sequence changes entirely. Most of the
2207  * time they don't log changes using records we
2208  * understand, so it doesn't make sense to handle the few
2209  * cases we do.
2210  */
2211  if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2212  goto change_done;
2213 
2214  /* user-triggered change */
2215  if (!IsToastRelation(relation))
2216  {
2217  ReorderBufferToastReplace(rb, txn, relation, change);
2218  ReorderBufferApplyChange(rb, txn, relation, change,
2219  streaming);
2220 
2221  /*
2222  * Only clear reassembled toast chunks if we're sure
2223  * they're not required anymore. The creator of the
2224  * tuple tells us.
2225  */
2226  if (change->data.tp.clear_toast_afterwards)
2227  ReorderBufferToastReset(rb, txn);
2228  }
2229  /* we're not interested in toast deletions */
2230  else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2231  {
2232  /*
2233  * Need to reassemble the full toasted Datum in
2234  * memory, to ensure the chunks don't get reused till
2235  * we're done remove it from the list of this
2236  * transaction's changes. Otherwise it will get
2237  * freed/reused while restoring spooled data from
2238  * disk.
2239  */
2240  Assert(change->data.tp.newtuple != NULL);
2241 
2242  dlist_delete(&change->node);
2243  ReorderBufferToastAppendChunk(rb, txn, relation,
2244  change);
2245  }
2246 
2247  change_done:
2248 
2249  /*
2250  * If speculative insertion was confirmed, the record
2251  * isn't needed anymore.
2252  */
2253  if (specinsert != NULL)
2254  {
2255  ReorderBufferReturnChange(rb, specinsert, true);
2256  specinsert = NULL;
2257  }
2258 
2259  if (RelationIsValid(relation))
2260  {
2261  RelationClose(relation);
2262  relation = NULL;
2263  }
2264  break;
2265 
2267 
2268  /*
2269  * Speculative insertions are dealt with by delaying the
2270  * processing of the insert until the confirmation record
2271  * arrives. For that we simply unlink the record from the
2272  * chain, so it does not get freed/reused while restoring
2273  * spooled data from disk.
2274  *
2275  * This is safe in the face of concurrent catalog changes
2276  * because the relevant relation can't be changed between
2277  * speculative insertion and confirmation due to
2278  * CheckTableNotInUse() and locking.
2279  */
2280 
2281  /* clear out a pending (and thus failed) speculation */
2282  if (specinsert != NULL)
2283  {
2284  ReorderBufferReturnChange(rb, specinsert, true);
2285  specinsert = NULL;
2286  }
2287 
2288  /* and memorize the pending insertion */
2289  dlist_delete(&change->node);
2290  specinsert = change;
2291  break;
2292 
2294 
2295  /*
2296  * Abort for speculative insertion arrived. So cleanup the
2297  * specinsert tuple and toast hash.
2298  *
2299  * Note that we get the spec abort change for each toast
2300  * entry but we need to perform the cleanup only the first
2301  * time we get it for the main table.
2302  */
2303  if (specinsert != NULL)
2304  {
2305  /*
2306  * We must clean the toast hash before processing a
2307  * completely new tuple to avoid confusion about the
2308  * previous tuple's toast chunks.
2309  */
2310  Assert(change->data.tp.clear_toast_afterwards);
2311  ReorderBufferToastReset(rb, txn);
2312 
2313  /* We don't need this record anymore. */
2314  ReorderBufferReturnChange(rb, specinsert, true);
2315  specinsert = NULL;
2316  }
2317  break;
2318 
2320  {
2321  int i;
2322  int nrelids = change->data.truncate.nrelids;
2323  int nrelations = 0;
2324  Relation *relations;
2325 
2326  relations = palloc0(nrelids * sizeof(Relation));
2327  for (i = 0; i < nrelids; i++)
2328  {
2329  Oid relid = change->data.truncate.relids[i];
2330  Relation rel;
2331 
2332  rel = RelationIdGetRelation(relid);
2333 
2334  if (!RelationIsValid(rel))
2335  elog(ERROR, "could not open relation with OID %u", relid);
2336 
2337  if (!RelationIsLogicallyLogged(rel))
2338  continue;
2339 
2340  relations[nrelations++] = rel;
2341  }
2342 
2343  /* Apply the truncate. */
2344  ReorderBufferApplyTruncate(rb, txn, nrelations,
2345  relations, change,
2346  streaming);
2347 
2348  for (i = 0; i < nrelations; i++)
2349  RelationClose(relations[i]);
2350 
2351  break;
2352  }
2353 
2355  ReorderBufferApplyMessage(rb, txn, change, streaming);
2356  break;
2357 
2359  /* Execute the invalidation messages locally */
2360  ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2361  change->data.inval.invalidations);
2362  break;
2363 
2365  /* get rid of the old */
2366  TeardownHistoricSnapshot(false);
2367 
2368  if (snapshot_now->copied)
2369  {
2370  ReorderBufferFreeSnap(rb, snapshot_now);
2371  snapshot_now =
2372  ReorderBufferCopySnap(rb, change->data.snapshot,
2373  txn, command_id);
2374  }
2375 
2376  /*
2377  * Restored from disk, need to be careful not to double
2378  * free. We could introduce refcounting for that, but for
2379  * now this seems infrequent enough not to care.
2380  */
2381  else if (change->data.snapshot->copied)
2382  {
2383  snapshot_now =
2384  ReorderBufferCopySnap(rb, change->data.snapshot,
2385  txn, command_id);
2386  }
2387  else
2388  {
2389  snapshot_now = change->data.snapshot;
2390  }
2391 
2392  /* and continue with the new one */
2393  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2394  break;
2395 
2397  Assert(change->data.command_id != InvalidCommandId);
2398 
2399  if (command_id < change->data.command_id)
2400  {
2401  command_id = change->data.command_id;
2402 
2403  if (!snapshot_now->copied)
2404  {
2405  /* we don't use the global one anymore */
2406  snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2407  txn, command_id);
2408  }
2409 
2410  snapshot_now->curcid = command_id;
2411 
2412  TeardownHistoricSnapshot(false);
2413  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2414  }
2415 
2416  break;
2417 
2419  elog(ERROR, "tuplecid value in changequeue");
2420  break;
2421  }
2422  }
2423 
2424  /* speculative insertion record must be freed by now */
2425  Assert(!specinsert);
2426 
2427  /* clean up the iterator */
2428  ReorderBufferIterTXNFinish(rb, iterstate);
2429  iterstate = NULL;
2430 
2431  /*
2432  * Update total transaction count and total bytes processed by the
2433  * transaction and its subtransactions. Ensure to not count the
2434  * streamed transaction multiple times.
2435  *
2436  * Note that the statistics computation has to be done after
2437  * ReorderBufferIterTXNFinish as it releases the serialized change
2438  * which we have already accounted in ReorderBufferIterTXNNext.
2439  */
2440  if (!rbtxn_is_streamed(txn))
2441  rb->totalTxns++;
2442 
2443  rb->totalBytes += txn->total_size;
2444 
2445  /*
2446  * Done with current changes, send the last message for this set of
2447  * changes depending upon streaming mode.
2448  */
2449  if (streaming)
2450  {
2451  if (stream_started)
2452  {
2453  rb->stream_stop(rb, txn, prev_lsn);
2454  stream_started = false;
2455  }
2456  }
2457  else
2458  {
2459  /*
2460  * Call either PREPARE (for two-phase transactions) or COMMIT (for
2461  * regular ones).
2462  */
2463  if (rbtxn_prepared(txn))
2464  rb->prepare(rb, txn, commit_lsn);
2465  else
2466  rb->commit(rb, txn, commit_lsn);
2467  }
2468 
2469  /* this is just a sanity check against bad output plugin behaviour */
2471  elog(ERROR, "output plugin used XID %u",
2473 
2474  /*
2475  * Remember the command ID and snapshot for the next set of changes in
2476  * streaming mode.
2477  */
2478  if (streaming)
2479  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2480  else if (snapshot_now->copied)
2481  ReorderBufferFreeSnap(rb, snapshot_now);
2482 
2483  /* cleanup */
2484  TeardownHistoricSnapshot(false);
2485 
2486  /*
2487  * Aborting the current (sub-)transaction as a whole has the right
2488  * semantics. We want all locks acquired in here to be released, not
2489  * reassigned to the parent and we do not want any database access
2490  * have persistent effects.
2491  */
2493 
2494  /* make sure there's no cache pollution */
2496 
2497  if (using_subtxn)
2499 
2500  /*
2501  * We are here due to one of the four reasons: 1. Decoding an
2502  * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2503  * prepared txn that was (partially) streamed. 4. Decoding a committed
2504  * txn.
2505  *
2506  * For 1, we allow truncation of txn data by removing the changes
2507  * already streamed but still keeping other things like invalidations,
2508  * snapshot, and tuplecids. For 2 and 3, we indicate
2509  * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2510  * data as the entire transaction has been decoded except for commit.
2511  * For 4, as the entire txn has been decoded, we can fully clean up
2512  * the TXN reorder buffer.
2513  */
2514  if (streaming || rbtxn_prepared(txn))
2515  {
2517  /* Reset the CheckXidAlive */
2519  }
2520  else
2521  ReorderBufferCleanupTXN(rb, txn);
2522  }
2523  PG_CATCH();
2524  {
2525  MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2526  ErrorData *errdata = CopyErrorData();
2527 
2528  /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2529  if (iterstate)
2530  ReorderBufferIterTXNFinish(rb, iterstate);
2531 
2533 
2534  /*
2535  * Force cache invalidation to happen outside of a valid transaction
2536  * to prevent catalog access as we just caught an error.
2537  */
2539 
2540  /* make sure there's no cache pollution */
2542  txn->invalidations);
2543 
2544  if (using_subtxn)
2546 
2547  /*
2548  * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2549  * abort of the (sub)transaction we are streaming or preparing. We
2550  * need to do the cleanup and return gracefully on this error, see
2551  * SetupCheckXidLive.
2552  *
2553  * This error code can be thrown by one of the callbacks we call
2554  * during decoding so we need to ensure that we return gracefully only
2555  * when we are sending the data in streaming mode and the streaming is
2556  * not finished yet or when we are sending the data out on a PREPARE
2557  * during a two-phase commit.
2558  */
2559  if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2560  (stream_started || rbtxn_prepared(txn)))
2561  {
2562  /* curtxn must be set for streaming or prepared transactions */
2563  Assert(curtxn);
2564 
2565  /* Cleanup the temporary error state. */
2566  FlushErrorState();
2567  FreeErrorData(errdata);
2568  errdata = NULL;
2569  curtxn->concurrent_abort = true;
2570 
2571  /* Reset the TXN so that it is allowed to stream remaining data. */
2572  ReorderBufferResetTXN(rb, txn, snapshot_now,
2573  command_id, prev_lsn,
2574  specinsert);
2575  }
2576  else
2577  {
2578  ReorderBufferCleanupTXN(rb, txn);
2579  MemoryContextSwitchTo(ecxt);
2580  PG_RE_THROW();
2581  }
2582  }
2583  PG_END_TRY();
2584 }
2585 
2586 /*
2587  * Perform the replay of a transaction and its non-aborted subtransactions.
2588  *
2589  * Subtransactions previously have to be processed by
2590  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2591  * transaction with ReorderBufferAssignChild.
2592  *
2593  * This interface is called once a prepare or toplevel commit is read for both
2594  * streamed as well as non-streamed transactions.
2595  */
2596 static void
2598  ReorderBuffer *rb, TransactionId xid,
2599  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2600  TimestampTz commit_time,
2601  RepOriginId origin_id, XLogRecPtr origin_lsn)
2602 {
2603  Snapshot snapshot_now;
2604  CommandId command_id = FirstCommandId;
2605 
2606  txn->final_lsn = commit_lsn;
2607  txn->end_lsn = end_lsn;
2608  txn->xact_time.commit_time = commit_time;
2609  txn->origin_id = origin_id;
2610  txn->origin_lsn = origin_lsn;
2611 
2612  /*
2613  * If the transaction was (partially) streamed, we need to commit it in a
2614  * 'streamed' way. That is, we first stream the remaining part of the
2615  * transaction, and then invoke stream_commit message.
2616  *
2617  * Called after everything (origin ID, LSN, ...) is stored in the
2618  * transaction to avoid passing that information directly.
2619  */
2620  if (rbtxn_is_streamed(txn))
2621  {
2622  ReorderBufferStreamCommit(rb, txn);
2623  return;
2624  }
2625 
2626  /*
2627  * If this transaction has no snapshot, it didn't make any changes to the
2628  * database, so there's nothing to decode. Note that
2629  * ReorderBufferCommitChild will have transferred any snapshots from
2630  * subtransactions if there were any.
2631  */
2632  if (txn->base_snapshot == NULL)
2633  {
2634  Assert(txn->ninvalidations == 0);
2635 
2636  /*
2637  * Removing this txn before a commit might result in the computation
2638  * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2639  */
2640  if (!rbtxn_prepared(txn))
2641  ReorderBufferCleanupTXN(rb, txn);
2642  return;
2643  }
2644 
2645  snapshot_now = txn->base_snapshot;
2646 
2647  /* Process and send the changes to output plugin. */
2648  ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2649  command_id, false);
2650 }
2651 
2652 /*
2653  * Commit a transaction.
2654  *
2655  * See comments for ReorderBufferReplay().
2656  */
2657 void
2659  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2660  TimestampTz commit_time,
2661  RepOriginId origin_id, XLogRecPtr origin_lsn)
2662 {
2663  ReorderBufferTXN *txn;
2664 
2665  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2666  false);
2667 
2668  /* unknown transaction, nothing to replay */
2669  if (txn == NULL)
2670  return;
2671 
2672  ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2673  origin_id, origin_lsn);
2674 }
2675 
2676 /*
2677  * Record the prepare information for a transaction.
2678  */
2679 bool
2681  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2682  TimestampTz prepare_time,
2683  RepOriginId origin_id, XLogRecPtr origin_lsn)
2684 {
2685  ReorderBufferTXN *txn;
2686 
2687  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2688 
2689  /* unknown transaction, nothing to do */
2690  if (txn == NULL)
2691  return false;
2692 
2693  /*
2694  * Remember the prepare information to be later used by commit prepared in
2695  * case we skip doing prepare.
2696  */
2697  txn->final_lsn = prepare_lsn;
2698  txn->end_lsn = end_lsn;
2699  txn->xact_time.prepare_time = prepare_time;
2700  txn->origin_id = origin_id;
2701  txn->origin_lsn = origin_lsn;
2702 
2703  return true;
2704 }
2705 
2706 /* Remember that we have skipped prepare */
2707 void
2709 {
2710  ReorderBufferTXN *txn;
2711 
2712  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2713 
2714  /* unknown transaction, nothing to do */
2715  if (txn == NULL)
2716  return;
2717 
2719 }
2720 
2721 /*
2722  * Prepare a two-phase transaction.
2723  *
2724  * See comments for ReorderBufferReplay().
2725  */
2726 void
2728  char *gid)
2729 {
2730  ReorderBufferTXN *txn;
2731 
2732  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2733  false);
2734 
2735  /* unknown transaction, nothing to replay */
2736  if (txn == NULL)
2737  return;
2738 
2739  txn->txn_flags |= RBTXN_PREPARE;
2740  txn->gid = pstrdup(gid);
2741 
2742  /* The prepare info must have been updated in txn by now. */
2744 
2745  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2746  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2747 
2748  /*
2749  * We send the prepare for the concurrently aborted xacts so that later
2750  * when rollback prepared is decoded and sent, the downstream should be
2751  * able to rollback such a xact. See comments atop DecodePrepare.
2752  *
2753  * Note, for the concurrent_abort + streaming case a stream_prepare was
2754  * already sent within the ReorderBufferReplay call above.
2755  */
2756  if (txn->concurrent_abort && !rbtxn_is_streamed(txn))
2757  rb->prepare(rb, txn, txn->final_lsn);
2758 }
2759 
2760 /*
2761  * This is used to handle COMMIT/ROLLBACK PREPARED.
2762  */
2763 void
2765  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2766  XLogRecPtr two_phase_at,
2767  TimestampTz commit_time, RepOriginId origin_id,
2768  XLogRecPtr origin_lsn, char *gid, bool is_commit)
2769 {
2770  ReorderBufferTXN *txn;
2771  XLogRecPtr prepare_end_lsn;
2772  TimestampTz prepare_time;
2773 
2774  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2775 
2776  /* unknown transaction, nothing to do */
2777  if (txn == NULL)
2778  return;
2779 
2780  /*
2781  * By this time the txn has the prepare record information, remember it to
2782  * be later used for rollback.
2783  */
2784  prepare_end_lsn = txn->end_lsn;
2785  prepare_time = txn->xact_time.prepare_time;
2786 
2787  /* add the gid in the txn */
2788  txn->gid = pstrdup(gid);
2789 
2790  /*
2791  * It is possible that this transaction is not decoded at prepare time
2792  * either because by that time we didn't have a consistent snapshot, or
2793  * two_phase was not enabled, or it was decoded earlier but we have
2794  * restarted. We only need to send the prepare if it was not decoded
2795  * earlier. We don't need to decode the xact for aborts if it is not done
2796  * already.
2797  */
2798  if ((txn->final_lsn < two_phase_at) && is_commit)
2799  {
2800  txn->txn_flags |= RBTXN_PREPARE;
2801 
2802  /*
2803  * The prepare info must have been updated in txn even if we skip
2804  * prepare.
2805  */
2807 
2808  /*
2809  * By this time the txn has the prepare record information and it is
2810  * important to use that so that downstream gets the accurate
2811  * information. If instead, we have passed commit information here
2812  * then downstream can behave as it has already replayed commit
2813  * prepared after the restart.
2814  */
2815  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2816  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2817  }
2818 
2819  txn->final_lsn = commit_lsn;
2820  txn->end_lsn = end_lsn;
2821  txn->xact_time.commit_time = commit_time;
2822  txn->origin_id = origin_id;
2823  txn->origin_lsn = origin_lsn;
2824 
2825  if (is_commit)
2826  rb->commit_prepared(rb, txn, commit_lsn);
2827  else
2828  rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2829 
2830  /* cleanup: make sure there's no cache pollution */
2832  txn->invalidations);
2833  ReorderBufferCleanupTXN(rb, txn);
2834 }
2835 
2836 /*
2837  * Abort a transaction that possibly has previous changes. Needs to be first
2838  * called for subtransactions and then for the toplevel xid.
2839  *
2840  * NB: Transactions handled here have to have actively aborted (i.e. have
2841  * produced an abort record). Implicitly aborted transactions are handled via
2842  * ReorderBufferAbortOld(); transactions we're just not interested in, but
2843  * which have committed are handled in ReorderBufferForget().
2844  *
2845  * This function purges this transaction and its contents from memory and
2846  * disk.
2847  */
2848 void
2850 {
2851  ReorderBufferTXN *txn;
2852 
2853  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2854  false);
2855 
2856  /* unknown, nothing to remove */
2857  if (txn == NULL)
2858  return;
2859 
2860  /* For streamed transactions notify the remote node about the abort. */
2861  if (rbtxn_is_streamed(txn))
2862  {
2863  rb->stream_abort(rb, txn, lsn);
2864 
2865  /*
2866  * We might have decoded changes for this transaction that could load
2867  * the cache as per the current transaction's view (consider DDL's
2868  * happened in this transaction). We don't want the decoding of future
2869  * transactions to use those cache entries so execute invalidations.
2870  */
2871  if (txn->ninvalidations > 0)
2873  txn->invalidations);
2874  }
2875 
2876  /* cosmetic... */
2877  txn->final_lsn = lsn;
2878 
2879  /* remove potential on-disk data, and deallocate */
2880  ReorderBufferCleanupTXN(rb, txn);
2881 }
2882 
2883 /*
2884  * Abort all transactions that aren't actually running anymore because the
2885  * server restarted.
2886  *
2887  * NB: These really have to be transactions that have aborted due to a server
2888  * crash/immediate restart, as we don't deal with invalidations here.
2889  */
2890 void
2892 {
2893  dlist_mutable_iter it;
2894 
2895  /*
2896  * Iterate through all (potential) toplevel TXNs and abort all that are
2897  * older than what possibly can be running. Once we've found the first
2898  * that is alive we stop, there might be some that acquired an xid earlier
2899  * but started writing later, but it's unlikely and they will be cleaned
2900  * up in a later call to this function.
2901  */
2903  {
2904  ReorderBufferTXN *txn;
2905 
2906  txn = dlist_container(ReorderBufferTXN, node, it.cur);
2907 
2908  if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2909  {
2910  elog(DEBUG2, "aborting old transaction %u", txn->xid);
2911 
2912  /* remove potential on-disk data, and deallocate this tx */
2913  ReorderBufferCleanupTXN(rb, txn);
2914  }
2915  else
2916  return;
2917  }
2918 }
2919 
2920 /*
2921  * Forget the contents of a transaction if we aren't interested in its
2922  * contents. Needs to be first called for subtransactions and then for the
2923  * toplevel xid.
2924  *
2925  * This is significantly different to ReorderBufferAbort() because
2926  * transactions that have committed need to be treated differently from aborted
2927  * ones since they may have modified the catalog.
2928  *
2929  * Note that this is only allowed to be called in the moment a transaction
2930  * commit has just been read, not earlier; otherwise later records referring
2931  * to this xid might re-create the transaction incompletely.
2932  */
2933 void
2935 {
2936  ReorderBufferTXN *txn;
2937 
2938  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2939  false);
2940 
2941  /* unknown, nothing to forget */
2942  if (txn == NULL)
2943  return;
2944 
2945  /* For streamed transactions notify the remote node about the abort. */
2946  if (rbtxn_is_streamed(txn))
2947  rb->stream_abort(rb, txn, lsn);
2948 
2949  /* cosmetic... */
2950  txn->final_lsn = lsn;
2951 
2952  /*
2953  * Process cache invalidation messages if there are any. Even if we're not
2954  * interested in the transaction's contents, it could have manipulated the
2955  * catalog and we need to update the caches according to that.
2956  */
2957  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2959  txn->invalidations);
2960  else
2961  Assert(txn->ninvalidations == 0);
2962 
2963  /* remove potential on-disk data, and deallocate */
2964  ReorderBufferCleanupTXN(rb, txn);
2965 }
2966 
2967 /*
2968  * Invalidate cache for those transactions that need to be skipped just in case
2969  * catalogs were manipulated as part of the transaction.
2970  *
2971  * Note that this is a special-purpose function for prepared transactions where
2972  * we don't want to clean up the TXN even when we decide to skip it. See
2973  * DecodePrepare.
2974  */
2975 void
2977 {
2978  ReorderBufferTXN *txn;
2979 
2980  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2981  false);
2982 
2983  /* unknown, nothing to do */
2984  if (txn == NULL)
2985  return;
2986 
2987  /*
2988  * Process cache invalidation messages if there are any. Even if we're not
2989  * interested in the transaction's contents, it could have manipulated the
2990  * catalog and we need to update the caches according to that.
2991  */
2992  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2994  txn->invalidations);
2995  else
2996  Assert(txn->ninvalidations == 0);
2997 }
2998 
2999 
3000 /*
3001  * Execute invalidations happening outside the context of a decoded
3002  * transaction. That currently happens either for xid-less commits
3003  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3004  * transactions (via ReorderBufferForget()).
3005  */
3006 void
3008  SharedInvalidationMessage *invalidations)
3009 {
3010  bool use_subtxn = IsTransactionOrTransactionBlock();
3011  int i;
3012 
3013  if (use_subtxn)
3014  BeginInternalSubTransaction("replay");
3015 
3016  /*
3017  * Force invalidations to happen outside of a valid transaction - that way
3018  * entries will just be marked as invalid without accessing the catalog.
3019  * That's advantageous because we don't need to setup the full state
3020  * necessary for catalog access.
3021  */
3022  if (use_subtxn)
3024 
3025  for (i = 0; i < ninvalidations; i++)
3026  LocalExecuteInvalidationMessage(&invalidations[i]);
3027 
3028  if (use_subtxn)
3030 }
3031 
3032 /*
3033  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3034  * least once for every xid in XLogRecord->xl_xid (other places in records
3035  * may, but do not have to be passed through here).
3036  *
3037  * Reorderbuffer keeps some datastructures about transactions in LSN order,
3038  * for efficiency. To do that it has to know about when transactions are seen
3039  * first in the WAL. As many types of records are not actually interesting for
3040  * logical decoding, they do not necessarily pass though here.
3041  */
3042 void
3044 {
3045  /* many records won't have an xid assigned, centralize check here */
3046  if (xid != InvalidTransactionId)
3047  ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3048 }
3049 
3050 /*
3051  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3052  * because the previous snapshot doesn't describe the catalog correctly for
3053  * following rows.
3054  */
3055 void
3057  XLogRecPtr lsn, Snapshot snap)
3058 {
3060 
3061  change->data.snapshot = snap;
3063 
3064  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3065 }
3066 
3067 /*
3068  * Set up the transaction's base snapshot.
3069  *
3070  * If we know that xid is a subtransaction, set the base snapshot on the
3071  * top-level transaction instead.
3072  */
3073 void
3075  XLogRecPtr lsn, Snapshot snap)
3076 {
3077  ReorderBufferTXN *txn;
3078  bool is_new;
3079 
3080  Assert(snap != NULL);
3081 
3082  /*
3083  * Fetch the transaction to operate on. If we know it's a subtransaction,
3084  * operate on its top-level transaction instead.
3085  */
3086  txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3087  if (rbtxn_is_known_subxact(txn))
3088  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3089  NULL, InvalidXLogRecPtr, false);
3090  Assert(txn->base_snapshot == NULL);
3091 
3092  txn->base_snapshot = snap;
3093  txn->base_snapshot_lsn = lsn;
3095 
3096  AssertTXNLsnOrder(rb);
3097 }
3098 
3099 /*
3100  * Access the catalog with this CommandId at this point in the changestream.
3101  *
3102  * May only be called for command ids > 1
3103  */
3104 void
3106  XLogRecPtr lsn, CommandId cid)
3107 {
3109 
3110  change->data.command_id = cid;
3112 
3113  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3114 }
3115 
3116 /*
3117  * Update memory counters to account for the new or removed change.
3118  *
3119  * We update two counters - in the reorder buffer, and in the transaction
3120  * containing the change. The reorder buffer counter allows us to quickly
3121  * decide if we reached the memory limit, the transaction counter allows
3122  * us to quickly pick the largest transaction for eviction.
3123  *
3124  * When streaming is enabled, we need to update the toplevel transaction
3125  * counters instead - we don't really care about subtransactions as we
3126  * can't stream them individually anyway, and we only pick toplevel
3127  * transactions for eviction. So only toplevel transactions matter.
3128  */
3129 static void
3131  ReorderBufferChange *change,
3132  bool addition, Size sz)
3133 {
3134  ReorderBufferTXN *txn;
3135  ReorderBufferTXN *toptxn;
3136 
3137  Assert(change->txn);
3138 
3139  /*
3140  * Ignore tuple CID changes, because those are not evicted when reaching
3141  * memory limit. So we just don't count them, because it might easily
3142  * trigger a pointless attempt to spill.
3143  */
3145  return;
3146 
3147  txn = change->txn;
3148 
3149  /*
3150  * Update the total size in top level as well. This is later used to
3151  * compute the decoding stats.
3152  */
3153  if (txn->toptxn != NULL)
3154  toptxn = txn->toptxn;
3155  else
3156  toptxn = txn;
3157 
3158  if (addition)
3159  {
3160  txn->size += sz;
3161  rb->size += sz;
3162 
3163  /* Update the total size in the top transaction. */
3164  toptxn->total_size += sz;
3165  }
3166  else
3167  {
3168  Assert((rb->size >= sz) && (txn->size >= sz));
3169  txn->size -= sz;
3170  rb->size -= sz;
3171 
3172  /* Update the total size in the top transaction. */
3173  toptxn->total_size -= sz;
3174  }
3175 
3176  Assert(txn->size <= rb->size);
3177 }
3178 
3179 /*
3180  * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3181  *
3182  * We do not include this change type in memory accounting, because we
3183  * keep CIDs in a separate list and do not evict them when reaching
3184  * the memory limit.
3185  */
3186 void
3188  XLogRecPtr lsn, RelFileLocator locator,
3189  ItemPointerData tid, CommandId cmin,
3190  CommandId cmax, CommandId combocid)
3191 {
3193  ReorderBufferTXN *txn;
3194 
3195  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3196 
3197  change->data.tuplecid.locator = locator;
3198  change->data.tuplecid.tid = tid;
3199  change->data.tuplecid.cmin = cmin;
3200  change->data.tuplecid.cmax = cmax;
3201  change->data.tuplecid.combocid = combocid;
3202  change->lsn = lsn;
3203  change->txn = txn;
3205 
3206  dlist_push_tail(&txn->tuplecids, &change->node);
3207  txn->ntuplecids++;
3208 }
3209 
3210 /*
3211  * Accumulate the invalidations for executing them later.
3212  *
3213  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3214  * accumulates all the invalidation messages in the toplevel transaction, if
3215  * available, otherwise in the current transaction, as well as in the form of
3216  * change in reorder buffer. We require to record it in form of the change
3217  * so that we can execute only the required invalidations instead of executing
3218  * all the invalidations on each CommandId increment. We also need to
3219  * accumulate these in the txn buffer because in some cases where we skip
3220  * processing the transaction (see ReorderBufferForget), we need to execute
3221  * all the invalidations together.
3222  */
3223 void
3225  XLogRecPtr lsn, Size nmsgs,
3227 {
3228  ReorderBufferTXN *txn;
3229  MemoryContext oldcontext;
3230  ReorderBufferChange *change;
3231 
3232  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3233 
3234  oldcontext = MemoryContextSwitchTo(rb->context);
3235 
3236  /*
3237  * Collect all the invalidations under the top transaction, if available,
3238  * so that we can execute them all together. See comments atop this
3239  * function.
3240  */
3241  if (txn->toptxn)
3242  txn = txn->toptxn;
3243 
3244  Assert(nmsgs > 0);
3245 
3246  /* Accumulate invalidations. */
3247  if (txn->ninvalidations == 0)
3248  {
3249  txn->ninvalidations = nmsgs;
3251  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3252  memcpy(txn->invalidations, msgs,
3253  sizeof(SharedInvalidationMessage) * nmsgs);
3254  }
3255  else
3256  {
3259  (txn->ninvalidations + nmsgs));
3260 
3261  memcpy(txn->invalidations + txn->ninvalidations, msgs,
3262  nmsgs * sizeof(SharedInvalidationMessage));
3263  txn->ninvalidations += nmsgs;
3264  }
3265 
3266  change = ReorderBufferGetChange(rb);
3268  change->data.inval.ninvalidations = nmsgs;
3269  change->data.inval.invalidations = (SharedInvalidationMessage *)
3270  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3271  memcpy(change->data.inval.invalidations, msgs,
3272  sizeof(SharedInvalidationMessage) * nmsgs);
3273 
3274  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3275 
3276  MemoryContextSwitchTo(oldcontext);
3277 }
3278 
3279 /*
3280  * Apply all invalidations we know. Possibly we only need parts at this point
3281  * in the changestream but we don't know which those are.
3282  */
3283 static void
3285 {
3286  int i;
3287 
3288  for (i = 0; i < nmsgs; i++)
3290 }
3291 
3292 /*
3293  * Mark a transaction as containing catalog changes
3294  */
3295 void
3297  XLogRecPtr lsn)
3298 {
3299  ReorderBufferTXN *txn;
3300  ReorderBufferTXN *toptxn;
3301 
3302  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3303 
3304  if (!rbtxn_has_catalog_changes(txn))
3305  {
3308  }
3309 
3310  /*
3311  * Mark top-level transaction as having catalog changes too if one of its
3312  * children has so that the ReorderBufferBuildTupleCidHash can
3313  * conveniently check just top-level transaction and decide whether to
3314  * build the hash table or not.
3315  */
3316  toptxn = txn->toptxn;
3317  if (toptxn != NULL && !rbtxn_has_catalog_changes(toptxn))
3318  {
3321  }
3322 }
3323 
3324 /*
3325  * Return palloc'ed array of the transactions that have changed catalogs.
3326  * The returned array is sorted in xidComparator order.
3327  *
3328  * The caller must free the returned array when done with it.
3329  */
3330 TransactionId *
3332 {
3333  dlist_iter iter;
3334  TransactionId *xids = NULL;
3335  size_t xcnt = 0;
3336 
3337  /* Quick return if the list is empty */
3338  if (dclist_count(&rb->catchange_txns) == 0)
3339  return NULL;
3340 
3341  /* Initialize XID array */
3342  xids = (TransactionId *) palloc(sizeof(TransactionId) *
3344  dclist_foreach(iter, &rb->catchange_txns)
3345  {
3347  catchange_node,
3348  iter.cur);
3349 
3351 
3352  xids[xcnt++] = txn->xid;
3353  }
3354 
3355  qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3356 
3357  Assert(xcnt == dclist_count(&rb->catchange_txns));
3358  return xids;
3359 }
3360 
3361 /*
3362  * Query whether a transaction is already *known* to contain catalog
3363  * changes. This can be wrong until directly before the commit!
3364  */
3365 bool
3367 {
3368  ReorderBufferTXN *txn;
3369 
3370  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3371  false);
3372  if (txn == NULL)
3373  return false;
3374 
3375  return rbtxn_has_catalog_changes(txn);
3376 }
3377 
3378 /*
3379  * ReorderBufferXidHasBaseSnapshot
3380  * Have we already set the base snapshot for the given txn/subtxn?
3381  */
3382 bool
3384 {
3385  ReorderBufferTXN *txn;
3386 
3387  txn = ReorderBufferTXNByXid(rb, xid, false,
3388  NULL, InvalidXLogRecPtr, false);
3389 
3390  /* transaction isn't known yet, ergo no snapshot */
3391  if (txn == NULL)
3392  return false;
3393 
3394  /* a known subtxn? operate on top-level txn instead */
3395  if (rbtxn_is_known_subxact(txn))
3396  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3397  NULL, InvalidXLogRecPtr, false);
3398 
3399  return txn->base_snapshot != NULL;
3400 }
3401 
3402 
3403 /*
3404  * ---------------------------------------
3405  * Disk serialization support
3406  * ---------------------------------------
3407  */
3408 
3409 /*
3410  * Ensure the IO buffer is >= sz.
3411  */
3412 static void
3414 {
3415  if (!rb->outbufsize)
3416  {
3417  rb->outbuf = MemoryContextAlloc(rb->context, sz);
3418  rb->outbufsize = sz;
3419  }
3420  else if (rb->outbufsize < sz)
3421  {
3422  rb->outbuf = repalloc(rb->outbuf, sz);
3423  rb->outbufsize = sz;
3424  }
3425 }
3426 
3427 /*
3428  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3429  *
3430  * XXX With many subtransactions this might be quite slow, because we'll have
3431  * to walk through all of them. There are some options how we could improve
3432  * that: (a) maintain some secondary structure with transactions sorted by
3433  * amount of changes, (b) not looking for the entirely largest transaction,
3434  * but e.g. for transaction using at least some fraction of the memory limit,
3435  * and (c) evicting multiple transactions at once, e.g. to free a given portion
3436  * of the memory limit (e.g. 50%).
3437  */
3438 static ReorderBufferTXN *
3440 {
3441  HASH_SEQ_STATUS hash_seq;
3443  ReorderBufferTXN *largest = NULL;
3444 
3445  hash_seq_init(&hash_seq, rb->by_txn);
3446  while ((ent = hash_seq_search(&hash_seq)) != NULL)
3447  {
3448  ReorderBufferTXN *txn = ent->txn;
3449 
3450  /* if the current transaction is larger, remember it */
3451  if ((!largest) || (txn->size > largest->size))
3452  largest = txn;
3453  }
3454 
3455  Assert(largest);
3456  Assert(largest->size > 0);
3457  Assert(largest->size <= rb->size);
3458 
3459  return largest;
3460 }
3461 
3462 /*
3463  * Find the largest toplevel transaction to evict (by streaming).
3464  *
3465  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3466  * should give us the same transaction (because we don't update memory account
3467  * for subtransaction with streaming, so it's always 0). But we can simply
3468  * iterate over the limited number of toplevel transactions that have a base
3469  * snapshot. There is no use of selecting a transaction that doesn't have base
3470  * snapshot because we don't decode such transactions.
3471  *
3472  * Note that, we skip transactions that contains incomplete changes. There
3473  * is a scope of optimization here such that we can select the largest
3474  * transaction which has incomplete changes. But that will make the code and
3475  * design quite complex and that might not be worth the benefit. If we plan to
3476  * stream the transactions that contains incomplete changes then we need to
3477  * find a way to partially stream/truncate the transaction changes in-memory
3478  * and build a mechanism to partially truncate the spilled files.
3479  * Additionally, whenever we partially stream the transaction we need to
3480  * maintain the last streamed lsn and next time we need to restore from that
3481  * segment and the offset in WAL. As we stream the changes from the top
3482  * transaction and restore them subtransaction wise, we need to even remember
3483  * the subxact from where we streamed the last change.
3484  */
3485 static ReorderBufferTXN *
3487 {
3488  dlist_iter iter;
3489  Size largest_size = 0;
3490  ReorderBufferTXN *largest = NULL;
3491 
3492  /* Find the largest top-level transaction having a base snapshot. */
3494  {
3495  ReorderBufferTXN *txn;
3496 
3497  txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3498 
3499  /* must not be a subtxn */
3501  /* base_snapshot must be set */
3502  Assert(txn->base_snapshot != NULL);
3503 
3504  if ((largest == NULL || txn->total_size > largest_size) &&
3505  (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)))
3506  {
3507  largest = txn;
3508  largest_size = txn->total_size;
3509  }
3510  }
3511 
3512  return largest;
3513 }
3514 
3515 /*
3516  * Check whether the logical_decoding_work_mem limit was reached, and if yes
3517  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3518  * disk until we reach under the memory limit.
3519  *
3520  * XXX At this point we select the transactions until we reach under the memory
3521  * limit, but we might also adapt a more elaborate eviction strategy - for example
3522  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3523  * limit.
3524  */
3525 static void
3527 {
3528  ReorderBufferTXN *txn;
3529 
3530  /* bail out if we haven't exceeded the memory limit */
3531  if (rb->size < logical_decoding_work_mem * 1024L)
3532  return;
3533 
3534  /*
3535  * Loop until we reach under the memory limit. One might think that just
3536  * by evicting the largest (sub)transaction we will come under the memory
3537  * limit based on assumption that the selected transaction is at least as
3538  * large as the most recent change (which caused us to go over the memory
3539  * limit). However, that is not true because a user can reduce the
3540  * logical_decoding_work_mem to a smaller value before the most recent
3541  * change.
3542  */
3543  while (rb->size >= logical_decoding_work_mem * 1024L)
3544  {
3545  /*
3546  * Pick the largest transaction (or subtransaction) and evict it from
3547  * memory by streaming, if possible. Otherwise, spill to disk.
3548  */
3550  (txn = ReorderBufferLargestTopTXN(rb)) != NULL)
3551  {
3552  /* we know there has to be one, because the size is not zero */
3553  Assert(txn && !txn->toptxn);
3554  Assert(txn->total_size > 0);
3555  Assert(rb->size >= txn->total_size);
3556 
3557  ReorderBufferStreamTXN(rb, txn);
3558  }
3559  else
3560  {
3561  /*
3562  * Pick the largest transaction (or subtransaction) and evict it
3563  * from memory by serializing it to disk.
3564  */
3565  txn = ReorderBufferLargestTXN(rb);
3566 
3567  /* we know there has to be one, because the size is not zero */
3568  Assert(txn);
3569  Assert(txn->size > 0);
3570  Assert(rb->size >= txn->size);
3571 
3572  ReorderBufferSerializeTXN(rb, txn);
3573  }
3574 
3575  /*
3576  * After eviction, the transaction should have no entries in memory,
3577  * and should use 0 bytes for changes.
3578  */
3579  Assert(txn->size == 0);
3580  Assert(txn->nentries_mem == 0);
3581  }
3582 
3583  /* We must be under the memory limit now. */
3584  Assert(rb->size < logical_decoding_work_mem * 1024L);
3585 }
3586 
3587 /*
3588  * Spill data of a large transaction (and its subtransactions) to disk.
3589  */
3590 static void
3592 {
3593  dlist_iter subtxn_i;
3594  dlist_mutable_iter change_i;
3595  int fd = -1;
3596  XLogSegNo curOpenSegNo = 0;
3597  Size spilled = 0;
3598  Size size = txn->size;
3599 
3600  elog(DEBUG2, "spill %u changes in XID %u to disk",
3601  (uint32) txn->nentries_mem, txn->xid);
3602 
3603  /* do the same to all child TXs */
3604  dlist_foreach(subtxn_i, &txn->subtxns)
3605  {
3606  ReorderBufferTXN *subtxn;
3607 
3608  subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3609  ReorderBufferSerializeTXN(rb, subtxn);
3610  }
3611 
3612  /* serialize changestream */
3613  dlist_foreach_modify(change_i, &txn->changes)
3614  {
3615  ReorderBufferChange *change;
3616 
3617  change = dlist_container(ReorderBufferChange, node, change_i.cur);
3618 
3619  /*
3620  * store in segment in which it belongs by start lsn, don't split over
3621  * multiple segments tho
3622  */
3623  if (fd == -1 ||
3624  !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3625  {
3626  char path[MAXPGPATH];
3627 
3628  if (fd != -1)
3630 
3631  XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3632 
3633  /*
3634  * No need to care about TLIs here, only used during a single run,
3635  * so each LSN only maps to a specific WAL record.
3636  */
3638  curOpenSegNo);
3639 
3640  /* open segment, create it if necessary */
3641  fd = OpenTransientFile(path,
3642  O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3643 
3644  if (fd < 0)
3645  ereport(ERROR,
3647  errmsg("could not open file \"%s\": %m", path)));
3648  }
3649 
3650  ReorderBufferSerializeChange(rb, txn, fd, change);
3651  dlist_delete(&change->node);
3652  ReorderBufferReturnChange(rb, change, true);
3653 
3654  spilled++;
3655  }
3656 
3657  /* update the statistics iff we have spilled anything */
3658  if (spilled)
3659  {
3660  rb->spillCount += 1;
3661  rb->spillBytes += size;
3662 
3663  /* don't consider already serialized transactions */
3664  rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3665 
3666  /* update the decoding stats */
3668  }
3669 
3670  Assert(spilled == txn->nentries_mem);
3671  Assert(dlist_is_empty(&txn->changes));
3672  txn->nentries_mem = 0;
3674 
3675  if (fd != -1)
3677 }
3678 
3679 /*
3680  * Serialize individual change to disk.
3681  */
3682 static void
3684  int fd, ReorderBufferChange *change)
3685 {
3686  ReorderBufferDiskChange *ondisk;
3687  Size sz = sizeof(ReorderBufferDiskChange);
3688 
3690 
3691  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3692  memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3693 
3694  switch (change->action)
3695  {
3696  /* fall through these, they're all similar enough */
3701  {
3702  char *data;
3703  ReorderBufferTupleBuf *oldtup,
3704  *newtup;
3705  Size oldlen = 0;
3706  Size newlen = 0;
3707 
3708  oldtup = change->data.tp.oldtuple;
3709  newtup = change->data.tp.newtuple;
3710 
3711  if (oldtup)
3712  {
3713  sz += sizeof(HeapTupleData);
3714  oldlen = oldtup->tuple.t_len;
3715  sz += oldlen;
3716  }
3717 
3718  if (newtup)
3719  {
3720  sz += sizeof(HeapTupleData);
3721  newlen = newtup->tuple.t_len;
3722  sz += newlen;
3723  }
3724 
3725  /* make sure we have enough space */
3727 
3728  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3729  /* might have been reallocated above */
3730  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3731 
3732  if (oldlen)
3733  {
3734  memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
3735  data += sizeof(HeapTupleData);
3736 
3737  memcpy(data, oldtup->tuple.t_data, oldlen);
3738  data += oldlen;
3739  }
3740 
3741  if (newlen)
3742  {
3743  memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
3744  data += sizeof(HeapTupleData);
3745 
3746  memcpy(data, newtup->tuple.t_data, newlen);
3747  data += newlen;
3748  }
3749  break;
3750  }
3752  {
3753  char *data;
3754  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3755 
3756  sz += prefix_size + change->data.msg.message_size +
3757  sizeof(Size) + sizeof(Size);
3759 
3760  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3761 
3762  /* might have been reallocated above */
3763  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3764 
3765  /* write the prefix including the size */
3766  memcpy(data, &prefix_size, sizeof(Size));
3767  data += sizeof(Size);
3768  memcpy(data, change->data.msg.prefix,
3769  prefix_size);
3770  data += prefix_size;
3771 
3772  /* write the message including the size */
3773  memcpy(data, &change->data.msg.message_size, sizeof(Size));
3774  data += sizeof(Size);
3775  memcpy(data, change->data.msg.message,
3776  change->data.msg.message_size);
3777  data += change->data.msg.message_size;
3778 
3779  break;
3780  }
3782  {
3783  char *data;
3784  Size inval_size = sizeof(SharedInvalidationMessage) *
3785  change->data.inval.ninvalidations;
3786 
3787  sz += inval_size;
3788 
3790  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3791 
3792  /* might have been reallocated above */
3793  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3794  memcpy(data, change->data.inval.invalidations, inval_size);
3795  data += inval_size;
3796 
3797  break;
3798  }
3800  {
3801  Snapshot snap;
3802  char *data;
3803 
3804  snap = change->data.snapshot;
3805 
3806  sz += sizeof(SnapshotData) +
3807  sizeof(TransactionId) * snap->xcnt +
3808  sizeof(TransactionId) * snap->subxcnt;
3809 
3810  /* make sure we have enough space */
3812  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3813  /* might have been reallocated above */
3814  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3815 
3816  memcpy(data, snap, sizeof(SnapshotData));
3817  data += sizeof(SnapshotData);
3818 
3819  if (snap->xcnt)
3820  {
3821  memcpy(data, snap->xip,
3822  sizeof(TransactionId) * snap->xcnt);
3823  data += sizeof(TransactionId) * snap->xcnt;
3824  }
3825 
3826  if (snap->subxcnt)
3827  {
3828  memcpy(data, snap->subxip,
3829  sizeof(TransactionId) * snap->subxcnt);
3830  data += sizeof(TransactionId) * snap->subxcnt;
3831  }
3832  break;
3833  }
3835  {
3836  Size size;
3837  char *data;
3838 
3839  /* account for the OIDs of truncated relations */
3840  size = sizeof(Oid) * change->data.truncate.nrelids;
3841  sz += size;
3842 
3843  /* make sure we have enough space */
3845 
3846  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3847  /* might have been reallocated above */
3848  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3849 
3850  memcpy(data, change->data.truncate.relids, size);
3851  data += size;
3852 
3853  break;
3854  }
3859  /* ReorderBufferChange contains everything important */
3860  break;
3861  }
3862 
3863  ondisk->size = sz;
3864 
3865  errno = 0;
3867  if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3868  {
3869  int save_errno = errno;
3870 
3872 
3873  /* if write didn't set errno, assume problem is no disk space */
3874  errno = save_errno ? save_errno : ENOSPC;
3875  ereport(ERROR,
3877  errmsg("could not write to data file for XID %u: %m",
3878  txn->xid)));
3879  }
3881 
3882  /*
3883  * Keep the transaction's final_lsn up to date with each change we send to
3884  * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3885  * only do this on commit and abort records, but that doesn't work if a
3886  * system crash leaves a transaction without its abort record).
3887  *
3888  * Make sure not to move it backwards.
3889  */
3890  if (txn->final_lsn < change->lsn)
3891  txn->final_lsn = change->lsn;
3892 
3893  Assert(ondisk->change.action == change->action);
3894 }
3895 
3896 /* Returns true, if the output plugin supports streaming, false, otherwise. */
3897 static inline bool
3899 {
3901 
3902  return ctx->streaming;
3903 }
3904 
3905 /* Returns true, if the streaming can be started now, false, otherwise. */
3906 static inline bool
3908 {
3910  SnapBuild *builder = ctx->snapshot_builder;
3911 
3912  /* We can't start streaming unless a consistent state is reached. */
3914  return false;
3915 
3916  /*
3917  * We can't start streaming immediately even if the streaming is enabled
3918  * because we previously decoded this transaction and now just are
3919  * restarting.
3920  */
3921  if (ReorderBufferCanStream(rb) &&
3922  !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr))
3923  return true;
3924 
3925  return false;
3926 }
3927 
3928 /*
3929  * Send data of a large transaction (and its subtransactions) to the
3930  * output plugin, but using the stream API.
3931  */
3932 static void
3934 {
3935  Snapshot snapshot_now;
3936  CommandId command_id;
3937  Size stream_bytes;
3938  bool txn_is_streamed;
3939 
3940  /* We can never reach here for a subtransaction. */
3941  Assert(txn->toptxn == NULL);
3942 
3943  /*
3944  * We can't make any assumptions about base snapshot here, similar to what
3945  * ReorderBufferCommit() does. That relies on base_snapshot getting
3946  * transferred from subxact in ReorderBufferCommitChild(), but that was
3947  * not yet called as the transaction is in-progress.
3948  *
3949  * So just walk the subxacts and use the same logic here. But we only need
3950  * to do that once, when the transaction is streamed for the first time.
3951  * After that we need to reuse the snapshot from the previous run.
3952  *
3953  * Unlike DecodeCommit which adds xids of all the subtransactions in
3954  * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here
3955  * but we do add them to subxip array instead via ReorderBufferCopySnap.
3956  * This allows the catalog changes made in subtransactions decoded till
3957  * now to be visible.
3958  */
3959  if (txn->snapshot_now == NULL)
3960  {
3961  dlist_iter subxact_i;
3962 
3963  /* make sure this transaction is streamed for the first time */
3964  Assert(!rbtxn_is_streamed(txn));
3965 
3966  /* at the beginning we should have invalid command ID */
3968 
3969  dlist_foreach(subxact_i, &txn->subtxns)
3970  {
3971  ReorderBufferTXN *subtxn;
3972 
3973  subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
3974  ReorderBufferTransferSnapToParent(txn, subtxn);
3975  }
3976 
3977  /*
3978  * If this transaction has no snapshot, it didn't make any changes to
3979  * the database till now, so there's nothing to decode.
3980  */
3981  if (txn->base_snapshot == NULL)
3982  {
3983  Assert(txn->ninvalidations == 0);
3984  return;
3985  }
3986 
3987  command_id = FirstCommandId;
3988  snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
3989  txn, command_id);
3990  }
3991  else
3992  {
3993  /* the transaction must have been already streamed */
3994  Assert(rbtxn_is_streamed(txn));
3995 
3996  /*
3997  * Nah, we already have snapshot from the previous streaming run. We
3998  * assume new subxacts can't move the LSN backwards, and so can't beat
3999  * the LSN condition in the previous branch (so no need to walk
4000  * through subxacts again). In fact, we must not do that as we may be
4001  * using snapshot half-way through the subxact.
4002  */
4003  command_id = txn->command_id;
4004 
4005  /*
4006  * We can't use txn->snapshot_now directly because after the last
4007  * streaming run, we might have got some new sub-transactions. So we
4008  * need to add them to the snapshot.
4009  */
4010  snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4011  txn, command_id);
4012 
4013  /* Free the previously copied snapshot. */
4014  Assert(txn->snapshot_now->copied);
4016  txn->snapshot_now = NULL;
4017  }
4018 
4019  /*
4020  * Remember this information to be used later to update stats. We can't
4021  * update the stats here as an error while processing the changes would
4022  * lead to the accumulation of stats even though we haven't streamed all
4023  * the changes.
4024  */
4025  txn_is_streamed = rbtxn_is_streamed(txn);
4026  stream_bytes = txn->total_size;
4027 
4028  /* Process and send the changes to output plugin. */
4029  ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4030  command_id, true);
4031 
4032  rb->streamCount += 1;
4033  rb->streamBytes += stream_bytes;
4034 
4035  /* Don't consider already streamed transaction. */
4036  rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4037 
4038  /* update the decoding stats */
4040 
4041  Assert(dlist_is_empty(&txn->changes));
4042  Assert(txn->nentries == 0);
4043  Assert(txn->nentries_mem == 0);
4044 }
4045 
4046 /*
4047  * Size of a change in memory.
4048  */
4049 static Size
4051 {
4052  Size sz = sizeof(ReorderBufferChange);
4053 
4054  switch (change->action)
4055  {
4056  /* fall through these, they're all similar enough */
4061  {
4062  ReorderBufferTupleBuf *oldtup,
4063  *newtup;
4064  Size oldlen = 0;
4065  Size newlen = 0;
4066 
4067  oldtup = change->data.tp.oldtuple;
4068  newtup = change->data.tp.newtuple;
4069 
4070  if (oldtup)
4071  {
4072  sz += sizeof(HeapTupleData);
4073  oldlen = oldtup->tuple.t_len;
4074  sz += oldlen;
4075  }
4076 
4077  if (newtup)
4078  {
4079  sz += sizeof(HeapTupleData);
4080  newlen = newtup->tuple.t_len;
4081  sz += newlen;
4082  }
4083 
4084  break;
4085  }
4087  {
4088  Size prefix_size = strlen(change->data.msg.prefix) + 1;
4089 
4090  sz += prefix_size + change->data.msg.message_size +
4091  sizeof(Size) + sizeof(Size);
4092 
4093  break;
4094  }
4096  {
4097  sz += sizeof(SharedInvalidationMessage) *
4098  change->data.inval.ninvalidations;
4099  break;
4100  }
4102  {
4103  Snapshot snap;
4104 
4105  snap = change->data.snapshot;
4106 
4107  sz += sizeof(SnapshotData) +
4108  sizeof(TransactionId) * snap->xcnt +
4109  sizeof(TransactionId) * snap->subxcnt;
4110 
4111  break;
4112  }
4114  {
4115  sz += sizeof(Oid) * change->data.truncate.nrelids;
4116 
4117  break;
4118  }
4123  /* ReorderBufferChange contains everything important */
4124  break;
4125  }
4126 
4127  return sz;
4128 }
4129 
4130 
4131 /*
4132  * Restore a number of changes spilled to disk back into memory.
4133  */
4134 static Size
4136  TXNEntryFile *file, XLogSegNo *segno)
4137 {
4138  Size restored = 0;
4139  XLogSegNo last_segno;
4140  dlist_mutable_iter cleanup_iter;
4141  File *fd = &file->vfd;
4142 
4145 
4146  /* free current entries, so we have memory for more */
4147  dlist_foreach_modify(cleanup_iter, &txn->changes)
4148  {
4150  dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4151 
4152  dlist_delete(&cleanup->node);
4154  }
4155  txn->nentries_mem = 0;
4156  Assert(dlist_is_empty(&txn->changes));
4157 
4158  XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4159 
4160  while (restored < max_changes_in_memory && *segno <= last_segno)
4161  {
4162  int readBytes;
4163  ReorderBufferDiskChange *ondisk;
4164 
4166 
4167  if (*fd == -1)
4168  {
4169  char path[MAXPGPATH];
4170 
4171  /* first time in */
4172  if (*segno == 0)
4173  XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4174 
4175  Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4176 
4177  /*
4178  * No need to care about TLIs here, only used during a single run,
4179  * so each LSN only maps to a specific WAL record.
4180  */
4182  *segno);
4183 
4184  *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4185 
4186  /* No harm in resetting the offset even in case of failure */
4187  file->curOffset = 0;
4188 
4189  if (*fd < 0 && errno == ENOENT)
4190  {
4191  *fd = -1;
4192  (*segno)++;
4193  continue;
4194  }
4195  else if (*fd < 0)
4196  ereport(ERROR,
4198  errmsg("could not open file \"%s\": %m",
4199  path)));
4200  }
4201 
4202  /*
4203  * Read the statically sized part of a change which has information
4204  * about the total size. If we couldn't read a record, we're at the
4205  * end of this file.
4206  */
4208  readBytes = FileRead(file->vfd, rb->outbuf,
4209  sizeof(ReorderBufferDiskChange),
4211 
4212  /* eof */
4213  if (readBytes == 0)
4214  {
4215  FileClose(*fd);
4216  *fd = -1;
4217  (*segno)++;
4218  continue;
4219  }
4220  else if (readBytes < 0)
4221  ereport(ERROR,
4223  errmsg("could not read from reorderbuffer spill file: %m")));
4224  else if (readBytes != sizeof(ReorderBufferDiskChange))
4225  ereport(ERROR,
4227  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4228  readBytes,
4229  (uint32) sizeof(ReorderBufferDiskChange))));
4230 
4231  file->curOffset += readBytes;
4232 
4233  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4234 
4236  sizeof(ReorderBufferDiskChange) + ondisk->size);
4237  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4238 
4239  readBytes = FileRead(file->vfd,
4240  rb->outbuf + sizeof(ReorderBufferDiskChange),
4241  ondisk->size - sizeof(ReorderBufferDiskChange),
4242  file->curOffset,
4244 
4245  if (readBytes < 0)
4246  ereport(ERROR,
4248  errmsg("could not read from reorderbuffer spill file: %m")));
4249  else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4250  ereport(ERROR,
4252  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4253  readBytes,
4254  (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4255 
4256  file->curOffset += readBytes;
4257 
4258  /*
4259  * ok, read a full change from disk, now restore it into proper
4260  * in-memory format
4261  */
4262  ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4263  restored++;
4264  }
4265 
4266  return restored;
4267 }
4268 
4269 /*
4270  * Convert change from its on-disk format to in-memory format and queue it onto
4271  * the TXN's ->changes list.
4272  *
4273  * Note: although "data" is declared char*, at entry it points to a
4274  * maxalign'd buffer, making it safe in most of this function to assume
4275  * that the pointed-to data is suitably aligned for direct access.
4276  */
4277 static void
4279  char *data)
4280 {
4281  ReorderBufferDiskChange *ondisk;
4282  ReorderBufferChange *change;
4283 
4284  ondisk = (ReorderBufferDiskChange *) data;
4285 
4286  change = ReorderBufferGetChange(rb);
4287 
4288  /* copy static part */
4289  memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4290 
4291  data += sizeof(ReorderBufferDiskChange);
4292 
4293  /* restore individual stuff */
4294  switch (change->action)
4295  {
4296  /* fall through these, they're all similar enough */
4301  if (change->data.tp.oldtuple)
4302  {
4303  uint32 tuplelen = ((HeapTuple) data)->t_len;
4304 
4305  change->data.tp.oldtuple =
4307 
4308  /* restore ->tuple */
4309  memcpy(&change->data.tp.oldtuple->tuple, data,
4310  sizeof(HeapTupleData));
4311  data += sizeof(HeapTupleData);
4312 
4313  /* reset t_data pointer into the new tuplebuf */
4314  change->data.tp.oldtuple->tuple.t_data =
4315  ReorderBufferTupleBufData(change->data.tp.oldtuple);
4316 
4317  /* restore tuple data itself */
4318  memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
4319  data += tuplelen;
4320  }
4321 
4322  if (change->data.tp.newtuple)
4323  {
4324  /* here, data might not be suitably aligned! */
4325  uint32 tuplelen;
4326 
4327  memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4328  sizeof(uint32));
4329 
4330  change->data.tp.newtuple =
4332 
4333  /* restore ->tuple */
4334  memcpy(&change->data.tp.newtuple->tuple, data,
4335  sizeof(HeapTupleData));
4336  data += sizeof(HeapTupleData);
4337 
4338  /* reset t_data pointer into the new tuplebuf */
4339  change->data.tp.newtuple->tuple.t_data =
4340  ReorderBufferTupleBufData(change->data.tp.newtuple);
4341 
4342  /* restore tuple data itself */
4343  memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
4344  data += tuplelen;
4345  }
4346 
4347  break;
4349  {
4350  Size prefix_size;
4351 
4352  /* read prefix */
4353  memcpy(&prefix_size, data, sizeof(Size));
4354  data += sizeof(Size);
4355  change->data.msg.prefix = MemoryContextAlloc(rb->context,
4356  prefix_size);
4357  memcpy(change->data.msg.prefix, data, prefix_size);
4358  Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4359  data += prefix_size;
4360 
4361  /* read the message */
4362  memcpy(&change->data.msg.message_size, data, sizeof(Size));
4363  data += sizeof(Size);
4364  change->data.msg.message = MemoryContextAlloc(rb->context,
4365  change->data.msg.message_size);
4366  memcpy(change->data.msg.message, data,
4367  change->data.msg.message_size);
4368  data += change->data.msg.message_size;
4369 
4370  break;
4371  }
4373  {
4374  Size inval_size = sizeof(SharedInvalidationMessage) *
4375  change->data.inval.ninvalidations;
4376 
4377  change->data.inval.invalidations =
4378  MemoryContextAlloc(rb->context, inval_size);
4379 
4380  /* read the message */
4381  memcpy(change->data.inval.invalidations, data, inval_size);
4382 
4383  break;
4384  }
4386  {
4387  Snapshot oldsnap;
4388  Snapshot newsnap;
4389  Size size;
4390 
4391  oldsnap = (Snapshot) data;
4392 
4393  size = sizeof(SnapshotData) +
4394  sizeof(TransactionId) * oldsnap->xcnt +
4395  sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4396 
4397  change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4398 
4399  newsnap = change->data.snapshot;
4400 
4401  memcpy(newsnap, data, size);
4402  newsnap->xip = (TransactionId *)
4403  (((char *) newsnap) + sizeof(SnapshotData));
4404  newsnap->subxip = newsnap->xip + newsnap->xcnt;
4405  newsnap->copied = true;
4406  break;
4407  }
4408  /* the base struct contains all the data, easy peasy */
4410  {
4411  Oid *relids;
4412 
4413  relids = ReorderBufferGetRelids(rb,
4414  change->data.truncate.nrelids);
4415  memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4416  change->data.truncate.relids = relids;
4417 
4418  break;
4419  }
4424  break;
4425  }
4426 
4427  dlist_push_tail(&txn->changes, &change->node);
4428  txn->nentries_mem++;
4429 
4430  /*
4431  * Update memory accounting for the restored change. We need to do this
4432  * although we don't check the memory limit when restoring the changes in
4433  * this branch (we only do that when initially queueing the changes after
4434  * decoding), because we will release the changes later, and that will
4435  * update the accounting too (subtracting the size from the counters). And
4436  * we don't want to underflow there.
4437  */
4438  ReorderBufferChangeMemoryUpdate(rb, change, true,
4439  ReorderBufferChangeSize(change));
4440 }
4441 
4442 /*
4443  * Remove all on-disk stored for the passed in transaction.
4444  */
4445 static void
4447 {
4448  XLogSegNo first;
4449  XLogSegNo cur;
4450  XLogSegNo last;
4451 
4454 
4455  XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4456  XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4457 
4458  /* iterate over all possible filenames, and delete them */
4459  for (cur = first; cur <= last; cur++)
4460  {
4461  char path[MAXPGPATH];
4462 
4464  if (unlink(path) != 0 && errno != ENOENT)
4465  ereport(ERROR,
4467  errmsg("could not remove file \"%s\": %m", path)));
4468  }
4469 }
4470 
4471 /*
4472  * Remove any leftover serialized reorder buffers from a slot directory after a
4473  * prior crash or decoding session exit.
4474  */
4475 static void
4477 {
4478  DIR *spill_dir;
4479  struct dirent *spill_de;
4480  struct stat statbuf;
4481  char path[MAXPGPATH * 2 + 12];
4482 
4483  sprintf(path, "pg_replslot/%s", slotname);
4484 
4485  /* we're only handling directories here, skip if it's not ours */
4486  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4487  return;
4488 
4489  spill_dir = AllocateDir(path);
4490  while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4491  {
4492  /* only look at names that can be ours */
4493  if (strncmp(spill_de->d_name, "xid", 3) == 0)
4494  {
4495  snprintf(path, sizeof(path),
4496  "pg_replslot/%s/%s", slotname,
4497  spill_de->d_name);
4498 
4499  if (unlink(path) != 0)
4500  ereport(ERROR,
4502  errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4503  path, slotname)));
4504  }
4505  }
4506  FreeDir(spill_dir);
4507 }
4508 
4509 /*
4510  * Given a replication slot, transaction ID and segment number, fill in the
4511  * corresponding spill file into 'path', which is a caller-owned buffer of size
4512  * at least MAXPGPATH.
4513  */
4514 static void
4516  XLogSegNo segno)
4517 {
4518  XLogRecPtr recptr;
4519 
4520  XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4521 
4522  snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4524  xid, LSN_FORMAT_ARGS(recptr));
4525 }
4526 
4527 /*
4528  * Delete all data spilled to disk after we've restarted/crashed. It will be
4529  * recreated when the respective slots are reused.
4530  */
4531 void
4533 {
4534  DIR *logical_dir;
4535  struct dirent *logical_de;
4536 
4537  logical_dir = AllocateDir("pg_replslot");
4538  while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4539  {
4540  if (strcmp(logical_de->d_name, ".") == 0 ||
4541  strcmp(logical_de->d_name, "..") == 0)
4542  continue;
4543 
4544  /* if it cannot be a slot, skip the directory */
4545  if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4546  continue;
4547 
4548  /*
4549  * ok, has to be a surviving logical slot, iterate and delete
4550  * everything starting with xid-*
4551  */
4553  }
4554  FreeDir(logical_dir);
4555 }
4556 
4557 /* ---------------------------------------
4558  * toast reassembly support
4559  * ---------------------------------------
4560  */
4561 
4562 /*
4563  * Initialize per tuple toast reconstruction support.
4564  */
4565 static void
4567 {
4568  HASHCTL hash_ctl;
4569 
4570  Assert(txn->toast_hash == NULL);
4571 
4572  hash_ctl.keysize = sizeof(Oid);
4573  hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4574  hash_ctl.hcxt = rb->context;
4575  txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4577 }
4578 
4579 /*
4580  * Per toast-chunk handling for toast reconstruction
4581  *
4582  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4583  * toasted Datum comes along.
4584  */
4585 static void
4587  Relation relation, ReorderBufferChange *change)
4588 {
4589  ReorderBufferToastEnt *ent;
4590  ReorderBufferTupleBuf *newtup;
4591  bool found;
4592  int32 chunksize;
4593  bool isnull;
4594  Pointer chunk;
4595  TupleDesc desc = RelationGetDescr(relation);
4596  Oid chunk_id;
4597  int32 chunk_seq;
4598 
4599  if (txn->toast_hash == NULL)
4600  ReorderBufferToastInitHash(rb, txn);
4601 
4602  Assert(IsToastRelation(relation));
4603 
4604  newtup = change->data.tp.newtuple;
4605  chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull));
4606  Assert(!isnull);
4607  chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull));
4608  Assert(!isnull);
4609 
4610  ent = (ReorderBufferToastEnt *)
4611  hash_search(txn->toast_hash,
4612  (void *) &chunk_id,
4613  HASH_ENTER,
4614  &found);
4615 
4616  if (!found)
4617  {
4618  Assert(ent->chunk_id == chunk_id);
4619  ent->num_chunks = 0;
4620  ent->last_chunk_seq = 0;
4621  ent->size = 0;
4622  ent->reconstructed = NULL;
4623  dlist_init(&ent->chunks);
4624 
4625  if (chunk_seq != 0)
4626  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4627  chunk_seq, chunk_id);
4628  }
4629  else if (found && chunk_seq != ent->last_chunk_seq + 1)
4630  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4631  chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4632 
4633  chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull));
4634  Assert(!isnull);
4635 
4636  /* calculate size so we can allocate the right size at once later */
4637  if (!VARATT_IS_EXTENDED(chunk))
4638  chunksize = VARSIZE(chunk) - VARHDRSZ;
4639  else if (VARATT_IS_SHORT(chunk))
4640  /* could happen due to heap_form_tuple doing its thing */
4641  chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4642  else
4643  elog(ERROR, "unexpected type of toast chunk");
4644 
4645  ent->size += chunksize;
4646  ent->last_chunk_seq = chunk_seq;
4647  ent->num_chunks++;
4648  dlist_push_tail(&ent->chunks, &change->node);
4649 }
4650 
4651 /*
4652  * Rejigger change->newtuple to point to in-memory toast tuples instead to
4653  * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
4654  *
4655  * We cannot replace unchanged toast tuples though, so those will still point
4656  * to on-disk toast data.
4657  *
4658  * While updating the existing change with detoasted tuple data, we need to
4659  * update the memory accounting info, because the change size will differ.
4660  * Otherwise the accounting may get out of sync, triggering serialization
4661  * at unexpected times.
4662  *
4663  * We simply subtract size of the change before rejiggering the tuple, and
4664  * then adding the new size. This makes it look like the change was removed
4665  * and then added back, except it only tweaks the accounting info.
4666  *
4667  * In particular it can't trigger serialization, which would be pointless
4668  * anyway as it happens during commit processing right before handing
4669  * the change to the output plugin.
4670  */
4671 static void
4673  Relation relation, ReorderBufferChange *change)
4674 {
4675  TupleDesc desc;
4676  int natt;
4677  Datum *attrs;
4678  bool *isnull;
4679  bool *free;
4680  HeapTuple tmphtup;
4681  Relation toast_rel;
4682  TupleDesc toast_desc;
4683  MemoryContext oldcontext;
4684  ReorderBufferTupleBuf *newtup;
4685  Size old_size;
4686 
4687  /* no toast tuples changed */
4688  if (txn->toast_hash == NULL)
4689  return;
4690 
4691  /*
4692  * We're going to modify the size of the change. So, to make sure the
4693  * accounting is correct we record the current change size and then after
4694  * re-computing the change we'll subtract the recorded size and then
4695  * re-add the new change size at the end. We don't immediately subtract
4696  * the old size because if there is any error before we add the new size,
4697  * we will release the changes and that will update the accounting info
4698  * (subtracting the size from the counters). And we don't want to
4699  * underflow there.
4700  */
4701  old_size = ReorderBufferChangeSize(change);
4702 
4703  oldcontext = MemoryContextSwitchTo(rb->context);
4704 
4705  /* we should only have toast tuples in an INSERT or UPDATE */
4706  Assert(change->data.tp.newtuple);
4707 
4708  desc = RelationGetDescr(relation);
4709 
4710  toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4711  if (!RelationIsValid(toast_rel))
4712  elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
4713  relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
4714 
4715  toast_desc = RelationGetDescr(toast_rel);
4716 
4717  /* should we allocate from stack instead? */
4718  attrs = palloc0(sizeof(Datum) * desc->natts);
4719  isnull = palloc0(sizeof(bool) * desc->natts);
4720  free = palloc0(sizeof(bool) * desc->natts);
4721 
4722  newtup = change->data.tp.newtuple;
4723 
4724  heap_deform_tuple(&newtup->tuple, desc, attrs, isnull);
4725 
4726  for (natt = 0; natt < desc->natts; natt++)
4727  {
4728  Form_pg_attribute attr = TupleDescAttr(desc, natt);
4729  ReorderBufferToastEnt *ent;
4730  struct varlena *varlena;
4731 
4732  /* va_rawsize is the size of the original datum -- including header */
4733  struct varatt_external toast_pointer;
4734  struct varatt_indirect redirect_pointer;
4735  struct varlena *new_datum = NULL;
4736  struct varlena *reconstructed;
4737  dlist_iter it;
4738  Size data_done = 0;
4739 
4740  /* system columns aren't toasted */
4741  if (attr->attnum < 0)
4742  continue;
4743 
4744  if (attr->attisdropped)
4745  continue;
4746 
4747  /* not a varlena datatype */
4748  if (attr->attlen != -1)
4749  continue;
4750 
4751  /* no data */
4752  if (isnull[natt])
4753  continue;
4754 
4755  /* ok, we know we have a toast datum */
4756  varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4757 
4758  /* no need to do anything if the tuple isn't external */
4760  continue;
4761 
4762  VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4763 
4764  /*
4765  * Check whether the toast tuple changed, replace if so.
4766  */
4767  ent = (ReorderBufferToastEnt *)
4768  hash_search(txn->toast_hash,
4769  (void *) &toast_pointer.va_valueid,
4770  HASH_FIND,
4771  NULL);
4772  if (ent == NULL)
4773  continue;
4774 
4775  new_datum =
4777 
4778  free[natt] = true;
4779 
4780  reconstructed = palloc0(toast_pointer.va_rawsize);
4781 
4782  ent->reconstructed = reconstructed;
4783 
4784  /* stitch toast tuple back together from its parts */
4785  dlist_foreach(it, &ent->chunks)
4786  {
4787  bool isnull;
4788  ReorderBufferChange *cchange;
4789  ReorderBufferTupleBuf *ctup;
4790  Pointer chunk;
4791 
4792  cchange = dlist_container(ReorderBufferChange, node, it.cur);
4793  ctup = cchange->data.tp.newtuple;
4794  chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull));
4795 
4796  Assert(!isnull);
4797  Assert(!VARATT_IS_EXTERNAL(chunk));
4798  Assert(!VARATT_IS_SHORT(chunk));
4799 
4800  memcpy(VARDATA(reconstructed) + data_done,
4801  VARDATA(chunk),
4802  VARSIZE(chunk) - VARHDRSZ);
4803  data_done += VARSIZE(chunk) - VARHDRSZ;
4804  }
4805  Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4806 
4807  /* make sure its marked as compressed or not */
4808  if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4809  SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4810  else
4811  SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4812 
4813  memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4814  redirect_pointer.pointer = reconstructed;
4815 
4817  memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4818  sizeof(redirect_pointer));
4819 
4820  attrs[natt] = PointerGetDatum(new_datum);
4821  }
4822 
4823  /*
4824  * Build tuple in separate memory & copy tuple back into the tuplebuf
4825  * passed to the output plugin. We can't directly heap_fill_tuple() into
4826  * the tuplebuf because attrs[] will point back into the current content.
4827  */
4828  tmphtup = heap_form_tuple(desc, attrs, isnull);
4829  Assert(newtup->tuple.t_len <= MaxHeapTupleSize);
4830  Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data);
4831 
4832  memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len);
4833  newtup->tuple.t_len = tmphtup->t_len;
4834 
4835  /*
4836  * free resources we won't further need, more persistent stuff will be
4837  * free'd in ReorderBufferToastReset().
4838  */
4839  RelationClose(toast_rel);
4840  pfree(tmphtup);
4841  for (natt = 0; natt < desc->natts; natt++)
4842  {
4843  if (free[natt])
4844  pfree(DatumGetPointer(attrs[natt]));
4845  }
4846  pfree(attrs);
4847  pfree(free);
4848  pfree(isnull);
4849 
4850  MemoryContextSwitchTo(oldcontext);
4851 
4852  /* subtract the old change size */
4853  ReorderBufferChangeMemoryUpdate(rb, change, false, old_size);
4854  /* now add the change back, with the correct size */
4855  ReorderBufferChangeMemoryUpdate(rb, change, true,
4856  ReorderBufferChangeSize(change));
4857 }
4858 
4859 /*
4860  * Free all resources allocated for toast reconstruction.
4861  */
4862 static void
4864 {
4865  HASH_SEQ_STATUS hstat;
4866  ReorderBufferToastEnt *ent;
4867 
4868  if (txn->toast_hash == NULL)
4869  return;
4870 
4871  /* sequentially walk over the hash and free everything */
4872  hash_seq_init(&hstat, txn->toast_hash);
4873  while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4874  {
4875  dlist_mutable_iter it;
4876 
4877  if (ent->reconstructed != NULL)
4878  pfree(ent->reconstructed);
4879 
4880  dlist_foreach_modify(it, &ent->chunks)
4881  {
4882  ReorderBufferChange *change =
4884 
4885  dlist_delete(&change->node);
4886  ReorderBufferReturnChange(rb, change, true);
4887  }
4888  }
4889 
4890  hash_destroy(txn->toast_hash);
4891  txn->toast_hash = NULL;
4892 }
4893 
4894 
4895 /* ---------------------------------------
4896  * Visibility support for logical decoding
4897  *
4898  *
4899  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4900  * always rely on stored cmin/cmax values because of two scenarios:
4901  *
4902  * * A tuple got changed multiple times during a single transaction and thus
4903  * has got a combo CID. Combo CIDs are only valid for the duration of a
4904  * single transaction.
4905  * * A tuple with a cmin but no cmax (and thus no combo CID) got
4906  * deleted/updated in another transaction than the one which created it
4907  * which we are looking at right now. As only one of cmin, cmax or combo CID
4908  * is actually stored in the heap we don't have access to the value we
4909  * need anymore.
4910  *
4911  * To resolve those problems we have a per-transaction hash of (cmin,
4912  * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
4913  * (cmin, cmax) values. That also takes care of combo CIDs by simply
4914  * not caring about them at all. As we have the real cmin/cmax values
4915  * combo CIDs aren't interesting.
4916  *
4917  * As we only care about catalog tuples here the overhead of this
4918  * hashtable should be acceptable.
4919  *
4920  * Heap rewrites complicate this a bit, check rewriteheap.c for
4921  * details.
4922  * -------------------------------------------------------------------------
4923  */
4924 
4925 /* struct for sorting mapping files by LSN efficiently */
4926 typedef struct RewriteMappingFile
4927 {
4931 
4932 #ifdef NOT_USED
4933 static void
4934 DisplayMapping(HTAB *tuplecid_data)
4935 {
4936  HASH_SEQ_STATUS hstat;
4938 
4939  hash_seq_init(&hstat, tuplecid_data);
4940  while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4941  {
4942  elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4943  ent->key.rlocator.dbOid,
4944  ent->key.rlocator.spcOid,
4945  ent->key.rlocator.relNumber,
4948  ent->cmin,
4949  ent->cmax
4950  );
4951  }
4952 }
4953 #endif
4954 
4955 /*
4956  * Apply a single mapping file to tuplecid_data.
4957  *
4958  * The mapping file has to have been verified to be a) committed b) for our
4959  * transaction c) applied in LSN order.
4960  */
4961 static void
4962 ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
4963 {
4964  char path[MAXPGPATH];
4965  int fd;
4966  int readBytes;
4968 
4969  sprintf(path, "pg_logical/mappings/%s", fname);
4970  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4971  if (fd < 0)
4972  ereport(ERROR,
4974  errmsg("could not open file \"%s\": %m", path)));
4975 
4976  while (true)
4977  {
4980  ReorderBufferTupleCidEnt *new_ent;
4981  bool found;
4982 
4983  /* be careful about padding */
4984  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
4985 
4986  /* read all mappings till the end of the file */
4988  readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
4990 
4991  if (readBytes < 0)
4992  ereport(ERROR,
4994  errmsg("could not read file \"%s\": %m",
4995  path)));
4996  else if (readBytes == 0) /* EOF */
4997  break;
4998  else if (readBytes != sizeof(LogicalRewriteMappingData))
4999  ereport(ERROR,
5001  errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5002  path, readBytes,
5003  (int32) sizeof(LogicalRewriteMappingData))));
5004 
5005  key.rlocator = map.old_locator;
5006  ItemPointerCopy(&map.old_tid,
5007  &key.tid);
5008 
5009 
5010  ent = (ReorderBufferTupleCidEnt *)
5012  (void *) &key,
5013  HASH_FIND,
5014  NULL);
5015 
5016  /* no existing mapping, no need to update */
5017  if (!ent)
5018  continue;
5019 
5020  key.rlocator = map.new_locator;
5021  ItemPointerCopy(&map.new_tid,
5022  &key.tid);
5023 
5024  new_ent = (ReorderBufferTupleCidEnt *)
5026  (void *) &key,
5027  HASH_ENTER,
5028  &found);
5029 
5030  if (found)
5031  {
5032  /*
5033  * Make sure the existing mapping makes sense. We sometime update
5034  * old records that did not yet have a cmax (e.g. pg_class' own
5035  * entry while rewriting it) during rewrites, so allow that.
5036  */
5037  Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5038  Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5039  }
5040  else
5041  {
5042  /* update mapping */
5043  new_ent->cmin = ent->cmin;
5044  new_ent->cmax = ent->cmax;
5045  new_ent->combocid = ent->combocid;
5046  }
5047  }
5048 
5049  if (CloseTransientFile(fd) != 0)
5050  ereport(ERROR,
5052  errmsg("could not close file \"%s\": %m", path)));
5053 }
5054 
5055 
5056 /*
5057  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5058  */
5059 static bool
5061 {
5062  return bsearch(&xid, xip, num,
5063  sizeof(TransactionId), xidComparator) != NULL;
5064 }
5065 
5066 /*
5067  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5068  */
5069 static int
5070 file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5071 {
5074 
5075  if (a->lsn < b->lsn)
5076  return -1;
5077  else if (a->lsn > b->lsn)
5078  return 1;
5079  return 0;
5080 }
5081 
5082 /*
5083  * Apply any existing logical remapping files if there are any targeted at our
5084  * transaction for relid.
5085  */
5086 static void
5088 {
5089  DIR *mapping_dir;
5090  struct dirent *mapping_de;
5091  List *files = NIL;
5092  ListCell *file;
5093  Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5094 
5095  mapping_dir = AllocateDir("pg_logical/mappings");
5096  while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
5097  {
5098  Oid f_dboid;
5099  Oid f_relid;
5100  TransactionId f_mapped_xid;
5101  TransactionId f_create_xid;
5102  XLogRecPtr f_lsn;
5103  uint32 f_hi,
5104  f_lo;
5105  RewriteMappingFile *f;
5106 
5107  if (strcmp(mapping_de->d_name, ".") == 0 ||
5108  strcmp(mapping_de->d_name, "..") == 0)
5109  continue;
5110 
5111  /* Ignore files that aren't ours */
5112  if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5113  continue;
5114 
5115  if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5116  &f_dboid, &f_relid, &f_hi, &f_lo,
5117  &f_mapped_xid, &f_create_xid) != 6)
5118  elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5119 
5120  f_lsn = ((uint64) f_hi) << 32 | f_lo;
5121 
5122  /* mapping for another database */
5123  if (f_dboid != dboid)
5124  continue;
5125 
5126  /* mapping for another relation */
5127  if (f_relid != relid)
5128  continue;
5129 
5130  /* did the creating transaction abort? */
5131  if (!TransactionIdDidCommit(f_create_xid))
5132  continue;
5133 
5134  /* not for our transaction */
5135  if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5136  continue;
5137 
5138  /* ok, relevant, queue for apply */
5139  f = palloc(sizeof(RewriteMappingFile));
5140  f->lsn = f_lsn;
5141  strcpy(f->fname, mapping_de->d_name);
5142  files = lappend(files, f);
5143  }
5144  FreeDir(mapping_dir);
5145 
5146  /* sort files so we apply them in LSN order */
5147  list_sort(files, file_sort_by_lsn);
5148 
5149  foreach(file, files)
5150  {
5152 
5153  elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5154  snapshot->subxip[0]);
5156  pfree(f);
5157  }
5158 }
5159 
5160 /*
5161  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5162  * combo CIDs.
5163  */
5164 bool
5166  Snapshot snapshot,
5167  HeapTuple htup, Buffer buffer,
5168  CommandId *cmin, CommandId *cmax)
5169 {
5172  ForkNumber forkno;
5173  BlockNumber blockno;
5174  bool updated_mapping = false;
5175 
5176  /*
5177  * Return unresolved if tuplecid_data is not valid. That's because when
5178  * streaming in-progress transactions we may run into tuples with the CID
5179  * before actually decoding them. Think e.g. about INSERT followed by
5180  * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5181  * INSERT. So in such cases, we assume the CID is from the future
5182  * command.
5183  */
5184  if (tuplecid_data == NULL)
5185  return false;
5186 
5187  /* be careful about padding */
5188  memset(&key, 0, sizeof(key));
5189 
5190  Assert(!BufferIsLocal(buffer));
5191 
5192  /*
5193  * get relfilelocator from the buffer, no convenient way to access it
5194  * other than that.
5195  */
5196  BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5197 
5198  /* tuples can only be in the main fork */
5199  Assert(forkno == MAIN_FORKNUM);
5200  Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5201 
5202  ItemPointerCopy(&htup->t_self,
5203  &key.tid);
5204 
5205 restart:
5206  ent = (ReorderBufferTupleCidEnt *)
5208  (void *) &key,
5209  HASH_FIND,
5210  NULL);
5211 
5212  /*
5213  * failed to find a mapping, check whether the table was rewritten and
5214  * apply mapping if so, but only do that once - there can be no new
5215  * mappings while we are in here since we have to hold a lock on the
5216  * relation.
5217  */
5218  if (ent == NULL && !updated_mapping)
5219  {
5221  /* now check but don't update for a mapping again */
5222  updated_mapping = true;
5223  goto restart;
5224  }
5225  else if (ent == NULL)
5226  return false;
5227 
5228  if (cmin)
5229  *cmin = ent->cmin;
5230  if (cmax)
5231  *cmax = ent->cmax;
5232  return true;
5233 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:125
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:109
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:32
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:173
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:68
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:207
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:158
uint32 BlockNumber
Definition: block.h:31
static int32 next
Definition: blutils.c:219
static void cleanup(void)
Definition: bootstrap.c:696
int Buffer
Definition: buf.h:23
#define BufferIsLocal(buffer)
Definition: buf.h:37
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2784
#define NameStr(name)
Definition: c.h:682
#define InvalidCommandId
Definition: c.h:605
unsigned int uint32
Definition: c.h:442
signed int int32
Definition: c.h:430
char * Pointer
Definition: c.h:419
#define VARHDRSZ
Definition: c.h:628
#define PG_BINARY
Definition: c.h:1209
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:362
#define FirstCommandId
Definition: c.h:604
uint32 CommandId
Definition: c.h:602
uint32 TransactionId
Definition: c.h:588
size_t Size
Definition: c.h:541
bool IsToastRelation(Relation relation)
Definition: catalog.c:147
bool IsSharedRelation(Oid relationId)
Definition: catalog.c:245
int64 TimestampTz
Definition: timestamp.h:39
#define INDIRECT_POINTER_SIZE
Definition: detoast.h:34
#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)
Definition: detoast.h:22
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:863
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421
struct cursor * cur
Definition: ecpg.c:28
void FreeErrorData(ErrorData *edata)
Definition: elog.c:1613
int errcode_for_file_access(void)
Definition: elog.c:718
void FlushErrorState(void)
Definition: elog.c:1651
int errmsg(const char *fmt,...)
Definition: elog.c:906
ErrorData * CopyErrorData(void)
Definition: elog.c:1557
#define PG_RE_THROW()
Definition: elog.h:350
#define DEBUG3
Definition: elog.h:24
#define PG_TRY(...)
Definition: elog.h:309
#define DEBUG2
Definition: elog.h:25
#define PG_END_TRY(...)
Definition: elog.h:334
#define DEBUG1
Definition: elog.h:26
#define ERROR
Definition: elog.h:35
#define PG_CATCH(...)
Definition: elog.h:319
#define INFO
Definition: elog.h:30
#define ereport(elevel,...)
Definition: elog.h:145
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2709
int FreeDir(DIR *dir)
Definition: fd.c:2761
int CloseTransientFile(int fd)
Definition: fd.c:2609
void FileClose(File file)
Definition: fd.c:1883
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1488
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2724
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2034
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2433
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2643
int File
Definition: fd.h:54
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: generation.c:158
Oid MyDatabaseId
Definition: globals.c:89
#define free(a)
Definition: header.h:65
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: heaptuple.c:1020
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition: heaptuple.c:1249
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
HeapTupleData * HeapTuple
Definition: htup.h:71
struct HeapTupleData HeapTupleData
#define SizeofHeapTupleHeader
Definition: htup_details.h:184
#define MaxHeapTupleSize
Definition: htup_details.h:554
static Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:745
#define dlist_foreach(iter, lhead)
Definition: ilist.h:573
static void dlist_init(dlist_head *head)
Definition: ilist.h:314
#define dclist_container(type, membername, ptr)
Definition: ilist.h:884
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition: ilist.h:659
static void dlist_insert_before(dlist_node *before, dlist_node *node)
Definition: ilist.h:382
#define dlist_head_element(type, membername, lhead)
Definition: ilist.h:553
static bool dlist_is_empty(dlist_head *head)
Definition: ilist.h:325
static void dlist_delete(dlist_node *node)
Definition: ilist.h:394
static dlist_node * dlist_next_node(dlist_head *head, dlist_node *node)
Definition: ilist.h:487
static dlist_node * dlist_pop_head_node(dlist_head *head)
Definition: ilist.h:415
static bool dlist_has_next(dlist_head *head, dlist_node *node)
Definition: ilist.h:468
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:590
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:353
static uint32 dclist_count(dclist_head *head)
Definition: ilist.h:869
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:713
static void dclist_init(dclist_head *head)
Definition: ilist.h:621
#define dlist_container(type, membername, ptr)
Definition: ilist.h:543
#define dclist_foreach(iter, lhead)
Definition: ilist.h:907
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
Definition: inval.c:615
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
Assert(fmt[strlen(fmt) - 1] !='\n')
void list_sort(List *list, list_sort_comparator cmp)
Definition: list.c:1673
List * lappend(List *list, void *datum)
Definition: list.c:338
void UpdateDecodingStats(LogicalDecodingContext *ctx)
Definition: logical.c:1833
char * pstrdup(const char *in)
Definition: mcxt.c:1483
void pfree(void *pointer)
Definition: mcxt.c:1306
void * palloc0(Size size)
Definition: mcxt.c:1230
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1037
MemoryContext CurrentMemoryContext
Definition: mcxt.c:124
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1321
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:994
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:376
void * palloc(Size size)
Definition: mcxt.c:1199
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
#define SLAB_DEFAULT_BLOCK_SIZE
Definition: memutils.h:182
#define SLAB_LARGE_BLOCK_SIZE
Definition: memutils.h:183
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:135
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:207
void * arg
#define MAXPGPATH
const void * data
#define lfirst(lc)
Definition: pg_list.h:170
#define NIL
Definition: pg_list.h:66
#define sprintf
Definition: port.h:240
#define snprintf
Definition: port.h:238
#define qsort(a, b, c, d)
Definition: port.h:445
#define VARHDRSZ_SHORT
Definition: postgres.h:293
#define VARSIZE_SHORT(PTR)
Definition: postgres.h:319
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:670
#define VARATT_IS_EXTENDED(PTR)
Definition: postgres.h:341
uintptr_t Datum
Definition: postgres.h:412
#define VARATT_IS_SHORT(PTR)
Definition: postgres.h:340
static Oid DatumGetObjectId(Datum X)
Definition: postgres.h:590
#define SET_VARSIZE_COMPRESSED(PTR, len)
Definition: postgres.h:345
#define VARDATA(PTR)
Definition: postgres.h:316
#define SET_VARTAG_EXTERNAL(PTR, tag)
Definition: postgres.h:347
#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)
Definition: postgres.h:392
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:660
#define VARDATA_EXTERNAL(PTR)
Definition: postgres.h:324
#define SET_VARSIZE(PTR, len)
Definition: postgres.h:343
#define VARSIZE(PTR)
Definition: postgres.h:317
#define VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer)
Definition: postgres.h:372
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:560
#define VARATT_IS_EXTERNAL(PTR)
Definition: postgres.h:327
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:550
@ VARTAG_INDIRECT
Definition: postgres.h:124
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define RelationIsLogicallyLogged(relation)
Definition: rel.h:699
#define RelationGetDescr(relation)
Definition: rel.h:527
#define RelationGetRelationName(relation)
Definition: rel.h:535
#define RelationIsValid(relation)
Definition: rel.h:474
Relation RelationIdGetRelation(Oid relationId)
Definition: relcache.c:2054
void RelationClose(Relation relation)
Definition: relcache.c:2160
Oid RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber)
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90
static int file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
void ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change, bool upd_mem)
static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
TransactionId * ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
void ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
void ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, CommandId cid)
static void ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileLocator locator, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
static void ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
static bool ReorderBufferCanStartStreaming(ReorderBuffer *rb)
static void ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id, XLogRecPtr last_lsn, ReorderBufferChange *specinsert)
struct ReorderBufferDiskChange ReorderBufferDiskChange
bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
void ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
TransactionId ReorderBufferGetOldestXmin(ReorderBuffer *rb)
static int ReorderBufferIterCompare(Datum a, Datum b, void *arg)
static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, ReorderBufferIterTXNState *volatile *iter_state)
bool ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, Snapshot snapshot, HeapTuple htup, Buffer buffer, CommandId *cmin, CommandId *cmax)
static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
void ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, ReorderBufferChange *change, bool toast_insert)
static void ReorderBufferReplay(ReorderBufferTXN *txn, ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn)
void ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid, char *gid)
ReorderBuffer * ReorderBufferAllocate(void)