PostgreSQL Source Code  git master
reorderbuffer.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * reorderbuffer.c
4  * PostgreSQL logical replay/reorder buffer management
5  *
6  *
7  * Copyright (c) 2012-2023, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/logical/reorderbuffer.c
12  *
13  * NOTES
14  * This module gets handed individual pieces of transactions in the order
15  * they are written to the WAL and is responsible to reassemble them into
16  * toplevel transaction sized pieces. When a transaction is completely
17  * reassembled - signaled by reading the transaction commit record - it
18  * will then call the output plugin (cf. ReorderBufferCommit()) with the
19  * individual changes. The output plugins rely on snapshots built by
20  * snapbuild.c which hands them to us.
21  *
22  * Transactions and subtransactions/savepoints in postgres are not
23  * immediately linked to each other from outside the performing
24  * backend. Only at commit/abort (or special xact_assignment records) they
25  * are linked together. Which means that we will have to splice together a
26  * toplevel transaction from its subtransactions. To do that efficiently we
27  * build a binary heap indexed by the smallest current lsn of the individual
28  * subtransactions' changestreams. As the individual streams are inherently
29  * ordered by LSN - since that is where we build them from - the transaction
30  * can easily be reassembled by always using the subtransaction with the
31  * smallest current LSN from the heap.
32  *
33  * In order to cope with large transactions - which can be several times as
34  * big as the available memory - this module supports spooling the contents
35  * of a large transactions to disk. When the transaction is replayed the
36  * contents of individual (sub-)transactions will be read from disk in
37  * chunks.
38  *
39  * This module also has to deal with reassembling toast records from the
40  * individual chunks stored in WAL. When a new (or initial) version of a
41  * tuple is stored in WAL it will always be preceded by the toast chunks
42  * emitted for the columns stored out of line. Within a single toplevel
43  * transaction there will be no other data carrying records between a row's
44  * toast chunks and the row data itself. See ReorderBufferToast* for
45  * details.
46  *
47  * ReorderBuffer uses two special memory context types - SlabContext for
48  * allocations of fixed-length structures (changes and transactions), and
49  * GenerationContext for the variable-length transaction data (allocated
50  * and freed in groups with similar lifespans).
51  *
52  * To limit the amount of memory used by decoded changes, we track memory
53  * used at the reorder buffer level (i.e. total amount of memory), and for
54  * each transaction. When the total amount of used memory exceeds the
55  * limit, the transaction consuming the most memory is then serialized to
56  * disk.
57  *
58  * Only decoded changes are evicted from memory (spilled to disk), not the
59  * transaction records. The number of toplevel transactions is limited,
60  * but a transaction with many subtransactions may still consume significant
61  * amounts of memory. However, the transaction records are fairly small and
62  * are not included in the memory limit.
63  *
64  * The current eviction algorithm is very simple - the transaction is
65  * picked merely by size, while it might be useful to also consider age
66  * (LSN) of the changes for example. With the new Generational memory
67  * allocator, evicting the oldest changes would make it more likely the
68  * memory gets actually freed.
69  *
70  * We still rely on max_changes_in_memory when loading serialized changes
71  * back into memory. At that point we can't use the memory limit directly
72  * as we load the subxacts independently. One option to deal with this
73  * would be to count the subxacts, and allow each to allocate 1/N of the
74  * memory limit. That however does not seem very appealing, because with
75  * many subtransactions it may easily cause thrashing (short cycles of
76  * deserializing and applying very few changes). We probably should give
77  * a bit more memory to the oldest subtransactions, because it's likely
78  * they are the source for the next sequence of changes.
79  *
80  * -------------------------------------------------------------------------
81  */
82 #include "postgres.h"
83 
84 #include <unistd.h>
85 #include <sys/stat.h>
86 
87 #include "access/detoast.h"
88 #include "access/heapam.h"
89 #include "access/rewriteheap.h"
90 #include "access/transam.h"
91 #include "access/xact.h"
92 #include "access/xlog_internal.h"
93 #include "catalog/catalog.h"
94 #include "lib/binaryheap.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "replication/logical.h"
99 #include "replication/slot.h"
100 #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
101 #include "storage/bufmgr.h"
102 #include "storage/fd.h"
103 #include "storage/sinval.h"
104 #include "utils/builtins.h"
105 #include "utils/combocid.h"
106 #include "utils/memdebug.h"
107 #include "utils/memutils.h"
108 #include "utils/rel.h"
109 #include "utils/relfilenumbermap.h"
110 
111 
112 /* entry for a hash table we use to map from xid to our transaction state */
114 {
118 
119 /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
121 {
125 
127 {
131  CommandId combocid; /* just for debugging */
133 
134 /* Virtual file descriptor with file offset tracking */
135 typedef struct TXNEntryFile
136 {
137  File vfd; /* -1 when the file is closed */
138  off_t curOffset; /* offset for next write or read. Reset to 0
139  * when vfd is opened. */
141 
142 /* k-way in-order change iteration support structures */
144 {
151 
153 {
159 
160 /* toast datastructures */
161 typedef struct ReorderBufferToastEnt
162 {
163  Oid chunk_id; /* toast_table.chunk_id */
164  int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
165  * have seen */
166  Size num_chunks; /* number of chunks we've already seen */
167  Size size; /* combined size of chunks seen */
168  dlist_head chunks; /* linked list of chunks */
169  struct varlena *reconstructed; /* reconstructed varlena now pointed to in
170  * main tup */
172 
173 /* Disk serialization support datastructures */
175 {
178  /* data follows */
180 
181 #define IsSpecInsert(action) \
182 ( \
183  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
184 )
185 #define IsSpecConfirmOrAbort(action) \
186 ( \
187  (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
188  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
189 )
190 #define IsInsertOrUpdate(action) \
191 ( \
192  (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
193  ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
194  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
195 )
196 
197 /*
198  * Maximum number of changes kept in memory, per transaction. After that,
199  * changes are spooled to disk.
200  *
201  * The current value should be sufficient to decode the entire transaction
202  * without hitting disk in OLTP workloads, while starting to spool to disk in
203  * other workloads reasonably fast.
204  *
205  * At some point in the future it probably makes sense to have a more elaborate
206  * resource management here, but it's not entirely clear what that would look
207  * like.
208  */
210 static const Size max_changes_in_memory = 4096; /* XXX for restore only */
211 
212 /* GUC variable */
214 
215 /* ---------------------------------------
216  * primary reorderbuffer support routines
217  * ---------------------------------------
218  */
222  TransactionId xid, bool create, bool *is_new,
223  XLogRecPtr lsn, bool create_as_top);
225  ReorderBufferTXN *subtxn);
226 
227 static void AssertTXNLsnOrder(ReorderBuffer *rb);
228 
229 /* ---------------------------------------
230  * support functions for lsn-order iterating over the ->changes of a
231  * transaction and its subtransactions
232  *
233  * used for iteration over the k-way heap merge of a transaction and its
234  * subtransactions
235  * ---------------------------------------
236  */
238  ReorderBufferIterTXNState *volatile *iter_state);
243 
244 /*
245  * ---------------------------------------
246  * Disk serialization support functions
247  * ---------------------------------------
248  */
252  int fd, ReorderBufferChange *change);
254  TXNEntryFile *file, XLogSegNo *segno);
256  char *data);
259  bool txn_prepared);
260 static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
261 static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
262  TransactionId xid, XLogSegNo segno);
263 
264 static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
266  ReorderBufferTXN *txn, CommandId cid);
267 
268 /*
269  * ---------------------------------------
270  * Streaming support functions
271  * ---------------------------------------
272  */
273 static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
274 static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
277 
278 /* ---------------------------------------
279  * toast reassembly support
280  * ---------------------------------------
281  */
285  Relation relation, ReorderBufferChange *change);
287  Relation relation, ReorderBufferChange *change);
288 
289 /*
290  * ---------------------------------------
291  * memory accounting
292  * ---------------------------------------
293  */
296  ReorderBufferChange *change,
297  bool addition, Size sz);
298 
299 /*
300  * Allocate a new ReorderBuffer and clean out any old serialized state from
301  * prior ReorderBuffer instances for the same slot.
302  */
305 {
306  ReorderBuffer *buffer;
307  HASHCTL hash_ctl;
308  MemoryContext new_ctx;
309 
310  Assert(MyReplicationSlot != NULL);
311 
312  /* allocate memory in own context, to have better accountability */
314  "ReorderBuffer",
316 
317  buffer =
318  (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
319 
320  memset(&hash_ctl, 0, sizeof(hash_ctl));
321 
322  buffer->context = new_ctx;
323 
324  buffer->change_context = SlabContextCreate(new_ctx,
325  "Change",
327  sizeof(ReorderBufferChange));
328 
329  buffer->txn_context = SlabContextCreate(new_ctx,
330  "TXN",
332  sizeof(ReorderBufferTXN));
333 
334  /*
335  * XXX the allocation sizes used below pre-date generation context's block
336  * growing code. These values should likely be benchmarked and set to
337  * more suitable values.
338  */
339  buffer->tup_context = GenerationContextCreate(new_ctx,
340  "Tuples",
344 
345  hash_ctl.keysize = sizeof(TransactionId);
346  hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
347  hash_ctl.hcxt = buffer->context;
348 
349  buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
351 
353  buffer->by_txn_last_txn = NULL;
354 
355  buffer->outbuf = NULL;
356  buffer->outbufsize = 0;
357  buffer->size = 0;
358 
359  buffer->spillTxns = 0;
360  buffer->spillCount = 0;
361  buffer->spillBytes = 0;
362  buffer->streamTxns = 0;
363  buffer->streamCount = 0;
364  buffer->streamBytes = 0;
365  buffer->totalTxns = 0;
366  buffer->totalBytes = 0;
367 
369 
370  dlist_init(&buffer->toplevel_by_lsn);
372  dclist_init(&buffer->catchange_txns);
373 
374  /*
375  * Ensure there's no stale data from prior uses of this slot, in case some
376  * prior exit avoided calling ReorderBufferFree. Failure to do this can
377  * produce duplicated txns, and it's very cheap if there's nothing there.
378  */
380 
381  return buffer;
382 }
383 
384 /*
385  * Free a ReorderBuffer
386  */
387 void
389 {
390  MemoryContext context = rb->context;
391 
392  /*
393  * We free separately allocated data by entirely scrapping reorderbuffer's
394  * memory context.
395  */
396  MemoryContextDelete(context);
397 
398  /* Free disk space used by unconsumed reorder buffers */
400 }
401 
402 /*
403  * Get an unused, possibly preallocated, ReorderBufferTXN.
404  */
405 static ReorderBufferTXN *
407 {
408  ReorderBufferTXN *txn;
409 
410  txn = (ReorderBufferTXN *)
412 
413  memset(txn, 0, sizeof(ReorderBufferTXN));
414 
415  dlist_init(&txn->changes);
416  dlist_init(&txn->tuplecids);
417  dlist_init(&txn->subtxns);
418 
419  /* InvalidCommandId is not zero, so set it explicitly */
421  txn->output_plugin_private = NULL;
422 
423  return txn;
424 }
425 
426 /*
427  * Free a ReorderBufferTXN.
428  */
429 static void
431 {
432  /* clean the lookup cache if we were cached (quite likely) */
433  if (rb->by_txn_last_xid == txn->xid)
434  {
436  rb->by_txn_last_txn = NULL;
437  }
438 
439  /* free data that's contained */
440 
441  if (txn->gid != NULL)
442  {
443  pfree(txn->gid);
444  txn->gid = NULL;
445  }
446 
447  if (txn->tuplecid_hash != NULL)
448  {
450  txn->tuplecid_hash = NULL;
451  }
452 
453  if (txn->invalidations)
454  {
455  pfree(txn->invalidations);
456  txn->invalidations = NULL;
457  }
458 
459  /* Reset the toast hash */
460  ReorderBufferToastReset(rb, txn);
461 
462  pfree(txn);
463 }
464 
465 /*
466  * Get a fresh ReorderBufferChange.
467  */
470 {
471  ReorderBufferChange *change;
472 
473  change = (ReorderBufferChange *)
475 
476  memset(change, 0, sizeof(ReorderBufferChange));
477  return change;
478 }
479 
480 /*
481  * Free a ReorderBufferChange and update memory accounting, if requested.
482  */
483 void
485  bool upd_mem)
486 {
487  /* update memory accounting info */
488  if (upd_mem)
489  ReorderBufferChangeMemoryUpdate(rb, change, false,
490  ReorderBufferChangeSize(change));
491 
492  /* free contained data */
493  switch (change->action)
494  {
499  if (change->data.tp.newtuple)
500  {
501  ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple);
502  change->data.tp.newtuple = NULL;
503  }
504 
505  if (change->data.tp.oldtuple)
506  {
507  ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple);
508  change->data.tp.oldtuple = NULL;
509  }
510  break;
512  if (change->data.msg.prefix != NULL)
513  pfree(change->data.msg.prefix);
514  change->data.msg.prefix = NULL;
515  if (change->data.msg.message != NULL)
516  pfree(change->data.msg.message);
517  change->data.msg.message = NULL;
518  break;
520  if (change->data.inval.invalidations)
521  pfree(change->data.inval.invalidations);
522  change->data.inval.invalidations = NULL;
523  break;
525  if (change->data.snapshot)
526  {
527  ReorderBufferFreeSnap(rb, change->data.snapshot);
528  change->data.snapshot = NULL;
529  }
530  break;
531  /* no data in addition to the struct itself */
533  if (change->data.truncate.relids != NULL)
534  {
535  ReorderBufferReturnRelids(rb, change->data.truncate.relids);
536  change->data.truncate.relids = NULL;
537  }
538  break;
543  break;
544  }
545 
546  pfree(change);
547 }
548 
549 /*
550  * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size
551  * tuple_len (excluding header overhead).
552  */
555 {
556  ReorderBufferTupleBuf *tuple;
557  Size alloc_len;
558 
559  alloc_len = tuple_len + SizeofHeapTupleHeader;
560 
561  tuple = (ReorderBufferTupleBuf *)
563  sizeof(ReorderBufferTupleBuf) +
564  MAXIMUM_ALIGNOF + alloc_len);
565  tuple->alloc_tuple_size = alloc_len;
566  tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
567 
568  return tuple;
569 }
570 
571 /*
572  * Free a ReorderBufferTupleBuf.
573  */
574 void
576 {
577  pfree(tuple);
578 }
579 
580 /*
581  * Get an array for relids of truncated relations.
582  *
583  * We use the global memory context (for the whole reorder buffer), because
584  * none of the existing ones seems like a good match (some are SLAB, so we
585  * can't use those, and tup_context is meant for tuple data, not relids). We
586  * could add yet another context, but it seems like an overkill - TRUNCATE is
587  * not particularly common operation, so it does not seem worth it.
588  */
589 Oid *
591 {
592  Oid *relids;
593  Size alloc_len;
594 
595  alloc_len = sizeof(Oid) * nrelids;
596 
597  relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
598 
599  return relids;
600 }
601 
602 /*
603  * Free an array of relids.
604  */
605 void
607 {
608  pfree(relids);
609 }
610 
611 /*
612  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
613  * If create is true, and a transaction doesn't already exist, create it
614  * (with the given LSN, and as top transaction if that's specified);
615  * when this happens, is_new is set to true.
616  */
617 static ReorderBufferTXN *
619  bool *is_new, XLogRecPtr lsn, bool create_as_top)
620 {
621  ReorderBufferTXN *txn;
623  bool found;
624 
626 
627  /*
628  * Check the one-entry lookup cache first
629  */
631  rb->by_txn_last_xid == xid)
632  {
633  txn = rb->by_txn_last_txn;
634 
635  if (txn != NULL)
636  {
637  /* found it, and it's valid */
638  if (is_new)
639  *is_new = false;
640  return txn;
641  }
642 
643  /*
644  * cached as non-existent, and asked not to create? Then nothing else
645  * to do.
646  */
647  if (!create)
648  return NULL;
649  /* otherwise fall through to create it */
650  }
651 
652  /*
653  * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
654  * create an entry.
655  */
656 
657  /* search the lookup table */
658  ent = (ReorderBufferTXNByIdEnt *)
659  hash_search(rb->by_txn,
660  &xid,
661  create ? HASH_ENTER : HASH_FIND,
662  &found);
663  if (found)
664  txn = ent->txn;
665  else if (create)
666  {
667  /* initialize the new entry, if creation was requested */
668  Assert(ent != NULL);
669  Assert(lsn != InvalidXLogRecPtr);
670 
671  ent->txn = ReorderBufferGetTXN(rb);
672  ent->txn->xid = xid;
673  txn = ent->txn;
674  txn->first_lsn = lsn;
676 
677  if (create_as_top)
678  {
679  dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
680  AssertTXNLsnOrder(rb);
681  }
682  }
683  else
684  txn = NULL; /* not found and not asked to create */
685 
686  /* update cache */
687  rb->by_txn_last_xid = xid;
688  rb->by_txn_last_txn = txn;
689 
690  if (is_new)
691  *is_new = !found;
692 
693  Assert(!create || txn != NULL);
694  return txn;
695 }
696 
697 /*
698  * Record the partial change for the streaming of in-progress transactions. We
699  * can stream only complete changes so if we have a partial change like toast
700  * table insert or speculative insert then we mark such a 'txn' so that it
701  * can't be streamed. We also ensure that if the changes in such a 'txn' can
702  * be streamed and are above logical_decoding_work_mem threshold then we stream
703  * them as soon as we have a complete change.
704  */
705 static void
707  ReorderBufferChange *change,
708  bool toast_insert)
709 {
710  ReorderBufferTXN *toptxn;
711 
712  /*
713  * The partial changes need to be processed only while streaming
714  * in-progress transactions.
715  */
716  if (!ReorderBufferCanStream(rb))
717  return;
718 
719  /* Get the top transaction. */
720  toptxn = rbtxn_get_toptxn(txn);
721 
722  /*
723  * Indicate a partial change for toast inserts. The change will be
724  * considered as complete once we get the insert or update on the main
725  * table and we are sure that the pending toast chunks are not required
726  * anymore.
727  *
728  * If we allow streaming when there are pending toast chunks then such
729  * chunks won't be released till the insert (multi_insert) is complete and
730  * we expect the txn to have streamed all changes after streaming. This
731  * restriction is mainly to ensure the correctness of streamed
732  * transactions and it doesn't seem worth uplifting such a restriction
733  * just to allow this case because anyway we will stream the transaction
734  * once such an insert is complete.
735  */
736  if (toast_insert)
738  else if (rbtxn_has_partial_change(toptxn) &&
739  IsInsertOrUpdate(change->action) &&
740  change->data.tp.clear_toast_afterwards)
742 
743  /*
744  * Indicate a partial change for speculative inserts. The change will be
745  * considered as complete once we get the speculative confirm or abort
746  * token.
747  */
748  if (IsSpecInsert(change->action))
750  else if (rbtxn_has_partial_change(toptxn) &&
751  IsSpecConfirmOrAbort(change->action))
753 
754  /*
755  * Stream the transaction if it is serialized before and the changes are
756  * now complete in the top-level transaction.
757  *
758  * The reason for doing the streaming of such a transaction as soon as we
759  * get the complete change for it is that previously it would have reached
760  * the memory threshold and wouldn't get streamed because of incomplete
761  * changes. Delaying such transactions would increase apply lag for them.
762  */
764  !(rbtxn_has_partial_change(toptxn)) &&
765  rbtxn_is_serialized(txn) &&
767  ReorderBufferStreamTXN(rb, toptxn);
768 }
769 
770 /*
771  * Queue a change into a transaction so it can be replayed upon commit or will be
772  * streamed when we reach logical_decoding_work_mem threshold.
773  */
774 void
776  ReorderBufferChange *change, bool toast_insert)
777 {
778  ReorderBufferTXN *txn;
779 
780  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
781 
782  /*
783  * While streaming the previous changes we have detected that the
784  * transaction is aborted. So there is no point in collecting further
785  * changes for it.
786  */
787  if (txn->concurrent_abort)
788  {
789  /*
790  * We don't need to update memory accounting for this change as we
791  * have not added it to the queue yet.
792  */
793  ReorderBufferReturnChange(rb, change, false);
794  return;
795  }
796 
797  /*
798  * The changes that are sent downstream are considered streamable. We
799  * remember such transactions so that only those will later be considered
800  * for streaming.
801  */
802  if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
808  {
809  ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
810 
812  }
813 
814  change->lsn = lsn;
815  change->txn = txn;
816 
817  Assert(InvalidXLogRecPtr != lsn);
818  dlist_push_tail(&txn->changes, &change->node);
819  txn->nentries++;
820  txn->nentries_mem++;
821 
822  /* update memory accounting information */
823  ReorderBufferChangeMemoryUpdate(rb, change, true,
824  ReorderBufferChangeSize(change));
825 
826  /* process partial change */
827  ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
828 
829  /* check the memory limits and evict something if needed */
831 }
832 
833 /*
834  * A transactional message is queued to be processed upon commit and a
835  * non-transactional message gets processed immediately.
836  */
837 void
839  Snapshot snap, XLogRecPtr lsn,
840  bool transactional, const char *prefix,
841  Size message_size, const char *message)
842 {
843  if (transactional)
844  {
845  MemoryContext oldcontext;
846  ReorderBufferChange *change;
847 
849 
850  /*
851  * We don't expect snapshots for transactional changes - we'll use the
852  * snapshot derived later during apply (unless the change gets
853  * skipped).
854  */
855  Assert(!snap);
856 
857  oldcontext = MemoryContextSwitchTo(rb->context);
858 
859  change = ReorderBufferGetChange(rb);
861  change->data.msg.prefix = pstrdup(prefix);
862  change->data.msg.message_size = message_size;
863  change->data.msg.message = palloc(message_size);
864  memcpy(change->data.msg.message, message, message_size);
865 
866  ReorderBufferQueueChange(rb, xid, lsn, change, false);
867 
868  MemoryContextSwitchTo(oldcontext);
869  }
870  else
871  {
872  ReorderBufferTXN *txn = NULL;
873  volatile Snapshot snapshot_now = snap;
874 
875  /* Non-transactional changes require a valid snapshot. */
876  Assert(snapshot_now);
877 
878  if (xid != InvalidTransactionId)
879  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
880 
881  /* setup snapshot to allow catalog access */
882  SetupHistoricSnapshot(snapshot_now, NULL);
883  PG_TRY();
884  {
885  rb->message(rb, txn, lsn, false, prefix, message_size, message);
886 
888  }
889  PG_CATCH();
890  {
892  PG_RE_THROW();
893  }
894  PG_END_TRY();
895  }
896 }
897 
898 /*
899  * AssertTXNLsnOrder
900  * Verify LSN ordering of transaction lists in the reorderbuffer
901  *
902  * Other LSN-related invariants are checked too.
903  *
904  * No-op if assertions are not in use.
905  */
906 static void
908 {
909 #ifdef USE_ASSERT_CHECKING
911  dlist_iter iter;
912  XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
913  XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
914 
915  /*
916  * Skip the verification if we don't reach the LSN at which we start
917  * decoding the contents of transactions yet because until we reach the
918  * LSN, we could have transactions that don't have the association between
919  * the top-level transaction and subtransaction yet and consequently have
920  * the same LSN. We don't guarantee this association until we try to
921  * decode the actual contents of transaction. The ordering of the records
922  * prior to the start_decoding_at LSN should have been checked before the
923  * restart.
924  */
926  return;
927 
928  dlist_foreach(iter, &rb->toplevel_by_lsn)
929  {
931  iter.cur);
932 
933  /* start LSN must be set */
934  Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
935 
936  /* If there is an end LSN, it must be higher than start LSN */
937  if (cur_txn->end_lsn != InvalidXLogRecPtr)
938  Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
939 
940  /* Current initial LSN must be strictly higher than previous */
941  if (prev_first_lsn != InvalidXLogRecPtr)
942  Assert(prev_first_lsn < cur_txn->first_lsn);
943 
944  /* known-as-subtxn txns must not be listed */
945  Assert(!rbtxn_is_known_subxact(cur_txn));
946 
947  prev_first_lsn = cur_txn->first_lsn;
948  }
949 
951  {
953  base_snapshot_node,
954  iter.cur);
955 
956  /* base snapshot (and its LSN) must be set */
957  Assert(cur_txn->base_snapshot != NULL);
959 
960  /* current LSN must be strictly higher than previous */
961  if (prev_base_snap_lsn != InvalidXLogRecPtr)
962  Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
963 
964  /* known-as-subtxn txns must not be listed */
965  Assert(!rbtxn_is_known_subxact(cur_txn));
966 
967  prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
968  }
969 #endif
970 }
971 
972 /*
973  * AssertChangeLsnOrder
974  *
975  * Check ordering of changes in the (sub)transaction.
976  */
977 static void
979 {
980 #ifdef USE_ASSERT_CHECKING
981  dlist_iter iter;
982  XLogRecPtr prev_lsn = txn->first_lsn;
983 
984  dlist_foreach(iter, &txn->changes)
985  {
986  ReorderBufferChange *cur_change;
987 
988  cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
989 
991  Assert(cur_change->lsn != InvalidXLogRecPtr);
992  Assert(txn->first_lsn <= cur_change->lsn);
993 
994  if (txn->end_lsn != InvalidXLogRecPtr)
995  Assert(cur_change->lsn <= txn->end_lsn);
996 
997  Assert(prev_lsn <= cur_change->lsn);
998 
999  prev_lsn = cur_change->lsn;
1000  }
1001 #endif
1002 }
1003 
1004 /*
1005  * ReorderBufferGetOldestTXN
1006  * Return oldest transaction in reorderbuffer
1007  */
1010 {
1011  ReorderBufferTXN *txn;
1012 
1013  AssertTXNLsnOrder(rb);
1014 
1015  if (dlist_is_empty(&rb->toplevel_by_lsn))
1016  return NULL;
1017 
1019 
1022  return txn;
1023 }
1024 
1025 /*
1026  * ReorderBufferGetOldestXmin
1027  * Return oldest Xmin in reorderbuffer
1028  *
1029  * Returns oldest possibly running Xid from the point of view of snapshots
1030  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1031  * there are none.
1032  *
1033  * Since snapshots are assigned monotonically, this equals the Xmin of the
1034  * base snapshot with minimal base_snapshot_lsn.
1035  */
1038 {
1039  ReorderBufferTXN *txn;
1040 
1041  AssertTXNLsnOrder(rb);
1042 
1044  return InvalidTransactionId;
1045 
1046  txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1048  return txn->base_snapshot->xmin;
1049 }
1050 
1051 void
1053 {
1054  rb->current_restart_decoding_lsn = ptr;
1055 }
1056 
1057 /*
1058  * ReorderBufferAssignChild
1059  *
1060  * Make note that we know that subxid is a subtransaction of xid, seen as of
1061  * the given lsn.
1062  */
1063 void
1065  TransactionId subxid, XLogRecPtr lsn)
1066 {
1067  ReorderBufferTXN *txn;
1068  ReorderBufferTXN *subtxn;
1069  bool new_top;
1070  bool new_sub;
1071 
1072  txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1073  subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1074 
1075  if (!new_sub)
1076  {
1077  if (rbtxn_is_known_subxact(subtxn))
1078  {
1079  /* already associated, nothing to do */
1080  return;
1081  }
1082  else
1083  {
1084  /*
1085  * We already saw this transaction, but initially added it to the
1086  * list of top-level txns. Now that we know it's not top-level,
1087  * remove it from there.
1088  */
1089  dlist_delete(&subtxn->node);
1090  }
1091  }
1092 
1093  subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1094  subtxn->toplevel_xid = xid;
1095  Assert(subtxn->nsubtxns == 0);
1096 
1097  /* set the reference to top-level transaction */
1098  subtxn->toptxn = txn;
1099 
1100  /* add to subtransaction list */
1101  dlist_push_tail(&txn->subtxns, &subtxn->node);
1102  txn->nsubtxns++;
1103 
1104  /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1105  ReorderBufferTransferSnapToParent(txn, subtxn);
1106 
1107  /* Verify LSN-ordering invariant */
1108  AssertTXNLsnOrder(rb);
1109 }
1110 
1111 /*
1112  * ReorderBufferTransferSnapToParent
1113  * Transfer base snapshot from subtxn to top-level txn, if needed
1114  *
1115  * This is done if the top-level txn doesn't have a base snapshot, or if the
1116  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1117  * snapshot's LSN. This can happen if there are no changes in the toplevel
1118  * txn but there are some in the subtxn, or the first change in subtxn has
1119  * earlier LSN than first change in the top-level txn and we learned about
1120  * their kinship only now.
1121  *
1122  * The subtransaction's snapshot is cleared regardless of the transfer
1123  * happening, since it's not needed anymore in either case.
1124  *
1125  * We do this as soon as we become aware of their kinship, to avoid queueing
1126  * extra snapshots to txns known-as-subtxns -- only top-level txns will
1127  * receive further snapshots.
1128  */
1129 static void
1131  ReorderBufferTXN *subtxn)
1132 {
1133  Assert(subtxn->toplevel_xid == txn->xid);
1134 
1135  if (subtxn->base_snapshot != NULL)
1136  {
1137  if (txn->base_snapshot == NULL ||
1138  subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1139  {
1140  /*
1141  * If the toplevel transaction already has a base snapshot but
1142  * it's newer than the subxact's, purge it.
1143  */
1144  if (txn->base_snapshot != NULL)
1145  {
1148  }
1149 
1150  /*
1151  * The snapshot is now the top transaction's; transfer it, and
1152  * adjust the list position of the top transaction in the list by
1153  * moving it to where the subtransaction is.
1154  */
1155  txn->base_snapshot = subtxn->base_snapshot;
1156  txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1158  &txn->base_snapshot_node);
1159 
1160  /*
1161  * The subtransaction doesn't have a snapshot anymore (so it
1162  * mustn't be in the list.)
1163  */
1164  subtxn->base_snapshot = NULL;
1166  dlist_delete(&subtxn->base_snapshot_node);
1167  }
1168  else
1169  {
1170  /* Base snap of toplevel is fine, so subxact's is not needed */
1172  dlist_delete(&subtxn->base_snapshot_node);
1173  subtxn->base_snapshot = NULL;
1175  }
1176  }
1177 }
1178 
1179 /*
1180  * Associate a subtransaction with its toplevel transaction at commit
1181  * time. There may be no further changes added after this.
1182  */
1183 void
1185  TransactionId subxid, XLogRecPtr commit_lsn,
1186  XLogRecPtr end_lsn)
1187 {
1188  ReorderBufferTXN *subtxn;
1189 
1190  subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1191  InvalidXLogRecPtr, false);
1192 
1193  /*
1194  * No need to do anything if that subtxn didn't contain any changes
1195  */
1196  if (!subtxn)
1197  return;
1198 
1199  subtxn->final_lsn = commit_lsn;
1200  subtxn->end_lsn = end_lsn;
1201 
1202  /*
1203  * Assign this subxact as a child of the toplevel xact (no-op if already
1204  * done.)
1205  */
1206  ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1207 }
1208 
1209 
1210 /*
1211  * Support for efficiently iterating over a transaction's and its
1212  * subtransactions' changes.
1213  *
1214  * We do by doing a k-way merge between transactions/subtransactions. For that
1215  * we model the current heads of the different transactions as a binary heap
1216  * so we easily know which (sub-)transaction has the change with the smallest
1217  * lsn next.
1218  *
1219  * We assume the changes in individual transactions are already sorted by LSN.
1220  */
1221 
1222 /*
1223  * Binary heap comparison function.
1224  */
1225 static int
1227 {
1229  XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1230  XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1231 
1232  if (pos_a < pos_b)
1233  return 1;
1234  else if (pos_a == pos_b)
1235  return 0;
1236  return -1;
1237 }
1238 
1239 /*
1240  * Allocate & initialize an iterator which iterates in lsn order over a
1241  * transaction and all its subtransactions.
1242  *
1243  * Note: The iterator state is returned through iter_state parameter rather
1244  * than the function's return value. This is because the state gets cleaned up
1245  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1246  * back the state even if this function throws an exception.
1247  */
1248 static void
1250  ReorderBufferIterTXNState *volatile *iter_state)
1251 {
1252  Size nr_txns = 0;
1254  dlist_iter cur_txn_i;
1255  int32 off;
1256 
1257  *iter_state = NULL;
1258 
1259  /* Check ordering of changes in the toplevel transaction. */
1260  AssertChangeLsnOrder(txn);
1261 
1262  /*
1263  * Calculate the size of our heap: one element for every transaction that
1264  * contains changes. (Besides the transactions already in the reorder
1265  * buffer, we count the one we were directly passed.)
1266  */
1267  if (txn->nentries > 0)
1268  nr_txns++;
1269 
1270  dlist_foreach(cur_txn_i, &txn->subtxns)
1271  {
1272  ReorderBufferTXN *cur_txn;
1273 
1274  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1275 
1276  /* Check ordering of changes in this subtransaction. */
1277  AssertChangeLsnOrder(cur_txn);
1278 
1279  if (cur_txn->nentries > 0)
1280  nr_txns++;
1281  }
1282 
1283  /* allocate iteration state */
1286  sizeof(ReorderBufferIterTXNState) +
1287  sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1288 
1289  state->nr_txns = nr_txns;
1290  dlist_init(&state->old_change);
1291 
1292  for (off = 0; off < state->nr_txns; off++)
1293  {
1294  state->entries[off].file.vfd = -1;
1295  state->entries[off].segno = 0;
1296  }
1297 
1298  /* allocate heap */
1299  state->heap = binaryheap_allocate(state->nr_txns,
1301  state);
1302 
1303  /* Now that the state fields are initialized, it is safe to return it. */
1304  *iter_state = state;
1305 
1306  /*
1307  * Now insert items into the binary heap, in an unordered fashion. (We
1308  * will run a heap assembly step at the end; this is more efficient.)
1309  */
1310 
1311  off = 0;
1312 
1313  /* add toplevel transaction if it contains changes */
1314  if (txn->nentries > 0)
1315  {
1316  ReorderBufferChange *cur_change;
1317 
1318  if (rbtxn_is_serialized(txn))
1319  {
1320  /* serialize remaining changes */
1321  ReorderBufferSerializeTXN(rb, txn);
1322  ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1323  &state->entries[off].segno);
1324  }
1325 
1326  cur_change = dlist_head_element(ReorderBufferChange, node,
1327  &txn->changes);
1328 
1329  state->entries[off].lsn = cur_change->lsn;
1330  state->entries[off].change = cur_change;
1331  state->entries[off].txn = txn;
1332 
1334  }
1335 
1336  /* add subtransactions if they contain changes */
1337  dlist_foreach(cur_txn_i, &txn->subtxns)
1338  {
1339  ReorderBufferTXN *cur_txn;
1340 
1341  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1342 
1343  if (cur_txn->nentries > 0)
1344  {
1345  ReorderBufferChange *cur_change;
1346 
1347  if (rbtxn_is_serialized(cur_txn))
1348  {
1349  /* serialize remaining changes */
1350  ReorderBufferSerializeTXN(rb, cur_txn);
1351  ReorderBufferRestoreChanges(rb, cur_txn,
1352  &state->entries[off].file,
1353  &state->entries[off].segno);
1354  }
1355  cur_change = dlist_head_element(ReorderBufferChange, node,
1356  &cur_txn->changes);
1357 
1358  state->entries[off].lsn = cur_change->lsn;
1359  state->entries[off].change = cur_change;
1360  state->entries[off].txn = cur_txn;
1361 
1363  }
1364  }
1365 
1366  /* assemble a valid binary heap */
1367  binaryheap_build(state->heap);
1368 }
1369 
1370 /*
1371  * Return the next change when iterating over a transaction and its
1372  * subtransactions.
1373  *
1374  * Returns NULL when no further changes exist.
1375  */
1376 static ReorderBufferChange *
1378 {
1379  ReorderBufferChange *change;
1381  int32 off;
1382 
1383  /* nothing there anymore */
1384  if (state->heap->bh_size == 0)
1385  return NULL;
1386 
1387  off = DatumGetInt32(binaryheap_first(state->heap));
1388  entry = &state->entries[off];
1389 
1390  /* free memory we might have "leaked" in the previous *Next call */
1391  if (!dlist_is_empty(&state->old_change))
1392  {
1393  change = dlist_container(ReorderBufferChange, node,
1394  dlist_pop_head_node(&state->old_change));
1395  ReorderBufferReturnChange(rb, change, true);
1396  Assert(dlist_is_empty(&state->old_change));
1397  }
1398 
1399  change = entry->change;
1400 
1401  /*
1402  * update heap with information about which transaction has the next
1403  * relevant change in LSN order
1404  */
1405 
1406  /* there are in-memory changes */
1407  if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1408  {
1409  dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1410  ReorderBufferChange *next_change =
1412 
1413  /* txn stays the same */
1414  state->entries[off].lsn = next_change->lsn;
1415  state->entries[off].change = next_change;
1416 
1418  return change;
1419  }
1420 
1421  /* try to load changes from disk */
1422  if (entry->txn->nentries != entry->txn->nentries_mem)
1423  {
1424  /*
1425  * Ugly: restoring changes will reuse *Change records, thus delete the
1426  * current one from the per-tx list and only free in the next call.
1427  */
1428  dlist_delete(&change->node);
1429  dlist_push_tail(&state->old_change, &change->node);
1430 
1431  /*
1432  * Update the total bytes processed by the txn for which we are
1433  * releasing the current set of changes and restoring the new set of
1434  * changes.
1435  */
1436  rb->totalBytes += entry->txn->size;
1437  if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1438  &state->entries[off].segno))
1439  {
1440  /* successfully restored changes from disk */
1441  ReorderBufferChange *next_change =
1443  &entry->txn->changes);
1444 
1445  elog(DEBUG2, "restored %u/%u changes from disk",
1446  (uint32) entry->txn->nentries_mem,
1447  (uint32) entry->txn->nentries);
1448 
1449  Assert(entry->txn->nentries_mem);
1450  /* txn stays the same */
1451  state->entries[off].lsn = next_change->lsn;
1452  state->entries[off].change = next_change;
1454 
1455  return change;
1456  }
1457  }
1458 
1459  /* ok, no changes there anymore, remove */
1461 
1462  return change;
1463 }
1464 
1465 /*
1466  * Deallocate the iterator
1467  */
1468 static void
1471 {
1472  int32 off;
1473 
1474  for (off = 0; off < state->nr_txns; off++)
1475  {
1476  if (state->entries[off].file.vfd != -1)
1477  FileClose(state->entries[off].file.vfd);
1478  }
1479 
1480  /* free memory we might have "leaked" in the last *Next call */
1481  if (!dlist_is_empty(&state->old_change))
1482  {
1483  ReorderBufferChange *change;
1484 
1485  change = dlist_container(ReorderBufferChange, node,
1486  dlist_pop_head_node(&state->old_change));
1487  ReorderBufferReturnChange(rb, change, true);
1488  Assert(dlist_is_empty(&state->old_change));
1489  }
1490 
1491  binaryheap_free(state->heap);
1492  pfree(state);
1493 }
1494 
1495 /*
1496  * Cleanup the contents of a transaction, usually after the transaction
1497  * committed or aborted.
1498  */
1499 static void
1501 {
1502  bool found;
1503  dlist_mutable_iter iter;
1504 
1505  /* cleanup subtransactions & their changes */
1506  dlist_foreach_modify(iter, &txn->subtxns)
1507  {
1508  ReorderBufferTXN *subtxn;
1509 
1510  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1511 
1512  /*
1513  * Subtransactions are always associated to the toplevel TXN, even if
1514  * they originally were happening inside another subtxn, so we won't
1515  * ever recurse more than one level deep here.
1516  */
1517  Assert(rbtxn_is_known_subxact(subtxn));
1518  Assert(subtxn->nsubtxns == 0);
1519 
1520  ReorderBufferCleanupTXN(rb, subtxn);
1521  }
1522 
1523  /* cleanup changes in the txn */
1524  dlist_foreach_modify(iter, &txn->changes)
1525  {
1526  ReorderBufferChange *change;
1527 
1528  change = dlist_container(ReorderBufferChange, node, iter.cur);
1529 
1530  /* Check we're not mixing changes from different transactions. */
1531  Assert(change->txn == txn);
1532 
1533  ReorderBufferReturnChange(rb, change, true);
1534  }
1535 
1536  /*
1537  * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1538  * They are always stored in the toplevel transaction.
1539  */
1540  dlist_foreach_modify(iter, &txn->tuplecids)
1541  {
1542  ReorderBufferChange *change;
1543 
1544  change = dlist_container(ReorderBufferChange, node, iter.cur);
1545 
1546  /* Check we're not mixing changes from different transactions. */
1547  Assert(change->txn == txn);
1549 
1550  ReorderBufferReturnChange(rb, change, true);
1551  }
1552 
1553  /*
1554  * Cleanup the base snapshot, if set.
1555  */
1556  if (txn->base_snapshot != NULL)
1557  {
1560  }
1561 
1562  /*
1563  * Cleanup the snapshot for the last streamed run.
1564  */
1565  if (txn->snapshot_now != NULL)
1566  {
1567  Assert(rbtxn_is_streamed(txn));
1569  }
1570 
1571  /*
1572  * Remove TXN from its containing lists.
1573  *
1574  * Note: if txn is known as subxact, we are deleting the TXN from its
1575  * parent's list of known subxacts; this leaves the parent's nsubxacts
1576  * count too high, but we don't care. Otherwise, we are deleting the TXN
1577  * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1578  * list of catalog modifying transactions as well.
1579  */
1580  dlist_delete(&txn->node);
1581  if (rbtxn_has_catalog_changes(txn))
1583 
1584  /* now remove reference from buffer */
1585  hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1586  Assert(found);
1587 
1588  /* remove entries spilled to disk */
1589  if (rbtxn_is_serialized(txn))
1590  ReorderBufferRestoreCleanup(rb, txn);
1591 
1592  /* deallocate */
1593  ReorderBufferReturnTXN(rb, txn);
1594 }
1595 
1596 /*
1597  * Discard changes from a transaction (and subtransactions), either after
1598  * streaming or decoding them at PREPARE. Keep the remaining info -
1599  * transactions, tuplecids, invalidations and snapshots.
1600  *
1601  * We additionally remove tuplecids after decoding the transaction at prepare
1602  * time as we only need to perform invalidation at rollback or commit prepared.
1603  *
1604  * 'txn_prepared' indicates that we have decoded the transaction at prepare
1605  * time.
1606  */
1607 static void
1609 {
1610  dlist_mutable_iter iter;
1611 
1612  /* cleanup subtransactions & their changes */
1613  dlist_foreach_modify(iter, &txn->subtxns)
1614  {
1615  ReorderBufferTXN *subtxn;
1616 
1617  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1618 
1619  /*
1620  * Subtransactions are always associated to the toplevel TXN, even if
1621  * they originally were happening inside another subtxn, so we won't
1622  * ever recurse more than one level deep here.
1623  */
1624  Assert(rbtxn_is_known_subxact(subtxn));
1625  Assert(subtxn->nsubtxns == 0);
1626 
1627  ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1628  }
1629 
1630  /* cleanup changes in the txn */
1631  dlist_foreach_modify(iter, &txn->changes)
1632  {
1633  ReorderBufferChange *change;
1634 
1635  change = dlist_container(ReorderBufferChange, node, iter.cur);
1636 
1637  /* Check we're not mixing changes from different transactions. */
1638  Assert(change->txn == txn);
1639 
1640  /* remove the change from it's containing list */
1641  dlist_delete(&change->node);
1642 
1643  ReorderBufferReturnChange(rb, change, true);
1644  }
1645 
1646  /*
1647  * Mark the transaction as streamed.
1648  *
1649  * The top-level transaction, is marked as streamed always, even if it
1650  * does not contain any changes (that is, when all the changes are in
1651  * subtransactions).
1652  *
1653  * For subtransactions, we only mark them as streamed when there are
1654  * changes in them.
1655  *
1656  * We do it this way because of aborts - we don't want to send aborts for
1657  * XIDs the downstream is not aware of. And of course, it always knows
1658  * about the toplevel xact (we send the XID in all messages), but we never
1659  * stream XIDs of empty subxacts.
1660  */
1661  if ((!txn_prepared) && (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0)))
1662  txn->txn_flags |= RBTXN_IS_STREAMED;
1663 
1664  if (txn_prepared)
1665  {
1666  /*
1667  * If this is a prepared txn, cleanup the tuplecids we stored for
1668  * decoding catalog snapshot access. They are always stored in the
1669  * toplevel transaction.
1670  */
1671  dlist_foreach_modify(iter, &txn->tuplecids)
1672  {
1673  ReorderBufferChange *change;
1674 
1675  change = dlist_container(ReorderBufferChange, node, iter.cur);
1676 
1677  /* Check we're not mixing changes from different transactions. */
1678  Assert(change->txn == txn);
1680 
1681  /* Remove the change from its containing list. */
1682  dlist_delete(&change->node);
1683 
1684  ReorderBufferReturnChange(rb, change, true);
1685  }
1686  }
1687 
1688  /*
1689  * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1690  * memory. We could also keep the hash table and update it with new ctid
1691  * values, but this seems simpler and good enough for now.
1692  */
1693  if (txn->tuplecid_hash != NULL)
1694  {
1696  txn->tuplecid_hash = NULL;
1697  }
1698 
1699  /* If this txn is serialized then clean the disk space. */
1700  if (rbtxn_is_serialized(txn))
1701  {
1702  ReorderBufferRestoreCleanup(rb, txn);
1703  txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1704 
1705  /*
1706  * We set this flag to indicate if the transaction is ever serialized.
1707  * We need this to accurately update the stats as otherwise the same
1708  * transaction can be counted as serialized multiple times.
1709  */
1711  }
1712 
1713  /* also reset the number of entries in the transaction */
1714  txn->nentries_mem = 0;
1715  txn->nentries = 0;
1716 }
1717 
1718 /*
1719  * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1720  * HeapTupleSatisfiesHistoricMVCC.
1721  */
1722 static void
1724 {
1725  dlist_iter iter;
1726  HASHCTL hash_ctl;
1727 
1729  return;
1730 
1731  hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1732  hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1733  hash_ctl.hcxt = rb->context;
1734 
1735  /*
1736  * create the hash with the exact number of to-be-stored tuplecids from
1737  * the start
1738  */
1739  txn->tuplecid_hash =
1740  hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1742 
1743  dlist_foreach(iter, &txn->tuplecids)
1744  {
1747  bool found;
1748  ReorderBufferChange *change;
1749 
1750  change = dlist_container(ReorderBufferChange, node, iter.cur);
1751 
1753 
1754  /* be careful about padding */
1755  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1756 
1757  key.rlocator = change->data.tuplecid.locator;
1758 
1759  ItemPointerCopy(&change->data.tuplecid.tid,
1760  &key.tid);
1761 
1762  ent = (ReorderBufferTupleCidEnt *)
1763  hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1764  if (!found)
1765  {
1766  ent->cmin = change->data.tuplecid.cmin;
1767  ent->cmax = change->data.tuplecid.cmax;
1768  ent->combocid = change->data.tuplecid.combocid;
1769  }
1770  else
1771  {
1772  /*
1773  * Maybe we already saw this tuple before in this transaction, but
1774  * if so it must have the same cmin.
1775  */
1776  Assert(ent->cmin == change->data.tuplecid.cmin);
1777 
1778  /*
1779  * cmax may be initially invalid, but once set it can only grow,
1780  * and never become invalid again.
1781  */
1782  Assert((ent->cmax == InvalidCommandId) ||
1783  ((change->data.tuplecid.cmax != InvalidCommandId) &&
1784  (change->data.tuplecid.cmax > ent->cmax)));
1785  ent->cmax = change->data.tuplecid.cmax;
1786  }
1787  }
1788 }
1789 
1790 /*
1791  * Copy a provided snapshot so we can modify it privately. This is needed so
1792  * that catalog modifying transactions can look into intermediate catalog
1793  * states.
1794  */
1795 static Snapshot
1797  ReorderBufferTXN *txn, CommandId cid)
1798 {
1799  Snapshot snap;
1800  dlist_iter iter;
1801  int i = 0;
1802  Size size;
1803 
1804  size = sizeof(SnapshotData) +
1805  sizeof(TransactionId) * orig_snap->xcnt +
1806  sizeof(TransactionId) * (txn->nsubtxns + 1);
1807 
1808  snap = MemoryContextAllocZero(rb->context, size);
1809  memcpy(snap, orig_snap, sizeof(SnapshotData));
1810 
1811  snap->copied = true;
1812  snap->active_count = 1; /* mark as active so nobody frees it */
1813  snap->regd_count = 0;
1814  snap->xip = (TransactionId *) (snap + 1);
1815 
1816  memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1817 
1818  /*
1819  * snap->subxip contains all txids that belong to our transaction which we
1820  * need to check via cmin/cmax. That's why we store the toplevel
1821  * transaction in there as well.
1822  */
1823  snap->subxip = snap->xip + snap->xcnt;
1824  snap->subxip[i++] = txn->xid;
1825 
1826  /*
1827  * subxcnt isn't decreased when subtransactions abort, so count manually.
1828  * Since it's an upper boundary it is safe to use it for the allocation
1829  * above.
1830  */
1831  snap->subxcnt = 1;
1832 
1833  dlist_foreach(iter, &txn->subtxns)
1834  {
1835  ReorderBufferTXN *sub_txn;
1836 
1837  sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1838  snap->subxip[i++] = sub_txn->xid;
1839  snap->subxcnt++;
1840  }
1841 
1842  /* sort so we can bsearch() later */
1843  qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1844 
1845  /* store the specified current CommandId */
1846  snap->curcid = cid;
1847 
1848  return snap;
1849 }
1850 
1851 /*
1852  * Free a previously ReorderBufferCopySnap'ed snapshot
1853  */
1854 static void
1856 {
1857  if (snap->copied)
1858  pfree(snap);
1859  else
1861 }
1862 
1863 /*
1864  * If the transaction was (partially) streamed, we need to prepare or commit
1865  * it in a 'streamed' way. That is, we first stream the remaining part of the
1866  * transaction, and then invoke stream_prepare or stream_commit message as per
1867  * the case.
1868  */
1869 static void
1871 {
1872  /* we should only call this for previously streamed transactions */
1873  Assert(rbtxn_is_streamed(txn));
1874 
1875  ReorderBufferStreamTXN(rb, txn);
1876 
1877  if (rbtxn_prepared(txn))
1878  {
1879  /*
1880  * Note, we send stream prepare even if a concurrent abort is
1881  * detected. See DecodePrepare for more information.
1882  */
1883  rb->stream_prepare(rb, txn, txn->final_lsn);
1884 
1885  /*
1886  * This is a PREPARED transaction, part of a two-phase commit. The
1887  * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1888  * just truncate txn by removing changes and tuplecids.
1889  */
1890  ReorderBufferTruncateTXN(rb, txn, true);
1891  /* Reset the CheckXidAlive */
1893  }
1894  else
1895  {
1896  rb->stream_commit(rb, txn, txn->final_lsn);
1897  ReorderBufferCleanupTXN(rb, txn);
1898  }
1899 }
1900 
1901 /*
1902  * Set xid to detect concurrent aborts.
1903  *
1904  * While streaming an in-progress transaction or decoding a prepared
1905  * transaction there is a possibility that the (sub)transaction might get
1906  * aborted concurrently. In such case if the (sub)transaction has catalog
1907  * update then we might decode the tuple using wrong catalog version. For
1908  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1909  * the transaction 501 updates the catalog tuple and after that we will have
1910  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1911  * aborted and some other transaction say 502 updates the same catalog tuple
1912  * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1913  * problem is that when we try to decode the tuple inserted/updated in 501
1914  * after the catalog update, we will see the catalog tuple with (xmin: 500,
1915  * xmax: 502) as visible because it will consider that the tuple is deleted by
1916  * xid 502 which is not visible to our snapshot. And when we will try to
1917  * decode with that catalog tuple, it can lead to a wrong result or a crash.
1918  * So, it is necessary to detect concurrent aborts to allow streaming of
1919  * in-progress transactions or decoding of prepared transactions.
1920  *
1921  * For detecting the concurrent abort we set CheckXidAlive to the current
1922  * (sub)transaction's xid for which this change belongs to. And, during
1923  * catalog scan we can check the status of the xid and if it is aborted we will
1924  * report a specific error so that we can stop streaming current transaction
1925  * and discard the already streamed changes on such an error. We might have
1926  * already streamed some of the changes for the aborted (sub)transaction, but
1927  * that is fine because when we decode the abort we will stream abort message
1928  * to truncate the changes in the subscriber. Similarly, for prepared
1929  * transactions, we stop decoding if concurrent abort is detected and then
1930  * rollback the changes when rollback prepared is encountered. See
1931  * DecodePrepare.
1932  */
1933 static inline void
1935 {
1936  /*
1937  * If the input transaction id is already set as a CheckXidAlive then
1938  * nothing to do.
1939  */
1941  return;
1942 
1943  /*
1944  * setup CheckXidAlive if it's not committed yet. We don't check if the
1945  * xid is aborted. That will happen during catalog access.
1946  */
1947  if (!TransactionIdDidCommit(xid))
1948  CheckXidAlive = xid;
1949  else
1951 }
1952 
1953 /*
1954  * Helper function for ReorderBufferProcessTXN for applying change.
1955  */
1956 static inline void
1958  Relation relation, ReorderBufferChange *change,
1959  bool streaming)
1960 {
1961  if (streaming)
1962  rb->stream_change(rb, txn, relation, change);
1963  else
1964  rb->apply_change(rb, txn, relation, change);
1965 }
1966 
1967 /*
1968  * Helper function for ReorderBufferProcessTXN for applying the truncate.
1969  */
1970 static inline void
1972  int nrelations, Relation *relations,
1973  ReorderBufferChange *change, bool streaming)
1974 {
1975  if (streaming)
1976  rb->stream_truncate(rb, txn, nrelations, relations, change);
1977  else
1978  rb->apply_truncate(rb, txn, nrelations, relations, change);
1979 }
1980 
1981 /*
1982  * Helper function for ReorderBufferProcessTXN for applying the message.
1983  */
1984 static inline void
1986  ReorderBufferChange *change, bool streaming)
1987 {
1988  if (streaming)
1989  rb->stream_message(rb, txn, change->lsn, true,
1990  change->data.msg.prefix,
1991  change->data.msg.message_size,
1992  change->data.msg.message);
1993  else
1994  rb->message(rb, txn, change->lsn, true,
1995  change->data.msg.prefix,
1996  change->data.msg.message_size,
1997  change->data.msg.message);
1998 }
1999 
2000 /*
2001  * Function to store the command id and snapshot at the end of the current
2002  * stream so that we can reuse the same while sending the next stream.
2003  */
2004 static inline void
2006  Snapshot snapshot_now, CommandId command_id)
2007 {
2008  txn->command_id = command_id;
2009 
2010  /* Avoid copying if it's already copied. */
2011  if (snapshot_now->copied)
2012  txn->snapshot_now = snapshot_now;
2013  else
2014  txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2015  txn, command_id);
2016 }
2017 
2018 /*
2019  * Helper function for ReorderBufferProcessTXN to handle the concurrent
2020  * abort of the streaming transaction. This resets the TXN such that it
2021  * can be used to stream the remaining data of transaction being processed.
2022  * This can happen when the subtransaction is aborted and we still want to
2023  * continue processing the main or other subtransactions data.
2024  */
2025 static void
2027  Snapshot snapshot_now,
2028  CommandId command_id,
2029  XLogRecPtr last_lsn,
2030  ReorderBufferChange *specinsert)
2031 {
2032  /* Discard the changes that we just streamed */
2034 
2035  /* Free all resources allocated for toast reconstruction */
2036  ReorderBufferToastReset(rb, txn);
2037 
2038  /* Return the spec insert change if it is not NULL */
2039  if (specinsert != NULL)
2040  {
2041  ReorderBufferReturnChange(rb, specinsert, true);
2042  specinsert = NULL;
2043  }
2044 
2045  /*
2046  * For the streaming case, stop the stream and remember the command ID and
2047  * snapshot for the streaming run.
2048  */
2049  if (rbtxn_is_streamed(txn))
2050  {
2051  rb->stream_stop(rb, txn, last_lsn);
2052  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2053  }
2054 }
2055 
2056 /*
2057  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2058  *
2059  * Send data of a transaction (and its subtransactions) to the
2060  * output plugin. We iterate over the top and subtransactions (using a k-way
2061  * merge) and replay the changes in lsn order.
2062  *
2063  * If streaming is true then data will be sent using stream API.
2064  *
2065  * Note: "volatile" markers on some parameters are to avoid trouble with
2066  * PG_TRY inside the function.
2067  */
2068 static void
2070  XLogRecPtr commit_lsn,
2071  volatile Snapshot snapshot_now,
2072  volatile CommandId command_id,
2073  bool streaming)
2074 {
2075  bool using_subtxn;
2077  ReorderBufferIterTXNState *volatile iterstate = NULL;
2078  volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2079  ReorderBufferChange *volatile specinsert = NULL;
2080  volatile bool stream_started = false;
2081  ReorderBufferTXN *volatile curtxn = NULL;
2082 
2083  /* build data to be able to lookup the CommandIds of catalog tuples */
2085 
2086  /* setup the initial snapshot */
2087  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2088 
2089  /*
2090  * Decoding needs access to syscaches et al., which in turn use
2091  * heavyweight locks and such. Thus we need to have enough state around to
2092  * keep track of those. The easiest way is to simply use a transaction
2093  * internally. That also allows us to easily enforce that nothing writes
2094  * to the database by checking for xid assignments.
2095  *
2096  * When we're called via the SQL SRF there's already a transaction
2097  * started, so start an explicit subtransaction there.
2098  */
2099  using_subtxn = IsTransactionOrTransactionBlock();
2100 
2101  PG_TRY();
2102  {
2103  ReorderBufferChange *change;
2104  int changes_count = 0; /* used to accumulate the number of
2105  * changes */
2106 
2107  if (using_subtxn)
2108  BeginInternalSubTransaction(streaming ? "stream" : "replay");
2109  else
2111 
2112  /*
2113  * We only need to send begin/begin-prepare for non-streamed
2114  * transactions.
2115  */
2116  if (!streaming)
2117  {
2118  if (rbtxn_prepared(txn))
2119  rb->begin_prepare(rb, txn);
2120  else
2121  rb->begin(rb, txn);
2122  }
2123 
2124  ReorderBufferIterTXNInit(rb, txn, &iterstate);
2125  while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2126  {
2127  Relation relation = NULL;
2128  Oid reloid;
2129 
2131 
2132  /*
2133  * We can't call start stream callback before processing first
2134  * change.
2135  */
2136  if (prev_lsn == InvalidXLogRecPtr)
2137  {
2138  if (streaming)
2139  {
2140  txn->origin_id = change->origin_id;
2141  rb->stream_start(rb, txn, change->lsn);
2142  stream_started = true;
2143  }
2144  }
2145 
2146  /*
2147  * Enforce correct ordering of changes, merged from multiple
2148  * subtransactions. The changes may have the same LSN due to
2149  * MULTI_INSERT xlog records.
2150  */
2151  Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2152 
2153  prev_lsn = change->lsn;
2154 
2155  /*
2156  * Set the current xid to detect concurrent aborts. This is
2157  * required for the cases when we decode the changes before the
2158  * COMMIT record is processed.
2159  */
2160  if (streaming || rbtxn_prepared(change->txn))
2161  {
2162  curtxn = change->txn;
2163  SetupCheckXidLive(curtxn->xid);
2164  }
2165 
2166  switch (change->action)
2167  {
2169 
2170  /*
2171  * Confirmation for speculative insertion arrived. Simply
2172  * use as a normal record. It'll be cleaned up at the end
2173  * of INSERT processing.
2174  */
2175  if (specinsert == NULL)
2176  elog(ERROR, "invalid ordering of speculative insertion changes");
2177  Assert(specinsert->data.tp.oldtuple == NULL);
2178  change = specinsert;
2180 
2181  /* intentionally fall through */
2185  Assert(snapshot_now);
2186 
2187  reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2188  change->data.tp.rlocator.relNumber);
2189 
2190  /*
2191  * Mapped catalog tuple without data, emitted while
2192  * catalog table was in the process of being rewritten. We
2193  * can fail to look up the relfilenumber, because the
2194  * relmapper has no "historic" view, in contrast to the
2195  * normal catalog during decoding. Thus repeated rewrites
2196  * can cause a lookup failure. That's OK because we do not
2197  * decode catalog changes anyway. Normally such tuples
2198  * would be skipped over below, but we can't identify
2199  * whether the table should be logically logged without
2200  * mapping the relfilenumber to the oid.
2201  */
2202  if (reloid == InvalidOid &&
2203  change->data.tp.newtuple == NULL &&
2204  change->data.tp.oldtuple == NULL)
2205  goto change_done;
2206  else if (reloid == InvalidOid)
2207  elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2208  relpathperm(change->data.tp.rlocator,
2209  MAIN_FORKNUM));
2210 
2211  relation = RelationIdGetRelation(reloid);
2212 
2213  if (!RelationIsValid(relation))
2214  elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2215  reloid,
2216  relpathperm(change->data.tp.rlocator,
2217  MAIN_FORKNUM));
2218 
2219  if (!RelationIsLogicallyLogged(relation))
2220  goto change_done;
2221 
2222  /*
2223  * Ignore temporary heaps created during DDL unless the
2224  * plugin has asked for them.
2225  */
2226  if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2227  goto change_done;
2228 
2229  /*
2230  * For now ignore sequence changes entirely. Most of the
2231  * time they don't log changes using records we
2232  * understand, so it doesn't make sense to handle the few
2233  * cases we do.
2234  */
2235  if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2236  goto change_done;
2237 
2238  /* user-triggered change */
2239  if (!IsToastRelation(relation))
2240  {
2241  ReorderBufferToastReplace(rb, txn, relation, change);
2242  ReorderBufferApplyChange(rb, txn, relation, change,
2243  streaming);
2244 
2245  /*
2246  * Only clear reassembled toast chunks if we're sure
2247  * they're not required anymore. The creator of the
2248  * tuple tells us.
2249  */
2250  if (change->data.tp.clear_toast_afterwards)
2251  ReorderBufferToastReset(rb, txn);
2252  }
2253  /* we're not interested in toast deletions */
2254  else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2255  {
2256  /*
2257  * Need to reassemble the full toasted Datum in
2258  * memory, to ensure the chunks don't get reused till
2259  * we're done remove it from the list of this
2260  * transaction's changes. Otherwise it will get
2261  * freed/reused while restoring spooled data from
2262  * disk.
2263  */
2264  Assert(change->data.tp.newtuple != NULL);
2265 
2266  dlist_delete(&change->node);
2267  ReorderBufferToastAppendChunk(rb, txn, relation,
2268  change);
2269  }
2270 
2271  change_done:
2272 
2273  /*
2274  * If speculative insertion was confirmed, the record
2275  * isn't needed anymore.
2276  */
2277  if (specinsert != NULL)
2278  {
2279  ReorderBufferReturnChange(rb, specinsert, true);
2280  specinsert = NULL;
2281  }
2282 
2283  if (RelationIsValid(relation))
2284  {
2285  RelationClose(relation);
2286  relation = NULL;
2287  }
2288  break;
2289 
2291 
2292  /*
2293  * Speculative insertions are dealt with by delaying the
2294  * processing of the insert until the confirmation record
2295  * arrives. For that we simply unlink the record from the
2296  * chain, so it does not get freed/reused while restoring
2297  * spooled data from disk.
2298  *
2299  * This is safe in the face of concurrent catalog changes
2300  * because the relevant relation can't be changed between
2301  * speculative insertion and confirmation due to
2302  * CheckTableNotInUse() and locking.
2303  */
2304 
2305  /* clear out a pending (and thus failed) speculation */
2306  if (specinsert != NULL)
2307  {
2308  ReorderBufferReturnChange(rb, specinsert, true);
2309  specinsert = NULL;
2310  }
2311 
2312  /* and memorize the pending insertion */
2313  dlist_delete(&change->node);
2314  specinsert = change;
2315  break;
2316 
2318 
2319  /*
2320  * Abort for speculative insertion arrived. So cleanup the
2321  * specinsert tuple and toast hash.
2322  *
2323  * Note that we get the spec abort change for each toast
2324  * entry but we need to perform the cleanup only the first
2325  * time we get it for the main table.
2326  */
2327  if (specinsert != NULL)
2328  {
2329  /*
2330  * We must clean the toast hash before processing a
2331  * completely new tuple to avoid confusion about the
2332  * previous tuple's toast chunks.
2333  */
2334  Assert(change->data.tp.clear_toast_afterwards);
2335  ReorderBufferToastReset(rb, txn);
2336 
2337  /* We don't need this record anymore. */
2338  ReorderBufferReturnChange(rb, specinsert, true);
2339  specinsert = NULL;
2340  }
2341  break;
2342 
2344  {
2345  int i;
2346  int nrelids = change->data.truncate.nrelids;
2347  int nrelations = 0;
2348  Relation *relations;
2349 
2350  relations = palloc0(nrelids * sizeof(Relation));
2351  for (i = 0; i < nrelids; i++)
2352  {
2353  Oid relid = change->data.truncate.relids[i];
2354  Relation rel;
2355 
2356  rel = RelationIdGetRelation(relid);
2357 
2358  if (!RelationIsValid(rel))
2359  elog(ERROR, "could not open relation with OID %u", relid);
2360 
2361  if (!RelationIsLogicallyLogged(rel))
2362  continue;
2363 
2364  relations[nrelations++] = rel;
2365  }
2366 
2367  /* Apply the truncate. */
2368  ReorderBufferApplyTruncate(rb, txn, nrelations,
2369  relations, change,
2370  streaming);
2371 
2372  for (i = 0; i < nrelations; i++)
2373  RelationClose(relations[i]);
2374 
2375  break;
2376  }
2377 
2379  ReorderBufferApplyMessage(rb, txn, change, streaming);
2380  break;
2381 
2383  /* Execute the invalidation messages locally */
2384  ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2385  change->data.inval.invalidations);
2386  break;
2387 
2389  /* get rid of the old */
2390  TeardownHistoricSnapshot(false);
2391 
2392  if (snapshot_now->copied)
2393  {
2394  ReorderBufferFreeSnap(rb, snapshot_now);
2395  snapshot_now =
2396  ReorderBufferCopySnap(rb, change->data.snapshot,
2397  txn, command_id);
2398  }
2399 
2400  /*
2401  * Restored from disk, need to be careful not to double
2402  * free. We could introduce refcounting for that, but for
2403  * now this seems infrequent enough not to care.
2404  */
2405  else if (change->data.snapshot->copied)
2406  {
2407  snapshot_now =
2408  ReorderBufferCopySnap(rb, change->data.snapshot,
2409  txn, command_id);
2410  }
2411  else
2412  {
2413  snapshot_now = change->data.snapshot;
2414  }
2415 
2416  /* and continue with the new one */
2417  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2418  break;
2419 
2421  Assert(change->data.command_id != InvalidCommandId);
2422 
2423  if (command_id < change->data.command_id)
2424  {
2425  command_id = change->data.command_id;
2426 
2427  if (!snapshot_now->copied)
2428  {
2429  /* we don't use the global one anymore */
2430  snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2431  txn, command_id);
2432  }
2433 
2434  snapshot_now->curcid = command_id;
2435 
2436  TeardownHistoricSnapshot(false);
2437  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2438  }
2439 
2440  break;
2441 
2443  elog(ERROR, "tuplecid value in changequeue");
2444  break;
2445  }
2446 
2447  /*
2448  * It is possible that the data is not sent to downstream for a
2449  * long time either because the output plugin filtered it or there
2450  * is a DDL that generates a lot of data that is not processed by
2451  * the plugin. So, in such cases, the downstream can timeout. To
2452  * avoid that we try to send a keepalive message if required.
2453  * Trying to send a keepalive message after every change has some
2454  * overhead, but testing showed there is no noticeable overhead if
2455  * we do it after every ~100 changes.
2456  */
2457 #define CHANGES_THRESHOLD 100
2458 
2459  if (++changes_count >= CHANGES_THRESHOLD)
2460  {
2461  rb->update_progress_txn(rb, txn, change->lsn);
2462  changes_count = 0;
2463  }
2464  }
2465 
2466  /* speculative insertion record must be freed by now */
2467  Assert(!specinsert);
2468 
2469  /* clean up the iterator */
2470  ReorderBufferIterTXNFinish(rb, iterstate);
2471  iterstate = NULL;
2472 
2473  /*
2474  * Update total transaction count and total bytes processed by the
2475  * transaction and its subtransactions. Ensure to not count the
2476  * streamed transaction multiple times.
2477  *
2478  * Note that the statistics computation has to be done after
2479  * ReorderBufferIterTXNFinish as it releases the serialized change
2480  * which we have already accounted in ReorderBufferIterTXNNext.
2481  */
2482  if (!rbtxn_is_streamed(txn))
2483  rb->totalTxns++;
2484 
2485  rb->totalBytes += txn->total_size;
2486 
2487  /*
2488  * Done with current changes, send the last message for this set of
2489  * changes depending upon streaming mode.
2490  */
2491  if (streaming)
2492  {
2493  if (stream_started)
2494  {
2495  rb->stream_stop(rb, txn, prev_lsn);
2496  stream_started = false;
2497  }
2498  }
2499  else
2500  {
2501  /*
2502  * Call either PREPARE (for two-phase transactions) or COMMIT (for
2503  * regular ones).
2504  */
2505  if (rbtxn_prepared(txn))
2506  rb->prepare(rb, txn, commit_lsn);
2507  else
2508  rb->commit(rb, txn, commit_lsn);
2509  }
2510 
2511  /* this is just a sanity check against bad output plugin behaviour */
2513  elog(ERROR, "output plugin used XID %u",
2515 
2516  /*
2517  * Remember the command ID and snapshot for the next set of changes in
2518  * streaming mode.
2519  */
2520  if (streaming)
2521  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2522  else if (snapshot_now->copied)
2523  ReorderBufferFreeSnap(rb, snapshot_now);
2524 
2525  /* cleanup */
2526  TeardownHistoricSnapshot(false);
2527 
2528  /*
2529  * Aborting the current (sub-)transaction as a whole has the right
2530  * semantics. We want all locks acquired in here to be released, not
2531  * reassigned to the parent and we do not want any database access
2532  * have persistent effects.
2533  */
2535 
2536  /* make sure there's no cache pollution */
2538 
2539  if (using_subtxn)
2541 
2542  /*
2543  * We are here due to one of the four reasons: 1. Decoding an
2544  * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2545  * prepared txn that was (partially) streamed. 4. Decoding a committed
2546  * txn.
2547  *
2548  * For 1, we allow truncation of txn data by removing the changes
2549  * already streamed but still keeping other things like invalidations,
2550  * snapshot, and tuplecids. For 2 and 3, we indicate
2551  * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2552  * data as the entire transaction has been decoded except for commit.
2553  * For 4, as the entire txn has been decoded, we can fully clean up
2554  * the TXN reorder buffer.
2555  */
2556  if (streaming || rbtxn_prepared(txn))
2557  {
2559  /* Reset the CheckXidAlive */
2561  }
2562  else
2563  ReorderBufferCleanupTXN(rb, txn);
2564  }
2565  PG_CATCH();
2566  {
2567  MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2568  ErrorData *errdata = CopyErrorData();
2569 
2570  /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2571  if (iterstate)
2572  ReorderBufferIterTXNFinish(rb, iterstate);
2573 
2575 
2576  /*
2577  * Force cache invalidation to happen outside of a valid transaction
2578  * to prevent catalog access as we just caught an error.
2579  */
2581 
2582  /* make sure there's no cache pollution */
2584  txn->invalidations);
2585 
2586  if (using_subtxn)
2588 
2589  /*
2590  * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2591  * abort of the (sub)transaction we are streaming or preparing. We
2592  * need to do the cleanup and return gracefully on this error, see
2593  * SetupCheckXidLive.
2594  *
2595  * This error code can be thrown by one of the callbacks we call
2596  * during decoding so we need to ensure that we return gracefully only
2597  * when we are sending the data in streaming mode and the streaming is
2598  * not finished yet or when we are sending the data out on a PREPARE
2599  * during a two-phase commit.
2600  */
2601  if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2602  (stream_started || rbtxn_prepared(txn)))
2603  {
2604  /* curtxn must be set for streaming or prepared transactions */
2605  Assert(curtxn);
2606 
2607  /* Cleanup the temporary error state. */
2608  FlushErrorState();
2609  FreeErrorData(errdata);
2610  errdata = NULL;
2611  curtxn->concurrent_abort = true;
2612 
2613  /* Reset the TXN so that it is allowed to stream remaining data. */
2614  ReorderBufferResetTXN(rb, txn, snapshot_now,
2615  command_id, prev_lsn,
2616  specinsert);
2617  }
2618  else
2619  {
2620  ReorderBufferCleanupTXN(rb, txn);
2621  MemoryContextSwitchTo(ecxt);
2622  PG_RE_THROW();
2623  }
2624  }
2625  PG_END_TRY();
2626 }
2627 
2628 /*
2629  * Perform the replay of a transaction and its non-aborted subtransactions.
2630  *
2631  * Subtransactions previously have to be processed by
2632  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2633  * transaction with ReorderBufferAssignChild.
2634  *
2635  * This interface is called once a prepare or toplevel commit is read for both
2636  * streamed as well as non-streamed transactions.
2637  */
2638 static void
2640  ReorderBuffer *rb, TransactionId xid,
2641  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2642  TimestampTz commit_time,
2643  RepOriginId origin_id, XLogRecPtr origin_lsn)
2644 {
2645  Snapshot snapshot_now;
2646  CommandId command_id = FirstCommandId;
2647 
2648  txn->final_lsn = commit_lsn;
2649  txn->end_lsn = end_lsn;
2650  txn->xact_time.commit_time = commit_time;
2651  txn->origin_id = origin_id;
2652  txn->origin_lsn = origin_lsn;
2653 
2654  /*
2655  * If the transaction was (partially) streamed, we need to commit it in a
2656  * 'streamed' way. That is, we first stream the remaining part of the
2657  * transaction, and then invoke stream_commit message.
2658  *
2659  * Called after everything (origin ID, LSN, ...) is stored in the
2660  * transaction to avoid passing that information directly.
2661  */
2662  if (rbtxn_is_streamed(txn))
2663  {
2664  ReorderBufferStreamCommit(rb, txn);
2665  return;
2666  }
2667 
2668  /*
2669  * If this transaction has no snapshot, it didn't make any changes to the
2670  * database, so there's nothing to decode. Note that
2671  * ReorderBufferCommitChild will have transferred any snapshots from
2672  * subtransactions if there were any.
2673  */
2674  if (txn->base_snapshot == NULL)
2675  {
2676  Assert(txn->ninvalidations == 0);
2677 
2678  /*
2679  * Removing this txn before a commit might result in the computation
2680  * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2681  */
2682  if (!rbtxn_prepared(txn))
2683  ReorderBufferCleanupTXN(rb, txn);
2684  return;
2685  }
2686 
2687  snapshot_now = txn->base_snapshot;
2688 
2689  /* Process and send the changes to output plugin. */
2690  ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2691  command_id, false);
2692 }
2693 
2694 /*
2695  * Commit a transaction.
2696  *
2697  * See comments for ReorderBufferReplay().
2698  */
2699 void
2701  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2702  TimestampTz commit_time,
2703  RepOriginId origin_id, XLogRecPtr origin_lsn)
2704 {
2705  ReorderBufferTXN *txn;
2706 
2707  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2708  false);
2709 
2710  /* unknown transaction, nothing to replay */
2711  if (txn == NULL)
2712  return;
2713 
2714  ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2715  origin_id, origin_lsn);
2716 }
2717 
2718 /*
2719  * Record the prepare information for a transaction.
2720  */
2721 bool
2723  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2724  TimestampTz prepare_time,
2725  RepOriginId origin_id, XLogRecPtr origin_lsn)
2726 {
2727  ReorderBufferTXN *txn;
2728 
2729  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2730 
2731  /* unknown transaction, nothing to do */
2732  if (txn == NULL)
2733  return false;
2734 
2735  /*
2736  * Remember the prepare information to be later used by commit prepared in
2737  * case we skip doing prepare.
2738  */
2739  txn->final_lsn = prepare_lsn;
2740  txn->end_lsn = end_lsn;
2741  txn->xact_time.prepare_time = prepare_time;
2742  txn->origin_id = origin_id;
2743  txn->origin_lsn = origin_lsn;
2744 
2745  return true;
2746 }
2747 
2748 /* Remember that we have skipped prepare */
2749 void
2751 {
2752  ReorderBufferTXN *txn;
2753 
2754  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2755 
2756  /* unknown transaction, nothing to do */
2757  if (txn == NULL)
2758  return;
2759 
2761 }
2762 
2763 /*
2764  * Prepare a two-phase transaction.
2765  *
2766  * See comments for ReorderBufferReplay().
2767  */
2768 void
2770  char *gid)
2771 {
2772  ReorderBufferTXN *txn;
2773 
2774  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2775  false);
2776 
2777  /* unknown transaction, nothing to replay */
2778  if (txn == NULL)
2779  return;
2780 
2781  txn->txn_flags |= RBTXN_PREPARE;
2782  txn->gid = pstrdup(gid);
2783 
2784  /* The prepare info must have been updated in txn by now. */
2786 
2787  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2788  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2789 
2790  /*
2791  * We send the prepare for the concurrently aborted xacts so that later
2792  * when rollback prepared is decoded and sent, the downstream should be
2793  * able to rollback such a xact. See comments atop DecodePrepare.
2794  *
2795  * Note, for the concurrent_abort + streaming case a stream_prepare was
2796  * already sent within the ReorderBufferReplay call above.
2797  */
2798  if (txn->concurrent_abort && !rbtxn_is_streamed(txn))
2799  rb->prepare(rb, txn, txn->final_lsn);
2800 }
2801 
2802 /*
2803  * This is used to handle COMMIT/ROLLBACK PREPARED.
2804  */
2805 void
2807  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2808  XLogRecPtr two_phase_at,
2809  TimestampTz commit_time, RepOriginId origin_id,
2810  XLogRecPtr origin_lsn, char *gid, bool is_commit)
2811 {
2812  ReorderBufferTXN *txn;
2813  XLogRecPtr prepare_end_lsn;
2814  TimestampTz prepare_time;
2815 
2816  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2817 
2818  /* unknown transaction, nothing to do */
2819  if (txn == NULL)
2820  return;
2821 
2822  /*
2823  * By this time the txn has the prepare record information, remember it to
2824  * be later used for rollback.
2825  */
2826  prepare_end_lsn = txn->end_lsn;
2827  prepare_time = txn->xact_time.prepare_time;
2828 
2829  /* add the gid in the txn */
2830  txn->gid = pstrdup(gid);
2831 
2832  /*
2833  * It is possible that this transaction is not decoded at prepare time
2834  * either because by that time we didn't have a consistent snapshot, or
2835  * two_phase was not enabled, or it was decoded earlier but we have
2836  * restarted. We only need to send the prepare if it was not decoded
2837  * earlier. We don't need to decode the xact for aborts if it is not done
2838  * already.
2839  */
2840  if ((txn->final_lsn < two_phase_at) && is_commit)
2841  {
2842  txn->txn_flags |= RBTXN_PREPARE;
2843 
2844  /*
2845  * The prepare info must have been updated in txn even if we skip
2846  * prepare.
2847  */
2849 
2850  /*
2851  * By this time the txn has the prepare record information and it is
2852  * important to use that so that downstream gets the accurate
2853  * information. If instead, we have passed commit information here
2854  * then downstream can behave as it has already replayed commit
2855  * prepared after the restart.
2856  */
2857  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2858  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2859  }
2860 
2861  txn->final_lsn = commit_lsn;
2862  txn->end_lsn = end_lsn;
2863  txn->xact_time.commit_time = commit_time;
2864  txn->origin_id = origin_id;
2865  txn->origin_lsn = origin_lsn;
2866 
2867  if (is_commit)
2868  rb->commit_prepared(rb, txn, commit_lsn);
2869  else
2870  rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2871 
2872  /* cleanup: make sure there's no cache pollution */
2874  txn->invalidations);
2875  ReorderBufferCleanupTXN(rb, txn);
2876 }
2877 
2878 /*
2879  * Abort a transaction that possibly has previous changes. Needs to be first
2880  * called for subtransactions and then for the toplevel xid.
2881  *
2882  * NB: Transactions handled here have to have actively aborted (i.e. have
2883  * produced an abort record). Implicitly aborted transactions are handled via
2884  * ReorderBufferAbortOld(); transactions we're just not interested in, but
2885  * which have committed are handled in ReorderBufferForget().
2886  *
2887  * This function purges this transaction and its contents from memory and
2888  * disk.
2889  */
2890 void
2892  TimestampTz abort_time)
2893 {
2894  ReorderBufferTXN *txn;
2895 
2896  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2897  false);
2898 
2899  /* unknown, nothing to remove */
2900  if (txn == NULL)
2901  return;
2902 
2903  txn->xact_time.abort_time = abort_time;
2904 
2905  /* For streamed transactions notify the remote node about the abort. */
2906  if (rbtxn_is_streamed(txn))
2907  {
2908  rb->stream_abort(rb, txn, lsn);
2909 
2910  /*
2911  * We might have decoded changes for this transaction that could load
2912  * the cache as per the current transaction's view (consider DDL's
2913  * happened in this transaction). We don't want the decoding of future
2914  * transactions to use those cache entries so execute invalidations.
2915  */
2916  if (txn->ninvalidations > 0)
2918  txn->invalidations);
2919  }
2920 
2921  /* cosmetic... */
2922  txn->final_lsn = lsn;
2923 
2924  /* remove potential on-disk data, and deallocate */
2925  ReorderBufferCleanupTXN(rb, txn);
2926 }
2927 
2928 /*
2929  * Abort all transactions that aren't actually running anymore because the
2930  * server restarted.
2931  *
2932  * NB: These really have to be transactions that have aborted due to a server
2933  * crash/immediate restart, as we don't deal with invalidations here.
2934  */
2935 void
2937 {
2938  dlist_mutable_iter it;
2939 
2940  /*
2941  * Iterate through all (potential) toplevel TXNs and abort all that are
2942  * older than what possibly can be running. Once we've found the first
2943  * that is alive we stop, there might be some that acquired an xid earlier
2944  * but started writing later, but it's unlikely and they will be cleaned
2945  * up in a later call to this function.
2946  */
2948  {
2949  ReorderBufferTXN *txn;
2950 
2951  txn = dlist_container(ReorderBufferTXN, node, it.cur);
2952 
2953  if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2954  {
2955  elog(DEBUG2, "aborting old transaction %u", txn->xid);
2956 
2957  /* Notify the remote node about the crash/immediate restart. */
2958  if (rbtxn_is_streamed(txn))
2959  rb->stream_abort(rb, txn, InvalidXLogRecPtr);
2960 
2961  /* remove potential on-disk data, and deallocate this tx */
2962  ReorderBufferCleanupTXN(rb, txn);
2963  }
2964  else
2965  return;
2966  }
2967 }
2968 
2969 /*
2970  * Forget the contents of a transaction if we aren't interested in its
2971  * contents. Needs to be first called for subtransactions and then for the
2972  * toplevel xid.
2973  *
2974  * This is significantly different to ReorderBufferAbort() because
2975  * transactions that have committed need to be treated differently from aborted
2976  * ones since they may have modified the catalog.
2977  *
2978  * Note that this is only allowed to be called in the moment a transaction
2979  * commit has just been read, not earlier; otherwise later records referring
2980  * to this xid might re-create the transaction incompletely.
2981  */
2982 void
2984 {
2985  ReorderBufferTXN *txn;
2986 
2987  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2988  false);
2989 
2990  /* unknown, nothing to forget */
2991  if (txn == NULL)
2992  return;
2993 
2994  /* this transaction mustn't be streamed */
2995  Assert(!rbtxn_is_streamed(txn));
2996 
2997  /* cosmetic... */
2998  txn->final_lsn = lsn;
2999 
3000  /*
3001  * Process cache invalidation messages if there are any. Even if we're not
3002  * interested in the transaction's contents, it could have manipulated the
3003  * catalog and we need to update the caches according to that.
3004  */
3005  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3007  txn->invalidations);
3008  else
3009  Assert(txn->ninvalidations == 0);
3010 
3011  /* remove potential on-disk data, and deallocate */
3012  ReorderBufferCleanupTXN(rb, txn);
3013 }
3014 
3015 /*
3016  * Invalidate cache for those transactions that need to be skipped just in case
3017  * catalogs were manipulated as part of the transaction.
3018  *
3019  * Note that this is a special-purpose function for prepared transactions where
3020  * we don't want to clean up the TXN even when we decide to skip it. See
3021  * DecodePrepare.
3022  */
3023 void
3025 {
3026  ReorderBufferTXN *txn;
3027 
3028  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3029  false);
3030 
3031  /* unknown, nothing to do */
3032  if (txn == NULL)
3033  return;
3034 
3035  /*
3036  * Process cache invalidation messages if there are any. Even if we're not
3037  * interested in the transaction's contents, it could have manipulated the
3038  * catalog and we need to update the caches according to that.
3039  */
3040  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3042  txn->invalidations);
3043  else
3044  Assert(txn->ninvalidations == 0);
3045 }
3046 
3047 
3048 /*
3049  * Execute invalidations happening outside the context of a decoded
3050  * transaction. That currently happens either for xid-less commits
3051  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3052  * transactions (via ReorderBufferForget()).
3053  */
3054 void
3056  SharedInvalidationMessage *invalidations)
3057 {
3058  bool use_subtxn = IsTransactionOrTransactionBlock();
3059  int i;
3060 
3061  if (use_subtxn)
3062  BeginInternalSubTransaction("replay");
3063 
3064  /*
3065  * Force invalidations to happen outside of a valid transaction - that way
3066  * entries will just be marked as invalid without accessing the catalog.
3067  * That's advantageous because we don't need to setup the full state
3068  * necessary for catalog access.
3069  */
3070  if (use_subtxn)
3072 
3073  for (i = 0; i < ninvalidations; i++)
3074  LocalExecuteInvalidationMessage(&invalidations[i]);
3075 
3076  if (use_subtxn)
3078 }
3079 
3080 /*
3081  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3082  * least once for every xid in XLogRecord->xl_xid (other places in records
3083  * may, but do not have to be passed through here).
3084  *
3085  * Reorderbuffer keeps some datastructures about transactions in LSN order,
3086  * for efficiency. To do that it has to know about when transactions are seen
3087  * first in the WAL. As many types of records are not actually interesting for
3088  * logical decoding, they do not necessarily pass though here.
3089  */
3090 void
3092 {
3093  /* many records won't have an xid assigned, centralize check here */
3094  if (xid != InvalidTransactionId)
3095  ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3096 }
3097 
3098 /*
3099  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3100  * because the previous snapshot doesn't describe the catalog correctly for
3101  * following rows.
3102  */
3103 void
3105  XLogRecPtr lsn, Snapshot snap)
3106 {
3108 
3109  change->data.snapshot = snap;
3111 
3112  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3113 }
3114 
3115 /*
3116  * Set up the transaction's base snapshot.
3117  *
3118  * If we know that xid is a subtransaction, set the base snapshot on the
3119  * top-level transaction instead.
3120  */
3121 void
3123  XLogRecPtr lsn, Snapshot snap)
3124 {
3125  ReorderBufferTXN *txn;
3126  bool is_new;
3127 
3128  Assert(snap != NULL);
3129 
3130  /*
3131  * Fetch the transaction to operate on. If we know it's a subtransaction,
3132  * operate on its top-level transaction instead.
3133  */
3134  txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3135  if (rbtxn_is_known_subxact(txn))
3136  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3137  NULL, InvalidXLogRecPtr, false);
3138  Assert(txn->base_snapshot == NULL);
3139 
3140  txn->base_snapshot = snap;
3141  txn->base_snapshot_lsn = lsn;
3143 
3144  AssertTXNLsnOrder(rb);
3145 }
3146 
3147 /*
3148  * Access the catalog with this CommandId at this point in the changestream.
3149  *
3150  * May only be called for command ids > 1
3151  */
3152 void
3154  XLogRecPtr lsn, CommandId cid)
3155 {
3157 
3158  change->data.command_id = cid;
3160 
3161  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3162 }
3163 
3164 /*
3165  * Update memory counters to account for the new or removed change.
3166  *
3167  * We update two counters - in the reorder buffer, and in the transaction
3168  * containing the change. The reorder buffer counter allows us to quickly
3169  * decide if we reached the memory limit, the transaction counter allows
3170  * us to quickly pick the largest transaction for eviction.
3171  *
3172  * When streaming is enabled, we need to update the toplevel transaction
3173  * counters instead - we don't really care about subtransactions as we
3174  * can't stream them individually anyway, and we only pick toplevel
3175  * transactions for eviction. So only toplevel transactions matter.
3176  */
3177 static void
3179  ReorderBufferChange *change,
3180  bool addition, Size sz)
3181 {
3182  ReorderBufferTXN *txn;
3183  ReorderBufferTXN *toptxn;
3184 
3185  Assert(change->txn);
3186 
3187  /*
3188  * Ignore tuple CID changes, because those are not evicted when reaching
3189  * memory limit. So we just don't count them, because it might easily
3190  * trigger a pointless attempt to spill.
3191  */
3193  return;
3194 
3195  txn = change->txn;
3196 
3197  /*
3198  * Update the total size in top level as well. This is later used to
3199  * compute the decoding stats.
3200  */
3201  toptxn = rbtxn_get_toptxn(txn);
3202 
3203  if (addition)
3204  {
3205  txn->size += sz;
3206  rb->size += sz;
3207 
3208  /* Update the total size in the top transaction. */
3209  toptxn->total_size += sz;
3210  }
3211  else
3212  {
3213  Assert((rb->size >= sz) && (txn->size >= sz));
3214  txn->size -= sz;
3215  rb->size -= sz;
3216 
3217  /* Update the total size in the top transaction. */
3218  toptxn->total_size -= sz;
3219  }
3220 
3221  Assert(txn->size <= rb->size);
3222 }
3223 
3224 /*
3225  * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3226  *
3227  * We do not include this change type in memory accounting, because we
3228  * keep CIDs in a separate list and do not evict them when reaching
3229  * the memory limit.
3230  */
3231 void
3233  XLogRecPtr lsn, RelFileLocator locator,
3234  ItemPointerData tid, CommandId cmin,
3235  CommandId cmax, CommandId combocid)
3236 {
3238  ReorderBufferTXN *txn;
3239 
3240  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3241 
3242  change->data.tuplecid.locator = locator;
3243  change->data.tuplecid.tid = tid;
3244  change->data.tuplecid.cmin = cmin;
3245  change->data.tuplecid.cmax = cmax;
3246  change->data.tuplecid.combocid = combocid;
3247  change->lsn = lsn;
3248  change->txn = txn;
3250 
3251  dlist_push_tail(&txn->tuplecids, &change->node);
3252  txn->ntuplecids++;
3253 }
3254 
3255 /*
3256  * Accumulate the invalidations for executing them later.
3257  *
3258  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3259  * accumulates all the invalidation messages in the toplevel transaction, if
3260  * available, otherwise in the current transaction, as well as in the form of
3261  * change in reorder buffer. We require to record it in form of the change
3262  * so that we can execute only the required invalidations instead of executing
3263  * all the invalidations on each CommandId increment. We also need to
3264  * accumulate these in the txn buffer because in some cases where we skip
3265  * processing the transaction (see ReorderBufferForget), we need to execute
3266  * all the invalidations together.
3267  */
3268 void
3270  XLogRecPtr lsn, Size nmsgs,
3272 {
3273  ReorderBufferTXN *txn;
3274  MemoryContext oldcontext;
3275  ReorderBufferChange *change;
3276 
3277  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3278 
3279  oldcontext = MemoryContextSwitchTo(rb->context);
3280 
3281  /*
3282  * Collect all the invalidations under the top transaction, if available,
3283  * so that we can execute them all together. See comments atop this
3284  * function.
3285  */
3286  txn = rbtxn_get_toptxn(txn);
3287 
3288  Assert(nmsgs > 0);
3289 
3290  /* Accumulate invalidations. */
3291  if (txn->ninvalidations == 0)
3292  {
3293  txn->ninvalidations = nmsgs;
3295  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3296  memcpy(txn->invalidations, msgs,
3297  sizeof(SharedInvalidationMessage) * nmsgs);
3298  }
3299  else
3300  {
3303  (txn->ninvalidations + nmsgs));
3304 
3305  memcpy(txn->invalidations + txn->ninvalidations, msgs,
3306  nmsgs * sizeof(SharedInvalidationMessage));
3307  txn->ninvalidations += nmsgs;
3308  }
3309 
3310  change = ReorderBufferGetChange(rb);
3312  change->data.inval.ninvalidations = nmsgs;
3313  change->data.inval.invalidations = (SharedInvalidationMessage *)
3314  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3315  memcpy(change->data.inval.invalidations, msgs,
3316  sizeof(SharedInvalidationMessage) * nmsgs);
3317 
3318  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3319 
3320  MemoryContextSwitchTo(oldcontext);
3321 }
3322 
3323 /*
3324  * Apply all invalidations we know. Possibly we only need parts at this point
3325  * in the changestream but we don't know which those are.
3326  */
3327 static void
3329 {
3330  int i;
3331 
3332  for (i = 0; i < nmsgs; i++)
3334 }
3335 
3336 /*
3337  * Mark a transaction as containing catalog changes
3338  */
3339 void
3341  XLogRecPtr lsn)
3342 {
3343  ReorderBufferTXN *txn;
3344 
3345  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3346 
3347  if (!rbtxn_has_catalog_changes(txn))
3348  {
3351  }
3352 
3353  /*
3354  * Mark top-level transaction as having catalog changes too if one of its
3355  * children has so that the ReorderBufferBuildTupleCidHash can
3356  * conveniently check just top-level transaction and decide whether to
3357  * build the hash table or not.
3358  */
3359  if (rbtxn_is_subtxn(txn))
3360  {
3361  ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3362 
3363  if (!rbtxn_has_catalog_changes(toptxn))
3364  {
3367  }
3368  }
3369 }
3370 
3371 /*
3372  * Return palloc'ed array of the transactions that have changed catalogs.
3373  * The returned array is sorted in xidComparator order.
3374  *
3375  * The caller must free the returned array when done with it.
3376  */
3377 TransactionId *
3379 {
3380  dlist_iter iter;
3381  TransactionId *xids = NULL;
3382  size_t xcnt = 0;
3383 
3384  /* Quick return if the list is empty */
3385  if (dclist_count(&rb->catchange_txns) == 0)
3386  return NULL;
3387 
3388  /* Initialize XID array */
3389  xids = (TransactionId *) palloc(sizeof(TransactionId) *
3391  dclist_foreach(iter, &rb->catchange_txns)
3392  {
3394  catchange_node,
3395  iter.cur);
3396 
3398 
3399  xids[xcnt++] = txn->xid;
3400  }
3401 
3402  qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3403 
3404  Assert(xcnt == dclist_count(&rb->catchange_txns));
3405  return xids;
3406 }
3407 
3408 /*
3409  * Query whether a transaction is already *known* to contain catalog
3410  * changes. This can be wrong until directly before the commit!
3411  */
3412 bool
3414 {
3415  ReorderBufferTXN *txn;
3416 
3417  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3418  false);
3419  if (txn == NULL)
3420  return false;
3421 
3422  return rbtxn_has_catalog_changes(txn);
3423 }
3424 
3425 /*
3426  * ReorderBufferXidHasBaseSnapshot
3427  * Have we already set the base snapshot for the given txn/subtxn?
3428  */
3429 bool
3431 {
3432  ReorderBufferTXN *txn;
3433 
3434  txn = ReorderBufferTXNByXid(rb, xid, false,
3435  NULL, InvalidXLogRecPtr, false);
3436 
3437  /* transaction isn't known yet, ergo no snapshot */
3438  if (txn == NULL)
3439  return false;
3440 
3441  /* a known subtxn? operate on top-level txn instead */
3442  if (rbtxn_is_known_subxact(txn))
3443  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3444  NULL, InvalidXLogRecPtr, false);
3445 
3446  return txn->base_snapshot != NULL;
3447 }
3448 
3449 
3450 /*
3451  * ---------------------------------------
3452  * Disk serialization support
3453  * ---------------------------------------
3454  */
3455 
3456 /*
3457  * Ensure the IO buffer is >= sz.
3458  */
3459 static void
3461 {
3462  if (!rb->outbufsize)
3463  {
3464  rb->outbuf = MemoryContextAlloc(rb->context, sz);
3465  rb->outbufsize = sz;
3466  }
3467  else if (rb->outbufsize < sz)
3468  {
3469  rb->outbuf = repalloc(rb->outbuf, sz);
3470  rb->outbufsize = sz;
3471  }
3472 }
3473 
3474 /*
3475  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3476  *
3477  * XXX With many subtransactions this might be quite slow, because we'll have
3478  * to walk through all of them. There are some options how we could improve
3479  * that: (a) maintain some secondary structure with transactions sorted by
3480  * amount of changes, (b) not looking for the entirely largest transaction,
3481  * but e.g. for transaction using at least some fraction of the memory limit,
3482  * and (c) evicting multiple transactions at once, e.g. to free a given portion
3483  * of the memory limit (e.g. 50%).
3484  */
3485 static ReorderBufferTXN *
3487 {
3488  HASH_SEQ_STATUS hash_seq;
3490  ReorderBufferTXN *largest = NULL;
3491 
3492  hash_seq_init(&hash_seq, rb->by_txn);
3493  while ((ent = hash_seq_search(&hash_seq)) != NULL)
3494  {
3495  ReorderBufferTXN *txn = ent->txn;
3496 
3497  /* if the current transaction is larger, remember it */
3498  if ((!largest) || (txn->size > largest->size))
3499  largest = txn;
3500  }
3501 
3502  Assert(largest);
3503  Assert(largest->size > 0);
3504  Assert(largest->size <= rb->size);
3505 
3506  return largest;
3507 }
3508 
3509 /*
3510  * Find the largest streamable toplevel transaction to evict (by streaming).
3511  *
3512  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3513  * should give us the same transaction (because we don't update memory account
3514  * for subtransaction with streaming, so it's always 0). But we can simply
3515  * iterate over the limited number of toplevel transactions that have a base
3516  * snapshot. There is no use of selecting a transaction that doesn't have base
3517  * snapshot because we don't decode such transactions. Also, we do not select
3518  * the transaction which doesn't have any streamable change.
3519  *
3520  * Note that, we skip transactions that contains incomplete changes. There
3521  * is a scope of optimization here such that we can select the largest
3522  * transaction which has incomplete changes. But that will make the code and
3523  * design quite complex and that might not be worth the benefit. If we plan to
3524  * stream the transactions that contains incomplete changes then we need to
3525  * find a way to partially stream/truncate the transaction changes in-memory
3526  * and build a mechanism to partially truncate the spilled files.
3527  * Additionally, whenever we partially stream the transaction we need to
3528  * maintain the last streamed lsn and next time we need to restore from that
3529  * segment and the offset in WAL. As we stream the changes from the top
3530  * transaction and restore them subtransaction wise, we need to even remember
3531  * the subxact from where we streamed the last change.
3532  */
3533 static ReorderBufferTXN *
3535 {
3536  dlist_iter iter;
3537  Size largest_size = 0;
3538  ReorderBufferTXN *largest = NULL;
3539 
3540  /* Find the largest top-level transaction having a base snapshot. */
3542  {
3543  ReorderBufferTXN *txn;
3544 
3545  txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3546 
3547  /* must not be a subtxn */
3549  /* base_snapshot must be set */
3550  Assert(txn->base_snapshot != NULL);
3551 
3552  if ((largest == NULL || txn->total_size > largest_size) &&
3553  (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)) &&
3555  {
3556  largest = txn;
3557  largest_size = txn->total_size;
3558  }
3559  }
3560 
3561  return largest;
3562 }
3563 
3564 /*
3565  * Check whether the logical_decoding_work_mem limit was reached, and if yes
3566  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3567  * disk or send to the output plugin until we reach under the memory limit.
3568  *
3569  * If debug_logical_replication_streaming is set to "immediate", stream or
3570  * serialize the changes immediately.
3571  *
3572  * XXX At this point we select the transactions until we reach under the memory
3573  * limit, but we might also adapt a more elaborate eviction strategy - for example
3574  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3575  * limit.
3576  */
3577 static void
3579 {
3580  ReorderBufferTXN *txn;
3581 
3582  /*
3583  * Bail out if debug_logical_replication_streaming is buffered and we
3584  * haven't exceeded the memory limit.
3585  */
3587  rb->size < logical_decoding_work_mem * 1024L)
3588  return;
3589 
3590  /*
3591  * If debug_logical_replication_streaming is immediate, loop until there's
3592  * no change. Otherwise, loop until we reach under the memory limit. One
3593  * might think that just by evicting the largest (sub)transaction we will
3594  * come under the memory limit based on assumption that the selected
3595  * transaction is at least as large as the most recent change (which
3596  * caused us to go over the memory limit). However, that is not true
3597  * because a user can reduce the logical_decoding_work_mem to a smaller
3598  * value before the most recent change.
3599  */
3600  while (rb->size >= logical_decoding_work_mem * 1024L ||
3602  rb->size > 0))
3603  {
3604  /*
3605  * Pick the largest transaction and evict it from memory by streaming,
3606  * if possible. Otherwise, spill to disk.
3607  */
3609  (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3610  {
3611  /* we know there has to be one, because the size is not zero */
3612  Assert(txn && rbtxn_is_toptxn(txn));
3613  Assert(txn->total_size > 0);
3614  Assert(rb->size >= txn->total_size);
3615 
3616  ReorderBufferStreamTXN(rb, txn);
3617  }
3618  else
3619  {
3620  /*
3621  * Pick the largest transaction (or subtransaction) and evict it
3622  * from memory by serializing it to disk.
3623  */
3624  txn = ReorderBufferLargestTXN(rb);
3625 
3626  /* we know there has to be one, because the size is not zero */
3627  Assert(txn);
3628  Assert(txn->size > 0);
3629  Assert(rb->size >= txn->size);
3630 
3631  ReorderBufferSerializeTXN(rb, txn);
3632  }
3633 
3634  /*
3635  * After eviction, the transaction should have no entries in memory,
3636  * and should use 0 bytes for changes.
3637  */
3638  Assert(txn->size == 0);
3639  Assert(txn->nentries_mem == 0);
3640  }
3641 
3642  /* We must be under the memory limit now. */
3643  Assert(rb->size < logical_decoding_work_mem * 1024L);
3644 }
3645 
3646 /*
3647  * Spill data of a large transaction (and its subtransactions) to disk.
3648  */
3649 static void
3651 {
3652  dlist_iter subtxn_i;
3653  dlist_mutable_iter change_i;
3654  int fd = -1;
3655  XLogSegNo curOpenSegNo = 0;
3656  Size spilled = 0;
3657  Size size = txn->size;
3658 
3659  elog(DEBUG2, "spill %u changes in XID %u to disk",
3660  (uint32) txn->nentries_mem, txn->xid);
3661 
3662  /* do the same to all child TXs */
3663  dlist_foreach(subtxn_i, &txn->subtxns)
3664  {
3665  ReorderBufferTXN *subtxn;
3666 
3667  subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3668  ReorderBufferSerializeTXN(rb, subtxn);
3669  }
3670 
3671  /* serialize changestream */
3672  dlist_foreach_modify(change_i, &txn->changes)
3673  {
3674  ReorderBufferChange *change;
3675 
3676  change = dlist_container(ReorderBufferChange, node, change_i.cur);
3677 
3678  /*
3679  * store in segment in which it belongs by start lsn, don't split over
3680  * multiple segments tho
3681  */
3682  if (fd == -1 ||
3683  !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3684  {
3685  char path[MAXPGPATH];
3686 
3687  if (fd != -1)
3689 
3690  XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3691 
3692  /*
3693  * No need to care about TLIs here, only used during a single run,
3694  * so each LSN only maps to a specific WAL record.
3695  */
3697  curOpenSegNo);
3698 
3699  /* open segment, create it if necessary */
3700  fd = OpenTransientFile(path,
3701  O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3702 
3703  if (fd < 0)
3704  ereport(ERROR,
3706  errmsg("could not open file \"%s\": %m", path)));
3707  }
3708 
3709  ReorderBufferSerializeChange(rb, txn, fd, change);
3710  dlist_delete(&change->node);
3711  ReorderBufferReturnChange(rb, change, true);
3712 
3713  spilled++;
3714  }
3715 
3716  /* update the statistics iff we have spilled anything */
3717  if (spilled)
3718  {
3719  rb->spillCount += 1;
3720  rb->spillBytes += size;
3721 
3722  /* don't consider already serialized transactions */
3723  rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3724 
3725  /* update the decoding stats */
3727  }
3728 
3729  Assert(spilled == txn->nentries_mem);
3730  Assert(dlist_is_empty(&txn->changes));
3731  txn->nentries_mem = 0;
3733 
3734  if (fd != -1)
3736 }
3737 
3738 /*
3739  * Serialize individual change to disk.
3740  */
3741 static void
3743  int fd, ReorderBufferChange *change)
3744 {
3745  ReorderBufferDiskChange *ondisk;
3746  Size sz = sizeof(ReorderBufferDiskChange);
3747 
3749 
3750  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3751  memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3752 
3753  switch (change->action)
3754  {
3755  /* fall through these, they're all similar enough */
3760  {
3761  char *data;
3762  ReorderBufferTupleBuf *oldtup,
3763  *newtup;
3764  Size oldlen = 0;
3765  Size newlen = 0;
3766 
3767  oldtup = change->data.tp.oldtuple;
3768  newtup = change->data.tp.newtuple;
3769 
3770  if (oldtup)
3771  {
3772  sz += sizeof(HeapTupleData);
3773  oldlen = oldtup->tuple.t_len;
3774  sz += oldlen;
3775  }
3776 
3777  if (newtup)
3778  {
3779  sz += sizeof(HeapTupleData);
3780  newlen = newtup->tuple.t_len;
3781  sz += newlen;
3782  }
3783 
3784  /* make sure we have enough space */
3786 
3787  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3788  /* might have been reallocated above */
3789  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3790 
3791  if (oldlen)
3792  {
3793  memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
3794  data += sizeof(HeapTupleData);
3795 
3796  memcpy(data, oldtup->tuple.t_data, oldlen);
3797  data += oldlen;
3798  }
3799 
3800  if (newlen)
3801  {
3802  memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
3803  data += sizeof(HeapTupleData);
3804 
3805  memcpy(data, newtup->tuple.t_data, newlen);
3806  data += newlen;
3807  }
3808  break;
3809  }
3811  {
3812  char *data;
3813  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3814 
3815  sz += prefix_size + change->data.msg.message_size +
3816  sizeof(Size) + sizeof(Size);
3818 
3819  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3820 
3821  /* might have been reallocated above */
3822  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3823 
3824  /* write the prefix including the size */
3825  memcpy(data, &prefix_size, sizeof(Size));
3826  data += sizeof(Size);
3827  memcpy(data, change->data.msg.prefix,
3828  prefix_size);
3829  data += prefix_size;
3830 
3831  /* write the message including the size */
3832  memcpy(data, &change->data.msg.message_size, sizeof(Size));
3833  data += sizeof(Size);
3834  memcpy(data, change->data.msg.message,
3835  change->data.msg.message_size);
3836  data += change->data.msg.message_size;
3837 
3838  break;
3839  }
3841  {
3842  char *data;
3843  Size inval_size = sizeof(SharedInvalidationMessage) *
3844  change->data.inval.ninvalidations;
3845 
3846  sz += inval_size;
3847 
3849  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3850 
3851  /* might have been reallocated above */
3852  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3853  memcpy(data, change->data.inval.invalidations, inval_size);
3854  data += inval_size;
3855 
3856  break;
3857  }
3859  {
3860  Snapshot snap;
3861  char *data;
3862 
3863  snap = change->data.snapshot;
3864 
3865  sz += sizeof(SnapshotData) +
3866  sizeof(TransactionId) * snap->xcnt +
3867  sizeof(TransactionId) * snap->subxcnt;
3868 
3869  /* make sure we have enough space */
3871  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3872  /* might have been reallocated above */
3873  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3874 
3875  memcpy(data, snap, sizeof(SnapshotData));
3876  data += sizeof(SnapshotData);
3877 
3878  if (snap->xcnt)
3879  {
3880  memcpy(data, snap->xip,
3881  sizeof(TransactionId) * snap->xcnt);
3882  data += sizeof(TransactionId) * snap->xcnt;
3883  }
3884 
3885  if (snap->subxcnt)
3886  {
3887  memcpy(data, snap->subxip,
3888  sizeof(TransactionId) * snap->subxcnt);
3889  data += sizeof(TransactionId) * snap->subxcnt;
3890  }
3891  break;
3892  }
3894  {
3895  Size size;
3896  char *data;
3897 
3898  /* account for the OIDs of truncated relations */
3899  size = sizeof(Oid) * change->data.truncate.nrelids;
3900  sz += size;
3901 
3902  /* make sure we have enough space */
3904 
3905  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3906  /* might have been reallocated above */
3907  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3908 
3909  memcpy(data, change->data.truncate.relids, size);
3910  data += size;
3911 
3912  break;
3913  }
3918  /* ReorderBufferChange contains everything important */
3919  break;
3920  }
3921 
3922  ondisk->size = sz;
3923 
3924  errno = 0;
3925  pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
3926  if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3927  {
3928  int save_errno = errno;
3929 
3931 
3932  /* if write didn't set errno, assume problem is no disk space */
3933  errno = save_errno ? save_errno : ENOSPC;
3934  ereport(ERROR,
3936  errmsg("could not write to data file for XID %u: %m",
3937  txn->xid)));
3938  }
3940 
3941  /*
3942  * Keep the transaction's final_lsn up to date with each change we send to
3943  * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3944  * only do this on commit and abort records, but that doesn't work if a
3945  * system crash leaves a transaction without its abort record).
3946  *
3947  * Make sure not to move it backwards.
3948  */
3949  if (txn->final_lsn < change->lsn)
3950  txn->final_lsn = change->lsn;
3951 
3952  Assert(ondisk->change.action == change->action);
3953 }
3954 
3955 /* Returns true, if the output plugin supports streaming, false, otherwise. */
3956 static inline bool
3958 {
3960 
3961  return ctx->streaming;
3962 }
3963 
3964 /* Returns true, if the streaming can be started now, false, otherwise. */
3965 static inline bool
3967 {
3969  SnapBuild *builder = ctx->snapshot_builder;
3970 
3971  /* We can't start streaming unless a consistent state is reached. */
3973  return false;
3974 
3975  /*
3976  * We can't start streaming immediately even if the streaming is enabled
3977  * because we previously decoded this transaction and now just are
3978  * restarting.
3979  */
3980  if (ReorderBufferCanStream(rb) &&
3981  !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
3982  return true;
3983 
3984  return false;
3985 }
3986 
3987 /*
3988  * Send data of a large transaction (and its subtransactions) to the
3989  * output plugin, but using the stream API.
3990  */
3991 static void
3993 {
3994  Snapshot snapshot_now;
3995  CommandId command_id;
3996  Size stream_bytes;
3997  bool txn_is_streamed;
3998 
3999  /* We can never reach here for a subtransaction. */
4000  Assert(rbtxn_is_toptxn(txn));
4001 
4002  /*
4003  * We can't make any assumptions about base snapshot here, similar to what
4004  * ReorderBufferCommit() does. That relies on base_snapshot getting
4005  * transferred from subxact in ReorderBufferCommitChild(), but that was
4006  * not yet called as the transaction is in-progress.
4007  *
4008  * So just walk the subxacts and use the same logic here. But we only need
4009  * to do that once, when the transaction is streamed for the first time.
4010  * After that we need to reuse the snapshot from the previous run.
4011  *
4012  * Unlike DecodeCommit which adds xids of all the subtransactions in
4013  * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4014  * we do add them to subxip array instead via ReorderBufferCopySnap. This
4015  * allows the catalog changes made in subtransactions decoded till now to
4016  * be visible.
4017  */
4018  if (txn->snapshot_now == NULL)
4019  {
4020  dlist_iter subxact_i;
4021 
4022  /* make sure this transaction is streamed for the first time */
4023  Assert(!rbtxn_is_streamed(txn));
4024 
4025  /* at the beginning we should have invalid command ID */
4027 
4028  dlist_foreach(subxact_i, &txn->subtxns)
4029  {
4030  ReorderBufferTXN *subtxn;
4031 
4032  subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4033  ReorderBufferTransferSnapToParent(txn, subtxn);
4034  }
4035 
4036  /*
4037  * If this transaction has no snapshot, it didn't make any changes to
4038  * the database till now, so there's nothing to decode.
4039  */
4040  if (txn->base_snapshot == NULL)
4041  {
4042  Assert(txn->ninvalidations == 0);
4043  return;
4044  }
4045 
4046  command_id = FirstCommandId;
4047  snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4048  txn, command_id);
4049  }
4050  else
4051  {
4052  /* the transaction must have been already streamed */
4053  Assert(rbtxn_is_streamed(txn));
4054 
4055  /*
4056  * Nah, we already have snapshot from the previous streaming run. We
4057  * assume new subxacts can't move the LSN backwards, and so can't beat
4058  * the LSN condition in the previous branch (so no need to walk
4059  * through subxacts again). In fact, we must not do that as we may be
4060  * using snapshot half-way through the subxact.
4061  */
4062  command_id = txn->command_id;
4063 
4064  /*
4065  * We can't use txn->snapshot_now directly because after the last
4066  * streaming run, we might have got some new sub-transactions. So we
4067  * need to add them to the snapshot.
4068  */
4069  snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4070  txn, command_id);
4071 
4072  /* Free the previously copied snapshot. */
4073  Assert(txn->snapshot_now->copied);
4075  txn->snapshot_now = NULL;
4076  }
4077 
4078  /*
4079  * Remember this information to be used later to update stats. We can't
4080  * update the stats here as an error while processing the changes would
4081  * lead to the accumulation of stats even though we haven't streamed all
4082  * the changes.
4083  */
4084  txn_is_streamed = rbtxn_is_streamed(txn);
4085  stream_bytes = txn->total_size;
4086 
4087  /* Process and send the changes to output plugin. */
4088  ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4089  command_id, true);
4090 
4091  rb->streamCount += 1;
4092  rb->streamBytes += stream_bytes;
4093 
4094  /* Don't consider already streamed transaction. */
4095  rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4096 
4097  /* update the decoding stats */
4099 
4100  Assert(dlist_is_empty(&txn->changes));
4101  Assert(txn->nentries == 0);
4102  Assert(txn->nentries_mem == 0);
4103 }
4104 
4105 /*
4106  * Size of a change in memory.
4107  */
4108 static Size
4110 {
4111  Size sz = sizeof(ReorderBufferChange);
4112 
4113  switch (change->action)
4114  {
4115  /* fall through these, they're all similar enough */
4120  {
4121  ReorderBufferTupleBuf *oldtup,
4122  *newtup;
4123  Size oldlen = 0;
4124  Size newlen = 0;
4125 
4126  oldtup = change->data.tp.oldtuple;
4127  newtup = change->data.tp.newtuple;
4128 
4129  if (oldtup)
4130  {
4131  sz += sizeof(HeapTupleData);
4132  oldlen = oldtup->tuple.t_len;
4133  sz += oldlen;
4134  }
4135 
4136  if (newtup)
4137  {
4138  sz += sizeof(HeapTupleData);
4139  newlen = newtup->tuple.t_len;
4140  sz += newlen;
4141  }
4142 
4143  break;
4144  }
4146  {
4147  Size prefix_size = strlen(change->data.msg.prefix) + 1;
4148 
4149  sz += prefix_size + change->data.msg.message_size +
4150  sizeof(Size) + sizeof(Size);
4151 
4152  break;
4153  }
4155  {
4156  sz += sizeof(SharedInvalidationMessage) *
4157  change->data.inval.ninvalidations;
4158  break;
4159  }
4161  {
4162  Snapshot snap;
4163 
4164  snap = change->data.snapshot;
4165 
4166  sz += sizeof(SnapshotData) +
4167  sizeof(TransactionId) * snap->xcnt +
4168  sizeof(TransactionId) * snap->subxcnt;
4169 
4170  break;
4171  }
4173  {
4174  sz += sizeof(Oid) * change->data.truncate.nrelids;
4175 
4176  break;
4177  }
4182  /* ReorderBufferChange contains everything important */
4183  break;
4184  }
4185 
4186  return sz;
4187 }
4188 
4189 
4190 /*
4191  * Restore a number of changes spilled to disk back into memory.
4192  */
4193 static Size
4195  TXNEntryFile *file, XLogSegNo *segno)
4196 {
4197  Size restored = 0;
4198  XLogSegNo last_segno;
4199  dlist_mutable_iter cleanup_iter;
4200  File *fd = &file->vfd;
4201 
4204 
4205  /* free current entries, so we have memory for more */
4206  dlist_foreach_modify(cleanup_iter, &txn->changes)
4207  {
4209  dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4210 
4211  dlist_delete(&cleanup->node);
4213  }
4214  txn->nentries_mem = 0;
4215  Assert(dlist_is_empty(&txn->changes));
4216 
4217  XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4218 
4219  while (restored < max_changes_in_memory && *segno <= last_segno)
4220  {
4221  int readBytes;
4222  ReorderBufferDiskChange *ondisk;
4223 
4225 
4226  if (*fd == -1)
4227  {
4228  char path[MAXPGPATH];
4229 
4230  /* first time in */
4231  if (*segno == 0)
4232  XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4233 
4234  Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4235 
4236  /*
4237  * No need to care about TLIs here, only used during a single run,
4238  * so each LSN only maps to a specific WAL record.
4239  */
4241  *segno);
4242 
4243  *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4244 
4245  /* No harm in resetting the offset even in case of failure */
4246  file->curOffset = 0;
4247 
4248  if (*fd < 0 && errno == ENOENT)
4249  {
4250  *fd = -1;
4251  (*segno)++;
4252  continue;
4253  }
4254  else if (*fd < 0)
4255  ereport(ERROR,
4257  errmsg("could not open file \"%s\": %m",
4258  path)));
4259  }
4260 
4261  /*
4262  * Read the statically sized part of a change which has information
4263  * about the total size. If we couldn't read a record, we're at the
4264  * end of this file.
4265  */
4267  readBytes = FileRead(file->vfd, rb->outbuf,
4268  sizeof(ReorderBufferDiskChange),
4269  file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4270 
4271  /* eof */
4272  if (readBytes == 0)
4273  {
4274  FileClose(*fd);
4275  *fd = -1;
4276  (*segno)++;
4277  continue;
4278  }
4279  else if (readBytes < 0)
4280  ereport(ERROR,
4282  errmsg("could not read from reorderbuffer spill file: %m")));
4283  else if (readBytes != sizeof(ReorderBufferDiskChange))
4284  ereport(ERROR,
4286  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4287  readBytes,
4288  (uint32) sizeof(ReorderBufferDiskChange))));
4289 
4290  file->curOffset += readBytes;
4291 
4292  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4293 
4295  sizeof(ReorderBufferDiskChange) + ondisk->size);
4296  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4297 
4298  readBytes = FileRead(file->vfd,
4299  rb->outbuf + sizeof(ReorderBufferDiskChange),
4300  ondisk->size - sizeof(ReorderBufferDiskChange),
4301  file->curOffset,
4302  WAIT_EVENT_REORDER_BUFFER_READ);
4303 
4304  if (readBytes < 0)
4305  ereport(ERROR,
4307  errmsg("could not read from reorderbuffer spill file: %m")));
4308  else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4309  ereport(ERROR,
4311  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4312  readBytes,
4313  (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4314 
4315  file->curOffset += readBytes;
4316 
4317  /*
4318  * ok, read a full change from disk, now restore it into proper
4319  * in-memory format
4320  */
4321  ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4322  restored++;
4323  }
4324 
4325  return restored;
4326 }
4327 
4328 /*
4329  * Convert change from its on-disk format to in-memory format and queue it onto
4330  * the TXN's ->changes list.
4331  *
4332  * Note: although "data" is declared char*, at entry it points to a
4333  * maxalign'd buffer, making it safe in most of this function to assume
4334  * that the pointed-to data is suitably aligned for direct access.
4335  */
4336 static void
4338  char *data)
4339 {
4340  ReorderBufferDiskChange *ondisk;
4341  ReorderBufferChange *change;
4342 
4343  ondisk = (ReorderBufferDiskChange *) data;
4344 
4345  change = ReorderBufferGetChange(rb);
4346 
4347  /* copy static part */
4348  memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4349 
4350  data += sizeof(ReorderBufferDiskChange);
4351 
4352  /* restore individual stuff */
4353  switch (change->action)
4354  {
4355  /* fall through these, they're all similar enough */
4360  if (change->data.tp.oldtuple)
4361  {
4362  uint32 tuplelen = ((HeapTuple) data)->t_len;
4363 
4364  change->data.tp.oldtuple =
4366 
4367  /* restore ->tuple */
4368  memcpy(&change->data.tp.oldtuple->tuple, data,
4369  sizeof(HeapTupleData));
4370  data += sizeof(HeapTupleData);
4371 
4372  /* reset t_data pointer into the new tuplebuf */
4373  change->data.tp.oldtuple->tuple.t_data =
4374  ReorderBufferTupleBufData(change->data.tp.oldtuple);
4375 
4376  /* restore tuple data itself */
4377  memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
4378  data += tuplelen;
4379  }
4380 
4381  if (change->data.tp.newtuple)
4382  {
4383  /* here, data might not be suitably aligned! */
4384  uint32 tuplelen;
4385 
4386  memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4387  sizeof(uint32));
4388 
4389  change->data.tp.newtuple =
4391 
4392  /* restore ->tuple */
4393  memcpy(&change->data.tp.newtuple->tuple, data,
4394  sizeof(HeapTupleData));
4395  data += sizeof(HeapTupleData);
4396 
4397  /* reset t_data pointer into the new tuplebuf */
4398  change->data.tp.newtuple->tuple.t_data =
4399  ReorderBufferTupleBufData(change->data.tp.newtuple);
4400 
4401  /* restore tuple data itself */
4402  memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
4403  data += tuplelen;
4404  }
4405 
4406  break;
4408  {
4409  Size prefix_size;
4410 
4411  /* read prefix */
4412  memcpy(&prefix_size, data, sizeof(Size));
4413  data += sizeof(Size);
4414  change->data.msg.prefix = MemoryContextAlloc(rb->context,
4415  prefix_size);
4416  memcpy(change->data.msg.prefix, data, prefix_size);
4417  Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4418  data += prefix_size;
4419 
4420  /* read the message */
4421  memcpy(&change->data.msg.message_size, data, sizeof(Size));
4422  data += sizeof(Size);
4423  change->data.msg.message = MemoryContextAlloc(rb->context,
4424  change->data.msg.message_size);
4425  memcpy(change->data.msg.message, data,
4426  change->data.msg.message_size);
4427  data += change->data.msg.message_size;
4428 
4429  break;
4430  }
4432  {
4433  Size inval_size = sizeof(SharedInvalidationMessage) *
4434  change->data.inval.ninvalidations;
4435 
4436  change->data.inval.invalidations =
4437  MemoryContextAlloc(rb->context, inval_size);
4438 
4439  /* read the message */
4440  memcpy(change->data.inval.invalidations, data, inval_size);
4441 
4442  break;
4443  }
4445  {
4446  Snapshot oldsnap;
4447  Snapshot newsnap;
4448  Size size;
4449 
4450  oldsnap = (Snapshot) data;
4451 
4452  size = sizeof(SnapshotData) +
4453  sizeof(TransactionId) * oldsnap->xcnt +
4454  sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4455 
4456  change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4457 
4458  newsnap = change->data.snapshot;
4459 
4460  memcpy(newsnap, data, size);
4461  newsnap->xip = (TransactionId *)
4462  (((char *) newsnap) + sizeof(SnapshotData));
4463  newsnap->subxip = newsnap->xip + newsnap->xcnt;
4464  newsnap->copied = true;
4465  break;
4466  }
4467  /* the base struct contains all the data, easy peasy */
4469  {
4470  Oid *relids;
4471 
4472  relids = ReorderBufferGetRelids(rb,
4473  change->data.truncate.nrelids);
4474  memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4475  change->data.truncate.relids = relids;
4476 
4477  break;
4478  }
4483  break;
4484  }
4485 
4486  dlist_push_tail(&txn->changes, &change->node);
4487  txn->nentries_mem++;
4488 
4489  /*
4490  * Update memory accounting for the restored change. We need to do this
4491  * although we don't check the memory limit when restoring the changes in
4492  * this branch (we only do that when initially queueing the changes after
4493  * decoding), because we will release the changes later, and that will
4494  * update the accounting too (subtracting the size from the counters). And
4495  * we don't want to underflow there.
4496  */
4497  ReorderBufferChangeMemoryUpdate(rb, change, true,
4498  ReorderBufferChangeSize(change));
4499 }
4500 
4501 /*
4502  * Remove all on-disk stored for the passed in transaction.
4503  */
4504 static void
4506 {
4507  XLogSegNo first;
4508  XLogSegNo cur;
4509  XLogSegNo last;
4510 
4513 
4514  XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4515  XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4516 
4517  /* iterate over all possible filenames, and delete them */
4518  for (cur = first; cur <= last; cur++)
4519  {
4520  char path[MAXPGPATH];
4521 
4523  if (unlink(path) != 0 && errno != ENOENT)
4524  ereport(ERROR,
4526  errmsg("could not remove file \"%s\": %m", path)));
4527  }
4528 }
4529 
4530 /*
4531  * Remove any leftover serialized reorder buffers from a slot directory after a
4532  * prior crash or decoding session exit.
4533  */
4534 static void
4536 {
4537  DIR *spill_dir;
4538  struct dirent *spill_de;
4539  struct stat statbuf;
4540  char path[MAXPGPATH * 2 + 12];
4541 
4542  sprintf(path, "pg_replslot/%s", slotname);
4543 
4544  /* we're only handling directories here, skip if it's not ours */
4545  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4546  return;
4547 
4548  spill_dir = AllocateDir(path);
4549  while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4550  {
4551  /* only look at names that can be ours */
4552  if (strncmp(spill_de->d_name, "xid", 3) == 0)
4553  {
4554  snprintf(path, sizeof(path),
4555  "pg_replslot/%s/%s", slotname,
4556  spill_de->d_name);
4557 
4558  if (unlink(path) != 0)
4559  ereport(ERROR,
4561  errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4562  path, slotname)));
4563  }
4564  }
4565  FreeDir(spill_dir);
4566 }
4567 
4568 /*
4569  * Given a replication slot, transaction ID and segment number, fill in the
4570  * corresponding spill file into 'path', which is a caller-owned buffer of size
4571  * at least MAXPGPATH.
4572  */
4573 static void
4575  XLogSegNo segno)
4576 {
4577  XLogRecPtr recptr;
4578 
4579  XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4580 
4581  snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4583  xid, LSN_FORMAT_ARGS(recptr));
4584 }
4585 
4586 /*
4587  * Delete all data spilled to disk after we've restarted/crashed. It will be
4588  * recreated when the respective slots are reused.
4589  */
4590 void
4592 {
4593  DIR *logical_dir;
4594  struct dirent *logical_de;
4595 
4596  logical_dir = AllocateDir("pg_replslot");
4597  while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4598  {
4599  if (strcmp(logical_de->d_name, ".") == 0 ||
4600  strcmp(logical_de->d_name, "..") == 0)
4601  continue;
4602 
4603  /* if it cannot be a slot, skip the directory */
4604  if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4605  continue;
4606 
4607  /*
4608  * ok, has to be a surviving logical slot, iterate and delete
4609  * everything starting with xid-*
4610  */
4612  }
4613  FreeDir(logical_dir);
4614 }
4615 
4616 /* ---------------------------------------
4617  * toast reassembly support
4618  * ---------------------------------------
4619  */
4620 
4621 /*
4622  * Initialize per tuple toast reconstruction support.
4623  */
4624 static void
4626 {
4627  HASHCTL hash_ctl;
4628 
4629  Assert(txn->toast_hash == NULL);
4630 
4631  hash_ctl.keysize = sizeof(Oid);
4632  hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4633  hash_ctl.hcxt = rb->context;
4634  txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4636 }
4637 
4638 /*
4639  * Per toast-chunk handling for toast reconstruction
4640  *
4641  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4642  * toasted Datum comes along.
4643  */
4644 static void
4646  Relation relation, ReorderBufferChange *change)
4647 {
4648  ReorderBufferToastEnt *ent;
4649  ReorderBufferTupleBuf *newtup;
4650  bool found;
4651  int32 chunksize;
4652  bool isnull;
4653  Pointer chunk;
4654  TupleDesc desc = RelationGetDescr(relation);
4655  Oid chunk_id;
4656  int32 chunk_seq;
4657 
4658  if (txn->toast_hash == NULL)
4659  ReorderBufferToastInitHash(rb, txn);
4660 
4661  Assert(IsToastRelation(relation));
4662 
4663  newtup = change->data.tp.newtuple;
4664  chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull));
4665  Assert(!isnull);
4666  chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull));
4667  Assert(!isnull);
4668 
4669  ent = (ReorderBufferToastEnt *)
4670  hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
4671 
4672  if (!found)
4673  {
4674  Assert(ent->chunk_id == chunk_id);
4675  ent->num_chunks = 0;
4676  ent->last_chunk_seq = 0;
4677  ent->size = 0;
4678  ent->reconstructed = NULL;
4679  dlist_init(&ent->chunks);
4680 
4681  if (chunk_seq != 0)
4682  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4683  chunk_seq, chunk_id);
4684  }
4685  else if (found && chunk_seq != ent->last_chunk_seq + 1)
4686  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4687  chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4688 
4689  chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull));
4690  Assert(!isnull);
4691 
4692  /* calculate size so we can allocate the right size at once later */
4693  if (!VARATT_IS_EXTENDED(chunk))
4694  chunksize = VARSIZE(chunk) - VARHDRSZ;
4695  else if (VARATT_IS_SHORT(chunk))
4696  /* could happen due to heap_form_tuple doing its thing */
4697  chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4698  else
4699  elog(ERROR, "unexpected type of toast chunk");
4700 
4701  ent->size += chunksize;
4702  ent->last_chunk_seq = chunk_seq;
4703  ent->num_chunks++;
4704  dlist_push_tail(&ent->chunks, &change->node);
4705 }
4706 
4707 /*
4708  * Rejigger change->newtuple to point to in-memory toast tuples instead to
4709  * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
4710  *
4711  * We cannot replace unchanged toast tuples though, so those will still point
4712  * to on-disk toast data.
4713  *
4714  * While updating the existing change with detoasted tuple data, we need to
4715  * update the memory accounting info, because the change size will differ.
4716  * Otherwise the accounting may get out of sync, triggering serialization
4717  * at unexpected times.
4718  *
4719  * We simply subtract size of the change before rejiggering the tuple, and
4720  * then adding the new size. This makes it look like the change was removed
4721  * and then added back, except it only tweaks the accounting info.
4722  *
4723  * In particular it can't trigger serialization, which would be pointless
4724  * anyway as it happens during commit processing right before handing
4725  * the change to the output plugin.
4726  */
4727 static void
4729  Relation relation, ReorderBufferChange *change)
4730 {
4731  TupleDesc desc;
4732  int natt;
4733  Datum *attrs;
4734  bool *isnull;
4735  bool *free;
4736  HeapTuple tmphtup;
4737  Relation toast_rel;
4738  TupleDesc toast_desc;
4739  MemoryContext oldcontext;
4740  ReorderBufferTupleBuf *newtup;
4741  Size old_size;
4742 
4743  /* no toast tuples changed */
4744  if (txn->toast_hash == NULL)
4745  return;
4746 
4747  /*
4748  * We're going to modify the size of the change. So, to make sure the
4749  * accounting is correct we record the current change size and then after
4750  * re-computing the change we'll subtract the recorded size and then
4751  * re-add the new change size at the end. We don't immediately subtract
4752  * the old size because if there is any error before we add the new size,
4753  * we will release the changes and that will update the accounting info
4754  * (subtracting the size from the counters). And we don't want to
4755  * underflow there.
4756  */
4757  old_size = ReorderBufferChangeSize(change);
4758 
4759  oldcontext = MemoryContextSwitchTo(rb->context);
4760 
4761  /* we should only have toast tuples in an INSERT or UPDATE */
4762  Assert(change->data.tp.newtuple);
4763 
4764  desc = RelationGetDescr(relation);
4765 
4766  toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4767  if (!RelationIsValid(toast_rel))
4768  elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
4769  relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
4770 
4771  toast_desc = RelationGetDescr(toast_rel);
4772 
4773  /* should we allocate from stack instead? */
4774  attrs = palloc0(sizeof(Datum) * desc->natts);
4775  isnull = palloc0(sizeof(bool) * desc->natts);
4776  free = palloc0(sizeof(bool) * desc->natts);
4777 
4778  newtup = change->data.tp.newtuple;
4779 
4780  heap_deform_tuple(&newtup->tuple, desc, attrs, isnull);
4781 
4782  for (natt = 0; natt < desc->natts; natt++)
4783  {
4784  Form_pg_attribute attr = TupleDescAttr(desc, natt);
4785  ReorderBufferToastEnt *ent;
4786  struct varlena *varlena;
4787 
4788  /* va_rawsize is the size of the original datum -- including header */
4789  struct varatt_external toast_pointer;
4790  struct varatt_indirect redirect_pointer;
4791  struct varlena *new_datum = NULL;
4792  struct varlena *reconstructed;
4793  dlist_iter it;
4794  Size data_done = 0;
4795 
4796  /* system columns aren't toasted */
4797  if (attr->attnum < 0)
4798  continue;
4799 
4800  if (attr->attisdropped)
4801  continue;
4802 
4803  /* not a varlena datatype */
4804  if (attr->attlen != -1)
4805  continue;
4806 
4807  /* no data */
4808  if (isnull[natt])
4809  continue;
4810 
4811  /* ok, we know we have a toast datum */
4812  varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4813 
4814  /* no need to do anything if the tuple isn't external */
4816  continue;
4817 
4818  VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4819 
4820  /*
4821  * Check whether the toast tuple changed, replace if so.
4822  */
4823  ent = (ReorderBufferToastEnt *)
4824  hash_search(txn->toast_hash,
4825  &toast_pointer.va_valueid,
4826  HASH_FIND,
4827  NULL);
4828  if (ent == NULL)
4829  continue;
4830 
4831  new_datum =
4833 
4834  free[natt] = true;
4835 
4836  reconstructed = palloc0(toast_pointer.va_rawsize);
4837 
4838  ent->reconstructed = reconstructed;
4839 
4840  /* stitch toast tuple back together from its parts */
4841  dlist_foreach(it, &ent->chunks)
4842  {
4843  bool cisnull;
4844  ReorderBufferChange *cchange;
4845  ReorderBufferTupleBuf *ctup;
4846  Pointer chunk;
4847 
4848  cchange = dlist_container(ReorderBufferChange, node, it.cur);
4849  ctup = cchange->data.tp.newtuple;
4850  chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &cisnull));
4851 
4852  Assert(!cisnull);
4853  Assert(!VARATT_IS_EXTERNAL(chunk));
4854  Assert(!VARATT_IS_SHORT(chunk));
4855 
4856  memcpy(VARDATA(reconstructed) + data_done,
4857  VARDATA(chunk),
4858  VARSIZE(chunk) - VARHDRSZ);
4859  data_done += VARSIZE(chunk) - VARHDRSZ;
4860  }
4861  Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4862 
4863  /* make sure its marked as compressed or not */
4864  if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4865  SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4866  else
4867  SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4868 
4869  memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4870  redirect_pointer.pointer = reconstructed;
4871 
4873  memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4874  sizeof(redirect_pointer));
4875 
4876  attrs[natt] = PointerGetDatum(new_datum);
4877  }
4878 
4879  /*
4880  * Build tuple in separate memory & copy tuple back into the tuplebuf
4881  * passed to the output plugin. We can't directly heap_fill_tuple() into
4882  * the tuplebuf because attrs[] will point back into the current content.
4883  */
4884  tmphtup = heap_form_tuple(desc, attrs, isnull);
4885  Assert(newtup->tuple.t_len <= MaxHeapTupleSize);
4886  Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data);
4887 
4888  memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len);
4889  newtup->tuple.t_len = tmphtup->t_len;
4890 
4891  /*
4892  * free resources we won't further need, more persistent stuff will be
4893  * free'd in ReorderBufferToastReset().
4894  */
4895  RelationClose(toast_rel);
4896  pfree(tmphtup);
4897  for (natt = 0; natt < desc->natts; natt++)
4898  {
4899  if (free[natt])
4900  pfree(DatumGetPointer(attrs[natt]));
4901  }
4902  pfree(attrs);
4903  pfree(free);
4904  pfree(isnull);
4905 
4906  MemoryContextSwitchTo(oldcontext);
4907 
4908  /* subtract the old change size */
4909  ReorderBufferChangeMemoryUpdate(rb, change, false, old_size);
4910  /* now add the change back, with the correct size */
4911  ReorderBufferChangeMemoryUpdate(rb, change, true,
4912  ReorderBufferChangeSize(change));
4913 }
4914 
4915 /*
4916  * Free all resources allocated for toast reconstruction.
4917  */
4918 static void
4920 {
4921  HASH_SEQ_STATUS hstat;
4922  ReorderBufferToastEnt *ent;
4923 
4924  if (txn->toast_hash == NULL)
4925  return;
4926 
4927  /* sequentially walk over the hash and free everything */
4928  hash_seq_init(&hstat, txn->toast_hash);
4929  while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4930  {
4931  dlist_mutable_iter it;
4932 
4933  if (ent->reconstructed != NULL)
4934  pfree(ent->reconstructed);
4935 
4936  dlist_foreach_modify(it, &ent->chunks)
4937  {
4938  ReorderBufferChange *change =
4940 
4941  dlist_delete(&change->node);
4942  ReorderBufferReturnChange(rb, change, true);
4943  }
4944  }
4945 
4946  hash_destroy(txn->toast_hash);
4947  txn->toast_hash = NULL;
4948 }
4949 
4950 
4951 /* ---------------------------------------
4952  * Visibility support for logical decoding
4953  *
4954  *
4955  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4956  * always rely on stored cmin/cmax values because of two scenarios:
4957  *
4958  * * A tuple got changed multiple times during a single transaction and thus
4959  * has got a combo CID. Combo CIDs are only valid for the duration of a
4960  * single transaction.
4961  * * A tuple with a cmin but no cmax (and thus no combo CID) got
4962  * deleted/updated in another transaction than the one which created it
4963  * which we are looking at right now. As only one of cmin, cmax or combo CID
4964  * is actually stored in the heap we don't have access to the value we
4965  * need anymore.
4966  *
4967  * To resolve those problems we have a per-transaction hash of (cmin,
4968  * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
4969  * (cmin, cmax) values. That also takes care of combo CIDs by simply
4970  * not caring about them at all. As we have the real cmin/cmax values
4971  * combo CIDs aren't interesting.
4972  *
4973  * As we only care about catalog tuples here the overhead of this
4974  * hashtable should be acceptable.
4975  *
4976  * Heap rewrites complicate this a bit, check rewriteheap.c for
4977  * details.
4978  * -------------------------------------------------------------------------
4979  */
4980 
4981 /* struct for sorting mapping files by LSN efficiently */
4982 typedef struct RewriteMappingFile
4983 {
4987 
4988 #ifdef NOT_USED
4989 static void
4990 DisplayMapping(HTAB *tuplecid_data)
4991 {
4992  HASH_SEQ_STATUS hstat;
4994 
4995  hash_seq_init(&hstat, tuplecid_data);
4996  while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4997  {
4998  elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4999  ent->key.rlocator.dbOid,
5000  ent->key.rlocator.spcOid,
5001  ent->key.rlocator.relNumber,
5004  ent->cmin,
5005  ent->cmax
5006  );
5007  }
5008 }
5009 #endif
5010 
5011 /*
5012  * Apply a single mapping file to tuplecid_data.
5013  *
5014  * The mapping file has to have been verified to be a) committed b) for our
5015  * transaction c) applied in LSN order.
5016  */
5017 static void
5018 ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5019 {
5020  char path[MAXPGPATH];
5021  int fd;
5022  int readBytes;
5024 
5025  sprintf(path, "pg_logical/mappings/%s", fname);
5026  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5027  if (fd < 0)
5028  ereport(ERROR,
5030  errmsg("could not open file \"%s\": %m", path)));
5031 
5032  while (true)
5033  {
5036  ReorderBufferTupleCidEnt *new_ent;
5037  bool found;
5038 
5039  /* be careful about padding */
5040  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5041 
5042  /* read all mappings till the end of the file */
5043  pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5044  readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5046 
5047  if (readBytes < 0)
5048  ereport(ERROR,
5050  errmsg("could not read file \"%s\": %m",
5051  path)));
5052  else if (readBytes == 0) /* EOF */
5053  break;
5054  else if (readBytes != sizeof(LogicalRewriteMappingData))
5055  ereport(ERROR,
5057  errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5058  path, readBytes,
5059  (int32) sizeof(LogicalRewriteMappingData))));
5060 
5061  key.rlocator = map.old_locator;
5062  ItemPointerCopy(&map.old_tid,
5063  &key.tid);
5064 
5065 
5066  ent = (ReorderBufferTupleCidEnt *)
5068 
5069  /* no existing mapping, no need to update */
5070  if (!ent)
5071  continue;
5072 
5073  key.rlocator = map.new_locator;
5074  ItemPointerCopy(&map.new_tid,
5075  &key.tid);
5076 
5077  new_ent = (ReorderBufferTupleCidEnt *)
5079 
5080  if (found)
5081  {
5082  /*
5083  * Make sure the existing mapping makes sense. We sometime update
5084  * old records that did not yet have a cmax (e.g. pg_class' own
5085  * entry while rewriting it) during rewrites, so allow that.
5086  */
5087  Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5088  Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5089  }
5090  else
5091  {
5092  /* update mapping */
5093  new_ent->cmin = ent->cmin;
5094  new_ent->cmax = ent->cmax;
5095  new_ent->combocid = ent->combocid;
5096  }
5097  }
5098 
5099  if (CloseTransientFile(fd) != 0)
5100  ereport(ERROR,
5102  errmsg("could not close file \"%s\": %m", path)));
5103 }
5104 
5105 
5106 /*
5107  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5108  */
5109 static bool
5111 {
5112  return bsearch(&xid, xip, num,
5113  sizeof(TransactionId), xidComparator) != NULL;
5114 }
5115 
5116 /*
5117  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5118  */
5119 static int
5120 file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5121 {
5124 
5125  if (a->lsn < b->lsn)
5126  return -1;
5127  else if (a->lsn > b->lsn)
5128  return 1;
5129  return 0;
5130 }
5131 
5132 /*
5133  * Apply any existing logical remapping files if there are any targeted at our
5134  * transaction for relid.
5135  */
5136 static void
5138 {
5139  DIR *mapping_dir;
5140  struct dirent *mapping_de;
5141  List *files = NIL;
5142  ListCell *file;
5143  Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5144 
5145  mapping_dir = AllocateDir("pg_logical/mappings");
5146  while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
5147  {
5148  Oid f_dboid;
5149  Oid f_relid;
5150  TransactionId f_mapped_xid;
5151  TransactionId f_create_xid;
5152  XLogRecPtr f_lsn;
5153  uint32 f_hi,
5154  f_lo;
5155  RewriteMappingFile *f;
5156 
5157  if (strcmp(mapping_de->d_name, ".") == 0 ||
5158  strcmp(mapping_de->d_name, "..") == 0)
5159  continue;
5160 
5161  /* Ignore files that aren't ours */
5162  if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5163  continue;
5164 
5165  if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5166  &f_dboid, &f_relid, &f_hi, &f_lo,
5167  &f_mapped_xid, &f_create_xid) != 6)
5168  elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5169 
5170  f_lsn = ((uint64) f_hi) << 32 | f_lo;
5171 
5172  /* mapping for another database */
5173  if (f_dboid != dboid)
5174  continue;
5175 
5176  /* mapping for another relation */
5177  if (f_relid != relid)
5178  continue;
5179 
5180  /* did the creating transaction abort? */
5181  if (!TransactionIdDidCommit(f_create_xid))
5182  continue;
5183 
5184  /* not for our transaction */
5185  if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5186  continue;
5187 
5188  /* ok, relevant, queue for apply */
5189  f = palloc(sizeof(RewriteMappingFile));
5190  f->lsn = f_lsn;
5191  strcpy(f->fname, mapping_de->d_name);
5192  files = lappend(files, f);
5193  }
5194  FreeDir(mapping_dir);
5195 
5196  /* sort files so we apply them in LSN order */
5197  list_sort(files, file_sort_by_lsn);
5198 
5199  foreach(file, files)
5200  {
5202 
5203  elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5204  snapshot->subxip[0]);
5206  pfree(f);
5207  }
5208 }
5209 
5210 /*
5211  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5212  * combo CIDs.
5213  */
5214 bool
5216  Snapshot snapshot,
5217  HeapTuple htup, Buffer buffer,
5218  CommandId *cmin, CommandId *cmax)
5219 {
5222  ForkNumber forkno;
5223  BlockNumber blockno;
5224  bool updated_mapping = false;
5225 
5226  /*
5227  * Return unresolved if tuplecid_data is not valid. That's because when
5228  * streaming in-progress transactions we may run into tuples with the CID
5229  * before actually decoding them. Think e.g. about INSERT followed by
5230  * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5231  * INSERT. So in such cases, we assume the CID is from the future
5232  * command.
5233  */
5234  if (tuplecid_data == NULL)
5235  return false;
5236 
5237  /* be careful about padding */
5238  memset(&key, 0, sizeof(key));
5239 
5240  Assert(!BufferIsLocal(buffer));
5241 
5242  /*
5243  * get relfilelocator from the buffer, no convenient way to access it
5244  * other than that.
5245  */
5246  BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5247 
5248  /* tuples can only be in the main fork */
5249  Assert(forkno == MAIN_FORKNUM);
5250  Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5251 
5252  ItemPointerCopy(&htup->t_self,
5253  &key.tid);
5254 
5255 restart:
5256  ent = (ReorderBufferTupleCidEnt *)
5258 
5259  /*
5260  * failed to find a mapping, check whether the table was rewritten and
5261  * apply mapping if so, but only do that once - there can be no new
5262  * mappings while we are in here since we have to hold a lock on the
5263  * relation.
5264  */
5265  if (ent == NULL && !updated_mapping)
5266  {
5268  /* now check but don't update for a mapping again */
5269  updated_mapping = true;
5270  goto restart;
5271  }
5272  else if (ent == NULL)
5273  return false;
5274 
5275  if (cmin)
5276  *cmin = ent->cmin;
5277  if (cmax)
5278  *cmax = ent->cmax;
5279  return true;
5280 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
uint32 BlockNumber
Definition: block.h:31
static int32 next
Definition: blutils.c:219
static void cleanup(void)
Definition: bootstrap.c:687
int Buffer
Definition: buf.h:23
#define BufferIsLocal(buffer)
Definition: buf.h:37
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:3311
#define NameStr(name)
Definition: c.h:735
#define InvalidCommandId
Definition: c.h:658
unsigned int uint32
Definition: c.h:495
signed int int32
Definition: c.h:483
char * Pointer
Definition: c.h:472
#define VARHDRSZ
Definition: c.h:681
#define PG_BINARY
Definition: c.h:1283
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:387
#define FirstCommandId
Definition: c.h:657
uint32 CommandId
Definition: c.h:655
uint32 TransactionId
Definition: c.h:641
size_t Size
Definition: c.h:594
bool IsToastRelation(Relation relation)
Definition: catalog.c:147
bool IsSharedRelation(Oid relationId)
Definition: catalog.c:245
int64 TimestampTz
Definition: timestamp.h:39
#define INDIRECT_POINTER_SIZE
Definition: detoast.h:34
#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)
Definition: detoast.h:22
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:863
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421
struct cursor * cur
Definition: ecpg.c:28
void FreeErrorData(ErrorData *edata)
Definition: elog.c:1776
int errcode_for_file_access(void)
Definition: elog.c:881
void FlushErrorState(void)
Definition: elog.c:1825
int errmsg(const char *fmt,...)
Definition: elog.c:1069
ErrorData * CopyErrorData(void)
Definition: elog.c:1720
#define PG_RE_THROW()
Definition: elog.h:411
#define DEBUG3
Definition: elog.h:28
#define PG_TRY(...)
Definition: elog.h:370
#define DEBUG2
Definition: elog.h:29
#define PG_END_TRY(...)
Definition: elog.h:395
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:380
#define INFO
Definition: elog.h:34
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2854
int FreeDir(DIR *dir)
Definition: fd.c:2906
int CloseTransientFile(int fd)
Definition: fd.c:2754
void FileClose(File file)
Definition: fd.c:1930
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1527
int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2088
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2869
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2578
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2788
int File
Definition: fd.h:49
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: generation.c:157
Oid MyDatabaseId
Definition: globals.c:89
#define free(a)
Definition: header.h:65
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: heaptuple.c:1108
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition: heaptuple.c:1337
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
HeapTupleData * HeapTuple
Definition: htup.h:71
struct HeapTupleData HeapTupleData
#define SizeofHeapTupleHeader
Definition: htup_details.h:185
#define MaxHeapTupleSize
Definition: htup_details.h:558
static Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:749
#define dlist_foreach(iter, lhead)
Definition: ilist.h:623
static void dlist_init(dlist_head *head)
Definition: ilist.h:314
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
static bool dlist_has_next(const dlist_head *head, const dlist_node *node)
Definition: ilist.h:503
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition: ilist.h:709
static void dlist_insert_before(dlist_node *before, dlist_node *node)
Definition: ilist.h:393
#define dlist_head_element(type, membername, lhead)
Definition: ilist.h:603
static void dlist_delete(dlist_node *node)
Definition: ilist.h:405
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static dlist_node * dlist_next_node(dlist_head *head, dlist_node *node)
Definition: ilist.h:537
static dlist_node * dlist_pop_head_node(dlist_head *head)
Definition: ilist.h:450
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:640
static bool dlist_is_empty(const dlist_head *head)
Definition: ilist.h:336
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:364
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
static void dclist_init(dclist_head *head)
Definition: ilist.h:671
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
Definition: inval.c:615
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
Assert(fmt[strlen(fmt) - 1] !='\n')
void list_sort(List *list, list_sort_comparator cmp)
Definition: list.c:1673
List * lappend(List *list, void *datum)
Definition: list.c:338
void UpdateDecodingStats(LogicalDecodingContext *ctx)
Definition: logical.c:1912
char * pstrdup(const char *in)
Definition: mcxt.c:1644
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1064
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1021
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:403
void * palloc(Size size)
Definition: mcxt.c:1226
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
#define SLAB_DEFAULT_BLOCK_SIZE
Definition: memutils.h:182
#define SLAB_LARGE_BLOCK_SIZE
Definition: memutils.h:183
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:209
void * arg
#define MAXPGPATH
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
#define sprintf
Definition: port.h:240
#define snprintf
Definition: port.h:238
#define qsort(a, b, c, d)
Definition: port.h:445
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Oid DatumGetObjectId(Datum X)
Definition: postgres.h:242
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define RelationIsLogicallyLogged(relation)
Definition: rel.h:702
#define RelationGetDescr(relation)
Definition: rel.h:530
#define RelationGetRelationName(relation)
Definition: rel.h:538
#define RelationIsValid(relation)
Definition: rel.h:477
Relation RelationIdGetRelation(Oid relationId)
Definition: relcache.c:2054
void RelationClose(Relation relation)
Definition: relcache.c:2160
Oid RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber)
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90
static int file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
void ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change, bool upd_mem)
static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
TransactionId * ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
void ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
void ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, CommandId cid)
static void ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileLocator locator, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
static void ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
void ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, TimestampTz abort_time)
static bool ReorderBufferCanStartStreaming(ReorderBuffer *rb)
static void ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id, XLog