PostgreSQL Source Code  git master
reorderbuffer.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * reorderbuffer.c
4  * PostgreSQL logical replay/reorder buffer management
5  *
6  *
7  * Copyright (c) 2012-2021, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/reorderbuffer.c
12  *
13  * NOTES
14  * This module gets handed individual pieces of transactions in the order
15  * they are written to the WAL and is responsible to reassemble them into
16  * toplevel transaction sized pieces. When a transaction is completely
17  * reassembled - signaled by reading the transaction commit record - it
18  * will then call the output plugin (cf. ReorderBufferCommit()) with the
19  * individual changes. The output plugins rely on snapshots built by
20  * snapbuild.c which hands them to us.
21  *
22  * Transactions and subtransactions/savepoints in postgres are not
23  * immediately linked to each other from outside the performing
24  * backend. Only at commit/abort (or special xact_assignment records) they
25  * are linked together. Which means that we will have to splice together a
26  * toplevel transaction from its subtransactions. To do that efficiently we
27  * build a binary heap indexed by the smallest current lsn of the individual
28  * subtransactions' changestreams. As the individual streams are inherently
29  * ordered by LSN - since that is where we build them from - the transaction
30  * can easily be reassembled by always using the subtransaction with the
31  * smallest current LSN from the heap.
32  *
33  * In order to cope with large transactions - which can be several times as
34  * big as the available memory - this module supports spooling the contents
35  * of a large transactions to disk. When the transaction is replayed the
36  * contents of individual (sub-)transactions will be read from disk in
37  * chunks.
38  *
39  * This module also has to deal with reassembling toast records from the
40  * individual chunks stored in WAL. When a new (or initial) version of a
41  * tuple is stored in WAL it will always be preceded by the toast chunks
42  * emitted for the columns stored out of line. Within a single toplevel
43  * transaction there will be no other data carrying records between a row's
44  * toast chunks and the row data itself. See ReorderBufferToast* for
45  * details.
46  *
47  * ReorderBuffer uses two special memory context types - SlabContext for
48  * allocations of fixed-length structures (changes and transactions), and
49  * GenerationContext for the variable-length transaction data (allocated
50  * and freed in groups with similar lifespans).
51  *
52  * To limit the amount of memory used by decoded changes, we track memory
53  * used at the reorder buffer level (i.e. total amount of memory), and for
54  * each transaction. When the total amount of used memory exceeds the
55  * limit, the transaction consuming the most memory is then serialized to
56  * disk.
57  *
58  * Only decoded changes are evicted from memory (spilled to disk), not the
59  * transaction records. The number of toplevel transactions is limited,
60  * but a transaction with many subtransactions may still consume significant
61  * amounts of memory. However, the transaction records are fairly small and
62  * are not included in the memory limit.
63  *
64  * The current eviction algorithm is very simple - the transaction is
65  * picked merely by size, while it might be useful to also consider age
66  * (LSN) of the changes for example. With the new Generational memory
67  * allocator, evicting the oldest changes would make it more likely the
68  * memory gets actually freed.
69  *
70  * We still rely on max_changes_in_memory when loading serialized changes
71  * back into memory. At that point we can't use the memory limit directly
72  * as we load the subxacts independently. One option to deal with this
73  * would be to count the subxacts, and allow each to allocate 1/N of the
74  * memory limit. That however does not seem very appealing, because with
75  * many subtransactions it may easily cause thrashing (short cycles of
76  * deserializing and applying very few changes). We probably should give
77  * a bit more memory to the oldest subtransactions, because it's likely
78  * they are the source for the next sequence of changes.
79  *
80  * -------------------------------------------------------------------------
81  */
82 #include "postgres.h"
83 
84 #include <unistd.h>
85 #include <sys/stat.h>
86 
87 #include "access/detoast.h"
88 #include "access/heapam.h"
89 #include "access/rewriteheap.h"
90 #include "access/transam.h"
91 #include "access/xact.h"
92 #include "access/xlog_internal.h"
93 #include "catalog/catalog.h"
94 #include "lib/binaryheap.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "replication/logical.h"
99 #include "replication/slot.h"
100 #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
101 #include "storage/bufmgr.h"
102 #include "storage/fd.h"
103 #include "storage/sinval.h"
104 #include "utils/builtins.h"
105 #include "utils/combocid.h"
106 #include "utils/memdebug.h"
107 #include "utils/memutils.h"
108 #include "utils/rel.h"
109 #include "utils/relfilenodemap.h"
110 
111 
112 /* entry for a hash table we use to map from xid to our transaction state */
114 {
118 
119 /* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
121 {
125 
127 {
131  CommandId combocid; /* just for debugging */
133 
134 /* Virtual file descriptor with file offset tracking */
135 typedef struct TXNEntryFile
136 {
137  File vfd; /* -1 when the file is closed */
138  off_t curOffset; /* offset for next write or read. Reset to 0
139  * when vfd is opened. */
140 } TXNEntryFile;
141 
142 /* k-way in-order change iteration support structures */
144 {
151 
153 {
159 
160 /* toast datastructures */
161 typedef struct ReorderBufferToastEnt
162 {
163  Oid chunk_id; /* toast_table.chunk_id */
164  int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
165  * have seen */
166  Size num_chunks; /* number of chunks we've already seen */
167  Size size; /* combined size of chunks seen */
168  dlist_head chunks; /* linked list of chunks */
169  struct varlena *reconstructed; /* reconstructed varlena now pointed to in
170  * main tup */
172 
173 /* Disk serialization support datastructures */
175 {
178  /* data follows */
180 
181 #define IsSpecInsert(action) \
182 ( \
183  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
184 )
185 #define IsSpecConfirm(action) \
186 ( \
187  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) \
188 )
189 #define IsInsertOrUpdate(action) \
190 ( \
191  (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
192  ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
193  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
194 )
195 
196 /*
197  * Maximum number of changes kept in memory, per transaction. After that,
198  * changes are spooled to disk.
199  *
200  * The current value should be sufficient to decode the entire transaction
201  * without hitting disk in OLTP workloads, while starting to spool to disk in
202  * other workloads reasonably fast.
203  *
204  * At some point in the future it probably makes sense to have a more elaborate
205  * resource management here, but it's not entirely clear what that would look
206  * like.
207  */
209 static const Size max_changes_in_memory = 4096; /* XXX for restore only */
210 
211 /* ---------------------------------------
212  * primary reorderbuffer support routines
213  * ---------------------------------------
214  */
218  TransactionId xid, bool create, bool *is_new,
219  XLogRecPtr lsn, bool create_as_top);
221  ReorderBufferTXN *subtxn);
222 
223 static void AssertTXNLsnOrder(ReorderBuffer *rb);
224 
225 /* ---------------------------------------
226  * support functions for lsn-order iterating over the ->changes of a
227  * transaction and its subtransactions
228  *
229  * used for iteration over the k-way heap merge of a transaction and its
230  * subtransactions
231  * ---------------------------------------
232  */
234  ReorderBufferIterTXNState *volatile *iter_state);
239 
240 /*
241  * ---------------------------------------
242  * Disk serialization support functions
243  * ---------------------------------------
244  */
248  int fd, ReorderBufferChange *change);
250  TXNEntryFile *file, XLogSegNo *segno);
252  char *change);
255  bool txn_prepared);
256 static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
257 static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
258  TransactionId xid, XLogSegNo segno);
259 
260 static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
263 
264 /*
265  * ---------------------------------------
266  * Streaming support functions
267  * ---------------------------------------
268  */
269 static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
270 static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
273 
274 /* ---------------------------------------
275  * toast reassembly support
276  * ---------------------------------------
277  */
281  Relation relation, ReorderBufferChange *change);
283  Relation relation, ReorderBufferChange *change);
284 
285 /*
286  * ---------------------------------------
287  * memory accounting
288  * ---------------------------------------
289  */
292  ReorderBufferChange *change, bool addition);
293 
294 /*
295  * Allocate a new ReorderBuffer and clean out any old serialized state from
296  * prior ReorderBuffer instances for the same slot.
297  */
300 {
301  ReorderBuffer *buffer;
302  HASHCTL hash_ctl;
303  MemoryContext new_ctx;
304 
305  Assert(MyReplicationSlot != NULL);
306 
307  /* allocate memory in own context, to have better accountability */
309  "ReorderBuffer",
311 
312  buffer =
313  (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
314 
315  memset(&hash_ctl, 0, sizeof(hash_ctl));
316 
317  buffer->context = new_ctx;
318 
319  buffer->change_context = SlabContextCreate(new_ctx,
320  "Change",
322  sizeof(ReorderBufferChange));
323 
324  buffer->txn_context = SlabContextCreate(new_ctx,
325  "TXN",
327  sizeof(ReorderBufferTXN));
328 
329  buffer->tup_context = GenerationContextCreate(new_ctx,
330  "Tuples",
332 
333  hash_ctl.keysize = sizeof(TransactionId);
334  hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
335  hash_ctl.hcxt = buffer->context;
336 
337  buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
339 
341  buffer->by_txn_last_txn = NULL;
342 
343  buffer->outbuf = NULL;
344  buffer->outbufsize = 0;
345  buffer->size = 0;
346 
347  buffer->spillTxns = 0;
348  buffer->spillCount = 0;
349  buffer->spillBytes = 0;
350  buffer->streamTxns = 0;
351  buffer->streamCount = 0;
352  buffer->streamBytes = 0;
353  buffer->totalTxns = 0;
354  buffer->totalBytes = 0;
355 
357 
358  dlist_init(&buffer->toplevel_by_lsn);
360 
361  /*
362  * Ensure there's no stale data from prior uses of this slot, in case some
363  * prior exit avoided calling ReorderBufferFree. Failure to do this can
364  * produce duplicated txns, and it's very cheap if there's nothing there.
365  */
367 
368  return buffer;
369 }
370 
371 /*
372  * Free a ReorderBuffer
373  */
374 void
376 {
377  MemoryContext context = rb->context;
378 
379  /*
380  * We free separately allocated data by entirely scrapping reorderbuffer's
381  * memory context.
382  */
383  MemoryContextDelete(context);
384 
385  /* Free disk space used by unconsumed reorder buffers */
387 }
388 
389 /*
390  * Get an unused, possibly preallocated, ReorderBufferTXN.
391  */
392 static ReorderBufferTXN *
394 {
396 
397  txn = (ReorderBufferTXN *)
399 
400  memset(txn, 0, sizeof(ReorderBufferTXN));
401 
402  dlist_init(&txn->changes);
403  dlist_init(&txn->tuplecids);
404  dlist_init(&txn->subtxns);
405 
406  /* InvalidCommandId is not zero, so set it explicitly */
408  txn->output_plugin_private = NULL;
409 
410  return txn;
411 }
412 
413 /*
414  * Free a ReorderBufferTXN.
415  */
416 static void
418 {
419  /* clean the lookup cache if we were cached (quite likely) */
420  if (rb->by_txn_last_xid == txn->xid)
421  {
423  rb->by_txn_last_txn = NULL;
424  }
425 
426  /* free data that's contained */
427 
428  if (txn->gid != NULL)
429  {
430  pfree(txn->gid);
431  txn->gid = NULL;
432  }
433 
434  if (txn->tuplecid_hash != NULL)
435  {
437  txn->tuplecid_hash = NULL;
438  }
439 
440  if (txn->invalidations)
441  {
442  pfree(txn->invalidations);
443  txn->invalidations = NULL;
444  }
445 
446  pfree(txn);
447 }
448 
449 /*
450  * Get an fresh ReorderBufferChange.
451  */
454 {
455  ReorderBufferChange *change;
456 
457  change = (ReorderBufferChange *)
459 
460  memset(change, 0, sizeof(ReorderBufferChange));
461  return change;
462 }
463 
464 /*
465  * Free a ReorderBufferChange and update memory accounting, if requested.
466  */
467 void
469  bool upd_mem)
470 {
471  /* update memory accounting info */
472  if (upd_mem)
473  ReorderBufferChangeMemoryUpdate(rb, change, false);
474 
475  /* free contained data */
476  switch (change->action)
477  {
482  if (change->data.tp.newtuple)
483  {
484  ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple);
485  change->data.tp.newtuple = NULL;
486  }
487 
488  if (change->data.tp.oldtuple)
489  {
490  ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple);
491  change->data.tp.oldtuple = NULL;
492  }
493  break;
495  if (change->data.msg.prefix != NULL)
496  pfree(change->data.msg.prefix);
497  change->data.msg.prefix = NULL;
498  if (change->data.msg.message != NULL)
499  pfree(change->data.msg.message);
500  change->data.msg.message = NULL;
501  break;
503  if (change->data.inval.invalidations)
504  pfree(change->data.inval.invalidations);
505  change->data.inval.invalidations = NULL;
506  break;
508  if (change->data.snapshot)
509  {
510  ReorderBufferFreeSnap(rb, change->data.snapshot);
511  change->data.snapshot = NULL;
512  }
513  break;
514  /* no data in addition to the struct itself */
516  if (change->data.truncate.relids != NULL)
517  {
518  ReorderBufferReturnRelids(rb, change->data.truncate.relids);
519  change->data.truncate.relids = NULL;
520  }
521  break;
525  break;
526  }
527 
528  pfree(change);
529 }
530 
531 /*
532  * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size
533  * tuple_len (excluding header overhead).
534  */
537 {
538  ReorderBufferTupleBuf *tuple;
539  Size alloc_len;
540 
541  alloc_len = tuple_len + SizeofHeapTupleHeader;
542 
543  tuple = (ReorderBufferTupleBuf *)
545  sizeof(ReorderBufferTupleBuf) +
546  MAXIMUM_ALIGNOF + alloc_len);
547  tuple->alloc_tuple_size = alloc_len;
548  tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
549 
550  return tuple;
551 }
552 
553 /*
554  * Free an ReorderBufferTupleBuf.
555  */
556 void
558 {
559  pfree(tuple);
560 }
561 
562 /*
563  * Get an array for relids of truncated relations.
564  *
565  * We use the global memory context (for the whole reorder buffer), because
566  * none of the existing ones seems like a good match (some are SLAB, so we
567  * can't use those, and tup_context is meant for tuple data, not relids). We
568  * could add yet another context, but it seems like an overkill - TRUNCATE is
569  * not particularly common operation, so it does not seem worth it.
570  */
571 Oid *
573 {
574  Oid *relids;
575  Size alloc_len;
576 
577  alloc_len = sizeof(Oid) * nrelids;
578 
579  relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
580 
581  return relids;
582 }
583 
584 /*
585  * Free an array of relids.
586  */
587 void
589 {
590  pfree(relids);
591 }
592 
593 /*
594  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
595  * If create is true, and a transaction doesn't already exist, create it
596  * (with the given LSN, and as top transaction if that's specified);
597  * when this happens, is_new is set to true.
598  */
599 static ReorderBufferTXN *
601  bool *is_new, XLogRecPtr lsn, bool create_as_top)
602 {
605  bool found;
606 
608 
609  /*
610  * Check the one-entry lookup cache first
611  */
613  rb->by_txn_last_xid == xid)
614  {
615  txn = rb->by_txn_last_txn;
616 
617  if (txn != NULL)
618  {
619  /* found it, and it's valid */
620  if (is_new)
621  *is_new = false;
622  return txn;
623  }
624 
625  /*
626  * cached as non-existent, and asked not to create? Then nothing else
627  * to do.
628  */
629  if (!create)
630  return NULL;
631  /* otherwise fall through to create it */
632  }
633 
634  /*
635  * If the cache wasn't hit or it yielded an "does-not-exist" and we want
636  * to create an entry.
637  */
638 
639  /* search the lookup table */
640  ent = (ReorderBufferTXNByIdEnt *)
641  hash_search(rb->by_txn,
642  (void *) &xid,
643  create ? HASH_ENTER : HASH_FIND,
644  &found);
645  if (found)
646  txn = ent->txn;
647  else if (create)
648  {
649  /* initialize the new entry, if creation was requested */
650  Assert(ent != NULL);
651  Assert(lsn != InvalidXLogRecPtr);
652 
653  ent->txn = ReorderBufferGetTXN(rb);
654  ent->txn->xid = xid;
655  txn = ent->txn;
656  txn->first_lsn = lsn;
658 
659  if (create_as_top)
660  {
661  dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
662  AssertTXNLsnOrder(rb);
663  }
664  }
665  else
666  txn = NULL; /* not found and not asked to create */
667 
668  /* update cache */
669  rb->by_txn_last_xid = xid;
670  rb->by_txn_last_txn = txn;
671 
672  if (is_new)
673  *is_new = !found;
674 
675  Assert(!create || txn != NULL);
676  return txn;
677 }
678 
679 /*
680  * Record the partial change for the streaming of in-progress transactions. We
681  * can stream only complete changes so if we have a partial change like toast
682  * table insert or speculative insert then we mark such a 'txn' so that it
683  * can't be streamed. We also ensure that if the changes in such a 'txn' are
684  * above logical_decoding_work_mem threshold then we stream them as soon as we
685  * have a complete change.
686  */
687 static void
689  ReorderBufferChange *change,
690  bool toast_insert)
691 {
692  ReorderBufferTXN *toptxn;
693 
694  /*
695  * The partial changes need to be processed only while streaming
696  * in-progress transactions.
697  */
698  if (!ReorderBufferCanStream(rb))
699  return;
700 
701  /* Get the top transaction. */
702  if (txn->toptxn != NULL)
703  toptxn = txn->toptxn;
704  else
705  toptxn = txn;
706 
707  /*
708  * Set the toast insert bit whenever we get toast insert to indicate a
709  * partial change and clear it when we get the insert or update on main
710  * table (Both update and insert will do the insert in the toast table).
711  */
712  if (toast_insert)
714  else if (rbtxn_has_toast_insert(toptxn) &&
715  IsInsertOrUpdate(change->action))
716  toptxn->txn_flags &= ~RBTXN_HAS_TOAST_INSERT;
717 
718  /*
719  * Set the spec insert bit whenever we get the speculative insert to
720  * indicate the partial change and clear the same on speculative confirm.
721  */
722  if (IsSpecInsert(change->action))
723  toptxn->txn_flags |= RBTXN_HAS_SPEC_INSERT;
724  else if (IsSpecConfirm(change->action))
725  {
726  /*
727  * Speculative confirm change must be preceded by speculative
728  * insertion.
729  */
730  Assert(rbtxn_has_spec_insert(toptxn));
731  toptxn->txn_flags &= ~RBTXN_HAS_SPEC_INSERT;
732  }
733 
734  /*
735  * Stream the transaction if it is serialized before and the changes are
736  * now complete in the top-level transaction.
737  *
738  * The reason for doing the streaming of such a transaction as soon as we
739  * get the complete change for it is that previously it would have reached
740  * the memory threshold and wouldn't get streamed because of incomplete
741  * changes. Delaying such transactions would increase apply lag for them.
742  */
744  !(rbtxn_has_incomplete_tuple(toptxn)) &&
745  rbtxn_is_serialized(txn))
746  ReorderBufferStreamTXN(rb, toptxn);
747 }
748 
749 /*
750  * Queue a change into a transaction so it can be replayed upon commit or will be
751  * streamed when we reach logical_decoding_work_mem threshold.
752  */
753 void
755  ReorderBufferChange *change, bool toast_insert)
756 {
758 
759  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
760 
761  /*
762  * While streaming the previous changes we have detected that the
763  * transaction is aborted. So there is no point in collecting further
764  * changes for it.
765  */
766  if (txn->concurrent_abort)
767  {
768  /*
769  * We don't need to update memory accounting for this change as we
770  * have not added it to the queue yet.
771  */
772  ReorderBufferReturnChange(rb, change, false);
773  return;
774  }
775 
776  change->lsn = lsn;
777  change->txn = txn;
778 
779  Assert(InvalidXLogRecPtr != lsn);
780  dlist_push_tail(&txn->changes, &change->node);
781  txn->nentries++;
782  txn->nentries_mem++;
783 
784  /* update memory accounting information */
785  ReorderBufferChangeMemoryUpdate(rb, change, true);
786 
787  /* process partial change */
788  ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
789 
790  /* check the memory limits and evict something if needed */
792 }
793 
794 /*
795  * A transactional message is queued to be processed upon commit and a
796  * non-transactional message gets processed immediately.
797  */
798 void
800  Snapshot snapshot, XLogRecPtr lsn,
801  bool transactional, const char *prefix,
802  Size message_size, const char *message)
803 {
804  if (transactional)
805  {
806  MemoryContext oldcontext;
807  ReorderBufferChange *change;
808 
810 
811  oldcontext = MemoryContextSwitchTo(rb->context);
812 
813  change = ReorderBufferGetChange(rb);
815  change->data.msg.prefix = pstrdup(prefix);
816  change->data.msg.message_size = message_size;
817  change->data.msg.message = palloc(message_size);
818  memcpy(change->data.msg.message, message, message_size);
819 
820  ReorderBufferQueueChange(rb, xid, lsn, change, false);
821 
822  MemoryContextSwitchTo(oldcontext);
823  }
824  else
825  {
826  ReorderBufferTXN *txn = NULL;
827  volatile Snapshot snapshot_now = snapshot;
828 
829  if (xid != InvalidTransactionId)
830  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
831 
832  /* setup snapshot to allow catalog access */
833  SetupHistoricSnapshot(snapshot_now, NULL);
834  PG_TRY();
835  {
836  rb->message(rb, txn, lsn, false, prefix, message_size, message);
837 
839  }
840  PG_CATCH();
841  {
843  PG_RE_THROW();
844  }
845  PG_END_TRY();
846  }
847 }
848 
849 /*
850  * AssertTXNLsnOrder
851  * Verify LSN ordering of transaction lists in the reorderbuffer
852  *
853  * Other LSN-related invariants are checked too.
854  *
855  * No-op if assertions are not in use.
856  */
857 static void
859 {
860 #ifdef USE_ASSERT_CHECKING
861  dlist_iter iter;
862  XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
863  XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
864 
865  dlist_foreach(iter, &rb->toplevel_by_lsn)
866  {
868  iter.cur);
869 
870  /* start LSN must be set */
871  Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
872 
873  /* If there is an end LSN, it must be higher than start LSN */
874  if (cur_txn->end_lsn != InvalidXLogRecPtr)
875  Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
876 
877  /* Current initial LSN must be strictly higher than previous */
878  if (prev_first_lsn != InvalidXLogRecPtr)
879  Assert(prev_first_lsn < cur_txn->first_lsn);
880 
881  /* known-as-subtxn txns must not be listed */
882  Assert(!rbtxn_is_known_subxact(cur_txn));
883 
884  prev_first_lsn = cur_txn->first_lsn;
885  }
886 
888  {
890  base_snapshot_node,
891  iter.cur);
892 
893  /* base snapshot (and its LSN) must be set */
894  Assert(cur_txn->base_snapshot != NULL);
896 
897  /* current LSN must be strictly higher than previous */
898  if (prev_base_snap_lsn != InvalidXLogRecPtr)
899  Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
900 
901  /* known-as-subtxn txns must not be listed */
902  Assert(!rbtxn_is_known_subxact(cur_txn));
903 
904  prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
905  }
906 #endif
907 }
908 
909 /*
910  * AssertChangeLsnOrder
911  *
912  * Check ordering of changes in the (sub)transaction.
913  */
914 static void
916 {
917 #ifdef USE_ASSERT_CHECKING
918  dlist_iter iter;
919  XLogRecPtr prev_lsn = txn->first_lsn;
920 
921  dlist_foreach(iter, &txn->changes)
922  {
923  ReorderBufferChange *cur_change;
924 
925  cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
926 
928  Assert(cur_change->lsn != InvalidXLogRecPtr);
929  Assert(txn->first_lsn <= cur_change->lsn);
930 
931  if (txn->end_lsn != InvalidXLogRecPtr)
932  Assert(cur_change->lsn <= txn->end_lsn);
933 
934  Assert(prev_lsn <= cur_change->lsn);
935 
936  prev_lsn = cur_change->lsn;
937  }
938 #endif
939 }
940 
941 /*
942  * ReorderBufferGetOldestTXN
943  * Return oldest transaction in reorderbuffer
944  */
947 {
949 
950  AssertTXNLsnOrder(rb);
951 
953  return NULL;
954 
956 
959  return txn;
960 }
961 
962 /*
963  * ReorderBufferGetOldestXmin
964  * Return oldest Xmin in reorderbuffer
965  *
966  * Returns oldest possibly running Xid from the point of view of snapshots
967  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
968  * there are none.
969  *
970  * Since snapshots are assigned monotonically, this equals the Xmin of the
971  * base snapshot with minimal base_snapshot_lsn.
972  */
975 {
977 
978  AssertTXNLsnOrder(rb);
979 
981  return InvalidTransactionId;
982 
983  txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
985  return txn->base_snapshot->xmin;
986 }
987 
988 void
990 {
992 }
993 
994 /*
995  * ReorderBufferAssignChild
996  *
997  * Make note that we know that subxid is a subtransaction of xid, seen as of
998  * the given lsn.
999  */
1000 void
1002  TransactionId subxid, XLogRecPtr lsn)
1003 {
1005  ReorderBufferTXN *subtxn;
1006  bool new_top;
1007  bool new_sub;
1008 
1009  txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1010  subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1011 
1012  if (!new_sub)
1013  {
1014  if (rbtxn_is_known_subxact(subtxn))
1015  {
1016  /* already associated, nothing to do */
1017  return;
1018  }
1019  else
1020  {
1021  /*
1022  * We already saw this transaction, but initially added it to the
1023  * list of top-level txns. Now that we know it's not top-level,
1024  * remove it from there.
1025  */
1026  dlist_delete(&subtxn->node);
1027  }
1028  }
1029 
1030  subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1031  subtxn->toplevel_xid = xid;
1032  Assert(subtxn->nsubtxns == 0);
1033 
1034  /* set the reference to top-level transaction */
1035  subtxn->toptxn = txn;
1036 
1037  /* add to subtransaction list */
1038  dlist_push_tail(&txn->subtxns, &subtxn->node);
1039  txn->nsubtxns++;
1040 
1041  /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1042  ReorderBufferTransferSnapToParent(txn, subtxn);
1043 
1044  /* Verify LSN-ordering invariant */
1045  AssertTXNLsnOrder(rb);
1046 }
1047 
1048 /*
1049  * ReorderBufferTransferSnapToParent
1050  * Transfer base snapshot from subtxn to top-level txn, if needed
1051  *
1052  * This is done if the top-level txn doesn't have a base snapshot, or if the
1053  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1054  * snapshot's LSN. This can happen if there are no changes in the toplevel
1055  * txn but there are some in the subtxn, or the first change in subtxn has
1056  * earlier LSN than first change in the top-level txn and we learned about
1057  * their kinship only now.
1058  *
1059  * The subtransaction's snapshot is cleared regardless of the transfer
1060  * happening, since it's not needed anymore in either case.
1061  *
1062  * We do this as soon as we become aware of their kinship, to avoid queueing
1063  * extra snapshots to txns known-as-subtxns -- only top-level txns will
1064  * receive further snapshots.
1065  */
1066 static void
1068  ReorderBufferTXN *subtxn)
1069 {
1070  Assert(subtxn->toplevel_xid == txn->xid);
1071 
1072  if (subtxn->base_snapshot != NULL)
1073  {
1074  if (txn->base_snapshot == NULL ||
1075  subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1076  {
1077  /*
1078  * If the toplevel transaction already has a base snapshot but
1079  * it's newer than the subxact's, purge it.
1080  */
1081  if (txn->base_snapshot != NULL)
1082  {
1085  }
1086 
1087  /*
1088  * The snapshot is now the top transaction's; transfer it, and
1089  * adjust the list position of the top transaction in the list by
1090  * moving it to where the subtransaction is.
1091  */
1092  txn->base_snapshot = subtxn->base_snapshot;
1093  txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1095  &txn->base_snapshot_node);
1096 
1097  /*
1098  * The subtransaction doesn't have a snapshot anymore (so it
1099  * mustn't be in the list.)
1100  */
1101  subtxn->base_snapshot = NULL;
1103  dlist_delete(&subtxn->base_snapshot_node);
1104  }
1105  else
1106  {
1107  /* Base snap of toplevel is fine, so subxact's is not needed */
1109  dlist_delete(&subtxn->base_snapshot_node);
1110  subtxn->base_snapshot = NULL;
1112  }
1113  }
1114 }
1115 
1116 /*
1117  * Associate a subtransaction with its toplevel transaction at commit
1118  * time. There may be no further changes added after this.
1119  */
1120 void
1122  TransactionId subxid, XLogRecPtr commit_lsn,
1123  XLogRecPtr end_lsn)
1124 {
1125  ReorderBufferTXN *subtxn;
1126 
1127  subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1128  InvalidXLogRecPtr, false);
1129 
1130  /*
1131  * No need to do anything if that subtxn didn't contain any changes
1132  */
1133  if (!subtxn)
1134  return;
1135 
1136  subtxn->final_lsn = commit_lsn;
1137  subtxn->end_lsn = end_lsn;
1138 
1139  /*
1140  * Assign this subxact as a child of the toplevel xact (no-op if already
1141  * done.)
1142  */
1143  ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1144 }
1145 
1146 
1147 /*
1148  * Support for efficiently iterating over a transaction's and its
1149  * subtransactions' changes.
1150  *
1151  * We do by doing a k-way merge between transactions/subtransactions. For that
1152  * we model the current heads of the different transactions as a binary heap
1153  * so we easily know which (sub-)transaction has the change with the smallest
1154  * lsn next.
1155  *
1156  * We assume the changes in individual transactions are already sorted by LSN.
1157  */
1158 
1159 /*
1160  * Binary heap comparison function.
1161  */
1162 static int
1164 {
1166  XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1167  XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1168 
1169  if (pos_a < pos_b)
1170  return 1;
1171  else if (pos_a == pos_b)
1172  return 0;
1173  return -1;
1174 }
1175 
1176 /*
1177  * Allocate & initialize an iterator which iterates in lsn order over a
1178  * transaction and all its subtransactions.
1179  *
1180  * Note: The iterator state is returned through iter_state parameter rather
1181  * than the function's return value. This is because the state gets cleaned up
1182  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1183  * back the state even if this function throws an exception.
1184  */
1185 static void
1187  ReorderBufferIterTXNState *volatile *iter_state)
1188 {
1189  Size nr_txns = 0;
1191  dlist_iter cur_txn_i;
1192  int32 off;
1193 
1194  *iter_state = NULL;
1195 
1196  /* Check ordering of changes in the toplevel transaction. */
1197  AssertChangeLsnOrder(txn);
1198 
1199  /*
1200  * Calculate the size of our heap: one element for every transaction that
1201  * contains changes. (Besides the transactions already in the reorder
1202  * buffer, we count the one we were directly passed.)
1203  */
1204  if (txn->nentries > 0)
1205  nr_txns++;
1206 
1207  dlist_foreach(cur_txn_i, &txn->subtxns)
1208  {
1209  ReorderBufferTXN *cur_txn;
1210 
1211  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1212 
1213  /* Check ordering of changes in this subtransaction. */
1214  AssertChangeLsnOrder(cur_txn);
1215 
1216  if (cur_txn->nentries > 0)
1217  nr_txns++;
1218  }
1219 
1220  /* allocate iteration state */
1221  state = (ReorderBufferIterTXNState *)
1223  sizeof(ReorderBufferIterTXNState) +
1224  sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1225 
1226  state->nr_txns = nr_txns;
1227  dlist_init(&state->old_change);
1228 
1229  for (off = 0; off < state->nr_txns; off++)
1230  {
1231  state->entries[off].file.vfd = -1;
1232  state->entries[off].segno = 0;
1233  }
1234 
1235  /* allocate heap */
1236  state->heap = binaryheap_allocate(state->nr_txns,
1238  state);
1239 
1240  /* Now that the state fields are initialized, it is safe to return it. */
1241  *iter_state = state;
1242 
1243  /*
1244  * Now insert items into the binary heap, in an unordered fashion. (We
1245  * will run a heap assembly step at the end; this is more efficient.)
1246  */
1247 
1248  off = 0;
1249 
1250  /* add toplevel transaction if it contains changes */
1251  if (txn->nentries > 0)
1252  {
1253  ReorderBufferChange *cur_change;
1254 
1255  if (rbtxn_is_serialized(txn))
1256  {
1257  /* serialize remaining changes */
1258  ReorderBufferSerializeTXN(rb, txn);
1259  ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1260  &state->entries[off].segno);
1261  }
1262 
1263  cur_change = dlist_head_element(ReorderBufferChange, node,
1264  &txn->changes);
1265 
1266  state->entries[off].lsn = cur_change->lsn;
1267  state->entries[off].change = cur_change;
1268  state->entries[off].txn = txn;
1269 
1271  }
1272 
1273  /* add subtransactions if they contain changes */
1274  dlist_foreach(cur_txn_i, &txn->subtxns)
1275  {
1276  ReorderBufferTXN *cur_txn;
1277 
1278  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1279 
1280  if (cur_txn->nentries > 0)
1281  {
1282  ReorderBufferChange *cur_change;
1283 
1284  if (rbtxn_is_serialized(cur_txn))
1285  {
1286  /* serialize remaining changes */
1287  ReorderBufferSerializeTXN(rb, cur_txn);
1288  ReorderBufferRestoreChanges(rb, cur_txn,
1289  &state->entries[off].file,
1290  &state->entries[off].segno);
1291  }
1292  cur_change = dlist_head_element(ReorderBufferChange, node,
1293  &cur_txn->changes);
1294 
1295  state->entries[off].lsn = cur_change->lsn;
1296  state->entries[off].change = cur_change;
1297  state->entries[off].txn = cur_txn;
1298 
1300  }
1301  }
1302 
1303  /* assemble a valid binary heap */
1304  binaryheap_build(state->heap);
1305 }
1306 
1307 /*
1308  * Return the next change when iterating over a transaction and its
1309  * subtransactions.
1310  *
1311  * Returns NULL when no further changes exist.
1312  */
1313 static ReorderBufferChange *
1315 {
1316  ReorderBufferChange *change;
1318  int32 off;
1319 
1320  /* nothing there anymore */
1321  if (state->heap->bh_size == 0)
1322  return NULL;
1323 
1324  off = DatumGetInt32(binaryheap_first(state->heap));
1325  entry = &state->entries[off];
1326 
1327  /* free memory we might have "leaked" in the previous *Next call */
1328  if (!dlist_is_empty(&state->old_change))
1329  {
1330  change = dlist_container(ReorderBufferChange, node,
1331  dlist_pop_head_node(&state->old_change));
1332  ReorderBufferReturnChange(rb, change, true);
1333  Assert(dlist_is_empty(&state->old_change));
1334  }
1335 
1336  change = entry->change;
1337 
1338  /*
1339  * update heap with information about which transaction has the next
1340  * relevant change in LSN order
1341  */
1342 
1343  /* there are in-memory changes */
1344  if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1345  {
1346  dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1347  ReorderBufferChange *next_change =
1348  dlist_container(ReorderBufferChange, node, next);
1349 
1350  /* txn stays the same */
1351  state->entries[off].lsn = next_change->lsn;
1352  state->entries[off].change = next_change;
1353 
1355  return change;
1356  }
1357 
1358  /* try to load changes from disk */
1359  if (entry->txn->nentries != entry->txn->nentries_mem)
1360  {
1361  /*
1362  * Ugly: restoring changes will reuse *Change records, thus delete the
1363  * current one from the per-tx list and only free in the next call.
1364  */
1365  dlist_delete(&change->node);
1366  dlist_push_tail(&state->old_change, &change->node);
1367 
1368  /*
1369  * Update the total bytes processed before releasing the current set
1370  * of changes and restoring the new set of changes.
1371  */
1372  rb->totalBytes += rb->size;
1373  if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1374  &state->entries[off].segno))
1375  {
1376  /* successfully restored changes from disk */
1377  ReorderBufferChange *next_change =
1379  &entry->txn->changes);
1380 
1381  elog(DEBUG2, "restored %u/%u changes from disk",
1382  (uint32) entry->txn->nentries_mem,
1383  (uint32) entry->txn->nentries);
1384 
1385  Assert(entry->txn->nentries_mem);
1386  /* txn stays the same */
1387  state->entries[off].lsn = next_change->lsn;
1388  state->entries[off].change = next_change;
1390 
1391  return change;
1392  }
1393  }
1394 
1395  /* ok, no changes there anymore, remove */
1396  binaryheap_remove_first(state->heap);
1397 
1398  return change;
1399 }
1400 
1401 /*
1402  * Deallocate the iterator
1403  */
1404 static void
1407 {
1408  int32 off;
1409 
1410  for (off = 0; off < state->nr_txns; off++)
1411  {
1412  if (state->entries[off].file.vfd != -1)
1413  FileClose(state->entries[off].file.vfd);
1414  }
1415 
1416  /* free memory we might have "leaked" in the last *Next call */
1417  if (!dlist_is_empty(&state->old_change))
1418  {
1419  ReorderBufferChange *change;
1420 
1421  change = dlist_container(ReorderBufferChange, node,
1422  dlist_pop_head_node(&state->old_change));
1423  ReorderBufferReturnChange(rb, change, true);
1424  Assert(dlist_is_empty(&state->old_change));
1425  }
1426 
1427  binaryheap_free(state->heap);
1428  pfree(state);
1429 }
1430 
1431 /*
1432  * Cleanup the contents of a transaction, usually after the transaction
1433  * committed or aborted.
1434  */
1435 static void
1437 {
1438  bool found;
1439  dlist_mutable_iter iter;
1440 
1441  /* cleanup subtransactions & their changes */
1442  dlist_foreach_modify(iter, &txn->subtxns)
1443  {
1444  ReorderBufferTXN *subtxn;
1445 
1446  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1447 
1448  /*
1449  * Subtransactions are always associated to the toplevel TXN, even if
1450  * they originally were happening inside another subtxn, so we won't
1451  * ever recurse more than one level deep here.
1452  */
1453  Assert(rbtxn_is_known_subxact(subtxn));
1454  Assert(subtxn->nsubtxns == 0);
1455 
1456  ReorderBufferCleanupTXN(rb, subtxn);
1457  }
1458 
1459  /* cleanup changes in the txn */
1460  dlist_foreach_modify(iter, &txn->changes)
1461  {
1462  ReorderBufferChange *change;
1463 
1464  change = dlist_container(ReorderBufferChange, node, iter.cur);
1465 
1466  /* Check we're not mixing changes from different transactions. */
1467  Assert(change->txn == txn);
1468 
1469  ReorderBufferReturnChange(rb, change, true);
1470  }
1471 
1472  /*
1473  * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1474  * They are always stored in the toplevel transaction.
1475  */
1476  dlist_foreach_modify(iter, &txn->tuplecids)
1477  {
1478  ReorderBufferChange *change;
1479 
1480  change = dlist_container(ReorderBufferChange, node, iter.cur);
1481 
1482  /* Check we're not mixing changes from different transactions. */
1483  Assert(change->txn == txn);
1485 
1486  ReorderBufferReturnChange(rb, change, true);
1487  }
1488 
1489  /*
1490  * Cleanup the base snapshot, if set.
1491  */
1492  if (txn->base_snapshot != NULL)
1493  {
1496  }
1497 
1498  /*
1499  * Cleanup the snapshot for the last streamed run.
1500  */
1501  if (txn->snapshot_now != NULL)
1502  {
1503  Assert(rbtxn_is_streamed(txn));
1505  }
1506 
1507  /*
1508  * Remove TXN from its containing list.
1509  *
1510  * Note: if txn is known as subxact, we are deleting the TXN from its
1511  * parent's list of known subxacts; this leaves the parent's nsubxacts
1512  * count too high, but we don't care. Otherwise, we are deleting the TXN
1513  * from the LSN-ordered list of toplevel TXNs.
1514  */
1515  dlist_delete(&txn->node);
1516 
1517  /* now remove reference from buffer */
1518  hash_search(rb->by_txn,
1519  (void *) &txn->xid,
1520  HASH_REMOVE,
1521  &found);
1522  Assert(found);
1523 
1524  /* remove entries spilled to disk */
1525  if (rbtxn_is_serialized(txn))
1526  ReorderBufferRestoreCleanup(rb, txn);
1527 
1528  /* deallocate */
1529  ReorderBufferReturnTXN(rb, txn);
1530 }
1531 
1532 /*
1533  * Discard changes from a transaction (and subtransactions), either after
1534  * streaming or decoding them at PREPARE. Keep the remaining info -
1535  * transactions, tuplecids, invalidations and snapshots.
1536  *
1537  * We additionaly remove tuplecids after decoding the transaction at prepare
1538  * time as we only need to perform invalidation at rollback or commit prepared.
1539  *
1540  * 'txn_prepared' indicates that we have decoded the transaction at prepare
1541  * time.
1542  */
1543 static void
1545 {
1546  dlist_mutable_iter iter;
1547 
1548  /* cleanup subtransactions & their changes */
1549  dlist_foreach_modify(iter, &txn->subtxns)
1550  {
1551  ReorderBufferTXN *subtxn;
1552 
1553  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1554 
1555  /*
1556  * Subtransactions are always associated to the toplevel TXN, even if
1557  * they originally were happening inside another subtxn, so we won't
1558  * ever recurse more than one level deep here.
1559  */
1560  Assert(rbtxn_is_known_subxact(subtxn));
1561  Assert(subtxn->nsubtxns == 0);
1562 
1563  ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1564  }
1565 
1566  /* cleanup changes in the txn */
1567  dlist_foreach_modify(iter, &txn->changes)
1568  {
1569  ReorderBufferChange *change;
1570 
1571  change = dlist_container(ReorderBufferChange, node, iter.cur);
1572 
1573  /* Check we're not mixing changes from different transactions. */
1574  Assert(change->txn == txn);
1575 
1576  /* remove the change from it's containing list */
1577  dlist_delete(&change->node);
1578 
1579  ReorderBufferReturnChange(rb, change, true);
1580  }
1581 
1582  /*
1583  * Mark the transaction as streamed.
1584  *
1585  * The toplevel transaction, identified by (toptxn==NULL), is marked as
1586  * streamed always, even if it does not contain any changes (that is, when
1587  * all the changes are in subtransactions).
1588  *
1589  * For subtransactions, we only mark them as streamed when there are
1590  * changes in them.
1591  *
1592  * We do it this way because of aborts - we don't want to send aborts for
1593  * XIDs the downstream is not aware of. And of course, it always knows
1594  * about the toplevel xact (we send the XID in all messages), but we never
1595  * stream XIDs of empty subxacts.
1596  */
1597  if ((!txn_prepared) && ((!txn->toptxn) || (txn->nentries_mem != 0)))
1598  txn->txn_flags |= RBTXN_IS_STREAMED;
1599 
1600  if (txn_prepared)
1601  {
1602  /*
1603  * If this is a prepared txn, cleanup the tuplecids we stored for
1604  * decoding catalog snapshot access. They are always stored in the
1605  * toplevel transaction.
1606  */
1607  dlist_foreach_modify(iter, &txn->tuplecids)
1608  {
1609  ReorderBufferChange *change;
1610 
1611  change = dlist_container(ReorderBufferChange, node, iter.cur);
1612 
1613  /* Check we're not mixing changes from different transactions. */
1614  Assert(change->txn == txn);
1616 
1617  /* Remove the change from its containing list. */
1618  dlist_delete(&change->node);
1619 
1620  ReorderBufferReturnChange(rb, change, true);
1621  }
1622  }
1623 
1624  /*
1625  * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any
1626  * memory. We could also keep the hash table and update it with new ctid
1627  * values, but this seems simpler and good enough for now.
1628  */
1629  if (txn->tuplecid_hash != NULL)
1630  {
1632  txn->tuplecid_hash = NULL;
1633  }
1634 
1635  /* If this txn is serialized then clean the disk space. */
1636  if (rbtxn_is_serialized(txn))
1637  {
1638  ReorderBufferRestoreCleanup(rb, txn);
1639  txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1640 
1641  /*
1642  * We set this flag to indicate if the transaction is ever serialized.
1643  * We need this to accurately update the stats as otherwise the same
1644  * transaction can be counted as serialized multiple times.
1645  */
1647  }
1648 
1649  /* also reset the number of entries in the transaction */
1650  txn->nentries_mem = 0;
1651  txn->nentries = 0;
1652 }
1653 
1654 /*
1655  * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
1656  * HeapTupleSatisfiesHistoricMVCC.
1657  */
1658 static void
1660 {
1661  dlist_iter iter;
1662  HASHCTL hash_ctl;
1663 
1665  return;
1666 
1667  hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1668  hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1669  hash_ctl.hcxt = rb->context;
1670 
1671  /*
1672  * create the hash with the exact number of to-be-stored tuplecids from
1673  * the start
1674  */
1675  txn->tuplecid_hash =
1676  hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1678 
1679  dlist_foreach(iter, &txn->tuplecids)
1680  {
1683  bool found;
1684  ReorderBufferChange *change;
1685 
1686  change = dlist_container(ReorderBufferChange, node, iter.cur);
1687 
1689 
1690  /* be careful about padding */
1691  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1692 
1693  key.relnode = change->data.tuplecid.node;
1694 
1695  ItemPointerCopy(&change->data.tuplecid.tid,
1696  &key.tid);
1697 
1698  ent = (ReorderBufferTupleCidEnt *)
1700  (void *) &key,
1702  &found);
1703  if (!found)
1704  {
1705  ent->cmin = change->data.tuplecid.cmin;
1706  ent->cmax = change->data.tuplecid.cmax;
1707  ent->combocid = change->data.tuplecid.combocid;
1708  }
1709  else
1710  {
1711  /*
1712  * Maybe we already saw this tuple before in this transaction, but
1713  * if so it must have the same cmin.
1714  */
1715  Assert(ent->cmin == change->data.tuplecid.cmin);
1716 
1717  /*
1718  * cmax may be initially invalid, but once set it can only grow,
1719  * and never become invalid again.
1720  */
1721  Assert((ent->cmax == InvalidCommandId) ||
1722  ((change->data.tuplecid.cmax != InvalidCommandId) &&
1723  (change->data.tuplecid.cmax > ent->cmax)));
1724  ent->cmax = change->data.tuplecid.cmax;
1725  }
1726  }
1727 }
1728 
1729 /*
1730  * Copy a provided snapshot so we can modify it privately. This is needed so
1731  * that catalog modifying transactions can look into intermediate catalog
1732  * states.
1733  */
1734 static Snapshot
1737 {
1738  Snapshot snap;
1739  dlist_iter iter;
1740  int i = 0;
1741  Size size;
1742 
1743  size = sizeof(SnapshotData) +
1744  sizeof(TransactionId) * orig_snap->xcnt +
1745  sizeof(TransactionId) * (txn->nsubtxns + 1);
1746 
1747  snap = MemoryContextAllocZero(rb->context, size);
1748  memcpy(snap, orig_snap, sizeof(SnapshotData));
1749 
1750  snap->copied = true;
1751  snap->active_count = 1; /* mark as active so nobody frees it */
1752  snap->regd_count = 0;
1753  snap->xip = (TransactionId *) (snap + 1);
1754 
1755  memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1756 
1757  /*
1758  * snap->subxip contains all txids that belong to our transaction which we
1759  * need to check via cmin/cmax. That's why we store the toplevel
1760  * transaction in there as well.
1761  */
1762  snap->subxip = snap->xip + snap->xcnt;
1763  snap->subxip[i++] = txn->xid;
1764 
1765  /*
1766  * subxcnt isn't decreased when subtransactions abort, so count manually.
1767  * Since it's an upper boundary it is safe to use it for the allocation
1768  * above.
1769  */
1770  snap->subxcnt = 1;
1771 
1772  dlist_foreach(iter, &txn->subtxns)
1773  {
1774  ReorderBufferTXN *sub_txn;
1775 
1776  sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1777  snap->subxip[i++] = sub_txn->xid;
1778  snap->subxcnt++;
1779  }
1780 
1781  /* sort so we can bsearch() later */
1782  qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1783 
1784  /* store the specified current CommandId */
1785  snap->curcid = cid;
1786 
1787  return snap;
1788 }
1789 
1790 /*
1791  * Free a previously ReorderBufferCopySnap'ed snapshot
1792  */
1793 static void
1795 {
1796  if (snap->copied)
1797  pfree(snap);
1798  else
1800 }
1801 
1802 /*
1803  * If the transaction was (partially) streamed, we need to prepare or commit
1804  * it in a 'streamed' way. That is, we first stream the remaining part of the
1805  * transaction, and then invoke stream_prepare or stream_commit message as per
1806  * the case.
1807  */
1808 static void
1810 {
1811  /* we should only call this for previously streamed transactions */
1812  Assert(rbtxn_is_streamed(txn));
1813 
1814  ReorderBufferStreamTXN(rb, txn);
1815 
1816  if (rbtxn_prepared(txn))
1817  {
1818  /*
1819  * Note, we send stream prepare even if a concurrent abort is
1820  * detected. See DecodePrepare for more information.
1821  */
1822  rb->stream_prepare(rb, txn, txn->final_lsn);
1823 
1824  /*
1825  * This is a PREPARED transaction, part of a two-phase commit. The
1826  * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1827  * just truncate txn by removing changes and tuple_cids.
1828  */
1829  ReorderBufferTruncateTXN(rb, txn, true);
1830  /* Reset the CheckXidAlive */
1832  }
1833  else
1834  {
1835  rb->stream_commit(rb, txn, txn->final_lsn);
1836  ReorderBufferCleanupTXN(rb, txn);
1837  }
1838 }
1839 
1840 /*
1841  * Set xid to detect concurrent aborts.
1842  *
1843  * While streaming an in-progress transaction or decoding a prepared
1844  * transaction there is a possibility that the (sub)transaction might get
1845  * aborted concurrently. In such case if the (sub)transaction has catalog
1846  * update then we might decode the tuple using wrong catalog version. For
1847  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1848  * the transaction 501 updates the catalog tuple and after that we will have
1849  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1850  * aborted and some other transaction say 502 updates the same catalog tuple
1851  * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1852  * problem is that when we try to decode the tuple inserted/updated in 501
1853  * after the catalog update, we will see the catalog tuple with (xmin: 500,
1854  * xmax: 502) as visible because it will consider that the tuple is deleted by
1855  * xid 502 which is not visible to our snapshot. And when we will try to
1856  * decode with that catalog tuple, it can lead to a wrong result or a crash.
1857  * So, it is necessary to detect concurrent aborts to allow streaming of
1858  * in-progress transactions or decoding of prepared transactions.
1859  *
1860  * For detecting the concurrent abort we set CheckXidAlive to the current
1861  * (sub)transaction's xid for which this change belongs to. And, during
1862  * catalog scan we can check the status of the xid and if it is aborted we will
1863  * report a specific error so that we can stop streaming current transaction
1864  * and discard the already streamed changes on such an error. We might have
1865  * already streamed some of the changes for the aborted (sub)transaction, but
1866  * that is fine because when we decode the abort we will stream abort message
1867  * to truncate the changes in the subscriber. Similarly, for prepared
1868  * transactions, we stop decoding if concurrent abort is detected and then
1869  * rollback the changes when rollback prepared is encountered. See
1870  * DecodePreare.
1871  */
1872 static inline void
1874 {
1875  /*
1876  * If the input transaction id is already set as a CheckXidAlive then
1877  * nothing to do.
1878  */
1880  return;
1881 
1882  /*
1883  * setup CheckXidAlive if it's not committed yet. We don't check if the
1884  * xid is aborted. That will happen during catalog access.
1885  */
1886  if (!TransactionIdDidCommit(xid))
1887  CheckXidAlive = xid;
1888  else
1890 }
1891 
1892 /*
1893  * Helper function for ReorderBufferProcessTXN for applying change.
1894  */
1895 static inline void
1897  Relation relation, ReorderBufferChange *change,
1898  bool streaming)
1899 {
1900  if (streaming)
1901  rb->stream_change(rb, txn, relation, change);
1902  else
1903  rb->apply_change(rb, txn, relation, change);
1904 }
1905 
1906 /*
1907  * Helper function for ReorderBufferProcessTXN for applying the truncate.
1908  */
1909 static inline void
1911  int nrelations, Relation *relations,
1912  ReorderBufferChange *change, bool streaming)
1913 {
1914  if (streaming)
1915  rb->stream_truncate(rb, txn, nrelations, relations, change);
1916  else
1917  rb->apply_truncate(rb, txn, nrelations, relations, change);
1918 }
1919 
1920 /*
1921  * Helper function for ReorderBufferProcessTXN for applying the message.
1922  */
1923 static inline void
1925  ReorderBufferChange *change, bool streaming)
1926 {
1927  if (streaming)
1928  rb->stream_message(rb, txn, change->lsn, true,
1929  change->data.msg.prefix,
1930  change->data.msg.message_size,
1931  change->data.msg.message);
1932  else
1933  rb->message(rb, txn, change->lsn, true,
1934  change->data.msg.prefix,
1935  change->data.msg.message_size,
1936  change->data.msg.message);
1937 }
1938 
1939 /*
1940  * Function to store the command id and snapshot at the end of the current
1941  * stream so that we can reuse the same while sending the next stream.
1942  */
1943 static inline void
1945  Snapshot snapshot_now, CommandId command_id)
1946 {
1947  txn->command_id = command_id;
1948 
1949  /* Avoid copying if it's already copied. */
1950  if (snapshot_now->copied)
1951  txn->snapshot_now = snapshot_now;
1952  else
1953  txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
1954  txn, command_id);
1955 }
1956 
1957 /*
1958  * Helper function for ReorderBufferProcessTXN to handle the concurrent
1959  * abort of the streaming transaction. This resets the TXN such that it
1960  * can be used to stream the remaining data of transaction being processed.
1961  * This can happen when the subtransaction is aborted and we still want to
1962  * continue processing the main or other subtransactions data.
1963  */
1964 static void
1966  Snapshot snapshot_now,
1967  CommandId command_id,
1968  XLogRecPtr last_lsn,
1969  ReorderBufferChange *specinsert)
1970 {
1971  /* Discard the changes that we just streamed */
1973 
1974  /* Free all resources allocated for toast reconstruction */
1975  ReorderBufferToastReset(rb, txn);
1976 
1977  /* Return the spec insert change if it is not NULL */
1978  if (specinsert != NULL)
1979  {
1980  ReorderBufferReturnChange(rb, specinsert, true);
1981  specinsert = NULL;
1982  }
1983 
1984  /*
1985  * For the streaming case, stop the stream and remember the command ID and
1986  * snapshot for the streaming run.
1987  */
1988  if (rbtxn_is_streamed(txn))
1989  {
1990  rb->stream_stop(rb, txn, last_lsn);
1991  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
1992  }
1993 }
1994 
1995 /*
1996  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
1997  *
1998  * Send data of a transaction (and its subtransactions) to the
1999  * output plugin. We iterate over the top and subtransactions (using a k-way
2000  * merge) and replay the changes in lsn order.
2001  *
2002  * If streaming is true then data will be sent using stream API.
2003  *
2004  * Note: "volatile" markers on some parameters are to avoid trouble with
2005  * PG_TRY inside the function.
2006  */
2007 static void
2009  XLogRecPtr commit_lsn,
2010  volatile Snapshot snapshot_now,
2011  volatile CommandId command_id,
2012  bool streaming)
2013 {
2014  bool using_subtxn;
2016  ReorderBufferIterTXNState *volatile iterstate = NULL;
2017  volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2018  ReorderBufferChange *volatile specinsert = NULL;
2019  volatile bool stream_started = false;
2020  ReorderBufferTXN *volatile curtxn = NULL;
2021 
2022  /* build data to be able to lookup the CommandIds of catalog tuples */
2024 
2025  /* setup the initial snapshot */
2026  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2027 
2028  /*
2029  * Decoding needs access to syscaches et al., which in turn use
2030  * heavyweight locks and such. Thus we need to have enough state around to
2031  * keep track of those. The easiest way is to simply use a transaction
2032  * internally. That also allows us to easily enforce that nothing writes
2033  * to the database by checking for xid assignments.
2034  *
2035  * When we're called via the SQL SRF there's already a transaction
2036  * started, so start an explicit subtransaction there.
2037  */
2038  using_subtxn = IsTransactionOrTransactionBlock();
2039 
2040  PG_TRY();
2041  {
2042  ReorderBufferChange *change;
2043 
2044  if (using_subtxn)
2045  BeginInternalSubTransaction(streaming ? "stream" : "replay");
2046  else
2048 
2049  /*
2050  * We only need to send begin/begin-prepare for non-streamed
2051  * transactions.
2052  */
2053  if (!streaming)
2054  {
2055  if (rbtxn_prepared(txn))
2056  rb->begin_prepare(rb, txn);
2057  else
2058  rb->begin(rb, txn);
2059  }
2060 
2061  ReorderBufferIterTXNInit(rb, txn, &iterstate);
2062  while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2063  {
2064  Relation relation = NULL;
2065  Oid reloid;
2066 
2067  /*
2068  * We can't call start stream callback before processing first
2069  * change.
2070  */
2071  if (prev_lsn == InvalidXLogRecPtr)
2072  {
2073  if (streaming)
2074  {
2075  txn->origin_id = change->origin_id;
2076  rb->stream_start(rb, txn, change->lsn);
2077  stream_started = true;
2078  }
2079  }
2080 
2081  /*
2082  * Enforce correct ordering of changes, merged from multiple
2083  * subtransactions. The changes may have the same LSN due to
2084  * MULTI_INSERT xlog records.
2085  */
2086  Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2087 
2088  prev_lsn = change->lsn;
2089 
2090  /*
2091  * Set the current xid to detect concurrent aborts. This is
2092  * required for the cases when we decode the changes before the
2093  * COMMIT record is processed.
2094  */
2095  if (streaming || rbtxn_prepared(change->txn))
2096  {
2097  curtxn = change->txn;
2098  SetupCheckXidLive(curtxn->xid);
2099  }
2100 
2101  switch (change->action)
2102  {
2104 
2105  /*
2106  * Confirmation for speculative insertion arrived. Simply
2107  * use as a normal record. It'll be cleaned up at the end
2108  * of INSERT processing.
2109  */
2110  if (specinsert == NULL)
2111  elog(ERROR, "invalid ordering of speculative insertion changes");
2112  Assert(specinsert->data.tp.oldtuple == NULL);
2113  change = specinsert;
2115 
2116  /* intentionally fall through */
2120  Assert(snapshot_now);
2121 
2122  reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode,
2123  change->data.tp.relnode.relNode);
2124 
2125  /*
2126  * Mapped catalog tuple without data, emitted while
2127  * catalog table was in the process of being rewritten. We
2128  * can fail to look up the relfilenode, because the
2129  * relmapper has no "historic" view, in contrast to the
2130  * normal catalog during decoding. Thus repeated rewrites
2131  * can cause a lookup failure. That's OK because we do not
2132  * decode catalog changes anyway. Normally such tuples
2133  * would be skipped over below, but we can't identify
2134  * whether the table should be logically logged without
2135  * mapping the relfilenode to the oid.
2136  */
2137  if (reloid == InvalidOid &&
2138  change->data.tp.newtuple == NULL &&
2139  change->data.tp.oldtuple == NULL)
2140  goto change_done;
2141  else if (reloid == InvalidOid)
2142  elog(ERROR, "could not map filenode \"%s\" to relation OID",
2143  relpathperm(change->data.tp.relnode,
2144  MAIN_FORKNUM));
2145 
2146  relation = RelationIdGetRelation(reloid);
2147 
2148  if (!RelationIsValid(relation))
2149  elog(ERROR, "could not open relation with OID %u (for filenode \"%s\")",
2150  reloid,
2151  relpathperm(change->data.tp.relnode,
2152  MAIN_FORKNUM));
2153 
2154  if (!RelationIsLogicallyLogged(relation))
2155  goto change_done;
2156 
2157  /*
2158  * Ignore temporary heaps created during DDL unless the
2159  * plugin has asked for them.
2160  */
2161  if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2162  goto change_done;
2163 
2164  /*
2165  * For now ignore sequence changes entirely. Most of the
2166  * time they don't log changes using records we
2167  * understand, so it doesn't make sense to handle the few
2168  * cases we do.
2169  */
2170  if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2171  goto change_done;
2172 
2173  /* user-triggered change */
2174  if (!IsToastRelation(relation))
2175  {
2176  ReorderBufferToastReplace(rb, txn, relation, change);
2177  ReorderBufferApplyChange(rb, txn, relation, change,
2178  streaming);
2179 
2180  /*
2181  * Only clear reassembled toast chunks if we're sure
2182  * they're not required anymore. The creator of the
2183  * tuple tells us.
2184  */
2185  if (change->data.tp.clear_toast_afterwards)
2186  ReorderBufferToastReset(rb, txn);
2187  }
2188  /* we're not interested in toast deletions */
2189  else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2190  {
2191  /*
2192  * Need to reassemble the full toasted Datum in
2193  * memory, to ensure the chunks don't get reused till
2194  * we're done remove it from the list of this
2195  * transaction's changes. Otherwise it will get
2196  * freed/reused while restoring spooled data from
2197  * disk.
2198  */
2199  Assert(change->data.tp.newtuple != NULL);
2200 
2201  dlist_delete(&change->node);
2202  ReorderBufferToastAppendChunk(rb, txn, relation,
2203  change);
2204  }
2205 
2206  change_done:
2207 
2208  /*
2209  * Either speculative insertion was confirmed, or it was
2210  * unsuccessful and the record isn't needed anymore.
2211  */
2212  if (specinsert != NULL)
2213  {
2214  ReorderBufferReturnChange(rb, specinsert, true);
2215  specinsert = NULL;
2216  }
2217 
2218  if (RelationIsValid(relation))
2219  {
2220  RelationClose(relation);
2221  relation = NULL;
2222  }
2223  break;
2224 
2226 
2227  /*
2228  * Speculative insertions are dealt with by delaying the
2229  * processing of the insert until the confirmation record
2230  * arrives. For that we simply unlink the record from the
2231  * chain, so it does not get freed/reused while restoring
2232  * spooled data from disk.
2233  *
2234  * This is safe in the face of concurrent catalog changes
2235  * because the relevant relation can't be changed between
2236  * speculative insertion and confirmation due to
2237  * CheckTableNotInUse() and locking.
2238  */
2239 
2240  /* clear out a pending (and thus failed) speculation */
2241  if (specinsert != NULL)
2242  {
2243  ReorderBufferReturnChange(rb, specinsert, true);
2244  specinsert = NULL;
2245  }
2246 
2247  /* and memorize the pending insertion */
2248  dlist_delete(&change->node);
2249  specinsert = change;
2250  break;
2251 
2253  {
2254  int i;
2255  int nrelids = change->data.truncate.nrelids;
2256  int nrelations = 0;
2257  Relation *relations;
2258 
2259  relations = palloc0(nrelids * sizeof(Relation));
2260  for (i = 0; i < nrelids; i++)
2261  {
2262  Oid relid = change->data.truncate.relids[i];
2263  Relation relation;
2264 
2265  relation = RelationIdGetRelation(relid);
2266 
2267  if (!RelationIsValid(relation))
2268  elog(ERROR, "could not open relation with OID %u", relid);
2269 
2270  if (!RelationIsLogicallyLogged(relation))
2271  continue;
2272 
2273  relations[nrelations++] = relation;
2274  }
2275 
2276  /* Apply the truncate. */
2277  ReorderBufferApplyTruncate(rb, txn, nrelations,
2278  relations, change,
2279  streaming);
2280 
2281  for (i = 0; i < nrelations; i++)
2282  RelationClose(relations[i]);
2283 
2284  break;
2285  }
2286 
2288  ReorderBufferApplyMessage(rb, txn, change, streaming);
2289  break;
2290 
2292  /* Execute the invalidation messages locally */
2294  change->data.inval.ninvalidations,
2295  change->data.inval.invalidations);
2296  break;
2297 
2299  /* get rid of the old */
2300  TeardownHistoricSnapshot(false);
2301 
2302  if (snapshot_now->copied)
2303  {
2304  ReorderBufferFreeSnap(rb, snapshot_now);
2305  snapshot_now =
2306  ReorderBufferCopySnap(rb, change->data.snapshot,
2307  txn, command_id);
2308  }
2309 
2310  /*
2311  * Restored from disk, need to be careful not to double
2312  * free. We could introduce refcounting for that, but for
2313  * now this seems infrequent enough not to care.
2314  */
2315  else if (change->data.snapshot->copied)
2316  {
2317  snapshot_now =
2318  ReorderBufferCopySnap(rb, change->data.snapshot,
2319  txn, command_id);
2320  }
2321  else
2322  {
2323  snapshot_now = change->data.snapshot;
2324  }
2325 
2326  /* and continue with the new one */
2327  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2328  break;
2329 
2331  Assert(change->data.command_id != InvalidCommandId);
2332 
2333  if (command_id < change->data.command_id)
2334  {
2335  command_id = change->data.command_id;
2336 
2337  if (!snapshot_now->copied)
2338  {
2339  /* we don't use the global one anymore */
2340  snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2341  txn, command_id);
2342  }
2343 
2344  snapshot_now->curcid = command_id;
2345 
2346  TeardownHistoricSnapshot(false);
2347  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2348  }
2349 
2350  break;
2351 
2353  elog(ERROR, "tuplecid value in changequeue");
2354  break;
2355  }
2356  }
2357 
2358  /*
2359  * There's a speculative insertion remaining, just clean in up, it
2360  * can't have been successful, otherwise we'd gotten a confirmation
2361  * record.
2362  */
2363  if (specinsert)
2364  {
2365  ReorderBufferReturnChange(rb, specinsert, true);
2366  specinsert = NULL;
2367  }
2368 
2369  /* clean up the iterator */
2370  ReorderBufferIterTXNFinish(rb, iterstate);
2371  iterstate = NULL;
2372 
2373  /*
2374  * Update total transaction count and total transaction bytes
2375  * processed. Ensure to not count the streamed transaction multiple
2376  * times.
2377  *
2378  * Note that the statistics computation has to be done after
2379  * ReorderBufferIterTXNFinish as it releases the serialized change
2380  * which we have already accounted in ReorderBufferIterTXNNext.
2381  */
2382  if (!rbtxn_is_streamed(txn))
2383  rb->totalTxns++;
2384 
2385  rb->totalBytes += rb->size;
2386 
2387  /*
2388  * Done with current changes, send the last message for this set of
2389  * changes depending upon streaming mode.
2390  */
2391  if (streaming)
2392  {
2393  if (stream_started)
2394  {
2395  rb->stream_stop(rb, txn, prev_lsn);
2396  stream_started = false;
2397  }
2398  }
2399  else
2400  {
2401  /*
2402  * Call either PREPARE (for two-phase transactions) or COMMIT (for
2403  * regular ones).
2404  */
2405  if (rbtxn_prepared(txn))
2406  rb->prepare(rb, txn, commit_lsn);
2407  else
2408  rb->commit(rb, txn, commit_lsn);
2409  }
2410 
2411  /* this is just a sanity check against bad output plugin behaviour */
2413  elog(ERROR, "output plugin used XID %u",
2415 
2416  /*
2417  * Remember the command ID and snapshot for the next set of changes in
2418  * streaming mode.
2419  */
2420  if (streaming)
2421  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2422  else if (snapshot_now->copied)
2423  ReorderBufferFreeSnap(rb, snapshot_now);
2424 
2425  /* cleanup */
2426  TeardownHistoricSnapshot(false);
2427 
2428  /*
2429  * Aborting the current (sub-)transaction as a whole has the right
2430  * semantics. We want all locks acquired in here to be released, not
2431  * reassigned to the parent and we do not want any database access
2432  * have persistent effects.
2433  */
2435 
2436  /* make sure there's no cache pollution */
2438 
2439  if (using_subtxn)
2441 
2442  /*
2443  * We are here due to one of the four reasons: 1. Decoding an
2444  * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2445  * prepared txn that was (partially) streamed. 4. Decoding a committed
2446  * txn.
2447  *
2448  * For 1, we allow truncation of txn data by removing the changes
2449  * already streamed but still keeping other things like invalidations,
2450  * snapshot, and tuplecids. For 2 and 3, we indicate
2451  * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2452  * data as the entire transaction has been decoded except for commit.
2453  * For 4, as the entire txn has been decoded, we can fully clean up
2454  * the TXN reorder buffer.
2455  */
2456  if (streaming || rbtxn_prepared(txn))
2457  {
2459  /* Reset the CheckXidAlive */
2461  }
2462  else
2463  ReorderBufferCleanupTXN(rb, txn);
2464  }
2465  PG_CATCH();
2466  {
2467  MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2468  ErrorData *errdata = CopyErrorData();
2469 
2470  /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2471  if (iterstate)
2472  ReorderBufferIterTXNFinish(rb, iterstate);
2473 
2475 
2476  /*
2477  * Force cache invalidation to happen outside of a valid transaction
2478  * to prevent catalog access as we just caught an error.
2479  */
2481 
2482  /* make sure there's no cache pollution */
2484  txn->invalidations);
2485 
2486  if (using_subtxn)
2488 
2489  /*
2490  * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2491  * abort of the (sub)transaction we are streaming or preparing. We
2492  * need to do the cleanup and return gracefully on this error, see
2493  * SetupCheckXidLive.
2494  */
2495  if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK)
2496  {
2497  /*
2498  * This error can occur either when we are sending the data in
2499  * streaming mode and the streaming is not finished yet or when we
2500  * are sending the data out on a PREPARE during a two-phase
2501  * commit.
2502  */
2503  Assert(streaming || rbtxn_prepared(txn));
2504  Assert(stream_started || rbtxn_prepared(txn));
2505 
2506  /* Cleanup the temporary error state. */
2507  FlushErrorState();
2508  FreeErrorData(errdata);
2509  errdata = NULL;
2510  curtxn->concurrent_abort = true;
2511 
2512  /* Reset the TXN so that it is allowed to stream remaining data. */
2513  ReorderBufferResetTXN(rb, txn, snapshot_now,
2514  command_id, prev_lsn,
2515  specinsert);
2516  }
2517  else
2518  {
2519  ReorderBufferCleanupTXN(rb, txn);
2520  MemoryContextSwitchTo(ecxt);
2521  PG_RE_THROW();
2522  }
2523  }
2524  PG_END_TRY();
2525 }
2526 
2527 /*
2528  * Perform the replay of a transaction and its non-aborted subtransactions.
2529  *
2530  * Subtransactions previously have to be processed by
2531  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2532  * transaction with ReorderBufferAssignChild.
2533  *
2534  * This interface is called once a prepare or toplevel commit is read for both
2535  * streamed as well as non-streamed transactions.
2536  */
2537 static void
2540  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2541  TimestampTz commit_time,
2542  RepOriginId origin_id, XLogRecPtr origin_lsn)
2543 {
2544  Snapshot snapshot_now;
2545  CommandId command_id = FirstCommandId;
2546 
2547  txn->final_lsn = commit_lsn;
2548  txn->end_lsn = end_lsn;
2549  txn->commit_time = commit_time;
2550  txn->origin_id = origin_id;
2551  txn->origin_lsn = origin_lsn;
2552 
2553  /*
2554  * If the transaction was (partially) streamed, we need to commit it in a
2555  * 'streamed' way. That is, we first stream the remaining part of the
2556  * transaction, and then invoke stream_commit message.
2557  *
2558  * Called after everything (origin ID, LSN, ...) is stored in the
2559  * transaction to avoid passing that information directly.
2560  */
2561  if (rbtxn_is_streamed(txn))
2562  {
2563  ReorderBufferStreamCommit(rb, txn);
2564  return;
2565  }
2566 
2567  /*
2568  * If this transaction has no snapshot, it didn't make any changes to the
2569  * database, so there's nothing to decode. Note that
2570  * ReorderBufferCommitChild will have transferred any snapshots from
2571  * subtransactions if there were any.
2572  */
2573  if (txn->base_snapshot == NULL)
2574  {
2575  Assert(txn->ninvalidations == 0);
2576 
2577  /*
2578  * Removing this txn before a commit might result in the computation
2579  * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2580  */
2581  if (!rbtxn_prepared(txn))
2582  ReorderBufferCleanupTXN(rb, txn);
2583  return;
2584  }
2585 
2586  snapshot_now = txn->base_snapshot;
2587 
2588  /* Process and send the changes to output plugin. */
2589  ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2590  command_id, false);
2591 }
2592 
2593 /*
2594  * Commit a transaction.
2595  *
2596  * See comments for ReorderBufferReplay().
2597  */
2598 void
2600  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2601  TimestampTz commit_time,
2602  RepOriginId origin_id, XLogRecPtr origin_lsn)
2603 {
2605 
2606  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2607  false);
2608 
2609  /* unknown transaction, nothing to replay */
2610  if (txn == NULL)
2611  return;
2612 
2613  ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2614  origin_id, origin_lsn);
2615 }
2616 
2617 /*
2618  * Record the prepare information for a transaction.
2619  */
2620 bool
2622  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2623  TimestampTz prepare_time,
2624  RepOriginId origin_id, XLogRecPtr origin_lsn)
2625 {
2627 
2628  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2629 
2630  /* unknown transaction, nothing to do */
2631  if (txn == NULL)
2632  return false;
2633 
2634  /*
2635  * Remember the prepare information to be later used by commit prepared in
2636  * case we skip doing prepare.
2637  */
2638  txn->final_lsn = prepare_lsn;
2639  txn->end_lsn = end_lsn;
2640  txn->commit_time = prepare_time;
2641  txn->origin_id = origin_id;
2642  txn->origin_lsn = origin_lsn;
2643 
2644  return true;
2645 }
2646 
2647 /* Remember that we have skipped prepare */
2648 void
2650 {
2652 
2653  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2654 
2655  /* unknown transaction, nothing to do */
2656  if (txn == NULL)
2657  return;
2658 
2660 }
2661 
2662 /*
2663  * Prepare a two-phase transaction.
2664  *
2665  * See comments for ReorderBufferReplay().
2666  */
2667 void
2669  char *gid)
2670 {
2672 
2673  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2674  false);
2675 
2676  /* unknown transaction, nothing to replay */
2677  if (txn == NULL)
2678  return;
2679 
2680  txn->txn_flags |= RBTXN_PREPARE;
2681  txn->gid = pstrdup(gid);
2682 
2683  /* The prepare info must have been updated in txn by now. */
2685 
2686  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2687  txn->commit_time, txn->origin_id, txn->origin_lsn);
2688 
2689  /*
2690  * We send the prepare for the concurrently aborted xacts so that later
2691  * when rollback prepared is decoded and sent, the downstream should be
2692  * able to rollback such a xact. See comments atop DecodePrepare.
2693  */
2694  if (txn->concurrent_abort)
2695  rb->prepare(rb, txn, txn->final_lsn);
2696 }
2697 
2698 /*
2699  * This is used to handle COMMIT/ROLLBACK PREPARED.
2700  */
2701 void
2703  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2704  XLogRecPtr initial_consistent_point,
2705  TimestampTz commit_time, RepOriginId origin_id,
2706  XLogRecPtr origin_lsn, char *gid, bool is_commit)
2707 {
2709  XLogRecPtr prepare_end_lsn;
2710  TimestampTz prepare_time;
2711 
2712  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2713 
2714  /* unknown transaction, nothing to do */
2715  if (txn == NULL)
2716  return;
2717 
2718  /*
2719  * By this time the txn has the prepare record information, remember it to
2720  * be later used for rollback.
2721  */
2722  prepare_end_lsn = txn->end_lsn;
2723  prepare_time = txn->commit_time;
2724 
2725  /* add the gid in the txn */
2726  txn->gid = pstrdup(gid);
2727 
2728  /*
2729  * It is possible that this transaction is not decoded at prepare time
2730  * either because by that time we didn't have a consistent snapshot or it
2731  * was decoded earlier but we have restarted. We only need to send the
2732  * prepare if it was not decoded earlier. We don't need to decode the xact
2733  * for aborts if it is not done already.
2734  */
2735  if ((txn->final_lsn < initial_consistent_point) && is_commit)
2736  {
2737  txn->txn_flags |= RBTXN_PREPARE;
2738 
2739  /*
2740  * The prepare info must have been updated in txn even if we skip
2741  * prepare.
2742  */
2744 
2745  /*
2746  * By this time the txn has the prepare record information and it is
2747  * important to use that so that downstream gets the accurate
2748  * information. If instead, we have passed commit information here
2749  * then downstream can behave as it has already replayed commit
2750  * prepared after the restart.
2751  */
2752  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2753  txn->commit_time, txn->origin_id, txn->origin_lsn);
2754  }
2755 
2756  txn->final_lsn = commit_lsn;
2757  txn->end_lsn = end_lsn;
2758  txn->commit_time = commit_time;
2759  txn->origin_id = origin_id;
2760  txn->origin_lsn = origin_lsn;
2761 
2762  if (is_commit)
2763  rb->commit_prepared(rb, txn, commit_lsn);
2764  else
2765  rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2766 
2767  /* cleanup: make sure there's no cache pollution */
2769  txn->invalidations);
2770  ReorderBufferCleanupTXN(rb, txn);
2771 }
2772 
2773 /*
2774  * Abort a transaction that possibly has previous changes. Needs to be first
2775  * called for subtransactions and then for the toplevel xid.
2776  *
2777  * NB: Transactions handled here have to have actively aborted (i.e. have
2778  * produced an abort record). Implicitly aborted transactions are handled via
2779  * ReorderBufferAbortOld(); transactions we're just not interested in, but
2780  * which have committed are handled in ReorderBufferForget().
2781  *
2782  * This function purges this transaction and its contents from memory and
2783  * disk.
2784  */
2785 void
2787 {
2789 
2790  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2791  false);
2792 
2793  /* unknown, nothing to remove */
2794  if (txn == NULL)
2795  return;
2796 
2797  /* For streamed transactions notify the remote node about the abort. */
2798  if (rbtxn_is_streamed(txn))
2799  {
2800  rb->stream_abort(rb, txn, lsn);
2801 
2802  /*
2803  * We might have decoded changes for this transaction that could load
2804  * the cache as per the current transaction's view (consider DDL's
2805  * happened in this transaction). We don't want the decoding of future
2806  * transactions to use those cache entries so execute invalidations.
2807  */
2808  if (txn->ninvalidations > 0)
2810  txn->invalidations);
2811  }
2812 
2813  /* cosmetic... */
2814  txn->final_lsn = lsn;
2815 
2816  /* remove potential on-disk data, and deallocate */
2817  ReorderBufferCleanupTXN(rb, txn);
2818 }
2819 
2820 /*
2821  * Abort all transactions that aren't actually running anymore because the
2822  * server restarted.
2823  *
2824  * NB: These really have to be transactions that have aborted due to a server
2825  * crash/immediate restart, as we don't deal with invalidations here.
2826  */
2827 void
2829 {
2830  dlist_mutable_iter it;
2831 
2832  /*
2833  * Iterate through all (potential) toplevel TXNs and abort all that are
2834  * older than what possibly can be running. Once we've found the first
2835  * that is alive we stop, there might be some that acquired an xid earlier
2836  * but started writing later, but it's unlikely and they will be cleaned
2837  * up in a later call to this function.
2838  */
2840  {
2842 
2843  txn = dlist_container(ReorderBufferTXN, node, it.cur);
2844 
2845  if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2846  {
2847  elog(DEBUG2, "aborting old transaction %u", txn->xid);
2848 
2849  /* remove potential on-disk data, and deallocate this tx */
2850  ReorderBufferCleanupTXN(rb, txn);
2851  }
2852  else
2853  return;
2854  }
2855 }
2856 
2857 /*
2858  * Forget the contents of a transaction if we aren't interested in its
2859  * contents. Needs to be first called for subtransactions and then for the
2860  * toplevel xid.
2861  *
2862  * This is significantly different to ReorderBufferAbort() because
2863  * transactions that have committed need to be treated differently from aborted
2864  * ones since they may have modified the catalog.
2865  *
2866  * Note that this is only allowed to be called in the moment a transaction
2867  * commit has just been read, not earlier; otherwise later records referring
2868  * to this xid might re-create the transaction incompletely.
2869  */
2870 void
2872 {
2874 
2875  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2876  false);
2877 
2878  /* unknown, nothing to forget */
2879  if (txn == NULL)
2880  return;
2881 
2882  /* For streamed transactions notify the remote node about the abort. */
2883  if (rbtxn_is_streamed(txn))
2884  rb->stream_abort(rb, txn, lsn);
2885 
2886  /* cosmetic... */
2887  txn->final_lsn = lsn;
2888 
2889  /*
2890  * Process cache invalidation messages if there are any. Even if we're not
2891  * interested in the transaction's contents, it could have manipulated the
2892  * catalog and we need to update the caches according to that.
2893  */
2894  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2896  txn->invalidations);
2897  else
2898  Assert(txn->ninvalidations == 0);
2899 
2900  /* remove potential on-disk data, and deallocate */
2901  ReorderBufferCleanupTXN(rb, txn);
2902 }
2903 
2904 /*
2905  * Invalidate cache for those transactions that need to be skipped just in case
2906  * catalogs were manipulated as part of the transaction.
2907  *
2908  * Note that this is a special-purpose function for prepared transactions where
2909  * we don't want to clean up the TXN even when we decide to skip it. See
2910  * DecodePrepare.
2911  */
2912 void
2914 {
2916 
2917  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2918  false);
2919 
2920  /* unknown, nothing to do */
2921  if (txn == NULL)
2922  return;
2923 
2924  /*
2925  * Process cache invalidation messages if there are any. Even if we're not
2926  * interested in the transaction's contents, it could have manipulated the
2927  * catalog and we need to update the caches according to that.
2928  */
2929  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2931  txn->invalidations);
2932  else
2933  Assert(txn->ninvalidations == 0);
2934 }
2935 
2936 
2937 /*
2938  * Execute invalidations happening outside the context of a decoded
2939  * transaction. That currently happens either for xid-less commits
2940  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
2941  * transactions (via ReorderBufferForget()).
2942  */
2943 void
2945  SharedInvalidationMessage *invalidations)
2946 {
2947  bool use_subtxn = IsTransactionOrTransactionBlock();
2948  int i;
2949 
2950  if (use_subtxn)
2951  BeginInternalSubTransaction("replay");
2952 
2953  /*
2954  * Force invalidations to happen outside of a valid transaction - that way
2955  * entries will just be marked as invalid without accessing the catalog.
2956  * That's advantageous because we don't need to setup the full state
2957  * necessary for catalog access.
2958  */
2959  if (use_subtxn)
2961 
2962  for (i = 0; i < ninvalidations; i++)
2963  LocalExecuteInvalidationMessage(&invalidations[i]);
2964 
2965  if (use_subtxn)
2967 }
2968 
2969 /*
2970  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
2971  * least once for every xid in XLogRecord->xl_xid (other places in records
2972  * may, but do not have to be passed through here).
2973  *
2974  * Reorderbuffer keeps some datastructures about transactions in LSN order,
2975  * for efficiency. To do that it has to know about when transactions are seen
2976  * first in the WAL. As many types of records are not actually interesting for
2977  * logical decoding, they do not necessarily pass though here.
2978  */
2979 void
2981 {
2982  /* many records won't have an xid assigned, centralize check here */
2983  if (xid != InvalidTransactionId)
2984  ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
2985 }
2986 
2987 /*
2988  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
2989  * because the previous snapshot doesn't describe the catalog correctly for
2990  * following rows.
2991  */
2992 void
2994  XLogRecPtr lsn, Snapshot snap)
2995 {
2997 
2998  change->data.snapshot = snap;
3000 
3001  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3002 }
3003 
3004 /*
3005  * Set up the transaction's base snapshot.
3006  *
3007  * If we know that xid is a subtransaction, set the base snapshot on the
3008  * top-level transaction instead.
3009  */
3010 void
3012  XLogRecPtr lsn, Snapshot snap)
3013 {
3015  bool is_new;
3016 
3017  AssertArg(snap != NULL);
3018 
3019  /*
3020  * Fetch the transaction to operate on. If we know it's a subtransaction,
3021  * operate on its top-level transaction instead.
3022  */
3023  txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3024  if (rbtxn_is_known_subxact(txn))
3025  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3026  NULL, InvalidXLogRecPtr, false);
3027  Assert(txn->base_snapshot == NULL);
3028 
3029  txn->base_snapshot = snap;
3030  txn->base_snapshot_lsn = lsn;
3032 
3033  AssertTXNLsnOrder(rb);
3034 }
3035 
3036 /*
3037  * Access the catalog with this CommandId at this point in the changestream.
3038  *
3039  * May only be called for command ids > 1
3040  */
3041 void
3043  XLogRecPtr lsn, CommandId cid)
3044 {
3046 
3047  change->data.command_id = cid;
3049 
3050  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3051 }
3052 
3053 /*
3054  * Update memory counters to account for the new or removed change.
3055  *
3056  * We update two counters - in the reorder buffer, and in the transaction
3057  * containing the change. The reorder buffer counter allows us to quickly
3058  * decide if we reached the memory limit, the transaction counter allows
3059  * us to quickly pick the largest transaction for eviction.
3060  *
3061  * When streaming is enabled, we need to update the toplevel transaction
3062  * counters instead - we don't really care about subtransactions as we
3063  * can't stream them individually anyway, and we only pick toplevel
3064  * transactions for eviction. So only toplevel transactions matter.
3065  */
3066 static void
3068  ReorderBufferChange *change,
3069  bool addition)
3070 {
3071  Size sz;
3073  ReorderBufferTXN *toptxn = NULL;
3074 
3075  Assert(change->txn);
3076 
3077  /*
3078  * Ignore tuple CID changes, because those are not evicted when reaching
3079  * memory limit. So we just don't count them, because it might easily
3080  * trigger a pointless attempt to spill.
3081  */
3083  return;
3084 
3085  txn = change->txn;
3086 
3087  /* If streaming supported, update the total size in top level as well. */
3088  if (ReorderBufferCanStream(rb))
3089  {
3090  if (txn->toptxn != NULL)
3091  toptxn = txn->toptxn;
3092  else
3093  toptxn = txn;
3094  }
3095 
3096  sz = ReorderBufferChangeSize(change);
3097 
3098  if (addition)
3099  {
3100  txn->size += sz;
3101  rb->size += sz;
3102 
3103  /* Update the total size in the top transaction. */
3104  if (toptxn)
3105  toptxn->total_size += sz;
3106  }
3107  else
3108  {
3109  Assert((rb->size >= sz) && (txn->size >= sz));
3110  txn->size -= sz;
3111  rb->size -= sz;
3112 
3113  /* Update the total size in the top transaction. */
3114  if (toptxn)
3115  toptxn->total_size -= sz;
3116  }
3117 
3118  Assert(txn->size <= rb->size);
3119 }
3120 
3121 /*
3122  * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
3123  *
3124  * We do not include this change type in memory accounting, because we
3125  * keep CIDs in a separate list and do not evict them when reaching
3126  * the memory limit.
3127  */
3128 void
3130  XLogRecPtr lsn, RelFileNode node,
3131  ItemPointerData tid, CommandId cmin,
3132  CommandId cmax, CommandId combocid)
3133 {
3136 
3137  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3138 
3139  change->data.tuplecid.node = node;
3140  change->data.tuplecid.tid = tid;
3141  change->data.tuplecid.cmin = cmin;
3142  change->data.tuplecid.cmax = cmax;
3143  change->data.tuplecid.combocid = combocid;
3144  change->lsn = lsn;
3145  change->txn = txn;
3147 
3148  dlist_push_tail(&txn->tuplecids, &change->node);
3149  txn->ntuplecids++;
3150 }
3151 
3152 /*
3153  * Setup the invalidation of the toplevel transaction.
3154  *
3155  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3156  * accumulates all the invalidation messages in the toplevel transaction as
3157  * well as in the form of change in reorder buffer. We require to record it in
3158  * form of the change so that we can execute only the required invalidations
3159  * instead of executing all the invalidations on each CommandId increment. We
3160  * also need to accumulate these in the toplevel transaction because in some
3161  * cases we skip processing the transaction (see ReorderBufferForget), we need
3162  * to execute all the invalidations together.
3163  */
3164 void
3166  XLogRecPtr lsn, Size nmsgs,
3168 {
3170  MemoryContext oldcontext;
3171  ReorderBufferChange *change;
3172 
3173  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3174 
3175  oldcontext = MemoryContextSwitchTo(rb->context);
3176 
3177  /*
3178  * Collect all the invalidations under the top transaction so that we can
3179  * execute them all together. See comment atop this function
3180  */
3181  if (txn->toptxn)
3182  txn = txn->toptxn;
3183 
3184  Assert(nmsgs > 0);
3185 
3186  /* Accumulate invalidations. */
3187  if (txn->ninvalidations == 0)
3188  {
3189  txn->ninvalidations = nmsgs;
3191  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3192  memcpy(txn->invalidations, msgs,
3193  sizeof(SharedInvalidationMessage) * nmsgs);
3194  }
3195  else
3196  {
3199  (txn->ninvalidations + nmsgs));
3200 
3201  memcpy(txn->invalidations + txn->ninvalidations, msgs,
3202  nmsgs * sizeof(SharedInvalidationMessage));
3203  txn->ninvalidations += nmsgs;
3204  }
3205 
3206  change = ReorderBufferGetChange(rb);
3208  change->data.inval.ninvalidations = nmsgs;
3209  change->data.inval.invalidations = (SharedInvalidationMessage *)
3210  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3211  memcpy(change->data.inval.invalidations, msgs,
3212  sizeof(SharedInvalidationMessage) * nmsgs);
3213 
3214  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3215 
3216  MemoryContextSwitchTo(oldcontext);
3217 }
3218 
3219 /*
3220  * Apply all invalidations we know. Possibly we only need parts at this point
3221  * in the changestream but we don't know which those are.
3222  */
3223 static void
3225 {
3226  int i;
3227 
3228  for (i = 0; i < nmsgs; i++)
3230 }
3231 
3232 /*
3233  * Mark a transaction as containing catalog changes
3234  */
3235 void
3237  XLogRecPtr lsn)
3238 {
3240 
3241  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3242 
3244 
3245  /*
3246  * Mark top-level transaction as having catalog changes too if one of its
3247  * children has so that the ReorderBufferBuildTupleCidHash can
3248  * conveniently check just top-level transaction and decide whether to
3249  * build the hash table or not.
3250  */
3251  if (txn->toptxn != NULL)
3253 }
3254 
3255 /*
3256  * Query whether a transaction is already *known* to contain catalog
3257  * changes. This can be wrong until directly before the commit!
3258  */
3259 bool
3261 {
3263 
3264  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3265  false);
3266  if (txn == NULL)
3267  return false;
3268 
3269  return rbtxn_has_catalog_changes(txn);
3270 }
3271 
3272 /*
3273  * ReorderBufferXidHasBaseSnapshot
3274  * Have we already set the base snapshot for the given txn/subtxn?
3275  */
3276 bool
3278 {
3280 
3281  txn = ReorderBufferTXNByXid(rb, xid, false,
3282  NULL, InvalidXLogRecPtr, false);
3283 
3284  /* transaction isn't known yet, ergo no snapshot */
3285  if (txn == NULL)
3286  return false;
3287 
3288  /* a known subtxn? operate on top-level txn instead */
3289  if (rbtxn_is_known_subxact(txn))
3290  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3291  NULL, InvalidXLogRecPtr, false);
3292 
3293  return txn->base_snapshot != NULL;
3294 }
3295 
3296 
3297 /*
3298  * ---------------------------------------
3299  * Disk serialization support
3300  * ---------------------------------------
3301  */
3302 
3303 /*
3304  * Ensure the IO buffer is >= sz.
3305  */
3306 static void
3308 {
3309  if (!rb->outbufsize)
3310  {
3311  rb->outbuf = MemoryContextAlloc(rb->context, sz);
3312  rb->outbufsize = sz;
3313  }
3314  else if (rb->outbufsize < sz)
3315  {
3316  rb->outbuf = repalloc(rb->outbuf, sz);
3317  rb->outbufsize = sz;
3318  }
3319 }
3320 
3321 /*
3322  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3323  *
3324  * XXX With many subtransactions this might be quite slow, because we'll have
3325  * to walk through all of them. There are some options how we could improve
3326  * that: (a) maintain some secondary structure with transactions sorted by
3327  * amount of changes, (b) not looking for the entirely largest transaction,
3328  * but e.g. for transaction using at least some fraction of the memory limit,
3329  * and (c) evicting multiple transactions at once, e.g. to free a given portion
3330  * of the memory limit (e.g. 50%).
3331  */
3332 static ReorderBufferTXN *
3334 {
3335  HASH_SEQ_STATUS hash_seq;
3337  ReorderBufferTXN *largest = NULL;
3338 
3339  hash_seq_init(&hash_seq, rb->by_txn);
3340  while ((ent = hash_seq_search(&hash_seq)) != NULL)
3341  {
3342  ReorderBufferTXN *txn = ent->txn;
3343 
3344  /* if the current transaction is larger, remember it */
3345  if ((!largest) || (txn->size > largest->size))
3346  largest = txn;
3347  }
3348 
3349  Assert(largest);
3350  Assert(largest->size > 0);
3351  Assert(largest->size <= rb->size);
3352 
3353  return largest;
3354 }
3355 
3356 /*
3357  * Find the largest toplevel transaction to evict (by streaming).
3358  *
3359  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3360  * should give us the same transaction (because we don't update memory account
3361  * for subtransaction with streaming, so it's always 0). But we can simply
3362  * iterate over the limited number of toplevel transactions.
3363  *
3364  * Note that, we skip transactions that contains incomplete changes. There
3365  * is a scope of optimization here such that we can select the largest transaction
3366  * which has complete changes. But that will make the code and design quite complex
3367  * and that might not be worth the benefit. If we plan to stream the transactions
3368  * that contains incomplete changes then we need to find a way to partially
3369  * stream/truncate the transaction changes in-memory and build a mechanism to
3370  * partially truncate the spilled files. Additionally, whenever we partially
3371  * stream the transaction we need to maintain the last streamed lsn and next time
3372  * we need to restore from that segment and the offset in WAL. As we stream the
3373  * changes from the top transaction and restore them subtransaction wise, we need
3374  * to even remember the subxact from where we streamed the last change.
3375  */
3376 static ReorderBufferTXN *
3378 {
3379  dlist_iter iter;
3380  Size largest_size = 0;
3381  ReorderBufferTXN *largest = NULL;
3382 
3383  /* Find the largest top-level transaction. */
3384  dlist_foreach(iter, &rb->toplevel_by_lsn)
3385  {
3387 
3388  txn = dlist_container(ReorderBufferTXN, node, iter.cur);
3389 
3390  if ((largest != NULL || txn->total_size > largest_size) &&
3391  (txn->total_size > 0) && !(rbtxn_has_incomplete_tuple(txn)))
3392  {
3393  largest = txn;
3394  largest_size = txn->total_size;
3395  }
3396  }
3397 
3398  return largest;
3399 }
3400 
3401 /*
3402  * Check whether the logical_decoding_work_mem limit was reached, and if yes
3403  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3404  * disk until we reach under the memory limit.
3405  *
3406  * XXX At this point we select the transactions until we reach under the memory
3407  * limit, but we might also adapt a more elaborate eviction strategy - for example
3408  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3409  * limit.
3410  */
3411 static void
3413 {
3415 
3416  /* bail out if we haven't exceeded the memory limit */
3417  if (rb->size < logical_decoding_work_mem * 1024L)
3418  return;
3419 
3420  /*
3421  * Loop until we reach under the memory limit. One might think that just
3422  * by evicting the largest (sub)transaction we will come under the memory
3423  * limit based on assumption that the selected transaction is at least as
3424  * large as the most recent change (which caused us to go over the memory
3425  * limit). However, that is not true because a user can reduce the
3426  * logical_decoding_work_mem to a smaller value before the most recent
3427  * change.
3428  */
3429  while (rb->size >= logical_decoding_work_mem * 1024L)
3430  {
3431  /*
3432  * Pick the largest transaction (or subtransaction) and evict it from
3433  * memory by streaming, if possible. Otherwise, spill to disk.
3434  */
3436  (txn = ReorderBufferLargestTopTXN(rb)) != NULL)
3437  {
3438  /* we know there has to be one, because the size is not zero */
3439  Assert(txn && !txn->toptxn);
3440  Assert(txn->total_size > 0);
3441  Assert(rb->size >= txn->total_size);
3442 
3443  ReorderBufferStreamTXN(rb, txn);
3444  }
3445  else
3446  {
3447  /*
3448  * Pick the largest transaction (or subtransaction) and evict it
3449  * from memory by serializing it to disk.
3450  */
3451  txn = ReorderBufferLargestTXN(rb);
3452 
3453  /* we know there has to be one, because the size is not zero */
3454  Assert(txn);
3455  Assert(txn->size > 0);
3456  Assert(rb->size >= txn->size);
3457 
3458  ReorderBufferSerializeTXN(rb, txn);
3459  }
3460 
3461  /*
3462  * After eviction, the transaction should have no entries in memory,
3463  * and should use 0 bytes for changes.
3464  */
3465  Assert(txn->size == 0);
3466  Assert(txn->nentries_mem == 0);
3467  }
3468 
3469  /* We must be under the memory limit now. */
3470  Assert(rb->size < logical_decoding_work_mem * 1024L);
3471 }
3472 
3473 /*
3474  * Spill data of a large transaction (and its subtransactions) to disk.
3475  */
3476 static void
3478 {
3479  dlist_iter subtxn_i;
3480  dlist_mutable_iter change_i;
3481  int fd = -1;
3482  XLogSegNo curOpenSegNo = 0;
3483  Size spilled = 0;
3484  Size size = txn->size;
3485 
3486  elog(DEBUG2, "spill %u changes in XID %u to disk",
3487  (uint32) txn->nentries_mem, txn->xid);
3488 
3489  /* do the same to all child TXs */
3490  dlist_foreach(subtxn_i, &txn->subtxns)
3491  {
3492  ReorderBufferTXN *subtxn;
3493 
3494  subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3495  ReorderBufferSerializeTXN(rb, subtxn);
3496  }
3497 
3498  /* serialize changestream */
3499  dlist_foreach_modify(change_i, &txn->changes)
3500  {
3501  ReorderBufferChange *change;
3502 
3503  change = dlist_container(ReorderBufferChange, node, change_i.cur);
3504 
3505  /*
3506  * store in segment in which it belongs by start lsn, don't split over
3507  * multiple segments tho
3508  */
3509  if (fd == -1 ||
3510  !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3511  {
3512  char path[MAXPGPATH];
3513 
3514  if (fd != -1)
3515  CloseTransientFile(fd);
3516 
3517  XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3518 
3519  /*
3520  * No need to care about TLIs here, only used during a single run,
3521  * so each LSN only maps to a specific WAL record.
3522  */
3524  curOpenSegNo);
3525 
3526  /* open segment, create it if necessary */
3527  fd = OpenTransientFile(path,
3528  O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3529 
3530  if (fd < 0)
3531  ereport(ERROR,
3533  errmsg("could not open file \"%s\": %m", path)));
3534  }
3535 
3536  ReorderBufferSerializeChange(rb, txn, fd, change);
3537  dlist_delete(&change->node);
3538  ReorderBufferReturnChange(rb, change, true);
3539 
3540  spilled++;
3541  }
3542 
3543  /* update the statistics iff we have spilled anything */
3544  if (spilled)
3545  {
3546  rb->spillCount += 1;
3547  rb->spillBytes += size;
3548 
3549  /* don't consider already serialized transactions */
3550  rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3551  }
3552 
3553  Assert(spilled == txn->nentries_mem);
3554  Assert(dlist_is_empty(&txn->changes));
3555  txn->nentries_mem = 0;
3557 
3558  if (fd != -1)
3559  CloseTransientFile(fd);
3560 }
3561 
3562 /*
3563  * Serialize individual change to disk.
3564  */
3565 static void
3567  int fd, ReorderBufferChange *change)
3568 {
3569  ReorderBufferDiskChange *ondisk;
3570  Size sz = sizeof(ReorderBufferDiskChange);
3571 
3573 
3574  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3575  memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3576 
3577  switch (change->action)
3578  {
3579  /* fall through these, they're all similar enough */
3584  {
3585  char *data;
3586  ReorderBufferTupleBuf *oldtup,
3587  *newtup;
3588  Size oldlen = 0;
3589  Size newlen = 0;
3590 
3591  oldtup = change->data.tp.oldtuple;
3592  newtup = change->data.tp.newtuple;
3593 
3594  if (oldtup)
3595  {
3596  sz += sizeof(HeapTupleData);
3597  oldlen = oldtup->tuple.t_len;
3598  sz += oldlen;
3599  }
3600 
3601  if (newtup)
3602  {
3603  sz += sizeof(HeapTupleData);
3604  newlen = newtup->tuple.t_len;
3605  sz += newlen;
3606  }
3607 
3608  /* make sure we have enough space */
3610 
3611  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3612  /* might have been reallocated above */
3613  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3614 
3615  if (oldlen)
3616  {
3617  memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
3618  data += sizeof(HeapTupleData);
3619 
3620  memcpy(data, oldtup->tuple.t_data, oldlen);
3621  data += oldlen;
3622  }
3623 
3624  if (newlen)
3625  {
3626  memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
3627  data += sizeof(HeapTupleData);
3628 
3629  memcpy(data, newtup->tuple.t_data, newlen);
3630  data += newlen;
3631  }
3632  break;
3633  }
3635  {
3636  char *data;
3637  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3638 
3639  sz += prefix_size + change->data.msg.message_size +
3640  sizeof(Size) + sizeof(Size);
3642 
3643  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3644 
3645  /* might have been reallocated above */
3646  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3647 
3648  /* write the prefix including the size */
3649  memcpy(data, &prefix_size, sizeof(Size));
3650  data += sizeof(Size);
3651  memcpy(data, change->data.msg.prefix,
3652  prefix_size);
3653  data += prefix_size;
3654 
3655  /* write the message including the size */
3656  memcpy(data, &change->data.msg.message_size, sizeof(Size));
3657  data += sizeof(Size);
3658  memcpy(data, change->data.msg.message,
3659  change->data.msg.message_size);
3660  data += change->data.msg.message_size;
3661 
3662  break;
3663  }
3665  {
3666  char *data;
3667  Size inval_size = sizeof(SharedInvalidationMessage) *
3668  change->data.inval.ninvalidations;
3669 
3670  sz += inval_size;
3671 
3673  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3674 
3675  /* might have been reallocated above */
3676  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3677  memcpy(data, change->data.inval.invalidations, inval_size);
3678  data += inval_size;
3679 
3680  break;
3681  }
3683  {
3684  Snapshot snap;
3685  char *data;
3686 
3687  snap = change->data.snapshot;
3688 
3689  sz += sizeof(SnapshotData) +
3690  sizeof(TransactionId) * snap->xcnt +
3691  sizeof(TransactionId) * snap->subxcnt;
3692 
3693  /* make sure we have enough space */
3695  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3696  /* might have been reallocated above */
3697  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3698 
3699  memcpy(data, snap, sizeof(SnapshotData));
3700  data += sizeof(SnapshotData);
3701 
3702  if (snap->xcnt)
3703  {
3704  memcpy(data, snap->xip,
3705  sizeof(TransactionId) * snap->xcnt);
3706  data += sizeof(TransactionId) * snap->xcnt;
3707  }
3708 
3709  if (snap->subxcnt)
3710  {
3711  memcpy(data, snap->subxip,
3712  sizeof(TransactionId) * snap->subxcnt);
3713  data += sizeof(TransactionId) * snap->subxcnt;
3714  }
3715  break;
3716  }
3718  {
3719  Size size;
3720  char *data;
3721 
3722  /* account for the OIDs of truncated relations */
3723  size = sizeof(Oid) * change->data.truncate.nrelids;
3724  sz += size;
3725 
3726  /* make sure we have enough space */
3728 
3729  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3730  /* might have been reallocated above */
3731  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3732 
3733  memcpy(data, change->data.truncate.relids, size);
3734  data += size;
3735 
3736  break;
3737  }
3741  /* ReorderBufferChange contains everything important */
3742  break;
3743  }
3744 
3745  ondisk->size = sz;
3746 
3747  errno = 0;
3749  if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3750  {
3751  int save_errno = errno;
3752 
3753  CloseTransientFile(fd);
3754 
3755  /* if write didn't set errno, assume problem is no disk space */
3756  errno = save_errno ? save_errno : ENOSPC;
3757  ereport(ERROR,
3759  errmsg("could not write to data file for XID %u: %m",
3760  txn->xid)));
3761  }
3763 
3764  /*
3765  * Keep the transaction's final_lsn up to date with each change we send to
3766  * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3767  * only do this on commit and abort records, but that doesn't work if a
3768  * system crash leaves a transaction without its abort record).
3769  *
3770  * Make sure not to move it backwards.
3771  */
3772  if (txn->final_lsn < change->lsn)
3773  txn->final_lsn = change->lsn;
3774 
3775  Assert(ondisk->change.action == change->action);
3776 }
3777 
3778 /* Returns true, if the output plugin supports streaming, false, otherwise. */
3779 static inline bool
3781 {
3783 
3784  return ctx->streaming;
3785 }
3786 
3787 /* Returns true, if the streaming can be started now, false, otherwise. */
3788 static inline bool
3790 {
3792  SnapBuild *builder = ctx->snapshot_builder;
3793 
3794  /* We can't start streaming unless a consistent state is reached. */
3796  return false;
3797 
3798  /*
3799  * We can't start streaming immediately even if the streaming is enabled
3800  * because we previously decoded this transaction and now just are
3801  * restarting.
3802  */
3803  if (ReorderBufferCanStream(rb) &&
3804  !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr))
3805  return true;
3806 
3807  return false;
3808 }
3809 
3810 /*
3811  * Send data of a large transaction (and its subtransactions) to the
3812  * output plugin, but using the stream API.
3813  */
3814 static void
3816 {
3817  Snapshot snapshot_now;
3818  CommandId command_id;
3819  Size stream_bytes;
3820  bool txn_is_streamed;
3821 
3822  /* We can never reach here for a subtransaction. */
3823  Assert(txn->toptxn == NULL);
3824 
3825  /*
3826  * We can't make any assumptions about base snapshot here, similar to what
3827  * ReorderBufferCommit() does. That relies on base_snapshot getting
3828  * transferred from subxact in ReorderBufferCommitChild(), but that was
3829  * not yet called as the transaction is in-progress.
3830  *
3831  * So just walk the subxacts and use the same logic here. But we only need
3832  * to do that once, when the transaction is streamed for the first time.
3833  * After that we need to reuse the snapshot from the previous run.
3834  *
3835  * Unlike DecodeCommit which adds xids of all the subtransactions in
3836  * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here
3837  * but we do add them to subxip array instead via ReorderBufferCopySnap.
3838  * This allows the catalog changes made in subtransactions decoded till
3839  * now to be visible.
3840  */
3841  if (txn->snapshot_now == NULL)
3842  {
3843  dlist_iter subxact_i;
3844 
3845  /* make sure this transaction is streamed for the first time */
3846  Assert(!rbtxn_is_streamed(txn));
3847 
3848  /* at the beginning we should have invalid command ID */
3850 
3851  dlist_foreach(subxact_i, &txn->subtxns)
3852  {
3853  ReorderBufferTXN *subtxn;
3854 
3855  subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
3856  ReorderBufferTransferSnapToParent(txn, subtxn);
3857  }
3858 
3859  /*
3860  * If this transaction has no snapshot, it didn't make any changes to
3861  * the database till now, so there's nothing to decode.
3862  */
3863  if (txn->base_snapshot == NULL)
3864  {
3865  Assert(txn->ninvalidations == 0);
3866  return;
3867  }
3868 
3869  command_id = FirstCommandId;
3870  snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
3871  txn, command_id);
3872  }
3873  else
3874  {
3875  /* the transaction must have been already streamed */
3876  Assert(rbtxn_is_streamed(txn));
3877 
3878  /*
3879  * Nah, we already have snapshot from the previous streaming run. We
3880  * assume new subxacts can't move the LSN backwards, and so can't beat
3881  * the LSN condition in the previous branch (so no need to walk
3882  * through subxacts again). In fact, we must not do that as we may be
3883  * using snapshot half-way through the subxact.
3884  */
3885  command_id = txn->command_id;
3886 
3887  /*
3888  * We can't use txn->snapshot_now directly because after the last
3889  * streaming run, we might have got some new sub-transactions. So we
3890  * need to add them to the snapshot.
3891  */
3892  snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
3893  txn, command_id);
3894 
3895  /* Free the previously copied snapshot. */
3896  Assert(txn->snapshot_now->copied);
3898  txn->snapshot_now = NULL;
3899  }
3900 
3901  /*
3902  * Remember this information to be used later to update stats. We can't
3903  * update the stats here as an error while processing the changes would
3904  * lead to the accumulation of stats even though we haven't streamed all
3905  * the changes.
3906  */
3907  txn_is_streamed = rbtxn_is_streamed(txn);
3908  stream_bytes = txn->total_size;
3909 
3910  /* Process and send the changes to output plugin. */
3911  ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
3912  command_id, true);
3913 
3914  rb->streamCount += 1;
3915  rb->streamBytes += stream_bytes;
3916 
3917  /* Don't consider already streamed transaction. */
3918  rb->streamTxns += (txn_is_streamed) ? 0 : 1;
3919 
3920  Assert(dlist_is_empty(&txn->changes));
3921  Assert(txn->nentries == 0);
3922  Assert(txn->nentries_mem == 0);
3923 }
3924 
3925 /*
3926  * Size of a change in memory.
3927  */
3928 static Size
3930 {
3931  Size sz = sizeof(ReorderBufferChange);
3932 
3933  switch (change->action)
3934  {
3935  /* fall through these, they're all similar enough */
3940  {
3941  ReorderBufferTupleBuf *oldtup,
3942  *newtup;
3943  Size oldlen = 0;
3944  Size newlen = 0;
3945 
3946  oldtup = change->data.tp.oldtuple;
3947  newtup = change->data.tp.newtuple;
3948 
3949  if (oldtup)
3950  {
3951  sz += sizeof(HeapTupleData);
3952  oldlen = oldtup->tuple.t_len;
3953  sz += oldlen;
3954  }
3955 
3956  if (newtup)
3957  {
3958  sz += sizeof(HeapTupleData);
3959  newlen = newtup->tuple.t_len;
3960  sz += newlen;
3961  }
3962 
3963  break;
3964  }
3966  {
3967  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3968 
3969  sz += prefix_size + change->data.msg.message_size +
3970  sizeof(Size) + sizeof(Size);
3971 
3972  break;
3973  }
3975  {
3976  sz += sizeof(SharedInvalidationMessage) *
3977  change->data.inval.ninvalidations;
3978  break;
3979  }
3981  {
3982  Snapshot snap;
3983 
3984  snap = change->data.snapshot;
3985 
3986  sz += sizeof(SnapshotData) +
3987  sizeof(TransactionId) * snap->xcnt +
3988  sizeof(TransactionId) * snap->subxcnt;
3989 
3990  break;
3991  }
3993  {
3994  sz += sizeof(Oid) * change->data.truncate.nrelids;
3995 
3996  break;
3997  }
4001  /* ReorderBufferChange contains everything important */
4002  break;
4003  }
4004 
4005  return sz;
4006 }
4007 
4008 
4009 /*
4010  * Restore a number of changes spilled to disk back into memory.
4011  */
4012 static Size
4014  TXNEntryFile *file, XLogSegNo *segno)
4015 {
4016  Size restored = 0;
4017  XLogSegNo last_segno;
4018  dlist_mutable_iter cleanup_iter;
4019  File *fd = &file->vfd;
4020 
4023 
4024  /* free current entries, so we have memory for more */
4025  dlist_foreach_modify(cleanup_iter, &txn->changes)
4026  {
4028  dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4029 
4030  dlist_delete(&cleanup->node);
4031  ReorderBufferReturnChange(rb, cleanup, true);
4032  }
4033  txn->nentries_mem = 0;
4034  Assert(dlist_is_empty(&txn->changes));
4035 
4036  XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4037 
4038  while (restored < max_changes_in_memory && *segno <= last_segno)
4039  {
4040  int readBytes;
4041  ReorderBufferDiskChange *ondisk;
4042 
4043  if (*fd == -1)
4044  {
4045  char path[MAXPGPATH];
4046 
4047  /* first time in */
4048  if (*segno == 0)
4049  XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4050 
4051  Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4052 
4053  /*
4054  * No need to care about TLIs here, only used during a single run,
4055  * so each LSN only maps to a specific WAL record.
4056  */
4058  *segno);
4059 
4060  *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4061 
4062  /* No harm in resetting the offset even in case of failure */
4063  file->curOffset = 0;
4064 
4065  if (*fd < 0 && errno == ENOENT)
4066  {
4067  *fd = -1;
4068  (*segno)++;
4069  continue;
4070  }
4071  else if (*fd < 0)
4072  ereport(ERROR,
4074  errmsg("could not open file \"%s\": %m",
4075  path)));
4076  }
4077 
4078  /*
4079  * Read the statically sized part of a change which has information
4080  * about the total size. If we couldn't read a record, we're at the
4081  * end of this file.
4082  */
4084  readBytes = FileRead(file->vfd, rb->outbuf,
4085  sizeof(ReorderBufferDiskChange),
4087 
4088  /* eof */
4089  if (readBytes == 0)
4090  {
4091  FileClose(*fd);
4092  *fd = -1;
4093  (*segno)++;
4094  continue;
4095  }
4096  else if (readBytes < 0)
4097  ereport(ERROR,
4099  errmsg("could not read from reorderbuffer spill file: %m")));
4100  else if (readBytes != sizeof(ReorderBufferDiskChange))
4101  ereport(ERROR,
4103  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4104  readBytes,
4105  (uint32) sizeof(ReorderBufferDiskChange))));
4106 
4107  file->curOffset += readBytes;
4108 
4109  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4110 
4112  sizeof(ReorderBufferDiskChange) + ondisk->size);
4113  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4114 
4115  readBytes = FileRead(file->vfd,
4116  rb->outbuf + sizeof(ReorderBufferDiskChange),
4117  ondisk->size - sizeof(ReorderBufferDiskChange),
4118  file->curOffset,
4120 
4121  if (readBytes < 0)
4122  ereport(ERROR,
4124  errmsg("could not read from reorderbuffer spill file: %m")));
4125  else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4126  ereport(ERROR,
4128  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4129  readBytes,
4130  (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4131 
4132  file->curOffset += readBytes;
4133 
4134  /*
4135  * ok, read a full change from disk, now restore it into proper
4136  * in-memory format
4137  */
4138  ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4139  restored++;
4140  }
4141 
4142  return restored;
4143 }
4144 
4145 /*
4146  * Convert change from its on-disk format to in-memory format and queue it onto
4147  * the TXN's ->changes list.
4148  *
4149  * Note: although "data" is declared char*, at entry it points to a
4150  * maxalign'd buffer, making it safe in most of this function to assume
4151  * that the pointed-to data is suitably aligned for direct access.
4152  */
4153 static void
4155  char *data)
4156 {
4157  ReorderBufferDiskChange *ondisk;
4158  ReorderBufferChange *change;
4159 
4160  ondisk = (ReorderBufferDiskChange *) data;
4161 
4162  change = ReorderBufferGetChange(rb);
4163 
4164  /* copy static part */
4165  memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4166 
4167  data += sizeof(ReorderBufferDiskChange);
4168 
4169  /* restore individual stuff */
4170  switch (change->action)
4171  {
4172  /* fall through these, they're all similar enough */
4177  if (change->data.tp.oldtuple)
4178  {
4179  uint32 tuplelen = ((HeapTuple) data)->t_len;
4180 
4181  change->data.tp.oldtuple =
4183 
4184  /* restore ->tuple */
4185  memcpy(&change->data.tp.oldtuple->tuple, data,
4186  sizeof(HeapTupleData));
4187  data += sizeof(HeapTupleData);
4188 
4189  /* reset t_data pointer into the new tuplebuf */
4190  change->data.tp.oldtuple->tuple.t_data =
4191  ReorderBufferTupleBufData(change->data.tp.oldtuple);
4192 
4193  /* restore tuple data itself */
4194  memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
4195  data += tuplelen;
4196  }
4197 
4198  if (change->data.tp.newtuple)
4199  {
4200  /* here, data might not be suitably aligned! */
4201  uint32 tuplelen;
4202 
4203  memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4204  sizeof(uint32));
4205 
4206  change->data.tp.newtuple =
4208 
4209  /* restore ->tuple */
4210  memcpy(&change->data.tp.newtuple->tuple, data,
4211  sizeof(HeapTupleData));
4212  data += sizeof(HeapTupleData);
4213 
4214  /* reset t_data pointer into the new tuplebuf */
4215  change->data.tp.newtuple->tuple.t_data =
4216  ReorderBufferTupleBufData(change->data.tp.newtuple);
4217 
4218  /* restore tuple data itself */
4219  memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
4220  data += tuplelen;
4221  }
4222 
4223  break;
4225  {
4226  Size prefix_size;
4227 
4228  /* read prefix */
4229  memcpy(&prefix_size, data, sizeof(Size));
4230  data += sizeof(Size);
4231  change->data.msg.prefix = MemoryContextAlloc(rb->context,
4232  prefix_size);
4233  memcpy(change->data.msg.prefix, data, prefix_size);
4234  Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4235  data += prefix_size;
4236 
4237  /* read the message */
4238  memcpy(&change->data.msg.message_size, data, sizeof(Size));
4239  data += sizeof(Size);
4240  change->data.msg.message = MemoryContextAlloc(rb->context,
4241  change->data.msg.message_size);
4242  memcpy(change->data.msg.message, data,
4243  change->data.msg.message_size);
4244  data += change->data.msg.message_size;
4245 
4246  break;
4247  }
4249  {
4250  Size inval_size = sizeof(SharedInvalidationMessage) *
4251  change->data.inval.ninvalidations;
4252 
4253  change->data.inval.invalidations =
4254  MemoryContextAlloc(rb->context, inval_size);
4255 
4256  /* read the message */
4257  memcpy(change->data.inval.invalidations, data, inval_size);
4258 
4259  break;
4260  }
4262  {
4263  Snapshot oldsnap;
4264  Snapshot newsnap;
4265  Size size;
4266 
4267  oldsnap = (Snapshot) data;
4268 
4269  size = sizeof(SnapshotData) +
4270  sizeof(TransactionId) * oldsnap->xcnt +
4271  sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4272 
4273  change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4274 
4275  newsnap = change->data.snapshot;
4276 
4277  memcpy(newsnap, data, size);
4278  newsnap->xip = (TransactionId *)
4279  (((char *) newsnap) + sizeof(SnapshotData));
4280  newsnap->subxip = newsnap->xip + newsnap->xcnt;
4281  newsnap->copied = true;
4282  break;
4283  }
4284  /* the base struct contains all the data, easy peasy */
4286  {
4287  Oid *relids;
4288 
4289  relids = ReorderBufferGetRelids(rb,
4290  change->data.truncate.nrelids);
4291  memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4292  change->data.truncate.relids = relids;
4293 
4294  break;
4295  }
4299  break;
4300  }
4301 
4302  dlist_push_tail(&txn->changes, &change->node);
4303  txn->nentries_mem++;
4304 
4305  /*
4306  * Update memory accounting for the restored change. We need to do this
4307  * although we don't check the memory limit when restoring the changes in
4308  * this branch (we only do that when initially queueing the changes after
4309  * decoding), because we will release the changes later, and that will
4310  * update the accounting too (subtracting the size from the counters). And
4311  * we don't want to underflow there.
4312  */
4313  ReorderBufferChangeMemoryUpdate(rb, change, true);
4314 }
4315 
4316 /*
4317  * Remove all on-disk stored for the passed in transaction.
4318  */
4319 static void
4321 {
4322  XLogSegNo first;
4323  XLogSegNo cur;
4324  XLogSegNo last;
4325 
4328 
4329  XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4330  XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4331 
4332  /* iterate over all possible filenames, and delete them */
4333  for (cur = first; cur <= last; cur++)
4334  {
4335  char path[MAXPGPATH];
4336 
4338  if (unlink(path) != 0 && errno != ENOENT)
4339  ereport(ERROR,
4341  errmsg("could not remove file \"%s\": %m", path)));
4342  }
4343 }
4344 
4345 /*
4346  * Remove any leftover serialized reorder buffers from a slot directory after a
4347  * prior crash or decoding session exit.
4348  */
4349 static void
4351 {
4352  DIR *spill_dir;
4353  struct dirent *spill_de;
4354  struct stat statbuf;
4355  char path[MAXPGPATH * 2 + 12];
4356 
4357  sprintf(path, "pg_replslot/%s", slotname);
4358 
4359  /* we're only handling directories here, skip if it's not ours */
4360  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4361  return;
4362 
4363  spill_dir = AllocateDir(path);
4364  while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4365  {
4366  /* only look at names that can be ours */
4367  if (strncmp(spill_de->d_name, "xid", 3) == 0)
4368  {
4369  snprintf(path, sizeof(path),
4370  "pg_replslot/%s/%s", slotname,
4371  spill_de->d_name);
4372 
4373  if (unlink(path) != 0)
4374  ereport(ERROR,
4376  errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4377  path, slotname)));
4378  }
4379  }
4380  FreeDir(spill_dir);
4381 }
4382 
4383 /*
4384  * Given a replication slot, transaction ID and segment number, fill in the
4385  * corresponding spill file into 'path', which is a caller-owned buffer of size
4386  * at least MAXPGPATH.
4387  */
4388 static void
4390  XLogSegNo segno)
4391 {
4392  XLogRecPtr recptr;
4393 
4394  XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4395 
4396  snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4398  xid, LSN_FORMAT_ARGS(recptr));
4399 }
4400 
4401 /*
4402  * Delete all data spilled to disk after we've restarted/crashed. It will be
4403  * recreated when the respective slots are reused.
4404  */
4405 void
4407 {
4408  DIR *logical_dir;
4409  struct dirent *logical_de;
4410 
4411  logical_dir = AllocateDir("pg_replslot");
4412  while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4413  {
4414  if (strcmp(logical_de->d_name, ".") == 0 ||
4415  strcmp(logical_de->d_name, "..") == 0)
4416  continue;
4417 
4418  /* if it cannot be a slot, skip the directory */
4419  if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4420  continue;
4421 
4422  /*
4423  * ok, has to be a surviving logical slot, iterate and delete
4424  * everything starting with xid-*
4425  */
4427  }
4428  FreeDir(logical_dir);
4429 }
4430 
4431 /* ---------------------------------------
4432  * toast reassembly support
4433  * ---------------------------------------
4434  */
4435 
4436 /*
4437  * Initialize per tuple toast reconstruction support.
4438  */
4439 static void
4441 {
4442  HASHCTL hash_ctl;
4443 
4444  Assert(txn->toast_hash == NULL);
4445 
4446  hash_ctl.keysize = sizeof(Oid);
4447  hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4448  hash_ctl.hcxt = rb->context;
4449  txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4451 }
4452 
4453 /*
4454  * Per toast-chunk handling for toast reconstruction
4455  *
4456  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4457  * toasted Datum comes along.
4458  */
4459 static void
4461  Relation relation, ReorderBufferChange *change)
4462 {
4463  ReorderBufferToastEnt *ent;
4464  ReorderBufferTupleBuf *newtup;
4465  bool found;
4466  int32 chunksize;
4467  bool isnull;
4468  Pointer chunk;
4469  TupleDesc desc = RelationGetDescr(relation);
4470  Oid chunk_id;
4471  int32 chunk_seq;
4472 
4473  if (txn->toast_hash == NULL)
4474  ReorderBufferToastInitHash(rb, txn);
4475 
4476  Assert(IsToastRelation(relation));
4477 
4478  newtup = change->data.tp.newtuple;
4479  chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull));
4480  Assert(!isnull);
4481  chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull));
4482  Assert(!isnull);
4483 
4484  ent = (ReorderBufferToastEnt *)
4485  hash_search(txn->toast_hash,
4486  (void *) &chunk_id,
4487  HASH_ENTER,
4488  &found);
4489 
4490  if (!found)
4491  {
4492  Assert(ent->chunk_id == chunk_id);
4493  ent->num_chunks = 0;
4494  ent->last_chunk_seq = 0;
4495  ent->size = 0;
4496  ent->reconstructed = NULL;
4497  dlist_init(&ent->chunks);
4498 
4499  if (chunk_seq != 0)
4500  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4501  chunk_seq, chunk_id);
4502  }
4503  else if (found && chunk_seq != ent->last_chunk_seq + 1)
4504  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4505  chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4506 
4507  chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull));
4508  Assert(!isnull);
4509 
4510  /* calculate size so we can allocate the right size at once later */
4511  if (!VARATT_IS_EXTENDED(chunk))
4512  chunksize = VARSIZE(chunk) - VARHDRSZ;
4513  else if (VARATT_IS_SHORT(chunk))
4514  /* could happen due to heap_form_tuple doing its thing */
4515  chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4516  else
4517  elog(ERROR, "unexpected type of toast chunk");
4518 
4519  ent->size += chunksize;
4520  ent->last_chunk_seq = chunk_seq;
4521  ent->num_chunks++;
4522  dlist_push_tail(&ent->chunks, &change->node);
4523 }
4524 
4525 /*
4526  * Rejigger change->newtuple to point to in-memory toast tuples instead to
4527  * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
4528  *
4529  * We cannot replace unchanged toast tuples though, so those will still point
4530  * to on-disk toast data.
4531  *
4532  * While updating the existing change with detoasted tuple data, we need to
4533  * update the memory accounting info, because the change size will differ.
4534  * Otherwise the accounting may get out of sync, triggering serialization
4535  * at unexpected times.
4536  *
4537  * We simply subtract size of the change before rejiggering the tuple, and
4538  * then adding the new size. This makes it look like the change was removed
4539  * and then added back, except it only tweaks the accounting info.
4540  *
4541  * In particular it can't trigger serialization, which would be pointless
4542  * anyway as it happens during commit processing right before handing
4543  * the change to the output plugin.
4544  */
4545 static void
4547  Relation relation, ReorderBufferChange *change)
4548 {
4549  TupleDesc desc;
4550  int natt;
4551  Datum *attrs;
4552  bool *isnull;
4553  bool *free;
4554  HeapTuple tmphtup;
4555  Relation toast_rel;
4556  TupleDesc toast_desc;
4557  MemoryContext oldcontext;
4558  ReorderBufferTupleBuf *newtup;
4559 
4560  /* no toast tuples changed */
4561  if (txn->toast_hash == NULL)
4562  return;
4563 
4564  /*
4565  * We're going to modify the size of the change, so to make sure the
4566  * accounting is correct we'll make it look like we're removing the change
4567  * now (with the old size), and then re-add it at the end.
4568  */
4569  ReorderBufferChangeMemoryUpdate(rb, change, false);
4570 
4571  oldcontext = MemoryContextSwitchTo(rb->context);
4572 
4573  /* we should only have toast tuples in an INSERT or UPDATE */
4574  Assert(change->data.tp.newtuple);
4575 
4576  desc = RelationGetDescr(relation);
4577 
4578  toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4579  if (!RelationIsValid(toast_rel))
4580  elog(ERROR, "could not open relation with OID %u",
4581  relation->rd_rel->reltoastrelid);
4582 
4583  toast_desc = RelationGetDescr(toast_rel);
4584 
4585  /* should we allocate from stack instead? */
4586  attrs = palloc0(sizeof(Datum) * desc->natts);
4587  isnull = palloc0(sizeof(bool) * desc->natts);
4588  free = palloc0(sizeof(bool) * desc->natts);
4589 
4590  newtup = change->data.tp.newtuple;
4591 
4592  heap_deform_tuple(&newtup->tuple, desc, attrs, isnull);
4593 
4594  for (natt = 0; natt < desc->natts; natt++)
4595  {
4596  Form_pg_attribute attr = TupleDescAttr(desc, natt);
4597  ReorderBufferToastEnt *ent;
4598  struct varlena *varlena;
4599 
4600  /* va_rawsize is the size of the original datum -- including header */
4601  struct varatt_external toast_pointer;
4602  struct varatt_indirect redirect_pointer;
4603  struct varlena *new_datum = NULL;
4604  struct varlena *reconstructed;
4605  dlist_iter it;
4606  Size data_done = 0;
4607 
4608  /* system columns aren't toasted */
4609  if (attr->attnum < 0)
4610  continue;
4611 
4612  if (attr->attisdropped)
4613  continue;
4614 
4615  /* not a varlena datatype */
4616  if (attr->attlen != -1)
4617  continue;
4618 
4619  /* no data */
4620  if (isnull[natt])
4621  continue;
4622 
4623  /* ok, we know we have a toast datum */
4624  varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4625 
4626  /* no need to do anything if the tuple isn't external */
4627  if (!VARATT_IS_EXTERNAL(varlena))
4628  continue;
4629 
4630  VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4631 
4632  /*
4633  * Check whether the toast tuple changed, replace if so.
4634  */
4635  ent = (ReorderBufferToastEnt *)
4636  hash_search(txn->toast_hash,
4637  (void *) &toast_pointer.va_valueid,
4638  HASH_FIND,
4639  NULL);
4640  if (ent == NULL)
4641  continue;
4642 
4643  new_datum =
4644  (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
4645 
4646  free[natt] = true;
4647 
4648  reconstructed = palloc0(toast_pointer.va_rawsize);
4649 
4650  ent->reconstructed = reconstructed;
4651 
4652  /* stitch toast tuple back together from its parts */
4653  dlist_foreach(it, &ent->chunks)
4654  {
4655  bool isnull;
4656  ReorderBufferChange *cchange;
4657  ReorderBufferTupleBuf *ctup;
4658  Pointer chunk;
4659 
4660  cchange = dlist_container(ReorderBufferChange, node, it.cur);
4661  ctup = cchange->data.tp.newtuple;
4662  chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull));
4663 
4664  Assert(!isnull);
4665  Assert(!VARATT_IS_EXTERNAL(chunk));
4666  Assert(!VARATT_IS_SHORT(chunk));
4667 
4668  memcpy(VARDATA(reconstructed) + data_done,
4669  VARDATA(chunk),
4670  VARSIZE(chunk) - VARHDRSZ);
4671  data_done += VARSIZE(chunk) - VARHDRSZ;
4672  }
4673  Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4674 
4675  /* make sure its marked as compressed or not */
4676  if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4677  SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4678  else
4679  SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4680 
4681  memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4682  redirect_pointer.pointer = reconstructed;
4683 
4685  memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4686  sizeof(redirect_pointer));
4687 
4688  attrs[natt] = PointerGetDatum(new_datum);
4689  }
4690 
4691  /*
4692  * Build tuple in separate memory & copy tuple back into the tuplebuf
4693  * passed to the output plugin. We can't directly heap_fill_tuple() into
4694  * the tuplebuf because attrs[] will point back into the current content.
4695  */
4696  tmphtup = heap_form_tuple(desc, attrs, isnull);
4697  Assert(newtup->tuple.t_len <= MaxHeapTupleSize);
4698  Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data);
4699 
4700  memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len);
4701  newtup->tuple.t_len = tmphtup->t_len;
4702 
4703  /*
4704  * free resources we won't further need, more persistent stuff will be
4705  * free'd in ReorderBufferToastReset().
4706  */
4707  RelationClose(toast_rel);
4708  pfree(tmphtup);
4709  for (natt = 0; natt < desc->natts; natt++)
4710  {
4711  if (free[natt])
4712  pfree(DatumGetPointer(attrs[natt]));
4713  }
4714  pfree(attrs);
4715  pfree(free);
4716  pfree(isnull);
4717 
4718  MemoryContextSwitchTo(oldcontext);
4719 
4720  /* now add the change back, with the correct size */
4721  ReorderBufferChangeMemoryUpdate(rb, change, true);
4722 }
4723 
4724 /*
4725  * Free all resources allocated for toast reconstruction.
4726  */
4727 static void
4729 {
4730  HASH_SEQ_STATUS hstat;
4731  ReorderBufferToastEnt *ent;
4732 
4733  if (txn->toast_hash == NULL)
4734  return;
4735 
4736  /* sequentially walk over the hash and free everything */
4737  hash_seq_init(&hstat, txn->toast_hash);
4738  while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4739  {
4740  dlist_mutable_iter it;
4741 
4742  if (ent->reconstructed != NULL)
4743  pfree(ent->reconstructed);
4744 
4745  dlist_foreach_modify(it, &ent->chunks)
4746  {
4747  ReorderBufferChange *change =
4749 
4750  dlist_delete(&change->node);
4751  ReorderBufferReturnChange(rb, change, true);
4752  }
4753  }
4754 
4755  hash_destroy(txn->toast_hash);
4756  txn->toast_hash = NULL;
4757 }
4758 
4759 
4760 /* ---------------------------------------
4761  * Visibility support for logical decoding
4762  *
4763  *
4764  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4765  * always rely on stored cmin/cmax values because of two scenarios:
4766  *
4767  * * A tuple got changed multiple times during a single transaction and thus
4768  * has got a combo CID. Combo CIDs are only valid for the duration of a
4769  * single transaction.
4770  * * A tuple with a cmin but no cmax (and thus no combo CID) got
4771  * deleted/updated in another transaction than the one which created it
4772  * which we are looking at right now. As only one of cmin, cmax or combo CID
4773  * is actually stored in the heap we don't have access to the value we
4774  * need anymore.
4775  *
4776  * To resolve those problems we have a per-transaction hash of (cmin,
4777  * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
4778  * (cmin, cmax) values. That also takes care of combo CIDs by simply
4779  * not caring about them at all. As we have the real cmin/cmax values
4780  * combo CIDs aren't interesting.
4781  *
4782  * As we only care about catalog tuples here the overhead of this
4783  * hashtable should be acceptable.
4784  *
4785  * Heap rewrites complicate this a bit, check rewriteheap.c for
4786  * details.
4787  * -------------------------------------------------------------------------
4788  */
4789 
4790 /* struct for sorting mapping files by LSN efficiently */
4791 typedef struct RewriteMappingFile
4792 {
4794  char fname[MAXPGPATH];
4796 
4797 #ifdef NOT_USED
4798 static void
4799 DisplayMapping(HTAB *tuplecid_data)
4800 {
4801  HASH_SEQ_STATUS hstat;
4803 
4804  hash_seq_init(&hstat, tuplecid_data);
4805  while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4806  {
4807  elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4808  ent->key.relnode.dbNode,
4809  ent->key.relnode.spcNode,
4810  ent->key.relnode.relNode,
4813  ent->cmin,
4814  ent->cmax
4815  );
4816  }
4817 }
4818 #endif
4819 
4820 /*
4821  * Apply a single mapping file to tuplecid_data.
4822  *
4823  * The mapping file has to have been verified to be a) committed b) for our
4824  * transaction c) applied in LSN order.
4825  */
4826 static void
4827 ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
4828 {
4829  char path[MAXPGPATH];
4830  int fd;
4831  int readBytes;
4833 
4834  sprintf(path, "pg_logical/mappings/%s", fname);
4835  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4836  if (fd < 0)
4837  ereport(ERROR,
4839  errmsg("could not open file \"%s\": %m", path)));
4840 
4841  while (true)
4842  {
4845  ReorderBufferTupleCidEnt *new_ent;
4846  bool found;
4847 
4848  /* be careful about padding */
4849  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
4850 
4851  /* read all mappings till the end of the file */
4853  readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
4855 
4856  if (readBytes < 0)
4857  ereport(ERROR,
4859  errmsg("could not read file \"%s\": %m",
4860  path)));
4861  else if (readBytes == 0) /* EOF */
4862  break;
4863  else if (readBytes != sizeof(LogicalRewriteMappingData))
4864  ereport(ERROR,
4866  errmsg("could not read from file \"%s\": read %d instead of %d bytes",
4867  path, readBytes,
4868  (int32) sizeof(LogicalRewriteMappingData))));
4869 
4870  key.relnode = map.old_node;
4871  ItemPointerCopy(&map.old_tid,
4872  &key.tid);
4873 
4874 
4875  ent = (ReorderBufferTupleCidEnt *)
4876  hash_search(tuplecid_data,
4877  (void *) &key,
4878  HASH_FIND,
4879  NULL);
4880 
4881  /* no existing mapping, no need to update */
4882  if (!ent)
4883  continue;
4884 
4885  key.relnode = map.new_node;
4886  ItemPointerCopy(&map.new_tid,
4887  &key.tid);
4888 
4889  new_ent = (ReorderBufferTupleCidEnt *)
4890  hash_search(tuplecid_data,
4891  (void *) &key,
4892  HASH_ENTER,
4893  &found);
4894 
4895  if (found)
4896  {
4897  /*
4898  * Make sure the existing mapping makes sense. We sometime update
4899  * old records that did not yet have a cmax (e.g. pg_class' own
4900  * entry while rewriting it) during rewrites, so allow that.
4901  */
4902  Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
4903  Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
4904  }
4905  else
4906  {
4907  /* update mapping */
4908  new_ent->cmin = ent->cmin;
4909  new_ent->cmax = ent->cmax;
4910  new_ent->combocid = ent->combocid;
4911  }
4912  }
4913 
4914  if (CloseTransientFile(fd) != 0)
4915  ereport(ERROR,
4917  errmsg("could not close file \"%s\": %m", path)));
4918 }
4919 
4920 
4921 /*
4922  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
4923  */
4924 static bool
4926 {
4927  return bsearch(&xid, xip, num,
4928  sizeof(TransactionId), xidComparator) != NULL;
4929 }
4930 
4931 /*
4932  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
4933  */
4934 static int
4935 file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
4936 {
4939 
4940  if (a->lsn < b->lsn)
4941  return -1;
4942  else if (a->lsn > b->lsn)
4943  return 1;
4944  return 0;
4945 }
4946 
4947 /*
4948  * Apply any existing logical remapping files if there are any targeted at our
4949  * transaction for relid.
4950  */
4951 static void
4953 {
4954  DIR *mapping_dir;
4955  struct dirent *mapping_de;
4956  List *files = NIL;
4957  ListCell *file;
4958  Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
4959 
4960  mapping_dir = AllocateDir("pg_logical/mappings");
4961  while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
4962  {
4963  Oid f_dboid;
4964  Oid f_relid;
4965  TransactionId f_mapped_xid;
4966  TransactionId f_create_xid;
4967  XLogRecPtr f_lsn;
4968  uint32 f_hi,
4969  f_lo;
4970  RewriteMappingFile *f;
4971 
4972  if (strcmp(mapping_de->d_name, ".") == 0 ||
4973  strcmp(mapping_de->d_name, "..") == 0)
4974  continue;
4975 
4976  /* Ignore files that aren't ours */
4977  if (strncmp(mapping_de->d_name, "map-", 4) != 0)
4978  continue;
4979 
4980  if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
4981  &f_dboid, &f_relid, &f_hi, &f_lo,
4982  &f_mapped_xid, &f_create_xid) != 6)
4983  elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
4984 
4985  f_lsn = ((uint64) f_hi) << 32 | f_lo;
4986 
4987  /* mapping for another database */
4988  if (f_dboid != dboid)
4989  continue;
4990 
4991  /* mapping for another relation */
4992  if (f_relid != relid)
4993  continue;
4994 
4995  /* did the creating transaction abort? */
4996  if (!TransactionIdDidCommit(f_create_xid))
4997  continue;
4998 
4999  /* not for our transaction */
5000  if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5001  continue;
5002 
5003  /* ok, relevant, queue for apply */
5004  f = palloc(sizeof(RewriteMappingFile));
5005  f->lsn = f_lsn;
5006  strcpy(f->fname, mapping_de->d_name);
5007  files = lappend(files, f);
5008  }
5009  FreeDir(mapping_dir);
5010 
5011  /* sort files so we apply them in LSN order */
5012  list_sort(files, file_sort_by_lsn);
5013 
5014  foreach(file, files)
5015  {
5017 
5018  elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5019  snapshot->subxip[0]);
5020  ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
5021  pfree(f);
5022  }
5023 }
5024 
5025 /*
5026  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5027  * combo CIDs.
5028  */
5029 bool
5031  Snapshot snapshot,
5032  HeapTuple htup, Buffer buffer,
5033  CommandId *cmin, CommandId *cmax)
5034 {
5037  ForkNumber forkno;
5038  BlockNumber blockno;
5039  bool updated_mapping = false;
5040 
5041  /*
5042  * Return unresolved if tuplecid_data is not valid. That's because when
5043  * streaming in-progress transactions we may run into tuples with the CID
5044  * before actually decoding them. Think e.g. about INSERT followed by
5045  * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5046  * INSERT. So in such cases, we assume the CID is from the future
5047  * command.
5048  */
5049  if (tuplecid_data == NULL)
5050  return false;
5051 
5052  /* be careful about padding */
5053  memset(&key, 0, sizeof(key));
5054 
5055  Assert(!BufferIsLocal(buffer));
5056 
5057  /*
5058  * get relfilenode from the buffer, no convenient way to access it other
5059  * than that.
5060  */
5061  BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
5062 
5063  /* tuples can only be in the main fork */
5064  Assert(forkno == MAIN_FORKNUM);
5065  Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5066 
5067  ItemPointerCopy(&htup->t_self,
5068  &key.tid);
5069 
5070 restart:
5071  ent = (ReorderBufferTupleCidEnt *)
5072  hash_search(tuplecid_data,
5073  (void *) &key,
5074  HASH_FIND,
5075  NULL);
5076 
5077  /*
5078  * failed to find a mapping, check whether the table was rewritten and
5079  * apply mapping if so, but only do that once - there can be no new
5080  * mappings while we are in here since we have to hold a lock on the
5081  * relation.
5082  */
5083  if (ent == NULL && !updated_mapping)
5084  {
5085  UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5086  /* now check but don't update for a mapping again */
5087  updated_mapping = true;
5088  goto restart;
5089  }
5090  else if (ent == NULL)
5091  return false;
5092 
5093  if (cmin)
5094  *cmin = ent->cmin;
5095  if (cmax)
5096  *cmax = ent->cmax;
5097  return true;
5098 }
static void ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
XLogRecPtr first_lsn
bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
#define NIL
Definition: pg_list.h:65
uint32 CommandId
Definition: c.h:601
TimestampTz commit_time
struct ReorderBufferToastEnt ReorderBufferToastEnt
struct TXNEntryFile TXNEntryFile
void AbortCurrentTransaction(void)
Definition: xact.c:3210
ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER]
#define SizeofHeapTupleHeader
Definition: htup_details.h:184
bool IsToastRelation(Relation relation)
Definition: catalog.c:145
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:862
void ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, Snapshot snapshot, XLogRecPtr lsn, bool transactional, const char *prefix, Size message_size, const char *message)
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
Snapshot base_snapshot
ReorderBufferApplyChangeCB apply_change
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
#define AllocSetContextCreate
Definition: memutils.h:173
HeapTupleData * HeapTuple
Definition: htup.h:71
#define rbtxn_prepared(txn)
void * private_data
dlist_node base_snapshot_node
#define DEBUG1
Definition: elog.h:25
dlist_node * cur
Definition: ilist.h:180
static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
RepOriginId origin_id
void StartupReorderBuffer(void)
#define VARDATA(PTR)
Definition: postgres.h:315
static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
#define fastgetattr(tup, attnum, tupleDesc, isnull)
Definition: htup_details.h:711
static void pgstat_report_wait_end(void)
Definition: wait_event.h:277
static int32 next
Definition: blutils.c:219
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1484
#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)
Definition: postgres.h:391
#define RBTXN_HAS_TOAST_INSERT
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
int wal_segment_size
Definition: xlog.c:121
void ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:543
uint32 TransactionId
Definition: c.h:587
void ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, TXNEntryFile *file, XLogSegNo *segno)
bool copied
Definition: snapshot.h:185
bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
Definition: snapbuild.c:385
void ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn)
#define RBTXN_IS_STREAMED
MemoryContext hcxt
Definition: hsearch.h:86
#define DatumGetInt32(X)
Definition: postgres.h:516
int sqlerrcode
Definition: elog.h:381
#define RelationGetDescr(relation)
Definition: rel.h:483
static ReorderBufferTXN * ReorderBufferLargestTXN(ReorderBuffer *rb)
static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, ReorderBufferTXN *subtxn)
#define DEBUG3
Definition: elog.h:23
SnapBuildState SnapBuildCurrentState(SnapBuild *builder)
Definition: snapbuild.c:367
#define write(a, b, c)
Definition: win32.h:14
#define VARHDRSZ_SHORT
Definition: postgres.h:292
TransactionId by_txn_last_xid
#define VARSIZE(PTR)
Definition: postgres.h:316
ErrorData * CopyErrorData(void)
Definition: elog.c:1560
#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)
Definition: detoast.h:22
int64 TimestampTz
Definition: timestamp.h:39
#define PointerGetDatum(X)
Definition: postgres.h:600
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
ReorderBufferTXN * txn
static bool TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
#define VARHDRSZ
Definition: c.h:627
#define dlist_foreach(iter, lhead)
Definition: ilist.h:526
ReorderBufferStreamAbortCB stream_abort
XLogRecPtr current_restart_decoding_lsn
#define DatumGetObjectId(X)
Definition: postgres.h:544
char * pstrdup(const char *in)
Definition: mcxt.c:1299
#define rbtxn_has_incomplete_tuple(txn)
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2715
static ReorderBufferTXN * ReorderBufferGetTXN(ReorderBuffer *rb)
static void ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id, XLogRecPtr last_lsn, ReorderBufferChange *specinsert)
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:317
struct ReorderBufferTXN * txn
Definition: reorderbuffer.h:87
Oid RelidByRelfilenode(Oid reltablespace, Oid relfilenode)
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
void ReorderBufferFree(ReorderBuffer *rb)
#define rbtxn_is_serialized_clear(txn)
uint16 RepOriginId
Definition: xlogdefs.h:65
Size entrysize
Definition: hsearch.h:76
CommandId command_id
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:350
Snapshot snapshot_now
struct cursor * cur
Definition: ecpg.c:28
char fname[MAXPGPATH]
int32 va_rawsize
Definition: postgres.h:71
bool IsTransactionOrTransactionBlock(void)
Definition: xact.c:4701
#define INFO
Definition: elog.h:33
MemoryContext SlabContextCreate(MemoryContext parent, const char *name, Size blockSize, Size chunkSize)
Definition: slab.c:175
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileNode node, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
uint32 BlockNumber
Definition: block.h:31
void TeardownHistoricSnapshot(bool is_error)
Definition: snapmgr.c:2051
ReorderBufferCommitCB commit
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: heaptuple.c:1020
ReorderBufferChange * ReorderBufferGetChange(ReorderBuffer *rb)
#define RelationIsLogicallyLogged(relation)
Definition: rel.h:643
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
void ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, XLogRecPtr initial_consistent_point, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn, char *gid, bool is_commit)
static int64 files
Definition: pg_checksums.c:34
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:125
ReplicationSlotPersistentData data
Definition: slot.h:156
struct ReorderBufferTupleCidKey ReorderBufferTupleCidKey
struct SnapshotData * Snapshot
Definition: snapshot.h:121
Form_pg_class rd_rel
Definition: rel.h:110
unsigned int Oid
Definition: postgres_ext.h:31
XLogRecPtr base_snapshot_lsn
ReorderBufferStreamCommitCB stream_commit
Definition: dirent.h:9
uint32 regd_count
Definition: snapshot.h:205
enum ReorderBufferChangeType action
Definition: reorderbuffer.h:84
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
MemoryContext change_context
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define VARDATA_EXTERNAL(PTR)
Definition: postgres.h:323
#define PG_BINARY
Definition: c.h:1271
void FlushErrorState(void)
Definition: elog.c:1654
XLogRecPtr origin_lsn
static void ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
signed int int32
Definition: c.h:429
#define FirstCommandId
Definition: c.h:603
XLogRecPtr EndRecPtr
Definition: xlogreader.h:179
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
void ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
#define RBTXN_SKIPPED_PREPARE
HeapTupleHeader t_data
Definition: htup.h:68
#define VARATT_IS_EXTERNAL(PTR)
Definition: postgres.h:326
#define sprintf
Definition: port.h:218
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:174
#define rbtxn_is_streamed(txn)
static dlist_node * dlist_next_node(dlist_head *head, dlist_node *node)
Definition: ilist.h:440
Definition: dynahash.c:219
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
ReorderBufferStreamMessageCB stream_message
#define RBTXN_HAS_CATALOG_CHANGES
#define ReorderBufferTupleBufData(p)
Definition: reorderbuffer.h:38
#define dlist_container(type, membername, ptr)
Definition: ilist.h:496
void pfree(void *pointer)
Definition: mcxt.c:1169
char * Pointer
Definition: c.h:418
void FreeErrorData(ErrorData *edata)
Definition: elog.c:1616
static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
Definition: dirent.c:25
#define ERROR
Definition: elog.h:46
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2423
static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
#define rbtxn_has_toast_insert(txn)
#define SLAB_LARGE_BLOCK_SIZE
Definition: memutils.h:225
#define VARATT_IS_SHORT(PTR)
Definition: postgres.h:339
#define RelationIsValid(relation)
Definition: rel.h:430
dlist_head changes
void ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids)
dlist_head txns_by_base_snapshot_lsn
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define IsSpecInsert(action)
ReorderBufferStreamPrepareCB stream_prepare
#define MAXPGPATH
ItemPointerData t_self
Definition: htup.h:65
ReorderBufferTupleCidKey key
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:195
#define DEBUG2
Definition: elog.h:24
static bool ReorderBufferCanStream(ReorderBuffer *rb)
TransactionId GetCurrentTransactionId(void)
Definition: xact.c:438
void ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, SharedInvalidationMessage *invalidations)
uint32 t_len
Definition: htup.h:64
#define MaxHeapTupleSize
Definition: htup_details.h:559
void ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
struct varlena * reconstructed
void RollbackAndReleaseCurrentSubTransaction(void)
Definition: xact.c:4512
#define SET_VARTAG_EXTERNAL(PTR, tag)
Definition: postgres.h:346
uint64 XLogSegNo
Definition: xlogdefs.h:48
int errcode_for_file_access(void)
Definition: elog.c:721
HeapTupleData tuple
Definition: reorderbuffer.h:29
struct SnapshotData SnapshotData
TransactionId GetCurrentTransactionIdIfAny(void)
Definition: xact.c:455
static ReorderBufferTXN * ReorderBufferLargestTopTXN(ReorderBuffer *rb)
static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, XLogSegNo segno)
#define InvalidTransactionId
Definition: transam.h:31
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:203
bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
unsigned int uint32
Definition: c.h:441
XLogRecPtr final_lsn
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2634
#define RBTXN_IS_SUBXACT
Oid t_tableOid
Definition: htup.h:66
static int file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
ReorderBufferBeginCB begin_prepare
struct ReorderBufferTXN * toptxn
void RelationClose(Relation relation)
Definition: relcache.c:2096
TransactionId xmin
Definition: snapshot.h:157
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
static void dlist_delete(dlist_node *node)
Definition: ilist.h:358
struct ReorderBufferChange::@97::@102 inval
#define INDIRECT_POINTER_SIZE
Definition: detoast.h:34
ReorderBufferMessageCB message
ReorderBufferStreamChangeCB stream_change
#define AssertArg(condition)
Definition: c.h:806
int bh_size
Definition: binaryheap.h:32
union ReorderBufferChange::@97 data
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
TransactionId * xip
Definition: snapshot.h:168
static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, ReorderBufferTXN *txn, CommandId cid)
#define VARSIZE_SHORT(PTR)
Definition: postgres.h:318
ForkNumber
Definition: relpath.h:40
RepOriginId origin_id
Definition: reorderbuffer.h:89
ReorderBufferStreamTruncateCB stream_truncate
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event