PostgreSQL Source Code  git master
reorderbuffer.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * reorderbuffer.c
4  * PostgreSQL logical replay/reorder buffer management
5  *
6  *
7  * Copyright (c) 2012-2021, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/reorderbuffer.c
12  *
13  * NOTES
14  * This module gets handed individual pieces of transactions in the order
15  * they are written to the WAL and is responsible to reassemble them into
16  * toplevel transaction sized pieces. When a transaction is completely
17  * reassembled - signaled by reading the transaction commit record - it
18  * will then call the output plugin (cf. ReorderBufferCommit()) with the
19  * individual changes. The output plugins rely on snapshots built by
20  * snapbuild.c which hands them to us.
21  *
22  * Transactions and subtransactions/savepoints in postgres are not
23  * immediately linked to each other from outside the performing
24  * backend. Only at commit/abort (or special xact_assignment records) they
25  * are linked together. Which means that we will have to splice together a
26  * toplevel transaction from its subtransactions. To do that efficiently we
27  * build a binary heap indexed by the smallest current lsn of the individual
28  * subtransactions' changestreams. As the individual streams are inherently
29  * ordered by LSN - since that is where we build them from - the transaction
30  * can easily be reassembled by always using the subtransaction with the
31  * smallest current LSN from the heap.
32  *
33  * In order to cope with large transactions - which can be several times as
34  * big as the available memory - this module supports spooling the contents
35  * of a large transactions to disk. When the transaction is replayed the
36  * contents of individual (sub-)transactions will be read from disk in
37  * chunks.
38  *
39  * This module also has to deal with reassembling toast records from the
40  * individual chunks stored in WAL. When a new (or initial) version of a
41  * tuple is stored in WAL it will always be preceded by the toast chunks
42  * emitted for the columns stored out of line. Within a single toplevel
43  * transaction there will be no other data carrying records between a row's
44  * toast chunks and the row data itself. See ReorderBufferToast* for
45  * details.
46  *
47  * ReorderBuffer uses two special memory context types - SlabContext for
48  * allocations of fixed-length structures (changes and transactions), and
49  * GenerationContext for the variable-length transaction data (allocated
50  * and freed in groups with similar lifespans).
51  *
52  * To limit the amount of memory used by decoded changes, we track memory
53  * used at the reorder buffer level (i.e. total amount of memory), and for
54  * each transaction. When the total amount of used memory exceeds the
55  * limit, the transaction consuming the most memory is then serialized to
56  * disk.
57  *
58  * Only decoded changes are evicted from memory (spilled to disk), not the
59  * transaction records. The number of toplevel transactions is limited,
60  * but a transaction with many subtransactions may still consume significant
61  * amounts of memory. However, the transaction records are fairly small and
62  * are not included in the memory limit.
63  *
64  * The current eviction algorithm is very simple - the transaction is
65  * picked merely by size, while it might be useful to also consider age
66  * (LSN) of the changes for example. With the new Generational memory
67  * allocator, evicting the oldest changes would make it more likely the
68  * memory gets actually freed.
69  *
70  * We still rely on max_changes_in_memory when loading serialized changes
71  * back into memory. At that point we can't use the memory limit directly
72  * as we load the subxacts independently. One option to deal with this
73  * would be to count the subxacts, and allow each to allocate 1/N of the
74  * memory limit. That however does not seem very appealing, because with
75  * many subtransactions it may easily cause thrashing (short cycles of
76  * deserializing and applying very few changes). We probably should give
77  * a bit more memory to the oldest subtransactions, because it's likely
78  * they are the source for the next sequence of changes.
79  *
80  * -------------------------------------------------------------------------
81  */
82 #include "postgres.h"
83 
84 #include <unistd.h>
85 #include <sys/stat.h>
86 
87 #include "access/detoast.h"
88 #include "access/heapam.h"
89 #include "access/rewriteheap.h"
90 #include "access/transam.h"
91 #include "access/xact.h"
92 #include "access/xlog_internal.h"
93 #include "catalog/catalog.h"
94 #include "lib/binaryheap.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "replication/logical.h"
99 #include "replication/slot.h"
100 #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
101 #include "storage/bufmgr.h"
102 #include "storage/fd.h"
103 #include "storage/sinval.h"
104 #include "utils/builtins.h"
105 #include "utils/combocid.h"
106 #include "utils/memdebug.h"
107 #include "utils/memutils.h"
108 #include "utils/rel.h"
109 #include "utils/relfilenodemap.h"
110 
111 
112 /* entry for a hash table we use to map from xid to our transaction state */
114 {
118 
119 /* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
121 {
125 
127 {
131  CommandId combocid; /* just for debugging */
133 
134 /* Virtual file descriptor with file offset tracking */
135 typedef struct TXNEntryFile
136 {
137  File vfd; /* -1 when the file is closed */
138  off_t curOffset; /* offset for next write or read. Reset to 0
139  * when vfd is opened. */
140 } TXNEntryFile;
141 
142 /* k-way in-order change iteration support structures */
144 {
151 
153 {
159 
160 /* toast datastructures */
161 typedef struct ReorderBufferToastEnt
162 {
163  Oid chunk_id; /* toast_table.chunk_id */
164  int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
165  * have seen */
166  Size num_chunks; /* number of chunks we've already seen */
167  Size size; /* combined size of chunks seen */
168  dlist_head chunks; /* linked list of chunks */
169  struct varlena *reconstructed; /* reconstructed varlena now pointed to in
170  * main tup */
172 
173 /* Disk serialization support datastructures */
175 {
178  /* data follows */
180 
181 #define IsSpecInsert(action) \
182 ( \
183  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
184 )
185 #define IsSpecConfirm(action) \
186 ( \
187  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) \
188 )
189 #define IsInsertOrUpdate(action) \
190 ( \
191  (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
192  ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
193  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
194 )
195 
196 /*
197  * Maximum number of changes kept in memory, per transaction. After that,
198  * changes are spooled to disk.
199  *
200  * The current value should be sufficient to decode the entire transaction
201  * without hitting disk in OLTP workloads, while starting to spool to disk in
202  * other workloads reasonably fast.
203  *
204  * At some point in the future it probably makes sense to have a more elaborate
205  * resource management here, but it's not entirely clear what that would look
206  * like.
207  */
209 static const Size max_changes_in_memory = 4096; /* XXX for restore only */
210 
211 /* ---------------------------------------
212  * primary reorderbuffer support routines
213  * ---------------------------------------
214  */
218  TransactionId xid, bool create, bool *is_new,
219  XLogRecPtr lsn, bool create_as_top);
221  ReorderBufferTXN *subtxn);
222 
223 static void AssertTXNLsnOrder(ReorderBuffer *rb);
224 
225 /* ---------------------------------------
226  * support functions for lsn-order iterating over the ->changes of a
227  * transaction and its subtransactions
228  *
229  * used for iteration over the k-way heap merge of a transaction and its
230  * subtransactions
231  * ---------------------------------------
232  */
234  ReorderBufferIterTXNState *volatile *iter_state);
239 
240 /*
241  * ---------------------------------------
242  * Disk serialization support functions
243  * ---------------------------------------
244  */
248  int fd, ReorderBufferChange *change);
250  TXNEntryFile *file, XLogSegNo *segno);
252  char *change);
255  bool txn_prepared);
256 static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
257 static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
258  TransactionId xid, XLogSegNo segno);
259 
260 static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
263 
264 /*
265  * ---------------------------------------
266  * Streaming support functions
267  * ---------------------------------------
268  */
269 static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
270 static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
273 
274 /* ---------------------------------------
275  * toast reassembly support
276  * ---------------------------------------
277  */
281  Relation relation, ReorderBufferChange *change);
283  Relation relation, ReorderBufferChange *change);
284 
285 /*
286  * ---------------------------------------
287  * memory accounting
288  * ---------------------------------------
289  */
292  ReorderBufferChange *change, bool addition);
293 
294 /*
295  * Allocate a new ReorderBuffer and clean out any old serialized state from
296  * prior ReorderBuffer instances for the same slot.
297  */
300 {
301  ReorderBuffer *buffer;
302  HASHCTL hash_ctl;
303  MemoryContext new_ctx;
304 
305  Assert(MyReplicationSlot != NULL);
306 
307  /* allocate memory in own context, to have better accountability */
309  "ReorderBuffer",
311 
312  buffer =
313  (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
314 
315  memset(&hash_ctl, 0, sizeof(hash_ctl));
316 
317  buffer->context = new_ctx;
318 
319  buffer->change_context = SlabContextCreate(new_ctx,
320  "Change",
322  sizeof(ReorderBufferChange));
323 
324  buffer->txn_context = SlabContextCreate(new_ctx,
325  "TXN",
327  sizeof(ReorderBufferTXN));
328 
329  buffer->tup_context = GenerationContextCreate(new_ctx,
330  "Tuples",
332 
333  hash_ctl.keysize = sizeof(TransactionId);
334  hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
335  hash_ctl.hcxt = buffer->context;
336 
337  buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
339 
341  buffer->by_txn_last_txn = NULL;
342 
343  buffer->outbuf = NULL;
344  buffer->outbufsize = 0;
345  buffer->size = 0;
346 
347  buffer->spillTxns = 0;
348  buffer->spillCount = 0;
349  buffer->spillBytes = 0;
350  buffer->streamTxns = 0;
351  buffer->streamCount = 0;
352  buffer->streamBytes = 0;
353 
355 
356  dlist_init(&buffer->toplevel_by_lsn);
358 
359  /*
360  * Ensure there's no stale data from prior uses of this slot, in case some
361  * prior exit avoided calling ReorderBufferFree. Failure to do this can
362  * produce duplicated txns, and it's very cheap if there's nothing there.
363  */
365 
366  return buffer;
367 }
368 
369 /*
370  * Free a ReorderBuffer
371  */
372 void
374 {
375  MemoryContext context = rb->context;
376 
377  /*
378  * We free separately allocated data by entirely scrapping reorderbuffer's
379  * memory context.
380  */
381  MemoryContextDelete(context);
382 
383  /* Free disk space used by unconsumed reorder buffers */
385 }
386 
387 /*
388  * Get an unused, possibly preallocated, ReorderBufferTXN.
389  */
390 static ReorderBufferTXN *
392 {
394 
395  txn = (ReorderBufferTXN *)
397 
398  memset(txn, 0, sizeof(ReorderBufferTXN));
399 
400  dlist_init(&txn->changes);
401  dlist_init(&txn->tuplecids);
402  dlist_init(&txn->subtxns);
403 
404  /* InvalidCommandId is not zero, so set it explicitly */
406  txn->output_plugin_private = NULL;
407 
408  return txn;
409 }
410 
411 /*
412  * Free a ReorderBufferTXN.
413  */
414 static void
416 {
417  /* clean the lookup cache if we were cached (quite likely) */
418  if (rb->by_txn_last_xid == txn->xid)
419  {
421  rb->by_txn_last_txn = NULL;
422  }
423 
424  /* free data that's contained */
425 
426  if (txn->gid != NULL)
427  {
428  pfree(txn->gid);
429  txn->gid = NULL;
430  }
431 
432  if (txn->tuplecid_hash != NULL)
433  {
435  txn->tuplecid_hash = NULL;
436  }
437 
438  if (txn->invalidations)
439  {
440  pfree(txn->invalidations);
441  txn->invalidations = NULL;
442  }
443 
444  pfree(txn);
445 }
446 
447 /*
448  * Get an fresh ReorderBufferChange.
449  */
452 {
453  ReorderBufferChange *change;
454 
455  change = (ReorderBufferChange *)
457 
458  memset(change, 0, sizeof(ReorderBufferChange));
459  return change;
460 }
461 
462 /*
463  * Free a ReorderBufferChange and update memory accounting, if requested.
464  */
465 void
467  bool upd_mem)
468 {
469  /* update memory accounting info */
470  if (upd_mem)
471  ReorderBufferChangeMemoryUpdate(rb, change, false);
472 
473  /* free contained data */
474  switch (change->action)
475  {
480  if (change->data.tp.newtuple)
481  {
482  ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple);
483  change->data.tp.newtuple = NULL;
484  }
485 
486  if (change->data.tp.oldtuple)
487  {
488  ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple);
489  change->data.tp.oldtuple = NULL;
490  }
491  break;
493  if (change->data.msg.prefix != NULL)
494  pfree(change->data.msg.prefix);
495  change->data.msg.prefix = NULL;
496  if (change->data.msg.message != NULL)
497  pfree(change->data.msg.message);
498  change->data.msg.message = NULL;
499  break;
501  if (change->data.inval.invalidations)
502  pfree(change->data.inval.invalidations);
503  change->data.inval.invalidations = NULL;
504  break;
506  if (change->data.snapshot)
507  {
508  ReorderBufferFreeSnap(rb, change->data.snapshot);
509  change->data.snapshot = NULL;
510  }
511  break;
512  /* no data in addition to the struct itself */
514  if (change->data.truncate.relids != NULL)
515  {
516  ReorderBufferReturnRelids(rb, change->data.truncate.relids);
517  change->data.truncate.relids = NULL;
518  }
519  break;
523  break;
524  }
525 
526  pfree(change);
527 }
528 
529 /*
530  * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size
531  * tuple_len (excluding header overhead).
532  */
535 {
536  ReorderBufferTupleBuf *tuple;
537  Size alloc_len;
538 
539  alloc_len = tuple_len + SizeofHeapTupleHeader;
540 
541  tuple = (ReorderBufferTupleBuf *)
543  sizeof(ReorderBufferTupleBuf) +
544  MAXIMUM_ALIGNOF + alloc_len);
545  tuple->alloc_tuple_size = alloc_len;
546  tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
547 
548  return tuple;
549 }
550 
551 /*
552  * Free an ReorderBufferTupleBuf.
553  */
554 void
556 {
557  pfree(tuple);
558 }
559 
560 /*
561  * Get an array for relids of truncated relations.
562  *
563  * We use the global memory context (for the whole reorder buffer), because
564  * none of the existing ones seems like a good match (some are SLAB, so we
565  * can't use those, and tup_context is meant for tuple data, not relids). We
566  * could add yet another context, but it seems like an overkill - TRUNCATE is
567  * not particularly common operation, so it does not seem worth it.
568  */
569 Oid *
571 {
572  Oid *relids;
573  Size alloc_len;
574 
575  alloc_len = sizeof(Oid) * nrelids;
576 
577  relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
578 
579  return relids;
580 }
581 
582 /*
583  * Free an array of relids.
584  */
585 void
587 {
588  pfree(relids);
589 }
590 
591 /*
592  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
593  * If create is true, and a transaction doesn't already exist, create it
594  * (with the given LSN, and as top transaction if that's specified);
595  * when this happens, is_new is set to true.
596  */
597 static ReorderBufferTXN *
599  bool *is_new, XLogRecPtr lsn, bool create_as_top)
600 {
603  bool found;
604 
606 
607  /*
608  * Check the one-entry lookup cache first
609  */
611  rb->by_txn_last_xid == xid)
612  {
613  txn = rb->by_txn_last_txn;
614 
615  if (txn != NULL)
616  {
617  /* found it, and it's valid */
618  if (is_new)
619  *is_new = false;
620  return txn;
621  }
622 
623  /*
624  * cached as non-existent, and asked not to create? Then nothing else
625  * to do.
626  */
627  if (!create)
628  return NULL;
629  /* otherwise fall through to create it */
630  }
631 
632  /*
633  * If the cache wasn't hit or it yielded an "does-not-exist" and we want
634  * to create an entry.
635  */
636 
637  /* search the lookup table */
638  ent = (ReorderBufferTXNByIdEnt *)
639  hash_search(rb->by_txn,
640  (void *) &xid,
641  create ? HASH_ENTER : HASH_FIND,
642  &found);
643  if (found)
644  txn = ent->txn;
645  else if (create)
646  {
647  /* initialize the new entry, if creation was requested */
648  Assert(ent != NULL);
649  Assert(lsn != InvalidXLogRecPtr);
650 
651  ent->txn = ReorderBufferGetTXN(rb);
652  ent->txn->xid = xid;
653  txn = ent->txn;
654  txn->first_lsn = lsn;
656 
657  if (create_as_top)
658  {
659  dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
660  AssertTXNLsnOrder(rb);
661  }
662  }
663  else
664  txn = NULL; /* not found and not asked to create */
665 
666  /* update cache */
667  rb->by_txn_last_xid = xid;
668  rb->by_txn_last_txn = txn;
669 
670  if (is_new)
671  *is_new = !found;
672 
673  Assert(!create || txn != NULL);
674  return txn;
675 }
676 
677 /*
678  * Record the partial change for the streaming of in-progress transactions. We
679  * can stream only complete changes so if we have a partial change like toast
680  * table insert or speculative insert then we mark such a 'txn' so that it
681  * can't be streamed. We also ensure that if the changes in such a 'txn' are
682  * above logical_decoding_work_mem threshold then we stream them as soon as we
683  * have a complete change.
684  */
685 static void
687  ReorderBufferChange *change,
688  bool toast_insert)
689 {
690  ReorderBufferTXN *toptxn;
691 
692  /*
693  * The partial changes need to be processed only while streaming
694  * in-progress transactions.
695  */
696  if (!ReorderBufferCanStream(rb))
697  return;
698 
699  /* Get the top transaction. */
700  if (txn->toptxn != NULL)
701  toptxn = txn->toptxn;
702  else
703  toptxn = txn;
704 
705  /*
706  * Set the toast insert bit whenever we get toast insert to indicate a
707  * partial change and clear it when we get the insert or update on main
708  * table (Both update and insert will do the insert in the toast table).
709  */
710  if (toast_insert)
712  else if (rbtxn_has_toast_insert(toptxn) &&
713  IsInsertOrUpdate(change->action))
714  toptxn->txn_flags &= ~RBTXN_HAS_TOAST_INSERT;
715 
716  /*
717  * Set the spec insert bit whenever we get the speculative insert to
718  * indicate the partial change and clear the same on speculative confirm.
719  */
720  if (IsSpecInsert(change->action))
721  toptxn->txn_flags |= RBTXN_HAS_SPEC_INSERT;
722  else if (IsSpecConfirm(change->action))
723  {
724  /*
725  * Speculative confirm change must be preceded by speculative
726  * insertion.
727  */
728  Assert(rbtxn_has_spec_insert(toptxn));
729  toptxn->txn_flags &= ~RBTXN_HAS_SPEC_INSERT;
730  }
731 
732  /*
733  * Stream the transaction if it is serialized before and the changes are
734  * now complete in the top-level transaction.
735  *
736  * The reason for doing the streaming of such a transaction as soon as we
737  * get the complete change for it is that previously it would have reached
738  * the memory threshold and wouldn't get streamed because of incomplete
739  * changes. Delaying such transactions would increase apply lag for them.
740  */
742  !(rbtxn_has_incomplete_tuple(toptxn)) &&
743  rbtxn_is_serialized(txn))
744  ReorderBufferStreamTXN(rb, toptxn);
745 }
746 
747 /*
748  * Queue a change into a transaction so it can be replayed upon commit or will be
749  * streamed when we reach logical_decoding_work_mem threshold.
750  */
751 void
753  ReorderBufferChange *change, bool toast_insert)
754 {
756 
757  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
758 
759  /*
760  * While streaming the previous changes we have detected that the
761  * transaction is aborted. So there is no point in collecting further
762  * changes for it.
763  */
764  if (txn->concurrent_abort)
765  {
766  /*
767  * We don't need to update memory accounting for this change as we
768  * have not added it to the queue yet.
769  */
770  ReorderBufferReturnChange(rb, change, false);
771  return;
772  }
773 
774  change->lsn = lsn;
775  change->txn = txn;
776 
777  Assert(InvalidXLogRecPtr != lsn);
778  dlist_push_tail(&txn->changes, &change->node);
779  txn->nentries++;
780  txn->nentries_mem++;
781 
782  /* update memory accounting information */
783  ReorderBufferChangeMemoryUpdate(rb, change, true);
784 
785  /* process partial change */
786  ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
787 
788  /* check the memory limits and evict something if needed */
790 }
791 
792 /*
793  * A transactional message is queued to be processed upon commit and a
794  * non-transactional message gets processed immediately.
795  */
796 void
798  Snapshot snapshot, XLogRecPtr lsn,
799  bool transactional, const char *prefix,
800  Size message_size, const char *message)
801 {
802  if (transactional)
803  {
804  MemoryContext oldcontext;
805  ReorderBufferChange *change;
806 
808 
809  oldcontext = MemoryContextSwitchTo(rb->context);
810 
811  change = ReorderBufferGetChange(rb);
813  change->data.msg.prefix = pstrdup(prefix);
814  change->data.msg.message_size = message_size;
815  change->data.msg.message = palloc(message_size);
816  memcpy(change->data.msg.message, message, message_size);
817 
818  ReorderBufferQueueChange(rb, xid, lsn, change, false);
819 
820  MemoryContextSwitchTo(oldcontext);
821  }
822  else
823  {
824  ReorderBufferTXN *txn = NULL;
825  volatile Snapshot snapshot_now = snapshot;
826 
827  if (xid != InvalidTransactionId)
828  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
829 
830  /* setup snapshot to allow catalog access */
831  SetupHistoricSnapshot(snapshot_now, NULL);
832  PG_TRY();
833  {
834  rb->message(rb, txn, lsn, false, prefix, message_size, message);
835 
837  }
838  PG_CATCH();
839  {
841  PG_RE_THROW();
842  }
843  PG_END_TRY();
844  }
845 }
846 
847 /*
848  * AssertTXNLsnOrder
849  * Verify LSN ordering of transaction lists in the reorderbuffer
850  *
851  * Other LSN-related invariants are checked too.
852  *
853  * No-op if assertions are not in use.
854  */
855 static void
857 {
858 #ifdef USE_ASSERT_CHECKING
859  dlist_iter iter;
860  XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
861  XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
862 
863  dlist_foreach(iter, &rb->toplevel_by_lsn)
864  {
866  iter.cur);
867 
868  /* start LSN must be set */
869  Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
870 
871  /* If there is an end LSN, it must be higher than start LSN */
872  if (cur_txn->end_lsn != InvalidXLogRecPtr)
873  Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
874 
875  /* Current initial LSN must be strictly higher than previous */
876  if (prev_first_lsn != InvalidXLogRecPtr)
877  Assert(prev_first_lsn < cur_txn->first_lsn);
878 
879  /* known-as-subtxn txns must not be listed */
880  Assert(!rbtxn_is_known_subxact(cur_txn));
881 
882  prev_first_lsn = cur_txn->first_lsn;
883  }
884 
886  {
888  base_snapshot_node,
889  iter.cur);
890 
891  /* base snapshot (and its LSN) must be set */
892  Assert(cur_txn->base_snapshot != NULL);
894 
895  /* current LSN must be strictly higher than previous */
896  if (prev_base_snap_lsn != InvalidXLogRecPtr)
897  Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
898 
899  /* known-as-subtxn txns must not be listed */
900  Assert(!rbtxn_is_known_subxact(cur_txn));
901 
902  prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
903  }
904 #endif
905 }
906 
907 /*
908  * AssertChangeLsnOrder
909  *
910  * Check ordering of changes in the (sub)transaction.
911  */
912 static void
914 {
915 #ifdef USE_ASSERT_CHECKING
916  dlist_iter iter;
917  XLogRecPtr prev_lsn = txn->first_lsn;
918 
919  dlist_foreach(iter, &txn->changes)
920  {
921  ReorderBufferChange *cur_change;
922 
923  cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
924 
926  Assert(cur_change->lsn != InvalidXLogRecPtr);
927  Assert(txn->first_lsn <= cur_change->lsn);
928 
929  if (txn->end_lsn != InvalidXLogRecPtr)
930  Assert(cur_change->lsn <= txn->end_lsn);
931 
932  Assert(prev_lsn <= cur_change->lsn);
933 
934  prev_lsn = cur_change->lsn;
935  }
936 #endif
937 }
938 
939 /*
940  * ReorderBufferGetOldestTXN
941  * Return oldest transaction in reorderbuffer
942  */
945 {
947 
948  AssertTXNLsnOrder(rb);
949 
951  return NULL;
952 
954 
957  return txn;
958 }
959 
960 /*
961  * ReorderBufferGetOldestXmin
962  * Return oldest Xmin in reorderbuffer
963  *
964  * Returns oldest possibly running Xid from the point of view of snapshots
965  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
966  * there are none.
967  *
968  * Since snapshots are assigned monotonically, this equals the Xmin of the
969  * base snapshot with minimal base_snapshot_lsn.
970  */
973 {
975 
976  AssertTXNLsnOrder(rb);
977 
979  return InvalidTransactionId;
980 
981  txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
983  return txn->base_snapshot->xmin;
984 }
985 
986 void
988 {
990 }
991 
992 /*
993  * ReorderBufferAssignChild
994  *
995  * Make note that we know that subxid is a subtransaction of xid, seen as of
996  * the given lsn.
997  */
998 void
1000  TransactionId subxid, XLogRecPtr lsn)
1001 {
1003  ReorderBufferTXN *subtxn;
1004  bool new_top;
1005  bool new_sub;
1006 
1007  txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1008  subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1009 
1010  if (!new_sub)
1011  {
1012  if (rbtxn_is_known_subxact(subtxn))
1013  {
1014  /* already associated, nothing to do */
1015  return;
1016  }
1017  else
1018  {
1019  /*
1020  * We already saw this transaction, but initially added it to the
1021  * list of top-level txns. Now that we know it's not top-level,
1022  * remove it from there.
1023  */
1024  dlist_delete(&subtxn->node);
1025  }
1026  }
1027 
1028  subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1029  subtxn->toplevel_xid = xid;
1030  Assert(subtxn->nsubtxns == 0);
1031 
1032  /* set the reference to top-level transaction */
1033  subtxn->toptxn = txn;
1034 
1035  /* add to subtransaction list */
1036  dlist_push_tail(&txn->subtxns, &subtxn->node);
1037  txn->nsubtxns++;
1038 
1039  /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1040  ReorderBufferTransferSnapToParent(txn, subtxn);
1041 
1042  /* Verify LSN-ordering invariant */
1043  AssertTXNLsnOrder(rb);
1044 }
1045 
1046 /*
1047  * ReorderBufferTransferSnapToParent
1048  * Transfer base snapshot from subtxn to top-level txn, if needed
1049  *
1050  * This is done if the top-level txn doesn't have a base snapshot, or if the
1051  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1052  * snapshot's LSN. This can happen if there are no changes in the toplevel
1053  * txn but there are some in the subtxn, or the first change in subtxn has
1054  * earlier LSN than first change in the top-level txn and we learned about
1055  * their kinship only now.
1056  *
1057  * The subtransaction's snapshot is cleared regardless of the transfer
1058  * happening, since it's not needed anymore in either case.
1059  *
1060  * We do this as soon as we become aware of their kinship, to avoid queueing
1061  * extra snapshots to txns known-as-subtxns -- only top-level txns will
1062  * receive further snapshots.
1063  */
1064 static void
1066  ReorderBufferTXN *subtxn)
1067 {
1068  Assert(subtxn->toplevel_xid == txn->xid);
1069 
1070  if (subtxn->base_snapshot != NULL)
1071  {
1072  if (txn->base_snapshot == NULL ||
1073  subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1074  {
1075  /*
1076  * If the toplevel transaction already has a base snapshot but
1077  * it's newer than the subxact's, purge it.
1078  */
1079  if (txn->base_snapshot != NULL)
1080  {
1083  }
1084 
1085  /*
1086  * The snapshot is now the top transaction's; transfer it, and
1087  * adjust the list position of the top transaction in the list by
1088  * moving it to where the subtransaction is.
1089  */
1090  txn->base_snapshot = subtxn->base_snapshot;
1091  txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1093  &txn->base_snapshot_node);
1094 
1095  /*
1096  * The subtransaction doesn't have a snapshot anymore (so it
1097  * mustn't be in the list.)
1098  */
1099  subtxn->base_snapshot = NULL;
1101  dlist_delete(&subtxn->base_snapshot_node);
1102  }
1103  else
1104  {
1105  /* Base snap of toplevel is fine, so subxact's is not needed */
1107  dlist_delete(&subtxn->base_snapshot_node);
1108  subtxn->base_snapshot = NULL;
1110  }
1111  }
1112 }
1113 
1114 /*
1115  * Associate a subtransaction with its toplevel transaction at commit
1116  * time. There may be no further changes added after this.
1117  */
1118 void
1120  TransactionId subxid, XLogRecPtr commit_lsn,
1121  XLogRecPtr end_lsn)
1122 {
1123  ReorderBufferTXN *subtxn;
1124 
1125  subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1126  InvalidXLogRecPtr, false);
1127 
1128  /*
1129  * No need to do anything if that subtxn didn't contain any changes
1130  */
1131  if (!subtxn)
1132  return;
1133 
1134  subtxn->final_lsn = commit_lsn;
1135  subtxn->end_lsn = end_lsn;
1136 
1137  /*
1138  * Assign this subxact as a child of the toplevel xact (no-op if already
1139  * done.)
1140  */
1141  ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1142 }
1143 
1144 
1145 /*
1146  * Support for efficiently iterating over a transaction's and its
1147  * subtransactions' changes.
1148  *
1149  * We do by doing a k-way merge between transactions/subtransactions. For that
1150  * we model the current heads of the different transactions as a binary heap
1151  * so we easily know which (sub-)transaction has the change with the smallest
1152  * lsn next.
1153  *
1154  * We assume the changes in individual transactions are already sorted by LSN.
1155  */
1156 
1157 /*
1158  * Binary heap comparison function.
1159  */
1160 static int
1162 {
1164  XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1165  XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1166 
1167  if (pos_a < pos_b)
1168  return 1;
1169  else if (pos_a == pos_b)
1170  return 0;
1171  return -1;
1172 }
1173 
1174 /*
1175  * Allocate & initialize an iterator which iterates in lsn order over a
1176  * transaction and all its subtransactions.
1177  *
1178  * Note: The iterator state is returned through iter_state parameter rather
1179  * than the function's return value. This is because the state gets cleaned up
1180  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1181  * back the state even if this function throws an exception.
1182  */
1183 static void
1185  ReorderBufferIterTXNState *volatile *iter_state)
1186 {
1187  Size nr_txns = 0;
1189  dlist_iter cur_txn_i;
1190  int32 off;
1191 
1192  *iter_state = NULL;
1193 
1194  /* Check ordering of changes in the toplevel transaction. */
1195  AssertChangeLsnOrder(txn);
1196 
1197  /*
1198  * Calculate the size of our heap: one element for every transaction that
1199  * contains changes. (Besides the transactions already in the reorder
1200  * buffer, we count the one we were directly passed.)
1201  */
1202  if (txn->nentries > 0)
1203  nr_txns++;
1204 
1205  dlist_foreach(cur_txn_i, &txn->subtxns)
1206  {
1207  ReorderBufferTXN *cur_txn;
1208 
1209  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1210 
1211  /* Check ordering of changes in this subtransaction. */
1212  AssertChangeLsnOrder(cur_txn);
1213 
1214  if (cur_txn->nentries > 0)
1215  nr_txns++;
1216  }
1217 
1218  /* allocate iteration state */
1219  state = (ReorderBufferIterTXNState *)
1221  sizeof(ReorderBufferIterTXNState) +
1222  sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1223 
1224  state->nr_txns = nr_txns;
1225  dlist_init(&state->old_change);
1226 
1227  for (off = 0; off < state->nr_txns; off++)
1228  {
1229  state->entries[off].file.vfd = -1;
1230  state->entries[off].segno = 0;
1231  }
1232 
1233  /* allocate heap */
1234  state->heap = binaryheap_allocate(state->nr_txns,
1236  state);
1237 
1238  /* Now that the state fields are initialized, it is safe to return it. */
1239  *iter_state = state;
1240 
1241  /*
1242  * Now insert items into the binary heap, in an unordered fashion. (We
1243  * will run a heap assembly step at the end; this is more efficient.)
1244  */
1245 
1246  off = 0;
1247 
1248  /* add toplevel transaction if it contains changes */
1249  if (txn->nentries > 0)
1250  {
1251  ReorderBufferChange *cur_change;
1252 
1253  if (rbtxn_is_serialized(txn))
1254  {
1255  /* serialize remaining changes */
1256  ReorderBufferSerializeTXN(rb, txn);
1257  ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1258  &state->entries[off].segno);
1259  }
1260 
1261  cur_change = dlist_head_element(ReorderBufferChange, node,
1262  &txn->changes);
1263 
1264  state->entries[off].lsn = cur_change->lsn;
1265  state->entries[off].change = cur_change;
1266  state->entries[off].txn = txn;
1267 
1269  }
1270 
1271  /* add subtransactions if they contain changes */
1272  dlist_foreach(cur_txn_i, &txn->subtxns)
1273  {
1274  ReorderBufferTXN *cur_txn;
1275 
1276  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1277 
1278  if (cur_txn->nentries > 0)
1279  {
1280  ReorderBufferChange *cur_change;
1281 
1282  if (rbtxn_is_serialized(cur_txn))
1283  {
1284  /* serialize remaining changes */
1285  ReorderBufferSerializeTXN(rb, cur_txn);
1286  ReorderBufferRestoreChanges(rb, cur_txn,
1287  &state->entries[off].file,
1288  &state->entries[off].segno);
1289  }
1290  cur_change = dlist_head_element(ReorderBufferChange, node,
1291  &cur_txn->changes);
1292 
1293  state->entries[off].lsn = cur_change->lsn;
1294  state->entries[off].change = cur_change;
1295  state->entries[off].txn = cur_txn;
1296 
1298  }
1299  }
1300 
1301  /* assemble a valid binary heap */
1302  binaryheap_build(state->heap);
1303 }
1304 
1305 /*
1306  * Return the next change when iterating over a transaction and its
1307  * subtransactions.
1308  *
1309  * Returns NULL when no further changes exist.
1310  */
1311 static ReorderBufferChange *
1313 {
1314  ReorderBufferChange *change;
1316  int32 off;
1317 
1318  /* nothing there anymore */
1319  if (state->heap->bh_size == 0)
1320  return NULL;
1321 
1322  off = DatumGetInt32(binaryheap_first(state->heap));
1323  entry = &state->entries[off];
1324 
1325  /* free memory we might have "leaked" in the previous *Next call */
1326  if (!dlist_is_empty(&state->old_change))
1327  {
1328  change = dlist_container(ReorderBufferChange, node,
1329  dlist_pop_head_node(&state->old_change));
1330  ReorderBufferReturnChange(rb, change, true);
1331  Assert(dlist_is_empty(&state->old_change));
1332  }
1333 
1334  change = entry->change;
1335 
1336  /*
1337  * update heap with information about which transaction has the next
1338  * relevant change in LSN order
1339  */
1340 
1341  /* there are in-memory changes */
1342  if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1343  {
1344  dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1345  ReorderBufferChange *next_change =
1346  dlist_container(ReorderBufferChange, node, next);
1347 
1348  /* txn stays the same */
1349  state->entries[off].lsn = next_change->lsn;
1350  state->entries[off].change = next_change;
1351 
1353  return change;
1354  }
1355 
1356  /* try to load changes from disk */
1357  if (entry->txn->nentries != entry->txn->nentries_mem)
1358  {
1359  /*
1360  * Ugly: restoring changes will reuse *Change records, thus delete the
1361  * current one from the per-tx list and only free in the next call.
1362  */
1363  dlist_delete(&change->node);
1364  dlist_push_tail(&state->old_change, &change->node);
1365 
1366  if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1367  &state->entries[off].segno))
1368  {
1369  /* successfully restored changes from disk */
1370  ReorderBufferChange *next_change =
1372  &entry->txn->changes);
1373 
1374  elog(DEBUG2, "restored %u/%u changes from disk",
1375  (uint32) entry->txn->nentries_mem,
1376  (uint32) entry->txn->nentries);
1377 
1378  Assert(entry->txn->nentries_mem);
1379  /* txn stays the same */
1380  state->entries[off].lsn = next_change->lsn;
1381  state->entries[off].change = next_change;
1383 
1384  return change;
1385  }
1386  }
1387 
1388  /* ok, no changes there anymore, remove */
1389  binaryheap_remove_first(state->heap);
1390 
1391  return change;
1392 }
1393 
1394 /*
1395  * Deallocate the iterator
1396  */
1397 static void
1400 {
1401  int32 off;
1402 
1403  for (off = 0; off < state->nr_txns; off++)
1404  {
1405  if (state->entries[off].file.vfd != -1)
1406  FileClose(state->entries[off].file.vfd);
1407  }
1408 
1409  /* free memory we might have "leaked" in the last *Next call */
1410  if (!dlist_is_empty(&state->old_change))
1411  {
1412  ReorderBufferChange *change;
1413 
1414  change = dlist_container(ReorderBufferChange, node,
1415  dlist_pop_head_node(&state->old_change));
1416  ReorderBufferReturnChange(rb, change, true);
1417  Assert(dlist_is_empty(&state->old_change));
1418  }
1419 
1420  binaryheap_free(state->heap);
1421  pfree(state);
1422 }
1423 
1424 /*
1425  * Cleanup the contents of a transaction, usually after the transaction
1426  * committed or aborted.
1427  */
1428 static void
1430 {
1431  bool found;
1432  dlist_mutable_iter iter;
1433 
1434  /* cleanup subtransactions & their changes */
1435  dlist_foreach_modify(iter, &txn->subtxns)
1436  {
1437  ReorderBufferTXN *subtxn;
1438 
1439  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1440 
1441  /*
1442  * Subtransactions are always associated to the toplevel TXN, even if
1443  * they originally were happening inside another subtxn, so we won't
1444  * ever recurse more than one level deep here.
1445  */
1446  Assert(rbtxn_is_known_subxact(subtxn));
1447  Assert(subtxn->nsubtxns == 0);
1448 
1449  ReorderBufferCleanupTXN(rb, subtxn);
1450  }
1451 
1452  /* cleanup changes in the txn */
1453  dlist_foreach_modify(iter, &txn->changes)
1454  {
1455  ReorderBufferChange *change;
1456 
1457  change = dlist_container(ReorderBufferChange, node, iter.cur);
1458 
1459  /* Check we're not mixing changes from different transactions. */
1460  Assert(change->txn == txn);
1461 
1462  ReorderBufferReturnChange(rb, change, true);
1463  }
1464 
1465  /*
1466  * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1467  * They are always stored in the toplevel transaction.
1468  */
1469  dlist_foreach_modify(iter, &txn->tuplecids)
1470  {
1471  ReorderBufferChange *change;
1472 
1473  change = dlist_container(ReorderBufferChange, node, iter.cur);
1474 
1475  /* Check we're not mixing changes from different transactions. */
1476  Assert(change->txn == txn);
1478 
1479  ReorderBufferReturnChange(rb, change, true);
1480  }
1481 
1482  /*
1483  * Cleanup the base snapshot, if set.
1484  */
1485  if (txn->base_snapshot != NULL)
1486  {
1489  }
1490 
1491  /*
1492  * Cleanup the snapshot for the last streamed run.
1493  */
1494  if (txn->snapshot_now != NULL)
1495  {
1496  Assert(rbtxn_is_streamed(txn));
1498  }
1499 
1500  /*
1501  * Remove TXN from its containing list.
1502  *
1503  * Note: if txn is known as subxact, we are deleting the TXN from its
1504  * parent's list of known subxacts; this leaves the parent's nsubxacts
1505  * count too high, but we don't care. Otherwise, we are deleting the TXN
1506  * from the LSN-ordered list of toplevel TXNs.
1507  */
1508  dlist_delete(&txn->node);
1509 
1510  /* now remove reference from buffer */
1511  hash_search(rb->by_txn,
1512  (void *) &txn->xid,
1513  HASH_REMOVE,
1514  &found);
1515  Assert(found);
1516 
1517  /* remove entries spilled to disk */
1518  if (rbtxn_is_serialized(txn))
1519  ReorderBufferRestoreCleanup(rb, txn);
1520 
1521  /* deallocate */
1522  ReorderBufferReturnTXN(rb, txn);
1523 }
1524 
1525 /*
1526  * Discard changes from a transaction (and subtransactions), either after
1527  * streaming or decoding them at PREPARE. Keep the remaining info -
1528  * transactions, tuplecids, invalidations and snapshots.
1529  *
1530  * We additionaly remove tuplecids after decoding the transaction at prepare
1531  * time as we only need to perform invalidation at rollback or commit prepared.
1532  *
1533  * 'txn_prepared' indicates that we have decoded the transaction at prepare
1534  * time.
1535  */
1536 static void
1538 {
1539  dlist_mutable_iter iter;
1540 
1541  /* cleanup subtransactions & their changes */
1542  dlist_foreach_modify(iter, &txn->subtxns)
1543  {
1544  ReorderBufferTXN *subtxn;
1545 
1546  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1547 
1548  /*
1549  * Subtransactions are always associated to the toplevel TXN, even if
1550  * they originally were happening inside another subtxn, so we won't
1551  * ever recurse more than one level deep here.
1552  */
1553  Assert(rbtxn_is_known_subxact(subtxn));
1554  Assert(subtxn->nsubtxns == 0);
1555 
1556  ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1557  }
1558 
1559  /* cleanup changes in the txn */
1560  dlist_foreach_modify(iter, &txn->changes)
1561  {
1562  ReorderBufferChange *change;
1563 
1564  change = dlist_container(ReorderBufferChange, node, iter.cur);
1565 
1566  /* Check we're not mixing changes from different transactions. */
1567  Assert(change->txn == txn);
1568 
1569  /* remove the change from it's containing list */
1570  dlist_delete(&change->node);
1571 
1572  ReorderBufferReturnChange(rb, change, true);
1573  }
1574 
1575  /*
1576  * Mark the transaction as streamed.
1577  *
1578  * The toplevel transaction, identified by (toptxn==NULL), is marked as
1579  * streamed always, even if it does not contain any changes (that is, when
1580  * all the changes are in subtransactions).
1581  *
1582  * For subtransactions, we only mark them as streamed when there are
1583  * changes in them.
1584  *
1585  * We do it this way because of aborts - we don't want to send aborts for
1586  * XIDs the downstream is not aware of. And of course, it always knows
1587  * about the toplevel xact (we send the XID in all messages), but we never
1588  * stream XIDs of empty subxacts.
1589  */
1590  if ((!txn_prepared) && ((!txn->toptxn) || (txn->nentries_mem != 0)))
1591  txn->txn_flags |= RBTXN_IS_STREAMED;
1592 
1593  if (txn_prepared)
1594  {
1595  /*
1596  * If this is a prepared txn, cleanup the tuplecids we stored for
1597  * decoding catalog snapshot access. They are always stored in the
1598  * toplevel transaction.
1599  */
1600  dlist_foreach_modify(iter, &txn->tuplecids)
1601  {
1602  ReorderBufferChange *change;
1603 
1604  change = dlist_container(ReorderBufferChange, node, iter.cur);
1605 
1606  /* Check we're not mixing changes from different transactions. */
1607  Assert(change->txn == txn);
1609 
1610  /* Remove the change from its containing list. */
1611  dlist_delete(&change->node);
1612 
1613  ReorderBufferReturnChange(rb, change, true);
1614  }
1615  }
1616 
1617  /*
1618  * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any
1619  * memory. We could also keep the hash table and update it with new ctid
1620  * values, but this seems simpler and good enough for now.
1621  */
1622  if (txn->tuplecid_hash != NULL)
1623  {
1625  txn->tuplecid_hash = NULL;
1626  }
1627 
1628  /* If this txn is serialized then clean the disk space. */
1629  if (rbtxn_is_serialized(txn))
1630  {
1631  ReorderBufferRestoreCleanup(rb, txn);
1632  txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1633 
1634  /*
1635  * We set this flag to indicate if the transaction is ever serialized.
1636  * We need this to accurately update the stats as otherwise the same
1637  * transaction can be counted as serialized multiple times.
1638  */
1640  }
1641 
1642  /* also reset the number of entries in the transaction */
1643  txn->nentries_mem = 0;
1644  txn->nentries = 0;
1645 }
1646 
1647 /*
1648  * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
1649  * HeapTupleSatisfiesHistoricMVCC.
1650  */
1651 static void
1653 {
1654  dlist_iter iter;
1655  HASHCTL hash_ctl;
1656 
1658  return;
1659 
1660  hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1661  hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1662  hash_ctl.hcxt = rb->context;
1663 
1664  /*
1665  * create the hash with the exact number of to-be-stored tuplecids from
1666  * the start
1667  */
1668  txn->tuplecid_hash =
1669  hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1671 
1672  dlist_foreach(iter, &txn->tuplecids)
1673  {
1676  bool found;
1677  ReorderBufferChange *change;
1678 
1679  change = dlist_container(ReorderBufferChange, node, iter.cur);
1680 
1682 
1683  /* be careful about padding */
1684  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1685 
1686  key.relnode = change->data.tuplecid.node;
1687 
1688  ItemPointerCopy(&change->data.tuplecid.tid,
1689  &key.tid);
1690 
1691  ent = (ReorderBufferTupleCidEnt *)
1693  (void *) &key,
1695  &found);
1696  if (!found)
1697  {
1698  ent->cmin = change->data.tuplecid.cmin;
1699  ent->cmax = change->data.tuplecid.cmax;
1700  ent->combocid = change->data.tuplecid.combocid;
1701  }
1702  else
1703  {
1704  /*
1705  * Maybe we already saw this tuple before in this transaction, but
1706  * if so it must have the same cmin.
1707  */
1708  Assert(ent->cmin == change->data.tuplecid.cmin);
1709 
1710  /*
1711  * cmax may be initially invalid, but once set it can only grow,
1712  * and never become invalid again.
1713  */
1714  Assert((ent->cmax == InvalidCommandId) ||
1715  ((change->data.tuplecid.cmax != InvalidCommandId) &&
1716  (change->data.tuplecid.cmax > ent->cmax)));
1717  ent->cmax = change->data.tuplecid.cmax;
1718  }
1719  }
1720 }
1721 
1722 /*
1723  * Copy a provided snapshot so we can modify it privately. This is needed so
1724  * that catalog modifying transactions can look into intermediate catalog
1725  * states.
1726  */
1727 static Snapshot
1730 {
1731  Snapshot snap;
1732  dlist_iter iter;
1733  int i = 0;
1734  Size size;
1735 
1736  size = sizeof(SnapshotData) +
1737  sizeof(TransactionId) * orig_snap->xcnt +
1738  sizeof(TransactionId) * (txn->nsubtxns + 1);
1739 
1740  snap = MemoryContextAllocZero(rb->context, size);
1741  memcpy(snap, orig_snap, sizeof(SnapshotData));
1742 
1743  snap->copied = true;
1744  snap->active_count = 1; /* mark as active so nobody frees it */
1745  snap->regd_count = 0;
1746  snap->xip = (TransactionId *) (snap + 1);
1747 
1748  memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1749 
1750  /*
1751  * snap->subxip contains all txids that belong to our transaction which we
1752  * need to check via cmin/cmax. That's why we store the toplevel
1753  * transaction in there as well.
1754  */
1755  snap->subxip = snap->xip + snap->xcnt;
1756  snap->subxip[i++] = txn->xid;
1757 
1758  /*
1759  * subxcnt isn't decreased when subtransactions abort, so count manually.
1760  * Since it's an upper boundary it is safe to use it for the allocation
1761  * above.
1762  */
1763  snap->subxcnt = 1;
1764 
1765  dlist_foreach(iter, &txn->subtxns)
1766  {
1767  ReorderBufferTXN *sub_txn;
1768 
1769  sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1770  snap->subxip[i++] = sub_txn->xid;
1771  snap->subxcnt++;
1772  }
1773 
1774  /* sort so we can bsearch() later */
1775  qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1776 
1777  /* store the specified current CommandId */
1778  snap->curcid = cid;
1779 
1780  return snap;
1781 }
1782 
1783 /*
1784  * Free a previously ReorderBufferCopySnap'ed snapshot
1785  */
1786 static void
1788 {
1789  if (snap->copied)
1790  pfree(snap);
1791  else
1793 }
1794 
1795 /*
1796  * If the transaction was (partially) streamed, we need to prepare or commit
1797  * it in a 'streamed' way. That is, we first stream the remaining part of the
1798  * transaction, and then invoke stream_prepare or stream_commit message as per
1799  * the case.
1800  */
1801 static void
1803 {
1804  /* we should only call this for previously streamed transactions */
1805  Assert(rbtxn_is_streamed(txn));
1806 
1807  ReorderBufferStreamTXN(rb, txn);
1808 
1809  if (rbtxn_prepared(txn))
1810  {
1811  /*
1812  * Note, we send stream prepare even if a concurrent abort is
1813  * detected. See DecodePrepare for more information.
1814  */
1815  rb->stream_prepare(rb, txn, txn->final_lsn);
1816 
1817  /*
1818  * This is a PREPARED transaction, part of a two-phase commit. The
1819  * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1820  * just truncate txn by removing changes and tuple_cids.
1821  */
1822  ReorderBufferTruncateTXN(rb, txn, true);
1823  /* Reset the CheckXidAlive */
1825  }
1826  else
1827  {
1828  rb->stream_commit(rb, txn, txn->final_lsn);
1829  ReorderBufferCleanupTXN(rb, txn);
1830  }
1831 }
1832 
1833 /*
1834  * Set xid to detect concurrent aborts.
1835  *
1836  * While streaming an in-progress transaction or decoding a prepared
1837  * transaction there is a possibility that the (sub)transaction might get
1838  * aborted concurrently. In such case if the (sub)transaction has catalog
1839  * update then we might decode the tuple using wrong catalog version. For
1840  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1841  * the transaction 501 updates the catalog tuple and after that we will have
1842  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1843  * aborted and some other transaction say 502 updates the same catalog tuple
1844  * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1845  * problem is that when we try to decode the tuple inserted/updated in 501
1846  * after the catalog update, we will see the catalog tuple with (xmin: 500,
1847  * xmax: 502) as visible because it will consider that the tuple is deleted by
1848  * xid 502 which is not visible to our snapshot. And when we will try to
1849  * decode with that catalog tuple, it can lead to a wrong result or a crash.
1850  * So, it is necessary to detect concurrent aborts to allow streaming of
1851  * in-progress transactions or decoding of prepared transactions.
1852  *
1853  * For detecting the concurrent abort we set CheckXidAlive to the current
1854  * (sub)transaction's xid for which this change belongs to. And, during
1855  * catalog scan we can check the status of the xid and if it is aborted we will
1856  * report a specific error so that we can stop streaming current transaction
1857  * and discard the already streamed changes on such an error. We might have
1858  * already streamed some of the changes for the aborted (sub)transaction, but
1859  * that is fine because when we decode the abort we will stream abort message
1860  * to truncate the changes in the subscriber. Similarly, for prepared
1861  * transactions, we stop decoding if concurrent abort is detected and then
1862  * rollback the changes when rollback prepared is encountered. See
1863  * DecodePreare.
1864  */
1865 static inline void
1867 {
1868  /*
1869  * If the input transaction id is already set as a CheckXidAlive then
1870  * nothing to do.
1871  */
1873  return;
1874 
1875  /*
1876  * setup CheckXidAlive if it's not committed yet. We don't check if the
1877  * xid is aborted. That will happen during catalog access.
1878  */
1879  if (!TransactionIdDidCommit(xid))
1880  CheckXidAlive = xid;
1881  else
1883 }
1884 
1885 /*
1886  * Helper function for ReorderBufferProcessTXN for applying change.
1887  */
1888 static inline void
1890  Relation relation, ReorderBufferChange *change,
1891  bool streaming)
1892 {
1893  if (streaming)
1894  rb->stream_change(rb, txn, relation, change);
1895  else
1896  rb->apply_change(rb, txn, relation, change);
1897 }
1898 
1899 /*
1900  * Helper function for ReorderBufferProcessTXN for applying the truncate.
1901  */
1902 static inline void
1904  int nrelations, Relation *relations,
1905  ReorderBufferChange *change, bool streaming)
1906 {
1907  if (streaming)
1908  rb->stream_truncate(rb, txn, nrelations, relations, change);
1909  else
1910  rb->apply_truncate(rb, txn, nrelations, relations, change);
1911 }
1912 
1913 /*
1914  * Helper function for ReorderBufferProcessTXN for applying the message.
1915  */
1916 static inline void
1918  ReorderBufferChange *change, bool streaming)
1919 {
1920  if (streaming)
1921  rb->stream_message(rb, txn, change->lsn, true,
1922  change->data.msg.prefix,
1923  change->data.msg.message_size,
1924  change->data.msg.message);
1925  else
1926  rb->message(rb, txn, change->lsn, true,
1927  change->data.msg.prefix,
1928  change->data.msg.message_size,
1929  change->data.msg.message);
1930 }
1931 
1932 /*
1933  * Function to store the command id and snapshot at the end of the current
1934  * stream so that we can reuse the same while sending the next stream.
1935  */
1936 static inline void
1938  Snapshot snapshot_now, CommandId command_id)
1939 {
1940  txn->command_id = command_id;
1941 
1942  /* Avoid copying if it's already copied. */
1943  if (snapshot_now->copied)
1944  txn->snapshot_now = snapshot_now;
1945  else
1946  txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
1947  txn, command_id);
1948 }
1949 
1950 /*
1951  * Helper function for ReorderBufferProcessTXN to handle the concurrent
1952  * abort of the streaming transaction. This resets the TXN such that it
1953  * can be used to stream the remaining data of transaction being processed.
1954  * This can happen when the subtransaction is aborted and we still want to
1955  * continue processing the main or other subtransactions data.
1956  */
1957 static void
1959  Snapshot snapshot_now,
1960  CommandId command_id,
1961  XLogRecPtr last_lsn,
1962  ReorderBufferChange *specinsert)
1963 {
1964  /* Discard the changes that we just streamed */
1966 
1967  /* Free all resources allocated for toast reconstruction */
1968  ReorderBufferToastReset(rb, txn);
1969 
1970  /* Return the spec insert change if it is not NULL */
1971  if (specinsert != NULL)
1972  {
1973  ReorderBufferReturnChange(rb, specinsert, true);
1974  specinsert = NULL;
1975  }
1976 
1977  /*
1978  * For the streaming case, stop the stream and remember the command ID and
1979  * snapshot for the streaming run.
1980  */
1981  if (rbtxn_is_streamed(txn))
1982  {
1983  rb->stream_stop(rb, txn, last_lsn);
1984  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
1985  }
1986 }
1987 
1988 /*
1989  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
1990  *
1991  * Send data of a transaction (and its subtransactions) to the
1992  * output plugin. We iterate over the top and subtransactions (using a k-way
1993  * merge) and replay the changes in lsn order.
1994  *
1995  * If streaming is true then data will be sent using stream API.
1996  *
1997  * Note: "volatile" markers on some parameters are to avoid trouble with
1998  * PG_TRY inside the function.
1999  */
2000 static void
2002  XLogRecPtr commit_lsn,
2003  volatile Snapshot snapshot_now,
2004  volatile CommandId command_id,
2005  bool streaming)
2006 {
2007  bool using_subtxn;
2009  ReorderBufferIterTXNState *volatile iterstate = NULL;
2010  volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2011  ReorderBufferChange *volatile specinsert = NULL;
2012  volatile bool stream_started = false;
2013  ReorderBufferTXN *volatile curtxn = NULL;
2014 
2015  /* build data to be able to lookup the CommandIds of catalog tuples */
2017 
2018  /* setup the initial snapshot */
2019  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2020 
2021  /*
2022  * Decoding needs access to syscaches et al., which in turn use
2023  * heavyweight locks and such. Thus we need to have enough state around to
2024  * keep track of those. The easiest way is to simply use a transaction
2025  * internally. That also allows us to easily enforce that nothing writes
2026  * to the database by checking for xid assignments.
2027  *
2028  * When we're called via the SQL SRF there's already a transaction
2029  * started, so start an explicit subtransaction there.
2030  */
2031  using_subtxn = IsTransactionOrTransactionBlock();
2032 
2033  PG_TRY();
2034  {
2035  ReorderBufferChange *change;
2036 
2037  if (using_subtxn)
2038  BeginInternalSubTransaction(streaming ? "stream" : "replay");
2039  else
2041 
2042  /*
2043  * We only need to send begin/begin-prepare for non-streamed
2044  * transactions.
2045  */
2046  if (!streaming)
2047  {
2048  if (rbtxn_prepared(txn))
2049  rb->begin_prepare(rb, txn);
2050  else
2051  rb->begin(rb, txn);
2052  }
2053 
2054  ReorderBufferIterTXNInit(rb, txn, &iterstate);
2055  while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2056  {
2057  Relation relation = NULL;
2058  Oid reloid;
2059 
2060  /*
2061  * We can't call start stream callback before processing first
2062  * change.
2063  */
2064  if (prev_lsn == InvalidXLogRecPtr)
2065  {
2066  if (streaming)
2067  {
2068  txn->origin_id = change->origin_id;
2069  rb->stream_start(rb, txn, change->lsn);
2070  stream_started = true;
2071  }
2072  }
2073 
2074  /*
2075  * Enforce correct ordering of changes, merged from multiple
2076  * subtransactions. The changes may have the same LSN due to
2077  * MULTI_INSERT xlog records.
2078  */
2079  Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2080 
2081  prev_lsn = change->lsn;
2082 
2083  /*
2084  * Set the current xid to detect concurrent aborts. This is
2085  * required for the cases when we decode the changes before the
2086  * COMMIT record is processed.
2087  */
2088  if (streaming || rbtxn_prepared(change->txn))
2089  {
2090  curtxn = change->txn;
2091  SetupCheckXidLive(curtxn->xid);
2092  }
2093 
2094  switch (change->action)
2095  {
2097 
2098  /*
2099  * Confirmation for speculative insertion arrived. Simply
2100  * use as a normal record. It'll be cleaned up at the end
2101  * of INSERT processing.
2102  */
2103  if (specinsert == NULL)
2104  elog(ERROR, "invalid ordering of speculative insertion changes");
2105  Assert(specinsert->data.tp.oldtuple == NULL);
2106  change = specinsert;
2108 
2109  /* intentionally fall through */
2113  Assert(snapshot_now);
2114 
2115  reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode,
2116  change->data.tp.relnode.relNode);
2117 
2118  /*
2119  * Mapped catalog tuple without data, emitted while
2120  * catalog table was in the process of being rewritten. We
2121  * can fail to look up the relfilenode, because the
2122  * relmapper has no "historic" view, in contrast to the
2123  * normal catalog during decoding. Thus repeated rewrites
2124  * can cause a lookup failure. That's OK because we do not
2125  * decode catalog changes anyway. Normally such tuples
2126  * would be skipped over below, but we can't identify
2127  * whether the table should be logically logged without
2128  * mapping the relfilenode to the oid.
2129  */
2130  if (reloid == InvalidOid &&
2131  change->data.tp.newtuple == NULL &&
2132  change->data.tp.oldtuple == NULL)
2133  goto change_done;
2134  else if (reloid == InvalidOid)
2135  elog(ERROR, "could not map filenode \"%s\" to relation OID",
2136  relpathperm(change->data.tp.relnode,
2137  MAIN_FORKNUM));
2138 
2139  relation = RelationIdGetRelation(reloid);
2140 
2141  if (!RelationIsValid(relation))
2142  elog(ERROR, "could not open relation with OID %u (for filenode \"%s\")",
2143  reloid,
2144  relpathperm(change->data.tp.relnode,
2145  MAIN_FORKNUM));
2146 
2147  if (!RelationIsLogicallyLogged(relation))
2148  goto change_done;
2149 
2150  /*
2151  * Ignore temporary heaps created during DDL unless the
2152  * plugin has asked for them.
2153  */
2154  if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2155  goto change_done;
2156 
2157  /*
2158  * For now ignore sequence changes entirely. Most of the
2159  * time they don't log changes using records we
2160  * understand, so it doesn't make sense to handle the few
2161  * cases we do.
2162  */
2163  if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2164  goto change_done;
2165 
2166  /* user-triggered change */
2167  if (!IsToastRelation(relation))
2168  {
2169  ReorderBufferToastReplace(rb, txn, relation, change);
2170  ReorderBufferApplyChange(rb, txn, relation, change,
2171  streaming);
2172 
2173  /*
2174  * Only clear reassembled toast chunks if we're sure
2175  * they're not required anymore. The creator of the
2176  * tuple tells us.
2177  */
2178  if (change->data.tp.clear_toast_afterwards)
2179  ReorderBufferToastReset(rb, txn);
2180  }
2181  /* we're not interested in toast deletions */
2182  else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2183  {
2184  /*
2185  * Need to reassemble the full toasted Datum in
2186  * memory, to ensure the chunks don't get reused till
2187  * we're done remove it from the list of this
2188  * transaction's changes. Otherwise it will get
2189  * freed/reused while restoring spooled data from
2190  * disk.
2191  */
2192  Assert(change->data.tp.newtuple != NULL);
2193 
2194  dlist_delete(&change->node);
2195  ReorderBufferToastAppendChunk(rb, txn, relation,
2196  change);
2197  }
2198 
2199  change_done:
2200 
2201  /*
2202  * Either speculative insertion was confirmed, or it was
2203  * unsuccessful and the record isn't needed anymore.
2204  */
2205  if (specinsert != NULL)
2206  {
2207  ReorderBufferReturnChange(rb, specinsert, true);
2208  specinsert = NULL;
2209  }
2210 
2211  if (RelationIsValid(relation))
2212  {
2213  RelationClose(relation);
2214  relation = NULL;
2215  }
2216  break;
2217 
2219 
2220  /*
2221  * Speculative insertions are dealt with by delaying the
2222  * processing of the insert until the confirmation record
2223  * arrives. For that we simply unlink the record from the
2224  * chain, so it does not get freed/reused while restoring
2225  * spooled data from disk.
2226  *
2227  * This is safe in the face of concurrent catalog changes
2228  * because the relevant relation can't be changed between
2229  * speculative insertion and confirmation due to
2230  * CheckTableNotInUse() and locking.
2231  */
2232 
2233  /* clear out a pending (and thus failed) speculation */
2234  if (specinsert != NULL)
2235  {
2236  ReorderBufferReturnChange(rb, specinsert, true);
2237  specinsert = NULL;
2238  }
2239 
2240  /* and memorize the pending insertion */
2241  dlist_delete(&change->node);
2242  specinsert = change;
2243  break;
2244 
2246  {
2247  int i;
2248  int nrelids = change->data.truncate.nrelids;
2249  int nrelations = 0;
2250  Relation *relations;
2251 
2252  relations = palloc0(nrelids * sizeof(Relation));
2253  for (i = 0; i < nrelids; i++)
2254  {
2255  Oid relid = change->data.truncate.relids[i];
2256  Relation relation;
2257 
2258  relation = RelationIdGetRelation(relid);
2259 
2260  if (!RelationIsValid(relation))
2261  elog(ERROR, "could not open relation with OID %u", relid);
2262 
2263  if (!RelationIsLogicallyLogged(relation))
2264  continue;
2265 
2266  relations[nrelations++] = relation;
2267  }
2268 
2269  /* Apply the truncate. */
2270  ReorderBufferApplyTruncate(rb, txn, nrelations,
2271  relations, change,
2272  streaming);
2273 
2274  for (i = 0; i < nrelations; i++)
2275  RelationClose(relations[i]);
2276 
2277  break;
2278  }
2279 
2281  ReorderBufferApplyMessage(rb, txn, change, streaming);
2282  break;
2283 
2285  /* Execute the invalidation messages locally */
2287  change->data.inval.ninvalidations,
2288  change->data.inval.invalidations);
2289  break;
2290 
2292  /* get rid of the old */
2293  TeardownHistoricSnapshot(false);
2294 
2295  if (snapshot_now->copied)
2296  {
2297  ReorderBufferFreeSnap(rb, snapshot_now);
2298  snapshot_now =
2299  ReorderBufferCopySnap(rb, change->data.snapshot,
2300  txn, command_id);
2301  }
2302 
2303  /*
2304  * Restored from disk, need to be careful not to double
2305  * free. We could introduce refcounting for that, but for
2306  * now this seems infrequent enough not to care.
2307  */
2308  else if (change->data.snapshot->copied)
2309  {
2310  snapshot_now =
2311  ReorderBufferCopySnap(rb, change->data.snapshot,
2312  txn, command_id);
2313  }
2314  else
2315  {
2316  snapshot_now = change->data.snapshot;
2317  }
2318 
2319  /* and continue with the new one */
2320  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2321  break;
2322 
2324  Assert(change->data.command_id != InvalidCommandId);
2325 
2326  if (command_id < change->data.command_id)
2327  {
2328  command_id = change->data.command_id;
2329 
2330  if (!snapshot_now->copied)
2331  {
2332  /* we don't use the global one anymore */
2333  snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2334  txn, command_id);
2335  }
2336 
2337  snapshot_now->curcid = command_id;
2338 
2339  TeardownHistoricSnapshot(false);
2340  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2341  }
2342 
2343  break;
2344 
2346  elog(ERROR, "tuplecid value in changequeue");
2347  break;
2348  }
2349  }
2350 
2351  /*
2352  * There's a speculative insertion remaining, just clean in up, it
2353  * can't have been successful, otherwise we'd gotten a confirmation
2354  * record.
2355  */
2356  if (specinsert)
2357  {
2358  ReorderBufferReturnChange(rb, specinsert, true);
2359  specinsert = NULL;
2360  }
2361 
2362  /* clean up the iterator */
2363  ReorderBufferIterTXNFinish(rb, iterstate);
2364  iterstate = NULL;
2365 
2366  /*
2367  * Done with current changes, send the last message for this set of
2368  * changes depending upon streaming mode.
2369  */
2370  if (streaming)
2371  {
2372  if (stream_started)
2373  {
2374  rb->stream_stop(rb, txn, prev_lsn);
2375  stream_started = false;
2376  }
2377  }
2378  else
2379  {
2380  /*
2381  * Call either PREPARE (for two-phase transactions) or COMMIT (for
2382  * regular ones).
2383  */
2384  if (rbtxn_prepared(txn))
2385  rb->prepare(rb, txn, commit_lsn);
2386  else
2387  rb->commit(rb, txn, commit_lsn);
2388  }
2389 
2390  /* this is just a sanity check against bad output plugin behaviour */
2392  elog(ERROR, "output plugin used XID %u",
2394 
2395  /*
2396  * Remember the command ID and snapshot for the next set of changes in
2397  * streaming mode.
2398  */
2399  if (streaming)
2400  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2401  else if (snapshot_now->copied)
2402  ReorderBufferFreeSnap(rb, snapshot_now);
2403 
2404  /* cleanup */
2405  TeardownHistoricSnapshot(false);
2406 
2407  /*
2408  * Aborting the current (sub-)transaction as a whole has the right
2409  * semantics. We want all locks acquired in here to be released, not
2410  * reassigned to the parent and we do not want any database access
2411  * have persistent effects.
2412  */
2414 
2415  /* make sure there's no cache pollution */
2417 
2418  if (using_subtxn)
2420 
2421  /*
2422  * We are here due to one of the four reasons: 1. Decoding an
2423  * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2424  * prepared txn that was (partially) streamed. 4. Decoding a committed
2425  * txn.
2426  *
2427  * For 1, we allow truncation of txn data by removing the changes
2428  * already streamed but still keeping other things like invalidations,
2429  * snapshot, and tuplecids. For 2 and 3, we indicate
2430  * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2431  * data as the entire transaction has been decoded except for commit.
2432  * For 4, as the entire txn has been decoded, we can fully clean up
2433  * the TXN reorder buffer.
2434  */
2435  if (streaming || rbtxn_prepared(txn))
2436  {
2438  /* Reset the CheckXidAlive */
2440  }
2441  else
2442  ReorderBufferCleanupTXN(rb, txn);
2443  }
2444  PG_CATCH();
2445  {
2446  MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2447  ErrorData *errdata = CopyErrorData();
2448 
2449  /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2450  if (iterstate)
2451  ReorderBufferIterTXNFinish(rb, iterstate);
2452 
2454 
2455  /*
2456  * Force cache invalidation to happen outside of a valid transaction
2457  * to prevent catalog access as we just caught an error.
2458  */
2460 
2461  /* make sure there's no cache pollution */
2463  txn->invalidations);
2464 
2465  if (using_subtxn)
2467 
2468  /*
2469  * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2470  * abort of the (sub)transaction we are streaming or preparing. We
2471  * need to do the cleanup and return gracefully on this error, see
2472  * SetupCheckXidLive.
2473  */
2474  if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK)
2475  {
2476  /*
2477  * This error can occur either when we are sending the data in
2478  * streaming mode and the streaming is not finished yet or when we
2479  * are sending the data out on a PREPARE during a two-phase
2480  * commit.
2481  */
2482  Assert(streaming || rbtxn_prepared(txn));
2483  Assert(stream_started || rbtxn_prepared(txn));
2484 
2485  /* Cleanup the temporary error state. */
2486  FlushErrorState();
2487  FreeErrorData(errdata);
2488  errdata = NULL;
2489  curtxn->concurrent_abort = true;
2490 
2491  /* Reset the TXN so that it is allowed to stream remaining data. */
2492  ReorderBufferResetTXN(rb, txn, snapshot_now,
2493  command_id, prev_lsn,
2494  specinsert);
2495  }
2496  else
2497  {
2498  ReorderBufferCleanupTXN(rb, txn);
2499  MemoryContextSwitchTo(ecxt);
2500  PG_RE_THROW();
2501  }
2502  }
2503  PG_END_TRY();
2504 }
2505 
2506 /*
2507  * Perform the replay of a transaction and its non-aborted subtransactions.
2508  *
2509  * Subtransactions previously have to be processed by
2510  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2511  * transaction with ReorderBufferAssignChild.
2512  *
2513  * This interface is called once a prepare or toplevel commit is read for both
2514  * streamed as well as non-streamed transactions.
2515  */
2516 static void
2519  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2520  TimestampTz commit_time,
2521  RepOriginId origin_id, XLogRecPtr origin_lsn)
2522 {
2523  Snapshot snapshot_now;
2524  CommandId command_id = FirstCommandId;
2525 
2526  txn->final_lsn = commit_lsn;
2527  txn->end_lsn = end_lsn;
2528  txn->commit_time = commit_time;
2529  txn->origin_id = origin_id;
2530  txn->origin_lsn = origin_lsn;
2531 
2532  /*
2533  * If the transaction was (partially) streamed, we need to commit it in a
2534  * 'streamed' way. That is, we first stream the remaining part of the
2535  * transaction, and then invoke stream_commit message.
2536  *
2537  * Called after everything (origin ID, LSN, ...) is stored in the
2538  * transaction to avoid passing that information directly.
2539  */
2540  if (rbtxn_is_streamed(txn))
2541  {
2542  ReorderBufferStreamCommit(rb, txn);
2543  return;
2544  }
2545 
2546  /*
2547  * If this transaction has no snapshot, it didn't make any changes to the
2548  * database, so there's nothing to decode. Note that
2549  * ReorderBufferCommitChild will have transferred any snapshots from
2550  * subtransactions if there were any.
2551  */
2552  if (txn->base_snapshot == NULL)
2553  {
2554  Assert(txn->ninvalidations == 0);
2555 
2556  /*
2557  * Removing this txn before a commit might result in the computation
2558  * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2559  */
2560  if (!rbtxn_prepared(txn))
2561  ReorderBufferCleanupTXN(rb, txn);
2562  return;
2563  }
2564 
2565  snapshot_now = txn->base_snapshot;
2566 
2567  /* Process and send the changes to output plugin. */
2568  ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2569  command_id, false);
2570 }
2571 
2572 /*
2573  * Commit a transaction.
2574  *
2575  * See comments for ReorderBufferReplay().
2576  */
2577 void
2579  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2580  TimestampTz commit_time,
2581  RepOriginId origin_id, XLogRecPtr origin_lsn)
2582 {
2584 
2585  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2586  false);
2587 
2588  /* unknown transaction, nothing to replay */
2589  if (txn == NULL)
2590  return;
2591 
2592  ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2593  origin_id, origin_lsn);
2594 }
2595 
2596 /*
2597  * Record the prepare information for a transaction.
2598  */
2599 bool
2601  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2602  TimestampTz prepare_time,
2603  RepOriginId origin_id, XLogRecPtr origin_lsn)
2604 {
2606 
2607  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2608 
2609  /* unknown transaction, nothing to do */
2610  if (txn == NULL)
2611  return false;
2612 
2613  /*
2614  * Remember the prepare information to be later used by commit prepared in
2615  * case we skip doing prepare.
2616  */
2617  txn->final_lsn = prepare_lsn;
2618  txn->end_lsn = end_lsn;
2619  txn->commit_time = prepare_time;
2620  txn->origin_id = origin_id;
2621  txn->origin_lsn = origin_lsn;
2622 
2623  return true;
2624 }
2625 
2626 /* Remember that we have skipped prepare */
2627 void
2629 {
2631 
2632  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2633 
2634  /* unknown transaction, nothing to do */
2635  if (txn == NULL)
2636  return;
2637 
2639 }
2640 
2641 /*
2642  * Prepare a two-phase transaction.
2643  *
2644  * See comments for ReorderBufferReplay().
2645  */
2646 void
2648  char *gid)
2649 {
2651 
2652  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2653  false);
2654 
2655  /* unknown transaction, nothing to replay */
2656  if (txn == NULL)
2657  return;
2658 
2659  txn->txn_flags |= RBTXN_PREPARE;
2660  txn->gid = pstrdup(gid);
2661 
2662  /* The prepare info must have been updated in txn by now. */
2664 
2665  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2666  txn->commit_time, txn->origin_id, txn->origin_lsn);
2667 }
2668 
2669 /*
2670  * This is used to handle COMMIT/ROLLBACK PREPARED.
2671  */
2672 void
2674  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2675  TimestampTz commit_time, RepOriginId origin_id,
2676  XLogRecPtr origin_lsn, char *gid, bool is_commit)
2677 {
2679  XLogRecPtr prepare_end_lsn;
2680  TimestampTz prepare_time;
2681 
2682  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, commit_lsn, false);
2683 
2684  /* unknown transaction, nothing to do */
2685  if (txn == NULL)
2686  return;
2687 
2688  /*
2689  * By this time the txn has the prepare record information, remember it to
2690  * be later used for rollback.
2691  */
2692  prepare_end_lsn = txn->end_lsn;
2693  prepare_time = txn->commit_time;
2694 
2695  /* add the gid in the txn */
2696  txn->gid = pstrdup(gid);
2697 
2698  /*
2699  * It is possible that this transaction is not decoded at prepare time
2700  * either because by that time we didn't have a consistent snapshot or it
2701  * was decoded earlier but we have restarted. We can't distinguish between
2702  * those two cases so we send the prepare in both the cases and let
2703  * downstream decide whether to process or skip it. We don't need to
2704  * decode the xact for aborts if it is not done already.
2705  */
2706  if (!rbtxn_prepared(txn) && is_commit)
2707  {
2708  txn->txn_flags |= RBTXN_PREPARE;
2709 
2710  /*
2711  * The prepare info must have been updated in txn even if we skip
2712  * prepare.
2713  */
2715 
2716  /*
2717  * By this time the txn has the prepare record information and it is
2718  * important to use that so that downstream gets the accurate
2719  * information. If instead, we have passed commit information here
2720  * then downstream can behave as it has already replayed commit
2721  * prepared after the restart.
2722  */
2723  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2724  txn->commit_time, txn->origin_id, txn->origin_lsn);
2725  }
2726 
2727  txn->final_lsn = commit_lsn;
2728  txn->end_lsn = end_lsn;
2729  txn->commit_time = commit_time;
2730  txn->origin_id = origin_id;
2731  txn->origin_lsn = origin_lsn;
2732 
2733  if (is_commit)
2734  rb->commit_prepared(rb, txn, commit_lsn);
2735  else
2736  rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2737 
2738  /* cleanup: make sure there's no cache pollution */
2740  txn->invalidations);
2741  ReorderBufferCleanupTXN(rb, txn);
2742 }
2743 
2744 /*
2745  * Abort a transaction that possibly has previous changes. Needs to be first
2746  * called for subtransactions and then for the toplevel xid.
2747  *
2748  * NB: Transactions handled here have to have actively aborted (i.e. have
2749  * produced an abort record). Implicitly aborted transactions are handled via
2750  * ReorderBufferAbortOld(); transactions we're just not interested in, but
2751  * which have committed are handled in ReorderBufferForget().
2752  *
2753  * This function purges this transaction and its contents from memory and
2754  * disk.
2755  */
2756 void
2758 {
2760 
2761  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2762  false);
2763 
2764  /* unknown, nothing to remove */
2765  if (txn == NULL)
2766  return;
2767 
2768  /* For streamed transactions notify the remote node about the abort. */
2769  if (rbtxn_is_streamed(txn))
2770  {
2771  rb->stream_abort(rb, txn, lsn);
2772 
2773  /*
2774  * We might have decoded changes for this transaction that could load
2775  * the cache as per the current transaction's view (consider DDL's
2776  * happened in this transaction). We don't want the decoding of future
2777  * transactions to use those cache entries so execute invalidations.
2778  */
2779  if (txn->ninvalidations > 0)
2781  txn->invalidations);
2782  }
2783 
2784  /* cosmetic... */
2785  txn->final_lsn = lsn;
2786 
2787  /* remove potential on-disk data, and deallocate */
2788  ReorderBufferCleanupTXN(rb, txn);
2789 }
2790 
2791 /*
2792  * Abort all transactions that aren't actually running anymore because the
2793  * server restarted.
2794  *
2795  * NB: These really have to be transactions that have aborted due to a server
2796  * crash/immediate restart, as we don't deal with invalidations here.
2797  */
2798 void
2800 {
2801  dlist_mutable_iter it;
2802 
2803  /*
2804  * Iterate through all (potential) toplevel TXNs and abort all that are
2805  * older than what possibly can be running. Once we've found the first
2806  * that is alive we stop, there might be some that acquired an xid earlier
2807  * but started writing later, but it's unlikely and they will be cleaned
2808  * up in a later call to this function.
2809  */
2811  {
2813 
2814  txn = dlist_container(ReorderBufferTXN, node, it.cur);
2815 
2816  if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2817  {
2818  elog(DEBUG2, "aborting old transaction %u", txn->xid);
2819 
2820  /* remove potential on-disk data, and deallocate this tx */
2821  ReorderBufferCleanupTXN(rb, txn);
2822  }
2823  else
2824  return;
2825  }
2826 }
2827 
2828 /*
2829  * Forget the contents of a transaction if we aren't interested in its
2830  * contents. Needs to be first called for subtransactions and then for the
2831  * toplevel xid.
2832  *
2833  * This is significantly different to ReorderBufferAbort() because
2834  * transactions that have committed need to be treated differently from aborted
2835  * ones since they may have modified the catalog.
2836  *
2837  * Note that this is only allowed to be called in the moment a transaction
2838  * commit has just been read, not earlier; otherwise later records referring
2839  * to this xid might re-create the transaction incompletely.
2840  */
2841 void
2843 {
2845 
2846  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2847  false);
2848 
2849  /* unknown, nothing to forget */
2850  if (txn == NULL)
2851  return;
2852 
2853  /* For streamed transactions notify the remote node about the abort. */
2854  if (rbtxn_is_streamed(txn))
2855  rb->stream_abort(rb, txn, lsn);
2856 
2857  /* cosmetic... */
2858  txn->final_lsn = lsn;
2859 
2860  /*
2861  * Process cache invalidation messages if there are any. Even if we're not
2862  * interested in the transaction's contents, it could have manipulated the
2863  * catalog and we need to update the caches according to that.
2864  */
2865  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2867  txn->invalidations);
2868  else
2869  Assert(txn->ninvalidations == 0);
2870 
2871  /* remove potential on-disk data, and deallocate */
2872  ReorderBufferCleanupTXN(rb, txn);
2873 }
2874 
2875 /*
2876  * Invalidate cache for those transactions that need to be skipped just in case
2877  * catalogs were manipulated as part of the transaction.
2878  *
2879  * Note that this is a special-purpose function for prepared transactions where
2880  * we don't want to clean up the TXN even when we decide to skip it. See
2881  * DecodePrepare.
2882  */
2883 void
2885 {
2887 
2888  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2889  false);
2890 
2891  /* unknown, nothing to do */
2892  if (txn == NULL)
2893  return;
2894 
2895  /*
2896  * Process cache invalidation messages if there are any. Even if we're not
2897  * interested in the transaction's contents, it could have manipulated the
2898  * catalog and we need to update the caches according to that.
2899  */
2900  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2902  txn->invalidations);
2903  else
2904  Assert(txn->ninvalidations == 0);
2905 }
2906 
2907 
2908 /*
2909  * Execute invalidations happening outside the context of a decoded
2910  * transaction. That currently happens either for xid-less commits
2911  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
2912  * transactions (via ReorderBufferForget()).
2913  */
2914 void
2916  SharedInvalidationMessage *invalidations)
2917 {
2918  bool use_subtxn = IsTransactionOrTransactionBlock();
2919  int i;
2920 
2921  if (use_subtxn)
2922  BeginInternalSubTransaction("replay");
2923 
2924  /*
2925  * Force invalidations to happen outside of a valid transaction - that way
2926  * entries will just be marked as invalid without accessing the catalog.
2927  * That's advantageous because we don't need to setup the full state
2928  * necessary for catalog access.
2929  */
2930  if (use_subtxn)
2932 
2933  for (i = 0; i < ninvalidations; i++)
2934  LocalExecuteInvalidationMessage(&invalidations[i]);
2935 
2936  if (use_subtxn)
2938 }
2939 
2940 /*
2941  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
2942  * least once for every xid in XLogRecord->xl_xid (other places in records
2943  * may, but do not have to be passed through here).
2944  *
2945  * Reorderbuffer keeps some datastructures about transactions in LSN order,
2946  * for efficiency. To do that it has to know about when transactions are seen
2947  * first in the WAL. As many types of records are not actually interesting for
2948  * logical decoding, they do not necessarily pass though here.
2949  */
2950 void
2952 {
2953  /* many records won't have an xid assigned, centralize check here */
2954  if (xid != InvalidTransactionId)
2955  ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
2956 }
2957 
2958 /*
2959  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
2960  * because the previous snapshot doesn't describe the catalog correctly for
2961  * following rows.
2962  */
2963 void
2965  XLogRecPtr lsn, Snapshot snap)
2966 {
2968 
2969  change->data.snapshot = snap;
2971 
2972  ReorderBufferQueueChange(rb, xid, lsn, change, false);
2973 }
2974 
2975 /*
2976  * Set up the transaction's base snapshot.
2977  *
2978  * If we know that xid is a subtransaction, set the base snapshot on the
2979  * top-level transaction instead.
2980  */
2981 void
2983  XLogRecPtr lsn, Snapshot snap)
2984 {
2986  bool is_new;
2987 
2988  AssertArg(snap != NULL);
2989 
2990  /*
2991  * Fetch the transaction to operate on. If we know it's a subtransaction,
2992  * operate on its top-level transaction instead.
2993  */
2994  txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
2995  if (rbtxn_is_known_subxact(txn))
2996  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
2997  NULL, InvalidXLogRecPtr, false);
2998  Assert(txn->base_snapshot == NULL);
2999 
3000  txn->base_snapshot = snap;
3001  txn->base_snapshot_lsn = lsn;
3003 
3004  AssertTXNLsnOrder(rb);
3005 }
3006 
3007 /*
3008  * Access the catalog with this CommandId at this point in the changestream.
3009  *
3010  * May only be called for command ids > 1
3011  */
3012 void
3014  XLogRecPtr lsn, CommandId cid)
3015 {
3017 
3018  change->data.command_id = cid;
3020 
3021  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3022 }
3023 
3024 /*
3025  * Update memory counters to account for the new or removed change.
3026  *
3027  * We update two counters - in the reorder buffer, and in the transaction
3028  * containing the change. The reorder buffer counter allows us to quickly
3029  * decide if we reached the memory limit, the transaction counter allows
3030  * us to quickly pick the largest transaction for eviction.
3031  *
3032  * When streaming is enabled, we need to update the toplevel transaction
3033  * counters instead - we don't really care about subtransactions as we
3034  * can't stream them individually anyway, and we only pick toplevel
3035  * transactions for eviction. So only toplevel transactions matter.
3036  */
3037 static void
3039  ReorderBufferChange *change,
3040  bool addition)
3041 {
3042  Size sz;
3044  ReorderBufferTXN *toptxn = NULL;
3045 
3046  Assert(change->txn);
3047 
3048  /*
3049  * Ignore tuple CID changes, because those are not evicted when reaching
3050  * memory limit. So we just don't count them, because it might easily
3051  * trigger a pointless attempt to spill.
3052  */
3054  return;
3055 
3056  txn = change->txn;
3057 
3058  /* If streaming supported, update the total size in top level as well. */
3059  if (ReorderBufferCanStream(rb))
3060  {
3061  if (txn->toptxn != NULL)
3062  toptxn = txn->toptxn;
3063  else
3064  toptxn = txn;
3065  }
3066 
3067  sz = ReorderBufferChangeSize(change);
3068 
3069  if (addition)
3070  {
3071  txn->size += sz;
3072  rb->size += sz;
3073 
3074  /* Update the total size in the top transaction. */
3075  if (toptxn)
3076  toptxn->total_size += sz;
3077  }
3078  else
3079  {
3080  Assert((rb->size >= sz) && (txn->size >= sz));
3081  txn->size -= sz;
3082  rb->size -= sz;
3083 
3084  /* Update the total size in the top transaction. */
3085  if (toptxn)
3086  toptxn->total_size -= sz;
3087  }
3088 
3089  Assert(txn->size <= rb->size);
3090 }
3091 
3092 /*
3093  * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
3094  *
3095  * We do not include this change type in memory accounting, because we
3096  * keep CIDs in a separate list and do not evict them when reaching
3097  * the memory limit.
3098  */
3099 void
3101  XLogRecPtr lsn, RelFileNode node,
3102  ItemPointerData tid, CommandId cmin,
3103  CommandId cmax, CommandId combocid)
3104 {
3107 
3108  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3109 
3110  change->data.tuplecid.node = node;
3111  change->data.tuplecid.tid = tid;
3112  change->data.tuplecid.cmin = cmin;
3113  change->data.tuplecid.cmax = cmax;
3114  change->data.tuplecid.combocid = combocid;
3115  change->lsn = lsn;
3116  change->txn = txn;
3118 
3119  dlist_push_tail(&txn->tuplecids, &change->node);
3120  txn->ntuplecids++;
3121 }
3122 
3123 /*
3124  * Setup the invalidation of the toplevel transaction.
3125  *
3126  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3127  * accumulates all the invalidation messages in the toplevel transaction as
3128  * well as in the form of change in reorder buffer. We require to record it in
3129  * form of the change so that we can execute only the required invalidations
3130  * instead of executing all the invalidations on each CommandId increment. We
3131  * also need to accumulate these in the toplevel transaction because in some
3132  * cases we skip processing the transaction (see ReorderBufferForget), we need
3133  * to execute all the invalidations together.
3134  */
3135 void
3137  XLogRecPtr lsn, Size nmsgs,
3139 {
3141  MemoryContext oldcontext;
3142  ReorderBufferChange *change;
3143 
3144  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3145 
3146  oldcontext = MemoryContextSwitchTo(rb->context);
3147 
3148  /*
3149  * Collect all the invalidations under the top transaction so that we can
3150  * execute them all together. See comment atop this function
3151  */
3152  if (txn->toptxn)
3153  txn = txn->toptxn;
3154 
3155  Assert(nmsgs > 0);
3156 
3157  /* Accumulate invalidations. */
3158  if (txn->ninvalidations == 0)
3159  {
3160  txn->ninvalidations = nmsgs;
3162  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3163  memcpy(txn->invalidations, msgs,
3164  sizeof(SharedInvalidationMessage) * nmsgs);
3165  }
3166  else
3167  {
3170  (txn->ninvalidations + nmsgs));
3171 
3172  memcpy(txn->invalidations + txn->ninvalidations, msgs,
3173  nmsgs * sizeof(SharedInvalidationMessage));
3174  txn->ninvalidations += nmsgs;
3175  }
3176 
3177  change = ReorderBufferGetChange(rb);
3179  change->data.inval.ninvalidations = nmsgs;
3180  change->data.inval.invalidations = (SharedInvalidationMessage *)
3181  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3182  memcpy(change->data.inval.invalidations, msgs,
3183  sizeof(SharedInvalidationMessage) * nmsgs);
3184 
3185  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3186 
3187  MemoryContextSwitchTo(oldcontext);
3188 }
3189 
3190 /*
3191  * Apply all invalidations we know. Possibly we only need parts at this point
3192  * in the changestream but we don't know which those are.
3193  */
3194 static void
3196 {
3197  int i;
3198 
3199  for (i = 0; i < nmsgs; i++)
3201 }
3202 
3203 /*
3204  * Mark a transaction as containing catalog changes
3205  */
3206 void
3208  XLogRecPtr lsn)
3209 {
3211 
3212  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3213 
3215 
3216  /*
3217  * Mark top-level transaction as having catalog changes too if one of its
3218  * children has so that the ReorderBufferBuildTupleCidHash can
3219  * conveniently check just top-level transaction and decide whether to
3220  * build the hash table or not.
3221  */
3222  if (txn->toptxn != NULL)
3224 }
3225 
3226 /*
3227  * Query whether a transaction is already *known* to contain catalog
3228  * changes. This can be wrong until directly before the commit!
3229  */
3230 bool
3232 {
3234 
3235  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3236  false);
3237  if (txn == NULL)
3238  return false;
3239 
3240  return rbtxn_has_catalog_changes(txn);
3241 }
3242 
3243 /*
3244  * ReorderBufferXidHasBaseSnapshot
3245  * Have we already set the base snapshot for the given txn/subtxn?
3246  */
3247 bool
3249 {
3251 
3252  txn = ReorderBufferTXNByXid(rb, xid, false,
3253  NULL, InvalidXLogRecPtr, false);
3254 
3255  /* transaction isn't known yet, ergo no snapshot */
3256  if (txn == NULL)
3257  return false;
3258 
3259  /* a known subtxn? operate on top-level txn instead */
3260  if (rbtxn_is_known_subxact(txn))
3261  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3262  NULL, InvalidXLogRecPtr, false);
3263 
3264  return txn->base_snapshot != NULL;
3265 }
3266 
3267 
3268 /*
3269  * ---------------------------------------
3270  * Disk serialization support
3271  * ---------------------------------------
3272  */
3273 
3274 /*
3275  * Ensure the IO buffer is >= sz.
3276  */
3277 static void
3279 {
3280  if (!rb->outbufsize)
3281  {
3282  rb->outbuf = MemoryContextAlloc(rb->context, sz);
3283  rb->outbufsize = sz;
3284  }
3285  else if (rb->outbufsize < sz)
3286  {
3287  rb->outbuf = repalloc(rb->outbuf, sz);
3288  rb->outbufsize = sz;
3289  }
3290 }
3291 
3292 /*
3293  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3294  *
3295  * XXX With many subtransactions this might be quite slow, because we'll have
3296  * to walk through all of them. There are some options how we could improve
3297  * that: (a) maintain some secondary structure with transactions sorted by
3298  * amount of changes, (b) not looking for the entirely largest transaction,
3299  * but e.g. for transaction using at least some fraction of the memory limit,
3300  * and (c) evicting multiple transactions at once, e.g. to free a given portion
3301  * of the memory limit (e.g. 50%).
3302  */
3303 static ReorderBufferTXN *
3305 {
3306  HASH_SEQ_STATUS hash_seq;
3308  ReorderBufferTXN *largest = NULL;
3309 
3310  hash_seq_init(&hash_seq, rb->by_txn);
3311  while ((ent = hash_seq_search(&hash_seq)) != NULL)
3312  {
3313  ReorderBufferTXN *txn = ent->txn;
3314 
3315  /* if the current transaction is larger, remember it */
3316  if ((!largest) || (txn->size > largest->size))
3317  largest = txn;
3318  }
3319 
3320  Assert(largest);
3321  Assert(largest->size > 0);
3322  Assert(largest->size <= rb->size);
3323 
3324  return largest;
3325 }
3326 
3327 /*
3328  * Find the largest toplevel transaction to evict (by streaming).
3329  *
3330  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3331  * should give us the same transaction (because we don't update memory account
3332  * for subtransaction with streaming, so it's always 0). But we can simply
3333  * iterate over the limited number of toplevel transactions.
3334  *
3335  * Note that, we skip transactions that contains incomplete changes. There
3336  * is a scope of optimization here such that we can select the largest transaction
3337  * which has complete changes. But that will make the code and design quite complex
3338  * and that might not be worth the benefit. If we plan to stream the transactions
3339  * that contains incomplete changes then we need to find a way to partially
3340  * stream/truncate the transaction changes in-memory and build a mechanism to
3341  * partially truncate the spilled files. Additionally, whenever we partially
3342  * stream the transaction we need to maintain the last streamed lsn and next time
3343  * we need to restore from that segment and the offset in WAL. As we stream the
3344  * changes from the top transaction and restore them subtransaction wise, we need
3345  * to even remember the subxact from where we streamed the last change.
3346  */
3347 static ReorderBufferTXN *
3349 {
3350  dlist_iter iter;
3351  Size largest_size = 0;
3352  ReorderBufferTXN *largest = NULL;
3353 
3354  /* Find the largest top-level transaction. */
3355  dlist_foreach(iter, &rb->toplevel_by_lsn)
3356  {
3358 
3359  txn = dlist_container(ReorderBufferTXN, node, iter.cur);
3360 
3361  if ((largest != NULL || txn->total_size > largest_size) &&
3362  (txn->total_size > 0) && !(rbtxn_has_incomplete_tuple(txn)))
3363  {
3364  largest = txn;
3365  largest_size = txn->total_size;
3366  }
3367  }
3368 
3369  return largest;
3370 }
3371 
3372 /*
3373  * Check whether the logical_decoding_work_mem limit was reached, and if yes
3374  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3375  * disk until we reach under the memory limit.
3376  *
3377  * XXX At this point we select the transactions until we reach under the memory
3378  * limit, but we might also adapt a more elaborate eviction strategy - for example
3379  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3380  * limit.
3381  */
3382 static void
3384 {
3386 
3387  /* bail out if we haven't exceeded the memory limit */
3388  if (rb->size < logical_decoding_work_mem * 1024L)
3389  return;
3390 
3391  /*
3392  * Loop until we reach under the memory limit. One might think that just
3393  * by evicting the largest (sub)transaction we will come under the memory
3394  * limit based on assumption that the selected transaction is at least as
3395  * large as the most recent change (which caused us to go over the memory
3396  * limit). However, that is not true because a user can reduce the
3397  * logical_decoding_work_mem to a smaller value before the most recent
3398  * change.
3399  */
3400  while (rb->size >= logical_decoding_work_mem * 1024L)
3401  {
3402  /*
3403  * Pick the largest transaction (or subtransaction) and evict it from
3404  * memory by streaming, if possible. Otherwise, spill to disk.
3405  */
3407  (txn = ReorderBufferLargestTopTXN(rb)) != NULL)
3408  {
3409  /* we know there has to be one, because the size is not zero */
3410  Assert(txn && !txn->toptxn);
3411  Assert(txn->total_size > 0);
3412  Assert(rb->size >= txn->total_size);
3413 
3414  ReorderBufferStreamTXN(rb, txn);
3415  }
3416  else
3417  {
3418  /*
3419  * Pick the largest transaction (or subtransaction) and evict it
3420  * from memory by serializing it to disk.
3421  */
3422  txn = ReorderBufferLargestTXN(rb);
3423 
3424  /* we know there has to be one, because the size is not zero */
3425  Assert(txn);
3426  Assert(txn->size > 0);
3427  Assert(rb->size >= txn->size);
3428 
3429  ReorderBufferSerializeTXN(rb, txn);
3430  }
3431 
3432  /*
3433  * After eviction, the transaction should have no entries in memory,
3434  * and should use 0 bytes for changes.
3435  */
3436  Assert(txn->size == 0);
3437  Assert(txn->nentries_mem == 0);
3438  }
3439 
3440  /* We must be under the memory limit now. */
3441  Assert(rb->size < logical_decoding_work_mem * 1024L);
3442 }
3443 
3444 /*
3445  * Spill data of a large transaction (and its subtransactions) to disk.
3446  */
3447 static void
3449 {
3450  dlist_iter subtxn_i;
3451  dlist_mutable_iter change_i;
3452  int fd = -1;
3453  XLogSegNo curOpenSegNo = 0;
3454  Size spilled = 0;
3455  Size size = txn->size;
3456 
3457  elog(DEBUG2, "spill %u changes in XID %u to disk",
3458  (uint32) txn->nentries_mem, txn->xid);
3459 
3460  /* do the same to all child TXs */
3461  dlist_foreach(subtxn_i, &txn->subtxns)
3462  {
3463  ReorderBufferTXN *subtxn;
3464 
3465  subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3466  ReorderBufferSerializeTXN(rb, subtxn);
3467  }
3468 
3469  /* serialize changestream */
3470  dlist_foreach_modify(change_i, &txn->changes)
3471  {
3472  ReorderBufferChange *change;
3473 
3474  change = dlist_container(ReorderBufferChange, node, change_i.cur);
3475 
3476  /*
3477  * store in segment in which it belongs by start lsn, don't split over
3478  * multiple segments tho
3479  */
3480  if (fd == -1 ||
3481  !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3482  {
3483  char path[MAXPGPATH];
3484 
3485  if (fd != -1)
3486  CloseTransientFile(fd);
3487 
3488  XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3489 
3490  /*
3491  * No need to care about TLIs here, only used during a single run,
3492  * so each LSN only maps to a specific WAL record.
3493  */
3495  curOpenSegNo);
3496 
3497  /* open segment, create it if necessary */
3498  fd = OpenTransientFile(path,
3499  O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3500 
3501  if (fd < 0)
3502  ereport(ERROR,
3504  errmsg("could not open file \"%s\": %m", path)));
3505  }
3506 
3507  ReorderBufferSerializeChange(rb, txn, fd, change);
3508  dlist_delete(&change->node);
3509  ReorderBufferReturnChange(rb, change, true);
3510 
3511  spilled++;
3512  }
3513 
3514  /* update the statistics iff we have spilled anything */
3515  if (spilled)
3516  {
3517  rb->spillCount += 1;
3518  rb->spillBytes += size;
3519 
3520  /* don't consider already serialized transactions */
3521  rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3522  }
3523 
3524  Assert(spilled == txn->nentries_mem);
3525  Assert(dlist_is_empty(&txn->changes));
3526  txn->nentries_mem = 0;
3528 
3529  if (fd != -1)
3530  CloseTransientFile(fd);
3531 }
3532 
3533 /*
3534  * Serialize individual change to disk.
3535  */
3536 static void
3538  int fd, ReorderBufferChange *change)
3539 {
3540  ReorderBufferDiskChange *ondisk;
3541  Size sz = sizeof(ReorderBufferDiskChange);
3542 
3544 
3545  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3546  memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3547 
3548  switch (change->action)
3549  {
3550  /* fall through these, they're all similar enough */
3555  {
3556  char *data;
3557  ReorderBufferTupleBuf *oldtup,
3558  *newtup;
3559  Size oldlen = 0;
3560  Size newlen = 0;
3561 
3562  oldtup = change->data.tp.oldtuple;
3563  newtup = change->data.tp.newtuple;
3564 
3565  if (oldtup)
3566  {
3567  sz += sizeof(HeapTupleData);
3568  oldlen = oldtup->tuple.t_len;
3569  sz += oldlen;
3570  }
3571 
3572  if (newtup)
3573  {
3574  sz += sizeof(HeapTupleData);
3575  newlen = newtup->tuple.t_len;
3576  sz += newlen;
3577  }
3578 
3579  /* make sure we have enough space */
3581 
3582  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3583  /* might have been reallocated above */
3584  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3585 
3586  if (oldlen)
3587  {
3588  memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
3589  data += sizeof(HeapTupleData);
3590 
3591  memcpy(data, oldtup->tuple.t_data, oldlen);
3592  data += oldlen;
3593  }
3594 
3595  if (newlen)
3596  {
3597  memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
3598  data += sizeof(HeapTupleData);
3599 
3600  memcpy(data, newtup->tuple.t_data, newlen);
3601  data += newlen;
3602  }
3603  break;
3604  }
3606  {
3607  char *data;
3608  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3609 
3610  sz += prefix_size + change->data.msg.message_size +
3611  sizeof(Size) + sizeof(Size);
3613 
3614  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3615 
3616  /* might have been reallocated above */
3617  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3618 
3619  /* write the prefix including the size */
3620  memcpy(data, &prefix_size, sizeof(Size));
3621  data += sizeof(Size);
3622  memcpy(data, change->data.msg.prefix,
3623  prefix_size);
3624  data += prefix_size;
3625 
3626  /* write the message including the size */
3627  memcpy(data, &change->data.msg.message_size, sizeof(Size));
3628  data += sizeof(Size);
3629  memcpy(data, change->data.msg.message,
3630  change->data.msg.message_size);
3631  data += change->data.msg.message_size;
3632 
3633  break;
3634  }
3636  {
3637  char *data;
3638  Size inval_size = sizeof(SharedInvalidationMessage) *
3639  change->data.inval.ninvalidations;
3640 
3641  sz += inval_size;
3642 
3644  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3645 
3646  /* might have been reallocated above */
3647  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3648  memcpy(data, change->data.inval.invalidations, inval_size);
3649  data += inval_size;
3650 
3651  break;
3652  }
3654  {
3655  Snapshot snap;
3656  char *data;
3657 
3658  snap = change->data.snapshot;
3659 
3660  sz += sizeof(SnapshotData) +
3661  sizeof(TransactionId) * snap->xcnt +
3662  sizeof(TransactionId) * snap->subxcnt;
3663 
3664  /* make sure we have enough space */
3666  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3667  /* might have been reallocated above */
3668  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3669 
3670  memcpy(data, snap, sizeof(SnapshotData));
3671  data += sizeof(SnapshotData);
3672 
3673  if (snap->xcnt)
3674  {
3675  memcpy(data, snap->xip,
3676  sizeof(TransactionId) * snap->xcnt);
3677  data += sizeof(TransactionId) * snap->xcnt;
3678  }
3679 
3680  if (snap->subxcnt)
3681  {
3682  memcpy(data, snap->subxip,
3683  sizeof(TransactionId) * snap->subxcnt);
3684  data += sizeof(TransactionId) * snap->subxcnt;
3685  }
3686  break;
3687  }
3689  {
3690  Size size;
3691  char *data;
3692 
3693  /* account for the OIDs of truncated relations */
3694  size = sizeof(Oid) * change->data.truncate.nrelids;
3695  sz += size;
3696 
3697  /* make sure we have enough space */
3699 
3700  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3701  /* might have been reallocated above */
3702  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3703 
3704  memcpy(data, change->data.truncate.relids, size);
3705  data += size;
3706 
3707  break;
3708  }
3712  /* ReorderBufferChange contains everything important */
3713  break;
3714  }
3715 
3716  ondisk->size = sz;
3717 
3718  errno = 0;
3720  if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3721  {
3722  int save_errno = errno;
3723 
3724  CloseTransientFile(fd);
3725 
3726  /* if write didn't set errno, assume problem is no disk space */
3727  errno = save_errno ? save_errno : ENOSPC;
3728  ereport(ERROR,
3730  errmsg("could not write to data file for XID %u: %m",
3731  txn->xid)));
3732  }
3734 
3735  /*
3736  * Keep the transaction's final_lsn up to date with each change we send to
3737  * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3738  * only do this on commit and abort records, but that doesn't work if a
3739  * system crash leaves a transaction without its abort record).
3740  *
3741  * Make sure not to move it backwards.
3742  */
3743  if (txn->final_lsn < change->lsn)
3744  txn->final_lsn = change->lsn;
3745 
3746  Assert(ondisk->change.action == change->action);
3747 }
3748 
3749 /* Returns true, if the output plugin supports streaming, false, otherwise. */
3750 static inline bool
3752 {
3754 
3755  return ctx->streaming;
3756 }
3757 
3758 /* Returns true, if the streaming can be started now, false, otherwise. */
3759 static inline bool
3761 {
3763  SnapBuild *builder = ctx->snapshot_builder;
3764 
3765  /* We can't start streaming unless a consistent state is reached. */
3767  return false;
3768 
3769  /*
3770  * We can't start streaming immediately even if the streaming is enabled
3771  * because we previously decoded this transaction and now just are
3772  * restarting.
3773  */
3774  if (ReorderBufferCanStream(rb) &&
3775  !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr))
3776  return true;
3777 
3778  return false;
3779 }
3780 
3781 /*
3782  * Send data of a large transaction (and its subtransactions) to the
3783  * output plugin, but using the stream API.
3784  */
3785 static void
3787 {
3788  Snapshot snapshot_now;
3789  CommandId command_id;
3790  Size stream_bytes;
3791  bool txn_is_streamed;
3792 
3793  /* We can never reach here for a subtransaction. */
3794  Assert(txn->toptxn == NULL);
3795 
3796  /*
3797  * We can't make any assumptions about base snapshot here, similar to what
3798  * ReorderBufferCommit() does. That relies on base_snapshot getting
3799  * transferred from subxact in ReorderBufferCommitChild(), but that was
3800  * not yet called as the transaction is in-progress.
3801  *
3802  * So just walk the subxacts and use the same logic here. But we only need
3803  * to do that once, when the transaction is streamed for the first time.
3804  * After that we need to reuse the snapshot from the previous run.
3805  *
3806  * Unlike DecodeCommit which adds xids of all the subtransactions in
3807  * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here
3808  * but we do add them to subxip array instead via ReorderBufferCopySnap.
3809  * This allows the catalog changes made in subtransactions decoded till
3810  * now to be visible.
3811  */
3812  if (txn->snapshot_now == NULL)
3813  {
3814  dlist_iter subxact_i;
3815 
3816  /* make sure this transaction is streamed for the first time */
3817  Assert(!rbtxn_is_streamed(txn));
3818 
3819  /* at the beginning we should have invalid command ID */
3821 
3822  dlist_foreach(subxact_i, &txn->subtxns)
3823  {
3824  ReorderBufferTXN *subtxn;
3825 
3826  subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
3827  ReorderBufferTransferSnapToParent(txn, subtxn);
3828  }
3829 
3830  /*
3831  * If this transaction has no snapshot, it didn't make any changes to
3832  * the database till now, so there's nothing to decode.
3833  */
3834  if (txn->base_snapshot == NULL)
3835  {
3836  Assert(txn->ninvalidations == 0);
3837  return;
3838  }
3839 
3840  command_id = FirstCommandId;
3841  snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
3842  txn, command_id);
3843  }
3844  else
3845  {
3846  /* the transaction must have been already streamed */
3847  Assert(rbtxn_is_streamed(txn));
3848 
3849  /*
3850  * Nah, we already have snapshot from the previous streaming run. We
3851  * assume new subxacts can't move the LSN backwards, and so can't beat
3852  * the LSN condition in the previous branch (so no need to walk
3853  * through subxacts again). In fact, we must not do that as we may be
3854  * using snapshot half-way through the subxact.
3855  */
3856  command_id = txn->command_id;
3857 
3858  /*
3859  * We can't use txn->snapshot_now directly because after the last
3860  * streaming run, we might have got some new sub-transactions. So we
3861  * need to add them to the snapshot.
3862  */
3863  snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
3864  txn, command_id);
3865 
3866  /* Free the previously copied snapshot. */
3867  Assert(txn->snapshot_now->copied);
3869  txn->snapshot_now = NULL;
3870  }
3871 
3872  /*
3873  * Remember this information to be used later to update stats. We can't
3874  * update the stats here as an error while processing the changes would
3875  * lead to the accumulation of stats even though we haven't streamed all
3876  * the changes.
3877  */
3878  txn_is_streamed = rbtxn_is_streamed(txn);
3879  stream_bytes = txn->total_size;
3880 
3881  /* Process and send the changes to output plugin. */
3882  ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
3883  command_id, true);
3884 
3885  rb->streamCount += 1;
3886  rb->streamBytes += stream_bytes;
3887 
3888  /* Don't consider already streamed transaction. */
3889  rb->streamTxns += (txn_is_streamed) ? 0 : 1;
3890 
3891  Assert(dlist_is_empty(&txn->changes));
3892  Assert(txn->nentries == 0);
3893  Assert(txn->nentries_mem == 0);
3894 }
3895 
3896 /*
3897  * Size of a change in memory.
3898  */
3899 static Size
3901 {
3902  Size sz = sizeof(ReorderBufferChange);
3903 
3904  switch (change->action)
3905  {
3906  /* fall through these, they're all similar enough */
3911  {
3912  ReorderBufferTupleBuf *oldtup,
3913  *newtup;
3914  Size oldlen = 0;
3915  Size newlen = 0;
3916 
3917  oldtup = change->data.tp.oldtuple;
3918  newtup = change->data.tp.newtuple;
3919 
3920  if (oldtup)
3921  {
3922  sz += sizeof(HeapTupleData);
3923  oldlen = oldtup->tuple.t_len;
3924  sz += oldlen;
3925  }
3926 
3927  if (newtup)
3928  {
3929  sz += sizeof(HeapTupleData);
3930  newlen = newtup->tuple.t_len;
3931  sz += newlen;
3932  }
3933 
3934  break;
3935  }
3937  {
3938  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3939 
3940  sz += prefix_size + change->data.msg.message_size +
3941  sizeof(Size) + sizeof(Size);
3942 
3943  break;
3944  }
3946  {
3947  sz += sizeof(SharedInvalidationMessage) *
3948  change->data.inval.ninvalidations;
3949  break;
3950  }
3952  {
3953  Snapshot snap;
3954 
3955  snap = change->data.snapshot;
3956 
3957  sz += sizeof(SnapshotData) +
3958  sizeof(TransactionId) * snap->xcnt +
3959  sizeof(TransactionId) * snap->subxcnt;
3960 
3961  break;
3962  }
3964  {
3965  sz += sizeof(Oid) * change->data.truncate.nrelids;
3966 
3967  break;
3968  }
3972  /* ReorderBufferChange contains everything important */
3973  break;
3974  }
3975 
3976  return sz;
3977 }
3978 
3979 
3980 /*
3981  * Restore a number of changes spilled to disk back into memory.
3982  */
3983 static Size
3985  TXNEntryFile *file, XLogSegNo *segno)
3986 {
3987  Size restored = 0;
3988  XLogSegNo last_segno;
3989  dlist_mutable_iter cleanup_iter;
3990  File *fd = &file->vfd;
3991 
3994 
3995  /* free current entries, so we have memory for more */
3996  dlist_foreach_modify(cleanup_iter, &txn->changes)
3997  {
3999  dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4000 
4001  dlist_delete(&cleanup->node);
4002  ReorderBufferReturnChange(rb, cleanup, true);
4003  }
4004  txn->nentries_mem = 0;
4005  Assert(dlist_is_empty(&txn->changes));
4006 
4007  XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4008 
4009  while (restored < max_changes_in_memory && *segno <= last_segno)
4010  {
4011  int readBytes;
4012  ReorderBufferDiskChange *ondisk;
4013 
4014  if (*fd == -1)
4015  {
4016  char path[MAXPGPATH];
4017 
4018  /* first time in */
4019  if (*segno == 0)
4020  XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4021 
4022  Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4023 
4024  /*
4025  * No need to care about TLIs here, only used during a single run,
4026  * so each LSN only maps to a specific WAL record.
4027  */
4029  *segno);
4030 
4031  *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4032 
4033  /* No harm in resetting the offset even in case of failure */
4034  file->curOffset = 0;
4035 
4036  if (*fd < 0 && errno == ENOENT)
4037  {
4038  *fd = -1;
4039  (*segno)++;
4040  continue;
4041  }
4042  else if (*fd < 0)
4043  ereport(ERROR,
4045  errmsg("could not open file \"%s\": %m",
4046  path)));
4047  }
4048 
4049  /*
4050  * Read the statically sized part of a change which has information
4051  * about the total size. If we couldn't read a record, we're at the
4052  * end of this file.
4053  */
4055  readBytes = FileRead(file->vfd, rb->outbuf,
4056  sizeof(ReorderBufferDiskChange),
4058 
4059  /* eof */
4060  if (readBytes == 0)
4061  {
4062  FileClose(*fd);
4063  *fd = -1;
4064  (*segno)++;
4065  continue;
4066  }
4067  else if (readBytes < 0)
4068  ereport(ERROR,
4070  errmsg("could not read from reorderbuffer spill file: %m")));
4071  else if (readBytes != sizeof(ReorderBufferDiskChange))
4072  ereport(ERROR,
4074  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4075  readBytes,
4076  (uint32) sizeof(ReorderBufferDiskChange))));
4077 
4078  file->curOffset += readBytes;
4079 
4080  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4081 
4083  sizeof(ReorderBufferDiskChange) + ondisk->size);
4084  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4085 
4086  readBytes = FileRead(file->vfd,
4087  rb->outbuf + sizeof(ReorderBufferDiskChange),
4088  ondisk->size - sizeof(ReorderBufferDiskChange),
4089  file->curOffset,
4091 
4092  if (readBytes < 0)
4093  ereport(ERROR,
4095  errmsg("could not read from reorderbuffer spill file: %m")));
4096  else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4097  ereport(ERROR,
4099  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4100  readBytes,
4101  (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4102 
4103  file->curOffset += readBytes;
4104 
4105  /*
4106  * ok, read a full change from disk, now restore it into proper
4107  * in-memory format
4108  */
4109  ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4110  restored++;
4111  }
4112 
4113  return restored;
4114 }
4115 
4116 /*
4117  * Convert change from its on-disk format to in-memory format and queue it onto
4118  * the TXN's ->changes list.
4119  *
4120  * Note: although "data" is declared char*, at entry it points to a
4121  * maxalign'd buffer, making it safe in most of this function to assume
4122  * that the pointed-to data is suitably aligned for direct access.
4123  */
4124 static void
4126  char *data)
4127 {
4128  ReorderBufferDiskChange *ondisk;
4129  ReorderBufferChange *change;
4130 
4131  ondisk = (ReorderBufferDiskChange *) data;
4132 
4133  change = ReorderBufferGetChange(rb);
4134 
4135  /* copy static part */
4136  memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4137 
4138  data += sizeof(ReorderBufferDiskChange);
4139 
4140  /* restore individual stuff */
4141  switch (change->action)
4142  {
4143  /* fall through these, they're all similar enough */
4148  if (change->data.tp.oldtuple)
4149  {
4150  uint32 tuplelen = ((HeapTuple) data)->t_len;
4151 
4152  change->data.tp.oldtuple =
4154 
4155  /* restore ->tuple */
4156  memcpy(&change->data.tp.oldtuple->tuple, data,
4157  sizeof(HeapTupleData));
4158  data += sizeof(HeapTupleData);
4159 
4160  /* reset t_data pointer into the new tuplebuf */
4161  change->data.tp.oldtuple->tuple.t_data =
4162  ReorderBufferTupleBufData(change->data.tp.oldtuple);
4163 
4164  /* restore tuple data itself */
4165  memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
4166  data += tuplelen;
4167  }
4168 
4169  if (change->data.tp.newtuple)
4170  {
4171  /* here, data might not be suitably aligned! */
4172  uint32 tuplelen;
4173 
4174  memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4175  sizeof(uint32));
4176 
4177  change->data.tp.newtuple =
4179 
4180  /* restore ->tuple */
4181  memcpy(&change->data.tp.newtuple->tuple, data,
4182  sizeof(HeapTupleData));
4183  data += sizeof(HeapTupleData);
4184 
4185  /* reset t_data pointer into the new tuplebuf */
4186  change->data.tp.newtuple->tuple.t_data =
4187  ReorderBufferTupleBufData(change->data.tp.newtuple);
4188 
4189  /* restore tuple data itself */
4190  memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
4191  data += tuplelen;
4192  }
4193 
4194  break;
4196  {
4197  Size prefix_size;
4198 
4199  /* read prefix */
4200  memcpy(&prefix_size, data, sizeof(Size));
4201  data += sizeof(Size);
4202  change->data.msg.prefix = MemoryContextAlloc(rb->context,
4203  prefix_size);
4204  memcpy(change->data.msg.prefix, data, prefix_size);
4205  Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4206  data += prefix_size;
4207 
4208  /* read the message */
4209  memcpy(&change->data.msg.message_size, data, sizeof(Size));
4210  data += sizeof(Size);
4211  change->data.msg.message = MemoryContextAlloc(rb->context,
4212  change->data.msg.message_size);
4213  memcpy(change->data.msg.message, data,
4214  change->data.msg.message_size);
4215  data += change->data.msg.message_size;
4216 
4217  break;
4218  }
4220  {
4221  Size inval_size = sizeof(SharedInvalidationMessage) *
4222  change->data.inval.ninvalidations;
4223 
4224  change->data.inval.invalidations =
4225  MemoryContextAlloc(rb->context, inval_size);
4226 
4227  /* read the message */
4228  memcpy(change->data.inval.invalidations, data, inval_size);
4229 
4230  break;
4231  }
4233  {
4234  Snapshot oldsnap;
4235  Snapshot newsnap;
4236  Size size;
4237 
4238  oldsnap = (Snapshot) data;
4239 
4240  size = sizeof(SnapshotData) +
4241  sizeof(TransactionId) * oldsnap->xcnt +
4242  sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4243 
4244  change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4245 
4246  newsnap = change->data.snapshot;
4247 
4248  memcpy(newsnap, data, size);
4249  newsnap->xip = (TransactionId *)
4250  (((char *) newsnap) + sizeof(SnapshotData));
4251  newsnap->subxip = newsnap->xip + newsnap->xcnt;
4252  newsnap->copied = true;
4253  break;
4254  }
4255  /* the base struct contains all the data, easy peasy */
4257  {
4258  Oid *relids;
4259 
4260  relids = ReorderBufferGetRelids(rb,
4261  change->data.truncate.nrelids);
4262  memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4263  change->data.truncate.relids = relids;
4264 
4265  break;
4266  }
4270  break;
4271  }
4272 
4273  dlist_push_tail(&txn->changes, &change->node);
4274  txn->nentries_mem++;
4275 
4276  /*
4277  * Update memory accounting for the restored change. We need to do this
4278  * although we don't check the memory limit when restoring the changes in
4279  * this branch (we only do that when initially queueing the changes after
4280  * decoding), because we will release the changes later, and that will
4281  * update the accounting too (subtracting the size from the counters). And
4282  * we don't want to underflow there.
4283  */
4284  ReorderBufferChangeMemoryUpdate(rb, change, true);
4285 }
4286 
4287 /*
4288  * Remove all on-disk stored for the passed in transaction.
4289  */
4290 static void
4292 {
4293  XLogSegNo first;
4294  XLogSegNo cur;
4295  XLogSegNo last;
4296 
4299 
4300  XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4301  XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4302 
4303  /* iterate over all possible filenames, and delete them */
4304  for (cur = first; cur <= last; cur++)
4305  {
4306  char path[MAXPGPATH];
4307 
4309  if (unlink(path) != 0 && errno != ENOENT)
4310  ereport(ERROR,
4312  errmsg("could not remove file \"%s\": %m", path)));
4313  }
4314 }
4315 
4316 /*
4317  * Remove any leftover serialized reorder buffers from a slot directory after a
4318  * prior crash or decoding session exit.
4319  */
4320 static void
4322 {
4323  DIR *spill_dir;
4324  struct dirent *spill_de;
4325  struct stat statbuf;
4326  char path[MAXPGPATH * 2 + 12];
4327 
4328  sprintf(path, "pg_replslot/%s", slotname);
4329 
4330  /* we're only handling directories here, skip if it's not ours */
4331  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4332  return;
4333 
4334  spill_dir = AllocateDir(path);
4335  while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4336  {
4337  /* only look at names that can be ours */
4338  if (strncmp(spill_de->d_name, "xid", 3) == 0)
4339  {
4340  snprintf(path, sizeof(path),
4341  "pg_replslot/%s/%s", slotname,
4342  spill_de->d_name);
4343 
4344  if (unlink(path) != 0)
4345  ereport(ERROR,
4347  errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4348  path, slotname)));
4349  }
4350  }
4351  FreeDir(spill_dir);
4352 }
4353 
4354 /*
4355  * Given a replication slot, transaction ID and segment number, fill in the
4356  * corresponding spill file into 'path', which is a caller-owned buffer of size
4357  * at least MAXPGPATH.
4358  */
4359 static void
4361  XLogSegNo segno)
4362 {
4363  XLogRecPtr recptr;
4364 
4365  XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4366 
4367  snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4369  xid,
4370  (uint32) (recptr >> 32), (uint32) recptr);
4371 }
4372 
4373 /*
4374  * Delete all data spilled to disk after we've restarted/crashed. It will be
4375  * recreated when the respective slots are reused.
4376  */
4377 void
4379 {
4380  DIR *logical_dir;
4381  struct dirent *logical_de;
4382 
4383  logical_dir = AllocateDir("pg_replslot");
4384  while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4385  {
4386  if (strcmp(logical_de->d_name, ".") == 0 ||
4387  strcmp(logical_de->d_name, "..") == 0)
4388  continue;
4389 
4390  /* if it cannot be a slot, skip the directory */
4391  if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4392  continue;
4393 
4394  /*
4395  * ok, has to be a surviving logical slot, iterate and delete
4396  * everything starting with xid-*
4397  */
4399  }
4400  FreeDir(logical_dir);
4401 }
4402 
4403 /* ---------------------------------------
4404  * toast reassembly support
4405  * ---------------------------------------
4406  */
4407 
4408 /*
4409  * Initialize per tuple toast reconstruction support.
4410  */
4411 static void
4413 {
4414  HASHCTL hash_ctl;
4415 
4416  Assert(txn->toast_hash == NULL);
4417 
4418  hash_ctl.keysize = sizeof(Oid);
4419  hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4420  hash_ctl.hcxt = rb->context;
4421  txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4423 }
4424 
4425 /*
4426  * Per toast-chunk handling for toast reconstruction
4427  *
4428  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4429  * toasted Datum comes along.
4430  */
4431 static void
4433  Relation relation, ReorderBufferChange *change)
4434 {
4435  ReorderBufferToastEnt *ent;
4436  ReorderBufferTupleBuf *newtup;
4437  bool found;
4438  int32 chunksize;
4439  bool isnull;
4440  Pointer chunk;
4441  TupleDesc desc = RelationGetDescr(relation);
4442  Oid chunk_id;
4443  int32 chunk_seq;
4444 
4445  if (txn->toast_hash == NULL)
4446  ReorderBufferToastInitHash(rb, txn);
4447 
4448  Assert(IsToastRelation(relation));
4449 
4450  newtup = change->data.tp.newtuple;
4451  chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull));
4452  Assert(!isnull);
4453  chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull));
4454  Assert(!isnull);
4455 
4456  ent = (ReorderBufferToastEnt *)
4457  hash_search(txn->toast_hash,
4458  (void *) &chunk_id,
4459  HASH_ENTER,
4460  &found);
4461 
4462  if (!found)
4463  {
4464  Assert(ent->chunk_id == chunk_id);
4465  ent->num_chunks = 0;
4466  ent->last_chunk_seq = 0;
4467  ent->size = 0;
4468  ent->reconstructed = NULL;
4469  dlist_init(&ent->chunks);
4470 
4471  if (chunk_seq != 0)
4472  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4473  chunk_seq, chunk_id);
4474  }
4475  else if (found && chunk_seq != ent->last_chunk_seq + 1)
4476  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4477  chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4478 
4479  chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull));
4480  Assert(!isnull);
4481 
4482  /* calculate size so we can allocate the right size at once later */
4483  if (!VARATT_IS_EXTENDED(chunk))
4484  chunksize = VARSIZE(chunk) - VARHDRSZ;
4485  else if (VARATT_IS_SHORT(chunk))
4486  /* could happen due to heap_form_tuple doing its thing */
4487  chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4488  else
4489  elog(ERROR, "unexpected type of toast chunk");
4490 
4491  ent->size += chunksize;
4492  ent->last_chunk_seq = chunk_seq;
4493  ent->num_chunks++;
4494  dlist_push_tail(&ent->chunks, &change->node);
4495 }
4496 
4497 /*
4498  * Rejigger change->newtuple to point to in-memory toast tuples instead to
4499  * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
4500  *
4501  * We cannot replace unchanged toast tuples though, so those will still point
4502  * to on-disk toast data.
4503  *
4504  * While updating the existing change with detoasted tuple data, we need to
4505  * update the memory accounting info, because the change size will differ.
4506  * Otherwise the accounting may get out of sync, triggering serialization
4507  * at unexpected times.
4508  *
4509  * We simply subtract size of the change before rejiggering the tuple, and
4510  * then adding the new size. This makes it look like the change was removed
4511  * and then added back, except it only tweaks the accounting info.
4512  *
4513  * In particular it can't trigger serialization, which would be pointless
4514  * anyway as it happens during commit processing right before handing
4515  * the change to the output plugin.
4516  */
4517 static void
4519  Relation relation, ReorderBufferChange *change)
4520 {
4521  TupleDesc desc;
4522  int natt;
4523  Datum *attrs;
4524  bool *isnull;
4525  bool *free;
4526  HeapTuple tmphtup;
4527  Relation toast_rel;
4528  TupleDesc toast_desc;
4529  MemoryContext oldcontext;
4530  ReorderBufferTupleBuf *newtup;
4531 
4532  /* no toast tuples changed */
4533  if (txn->toast_hash == NULL)
4534  return;
4535 
4536  /*
4537  * We're going to modify the size of the change, so to make sure the
4538  * accounting is correct we'll make it look like we're removing the change
4539  * now (with the old size), and then re-add it at the end.
4540  */
4541  ReorderBufferChangeMemoryUpdate(rb, change, false);
4542 
4543  oldcontext = MemoryContextSwitchTo(rb->context);
4544 
4545  /* we should only have toast tuples in an INSERT or UPDATE */
4546  Assert(change->data.tp.newtuple);
4547 
4548  desc = RelationGetDescr(relation);
4549 
4550  toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4551  if (!RelationIsValid(toast_rel))
4552  elog(ERROR, "could not open relation with OID %u",
4553  relation->rd_rel->reltoastrelid);
4554 
4555  toast_desc = RelationGetDescr(toast_rel);
4556 
4557  /* should we allocate from stack instead? */
4558  attrs = palloc0(sizeof(Datum) * desc->natts);
4559  isnull = palloc0(sizeof(bool) * desc->natts);
4560  free = palloc0(sizeof(bool) * desc->natts);
4561 
4562  newtup = change->data.tp.newtuple;
4563 
4564  heap_deform_tuple(&newtup->tuple, desc, attrs, isnull);
4565 
4566  for (natt = 0; natt < desc->natts; natt++)
4567  {
4568  Form_pg_attribute attr = TupleDescAttr(desc, natt);
4569  ReorderBufferToastEnt *ent;
4570  struct varlena *varlena;
4571 
4572  /* va_rawsize is the size of the original datum -- including header */
4573  struct varatt_external toast_pointer;
4574  struct varatt_indirect redirect_pointer;
4575  struct varlena *new_datum = NULL;
4576  struct varlena *reconstructed;
4577  dlist_iter it;
4578  Size data_done = 0;
4579 
4580  /* system columns aren't toasted */
4581  if (attr->attnum < 0)
4582  continue;
4583 
4584  if (attr->attisdropped)
4585  continue;
4586 
4587  /* not a varlena datatype */
4588  if (attr->attlen != -1)
4589  continue;
4590 
4591  /* no data */
4592  if (isnull[natt])
4593  continue;
4594 
4595  /* ok, we know we have a toast datum */
4596  varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4597 
4598  /* no need to do anything if the tuple isn't external */
4599  if (!VARATT_IS_EXTERNAL(varlena))
4600  continue;
4601 
4602  VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4603 
4604  /*
4605  * Check whether the toast tuple changed, replace if so.
4606  */
4607  ent = (ReorderBufferToastEnt *)
4608  hash_search(txn->toast_hash,
4609  (void *) &toast_pointer.va_valueid,
4610  HASH_FIND,
4611  NULL);
4612  if (ent == NULL)
4613  continue;
4614 
4615  new_datum =
4616  (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
4617 
4618  free[natt] = true;
4619 
4620  reconstructed = palloc0(toast_pointer.va_rawsize);
4621 
4622  ent->reconstructed = reconstructed;
4623 
4624  /* stitch toast tuple back together from its parts */
4625  dlist_foreach(it, &ent->chunks)
4626  {
4627  bool isnull;
4628  ReorderBufferChange *cchange;
4629  ReorderBufferTupleBuf *ctup;
4630  Pointer chunk;
4631 
4632  cchange = dlist_container(ReorderBufferChange, node, it.cur);
4633  ctup = cchange->data.tp.newtuple;
4634  chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull));
4635 
4636  Assert(!isnull);
4637  Assert(!VARATT_IS_EXTERNAL(chunk));
4638  Assert(!VARATT_IS_SHORT(chunk));
4639 
4640  memcpy(VARDATA(reconstructed) + data_done,
4641  VARDATA(chunk),
4642  VARSIZE(chunk) - VARHDRSZ);
4643  data_done += VARSIZE(chunk) - VARHDRSZ;
4644  }
4645  Assert(data_done == toast_pointer.va_extsize);
4646 
4647  /* make sure its marked as compressed or not */
4648  if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4649  SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4650  else
4651  SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4652 
4653  memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4654  redirect_pointer.pointer = reconstructed;
4655 
4657  memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4658  sizeof(redirect_pointer));
4659 
4660  attrs[natt] = PointerGetDatum(new_datum);
4661  }
4662 
4663  /*
4664  * Build tuple in separate memory & copy tuple back into the tuplebuf
4665  * passed to the output plugin. We can't directly heap_fill_tuple() into
4666  * the tuplebuf because attrs[] will point back into the current content.
4667  */
4668  tmphtup = heap_form_tuple(desc, attrs, isnull);
4669  Assert(newtup->tuple.t_len <= MaxHeapTupleSize);
4670  Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data);
4671 
4672  memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len);
4673  newtup->tuple.t_len = tmphtup->t_len;
4674 
4675  /*
4676  * free resources we won't further need, more persistent stuff will be
4677  * free'd in ReorderBufferToastReset().
4678  */
4679  RelationClose(toast_rel);
4680  pfree(tmphtup);
4681  for (natt = 0; natt < desc->natts; natt++)
4682  {
4683  if (free[natt])
4684  pfree(DatumGetPointer(attrs[natt]));
4685  }
4686  pfree(attrs);
4687  pfree(free);
4688  pfree(isnull);
4689 
4690  MemoryContextSwitchTo(oldcontext);
4691 
4692  /* now add the change back, with the correct size */
4693  ReorderBufferChangeMemoryUpdate(rb, change, true);
4694 }
4695 
4696 /*
4697  * Free all resources allocated for toast reconstruction.
4698  */
4699 static void
4701 {
4702  HASH_SEQ_STATUS hstat;
4703  ReorderBufferToastEnt *ent;
4704 
4705  if (txn->toast_hash == NULL)
4706  return;
4707 
4708  /* sequentially walk over the hash and free everything */
4709  hash_seq_init(&hstat, txn->toast_hash);
4710  while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4711  {
4712  dlist_mutable_iter it;
4713 
4714  if (ent->reconstructed != NULL)
4715  pfree(ent->reconstructed);
4716 
4717  dlist_foreach_modify(it, &ent->chunks)
4718  {
4719  ReorderBufferChange *change =
4721 
4722  dlist_delete(&change->node);
4723  ReorderBufferReturnChange(rb, change, true);
4724  }
4725  }
4726 
4727  hash_destroy(txn->toast_hash);
4728  txn->toast_hash = NULL;
4729 }
4730 
4731 
4732 /* ---------------------------------------
4733  * Visibility support for logical decoding
4734  *
4735  *
4736  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4737  * always rely on stored cmin/cmax values because of two scenarios:
4738  *
4739  * * A tuple got changed multiple times during a single transaction and thus
4740  * has got a combocid. Combocid's are only valid for the duration of a
4741  * single transaction.
4742  * * A tuple with a cmin but no cmax (and thus no combocid) got
4743  * deleted/updated in another transaction than the one which created it
4744  * which we are looking at right now. As only one of cmin, cmax or combocid
4745  * is actually stored in the heap we don't have access to the value we
4746  * need anymore.
4747  *
4748  * To resolve those problems we have a per-transaction hash of (cmin,
4749  * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
4750  * (cmin, cmax) values. That also takes care of combocids by simply
4751  * not caring about them at all. As we have the real cmin/cmax values
4752  * combocids aren't interesting.
4753  *
4754  * As we only care about catalog tuples here the overhead of this
4755  * hashtable should be acceptable.
4756  *
4757  * Heap rewrites complicate this a bit, check rewriteheap.c for
4758  * details.
4759  * -------------------------------------------------------------------------
4760  */
4761 
4762 /* struct for sorting mapping files by LSN efficiently */
4763 typedef struct RewriteMappingFile
4764 {
4766  char fname[MAXPGPATH];
4768 
4769 #ifdef NOT_USED
4770 static void
4771 DisplayMapping(HTAB *tuplecid_data)
4772 {
4773  HASH_SEQ_STATUS hstat;
4775 
4776  hash_seq_init(&hstat, tuplecid_data);
4777  while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4778  {
4779  elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4780  ent->key.relnode.dbNode,
4781  ent->key.relnode.spcNode,
4782  ent->key.relnode.relNode,
4785  ent->cmin,
4786  ent->cmax
4787  );
4788  }
4789 }
4790 #endif
4791 
4792 /*
4793  * Apply a single mapping file to tuplecid_data.
4794  *
4795  * The mapping file has to have been verified to be a) committed b) for our
4796  * transaction c) applied in LSN order.
4797  */
4798 static void
4799 ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
4800 {
4801  char path[MAXPGPATH];
4802  int fd;
4803  int readBytes;
4805 
4806  sprintf(path, "pg_logical/mappings/%s", fname);
4807  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4808  if (fd < 0)
4809  ereport(ERROR,
4811  errmsg("could not open file \"%s\": %m", path)));
4812 
4813  while (true)
4814  {
4817  ReorderBufferTupleCidEnt *new_ent;
4818  bool found;
4819 
4820  /* be careful about padding */
4821  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
4822 
4823  /* read all mappings till the end of the file */
4825  readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
4827 
4828  if (readBytes < 0)
4829  ereport(ERROR,
4831  errmsg("could not read file \"%s\": %m",
4832  path)));
4833  else if (readBytes == 0) /* EOF */
4834  break;
4835  else if (readBytes != sizeof(LogicalRewriteMappingData))
4836  ereport(ERROR,
4838  errmsg("could not read from file \"%s\": read %d instead of %d bytes",
4839  path, readBytes,
4840  (int32) sizeof(LogicalRewriteMappingData))));
4841 
4842  key.relnode = map.old_node;
4843  ItemPointerCopy(&map.old_tid,
4844  &key.tid);
4845 
4846 
4847  ent = (ReorderBufferTupleCidEnt *)
4848  hash_search(tuplecid_data,
4849  (void *) &key,
4850  HASH_FIND,
4851  NULL);
4852 
4853  /* no existing mapping, no need to update */
4854  if (!ent)
4855  continue;
4856 
4857  key.relnode = map.new_node;
4858  ItemPointerCopy(&map.new_tid,
4859  &key.tid);
4860 
4861  new_ent = (ReorderBufferTupleCidEnt *)
4862  hash_search(tuplecid_data,
4863  (void *) &key,
4864  HASH_ENTER,
4865  &found);
4866 
4867  if (found)
4868  {
4869  /*
4870  * Make sure the existing mapping makes sense. We sometime update
4871  * old records that did not yet have a cmax (e.g. pg_class' own
4872  * entry while rewriting it) during rewrites, so allow that.
4873  */
4874  Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
4875  Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
4876  }
4877  else
4878  {
4879  /* update mapping */
4880  new_ent->cmin = ent->cmin;
4881  new_ent->cmax = ent->cmax;
4882  new_ent->combocid = ent->combocid;
4883  }
4884  }
4885 
4886  if (CloseTransientFile(fd) != 0)
4887  ereport(ERROR,
4889  errmsg("could not close file \"%s\": %m", path)));
4890 }
4891 
4892 
4893 /*
4894  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
4895  */
4896 static bool
4898 {
4899  return bsearch(&xid, xip, num,
4900  sizeof(TransactionId), xidComparator) != NULL;
4901 }
4902 
4903 /*
4904  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
4905  */
4906 static int
4907 file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
4908 {
4911 
4912  if (a->lsn < b->lsn)
4913  return -1;
4914  else if (a->lsn > b->lsn)
4915  return 1;
4916  return 0;
4917 }
4918 
4919 /*
4920  * Apply any existing logical remapping files if there are any targeted at our
4921  * transaction for relid.
4922  */
4923 static void
4925 {
4926  DIR *mapping_dir;
4927  struct dirent *mapping_de;
4928  List *files = NIL;
4929  ListCell *file;
4930  Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
4931 
4932  mapping_dir = AllocateDir("pg_logical/mappings");
4933  while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
4934  {
4935  Oid f_dboid;
4936  Oid f_relid;
4937  TransactionId f_mapped_xid;
4938  TransactionId f_create_xid;
4939  XLogRecPtr f_lsn;
4940  uint32 f_hi,
4941  f_lo;
4942  RewriteMappingFile *f;
4943 
4944  if (strcmp(mapping_de->d_name, ".") == 0 ||
4945  strcmp(mapping_de->d_name, "..") == 0)
4946  continue;
4947 
4948  /* Ignore files that aren't ours */
4949  if (strncmp(mapping_de->d_name, "map-", 4) != 0)
4950  continue;
4951 
4952  if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
4953  &f_dboid, &f_relid, &f_hi, &f_lo,
4954  &f_mapped_xid, &f_create_xid) != 6)
4955  elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
4956 
4957  f_lsn = ((uint64) f_hi) << 32 | f_lo;
4958 
4959  /* mapping for another database */
4960  if (f_dboid != dboid)
4961  continue;
4962 
4963  /* mapping for another relation */
4964  if (f_relid != relid)
4965  continue;
4966 
4967  /* did the creating transaction abort? */
4968  if (!TransactionIdDidCommit(f_create_xid))
4969  continue;
4970 
4971  /* not for our transaction */
4972  if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
4973  continue;
4974 
4975  /* ok, relevant, queue for apply */
4976  f = palloc(sizeof(RewriteMappingFile));
4977  f->lsn = f_lsn;
4978  strcpy(f->fname, mapping_de->d_name);
4979  files = lappend(files, f);
4980  }
4981  FreeDir(mapping_dir);
4982 
4983  /* sort files so we apply them in LSN order */
4984  list_sort(files, file_sort_by_lsn);
4985 
4986  foreach(file, files)
4987  {
4989 
4990  elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
4991  snapshot->subxip[0]);
4992  ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
4993  pfree(f);
4994  }
4995 }
4996 
4997 /*
4998  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
4999  * combocids.
5000  */
5001 bool
5003  Snapshot snapshot,
5004  HeapTuple htup, Buffer buffer,
5005  CommandId *cmin, CommandId *cmax)
5006 {
5009  ForkNumber forkno;
5010  BlockNumber blockno;
5011  bool updated_mapping = false;
5012 
5013  /*
5014  * Return unresolved if tuplecid_data is not valid. That's because when
5015  * streaming in-progress transactions we may run into tuples with the CID
5016  * before actually decoding them. Think e.g. about INSERT followed by
5017  * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5018  * INSERT. So in such cases, we assume the CID is from the future
5019  * command.
5020  */
5021  if (tuplecid_data == NULL)
5022  return false;
5023 
5024  /* be careful about padding */
5025  memset(&key, 0, sizeof(key));
5026 
5027  Assert(!BufferIsLocal(buffer));
5028 
5029  /*
5030  * get relfilenode from the buffer, no convenient way to access it other
5031  * than that.
5032  */
5033  BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
5034 
5035  /* tuples can only be in the main fork */
5036  Assert(forkno == MAIN_FORKNUM);
5037  Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5038 
5039  ItemPointerCopy(&htup->t_self,
5040  &key.tid);
5041 
5042 restart:
5043  ent = (ReorderBufferTupleCidEnt *)
5044  hash_search(tuplecid_data,
5045  (void *) &key,
5046  HASH_FIND,
5047  NULL);
5048 
5049  /*
5050  * failed to find a mapping, check whether the table was rewritten and
5051  * apply mapping if so, but only do that once - there can be no new
5052  * mappings while we are in here since we have to hold a lock on the
5053  * relation.
5054  */
5055  if (ent == NULL && !updated_mapping)
5056  {
5057  UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5058  /* now check but don't update for a mapping again */
5059  updated_mapping = true;
5060  goto restart;
5061  }
5062  else if (ent == NULL)
5063  return false;
5064 
5065  if (cmin)
5066  *cmin = ent->cmin;
5067  if (cmax)
5068  *cmax = ent->cmax;
5069  return true;
5070 }
static void ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
XLogRecPtr first_lsn
bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
#define NIL
Definition: pg_list.h:65
uint32 CommandId
Definition: c.h:589
TimestampTz commit_time
struct ReorderBufferToastEnt ReorderBufferToastEnt
struct TXNEntryFile TXNEntryFile
void AbortCurrentTransaction(void)
Definition: xact.c:3212
ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER]
#define SizeofHeapTupleHeader
Definition: htup_details.h:184
bool IsToastRelation(Relation relation)
Definition: catalog.c:138
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:862
void ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, Snapshot snapshot, XLogRecPtr lsn, bool transactional, const char *prefix, Size message_size, const char *message)
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
Snapshot base_snapshot
ReorderBufferApplyChangeCB apply_change
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:212
#define AllocSetContextCreate
Definition: memutils.h:170
HeapTupleData * HeapTuple
Definition: htup.h:71
#define rbtxn_prepared(txn)
void * private_data
dlist_node base_snapshot_node
#define DEBUG1
Definition: elog.h:25
dlist_node * cur
Definition: ilist.h:180
static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, Relation relation, ReorderBufferChange *change)
RepOriginId origin_id
void StartupReorderBuffer(void)
#define VARDATA(PTR)
Definition: postgres.h:302
static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
#define fastgetattr(tup, attnum, tupleDesc, isnull)
Definition: htup_details.h:712
static int32 next
Definition: blutils.c:219
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1465
#define RBTXN_HAS_TOAST_INSERT
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define HASH_CONTEXT
Definition: hsearch.h:102
struct ReorderBufferChange::@99::@101 truncate
#define HASH_ELEM
Definition: hsearch.h:95
int wal_segment_size
Definition: xlog.c:118
void ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:524
uint32 TransactionId
Definition: c.h:575
void ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, TXNEntryFile *file, XLogSegNo *segno)
bool copied
Definition: snapshot.h:185
bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
Definition: snapbuild.c:404
void ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn)
#define RBTXN_IS_STREAMED
MemoryContext hcxt
Definition: hsearch.h:86
#define DatumGetInt32(X)
Definition: postgres.h:472
int sqlerrcode
Definition: elog.h:378
#define RelationGetDescr(relation)
Definition: rel.h:483
static ReorderBufferTXN * ReorderBufferLargestTXN(ReorderBuffer *rb)
static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, ReorderBufferTXN *subtxn)
#define DEBUG3
Definition: elog.h:23
SnapBuildState SnapBuildCurrentState(SnapBuild *builder)
Definition: snapbuild.c:395
#define write(a, b, c)
Definition: win32.h:14
#define VARHDRSZ_SHORT
Definition: postgres.h:268
TransactionId by_txn_last_xid
#define VARSIZE(PTR)
Definition: postgres.h:303
ErrorData * CopyErrorData(void)
Definition: elog.c:1565
#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)
Definition: detoast.h:32
int64 TimestampTz
Definition: timestamp.h:39
struct ReorderBufferChange::@99::@102 msg
#define PointerGetDatum(X)
Definition: postgres.h:556
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
ReorderBufferTXN * txn
static bool TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
#define VARHDRSZ
Definition: c.h:615
#define dlist_foreach(iter, lhead)
Definition: ilist.h:507
ReorderBufferStreamAbortCB stream_abort
XLogRecPtr current_restart_decoding_lsn
#define DatumGetObjectId(X)
Definition: postgres.h:500
char * pstrdup(const char *in)
Definition: mcxt.c:1187
#define rbtxn_has_incomplete_tuple(txn)
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2696
static ReorderBufferTXN * ReorderBufferGetTXN(ReorderBuffer *rb)
static void ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, Snapshot snapshot_now, CommandId command_id, XLogRecPtr last_lsn, ReorderBufferChange *specinsert)
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:317
struct ReorderBufferTXN * txn
Definition: reorderbuffer.h:87
Oid RelidByRelfilenode(Oid reltablespace, Oid relfilenode)
struct ReorderBufferChange::@99::@103 tuplecid
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
void ReorderBufferFree(ReorderBuffer *rb)
#define rbtxn_is_serialized_clear(txn)
uint16 RepOriginId
Definition: xlogdefs.h:58
Size entrysize
Definition: hsearch.h:76
CommandId command_id
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:338
Snapshot snapshot_now
struct cursor * cur
Definition: ecpg.c:28
char fname[MAXPGPATH]
int32 va_rawsize
Definition: postgres.h:69
bool IsTransactionOrTransactionBlock(void)
Definition: xact.c:4703
void ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn, XLogRecPtr end_lsn, TimestampTz commit_time, RepOriginId origin_id, XLogRecPtr origin_lsn, char *gid, bool is_commit)
#define INFO
Definition: elog.h:33
MemoryContext SlabContextCreate(MemoryContext parent, const char *name, Size blockSize, Size chunkSize)
Definition: slab.c:174
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileNode node, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
uint32 BlockNumber
Definition: block.h:31
void TeardownHistoricSnapshot(bool is_error)
Definition: snapmgr.c:2047
ReorderBufferCommitCB commit
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: heaptuple.c:1020
ReorderBufferChange * ReorderBufferGetChange(ReorderBuffer *rb)
#define RelationIsLogicallyLogged(relation)
Definition: rel.h:636
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
static int64 files
Definition: pg_checksums.c:34
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:125
ReplicationSlotPersistentData data
Definition: slot.h:143
struct ReorderBufferTupleCidKey ReorderBufferTupleCidKey
struct SnapshotData * Snapshot
Definition: snapshot.h:121
Form_pg_class rd_rel
Definition: rel.h:110
unsigned int Oid
Definition: postgres_ext.h:31
XLogRecPtr base_snapshot_lsn
ReorderBufferStreamCommitCB stream_commit
Definition: dirent.h:9
uint32 regd_count
Definition: snapshot.h:205
enum ReorderBufferChangeType action
Definition: reorderbuffer.h:84
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
MemoryContext change_context
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define VARDATA_EXTERNAL(PTR)
Definition: postgres.h:310
#define PG_BINARY
Definition: c.h:1259
void FlushErrorState(void)
Definition: elog.c:1659
XLogRecPtr origin_lsn
static void ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
signed int int32
Definition: c.h:417
#define FirstCommandId
Definition: c.h:591
XLogRecPtr EndRecPtr
Definition: xlogreader.h:176
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
void ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
#define RBTXN_SKIPPED_PREPARE
HeapTupleHeader t_data
Definition: htup.h:68
#define VARATT_IS_EXTERNAL(PTR)
Definition: postgres.h:313
#define sprintf
Definition: port.h:217
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:174
#define rbtxn_is_streamed(txn)
static dlist_node * dlist_next_node(dlist_head *head, dlist_node *node)
Definition: ilist.h:421
Definition: dynahash.c:219
ReorderBufferStreamMessageCB stream_message
#define RBTXN_HAS_CATALOG_CHANGES
#define ReorderBufferTupleBufData(p)
Definition: reorderbuffer.h:38
#define dlist_container(type, membername, ptr)
Definition: ilist.h:477
void pfree(void *pointer)
Definition: mcxt.c:1057
char * Pointer
Definition: c.h:406
void FreeErrorData(ErrorData *edata)
Definition: elog.c:1621
static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
Definition: dirent.c:25
#define ERROR
Definition: elog.h:45
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2404
static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
#define rbtxn_has_toast_insert(txn)
#define SLAB_LARGE_BLOCK_SIZE
Definition: memutils.h:222
#define VARATT_IS_SHORT(PTR)
Definition: postgres.h:326
#define RelationIsValid(relation)
Definition: rel.h:430
dlist_head changes
void ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids)
dlist_head txns_by_base_snapshot_lsn
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define IsSpecInsert(action)
ReorderBufferStreamPrepareCB stream_prepare
#define MAXPGPATH
ItemPointerData t_self
Definition: htup.h:65
ReorderBufferTupleCidKey key
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:192
#define DEBUG2
Definition: elog.h:24
static bool ReorderBufferCanStream(ReorderBuffer *rb)
TransactionId GetCurrentTransactionId(void)
Definition: xact.c:438
void ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, SharedInvalidationMessage *invalidations)
uint32 t_len
Definition: htup.h:64
#define MaxHeapTupleSize
Definition: htup_details.h:560
void ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
struct varlena * reconstructed
void RollbackAndReleaseCurrentSubTransaction(void)
Definition: xact.c:4514
#define SET_VARTAG_EXTERNAL(PTR, tag)
Definition: postgres.h:333
uint64 XLogSegNo
Definition: xlogdefs.h:41
int errcode_for_file_access(void)
Definition: elog.c:727
HeapTupleData tuple
Definition: reorderbuffer.h:29
struct SnapshotData SnapshotData
TransactionId GetCurrentTransactionIdIfAny(void)
Definition: xact.c:455
static ReorderBufferTXN * ReorderBufferLargestTopTXN(ReorderBuffer *rb)
static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, XLogSegNo segno)
#define InvalidTransactionId
Definition: transam.h:31
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:193
bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
unsigned int uint32
Definition: c.h:429
XLogRecPtr final_lsn
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2615
#define RBTXN_IS_SUBXACT
Oid t_tableOid
Definition: htup.h:66
static int file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
ReorderBufferBeginCB begin_prepare
struct ReorderBufferTXN * toptxn
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1512
void RelationClose(Relation relation)
Definition: relcache.c:2123
TransactionId xmin
Definition: snapshot.h:157
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
static void dlist_delete(dlist_node *node)
Definition: ilist.h:358
#define INDIRECT_POINTER_SIZE
Definition: detoast.h:44
ReorderBufferMessageCB message
ReorderBufferStreamChangeCB stream_change
#define AssertArg(condition)
Definition: c.h:794
int bh_size
Definition: binaryheap.h:32
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
TransactionId * xip
Definition: snapshot.h:168
static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, ReorderBufferTXN *txn, CommandId cid)
#define VARSIZE_SHORT(PTR)
Definition: postgres.h:305
ForkNumber
Definition: relpath.h:40
RepOriginId origin_id
Definition: reorderbuffer.h:89
ReorderBufferStreamTruncateCB stream_truncate
List * lappend(List *list, void *datum)
Definition: list.c:321
#define rbtxn_has_spec_insert(txn)
static HTAB * tuplecid_data
Definition: snapmgr.c:116
int CloseTransientFile(int fd)
Definition: fd.c:2581
static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
TransactionId CheckXidAlive
Definition: xact.c:95
struct ReorderBufferChange::@99::@104 inval
struct ReorderBufferTupleCidEnt ReorderBufferTupleCidEnt
void ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid, TransactionId subxid, XLogRecPtr lsn)
static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, char *change)
Definition: