PostgreSQL Source Code  git master
reorderbuffer.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * reorderbuffer.c
4  * PostgreSQL logical replay/reorder buffer management
5  *
6  *
7  * Copyright (c) 2012-2024, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/logical/reorderbuffer.c
12  *
13  * NOTES
14  * This module gets handed individual pieces of transactions in the order
15  * they are written to the WAL and is responsible to reassemble them into
16  * toplevel transaction sized pieces. When a transaction is completely
17  * reassembled - signaled by reading the transaction commit record - it
18  * will then call the output plugin (cf. ReorderBufferCommit()) with the
19  * individual changes. The output plugins rely on snapshots built by
20  * snapbuild.c which hands them to us.
21  *
22  * Transactions and subtransactions/savepoints in postgres are not
23  * immediately linked to each other from outside the performing
24  * backend. Only at commit/abort (or special xact_assignment records) they
25  * are linked together. Which means that we will have to splice together a
26  * toplevel transaction from its subtransactions. To do that efficiently we
27  * build a binary heap indexed by the smallest current lsn of the individual
28  * subtransactions' changestreams. As the individual streams are inherently
29  * ordered by LSN - since that is where we build them from - the transaction
30  * can easily be reassembled by always using the subtransaction with the
31  * smallest current LSN from the heap.
32  *
33  * In order to cope with large transactions - which can be several times as
34  * big as the available memory - this module supports spooling the contents
35  * of large transactions to disk. When the transaction is replayed the
36  * contents of individual (sub-)transactions will be read from disk in
37  * chunks.
38  *
39  * This module also has to deal with reassembling toast records from the
40  * individual chunks stored in WAL. When a new (or initial) version of a
41  * tuple is stored in WAL it will always be preceded by the toast chunks
42  * emitted for the columns stored out of line. Within a single toplevel
43  * transaction there will be no other data carrying records between a row's
44  * toast chunks and the row data itself. See ReorderBufferToast* for
45  * details.
46  *
47  * ReorderBuffer uses two special memory context types - SlabContext for
48  * allocations of fixed-length structures (changes and transactions), and
49  * GenerationContext for the variable-length transaction data (allocated
50  * and freed in groups with similar lifespans).
51  *
52  * To limit the amount of memory used by decoded changes, we track memory
53  * used at the reorder buffer level (i.e. total amount of memory), and for
54  * each transaction. When the total amount of used memory exceeds the
55  * limit, the transaction consuming the most memory is then serialized to
56  * disk.
57  *
58  * Only decoded changes are evicted from memory (spilled to disk), not the
59  * transaction records. The number of toplevel transactions is limited,
60  * but a transaction with many subtransactions may still consume significant
61  * amounts of memory. However, the transaction records are fairly small and
62  * are not included in the memory limit.
63  *
64  * The current eviction algorithm is very simple - the transaction is
65  * picked merely by size, while it might be useful to also consider age
66  * (LSN) of the changes for example. With the new Generational memory
67  * allocator, evicting the oldest changes would make it more likely the
68  * memory gets actually freed.
69  *
70  * We use a max-heap with transaction size as the key to efficiently find
71  * the largest transaction. We update the max-heap whenever the memory
72  * counter is updated; however transactions with size 0 are not stored in
73  * the heap, because they have no changes to evict.
74  *
75  * We still rely on max_changes_in_memory when loading serialized changes
76  * back into memory. At that point we can't use the memory limit directly
77  * as we load the subxacts independently. One option to deal with this
78  * would be to count the subxacts, and allow each to allocate 1/N of the
79  * memory limit. That however does not seem very appealing, because with
80  * many subtransactions it may easily cause thrashing (short cycles of
81  * deserializing and applying very few changes). We probably should give
82  * a bit more memory to the oldest subtransactions, because it's likely
83  * they are the source for the next sequence of changes.
84  *
85  * -------------------------------------------------------------------------
86  */
87 #include "postgres.h"
88 
89 #include <unistd.h>
90 #include <sys/stat.h>
91 
92 #include "access/detoast.h"
93 #include "access/heapam.h"
94 #include "access/rewriteheap.h"
95 #include "access/transam.h"
96 #include "access/xact.h"
97 #include "access/xlog_internal.h"
98 #include "catalog/catalog.h"
99 #include "common/int.h"
100 #include "lib/binaryheap.h"
101 #include "miscadmin.h"
102 #include "pgstat.h"
103 #include "replication/logical.h"
105 #include "replication/slot.h"
106 #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 #include "storage/bufmgr.h"
108 #include "storage/fd.h"
109 #include "storage/sinval.h"
110 #include "utils/builtins.h"
111 #include "utils/memutils.h"
112 #include "utils/rel.h"
113 #include "utils/relfilenumbermap.h"
114 
115 /* entry for a hash table we use to map from xid to our transaction state */
117 {
121 
122 /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
124 {
128 
130 {
134  CommandId combocid; /* just for debugging */
136 
137 /* Virtual file descriptor with file offset tracking */
138 typedef struct TXNEntryFile
139 {
140  File vfd; /* -1 when the file is closed */
141  off_t curOffset; /* offset for next write or read. Reset to 0
142  * when vfd is opened. */
144 
145 /* k-way in-order change iteration support structures */
147 {
154 
156 {
162 
163 /* toast datastructures */
164 typedef struct ReorderBufferToastEnt
165 {
166  Oid chunk_id; /* toast_table.chunk_id */
167  int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
168  * have seen */
169  Size num_chunks; /* number of chunks we've already seen */
170  Size size; /* combined size of chunks seen */
171  dlist_head chunks; /* linked list of chunks */
172  struct varlena *reconstructed; /* reconstructed varlena now pointed to in
173  * main tup */
175 
176 /* Disk serialization support datastructures */
178 {
181  /* data follows */
183 
184 #define IsSpecInsert(action) \
185 ( \
186  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
187 )
188 #define IsSpecConfirmOrAbort(action) \
189 ( \
190  (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
191  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
192 )
193 #define IsInsertOrUpdate(action) \
194 ( \
195  (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
196  ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
197  ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
198 )
199 
200 /*
201  * Maximum number of changes kept in memory, per transaction. After that,
202  * changes are spooled to disk.
203  *
204  * The current value should be sufficient to decode the entire transaction
205  * without hitting disk in OLTP workloads, while starting to spool to disk in
206  * other workloads reasonably fast.
207  *
208  * At some point in the future it probably makes sense to have a more elaborate
209  * resource management here, but it's not entirely clear what that would look
210  * like.
211  */
213 static const Size max_changes_in_memory = 4096; /* XXX for restore only */
214 
215 /* GUC variable */
217 
218 /* ---------------------------------------
219  * primary reorderbuffer support routines
220  * ---------------------------------------
221  */
225  TransactionId xid, bool create, bool *is_new,
226  XLogRecPtr lsn, bool create_as_top);
228  ReorderBufferTXN *subtxn);
229 
230 static void AssertTXNLsnOrder(ReorderBuffer *rb);
231 
232 /* ---------------------------------------
233  * support functions for lsn-order iterating over the ->changes of a
234  * transaction and its subtransactions
235  *
236  * used for iteration over the k-way heap merge of a transaction and its
237  * subtransactions
238  * ---------------------------------------
239  */
241  ReorderBufferIterTXNState *volatile *iter_state);
246 
247 /*
248  * ---------------------------------------
249  * Disk serialization support functions
250  * ---------------------------------------
251  */
255  int fd, ReorderBufferChange *change);
257  TXNEntryFile *file, XLogSegNo *segno);
259  char *data);
262  bool txn_prepared);
263 static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
264 static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
265  TransactionId xid, XLogSegNo segno);
266 static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
267 
268 static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
270  ReorderBufferTXN *txn, CommandId cid);
271 
272 /*
273  * ---------------------------------------
274  * Streaming support functions
275  * ---------------------------------------
276  */
277 static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
278 static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
281 
282 /* ---------------------------------------
283  * toast reassembly support
284  * ---------------------------------------
285  */
289  Relation relation, ReorderBufferChange *change);
291  Relation relation, ReorderBufferChange *change);
292 
293 /*
294  * ---------------------------------------
295  * memory accounting
296  * ---------------------------------------
297  */
300  ReorderBufferChange *change,
301  ReorderBufferTXN *txn,
302  bool addition, Size sz);
303 
304 /*
305  * Allocate a new ReorderBuffer and clean out any old serialized state from
306  * prior ReorderBuffer instances for the same slot.
307  */
310 {
311  ReorderBuffer *buffer;
312  HASHCTL hash_ctl;
313  MemoryContext new_ctx;
314 
315  Assert(MyReplicationSlot != NULL);
316 
317  /* allocate memory in own context, to have better accountability */
319  "ReorderBuffer",
321 
322  buffer =
323  (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
324 
325  memset(&hash_ctl, 0, sizeof(hash_ctl));
326 
327  buffer->context = new_ctx;
328 
329  buffer->change_context = SlabContextCreate(new_ctx,
330  "Change",
332  sizeof(ReorderBufferChange));
333 
334  buffer->txn_context = SlabContextCreate(new_ctx,
335  "TXN",
337  sizeof(ReorderBufferTXN));
338 
339  /*
340  * XXX the allocation sizes used below pre-date generation context's block
341  * growing code. These values should likely be benchmarked and set to
342  * more suitable values.
343  */
344  buffer->tup_context = GenerationContextCreate(new_ctx,
345  "Tuples",
349 
350  hash_ctl.keysize = sizeof(TransactionId);
351  hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
352  hash_ctl.hcxt = buffer->context;
353 
354  buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
356 
358  buffer->by_txn_last_txn = NULL;
359 
360  buffer->outbuf = NULL;
361  buffer->outbufsize = 0;
362  buffer->size = 0;
363 
364  /* txn_heap is ordered by transaction size */
366 
367  buffer->spillTxns = 0;
368  buffer->spillCount = 0;
369  buffer->spillBytes = 0;
370  buffer->streamTxns = 0;
371  buffer->streamCount = 0;
372  buffer->streamBytes = 0;
373  buffer->totalTxns = 0;
374  buffer->totalBytes = 0;
375 
377 
378  dlist_init(&buffer->toplevel_by_lsn);
380  dclist_init(&buffer->catchange_txns);
381 
382  /*
383  * Ensure there's no stale data from prior uses of this slot, in case some
384  * prior exit avoided calling ReorderBufferFree. Failure to do this can
385  * produce duplicated txns, and it's very cheap if there's nothing there.
386  */
388 
389  return buffer;
390 }
391 
392 /*
393  * Free a ReorderBuffer
394  */
395 void
397 {
399 
400  /*
401  * We free separately allocated data by entirely scrapping reorderbuffer's
402  * memory context.
403  */
405 
406  /* Free disk space used by unconsumed reorder buffers */
408 }
409 
410 /*
411  * Get an unused, possibly preallocated, ReorderBufferTXN.
412  */
413 static ReorderBufferTXN *
415 {
416  ReorderBufferTXN *txn;
417 
418  txn = (ReorderBufferTXN *)
420 
421  memset(txn, 0, sizeof(ReorderBufferTXN));
422 
423  dlist_init(&txn->changes);
424  dlist_init(&txn->tuplecids);
425  dlist_init(&txn->subtxns);
426 
427  /* InvalidCommandId is not zero, so set it explicitly */
429  txn->output_plugin_private = NULL;
430 
431  return txn;
432 }
433 
434 /*
435  * Free a ReorderBufferTXN.
436  */
437 static void
439 {
440  /* clean the lookup cache if we were cached (quite likely) */
441  if (rb->by_txn_last_xid == txn->xid)
442  {
444  rb->by_txn_last_txn = NULL;
445  }
446 
447  /* free data that's contained */
448 
449  if (txn->gid != NULL)
450  {
451  pfree(txn->gid);
452  txn->gid = NULL;
453  }
454 
455  if (txn->tuplecid_hash != NULL)
456  {
458  txn->tuplecid_hash = NULL;
459  }
460 
461  if (txn->invalidations)
462  {
463  pfree(txn->invalidations);
464  txn->invalidations = NULL;
465  }
466 
467  /* Reset the toast hash */
468  ReorderBufferToastReset(rb, txn);
469 
470  pfree(txn);
471 }
472 
473 /*
474  * Get a fresh ReorderBufferChange.
475  */
478 {
479  ReorderBufferChange *change;
480 
481  change = (ReorderBufferChange *)
483 
484  memset(change, 0, sizeof(ReorderBufferChange));
485  return change;
486 }
487 
488 /*
489  * Free a ReorderBufferChange and update memory accounting, if requested.
490  */
491 void
493  bool upd_mem)
494 {
495  /* update memory accounting info */
496  if (upd_mem)
497  ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
498  ReorderBufferChangeSize(change));
499 
500  /* free contained data */
501  switch (change->action)
502  {
507  if (change->data.tp.newtuple)
508  {
509  ReorderBufferReturnTupleBuf(change->data.tp.newtuple);
510  change->data.tp.newtuple = NULL;
511  }
512 
513  if (change->data.tp.oldtuple)
514  {
515  ReorderBufferReturnTupleBuf(change->data.tp.oldtuple);
516  change->data.tp.oldtuple = NULL;
517  }
518  break;
520  if (change->data.msg.prefix != NULL)
521  pfree(change->data.msg.prefix);
522  change->data.msg.prefix = NULL;
523  if (change->data.msg.message != NULL)
524  pfree(change->data.msg.message);
525  change->data.msg.message = NULL;
526  break;
528  if (change->data.inval.invalidations)
529  pfree(change->data.inval.invalidations);
530  change->data.inval.invalidations = NULL;
531  break;
533  if (change->data.snapshot)
534  {
535  ReorderBufferFreeSnap(rb, change->data.snapshot);
536  change->data.snapshot = NULL;
537  }
538  break;
539  /* no data in addition to the struct itself */
541  if (change->data.truncate.relids != NULL)
542  {
543  ReorderBufferReturnRelids(rb, change->data.truncate.relids);
544  change->data.truncate.relids = NULL;
545  }
546  break;
551  break;
552  }
553 
554  pfree(change);
555 }
556 
557 /*
558  * Get a fresh HeapTuple fitting a tuple of size tuple_len (excluding header
559  * overhead).
560  */
561 HeapTuple
563 {
564  HeapTuple tuple;
565  Size alloc_len;
566 
567  alloc_len = tuple_len + SizeofHeapTupleHeader;
568 
570  HEAPTUPLESIZE + alloc_len);
571  tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
572 
573  return tuple;
574 }
575 
576 /*
577  * Free a HeapTuple returned by ReorderBufferGetTupleBuf().
578  */
579 void
581 {
582  pfree(tuple);
583 }
584 
585 /*
586  * Get an array for relids of truncated relations.
587  *
588  * We use the global memory context (for the whole reorder buffer), because
589  * none of the existing ones seems like a good match (some are SLAB, so we
590  * can't use those, and tup_context is meant for tuple data, not relids). We
591  * could add yet another context, but it seems like an overkill - TRUNCATE is
592  * not particularly common operation, so it does not seem worth it.
593  */
594 Oid *
596 {
597  Oid *relids;
598  Size alloc_len;
599 
600  alloc_len = sizeof(Oid) * nrelids;
601 
602  relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
603 
604  return relids;
605 }
606 
607 /*
608  * Free an array of relids.
609  */
610 void
612 {
613  pfree(relids);
614 }
615 
616 /*
617  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
618  * If create is true, and a transaction doesn't already exist, create it
619  * (with the given LSN, and as top transaction if that's specified);
620  * when this happens, is_new is set to true.
621  */
622 static ReorderBufferTXN *
624  bool *is_new, XLogRecPtr lsn, bool create_as_top)
625 {
626  ReorderBufferTXN *txn;
628  bool found;
629 
631 
632  /*
633  * Check the one-entry lookup cache first
634  */
636  rb->by_txn_last_xid == xid)
637  {
638  txn = rb->by_txn_last_txn;
639 
640  if (txn != NULL)
641  {
642  /* found it, and it's valid */
643  if (is_new)
644  *is_new = false;
645  return txn;
646  }
647 
648  /*
649  * cached as non-existent, and asked not to create? Then nothing else
650  * to do.
651  */
652  if (!create)
653  return NULL;
654  /* otherwise fall through to create it */
655  }
656 
657  /*
658  * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
659  * create an entry.
660  */
661 
662  /* search the lookup table */
663  ent = (ReorderBufferTXNByIdEnt *)
664  hash_search(rb->by_txn,
665  &xid,
666  create ? HASH_ENTER : HASH_FIND,
667  &found);
668  if (found)
669  txn = ent->txn;
670  else if (create)
671  {
672  /* initialize the new entry, if creation was requested */
673  Assert(ent != NULL);
674  Assert(lsn != InvalidXLogRecPtr);
675 
676  ent->txn = ReorderBufferGetTXN(rb);
677  ent->txn->xid = xid;
678  txn = ent->txn;
679  txn->first_lsn = lsn;
681 
682  if (create_as_top)
683  {
684  dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
685  AssertTXNLsnOrder(rb);
686  }
687  }
688  else
689  txn = NULL; /* not found and not asked to create */
690 
691  /* update cache */
692  rb->by_txn_last_xid = xid;
693  rb->by_txn_last_txn = txn;
694 
695  if (is_new)
696  *is_new = !found;
697 
698  Assert(!create || txn != NULL);
699  return txn;
700 }
701 
702 /*
703  * Record the partial change for the streaming of in-progress transactions. We
704  * can stream only complete changes so if we have a partial change like toast
705  * table insert or speculative insert then we mark such a 'txn' so that it
706  * can't be streamed. We also ensure that if the changes in such a 'txn' can
707  * be streamed and are above logical_decoding_work_mem threshold then we stream
708  * them as soon as we have a complete change.
709  */
710 static void
712  ReorderBufferChange *change,
713  bool toast_insert)
714 {
715  ReorderBufferTXN *toptxn;
716 
717  /*
718  * The partial changes need to be processed only while streaming
719  * in-progress transactions.
720  */
721  if (!ReorderBufferCanStream(rb))
722  return;
723 
724  /* Get the top transaction. */
725  toptxn = rbtxn_get_toptxn(txn);
726 
727  /*
728  * Indicate a partial change for toast inserts. The change will be
729  * considered as complete once we get the insert or update on the main
730  * table and we are sure that the pending toast chunks are not required
731  * anymore.
732  *
733  * If we allow streaming when there are pending toast chunks then such
734  * chunks won't be released till the insert (multi_insert) is complete and
735  * we expect the txn to have streamed all changes after streaming. This
736  * restriction is mainly to ensure the correctness of streamed
737  * transactions and it doesn't seem worth uplifting such a restriction
738  * just to allow this case because anyway we will stream the transaction
739  * once such an insert is complete.
740  */
741  if (toast_insert)
743  else if (rbtxn_has_partial_change(toptxn) &&
744  IsInsertOrUpdate(change->action) &&
745  change->data.tp.clear_toast_afterwards)
747 
748  /*
749  * Indicate a partial change for speculative inserts. The change will be
750  * considered as complete once we get the speculative confirm or abort
751  * token.
752  */
753  if (IsSpecInsert(change->action))
755  else if (rbtxn_has_partial_change(toptxn) &&
756  IsSpecConfirmOrAbort(change->action))
758 
759  /*
760  * Stream the transaction if it is serialized before and the changes are
761  * now complete in the top-level transaction.
762  *
763  * The reason for doing the streaming of such a transaction as soon as we
764  * get the complete change for it is that previously it would have reached
765  * the memory threshold and wouldn't get streamed because of incomplete
766  * changes. Delaying such transactions would increase apply lag for them.
767  */
769  !(rbtxn_has_partial_change(toptxn)) &&
770  rbtxn_is_serialized(txn) &&
772  ReorderBufferStreamTXN(rb, toptxn);
773 }
774 
775 /*
776  * Queue a change into a transaction so it can be replayed upon commit or will be
777  * streamed when we reach logical_decoding_work_mem threshold.
778  */
779 void
781  ReorderBufferChange *change, bool toast_insert)
782 {
783  ReorderBufferTXN *txn;
784 
785  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
786 
787  /*
788  * While streaming the previous changes we have detected that the
789  * transaction is aborted. So there is no point in collecting further
790  * changes for it.
791  */
792  if (txn->concurrent_abort)
793  {
794  /*
795  * We don't need to update memory accounting for this change as we
796  * have not added it to the queue yet.
797  */
798  ReorderBufferReturnChange(rb, change, false);
799  return;
800  }
801 
802  /*
803  * The changes that are sent downstream are considered streamable. We
804  * remember such transactions so that only those will later be considered
805  * for streaming.
806  */
807  if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
813  {
814  ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
815 
817  }
818 
819  change->lsn = lsn;
820  change->txn = txn;
821 
822  Assert(InvalidXLogRecPtr != lsn);
823  dlist_push_tail(&txn->changes, &change->node);
824  txn->nentries++;
825  txn->nentries_mem++;
826 
827  /* update memory accounting information */
828  ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
829  ReorderBufferChangeSize(change));
830 
831  /* process partial change */
832  ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
833 
834  /* check the memory limits and evict something if needed */
836 }
837 
838 /*
839  * A transactional message is queued to be processed upon commit and a
840  * non-transactional message gets processed immediately.
841  */
842 void
844  Snapshot snap, XLogRecPtr lsn,
845  bool transactional, const char *prefix,
846  Size message_size, const char *message)
847 {
848  if (transactional)
849  {
850  MemoryContext oldcontext;
851  ReorderBufferChange *change;
852 
854 
855  /*
856  * We don't expect snapshots for transactional changes - we'll use the
857  * snapshot derived later during apply (unless the change gets
858  * skipped).
859  */
860  Assert(!snap);
861 
862  oldcontext = MemoryContextSwitchTo(rb->context);
863 
864  change = ReorderBufferGetChange(rb);
866  change->data.msg.prefix = pstrdup(prefix);
867  change->data.msg.message_size = message_size;
868  change->data.msg.message = palloc(message_size);
869  memcpy(change->data.msg.message, message, message_size);
870 
871  ReorderBufferQueueChange(rb, xid, lsn, change, false);
872 
873  MemoryContextSwitchTo(oldcontext);
874  }
875  else
876  {
877  ReorderBufferTXN *txn = NULL;
878  volatile Snapshot snapshot_now = snap;
879 
880  /* Non-transactional changes require a valid snapshot. */
881  Assert(snapshot_now);
882 
883  if (xid != InvalidTransactionId)
884  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
885 
886  /* setup snapshot to allow catalog access */
887  SetupHistoricSnapshot(snapshot_now, NULL);
888  PG_TRY();
889  {
890  rb->message(rb, txn, lsn, false, prefix, message_size, message);
891 
893  }
894  PG_CATCH();
895  {
897  PG_RE_THROW();
898  }
899  PG_END_TRY();
900  }
901 }
902 
903 /*
904  * AssertTXNLsnOrder
905  * Verify LSN ordering of transaction lists in the reorderbuffer
906  *
907  * Other LSN-related invariants are checked too.
908  *
909  * No-op if assertions are not in use.
910  */
911 static void
913 {
914 #ifdef USE_ASSERT_CHECKING
916  dlist_iter iter;
917  XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
918  XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
919 
920  /*
921  * Skip the verification if we don't reach the LSN at which we start
922  * decoding the contents of transactions yet because until we reach the
923  * LSN, we could have transactions that don't have the association between
924  * the top-level transaction and subtransaction yet and consequently have
925  * the same LSN. We don't guarantee this association until we try to
926  * decode the actual contents of transaction. The ordering of the records
927  * prior to the start_decoding_at LSN should have been checked before the
928  * restart.
929  */
931  return;
932 
933  dlist_foreach(iter, &rb->toplevel_by_lsn)
934  {
936  iter.cur);
937 
938  /* start LSN must be set */
939  Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
940 
941  /* If there is an end LSN, it must be higher than start LSN */
942  if (cur_txn->end_lsn != InvalidXLogRecPtr)
943  Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
944 
945  /* Current initial LSN must be strictly higher than previous */
946  if (prev_first_lsn != InvalidXLogRecPtr)
947  Assert(prev_first_lsn < cur_txn->first_lsn);
948 
949  /* known-as-subtxn txns must not be listed */
950  Assert(!rbtxn_is_known_subxact(cur_txn));
951 
952  prev_first_lsn = cur_txn->first_lsn;
953  }
954 
956  {
958  base_snapshot_node,
959  iter.cur);
960 
961  /* base snapshot (and its LSN) must be set */
962  Assert(cur_txn->base_snapshot != NULL);
964 
965  /* current LSN must be strictly higher than previous */
966  if (prev_base_snap_lsn != InvalidXLogRecPtr)
967  Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
968 
969  /* known-as-subtxn txns must not be listed */
970  Assert(!rbtxn_is_known_subxact(cur_txn));
971 
972  prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
973  }
974 #endif
975 }
976 
977 /*
978  * AssertChangeLsnOrder
979  *
980  * Check ordering of changes in the (sub)transaction.
981  */
982 static void
984 {
985 #ifdef USE_ASSERT_CHECKING
986  dlist_iter iter;
987  XLogRecPtr prev_lsn = txn->first_lsn;
988 
989  dlist_foreach(iter, &txn->changes)
990  {
991  ReorderBufferChange *cur_change;
992 
993  cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
994 
996  Assert(cur_change->lsn != InvalidXLogRecPtr);
997  Assert(txn->first_lsn <= cur_change->lsn);
998 
999  if (txn->end_lsn != InvalidXLogRecPtr)
1000  Assert(cur_change->lsn <= txn->end_lsn);
1001 
1002  Assert(prev_lsn <= cur_change->lsn);
1003 
1004  prev_lsn = cur_change->lsn;
1005  }
1006 #endif
1007 }
1008 
1009 /*
1010  * ReorderBufferGetOldestTXN
1011  * Return oldest transaction in reorderbuffer
1012  */
1015 {
1016  ReorderBufferTXN *txn;
1017 
1018  AssertTXNLsnOrder(rb);
1019 
1020  if (dlist_is_empty(&rb->toplevel_by_lsn))
1021  return NULL;
1022 
1024 
1027  return txn;
1028 }
1029 
1030 /*
1031  * ReorderBufferGetOldestXmin
1032  * Return oldest Xmin in reorderbuffer
1033  *
1034  * Returns oldest possibly running Xid from the point of view of snapshots
1035  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1036  * there are none.
1037  *
1038  * Since snapshots are assigned monotonically, this equals the Xmin of the
1039  * base snapshot with minimal base_snapshot_lsn.
1040  */
1043 {
1044  ReorderBufferTXN *txn;
1045 
1046  AssertTXNLsnOrder(rb);
1047 
1049  return InvalidTransactionId;
1050 
1051  txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1053  return txn->base_snapshot->xmin;
1054 }
1055 
1056 void
1058 {
1059  rb->current_restart_decoding_lsn = ptr;
1060 }
1061 
1062 /*
1063  * ReorderBufferAssignChild
1064  *
1065  * Make note that we know that subxid is a subtransaction of xid, seen as of
1066  * the given lsn.
1067  */
1068 void
1070  TransactionId subxid, XLogRecPtr lsn)
1071 {
1072  ReorderBufferTXN *txn;
1073  ReorderBufferTXN *subtxn;
1074  bool new_top;
1075  bool new_sub;
1076 
1077  txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1078  subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1079 
1080  if (!new_sub)
1081  {
1082  if (rbtxn_is_known_subxact(subtxn))
1083  {
1084  /* already associated, nothing to do */
1085  return;
1086  }
1087  else
1088  {
1089  /*
1090  * We already saw this transaction, but initially added it to the
1091  * list of top-level txns. Now that we know it's not top-level,
1092  * remove it from there.
1093  */
1094  dlist_delete(&subtxn->node);
1095  }
1096  }
1097 
1098  subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1099  subtxn->toplevel_xid = xid;
1100  Assert(subtxn->nsubtxns == 0);
1101 
1102  /* set the reference to top-level transaction */
1103  subtxn->toptxn = txn;
1104 
1105  /* add to subtransaction list */
1106  dlist_push_tail(&txn->subtxns, &subtxn->node);
1107  txn->nsubtxns++;
1108 
1109  /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1110  ReorderBufferTransferSnapToParent(txn, subtxn);
1111 
1112  /* Verify LSN-ordering invariant */
1113  AssertTXNLsnOrder(rb);
1114 }
1115 
1116 /*
1117  * ReorderBufferTransferSnapToParent
1118  * Transfer base snapshot from subtxn to top-level txn, if needed
1119  *
1120  * This is done if the top-level txn doesn't have a base snapshot, or if the
1121  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1122  * snapshot's LSN. This can happen if there are no changes in the toplevel
1123  * txn but there are some in the subtxn, or the first change in subtxn has
1124  * earlier LSN than first change in the top-level txn and we learned about
1125  * their kinship only now.
1126  *
1127  * The subtransaction's snapshot is cleared regardless of the transfer
1128  * happening, since it's not needed anymore in either case.
1129  *
1130  * We do this as soon as we become aware of their kinship, to avoid queueing
1131  * extra snapshots to txns known-as-subtxns -- only top-level txns will
1132  * receive further snapshots.
1133  */
1134 static void
1136  ReorderBufferTXN *subtxn)
1137 {
1138  Assert(subtxn->toplevel_xid == txn->xid);
1139 
1140  if (subtxn->base_snapshot != NULL)
1141  {
1142  if (txn->base_snapshot == NULL ||
1143  subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1144  {
1145  /*
1146  * If the toplevel transaction already has a base snapshot but
1147  * it's newer than the subxact's, purge it.
1148  */
1149  if (txn->base_snapshot != NULL)
1150  {
1153  }
1154 
1155  /*
1156  * The snapshot is now the top transaction's; transfer it, and
1157  * adjust the list position of the top transaction in the list by
1158  * moving it to where the subtransaction is.
1159  */
1160  txn->base_snapshot = subtxn->base_snapshot;
1161  txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1163  &txn->base_snapshot_node);
1164 
1165  /*
1166  * The subtransaction doesn't have a snapshot anymore (so it
1167  * mustn't be in the list.)
1168  */
1169  subtxn->base_snapshot = NULL;
1171  dlist_delete(&subtxn->base_snapshot_node);
1172  }
1173  else
1174  {
1175  /* Base snap of toplevel is fine, so subxact's is not needed */
1177  dlist_delete(&subtxn->base_snapshot_node);
1178  subtxn->base_snapshot = NULL;
1180  }
1181  }
1182 }
1183 
1184 /*
1185  * Associate a subtransaction with its toplevel transaction at commit
1186  * time. There may be no further changes added after this.
1187  */
1188 void
1190  TransactionId subxid, XLogRecPtr commit_lsn,
1191  XLogRecPtr end_lsn)
1192 {
1193  ReorderBufferTXN *subtxn;
1194 
1195  subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1196  InvalidXLogRecPtr, false);
1197 
1198  /*
1199  * No need to do anything if that subtxn didn't contain any changes
1200  */
1201  if (!subtxn)
1202  return;
1203 
1204  subtxn->final_lsn = commit_lsn;
1205  subtxn->end_lsn = end_lsn;
1206 
1207  /*
1208  * Assign this subxact as a child of the toplevel xact (no-op if already
1209  * done.)
1210  */
1211  ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1212 }
1213 
1214 
1215 /*
1216  * Support for efficiently iterating over a transaction's and its
1217  * subtransactions' changes.
1218  *
1219  * We do by doing a k-way merge between transactions/subtransactions. For that
1220  * we model the current heads of the different transactions as a binary heap
1221  * so we easily know which (sub-)transaction has the change with the smallest
1222  * lsn next.
1223  *
1224  * We assume the changes in individual transactions are already sorted by LSN.
1225  */
1226 
1227 /*
1228  * Binary heap comparison function.
1229  */
1230 static int
1232 {
1234  XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1235  XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1236 
1237  if (pos_a < pos_b)
1238  return 1;
1239  else if (pos_a == pos_b)
1240  return 0;
1241  return -1;
1242 }
1243 
1244 /*
1245  * Allocate & initialize an iterator which iterates in lsn order over a
1246  * transaction and all its subtransactions.
1247  *
1248  * Note: The iterator state is returned through iter_state parameter rather
1249  * than the function's return value. This is because the state gets cleaned up
1250  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1251  * back the state even if this function throws an exception.
1252  */
1253 static void
1255  ReorderBufferIterTXNState *volatile *iter_state)
1256 {
1257  Size nr_txns = 0;
1259  dlist_iter cur_txn_i;
1260  int32 off;
1261 
1262  *iter_state = NULL;
1263 
1264  /* Check ordering of changes in the toplevel transaction. */
1265  AssertChangeLsnOrder(txn);
1266 
1267  /*
1268  * Calculate the size of our heap: one element for every transaction that
1269  * contains changes. (Besides the transactions already in the reorder
1270  * buffer, we count the one we were directly passed.)
1271  */
1272  if (txn->nentries > 0)
1273  nr_txns++;
1274 
1275  dlist_foreach(cur_txn_i, &txn->subtxns)
1276  {
1277  ReorderBufferTXN *cur_txn;
1278 
1279  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1280 
1281  /* Check ordering of changes in this subtransaction. */
1282  AssertChangeLsnOrder(cur_txn);
1283 
1284  if (cur_txn->nentries > 0)
1285  nr_txns++;
1286  }
1287 
1288  /* allocate iteration state */
1291  sizeof(ReorderBufferIterTXNState) +
1292  sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1293 
1294  state->nr_txns = nr_txns;
1295  dlist_init(&state->old_change);
1296 
1297  for (off = 0; off < state->nr_txns; off++)
1298  {
1299  state->entries[off].file.vfd = -1;
1300  state->entries[off].segno = 0;
1301  }
1302 
1303  /* allocate heap */
1304  state->heap = binaryheap_allocate(state->nr_txns,
1306  state);
1307 
1308  /* Now that the state fields are initialized, it is safe to return it. */
1309  *iter_state = state;
1310 
1311  /*
1312  * Now insert items into the binary heap, in an unordered fashion. (We
1313  * will run a heap assembly step at the end; this is more efficient.)
1314  */
1315 
1316  off = 0;
1317 
1318  /* add toplevel transaction if it contains changes */
1319  if (txn->nentries > 0)
1320  {
1321  ReorderBufferChange *cur_change;
1322 
1323  if (rbtxn_is_serialized(txn))
1324  {
1325  /* serialize remaining changes */
1326  ReorderBufferSerializeTXN(rb, txn);
1327  ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1328  &state->entries[off].segno);
1329  }
1330 
1331  cur_change = dlist_head_element(ReorderBufferChange, node,
1332  &txn->changes);
1333 
1334  state->entries[off].lsn = cur_change->lsn;
1335  state->entries[off].change = cur_change;
1336  state->entries[off].txn = txn;
1337 
1339  }
1340 
1341  /* add subtransactions if they contain changes */
1342  dlist_foreach(cur_txn_i, &txn->subtxns)
1343  {
1344  ReorderBufferTXN *cur_txn;
1345 
1346  cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1347 
1348  if (cur_txn->nentries > 0)
1349  {
1350  ReorderBufferChange *cur_change;
1351 
1352  if (rbtxn_is_serialized(cur_txn))
1353  {
1354  /* serialize remaining changes */
1355  ReorderBufferSerializeTXN(rb, cur_txn);
1356  ReorderBufferRestoreChanges(rb, cur_txn,
1357  &state->entries[off].file,
1358  &state->entries[off].segno);
1359  }
1360  cur_change = dlist_head_element(ReorderBufferChange, node,
1361  &cur_txn->changes);
1362 
1363  state->entries[off].lsn = cur_change->lsn;
1364  state->entries[off].change = cur_change;
1365  state->entries[off].txn = cur_txn;
1366 
1368  }
1369  }
1370 
1371  /* assemble a valid binary heap */
1372  binaryheap_build(state->heap);
1373 }
1374 
1375 /*
1376  * Return the next change when iterating over a transaction and its
1377  * subtransactions.
1378  *
1379  * Returns NULL when no further changes exist.
1380  */
1381 static ReorderBufferChange *
1383 {
1384  ReorderBufferChange *change;
1386  int32 off;
1387 
1388  /* nothing there anymore */
1389  if (state->heap->bh_size == 0)
1390  return NULL;
1391 
1392  off = DatumGetInt32(binaryheap_first(state->heap));
1393  entry = &state->entries[off];
1394 
1395  /* free memory we might have "leaked" in the previous *Next call */
1396  if (!dlist_is_empty(&state->old_change))
1397  {
1398  change = dlist_container(ReorderBufferChange, node,
1399  dlist_pop_head_node(&state->old_change));
1400  ReorderBufferReturnChange(rb, change, true);
1401  Assert(dlist_is_empty(&state->old_change));
1402  }
1403 
1404  change = entry->change;
1405 
1406  /*
1407  * update heap with information about which transaction has the next
1408  * relevant change in LSN order
1409  */
1410 
1411  /* there are in-memory changes */
1412  if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1413  {
1414  dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1415  ReorderBufferChange *next_change =
1417 
1418  /* txn stays the same */
1419  state->entries[off].lsn = next_change->lsn;
1420  state->entries[off].change = next_change;
1421 
1423  return change;
1424  }
1425 
1426  /* try to load changes from disk */
1427  if (entry->txn->nentries != entry->txn->nentries_mem)
1428  {
1429  /*
1430  * Ugly: restoring changes will reuse *Change records, thus delete the
1431  * current one from the per-tx list and only free in the next call.
1432  */
1433  dlist_delete(&change->node);
1434  dlist_push_tail(&state->old_change, &change->node);
1435 
1436  /*
1437  * Update the total bytes processed by the txn for which we are
1438  * releasing the current set of changes and restoring the new set of
1439  * changes.
1440  */
1441  rb->totalBytes += entry->txn->size;
1442  if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1443  &state->entries[off].segno))
1444  {
1445  /* successfully restored changes from disk */
1446  ReorderBufferChange *next_change =
1448  &entry->txn->changes);
1449 
1450  elog(DEBUG2, "restored %u/%u changes from disk",
1451  (uint32) entry->txn->nentries_mem,
1452  (uint32) entry->txn->nentries);
1453 
1454  Assert(entry->txn->nentries_mem);
1455  /* txn stays the same */
1456  state->entries[off].lsn = next_change->lsn;
1457  state->entries[off].change = next_change;
1459 
1460  return change;
1461  }
1462  }
1463 
1464  /* ok, no changes there anymore, remove */
1466 
1467  return change;
1468 }
1469 
1470 /*
1471  * Deallocate the iterator
1472  */
1473 static void
1476 {
1477  int32 off;
1478 
1479  for (off = 0; off < state->nr_txns; off++)
1480  {
1481  if (state->entries[off].file.vfd != -1)
1482  FileClose(state->entries[off].file.vfd);
1483  }
1484 
1485  /* free memory we might have "leaked" in the last *Next call */
1486  if (!dlist_is_empty(&state->old_change))
1487  {
1488  ReorderBufferChange *change;
1489 
1490  change = dlist_container(ReorderBufferChange, node,
1491  dlist_pop_head_node(&state->old_change));
1492  ReorderBufferReturnChange(rb, change, true);
1493  Assert(dlist_is_empty(&state->old_change));
1494  }
1495 
1496  binaryheap_free(state->heap);
1497  pfree(state);
1498 }
1499 
1500 /*
1501  * Cleanup the contents of a transaction, usually after the transaction
1502  * committed or aborted.
1503  */
1504 static void
1506 {
1507  bool found;
1508  dlist_mutable_iter iter;
1509 
1510  /* cleanup subtransactions & their changes */
1511  dlist_foreach_modify(iter, &txn->subtxns)
1512  {
1513  ReorderBufferTXN *subtxn;
1514 
1515  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1516 
1517  /*
1518  * Subtransactions are always associated to the toplevel TXN, even if
1519  * they originally were happening inside another subtxn, so we won't
1520  * ever recurse more than one level deep here.
1521  */
1522  Assert(rbtxn_is_known_subxact(subtxn));
1523  Assert(subtxn->nsubtxns == 0);
1524 
1525  ReorderBufferCleanupTXN(rb, subtxn);
1526  }
1527 
1528  /* cleanup changes in the txn */
1529  dlist_foreach_modify(iter, &txn->changes)
1530  {
1531  ReorderBufferChange *change;
1532 
1533  change = dlist_container(ReorderBufferChange, node, iter.cur);
1534 
1535  /* Check we're not mixing changes from different transactions. */
1536  Assert(change->txn == txn);
1537 
1538  ReorderBufferReturnChange(rb, change, false);
1539  }
1540 
1541  /*
1542  * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1543  * They are always stored in the toplevel transaction.
1544  */
1545  dlist_foreach_modify(iter, &txn->tuplecids)
1546  {
1547  ReorderBufferChange *change;
1548 
1549  change = dlist_container(ReorderBufferChange, node, iter.cur);
1550 
1551  /* Check we're not mixing changes from different transactions. */
1552  Assert(change->txn == txn);
1554 
1555  ReorderBufferReturnChange(rb, change, true);
1556  }
1557 
1558  /*
1559  * Cleanup the base snapshot, if set.
1560  */
1561  if (txn->base_snapshot != NULL)
1562  {
1565  }
1566 
1567  /*
1568  * Cleanup the snapshot for the last streamed run.
1569  */
1570  if (txn->snapshot_now != NULL)
1571  {
1572  Assert(rbtxn_is_streamed(txn));
1574  }
1575 
1576  /*
1577  * Remove TXN from its containing lists.
1578  *
1579  * Note: if txn is known as subxact, we are deleting the TXN from its
1580  * parent's list of known subxacts; this leaves the parent's nsubxacts
1581  * count too high, but we don't care. Otherwise, we are deleting the TXN
1582  * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1583  * list of catalog modifying transactions as well.
1584  */
1585  dlist_delete(&txn->node);
1586  if (rbtxn_has_catalog_changes(txn))
1588 
1589  /* now remove reference from buffer */
1590  hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1591  Assert(found);
1592 
1593  /* remove entries spilled to disk */
1594  if (rbtxn_is_serialized(txn))
1595  ReorderBufferRestoreCleanup(rb, txn);
1596 
1597  /* Update the memory counter */
1598  ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, txn->size);
1599 
1600  /* deallocate */
1601  ReorderBufferReturnTXN(rb, txn);
1602 }
1603 
1604 /*
1605  * Discard changes from a transaction (and subtransactions), either after
1606  * streaming or decoding them at PREPARE. Keep the remaining info -
1607  * transactions, tuplecids, invalidations and snapshots.
1608  *
1609  * We additionally remove tuplecids after decoding the transaction at prepare
1610  * time as we only need to perform invalidation at rollback or commit prepared.
1611  *
1612  * 'txn_prepared' indicates that we have decoded the transaction at prepare
1613  * time.
1614  */
1615 static void
1617 {
1618  dlist_mutable_iter iter;
1619 
1620  /* cleanup subtransactions & their changes */
1621  dlist_foreach_modify(iter, &txn->subtxns)
1622  {
1623  ReorderBufferTXN *subtxn;
1624 
1625  subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1626 
1627  /*
1628  * Subtransactions are always associated to the toplevel TXN, even if
1629  * they originally were happening inside another subtxn, so we won't
1630  * ever recurse more than one level deep here.
1631  */
1632  Assert(rbtxn_is_known_subxact(subtxn));
1633  Assert(subtxn->nsubtxns == 0);
1634 
1635  ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1636  }
1637 
1638  /* cleanup changes in the txn */
1639  dlist_foreach_modify(iter, &txn->changes)
1640  {
1641  ReorderBufferChange *change;
1642 
1643  change = dlist_container(ReorderBufferChange, node, iter.cur);
1644 
1645  /* Check we're not mixing changes from different transactions. */
1646  Assert(change->txn == txn);
1647 
1648  /* remove the change from it's containing list */
1649  dlist_delete(&change->node);
1650 
1651  ReorderBufferReturnChange(rb, change, false);
1652  }
1653 
1654  /* Update the memory counter */
1655  ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, txn->size);
1656 
1657  /*
1658  * Mark the transaction as streamed.
1659  *
1660  * The top-level transaction, is marked as streamed always, even if it
1661  * does not contain any changes (that is, when all the changes are in
1662  * subtransactions).
1663  *
1664  * For subtransactions, we only mark them as streamed when there are
1665  * changes in them.
1666  *
1667  * We do it this way because of aborts - we don't want to send aborts for
1668  * XIDs the downstream is not aware of. And of course, it always knows
1669  * about the toplevel xact (we send the XID in all messages), but we never
1670  * stream XIDs of empty subxacts.
1671  */
1672  if ((!txn_prepared) && (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0)))
1673  txn->txn_flags |= RBTXN_IS_STREAMED;
1674 
1675  if (txn_prepared)
1676  {
1677  /*
1678  * If this is a prepared txn, cleanup the tuplecids we stored for
1679  * decoding catalog snapshot access. They are always stored in the
1680  * toplevel transaction.
1681  */
1682  dlist_foreach_modify(iter, &txn->tuplecids)
1683  {
1684  ReorderBufferChange *change;
1685 
1686  change = dlist_container(ReorderBufferChange, node, iter.cur);
1687 
1688  /* Check we're not mixing changes from different transactions. */
1689  Assert(change->txn == txn);
1691 
1692  /* Remove the change from its containing list. */
1693  dlist_delete(&change->node);
1694 
1695  ReorderBufferReturnChange(rb, change, true);
1696  }
1697  }
1698 
1699  /*
1700  * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1701  * memory. We could also keep the hash table and update it with new ctid
1702  * values, but this seems simpler and good enough for now.
1703  */
1704  if (txn->tuplecid_hash != NULL)
1705  {
1707  txn->tuplecid_hash = NULL;
1708  }
1709 
1710  /* If this txn is serialized then clean the disk space. */
1711  if (rbtxn_is_serialized(txn))
1712  {
1713  ReorderBufferRestoreCleanup(rb, txn);
1714  txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1715 
1716  /*
1717  * We set this flag to indicate if the transaction is ever serialized.
1718  * We need this to accurately update the stats as otherwise the same
1719  * transaction can be counted as serialized multiple times.
1720  */
1722  }
1723 
1724  /* also reset the number of entries in the transaction */
1725  txn->nentries_mem = 0;
1726  txn->nentries = 0;
1727 }
1728 
1729 /*
1730  * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1731  * HeapTupleSatisfiesHistoricMVCC.
1732  */
1733 static void
1735 {
1736  dlist_iter iter;
1737  HASHCTL hash_ctl;
1738 
1740  return;
1741 
1742  hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1743  hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1744  hash_ctl.hcxt = rb->context;
1745 
1746  /*
1747  * create the hash with the exact number of to-be-stored tuplecids from
1748  * the start
1749  */
1750  txn->tuplecid_hash =
1751  hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1753 
1754  dlist_foreach(iter, &txn->tuplecids)
1755  {
1758  bool found;
1759  ReorderBufferChange *change;
1760 
1761  change = dlist_container(ReorderBufferChange, node, iter.cur);
1762 
1764 
1765  /* be careful about padding */
1766  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1767 
1768  key.rlocator = change->data.tuplecid.locator;
1769 
1770  ItemPointerCopy(&change->data.tuplecid.tid,
1771  &key.tid);
1772 
1773  ent = (ReorderBufferTupleCidEnt *)
1774  hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1775  if (!found)
1776  {
1777  ent->cmin = change->data.tuplecid.cmin;
1778  ent->cmax = change->data.tuplecid.cmax;
1779  ent->combocid = change->data.tuplecid.combocid;
1780  }
1781  else
1782  {
1783  /*
1784  * Maybe we already saw this tuple before in this transaction, but
1785  * if so it must have the same cmin.
1786  */
1787  Assert(ent->cmin == change->data.tuplecid.cmin);
1788 
1789  /*
1790  * cmax may be initially invalid, but once set it can only grow,
1791  * and never become invalid again.
1792  */
1793  Assert((ent->cmax == InvalidCommandId) ||
1794  ((change->data.tuplecid.cmax != InvalidCommandId) &&
1795  (change->data.tuplecid.cmax > ent->cmax)));
1796  ent->cmax = change->data.tuplecid.cmax;
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Copy a provided snapshot so we can modify it privately. This is needed so
1803  * that catalog modifying transactions can look into intermediate catalog
1804  * states.
1805  */
1806 static Snapshot
1808  ReorderBufferTXN *txn, CommandId cid)
1809 {
1810  Snapshot snap;
1811  dlist_iter iter;
1812  int i = 0;
1813  Size size;
1814 
1815  size = sizeof(SnapshotData) +
1816  sizeof(TransactionId) * orig_snap->xcnt +
1817  sizeof(TransactionId) * (txn->nsubtxns + 1);
1818 
1819  snap = MemoryContextAllocZero(rb->context, size);
1820  memcpy(snap, orig_snap, sizeof(SnapshotData));
1821 
1822  snap->copied = true;
1823  snap->active_count = 1; /* mark as active so nobody frees it */
1824  snap->regd_count = 0;
1825  snap->xip = (TransactionId *) (snap + 1);
1826 
1827  memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1828 
1829  /*
1830  * snap->subxip contains all txids that belong to our transaction which we
1831  * need to check via cmin/cmax. That's why we store the toplevel
1832  * transaction in there as well.
1833  */
1834  snap->subxip = snap->xip + snap->xcnt;
1835  snap->subxip[i++] = txn->xid;
1836 
1837  /*
1838  * subxcnt isn't decreased when subtransactions abort, so count manually.
1839  * Since it's an upper boundary it is safe to use it for the allocation
1840  * above.
1841  */
1842  snap->subxcnt = 1;
1843 
1844  dlist_foreach(iter, &txn->subtxns)
1845  {
1846  ReorderBufferTXN *sub_txn;
1847 
1848  sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1849  snap->subxip[i++] = sub_txn->xid;
1850  snap->subxcnt++;
1851  }
1852 
1853  /* sort so we can bsearch() later */
1854  qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1855 
1856  /* store the specified current CommandId */
1857  snap->curcid = cid;
1858 
1859  return snap;
1860 }
1861 
1862 /*
1863  * Free a previously ReorderBufferCopySnap'ed snapshot
1864  */
1865 static void
1867 {
1868  if (snap->copied)
1869  pfree(snap);
1870  else
1872 }
1873 
1874 /*
1875  * If the transaction was (partially) streamed, we need to prepare or commit
1876  * it in a 'streamed' way. That is, we first stream the remaining part of the
1877  * transaction, and then invoke stream_prepare or stream_commit message as per
1878  * the case.
1879  */
1880 static void
1882 {
1883  /* we should only call this for previously streamed transactions */
1884  Assert(rbtxn_is_streamed(txn));
1885 
1886  ReorderBufferStreamTXN(rb, txn);
1887 
1888  if (rbtxn_prepared(txn))
1889  {
1890  /*
1891  * Note, we send stream prepare even if a concurrent abort is
1892  * detected. See DecodePrepare for more information.
1893  */
1894  rb->stream_prepare(rb, txn, txn->final_lsn);
1895 
1896  /*
1897  * This is a PREPARED transaction, part of a two-phase commit. The
1898  * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1899  * just truncate txn by removing changes and tuplecids.
1900  */
1901  ReorderBufferTruncateTXN(rb, txn, true);
1902  /* Reset the CheckXidAlive */
1904  }
1905  else
1906  {
1907  rb->stream_commit(rb, txn, txn->final_lsn);
1908  ReorderBufferCleanupTXN(rb, txn);
1909  }
1910 }
1911 
1912 /*
1913  * Set xid to detect concurrent aborts.
1914  *
1915  * While streaming an in-progress transaction or decoding a prepared
1916  * transaction there is a possibility that the (sub)transaction might get
1917  * aborted concurrently. In such case if the (sub)transaction has catalog
1918  * update then we might decode the tuple using wrong catalog version. For
1919  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1920  * the transaction 501 updates the catalog tuple and after that we will have
1921  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1922  * aborted and some other transaction say 502 updates the same catalog tuple
1923  * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1924  * problem is that when we try to decode the tuple inserted/updated in 501
1925  * after the catalog update, we will see the catalog tuple with (xmin: 500,
1926  * xmax: 502) as visible because it will consider that the tuple is deleted by
1927  * xid 502 which is not visible to our snapshot. And when we will try to
1928  * decode with that catalog tuple, it can lead to a wrong result or a crash.
1929  * So, it is necessary to detect concurrent aborts to allow streaming of
1930  * in-progress transactions or decoding of prepared transactions.
1931  *
1932  * For detecting the concurrent abort we set CheckXidAlive to the current
1933  * (sub)transaction's xid for which this change belongs to. And, during
1934  * catalog scan we can check the status of the xid and if it is aborted we will
1935  * report a specific error so that we can stop streaming current transaction
1936  * and discard the already streamed changes on such an error. We might have
1937  * already streamed some of the changes for the aborted (sub)transaction, but
1938  * that is fine because when we decode the abort we will stream abort message
1939  * to truncate the changes in the subscriber. Similarly, for prepared
1940  * transactions, we stop decoding if concurrent abort is detected and then
1941  * rollback the changes when rollback prepared is encountered. See
1942  * DecodePrepare.
1943  */
1944 static inline void
1946 {
1947  /*
1948  * If the input transaction id is already set as a CheckXidAlive then
1949  * nothing to do.
1950  */
1952  return;
1953 
1954  /*
1955  * setup CheckXidAlive if it's not committed yet. We don't check if the
1956  * xid is aborted. That will happen during catalog access.
1957  */
1958  if (!TransactionIdDidCommit(xid))
1959  CheckXidAlive = xid;
1960  else
1962 }
1963 
1964 /*
1965  * Helper function for ReorderBufferProcessTXN for applying change.
1966  */
1967 static inline void
1969  Relation relation, ReorderBufferChange *change,
1970  bool streaming)
1971 {
1972  if (streaming)
1973  rb->stream_change(rb, txn, relation, change);
1974  else
1975  rb->apply_change(rb, txn, relation, change);
1976 }
1977 
1978 /*
1979  * Helper function for ReorderBufferProcessTXN for applying the truncate.
1980  */
1981 static inline void
1983  int nrelations, Relation *relations,
1984  ReorderBufferChange *change, bool streaming)
1985 {
1986  if (streaming)
1987  rb->stream_truncate(rb, txn, nrelations, relations, change);
1988  else
1989  rb->apply_truncate(rb, txn, nrelations, relations, change);
1990 }
1991 
1992 /*
1993  * Helper function for ReorderBufferProcessTXN for applying the message.
1994  */
1995 static inline void
1997  ReorderBufferChange *change, bool streaming)
1998 {
1999  if (streaming)
2000  rb->stream_message(rb, txn, change->lsn, true,
2001  change->data.msg.prefix,
2002  change->data.msg.message_size,
2003  change->data.msg.message);
2004  else
2005  rb->message(rb, txn, change->lsn, true,
2006  change->data.msg.prefix,
2007  change->data.msg.message_size,
2008  change->data.msg.message);
2009 }
2010 
2011 /*
2012  * Function to store the command id and snapshot at the end of the current
2013  * stream so that we can reuse the same while sending the next stream.
2014  */
2015 static inline void
2017  Snapshot snapshot_now, CommandId command_id)
2018 {
2019  txn->command_id = command_id;
2020 
2021  /* Avoid copying if it's already copied. */
2022  if (snapshot_now->copied)
2023  txn->snapshot_now = snapshot_now;
2024  else
2025  txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2026  txn, command_id);
2027 }
2028 
2029 /*
2030  * Helper function for ReorderBufferProcessTXN to handle the concurrent
2031  * abort of the streaming transaction. This resets the TXN such that it
2032  * can be used to stream the remaining data of transaction being processed.
2033  * This can happen when the subtransaction is aborted and we still want to
2034  * continue processing the main or other subtransactions data.
2035  */
2036 static void
2038  Snapshot snapshot_now,
2039  CommandId command_id,
2040  XLogRecPtr last_lsn,
2041  ReorderBufferChange *specinsert)
2042 {
2043  /* Discard the changes that we just streamed */
2045 
2046  /* Free all resources allocated for toast reconstruction */
2047  ReorderBufferToastReset(rb, txn);
2048 
2049  /* Return the spec insert change if it is not NULL */
2050  if (specinsert != NULL)
2051  {
2052  ReorderBufferReturnChange(rb, specinsert, true);
2053  specinsert = NULL;
2054  }
2055 
2056  /*
2057  * For the streaming case, stop the stream and remember the command ID and
2058  * snapshot for the streaming run.
2059  */
2060  if (rbtxn_is_streamed(txn))
2061  {
2062  rb->stream_stop(rb, txn, last_lsn);
2063  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2064  }
2065 }
2066 
2067 /*
2068  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2069  *
2070  * Send data of a transaction (and its subtransactions) to the
2071  * output plugin. We iterate over the top and subtransactions (using a k-way
2072  * merge) and replay the changes in lsn order.
2073  *
2074  * If streaming is true then data will be sent using stream API.
2075  *
2076  * Note: "volatile" markers on some parameters are to avoid trouble with
2077  * PG_TRY inside the function.
2078  */
2079 static void
2081  XLogRecPtr commit_lsn,
2082  volatile Snapshot snapshot_now,
2083  volatile CommandId command_id,
2084  bool streaming)
2085 {
2086  bool using_subtxn;
2088  ReorderBufferIterTXNState *volatile iterstate = NULL;
2089  volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2090  ReorderBufferChange *volatile specinsert = NULL;
2091  volatile bool stream_started = false;
2092  ReorderBufferTXN *volatile curtxn = NULL;
2093 
2094  /* build data to be able to lookup the CommandIds of catalog tuples */
2096 
2097  /* setup the initial snapshot */
2098  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2099 
2100  /*
2101  * Decoding needs access to syscaches et al., which in turn use
2102  * heavyweight locks and such. Thus we need to have enough state around to
2103  * keep track of those. The easiest way is to simply use a transaction
2104  * internally. That also allows us to easily enforce that nothing writes
2105  * to the database by checking for xid assignments.
2106  *
2107  * When we're called via the SQL SRF there's already a transaction
2108  * started, so start an explicit subtransaction there.
2109  */
2110  using_subtxn = IsTransactionOrTransactionBlock();
2111 
2112  PG_TRY();
2113  {
2114  ReorderBufferChange *change;
2115  int changes_count = 0; /* used to accumulate the number of
2116  * changes */
2117 
2118  if (using_subtxn)
2119  BeginInternalSubTransaction(streaming ? "stream" : "replay");
2120  else
2122 
2123  /*
2124  * We only need to send begin/begin-prepare for non-streamed
2125  * transactions.
2126  */
2127  if (!streaming)
2128  {
2129  if (rbtxn_prepared(txn))
2130  rb->begin_prepare(rb, txn);
2131  else
2132  rb->begin(rb, txn);
2133  }
2134 
2135  ReorderBufferIterTXNInit(rb, txn, &iterstate);
2136  while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2137  {
2138  Relation relation = NULL;
2139  Oid reloid;
2140 
2142 
2143  /*
2144  * We can't call start stream callback before processing first
2145  * change.
2146  */
2147  if (prev_lsn == InvalidXLogRecPtr)
2148  {
2149  if (streaming)
2150  {
2151  txn->origin_id = change->origin_id;
2152  rb->stream_start(rb, txn, change->lsn);
2153  stream_started = true;
2154  }
2155  }
2156 
2157  /*
2158  * Enforce correct ordering of changes, merged from multiple
2159  * subtransactions. The changes may have the same LSN due to
2160  * MULTI_INSERT xlog records.
2161  */
2162  Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2163 
2164  prev_lsn = change->lsn;
2165 
2166  /*
2167  * Set the current xid to detect concurrent aborts. This is
2168  * required for the cases when we decode the changes before the
2169  * COMMIT record is processed.
2170  */
2171  if (streaming || rbtxn_prepared(change->txn))
2172  {
2173  curtxn = change->txn;
2174  SetupCheckXidLive(curtxn->xid);
2175  }
2176 
2177  switch (change->action)
2178  {
2180 
2181  /*
2182  * Confirmation for speculative insertion arrived. Simply
2183  * use as a normal record. It'll be cleaned up at the end
2184  * of INSERT processing.
2185  */
2186  if (specinsert == NULL)
2187  elog(ERROR, "invalid ordering of speculative insertion changes");
2188  Assert(specinsert->data.tp.oldtuple == NULL);
2189  change = specinsert;
2191 
2192  /* intentionally fall through */
2196  Assert(snapshot_now);
2197 
2198  reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2199  change->data.tp.rlocator.relNumber);
2200 
2201  /*
2202  * Mapped catalog tuple without data, emitted while
2203  * catalog table was in the process of being rewritten. We
2204  * can fail to look up the relfilenumber, because the
2205  * relmapper has no "historic" view, in contrast to the
2206  * normal catalog during decoding. Thus repeated rewrites
2207  * can cause a lookup failure. That's OK because we do not
2208  * decode catalog changes anyway. Normally such tuples
2209  * would be skipped over below, but we can't identify
2210  * whether the table should be logically logged without
2211  * mapping the relfilenumber to the oid.
2212  */
2213  if (reloid == InvalidOid &&
2214  change->data.tp.newtuple == NULL &&
2215  change->data.tp.oldtuple == NULL)
2216  goto change_done;
2217  else if (reloid == InvalidOid)
2218  elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2219  relpathperm(change->data.tp.rlocator,
2220  MAIN_FORKNUM));
2221 
2222  relation = RelationIdGetRelation(reloid);
2223 
2224  if (!RelationIsValid(relation))
2225  elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2226  reloid,
2227  relpathperm(change->data.tp.rlocator,
2228  MAIN_FORKNUM));
2229 
2230  if (!RelationIsLogicallyLogged(relation))
2231  goto change_done;
2232 
2233  /*
2234  * Ignore temporary heaps created during DDL unless the
2235  * plugin has asked for them.
2236  */
2237  if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2238  goto change_done;
2239 
2240  /*
2241  * For now ignore sequence changes entirely. Most of the
2242  * time they don't log changes using records we
2243  * understand, so it doesn't make sense to handle the few
2244  * cases we do.
2245  */
2246  if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2247  goto change_done;
2248 
2249  /* user-triggered change */
2250  if (!IsToastRelation(relation))
2251  {
2252  ReorderBufferToastReplace(rb, txn, relation, change);
2253  ReorderBufferApplyChange(rb, txn, relation, change,
2254  streaming);
2255 
2256  /*
2257  * Only clear reassembled toast chunks if we're sure
2258  * they're not required anymore. The creator of the
2259  * tuple tells us.
2260  */
2261  if (change->data.tp.clear_toast_afterwards)
2262  ReorderBufferToastReset(rb, txn);
2263  }
2264  /* we're not interested in toast deletions */
2265  else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2266  {
2267  /*
2268  * Need to reassemble the full toasted Datum in
2269  * memory, to ensure the chunks don't get reused till
2270  * we're done remove it from the list of this
2271  * transaction's changes. Otherwise it will get
2272  * freed/reused while restoring spooled data from
2273  * disk.
2274  */
2275  Assert(change->data.tp.newtuple != NULL);
2276 
2277  dlist_delete(&change->node);
2278  ReorderBufferToastAppendChunk(rb, txn, relation,
2279  change);
2280  }
2281 
2282  change_done:
2283 
2284  /*
2285  * If speculative insertion was confirmed, the record
2286  * isn't needed anymore.
2287  */
2288  if (specinsert != NULL)
2289  {
2290  ReorderBufferReturnChange(rb, specinsert, true);
2291  specinsert = NULL;
2292  }
2293 
2294  if (RelationIsValid(relation))
2295  {
2296  RelationClose(relation);
2297  relation = NULL;
2298  }
2299  break;
2300 
2302 
2303  /*
2304  * Speculative insertions are dealt with by delaying the
2305  * processing of the insert until the confirmation record
2306  * arrives. For that we simply unlink the record from the
2307  * chain, so it does not get freed/reused while restoring
2308  * spooled data from disk.
2309  *
2310  * This is safe in the face of concurrent catalog changes
2311  * because the relevant relation can't be changed between
2312  * speculative insertion and confirmation due to
2313  * CheckTableNotInUse() and locking.
2314  */
2315 
2316  /* clear out a pending (and thus failed) speculation */
2317  if (specinsert != NULL)
2318  {
2319  ReorderBufferReturnChange(rb, specinsert, true);
2320  specinsert = NULL;
2321  }
2322 
2323  /* and memorize the pending insertion */
2324  dlist_delete(&change->node);
2325  specinsert = change;
2326  break;
2327 
2329 
2330  /*
2331  * Abort for speculative insertion arrived. So cleanup the
2332  * specinsert tuple and toast hash.
2333  *
2334  * Note that we get the spec abort change for each toast
2335  * entry but we need to perform the cleanup only the first
2336  * time we get it for the main table.
2337  */
2338  if (specinsert != NULL)
2339  {
2340  /*
2341  * We must clean the toast hash before processing a
2342  * completely new tuple to avoid confusion about the
2343  * previous tuple's toast chunks.
2344  */
2345  Assert(change->data.tp.clear_toast_afterwards);
2346  ReorderBufferToastReset(rb, txn);
2347 
2348  /* We don't need this record anymore. */
2349  ReorderBufferReturnChange(rb, specinsert, true);
2350  specinsert = NULL;
2351  }
2352  break;
2353 
2355  {
2356  int i;
2357  int nrelids = change->data.truncate.nrelids;
2358  int nrelations = 0;
2359  Relation *relations;
2360 
2361  relations = palloc0(nrelids * sizeof(Relation));
2362  for (i = 0; i < nrelids; i++)
2363  {
2364  Oid relid = change->data.truncate.relids[i];
2365  Relation rel;
2366 
2367  rel = RelationIdGetRelation(relid);
2368 
2369  if (!RelationIsValid(rel))
2370  elog(ERROR, "could not open relation with OID %u", relid);
2371 
2372  if (!RelationIsLogicallyLogged(rel))
2373  continue;
2374 
2375  relations[nrelations++] = rel;
2376  }
2377 
2378  /* Apply the truncate. */
2379  ReorderBufferApplyTruncate(rb, txn, nrelations,
2380  relations, change,
2381  streaming);
2382 
2383  for (i = 0; i < nrelations; i++)
2384  RelationClose(relations[i]);
2385 
2386  break;
2387  }
2388 
2390  ReorderBufferApplyMessage(rb, txn, change, streaming);
2391  break;
2392 
2394  /* Execute the invalidation messages locally */
2395  ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2396  change->data.inval.invalidations);
2397  break;
2398 
2400  /* get rid of the old */
2401  TeardownHistoricSnapshot(false);
2402 
2403  if (snapshot_now->copied)
2404  {
2405  ReorderBufferFreeSnap(rb, snapshot_now);
2406  snapshot_now =
2407  ReorderBufferCopySnap(rb, change->data.snapshot,
2408  txn, command_id);
2409  }
2410 
2411  /*
2412  * Restored from disk, need to be careful not to double
2413  * free. We could introduce refcounting for that, but for
2414  * now this seems infrequent enough not to care.
2415  */
2416  else if (change->data.snapshot->copied)
2417  {
2418  snapshot_now =
2419  ReorderBufferCopySnap(rb, change->data.snapshot,
2420  txn, command_id);
2421  }
2422  else
2423  {
2424  snapshot_now = change->data.snapshot;
2425  }
2426 
2427  /* and continue with the new one */
2428  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2429  break;
2430 
2432  Assert(change->data.command_id != InvalidCommandId);
2433 
2434  if (command_id < change->data.command_id)
2435  {
2436  command_id = change->data.command_id;
2437 
2438  if (!snapshot_now->copied)
2439  {
2440  /* we don't use the global one anymore */
2441  snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2442  txn, command_id);
2443  }
2444 
2445  snapshot_now->curcid = command_id;
2446 
2447  TeardownHistoricSnapshot(false);
2448  SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2449  }
2450 
2451  break;
2452 
2454  elog(ERROR, "tuplecid value in changequeue");
2455  break;
2456  }
2457 
2458  /*
2459  * It is possible that the data is not sent to downstream for a
2460  * long time either because the output plugin filtered it or there
2461  * is a DDL that generates a lot of data that is not processed by
2462  * the plugin. So, in such cases, the downstream can timeout. To
2463  * avoid that we try to send a keepalive message if required.
2464  * Trying to send a keepalive message after every change has some
2465  * overhead, but testing showed there is no noticeable overhead if
2466  * we do it after every ~100 changes.
2467  */
2468 #define CHANGES_THRESHOLD 100
2469 
2470  if (++changes_count >= CHANGES_THRESHOLD)
2471  {
2472  rb->update_progress_txn(rb, txn, change->lsn);
2473  changes_count = 0;
2474  }
2475  }
2476 
2477  /* speculative insertion record must be freed by now */
2478  Assert(!specinsert);
2479 
2480  /* clean up the iterator */
2481  ReorderBufferIterTXNFinish(rb, iterstate);
2482  iterstate = NULL;
2483 
2484  /*
2485  * Update total transaction count and total bytes processed by the
2486  * transaction and its subtransactions. Ensure to not count the
2487  * streamed transaction multiple times.
2488  *
2489  * Note that the statistics computation has to be done after
2490  * ReorderBufferIterTXNFinish as it releases the serialized change
2491  * which we have already accounted in ReorderBufferIterTXNNext.
2492  */
2493  if (!rbtxn_is_streamed(txn))
2494  rb->totalTxns++;
2495 
2496  rb->totalBytes += txn->total_size;
2497 
2498  /*
2499  * Done with current changes, send the last message for this set of
2500  * changes depending upon streaming mode.
2501  */
2502  if (streaming)
2503  {
2504  if (stream_started)
2505  {
2506  rb->stream_stop(rb, txn, prev_lsn);
2507  stream_started = false;
2508  }
2509  }
2510  else
2511  {
2512  /*
2513  * Call either PREPARE (for two-phase transactions) or COMMIT (for
2514  * regular ones).
2515  */
2516  if (rbtxn_prepared(txn))
2517  rb->prepare(rb, txn, commit_lsn);
2518  else
2519  rb->commit(rb, txn, commit_lsn);
2520  }
2521 
2522  /* this is just a sanity check against bad output plugin behaviour */
2524  elog(ERROR, "output plugin used XID %u",
2526 
2527  /*
2528  * Remember the command ID and snapshot for the next set of changes in
2529  * streaming mode.
2530  */
2531  if (streaming)
2532  ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2533  else if (snapshot_now->copied)
2534  ReorderBufferFreeSnap(rb, snapshot_now);
2535 
2536  /* cleanup */
2537  TeardownHistoricSnapshot(false);
2538 
2539  /*
2540  * Aborting the current (sub-)transaction as a whole has the right
2541  * semantics. We want all locks acquired in here to be released, not
2542  * reassigned to the parent and we do not want any database access
2543  * have persistent effects.
2544  */
2546 
2547  /* make sure there's no cache pollution */
2549 
2550  if (using_subtxn)
2552 
2553  /*
2554  * We are here due to one of the four reasons: 1. Decoding an
2555  * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2556  * prepared txn that was (partially) streamed. 4. Decoding a committed
2557  * txn.
2558  *
2559  * For 1, we allow truncation of txn data by removing the changes
2560  * already streamed but still keeping other things like invalidations,
2561  * snapshot, and tuplecids. For 2 and 3, we indicate
2562  * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2563  * data as the entire transaction has been decoded except for commit.
2564  * For 4, as the entire txn has been decoded, we can fully clean up
2565  * the TXN reorder buffer.
2566  */
2567  if (streaming || rbtxn_prepared(txn))
2568  {
2570  /* Reset the CheckXidAlive */
2572  }
2573  else
2574  ReorderBufferCleanupTXN(rb, txn);
2575  }
2576  PG_CATCH();
2577  {
2578  MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2579  ErrorData *errdata = CopyErrorData();
2580 
2581  /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2582  if (iterstate)
2583  ReorderBufferIterTXNFinish(rb, iterstate);
2584 
2586 
2587  /*
2588  * Force cache invalidation to happen outside of a valid transaction
2589  * to prevent catalog access as we just caught an error.
2590  */
2592 
2593  /* make sure there's no cache pollution */
2595  txn->invalidations);
2596 
2597  if (using_subtxn)
2599 
2600  /*
2601  * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2602  * abort of the (sub)transaction we are streaming or preparing. We
2603  * need to do the cleanup and return gracefully on this error, see
2604  * SetupCheckXidLive.
2605  *
2606  * This error code can be thrown by one of the callbacks we call
2607  * during decoding so we need to ensure that we return gracefully only
2608  * when we are sending the data in streaming mode and the streaming is
2609  * not finished yet or when we are sending the data out on a PREPARE
2610  * during a two-phase commit.
2611  */
2612  if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2613  (stream_started || rbtxn_prepared(txn)))
2614  {
2615  /* curtxn must be set for streaming or prepared transactions */
2616  Assert(curtxn);
2617 
2618  /* Cleanup the temporary error state. */
2619  FlushErrorState();
2620  FreeErrorData(errdata);
2621  errdata = NULL;
2622  curtxn->concurrent_abort = true;
2623 
2624  /* Reset the TXN so that it is allowed to stream remaining data. */
2625  ReorderBufferResetTXN(rb, txn, snapshot_now,
2626  command_id, prev_lsn,
2627  specinsert);
2628  }
2629  else
2630  {
2631  ReorderBufferCleanupTXN(rb, txn);
2632  MemoryContextSwitchTo(ecxt);
2633  PG_RE_THROW();
2634  }
2635  }
2636  PG_END_TRY();
2637 }
2638 
2639 /*
2640  * Perform the replay of a transaction and its non-aborted subtransactions.
2641  *
2642  * Subtransactions previously have to be processed by
2643  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2644  * transaction with ReorderBufferAssignChild.
2645  *
2646  * This interface is called once a prepare or toplevel commit is read for both
2647  * streamed as well as non-streamed transactions.
2648  */
2649 static void
2651  ReorderBuffer *rb, TransactionId xid,
2652  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2653  TimestampTz commit_time,
2654  RepOriginId origin_id, XLogRecPtr origin_lsn)
2655 {
2656  Snapshot snapshot_now;
2657  CommandId command_id = FirstCommandId;
2658 
2659  txn->final_lsn = commit_lsn;
2660  txn->end_lsn = end_lsn;
2661  txn->xact_time.commit_time = commit_time;
2662  txn->origin_id = origin_id;
2663  txn->origin_lsn = origin_lsn;
2664 
2665  /*
2666  * If the transaction was (partially) streamed, we need to commit it in a
2667  * 'streamed' way. That is, we first stream the remaining part of the
2668  * transaction, and then invoke stream_commit message.
2669  *
2670  * Called after everything (origin ID, LSN, ...) is stored in the
2671  * transaction to avoid passing that information directly.
2672  */
2673  if (rbtxn_is_streamed(txn))
2674  {
2675  ReorderBufferStreamCommit(rb, txn);
2676  return;
2677  }
2678 
2679  /*
2680  * If this transaction has no snapshot, it didn't make any changes to the
2681  * database, so there's nothing to decode. Note that
2682  * ReorderBufferCommitChild will have transferred any snapshots from
2683  * subtransactions if there were any.
2684  */
2685  if (txn->base_snapshot == NULL)
2686  {
2687  Assert(txn->ninvalidations == 0);
2688 
2689  /*
2690  * Removing this txn before a commit might result in the computation
2691  * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2692  */
2693  if (!rbtxn_prepared(txn))
2694  ReorderBufferCleanupTXN(rb, txn);
2695  return;
2696  }
2697 
2698  snapshot_now = txn->base_snapshot;
2699 
2700  /* Process and send the changes to output plugin. */
2701  ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2702  command_id, false);
2703 }
2704 
2705 /*
2706  * Commit a transaction.
2707  *
2708  * See comments for ReorderBufferReplay().
2709  */
2710 void
2712  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2713  TimestampTz commit_time,
2714  RepOriginId origin_id, XLogRecPtr origin_lsn)
2715 {
2716  ReorderBufferTXN *txn;
2717 
2718  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2719  false);
2720 
2721  /* unknown transaction, nothing to replay */
2722  if (txn == NULL)
2723  return;
2724 
2725  ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2726  origin_id, origin_lsn);
2727 }
2728 
2729 /*
2730  * Record the prepare information for a transaction.
2731  */
2732 bool
2734  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2735  TimestampTz prepare_time,
2736  RepOriginId origin_id, XLogRecPtr origin_lsn)
2737 {
2738  ReorderBufferTXN *txn;
2739 
2740  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2741 
2742  /* unknown transaction, nothing to do */
2743  if (txn == NULL)
2744  return false;
2745 
2746  /*
2747  * Remember the prepare information to be later used by commit prepared in
2748  * case we skip doing prepare.
2749  */
2750  txn->final_lsn = prepare_lsn;
2751  txn->end_lsn = end_lsn;
2752  txn->xact_time.prepare_time = prepare_time;
2753  txn->origin_id = origin_id;
2754  txn->origin_lsn = origin_lsn;
2755 
2756  return true;
2757 }
2758 
2759 /* Remember that we have skipped prepare */
2760 void
2762 {
2763  ReorderBufferTXN *txn;
2764 
2765  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2766 
2767  /* unknown transaction, nothing to do */
2768  if (txn == NULL)
2769  return;
2770 
2772 }
2773 
2774 /*
2775  * Prepare a two-phase transaction.
2776  *
2777  * See comments for ReorderBufferReplay().
2778  */
2779 void
2781  char *gid)
2782 {
2783  ReorderBufferTXN *txn;
2784 
2785  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2786  false);
2787 
2788  /* unknown transaction, nothing to replay */
2789  if (txn == NULL)
2790  return;
2791 
2792  txn->txn_flags |= RBTXN_PREPARE;
2793  txn->gid = pstrdup(gid);
2794 
2795  /* The prepare info must have been updated in txn by now. */
2797 
2798  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2799  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2800 
2801  /*
2802  * We send the prepare for the concurrently aborted xacts so that later
2803  * when rollback prepared is decoded and sent, the downstream should be
2804  * able to rollback such a xact. See comments atop DecodePrepare.
2805  *
2806  * Note, for the concurrent_abort + streaming case a stream_prepare was
2807  * already sent within the ReorderBufferReplay call above.
2808  */
2809  if (txn->concurrent_abort && !rbtxn_is_streamed(txn))
2810  rb->prepare(rb, txn, txn->final_lsn);
2811 }
2812 
2813 /*
2814  * This is used to handle COMMIT/ROLLBACK PREPARED.
2815  */
2816 void
2818  XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2819  XLogRecPtr two_phase_at,
2820  TimestampTz commit_time, RepOriginId origin_id,
2821  XLogRecPtr origin_lsn, char *gid, bool is_commit)
2822 {
2823  ReorderBufferTXN *txn;
2824  XLogRecPtr prepare_end_lsn;
2825  TimestampTz prepare_time;
2826 
2827  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2828 
2829  /* unknown transaction, nothing to do */
2830  if (txn == NULL)
2831  return;
2832 
2833  /*
2834  * By this time the txn has the prepare record information, remember it to
2835  * be later used for rollback.
2836  */
2837  prepare_end_lsn = txn->end_lsn;
2838  prepare_time = txn->xact_time.prepare_time;
2839 
2840  /* add the gid in the txn */
2841  txn->gid = pstrdup(gid);
2842 
2843  /*
2844  * It is possible that this transaction is not decoded at prepare time
2845  * either because by that time we didn't have a consistent snapshot, or
2846  * two_phase was not enabled, or it was decoded earlier but we have
2847  * restarted. We only need to send the prepare if it was not decoded
2848  * earlier. We don't need to decode the xact for aborts if it is not done
2849  * already.
2850  */
2851  if ((txn->final_lsn < two_phase_at) && is_commit)
2852  {
2853  txn->txn_flags |= RBTXN_PREPARE;
2854 
2855  /*
2856  * The prepare info must have been updated in txn even if we skip
2857  * prepare.
2858  */
2860 
2861  /*
2862  * By this time the txn has the prepare record information and it is
2863  * important to use that so that downstream gets the accurate
2864  * information. If instead, we have passed commit information here
2865  * then downstream can behave as it has already replayed commit
2866  * prepared after the restart.
2867  */
2868  ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2869  txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2870  }
2871 
2872  txn->final_lsn = commit_lsn;
2873  txn->end_lsn = end_lsn;
2874  txn->xact_time.commit_time = commit_time;
2875  txn->origin_id = origin_id;
2876  txn->origin_lsn = origin_lsn;
2877 
2878  if (is_commit)
2879  rb->commit_prepared(rb, txn, commit_lsn);
2880  else
2881  rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2882 
2883  /* cleanup: make sure there's no cache pollution */
2885  txn->invalidations);
2886  ReorderBufferCleanupTXN(rb, txn);
2887 }
2888 
2889 /*
2890  * Abort a transaction that possibly has previous changes. Needs to be first
2891  * called for subtransactions and then for the toplevel xid.
2892  *
2893  * NB: Transactions handled here have to have actively aborted (i.e. have
2894  * produced an abort record). Implicitly aborted transactions are handled via
2895  * ReorderBufferAbortOld(); transactions we're just not interested in, but
2896  * which have committed are handled in ReorderBufferForget().
2897  *
2898  * This function purges this transaction and its contents from memory and
2899  * disk.
2900  */
2901 void
2903  TimestampTz abort_time)
2904 {
2905  ReorderBufferTXN *txn;
2906 
2907  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2908  false);
2909 
2910  /* unknown, nothing to remove */
2911  if (txn == NULL)
2912  return;
2913 
2914  txn->xact_time.abort_time = abort_time;
2915 
2916  /* For streamed transactions notify the remote node about the abort. */
2917  if (rbtxn_is_streamed(txn))
2918  {
2919  rb->stream_abort(rb, txn, lsn);
2920 
2921  /*
2922  * We might have decoded changes for this transaction that could load
2923  * the cache as per the current transaction's view (consider DDL's
2924  * happened in this transaction). We don't want the decoding of future
2925  * transactions to use those cache entries so execute invalidations.
2926  */
2927  if (txn->ninvalidations > 0)
2929  txn->invalidations);
2930  }
2931 
2932  /* cosmetic... */
2933  txn->final_lsn = lsn;
2934 
2935  /* remove potential on-disk data, and deallocate */
2936  ReorderBufferCleanupTXN(rb, txn);
2937 }
2938 
2939 /*
2940  * Abort all transactions that aren't actually running anymore because the
2941  * server restarted.
2942  *
2943  * NB: These really have to be transactions that have aborted due to a server
2944  * crash/immediate restart, as we don't deal with invalidations here.
2945  */
2946 void
2948 {
2949  dlist_mutable_iter it;
2950 
2951  /*
2952  * Iterate through all (potential) toplevel TXNs and abort all that are
2953  * older than what possibly can be running. Once we've found the first
2954  * that is alive we stop, there might be some that acquired an xid earlier
2955  * but started writing later, but it's unlikely and they will be cleaned
2956  * up in a later call to this function.
2957  */
2959  {
2960  ReorderBufferTXN *txn;
2961 
2962  txn = dlist_container(ReorderBufferTXN, node, it.cur);
2963 
2964  if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2965  {
2966  elog(DEBUG2, "aborting old transaction %u", txn->xid);
2967 
2968  /* Notify the remote node about the crash/immediate restart. */
2969  if (rbtxn_is_streamed(txn))
2970  rb->stream_abort(rb, txn, InvalidXLogRecPtr);
2971 
2972  /* remove potential on-disk data, and deallocate this tx */
2973  ReorderBufferCleanupTXN(rb, txn);
2974  }
2975  else
2976  return;
2977  }
2978 }
2979 
2980 /*
2981  * Forget the contents of a transaction if we aren't interested in its
2982  * contents. Needs to be first called for subtransactions and then for the
2983  * toplevel xid.
2984  *
2985  * This is significantly different to ReorderBufferAbort() because
2986  * transactions that have committed need to be treated differently from aborted
2987  * ones since they may have modified the catalog.
2988  *
2989  * Note that this is only allowed to be called in the moment a transaction
2990  * commit has just been read, not earlier; otherwise later records referring
2991  * to this xid might re-create the transaction incompletely.
2992  */
2993 void
2995 {
2996  ReorderBufferTXN *txn;
2997 
2998  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2999  false);
3000 
3001  /* unknown, nothing to forget */
3002  if (txn == NULL)
3003  return;
3004 
3005  /* this transaction mustn't be streamed */
3006  Assert(!rbtxn_is_streamed(txn));
3007 
3008  /* cosmetic... */
3009  txn->final_lsn = lsn;
3010 
3011  /*
3012  * Process cache invalidation messages if there are any. Even if we're not
3013  * interested in the transaction's contents, it could have manipulated the
3014  * catalog and we need to update the caches according to that.
3015  */
3016  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3018  txn->invalidations);
3019  else
3020  Assert(txn->ninvalidations == 0);
3021 
3022  /* remove potential on-disk data, and deallocate */
3023  ReorderBufferCleanupTXN(rb, txn);
3024 }
3025 
3026 /*
3027  * Invalidate cache for those transactions that need to be skipped just in case
3028  * catalogs were manipulated as part of the transaction.
3029  *
3030  * Note that this is a special-purpose function for prepared transactions where
3031  * we don't want to clean up the TXN even when we decide to skip it. See
3032  * DecodePrepare.
3033  */
3034 void
3036 {
3037  ReorderBufferTXN *txn;
3038 
3039  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3040  false);
3041 
3042  /* unknown, nothing to do */
3043  if (txn == NULL)
3044  return;
3045 
3046  /*
3047  * Process cache invalidation messages if there are any. Even if we're not
3048  * interested in the transaction's contents, it could have manipulated the
3049  * catalog and we need to update the caches according to that.
3050  */
3051  if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3053  txn->invalidations);
3054  else
3055  Assert(txn->ninvalidations == 0);
3056 }
3057 
3058 
3059 /*
3060  * Execute invalidations happening outside the context of a decoded
3061  * transaction. That currently happens either for xid-less commits
3062  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3063  * transactions (via ReorderBufferForget()).
3064  */
3065 void
3067  SharedInvalidationMessage *invalidations)
3068 {
3069  bool use_subtxn = IsTransactionOrTransactionBlock();
3070  int i;
3071 
3072  if (use_subtxn)
3073  BeginInternalSubTransaction("replay");
3074 
3075  /*
3076  * Force invalidations to happen outside of a valid transaction - that way
3077  * entries will just be marked as invalid without accessing the catalog.
3078  * That's advantageous because we don't need to setup the full state
3079  * necessary for catalog access.
3080  */
3081  if (use_subtxn)
3083 
3084  for (i = 0; i < ninvalidations; i++)
3085  LocalExecuteInvalidationMessage(&invalidations[i]);
3086 
3087  if (use_subtxn)
3089 }
3090 
3091 /*
3092  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3093  * least once for every xid in XLogRecord->xl_xid (other places in records
3094  * may, but do not have to be passed through here).
3095  *
3096  * Reorderbuffer keeps some data structures about transactions in LSN order,
3097  * for efficiency. To do that it has to know about when transactions are seen
3098  * first in the WAL. As many types of records are not actually interesting for
3099  * logical decoding, they do not necessarily pass through here.
3100  */
3101 void
3103 {
3104  /* many records won't have an xid assigned, centralize check here */
3105  if (xid != InvalidTransactionId)
3106  ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3107 }
3108 
3109 /*
3110  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3111  * because the previous snapshot doesn't describe the catalog correctly for
3112  * following rows.
3113  */
3114 void
3116  XLogRecPtr lsn, Snapshot snap)
3117 {
3119 
3120  change->data.snapshot = snap;
3122 
3123  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3124 }
3125 
3126 /*
3127  * Set up the transaction's base snapshot.
3128  *
3129  * If we know that xid is a subtransaction, set the base snapshot on the
3130  * top-level transaction instead.
3131  */
3132 void
3134  XLogRecPtr lsn, Snapshot snap)
3135 {
3136  ReorderBufferTXN *txn;
3137  bool is_new;
3138 
3139  Assert(snap != NULL);
3140 
3141  /*
3142  * Fetch the transaction to operate on. If we know it's a subtransaction,
3143  * operate on its top-level transaction instead.
3144  */
3145  txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3146  if (rbtxn_is_known_subxact(txn))
3147  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3148  NULL, InvalidXLogRecPtr, false);
3149  Assert(txn->base_snapshot == NULL);
3150 
3151  txn->base_snapshot = snap;
3152  txn->base_snapshot_lsn = lsn;
3154 
3155  AssertTXNLsnOrder(rb);
3156 }
3157 
3158 /*
3159  * Access the catalog with this CommandId at this point in the changestream.
3160  *
3161  * May only be called for command ids > 1
3162  */
3163 void
3165  XLogRecPtr lsn, CommandId cid)
3166 {
3168 
3169  change->data.command_id = cid;
3171 
3172  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3173 }
3174 
3175 /*
3176  * Update memory counters to account for the new or removed change.
3177  *
3178  * We update two counters - in the reorder buffer, and in the transaction
3179  * containing the change. The reorder buffer counter allows us to quickly
3180  * decide if we reached the memory limit, the transaction counter allows
3181  * us to quickly pick the largest transaction for eviction.
3182  *
3183  * Either txn or change must be non-NULL at least. We update the memory
3184  * counter of txn if it's non-NULL, otherwise change->txn.
3185  *
3186  * When streaming is enabled, we need to update the toplevel transaction
3187  * counters instead - we don't really care about subtransactions as we
3188  * can't stream them individually anyway, and we only pick toplevel
3189  * transactions for eviction. So only toplevel transactions matter.
3190  */
3191 static void
3193  ReorderBufferChange *change,
3194  ReorderBufferTXN *txn,
3195  bool addition, Size sz)
3196 {
3197  ReorderBufferTXN *toptxn;
3198 
3199  Assert(txn || change);
3200 
3201  /*
3202  * Ignore tuple CID changes, because those are not evicted when reaching
3203  * memory limit. So we just don't count them, because it might easily
3204  * trigger a pointless attempt to spill.
3205  */
3206  if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3207  return;
3208 
3209  if (sz == 0)
3210  return;
3211 
3212  if (txn == NULL)
3213  txn = change->txn;
3214  Assert(txn != NULL);
3215 
3216  /*
3217  * Update the total size in top level as well. This is later used to
3218  * compute the decoding stats.
3219  */
3220  toptxn = rbtxn_get_toptxn(txn);
3221 
3222  if (addition)
3223  {
3224  Size oldsize = txn->size;
3225 
3226  txn->size += sz;
3227  rb->size += sz;
3228 
3229  /* Update the total size in the top transaction. */
3230  toptxn->total_size += sz;
3231 
3232  /* Update the max-heap */
3233  if (oldsize != 0)
3234  pairingheap_remove(rb->txn_heap, &txn->txn_node);
3235  pairingheap_add(rb->txn_heap, &txn->txn_node);
3236  }
3237  else
3238  {
3239  Assert((rb->size >= sz) && (txn->size >= sz));
3240  txn->size -= sz;
3241  rb->size -= sz;
3242 
3243  /* Update the total size in the top transaction. */
3244  toptxn->total_size -= sz;
3245 
3246  /* Update the max-heap */
3247  pairingheap_remove(rb->txn_heap, &txn->txn_node);
3248  if (txn->size != 0)
3249  pairingheap_add(rb->txn_heap, &txn->txn_node);
3250  }
3251 
3252  Assert(txn->size <= rb->size);
3253 }
3254 
3255 /*
3256  * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3257  *
3258  * We do not include this change type in memory accounting, because we
3259  * keep CIDs in a separate list and do not evict them when reaching
3260  * the memory limit.
3261  */
3262 void
3264  XLogRecPtr lsn, RelFileLocator locator,
3265  ItemPointerData tid, CommandId cmin,
3266  CommandId cmax, CommandId combocid)
3267 {
3269  ReorderBufferTXN *txn;
3270 
3271  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3272 
3273  change->data.tuplecid.locator = locator;
3274  change->data.tuplecid.tid = tid;
3275  change->data.tuplecid.cmin = cmin;
3276  change->data.tuplecid.cmax = cmax;
3277  change->data.tuplecid.combocid = combocid;
3278  change->lsn = lsn;
3279  change->txn = txn;
3281 
3282  dlist_push_tail(&txn->tuplecids, &change->node);
3283  txn->ntuplecids++;
3284 }
3285 
3286 /*
3287  * Accumulate the invalidations for executing them later.
3288  *
3289  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3290  * accumulates all the invalidation messages in the toplevel transaction, if
3291  * available, otherwise in the current transaction, as well as in the form of
3292  * change in reorder buffer. We require to record it in form of the change
3293  * so that we can execute only the required invalidations instead of executing
3294  * all the invalidations on each CommandId increment. We also need to
3295  * accumulate these in the txn buffer because in some cases where we skip
3296  * processing the transaction (see ReorderBufferForget), we need to execute
3297  * all the invalidations together.
3298  */
3299 void
3301  XLogRecPtr lsn, Size nmsgs,
3303 {
3304  ReorderBufferTXN *txn;
3305  MemoryContext oldcontext;
3306  ReorderBufferChange *change;
3307 
3308  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3309 
3310  oldcontext = MemoryContextSwitchTo(rb->context);
3311 
3312  /*
3313  * Collect all the invalidations under the top transaction, if available,
3314  * so that we can execute them all together. See comments atop this
3315  * function.
3316  */
3317  txn = rbtxn_get_toptxn(txn);
3318 
3319  Assert(nmsgs > 0);
3320 
3321  /* Accumulate invalidations. */
3322  if (txn->ninvalidations == 0)
3323  {
3324  txn->ninvalidations = nmsgs;
3326  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3327  memcpy(txn->invalidations, msgs,
3328  sizeof(SharedInvalidationMessage) * nmsgs);
3329  }
3330  else
3331  {
3334  (txn->ninvalidations + nmsgs));
3335 
3336  memcpy(txn->invalidations + txn->ninvalidations, msgs,
3337  nmsgs * sizeof(SharedInvalidationMessage));
3338  txn->ninvalidations += nmsgs;
3339  }
3340 
3341  change = ReorderBufferGetChange(rb);
3343  change->data.inval.ninvalidations = nmsgs;
3344  change->data.inval.invalidations = (SharedInvalidationMessage *)
3345  palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3346  memcpy(change->data.inval.invalidations, msgs,
3347  sizeof(SharedInvalidationMessage) * nmsgs);
3348 
3349  ReorderBufferQueueChange(rb, xid, lsn, change, false);
3350 
3351  MemoryContextSwitchTo(oldcontext);
3352 }
3353 
3354 /*
3355  * Apply all invalidations we know. Possibly we only need parts at this point
3356  * in the changestream but we don't know which those are.
3357  */
3358 static void
3360 {
3361  int i;
3362 
3363  for (i = 0; i < nmsgs; i++)
3365 }
3366 
3367 /*
3368  * Mark a transaction as containing catalog changes
3369  */
3370 void
3372  XLogRecPtr lsn)
3373 {
3374  ReorderBufferTXN *txn;
3375 
3376  txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3377 
3378  if (!rbtxn_has_catalog_changes(txn))
3379  {
3382  }
3383 
3384  /*
3385  * Mark top-level transaction as having catalog changes too if one of its
3386  * children has so that the ReorderBufferBuildTupleCidHash can
3387  * conveniently check just top-level transaction and decide whether to
3388  * build the hash table or not.
3389  */
3390  if (rbtxn_is_subtxn(txn))
3391  {
3392  ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3393 
3394  if (!rbtxn_has_catalog_changes(toptxn))
3395  {
3398  }
3399  }
3400 }
3401 
3402 /*
3403  * Return palloc'ed array of the transactions that have changed catalogs.
3404  * The returned array is sorted in xidComparator order.
3405  *
3406  * The caller must free the returned array when done with it.
3407  */
3408 TransactionId *
3410 {
3411  dlist_iter iter;
3412  TransactionId *xids = NULL;
3413  size_t xcnt = 0;
3414 
3415  /* Quick return if the list is empty */
3416  if (dclist_count(&rb->catchange_txns) == 0)
3417  return NULL;
3418 
3419  /* Initialize XID array */
3420  xids = (TransactionId *) palloc(sizeof(TransactionId) *
3422  dclist_foreach(iter, &rb->catchange_txns)
3423  {
3425  catchange_node,
3426  iter.cur);
3427 
3429 
3430  xids[xcnt++] = txn->xid;
3431  }
3432 
3433  qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3434 
3435  Assert(xcnt == dclist_count(&rb->catchange_txns));
3436  return xids;
3437 }
3438 
3439 /*
3440  * Query whether a transaction is already *known* to contain catalog
3441  * changes. This can be wrong until directly before the commit!
3442  */
3443 bool
3445 {
3446  ReorderBufferTXN *txn;
3447 
3448  txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3449  false);
3450  if (txn == NULL)
3451  return false;
3452 
3453  return rbtxn_has_catalog_changes(txn);
3454 }
3455 
3456 /*
3457  * ReorderBufferXidHasBaseSnapshot
3458  * Have we already set the base snapshot for the given txn/subtxn?
3459  */
3460 bool
3462 {
3463  ReorderBufferTXN *txn;
3464 
3465  txn = ReorderBufferTXNByXid(rb, xid, false,
3466  NULL, InvalidXLogRecPtr, false);
3467 
3468  /* transaction isn't known yet, ergo no snapshot */
3469  if (txn == NULL)
3470  return false;
3471 
3472  /* a known subtxn? operate on top-level txn instead */
3473  if (rbtxn_is_known_subxact(txn))
3474  txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3475  NULL, InvalidXLogRecPtr, false);
3476 
3477  return txn->base_snapshot != NULL;
3478 }
3479 
3480 
3481 /*
3482  * ---------------------------------------
3483  * Disk serialization support
3484  * ---------------------------------------
3485  */
3486 
3487 /*
3488  * Ensure the IO buffer is >= sz.
3489  */
3490 static void
3492 {
3493  if (!rb->outbufsize)
3494  {
3495  rb->outbuf = MemoryContextAlloc(rb->context, sz);
3496  rb->outbufsize = sz;
3497  }
3498  else if (rb->outbufsize < sz)
3499  {
3500  rb->outbuf = repalloc(rb->outbuf, sz);
3501  rb->outbufsize = sz;
3502  }
3503 }
3504 
3505 
3506 /* Compare two transactions by size */
3507 static int
3509 {
3512 
3513  if (ta->size < tb->size)
3514  return -1;
3515  if (ta->size > tb->size)
3516  return 1;
3517  return 0;
3518 }
3519 
3520 /*
3521  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3522  */
3523 static ReorderBufferTXN *
3525 {
3526  ReorderBufferTXN *largest;
3527 
3528  /* Get the largest transaction from the max-heap */
3529  largest = pairingheap_container(ReorderBufferTXN, txn_node,
3531 
3532  Assert(largest);
3533  Assert(largest->size > 0);
3534  Assert(largest->size <= rb->size);
3535 
3536  return largest;
3537 }
3538 
3539 /*
3540  * Find the largest streamable toplevel transaction to evict (by streaming).
3541  *
3542  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3543  * should give us the same transaction (because we don't update memory account
3544  * for subtransaction with streaming, so it's always 0). But we can simply
3545  * iterate over the limited number of toplevel transactions that have a base
3546  * snapshot. There is no use of selecting a transaction that doesn't have base
3547  * snapshot because we don't decode such transactions. Also, we do not select
3548  * the transaction which doesn't have any streamable change.
3549  *
3550  * Note that, we skip transactions that contain incomplete changes. There
3551  * is a scope of optimization here such that we can select the largest
3552  * transaction which has incomplete changes. But that will make the code and
3553  * design quite complex and that might not be worth the benefit. If we plan to
3554  * stream the transactions that contain incomplete changes then we need to
3555  * find a way to partially stream/truncate the transaction changes in-memory
3556  * and build a mechanism to partially truncate the spilled files.
3557  * Additionally, whenever we partially stream the transaction we need to
3558  * maintain the last streamed lsn and next time we need to restore from that
3559  * segment and the offset in WAL. As we stream the changes from the top
3560  * transaction and restore them subtransaction wise, we need to even remember
3561  * the subxact from where we streamed the last change.
3562  */
3563 static ReorderBufferTXN *
3565 {
3566  dlist_iter iter;
3567  Size largest_size = 0;
3568  ReorderBufferTXN *largest = NULL;
3569 
3570  /* Find the largest top-level transaction having a base snapshot. */
3572  {
3573  ReorderBufferTXN *txn;
3574 
3575  txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3576 
3577  /* must not be a subtxn */
3579  /* base_snapshot must be set */
3580  Assert(txn->base_snapshot != NULL);
3581 
3582  if ((largest == NULL || txn->total_size > largest_size) &&
3583  (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)) &&
3585  {
3586  largest = txn;
3587  largest_size = txn->total_size;
3588  }
3589  }
3590 
3591  return largest;
3592 }
3593 
3594 /*
3595  * Check whether the logical_decoding_work_mem limit was reached, and if yes
3596  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3597  * disk or send to the output plugin until we reach under the memory limit.
3598  *
3599  * If debug_logical_replication_streaming is set to "immediate", stream or
3600  * serialize the changes immediately.
3601  *
3602  * XXX At this point we select the transactions until we reach under the memory
3603  * limit, but we might also adapt a more elaborate eviction strategy - for example
3604  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3605  * limit.
3606  */
3607 static void
3609 {
3610  ReorderBufferTXN *txn;
3611 
3612  /*
3613  * Bail out if debug_logical_replication_streaming is buffered and we
3614  * haven't exceeded the memory limit.
3615  */
3617  rb->size < logical_decoding_work_mem * 1024L)
3618  return;
3619 
3620  /*
3621  * If debug_logical_replication_streaming is immediate, loop until there's
3622  * no change. Otherwise, loop until we reach under the memory limit. One
3623  * might think that just by evicting the largest (sub)transaction we will
3624  * come under the memory limit based on assumption that the selected
3625  * transaction is at least as large as the most recent change (which
3626  * caused us to go over the memory limit). However, that is not true
3627  * because a user can reduce the logical_decoding_work_mem to a smaller
3628  * value before the most recent change.
3629  */
3630  while (rb->size >= logical_decoding_work_mem * 1024L ||
3632  rb->size > 0))
3633  {
3634  /*
3635  * Pick the largest transaction and evict it from memory by streaming,
3636  * if possible. Otherwise, spill to disk.
3637  */
3639  (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3640  {
3641  /* we know there has to be one, because the size is not zero */
3642  Assert(txn && rbtxn_is_toptxn(txn));
3643  Assert(txn->total_size > 0);
3644  Assert(rb->size >= txn->total_size);
3645 
3646  ReorderBufferStreamTXN(rb, txn);
3647  }
3648  else
3649  {
3650  /*
3651  * Pick the largest transaction (or subtransaction) and evict it
3652  * from memory by serializing it to disk.
3653  */
3654  txn = ReorderBufferLargestTXN(rb);
3655 
3656  /* we know there has to be one, because the size is not zero */
3657  Assert(txn);
3658  Assert(txn->size > 0);
3659  Assert(rb->size >= txn->size);
3660 
3661  ReorderBufferSerializeTXN(rb, txn);
3662  }
3663 
3664  /*
3665  * After eviction, the transaction should have no entries in memory,
3666  * and should use 0 bytes for changes.
3667  */
3668  Assert(txn->size == 0);
3669  Assert(txn->nentries_mem == 0);
3670  }
3671 
3672  /* We must be under the memory limit now. */
3673  Assert(rb->size < logical_decoding_work_mem * 1024L);
3674 
3675 }
3676 
3677 /*
3678  * Spill data of a large transaction (and its subtransactions) to disk.
3679  */
3680 static void
3682 {
3683  dlist_iter subtxn_i;
3684  dlist_mutable_iter change_i;
3685  int fd = -1;
3686  XLogSegNo curOpenSegNo = 0;
3687  Size spilled = 0;
3688  Size size = txn->size;
3689 
3690  elog(DEBUG2, "spill %u changes in XID %u to disk",
3691  (uint32) txn->nentries_mem, txn->xid);
3692 
3693  /* do the same to all child TXs */
3694  dlist_foreach(subtxn_i, &txn->subtxns)
3695  {
3696  ReorderBufferTXN *subtxn;
3697 
3698  subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3699  ReorderBufferSerializeTXN(rb, subtxn);
3700  }
3701 
3702  /* serialize changestream */
3703  dlist_foreach_modify(change_i, &txn->changes)
3704  {
3705  ReorderBufferChange *change;
3706 
3707  change = dlist_container(ReorderBufferChange, node, change_i.cur);
3708 
3709  /*
3710  * store in segment in which it belongs by start lsn, don't split over
3711  * multiple segments tho
3712  */
3713  if (fd == -1 ||
3714  !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3715  {
3716  char path[MAXPGPATH];
3717 
3718  if (fd != -1)
3720 
3721  XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3722 
3723  /*
3724  * No need to care about TLIs here, only used during a single run,
3725  * so each LSN only maps to a specific WAL record.
3726  */
3728  curOpenSegNo);
3729 
3730  /* open segment, create it if necessary */
3731  fd = OpenTransientFile(path,
3732  O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3733 
3734  if (fd < 0)
3735  ereport(ERROR,
3737  errmsg("could not open file \"%s\": %m", path)));
3738  }
3739 
3740  ReorderBufferSerializeChange(rb, txn, fd, change);
3741  dlist_delete(&change->node);
3742  ReorderBufferReturnChange(rb, change, false);
3743 
3744  spilled++;
3745  }
3746 
3747  /* Update the memory counter */
3748  ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
3749 
3750  /* update the statistics iff we have spilled anything */
3751  if (spilled)
3752  {
3753  rb->spillCount += 1;
3754  rb->spillBytes += size;
3755 
3756  /* don't consider already serialized transactions */
3757  rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3758 
3759  /* update the decoding stats */
3761  }
3762 
3763  Assert(spilled == txn->nentries_mem);
3764  Assert(dlist_is_empty(&txn->changes));
3765  txn->nentries_mem = 0;
3767 
3768  if (fd != -1)
3770 }
3771 
3772 /*
3773  * Serialize individual change to disk.
3774  */
3775 static void
3777  int fd, ReorderBufferChange *change)
3778 {
3779  ReorderBufferDiskChange *ondisk;
3780  Size sz = sizeof(ReorderBufferDiskChange);
3781 
3783 
3784  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3785  memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3786 
3787  switch (change->action)
3788  {
3789  /* fall through these, they're all similar enough */
3794  {
3795  char *data;
3796  HeapTuple oldtup,
3797  newtup;
3798  Size oldlen = 0;
3799  Size newlen = 0;
3800 
3801  oldtup = change->data.tp.oldtuple;
3802  newtup = change->data.tp.newtuple;
3803 
3804  if (oldtup)
3805  {
3806  sz += sizeof(HeapTupleData);
3807  oldlen = oldtup->t_len;
3808  sz += oldlen;
3809  }
3810 
3811  if (newtup)
3812  {
3813  sz += sizeof(HeapTupleData);
3814  newlen = newtup->t_len;
3815  sz += newlen;
3816  }
3817 
3818  /* make sure we have enough space */
3820 
3821  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3822  /* might have been reallocated above */
3823  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3824 
3825  if (oldlen)
3826  {
3827  memcpy(data, oldtup, sizeof(HeapTupleData));
3828  data += sizeof(HeapTupleData);
3829 
3830  memcpy(data, oldtup->t_data, oldlen);
3831  data += oldlen;
3832  }
3833 
3834  if (newlen)
3835  {
3836  memcpy(data, newtup, sizeof(HeapTupleData));
3837  data += sizeof(HeapTupleData);
3838 
3839  memcpy(data, newtup->t_data, newlen);
3840  data += newlen;
3841  }
3842  break;
3843  }
3845  {
3846  char *data;
3847  Size prefix_size = strlen(change->data.msg.prefix) + 1;
3848 
3849  sz += prefix_size + change->data.msg.message_size +
3850  sizeof(Size) + sizeof(Size);
3852 
3853  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3854 
3855  /* might have been reallocated above */
3856  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3857 
3858  /* write the prefix including the size */
3859  memcpy(data, &prefix_size, sizeof(Size));
3860  data += sizeof(Size);
3861  memcpy(data, change->data.msg.prefix,
3862  prefix_size);
3863  data += prefix_size;
3864 
3865  /* write the message including the size */
3866  memcpy(data, &change->data.msg.message_size, sizeof(Size));
3867  data += sizeof(Size);
3868  memcpy(data, change->data.msg.message,
3869  change->data.msg.message_size);
3870  data += change->data.msg.message_size;
3871 
3872  break;
3873  }
3875  {
3876  char *data;
3877  Size inval_size = sizeof(SharedInvalidationMessage) *
3878  change->data.inval.ninvalidations;
3879 
3880  sz += inval_size;
3881 
3883  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3884 
3885  /* might have been reallocated above */
3886  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3887  memcpy(data, change->data.inval.invalidations, inval_size);
3888  data += inval_size;
3889 
3890  break;
3891  }
3893  {
3894  Snapshot snap;
3895  char *data;
3896 
3897  snap = change->data.snapshot;
3898 
3899  sz += sizeof(SnapshotData) +
3900  sizeof(TransactionId) * snap->xcnt +
3901  sizeof(TransactionId) * snap->subxcnt;
3902 
3903  /* make sure we have enough space */
3905  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3906  /* might have been reallocated above */
3907  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3908 
3909  memcpy(data, snap, sizeof(SnapshotData));
3910  data += sizeof(SnapshotData);
3911 
3912  if (snap->xcnt)
3913  {
3914  memcpy(data, snap->xip,
3915  sizeof(TransactionId) * snap->xcnt);
3916  data += sizeof(TransactionId) * snap->xcnt;
3917  }
3918 
3919  if (snap->subxcnt)
3920  {
3921  memcpy(data, snap->subxip,
3922  sizeof(TransactionId) * snap->subxcnt);
3923  data += sizeof(TransactionId) * snap->subxcnt;
3924  }
3925  break;
3926  }
3928  {
3929  Size size;
3930  char *data;
3931 
3932  /* account for the OIDs of truncated relations */
3933  size = sizeof(Oid) * change->data.truncate.nrelids;
3934  sz += size;
3935 
3936  /* make sure we have enough space */
3938 
3939  data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3940  /* might have been reallocated above */
3941  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3942 
3943  memcpy(data, change->data.truncate.relids, size);
3944  data += size;
3945 
3946  break;
3947  }
3952  /* ReorderBufferChange contains everything important */
3953  break;
3954  }
3955 
3956  ondisk->size = sz;
3957 
3958  errno = 0;
3959  pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
3960  if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3961  {
3962  int save_errno = errno;
3963 
3965 
3966  /* if write didn't set errno, assume problem is no disk space */
3967  errno = save_errno ? save_errno : ENOSPC;
3968  ereport(ERROR,
3970  errmsg("could not write to data file for XID %u: %m",
3971  txn->xid)));
3972  }
3974 
3975  /*
3976  * Keep the transaction's final_lsn up to date with each change we send to
3977  * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3978  * only do this on commit and abort records, but that doesn't work if a
3979  * system crash leaves a transaction without its abort record).
3980  *
3981  * Make sure not to move it backwards.
3982  */
3983  if (txn->final_lsn < change->lsn)
3984  txn->final_lsn = change->lsn;
3985 
3986  Assert(ondisk->change.action == change->action);
3987 }
3988 
3989 /* Returns true, if the output plugin supports streaming, false, otherwise. */
3990 static inline bool
3992 {
3994 
3995  return ctx->streaming;
3996 }
3997 
3998 /* Returns true, if the streaming can be started now, false, otherwise. */
3999 static inline bool
4001 {
4003  SnapBuild *builder = ctx->snapshot_builder;
4004 
4005  /* We can't start streaming unless a consistent state is reached. */
4007  return false;
4008 
4009  /*
4010  * We can't start streaming immediately even if the streaming is enabled
4011  * because we previously decoded this transaction and now just are
4012  * restarting.
4013  */
4014  if (ReorderBufferCanStream(rb) &&
4015  !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
4016  return true;
4017 
4018  return false;
4019 }
4020 
4021 /*
4022  * Send data of a large transaction (and its subtransactions) to the
4023  * output plugin, but using the stream API.
4024  */
4025 static void
4027 {
4028  Snapshot snapshot_now;
4029  CommandId command_id;
4030  Size stream_bytes;
4031  bool txn_is_streamed;
4032 
4033  /* We can never reach here for a subtransaction. */
4034  Assert(rbtxn_is_toptxn(txn));
4035 
4036  /*
4037  * We can't make any assumptions about base snapshot here, similar to what
4038  * ReorderBufferCommit() does. That relies on base_snapshot getting
4039  * transferred from subxact in ReorderBufferCommitChild(), but that was
4040  * not yet called as the transaction is in-progress.
4041  *
4042  * So just walk the subxacts and use the same logic here. But we only need
4043  * to do that once, when the transaction is streamed for the first time.
4044  * After that we need to reuse the snapshot from the previous run.
4045  *
4046  * Unlike DecodeCommit which adds xids of all the subtransactions in
4047  * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4048  * we do add them to subxip array instead via ReorderBufferCopySnap. This
4049  * allows the catalog changes made in subtransactions decoded till now to
4050  * be visible.
4051  */
4052  if (txn->snapshot_now == NULL)
4053  {
4054  dlist_iter subxact_i;
4055 
4056  /* make sure this transaction is streamed for the first time */
4057  Assert(!rbtxn_is_streamed(txn));
4058 
4059  /* at the beginning we should have invalid command ID */
4061 
4062  dlist_foreach(subxact_i, &txn->subtxns)
4063  {
4064  ReorderBufferTXN *subtxn;
4065 
4066  subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4067  ReorderBufferTransferSnapToParent(txn, subtxn);
4068  }
4069 
4070  /*
4071  * If this transaction has no snapshot, it didn't make any changes to
4072  * the database till now, so there's nothing to decode.
4073  */
4074  if (txn->base_snapshot == NULL)
4075  {
4076  Assert(txn->ninvalidations == 0);
4077  return;
4078  }
4079 
4080  command_id = FirstCommandId;
4081  snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4082  txn, command_id);
4083  }
4084  else
4085  {
4086  /* the transaction must have been already streamed */
4087  Assert(rbtxn_is_streamed(txn));
4088 
4089  /*
4090  * Nah, we already have snapshot from the previous streaming run. We
4091  * assume new subxacts can't move the LSN backwards, and so can't beat
4092  * the LSN condition in the previous branch (so no need to walk
4093  * through subxacts again). In fact, we must not do that as we may be
4094  * using snapshot half-way through the subxact.
4095  */
4096  command_id = txn->command_id;
4097 
4098  /*
4099  * We can't use txn->snapshot_now directly because after the last
4100  * streaming run, we might have got some new sub-transactions. So we
4101  * need to add them to the snapshot.
4102  */
4103  snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4104  txn, command_id);
4105 
4106  /* Free the previously copied snapshot. */
4107  Assert(txn->snapshot_now->copied);
4109  txn->snapshot_now = NULL;
4110  }
4111 
4112  /*
4113  * Remember this information to be used later to update stats. We can't
4114  * update the stats here as an error while processing the changes would
4115  * lead to the accumulation of stats even though we haven't streamed all
4116  * the changes.
4117  */
4118  txn_is_streamed = rbtxn_is_streamed(txn);
4119  stream_bytes = txn->total_size;
4120 
4121  /* Process and send the changes to output plugin. */
4122  ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4123  command_id, true);
4124 
4125  rb->streamCount += 1;
4126  rb->streamBytes += stream_bytes;
4127 
4128  /* Don't consider already streamed transaction. */
4129  rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4130 
4131  /* update the decoding stats */
4133 
4134  Assert(dlist_is_empty(&txn->changes));
4135  Assert(txn->nentries == 0);
4136  Assert(txn->nentries_mem == 0);
4137 }
4138 
4139 /*
4140  * Size of a change in memory.
4141  */
4142 static Size
4144 {
4145  Size sz = sizeof(ReorderBufferChange);
4146 
4147  switch (change->action)
4148  {
4149  /* fall through these, they're all similar enough */
4154  {
4155  HeapTuple oldtup,
4156  newtup;
4157  Size oldlen = 0;
4158  Size newlen = 0;
4159 
4160  oldtup = change->data.tp.oldtuple;
4161  newtup = change->data.tp.newtuple;
4162 
4163  if (oldtup)
4164  {
4165  sz += sizeof(HeapTupleData);
4166  oldlen = oldtup->t_len;
4167  sz += oldlen;
4168  }
4169 
4170  if (newtup)
4171  {
4172  sz += sizeof(HeapTupleData);
4173  newlen = newtup->t_len;
4174  sz += newlen;
4175  }
4176 
4177  break;
4178  }
4180  {
4181  Size prefix_size = strlen(change->data.msg.prefix) + 1;
4182 
4183  sz += prefix_size + change->data.msg.message_size +
4184  sizeof(Size) + sizeof(Size);
4185 
4186  break;
4187  }
4189  {
4190  sz += sizeof(SharedInvalidationMessage) *
4191  change->data.inval.ninvalidations;
4192  break;
4193  }
4195  {
4196  Snapshot snap;
4197 
4198  snap = change->data.snapshot;
4199 
4200  sz += sizeof(SnapshotData) +
4201  sizeof(TransactionId) * snap->xcnt +
4202  sizeof(TransactionId) * snap->subxcnt;
4203 
4204  break;
4205  }
4207  {
4208  sz += sizeof(Oid) * change->data.truncate.nrelids;
4209 
4210  break;
4211  }
4216  /* ReorderBufferChange contains everything important */
4217  break;
4218  }
4219 
4220  return sz;
4221 }
4222 
4223 
4224 /*
4225  * Restore a number of changes spilled to disk back into memory.
4226  */
4227 static Size
4229  TXNEntryFile *file, XLogSegNo *segno)
4230 {
4231  Size restored = 0;
4232  XLogSegNo last_segno;
4233  dlist_mutable_iter cleanup_iter;
4234  File *fd = &file->vfd;
4235 
4238 
4239  /* free current entries, so we have memory for more */
4240  dlist_foreach_modify(cleanup_iter, &txn->changes)
4241  {
4243  dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4244 
4245  dlist_delete(&cleanup->node);
4247  }
4248  txn->nentries_mem = 0;
4249  Assert(dlist_is_empty(&txn->changes));
4250 
4251  XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4252 
4253  while (restored < max_changes_in_memory && *segno <= last_segno)
4254  {
4255  int readBytes;
4256  ReorderBufferDiskChange *ondisk;
4257 
4259 
4260  if (*fd == -1)
4261  {
4262  char path[MAXPGPATH];
4263 
4264  /* first time in */
4265  if (*segno == 0)
4266  XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4267 
4268  Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4269 
4270  /*
4271  * No need to care about TLIs here, only used during a single run,
4272  * so each LSN only maps to a specific WAL record.
4273  */
4275  *segno);
4276 
4277  *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4278 
4279  /* No harm in resetting the offset even in case of failure */
4280  file->curOffset = 0;
4281 
4282  if (*fd < 0 && errno == ENOENT)
4283  {
4284  *fd = -1;
4285  (*segno)++;
4286  continue;
4287  }
4288  else if (*fd < 0)
4289  ereport(ERROR,
4291  errmsg("could not open file \"%s\": %m",
4292  path)));
4293  }
4294 
4295  /*
4296  * Read the statically sized part of a change which has information
4297  * about the total size. If we couldn't read a record, we're at the
4298  * end of this file.
4299  */
4301  readBytes = FileRead(file->vfd, rb->outbuf,
4302  sizeof(ReorderBufferDiskChange),
4303  file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4304 
4305  /* eof */
4306  if (readBytes == 0)
4307  {
4308  FileClose(*fd);
4309  *fd = -1;
4310  (*segno)++;
4311  continue;
4312  }
4313  else if (readBytes < 0)
4314  ereport(ERROR,
4316  errmsg("could not read from reorderbuffer spill file: %m")));
4317  else if (readBytes != sizeof(ReorderBufferDiskChange))
4318  ereport(ERROR,
4320  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4321  readBytes,
4322  (uint32) sizeof(ReorderBufferDiskChange))));
4323 
4324  file->curOffset += readBytes;
4325 
4326  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4327 
4329  sizeof(ReorderBufferDiskChange) + ondisk->size);
4330  ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4331 
4332  readBytes = FileRead(file->vfd,
4333  rb->outbuf + sizeof(ReorderBufferDiskChange),
4334  ondisk->size - sizeof(ReorderBufferDiskChange),
4335  file->curOffset,
4336  WAIT_EVENT_REORDER_BUFFER_READ);
4337 
4338  if (readBytes < 0)
4339  ereport(ERROR,
4341  errmsg("could not read from reorderbuffer spill file: %m")));
4342  else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4343  ereport(ERROR,
4345  errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4346  readBytes,
4347  (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4348 
4349  file->curOffset += readBytes;
4350 
4351  /*
4352  * ok, read a full change from disk, now restore it into proper
4353  * in-memory format
4354  */
4355  ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4356  restored++;
4357  }
4358 
4359  return restored;
4360 }
4361 
4362 /*
4363  * Convert change from its on-disk format to in-memory format and queue it onto
4364  * the TXN's ->changes list.
4365  *
4366  * Note: although "data" is declared char*, at entry it points to a
4367  * maxalign'd buffer, making it safe in most of this function to assume
4368  * that the pointed-to data is suitably aligned for direct access.
4369  */
4370 static void
4372  char *data)
4373 {
4374  ReorderBufferDiskChange *ondisk;
4375  ReorderBufferChange *change;
4376 
4377  ondisk = (ReorderBufferDiskChange *) data;
4378 
4379  change = ReorderBufferGetChange(rb);
4380 
4381  /* copy static part */
4382  memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4383 
4384  data += sizeof(ReorderBufferDiskChange);
4385 
4386  /* restore individual stuff */
4387  switch (change->action)
4388  {
4389  /* fall through these, they're all similar enough */
4394  if (change->data.tp.oldtuple)
4395  {
4396  uint32 tuplelen = ((HeapTuple) data)->t_len;
4397 
4398  change->data.tp.oldtuple =
4400 
4401  /* restore ->tuple */
4402  memcpy(change->data.tp.oldtuple, data,
4403  sizeof(HeapTupleData));
4404  data += sizeof(HeapTupleData);
4405 
4406  /* reset t_data pointer into the new tuplebuf */
4407  change->data.tp.oldtuple->t_data =
4408  (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4409 
4410  /* restore tuple data itself */
4411  memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
4412  data += tuplelen;
4413  }
4414 
4415  if (change->data.tp.newtuple)
4416  {
4417  /* here, data might not be suitably aligned! */
4418  uint32 tuplelen;
4419 
4420  memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4421  sizeof(uint32));
4422 
4423  change->data.tp.newtuple =
4425 
4426  /* restore ->tuple */
4427  memcpy(change->data.tp.newtuple, data,
4428  sizeof(HeapTupleData));
4429  data += sizeof(HeapTupleData);
4430 
4431  /* reset t_data pointer into the new tuplebuf */
4432  change->data.tp.newtuple->t_data =
4433  (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4434 
4435  /* restore tuple data itself */
4436  memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
4437  data += tuplelen;
4438  }
4439 
4440  break;
4442  {
4443  Size prefix_size;
4444 
4445  /* read prefix */
4446  memcpy(&prefix_size, data, sizeof(Size));
4447  data += sizeof(Size);
4448  change->data.msg.prefix = MemoryContextAlloc(rb->context,
4449  prefix_size);
4450  memcpy(change->data.msg.prefix, data, prefix_size);
4451  Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4452  data += prefix_size;
4453 
4454  /* read the message */
4455  memcpy(&change->data.msg.message_size, data, sizeof(Size));
4456  data += sizeof(Size);
4457  change->data.msg.message = MemoryContextAlloc(rb->context,
4458  change->data.msg.message_size);
4459  memcpy(change->data.msg.message, data,
4460  change->data.msg.message_size);
4461  data += change->data.msg.message_size;
4462 
4463  break;
4464  }
4466  {
4467  Size inval_size = sizeof(SharedInvalidationMessage) *
4468  change->data.inval.ninvalidations;
4469 
4470  change->data.inval.invalidations =
4471  MemoryContextAlloc(rb->context, inval_size);
4472 
4473  /* read the message */
4474  memcpy(change->data.inval.invalidations, data, inval_size);
4475 
4476  break;
4477  }
4479  {
4480  Snapshot oldsnap;
4481  Snapshot newsnap;
4482  Size size;
4483 
4484  oldsnap = (Snapshot) data;
4485 
4486  size = sizeof(SnapshotData) +
4487  sizeof(TransactionId) * oldsnap->xcnt +
4488  sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4489 
4491 
4492  newsnap = change->data.snapshot;
4493 
4494  memcpy(newsnap, data, size);
4495  newsnap->xip = (TransactionId *)
4496  (((char *) newsnap) + sizeof(SnapshotData));
4497  newsnap->subxip = newsnap->xip + newsnap->xcnt;
4498  newsnap->copied = true;
4499  break;
4500  }
4501  /* the base struct contains all the data, easy peasy */
4503  {
4504  Oid *relids;
4505 
4506  relids = ReorderBufferGetRelids(rb,
4507  change->data.truncate.nrelids);
4508  memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4509  change->data.truncate.relids = relids;
4510 
4511  break;
4512  }
4517  break;
4518  }
4519 
4520  dlist_push_tail(&txn->changes, &change->node);
4521  txn->nentries_mem++;
4522 
4523  /*
4524  * Update memory accounting for the restored change. We need to do this
4525  * although we don't check the memory limit when restoring the changes in
4526  * this branch (we only do that when initially queueing the changes after
4527  * decoding), because we will release the changes later, and that will
4528  * update the accounting too (subtracting the size from the counters). And
4529  * we don't want to underflow there.
4530  */
4531  ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4532  ReorderBufferChangeSize(change));
4533 }
4534 
4535 /*
4536  * Remove all on-disk stored for the passed in transaction.
4537  */
4538 static void
4540 {
4541  XLogSegNo first;
4542  XLogSegNo cur;
4543  XLogSegNo last;
4544 
4547 
4548  XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4549  XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4550 
4551  /* iterate over all possible filenames, and delete them */
4552  for (cur = first; cur <= last; cur++)
4553  {
4554  char path[MAXPGPATH];
4555 
4557  if (unlink(path) != 0 && errno != ENOENT)
4558  ereport(ERROR,
4560  errmsg("could not remove file \"%s\": %m", path)));
4561  }
4562 }
4563 
4564 /*
4565  * Remove any leftover serialized reorder buffers from a slot directory after a
4566  * prior crash or decoding session exit.
4567  */
4568 static void
4570 {
4571  DIR *spill_dir;
4572  struct dirent *spill_de;
4573  struct stat statbuf;
4574  char path[MAXPGPATH * 2 + 12];
4575 
4576  sprintf(path, "pg_replslot/%s", slotname);
4577 
4578  /* we're only handling directories here, skip if it's not ours */
4579  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4580  return;
4581 
4582  spill_dir = AllocateDir(path);
4583  while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4584  {
4585  /* only look at names that can be ours */
4586  if (strncmp(spill_de->d_name, "xid", 3) == 0)
4587  {
4588  snprintf(path, sizeof(path),
4589  "pg_replslot/%s/%s", slotname,
4590  spill_de->d_name);
4591 
4592  if (unlink(path) != 0)
4593  ereport(ERROR,
4595  errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4596  path, slotname)));
4597  }
4598  }
4599  FreeDir(spill_dir);
4600 }
4601 
4602 /*
4603  * Given a replication slot, transaction ID and segment number, fill in the
4604  * corresponding spill file into 'path', which is a caller-owned buffer of size
4605  * at least MAXPGPATH.
4606  */
4607 static void
4609  XLogSegNo segno)
4610 {
4611  XLogRecPtr recptr;
4612 
4613  XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4614 
4615  snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4617  xid, LSN_FORMAT_ARGS(recptr));
4618 }
4619 
4620 /*
4621  * Delete all data spilled to disk after we've restarted/crashed. It will be
4622  * recreated when the respective slots are reused.
4623  */
4624 void
4626 {
4627  DIR *logical_dir;
4628  struct dirent *logical_de;
4629 
4630  logical_dir = AllocateDir("pg_replslot");
4631  while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4632  {
4633  if (strcmp(logical_de->d_name, ".") == 0 ||
4634  strcmp(logical_de->d_name, "..") == 0)
4635  continue;
4636 
4637  /* if it cannot be a slot, skip the directory */
4638  if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4639  continue;
4640 
4641  /*
4642  * ok, has to be a surviving logical slot, iterate and delete
4643  * everything starting with xid-*
4644  */
4646  }
4647  FreeDir(logical_dir);
4648 }
4649 
4650 /* ---------------------------------------
4651  * toast reassembly support
4652  * ---------------------------------------
4653  */
4654 
4655 /*
4656  * Initialize per tuple toast reconstruction support.
4657  */
4658 static void
4660 {
4661  HASHCTL hash_ctl;
4662 
4663  Assert(txn->toast_hash == NULL);
4664 
4665  hash_ctl.keysize = sizeof(Oid);
4666  hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4667  hash_ctl.hcxt = rb->context;
4668  txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4670 }
4671 
4672 /*
4673  * Per toast-chunk handling for toast reconstruction
4674  *
4675  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4676  * toasted Datum comes along.
4677  */
4678 static void
4680  Relation relation, ReorderBufferChange *change)
4681 {
4682  ReorderBufferToastEnt *ent;
4683  HeapTuple newtup;
4684  bool found;
4685  int32 chunksize;
4686  bool isnull;
4687  Pointer chunk;
4688  TupleDesc desc = RelationGetDescr(relation);
4689  Oid chunk_id;
4690  int32 chunk_seq;
4691 
4692  if (txn->toast_hash == NULL)
4693  ReorderBufferToastInitHash(rb, txn);
4694 
4695  Assert(IsToastRelation(relation));
4696 
4697  newtup = change->data.tp.newtuple;
4698  chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
4699  Assert(!isnull);
4700  chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
4701  Assert(!isnull);
4702 
4703  ent = (ReorderBufferToastEnt *)
4704  hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
4705 
4706  if (!found)
4707  {
4708  Assert(ent->chunk_id == chunk_id);
4709  ent->num_chunks = 0;
4710  ent->last_chunk_seq = 0;
4711  ent->size = 0;
4712  ent->reconstructed = NULL;
4713  dlist_init(&ent->chunks);
4714 
4715  if (chunk_seq != 0)
4716  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4717  chunk_seq, chunk_id);
4718  }
4719  else if (found && chunk_seq != ent->last_chunk_seq + 1)
4720  elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4721  chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4722 
4723  chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
4724  Assert(!isnull);
4725 
4726  /* calculate size so we can allocate the right size at once later */
4727  if (!VARATT_IS_EXTENDED(chunk))
4728  chunksize = VARSIZE(chunk) - VARHDRSZ;
4729  else if (VARATT_IS_SHORT(chunk))
4730  /* could happen due to heap_form_tuple doing its thing */
4731  chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4732  else
4733  elog(ERROR, "unexpected type of toast chunk");
4734 
4735  ent->size += chunksize;
4736  ent->last_chunk_seq = chunk_seq;
4737  ent->num_chunks++;
4738  dlist_push_tail(&ent->chunks, &change->node);
4739 }
4740 
4741 /*
4742  * Rejigger change->newtuple to point to in-memory toast tuples instead of
4743  * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
4744  *
4745  * We cannot replace unchanged toast tuples though, so those will still point
4746  * to on-disk toast data.
4747  *
4748  * While updating the existing change with detoasted tuple data, we need to
4749  * update the memory accounting info, because the change size will differ.
4750  * Otherwise the accounting may get out of sync, triggering serialization
4751  * at unexpected times.
4752  *
4753  * We simply subtract size of the change before rejiggering the tuple, and
4754  * then add the new size. This makes it look like the change was removed
4755  * and then added back, except it only tweaks the accounting info.
4756  *
4757  * In particular it can't trigger serialization, which would be pointless
4758  * anyway as it happens during commit processing right before handing
4759  * the change to the output plugin.
4760  */
4761 static void
4763  Relation relation, ReorderBufferChange *change)
4764 {
4765  TupleDesc desc;
4766  int natt;
4767  Datum *attrs;
4768  bool *isnull;
4769  bool *free;
4770  HeapTuple tmphtup;
4771  Relation toast_rel;
4772  TupleDesc toast_desc;
4773  MemoryContext oldcontext;
4774  HeapTuple newtup;
4775  Size old_size;
4776 
4777  /* no toast tuples changed */
4778  if (txn->toast_hash == NULL)
4779  return;
4780 
4781  /*
4782  * We're going to modify the size of the change. So, to make sure the
4783  * accounting is correct we record the current change size and then after
4784  * re-computing the change we'll subtract the recorded size and then
4785  * re-add the new change size at the end. We don't immediately subtract
4786  * the old size because if there is any error before we add the new size,
4787  * we will release the changes and that will update the accounting info
4788  * (subtracting the size from the counters). And we don't want to
4789  * underflow there.
4790  */
4791  old_size = ReorderBufferChangeSize(change);
4792 
4793  oldcontext = MemoryContextSwitchTo(rb->context);
4794 
4795  /* we should only have toast tuples in an INSERT or UPDATE */
4796  Assert(change->data.tp.newtuple);
4797 
4798  desc = RelationGetDescr(relation);
4799 
4800  toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4801  if (!RelationIsValid(toast_rel))
4802  elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
4803  relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
4804 
4805  toast_desc = RelationGetDescr(toast_rel);
4806 
4807  /* should we allocate from stack instead? */
4808  attrs = palloc0(sizeof(Datum) * desc->natts);
4809  isnull = palloc0(sizeof(bool) * desc->natts);
4810  free = palloc0(sizeof(bool) * desc->natts);
4811 
4812  newtup = change->data.tp.newtuple;
4813 
4814  heap_deform_tuple(newtup, desc, attrs, isnull);
4815 
4816  for (natt = 0; natt < desc->natts; natt++)
4817  {
4818  Form_pg_attribute attr = TupleDescAttr(desc, natt);
4819  ReorderBufferToastEnt *ent;
4820  struct varlena *varlena;
4821 
4822  /* va_rawsize is the size of the original datum -- including header */
4823  struct varatt_external toast_pointer;
4824  struct varatt_indirect redirect_pointer;
4825  struct varlena *new_datum = NULL;
4826  struct varlena *reconstructed;
4827  dlist_iter it;
4828  Size data_done = 0;
4829 
4830  /* system columns aren't toasted */
4831  if (attr->attnum < 0)
4832  continue;
4833 
4834  if (attr->attisdropped)
4835  continue;
4836 
4837  /* not a varlena datatype */
4838  if (attr->attlen != -1)
4839  continue;
4840 
4841  /* no data */
4842  if (isnull[natt])
4843  continue;
4844 
4845  /* ok, we know we have a toast datum */
4846  varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4847 
4848  /* no need to do anything if the tuple isn't external */
4850  continue;
4851 
4852  VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4853 
4854  /*
4855  * Check whether the toast tuple changed, replace if so.
4856  */
4857  ent = (ReorderBufferToastEnt *)
4858  hash_search(txn->toast_hash,
4859  &toast_pointer.va_valueid,
4860  HASH_FIND,
4861  NULL);
4862  if (ent == NULL)
4863  continue;
4864 
4865  new_datum =
4867 
4868  free[natt] = true;
4869 
4870  reconstructed = palloc0(toast_pointer.va_rawsize);
4871 
4872  ent->reconstructed = reconstructed;
4873 
4874  /* stitch toast tuple back together from its parts */
4875  dlist_foreach(it, &ent->chunks)
4876  {
4877  bool cisnull;
4878  ReorderBufferChange *cchange;
4879  HeapTuple ctup;
4880  Pointer chunk;
4881 
4882  cchange = dlist_container(ReorderBufferChange, node, it.cur);
4883  ctup = cchange->data.tp.newtuple;
4884  chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
4885 
4886  Assert(!cisnull);
4889 
4890  memcpy(VARDATA(reconstructed) + data_done,
4891  VARDATA(chunk),
4892  VARSIZE(chunk) - VARHDRSZ);
4893  data_done += VARSIZE(chunk) - VARHDRSZ;
4894  }
4895  Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4896 
4897  /* make sure its marked as compressed or not */
4898  if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4899  SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4900  else
4901  SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4902 
4903  memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4904  redirect_pointer.pointer = reconstructed;
4905 
4907  memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4908  sizeof(redirect_pointer));
4909 
4910  attrs[natt] = PointerGetDatum(new_datum);
4911  }
4912 
4913  /*
4914  * Build tuple in separate memory & copy tuple back into the tuplebuf
4915  * passed to the output plugin. We can't directly heap_fill_tuple() into
4916  * the tuplebuf because attrs[] will point back into the current content.
4917  */
4918  tmphtup = heap_form_tuple(desc, attrs, isnull);
4919  Assert(newtup->t_len <= MaxHeapTupleSize);
4920  Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
4921 
4922  memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
4923  newtup->t_len = tmphtup->t_len;
4924 
4925  /*
4926  * free resources we won't further need, more persistent stuff will be
4927  * free'd in ReorderBufferToastReset().
4928  */
4929  RelationClose(toast_rel);
4930  pfree(tmphtup);
4931  for (natt = 0; natt < desc->natts; natt++)
4932  {
4933  if (free[natt])
4934  pfree(DatumGetPointer(attrs[natt]));
4935  }
4936  pfree(attrs);
4937  pfree(free);
4938  pfree(isnull);
4939 
4940  MemoryContextSwitchTo(oldcontext);
4941 
4942  /* subtract the old change size */
4943  ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
4944  /* now add the change back, with the correct size */
4945  ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4946  ReorderBufferChangeSize(change));
4947 }
4948 
4949 /*
4950  * Free all resources allocated for toast reconstruction.
4951  */
4952 static void
4954 {
4955  HASH_SEQ_STATUS hstat;
4956  ReorderBufferToastEnt *ent;
4957 
4958  if (txn->toast_hash == NULL)
4959  return;
4960 
4961  /* sequentially walk over the hash and free everything */
4962  hash_seq_init(&hstat, txn->toast_hash);
4963  while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4964  {
4965  dlist_mutable_iter it;
4966 
4967  if (ent->reconstructed != NULL)
4968  pfree(ent->reconstructed);
4969 
4970  dlist_foreach_modify(it, &ent->chunks)
4971  {
4972  ReorderBufferChange *change =
4974 
4975  dlist_delete(&change->node);
4976  ReorderBufferReturnChange(rb, change, true);
4977  }
4978  }
4979 
4980  hash_destroy(txn->toast_hash);
4981  txn->toast_hash = NULL;
4982 }
4983 
4984 
4985 /* ---------------------------------------
4986  * Visibility support for logical decoding
4987  *
4988  *
4989  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4990  * always rely on stored cmin/cmax values because of two scenarios:
4991  *
4992  * * A tuple got changed multiple times during a single transaction and thus
4993  * has got a combo CID. Combo CIDs are only valid for the duration of a
4994  * single transaction.
4995  * * A tuple with a cmin but no cmax (and thus no combo CID) got
4996  * deleted/updated in another transaction than the one which created it
4997  * which we are looking at right now. As only one of cmin, cmax or combo CID
4998  * is actually stored in the heap we don't have access to the value we
4999  * need anymore.
5000  *
5001  * To resolve those problems we have a per-transaction hash of (cmin,
5002  * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5003  * (cmin, cmax) values. That also takes care of combo CIDs by simply
5004  * not caring about them at all. As we have the real cmin/cmax values
5005  * combo CIDs aren't interesting.
5006  *
5007  * As we only care about catalog tuples here the overhead of this
5008  * hashtable should be acceptable.
5009  *
5010  * Heap rewrites complicate this a bit, check rewriteheap.c for
5011  * details.
5012  * -------------------------------------------------------------------------
5013  */
5014 
5015 /* struct for sorting mapping files by LSN efficiently */
5016 typedef struct RewriteMappingFile
5017 {
5021 
5022 #ifdef NOT_USED
5023 static void
5024 DisplayMapping(HTAB *tuplecid_data)
5025 {
5026  HASH_SEQ_STATUS hstat;
5028 
5029  hash_seq_init(&hstat, tuplecid_data);
5030  while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5031  {
5032  elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5033  ent->key.rlocator.dbOid,
5034  ent->key.rlocator.spcOid,
5035  ent->key.rlocator.relNumber,
5038  ent->cmin,
5039  ent->cmax
5040  );
5041  }
5042 }
5043 #endif
5044 
5045 /*
5046  * Apply a single mapping file to tuplecid_data.
5047  *
5048  * The mapping file has to have been verified to be a) committed b) for our
5049  * transaction c) applied in LSN order.
5050  */
5051 static void
5052 ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5053 {
5054  char path[MAXPGPATH];
5055  int fd;
5056  int readBytes;
5058 
5059  sprintf(path, "pg_logical/mappings/%s", fname);
5060  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5061  if (fd < 0)
5062  ereport(ERROR,
5064  errmsg("could not open file \"%s\": %m", path)));
5065 
5066  while (true)
5067  {
5070  ReorderBufferTupleCidEnt *new_ent;
5071  bool found;
5072 
5073  /* be careful about padding */
5074  memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5075 
5076  /* read all mappings till the end of the file */
5077  pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5078  readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5080 
5081  if (readBytes < 0)
5082  ereport(ERROR,
5084  errmsg("could not read file \"%s\": %m",
5085  path)));
5086  else if (readBytes == 0) /* EOF */
5087  break;
5088  else if (readBytes != sizeof(LogicalRewriteMappingData))
5089  ereport(ERROR,
5091  errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5092  path, readBytes,
5093  (int32) sizeof(LogicalRewriteMappingData))));
5094 
5095  key.rlocator = map.old_locator;
5096  ItemPointerCopy(&map.old_tid,
5097  &key.tid);
5098 
5099 
5100  ent = (ReorderBufferTupleCidEnt *)
5102 
5103  /* no existing mapping, no need to update */
5104  if (!ent)
5105  continue;
5106 
5107  key.rlocator = map.new_locator;
5108  ItemPointerCopy(&map.new_tid,
5109  &key.tid);
5110 
5111  new_ent = (ReorderBufferTupleCidEnt *)
5113 
5114  if (found)
5115  {
5116  /*
5117  * Make sure the existing mapping makes sense. We sometime update
5118  * old records that did not yet have a cmax (e.g. pg_class' own
5119  * entry while rewriting it) during rewrites, so allow that.
5120  */
5121  Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5122  Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5123  }
5124  else
5125  {
5126  /* update mapping */
5127  new_ent->cmin = ent->cmin;
5128  new_ent->cmax = ent->cmax;
5129  new_ent->combocid = ent->combocid;
5130  }
5131  }
5132 
5133  if (CloseTransientFile(fd) != 0)
5134  ereport(ERROR,
5136  errmsg("could not close file \"%s\": %m", path)));
5137 }
5138 
5139 
5140 /*
5141  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5142  */
5143 static bool
5145 {
5146  return bsearch(&xid, xip, num,
5147  sizeof(TransactionId), xidComparator) != NULL;
5148 }
5149 
5150 /*
5151  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5152  */
5153 static int
5154 file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5155 {
5158 
5159  return pg_cmp_u64(a->lsn, b->lsn);
5160 }
5161 
5162 /*
5163  * Apply any existing logical remapping files if there are any targeted at our
5164  * transaction for relid.
5165  */
5166 static void
5168 {
5169  DIR *mapping_dir;
5170  struct dirent *mapping_de;
5171  List *files = NIL;
5172  ListCell *file;
5173  Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5174 
5175  mapping_dir = AllocateDir("pg_logical/mappings");
5176  while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
5177  {
5178  Oid f_dboid;
5179  Oid f_relid;
5180  TransactionId f_mapped_xid;
5181  TransactionId f_create_xid;
5182  XLogRecPtr f_lsn;
5183  uint32 f_hi,
5184  f_lo;
5185  RewriteMappingFile *f;
5186 
5187  if (strcmp(mapping_de->d_name, ".") == 0 ||
5188  strcmp(mapping_de->d_name, "..") == 0)
5189  continue;
5190 
5191  /* Ignore files that aren't ours */
5192  if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5193  continue;
5194 
5195  if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5196  &f_dboid, &f_relid, &f_hi, &f_lo,
5197  &f_mapped_xid, &f_create_xid) != 6)
5198  elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5199 
5200  f_lsn = ((uint64) f_hi) << 32 | f_lo;
5201 
5202  /* mapping for another database */
5203  if (f_dboid != dboid)
5204  continue;
5205 
5206  /* mapping for another relation */
5207  if (f_relid != relid)
5208  continue;
5209 
5210  /* did the creating transaction abort? */
5211  if (!TransactionIdDidCommit(f_create_xid))
5212  continue;
5213 
5214  /* not for our transaction */
5215  if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5216  continue;
5217 
5218  /* ok, relevant, queue for apply */
5219  f = palloc(sizeof(RewriteMappingFile));
5220  f->lsn = f_lsn;
5221  strcpy(f->fname, mapping_de->d_name);
5222  files = lappend(files, f);
5223  }
5224  FreeDir(mapping_dir);
5225 
5226  /* sort files so we apply them in LSN order */
5227  list_sort(files, file_sort_by_lsn);
5228 
5229  foreach(file, files)
5230  {
5232 
5233  elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5234  snapshot->subxip[0]);
5236  pfree(f);
5237  }
5238 }
5239 
5240 /*
5241  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5242  * combo CIDs.
5243  */
5244 bool
5246  Snapshot snapshot,
5247  HeapTuple htup, Buffer buffer,
5248  CommandId *cmin, CommandId *cmax)
5249 {
5252  ForkNumber forkno;
5253  BlockNumber blockno;
5254  bool updated_mapping = false;
5255 
5256  /*
5257  * Return unresolved if tuplecid_data is not valid. That's because when
5258  * streaming in-progress transactions we may run into tuples with the CID
5259  * before actually decoding them. Think e.g. about INSERT followed by
5260  * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5261  * INSERT. So in such cases, we assume the CID is from the future
5262  * command.
5263  */
5264  if (tuplecid_data == NULL)
5265  return false;
5266 
5267  /* be careful about padding */
5268  memset(&key, 0, sizeof(key));
5269 
5270  Assert(!BufferIsLocal(buffer));
5271 
5272  /*
5273  * get relfilelocator from the buffer, no convenient way to access it
5274  * other than that.
5275  */
5276  BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5277 
5278  /* tuples can only be in the main fork */
5279  Assert(forkno == MAIN_FORKNUM);
5280  Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5281 
5282  ItemPointerCopy(&htup->t_self,
5283  &key.tid);
5284 
5285 restart:
5286  ent = (ReorderBufferTupleCidEnt *)
5288 
5289  /*
5290  * failed to find a mapping, check whether the table was rewritten and
5291  * apply mapping if so, but only do that once - there can be no new
5292  * mappings while we are in here since we have to hold a lock on the
5293  * relation.
5294  */
5295  if (ent == NULL && !updated_mapping)
5296  {
5298  /* now check but don't update for a mapping again */
5299  updated_mapping = true;
5300  goto restart;
5301  }
5302  else if (ent == NULL)
5303  return false;
5304 
5305  if (cmin)
5306  *cmin = ent->cmin;
5307  if (cmax)
5308  *cmax = ent->cmax;
5309  return true;
5310 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
uint32 BlockNumber
Definition: block.h:31
static int32 next
Definition: blutils.c:221
static void cleanup(void)
Definition: bootstrap.c:682
int Buffer
Definition: buf.h:23
#define BufferIsLocal(buffer)
Definition: buf.h:37
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:3734
#define NameStr(name)
Definition: c.h:746
#define InvalidCommandId
Definition: c.h:669
unsigned int uint32
Definition: c.h:506
signed int int32
Definition: c.h:494
char * Pointer
Definition: c.h:483
#define VARHDRSZ
Definition: c.h:692
#define Assert(condition)
Definition: c.h:858
#define PG_BINARY
Definition: c.h:1273
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:398
#define FirstCommandId
Definition: c.h:668
uint32 CommandId
Definition: c.h:666
uint32 TransactionId
Definition: c.h:652
size_t Size
Definition: c.h:605
bool IsToastRelation(Relation relation)
Definition: catalog.c:145
bool IsSharedRelation(Oid relationId)
Definition: catalog.c:243
int64 TimestampTz
Definition: timestamp.h:39
#define INDIRECT_POINTER_SIZE
Definition: detoast.h:34
#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)
Definition: detoast.h:22
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:865
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1395
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
struct cursor * cur
Definition: ecpg.c:28
void FreeErrorData(ErrorData *edata)
Definition: elog.c:1785
int errcode_for_file_access(void)
Definition: elog.c:880
void FlushErrorState(void)
Definition: elog.c:1834
int errmsg(const char *fmt,...)
Definition: elog.c:1070
ErrorData * CopyErrorData(void)
Definition: elog.c:1729
#define PG_RE_THROW()
Definition: elog.h:411
#define DEBUG3
Definition: elog.h:28
#define PG_TRY(...)
Definition: elog.h:370
#define DEBUG2
Definition: elog.h:29
#define PG_END_TRY(...)
Definition: elog.h:395
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:380
#define elog(elevel,...)
Definition: elog.h:224
#define INFO
Definition: elog.h:34
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2909
int FreeDir(DIR *dir)
Definition: fd.c:2961
int CloseTransientFile(int fd)
Definition: fd.c:2809
void FileClose(File file)
Definition: fd.c:1978
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1575
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2924
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2633
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2843
static ssize_t FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.h:196
int File
Definition: fd.h:51
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: generation.c:160
Oid MyDatabaseId
Definition: globals.c:91
uint64 chunk
#define free(a)
Definition: header.h:65
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: heaptuple.c:1116
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition: heaptuple.c:1345
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define HEAPTUPLESIZE
Definition: htup.h:73
HeapTupleData * HeapTuple
Definition: htup.h:71
struct HeapTupleData HeapTupleData
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
#define SizeofHeapTupleHeader
Definition: htup_details.h:185
#define MaxHeapTupleSize
Definition: htup_details.h:558
static Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:749
#define dlist_foreach(iter, lhead)
Definition: ilist.h:623
static void dlist_init(dlist_head *head)
Definition: ilist.h:314
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
static bool dlist_has_next(const dlist_head *head, const dlist_node *node)
Definition: ilist.h:503
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition: ilist.h:709
static void dlist_insert_before(dlist_node *before, dlist_node *node)
Definition: ilist.h:393
#define dlist_head_element(type, membername, lhead)
Definition: ilist.h:603
static void dlist_delete(dlist_node *node)
Definition: ilist.h:405
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static dlist_node * dlist_next_node(dlist_head *head, dlist_node *node)
Definition: ilist.h:537
static dlist_node * dlist_pop_head_node(dlist_head *head)
Definition: ilist.h:450
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:640
static bool dlist_is_empty(const dlist_head *head)
Definition: ilist.h:336
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:364
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
static void dclist_init(dclist_head *head)
Definition: ilist.h:671
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
static int pg_cmp_u64(uint64 a, uint64 b)
Definition: int.h:501
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
Definition: inval.c:705
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
void list_sort(List *list, list_sort_comparator cmp)
Definition: list.c:1674
List * lappend(List *list, void *datum)
Definition: list.c:339
void UpdateDecodingStats(LogicalDecodingContext *ctx)
Definition: logical.c:1931
char * pstrdup(const char *in)
Definition: mcxt.c:1695
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1214
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1540
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1180
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
void * palloc(Size size)
Definition: mcxt.c:1316
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define SLAB_DEFAULT_BLOCK_SIZE
Definition: memutils.h:189
#define SLAB_LARGE_BLOCK_SIZE
Definition: memutils.h:190
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void pairingheap_remove(pairingheap *heap, pairingheap_node *node)
Definition: pairingheap.c:170
void pairingheap_add(pairingheap *heap, pairingheap_node *node)
Definition: pairingheap.c:112
pairingheap_node * pairingheap_first(pairingheap *heap)
Definition: pairingheap.c:130
pairingheap * pairingheap_allocate(pairingheap_comparator compare, void *arg)
Definition: pairingheap.c:42
#define pairingheap_container(type, membername, ptr)
Definition: pairingheap.h:43
#define pairingheap_const_container(type, membername, ptr)
Definition: pairingheap.h:51
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:209
void * arg
#define MAXPGPATH
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
#define sprintf
Definition: port.h:240
#define snprintf
Definition: port.h:238
#define qsort(a, b, c, d)
Definition: port.h:449
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Oid DatumGetObjectId(Datum X)
Definition: postgres.h:242
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
tree context
Definition: radixtree.h:1833
MemoryContextSwitchTo(old_ctx)
#define RelationIsLogicallyLogged(relation)
Definition: rel.h:701