PostgreSQL Source Code  git master
snapbuild.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * snapbuild.c
4  *
5  * Infrastructure for building historic catalog snapshots based on contents
6  * of the WAL, for the purpose of decoding heapam.c style values in the
7  * WAL.
8  *
9  * NOTES:
10  *
11  * We build snapshots which can *only* be used to read catalog contents and we
12  * do so by reading and interpreting the WAL stream. The aim is to build a
13  * snapshot that behaves the same as a freshly taken MVCC snapshot would have
14  * at the time the XLogRecord was generated.
15  *
16  * To build the snapshots we reuse the infrastructure built for Hot
17  * Standby. The in-memory snapshots we build look different than HS' because
18  * we have different needs. To successfully decode data from the WAL we only
19  * need to access catalog tables and (sys|rel|cat)cache, not the actual user
20  * tables since the data we decode is wholly contained in the WAL
21  * records. Also, our snapshots need to be different in comparison to normal
22  * MVCC ones because in contrast to those we cannot fully rely on the clog and
23  * pg_subtrans for information about committed transactions because they might
24  * commit in the future from the POV of the WAL entry we're currently
25  * decoding. This definition has the advantage that we only need to prevent
26  * removal of catalog rows, while normal table's rows can still be
27  * removed. This is achieved by using the replication slot mechanism.
28  *
29  * As the percentage of transactions modifying the catalog normally is fairly
30  * small in comparisons to ones only manipulating user data, we keep track of
31  * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
32  * track of all running transactions like it's done in a normal snapshot. Note
33  * that we're generally only looking at transactions that have acquired an
34  * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
35  * that we consider committed, everything else is considered aborted/in
36  * progress. That also allows us not to care about subtransactions before they
37  * have committed which means this module, in contrast to HS, doesn't have to
38  * care about suboverflowed subtransactions and similar.
39  *
40  * One complexity of doing this is that to e.g. handle mixed DDL/DML
41  * transactions we need Snapshots that see intermediate versions of the
42  * catalog in a transaction. During normal operation this is achieved by using
43  * CommandIds/cmin/cmax. The problem with that however is that for space
44  * efficiency reasons, the cmin and cmax are not included in WAL records. We
45  * cannot read the cmin/cmax from the tuple itself, either, because it is
46  * reset on crash recovery. Even if we could, we could not decode combocids
47  * which are only tracked in the original backend's memory. To work around
48  * that, heapam writes an extra WAL record (XLOG_HEAP2_NEW_CID) every time a
49  * catalog row is modified, which includes the cmin and cmax of the
50  * tuple. During decoding, we insert the ctid->(cmin,cmax) mappings into the
51  * reorder buffer, and use them at visibility checks instead of the cmin/cmax
52  * on the tuple itself. Check the reorderbuffer.c's comment above
53  * ResolveCminCmaxDuringDecoding() for details.
54  *
55  * To facilitate all this we need our own visibility routine, as the normal
56  * ones are optimized for different usecases.
57  *
58  * To replace the normal catalog snapshots with decoding ones use the
59  * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
60  *
61  *
62  *
63  * The snapbuild machinery is starting up in several stages, as illustrated
64  * by the following graph describing the SnapBuild->state transitions:
65  *
66  * +-------------------------+
67  * +----| START |-------------+
68  * | +-------------------------+ |
69  * | | |
70  * | | |
71  * | running_xacts #1 |
72  * | | |
73  * | | |
74  * | v |
75  * | +-------------------------+ v
76  * | | BUILDING_SNAPSHOT |------------>|
77  * | +-------------------------+ |
78  * | | |
79  * | | |
80  * | running_xacts #2, xacts from #1 finished |
81  * | | |
82  * | | |
83  * | v |
84  * | +-------------------------+ v
85  * | | FULL_SNAPSHOT |------------>|
86  * | +-------------------------+ |
87  * | | |
88  * running_xacts | saved snapshot
89  * with zero xacts | at running_xacts's lsn
90  * | | |
91  * | running_xacts with xacts from #2 finished |
92  * | | |
93  * | v |
94  * | +-------------------------+ |
95  * +--->|SNAPBUILD_CONSISTENT |<------------+
96  * +-------------------------+
97  *
98  * Initially the machinery is in the START stage. When an xl_running_xacts
99  * record is read that is sufficiently new (above the safe xmin horizon),
100  * there's a state transition. If there were no running xacts when the
101  * xl_running_xacts record was generated, we'll directly go into CONSISTENT
102  * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full
103  * snapshot means that all transactions that start henceforth can be decoded
104  * in their entirety, but transactions that started previously can't. In
105  * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
106  * running transactions have committed or aborted.
107  *
108  * Only transactions that commit after CONSISTENT state has been reached will
109  * be replayed, even though they might have started while still in
110  * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
111  * changes has been exported, but all the following ones will be. That point
112  * is a convenient point to initialize replication from, which is why we
113  * export a snapshot at that point, which *can* be used to read normal data.
114  *
115  * Copyright (c) 2012-2023, PostgreSQL Global Development Group
116  *
117  * IDENTIFICATION
118  * src/backend/replication/logical/snapbuild.c
119  *
120  *-------------------------------------------------------------------------
121  */
122 
123 #include "postgres.h"
124 
125 #include <sys/stat.h>
126 #include <unistd.h>
127 
128 #include "access/heapam_xlog.h"
129 #include "access/transam.h"
130 #include "access/xact.h"
131 #include "common/file_utils.h"
132 #include "miscadmin.h"
133 #include "pgstat.h"
134 #include "replication/logical.h"
136 #include "replication/snapbuild.h"
137 #include "storage/block.h" /* debugging output */
138 #include "storage/fd.h"
139 #include "storage/lmgr.h"
140 #include "storage/proc.h"
141 #include "storage/procarray.h"
142 #include "storage/standby.h"
143 #include "utils/builtins.h"
144 #include "utils/memutils.h"
145 #include "utils/snapmgr.h"
146 #include "utils/snapshot.h"
147 
148 /*
149  * This struct contains the current state of the snapshot building
150  * machinery. Besides a forward declaration in the header, it is not exposed
151  * to the public, so we can easily change its contents.
152  */
153 struct SnapBuild
154 {
155  /* how far are we along building our first full snapshot */
157 
158  /* private memory context used to allocate memory for this module. */
160 
161  /* all transactions < than this have committed/aborted */
163 
164  /* all transactions >= than this are uncommitted */
166 
167  /*
168  * Don't replay commits from an LSN < this LSN. This can be set externally
169  * but it will also be advanced (never retreat) from within snapbuild.c.
170  */
172 
173  /*
174  * LSN at which two-phase decoding was enabled or LSN at which we found a
175  * consistent point at the time of slot creation.
176  *
177  * The prepared transactions, that were skipped because previously
178  * two-phase was not enabled or are not covered by initial snapshot, need
179  * to be sent later along with commit prepared and they must be before
180  * this point.
181  */
183 
184  /*
185  * Don't start decoding WAL until the "xl_running_xacts" information
186  * indicates there are no running xids with an xid smaller than this.
187  */
189 
190  /* Indicates if we are building full snapshot or just catalog one. */
192 
193  /*
194  * Snapshot that's valid to see the catalog state seen at this moment.
195  */
197 
198  /*
199  * LSN of the last location we are sure a snapshot has been serialized to.
200  */
202 
203  /*
204  * The reorderbuffer we need to update with usable snapshots et al.
205  */
207 
208  /*
209  * TransactionId at which the next phase of initial snapshot building will
210  * happen. InvalidTransactionId if not known (i.e. SNAPBUILD_START), or
211  * when no next phase necessary (SNAPBUILD_CONSISTENT).
212  */
214 
215  /*
216  * Array of transactions which could have catalog changes that committed
217  * between xmin and xmax.
218  */
219  struct
220  {
221  /* number of committed transactions */
222  size_t xcnt;
223 
224  /* available space for committed transactions */
225  size_t xcnt_space;
226 
227  /*
228  * Until we reach a CONSISTENT state, we record commits of all
229  * transactions, not just the catalog changing ones. Record when that
230  * changes so we know we cannot export a snapshot safely anymore.
231  */
233 
234  /*
235  * Array of committed transactions that have modified the catalog.
236  *
237  * As this array is frequently modified we do *not* keep it in
238  * xidComparator order. Instead we sort the array when building &
239  * distributing a snapshot.
240  *
241  * TODO: It's unclear whether that reasoning has much merit. Every
242  * time we add something here after becoming consistent will also
243  * require distributing a snapshot. Storing them sorted would
244  * potentially also make it easier to purge (but more complicated wrt
245  * wraparound?). Should be improved if sorting while building the
246  * snapshot shows up in profiles.
247  */
250 
251  /*
252  * Array of transactions and subtransactions that had modified catalogs
253  * and were running when the snapshot was serialized.
254  *
255  * We normally rely on some WAL record types such as HEAP2_NEW_CID to know
256  * if the transaction has changed the catalog. But it could happen that
257  * the logical decoding decodes only the commit record of the transaction
258  * after restoring the previously serialized snapshot in which case we
259  * will miss adding the xid to the snapshot and end up looking at the
260  * catalogs with the wrong snapshot.
261  *
262  * Now to avoid the above problem, we serialize the transactions that had
263  * modified the catalogs and are still running at the time of snapshot
264  * serialization. We fill this array while restoring the snapshot and then
265  * refer it while decoding commit to ensure if the xact has modified the
266  * catalog. We discard this array when all the xids in the list become old
267  * enough to matter. See SnapBuildPurgeOlderTxn for details.
268  */
269  struct
270  {
271  /* number of transactions */
272  size_t xcnt;
273 
274  /* This array must be sorted in xidComparator order */
277 };
278 
279 /*
280  * Starting a transaction -- which we need to do while exporting a snapshot --
281  * removes knowledge about the previously used resowner, so we save it here.
282  */
284 static bool ExportInProgress = false;
285 
286 /* ->committed and ->catchange manipulation */
287 static void SnapBuildPurgeOlderTxn(SnapBuild *builder);
288 
289 /* snapshot building/manipulation/distribution functions */
290 static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder);
291 
292 static void SnapBuildFreeSnapshot(Snapshot snap);
293 
294 static void SnapBuildSnapIncRefcount(Snapshot snap);
295 
297 
298 static inline bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid,
299  uint32 xinfo);
300 
301 /* xlog reading helper functions for SnapBuildProcessRunningXacts */
302 static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
303 static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff);
304 
305 /* serialization functions */
306 static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
307 static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
308 static void SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path);
309 
310 /*
311  * Allocate a new snapshot builder.
312  *
313  * xmin_horizon is the xid >= which we can be sure no catalog rows have been
314  * removed, start_lsn is the LSN >= we want to replay commits.
315  */
316 SnapBuild *
318  TransactionId xmin_horizon,
319  XLogRecPtr start_lsn,
320  bool need_full_snapshot,
321  XLogRecPtr two_phase_at)
322 {
323  MemoryContext context;
324  MemoryContext oldcontext;
325  SnapBuild *builder;
326 
327  /* allocate memory in own context, to have better accountability */
329  "snapshot builder context",
331  oldcontext = MemoryContextSwitchTo(context);
332 
333  builder = palloc0(sizeof(SnapBuild));
334 
335  builder->state = SNAPBUILD_START;
336  builder->context = context;
337  builder->reorder = reorder;
338  /* Other struct members initialized by zeroing via palloc0 above */
339 
340  builder->committed.xcnt = 0;
341  builder->committed.xcnt_space = 128; /* arbitrary number */
342  builder->committed.xip =
343  palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
344  builder->committed.includes_all_transactions = true;
345 
346  builder->catchange.xcnt = 0;
347  builder->catchange.xip = NULL;
348 
349  builder->initial_xmin_horizon = xmin_horizon;
350  builder->start_decoding_at = start_lsn;
351  builder->building_full_snapshot = need_full_snapshot;
352  builder->two_phase_at = two_phase_at;
353 
354  MemoryContextSwitchTo(oldcontext);
355 
356  return builder;
357 }
358 
359 /*
360  * Free a snapshot builder.
361  */
362 void
364 {
365  MemoryContext context = builder->context;
366 
367  /* free snapshot explicitly, that contains some error checking */
368  if (builder->snapshot != NULL)
369  {
371  builder->snapshot = NULL;
372  }
373 
374  /* other resources are deallocated via memory context reset */
375  MemoryContextDelete(context);
376 }
377 
378 /*
379  * Free an unreferenced snapshot that has previously been built by us.
380  */
381 static void
383 {
384  /* make sure we don't get passed an external snapshot */
386 
387  /* make sure nobody modified our snapshot */
388  Assert(snap->curcid == FirstCommandId);
389  Assert(!snap->suboverflowed);
390  Assert(!snap->takenDuringRecovery);
391  Assert(snap->regd_count == 0);
392 
393  /* slightly more likely, so it's checked even without c-asserts */
394  if (snap->copied)
395  elog(ERROR, "cannot free a copied snapshot");
396 
397  if (snap->active_count)
398  elog(ERROR, "cannot free an active snapshot");
399 
400  pfree(snap);
401 }
402 
403 /*
404  * In which state of snapshot building are we?
405  */
408 {
409  return builder->state;
410 }
411 
412 /*
413  * Return the LSN at which the two-phase decoding was first enabled.
414  */
417 {
418  return builder->two_phase_at;
419 }
420 
421 /*
422  * Set the LSN at which two-phase decoding is enabled.
423  */
424 void
426 {
427  builder->two_phase_at = ptr;
428 }
429 
430 /*
431  * Should the contents of transaction ending at 'ptr' be decoded?
432  */
433 bool
435 {
436  return ptr < builder->start_decoding_at;
437 }
438 
439 /*
440  * Increase refcount of a snapshot.
441  *
442  * This is used when handing out a snapshot to some external resource or when
443  * adding a Snapshot as builder->snapshot.
444  */
445 static void
447 {
448  snap->active_count++;
449 }
450 
451 /*
452  * Decrease refcount of a snapshot and free if the refcount reaches zero.
453  *
454  * Externally visible, so that external resources that have been handed an
455  * IncRef'ed Snapshot can adjust its refcount easily.
456  */
457 void
459 {
460  /* make sure we don't get passed an external snapshot */
462 
463  /* make sure nobody modified our snapshot */
464  Assert(snap->curcid == FirstCommandId);
465  Assert(!snap->suboverflowed);
466  Assert(!snap->takenDuringRecovery);
467 
468  Assert(snap->regd_count == 0);
469 
470  Assert(snap->active_count > 0);
471 
472  /* slightly more likely, so it's checked even without casserts */
473  if (snap->copied)
474  elog(ERROR, "cannot free a copied snapshot");
475 
476  snap->active_count--;
477  if (snap->active_count == 0)
478  SnapBuildFreeSnapshot(snap);
479 }
480 
481 /*
482  * Build a new snapshot, based on currently committed catalog-modifying
483  * transactions.
484  *
485  * In-progress transactions with catalog access are *not* allowed to modify
486  * these snapshots; they have to copy them and fill in appropriate ->curcid
487  * and ->subxip/subxcnt values.
488  */
489 static Snapshot
491 {
492  Snapshot snapshot;
493  Size ssize;
494 
495  Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
496 
497  ssize = sizeof(SnapshotData)
498  + sizeof(TransactionId) * builder->committed.xcnt
499  + sizeof(TransactionId) * 1 /* toplevel xid */ ;
500 
501  snapshot = MemoryContextAllocZero(builder->context, ssize);
502 
504 
505  /*
506  * We misuse the original meaning of SnapshotData's xip and subxip fields
507  * to make the more fitting for our needs.
508  *
509  * In the 'xip' array we store transactions that have to be treated as
510  * committed. Since we will only ever look at tuples from transactions
511  * that have modified the catalog it's more efficient to store those few
512  * that exist between xmin and xmax (frequently there are none).
513  *
514  * Snapshots that are used in transactions that have modified the catalog
515  * also use the 'subxip' array to store their toplevel xid and all the
516  * subtransaction xids so we can recognize when we need to treat rows as
517  * visible that are not in xip but still need to be visible. Subxip only
518  * gets filled when the transaction is copied into the context of a
519  * catalog modifying transaction since we otherwise share a snapshot
520  * between transactions. As long as a txn hasn't modified the catalog it
521  * doesn't need to treat any uncommitted rows as visible, so there is no
522  * need for those xids.
523  *
524  * Both arrays are qsort'ed so that we can use bsearch() on them.
525  */
526  Assert(TransactionIdIsNormal(builder->xmin));
527  Assert(TransactionIdIsNormal(builder->xmax));
528 
529  snapshot->xmin = builder->xmin;
530  snapshot->xmax = builder->xmax;
531 
532  /* store all transactions to be treated as committed by this snapshot */
533  snapshot->xip =
534  (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
535  snapshot->xcnt = builder->committed.xcnt;
536  memcpy(snapshot->xip,
537  builder->committed.xip,
538  builder->committed.xcnt * sizeof(TransactionId));
539 
540  /* sort so we can bsearch() */
541  qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
542 
543  /*
544  * Initially, subxip is empty, i.e. it's a snapshot to be used by
545  * transactions that don't modify the catalog. Will be filled by
546  * ReorderBufferCopySnap() if necessary.
547  */
548  snapshot->subxcnt = 0;
549  snapshot->subxip = NULL;
550 
551  snapshot->suboverflowed = false;
552  snapshot->takenDuringRecovery = false;
553  snapshot->copied = false;
554  snapshot->curcid = FirstCommandId;
555  snapshot->active_count = 0;
556  snapshot->regd_count = 0;
557  snapshot->snapXactCompletionCount = 0;
558 
559  return snapshot;
560 }
561 
562 /*
563  * Build the initial slot snapshot and convert it to a normal snapshot that
564  * is understood by HeapTupleSatisfiesMVCC.
565  *
566  * The snapshot will be usable directly in current transaction or exported
567  * for loading in different transaction.
568  */
569 Snapshot
571 {
572  Snapshot snap;
573  TransactionId xid;
574  TransactionId safeXid;
575  TransactionId *newxip;
576  int newxcnt = 0;
577 
579  Assert(builder->building_full_snapshot);
580 
581  /* don't allow older snapshots */
582  InvalidateCatalogSnapshot(); /* about to overwrite MyProc->xmin */
584  elog(ERROR, "cannot build an initial slot snapshot when snapshots exist");
586 
587  if (builder->state != SNAPBUILD_CONSISTENT)
588  elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state");
589 
590  if (!builder->committed.includes_all_transactions)
591  elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
592 
593  /* so we don't overwrite the existing value */
595  elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid");
596 
597  snap = SnapBuildBuildSnapshot(builder);
598 
599  /*
600  * We know that snap->xmin is alive, enforced by the logical xmin
601  * mechanism. Due to that we can do this without locks, we're only
602  * changing our own value.
603  *
604  * Building an initial snapshot is expensive and an unenforced xmin
605  * horizon would have bad consequences, therefore always double-check that
606  * the horizon is enforced.
607  */
608  LWLockAcquire(ProcArrayLock, LW_SHARED);
609  safeXid = GetOldestSafeDecodingTransactionId(false);
610  LWLockRelease(ProcArrayLock);
611 
612  if (TransactionIdFollows(safeXid, snap->xmin))
613  elog(ERROR, "cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u",
614  safeXid, snap->xmin);
615 
616  MyProc->xmin = snap->xmin;
617 
618  /* allocate in transaction context */
619  newxip = (TransactionId *)
621 
622  /*
623  * snapbuild.c builds transactions in an "inverted" manner, which means it
624  * stores committed transactions in ->xip, not ones in progress. Build a
625  * classical snapshot by marking all non-committed transactions as
626  * in-progress. This can be expensive.
627  */
628  for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
629  {
630  void *test;
631 
632  /*
633  * Check whether transaction committed using the decoding snapshot
634  * meaning of ->xip.
635  */
636  test = bsearch(&xid, snap->xip, snap->xcnt,
637  sizeof(TransactionId), xidComparator);
638 
639  if (test == NULL)
640  {
641  if (newxcnt >= GetMaxSnapshotXidCount())
642  ereport(ERROR,
644  errmsg("initial slot snapshot too large")));
645 
646  newxip[newxcnt++] = xid;
647  }
648 
650  }
651 
652  /* adjust remaining snapshot fields as needed */
654  snap->xcnt = newxcnt;
655  snap->xip = newxip;
656 
657  return snap;
658 }
659 
660 /*
661  * Export a snapshot so it can be set in another session with SET TRANSACTION
662  * SNAPSHOT.
663  *
664  * For that we need to start a transaction in the current backend as the
665  * importing side checks whether the source transaction is still open to make
666  * sure the xmin horizon hasn't advanced since then.
667  */
668 const char *
670 {
671  Snapshot snap;
672  char *snapname;
673 
675  elog(ERROR, "cannot export a snapshot from within a transaction");
676 
678  elog(ERROR, "can only export one snapshot at a time");
679 
681  ExportInProgress = true;
682 
684 
685  /* There doesn't seem to a nice API to set these */
687  XactReadOnly = true;
688 
689  snap = SnapBuildInitialSnapshot(builder);
690 
691  /*
692  * now that we've built a plain snapshot, make it active and use the
693  * normal mechanisms for exporting it
694  */
695  snapname = ExportSnapshot(snap);
696 
697  ereport(LOG,
698  (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
699  "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
700  snap->xcnt,
701  snapname, snap->xcnt)));
702  return snapname;
703 }
704 
705 /*
706  * Ensure there is a snapshot and if not build one for current transaction.
707  */
708 Snapshot
710 {
711  Assert(builder->state == SNAPBUILD_CONSISTENT);
712 
713  /* only build a new snapshot if we don't have a prebuilt one */
714  if (builder->snapshot == NULL)
715  {
716  builder->snapshot = SnapBuildBuildSnapshot(builder);
717  /* increase refcount for the snapshot builder */
719  }
720 
721  return builder->snapshot;
722 }
723 
724 /*
725  * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
726  * any. Aborts the previously started transaction and resets the resource
727  * owner back to its original value.
728  */
729 void
731 {
732  ResourceOwner tmpResOwner;
733 
734  /* nothing exported, that is the usual case */
735  if (!ExportInProgress)
736  return;
737 
738  if (!IsTransactionState())
739  elog(ERROR, "clearing exported snapshot in wrong transaction state");
740 
741  /*
742  * AbortCurrentTransaction() takes care of resetting the snapshot state,
743  * so remember SavedResourceOwnerDuringExport.
744  */
745  tmpResOwner = SavedResourceOwnerDuringExport;
746 
747  /* make sure nothing could have ever happened */
749 
750  CurrentResourceOwner = tmpResOwner;
751 }
752 
753 /*
754  * Clear snapshot export state during transaction abort.
755  */
756 void
758 {
760  ExportInProgress = false;
761 }
762 
763 /*
764  * Handle the effects of a single heap change, appropriate to the current state
765  * of the snapshot builder and returns whether changes made at (xid, lsn) can
766  * be decoded.
767  */
768 bool
770 {
771  /*
772  * We can't handle data in transactions if we haven't built a snapshot
773  * yet, so don't store them.
774  */
775  if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
776  return false;
777 
778  /*
779  * No point in keeping track of changes in transactions that we don't have
780  * enough information about to decode. This means that they started before
781  * we got into the SNAPBUILD_FULL_SNAPSHOT state.
782  */
783  if (builder->state < SNAPBUILD_CONSISTENT &&
784  TransactionIdPrecedes(xid, builder->next_phase_at))
785  return false;
786 
787  /*
788  * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
789  * be needed to decode the change we're currently processing.
790  */
791  if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
792  {
793  /* only build a new snapshot if we don't have a prebuilt one */
794  if (builder->snapshot == NULL)
795  {
796  builder->snapshot = SnapBuildBuildSnapshot(builder);
797  /* increase refcount for the snapshot builder */
799  }
800 
801  /*
802  * Increase refcount for the transaction we're handing the snapshot
803  * out to.
804  */
806  ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
807  builder->snapshot);
808  }
809 
810  return true;
811 }
812 
813 /*
814  * Do CommandId/combo CID handling after reading an xl_heap_new_cid record.
815  * This implies that a transaction has done some form of write to system
816  * catalogs.
817  */
818 void
820  XLogRecPtr lsn, xl_heap_new_cid *xlrec)
821 {
822  CommandId cid;
823 
824  /*
825  * we only log new_cid's if a catalog tuple was modified, so mark the
826  * transaction as containing catalog modifications
827  */
828  ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn);
829 
830  ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
831  xlrec->target_locator, xlrec->target_tid,
832  xlrec->cmin, xlrec->cmax,
833  xlrec->combocid);
834 
835  /* figure out new command id */
836  if (xlrec->cmin != InvalidCommandId &&
837  xlrec->cmax != InvalidCommandId)
838  cid = Max(xlrec->cmin, xlrec->cmax);
839  else if (xlrec->cmax != InvalidCommandId)
840  cid = xlrec->cmax;
841  else if (xlrec->cmin != InvalidCommandId)
842  cid = xlrec->cmin;
843  else
844  {
845  cid = InvalidCommandId; /* silence compiler */
846  elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
847  }
848 
849  ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
850 }
851 
852 /*
853  * Add a new Snapshot to all transactions we're decoding that currently are
854  * in-progress so they can see new catalog contents made by the transaction
855  * that just committed. This is necessary because those in-progress
856  * transactions will use the new catalog's contents from here on (at the very
857  * least everything they do needs to be compatible with newer catalog
858  * contents).
859  */
860 static void
862 {
863  dlist_iter txn_i;
864  ReorderBufferTXN *txn;
865 
866  /*
867  * Iterate through all toplevel transactions. This can include
868  * subtransactions which we just don't yet know to be that, but that's
869  * fine, they will just get an unnecessary snapshot queued.
870  */
871  dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
872  {
873  txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
874 
876 
877  /*
878  * If we don't have a base snapshot yet, there are no changes in this
879  * transaction which in turn implies we don't yet need a snapshot at
880  * all. We'll add a snapshot when the first change gets queued.
881  *
882  * NB: This works correctly even for subtransactions because
883  * ReorderBufferAssignChild() takes care to transfer the base snapshot
884  * to the top-level transaction, and while iterating the changequeue
885  * we'll get the change from the subtxn.
886  */
887  if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
888  continue;
889 
890  /*
891  * We don't need to add snapshot to prepared transactions as they
892  * should not see the new catalog contents.
893  */
894  if (rbtxn_prepared(txn) || rbtxn_skip_prepared(txn))
895  continue;
896 
897  elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
898  txn->xid, LSN_FORMAT_ARGS(lsn));
899 
900  /*
901  * increase the snapshot's refcount for the transaction we are handing
902  * it out to
903  */
905  ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
906  builder->snapshot);
907  }
908 }
909 
910 /*
911  * Keep track of a new catalog changing transaction that has committed.
912  */
913 static void
915 {
917 
918  if (builder->committed.xcnt == builder->committed.xcnt_space)
919  {
920  builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
921 
922  elog(DEBUG1, "increasing space for committed transactions to %u",
923  (uint32) builder->committed.xcnt_space);
924 
925  builder->committed.xip = repalloc(builder->committed.xip,
926  builder->committed.xcnt_space * sizeof(TransactionId));
927  }
928 
929  /*
930  * TODO: It might make sense to keep the array sorted here instead of
931  * doing it every time we build a new snapshot. On the other hand this
932  * gets called repeatedly when a transaction with subtransactions commits.
933  */
934  builder->committed.xip[builder->committed.xcnt++] = xid;
935 }
936 
937 /*
938  * Remove knowledge about transactions we treat as committed or containing catalog
939  * changes that are smaller than ->xmin. Those won't ever get checked via
940  * the ->committed or ->catchange array, respectively. The committed xids will
941  * get checked via the clog machinery.
942  *
943  * We can ideally remove the transaction from catchange array once it is
944  * finished (committed/aborted) but that could be costly as we need to maintain
945  * the xids order in the array.
946  */
947 static void
949 {
950  int off;
951  TransactionId *workspace;
952  int surviving_xids = 0;
953 
954  /* not ready yet */
955  if (!TransactionIdIsNormal(builder->xmin))
956  return;
957 
958  /* TODO: Neater algorithm than just copying and iterating? */
959  workspace =
960  MemoryContextAlloc(builder->context,
961  builder->committed.xcnt * sizeof(TransactionId));
962 
963  /* copy xids that still are interesting to workspace */
964  for (off = 0; off < builder->committed.xcnt; off++)
965  {
966  if (NormalTransactionIdPrecedes(builder->committed.xip[off],
967  builder->xmin))
968  ; /* remove */
969  else
970  workspace[surviving_xids++] = builder->committed.xip[off];
971  }
972 
973  /* copy workspace back to persistent state */
974  memcpy(builder->committed.xip, workspace,
975  surviving_xids * sizeof(TransactionId));
976 
977  elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
978  (uint32) builder->committed.xcnt, (uint32) surviving_xids,
979  builder->xmin, builder->xmax);
980  builder->committed.xcnt = surviving_xids;
981 
982  pfree(workspace);
983 
984  /*
985  * Purge xids in ->catchange as well. The purged array must also be sorted
986  * in xidComparator order.
987  */
988  if (builder->catchange.xcnt > 0)
989  {
990  /*
991  * Since catchange.xip is sorted, we find the lower bound of xids that
992  * are still interesting.
993  */
994  for (off = 0; off < builder->catchange.xcnt; off++)
995  {
996  if (TransactionIdFollowsOrEquals(builder->catchange.xip[off],
997  builder->xmin))
998  break;
999  }
1000 
1001  surviving_xids = builder->catchange.xcnt - off;
1002 
1003  if (surviving_xids > 0)
1004  {
1005  memmove(builder->catchange.xip, &(builder->catchange.xip[off]),
1006  surviving_xids * sizeof(TransactionId));
1007  }
1008  else
1009  {
1010  pfree(builder->catchange.xip);
1011  builder->catchange.xip = NULL;
1012  }
1013 
1014  elog(DEBUG3, "purged catalog modifying transactions from %u to %u, xmin: %u, xmax: %u",
1015  (uint32) builder->catchange.xcnt, (uint32) surviving_xids,
1016  builder->xmin, builder->xmax);
1017  builder->catchange.xcnt = surviving_xids;
1018  }
1019 }
1020 
1021 /*
1022  * Handle everything that needs to be done when a transaction commits
1023  */
1024 void
1026  int nsubxacts, TransactionId *subxacts, uint32 xinfo)
1027 {
1028  int nxact;
1029 
1030  bool needs_snapshot = false;
1031  bool needs_timetravel = false;
1032  bool sub_needs_timetravel = false;
1033 
1034  TransactionId xmax = xid;
1035 
1036  /*
1037  * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
1038  * will they be part of a snapshot. So we don't need to record anything.
1039  */
1040  if (builder->state == SNAPBUILD_START ||
1041  (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
1042  TransactionIdPrecedes(xid, builder->next_phase_at)))
1043  {
1044  /* ensure that only commits after this are getting replayed */
1045  if (builder->start_decoding_at <= lsn)
1046  builder->start_decoding_at = lsn + 1;
1047  return;
1048  }
1049 
1050  if (builder->state < SNAPBUILD_CONSISTENT)
1051  {
1052  /* ensure that only commits after this are getting replayed */
1053  if (builder->start_decoding_at <= lsn)
1054  builder->start_decoding_at = lsn + 1;
1055 
1056  /*
1057  * If building an exportable snapshot, force xid to be tracked, even
1058  * if the transaction didn't modify the catalog.
1059  */
1060  if (builder->building_full_snapshot)
1061  {
1062  needs_timetravel = true;
1063  }
1064  }
1065 
1066  for (nxact = 0; nxact < nsubxacts; nxact++)
1067  {
1068  TransactionId subxid = subxacts[nxact];
1069 
1070  /*
1071  * Add subtransaction to base snapshot if catalog modifying, we don't
1072  * distinguish to toplevel transactions there.
1073  */
1074  if (SnapBuildXidHasCatalogChanges(builder, subxid, xinfo))
1075  {
1076  sub_needs_timetravel = true;
1077  needs_snapshot = true;
1078 
1079  elog(DEBUG1, "found subtransaction %u:%u with catalog changes",
1080  xid, subxid);
1081 
1082  SnapBuildAddCommittedTxn(builder, subxid);
1083 
1084  if (NormalTransactionIdFollows(subxid, xmax))
1085  xmax = subxid;
1086  }
1087 
1088  /*
1089  * If we're forcing timetravel we also need visibility information
1090  * about subtransaction, so keep track of subtransaction's state, even
1091  * if not catalog modifying. Don't need to distribute a snapshot in
1092  * that case.
1093  */
1094  else if (needs_timetravel)
1095  {
1096  SnapBuildAddCommittedTxn(builder, subxid);
1097  if (NormalTransactionIdFollows(subxid, xmax))
1098  xmax = subxid;
1099  }
1100  }
1101 
1102  /* if top-level modified catalog, it'll need a snapshot */
1103  if (SnapBuildXidHasCatalogChanges(builder, xid, xinfo))
1104  {
1105  elog(DEBUG2, "found top level transaction %u, with catalog changes",
1106  xid);
1107  needs_snapshot = true;
1108  needs_timetravel = true;
1109  SnapBuildAddCommittedTxn(builder, xid);
1110  }
1111  else if (sub_needs_timetravel)
1112  {
1113  /* track toplevel txn as well, subxact alone isn't meaningful */
1114  elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions",
1115  xid);
1116  needs_timetravel = true;
1117  SnapBuildAddCommittedTxn(builder, xid);
1118  }
1119  else if (needs_timetravel)
1120  {
1121  elog(DEBUG2, "forced transaction %u to do timetravel", xid);
1122 
1123  SnapBuildAddCommittedTxn(builder, xid);
1124  }
1125 
1126  if (!needs_timetravel)
1127  {
1128  /* record that we cannot export a general snapshot anymore */
1129  builder->committed.includes_all_transactions = false;
1130  }
1131 
1132  Assert(!needs_snapshot || needs_timetravel);
1133 
1134  /*
1135  * Adjust xmax of the snapshot builder, we only do that for committed,
1136  * catalog modifying, transactions, everything else isn't interesting for
1137  * us since we'll never look at the respective rows.
1138  */
1139  if (needs_timetravel &&
1140  (!TransactionIdIsValid(builder->xmax) ||
1141  TransactionIdFollowsOrEquals(xmax, builder->xmax)))
1142  {
1143  builder->xmax = xmax;
1144  TransactionIdAdvance(builder->xmax);
1145  }
1146 
1147  /* if there's any reason to build a historic snapshot, do so now */
1148  if (needs_snapshot)
1149  {
1150  /*
1151  * If we haven't built a complete snapshot yet there's no need to hand
1152  * it out, it wouldn't (and couldn't) be used anyway.
1153  */
1154  if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
1155  return;
1156 
1157  /*
1158  * Decrease the snapshot builder's refcount of the old snapshot, note
1159  * that it still will be used if it has been handed out to the
1160  * reorderbuffer earlier.
1161  */
1162  if (builder->snapshot)
1164 
1165  builder->snapshot = SnapBuildBuildSnapshot(builder);
1166 
1167  /* we might need to execute invalidations, add snapshot */
1168  if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
1169  {
1171  ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
1172  builder->snapshot);
1173  }
1174 
1175  /* refcount of the snapshot builder for the new snapshot */
1177 
1178  /* add a new catalog snapshot to all currently running transactions */
1180  }
1181 }
1182 
1183 /*
1184  * Check the reorder buffer and the snapshot to see if the given transaction has
1185  * modified catalogs.
1186  */
1187 static inline bool
1189  uint32 xinfo)
1190 {
1191  if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
1192  return true;
1193 
1194  /*
1195  * The transactions that have changed catalogs must have invalidation
1196  * info.
1197  */
1198  if (!(xinfo & XACT_XINFO_HAS_INVALS))
1199  return false;
1200 
1201  /* Check the catchange XID array */
1202  return ((builder->catchange.xcnt > 0) &&
1203  (bsearch(&xid, builder->catchange.xip, builder->catchange.xcnt,
1204  sizeof(TransactionId), xidComparator) != NULL));
1205 }
1206 
1207 /* -----------------------------------
1208  * Snapshot building functions dealing with xlog records
1209  * -----------------------------------
1210  */
1211 
1212 /*
1213  * Process a running xacts record, and use its information to first build a
1214  * historic snapshot and later to release resources that aren't needed
1215  * anymore.
1216  */
1217 void
1219 {
1220  ReorderBufferTXN *txn;
1221  TransactionId xmin;
1222 
1223  /*
1224  * If we're not consistent yet, inspect the record to see whether it
1225  * allows to get closer to being consistent. If we are consistent, dump
1226  * our snapshot so others or we, after a restart, can use it.
1227  */
1228  if (builder->state < SNAPBUILD_CONSISTENT)
1229  {
1230  /* returns false if there's no point in performing cleanup just yet */
1231  if (!SnapBuildFindSnapshot(builder, lsn, running))
1232  return;
1233  }
1234  else
1235  SnapBuildSerialize(builder, lsn);
1236 
1237  /*
1238  * Update range of interesting xids based on the running xacts
1239  * information. We don't increase ->xmax using it, because once we are in
1240  * a consistent state we can do that ourselves and much more efficiently
1241  * so, because we only need to do it for catalog transactions since we
1242  * only ever look at those.
1243  *
1244  * NB: We only increase xmax when a catalog modifying transaction commits
1245  * (see SnapBuildCommitTxn). Because of this, xmax can be lower than
1246  * xmin, which looks odd but is correct and actually more efficient, since
1247  * we hit fast paths in heapam_visibility.c.
1248  */
1249  builder->xmin = running->oldestRunningXid;
1250 
1251  /* Remove transactions we don't need to keep track off anymore */
1252  SnapBuildPurgeOlderTxn(builder);
1253 
1254  /*
1255  * Advance the xmin limit for the current replication slot, to allow
1256  * vacuum to clean up the tuples this slot has been protecting.
1257  *
1258  * The reorderbuffer might have an xmin among the currently running
1259  * snapshots; use it if so. If not, we need only consider the snapshots
1260  * we'll produce later, which can't be less than the oldest running xid in
1261  * the record we're reading now.
1262  */
1263  xmin = ReorderBufferGetOldestXmin(builder->reorder);
1264  if (xmin == InvalidTransactionId)
1265  xmin = running->oldestRunningXid;
1266  elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
1267  builder->xmin, builder->xmax, running->oldestRunningXid, xmin);
1268  LogicalIncreaseXminForSlot(lsn, xmin);
1269 
1270  /*
1271  * Also tell the slot where we can restart decoding from. We don't want to
1272  * do that after every commit because changing that implies an fsync of
1273  * the logical slot's state file, so we only do it every time we see a
1274  * running xacts record.
1275  *
1276  * Do so by looking for the oldest in progress transaction (determined by
1277  * the first LSN of any of its relevant records). Every transaction
1278  * remembers the last location we stored the snapshot to disk before its
1279  * beginning. That point is where we can restart from.
1280  */
1281 
1282  /*
1283  * Can't know about a serialized snapshot's location if we're not
1284  * consistent.
1285  */
1286  if (builder->state < SNAPBUILD_CONSISTENT)
1287  return;
1288 
1289  txn = ReorderBufferGetOldestTXN(builder->reorder);
1290 
1291  /*
1292  * oldest ongoing txn might have started when we didn't yet serialize
1293  * anything because we hadn't reached a consistent state yet.
1294  */
1295  if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
1297 
1298  /*
1299  * No in-progress transaction, can reuse the last serialized snapshot if
1300  * we have one.
1301  */
1302  else if (txn == NULL &&
1306  builder->last_serialized_snapshot);
1307 }
1308 
1309 
1310 /*
1311  * Build the start of a snapshot that's capable of decoding the catalog.
1312  *
1313  * Helper function for SnapBuildProcessRunningXacts() while we're not yet
1314  * consistent.
1315  *
1316  * Returns true if there is a point in performing internal maintenance/cleanup
1317  * using the xl_running_xacts record.
1318  */
1319 static bool
1321 {
1322  /* ---
1323  * Build catalog decoding snapshot incrementally using information about
1324  * the currently running transactions. There are several ways to do that:
1325  *
1326  * a) There were no running transactions when the xl_running_xacts record
1327  * was inserted, jump to CONSISTENT immediately. We might find such a
1328  * state while waiting on c)'s sub-states.
1329  *
1330  * b) This (in a previous run) or another decoding slot serialized a
1331  * snapshot to disk that we can use. Can't use this method for the
1332  * initial snapshot when slot is being created and needs full snapshot
1333  * for export or direct use, as that snapshot will only contain catalog
1334  * modifying transactions.
1335  *
1336  * c) First incrementally build a snapshot for catalog tuples
1337  * (BUILDING_SNAPSHOT), that requires all, already in-progress,
1338  * transactions to finish. Every transaction starting after that
1339  * (FULL_SNAPSHOT state), has enough information to be decoded. But
1340  * for older running transactions no viable snapshot exists yet, so
1341  * CONSISTENT will only be reached once all of those have finished.
1342  * ---
1343  */
1344 
1345  /*
1346  * xl_running_xacts record is older than what we can use, we might not
1347  * have all necessary catalog rows anymore.
1348  */
1351  builder->initial_xmin_horizon))
1352  {
1353  ereport(DEBUG1,
1354  (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
1355  LSN_FORMAT_ARGS(lsn)),
1356  errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
1357  builder->initial_xmin_horizon, running->oldestRunningXid)));
1358 
1359 
1360  SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon);
1361 
1362  return true;
1363  }
1364 
1365  /*
1366  * a) No transaction were running, we can jump to consistent.
1367  *
1368  * This is not affected by races around xl_running_xacts, because we can
1369  * miss transaction commits, but currently not transactions starting.
1370  *
1371  * NB: We might have already started to incrementally assemble a snapshot,
1372  * so we need to be careful to deal with that.
1373  */
1374  if (running->oldestRunningXid == running->nextXid)
1375  {
1376  if (builder->start_decoding_at == InvalidXLogRecPtr ||
1377  builder->start_decoding_at <= lsn)
1378  /* can decode everything after this */
1379  builder->start_decoding_at = lsn + 1;
1380 
1381  /* As no transactions were running xmin/xmax can be trivially set. */
1382  builder->xmin = running->nextXid; /* < are finished */
1383  builder->xmax = running->nextXid; /* >= are running */
1384 
1385  /* so we can safely use the faster comparisons */
1386  Assert(TransactionIdIsNormal(builder->xmin));
1387  Assert(TransactionIdIsNormal(builder->xmax));
1388 
1389  builder->state = SNAPBUILD_CONSISTENT;
1391 
1392  ereport(LOG,
1393  (errmsg("logical decoding found consistent point at %X/%X",
1394  LSN_FORMAT_ARGS(lsn)),
1395  errdetail("There are no running transactions.")));
1396 
1397  return false;
1398  }
1399  /* b) valid on disk state and not building full snapshot */
1400  else if (!builder->building_full_snapshot &&
1401  SnapBuildRestore(builder, lsn))
1402  {
1403  /* there won't be any state to cleanup */
1404  return false;
1405  }
1406 
1407  /*
1408  * c) transition from START to BUILDING_SNAPSHOT.
1409  *
1410  * In START state, and a xl_running_xacts record with running xacts is
1411  * encountered. In that case, switch to BUILDING_SNAPSHOT state, and
1412  * record xl_running_xacts->nextXid. Once all running xacts have finished
1413  * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It
1414  * might look that we could use xl_running_xacts's ->xids information to
1415  * get there quicker, but that is problematic because transactions marked
1416  * as running, might already have inserted their commit record - it's
1417  * infeasible to change that with locking.
1418  */
1419  else if (builder->state == SNAPBUILD_START)
1420  {
1422  builder->next_phase_at = running->nextXid;
1423 
1424  /*
1425  * Start with an xmin/xmax that's correct for future, when all the
1426  * currently running transactions have finished. We'll update both
1427  * while waiting for the pending transactions to finish.
1428  */
1429  builder->xmin = running->nextXid; /* < are finished */
1430  builder->xmax = running->nextXid; /* >= are running */
1431 
1432  /* so we can safely use the faster comparisons */
1433  Assert(TransactionIdIsNormal(builder->xmin));
1434  Assert(TransactionIdIsNormal(builder->xmax));
1435 
1436  ereport(LOG,
1437  (errmsg("logical decoding found initial starting point at %X/%X",
1438  LSN_FORMAT_ARGS(lsn)),
1439  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1440  running->xcnt, running->nextXid)));
1441 
1442  SnapBuildWaitSnapshot(running, running->nextXid);
1443  }
1444 
1445  /*
1446  * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT.
1447  *
1448  * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid
1449  * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This
1450  * means all transactions starting afterwards have enough information to
1451  * be decoded. Switch to FULL_SNAPSHOT.
1452  */
1453  else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
1455  running->oldestRunningXid))
1456  {
1457  builder->state = SNAPBUILD_FULL_SNAPSHOT;
1458  builder->next_phase_at = running->nextXid;
1459 
1460  ereport(LOG,
1461  (errmsg("logical decoding found initial consistent point at %X/%X",
1462  LSN_FORMAT_ARGS(lsn)),
1463  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1464  running->xcnt, running->nextXid)));
1465 
1466  SnapBuildWaitSnapshot(running, running->nextXid);
1467  }
1468 
1469  /*
1470  * c) transition from FULL_SNAPSHOT to CONSISTENT.
1471  *
1472  * In FULL_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid is
1473  * >= than nextXid from when we switched to FULL_SNAPSHOT. This means all
1474  * transactions that are currently in progress have a catalog snapshot,
1475  * and all their changes have been collected. Switch to CONSISTENT.
1476  */
1477  else if (builder->state == SNAPBUILD_FULL_SNAPSHOT &&
1479  running->oldestRunningXid))
1480  {
1481  builder->state = SNAPBUILD_CONSISTENT;
1483 
1484  ereport(LOG,
1485  (errmsg("logical decoding found consistent point at %X/%X",
1486  LSN_FORMAT_ARGS(lsn)),
1487  errdetail("There are no old transactions anymore.")));
1488  }
1489 
1490  /*
1491  * We already started to track running xacts and need to wait for all
1492  * in-progress ones to finish. We fall through to the normal processing of
1493  * records so incremental cleanup can be performed.
1494  */
1495  return true;
1496 }
1497 
1498 /* ---
1499  * Iterate through xids in record, wait for all older than the cutoff to
1500  * finish. Then, if possible, log a new xl_running_xacts record.
1501  *
1502  * This isn't required for the correctness of decoding, but to:
1503  * a) allow isolationtester to notice that we're currently waiting for
1504  * something.
1505  * b) log a new xl_running_xacts record where it'd be helpful, without having
1506  * to wait for bgwriter or checkpointer.
1507  * ---
1508  */
1509 static void
1511 {
1512  int off;
1513 
1514  for (off = 0; off < running->xcnt; off++)
1515  {
1516  TransactionId xid = running->xids[off];
1517 
1518  /*
1519  * Upper layers should prevent that we ever need to wait on ourselves.
1520  * Check anyway, since failing to do so would either result in an
1521  * endless wait or an Assert() failure.
1522  */
1524  elog(ERROR, "waiting for ourselves");
1525 
1526  if (TransactionIdFollows(xid, cutoff))
1527  continue;
1528 
1529  XactLockTableWait(xid, NULL, NULL, XLTW_None);
1530  }
1531 
1532  /*
1533  * All transactions we needed to finish finished - try to ensure there is
1534  * another xl_running_xacts record in a timely manner, without having to
1535  * wait for bgwriter or checkpointer to log one. During recovery we can't
1536  * enforce that, so we'll have to wait.
1537  */
1538  if (!RecoveryInProgress())
1539  {
1541  }
1542 }
1543 
1544 /* -----------------------------------
1545  * Snapshot serialization support
1546  * -----------------------------------
1547  */
1548 
1549 /*
1550  * We store current state of struct SnapBuild on disk in the following manner:
1551  *
1552  * struct SnapBuildOnDisk;
1553  * TransactionId * committed.xcnt; (*not xcnt_space*)
1554  * TransactionId * catchange.xcnt;
1555  *
1556  */
1557 typedef struct SnapBuildOnDisk
1558 {
1559  /* first part of this struct needs to be version independent */
1560 
1561  /* data not covered by checksum */
1564 
1565  /* data covered by checksum */
1566 
1567  /* version, in case we want to support pg_upgrade */
1569  /* how large is the on disk data, excluding the constant sized part */
1571 
1572  /* version dependent part */
1574 
1575  /* variable amount of TransactionIds follows */
1577 
1578 #define SnapBuildOnDiskConstantSize \
1579  offsetof(SnapBuildOnDisk, builder)
1580 #define SnapBuildOnDiskNotChecksummedSize \
1581  offsetof(SnapBuildOnDisk, version)
1582 
1583 #define SNAPBUILD_MAGIC 0x51A1E001
1584 #define SNAPBUILD_VERSION 5
1585 
1586 /*
1587  * Store/Load a snapshot from disk, depending on the snapshot builder's state.
1588  *
1589  * Supposed to be used by external (i.e. not snapbuild.c) code that just read
1590  * a record that's a potential location for a serialized snapshot.
1591  */
1592 void
1594 {
1595  if (builder->state < SNAPBUILD_CONSISTENT)
1596  SnapBuildRestore(builder, lsn);
1597  else
1598  SnapBuildSerialize(builder, lsn);
1599 }
1600 
1601 /*
1602  * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
1603  * been done by another decoding process.
1604  */
1605 static void
1607 {
1608  Size needed_length;
1609  SnapBuildOnDisk *ondisk = NULL;
1610  TransactionId *catchange_xip = NULL;
1611  MemoryContext old_ctx;
1612  size_t catchange_xcnt;
1613  char *ondisk_c;
1614  int fd;
1615  char tmppath[MAXPGPATH];
1616  char path[MAXPGPATH];
1617  int ret;
1618  struct stat stat_buf;
1619  Size sz;
1620 
1621  Assert(lsn != InvalidXLogRecPtr);
1623  builder->last_serialized_snapshot <= lsn);
1624 
1625  /*
1626  * no point in serializing if we cannot continue to work immediately after
1627  * restoring the snapshot
1628  */
1629  if (builder->state < SNAPBUILD_CONSISTENT)
1630  return;
1631 
1632  /* consistent snapshots have no next phase */
1634 
1635  /*
1636  * We identify snapshots by the LSN they are valid for. We don't need to
1637  * include timelines in the name as each LSN maps to exactly one timeline
1638  * unless the user used pg_resetwal or similar. If a user did so, there's
1639  * no hope continuing to decode anyway.
1640  */
1641  sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1642  LSN_FORMAT_ARGS(lsn));
1643 
1644  /*
1645  * first check whether some other backend already has written the snapshot
1646  * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
1647  * as a valid state. Everything else is an unexpected error.
1648  */
1649  ret = stat(path, &stat_buf);
1650 
1651  if (ret != 0 && errno != ENOENT)
1652  ereport(ERROR,
1654  errmsg("could not stat file \"%s\": %m", path)));
1655 
1656  else if (ret == 0)
1657  {
1658  /*
1659  * somebody else has already serialized to this point, don't overwrite
1660  * but remember location, so we don't need to read old data again.
1661  *
1662  * To be sure it has been synced to disk after the rename() from the
1663  * tempfile filename to the real filename, we just repeat the fsync.
1664  * That ought to be cheap because in most scenarios it should already
1665  * be safely on disk.
1666  */
1667  fsync_fname(path, false);
1668  fsync_fname("pg_logical/snapshots", true);
1669 
1670  builder->last_serialized_snapshot = lsn;
1671  goto out;
1672  }
1673 
1674  /*
1675  * there is an obvious race condition here between the time we stat(2) the
1676  * file and us writing the file. But we rename the file into place
1677  * atomically and all files created need to contain the same data anyway,
1678  * so this is perfectly fine, although a bit of a resource waste. Locking
1679  * seems like pointless complication.
1680  */
1681  elog(DEBUG1, "serializing snapshot to %s", path);
1682 
1683  /* to make sure only we will write to this tempfile, include pid */
1684  sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%d.tmp",
1685  LSN_FORMAT_ARGS(lsn), MyProcPid);
1686 
1687  /*
1688  * Unlink temporary file if it already exists, needs to have been before a
1689  * crash/error since we won't enter this function twice from within a
1690  * single decoding slot/backend and the temporary file contains the pid of
1691  * the current process.
1692  */
1693  if (unlink(tmppath) != 0 && errno != ENOENT)
1694  ereport(ERROR,
1696  errmsg("could not remove file \"%s\": %m", tmppath)));
1697 
1698  old_ctx = MemoryContextSwitchTo(builder->context);
1699 
1700  /* Get the catalog modifying transactions that are yet not committed */
1701  catchange_xip = ReorderBufferGetCatalogChangesXacts(builder->reorder);
1702  catchange_xcnt = dclist_count(&builder->reorder->catchange_txns);
1703 
1704  needed_length = sizeof(SnapBuildOnDisk) +
1705  sizeof(TransactionId) * (builder->committed.xcnt + catchange_xcnt);
1706 
1707  ondisk_c = palloc0(needed_length);
1708  ondisk = (SnapBuildOnDisk *) ondisk_c;
1709  ondisk->magic = SNAPBUILD_MAGIC;
1710  ondisk->version = SNAPBUILD_VERSION;
1711  ondisk->length = needed_length;
1712  INIT_CRC32C(ondisk->checksum);
1713  COMP_CRC32C(ondisk->checksum,
1714  ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
1716  ondisk_c += sizeof(SnapBuildOnDisk);
1717 
1718  memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
1719  /* NULL-ify memory-only data */
1720  ondisk->builder.context = NULL;
1721  ondisk->builder.snapshot = NULL;
1722  ondisk->builder.reorder = NULL;
1723  ondisk->builder.committed.xip = NULL;
1724  ondisk->builder.catchange.xip = NULL;
1725  /* update catchange only on disk data */
1726  ondisk->builder.catchange.xcnt = catchange_xcnt;
1727 
1728  COMP_CRC32C(ondisk->checksum,
1729  &ondisk->builder,
1730  sizeof(SnapBuild));
1731 
1732  /* copy committed xacts */
1733  if (builder->committed.xcnt > 0)
1734  {
1735  sz = sizeof(TransactionId) * builder->committed.xcnt;
1736  memcpy(ondisk_c, builder->committed.xip, sz);
1737  COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1738  ondisk_c += sz;
1739  }
1740 
1741  /* copy catalog modifying xacts */
1742  if (catchange_xcnt > 0)
1743  {
1744  sz = sizeof(TransactionId) * catchange_xcnt;
1745  memcpy(ondisk_c, catchange_xip, sz);
1746  COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1747  ondisk_c += sz;
1748  }
1749 
1750  FIN_CRC32C(ondisk->checksum);
1751 
1752  /* we have valid data now, open tempfile and write it there */
1753  fd = OpenTransientFile(tmppath,
1754  O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1755  if (fd < 0)
1756  ereport(ERROR,
1758  errmsg("could not open file \"%s\": %m", tmppath)));
1759 
1760  errno = 0;
1761  pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE);
1762  if ((write(fd, ondisk, needed_length)) != needed_length)
1763  {
1764  int save_errno = errno;
1765 
1767 
1768  /* if write didn't set errno, assume problem is no disk space */
1769  errno = save_errno ? save_errno : ENOSPC;
1770  ereport(ERROR,
1772  errmsg("could not write to file \"%s\": %m", tmppath)));
1773  }
1775 
1776  /*
1777  * fsync the file before renaming so that even if we crash after this we
1778  * have either a fully valid file or nothing.
1779  *
1780  * It's safe to just ERROR on fsync() here because we'll retry the whole
1781  * operation including the writes.
1782  *
1783  * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
1784  * some noticeable overhead since it's performed synchronously during
1785  * decoding?
1786  */
1787  pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC);
1788  if (pg_fsync(fd) != 0)
1789  {
1790  int save_errno = errno;
1791 
1793  errno = save_errno;
1794  ereport(ERROR,
1796  errmsg("could not fsync file \"%s\": %m", tmppath)));
1797  }
1799 
1800  if (CloseTransientFile(fd) != 0)
1801  ereport(ERROR,
1803  errmsg("could not close file \"%s\": %m", tmppath)));
1804 
1805  fsync_fname("pg_logical/snapshots", true);
1806 
1807  /*
1808  * We may overwrite the work from some other backend, but that's ok, our
1809  * snapshot is valid as well, we'll just have done some superfluous work.
1810  */
1811  if (rename(tmppath, path) != 0)
1812  {
1813  ereport(ERROR,
1815  errmsg("could not rename file \"%s\" to \"%s\": %m",
1816  tmppath, path)));
1817  }
1818 
1819  /* make sure we persist */
1820  fsync_fname(path, false);
1821  fsync_fname("pg_logical/snapshots", true);
1822 
1823  /*
1824  * Now there's no way we can lose the dumped state anymore, remember this
1825  * as a serialization point.
1826  */
1827  builder->last_serialized_snapshot = lsn;
1828 
1829  MemoryContextSwitchTo(old_ctx);
1830 
1831 out:
1833  builder->last_serialized_snapshot);
1834  /* be tidy */
1835  if (ondisk)
1836  pfree(ondisk);
1837  if (catchange_xip)
1838  pfree(catchange_xip);
1839 }
1840 
1841 /*
1842  * Restore a snapshot into 'builder' if previously one has been stored at the
1843  * location indicated by 'lsn'. Returns true if successful, false otherwise.
1844  */
1845 static bool
1847 {
1848  SnapBuildOnDisk ondisk;
1849  int fd;
1850  char path[MAXPGPATH];
1851  Size sz;
1852  pg_crc32c checksum;
1853 
1854  /* no point in loading a snapshot if we're already there */
1855  if (builder->state == SNAPBUILD_CONSISTENT)
1856  return false;
1857 
1858  sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1859  LSN_FORMAT_ARGS(lsn));
1860 
1861  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
1862 
1863  if (fd < 0 && errno == ENOENT)
1864  return false;
1865  else if (fd < 0)
1866  ereport(ERROR,
1868  errmsg("could not open file \"%s\": %m", path)));
1869 
1870  /* ----
1871  * Make sure the snapshot had been stored safely to disk, that's normally
1872  * cheap.
1873  * Note that we do not need PANIC here, nobody will be able to use the
1874  * slot without fsyncing, and saving it won't succeed without an fsync()
1875  * either...
1876  * ----
1877  */
1878  fsync_fname(path, false);
1879  fsync_fname("pg_logical/snapshots", true);
1880 
1881 
1882  /* read statically sized portion of snapshot */
1883  SnapBuildRestoreContents(fd, (char *) &ondisk, SnapBuildOnDiskConstantSize, path);
1884 
1885  if (ondisk.magic != SNAPBUILD_MAGIC)
1886  ereport(ERROR,
1888  errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
1889  path, ondisk.magic, SNAPBUILD_MAGIC)));
1890 
1891  if (ondisk.version != SNAPBUILD_VERSION)
1892  ereport(ERROR,
1894  errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
1895  path, ondisk.version, SNAPBUILD_VERSION)));
1896 
1897  INIT_CRC32C(checksum);
1898  COMP_CRC32C(checksum,
1899  ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
1901 
1902  /* read SnapBuild */
1903  SnapBuildRestoreContents(fd, (char *) &ondisk.builder, sizeof(SnapBuild), path);
1904  COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild));
1905 
1906  /* restore committed xacts information */
1907  if (ondisk.builder.committed.xcnt > 0)
1908  {
1909  sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
1910  ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz);
1911  SnapBuildRestoreContents(fd, (char *) ondisk.builder.committed.xip, sz, path);
1912  COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz);
1913  }
1914 
1915  /* restore catalog modifying xacts information */
1916  if (ondisk.builder.catchange.xcnt > 0)
1917  {
1918  sz = sizeof(TransactionId) * ondisk.builder.catchange.xcnt;
1919  ondisk.builder.catchange.xip = MemoryContextAllocZero(builder->context, sz);
1920  SnapBuildRestoreContents(fd, (char *) ondisk.builder.catchange.xip, sz, path);
1921  COMP_CRC32C(checksum, ondisk.builder.catchange.xip, sz);
1922  }
1923 
1924  if (CloseTransientFile(fd) != 0)
1925  ereport(ERROR,
1927  errmsg("could not close file \"%s\": %m", path)));
1928 
1929  FIN_CRC32C(checksum);
1930 
1931  /* verify checksum of what we've read */
1932  if (!EQ_CRC32C(checksum, ondisk.checksum))
1933  ereport(ERROR,
1935  errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
1936  path, checksum, ondisk.checksum)));
1937 
1938  /*
1939  * ok, we now have a sensible snapshot here, figure out if it has more
1940  * information than we have.
1941  */
1942 
1943  /*
1944  * We are only interested in consistent snapshots for now, comparing
1945  * whether one incomplete snapshot is more "advanced" seems to be
1946  * unnecessarily complex.
1947  */
1948  if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
1949  goto snapshot_not_interesting;
1950 
1951  /*
1952  * Don't use a snapshot that requires an xmin that we cannot guarantee to
1953  * be available.
1954  */
1956  goto snapshot_not_interesting;
1957 
1958  /*
1959  * Consistent snapshots have no next phase. Reset next_phase_at as it is
1960  * possible that an old value may remain.
1961  */
1964 
1965  /* ok, we think the snapshot is sensible, copy over everything important */
1966  builder->xmin = ondisk.builder.xmin;
1967  builder->xmax = ondisk.builder.xmax;
1968  builder->state = ondisk.builder.state;
1969 
1970  builder->committed.xcnt = ondisk.builder.committed.xcnt;
1971  /* We only allocated/stored xcnt, not xcnt_space xids ! */
1972  /* don't overwrite preallocated xip, if we don't have anything here */
1973  if (builder->committed.xcnt > 0)
1974  {
1975  pfree(builder->committed.xip);
1976  builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
1977  builder->committed.xip = ondisk.builder.committed.xip;
1978  }
1979  ondisk.builder.committed.xip = NULL;
1980 
1981  /* set catalog modifying transactions */
1982  if (builder->catchange.xip)
1983  pfree(builder->catchange.xip);
1984  builder->catchange.xcnt = ondisk.builder.catchange.xcnt;
1985  builder->catchange.xip = ondisk.builder.catchange.xip;
1986  ondisk.builder.catchange.xip = NULL;
1987 
1988  /* our snapshot is not interesting anymore, build a new one */
1989  if (builder->snapshot != NULL)
1990  {
1992  }
1993  builder->snapshot = SnapBuildBuildSnapshot(builder);
1995 
1996  ReorderBufferSetRestartPoint(builder->reorder, lsn);
1997 
1998  Assert(builder->state == SNAPBUILD_CONSISTENT);
1999 
2000  ereport(LOG,
2001  (errmsg("logical decoding found consistent point at %X/%X",
2002  LSN_FORMAT_ARGS(lsn)),
2003  errdetail("Logical decoding will begin using saved snapshot.")));
2004  return true;
2005 
2006 snapshot_not_interesting:
2007  if (ondisk.builder.committed.xip != NULL)
2008  pfree(ondisk.builder.committed.xip);
2009  if (ondisk.builder.catchange.xip != NULL)
2010  pfree(ondisk.builder.catchange.xip);
2011  return false;
2012 }
2013 
2014 /*
2015  * Read the contents of the serialized snapshot to 'dest'.
2016  */
2017 static void
2018 SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path)
2019 {
2020  int readBytes;
2021 
2022  pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
2023  readBytes = read(fd, dest, size);
2025  if (readBytes != size)
2026  {
2027  int save_errno = errno;
2028 
2030 
2031  if (readBytes < 0)
2032  {
2033  errno = save_errno;
2034  ereport(ERROR,
2036  errmsg("could not read file \"%s\": %m", path)));
2037  }
2038  else
2039  ereport(ERROR,
2041  errmsg("could not read file \"%s\": read %d of %zu",
2042  path, readBytes, size)));
2043  }
2044 }
2045 
2046 /*
2047  * Remove all serialized snapshots that are not required anymore because no
2048  * slot can need them. This doesn't actually have to run during a checkpoint,
2049  * but it's a convenient point to schedule this.
2050  *
2051  * NB: We run this during checkpoints even if logical decoding is disabled so
2052  * we cleanup old slots at some point after it got disabled.
2053  */
2054 void
2056 {
2057  XLogRecPtr cutoff;
2058  XLogRecPtr redo;
2059  DIR *snap_dir;
2060  struct dirent *snap_de;
2061  char path[MAXPGPATH + 21];
2062 
2063  /*
2064  * We start off with a minimum of the last redo pointer. No new
2065  * replication slot will start before that, so that's a safe upper bound
2066  * for removal.
2067  */
2068  redo = GetRedoRecPtr();
2069 
2070  /* now check for the restart ptrs from existing slots */
2072 
2073  /* don't start earlier than the restart lsn */
2074  if (redo < cutoff)
2075  cutoff = redo;
2076 
2077  snap_dir = AllocateDir("pg_logical/snapshots");
2078  while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
2079  {
2080  uint32 hi;
2081  uint32 lo;
2082  XLogRecPtr lsn;
2083  PGFileType de_type;
2084 
2085  if (strcmp(snap_de->d_name, ".") == 0 ||
2086  strcmp(snap_de->d_name, "..") == 0)
2087  continue;
2088 
2089  snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
2090  de_type = get_dirent_type(path, snap_de, false, DEBUG1);
2091 
2092  if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_REG)
2093  {
2094  elog(DEBUG1, "only regular files expected: %s", path);
2095  continue;
2096  }
2097 
2098  /*
2099  * temporary filenames from SnapBuildSerialize() include the LSN and
2100  * everything but are postfixed by .$pid.tmp. We can just remove them
2101  * the same as other files because there can be none that are
2102  * currently being written that are older than cutoff.
2103  *
2104  * We just log a message if a file doesn't fit the pattern, it's
2105  * probably some editors lock/state file or similar...
2106  */
2107  if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
2108  {
2109  ereport(LOG,
2110  (errmsg("could not parse file name \"%s\"", path)));
2111  continue;
2112  }
2113 
2114  lsn = ((uint64) hi) << 32 | lo;
2115 
2116  /* check whether we still need it */
2117  if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
2118  {
2119  elog(DEBUG1, "removing snapbuild snapshot %s", path);
2120 
2121  /*
2122  * It's not particularly harmful, though strange, if we can't
2123  * remove the file here. Don't prevent the checkpoint from
2124  * completing, that'd be a cure worse than the disease.
2125  */
2126  if (unlink(path) < 0)
2127  {
2128  ereport(LOG,
2130  errmsg("could not remove file \"%s\": %m",
2131  path)));
2132  continue;
2133  }
2134  }
2135  }
2136  FreeDir(snap_dir);
2137 }
#define InvalidCommandId
Definition: c.h:658
unsigned int uint32
Definition: c.h:495
#define Max(x, y)
Definition: c.h:987
#define PG_BINARY
Definition: c.h:1283
#define FirstCommandId
Definition: c.h:657
uint32 CommandId
Definition: c.h:655
uint32 TransactionId
Definition: c.h:641
size_t Size
Definition: c.h:594
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1179
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1229
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define DEBUG3
Definition: elog.h:28
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2879
int FreeDir(DIR *dir)
Definition: fd.c:2931
int CloseTransientFile(int fd)
Definition: fd.c:2779
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:733
int pg_fsync(int fd)
Definition: fd.c:386
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2603
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2813
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:525
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
int MyProcPid
Definition: globals.c:44
#define dlist_foreach(iter, lhead)
Definition: ilist.h:623
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
Assert(fmt[strlen(fmt) - 1] !='\n')
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:668
@ XLTW_None
Definition: lmgr.h:26
void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn)
Definition: logical.c:1743
void LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin)
Definition: logical.c:1675
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1195
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1808
@ LW_SHARED
Definition: lwlock.h:117
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1064
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1021
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:403
void * palloc(Size size)
Definition: mcxt.c:1226
#define AllocSetContextCreate
Definition: memutils.h:126
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:150
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:98
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:103
#define ERRCODE_T_R_SERIALIZATION_FAILURE
Definition: pgbench.c:76
#define sprintf
Definition: port.h:240
#define snprintf
Definition: port.h:238
#define qsort(a, b, c, d)
Definition: port.h:445
static void test(void)
static int fd(const char *x, int i)
Definition: preproc-init.c:105
TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly)
Definition: procarray.c:2909
int GetMaxSnapshotXidCount(void)
Definition: procarray.c:2050
TransactionId * ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
void ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
void ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, CommandId cid)
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileLocator locator, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
TransactionId ReorderBufferGetOldestXmin(ReorderBuffer *rb)
ReorderBufferTXN * ReorderBufferGetOldestTXN(ReorderBuffer *rb)
bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
void ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
void ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
#define rbtxn_prepared(txn)
#define rbtxn_skip_prepared(txn)
ResourceOwner CurrentResourceOwner
Definition: resowner.c:164
XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void)
Definition: slot.c:972
static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:1606
void SnapBuildSnapDecRefcount(Snapshot snap)
Definition: snapbuild.c:458
#define SNAPBUILD_VERSION
Definition: snapbuild.c:1584
bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
Definition: snapbuild.c:434
void SnapBuildResetExportedSnapshotState(void)
Definition: snapbuild.c:757
void SnapBuildSetTwoPhaseAt(SnapBuild *builder, XLogRecPtr ptr)
Definition: snapbuild.c:425
static void SnapBuildSnapIncRefcount(Snapshot snap)
Definition: snapbuild.c:446
bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
Definition: snapbuild.c:769
const char * SnapBuildExportSnapshot(SnapBuild *builder)
Definition: snapbuild.c:669
XLogRecPtr SnapBuildGetTwoPhaseAt(SnapBuild *builder)
Definition: snapbuild.c:416
SnapBuildState SnapBuildCurrentState(SnapBuild *builder)
Definition: snapbuild.c:407
#define SnapBuildOnDiskNotChecksummedSize
Definition: snapbuild.c:1580
void FreeSnapshotBuilder(SnapBuild *builder)
Definition: snapbuild.c:363
void CheckPointSnapBuild(void)
Definition: snapbuild.c:2055
static void SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
Definition: snapbuild.c:914
SnapBuild * AllocateSnapshotBuilder(ReorderBuffer *reorder, TransactionId xmin_horizon, XLogRecPtr start_lsn, bool need_full_snapshot, XLogRecPtr two_phase_at)
Definition: snapbuild.c:317
#define SNAPBUILD_MAGIC
Definition: snapbuild.c:1583
static bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid, uint32 xinfo)
Definition: snapbuild.c:1188
Snapshot SnapBuildGetOrBuildSnapshot(SnapBuild *builder)
Definition: snapbuild.c:709
Snapshot SnapBuildInitialSnapshot(SnapBuild *builder)
Definition: snapbuild.c:570
static ResourceOwner SavedResourceOwnerDuringExport
Definition: snapbuild.c:283
void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:1593
void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, int nsubxacts, TransactionId *subxacts, uint32 xinfo)
Definition: snapbuild.c:1025
static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff)
Definition: snapbuild.c:1510
static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder)
Definition: snapbuild.c:490
void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn, xl_heap_new_cid *xlrec)
Definition: snapbuild.c:819
void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
Definition: snapbuild.c:1218
static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:861
void SnapBuildClearExportedSnapshot(void)
Definition: snapbuild.c:730
static void SnapBuildFreeSnapshot(Snapshot snap)
Definition: snapbuild.c:382
static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
Definition: snapbuild.c:1320
static bool ExportInProgress
Definition: snapbuild.c:284
struct SnapBuildOnDisk SnapBuildOnDisk
static void SnapBuildPurgeOlderTxn(SnapBuild *builder)
Definition: snapbuild.c:948
#define SnapBuildOnDiskConstantSize
Definition: snapbuild.c:1578
static void SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path)
Definition: snapbuild.c:2018
static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:1846
SnapBuildState
Definition: snapbuild.h:19
@ SNAPBUILD_START
Definition: snapbuild.h:23
@ SNAPBUILD_BUILDING_SNAPSHOT
Definition: snapbuild.h:29
@ SNAPBUILD_FULL_SNAPSHOT
Definition: snapbuild.h:39
@ SNAPBUILD_CONSISTENT
Definition: snapbuild.h:46
char * ExportSnapshot(Snapshot snapshot)
Definition: snapmgr.c:1102
bool HistoricSnapshotActive(void)
Definition: snapmgr.c:1678
bool HaveRegisteredOrActiveSnapshot(void)
Definition: snapmgr.c:1630
void InvalidateCatalogSnapshot(void)
Definition: snapmgr.c:429
struct SnapshotData SnapshotData
@ SNAPSHOT_MVCC
Definition: snapshot.h:50
@ SNAPSHOT_HISTORIC_MVCC
Definition: snapshot.h:109
PGPROC * MyProc
Definition: proc.c:66
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:1287
Definition: dirent.c:26
TransactionId xmin
Definition: proc.h:178
XLogRecPtr restart_decoding_lsn
TransactionId xid
dclist_head catchange_txns
dlist_head toplevel_by_lsn
XLogRecPtr current_restart_decoding_lsn
SnapBuild builder
Definition: snapbuild.c:1573
pg_crc32c checksum
Definition: snapbuild.c:1563
XLogRecPtr start_decoding_at
Definition: snapbuild.c:171
SnapBuildState state
Definition: snapbuild.c:156
TransactionId xmin
Definition: snapbuild.c:162
TransactionId initial_xmin_horizon
Definition: snapbuild.c:188
struct SnapBuild::@16 committed
TransactionId xmax
Definition: snapbuild.c:165
TransactionId * xip
Definition: snapbuild.c:248
Snapshot snapshot
Definition: snapbuild.c:196
XLogRecPtr two_phase_at
Definition: snapbuild.c:182
bool building_full_snapshot
Definition: snapbuild.c:191
TransactionId next_phase_at
Definition: snapbuild.c:213
struct SnapBuild::@17 catchange
size_t xcnt
Definition: snapbuild.c:222
XLogRecPtr last_serialized_snapshot
Definition: snapbuild.c:201
size_t xcnt_space
Definition: snapbuild.c:225
bool includes_all_transactions
Definition: snapbuild.c:232
MemoryContext context
Definition: snapbuild.c:159
ReorderBuffer * reorder
Definition: snapbuild.c:206
TransactionId xmin
Definition: snapshot.h:157
int32 subxcnt
Definition: snapshot.h:181
bool copied
Definition: snapshot.h:185
uint32 regd_count
Definition: snapshot.h:205
uint32 active_count
Definition: snapshot.h:204
CommandId curcid
Definition: snapshot.h:187
uint32 xcnt
Definition: snapshot.h:169
TransactionId * subxip
Definition: snapshot.h:180
uint64 snapXactCompletionCount
Definition: snapshot.h:216
TransactionId xmax
Definition: snapshot.h:158
SnapshotType snapshot_type
Definition: snapshot.h:144
TransactionId * xip
Definition: snapshot.h:168
bool suboverflowed
Definition: snapshot.h:182
bool takenDuringRecovery
Definition: snapshot.h:184
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
dlist_node * cur
Definition: ilist.h:179
CommandId cmin
Definition: heapam_xlog.h:380
CommandId combocid
Definition: heapam_xlog.h:382
ItemPointerData target_tid
Definition: heapam_xlog.h:388
TransactionId top_xid
Definition: heapam_xlog.h:379
CommandId cmax
Definition: heapam_xlog.h:381
RelFileLocator target_locator
Definition: heapam_xlog.h:387
TransactionId oldestRunningXid
Definition: standbydefs.h:53
TransactionId xids[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:56
TransactionId nextXid
Definition: standbydefs.h:52
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:299
bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition: transam.c:314
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:329
#define InvalidTransactionId
Definition: transam.h:31
#define NormalTransactionIdPrecedes(id1, id2)
Definition: transam.h:147
#define NormalTransactionIdFollows(id1, id2)
Definition: transam.h:152
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TransactionIdAdvance(dest)
Definition: transam.h:91
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:88
static void pgstat_report_wait_end(void)
Definition: wait_event.h:104
#define stat
Definition: win32_port.h:284
bool IsTransactionOrTransactionBlock(void)
Definition: xact.c:4834
bool XactReadOnly
Definition: xact.c:82
bool IsTransactionState(void)
Definition: xact.c:378
void StartTransactionCommand(void)
Definition: xact.c:2937
int XactIsoLevel
Definition: xact.c:79
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition: xact.c:926
void AbortCurrentTransaction(void)
Definition: xact.c:3305
#define XACT_REPEATABLE_READ
Definition: xact.h:38
#define XACT_XINFO_HAS_INVALS
Definition: xact.h:191
int xidComparator(const void *arg1, const void *arg2)
Definition: xid.c:138
bool RecoveryInProgress(void)
Definition: xlog.c:6039
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6142
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28