PostgreSQL Source Code  git master
snapbuild.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * snapbuild.c
4  *
5  * Infrastructure for building historic catalog snapshots based on contents
6  * of the WAL, for the purpose of decoding heapam.c style values in the
7  * WAL.
8  *
9  * NOTES:
10  *
11  * We build snapshots which can *only* be used to read catalog contents and we
12  * do so by reading and interpreting the WAL stream. The aim is to build a
13  * snapshot that behaves the same as a freshly taken MVCC snapshot would have
14  * at the time the XLogRecord was generated.
15  *
16  * To build the snapshots we reuse the infrastructure built for Hot
17  * Standby. The in-memory snapshots we build look different than HS' because
18  * we have different needs. To successfully decode data from the WAL we only
19  * need to access catalog tables and (sys|rel|cat)cache, not the actual user
20  * tables since the data we decode is wholly contained in the WAL
21  * records. Also, our snapshots need to be different in comparison to normal
22  * MVCC ones because in contrast to those we cannot fully rely on the clog and
23  * pg_subtrans for information about committed transactions because they might
24  * commit in the future from the POV of the WAL entry we're currently
25  * decoding. This definition has the advantage that we only need to prevent
26  * removal of catalog rows, while normal table's rows can still be
27  * removed. This is achieved by using the replication slot mechanism.
28  *
29  * As the percentage of transactions modifying the catalog normally is fairly
30  * small in comparisons to ones only manipulating user data, we keep track of
31  * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
32  * track of all running transactions like it's done in a normal snapshot. Note
33  * that we're generally only looking at transactions that have acquired an
34  * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
35  * that we consider committed, everything else is considered aborted/in
36  * progress. That also allows us not to care about subtransactions before they
37  * have committed which means this module, in contrast to HS, doesn't have to
38  * care about suboverflowed subtransactions and similar.
39  *
40  * One complexity of doing this is that to e.g. handle mixed DDL/DML
41  * transactions we need Snapshots that see intermediate versions of the
42  * catalog in a transaction. During normal operation this is achieved by using
43  * CommandIds/cmin/cmax. The problem with that however is that for space
44  * efficiency reasons only one value of that is stored
45  * (cf. combocid.c). Since combo CIDs are only available in memory we log
46  * additional information which allows us to get the original (cmin, cmax)
47  * pair during visibility checks. Check the reorderbuffer.c's comment above
48  * ResolveCminCmaxDuringDecoding() for details.
49  *
50  * To facilitate all this we need our own visibility routine, as the normal
51  * ones are optimized for different usecases.
52  *
53  * To replace the normal catalog snapshots with decoding ones use the
54  * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
55  *
56  *
57  *
58  * The snapbuild machinery is starting up in several stages, as illustrated
59  * by the following graph describing the SnapBuild->state transitions:
60  *
61  * +-------------------------+
62  * +----| START |-------------+
63  * | +-------------------------+ |
64  * | | |
65  * | | |
66  * | running_xacts #1 |
67  * | | |
68  * | | |
69  * | v |
70  * | +-------------------------+ v
71  * | | BUILDING_SNAPSHOT |------------>|
72  * | +-------------------------+ |
73  * | | |
74  * | | |
75  * | running_xacts #2, xacts from #1 finished |
76  * | | |
77  * | | |
78  * | v |
79  * | +-------------------------+ v
80  * | | FULL_SNAPSHOT |------------>|
81  * | +-------------------------+ |
82  * | | |
83  * running_xacts | saved snapshot
84  * with zero xacts | at running_xacts's lsn
85  * | | |
86  * | running_xacts with xacts from #2 finished |
87  * | | |
88  * | v |
89  * | +-------------------------+ |
90  * +--->|SNAPBUILD_CONSISTENT |<------------+
91  * +-------------------------+
92  *
93  * Initially the machinery is in the START stage. When an xl_running_xacts
94  * record is read that is sufficiently new (above the safe xmin horizon),
95  * there's a state transition. If there were no running xacts when the
96  * xl_running_xacts record was generated, we'll directly go into CONSISTENT
97  * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full
98  * snapshot means that all transactions that start henceforth can be decoded
99  * in their entirety, but transactions that started previously can't. In
100  * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
101  * running transactions have committed or aborted.
102  *
103  * Only transactions that commit after CONSISTENT state has been reached will
104  * be replayed, even though they might have started while still in
105  * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
106  * changes has been exported, but all the following ones will be. That point
107  * is a convenient point to initialize replication from, which is why we
108  * export a snapshot at that point, which *can* be used to read normal data.
109  *
110  * Copyright (c) 2012-2023, PostgreSQL Global Development Group
111  *
112  * IDENTIFICATION
113  * src/backend/replication/logical/snapbuild.c
114  *
115  *-------------------------------------------------------------------------
116  */
117 
118 #include "postgres.h"
119 
120 #include <sys/stat.h>
121 #include <unistd.h>
122 
123 #include "access/heapam_xlog.h"
124 #include "access/transam.h"
125 #include "access/xact.h"
126 #include "common/file_utils.h"
127 #include "miscadmin.h"
128 #include "pgstat.h"
129 #include "replication/logical.h"
131 #include "replication/snapbuild.h"
132 #include "storage/block.h" /* debugging output */
133 #include "storage/fd.h"
134 #include "storage/lmgr.h"
135 #include "storage/proc.h"
136 #include "storage/procarray.h"
137 #include "storage/standby.h"
138 #include "utils/builtins.h"
139 #include "utils/memutils.h"
140 #include "utils/snapmgr.h"
141 #include "utils/snapshot.h"
142 
143 /*
144  * This struct contains the current state of the snapshot building
145  * machinery. Besides a forward declaration in the header, it is not exposed
146  * to the public, so we can easily change its contents.
147  */
148 struct SnapBuild
149 {
150  /* how far are we along building our first full snapshot */
152 
153  /* private memory context used to allocate memory for this module. */
155 
156  /* all transactions < than this have committed/aborted */
158 
159  /* all transactions >= than this are uncommitted */
161 
162  /*
163  * Don't replay commits from an LSN < this LSN. This can be set externally
164  * but it will also be advanced (never retreat) from within snapbuild.c.
165  */
167 
168  /*
169  * LSN at which two-phase decoding was enabled or LSN at which we found a
170  * consistent point at the time of slot creation.
171  *
172  * The prepared transactions, that were skipped because previously
173  * two-phase was not enabled or are not covered by initial snapshot, need
174  * to be sent later along with commit prepared and they must be before
175  * this point.
176  */
178 
179  /*
180  * Don't start decoding WAL until the "xl_running_xacts" information
181  * indicates there are no running xids with an xid smaller than this.
182  */
184 
185  /* Indicates if we are building full snapshot or just catalog one. */
187 
188  /*
189  * Snapshot that's valid to see the catalog state seen at this moment.
190  */
192 
193  /*
194  * LSN of the last location we are sure a snapshot has been serialized to.
195  */
197 
198  /*
199  * The reorderbuffer we need to update with usable snapshots et al.
200  */
202 
203  /*
204  * TransactionId at which the next phase of initial snapshot building will
205  * happen. InvalidTransactionId if not known (i.e. SNAPBUILD_START), or
206  * when no next phase necessary (SNAPBUILD_CONSISTENT).
207  */
209 
210  /*
211  * Array of transactions which could have catalog changes that committed
212  * between xmin and xmax.
213  */
214  struct
215  {
216  /* number of committed transactions */
217  size_t xcnt;
218 
219  /* available space for committed transactions */
220  size_t xcnt_space;
221 
222  /*
223  * Until we reach a CONSISTENT state, we record commits of all
224  * transactions, not just the catalog changing ones. Record when that
225  * changes so we know we cannot export a snapshot safely anymore.
226  */
228 
229  /*
230  * Array of committed transactions that have modified the catalog.
231  *
232  * As this array is frequently modified we do *not* keep it in
233  * xidComparator order. Instead we sort the array when building &
234  * distributing a snapshot.
235  *
236  * TODO: It's unclear whether that reasoning has much merit. Every
237  * time we add something here after becoming consistent will also
238  * require distributing a snapshot. Storing them sorted would
239  * potentially also make it easier to purge (but more complicated wrt
240  * wraparound?). Should be improved if sorting while building the
241  * snapshot shows up in profiles.
242  */
245 
246  /*
247  * Array of transactions and subtransactions that had modified catalogs
248  * and were running when the snapshot was serialized.
249  *
250  * We normally rely on some WAL record types such as HEAP2_NEW_CID to know
251  * if the transaction has changed the catalog. But it could happen that
252  * the logical decoding decodes only the commit record of the transaction
253  * after restoring the previously serialized snapshot in which case we
254  * will miss adding the xid to the snapshot and end up looking at the
255  * catalogs with the wrong snapshot.
256  *
257  * Now to avoid the above problem, we serialize the transactions that had
258  * modified the catalogs and are still running at the time of snapshot
259  * serialization. We fill this array while restoring the snapshot and then
260  * refer it while decoding commit to ensure if the xact has modified the
261  * catalog. We discard this array when all the xids in the list become old
262  * enough to matter. See SnapBuildPurgeOlderTxn for details.
263  */
264  struct
265  {
266  /* number of transactions */
267  size_t xcnt;
268 
269  /* This array must be sorted in xidComparator order */
272 };
273 
274 /*
275  * Starting a transaction -- which we need to do while exporting a snapshot --
276  * removes knowledge about the previously used resowner, so we save it here.
277  */
279 static bool ExportInProgress = false;
280 
281 /* ->committed and ->catchange manipulation */
282 static void SnapBuildPurgeOlderTxn(SnapBuild *builder);
283 
284 /* snapshot building/manipulation/distribution functions */
285 static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder);
286 
287 static void SnapBuildFreeSnapshot(Snapshot snap);
288 
289 static void SnapBuildSnapIncRefcount(Snapshot snap);
290 
292 
293 static inline bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid,
294  uint32 xinfo);
295 
296 /* xlog reading helper functions for SnapBuildProcessRunningXacts */
297 static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
298 static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff);
299 
300 /* serialization functions */
301 static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
302 static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
303 static void SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path);
304 
305 /*
306  * Allocate a new snapshot builder.
307  *
308  * xmin_horizon is the xid >= which we can be sure no catalog rows have been
309  * removed, start_lsn is the LSN >= we want to replay commits.
310  */
311 SnapBuild *
313  TransactionId xmin_horizon,
314  XLogRecPtr start_lsn,
315  bool need_full_snapshot,
316  XLogRecPtr two_phase_at)
317 {
318  MemoryContext context;
319  MemoryContext oldcontext;
320  SnapBuild *builder;
321 
322  /* allocate memory in own context, to have better accountability */
324  "snapshot builder context",
326  oldcontext = MemoryContextSwitchTo(context);
327 
328  builder = palloc0(sizeof(SnapBuild));
329 
330  builder->state = SNAPBUILD_START;
331  builder->context = context;
332  builder->reorder = reorder;
333  /* Other struct members initialized by zeroing via palloc0 above */
334 
335  builder->committed.xcnt = 0;
336  builder->committed.xcnt_space = 128; /* arbitrary number */
337  builder->committed.xip =
338  palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
339  builder->committed.includes_all_transactions = true;
340 
341  builder->catchange.xcnt = 0;
342  builder->catchange.xip = NULL;
343 
344  builder->initial_xmin_horizon = xmin_horizon;
345  builder->start_decoding_at = start_lsn;
346  builder->building_full_snapshot = need_full_snapshot;
347  builder->two_phase_at = two_phase_at;
348 
349  MemoryContextSwitchTo(oldcontext);
350 
351  return builder;
352 }
353 
354 /*
355  * Free a snapshot builder.
356  */
357 void
359 {
360  MemoryContext context = builder->context;
361 
362  /* free snapshot explicitly, that contains some error checking */
363  if (builder->snapshot != NULL)
364  {
366  builder->snapshot = NULL;
367  }
368 
369  /* other resources are deallocated via memory context reset */
370  MemoryContextDelete(context);
371 }
372 
373 /*
374  * Free an unreferenced snapshot that has previously been built by us.
375  */
376 static void
378 {
379  /* make sure we don't get passed an external snapshot */
381 
382  /* make sure nobody modified our snapshot */
383  Assert(snap->curcid == FirstCommandId);
384  Assert(!snap->suboverflowed);
385  Assert(!snap->takenDuringRecovery);
386  Assert(snap->regd_count == 0);
387 
388  /* slightly more likely, so it's checked even without c-asserts */
389  if (snap->copied)
390  elog(ERROR, "cannot free a copied snapshot");
391 
392  if (snap->active_count)
393  elog(ERROR, "cannot free an active snapshot");
394 
395  pfree(snap);
396 }
397 
398 /*
399  * In which state of snapshot building are we?
400  */
403 {
404  return builder->state;
405 }
406 
407 /*
408  * Return the LSN at which the two-phase decoding was first enabled.
409  */
412 {
413  return builder->two_phase_at;
414 }
415 
416 /*
417  * Set the LSN at which two-phase decoding is enabled.
418  */
419 void
421 {
422  builder->two_phase_at = ptr;
423 }
424 
425 /*
426  * Should the contents of transaction ending at 'ptr' be decoded?
427  */
428 bool
430 {
431  return ptr < builder->start_decoding_at;
432 }
433 
434 /*
435  * Increase refcount of a snapshot.
436  *
437  * This is used when handing out a snapshot to some external resource or when
438  * adding a Snapshot as builder->snapshot.
439  */
440 static void
442 {
443  snap->active_count++;
444 }
445 
446 /*
447  * Decrease refcount of a snapshot and free if the refcount reaches zero.
448  *
449  * Externally visible, so that external resources that have been handed an
450  * IncRef'ed Snapshot can adjust its refcount easily.
451  */
452 void
454 {
455  /* make sure we don't get passed an external snapshot */
457 
458  /* make sure nobody modified our snapshot */
459  Assert(snap->curcid == FirstCommandId);
460  Assert(!snap->suboverflowed);
461  Assert(!snap->takenDuringRecovery);
462 
463  Assert(snap->regd_count == 0);
464 
465  Assert(snap->active_count > 0);
466 
467  /* slightly more likely, so it's checked even without casserts */
468  if (snap->copied)
469  elog(ERROR, "cannot free a copied snapshot");
470 
471  snap->active_count--;
472  if (snap->active_count == 0)
473  SnapBuildFreeSnapshot(snap);
474 }
475 
476 /*
477  * Build a new snapshot, based on currently committed catalog-modifying
478  * transactions.
479  *
480  * In-progress transactions with catalog access are *not* allowed to modify
481  * these snapshots; they have to copy them and fill in appropriate ->curcid
482  * and ->subxip/subxcnt values.
483  */
484 static Snapshot
486 {
487  Snapshot snapshot;
488  Size ssize;
489 
490  Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
491 
492  ssize = sizeof(SnapshotData)
493  + sizeof(TransactionId) * builder->committed.xcnt
494  + sizeof(TransactionId) * 1 /* toplevel xid */ ;
495 
496  snapshot = MemoryContextAllocZero(builder->context, ssize);
497 
499 
500  /*
501  * We misuse the original meaning of SnapshotData's xip and subxip fields
502  * to make the more fitting for our needs.
503  *
504  * In the 'xip' array we store transactions that have to be treated as
505  * committed. Since we will only ever look at tuples from transactions
506  * that have modified the catalog it's more efficient to store those few
507  * that exist between xmin and xmax (frequently there are none).
508  *
509  * Snapshots that are used in transactions that have modified the catalog
510  * also use the 'subxip' array to store their toplevel xid and all the
511  * subtransaction xids so we can recognize when we need to treat rows as
512  * visible that are not in xip but still need to be visible. Subxip only
513  * gets filled when the transaction is copied into the context of a
514  * catalog modifying transaction since we otherwise share a snapshot
515  * between transactions. As long as a txn hasn't modified the catalog it
516  * doesn't need to treat any uncommitted rows as visible, so there is no
517  * need for those xids.
518  *
519  * Both arrays are qsort'ed so that we can use bsearch() on them.
520  */
521  Assert(TransactionIdIsNormal(builder->xmin));
522  Assert(TransactionIdIsNormal(builder->xmax));
523 
524  snapshot->xmin = builder->xmin;
525  snapshot->xmax = builder->xmax;
526 
527  /* store all transactions to be treated as committed by this snapshot */
528  snapshot->xip =
529  (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
530  snapshot->xcnt = builder->committed.xcnt;
531  memcpy(snapshot->xip,
532  builder->committed.xip,
533  builder->committed.xcnt * sizeof(TransactionId));
534 
535  /* sort so we can bsearch() */
536  qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
537 
538  /*
539  * Initially, subxip is empty, i.e. it's a snapshot to be used by
540  * transactions that don't modify the catalog. Will be filled by
541  * ReorderBufferCopySnap() if necessary.
542  */
543  snapshot->subxcnt = 0;
544  snapshot->subxip = NULL;
545 
546  snapshot->suboverflowed = false;
547  snapshot->takenDuringRecovery = false;
548  snapshot->copied = false;
549  snapshot->curcid = FirstCommandId;
550  snapshot->active_count = 0;
551  snapshot->regd_count = 0;
552  snapshot->snapXactCompletionCount = 0;
553 
554  return snapshot;
555 }
556 
557 /*
558  * Build the initial slot snapshot and convert it to a normal snapshot that
559  * is understood by HeapTupleSatisfiesMVCC.
560  *
561  * The snapshot will be usable directly in current transaction or exported
562  * for loading in different transaction.
563  */
564 Snapshot
566 {
567  Snapshot snap;
568  TransactionId xid;
569  TransactionId safeXid;
570  TransactionId *newxip;
571  int newxcnt = 0;
572 
574  Assert(builder->building_full_snapshot);
575 
576  /* don't allow older snapshots */
577  InvalidateCatalogSnapshot(); /* about to overwrite MyProc->xmin */
579  elog(ERROR, "cannot build an initial slot snapshot when snapshots exist");
581 
582  if (builder->state != SNAPBUILD_CONSISTENT)
583  elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state");
584 
585  if (!builder->committed.includes_all_transactions)
586  elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
587 
588  /* so we don't overwrite the existing value */
590  elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid");
591 
592  snap = SnapBuildBuildSnapshot(builder);
593 
594  /*
595  * We know that snap->xmin is alive, enforced by the logical xmin
596  * mechanism. Due to that we can do this without locks, we're only
597  * changing our own value.
598  *
599  * Building an initial snapshot is expensive and an unenforced xmin
600  * horizon would have bad consequences, therefore always double-check that
601  * the horizon is enforced.
602  */
603  LWLockAcquire(ProcArrayLock, LW_SHARED);
604  safeXid = GetOldestSafeDecodingTransactionId(false);
605  LWLockRelease(ProcArrayLock);
606 
607  if (TransactionIdFollows(safeXid, snap->xmin))
608  elog(ERROR, "cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u",
609  safeXid, snap->xmin);
610 
611  MyProc->xmin = snap->xmin;
612 
613  /* allocate in transaction context */
614  newxip = (TransactionId *)
616 
617  /*
618  * snapbuild.c builds transactions in an "inverted" manner, which means it
619  * stores committed transactions in ->xip, not ones in progress. Build a
620  * classical snapshot by marking all non-committed transactions as
621  * in-progress. This can be expensive.
622  */
623  for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
624  {
625  void *test;
626 
627  /*
628  * Check whether transaction committed using the decoding snapshot
629  * meaning of ->xip.
630  */
631  test = bsearch(&xid, snap->xip, snap->xcnt,
632  sizeof(TransactionId), xidComparator);
633 
634  if (test == NULL)
635  {
636  if (newxcnt >= GetMaxSnapshotXidCount())
637  ereport(ERROR,
639  errmsg("initial slot snapshot too large")));
640 
641  newxip[newxcnt++] = xid;
642  }
643 
645  }
646 
647  /* adjust remaining snapshot fields as needed */
649  snap->xcnt = newxcnt;
650  snap->xip = newxip;
651 
652  return snap;
653 }
654 
655 /*
656  * Export a snapshot so it can be set in another session with SET TRANSACTION
657  * SNAPSHOT.
658  *
659  * For that we need to start a transaction in the current backend as the
660  * importing side checks whether the source transaction is still open to make
661  * sure the xmin horizon hasn't advanced since then.
662  */
663 const char *
665 {
666  Snapshot snap;
667  char *snapname;
668 
670  elog(ERROR, "cannot export a snapshot from within a transaction");
671 
673  elog(ERROR, "can only export one snapshot at a time");
674 
676  ExportInProgress = true;
677 
679 
680  /* There doesn't seem to a nice API to set these */
682  XactReadOnly = true;
683 
684  snap = SnapBuildInitialSnapshot(builder);
685 
686  /*
687  * now that we've built a plain snapshot, make it active and use the
688  * normal mechanisms for exporting it
689  */
690  snapname = ExportSnapshot(snap);
691 
692  ereport(LOG,
693  (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
694  "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
695  snap->xcnt,
696  snapname, snap->xcnt)));
697  return snapname;
698 }
699 
700 /*
701  * Ensure there is a snapshot and if not build one for current transaction.
702  */
703 Snapshot
705 {
706  Assert(builder->state == SNAPBUILD_CONSISTENT);
707 
708  /* only build a new snapshot if we don't have a prebuilt one */
709  if (builder->snapshot == NULL)
710  {
711  builder->snapshot = SnapBuildBuildSnapshot(builder);
712  /* increase refcount for the snapshot builder */
714  }
715 
716  return builder->snapshot;
717 }
718 
719 /*
720  * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
721  * any. Aborts the previously started transaction and resets the resource
722  * owner back to its original value.
723  */
724 void
726 {
727  ResourceOwner tmpResOwner;
728 
729  /* nothing exported, that is the usual case */
730  if (!ExportInProgress)
731  return;
732 
733  if (!IsTransactionState())
734  elog(ERROR, "clearing exported snapshot in wrong transaction state");
735 
736  /*
737  * AbortCurrentTransaction() takes care of resetting the snapshot state,
738  * so remember SavedResourceOwnerDuringExport.
739  */
740  tmpResOwner = SavedResourceOwnerDuringExport;
741 
742  /* make sure nothing could have ever happened */
744 
745  CurrentResourceOwner = tmpResOwner;
746 }
747 
748 /*
749  * Clear snapshot export state during transaction abort.
750  */
751 void
753 {
755  ExportInProgress = false;
756 }
757 
758 /*
759  * Handle the effects of a single heap change, appropriate to the current state
760  * of the snapshot builder and returns whether changes made at (xid, lsn) can
761  * be decoded.
762  */
763 bool
765 {
766  /*
767  * We can't handle data in transactions if we haven't built a snapshot
768  * yet, so don't store them.
769  */
770  if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
771  return false;
772 
773  /*
774  * No point in keeping track of changes in transactions that we don't have
775  * enough information about to decode. This means that they started before
776  * we got into the SNAPBUILD_FULL_SNAPSHOT state.
777  */
778  if (builder->state < SNAPBUILD_CONSISTENT &&
779  TransactionIdPrecedes(xid, builder->next_phase_at))
780  return false;
781 
782  /*
783  * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
784  * be needed to decode the change we're currently processing.
785  */
786  if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
787  {
788  /* only build a new snapshot if we don't have a prebuilt one */
789  if (builder->snapshot == NULL)
790  {
791  builder->snapshot = SnapBuildBuildSnapshot(builder);
792  /* increase refcount for the snapshot builder */
794  }
795 
796  /*
797  * Increase refcount for the transaction we're handing the snapshot
798  * out to.
799  */
801  ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
802  builder->snapshot);
803  }
804 
805  return true;
806 }
807 
808 /*
809  * Do CommandId/combo CID handling after reading an xl_heap_new_cid record.
810  * This implies that a transaction has done some form of write to system
811  * catalogs.
812  */
813 void
815  XLogRecPtr lsn, xl_heap_new_cid *xlrec)
816 {
817  CommandId cid;
818 
819  /*
820  * we only log new_cid's if a catalog tuple was modified, so mark the
821  * transaction as containing catalog modifications
822  */
823  ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn);
824 
825  ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
826  xlrec->target_locator, xlrec->target_tid,
827  xlrec->cmin, xlrec->cmax,
828  xlrec->combocid);
829 
830  /* figure out new command id */
831  if (xlrec->cmin != InvalidCommandId &&
832  xlrec->cmax != InvalidCommandId)
833  cid = Max(xlrec->cmin, xlrec->cmax);
834  else if (xlrec->cmax != InvalidCommandId)
835  cid = xlrec->cmax;
836  else if (xlrec->cmin != InvalidCommandId)
837  cid = xlrec->cmin;
838  else
839  {
840  cid = InvalidCommandId; /* silence compiler */
841  elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
842  }
843 
844  ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
845 }
846 
847 /*
848  * Add a new Snapshot to all transactions we're decoding that currently are
849  * in-progress so they can see new catalog contents made by the transaction
850  * that just committed. This is necessary because those in-progress
851  * transactions will use the new catalog's contents from here on (at the very
852  * least everything they do needs to be compatible with newer catalog
853  * contents).
854  */
855 static void
857 {
858  dlist_iter txn_i;
859  ReorderBufferTXN *txn;
860 
861  /*
862  * Iterate through all toplevel transactions. This can include
863  * subtransactions which we just don't yet know to be that, but that's
864  * fine, they will just get an unnecessary snapshot queued.
865  */
866  dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
867  {
868  txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
869 
871 
872  /*
873  * If we don't have a base snapshot yet, there are no changes in this
874  * transaction which in turn implies we don't yet need a snapshot at
875  * all. We'll add a snapshot when the first change gets queued.
876  *
877  * NB: This works correctly even for subtransactions because
878  * ReorderBufferAssignChild() takes care to transfer the base snapshot
879  * to the top-level transaction, and while iterating the changequeue
880  * we'll get the change from the subtxn.
881  */
882  if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
883  continue;
884 
885  /*
886  * We don't need to add snapshot to prepared transactions as they
887  * should not see the new catalog contents.
888  */
889  if (rbtxn_prepared(txn) || rbtxn_skip_prepared(txn))
890  continue;
891 
892  elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
893  txn->xid, LSN_FORMAT_ARGS(lsn));
894 
895  /*
896  * increase the snapshot's refcount for the transaction we are handing
897  * it out to
898  */
900  ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
901  builder->snapshot);
902  }
903 }
904 
905 /*
906  * Keep track of a new catalog changing transaction that has committed.
907  */
908 static void
910 {
912 
913  if (builder->committed.xcnt == builder->committed.xcnt_space)
914  {
915  builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
916 
917  elog(DEBUG1, "increasing space for committed transactions to %u",
918  (uint32) builder->committed.xcnt_space);
919 
920  builder->committed.xip = repalloc(builder->committed.xip,
921  builder->committed.xcnt_space * sizeof(TransactionId));
922  }
923 
924  /*
925  * TODO: It might make sense to keep the array sorted here instead of
926  * doing it every time we build a new snapshot. On the other hand this
927  * gets called repeatedly when a transaction with subtransactions commits.
928  */
929  builder->committed.xip[builder->committed.xcnt++] = xid;
930 }
931 
932 /*
933  * Remove knowledge about transactions we treat as committed or containing catalog
934  * changes that are smaller than ->xmin. Those won't ever get checked via
935  * the ->committed or ->catchange array, respectively. The committed xids will
936  * get checked via the clog machinery.
937  *
938  * We can ideally remove the transaction from catchange array once it is
939  * finished (committed/aborted) but that could be costly as we need to maintain
940  * the xids order in the array.
941  */
942 static void
944 {
945  int off;
946  TransactionId *workspace;
947  int surviving_xids = 0;
948 
949  /* not ready yet */
950  if (!TransactionIdIsNormal(builder->xmin))
951  return;
952 
953  /* TODO: Neater algorithm than just copying and iterating? */
954  workspace =
955  MemoryContextAlloc(builder->context,
956  builder->committed.xcnt * sizeof(TransactionId));
957 
958  /* copy xids that still are interesting to workspace */
959  for (off = 0; off < builder->committed.xcnt; off++)
960  {
961  if (NormalTransactionIdPrecedes(builder->committed.xip[off],
962  builder->xmin))
963  ; /* remove */
964  else
965  workspace[surviving_xids++] = builder->committed.xip[off];
966  }
967 
968  /* copy workspace back to persistent state */
969  memcpy(builder->committed.xip, workspace,
970  surviving_xids * sizeof(TransactionId));
971 
972  elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
973  (uint32) builder->committed.xcnt, (uint32) surviving_xids,
974  builder->xmin, builder->xmax);
975  builder->committed.xcnt = surviving_xids;
976 
977  pfree(workspace);
978 
979  /*
980  * Purge xids in ->catchange as well. The purged array must also be sorted
981  * in xidComparator order.
982  */
983  if (builder->catchange.xcnt > 0)
984  {
985  /*
986  * Since catchange.xip is sorted, we find the lower bound of xids that
987  * are still interesting.
988  */
989  for (off = 0; off < builder->catchange.xcnt; off++)
990  {
991  if (TransactionIdFollowsOrEquals(builder->catchange.xip[off],
992  builder->xmin))
993  break;
994  }
995 
996  surviving_xids = builder->catchange.xcnt - off;
997 
998  if (surviving_xids > 0)
999  {
1000  memmove(builder->catchange.xip, &(builder->catchange.xip[off]),
1001  surviving_xids * sizeof(TransactionId));
1002  }
1003  else
1004  {
1005  pfree(builder->catchange.xip);
1006  builder->catchange.xip = NULL;
1007  }
1008 
1009  elog(DEBUG3, "purged catalog modifying transactions from %u to %u, xmin: %u, xmax: %u",
1010  (uint32) builder->catchange.xcnt, (uint32) surviving_xids,
1011  builder->xmin, builder->xmax);
1012  builder->catchange.xcnt = surviving_xids;
1013  }
1014 }
1015 
1016 /*
1017  * Handle everything that needs to be done when a transaction commits
1018  */
1019 void
1021  int nsubxacts, TransactionId *subxacts, uint32 xinfo)
1022 {
1023  int nxact;
1024 
1025  bool needs_snapshot = false;
1026  bool needs_timetravel = false;
1027  bool sub_needs_timetravel = false;
1028 
1029  TransactionId xmax = xid;
1030 
1031  /*
1032  * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
1033  * will they be part of a snapshot. So we don't need to record anything.
1034  */
1035  if (builder->state == SNAPBUILD_START ||
1036  (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
1037  TransactionIdPrecedes(xid, builder->next_phase_at)))
1038  {
1039  /* ensure that only commits after this are getting replayed */
1040  if (builder->start_decoding_at <= lsn)
1041  builder->start_decoding_at = lsn + 1;
1042  return;
1043  }
1044 
1045  if (builder->state < SNAPBUILD_CONSISTENT)
1046  {
1047  /* ensure that only commits after this are getting replayed */
1048  if (builder->start_decoding_at <= lsn)
1049  builder->start_decoding_at = lsn + 1;
1050 
1051  /*
1052  * If building an exportable snapshot, force xid to be tracked, even
1053  * if the transaction didn't modify the catalog.
1054  */
1055  if (builder->building_full_snapshot)
1056  {
1057  needs_timetravel = true;
1058  }
1059  }
1060 
1061  for (nxact = 0; nxact < nsubxacts; nxact++)
1062  {
1063  TransactionId subxid = subxacts[nxact];
1064 
1065  /*
1066  * Add subtransaction to base snapshot if catalog modifying, we don't
1067  * distinguish to toplevel transactions there.
1068  */
1069  if (SnapBuildXidHasCatalogChanges(builder, subxid, xinfo))
1070  {
1071  sub_needs_timetravel = true;
1072  needs_snapshot = true;
1073 
1074  elog(DEBUG1, "found subtransaction %u:%u with catalog changes",
1075  xid, subxid);
1076 
1077  SnapBuildAddCommittedTxn(builder, subxid);
1078 
1079  if (NormalTransactionIdFollows(subxid, xmax))
1080  xmax = subxid;
1081  }
1082 
1083  /*
1084  * If we're forcing timetravel we also need visibility information
1085  * about subtransaction, so keep track of subtransaction's state, even
1086  * if not catalog modifying. Don't need to distribute a snapshot in
1087  * that case.
1088  */
1089  else if (needs_timetravel)
1090  {
1091  SnapBuildAddCommittedTxn(builder, subxid);
1092  if (NormalTransactionIdFollows(subxid, xmax))
1093  xmax = subxid;
1094  }
1095  }
1096 
1097  /* if top-level modified catalog, it'll need a snapshot */
1098  if (SnapBuildXidHasCatalogChanges(builder, xid, xinfo))
1099  {
1100  elog(DEBUG2, "found top level transaction %u, with catalog changes",
1101  xid);
1102  needs_snapshot = true;
1103  needs_timetravel = true;
1104  SnapBuildAddCommittedTxn(builder, xid);
1105  }
1106  else if (sub_needs_timetravel)
1107  {
1108  /* track toplevel txn as well, subxact alone isn't meaningful */
1109  elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions",
1110  xid);
1111  needs_timetravel = true;
1112  SnapBuildAddCommittedTxn(builder, xid);
1113  }
1114  else if (needs_timetravel)
1115  {
1116  elog(DEBUG2, "forced transaction %u to do timetravel", xid);
1117 
1118  SnapBuildAddCommittedTxn(builder, xid);
1119  }
1120 
1121  if (!needs_timetravel)
1122  {
1123  /* record that we cannot export a general snapshot anymore */
1124  builder->committed.includes_all_transactions = false;
1125  }
1126 
1127  Assert(!needs_snapshot || needs_timetravel);
1128 
1129  /*
1130  * Adjust xmax of the snapshot builder, we only do that for committed,
1131  * catalog modifying, transactions, everything else isn't interesting for
1132  * us since we'll never look at the respective rows.
1133  */
1134  if (needs_timetravel &&
1135  (!TransactionIdIsValid(builder->xmax) ||
1136  TransactionIdFollowsOrEquals(xmax, builder->xmax)))
1137  {
1138  builder->xmax = xmax;
1139  TransactionIdAdvance(builder->xmax);
1140  }
1141 
1142  /* if there's any reason to build a historic snapshot, do so now */
1143  if (needs_snapshot)
1144  {
1145  /*
1146  * If we haven't built a complete snapshot yet there's no need to hand
1147  * it out, it wouldn't (and couldn't) be used anyway.
1148  */
1149  if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
1150  return;
1151 
1152  /*
1153  * Decrease the snapshot builder's refcount of the old snapshot, note
1154  * that it still will be used if it has been handed out to the
1155  * reorderbuffer earlier.
1156  */
1157  if (builder->snapshot)
1159 
1160  builder->snapshot = SnapBuildBuildSnapshot(builder);
1161 
1162  /* we might need to execute invalidations, add snapshot */
1163  if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
1164  {
1166  ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
1167  builder->snapshot);
1168  }
1169 
1170  /* refcount of the snapshot builder for the new snapshot */
1172 
1173  /* add a new catalog snapshot to all currently running transactions */
1175  }
1176 }
1177 
1178 /*
1179  * Check the reorder buffer and the snapshot to see if the given transaction has
1180  * modified catalogs.
1181  */
1182 static inline bool
1184  uint32 xinfo)
1185 {
1186  if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
1187  return true;
1188 
1189  /*
1190  * The transactions that have changed catalogs must have invalidation
1191  * info.
1192  */
1193  if (!(xinfo & XACT_XINFO_HAS_INVALS))
1194  return false;
1195 
1196  /* Check the catchange XID array */
1197  return ((builder->catchange.xcnt > 0) &&
1198  (bsearch(&xid, builder->catchange.xip, builder->catchange.xcnt,
1199  sizeof(TransactionId), xidComparator) != NULL));
1200 }
1201 
1202 /* -----------------------------------
1203  * Snapshot building functions dealing with xlog records
1204  * -----------------------------------
1205  */
1206 
1207 /*
1208  * Process a running xacts record, and use its information to first build a
1209  * historic snapshot and later to release resources that aren't needed
1210  * anymore.
1211  */
1212 void
1214 {
1215  ReorderBufferTXN *txn;
1216  TransactionId xmin;
1217 
1218  /*
1219  * If we're not consistent yet, inspect the record to see whether it
1220  * allows to get closer to being consistent. If we are consistent, dump
1221  * our snapshot so others or we, after a restart, can use it.
1222  */
1223  if (builder->state < SNAPBUILD_CONSISTENT)
1224  {
1225  /* returns false if there's no point in performing cleanup just yet */
1226  if (!SnapBuildFindSnapshot(builder, lsn, running))
1227  return;
1228  }
1229  else
1230  SnapBuildSerialize(builder, lsn);
1231 
1232  /*
1233  * Update range of interesting xids based on the running xacts
1234  * information. We don't increase ->xmax using it, because once we are in
1235  * a consistent state we can do that ourselves and much more efficiently
1236  * so, because we only need to do it for catalog transactions since we
1237  * only ever look at those.
1238  *
1239  * NB: We only increase xmax when a catalog modifying transaction commits
1240  * (see SnapBuildCommitTxn). Because of this, xmax can be lower than
1241  * xmin, which looks odd but is correct and actually more efficient, since
1242  * we hit fast paths in heapam_visibility.c.
1243  */
1244  builder->xmin = running->oldestRunningXid;
1245 
1246  /* Remove transactions we don't need to keep track off anymore */
1247  SnapBuildPurgeOlderTxn(builder);
1248 
1249  /*
1250  * Advance the xmin limit for the current replication slot, to allow
1251  * vacuum to clean up the tuples this slot has been protecting.
1252  *
1253  * The reorderbuffer might have an xmin among the currently running
1254  * snapshots; use it if so. If not, we need only consider the snapshots
1255  * we'll produce later, which can't be less than the oldest running xid in
1256  * the record we're reading now.
1257  */
1258  xmin = ReorderBufferGetOldestXmin(builder->reorder);
1259  if (xmin == InvalidTransactionId)
1260  xmin = running->oldestRunningXid;
1261  elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
1262  builder->xmin, builder->xmax, running->oldestRunningXid, xmin);
1263  LogicalIncreaseXminForSlot(lsn, xmin);
1264 
1265  /*
1266  * Also tell the slot where we can restart decoding from. We don't want to
1267  * do that after every commit because changing that implies an fsync of
1268  * the logical slot's state file, so we only do it every time we see a
1269  * running xacts record.
1270  *
1271  * Do so by looking for the oldest in progress transaction (determined by
1272  * the first LSN of any of its relevant records). Every transaction
1273  * remembers the last location we stored the snapshot to disk before its
1274  * beginning. That point is where we can restart from.
1275  */
1276 
1277  /*
1278  * Can't know about a serialized snapshot's location if we're not
1279  * consistent.
1280  */
1281  if (builder->state < SNAPBUILD_CONSISTENT)
1282  return;
1283 
1284  txn = ReorderBufferGetOldestTXN(builder->reorder);
1285 
1286  /*
1287  * oldest ongoing txn might have started when we didn't yet serialize
1288  * anything because we hadn't reached a consistent state yet.
1289  */
1290  if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
1292 
1293  /*
1294  * No in-progress transaction, can reuse the last serialized snapshot if
1295  * we have one.
1296  */
1297  else if (txn == NULL &&
1301  builder->last_serialized_snapshot);
1302 }
1303 
1304 
1305 /*
1306  * Build the start of a snapshot that's capable of decoding the catalog.
1307  *
1308  * Helper function for SnapBuildProcessRunningXacts() while we're not yet
1309  * consistent.
1310  *
1311  * Returns true if there is a point in performing internal maintenance/cleanup
1312  * using the xl_running_xacts record.
1313  */
1314 static bool
1316 {
1317  /* ---
1318  * Build catalog decoding snapshot incrementally using information about
1319  * the currently running transactions. There are several ways to do that:
1320  *
1321  * a) There were no running transactions when the xl_running_xacts record
1322  * was inserted, jump to CONSISTENT immediately. We might find such a
1323  * state while waiting on c)'s sub-states.
1324  *
1325  * b) This (in a previous run) or another decoding slot serialized a
1326  * snapshot to disk that we can use. Can't use this method for the
1327  * initial snapshot when slot is being created and needs full snapshot
1328  * for export or direct use, as that snapshot will only contain catalog
1329  * modifying transactions.
1330  *
1331  * c) First incrementally build a snapshot for catalog tuples
1332  * (BUILDING_SNAPSHOT), that requires all, already in-progress,
1333  * transactions to finish. Every transaction starting after that
1334  * (FULL_SNAPSHOT state), has enough information to be decoded. But
1335  * for older running transactions no viable snapshot exists yet, so
1336  * CONSISTENT will only be reached once all of those have finished.
1337  * ---
1338  */
1339 
1340  /*
1341  * xl_running_xacts record is older than what we can use, we might not have
1342  * all necessary catalog rows anymore.
1343  */
1346  builder->initial_xmin_horizon))
1347  {
1348  ereport(DEBUG1,
1349  (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
1350  LSN_FORMAT_ARGS(lsn)),
1351  errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
1352  builder->initial_xmin_horizon, running->oldestRunningXid)));
1353 
1354 
1355  SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon);
1356 
1357  return true;
1358  }
1359 
1360  /*
1361  * a) No transaction were running, we can jump to consistent.
1362  *
1363  * This is not affected by races around xl_running_xacts, because we can
1364  * miss transaction commits, but currently not transactions starting.
1365  *
1366  * NB: We might have already started to incrementally assemble a snapshot,
1367  * so we need to be careful to deal with that.
1368  */
1369  if (running->oldestRunningXid == running->nextXid)
1370  {
1371  if (builder->start_decoding_at == InvalidXLogRecPtr ||
1372  builder->start_decoding_at <= lsn)
1373  /* can decode everything after this */
1374  builder->start_decoding_at = lsn + 1;
1375 
1376  /* As no transactions were running xmin/xmax can be trivially set. */
1377  builder->xmin = running->nextXid; /* < are finished */
1378  builder->xmax = running->nextXid; /* >= are running */
1379 
1380  /* so we can safely use the faster comparisons */
1381  Assert(TransactionIdIsNormal(builder->xmin));
1382  Assert(TransactionIdIsNormal(builder->xmax));
1383 
1384  builder->state = SNAPBUILD_CONSISTENT;
1386 
1387  ereport(LOG,
1388  (errmsg("logical decoding found consistent point at %X/%X",
1389  LSN_FORMAT_ARGS(lsn)),
1390  errdetail("There are no running transactions.")));
1391 
1392  return false;
1393  }
1394  /* b) valid on disk state and not building full snapshot */
1395  else if (!builder->building_full_snapshot &&
1396  SnapBuildRestore(builder, lsn))
1397  {
1398  /* there won't be any state to cleanup */
1399  return false;
1400  }
1401 
1402  /*
1403  * c) transition from START to BUILDING_SNAPSHOT.
1404  *
1405  * In START state, and a xl_running_xacts record with running xacts is
1406  * encountered. In that case, switch to BUILDING_SNAPSHOT state, and
1407  * record xl_running_xacts->nextXid. Once all running xacts have finished
1408  * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It
1409  * might look that we could use xl_running_xacts's ->xids information to
1410  * get there quicker, but that is problematic because transactions marked
1411  * as running, might already have inserted their commit record - it's
1412  * infeasible to change that with locking.
1413  */
1414  else if (builder->state == SNAPBUILD_START)
1415  {
1417  builder->next_phase_at = running->nextXid;
1418 
1419  /*
1420  * Start with an xmin/xmax that's correct for future, when all the
1421  * currently running transactions have finished. We'll update both
1422  * while waiting for the pending transactions to finish.
1423  */
1424  builder->xmin = running->nextXid; /* < are finished */
1425  builder->xmax = running->nextXid; /* >= are running */
1426 
1427  /* so we can safely use the faster comparisons */
1428  Assert(TransactionIdIsNormal(builder->xmin));
1429  Assert(TransactionIdIsNormal(builder->xmax));
1430 
1431  ereport(LOG,
1432  (errmsg("logical decoding found initial starting point at %X/%X",
1433  LSN_FORMAT_ARGS(lsn)),
1434  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1435  running->xcnt, running->nextXid)));
1436 
1437  SnapBuildWaitSnapshot(running, running->nextXid);
1438  }
1439 
1440  /*
1441  * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT.
1442  *
1443  * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid
1444  * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This
1445  * means all transactions starting afterwards have enough information to
1446  * be decoded. Switch to FULL_SNAPSHOT.
1447  */
1448  else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
1450  running->oldestRunningXid))
1451  {
1452  builder->state = SNAPBUILD_FULL_SNAPSHOT;
1453  builder->next_phase_at = running->nextXid;
1454 
1455  ereport(LOG,
1456  (errmsg("logical decoding found initial consistent point at %X/%X",
1457  LSN_FORMAT_ARGS(lsn)),
1458  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1459  running->xcnt, running->nextXid)));
1460 
1461  SnapBuildWaitSnapshot(running, running->nextXid);
1462  }
1463 
1464  /*
1465  * c) transition from FULL_SNAPSHOT to CONSISTENT.
1466  *
1467  * In FULL_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid is
1468  * >= than nextXid from when we switched to FULL_SNAPSHOT. This means all
1469  * transactions that are currently in progress have a catalog snapshot,
1470  * and all their changes have been collected. Switch to CONSISTENT.
1471  */
1472  else if (builder->state == SNAPBUILD_FULL_SNAPSHOT &&
1474  running->oldestRunningXid))
1475  {
1476  builder->state = SNAPBUILD_CONSISTENT;
1478 
1479  ereport(LOG,
1480  (errmsg("logical decoding found consistent point at %X/%X",
1481  LSN_FORMAT_ARGS(lsn)),
1482  errdetail("There are no old transactions anymore.")));
1483  }
1484 
1485  /*
1486  * We already started to track running xacts and need to wait for all
1487  * in-progress ones to finish. We fall through to the normal processing of
1488  * records so incremental cleanup can be performed.
1489  */
1490  return true;
1491 }
1492 
1493 /* ---
1494  * Iterate through xids in record, wait for all older than the cutoff to
1495  * finish. Then, if possible, log a new xl_running_xacts record.
1496  *
1497  * This isn't required for the correctness of decoding, but to:
1498  * a) allow isolationtester to notice that we're currently waiting for
1499  * something.
1500  * b) log a new xl_running_xacts record where it'd be helpful, without having
1501  * to wait for bgwriter or checkpointer.
1502  * ---
1503  */
1504 static void
1506 {
1507  int off;
1508 
1509  for (off = 0; off < running->xcnt; off++)
1510  {
1511  TransactionId xid = running->xids[off];
1512 
1513  /*
1514  * Upper layers should prevent that we ever need to wait on ourselves.
1515  * Check anyway, since failing to do so would either result in an
1516  * endless wait or an Assert() failure.
1517  */
1519  elog(ERROR, "waiting for ourselves");
1520 
1521  if (TransactionIdFollows(xid, cutoff))
1522  continue;
1523 
1524  XactLockTableWait(xid, NULL, NULL, XLTW_None);
1525  }
1526 
1527  /*
1528  * All transactions we needed to finish finished - try to ensure there is
1529  * another xl_running_xacts record in a timely manner, without having to
1530  * wait for bgwriter or checkpointer to log one. During recovery we can't
1531  * enforce that, so we'll have to wait.
1532  */
1533  if (!RecoveryInProgress())
1534  {
1536  }
1537 }
1538 
1539 /* -----------------------------------
1540  * Snapshot serialization support
1541  * -----------------------------------
1542  */
1543 
1544 /*
1545  * We store current state of struct SnapBuild on disk in the following manner:
1546  *
1547  * struct SnapBuildOnDisk;
1548  * TransactionId * committed.xcnt; (*not xcnt_space*)
1549  * TransactionId * catchange.xcnt;
1550  *
1551  */
1552 typedef struct SnapBuildOnDisk
1553 {
1554  /* first part of this struct needs to be version independent */
1555 
1556  /* data not covered by checksum */
1559 
1560  /* data covered by checksum */
1561 
1562  /* version, in case we want to support pg_upgrade */
1564  /* how large is the on disk data, excluding the constant sized part */
1566 
1567  /* version dependent part */
1569 
1570  /* variable amount of TransactionIds follows */
1572 
1573 #define SnapBuildOnDiskConstantSize \
1574  offsetof(SnapBuildOnDisk, builder)
1575 #define SnapBuildOnDiskNotChecksummedSize \
1576  offsetof(SnapBuildOnDisk, version)
1577 
1578 #define SNAPBUILD_MAGIC 0x51A1E001
1579 #define SNAPBUILD_VERSION 5
1580 
1581 /*
1582  * Store/Load a snapshot from disk, depending on the snapshot builder's state.
1583  *
1584  * Supposed to be used by external (i.e. not snapbuild.c) code that just read
1585  * a record that's a potential location for a serialized snapshot.
1586  */
1587 void
1589 {
1590  if (builder->state < SNAPBUILD_CONSISTENT)
1591  SnapBuildRestore(builder, lsn);
1592  else
1593  SnapBuildSerialize(builder, lsn);
1594 }
1595 
1596 /*
1597  * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
1598  * been done by another decoding process.
1599  */
1600 static void
1602 {
1603  Size needed_length;
1604  SnapBuildOnDisk *ondisk = NULL;
1605  TransactionId *catchange_xip = NULL;
1606  MemoryContext old_ctx;
1607  size_t catchange_xcnt;
1608  char *ondisk_c;
1609  int fd;
1610  char tmppath[MAXPGPATH];
1611  char path[MAXPGPATH];
1612  int ret;
1613  struct stat stat_buf;
1614  Size sz;
1615 
1616  Assert(lsn != InvalidXLogRecPtr);
1618  builder->last_serialized_snapshot <= lsn);
1619 
1620  /*
1621  * no point in serializing if we cannot continue to work immediately after
1622  * restoring the snapshot
1623  */
1624  if (builder->state < SNAPBUILD_CONSISTENT)
1625  return;
1626 
1627  /* consistent snapshots have no next phase */
1629 
1630  /*
1631  * We identify snapshots by the LSN they are valid for. We don't need to
1632  * include timelines in the name as each LSN maps to exactly one timeline
1633  * unless the user used pg_resetwal or similar. If a user did so, there's
1634  * no hope continuing to decode anyway.
1635  */
1636  sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1637  LSN_FORMAT_ARGS(lsn));
1638 
1639  /*
1640  * first check whether some other backend already has written the snapshot
1641  * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
1642  * as a valid state. Everything else is an unexpected error.
1643  */
1644  ret = stat(path, &stat_buf);
1645 
1646  if (ret != 0 && errno != ENOENT)
1647  ereport(ERROR,
1649  errmsg("could not stat file \"%s\": %m", path)));
1650 
1651  else if (ret == 0)
1652  {
1653  /*
1654  * somebody else has already serialized to this point, don't overwrite
1655  * but remember location, so we don't need to read old data again.
1656  *
1657  * To be sure it has been synced to disk after the rename() from the
1658  * tempfile filename to the real filename, we just repeat the fsync.
1659  * That ought to be cheap because in most scenarios it should already
1660  * be safely on disk.
1661  */
1662  fsync_fname(path, false);
1663  fsync_fname("pg_logical/snapshots", true);
1664 
1665  builder->last_serialized_snapshot = lsn;
1666  goto out;
1667  }
1668 
1669  /*
1670  * there is an obvious race condition here between the time we stat(2) the
1671  * file and us writing the file. But we rename the file into place
1672  * atomically and all files created need to contain the same data anyway,
1673  * so this is perfectly fine, although a bit of a resource waste. Locking
1674  * seems like pointless complication.
1675  */
1676  elog(DEBUG1, "serializing snapshot to %s", path);
1677 
1678  /* to make sure only we will write to this tempfile, include pid */
1679  sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%d.tmp",
1680  LSN_FORMAT_ARGS(lsn), MyProcPid);
1681 
1682  /*
1683  * Unlink temporary file if it already exists, needs to have been before a
1684  * crash/error since we won't enter this function twice from within a
1685  * single decoding slot/backend and the temporary file contains the pid of
1686  * the current process.
1687  */
1688  if (unlink(tmppath) != 0 && errno != ENOENT)
1689  ereport(ERROR,
1691  errmsg("could not remove file \"%s\": %m", tmppath)));
1692 
1693  old_ctx = MemoryContextSwitchTo(builder->context);
1694 
1695  /* Get the catalog modifying transactions that are yet not committed */
1696  catchange_xip = ReorderBufferGetCatalogChangesXacts(builder->reorder);
1697  catchange_xcnt = dclist_count(&builder->reorder->catchange_txns);
1698 
1699  needed_length = sizeof(SnapBuildOnDisk) +
1700  sizeof(TransactionId) * (builder->committed.xcnt + catchange_xcnt);
1701 
1702  ondisk_c = palloc0(needed_length);
1703  ondisk = (SnapBuildOnDisk *) ondisk_c;
1704  ondisk->magic = SNAPBUILD_MAGIC;
1705  ondisk->version = SNAPBUILD_VERSION;
1706  ondisk->length = needed_length;
1707  INIT_CRC32C(ondisk->checksum);
1708  COMP_CRC32C(ondisk->checksum,
1709  ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
1711  ondisk_c += sizeof(SnapBuildOnDisk);
1712 
1713  memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
1714  /* NULL-ify memory-only data */
1715  ondisk->builder.context = NULL;
1716  ondisk->builder.snapshot = NULL;
1717  ondisk->builder.reorder = NULL;
1718  ondisk->builder.committed.xip = NULL;
1719  ondisk->builder.catchange.xip = NULL;
1720  /* update catchange only on disk data */
1721  ondisk->builder.catchange.xcnt = catchange_xcnt;
1722 
1723  COMP_CRC32C(ondisk->checksum,
1724  &ondisk->builder,
1725  sizeof(SnapBuild));
1726 
1727  /* copy committed xacts */
1728  if (builder->committed.xcnt > 0)
1729  {
1730  sz = sizeof(TransactionId) * builder->committed.xcnt;
1731  memcpy(ondisk_c, builder->committed.xip, sz);
1732  COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1733  ondisk_c += sz;
1734  }
1735 
1736  /* copy catalog modifying xacts */
1737  if (catchange_xcnt > 0)
1738  {
1739  sz = sizeof(TransactionId) * catchange_xcnt;
1740  memcpy(ondisk_c, catchange_xip, sz);
1741  COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1742  ondisk_c += sz;
1743  }
1744 
1745  FIN_CRC32C(ondisk->checksum);
1746 
1747  /* we have valid data now, open tempfile and write it there */
1748  fd = OpenTransientFile(tmppath,
1749  O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1750  if (fd < 0)
1751  ereport(ERROR,
1753  errmsg("could not open file \"%s\": %m", tmppath)));
1754 
1755  errno = 0;
1757  if ((write(fd, ondisk, needed_length)) != needed_length)
1758  {
1759  int save_errno = errno;
1760 
1762 
1763  /* if write didn't set errno, assume problem is no disk space */
1764  errno = save_errno ? save_errno : ENOSPC;
1765  ereport(ERROR,
1767  errmsg("could not write to file \"%s\": %m", tmppath)));
1768  }
1770 
1771  /*
1772  * fsync the file before renaming so that even if we crash after this we
1773  * have either a fully valid file or nothing.
1774  *
1775  * It's safe to just ERROR on fsync() here because we'll retry the whole
1776  * operation including the writes.
1777  *
1778  * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
1779  * some noticeable overhead since it's performed synchronously during
1780  * decoding?
1781  */
1783  if (pg_fsync(fd) != 0)
1784  {
1785  int save_errno = errno;
1786 
1788  errno = save_errno;
1789  ereport(ERROR,
1791  errmsg("could not fsync file \"%s\": %m", tmppath)));
1792  }
1794 
1795  if (CloseTransientFile(fd) != 0)
1796  ereport(ERROR,
1798  errmsg("could not close file \"%s\": %m", tmppath)));
1799 
1800  fsync_fname("pg_logical/snapshots", true);
1801 
1802  /*
1803  * We may overwrite the work from some other backend, but that's ok, our
1804  * snapshot is valid as well, we'll just have done some superfluous work.
1805  */
1806  if (rename(tmppath, path) != 0)
1807  {
1808  ereport(ERROR,
1810  errmsg("could not rename file \"%s\" to \"%s\": %m",
1811  tmppath, path)));
1812  }
1813 
1814  /* make sure we persist */
1815  fsync_fname(path, false);
1816  fsync_fname("pg_logical/snapshots", true);
1817 
1818  /*
1819  * Now there's no way we can lose the dumped state anymore, remember this
1820  * as a serialization point.
1821  */
1822  builder->last_serialized_snapshot = lsn;
1823 
1824  MemoryContextSwitchTo(old_ctx);
1825 
1826 out:
1828  builder->last_serialized_snapshot);
1829  /* be tidy */
1830  if (ondisk)
1831  pfree(ondisk);
1832  if (catchange_xip)
1833  pfree(catchange_xip);
1834 }
1835 
1836 /*
1837  * Restore a snapshot into 'builder' if previously one has been stored at the
1838  * location indicated by 'lsn'. Returns true if successful, false otherwise.
1839  */
1840 static bool
1842 {
1843  SnapBuildOnDisk ondisk;
1844  int fd;
1845  char path[MAXPGPATH];
1846  Size sz;
1847  pg_crc32c checksum;
1848 
1849  /* no point in loading a snapshot if we're already there */
1850  if (builder->state == SNAPBUILD_CONSISTENT)
1851  return false;
1852 
1853  sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1854  LSN_FORMAT_ARGS(lsn));
1855 
1856  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
1857 
1858  if (fd < 0 && errno == ENOENT)
1859  return false;
1860  else if (fd < 0)
1861  ereport(ERROR,
1863  errmsg("could not open file \"%s\": %m", path)));
1864 
1865  /* ----
1866  * Make sure the snapshot had been stored safely to disk, that's normally
1867  * cheap.
1868  * Note that we do not need PANIC here, nobody will be able to use the
1869  * slot without fsyncing, and saving it won't succeed without an fsync()
1870  * either...
1871  * ----
1872  */
1873  fsync_fname(path, false);
1874  fsync_fname("pg_logical/snapshots", true);
1875 
1876 
1877  /* read statically sized portion of snapshot */
1878  SnapBuildRestoreContents(fd, (char *) &ondisk, SnapBuildOnDiskConstantSize, path);
1879 
1880  if (ondisk.magic != SNAPBUILD_MAGIC)
1881  ereport(ERROR,
1883  errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
1884  path, ondisk.magic, SNAPBUILD_MAGIC)));
1885 
1886  if (ondisk.version != SNAPBUILD_VERSION)
1887  ereport(ERROR,
1889  errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
1890  path, ondisk.version, SNAPBUILD_VERSION)));
1891 
1892  INIT_CRC32C(checksum);
1893  COMP_CRC32C(checksum,
1894  ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
1896 
1897  /* read SnapBuild */
1898  SnapBuildRestoreContents(fd, (char *) &ondisk.builder, sizeof(SnapBuild), path);
1899  COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild));
1900 
1901  /* restore committed xacts information */
1902  if (ondisk.builder.committed.xcnt > 0)
1903  {
1904  sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
1905  ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz);
1906  SnapBuildRestoreContents(fd, (char *) ondisk.builder.committed.xip, sz, path);
1907  COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz);
1908  }
1909 
1910  /* restore catalog modifying xacts information */
1911  if (ondisk.builder.catchange.xcnt > 0)
1912  {
1913  sz = sizeof(TransactionId) * ondisk.builder.catchange.xcnt;
1914  ondisk.builder.catchange.xip = MemoryContextAllocZero(builder->context, sz);
1915  SnapBuildRestoreContents(fd, (char *) ondisk.builder.catchange.xip, sz, path);
1916  COMP_CRC32C(checksum, ondisk.builder.catchange.xip, sz);
1917  }
1918 
1919  if (CloseTransientFile(fd) != 0)
1920  ereport(ERROR,
1922  errmsg("could not close file \"%s\": %m", path)));
1923 
1924  FIN_CRC32C(checksum);
1925 
1926  /* verify checksum of what we've read */
1927  if (!EQ_CRC32C(checksum, ondisk.checksum))
1928  ereport(ERROR,
1930  errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
1931  path, checksum, ondisk.checksum)));
1932 
1933  /*
1934  * ok, we now have a sensible snapshot here, figure out if it has more
1935  * information than we have.
1936  */
1937 
1938  /*
1939  * We are only interested in consistent snapshots for now, comparing
1940  * whether one incomplete snapshot is more "advanced" seems to be
1941  * unnecessarily complex.
1942  */
1943  if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
1944  goto snapshot_not_interesting;
1945 
1946  /*
1947  * Don't use a snapshot that requires an xmin that we cannot guarantee to
1948  * be available.
1949  */
1951  goto snapshot_not_interesting;
1952 
1953  /* consistent snapshots have no next phase */
1955 
1956  /* ok, we think the snapshot is sensible, copy over everything important */
1957  builder->xmin = ondisk.builder.xmin;
1958  builder->xmax = ondisk.builder.xmax;
1959  builder->state = ondisk.builder.state;
1960 
1961  builder->committed.xcnt = ondisk.builder.committed.xcnt;
1962  /* We only allocated/stored xcnt, not xcnt_space xids ! */
1963  /* don't overwrite preallocated xip, if we don't have anything here */
1964  if (builder->committed.xcnt > 0)
1965  {
1966  pfree(builder->committed.xip);
1967  builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
1968  builder->committed.xip = ondisk.builder.committed.xip;
1969  }
1970  ondisk.builder.committed.xip = NULL;
1971 
1972  /* set catalog modifying transactions */
1973  if (builder->catchange.xip)
1974  pfree(builder->catchange.xip);
1975  builder->catchange.xcnt = ondisk.builder.catchange.xcnt;
1976  builder->catchange.xip = ondisk.builder.catchange.xip;
1977  ondisk.builder.catchange.xip = NULL;
1978 
1979  /* our snapshot is not interesting anymore, build a new one */
1980  if (builder->snapshot != NULL)
1981  {
1983  }
1984  builder->snapshot = SnapBuildBuildSnapshot(builder);
1986 
1987  ReorderBufferSetRestartPoint(builder->reorder, lsn);
1988 
1989  Assert(builder->state == SNAPBUILD_CONSISTENT);
1990 
1991  ereport(LOG,
1992  (errmsg("logical decoding found consistent point at %X/%X",
1993  LSN_FORMAT_ARGS(lsn)),
1994  errdetail("Logical decoding will begin using saved snapshot.")));
1995  return true;
1996 
1997 snapshot_not_interesting:
1998  if (ondisk.builder.committed.xip != NULL)
1999  pfree(ondisk.builder.committed.xip);
2000  if (ondisk.builder.catchange.xip != NULL)
2001  pfree(ondisk.builder.catchange.xip);
2002  return false;
2003 }
2004 
2005 /*
2006  * Read the contents of the serialized snapshot to 'dest'.
2007  */
2008 static void
2009 SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path)
2010 {
2011  int readBytes;
2012 
2014  readBytes = read(fd, dest, size);
2016  if (readBytes != size)
2017  {
2018  int save_errno = errno;
2019 
2021 
2022  if (readBytes < 0)
2023  {
2024  errno = save_errno;
2025  ereport(ERROR,
2027  errmsg("could not read file \"%s\": %m", path)));
2028  }
2029  else
2030  ereport(ERROR,
2032  errmsg("could not read file \"%s\": read %d of %zu",
2033  path, readBytes, sizeof(SnapBuild))));
2034  }
2035 }
2036 
2037 /*
2038  * Remove all serialized snapshots that are not required anymore because no
2039  * slot can need them. This doesn't actually have to run during a checkpoint,
2040  * but it's a convenient point to schedule this.
2041  *
2042  * NB: We run this during checkpoints even if logical decoding is disabled so
2043  * we cleanup old slots at some point after it got disabled.
2044  */
2045 void
2047 {
2048  XLogRecPtr cutoff;
2049  XLogRecPtr redo;
2050  DIR *snap_dir;
2051  struct dirent *snap_de;
2052  char path[MAXPGPATH + 21];
2053 
2054  /*
2055  * We start off with a minimum of the last redo pointer. No new
2056  * replication slot will start before that, so that's a safe upper bound
2057  * for removal.
2058  */
2059  redo = GetRedoRecPtr();
2060 
2061  /* now check for the restart ptrs from existing slots */
2063 
2064  /* don't start earlier than the restart lsn */
2065  if (redo < cutoff)
2066  cutoff = redo;
2067 
2068  snap_dir = AllocateDir("pg_logical/snapshots");
2069  while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
2070  {
2071  uint32 hi;
2072  uint32 lo;
2073  XLogRecPtr lsn;
2074  PGFileType de_type;
2075 
2076  if (strcmp(snap_de->d_name, ".") == 0 ||
2077  strcmp(snap_de->d_name, "..") == 0)
2078  continue;
2079 
2080  snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
2081  de_type = get_dirent_type(path, snap_de, false, DEBUG1);
2082 
2083  if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_REG)
2084  {
2085  elog(DEBUG1, "only regular files expected: %s", path);
2086  continue;
2087  }
2088 
2089  /*
2090  * temporary filenames from SnapBuildSerialize() include the LSN and
2091  * everything but are postfixed by .$pid.tmp. We can just remove them
2092  * the same as other files because there can be none that are
2093  * currently being written that are older than cutoff.
2094  *
2095  * We just log a message if a file doesn't fit the pattern, it's
2096  * probably some editors lock/state file or similar...
2097  */
2098  if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
2099  {
2100  ereport(LOG,
2101  (errmsg("could not parse file name \"%s\"", path)));
2102  continue;
2103  }
2104 
2105  lsn = ((uint64) hi) << 32 | lo;
2106 
2107  /* check whether we still need it */
2108  if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
2109  {
2110  elog(DEBUG1, "removing snapbuild snapshot %s", path);
2111 
2112  /*
2113  * It's not particularly harmful, though strange, if we can't
2114  * remove the file here. Don't prevent the checkpoint from
2115  * completing, that'd be a cure worse than the disease.
2116  */
2117  if (unlink(path) < 0)
2118  {
2119  ereport(LOG,
2121  errmsg("could not remove file \"%s\": %m",
2122  path)));
2123  continue;
2124  }
2125  }
2126  }
2127  FreeDir(snap_dir);
2128 }
#define InvalidCommandId
Definition: c.h:653
unsigned int uint32
Definition: c.h:490
#define Max(x, y)
Definition: c.h:982
#define PG_BINARY
Definition: c.h:1260
#define FirstCommandId
Definition: c.h:652
uint32 CommandId
Definition: c.h:650
uint32 TransactionId
Definition: c.h:636
size_t Size
Definition: c.h:589
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1179
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1229
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define DEBUG3
Definition: elog.h:28
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2710
int FreeDir(DIR *dir)
Definition: fd.c:2762
int CloseTransientFile(int fd)
Definition: fd.c:2610
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:662
int pg_fsync(int fd)
Definition: fd.c:356
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2434
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2644
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:406
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
int MyProcPid
Definition: globals.c:44
#define dlist_foreach(iter, lhead)
Definition: ilist.h:623
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
Assert(fmt[strlen(fmt) - 1] !='\n')
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:668
@ XLTW_None
Definition: lmgr.h:26
void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn)
Definition: logical.c:1714
void LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin)
Definition: logical.c:1646
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1195
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1803
@ LW_SHARED
Definition: lwlock.h:116
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc0(Size size)
Definition: mcxt.c:1241
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1048
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1456
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1005
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:387
void * palloc(Size size)
Definition: mcxt.c:1210
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:89
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:94
#define ERRCODE_T_R_SERIALIZATION_FAILURE
Definition: pgbench.c:76
#define sprintf
Definition: port.h:240
#define snprintf
Definition: port.h:238
#define qsort(a, b, c, d)
Definition: port.h:445
static void test(void)
static int fd(const char *x, int i)
Definition: preproc-init.c:105
TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly)
Definition: procarray.c:2992
int GetMaxSnapshotXidCount(void)
Definition: procarray.c:2098
TransactionId * ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
void ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
void ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, CommandId cid)
void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, RelFileLocator locator, ItemPointerData tid, CommandId cmin, CommandId cmax, CommandId combocid)
void ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
TransactionId ReorderBufferGetOldestXmin(ReorderBuffer *rb)
ReorderBufferTXN * ReorderBufferGetOldestTXN(ReorderBuffer *rb)
bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
void ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Snapshot snap)
void ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
#define rbtxn_prepared(txn)
#define rbtxn_skip_prepared(txn)
ResourceOwner CurrentResourceOwner
Definition: resowner.c:146
XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void)
Definition: slot.c:935
static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:1601
void SnapBuildSnapDecRefcount(Snapshot snap)
Definition: snapbuild.c:453
#define SNAPBUILD_VERSION
Definition: snapbuild.c:1579
bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
Definition: snapbuild.c:429
void SnapBuildResetExportedSnapshotState(void)
Definition: snapbuild.c:752
void SnapBuildSetTwoPhaseAt(SnapBuild *builder, XLogRecPtr ptr)
Definition: snapbuild.c:420
static void SnapBuildSnapIncRefcount(Snapshot snap)
Definition: snapbuild.c:441
bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
Definition: snapbuild.c:764
const char * SnapBuildExportSnapshot(SnapBuild *builder)
Definition: snapbuild.c:664
XLogRecPtr SnapBuildGetTwoPhaseAt(SnapBuild *builder)
Definition: snapbuild.c:411
SnapBuildState SnapBuildCurrentState(SnapBuild *builder)
Definition: snapbuild.c:402
#define SnapBuildOnDiskNotChecksummedSize
Definition: snapbuild.c:1575
void FreeSnapshotBuilder(SnapBuild *builder)
Definition: snapbuild.c:358
void CheckPointSnapBuild(void)
Definition: snapbuild.c:2046
static void SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
Definition: snapbuild.c:909
SnapBuild * AllocateSnapshotBuilder(ReorderBuffer *reorder, TransactionId xmin_horizon, XLogRecPtr start_lsn, bool need_full_snapshot, XLogRecPtr two_phase_at)
Definition: snapbuild.c:312
#define SNAPBUILD_MAGIC
Definition: snapbuild.c:1578
static bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid, uint32 xinfo)
Definition: snapbuild.c:1183
Snapshot SnapBuildGetOrBuildSnapshot(SnapBuild *builder)
Definition: snapbuild.c:704
Snapshot SnapBuildInitialSnapshot(SnapBuild *builder)
Definition: snapbuild.c:565
static ResourceOwner SavedResourceOwnerDuringExport
Definition: snapbuild.c:278
void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:1588
void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, int nsubxacts, TransactionId *subxacts, uint32 xinfo)
Definition: snapbuild.c:1020
static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff)
Definition: snapbuild.c:1505
static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder)
Definition: snapbuild.c:485
void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn, xl_heap_new_cid *xlrec)
Definition: snapbuild.c:814
void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
Definition: snapbuild.c:1213
static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:856
void SnapBuildClearExportedSnapshot(void)
Definition: snapbuild.c:725
static void SnapBuildFreeSnapshot(Snapshot snap)
Definition: snapbuild.c:377
static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
Definition: snapbuild.c:1315
static bool ExportInProgress
Definition: snapbuild.c:279
struct SnapBuildOnDisk SnapBuildOnDisk
static void SnapBuildPurgeOlderTxn(SnapBuild *builder)
Definition: snapbuild.c:943
#define SnapBuildOnDiskConstantSize
Definition: snapbuild.c:1573
static void SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path)
Definition: snapbuild.c:2009
static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
Definition: snapbuild.c:1841
SnapBuildState
Definition: snapbuild.h:19
@ SNAPBUILD_START
Definition: snapbuild.h:23
@ SNAPBUILD_BUILDING_SNAPSHOT
Definition: snapbuild.h:29
@ SNAPBUILD_FULL_SNAPSHOT
Definition: snapbuild.h:39
@ SNAPBUILD_CONSISTENT
Definition: snapbuild.h:46
char * ExportSnapshot(Snapshot snapshot)
Definition: snapmgr.c:1125
bool HistoricSnapshotActive(void)
Definition: snapmgr.c:2103
bool HaveRegisteredOrActiveSnapshot(void)
Definition: snapmgr.c:1641
void InvalidateCatalogSnapshot(void)
Definition: snapmgr.c:457
struct SnapshotData SnapshotData
@ SNAPSHOT_MVCC
Definition: snapshot.h:50
@ SNAPSHOT_HISTORIC_MVCC
Definition: snapshot.h:109
PGPROC * MyProc
Definition: proc.c:66
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:1264
Definition: dirent.c:26
TransactionId xmin
Definition: proc.h:178
XLogRecPtr restart_decoding_lsn
TransactionId xid
dclist_head catchange_txns
dlist_head toplevel_by_lsn
XLogRecPtr current_restart_decoding_lsn
SnapBuild builder
Definition: snapbuild.c:1568
pg_crc32c checksum
Definition: snapbuild.c:1558
XLogRecPtr start_decoding_at
Definition: snapbuild.c:166
SnapBuildState state
Definition: snapbuild.c:151
TransactionId xmin
Definition: snapbuild.c:157
TransactionId initial_xmin_horizon
Definition: snapbuild.c:183
struct SnapBuild::@16 committed
TransactionId xmax
Definition: snapbuild.c:160
TransactionId * xip
Definition: snapbuild.c:243
Snapshot snapshot
Definition: snapbuild.c:191
XLogRecPtr two_phase_at
Definition: snapbuild.c:177
bool building_full_snapshot
Definition: snapbuild.c:186
TransactionId next_phase_at
Definition: snapbuild.c:208
struct SnapBuild::@17 catchange
size_t xcnt
Definition: snapbuild.c:217
XLogRecPtr last_serialized_snapshot
Definition: snapbuild.c:196
size_t xcnt_space
Definition: snapbuild.c:220
bool includes_all_transactions
Definition: snapbuild.c:227
MemoryContext context
Definition: snapbuild.c:154
ReorderBuffer * reorder
Definition: snapbuild.c:201
TransactionId xmin
Definition: snapshot.h:157
int32 subxcnt
Definition: snapshot.h:181
bool copied
Definition: snapshot.h:185
uint32 regd_count
Definition: snapshot.h:205
uint32 active_count
Definition: snapshot.h:204
CommandId curcid
Definition: snapshot.h:187
uint32 xcnt
Definition: snapshot.h:169
TransactionId * subxip
Definition: snapshot.h:180
uint64 snapXactCompletionCount
Definition: snapshot.h:216
TransactionId xmax
Definition: snapshot.h:158
SnapshotType snapshot_type
Definition: snapshot.h:144
TransactionId * xip
Definition: snapshot.h:168
bool suboverflowed
Definition: snapshot.h:182
bool takenDuringRecovery
Definition: snapshot.h:184
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
dlist_node * cur
Definition: ilist.h:179
CommandId cmin
Definition: heapam_xlog.h:376
CommandId combocid
Definition: heapam_xlog.h:378
ItemPointerData target_tid
Definition: heapam_xlog.h:384
TransactionId top_xid
Definition: heapam_xlog.h:375
CommandId cmax
Definition: heapam_xlog.h:377
RelFileLocator target_locator
Definition: heapam_xlog.h:383
TransactionId oldestRunningXid
Definition: standbydefs.h:53
TransactionId xids[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:56
TransactionId nextXid
Definition: standbydefs.h:52
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:299
bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition: transam.c:314
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:329
#define InvalidTransactionId
Definition: transam.h:31
#define NormalTransactionIdPrecedes(id1, id2)
Definition: transam.h:147
#define NormalTransactionIdFollows(id1, id2)
Definition: transam.h:152
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TransactionIdAdvance(dest)
Definition: transam.h:91
@ WAIT_EVENT_SNAPBUILD_SYNC
Definition: wait_event.h:215
@ WAIT_EVENT_SNAPBUILD_WRITE
Definition: wait_event.h:216
@ WAIT_EVENT_SNAPBUILD_READ
Definition: wait_event.h:214
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:271
static void pgstat_report_wait_end(void)
Definition: wait_event.h:287
#define stat
Definition: win32_port.h:286
bool IsTransactionOrTransactionBlock(void)
Definition: xact.c:4841
bool XactReadOnly
Definition: xact.c:82
bool IsTransactionState(void)
Definition: xact.c:378
void StartTransactionCommand(void)
Definition: xact.c:2944
int XactIsoLevel
Definition: xact.c:79
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition: xact.c:926
void AbortCurrentTransaction(void)
Definition: xact.c:3312
#define XACT_REPEATABLE_READ
Definition: xact.h:38
#define XACT_XINFO_HAS_INVALS
Definition: xact.h:191
int xidComparator(const void *arg1, const void *arg2)
Definition: xid.c:138
bool RecoveryInProgress(void)
Definition: xlog.c:5908
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6011
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28