PostgreSQL Source Code  git master
twophase.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * twophase.c
4  * Two-phase commit support functions.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/access/transam/twophase.c
11  *
12  * NOTES
13  * Each global transaction is associated with a global transaction
14  * identifier (GID). The client assigns a GID to a postgres
15  * transaction with the PREPARE TRANSACTION command.
16  *
17  * We keep all active global transactions in a shared memory array.
18  * When the PREPARE TRANSACTION command is issued, the GID is
19  * reserved for the transaction in the array. This is done before
20  * a WAL entry is made, because the reservation checks for duplicate
21  * GIDs and aborts the transaction if there already is a global
22  * transaction in prepared state with the same GID.
23  *
24  * A global transaction (gxact) also has dummy PGPROC; this is what keeps
25  * the XID considered running by TransactionIdIsInProgress. It is also
26  * convenient as a PGPROC to hook the gxact's locks to.
27  *
28  * Information to recover prepared transactions in case of crash is
29  * now stored in WAL for the common case. In some cases there will be
30  * an extended period between preparing a GXACT and commit/abort, in
31  * which case we need to separately record prepared transaction data
32  * in permanent storage. This includes locking information, pending
33  * notifications etc. All that state information is written to the
34  * per-transaction state file in the pg_twophase directory.
35  * All prepared transactions will be written prior to shutdown.
36  *
37  * Life track of state data is following:
38  *
39  * * On PREPARE TRANSACTION backend writes state data only to the WAL and
40  * stores pointer to the start of the WAL record in
41  * gxact->prepare_start_lsn.
42  * * If COMMIT occurs before checkpoint then backend reads data from WAL
43  * using prepare_start_lsn.
44  * * On checkpoint state data copied to files in pg_twophase directory and
45  * fsynced
46  * * If COMMIT happens after checkpoint then backend reads state data from
47  * files
48  *
49  * During replay and replication, TwoPhaseState also holds information
50  * about active prepared transactions that haven't been moved to disk yet.
51  *
52  * Replay of twophase records happens by the following rules:
53  *
54  * * At the beginning of recovery, pg_twophase is scanned once, filling
55  * TwoPhaseState with entries marked with gxact->inredo and
56  * gxact->ondisk. Two-phase file data older than the XID horizon of
57  * the redo position are discarded.
58  * * On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts.
59  * gxact->inredo is set to true for such entries.
60  * * On Checkpoint we iterate through TwoPhaseState->prepXacts entries
61  * that have gxact->inredo set and are behind the redo_horizon. We
62  * save them to disk and then switch gxact->ondisk to true.
63  * * On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts.
64  * If gxact->ondisk is true, the corresponding entry from the disk
65  * is additionally deleted.
66  * * RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions()
67  * and PrescanPreparedTransactions() have been modified to go through
68  * gxact->inredo entries that have not made it to disk.
69  *
70  *-------------------------------------------------------------------------
71  */
72 #include "postgres.h"
73 
74 #include <fcntl.h>
75 #include <sys/stat.h>
76 #include <time.h>
77 #include <unistd.h>
78 
79 #include "access/commit_ts.h"
80 #include "access/htup_details.h"
81 #include "access/subtrans.h"
82 #include "access/transam.h"
83 #include "access/twophase.h"
84 #include "access/twophase_rmgr.h"
85 #include "access/xact.h"
86 #include "access/xlog.h"
87 #include "access/xloginsert.h"
88 #include "access/xlogreader.h"
89 #include "access/xlogrecovery.h"
90 #include "access/xlogutils.h"
91 #include "catalog/pg_type.h"
92 #include "catalog/storage.h"
93 #include "funcapi.h"
94 #include "miscadmin.h"
95 #include "pg_trace.h"
96 #include "pgstat.h"
97 #include "replication/origin.h"
98 #include "replication/syncrep.h"
99 #include "storage/fd.h"
100 #include "storage/ipc.h"
101 #include "storage/md.h"
102 #include "storage/predicate.h"
103 #include "storage/proc.h"
104 #include "storage/procarray.h"
105 #include "utils/builtins.h"
106 #include "utils/memutils.h"
107 #include "utils/timestamp.h"
108 
109 /*
110  * Directory where Two-phase commit files reside within PGDATA
111  */
112 #define TWOPHASE_DIR "pg_twophase"
113 
114 /* GUC variable, can't be changed after startup */
116 
117 /*
118  * This struct describes one global transaction that is in prepared state
119  * or attempting to become prepared.
120  *
121  * The lifecycle of a global transaction is:
122  *
123  * 1. After checking that the requested GID is not in use, set up an entry in
124  * the TwoPhaseState->prepXacts array with the correct GID and valid = false,
125  * and mark it as locked by my backend.
126  *
127  * 2. After successfully completing prepare, set valid = true and enter the
128  * referenced PGPROC into the global ProcArray.
129  *
130  * 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry is
131  * valid and not locked, then mark the entry as locked by storing my current
132  * proc number into locking_backend. This prevents concurrent attempts to
133  * commit or rollback the same prepared xact.
134  *
135  * 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry
136  * from the ProcArray and the TwoPhaseState->prepXacts array and return it to
137  * the freelist.
138  *
139  * Note that if the preparing transaction fails between steps 1 and 2, the
140  * entry must be removed so that the GID and the GlobalTransaction struct
141  * can be reused. See AtAbort_Twophase().
142  *
143  * typedef struct GlobalTransactionData *GlobalTransaction appears in
144  * twophase.h
145  */
146 
147 typedef struct GlobalTransactionData
148 {
149  GlobalTransaction next; /* list link for free list */
150  int pgprocno; /* ID of associated dummy PGPROC */
151  TimestampTz prepared_at; /* time of preparation */
152 
153  /*
154  * Note that we need to keep track of two LSNs for each GXACT. We keep
155  * track of the start LSN because this is the address we must use to read
156  * state data back from WAL when committing a prepared GXACT. We keep
157  * track of the end LSN because that is the LSN we need to wait for prior
158  * to commit.
159  */
160  XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */
161  XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */
162  TransactionId xid; /* The GXACT id */
163 
164  Oid owner; /* ID of user that executed the xact */
165  ProcNumber locking_backend; /* backend currently working on the xact */
166  bool valid; /* true if PGPROC entry is in proc array */
167  bool ondisk; /* true if prepare state file is on disk */
168  bool inredo; /* true if entry was added via xlog_redo */
169  char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
171 
172 /*
173  * Two Phase Commit shared state. Access to this struct is protected
174  * by TwoPhaseStateLock.
175  */
176 typedef struct TwoPhaseStateData
177 {
178  /* Head of linked list of free GlobalTransactionData structs */
180 
181  /* Number of valid prepXacts entries. */
183 
184  /* There are max_prepared_xacts items in this array */
187 
189 
190 /*
191  * Global transaction entry currently locked by us, if any. Note that any
192  * access to the entry pointed to by this variable must be protected by
193  * TwoPhaseStateLock, though obviously the pointer itself doesn't need to be
194  * (since it's just local memory).
195  */
197 
198 static bool twophaseExitRegistered = false;
199 
201  int nchildren,
202  TransactionId *children,
203  int nrels,
204  RelFileLocator *rels,
205  int nstats,
206  xl_xact_stats_item *stats,
207  int ninvalmsgs,
208  SharedInvalidationMessage *invalmsgs,
209  bool initfileinval,
210  const char *gid);
212  int nchildren,
213  TransactionId *children,
214  int nrels,
215  RelFileLocator *rels,
216  int nstats,
217  xl_xact_stats_item *stats,
218  const char *gid);
219 static void ProcessRecords(char *bufptr, TransactionId xid,
220  const TwoPhaseCallback callbacks[]);
221 static void RemoveGXact(GlobalTransaction gxact);
222 
223 static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
224 static char *ProcessTwoPhaseBuffer(TransactionId xid,
225  XLogRecPtr prepare_start_lsn,
226  bool fromdisk, bool setParent, bool setNextXid);
228  const char *gid, TimestampTz prepared_at, Oid owner,
229  Oid databaseid);
230 static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning);
231 static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
232 
233 /*
234  * Initialization of shared memory
235  */
236 Size
238 {
239  Size size;
240 
241  /* Need the fixed struct, the array of pointers, and the GTD structs */
242  size = offsetof(TwoPhaseStateData, prepXacts);
244  sizeof(GlobalTransaction)));
245  size = MAXALIGN(size);
247  sizeof(GlobalTransactionData)));
248 
249  return size;
250 }
251 
252 void
254 {
255  bool found;
256 
257  TwoPhaseState = ShmemInitStruct("Prepared Transaction Table",
259  &found);
260  if (!IsUnderPostmaster)
261  {
262  GlobalTransaction gxacts;
263  int i;
264 
265  Assert(!found);
266  TwoPhaseState->freeGXacts = NULL;
268 
269  /*
270  * Initialize the linked list of free GlobalTransactionData structs
271  */
272  gxacts = (GlobalTransaction)
273  ((char *) TwoPhaseState +
274  MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
276  for (i = 0; i < max_prepared_xacts; i++)
277  {
278  /* insert into linked list */
279  gxacts[i].next = TwoPhaseState->freeGXacts;
280  TwoPhaseState->freeGXacts = &gxacts[i];
281 
282  /* associate it with a PGPROC assigned by InitProcGlobal */
284  }
285  }
286  else
287  Assert(found);
288 }
289 
290 /*
291  * Exit hook to unlock the global transaction entry we're working on.
292  */
293 static void
295 {
296  /* same logic as abort */
298 }
299 
300 /*
301  * Abort hook to unlock the global transaction entry we're working on.
302  */
303 void
305 {
306  if (MyLockedGxact == NULL)
307  return;
308 
309  /*
310  * What to do with the locked global transaction entry? If we were in the
311  * process of preparing the transaction, but haven't written the WAL
312  * record and state file yet, the transaction must not be considered as
313  * prepared. Likewise, if we are in the process of finishing an
314  * already-prepared transaction, and fail after having already written the
315  * 2nd phase commit or rollback record to the WAL, the transaction should
316  * not be considered as prepared anymore. In those cases, just remove the
317  * entry from shared memory.
318  *
319  * Otherwise, the entry must be left in place so that the transaction can
320  * be finished later, so just unlock it.
321  *
322  * If we abort during prepare, after having written the WAL record, we
323  * might not have transferred all locks and other state to the prepared
324  * transaction yet. Likewise, if we abort during commit or rollback,
325  * after having written the WAL record, we might not have released all the
326  * resources held by the transaction yet. In those cases, the in-memory
327  * state can be wrong, but it's too late to back out.
328  */
329  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
330  if (!MyLockedGxact->valid)
332  else
334  LWLockRelease(TwoPhaseStateLock);
335 
336  MyLockedGxact = NULL;
337 }
338 
339 /*
340  * This is called after we have finished transferring state to the prepared
341  * PGPROC entry.
342  */
343 void
345 {
346  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
348  LWLockRelease(TwoPhaseStateLock);
349 
350  MyLockedGxact = NULL;
351 }
352 
353 
354 /*
355  * MarkAsPreparing
356  * Reserve the GID for the given transaction.
357  */
359 MarkAsPreparing(TransactionId xid, const char *gid,
360  TimestampTz prepared_at, Oid owner, Oid databaseid)
361 {
362  GlobalTransaction gxact;
363  int i;
364 
365  if (strlen(gid) >= GIDSIZE)
366  ereport(ERROR,
367  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
368  errmsg("transaction identifier \"%s\" is too long",
369  gid)));
370 
371  /* fail immediately if feature is disabled */
372  if (max_prepared_xacts == 0)
373  ereport(ERROR,
374  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
375  errmsg("prepared transactions are disabled"),
376  errhint("Set \"max_prepared_transactions\" to a nonzero value.")));
377 
378  /* on first call, register the exit hook */
380  {
382  twophaseExitRegistered = true;
383  }
384 
385  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
386 
387  /* Check for conflicting GID */
388  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
389  {
390  gxact = TwoPhaseState->prepXacts[i];
391  if (strcmp(gxact->gid, gid) == 0)
392  {
393  ereport(ERROR,
395  errmsg("transaction identifier \"%s\" is already in use",
396  gid)));
397  }
398  }
399 
400  /* Get a free gxact from the freelist */
401  if (TwoPhaseState->freeGXacts == NULL)
402  ereport(ERROR,
403  (errcode(ERRCODE_OUT_OF_MEMORY),
404  errmsg("maximum number of prepared transactions reached"),
405  errhint("Increase \"max_prepared_transactions\" (currently %d).",
407  gxact = TwoPhaseState->freeGXacts;
408  TwoPhaseState->freeGXacts = gxact->next;
409 
410  MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid);
411 
412  gxact->ondisk = false;
413 
414  /* And insert it into the active array */
417 
418  LWLockRelease(TwoPhaseStateLock);
419 
420  return gxact;
421 }
422 
423 /*
424  * MarkAsPreparingGuts
425  *
426  * This uses a gxact struct and puts it into the active array.
427  * NOTE: this is also used when reloading a gxact after a crash; so avoid
428  * assuming that we can use very much backend context.
429  *
430  * Note: This function should be called with appropriate locks held.
431  */
432 static void
434  TimestampTz prepared_at, Oid owner, Oid databaseid)
435 {
436  PGPROC *proc;
437  int i;
438 
439  Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
440 
441  Assert(gxact != NULL);
442  proc = GetPGProcByNumber(gxact->pgprocno);
443 
444  /* Initialize the PGPROC entry */
445  MemSet(proc, 0, sizeof(PGPROC));
446  dlist_node_init(&proc->links);
449  {
450  /* clone VXID, for TwoPhaseGetXidByVirtualXID() to find */
451  proc->vxid.lxid = MyProc->vxid.lxid;
452  proc->vxid.procNumber = MyProcNumber;
453  }
454  else
455  {
457  /* GetLockConflicts() uses this to specify a wait on the XID */
458  proc->vxid.lxid = xid;
460  }
461  proc->xid = xid;
462  Assert(proc->xmin == InvalidTransactionId);
463  proc->delayChkptFlags = 0;
464  proc->statusFlags = 0;
465  proc->pid = 0;
466  proc->databaseId = databaseid;
467  proc->roleId = owner;
468  proc->tempNamespaceId = InvalidOid;
469  proc->isBackgroundWorker = false;
471  proc->lwWaitMode = 0;
472  proc->waitLock = NULL;
473  proc->waitProcLock = NULL;
474  pg_atomic_init_u64(&proc->waitStart, 0);
475  for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
476  dlist_init(&proc->myProcLocks[i]);
477  /* subxid data must be filled later by GXactLoadSubxactData */
478  proc->subxidStatus.overflowed = false;
479  proc->subxidStatus.count = 0;
480 
481  gxact->prepared_at = prepared_at;
482  gxact->xid = xid;
483  gxact->owner = owner;
484  gxact->locking_backend = MyProcNumber;
485  gxact->valid = false;
486  gxact->inredo = false;
487  strcpy(gxact->gid, gid);
488 
489  /*
490  * Remember that we have this GlobalTransaction entry locked for us. If we
491  * abort after this, we must release it.
492  */
493  MyLockedGxact = gxact;
494 }
495 
496 /*
497  * GXactLoadSubxactData
498  *
499  * If the transaction being persisted had any subtransactions, this must
500  * be called before MarkAsPrepared() to load information into the dummy
501  * PGPROC.
502  */
503 static void
505  TransactionId *children)
506 {
507  PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
508 
509  /* We need no extra lock since the GXACT isn't valid yet */
510  if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
511  {
512  proc->subxidStatus.overflowed = true;
513  nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
514  }
515  if (nsubxacts > 0)
516  {
517  memcpy(proc->subxids.xids, children,
518  nsubxacts * sizeof(TransactionId));
519  proc->subxidStatus.count = nsubxacts;
520  }
521 }
522 
523 /*
524  * MarkAsPrepared
525  * Mark the GXACT as fully valid, and enter it into the global ProcArray.
526  *
527  * lock_held indicates whether caller already holds TwoPhaseStateLock.
528  */
529 static void
530 MarkAsPrepared(GlobalTransaction gxact, bool lock_held)
531 {
532  /* Lock here may be overkill, but I'm not convinced of that ... */
533  if (!lock_held)
534  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
535  Assert(!gxact->valid);
536  gxact->valid = true;
537  if (!lock_held)
538  LWLockRelease(TwoPhaseStateLock);
539 
540  /*
541  * Put it into the global ProcArray so TransactionIdIsInProgress considers
542  * the XID as still running.
543  */
545 }
546 
547 /*
548  * LockGXact
549  * Locate the prepared transaction and mark it busy for COMMIT or PREPARE.
550  */
551 static GlobalTransaction
552 LockGXact(const char *gid, Oid user)
553 {
554  int i;
555 
556  /* on first call, register the exit hook */
558  {
560  twophaseExitRegistered = true;
561  }
562 
563  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
564 
565  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
566  {
568  PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
569 
570  /* Ignore not-yet-valid GIDs */
571  if (!gxact->valid)
572  continue;
573  if (strcmp(gxact->gid, gid) != 0)
574  continue;
575 
576  /* Found it, but has someone else got it locked? */
577  if (gxact->locking_backend != INVALID_PROC_NUMBER)
578  ereport(ERROR,
579  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
580  errmsg("prepared transaction with identifier \"%s\" is busy",
581  gid)));
582 
583  if (user != gxact->owner && !superuser_arg(user))
584  ereport(ERROR,
585  (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
586  errmsg("permission denied to finish prepared transaction"),
587  errhint("Must be superuser or the user that prepared the transaction.")));
588 
589  /*
590  * Note: it probably would be possible to allow committing from
591  * another database; but at the moment NOTIFY is known not to work and
592  * there may be some other issues as well. Hence disallow until
593  * someone gets motivated to make it work.
594  */
595  if (MyDatabaseId != proc->databaseId)
596  ereport(ERROR,
597  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
598  errmsg("prepared transaction belongs to another database"),
599  errhint("Connect to the database where the transaction was prepared to finish it.")));
600 
601  /* OK for me to lock it */
602  gxact->locking_backend = MyProcNumber;
603  MyLockedGxact = gxact;
604 
605  LWLockRelease(TwoPhaseStateLock);
606 
607  return gxact;
608  }
609 
610  LWLockRelease(TwoPhaseStateLock);
611 
612  ereport(ERROR,
613  (errcode(ERRCODE_UNDEFINED_OBJECT),
614  errmsg("prepared transaction with identifier \"%s\" does not exist",
615  gid)));
616 
617  /* NOTREACHED */
618  return NULL;
619 }
620 
621 /*
622  * RemoveGXact
623  * Remove the prepared transaction from the shared memory array.
624  *
625  * NB: caller should have already removed it from ProcArray
626  */
627 static void
629 {
630  int i;
631 
632  Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
633 
634  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
635  {
636  if (gxact == TwoPhaseState->prepXacts[i])
637  {
638  /* remove from the active array */
641 
642  /* and put it back in the freelist */
643  gxact->next = TwoPhaseState->freeGXacts;
644  TwoPhaseState->freeGXacts = gxact;
645 
646  return;
647  }
648  }
649 
650  elog(ERROR, "failed to find %p in GlobalTransaction array", gxact);
651 }
652 
653 /*
654  * Returns an array of all prepared transactions for the user-level
655  * function pg_prepared_xact.
656  *
657  * The returned array and all its elements are copies of internal data
658  * structures, to minimize the time we need to hold the TwoPhaseStateLock.
659  *
660  * WARNING -- we return even those transactions that are not fully prepared
661  * yet. The caller should filter them out if he doesn't want them.
662  *
663  * The returned array is palloc'd.
664  */
665 static int
667 {
668  GlobalTransaction array;
669  int num;
670  int i;
671 
672  LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
673 
674  if (TwoPhaseState->numPrepXacts == 0)
675  {
676  LWLockRelease(TwoPhaseStateLock);
677 
678  *gxacts = NULL;
679  return 0;
680  }
681 
683  array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num);
684  *gxacts = array;
685  for (i = 0; i < num; i++)
686  memcpy(array + i, TwoPhaseState->prepXacts[i],
687  sizeof(GlobalTransactionData));
688 
689  LWLockRelease(TwoPhaseStateLock);
690 
691  return num;
692 }
693 
694 
695 /* Working status for pg_prepared_xact */
696 typedef struct
697 {
699  int ngxacts;
700  int currIdx;
701 } Working_State;
702 
703 /*
704  * pg_prepared_xact
705  * Produce a view with one row per prepared transaction.
706  *
707  * This function is here so we don't have to export the
708  * GlobalTransactionData struct definition.
709  */
710 Datum
712 {
713  FuncCallContext *funcctx;
714  Working_State *status;
715 
716  if (SRF_IS_FIRSTCALL())
717  {
718  TupleDesc tupdesc;
719  MemoryContext oldcontext;
720 
721  /* create a function context for cross-call persistence */
722  funcctx = SRF_FIRSTCALL_INIT();
723 
724  /*
725  * Switch to memory context appropriate for multiple function calls
726  */
727  oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
728 
729  /* build tupdesc for result tuples */
730  /* this had better match pg_prepared_xacts view in system_views.sql */
731  tupdesc = CreateTemplateTupleDesc(5);
732  TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction",
733  XIDOID, -1, 0);
734  TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid",
735  TEXTOID, -1, 0);
736  TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared",
737  TIMESTAMPTZOID, -1, 0);
738  TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid",
739  OIDOID, -1, 0);
740  TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid",
741  OIDOID, -1, 0);
742 
743  funcctx->tuple_desc = BlessTupleDesc(tupdesc);
744 
745  /*
746  * Collect all the 2PC status information that we will format and send
747  * out as a result set.
748  */
749  status = (Working_State *) palloc(sizeof(Working_State));
750  funcctx->user_fctx = (void *) status;
751 
752  status->ngxacts = GetPreparedTransactionList(&status->array);
753  status->currIdx = 0;
754 
755  MemoryContextSwitchTo(oldcontext);
756  }
757 
758  funcctx = SRF_PERCALL_SETUP();
759  status = (Working_State *) funcctx->user_fctx;
760 
761  while (status->array != NULL && status->currIdx < status->ngxacts)
762  {
763  GlobalTransaction gxact = &status->array[status->currIdx++];
764  PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
765  Datum values[5] = {0};
766  bool nulls[5] = {0};
767  HeapTuple tuple;
768  Datum result;
769 
770  if (!gxact->valid)
771  continue;
772 
773  /*
774  * Form tuple with appropriate data.
775  */
776 
777  values[0] = TransactionIdGetDatum(proc->xid);
778  values[1] = CStringGetTextDatum(gxact->gid);
780  values[3] = ObjectIdGetDatum(gxact->owner);
781  values[4] = ObjectIdGetDatum(proc->databaseId);
782 
783  tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
784  result = HeapTupleGetDatum(tuple);
785  SRF_RETURN_NEXT(funcctx, result);
786  }
787 
788  SRF_RETURN_DONE(funcctx);
789 }
790 
791 /*
792  * TwoPhaseGetGXact
793  * Get the GlobalTransaction struct for a prepared transaction
794  * specified by XID
795  *
796  * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
797  * caller had better hold it.
798  */
799 static GlobalTransaction
800 TwoPhaseGetGXact(TransactionId xid, bool lock_held)
801 {
802  GlobalTransaction result = NULL;
803  int i;
804 
805  static TransactionId cached_xid = InvalidTransactionId;
806  static GlobalTransaction cached_gxact = NULL;
807 
808  Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));
809 
810  /*
811  * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
812  * repeatedly for the same XID. We can save work with a simple cache.
813  */
814  if (xid == cached_xid)
815  return cached_gxact;
816 
817  if (!lock_held)
818  LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
819 
820  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
821  {
823 
824  if (gxact->xid == xid)
825  {
826  result = gxact;
827  break;
828  }
829  }
830 
831  if (!lock_held)
832  LWLockRelease(TwoPhaseStateLock);
833 
834  if (result == NULL) /* should not happen */
835  elog(ERROR, "failed to find GlobalTransaction for xid %u", xid);
836 
837  cached_xid = xid;
838  cached_gxact = result;
839 
840  return result;
841 }
842 
843 /*
844  * TwoPhaseGetXidByVirtualXID
845  * Lookup VXID among xacts prepared since last startup.
846  *
847  * (This won't find recovered xacts.) If more than one matches, return any
848  * and set "have_more" to true. To witness multiple matches, a single
849  * proc number must consume 2^32 LXIDs, with no intervening database restart.
850  */
853  bool *have_more)
854 {
855  int i;
857 
859  LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
860 
861  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
862  {
864  PGPROC *proc;
865  VirtualTransactionId proc_vxid;
866 
867  if (!gxact->valid)
868  continue;
869  proc = GetPGProcByNumber(gxact->pgprocno);
870  GET_VXID_FROM_PGPROC(proc_vxid, *proc);
871  if (VirtualTransactionIdEquals(vxid, proc_vxid))
872  {
873  /*
874  * Startup process sets proc->vxid.procNumber to
875  * INVALID_PROC_NUMBER.
876  */
877  Assert(!gxact->inredo);
878 
879  if (result != InvalidTransactionId)
880  {
881  *have_more = true;
882  break;
883  }
884  result = gxact->xid;
885  }
886  }
887 
888  LWLockRelease(TwoPhaseStateLock);
889 
890  return result;
891 }
892 
893 /*
894  * TwoPhaseGetDummyProcNumber
895  * Get the dummy proc number for prepared transaction specified by XID
896  *
897  * Dummy proc numbers are similar to proc numbers of real backends. They
898  * start at MaxBackends, and are unique across all currently active real
899  * backends and prepared transactions. If lock_held is set to true,
900  * TwoPhaseStateLock will not be taken, so the caller had better hold it.
901  */
904 {
905  GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
906 
907  return gxact->pgprocno;
908 }
909 
910 /*
911  * TwoPhaseGetDummyProc
912  * Get the PGPROC that represents a prepared transaction specified by XID
913  *
914  * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
915  * caller had better hold it.
916  */
917 PGPROC *
919 {
920  GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
921 
922  return GetPGProcByNumber(gxact->pgprocno);
923 }
924 
925 /************************************************************************/
926 /* State file support */
927 /************************************************************************/
928 
929 /*
930  * Compute the FullTransactionId for the given TransactionId.
931  *
932  * The wrap logic is safe here because the span of active xids cannot exceed one
933  * epoch at any given time.
934  */
935 static inline FullTransactionId
937 {
938  FullTransactionId nextFullXid;
939  TransactionId nextXid;
940  uint32 epoch;
941 
943 
944  LWLockAcquire(XidGenLock, LW_SHARED);
945  nextFullXid = TransamVariables->nextXid;
946  LWLockRelease(XidGenLock);
947 
948  nextXid = XidFromFullTransactionId(nextFullXid);
949  epoch = EpochFromFullTransactionId(nextFullXid);
950  if (unlikely(xid > nextXid))
951  {
952  /* Wraparound occurred, must be from a prev epoch. */
953  Assert(epoch > 0);
954  epoch--;
955  }
956 
958 }
959 
960 static inline int
962 {
964 
965  return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X",
968 }
969 
970 /*
971  * 2PC state file format:
972  *
973  * 1. TwoPhaseFileHeader
974  * 2. TransactionId[] (subtransactions)
975  * 3. RelFileLocator[] (files to be deleted at commit)
976  * 4. RelFileLocator[] (files to be deleted at abort)
977  * 5. SharedInvalidationMessage[] (inval messages to be sent at commit)
978  * 6. TwoPhaseRecordOnDisk
979  * 7. ...
980  * 8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
981  * 9. checksum (CRC-32C)
982  *
983  * Each segment except the final checksum is MAXALIGN'd.
984  */
985 
986 /*
987  * Header for a 2PC state file
988  */
989 #define TWOPHASE_MAGIC 0x57F94534 /* format identifier */
990 
992 
993 /*
994  * Header for each record in a state file
995  *
996  * NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header.
997  * The rmgr data will be stored starting on a MAXALIGN boundary.
998  */
999 typedef struct TwoPhaseRecordOnDisk
1000 {
1001  uint32 len; /* length of rmgr data */
1002  TwoPhaseRmgrId rmid; /* resource manager for this record */
1003  uint16 info; /* flag bits for use by rmgr */
1005 
1006 /*
1007  * During prepare, the state file is assembled in memory before writing it
1008  * to WAL and the actual state file. We use a chain of StateFileChunk blocks
1009  * for that.
1010  */
1011 typedef struct StateFileChunk
1012 {
1013  char *data;
1017 
1018 static struct xllist
1019 {
1020  StateFileChunk *head; /* first data block in the chain */
1021  StateFileChunk *tail; /* last block in chain */
1023  uint32 bytes_free; /* free bytes left in tail block */
1024  uint32 total_len; /* total data bytes in chain */
1026 
1027 
1028 /*
1029  * Append a block of data to records data structure.
1030  *
1031  * NB: each block is padded to a MAXALIGN multiple. This must be
1032  * accounted for when the file is later read!
1033  *
1034  * The data is copied, so the caller is free to modify it afterwards.
1035  */
1036 static void
1038 {
1039  uint32 padlen = MAXALIGN(len);
1040 
1041  if (padlen > records.bytes_free)
1042  {
1043  records.tail->next = palloc0(sizeof(StateFileChunk));
1045  records.tail->len = 0;
1046  records.tail->next = NULL;
1047  records.num_chunks++;
1048 
1049  records.bytes_free = Max(padlen, 512);
1051  }
1052 
1053  memcpy(((char *) records.tail->data) + records.tail->len, data, len);
1054  records.tail->len += padlen;
1055  records.bytes_free -= padlen;
1056  records.total_len += padlen;
1057 }
1058 
1059 /*
1060  * Start preparing a state file.
1061  *
1062  * Initializes data structure and inserts the 2PC file header record.
1063  */
1064 void
1066 {
1067  PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
1068  TransactionId xid = gxact->xid;
1069  TwoPhaseFileHeader hdr;
1070  TransactionId *children;
1071  RelFileLocator *commitrels;
1072  RelFileLocator *abortrels;
1073  xl_xact_stats_item *abortstats = NULL;
1074  xl_xact_stats_item *commitstats = NULL;
1075  SharedInvalidationMessage *invalmsgs;
1076 
1077  /* Initialize linked list */
1078  records.head = palloc0(sizeof(StateFileChunk));
1079  records.head->len = 0;
1080  records.head->next = NULL;
1081 
1082  records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512);
1084 
1086  records.num_chunks = 1;
1087 
1088  records.total_len = 0;
1089 
1090  /* Create header */
1091  hdr.magic = TWOPHASE_MAGIC;
1092  hdr.total_len = 0; /* EndPrepare will fill this in */
1093  hdr.xid = xid;
1094  hdr.database = proc->databaseId;
1095  hdr.prepared_at = gxact->prepared_at;
1096  hdr.owner = gxact->owner;
1097  hdr.nsubxacts = xactGetCommittedChildren(&children);
1098  hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels);
1099  hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels);
1100  hdr.ncommitstats =
1101  pgstat_get_transactional_drops(true, &commitstats);
1102  hdr.nabortstats =
1103  pgstat_get_transactional_drops(false, &abortstats);
1105  &hdr.initfileinval);
1106  hdr.gidlen = strlen(gxact->gid) + 1; /* Include '\0' */
1107  /* EndPrepare will fill the origin data, if necessary */
1109  hdr.origin_timestamp = 0;
1110 
1111  save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
1112  save_state_data(gxact->gid, hdr.gidlen);
1113 
1114  /*
1115  * Add the additional info about subxacts, deletable files and cache
1116  * invalidation messages.
1117  */
1118  if (hdr.nsubxacts > 0)
1119  {
1120  save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
1121  /* While we have the child-xact data, stuff it in the gxact too */
1122  GXactLoadSubxactData(gxact, hdr.nsubxacts, children);
1123  }
1124  if (hdr.ncommitrels > 0)
1125  {
1126  save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileLocator));
1127  pfree(commitrels);
1128  }
1129  if (hdr.nabortrels > 0)
1130  {
1131  save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileLocator));
1132  pfree(abortrels);
1133  }
1134  if (hdr.ncommitstats > 0)
1135  {
1136  save_state_data(commitstats,
1137  hdr.ncommitstats * sizeof(xl_xact_stats_item));
1138  pfree(commitstats);
1139  }
1140  if (hdr.nabortstats > 0)
1141  {
1142  save_state_data(abortstats,
1143  hdr.nabortstats * sizeof(xl_xact_stats_item));
1144  pfree(abortstats);
1145  }
1146  if (hdr.ninvalmsgs > 0)
1147  {
1148  save_state_data(invalmsgs,
1149  hdr.ninvalmsgs * sizeof(SharedInvalidationMessage));
1150  pfree(invalmsgs);
1151  }
1152 }
1153 
1154 /*
1155  * Finish preparing state data and writing it to WAL.
1156  */
1157 void
1159 {
1160  TwoPhaseFileHeader *hdr;
1161  StateFileChunk *record;
1162  bool replorigin;
1163 
1164  /* Add the end sentinel to the list of 2PC records */
1166  NULL, 0);
1167 
1168  /* Go back and fill in total_len in the file header record */
1169  hdr = (TwoPhaseFileHeader *) records.head->data;
1170  Assert(hdr->magic == TWOPHASE_MAGIC);
1171  hdr->total_len = records.total_len + sizeof(pg_crc32c);
1172 
1173  replorigin = (replorigin_session_origin != InvalidRepOriginId &&
1175 
1176  if (replorigin)
1177  {
1180  }
1181 
1182  /*
1183  * If the data size exceeds MaxAllocSize, we won't be able to read it in
1184  * ReadTwoPhaseFile. Check for that now, rather than fail in the case
1185  * where we write data to file and then re-read at commit time.
1186  */
1187  if (hdr->total_len > MaxAllocSize)
1188  ereport(ERROR,
1189  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1190  errmsg("two-phase state file maximum length exceeded")));
1191 
1192  /*
1193  * Now writing 2PC state data to WAL. We let the WAL's CRC protection
1194  * cover us, so no need to calculate a separate CRC.
1195  *
1196  * We have to set DELAY_CHKPT_START here, too; otherwise a checkpoint
1197  * starting immediately after the WAL record is inserted could complete
1198  * without fsync'ing our state file. (This is essentially the same kind
1199  * of race condition as the COMMIT-to-clog-write case that
1200  * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.)
1201  *
1202  * We save the PREPARE record's location in the gxact for later use by
1203  * CheckPointTwoPhase.
1204  */
1206 
1208 
1211 
1212  XLogBeginInsert();
1213  for (record = records.head; record != NULL; record = record->next)
1214  XLogRegisterData(record->data, record->len);
1215 
1217 
1218  gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE);
1219 
1220  if (replorigin)
1221  {
1222  /* Move LSNs forward for this replication origin */
1224  gxact->prepare_end_lsn);
1225  }
1226 
1227  XLogFlush(gxact->prepare_end_lsn);
1228 
1229  /* If we crash now, we have prepared: WAL replay will fix things */
1230 
1231  /* Store record's start location to read that later on Commit */
1233 
1234  /*
1235  * Mark the prepared transaction as valid. As soon as xact.c marks MyProc
1236  * as not running our XID (which it will do immediately after this
1237  * function returns), others can commit/rollback the xact.
1238  *
1239  * NB: a side effect of this is to make a dummy ProcArray entry for the
1240  * prepared XID. This must happen before we clear the XID from MyProc /
1241  * ProcGlobal->xids[], else there is a window where the XID is not running
1242  * according to TransactionIdIsInProgress, and onlookers would be entitled
1243  * to assume the xact crashed. Instead we have a window where the same
1244  * XID appears twice in ProcArray, which is OK.
1245  */
1246  MarkAsPrepared(gxact, false);
1247 
1248  /*
1249  * Now we can mark ourselves as out of the commit critical section: a
1250  * checkpoint starting after this will certainly see the gxact as a
1251  * candidate for fsyncing.
1252  */
1254 
1255  /*
1256  * Remember that we have this GlobalTransaction entry locked for us. If
1257  * we crash after this point, it's too late to abort, but we must unlock
1258  * it so that the prepared transaction can be committed or rolled back.
1259  */
1260  MyLockedGxact = gxact;
1261 
1262  END_CRIT_SECTION();
1263 
1264  /*
1265  * Wait for synchronous replication, if required.
1266  *
1267  * Note that at this stage we have marked the prepare, but still show as
1268  * running in the procarray (twice!) and continue to hold locks.
1269  */
1270  SyncRepWaitForLSN(gxact->prepare_end_lsn, false);
1271 
1272  records.tail = records.head = NULL;
1273  records.num_chunks = 0;
1274 }
1275 
1276 /*
1277  * Register a 2PC record to be written to state file.
1278  */
1279 void
1281  const void *data, uint32 len)
1282 {
1283  TwoPhaseRecordOnDisk record;
1284 
1285  record.rmid = rmid;
1286  record.info = info;
1287  record.len = len;
1288  save_state_data(&record, sizeof(TwoPhaseRecordOnDisk));
1289  if (len > 0)
1291 }
1292 
1293 
1294 /*
1295  * Read and validate the state file for xid.
1296  *
1297  * If it looks OK (has a valid magic number and CRC), return the palloc'd
1298  * contents of the file, issuing an error when finding corrupted data. If
1299  * missing_ok is true, which indicates that missing files can be safely
1300  * ignored, then return NULL. This state can be reached when doing recovery.
1301  */
1302 static char *
1303 ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
1304 {
1305  char path[MAXPGPATH];
1306  char *buf;
1307  TwoPhaseFileHeader *hdr;
1308  int fd;
1309  struct stat stat;
1310  uint32 crc_offset;
1311  pg_crc32c calc_crc,
1312  file_crc;
1313  int r;
1314 
1315  TwoPhaseFilePath(path, xid);
1316 
1317  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
1318  if (fd < 0)
1319  {
1320  if (missing_ok && errno == ENOENT)
1321  return NULL;
1322 
1323  ereport(ERROR,
1325  errmsg("could not open file \"%s\": %m", path)));
1326  }
1327 
1328  /*
1329  * Check file length. We can determine a lower bound pretty easily. We
1330  * set an upper bound to avoid palloc() failure on a corrupt file, though
1331  * we can't guarantee that we won't get an out of memory error anyway,
1332  * even on a valid file.
1333  */
1334  if (fstat(fd, &stat))
1335  ereport(ERROR,
1337  errmsg("could not stat file \"%s\": %m", path)));
1338 
1339  if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
1340  MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
1341  sizeof(pg_crc32c)) ||
1343  ereport(ERROR,
1345  errmsg_plural("incorrect size of file \"%s\": %lld byte",
1346  "incorrect size of file \"%s\": %lld bytes",
1347  (long long int) stat.st_size, path,
1348  (long long int) stat.st_size)));
1349 
1350  crc_offset = stat.st_size - sizeof(pg_crc32c);
1351  if (crc_offset != MAXALIGN(crc_offset))
1352  ereport(ERROR,
1354  errmsg("incorrect alignment of CRC offset for file \"%s\"",
1355  path)));
1356 
1357  /*
1358  * OK, slurp in the file.
1359  */
1360  buf = (char *) palloc(stat.st_size);
1361 
1362  pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ);
1363  r = read(fd, buf, stat.st_size);
1364  if (r != stat.st_size)
1365  {
1366  if (r < 0)
1367  ereport(ERROR,
1369  errmsg("could not read file \"%s\": %m", path)));
1370  else
1371  ereport(ERROR,
1372  (errmsg("could not read file \"%s\": read %d of %lld",
1373  path, r, (long long int) stat.st_size)));
1374  }
1375 
1377 
1378  if (CloseTransientFile(fd) != 0)
1379  ereport(ERROR,
1381  errmsg("could not close file \"%s\": %m", path)));
1382 
1383  hdr = (TwoPhaseFileHeader *) buf;
1384  if (hdr->magic != TWOPHASE_MAGIC)
1385  ereport(ERROR,
1387  errmsg("invalid magic number stored in file \"%s\"",
1388  path)));
1389 
1390  if (hdr->total_len != stat.st_size)
1391  ereport(ERROR,
1393  errmsg("invalid size stored in file \"%s\"",
1394  path)));
1395 
1396  INIT_CRC32C(calc_crc);
1397  COMP_CRC32C(calc_crc, buf, crc_offset);
1398  FIN_CRC32C(calc_crc);
1399 
1400  file_crc = *((pg_crc32c *) (buf + crc_offset));
1401 
1402  if (!EQ_CRC32C(calc_crc, file_crc))
1403  ereport(ERROR,
1405  errmsg("calculated CRC checksum does not match value stored in file \"%s\"",
1406  path)));
1407 
1408  return buf;
1409 }
1410 
1411 
1412 /*
1413  * Reads 2PC data from xlog. During checkpoint this data will be moved to
1414  * twophase files and ReadTwoPhaseFile should be used instead.
1415  *
1416  * Note clearly that this function can access WAL during normal operation,
1417  * similarly to the way WALSender or Logical Decoding would do.
1418  */
1419 static void
1421 {
1422  XLogRecord *record;
1424  char *errormsg;
1425 
1427  XL_ROUTINE(.page_read = &read_local_xlog_page,
1428  .segment_open = &wal_segment_open,
1429  .segment_close = &wal_segment_close),
1430  NULL);
1431  if (!xlogreader)
1432  ereport(ERROR,
1433  (errcode(ERRCODE_OUT_OF_MEMORY),
1434  errmsg("out of memory"),
1435  errdetail("Failed while allocating a WAL reading processor.")));
1436 
1437  XLogBeginRead(xlogreader, lsn);
1438  record = XLogReadRecord(xlogreader, &errormsg);
1439 
1440  if (record == NULL)
1441  {
1442  if (errormsg)
1443  ereport(ERROR,
1445  errmsg("could not read two-phase state from WAL at %X/%X: %s",
1446  LSN_FORMAT_ARGS(lsn), errormsg)));
1447  else
1448  ereport(ERROR,
1450  errmsg("could not read two-phase state from WAL at %X/%X",
1451  LSN_FORMAT_ARGS(lsn))));
1452  }
1453 
1454  if (XLogRecGetRmid(xlogreader) != RM_XACT_ID ||
1456  ereport(ERROR,
1458  errmsg("expected two-phase state data is not present in WAL at %X/%X",
1459  LSN_FORMAT_ARGS(lsn))));
1460 
1461  if (len != NULL)
1463 
1464  *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader));
1465  memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader));
1466 
1468 }
1469 
1470 
1471 /*
1472  * Confirms an xid is prepared, during recovery
1473  */
1474 bool
1476 {
1477  char *buf;
1478  TwoPhaseFileHeader *hdr;
1479  bool result;
1480 
1482 
1483  if (max_prepared_xacts <= 0)
1484  return false; /* nothing to do */
1485 
1486  /* Read and validate file */
1487  buf = ReadTwoPhaseFile(xid, true);
1488  if (buf == NULL)
1489  return false;
1490 
1491  /* Check header also */
1492  hdr = (TwoPhaseFileHeader *) buf;
1493  result = TransactionIdEquals(hdr->xid, xid);
1494  pfree(buf);
1495 
1496  return result;
1497 }
1498 
1499 /*
1500  * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
1501  */
1502 void
1503 FinishPreparedTransaction(const char *gid, bool isCommit)
1504 {
1505  GlobalTransaction gxact;
1506  PGPROC *proc;
1507  TransactionId xid;
1508  char *buf;
1509  char *bufptr;
1510  TwoPhaseFileHeader *hdr;
1511  TransactionId latestXid;
1512  TransactionId *children;
1513  RelFileLocator *commitrels;
1514  RelFileLocator *abortrels;
1515  RelFileLocator *delrels;
1516  int ndelrels;
1517  xl_xact_stats_item *commitstats;
1518  xl_xact_stats_item *abortstats;
1519  SharedInvalidationMessage *invalmsgs;
1520 
1521  /*
1522  * Validate the GID, and lock the GXACT to ensure that two backends do not
1523  * try to commit the same GID at once.
1524  */
1525  gxact = LockGXact(gid, GetUserId());
1526  proc = GetPGProcByNumber(gxact->pgprocno);
1527  xid = gxact->xid;
1528 
1529  /*
1530  * Read and validate 2PC state data. State data will typically be stored
1531  * in WAL files if the LSN is after the last checkpoint record, or moved
1532  * to disk if for some reason they have lived for a long time.
1533  */
1534  if (gxact->ondisk)
1535  buf = ReadTwoPhaseFile(xid, false);
1536  else
1537  XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
1538 
1539 
1540  /*
1541  * Disassemble the header area
1542  */
1543  hdr = (TwoPhaseFileHeader *) buf;
1544  Assert(TransactionIdEquals(hdr->xid, xid));
1545  bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
1546  bufptr += MAXALIGN(hdr->gidlen);
1547  children = (TransactionId *) bufptr;
1548  bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
1549  commitrels = (RelFileLocator *) bufptr;
1550  bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileLocator));
1551  abortrels = (RelFileLocator *) bufptr;
1552  bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileLocator));
1553  commitstats = (xl_xact_stats_item *) bufptr;
1554  bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
1555  abortstats = (xl_xact_stats_item *) bufptr;
1556  bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
1557  invalmsgs = (SharedInvalidationMessage *) bufptr;
1558  bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
1559 
1560  /* compute latestXid among all children */
1561  latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
1562 
1563  /* Prevent cancel/die interrupt while cleaning up */
1564  HOLD_INTERRUPTS();
1565 
1566  /*
1567  * The order of operations here is critical: make the XLOG entry for
1568  * commit or abort, then mark the transaction committed or aborted in
1569  * pg_xact, then remove its PGPROC from the global ProcArray (which means
1570  * TransactionIdIsInProgress will stop saying the prepared xact is in
1571  * progress), then run the post-commit or post-abort callbacks. The
1572  * callbacks will release the locks the transaction held.
1573  */
1574  if (isCommit)
1576  hdr->nsubxacts, children,
1577  hdr->ncommitrels, commitrels,
1578  hdr->ncommitstats,
1579  commitstats,
1580  hdr->ninvalmsgs, invalmsgs,
1581  hdr->initfileinval, gid);
1582  else
1584  hdr->nsubxacts, children,
1585  hdr->nabortrels, abortrels,
1586  hdr->nabortstats,
1587  abortstats,
1588  gid);
1589 
1590  ProcArrayRemove(proc, latestXid);
1591 
1592  /*
1593  * In case we fail while running the callbacks, mark the gxact invalid so
1594  * no one else will try to commit/rollback, and so it will be recycled if
1595  * we fail after this point. It is still locked by our backend so it
1596  * won't go away yet.
1597  *
1598  * (We assume it's safe to do this without taking TwoPhaseStateLock.)
1599  */
1600  gxact->valid = false;
1601 
1602  /*
1603  * We have to remove any files that were supposed to be dropped. For
1604  * consistency with the regular xact.c code paths, must do this before
1605  * releasing locks, so do it before running the callbacks.
1606  *
1607  * NB: this code knows that we couldn't be dropping any temp rels ...
1608  */
1609  if (isCommit)
1610  {
1611  delrels = commitrels;
1612  ndelrels = hdr->ncommitrels;
1613  }
1614  else
1615  {
1616  delrels = abortrels;
1617  ndelrels = hdr->nabortrels;
1618  }
1619 
1620  /* Make sure files supposed to be dropped are dropped */
1621  DropRelationFiles(delrels, ndelrels, false);
1622 
1623  if (isCommit)
1624  pgstat_execute_transactional_drops(hdr->ncommitstats, commitstats, false);
1625  else
1626  pgstat_execute_transactional_drops(hdr->nabortstats, abortstats, false);
1627 
1628  /*
1629  * Handle cache invalidation messages.
1630  *
1631  * Relcache init file invalidation requires processing both before and
1632  * after we send the SI messages, only when committing. See
1633  * AtEOXact_Inval().
1634  */
1635  if (isCommit)
1636  {
1637  if (hdr->initfileinval)
1639  SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs);
1640  if (hdr->initfileinval)
1642  }
1643 
1644  /*
1645  * Acquire the two-phase lock. We want to work on the two-phase callbacks
1646  * while holding it to avoid potential conflicts with other transactions
1647  * attempting to use the same GID, so the lock is released once the shared
1648  * memory state is cleared.
1649  */
1650  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
1651 
1652  /* And now do the callbacks */
1653  if (isCommit)
1655  else
1657 
1658  PredicateLockTwoPhaseFinish(xid, isCommit);
1659 
1660  /* Clear shared memory state */
1661  RemoveGXact(gxact);
1662 
1663  /*
1664  * Release the lock as all callbacks are called and shared memory cleanup
1665  * is done.
1666  */
1667  LWLockRelease(TwoPhaseStateLock);
1668 
1669  /* Count the prepared xact as committed or aborted */
1670  AtEOXact_PgStat(isCommit, false);
1671 
1672  /*
1673  * And now we can clean up any files we may have left.
1674  */
1675  if (gxact->ondisk)
1676  RemoveTwoPhaseFile(xid, true);
1677 
1678  MyLockedGxact = NULL;
1679 
1681 
1682  pfree(buf);
1683 }
1684 
1685 /*
1686  * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record.
1687  */
1688 static void
1689 ProcessRecords(char *bufptr, TransactionId xid,
1690  const TwoPhaseCallback callbacks[])
1691 {
1692  for (;;)
1693  {
1694  TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr;
1695 
1696  Assert(record->rmid <= TWOPHASE_RM_MAX_ID);
1697  if (record->rmid == TWOPHASE_RM_END_ID)
1698  break;
1699 
1700  bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
1701 
1702  if (callbacks[record->rmid] != NULL)
1703  callbacks[record->rmid] (xid, record->info,
1704  (void *) bufptr, record->len);
1705 
1706  bufptr += MAXALIGN(record->len);
1707  }
1708 }
1709 
1710 /*
1711  * Remove the 2PC file for the specified XID.
1712  *
1713  * If giveWarning is false, do not complain about file-not-present;
1714  * this is an expected case during WAL replay.
1715  */
1716 static void
1717 RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
1718 {
1719  char path[MAXPGPATH];
1720 
1721  TwoPhaseFilePath(path, xid);
1722  if (unlink(path))
1723  if (errno != ENOENT || giveWarning)
1724  ereport(WARNING,
1726  errmsg("could not remove file \"%s\": %m", path)));
1727 }
1728 
1729 /*
1730  * Recreates a state file. This is used in WAL replay and during
1731  * checkpoint creation.
1732  *
1733  * Note: content and len don't include CRC.
1734  */
1735 static void
1736 RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
1737 {
1738  char path[MAXPGPATH];
1739  pg_crc32c statefile_crc;
1740  int fd;
1741 
1742  /* Recompute CRC */
1743  INIT_CRC32C(statefile_crc);
1744  COMP_CRC32C(statefile_crc, content, len);
1745  FIN_CRC32C(statefile_crc);
1746 
1747  TwoPhaseFilePath(path, xid);
1748 
1749  fd = OpenTransientFile(path,
1750  O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY);
1751  if (fd < 0)
1752  ereport(ERROR,
1754  errmsg("could not recreate file \"%s\": %m", path)));
1755 
1756  /* Write content and CRC */
1757  errno = 0;
1758  pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE);
1759  if (write(fd, content, len) != len)
1760  {
1761  /* if write didn't set errno, assume problem is no disk space */
1762  if (errno == 0)
1763  errno = ENOSPC;
1764  ereport(ERROR,
1766  errmsg("could not write file \"%s\": %m", path)));
1767  }
1768  if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c))
1769  {
1770  /* if write didn't set errno, assume problem is no disk space */
1771  if (errno == 0)
1772  errno = ENOSPC;
1773  ereport(ERROR,
1775  errmsg("could not write file \"%s\": %m", path)));
1776  }
1778 
1779  /*
1780  * We must fsync the file because the end-of-replay checkpoint will not do
1781  * so, there being no GXACT in shared memory yet to tell it to.
1782  */
1783  pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC);
1784  if (pg_fsync(fd) != 0)
1785  ereport(ERROR,
1787  errmsg("could not fsync file \"%s\": %m", path)));
1789 
1790  if (CloseTransientFile(fd) != 0)
1791  ereport(ERROR,
1793  errmsg("could not close file \"%s\": %m", path)));
1794 }
1795 
1796 /*
1797  * CheckPointTwoPhase -- handle 2PC component of checkpointing.
1798  *
1799  * We must fsync the state file of any GXACT that is valid or has been
1800  * generated during redo and has a PREPARE LSN <= the checkpoint's redo
1801  * horizon. (If the gxact isn't valid yet, has not been generated in
1802  * redo, or has a later LSN, this checkpoint is not responsible for
1803  * fsyncing it.)
1804  *
1805  * This is deliberately run as late as possible in the checkpoint sequence,
1806  * because GXACTs ordinarily have short lifespans, and so it is quite
1807  * possible that GXACTs that were valid at checkpoint start will no longer
1808  * exist if we wait a little bit. With typical checkpoint settings this
1809  * will be about 3 minutes for an online checkpoint, so as a result we
1810  * expect that there will be no GXACTs that need to be copied to disk.
1811  *
1812  * If a GXACT remains valid across multiple checkpoints, it will already
1813  * be on disk so we don't bother to repeat that write.
1814  */
1815 void
1817 {
1818  int i;
1819  int serialized_xacts = 0;
1820 
1821  if (max_prepared_xacts <= 0)
1822  return; /* nothing to do */
1823 
1824  TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();
1825 
1826  /*
1827  * We are expecting there to be zero GXACTs that need to be copied to
1828  * disk, so we perform all I/O while holding TwoPhaseStateLock for
1829  * simplicity. This prevents any new xacts from preparing while this
1830  * occurs, which shouldn't be a problem since the presence of long-lived
1831  * prepared xacts indicates the transaction manager isn't active.
1832  *
1833  * It's also possible to move I/O out of the lock, but on every error we
1834  * should check whether somebody committed our transaction in different
1835  * backend. Let's leave this optimization for future, if somebody will
1836  * spot that this place cause bottleneck.
1837  *
1838  * Note that it isn't possible for there to be a GXACT with a
1839  * prepare_end_lsn set prior to the last checkpoint yet is marked invalid,
1840  * because of the efforts with delayChkptFlags.
1841  */
1842  LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
1843  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
1844  {
1845  /*
1846  * Note that we are using gxact not PGPROC so this works in recovery
1847  * also
1848  */
1850 
1851  if ((gxact->valid || gxact->inredo) &&
1852  !gxact->ondisk &&
1853  gxact->prepare_end_lsn <= redo_horizon)
1854  {
1855  char *buf;
1856  int len;
1857 
1859  RecreateTwoPhaseFile(gxact->xid, buf, len);
1860  gxact->ondisk = true;
1863  pfree(buf);
1864  serialized_xacts++;
1865  }
1866  }
1867  LWLockRelease(TwoPhaseStateLock);
1868 
1869  /*
1870  * Flush unconditionally the parent directory to make any information
1871  * durable on disk. Two-phase files could have been removed and those
1872  * removals need to be made persistent as well as any files newly created
1873  * previously since the last checkpoint.
1874  */
1875  fsync_fname(TWOPHASE_DIR, true);
1876 
1877  TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
1878 
1879  if (log_checkpoints && serialized_xacts > 0)
1880  ereport(LOG,
1881  (errmsg_plural("%u two-phase state file was written "
1882  "for a long-running prepared transaction",
1883  "%u two-phase state files were written "
1884  "for long-running prepared transactions",
1885  serialized_xacts,
1886  serialized_xacts)));
1887 }
1888 
1889 /*
1890  * restoreTwoPhaseData
1891  *
1892  * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data.
1893  * This is called once at the beginning of recovery, saving any extra
1894  * lookups in the future. Two-phase files that are newer than the
1895  * minimum XID horizon are discarded on the way.
1896  */
1897 void
1899 {
1900  DIR *cldir;
1901  struct dirent *clde;
1902 
1903  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
1904  cldir = AllocateDir(TWOPHASE_DIR);
1905  while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
1906  {
1907  if (strlen(clde->d_name) == 16 &&
1908  strspn(clde->d_name, "0123456789ABCDEF") == 16)
1909  {
1910  TransactionId xid;
1911  FullTransactionId fxid;
1912  char *buf;
1913 
1914  fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16));
1915  xid = XidFromFullTransactionId(fxid);
1916 
1918  true, false, false);
1919  if (buf == NULL)
1920  continue;
1921 
1924  }
1925  }
1926  LWLockRelease(TwoPhaseStateLock);
1927  FreeDir(cldir);
1928 }
1929 
1930 /*
1931  * PrescanPreparedTransactions
1932  *
1933  * Scan the shared memory entries of TwoPhaseState and determine the range
1934  * of valid XIDs present. This is run during database startup, after we
1935  * have completed reading WAL. TransamVariables->nextXid has been set to
1936  * one more than the highest XID for which evidence exists in WAL.
1937  *
1938  * We throw away any prepared xacts with main XID beyond nextXid --- if any
1939  * are present, it suggests that the DBA has done a PITR recovery to an
1940  * earlier point in time without cleaning out pg_twophase. We dare not
1941  * try to recover such prepared xacts since they likely depend on database
1942  * state that doesn't exist now.
1943  *
1944  * However, we will advance nextXid beyond any subxact XIDs belonging to
1945  * valid prepared xacts. We need to do this since subxact commit doesn't
1946  * write a WAL entry, and so there might be no evidence in WAL of those
1947  * subxact XIDs.
1948  *
1949  * On corrupted two-phase files, fail immediately. Keeping around broken
1950  * entries and let replay continue causes harm on the system, and a new
1951  * backup should be rolled in.
1952  *
1953  * Our other responsibility is to determine and return the oldest valid XID
1954  * among the prepared xacts (if none, return TransamVariables->nextXid).
1955  * This is needed to synchronize pg_subtrans startup properly.
1956  *
1957  * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
1958  * top-level xids is stored in *xids_p. The number of entries in the array
1959  * is returned in *nxids_p.
1960  */
1963 {
1965  TransactionId origNextXid = XidFromFullTransactionId(nextXid);
1966  TransactionId result = origNextXid;
1967  TransactionId *xids = NULL;
1968  int nxids = 0;
1969  int allocsize = 0;
1970  int i;
1971 
1972  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
1973  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
1974  {
1975  TransactionId xid;
1976  char *buf;
1978 
1979  Assert(gxact->inredo);
1980 
1981  xid = gxact->xid;
1982 
1983  buf = ProcessTwoPhaseBuffer(xid,
1984  gxact->prepare_start_lsn,
1985  gxact->ondisk, false, true);
1986 
1987  if (buf == NULL)
1988  continue;
1989 
1990  /*
1991  * OK, we think this file is valid. Incorporate xid into the
1992  * running-minimum result.
1993  */
1994  if (TransactionIdPrecedes(xid, result))
1995  result = xid;
1996 
1997  if (xids_p)
1998  {
1999  if (nxids == allocsize)
2000  {
2001  if (nxids == 0)
2002  {
2003  allocsize = 10;
2004  xids = palloc(allocsize * sizeof(TransactionId));
2005  }
2006  else
2007  {
2008  allocsize = allocsize * 2;
2009  xids = repalloc(xids, allocsize * sizeof(TransactionId));
2010  }
2011  }
2012  xids[nxids++] = xid;
2013  }
2014 
2015  pfree(buf);
2016  }
2017  LWLockRelease(TwoPhaseStateLock);
2018 
2019  if (xids_p)
2020  {
2021  *xids_p = xids;
2022  *nxids_p = nxids;
2023  }
2024 
2025  return result;
2026 }
2027 
2028 /*
2029  * StandbyRecoverPreparedTransactions
2030  *
2031  * Scan the shared memory entries of TwoPhaseState and setup all the required
2032  * information to allow standby queries to treat prepared transactions as still
2033  * active.
2034  *
2035  * This is never called at the end of recovery - we use
2036  * RecoverPreparedTransactions() at that point.
2037  *
2038  * The lack of calls to SubTransSetParent() calls here is by design;
2039  * those calls are made by RecoverPreparedTransactions() at the end of recovery
2040  * for those xacts that need this.
2041  */
2042 void
2044 {
2045  int i;
2046 
2047  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
2048  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2049  {
2050  TransactionId xid;
2051  char *buf;
2053 
2054  Assert(gxact->inredo);
2055 
2056  xid = gxact->xid;
2057 
2058  buf = ProcessTwoPhaseBuffer(xid,
2059  gxact->prepare_start_lsn,
2060  gxact->ondisk, false, false);
2061  if (buf != NULL)
2062  pfree(buf);
2063  }
2064  LWLockRelease(TwoPhaseStateLock);
2065 }
2066 
2067 /*
2068  * RecoverPreparedTransactions
2069  *
2070  * Scan the shared memory entries of TwoPhaseState and reload the state for
2071  * each prepared transaction (reacquire locks, etc).
2072  *
2073  * This is run at the end of recovery, but before we allow backends to write
2074  * WAL.
2075  *
2076  * At the end of recovery the way we take snapshots will change. We now need
2077  * to mark all running transactions with their full SubTransSetParent() info
2078  * to allow normal snapshots to work correctly if snapshots overflow.
2079  * We do this here because by definition prepared transactions are the only
2080  * type of write transaction still running, so this is necessary and
2081  * complete.
2082  */
2083 void
2085 {
2086  int i;
2087 
2088  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
2089  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2090  {
2091  TransactionId xid;
2092  char *buf;
2094  char *bufptr;
2095  TwoPhaseFileHeader *hdr;
2096  TransactionId *subxids;
2097  const char *gid;
2098 
2099  xid = gxact->xid;
2100 
2101  /*
2102  * Reconstruct subtrans state for the transaction --- needed because
2103  * pg_subtrans is not preserved over a restart. Note that we are
2104  * linking all the subtransactions directly to the top-level XID;
2105  * there may originally have been a more complex hierarchy, but
2106  * there's no need to restore that exactly. It's possible that
2107  * SubTransSetParent has been set before, if the prepared transaction
2108  * generated xid assignment records.
2109  */
2110  buf = ProcessTwoPhaseBuffer(xid,
2111  gxact->prepare_start_lsn,
2112  gxact->ondisk, true, false);
2113  if (buf == NULL)
2114  continue;
2115 
2116  ereport(LOG,
2117  (errmsg("recovering prepared transaction %u from shared memory", xid)));
2118 
2119  hdr = (TwoPhaseFileHeader *) buf;
2120  Assert(TransactionIdEquals(hdr->xid, xid));
2121  bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
2122  gid = (const char *) bufptr;
2123  bufptr += MAXALIGN(hdr->gidlen);
2124  subxids = (TransactionId *) bufptr;
2125  bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
2126  bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileLocator));
2127  bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileLocator));
2128  bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
2129  bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
2130  bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
2131 
2132  /*
2133  * Recreate its GXACT and dummy PGPROC. But, check whether it was
2134  * added in redo and already has a shmem entry for it.
2135  */
2136  MarkAsPreparingGuts(gxact, xid, gid,
2137  hdr->prepared_at,
2138  hdr->owner, hdr->database);
2139 
2140  /* recovered, so reset the flag for entries generated by redo */
2141  gxact->inredo = false;
2142 
2143  GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids);
2144  MarkAsPrepared(gxact, true);
2145 
2146  LWLockRelease(TwoPhaseStateLock);
2147 
2148  /*
2149  * Recover other state (notably locks) using resource managers.
2150  */
2152 
2153  /*
2154  * Release locks held by the standby process after we process each
2155  * prepared transaction. As a result, we don't need too many
2156  * additional locks at any one time.
2157  */
2158  if (InHotStandby)
2159  StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids);
2160 
2161  /*
2162  * We're done with recovering this transaction. Clear MyLockedGxact,
2163  * like we do in PrepareTransaction() during normal operation.
2164  */
2166 
2167  pfree(buf);
2168 
2169  LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
2170  }
2171 
2172  LWLockRelease(TwoPhaseStateLock);
2173 }
2174 
2175 /*
2176  * ProcessTwoPhaseBuffer
2177  *
2178  * Given a transaction id, read it either from disk or read it directly
2179  * via shmem xlog record pointer using the provided "prepare_start_lsn".
2180  *
2181  * If setParent is true, set up subtransaction parent linkages.
2182  *
2183  * If setNextXid is true, set TransamVariables->nextXid to the newest
2184  * value scanned.
2185  */
2186 static char *
2188  XLogRecPtr prepare_start_lsn,
2189  bool fromdisk,
2190  bool setParent, bool setNextXid)
2191 {
2193  TransactionId origNextXid = XidFromFullTransactionId(nextXid);
2194  TransactionId *subxids;
2195  char *buf;
2196  TwoPhaseFileHeader *hdr;
2197  int i;
2198 
2199  Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
2200 
2201  if (!fromdisk)
2202  Assert(prepare_start_lsn != InvalidXLogRecPtr);
2203 
2204  /* Already processed? */
2206  {
2207  if (fromdisk)
2208  {
2209  ereport(WARNING,
2210  (errmsg("removing stale two-phase state file for transaction %u",
2211  xid)));
2212  RemoveTwoPhaseFile(xid, true);
2213  }
2214  else
2215  {
2216  ereport(WARNING,
2217  (errmsg("removing stale two-phase state from memory for transaction %u",
2218  xid)));
2219  PrepareRedoRemove(xid, true);
2220  }
2221  return NULL;
2222  }
2223 
2224  /* Reject XID if too new */
2225  if (TransactionIdFollowsOrEquals(xid, origNextXid))
2226  {
2227  if (fromdisk)
2228  {
2229  ereport(WARNING,
2230  (errmsg("removing future two-phase state file for transaction %u",
2231  xid)));
2232  RemoveTwoPhaseFile(xid, true);
2233  }
2234  else
2235  {
2236  ereport(WARNING,
2237  (errmsg("removing future two-phase state from memory for transaction %u",
2238  xid)));
2239  PrepareRedoRemove(xid, true);
2240  }
2241  return NULL;
2242  }
2243 
2244  if (fromdisk)
2245  {
2246  /* Read and validate file */
2247  buf = ReadTwoPhaseFile(xid, false);
2248  }
2249  else
2250  {
2251  /* Read xlog data */
2252  XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL);
2253  }
2254 
2255  /* Deconstruct header */
2256  hdr = (TwoPhaseFileHeader *) buf;
2257  if (!TransactionIdEquals(hdr->xid, xid))
2258  {
2259  if (fromdisk)
2260  ereport(ERROR,
2262  errmsg("corrupted two-phase state file for transaction %u",
2263  xid)));
2264  else
2265  ereport(ERROR,
2267  errmsg("corrupted two-phase state in memory for transaction %u",
2268  xid)));
2269  }
2270 
2271  /*
2272  * Examine subtransaction XIDs ... they should all follow main XID, and
2273  * they may force us to advance nextXid.
2274  */
2275  subxids = (TransactionId *) (buf +
2276  MAXALIGN(sizeof(TwoPhaseFileHeader)) +
2277  MAXALIGN(hdr->gidlen));
2278  for (i = 0; i < hdr->nsubxacts; i++)
2279  {
2280  TransactionId subxid = subxids[i];
2281 
2282  Assert(TransactionIdFollows(subxid, xid));
2283 
2284  /* update nextXid if needed */
2285  if (setNextXid)
2287 
2288  if (setParent)
2289  SubTransSetParent(subxid, xid);
2290  }
2291 
2292  return buf;
2293 }
2294 
2295 
2296 /*
2297  * RecordTransactionCommitPrepared
2298  *
2299  * This is basically the same as RecordTransactionCommit (q.v. if you change
2300  * this function): in particular, we must set DELAY_CHKPT_START to avoid a
2301  * race condition.
2302  *
2303  * We know the transaction made at least one XLOG entry (its PREPARE),
2304  * so it is never possible to optimize out the commit record.
2305  */
2306 static void
2308  int nchildren,
2309  TransactionId *children,
2310  int nrels,
2311  RelFileLocator *rels,
2312  int nstats,
2313  xl_xact_stats_item *stats,
2314  int ninvalmsgs,
2315  SharedInvalidationMessage *invalmsgs,
2316  bool initfileinval,
2317  const char *gid)
2318 {
2319  XLogRecPtr recptr;
2320  TimestampTz committs = GetCurrentTimestamp();
2321  bool replorigin;
2322 
2323  /*
2324  * Are we using the replication origins feature? Or, in other words, are
2325  * we replaying remote actions?
2326  */
2327  replorigin = (replorigin_session_origin != InvalidRepOriginId &&
2329 
2331 
2332  /* See notes in RecordTransactionCommit */
2335 
2336  /*
2337  * Emit the XLOG commit record. Note that we mark 2PC commits as
2338  * potentially having AccessExclusiveLocks since we don't know whether or
2339  * not they do.
2340  */
2341  recptr = XactLogCommitRecord(committs,
2342  nchildren, children, nrels, rels,
2343  nstats, stats,
2344  ninvalmsgs, invalmsgs,
2345  initfileinval,
2347  xid, gid);
2348 
2349 
2350  if (replorigin)
2351  /* Move LSNs forward for this replication origin */
2353  XactLastRecEnd);
2354 
2355  /*
2356  * Record commit timestamp. The value comes from plain commit timestamp
2357  * if replorigin is not enabled, or replorigin already set a value for us
2358  * in replorigin_session_origin_timestamp otherwise.
2359  *
2360  * We don't need to WAL-log anything here, as the commit record written
2361  * above already contains the data.
2362  */
2363  if (!replorigin || replorigin_session_origin_timestamp == 0)
2365 
2366  TransactionTreeSetCommitTsData(xid, nchildren, children,
2369 
2370  /*
2371  * We don't currently try to sleep before flush here ... nor is there any
2372  * support for async commit of a prepared xact (the very idea is probably
2373  * a contradiction)
2374  */
2375 
2376  /* Flush XLOG to disk */
2377  XLogFlush(recptr);
2378 
2379  /* Mark the transaction committed in pg_xact */
2380  TransactionIdCommitTree(xid, nchildren, children);
2381 
2382  /* Checkpoint can proceed now */
2384 
2385  END_CRIT_SECTION();
2386 
2387  /*
2388  * Wait for synchronous replication, if required.
2389  *
2390  * Note that at this stage we have marked clog, but still show as running
2391  * in the procarray and continue to hold locks.
2392  */
2393  SyncRepWaitForLSN(recptr, true);
2394 }
2395 
2396 /*
2397  * RecordTransactionAbortPrepared
2398  *
2399  * This is basically the same as RecordTransactionAbort.
2400  *
2401  * We know the transaction made at least one XLOG entry (its PREPARE),
2402  * so it is never possible to optimize out the abort record.
2403  */
2404 static void
2406  int nchildren,
2407  TransactionId *children,
2408  int nrels,
2409  RelFileLocator *rels,
2410  int nstats,
2411  xl_xact_stats_item *stats,
2412  const char *gid)
2413 {
2414  XLogRecPtr recptr;
2415  bool replorigin;
2416 
2417  /*
2418  * Are we using the replication origins feature? Or, in other words, are
2419  * we replaying remote actions?
2420  */
2421  replorigin = (replorigin_session_origin != InvalidRepOriginId &&
2423 
2424  /*
2425  * Catch the scenario where we aborted partway through
2426  * RecordTransactionCommitPrepared ...
2427  */
2428  if (TransactionIdDidCommit(xid))
2429  elog(PANIC, "cannot abort transaction %u, it was already committed",
2430  xid);
2431 
2433 
2434  /*
2435  * Emit the XLOG commit record. Note that we mark 2PC aborts as
2436  * potentially having AccessExclusiveLocks since we don't know whether or
2437  * not they do.
2438  */
2440  nchildren, children,
2441  nrels, rels,
2442  nstats, stats,
2444  xid, gid);
2445 
2446  if (replorigin)
2447  /* Move LSNs forward for this replication origin */
2449  XactLastRecEnd);
2450 
2451  /* Always flush, since we're about to remove the 2PC state file */
2452  XLogFlush(recptr);
2453 
2454  /*
2455  * Mark the transaction aborted in clog. This is not absolutely necessary
2456  * but we may as well do it while we are here.
2457  */
2458  TransactionIdAbortTree(xid, nchildren, children);
2459 
2460  END_CRIT_SECTION();
2461 
2462  /*
2463  * Wait for synchronous replication, if required.
2464  *
2465  * Note that at this stage we have marked clog, but still show as running
2466  * in the procarray and continue to hold locks.
2467  */
2468  SyncRepWaitForLSN(recptr, false);
2469 }
2470 
2471 /*
2472  * PrepareRedoAdd
2473  *
2474  * Store pointers to the start/end of the WAL record along with the xid in
2475  * a gxact entry in shared memory TwoPhaseState structure. If caller
2476  * specifies InvalidXLogRecPtr as WAL location to fetch the two-phase
2477  * data, the entry is marked as located on disk.
2478  */
2479 void
2480 PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
2481  XLogRecPtr end_lsn, RepOriginId origin_id)
2482 {
2484  char *bufptr;
2485  const char *gid;
2486  GlobalTransaction gxact;
2487 
2488  Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
2490 
2491  bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
2492  gid = (const char *) bufptr;
2493 
2494  /*
2495  * Reserve the GID for the given transaction in the redo code path.
2496  *
2497  * This creates a gxact struct and puts it into the active array.
2498  *
2499  * In redo, this struct is mainly used to track PREPARE/COMMIT entries in
2500  * shared memory. Hence, we only fill up the bare minimum contents here.
2501  * The gxact also gets marked with gxact->inredo set to true to indicate
2502  * that it got added in the redo phase
2503  */
2504 
2505  /*
2506  * In the event of a crash while a checkpoint was running, it may be
2507  * possible that some two-phase data found its way to disk while its
2508  * corresponding record needs to be replayed in the follow-up recovery. As
2509  * the 2PC data was on disk, it has already been restored at the beginning
2510  * of recovery with restoreTwoPhaseData(), so skip this record to avoid
2511  * duplicates in TwoPhaseState. If a consistent state has been reached,
2512  * the record is added to TwoPhaseState and it should have no
2513  * corresponding file in pg_twophase.
2514  */
2515  if (!XLogRecPtrIsInvalid(start_lsn))
2516  {
2517  char path[MAXPGPATH];
2518 
2519  TwoPhaseFilePath(path, hdr->xid);
2520 
2521  if (access(path, F_OK) == 0)
2522  {
2524  (errmsg("could not recover two-phase state file for transaction %u",
2525  hdr->xid),
2526  errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.",
2527  LSN_FORMAT_ARGS(start_lsn))));
2528  return;
2529  }
2530 
2531  if (errno != ENOENT)
2532  ereport(ERROR,
2534  errmsg("could not access file \"%s\": %m", path)));
2535  }
2536 
2537  /* Get a free gxact from the freelist */
2538  if (TwoPhaseState->freeGXacts == NULL)
2539  ereport(ERROR,
2540  (errcode(ERRCODE_OUT_OF_MEMORY),
2541  errmsg("maximum number of prepared transactions reached"),
2542  errhint("Increase \"max_prepared_transactions\" (currently %d).",
2543  max_prepared_xacts)));
2544  gxact = TwoPhaseState->freeGXacts;
2545  TwoPhaseState->freeGXacts = gxact->next;
2546 
2547  gxact->prepared_at = hdr->prepared_at;
2548  gxact->prepare_start_lsn = start_lsn;
2549  gxact->prepare_end_lsn = end_lsn;
2550  gxact->xid = hdr->xid;
2551  gxact->owner = hdr->owner;
2553  gxact->valid = false;
2554  gxact->ondisk = XLogRecPtrIsInvalid(start_lsn);
2555  gxact->inredo = true; /* yes, added in redo */
2556  strcpy(gxact->gid, gid);
2557 
2558  /* And insert it into the active array */
2561 
2562  if (origin_id != InvalidRepOriginId)
2563  {
2564  /* recover apply progress */
2565  replorigin_advance(origin_id, hdr->origin_lsn, end_lsn,
2566  false /* backward */ , false /* WAL */ );
2567  }
2568 
2569  elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid);
2570 }
2571 
2572 /*
2573  * PrepareRedoRemove
2574  *
2575  * Remove the corresponding gxact entry from TwoPhaseState. Also remove
2576  * the 2PC file if a prepared transaction was saved via an earlier checkpoint.
2577  *
2578  * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState
2579  * is updated.
2580  */
2581 void
2582 PrepareRedoRemove(TransactionId xid, bool giveWarning)
2583 {
2584  GlobalTransaction gxact = NULL;
2585  int i;
2586  bool found = false;
2587 
2588  Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
2590 
2591  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2592  {
2593  gxact = TwoPhaseState->prepXacts[i];
2594 
2595  if (gxact->xid == xid)
2596  {
2597  Assert(gxact->inredo);
2598  found = true;
2599  break;
2600  }
2601  }
2602 
2603  /*
2604  * Just leave if there is nothing, this is expected during WAL replay.
2605  */
2606  if (!found)
2607  return;
2608 
2609  /*
2610  * And now we can clean up any files we may have left.
2611  */
2612  elog(DEBUG2, "removing 2PC data for transaction %u", xid);
2613  if (gxact->ondisk)
2614  RemoveTwoPhaseFile(xid, giveWarning);
2615  RemoveGXact(gxact);
2616 }
2617 
2618 /*
2619  * LookupGXact
2620  * Check if the prepared transaction with the given GID, lsn and timestamp
2621  * exists.
2622  *
2623  * Note that we always compare with the LSN where prepare ends because that is
2624  * what is stored as origin_lsn in the 2PC file.
2625  *
2626  * This function is primarily used to check if the prepared transaction
2627  * received from the upstream (remote node) already exists. Checking only GID
2628  * is not sufficient because a different prepared xact with the same GID can
2629  * exist on the same node. So, we are ensuring to match origin_lsn and
2630  * origin_timestamp of prepared xact to avoid the possibility of a match of
2631  * prepared xact from two different nodes.
2632  */
2633 bool
2634 LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
2635  TimestampTz origin_prepare_timestamp)
2636 {
2637  int i;
2638  bool found = false;
2639 
2640  LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
2641  for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
2642  {
2644 
2645  /* Ignore not-yet-valid GIDs. */
2646  if (gxact->valid && strcmp(gxact->gid, gid) == 0)
2647  {
2648  char *buf;
2649  TwoPhaseFileHeader *hdr;
2650 
2651  /*
2652  * We are not expecting collisions of GXACTs (same gid) between
2653  * publisher and subscribers, so we perform all I/O while holding
2654  * TwoPhaseStateLock for simplicity.
2655  *
2656  * To move the I/O out of the lock, we need to ensure that no
2657  * other backend commits the prepared xact in the meantime. We can
2658  * do this optimization if we encounter many collisions in GID
2659  * between publisher and subscriber.
2660  */
2661  if (gxact->ondisk)
2662  buf = ReadTwoPhaseFile(gxact->xid, false);
2663  else
2664  {
2665  Assert(gxact->prepare_start_lsn);
2666  XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
2667  }
2668 
2669  hdr = (TwoPhaseFileHeader *) buf;
2670 
2671  if (hdr->origin_lsn == prepare_end_lsn &&
2672  hdr->origin_timestamp == origin_prepare_timestamp)
2673  {
2674  found = true;
2675  pfree(buf);
2676  break;
2677  }
2678 
2679  pfree(buf);
2680  }
2681  }
2682  LWLockRelease(TwoPhaseStateLock);
2683  return found;
2684 }
static void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:448
int16 AttrNumber
Definition: attnum.h:21
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1654
static Datum values[MAXATTR]
Definition: bootstrap.c:152
#define CStringGetTextDatum(s)
Definition: builtins.h:97
unsigned short uint16
Definition: c.h:505
unsigned int uint32
Definition: c.h:506
#define MAXALIGN(LEN)
Definition: c.h:811
#define Max(x, y)
Definition: c.h:998
#define Assert(condition)
Definition: c.h:858
#define PG_BINARY
Definition: c.h:1273
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:398
#define strtou64(str, endptr, base)
Definition: c.h:1298
#define unlikely(x)
Definition: c.h:311
#define MemSet(start, val, len)
Definition: c.h:1020
uint32 TransactionId
Definition: c.h:652
size_t Size
Definition: c.h:605
void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, TransactionId *subxids, TimestampTz timestamp, RepOriginId nodeid)
Definition: commit_ts.c:141
int64 TimestampTz
Definition: timestamp.h:39
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1180
int errcode_for_file_access(void)
Definition: elog.c:880
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:857
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
TupleDesc BlessTupleDesc(TupleDesc tupdesc)
Definition: execTuples.c:2158
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2909
int FreeDir(DIR *dir)
Definition: fd.c:2961
int CloseTransientFile(int fd)
Definition: fd.c:2809
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:756
int pg_fsync(int fd)
Definition: fd.c:386
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2633
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2843
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define SRF_IS_FIRSTCALL()
Definition: funcapi.h:304
#define SRF_PERCALL_SETUP()
Definition: funcapi.h:308
#define SRF_RETURN_NEXT(_funcctx, _result)
Definition: funcapi.h:310
#define SRF_FIRSTCALL_INIT()
Definition: funcapi.h:306
static Datum HeapTupleGetDatum(const HeapTupleData *tuple)
Definition: funcapi.h:230
#define SRF_RETURN_DONE(_funcctx)
Definition: funcapi.h:328
ProcNumber MyProcNumber
Definition: globals.c:87
bool IsUnderPostmaster
Definition: globals.c:117
bool IsPostmasterEnvironment
Definition: globals.c:116
Oid MyDatabaseId
Definition: globals.c:91
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: heaptuple.c:1116
static void dlist_init(dlist_head *head)
Definition: ilist.h:314
static void dlist_node_init(dlist_node *node)
Definition: ilist.h:325
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, bool *RelcacheInitFileInval)
Definition: inval.c:882
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:337
int i
Definition: isn.c:73
#define VirtualTransactionIdIsValid(vxid)
Definition: lock.h:67
#define GET_VXID_FROM_PGPROC(vxid_dst, proc)
Definition: lock.h:77
#define LocalTransactionIdIsValid(lxid)
Definition: lock.h:66
#define VirtualTransactionIdEquals(vxid1, vxid2)
Definition: lock.h:71
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1895
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1170
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1939
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1783
@ LW_WS_NOT_WAITING
Definition: lwlock.h:30
#define NUM_LOCK_PARTITIONS
Definition: lwlock.h:97
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1540
void * palloc(Size size)
Definition: mcxt.c:1316
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition: md.c:1446
#define MaxAllocSize
Definition: memutils.h:40
#define RESUME_INTERRUPTS()
Definition: miscadmin.h:135
#define AmStartupProcess()
Definition: miscadmin.h:382
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define HOLD_INTERRUPTS()
Definition: miscadmin.h:133
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
Oid GetUserId(void)
Definition: miscinit.c:514
TimestampTz replorigin_session_origin_timestamp
Definition: origin.c:157
void replorigin_session_advance(XLogRecPtr remote_commit, XLogRecPtr local_commit)
Definition: origin.c:1219
RepOriginId replorigin_session_origin
Definition: origin.c:155
void replorigin_advance(RepOriginId node, XLogRecPtr remote_commit, XLogRecPtr local_commit, bool go_backward, bool wal_log)
Definition: origin.c:888
XLogRecPtr replorigin_session_origin_lsn
Definition: origin.c:156
#define DoNotReplicateId
Definition: origin.h:34
#define InvalidRepOriginId
Definition: origin.h:33
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:98
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:103
const void size_t len
const void * data
while(p+4<=pend)
static char * user
Definition: pg_regress.c:120
static char * buf
Definition: pg_test_fsync.c:73
void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo)
Definition: pgstat_xact.c:312
void AtEOXact_PgStat(bool isCommit, bool parallel)
Definition: pgstat_xact.c:40
int pgstat_get_transactional_drops(bool isCommit, xl_xact_stats_item **items)
Definition: pgstat_xact.c:270
#define snprintf
Definition: port.h:238
static Datum TransactionIdGetDatum(TransactionId X)
Definition: postgres.h:272
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
Definition: predicate.c:4867
static int fd(const char *x, int i)
Definition: preproc-init.c:105
short access
Definition: preproc-type.c:36
#define GetPGProcByNumber(n)
Definition: proc.h:428
#define PGPROC_MAX_CACHED_SUBXIDS
Definition: proc.h:39
#define GetNumberFromPGProc(proc)
Definition: proc.h:429
#define DELAY_CHKPT_START
Definition: proc.h:114
@ PROC_WAIT_STATUS_OK
Definition: proc.h:119
void ProcArrayAdd(PGPROC *proc)
Definition: procarray.c:468
void ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
Definition: procarray.c:565
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
MemoryContextSwitchTo(old_ctx)
void RelationCacheInitFilePostInvalidate(void)
Definition: relcache.c:6788
void RelationCacheInitFilePreInvalidate(void)
Definition: relcache.c:6763
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
void SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n)
Definition: sinval.c:47
static pg_noinline void Size size
Definition: slab.c:607
PGPROC * MyProc
Definition: proc.c:66
PGPROC * PreparedXactProcs
Definition: proc.c:80
void StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
Definition: standby.c:1091
int smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
Definition: storage.c:852
#define ERRCODE_DUPLICATE_OBJECT
Definition: streamutil.c:32
Definition: dirent.c:26
void * user_fctx
Definition: funcapi.h:82
MemoryContext multi_call_memory_ctx
Definition: funcapi.h:101
TupleDesc tuple_desc
Definition: funcapi.h:112
TimestampTz prepared_at
Definition: twophase.c:151
TransactionId xid
Definition: twophase.c:162
XLogRecPtr prepare_start_lsn
Definition: twophase.c:160
XLogRecPtr prepare_end_lsn
Definition: twophase.c:161
GlobalTransaction next
Definition: twophase.c:149
ProcNumber locking_backend
Definition: twophase.c:165
char gid[GIDSIZE]
Definition: twophase.c:169
Definition: proc.h:157
TransactionId xmin
Definition: proc.h:173
LocalTransactionId lxid
Definition: proc.h:196
PROCLOCK * waitProcLock
Definition: proc.h:229
uint8 lwWaitMode
Definition: proc.h:220
uint8 statusFlags
Definition: proc.h:238
Oid databaseId
Definition: proc.h:203
pg_atomic_uint64 waitStart
Definition: proc.h:233
ProcNumber procNumber
Definition: proc.h:191
int pid
Definition: proc.h:178
bool isBackgroundWorker
Definition: proc.h:209
XidCacheStatus subxidStatus
Definition: proc.h:259
LOCK * waitLock
Definition: proc.h:228
TransactionId xid
Definition: proc.h:168
struct XidCache subxids
Definition: proc.h:261
int delayChkptFlags
Definition: proc.h:236
struct PGPROC::@117 vxid
dlist_head myProcLocks[NUM_LOCK_PARTITIONS]
Definition: proc.h:257
Oid roleId
Definition: proc.h:204
ProcWaitStatus waitStatus
Definition: proc.h:163
Oid tempNamespaceId
Definition: proc.h:206
dlist_node links
Definition: proc.h:159
uint8 lwWaiting
Definition: proc.h:219
struct StateFileChunk * next
Definition: twophase.c:1015
FullTransactionId nextXid
Definition: transam.h:220
TwoPhaseRmgrId rmid
Definition: twophase.c:1002
GlobalTransaction freeGXacts
Definition: twophase.c:179
GlobalTransaction prepXacts[FLEXIBLE_ARRAY_MEMBER]
Definition: twophase.c:185
GlobalTransaction array
Definition: twophase.c:698
bool overflowed
Definition: proc.h:46
uint8 count
Definition: proc.h:44
TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS]
Definition: proc.h:51
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:273
TimestampTz prepared_at
Definition: xact.h:352
int32 nabortrels
Definition: xact.h:356
int32 ninvalmsgs
Definition: xact.h:359
bool initfileinval
Definition: xact.h:360
int32 ncommitstats
Definition: xact.h:357
TimestampTz origin_timestamp
Definition: xact.h:363
uint16 gidlen
Definition: xact.h:361
uint32 total_len
Definition: xact.h:349
int32 nabortstats
Definition: xact.h:358
Oid database
Definition: xact.h:351
XLogRecPtr origin_lsn
Definition: xact.h:362
uint32 magic
Definition: xact.h:348
int32 ncommitrels
Definition: xact.h:355
TransactionId xid
Definition: xact.h:350
int32 nsubxacts
Definition: xact.h:354
uint32 total_len
Definition: twophase.c:1024
uint32 num_chunks
Definition: twophase.c:1022
StateFileChunk * head
Definition: twophase.c:1020
StateFileChunk * tail
Definition: twophase.c:1021
uint32 bytes_free
Definition: twophase.c:1023
void SubTransSetParent(TransactionId xid, TransactionId parent)
Definition: subtrans.c:85
bool superuser_arg(Oid roleid)
Definition: superuser.c:56
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:148
TransactionId TransactionIdLatest(TransactionId mainxid, int nxids, const TransactionId *xids)
Definition: transam.c:345
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:126
void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids)
Definition: transam.c:240
void TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids)
Definition: transam.c:270
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
bool TransactionIdDidAbort(TransactionId transactionId)
Definition: transam.c:188
bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition: transam.c:314
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:329
#define InvalidTransactionId
Definition: transam.h:31
#define EpochFromFullTransactionId(x)
Definition: transam.h:47
static FullTransactionId FullTransactionIdFromU64(uint64 value)
Definition: transam.h:81
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
static FullTransactionId FullTransactionIdFromEpochAndXid(uint32 epoch, TransactionId xid)
Definition: transam.h:71
TupleDesc CreateTemplateTupleDesc(int natts)
Definition: tupdesc.c:67
void TupleDescInitEntry(TupleDesc desc, AttrNumber attributeNumber, const char *attributeName, Oid oidtypeid, int32 typmod, int attdim)
Definition: tupdesc.c:651
static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
Definition: twophase.c:1420
void RecoverPreparedTransactions(void)
Definition: twophase.c:2084
static bool twophaseExitRegistered
Definition: twophase.c:198
void restoreTwoPhaseData(void)
Definition: twophase.c:1898
bool LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, TimestampTz origin_prepare_timestamp)
Definition: twophase.c:2634
Size TwoPhaseShmemSize(void)
Definition: twophase.c:237
#define TWOPHASE_DIR
Definition: twophase.c:112
static void RecordTransactionAbortPrepared(TransactionId xid, int nchildren, TransactionId *children, int nrels, RelFileLocator *rels, int nstats, xl_xact_stats_item *stats, const char *gid)
Definition: twophase.c:2405
void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, const void *data, uint32 len)
Definition: twophase.c:1280
int max_prepared_xacts
Definition: twophase.c:115
static FullTransactionId AdjustToFullTransactionId(TransactionId xid)
Definition: twophase.c:936
static void RecordTransactionCommitPrepared(TransactionId xid, int nchildren, TransactionId *children, int nrels, RelFileLocator *rels, int nstats, xl_xact_stats_item *stats, int ninvalmsgs, SharedInvalidationMessage *invalmsgs, bool initfileinval, const char *gid)
Definition: twophase.c:2307
static void RemoveGXact(GlobalTransaction gxact)
Definition: twophase.c:628
struct TwoPhaseStateData TwoPhaseStateData
static GlobalTransaction MyLockedGxact
Definition: twophase.c:196
static TwoPhaseStateData * TwoPhaseState
Definition: twophase.c:188
static void ProcessRecords(char *bufptr, TransactionId xid, const TwoPhaseCallback callbacks[])
Definition: twophase.c:1689
void AtAbort_Twophase(void)
Definition: twophase.c:304
static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid)
Definition: twophase.c:433
struct GlobalTransactionData GlobalTransactionData
static void save_state_data(const void *data, uint32 len)
Definition: twophase.c:1037
#define TWOPHASE_MAGIC
Definition: twophase.c:989
void FinishPreparedTransaction(const char *gid, bool isCommit)
Definition: twophase.c:1503
struct TwoPhaseRecordOnDisk TwoPhaseRecordOnDisk
TransactionId TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, bool *have_more)
Definition: twophase.c:852
static void GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts, TransactionId *children)
Definition: twophase.c:504
void PrepareRedoRemove(TransactionId xid, bool giveWarning)
Definition: twophase.c:2582
Datum pg_prepared_xact(PG_FUNCTION_ARGS)
Definition: twophase.c:711
void EndPrepare(GlobalTransaction gxact)
Definition: twophase.c:1158
static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
Definition: twophase.c:1717
TransactionId PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
Definition: twophase.c:1962
static char * ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
Definition: twophase.c:1303
void StartPrepare(GlobalTransaction gxact)
Definition: twophase.c:1065
static int GetPreparedTransactionList(GlobalTransaction *gxacts)
Definition: twophase.c:666
PGPROC * TwoPhaseGetDummyProc(TransactionId xid, bool lock_held)
Definition: twophase.c:918
ProcNumber TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held)
Definition: twophase.c:903
void TwoPhaseShmemInit(void)
Definition: twophase.c:253
void PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn, RepOriginId origin_id)
Definition: twophase.c:2480
static int TwoPhaseFilePath(char *path, TransactionId xid)
Definition: twophase.c:961
static GlobalTransaction TwoPhaseGetGXact(TransactionId xid, bool lock_held)
Definition: twophase.c:800
void StandbyRecoverPreparedTransactions(void)
Definition: twophase.c:2043
static void AtProcExit_Twophase(int code, Datum arg)
Definition: twophase.c:294
static char * ProcessTwoPhaseBuffer(TransactionId xid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid)
Definition: twophase.c:2187
static void MarkAsPrepared(GlobalTransaction gxact, bool lock_held)
Definition: twophase.c:530
void PostPrepare_Twophase(void)
Definition: twophase.c:344
xl_xact_prepare TwoPhaseFileHeader
Definition: twophase.c:991
void CheckPointTwoPhase(XLogRecPtr redo_horizon)
Definition: twophase.c:1816
struct StateFileChunk StateFileChunk
bool StandbyTransactionIdIsPrepared(TransactionId xid)
Definition: twophase.c:1475
static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
Definition: twophase.c:1736
GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid)
Definition: twophase.c:359
static GlobalTransaction LockGXact(const char *gid, Oid user)
Definition: twophase.c:552
static struct xllist records
struct GlobalTransactionData * GlobalTransaction
Definition: twophase.h:26
const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID+1]
Definition: twophase_rmgr.c:33
const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID+1]
Definition: twophase_rmgr.c:24
const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID+1]
Definition: twophase_rmgr.c:42
#define TWOPHASE_RM_MAX_ID
Definition: twophase_rmgr.h:29
uint8 TwoPhaseRmgrId
Definition: twophase_rmgr.h:19
#define TWOPHASE_RM_END_ID
Definition: twophase_rmgr.h:24
void(* TwoPhaseCallback)(TransactionId xid, uint16 info, void *recdata, uint32 len)
Definition: twophase_rmgr.h:17
static Datum TimestampTzGetDatum(TimestampTz X)
Definition: timestamp.h:52
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
TransamVariablesData * TransamVariables
Definition: varsup.c:34
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:82
static void pgstat_report_wait_end(void)
Definition: wait_event.h:98
#define fstat
Definition: win32_port.h:283
static const unsigned __int64 epoch
XLogRecPtr XactLogCommitRecord(TimestampTz commit_time, int nsubxacts, TransactionId *subxacts, int nrels, RelFileLocator *rels, int ndroppedstats, xl_xact_stats_item *droppedstats, int nmsgs, SharedInvalidationMessage *msgs, bool relcacheInval, int xactflags, TransactionId twophase_xid, const char *twophase_gid)
Definition: xact.c:5752
int xactGetCommittedChildren(TransactionId **ptr)
Definition: xact.c:5728
int MyXactFlags
Definition: xact.c:134
XLogRecPtr XactLogAbortRecord(TimestampTz abort_time, int nsubxacts, TransactionId *subxacts, int nrels, RelFileLocator *rels, int ndroppedstats, xl_xact_stats_item *droppedstats, int xactflags, TransactionId twophase_xid, const char *twophase_gid)
Definition: xact.c:5924
#define XLOG_XACT_PREPARE
Definition: xact.h:170
#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK
Definition: xact.h:108
#define XLOG_XACT_OPMASK
Definition: xact.h:179
#define GIDSIZE
Definition: xact.h:31
XLogRecPtr ProcLastRecPtr
Definition: xlog.c:253
bool RecoveryInProgress(void)
Definition: xlog.c:6290
XLogRecPtr XactLastRecEnd
Definition: xlog.c:254
int wal_segment_size
Definition: xlog.c:143
bool log_checkpoints
Definition: xlog.c:129
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2791
#define XLOG_INCLUDE_ORIGIN
Definition: xlog.h:152
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint16 RepOriginId
Definition: xlogdefs.h:65
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogSetRecordFlags(uint8 flags)
Definition: xloginsert.c:456
void XLogBeginInsert(void)
Definition: xloginsert.c:149
void XLogEnsureRecordSpace(int max_block_id, int ndatas)
Definition: xloginsert.c:175
XLogRecord * XLogReadRecord(XLogReaderState *state, char **errormsg)
Definition: xlogreader.c:389
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:161
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:106
void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
Definition: xlogreader.c:231
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:416
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:411
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
bool reachedConsistency
Definition: xlogrecovery.c:294
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:188
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:842
void wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p)
Definition: xlogutils.c:817
int read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
Definition: xlogutils.c:861
#define InHotStandby
Definition: xlogutils.h:57