PostgreSQL Source Code  git master
standby.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * standby.c
4  * Misc functions used in Hot Standby mode.
5  *
6  * All functions for handling RM_STANDBY_ID, which relate to
7  * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8  * Plus conflict recovery processing.
9  *
10  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
11  * Portions Copyright (c) 1994, Regents of the University of California
12  *
13  * IDENTIFICATION
14  * src/backend/storage/ipc/standby.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 #include "access/transam.h"
20 #include "access/twophase.h"
21 #include "access/xact.h"
22 #include "access/xlog.h"
23 #include "access/xloginsert.h"
24 #include "miscadmin.h"
25 #include "pgstat.h"
26 #include "storage/bufmgr.h"
27 #include "storage/lmgr.h"
28 #include "storage/proc.h"
29 #include "storage/procarray.h"
30 #include "storage/sinvaladt.h"
31 #include "storage/standby.h"
32 #include "utils/hsearch.h"
33 #include "utils/memutils.h"
34 #include "utils/ps_status.h"
35 #include "utils/timeout.h"
36 #include "utils/timestamp.h"
37 
38 /* User-settable GUC parameters */
40 int max_standby_archive_delay = 30 * 1000;
42 
44 
46  ProcSignalReason reason,
47  uint32 wait_event_info,
48  bool report_waiting);
51 static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
52 
53 /*
54  * Keep track of all the locks owned by a given transaction.
55  */
56 typedef struct RecoveryLockListsEntry
57 {
61 
62 /*
63  * InitRecoveryTransactionEnvironment
64  * Initialize tracking of our primary's in-progress transactions.
65  *
66  * We need to issue shared invalidations and hold locks. Holding locks
67  * means others may want to wait on us, so we need to make a lock table
68  * vxact entry like a real transaction. We could create and delete
69  * lock table entries for each transaction but its simpler just to create
70  * one permanent entry and leave it there all the time. Locks are then
71  * acquired and released as needed. Yes, this means you can see the
72  * Startup process in pg_locks once we have run this.
73  */
74 void
76 {
78  HASHCTL hash_ctl;
79 
80  /*
81  * Initialize the hash table for tracking the list of locks held by each
82  * transaction.
83  */
84  memset(&hash_ctl, 0, sizeof(hash_ctl));
85  hash_ctl.keysize = sizeof(TransactionId);
86  hash_ctl.entrysize = sizeof(RecoveryLockListsEntry);
87  RecoveryLockLists = hash_create("RecoveryLockLists",
88  64,
89  &hash_ctl,
91 
92  /*
93  * Initialize shared invalidation management for Startup process, being
94  * careful to register ourselves as a sendOnly process so we don't need to
95  * read messages, nor will we get signaled when the queue starts filling
96  * up.
97  */
99 
100  /*
101  * Lock a virtual transaction id for Startup process.
102  *
103  * We need to do GetNextLocalTransactionId() because
104  * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
105  * manager doesn't like that at all.
106  *
107  * Note that we don't need to run XactLockTableInsert() because nobody
108  * needs to wait on xids. That sounds a little strange, but table locks
109  * are held by vxids and row level locks are held by xids. All queries
110  * hold AccessShareLocks so never block while we write or lock new rows.
111  */
112  vxid.backendId = MyBackendId;
115 
117 }
118 
119 /*
120  * ShutdownRecoveryTransactionEnvironment
121  * Shut down transaction tracking
122  *
123  * Prepare to switch from hot standby mode to normal operation. Shut down
124  * recovery-time transaction tracking.
125  */
126 void
128 {
129  /* Mark all tracked in-progress transactions as finished. */
131 
132  /* Release all locks the tracked transactions were holding */
134 
135  /* Destroy the hash table of locks. */
136  hash_destroy(RecoveryLockLists);
137  RecoveryLockLists = NULL;
138 
139  /* Cleanup our VirtualTransaction */
141 }
142 
143 
144 /*
145  * -----------------------------------------------------
146  * Standby wait timers and backend cancel logic
147  * -----------------------------------------------------
148  */
149 
150 /*
151  * Determine the cutoff time at which we want to start canceling conflicting
152  * transactions. Returns zero (a time safely in the past) if we are willing
153  * to wait forever.
154  */
155 static TimestampTz
157 {
158  TimestampTz rtime;
159  bool fromStream;
160 
161  /*
162  * The cutoff time is the last WAL data receipt time plus the appropriate
163  * delay variable. Delay of -1 means wait forever.
164  */
165  GetXLogReceiptTime(&rtime, &fromStream);
166  if (fromStream)
167  {
169  return 0; /* wait forever */
171  }
172  else
173  {
175  return 0; /* wait forever */
177  }
178 }
179 
180 #define STANDBY_INITIAL_WAIT_US 1000
182 
183 /*
184  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
185  * We wait here for a while then return. If we decide we can't wait any
186  * more then we return true, if we can wait some more return false.
187  */
188 static bool
190 {
191  TimestampTz ltime;
192 
194 
195  /* Are we past the limit time? */
196  ltime = GetStandbyLimitTime();
197  if (ltime && GetCurrentTimestamp() >= ltime)
198  return true;
199 
200  /*
201  * Sleep a bit (this is essential to avoid busy-waiting).
202  */
203  pgstat_report_wait_start(wait_event_info);
206 
207  /*
208  * Progressively increase the sleep times, but not to more than 1s, since
209  * pg_usleep isn't interruptible on some platforms.
210  */
211  standbyWait_us *= 2;
212  if (standbyWait_us > 1000000)
213  standbyWait_us = 1000000;
214 
215  return false;
216 }
217 
218 /*
219  * This is the main executioner for any query backend that conflicts with
220  * recovery processing. Judgement has already been passed on it within
221  * a specific rmgr. Here we just issue the orders to the procs. The procs
222  * then throw the required error as instructed.
223  *
224  * If report_waiting is true, "waiting" is reported in PS display if necessary.
225  * If the caller has already reported that, report_waiting should be false.
226  * Otherwise, "waiting" is reported twice unexpectedly.
227  */
228 static void
230  ProcSignalReason reason, uint32 wait_event_info,
231  bool report_waiting)
232 {
233  TimestampTz waitStart = 0;
234  char *new_status;
235 
236  /* Fast exit, to avoid a kernel call if there's no work to be done. */
237  if (!VirtualTransactionIdIsValid(*waitlist))
238  return;
239 
240  if (report_waiting)
241  waitStart = GetCurrentTimestamp();
242  new_status = NULL; /* we haven't changed the ps display */
243 
244  while (VirtualTransactionIdIsValid(*waitlist))
245  {
246  /* reset standbyWait_us for each xact we wait for */
248 
249  /* wait until the virtual xid is gone */
250  while (!VirtualXactLock(*waitlist, false))
251  {
252  /*
253  * Report via ps if we have been waiting for more than 500 msec
254  * (should that be configurable?)
255  */
256  if (update_process_title && new_status == NULL && report_waiting &&
258  500))
259  {
260  const char *old_status;
261  int len;
262 
263  old_status = get_ps_display(&len);
264  new_status = (char *) palloc(len + 8 + 1);
265  memcpy(new_status, old_status, len);
266  strcpy(new_status + len, " waiting");
267  set_ps_display(new_status);
268  new_status[len] = '\0'; /* truncate off " waiting" */
269  }
270 
271  /* Is it time to kill it? */
272  if (WaitExceedsMaxStandbyDelay(wait_event_info))
273  {
274  pid_t pid;
275 
276  /*
277  * Now find out who to throw out of the balloon.
278  */
280  pid = CancelVirtualTransaction(*waitlist, reason);
281 
282  /*
283  * Wait a little bit for it to die so that we avoid flooding
284  * an unresponsive backend when system is heavily loaded.
285  */
286  if (pid != 0)
287  pg_usleep(5000L);
288  }
289  }
290 
291  /* The virtual transaction is gone now, wait for the next one */
292  waitlist++;
293  }
294 
295  /* Reset ps display if we changed it */
296  if (new_status)
297  {
298  set_ps_display(new_status);
299  pfree(new_status);
300  }
301 }
302 
303 void
305 {
306  VirtualTransactionId *backends;
307 
308  /*
309  * If we get passed InvalidTransactionId then we are a little surprised,
310  * but it is theoretically possible in normal running. It also happens
311  * when replaying already applied WAL records after a standby crash or
312  * restart, or when replaying an XLOG_HEAP2_VISIBLE record that marks as
313  * frozen a page which was already all-visible. If latestRemovedXid is
314  * invalid then there is no conflict. That rule applies across all record
315  * types that suffer from this conflict.
316  */
317  if (!TransactionIdIsValid(latestRemovedXid))
318  return;
319 
320  backends = GetConflictingVirtualXIDs(latestRemovedXid,
321  node.dbNode);
322 
326  true);
327 }
328 
329 void
331 {
332  VirtualTransactionId *temp_file_users;
333 
334  /*
335  * Standby users may be currently using this tablespace for their
336  * temporary files. We only care about current users because
337  * temp_tablespace parameter will just ignore tablespaces that no longer
338  * exist.
339  *
340  * Ask everybody to cancel their queries immediately so we can ensure no
341  * temp files remain and we can remove the tablespace. Nuke the entire
342  * site from orbit, it's the only way to be sure.
343  *
344  * XXX: We could work out the pids of active backends using this
345  * tablespace by examining the temp filenames in the directory. We would
346  * then convert the pids into VirtualXIDs before attempting to cancel
347  * them.
348  *
349  * We don't wait for commit because drop tablespace is non-transactional.
350  */
352  InvalidOid);
356  true);
357 }
358 
359 void
361 {
362  /*
363  * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
364  * only waits for transactions and completely idle sessions would block
365  * us. This is rare enough that we do this as simply as possible: no wait,
366  * just force them off immediately.
367  *
368  * No locking is required here because we already acquired
369  * AccessExclusiveLock. Anybody trying to connect while we do this will
370  * block during InitPostgres() and then disconnect when they see the
371  * database has been removed.
372  */
373  while (CountDBBackends(dbid) > 0)
374  {
376 
377  /*
378  * Wait awhile for them to die so that we avoid flooding an
379  * unresponsive backend when system is heavily loaded.
380  */
381  pg_usleep(10000);
382  }
383 }
384 
385 /*
386  * ResolveRecoveryConflictWithLock is called from ProcSleep()
387  * to resolve conflicts with other backends holding relation locks.
388  *
389  * The WaitLatch sleep normally done in ProcSleep()
390  * (when not InHotStandby) is performed here, for code clarity.
391  *
392  * We either resolve conflicts immediately or set a timeout to wake us at
393  * the limit of our patience.
394  *
395  * Resolve conflicts by canceling to all backends holding a conflicting
396  * lock. As we are already queued to be granted the lock, no new lock
397  * requests conflicting with ours will be granted in the meantime.
398  *
399  * Deadlocks involving the Startup process and an ordinary backend process
400  * will be detected by the deadlock detector within the ordinary backend.
401  */
402 void
404 {
405  TimestampTz ltime;
406 
408 
409  ltime = GetStandbyLimitTime();
410 
411  if (GetCurrentTimestamp() >= ltime)
412  {
413  /*
414  * We're already behind, so clear a path as quickly as possible.
415  */
416  VirtualTransactionId *backends;
417 
418  backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
419 
420  /*
421  * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
422  * "waiting" in PS display by disabling its argument report_waiting
423  * because the caller, WaitOnLock(), has already reported that.
424  */
427  PG_WAIT_LOCK | locktag.locktag_type,
428  false);
429  }
430  else
431  {
432  /*
433  * Wait (or wait again) until ltime
434  */
435  EnableTimeoutParams timeouts[1];
436 
437  timeouts[0].id = STANDBY_LOCK_TIMEOUT;
438  timeouts[0].type = TMPARAM_AT;
439  timeouts[0].fin_time = ltime;
440  enable_timeouts(timeouts, 1);
441  }
442 
443  /* Wait to be signaled by the release of the Relation Lock */
445 
446  /*
447  * Clear any timeout requests established above. We assume here that the
448  * Startup process doesn't have any other outstanding timeouts than those
449  * used by this function. If that stops being true, we could cancel the
450  * timeouts individually, but that'd be slower.
451  */
452  disable_all_timeouts(false);
453 }
454 
455 /*
456  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
457  * to resolve conflicts with other backends holding buffer pins.
458  *
459  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
460  * (when not InHotStandby) is performed here, for code clarity.
461  *
462  * We either resolve conflicts immediately or set a timeout to wake us at
463  * the limit of our patience.
464  *
465  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
466  * they hold one of the buffer pins that is blocking Startup process. If so,
467  * those backends will take an appropriate error action, ERROR or FATAL.
468  *
469  * We also must check for deadlocks. Deadlocks occur because if queries
470  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
471  * be cleared if the Startup process replays a transaction completion record.
472  * If Startup process is also waiting then that is a deadlock. The deadlock
473  * can occur if the query is waiting and then the Startup sleeps, or if
474  * Startup is sleeping and the query waits on a lock. We protect against
475  * only the former sequence here, the latter sequence is checked prior to
476  * the query sleeping, in CheckRecoveryConflictDeadlock().
477  *
478  * Deadlocks are extremely rare, and relatively expensive to check for,
479  * so we don't do a deadlock check right away ... only if we have had to wait
480  * at least deadlock_timeout.
481  */
482 void
484 {
485  TimestampTz ltime;
486 
488 
489  ltime = GetStandbyLimitTime();
490 
491  if (ltime == 0)
492  {
493  /*
494  * We're willing to wait forever for conflicts, so set timeout for
495  * deadlock check only
496  */
498  }
499  else if (GetCurrentTimestamp() >= ltime)
500  {
501  /*
502  * We're already behind, so clear a path as quickly as possible.
503  */
505  }
506  else
507  {
508  /*
509  * Wake up at ltime, and check for deadlocks as well if we will be
510  * waiting longer than deadlock_timeout
511  */
512  EnableTimeoutParams timeouts[2];
513 
514  timeouts[0].id = STANDBY_TIMEOUT;
515  timeouts[0].type = TMPARAM_AT;
516  timeouts[0].fin_time = ltime;
517  timeouts[1].id = STANDBY_DEADLOCK_TIMEOUT;
518  timeouts[1].type = TMPARAM_AFTER;
519  timeouts[1].delay_ms = DeadlockTimeout;
520  enable_timeouts(timeouts, 2);
521  }
522 
523  /* Wait to be signaled by UnpinBuffer() */
525 
526  /*
527  * Clear any timeout requests established above. We assume here that the
528  * Startup process doesn't have any other timeouts than what this function
529  * uses. If that stops being true, we could cancel the timeouts
530  * individually, but that'd be slower.
531  */
532  disable_all_timeouts(false);
533 }
534 
535 static void
537 {
540 
541  /*
542  * We send signal to all backends to ask them if they are holding the
543  * buffer pin which is delaying the Startup process. We must not set the
544  * conflict flag yet, since most backends will be innocent. Let the
545  * SIGUSR1 handling in each backend decide their own fate.
546  */
547  CancelDBBackends(InvalidOid, reason, false);
548 }
549 
550 /*
551  * In Hot Standby perform early deadlock detection. We abort the lock
552  * wait if we are about to sleep while holding the buffer pin that Startup
553  * process is waiting for.
554  *
555  * Note: this code is pessimistic, because there is no way for it to
556  * determine whether an actual deadlock condition is present: the lock we
557  * need to wait for might be unrelated to any held by the Startup process.
558  * Sooner or later, this mechanism should get ripped out in favor of somehow
559  * accounting for buffer locks in DeadLockCheck(). However, errors here
560  * seem to be very low-probability in practice, so for now it's not worth
561  * the trouble.
562  */
563 void
565 {
566  Assert(!InRecovery); /* do not call in Startup process */
567 
569  return;
570 
571  /*
572  * Error message should match ProcessInterrupts() but we avoid calling
573  * that because we aren't handling an interrupt at this point. Note that
574  * we only cancel the current transaction here, so if we are in a
575  * subtransaction and the pin is held by a parent, then the Startup
576  * process will continue to wait even though we have avoided deadlock.
577  */
578  ereport(ERROR,
579  (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
580  errmsg("canceling statement due to conflict with recovery"),
581  errdetail("User transaction caused buffer deadlock with recovery.")));
582 }
583 
584 
585 /* --------------------------------
586  * timeout handler routines
587  * --------------------------------
588  */
589 
590 /*
591  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT
592  * occurs before STANDBY_TIMEOUT. Send out a request for hot-standby
593  * backends to check themselves for deadlocks.
594  */
595 void
597 {
599 }
600 
601 /*
602  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
603  * Send out a request to release conflicting buffer pins unconditionally,
604  * so we can press ahead with applying changes in recovery.
605  */
606 void
608 {
609  /* forget any pending STANDBY_DEADLOCK_TIMEOUT request */
611 
613 }
614 
615 /*
616  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
617  * This doesn't need to do anything, simply waking up is enough.
618  */
619 void
621 {
622 }
623 
624 /*
625  * -----------------------------------------------------
626  * Locking in Recovery Mode
627  * -----------------------------------------------------
628  *
629  * All locks are held by the Startup process using a single virtual
630  * transaction. This implementation is both simpler and in some senses,
631  * more correct. The locks held mean "some original transaction held
632  * this lock, so query access is not allowed at this time". So the Startup
633  * process is the proxy by which the original locks are implemented.
634  *
635  * We only keep track of AccessExclusiveLocks, which are only ever held by
636  * one transaction on one relation.
637  *
638  * We keep a hash table of lists of locks in local memory keyed by xid,
639  * RecoveryLockLists, so we can keep track of the various entries made by
640  * the Startup process's virtual xid in the shared lock table.
641  *
642  * List elements use type xl_standby_lock, since the WAL record type exactly
643  * matches the information that we need to keep track of.
644  *
645  * We use session locks rather than normal locks so we don't need
646  * ResourceOwners.
647  */
648 
649 
650 void
652 {
653  RecoveryLockListsEntry *entry;
654  xl_standby_lock *newlock;
655  LOCKTAG locktag;
656  bool found;
657 
658  /* Already processed? */
659  if (!TransactionIdIsValid(xid) ||
660  TransactionIdDidCommit(xid) ||
662  return;
663 
665  "adding recovery lock: db %u rel %u", dbOid, relOid);
666 
667  /* dbOid is InvalidOid when we are locking a shared relation. */
668  Assert(OidIsValid(relOid));
669 
670  /* Create a new list for this xid, if we don't have one already. */
671  entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found);
672  if (!found)
673  {
674  entry->xid = xid;
675  entry->locks = NIL;
676  }
677 
678  newlock = palloc(sizeof(xl_standby_lock));
679  newlock->xid = xid;
680  newlock->dbOid = dbOid;
681  newlock->relOid = relOid;
682  entry->locks = lappend(entry->locks, newlock);
683 
684  SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
685 
686  (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
687 }
688 
689 static void
691 {
692  while (locks)
693  {
694  xl_standby_lock *lock = (xl_standby_lock *) linitial(locks);
695  LOCKTAG locktag;
696 
698  "releasing recovery lock: xid %u db %u rel %u",
699  lock->xid, lock->dbOid, lock->relOid);
700  SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
701  if (!LockRelease(&locktag, AccessExclusiveLock, true))
702  {
703  elog(LOG,
704  "RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
705  lock->xid, lock->dbOid, lock->relOid);
706  Assert(false);
707  }
708  pfree(lock);
709  locks = list_delete_first(locks);
710  }
711 }
712 
713 static void
715 {
716  RecoveryLockListsEntry *entry;
717 
718  if (TransactionIdIsValid(xid))
719  {
720  if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL)))
721  {
723  hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
724  }
725  }
726  else
728 }
729 
730 /*
731  * Release locks for a transaction tree, starting at xid down, from
732  * RecoveryLockLists.
733  *
734  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
735  * to remove any AccessExclusiveLocks requested by a transaction.
736  */
737 void
739 {
740  int i;
741 
742  StandbyReleaseLocks(xid);
743 
744  for (i = 0; i < nsubxids; i++)
745  StandbyReleaseLocks(subxids[i]);
746 }
747 
748 /*
749  * Called at end of recovery and when we see a shutdown checkpoint.
750  */
751 void
753 {
755  RecoveryLockListsEntry *entry;
756 
757  elog(trace_recovery(DEBUG2), "release all standby locks");
758 
759  hash_seq_init(&status, RecoveryLockLists);
760  while ((entry = hash_seq_search(&status)))
761  {
763  hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
764  }
765 }
766 
767 /*
768  * StandbyReleaseOldLocks
769  * Release standby locks held by top-level XIDs that aren't running,
770  * as long as they're not prepared transactions.
771  */
772 void
774 {
776  RecoveryLockListsEntry *entry;
777 
778  hash_seq_init(&status, RecoveryLockLists);
779  while ((entry = hash_seq_search(&status)))
780  {
782 
783  /* Skip if prepared transaction. */
785  continue;
786 
787  /* Skip if >= oldxid. */
788  if (!TransactionIdPrecedes(entry->xid, oldxid))
789  continue;
790 
791  /* Remove all locks and hash table entry. */
793  hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
794  }
795 }
796 
797 /*
798  * --------------------------------------------------------------------
799  * Recovery handling for Rmgr RM_STANDBY_ID
800  *
801  * These record types will only be created if XLogStandbyInfoActive()
802  * --------------------------------------------------------------------
803  */
804 
805 void
807 {
808  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
809 
810  /* Backup blocks are not used in standby records */
811  Assert(!XLogRecHasAnyBlockRefs(record));
812 
813  /* Do nothing if we're not in hot standby mode */
815  return;
816 
817  if (info == XLOG_STANDBY_LOCK)
818  {
819  xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
820  int i;
821 
822  for (i = 0; i < xlrec->nlocks; i++)
824  xlrec->locks[i].dbOid,
825  xlrec->locks[i].relOid);
826  }
827  else if (info == XLOG_RUNNING_XACTS)
828  {
829  xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
830  RunningTransactionsData running;
831 
832  running.xcnt = xlrec->xcnt;
833  running.subxcnt = xlrec->subxcnt;
834  running.subxid_overflow = xlrec->subxid_overflow;
835  running.nextXid = xlrec->nextXid;
836  running.latestCompletedXid = xlrec->latestCompletedXid;
837  running.oldestRunningXid = xlrec->oldestRunningXid;
838  running.xids = xlrec->xids;
839 
840  ProcArrayApplyRecoveryInfo(&running);
841  }
842  else if (info == XLOG_INVALIDATIONS)
843  {
844  xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
845 
847  xlrec->nmsgs,
848  xlrec->relcacheInitFileInval,
849  xlrec->dbId,
850  xlrec->tsId);
851  }
852  else
853  elog(PANIC, "standby_redo: unknown op code %u", info);
854 }
855 
856 /*
857  * Log details of the current snapshot to WAL. This allows the snapshot state
858  * to be reconstructed on the standby and for logical decoding.
859  *
860  * This is used for Hot Standby as follows:
861  *
862  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
863  * start from a shutdown checkpoint because we know nothing was running
864  * at that time and our recovery snapshot is known empty. In the more
865  * typical case of an online checkpoint we need to jump through a few
866  * hoops to get a correct recovery snapshot and this requires a two or
867  * sometimes a three stage process.
868  *
869  * The initial snapshot must contain all running xids and all current
870  * AccessExclusiveLocks at a point in time on the standby. Assembling
871  * that information while the server is running requires many and
872  * various LWLocks, so we choose to derive that information piece by
873  * piece and then re-assemble that info on the standby. When that
874  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
875  *
876  * Since locking on the primary when we derive the information is not
877  * strict, we note that there is a time window between the derivation and
878  * writing to WAL of the derived information. That allows race conditions
879  * that we must resolve, since xids and locks may enter or leave the
880  * snapshot during that window. This creates the issue that an xid or
881  * lock may start *after* the snapshot has been derived yet *before* the
882  * snapshot is logged in the running xacts WAL record. We resolve this by
883  * starting to accumulate changes at a point just prior to when we derive
884  * the snapshot on the primary, then ignore duplicates when we later apply
885  * the snapshot from the running xacts record. This is implemented during
886  * CreateCheckpoint() where we use the logical checkpoint location as
887  * our starting point and then write the running xacts record immediately
888  * before writing the main checkpoint WAL record. Since we always start
889  * up from a checkpoint and are immediately at our starting point, we
890  * unconditionally move to STANDBY_INITIALIZED. After this point we
891  * must do 4 things:
892  * * move shared nextXid forwards as we see new xids
893  * * extend the clog and subtrans with each new xid
894  * * keep track of uncommitted known assigned xids
895  * * keep track of uncommitted AccessExclusiveLocks
896  *
897  * When we see a commit/abort we must remove known assigned xids and locks
898  * from the completing transaction. Attempted removals that cannot locate
899  * an entry are expected and must not cause an error when we are in state
900  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
901  * KnownAssignedXidsRemove().
902  *
903  * Later, when we apply the running xact data we must be careful to ignore
904  * transactions already committed, since those commits raced ahead when
905  * making WAL entries.
906  *
907  * The loose timing also means that locks may be recorded that have a
908  * zero xid, since xids are removed from procs before locks are removed.
909  * So we must prune the lock list down to ensure we hold locks only for
910  * currently running xids, performed by StandbyReleaseOldLocks().
911  * Zero xids should no longer be possible, but we may be replaying WAL
912  * from a time when they were possible.
913  *
914  * For logical decoding only the running xacts information is needed;
915  * there's no need to look at the locking information, but it's logged anyway,
916  * as there's no independent knob to just enable logical decoding. For
917  * details of how this is used, check snapbuild.c's introductory comment.
918  *
919  *
920  * Returns the RecPtr of the last inserted record.
921  */
924 {
925  XLogRecPtr recptr;
926  RunningTransactions running;
928  int nlocks;
929 
931 
932  /*
933  * Get details of any AccessExclusiveLocks being held at the moment.
934  */
935  locks = GetRunningTransactionLocks(&nlocks);
936  if (nlocks > 0)
937  LogAccessExclusiveLocks(nlocks, locks);
938  pfree(locks);
939 
940  /*
941  * Log details of all in-progress transactions. This should be the last
942  * record we write, because standby will open up when it sees this.
943  */
944  running = GetRunningTransactionData();
945 
946  /*
947  * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
948  * For Hot Standby this can be done before inserting the WAL record
949  * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
950  * the clog. For logical decoding, though, the lock can't be released
951  * early because the clog might be "in the future" from the POV of the
952  * historic snapshot. This would allow for situations where we're waiting
953  * for the end of a transaction listed in the xl_running_xacts record
954  * which, according to the WAL, has committed before the xl_running_xacts
955  * record. Fortunately this routine isn't executed frequently, and it's
956  * only a shared lock.
957  */
959  LWLockRelease(ProcArrayLock);
960 
961  recptr = LogCurrentRunningXacts(running);
962 
963  /* Release lock if we kept it longer ... */
965  LWLockRelease(ProcArrayLock);
966 
967  /* GetRunningTransactionData() acquired XidGenLock, we must release it */
968  LWLockRelease(XidGenLock);
969 
970  return recptr;
971 }
972 
973 /*
974  * Record an enhanced snapshot of running transactions into WAL.
975  *
976  * The definitions of RunningTransactionsData and xl_xact_running_xacts are
977  * similar. We keep them separate because xl_xact_running_xacts is a
978  * contiguous chunk of memory and never exists fully until it is assembled in
979  * WAL. The inserted records are marked as not being important for durability,
980  * to avoid triggering superfluous checkpoint / archiving activity.
981  */
982 static XLogRecPtr
984 {
985  xl_running_xacts xlrec;
986  XLogRecPtr recptr;
987 
988  xlrec.xcnt = CurrRunningXacts->xcnt;
989  xlrec.subxcnt = CurrRunningXacts->subxcnt;
990  xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
991  xlrec.nextXid = CurrRunningXacts->nextXid;
992  xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
993  xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
994 
995  /* Header */
996  XLogBeginInsert();
998  XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
999 
1000  /* array of TransactionIds */
1001  if (xlrec.xcnt > 0)
1002  XLogRegisterData((char *) CurrRunningXacts->xids,
1003  (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
1004 
1005  recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
1006 
1007  if (CurrRunningXacts->subxid_overflow)
1009  "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1010  CurrRunningXacts->xcnt,
1011  (uint32) (recptr >> 32), (uint32) recptr,
1012  CurrRunningXacts->oldestRunningXid,
1013  CurrRunningXacts->latestCompletedXid,
1014  CurrRunningXacts->nextXid);
1015  else
1017  "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1018  CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1019  (uint32) (recptr >> 32), (uint32) recptr,
1020  CurrRunningXacts->oldestRunningXid,
1021  CurrRunningXacts->latestCompletedXid,
1022  CurrRunningXacts->nextXid);
1023 
1024  /*
1025  * Ensure running_xacts information is synced to disk not too far in the
1026  * future. We don't want to stall anything though (i.e. use XLogFlush()),
1027  * so we let the wal writer do it during normal operation.
1028  * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1029  * and nudge the WALWriter into action if sleeping. Check
1030  * XLogBackgroundFlush() for details why a record might not be flushed
1031  * without it.
1032  */
1033  XLogSetAsyncXactLSN(recptr);
1034 
1035  return recptr;
1036 }
1037 
1038 /*
1039  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1040  * logged, as described in backend/storage/lmgr/README.
1041  */
1042 static void
1044 {
1045  xl_standby_locks xlrec;
1046 
1047  xlrec.nlocks = nlocks;
1048 
1049  XLogBeginInsert();
1050  XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
1051  XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
1053 
1054  (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1055 }
1056 
1057 /*
1058  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1059  */
1060 void
1062 {
1063  xl_standby_lock xlrec;
1064 
1065  xlrec.xid = GetCurrentTransactionId();
1066 
1067  xlrec.dbOid = dbOid;
1068  xlrec.relOid = relOid;
1069 
1070  LogAccessExclusiveLocks(1, &xlrec);
1072 }
1073 
1074 /*
1075  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1076  */
1077 void
1079 {
1080  /*
1081  * Ensure that a TransactionId has been assigned to this transaction, for
1082  * two reasons, both related to lock release on the standby. First, we
1083  * must assign an xid so that RecordTransactionCommit() and
1084  * RecordTransactionAbort() do not optimise away the transaction
1085  * completion record which recovery relies upon to release locks. It's a
1086  * hack, but for a corner case not worth adding code for into the main
1087  * commit path. Second, we must assign an xid before the lock is recorded
1088  * in shared memory, otherwise a concurrently executing
1089  * GetRunningTransactionLocks() might see a lock associated with an
1090  * InvalidTransactionId which we later assert cannot happen.
1091  */
1092  (void) GetCurrentTransactionId();
1093 }
1094 
1095 /*
1096  * Emit WAL for invalidations. This currently is only used for commits without
1097  * an xid but which contain invalidations.
1098  */
1099 void
1101  bool relcacheInitFileInval)
1102 {
1103  xl_invalidations xlrec;
1104 
1105  /* prepare record */
1106  memset(&xlrec, 0, sizeof(xlrec));
1107  xlrec.dbId = MyDatabaseId;
1108  xlrec.tsId = MyDatabaseTableSpace;
1109  xlrec.relcacheInitFileInval = relcacheInitFileInval;
1110  xlrec.nmsgs = nmsgs;
1111 
1112  /* perform insertion */
1113  XLogBeginInsert();
1114  XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
1115  XLogRegisterData((char *) msgs,
1116  nmsgs * sizeof(SharedInvalidationMessage));
1117  XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1118 }
static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
Definition: standby.c:1043
static bool WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
Definition: standby.c:189
void ProcArrayApplyRecoveryInfo(RunningTransactions running)
Definition: procarray.c:985
static void StandbyReleaseLockList(List *locks)
Definition: standby.c:690
#define NIL
Definition: pg_list.h:65
void ResolveRecoveryConflictWithLock(LOCKTAG locktag)
Definition: standby.c:403
TransactionId oldestRunningXid
Definition: standby.h:76
pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
Definition: procarray.c:3270
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:827
#define PG_WAIT_LOCK
Definition: pgstat.h:785
static TimestampTz GetStandbyLimitTime(void)
Definition: standby.c:156
TimeoutId id
Definition: timeout.h:54
int CountDBBackends(Oid databaseid)
Definition: procarray.c:3371
void StandbyTimeoutHandler(void)
Definition: standby.c:607
int max_standby_archive_delay
Definition: standby.c:40
static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, ProcSignalReason reason, uint32 wait_event_info, bool report_waiting)
Definition: standby.c:229
BackendId MyBackendId
Definition: globals.c:81
LockAcquireResult LockAcquire(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock, bool dontWait)
Definition: lock.c:747
void VirtualXactLockTableCleanup(void)
Definition: lock.c:4414
static HTAB * RecoveryLockLists
Definition: standby.c:43
#define HASH_ELEM
Definition: hsearch.h:85
uint32 TransactionId
Definition: c.h:520
void SharedInvalBackendInit(bool sendOnly)
Definition: sinvaladt.c:257
bool update_process_title
Definition: ps_status.c:36
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1578
static void StandbyReleaseLocks(TransactionId xid)
Definition: standby.c:714
int64 TimestampTz
Definition: timestamp.h:39
TimeoutType type
Definition: timeout.h:55
int wal_level
Definition: xlog.c:107
int vacuum_defer_cleanup_age
Definition: standby.c:39
bool InRecovery
Definition: xlog.c:205
VirtualTransactionId * GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
Definition: procarray.c:3196
#define XLOG_INVALIDATIONS
Definition: standbydefs.h:36
static int standbyWait_us
Definition: standby.c:181
unsigned char uint8
Definition: c.h:372
#define XLOG_STANDBY_LOCK
Definition: standbydefs.h:34
Definition: lock.h:164
Size entrysize
Definition: hsearch.h:72
void LogAccessExclusiveLock(Oid dbOid, Oid relOid)
Definition: standby.c:1061
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:610
TransactionId * xids
Definition: standby.h:79
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:919
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:125
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
LocalTransactionId localTransactionId
Definition: lock.h:65
#define DEBUG4
Definition: elog.h:22
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1677
#define OidIsValid(objectId)
Definition: c.h:651
#define PANIC
Definition: elog.h:53
xl_standby_lock * GetRunningTransactionLocks(int *nlocks)
Definition: lock.c:3933
void ExpireAllKnownAssignedTransactionIds(void)
Definition: procarray.c:4291
TransactionId xid
Definition: standby.c:58
Oid MyDatabaseTableSpace
Definition: globals.c:87
int trace_recovery(int trace_level)
Definition: elog.c:3543
TransactionId latestCompletedXid
Definition: standby.h:77
#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK
Definition: xact.h:107
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
bool VirtualXactLock(VirtualTransactionId vxid, bool wait)
Definition: lock.c:4461
void set_ps_display(const char *activity)
Definition: ps_status.c:349
TransactionId xids[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:56
void pg_usleep(long microsec)
Definition: signal.c:53
Definition: dynahash.c:218
void enable_timeouts(const EnableTimeoutParams *timeouts, int count)
Definition: timeout.c:483
void pfree(void *pointer)
Definition: mcxt.c:1057
#define XLogRecGetData(decoder)
Definition: xlogreader.h:310
void disable_all_timeouts(bool keep_indicators)
Definition: timeout.c:598
#define linitial(l)
Definition: pg_list.h:174
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:923
static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
Definition: standby.c:983
#define ERROR
Definition: elog.h:43
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:483
#define STANDBY_INITIAL_WAIT_US
Definition: standby.c:180
LocalTransactionId GetNextLocalTransactionId(void)
Definition: sinvaladt.c:766
TransactionId latestCompletedXid
Definition: standbydefs.h:54
void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
Definition: procarray.c:3432
#define DEBUG2
Definition: elog.h:24
TransactionId GetCurrentTransactionId(void)
Definition: xact.c:438
void VirtualXactLockTableInsert(VirtualTransactionId vxid)
Definition: lock.c:4391
#define SET_LOCKTAG_RELATION(locktag, dboid, reloid)
Definition: lock.h:181
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void standby_redo(XLogReaderState *record)
Definition: standby.c:806
bool relcacheInitFileInval
Definition: standbydefs.h:67
#define MinSizeOfInvalidations
Definition: standbydefs.h:72
void LogAccessExclusiveLockPrepare(void)
Definition: standby.c:1078
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define InvalidTransactionId
Definition: transam.h:31
bool StandbyTransactionIdIsPrepared(TransactionId xid)
Definition: twophase.c:1369
void StandbyLockTimeoutHandler(void)
Definition: standby.c:620
void StandbyDeadLockHandler(void)
Definition: standby.c:596
void StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
Definition: standby.c:738
unsigned int uint32
Definition: c.h:374
void CheckRecoveryConflictDeadlock(void)
Definition: standby.c:564
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1386
static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
Definition: standby.c:536
void XLogSetRecordFlags(uint8 flags)
Definition: xloginsert.c:404
bool TransactionIdDidAbort(TransactionId transactionId)
Definition: transam.c:181
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:305
#define MinSizeOfXactRunningXacts
Definition: standby.h:55
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
List * lappend(List *list, void *datum)
Definition: list.c:321
void StandbyReleaseAllLocks(void)
Definition: standby.c:752
int MyXactFlags
Definition: xact.c:132
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1796
TransactionId xid
Definition: lockdefs.h:54
#define VirtualTransactionIdIsValid(vxid)
Definition: lock.h:70
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
#define HASH_BLOBS
Definition: hsearch.h:86
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:326
Oid MyDatabaseId
Definition: globals.c:85
Size keysize
Definition: hsearch.h:71
#define XLogStandbyInfoActive()
Definition: xlog.h:205
#define InvalidOid
Definition: postgres_ext.h:36
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:56
VirtualTransactionId * GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
Definition: lock.c:2912
#define ereport(elevel,...)
Definition: elog.h:144
void LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, bool relcacheInitFileInval)
Definition: standby.c:1100
void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, int nmsgs, bool RelcacheInitFileInval, Oid dbid, Oid tsid)
Definition: inval.c:887
void ResolveRecoveryConflictWithTablespace(Oid tsid)
Definition: standby.c:330
void XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
Definition: xlog.c:2683
void ResolveRecoveryConflictWithDatabase(Oid dbid)
Definition: standby.c:360
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:786
void enable_timeout_after(TimeoutId id, int delay_ms)
Definition: timeout.c:435
uint8 locktag_type
Definition: lock.h:170
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void InitRecoveryTransactionEnvironment(void)
Definition: standby.c:75
#define Assert(condition)
Definition: c.h:745
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define XLOG_MARK_UNIMPORTANT
Definition: xlog.h:239
BackendId backendId
Definition: lock.h:64
SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:69
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3918
TimestampTz fin_time
Definition: timeout.h:57
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1362
TransactionId nextXid
Definition: standbydefs.h:52
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1401
xl_standby_lock locks[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:41
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1391
struct RecoveryLockListsEntry RecoveryLockListsEntry
#define AccessExclusiveLock
Definition: lockdefs.h:45
TransactionId nextXid
Definition: standby.h:75
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:824
void StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
Definition: standby.c:651
#define elog(elevel,...)
Definition: elog.h:214
int i
bool LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
Definition: lock.c:1975
void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
Definition: standby.c:304
ProcSignalReason
Definition: procsignal.h:30
TransactionId oldestRunningXid
Definition: standbydefs.h:53
#define XLOG_RUNNING_XACTS
Definition: standbydefs.h:35
RunningTransactions GetRunningTransactionData(void)
Definition: procarray.c:2554
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:312
int DeadlockTimeout
Definition: proc.c:60
void ShutdownRecoveryTransactionEnvironment(void)
Definition: standby.c:127
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
Definition: xlog.c:6220
void disable_timeout(TimeoutId id, bool keep_indicator)
Definition: timeout.c:532
#define TransactionIdIsValid(xid)
Definition: transam.h:41
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
void XLogBeginInsert(void)
Definition: xloginsert.c:123
Definition: pg_list.h:50
HotStandbyState standbyState
Definition: xlog.c:208
#define offsetof(type, field)
Definition: c.h:668
void StandbyReleaseOldLocks(TransactionId oldxid)
Definition: standby.c:773
int max_standby_streaming_delay
Definition: standby.c:41
List * list_delete_first(List *list)
Definition: list.c:860