PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
standby.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * standby.c
4  * Misc functions used in Hot Standby mode.
5  *
6  * All functions for handling RM_STANDBY_ID, which relate to
7  * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8  * Plus conflict recovery processing.
9  *
10  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
11  * Portions Copyright (c) 1994, Regents of the University of California
12  *
13  * IDENTIFICATION
14  * src/backend/storage/ipc/standby.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 #include "access/transam.h"
20 #include "access/twophase.h"
21 #include "access/xact.h"
22 #include "access/xlog.h"
23 #include "access/xloginsert.h"
24 #include "miscadmin.h"
25 #include "pgstat.h"
26 #include "storage/bufmgr.h"
27 #include "storage/lmgr.h"
28 #include "storage/proc.h"
29 #include "storage/procarray.h"
30 #include "storage/sinvaladt.h"
31 #include "storage/standby.h"
32 #include "utils/ps_status.h"
33 #include "utils/timeout.h"
34 #include "utils/timestamp.h"
35 
36 /* User-settable GUC parameters */
38 int max_standby_archive_delay = 30 * 1000;
40 
42 
44  ProcSignalReason reason);
47 static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
48 
49 
50 /*
51  * InitRecoveryTransactionEnvironment
52  * Initialize tracking of in-progress transactions in master
53  *
54  * We need to issue shared invalidations and hold locks. Holding locks
55  * means others may want to wait on us, so we need to make a lock table
56  * vxact entry like a real transaction. We could create and delete
57  * lock table entries for each transaction but its simpler just to create
58  * one permanent entry and leave it there all the time. Locks are then
59  * acquired and released as needed. Yes, this means you can see the
60  * Startup process in pg_locks once we have run this.
61  */
62 void
64 {
66 
67  /*
68  * Initialize shared invalidation management for Startup process, being
69  * careful to register ourselves as a sendOnly process so we don't need to
70  * read messages, nor will we get signalled when the queue starts filling
71  * up.
72  */
74 
75  /*
76  * Lock a virtual transaction id for Startup process.
77  *
78  * We need to do GetNextLocalTransactionId() because
79  * SharedInvalBackendInit() leaves localTransactionid invalid and the lock
80  * manager doesn't like that at all.
81  *
82  * Note that we don't need to run XactLockTableInsert() because nobody
83  * needs to wait on xids. That sounds a little strange, but table locks
84  * are held by vxids and row level locks are held by xids. All queries
85  * hold AccessShareLocks so never block while we write or lock new rows.
86  */
87  vxid.backendId = MyBackendId;
90 
92 }
93 
94 /*
95  * ShutdownRecoveryTransactionEnvironment
96  * Shut down transaction tracking
97  *
98  * Prepare to switch from hot standby mode to normal operation. Shut down
99  * recovery-time transaction tracking.
100  */
101 void
103 {
104  /* Mark all tracked in-progress transactions as finished. */
106 
107  /* Release all locks the tracked transactions were holding */
109 
110  /* Cleanup our VirtualTransaction */
112 }
113 
114 
115 /*
116  * -----------------------------------------------------
117  * Standby wait timers and backend cancel logic
118  * -----------------------------------------------------
119  */
120 
121 /*
122  * Determine the cutoff time at which we want to start canceling conflicting
123  * transactions. Returns zero (a time safely in the past) if we are willing
124  * to wait forever.
125  */
126 static TimestampTz
128 {
129  TimestampTz rtime;
130  bool fromStream;
131 
132  /*
133  * The cutoff time is the last WAL data receipt time plus the appropriate
134  * delay variable. Delay of -1 means wait forever.
135  */
136  GetXLogReceiptTime(&rtime, &fromStream);
137  if (fromStream)
138  {
140  return 0; /* wait forever */
142  }
143  else
144  {
146  return 0; /* wait forever */
148  }
149 }
150 
151 #define STANDBY_INITIAL_WAIT_US 1000
153 
154 /*
155  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
156  * We wait here for a while then return. If we decide we can't wait any
157  * more then we return true, if we can wait some more return false.
158  */
159 static bool
161 {
162  TimestampTz ltime;
163 
165 
166  /* Are we past the limit time? */
167  ltime = GetStandbyLimitTime();
168  if (ltime && GetCurrentTimestamp() >= ltime)
169  return true;
170 
171  /*
172  * Sleep a bit (this is essential to avoid busy-waiting).
173  */
175 
176  /*
177  * Progressively increase the sleep times, but not to more than 1s, since
178  * pg_usleep isn't interruptable on some platforms.
179  */
180  standbyWait_us *= 2;
181  if (standbyWait_us > 1000000)
182  standbyWait_us = 1000000;
183 
184  return false;
185 }
186 
187 /*
188  * This is the main executioner for any query backend that conflicts with
189  * recovery processing. Judgement has already been passed on it within
190  * a specific rmgr. Here we just issue the orders to the procs. The procs
191  * then throw the required error as instructed.
192  */
193 static void
195  ProcSignalReason reason)
196 {
197  TimestampTz waitStart;
198  char *new_status;
199 
200  /* Fast exit, to avoid a kernel call if there's no work to be done. */
201  if (!VirtualTransactionIdIsValid(*waitlist))
202  return;
203 
204  waitStart = GetCurrentTimestamp();
205  new_status = NULL; /* we haven't changed the ps display */
206 
207  while (VirtualTransactionIdIsValid(*waitlist))
208  {
209  /* reset standbyWait_us for each xact we wait for */
211 
212  /* wait until the virtual xid is gone */
213  while (!VirtualXactLock(*waitlist, false))
214  {
215  /*
216  * Report via ps if we have been waiting for more than 500 msec
217  * (should that be configurable?)
218  */
219  if (update_process_title && new_status == NULL &&
221  500))
222  {
223  const char *old_status;
224  int len;
225 
226  old_status = get_ps_display(&len);
227  new_status = (char *) palloc(len + 8 + 1);
228  memcpy(new_status, old_status, len);
229  strcpy(new_status + len, " waiting");
230  set_ps_display(new_status, false);
231  new_status[len] = '\0'; /* truncate off " waiting" */
232  }
233 
234  /* Is it time to kill it? */
236  {
237  pid_t pid;
238 
239  /*
240  * Now find out who to throw out of the balloon.
241  */
243  pid = CancelVirtualTransaction(*waitlist, reason);
244 
245  /*
246  * Wait a little bit for it to die so that we avoid flooding
247  * an unresponsive backend when system is heavily loaded.
248  */
249  if (pid != 0)
250  pg_usleep(5000L);
251  }
252  }
253 
254  /* The virtual transaction is gone now, wait for the next one */
255  waitlist++;
256  }
257 
258  /* Reset ps display if we changed it */
259  if (new_status)
260  {
261  set_ps_display(new_status, false);
262  pfree(new_status);
263  }
264 }
265 
266 void
268 {
269  VirtualTransactionId *backends;
270 
271  /*
272  * If we get passed InvalidTransactionId then we are a little surprised,
273  * but it is theoretically possible in normal running. It also happens
274  * when replaying already applied WAL records after a standby crash or
275  * restart, or when replaying an XLOG_HEAP2_VISIBLE record that marks as
276  * frozen a page which was already all-visible. If latestRemovedXid is
277  * invalid then there is no conflict. That rule applies across all record
278  * types that suffer from this conflict.
279  */
280  if (!TransactionIdIsValid(latestRemovedXid))
281  return;
282 
283  backends = GetConflictingVirtualXIDs(latestRemovedXid,
284  node.dbNode);
285 
288 }
289 
290 void
292 {
293  VirtualTransactionId *temp_file_users;
294 
295  /*
296  * Standby users may be currently using this tablespace for their
297  * temporary files. We only care about current users because
298  * temp_tablespace parameter will just ignore tablespaces that no longer
299  * exist.
300  *
301  * Ask everybody to cancel their queries immediately so we can ensure no
302  * temp files remain and we can remove the tablespace. Nuke the entire
303  * site from orbit, it's the only way to be sure.
304  *
305  * XXX: We could work out the pids of active backends using this
306  * tablespace by examining the temp filenames in the directory. We would
307  * then convert the pids into VirtualXIDs before attempting to cancel
308  * them.
309  *
310  * We don't wait for commit because drop tablespace is non-transactional.
311  */
313  InvalidOid);
316 }
317 
318 void
320 {
321  /*
322  * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
323  * only waits for transactions and completely idle sessions would block
324  * us. This is rare enough that we do this as simply as possible: no wait,
325  * just force them off immediately.
326  *
327  * No locking is required here because we already acquired
328  * AccessExclusiveLock. Anybody trying to connect while we do this will
329  * block during InitPostgres() and then disconnect when they see the
330  * database has been removed.
331  */
332  while (CountDBBackends(dbid) > 0)
333  {
335 
336  /*
337  * Wait awhile for them to die so that we avoid flooding an
338  * unresponsive backend when system is heavily loaded.
339  */
340  pg_usleep(10000);
341  }
342 }
343 
344 /*
345  * ResolveRecoveryConflictWithLock is called from ProcSleep()
346  * to resolve conflicts with other backends holding relation locks.
347  *
348  * The WaitLatch sleep normally done in ProcSleep()
349  * (when not InHotStandby) is performed here, for code clarity.
350  *
351  * We either resolve conflicts immediately or set a timeout to wake us at
352  * the limit of our patience.
353  *
354  * Resolve conflicts by canceling to all backends holding a conflicting
355  * lock. As we are already queued to be granted the lock, no new lock
356  * requests conflicting with ours will be granted in the meantime.
357  *
358  * Deadlocks involving the Startup process and an ordinary backend process
359  * will be detected by the deadlock detector within the ordinary backend.
360  */
361 void
363 {
364  TimestampTz ltime;
365 
367 
368  ltime = GetStandbyLimitTime();
369 
370  if (GetCurrentTimestamp() >= ltime)
371  {
372  /*
373  * We're already behind, so clear a path as quickly as possible.
374  */
375  VirtualTransactionId *backends;
376 
377  backends = GetLockConflicts(&locktag, AccessExclusiveLock);
380  }
381  else
382  {
383  /*
384  * Wait (or wait again) until ltime
385  */
386  EnableTimeoutParams timeouts[1];
387 
388  timeouts[0].id = STANDBY_LOCK_TIMEOUT;
389  timeouts[0].type = TMPARAM_AT;
390  timeouts[0].fin_time = ltime;
391  enable_timeouts(timeouts, 1);
392  }
393 
394  /* Wait to be signaled by the release of the Relation Lock */
396 
397  /*
398  * Clear any timeout requests established above. We assume here that the
399  * Startup process doesn't have any other outstanding timeouts than those
400  * used by this function. If that stops being true, we could cancel the
401  * timeouts individually, but that'd be slower.
402  */
403  disable_all_timeouts(false);
404 }
405 
406 /*
407  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
408  * to resolve conflicts with other backends holding buffer pins.
409  *
410  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
411  * (when not InHotStandby) is performed here, for code clarity.
412  *
413  * We either resolve conflicts immediately or set a timeout to wake us at
414  * the limit of our patience.
415  *
416  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
417  * they hold one of the buffer pins that is blocking Startup process. If so,
418  * those backends will take an appropriate error action, ERROR or FATAL.
419  *
420  * We also must check for deadlocks. Deadlocks occur because if queries
421  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
422  * be cleared if the Startup process replays a transaction completion record.
423  * If Startup process is also waiting then that is a deadlock. The deadlock
424  * can occur if the query is waiting and then the Startup sleeps, or if
425  * Startup is sleeping and the query waits on a lock. We protect against
426  * only the former sequence here, the latter sequence is checked prior to
427  * the query sleeping, in CheckRecoveryConflictDeadlock().
428  *
429  * Deadlocks are extremely rare, and relatively expensive to check for,
430  * so we don't do a deadlock check right away ... only if we have had to wait
431  * at least deadlock_timeout.
432  */
433 void
435 {
436  TimestampTz ltime;
437 
439 
440  ltime = GetStandbyLimitTime();
441 
442  if (ltime == 0)
443  {
444  /*
445  * We're willing to wait forever for conflicts, so set timeout for
446  * deadlock check only
447  */
449  }
450  else if (GetCurrentTimestamp() >= ltime)
451  {
452  /*
453  * We're already behind, so clear a path as quickly as possible.
454  */
456  }
457  else
458  {
459  /*
460  * Wake up at ltime, and check for deadlocks as well if we will be
461  * waiting longer than deadlock_timeout
462  */
463  EnableTimeoutParams timeouts[2];
464 
465  timeouts[0].id = STANDBY_TIMEOUT;
466  timeouts[0].type = TMPARAM_AT;
467  timeouts[0].fin_time = ltime;
468  timeouts[1].id = STANDBY_DEADLOCK_TIMEOUT;
469  timeouts[1].type = TMPARAM_AFTER;
470  timeouts[1].delay_ms = DeadlockTimeout;
471  enable_timeouts(timeouts, 2);
472  }
473 
474  /* Wait to be signaled by UnpinBuffer() */
476 
477  /*
478  * Clear any timeout requests established above. We assume here that the
479  * Startup process doesn't have any other timeouts than what this function
480  * uses. If that stops being true, we could cancel the timeouts
481  * individually, but that'd be slower.
482  */
483  disable_all_timeouts(false);
484 }
485 
486 static void
488 {
491 
492  /*
493  * We send signal to all backends to ask them if they are holding the
494  * buffer pin which is delaying the Startup process. We must not set the
495  * conflict flag yet, since most backends will be innocent. Let the
496  * SIGUSR1 handling in each backend decide their own fate.
497  */
498  CancelDBBackends(InvalidOid, reason, false);
499 }
500 
501 /*
502  * In Hot Standby perform early deadlock detection. We abort the lock
503  * wait if we are about to sleep while holding the buffer pin that Startup
504  * process is waiting for.
505  *
506  * Note: this code is pessimistic, because there is no way for it to
507  * determine whether an actual deadlock condition is present: the lock we
508  * need to wait for might be unrelated to any held by the Startup process.
509  * Sooner or later, this mechanism should get ripped out in favor of somehow
510  * accounting for buffer locks in DeadLockCheck(). However, errors here
511  * seem to be very low-probability in practice, so for now it's not worth
512  * the trouble.
513  */
514 void
516 {
517  Assert(!InRecovery); /* do not call in Startup process */
518 
520  return;
521 
522  /*
523  * Error message should match ProcessInterrupts() but we avoid calling
524  * that because we aren't handling an interrupt at this point. Note that
525  * we only cancel the current transaction here, so if we are in a
526  * subtransaction and the pin is held by a parent, then the Startup
527  * process will continue to wait even though we have avoided deadlock.
528  */
529  ereport(ERROR,
530  (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
531  errmsg("canceling statement due to conflict with recovery"),
532  errdetail("User transaction caused buffer deadlock with recovery.")));
533 }
534 
535 
536 /* --------------------------------
537  * timeout handler routines
538  * --------------------------------
539  */
540 
541 /*
542  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT
543  * occurs before STANDBY_TIMEOUT. Send out a request for hot-standby
544  * backends to check themselves for deadlocks.
545  */
546 void
548 {
550 }
551 
552 /*
553  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
554  * Send out a request to release conflicting buffer pins unconditionally,
555  * so we can press ahead with applying changes in recovery.
556  */
557 void
559 {
560  /* forget any pending STANDBY_DEADLOCK_TIMEOUT request */
562 
564 }
565 
566 /*
567  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
568  * This doesn't need to do anything, simply waking up is enough.
569  */
570 void
572 {
573 }
574 
575 /*
576  * -----------------------------------------------------
577  * Locking in Recovery Mode
578  * -----------------------------------------------------
579  *
580  * All locks are held by the Startup process using a single virtual
581  * transaction. This implementation is both simpler and in some senses,
582  * more correct. The locks held mean "some original transaction held
583  * this lock, so query access is not allowed at this time". So the Startup
584  * process is the proxy by which the original locks are implemented.
585  *
586  * We only keep track of AccessExclusiveLocks, which are only ever held by
587  * one transaction on one relation.
588  *
589  * We keep a single dynamically expandible list of locks in local memory,
590  * RelationLockList, so we can keep track of the various entries made by
591  * the Startup process's virtual xid in the shared lock table.
592  *
593  * We record the lock against the top-level xid, rather than individual
594  * subtransaction xids. This means AccessExclusiveLocks held by aborted
595  * subtransactions are not released as early as possible on standbys.
596  *
597  * List elements use type xl_rel_lock, since the WAL record type exactly
598  * matches the information that we need to keep track of.
599  *
600  * We use session locks rather than normal locks so we don't need
601  * ResourceOwners.
602  */
603 
604 
605 void
607 {
608  xl_standby_lock *newlock;
609  LOCKTAG locktag;
610 
611  /* Already processed? */
612  if (!TransactionIdIsValid(xid) ||
613  TransactionIdDidCommit(xid) ||
615  return;
616 
618  "adding recovery lock: db %u rel %u", dbOid, relOid);
619 
620  /* dbOid is InvalidOid when we are locking a shared relation. */
621  Assert(OidIsValid(relOid));
622 
623  newlock = palloc(sizeof(xl_standby_lock));
624  newlock->xid = xid;
625  newlock->dbOid = dbOid;
626  newlock->relOid = relOid;
627  RecoveryLockList = lappend(RecoveryLockList, newlock);
628 
629  SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
630 
631  LockAcquireExtended(&locktag, AccessExclusiveLock, true, false, false);
632 }
633 
634 static void
636 {
637  ListCell *cell,
638  *prev,
639  *next;
640 
641  /*
642  * Release all matching locks and remove them from list
643  */
644  prev = NULL;
645  for (cell = list_head(RecoveryLockList); cell; cell = next)
646  {
647  xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
648 
649  next = lnext(cell);
650 
651  if (!TransactionIdIsValid(xid) || lock->xid == xid)
652  {
653  LOCKTAG locktag;
654 
656  "releasing recovery lock: xid %u db %u rel %u",
657  lock->xid, lock->dbOid, lock->relOid);
658  SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
659  if (!LockRelease(&locktag, AccessExclusiveLock, true))
660  elog(LOG,
661  "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
662  lock->xid, lock->dbOid, lock->relOid);
663 
664  RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
665  pfree(lock);
666  }
667  else
668  prev = cell;
669  }
670 }
671 
672 /*
673  * Release locks for a transaction tree, starting at xid down, from
674  * RecoveryLockList.
675  *
676  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
677  * to remove any AccessExclusiveLocks requested by a transaction.
678  */
679 void
681 {
682  int i;
683 
684  StandbyReleaseLocks(xid);
685 
686  for (i = 0; i < nsubxids; i++)
687  StandbyReleaseLocks(subxids[i]);
688 }
689 
690 /*
691  * Called at end of recovery and when we see a shutdown checkpoint.
692  */
693 void
695 {
696  ListCell *cell,
697  *prev,
698  *next;
699  LOCKTAG locktag;
700 
701  elog(trace_recovery(DEBUG2), "release all standby locks");
702 
703  prev = NULL;
704  for (cell = list_head(RecoveryLockList); cell; cell = next)
705  {
706  xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
707 
708  next = lnext(cell);
709 
711  "releasing recovery lock: xid %u db %u rel %u",
712  lock->xid, lock->dbOid, lock->relOid);
713  SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
714  if (!LockRelease(&locktag, AccessExclusiveLock, true))
715  elog(LOG,
716  "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
717  lock->xid, lock->dbOid, lock->relOid);
718  RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
719  pfree(lock);
720  }
721 }
722 
723 /*
724  * StandbyReleaseOldLocks
725  * Release standby locks held by top-level XIDs that aren't running,
726  * as long as they're not prepared transactions.
727  */
728 void
730 {
731  ListCell *cell,
732  *prev,
733  *next;
734  LOCKTAG locktag;
735 
736  prev = NULL;
737  for (cell = list_head(RecoveryLockList); cell; cell = next)
738  {
739  xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
740  bool remove = false;
741 
742  next = lnext(cell);
743 
745 
747  remove = false;
748  else
749  {
750  int i;
751  bool found = false;
752 
753  for (i = 0; i < nxids; i++)
754  {
755  if (lock->xid == xids[i])
756  {
757  found = true;
758  break;
759  }
760  }
761 
762  /*
763  * If its not a running transaction, remove it.
764  */
765  if (!found)
766  remove = true;
767  }
768 
769  if (remove)
770  {
772  "releasing recovery lock: xid %u db %u rel %u",
773  lock->xid, lock->dbOid, lock->relOid);
774  SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
775  if (!LockRelease(&locktag, AccessExclusiveLock, true))
776  elog(LOG,
777  "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
778  lock->xid, lock->dbOid, lock->relOid);
779  RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
780  pfree(lock);
781  }
782  else
783  prev = cell;
784  }
785 }
786 
787 /*
788  * --------------------------------------------------------------------
789  * Recovery handling for Rmgr RM_STANDBY_ID
790  *
791  * These record types will only be created if XLogStandbyInfoActive()
792  * --------------------------------------------------------------------
793  */
794 
795 void
797 {
798  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
799 
800  /* Backup blocks are not used in standby records */
801  Assert(!XLogRecHasAnyBlockRefs(record));
802 
803  /* Do nothing if we're not in hot standby mode */
805  return;
806 
807  if (info == XLOG_STANDBY_LOCK)
808  {
809  xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
810  int i;
811 
812  for (i = 0; i < xlrec->nlocks; i++)
814  xlrec->locks[i].dbOid,
815  xlrec->locks[i].relOid);
816  }
817  else if (info == XLOG_RUNNING_XACTS)
818  {
819  xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
820  RunningTransactionsData running;
821 
822  running.xcnt = xlrec->xcnt;
823  running.subxcnt = xlrec->subxcnt;
824  running.subxid_overflow = xlrec->subxid_overflow;
825  running.nextXid = xlrec->nextXid;
826  running.latestCompletedXid = xlrec->latestCompletedXid;
827  running.oldestRunningXid = xlrec->oldestRunningXid;
828  running.xids = xlrec->xids;
829 
830  ProcArrayApplyRecoveryInfo(&running);
831  }
832  else if (info == XLOG_INVALIDATIONS)
833  {
834  xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
835 
837  xlrec->nmsgs,
838  xlrec->relcacheInitFileInval,
839  xlrec->dbId,
840  xlrec->tsId);
841  }
842  else
843  elog(PANIC, "standby_redo: unknown op code %u", info);
844 }
845 
846 /*
847  * Log details of the current snapshot to WAL. This allows the snapshot state
848  * to be reconstructed on the standby and for logical decoding.
849  *
850  * This is used for Hot Standby as follows:
851  *
852  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
853  * start from a shutdown checkpoint because we know nothing was running
854  * at that time and our recovery snapshot is known empty. In the more
855  * typical case of an online checkpoint we need to jump through a few
856  * hoops to get a correct recovery snapshot and this requires a two or
857  * sometimes a three stage process.
858  *
859  * The initial snapshot must contain all running xids and all current
860  * AccessExclusiveLocks at a point in time on the standby. Assembling
861  * that information while the server is running requires many and
862  * various LWLocks, so we choose to derive that information piece by
863  * piece and then re-assemble that info on the standby. When that
864  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
865  *
866  * Since locking on the primary when we derive the information is not
867  * strict, we note that there is a time window between the derivation and
868  * writing to WAL of the derived information. That allows race conditions
869  * that we must resolve, since xids and locks may enter or leave the
870  * snapshot during that window. This creates the issue that an xid or
871  * lock may start *after* the snapshot has been derived yet *before* the
872  * snapshot is logged in the running xacts WAL record. We resolve this by
873  * starting to accumulate changes at a point just prior to when we derive
874  * the snapshot on the primary, then ignore duplicates when we later apply
875  * the snapshot from the running xacts record. This is implemented during
876  * CreateCheckpoint() where we use the logical checkpoint location as
877  * our starting point and then write the running xacts record immediately
878  * before writing the main checkpoint WAL record. Since we always start
879  * up from a checkpoint and are immediately at our starting point, we
880  * unconditionally move to STANDBY_INITIALIZED. After this point we
881  * must do 4 things:
882  * * move shared nextXid forwards as we see new xids
883  * * extend the clog and subtrans with each new xid
884  * * keep track of uncommitted known assigned xids
885  * * keep track of uncommitted AccessExclusiveLocks
886  *
887  * When we see a commit/abort we must remove known assigned xids and locks
888  * from the completing transaction. Attempted removals that cannot locate
889  * an entry are expected and must not cause an error when we are in state
890  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
891  * KnownAssignedXidsRemove().
892  *
893  * Later, when we apply the running xact data we must be careful to ignore
894  * transactions already committed, since those commits raced ahead when
895  * making WAL entries.
896  *
897  * The loose timing also means that locks may be recorded that have a
898  * zero xid, since xids are removed from procs before locks are removed.
899  * So we must prune the lock list down to ensure we hold locks only for
900  * currently running xids, performed by StandbyReleaseOldLocks().
901  * Zero xids should no longer be possible, but we may be replaying WAL
902  * from a time when they were possible.
903  *
904  * For logical decoding only the running xacts information is needed;
905  * there's no need to look at the locking information, but it's logged anyway,
906  * as there's no independent knob to just enable logical decoding. For
907  * details of how this is used, check snapbuild.c's introductory comment.
908  *
909  *
910  * Returns the RecPtr of the last inserted record.
911  */
914 {
915  XLogRecPtr recptr;
916  RunningTransactions running;
917  xl_standby_lock *locks;
918  int nlocks;
919 
921 
922  /*
923  * Get details of any AccessExclusiveLocks being held at the moment.
924  */
925  locks = GetRunningTransactionLocks(&nlocks);
926  if (nlocks > 0)
927  LogAccessExclusiveLocks(nlocks, locks);
928  pfree(locks);
929 
930  /*
931  * Log details of all in-progress transactions. This should be the last
932  * record we write, because standby will open up when it sees this.
933  */
934  running = GetRunningTransactionData();
935 
936  /*
937  * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
938  * For Hot Standby this can be done before inserting the WAL record
939  * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
940  * the clog. For logical decoding, though, the lock can't be released
941  * early because the clog might be "in the future" from the POV of the
942  * historic snapshot. This would allow for situations where we're waiting
943  * for the end of a transaction listed in the xl_running_xacts record
944  * which, according to the WAL, has committed before the xl_running_xacts
945  * record. Fortunately this routine isn't executed frequently, and it's
946  * only a shared lock.
947  */
949  LWLockRelease(ProcArrayLock);
950 
951  recptr = LogCurrentRunningXacts(running);
952 
953  /* Release lock if we kept it longer ... */
955  LWLockRelease(ProcArrayLock);
956 
957  /* GetRunningTransactionData() acquired XidGenLock, we must release it */
958  LWLockRelease(XidGenLock);
959 
960  return recptr;
961 }
962 
963 /*
964  * Record an enhanced snapshot of running transactions into WAL.
965  *
966  * The definitions of RunningTransactionsData and xl_xact_running_xacts are
967  * similar. We keep them separate because xl_xact_running_xacts is a
968  * contiguous chunk of memory and never exists fully until it is assembled in
969  * WAL. The inserted records are marked as not being important for durability,
970  * to avoid triggering superfluous checkpoint / archiving activity.
971  */
972 static XLogRecPtr
974 {
975  xl_running_xacts xlrec;
976  XLogRecPtr recptr;
977 
978  xlrec.xcnt = CurrRunningXacts->xcnt;
979  xlrec.subxcnt = CurrRunningXacts->subxcnt;
980  xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
981  xlrec.nextXid = CurrRunningXacts->nextXid;
982  xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
983  xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
984 
985  /* Header */
986  XLogBeginInsert();
988  XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
989 
990  /* array of TransactionIds */
991  if (xlrec.xcnt > 0)
992  XLogRegisterData((char *) CurrRunningXacts->xids,
993  (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
994 
995  recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
996 
997  if (CurrRunningXacts->subxid_overflow)
999  "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1000  CurrRunningXacts->xcnt,
1001  (uint32) (recptr >> 32), (uint32) recptr,
1002  CurrRunningXacts->oldestRunningXid,
1003  CurrRunningXacts->latestCompletedXid,
1004  CurrRunningXacts->nextXid);
1005  else
1007  "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1008  CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1009  (uint32) (recptr >> 32), (uint32) recptr,
1010  CurrRunningXacts->oldestRunningXid,
1011  CurrRunningXacts->latestCompletedXid,
1012  CurrRunningXacts->nextXid);
1013 
1014  /*
1015  * Ensure running_xacts information is synced to disk not too far in the
1016  * future. We don't want to stall anything though (i.e. use XLogFlush()),
1017  * so we let the wal writer do it during normal operation.
1018  * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1019  * and nudge the WALWriter into action if sleeping. Check
1020  * XLogBackgroundFlush() for details why a record might not be flushed
1021  * without it.
1022  */
1023  XLogSetAsyncXactLSN(recptr);
1024 
1025  return recptr;
1026 }
1027 
1028 /*
1029  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1030  * logged, as described in backend/storage/lmgr/README.
1031  */
1032 static void
1034 {
1035  xl_standby_locks xlrec;
1036 
1037  xlrec.nlocks = nlocks;
1038 
1039  XLogBeginInsert();
1040  XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
1041  XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
1043 
1044  (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1045 }
1046 
1047 /*
1048  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1049  */
1050 void
1052 {
1053  xl_standby_lock xlrec;
1054 
1055  xlrec.xid = GetTopTransactionId();
1056 
1057  /*
1058  * Decode the locktag back to the original values, to avoid sending lots
1059  * of empty bytes with every message. See lock.h to check how a locktag
1060  * is defined for LOCKTAG_RELATION
1061  */
1062  xlrec.dbOid = dbOid;
1063  xlrec.relOid = relOid;
1064 
1065  LogAccessExclusiveLocks(1, &xlrec);
1066 }
1067 
1068 /*
1069  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1070  */
1071 void
1073 {
1074  /*
1075  * Ensure that a TransactionId has been assigned to this transaction, for
1076  * two reasons, both related to lock release on the standby. First, we
1077  * must assign an xid so that RecordTransactionCommit() and
1078  * RecordTransactionAbort() do not optimise away the transaction
1079  * completion record which recovery relies upon to release locks. It's a
1080  * hack, but for a corner case not worth adding code for into the main
1081  * commit path. Second, we must assign an xid before the lock is recorded
1082  * in shared memory, otherwise a concurrently executing
1083  * GetRunningTransactionLocks() might see a lock associated with an
1084  * InvalidTransactionId which we later assert cannot happen.
1085  */
1086  (void) GetTopTransactionId();
1087 }
1088 
1089 /*
1090  * Emit WAL for invalidations. This currently is only used for commits without
1091  * an xid but which contain invalidations.
1092  */
1093 void
1095  bool relcacheInitFileInval)
1096 {
1097  xl_invalidations xlrec;
1098 
1099  /* prepare record */
1100  memset(&xlrec, 0, sizeof(xlrec));
1101  xlrec.dbId = MyDatabaseId;
1102  xlrec.tsId = MyDatabaseTableSpace;
1103  xlrec.relcacheInitFileInval = relcacheInitFileInval;
1104  xlrec.nmsgs = nmsgs;
1105 
1106  /* perform insertion */
1107  XLogBeginInsert();
1108  XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
1109  XLogRegisterData((char *) msgs,
1110  nmsgs * sizeof(SharedInvalidationMessage));
1111  XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1112 }
static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
Definition: standby.c:1033
void ProcArrayApplyRecoveryInfo(RunningTransactions running)
Definition: procarray.c:664
void ResolveRecoveryConflictWithLock(LOCKTAG locktag)
Definition: standby.c:362
static List * RecoveryLockList
Definition: standby.c:41
TransactionId oldestRunningXid
Definition: standby.h:76
pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
Definition: procarray.c:2620
#define PG_WAIT_LOCK
Definition: pgstat.h:719
static TimestampTz GetStandbyLimitTime(void)
Definition: standby.c:127
TimeoutId id
Definition: timeout.h:54
int CountDBBackends(Oid databaseid)
Definition: procarray.c:2722
void StandbyTimeoutHandler(void)
Definition: standby.c:558
int max_standby_archive_delay
Definition: standby.c:38
BackendId MyBackendId
Definition: globals.c:72
void VirtualXactLockTableCleanup(void)
Definition: lock.c:4244
static int32 next
Definition: blutils.c:210
uint32 TransactionId
Definition: c.h:393
void SharedInvalBackendInit(bool sendOnly)
Definition: sinvaladt.c:258
bool update_process_title
Definition: ps_status.c:35
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1688
static void StandbyReleaseLocks(TransactionId xid)
Definition: standby.c:635
TimeoutType type
Definition: timeout.h:55
int wal_level
Definition: xlog.c:102
int vacuum_defer_cleanup_age
Definition: standby.c:37
bool InRecovery
Definition: xlog.c:190
VirtualTransactionId * GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
Definition: procarray.c:2545
#define XLOG_INVALIDATIONS
Definition: standbydefs.h:36
static int standbyWait_us
Definition: standby.c:152
unsigned char uint8
Definition: c.h:263
#define XLOG_STANDBY_LOCK
Definition: standbydefs.h:34
Definition: lock.h:179
void set_ps_display(const char *activity, bool force)
Definition: ps_status.c:326
void LogAccessExclusiveLock(Oid dbOid, Oid relOid)
Definition: standby.c:1051
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:575
TransactionId * xids
Definition: standby.h:79
TransactionId GetTopTransactionId(void)
Definition: xact.c:388
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:125
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
LocalTransactionId localTransactionId
Definition: lock.h:66
#define DEBUG4
Definition: elog.h:22
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1821
#define OidIsValid(objectId)
Definition: c.h:533
#define PANIC
Definition: elog.h:53
xl_standby_lock * GetRunningTransactionLocks(int *nlocks)
Definition: lock.c:3762
void ExpireAllKnownAssignedTransactionIds(void)
Definition: procarray.c:3254
Oid MyDatabaseTableSpace
Definition: globals.c:78
int trace_recovery(int trace_level)
Definition: elog.c:3753
TransactionId latestCompletedXid
Definition: standby.h:77
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1714
bool VirtualXactLock(VirtualTransactionId vxid, bool wait)
Definition: lock.c:4291
TransactionId xids[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:56
void pg_usleep(long microsec)
Definition: signal.c:53
double TimestampTz
Definition: timestamp.h:51
void enable_timeouts(const EnableTimeoutParams *timeouts, int count)
Definition: timeout.c:476
void pfree(void *pointer)
Definition: mcxt.c:992
#define XLogRecGetData(decoder)
Definition: xlogreader.h:202
void disable_all_timeouts(bool keep_indicators)
Definition: timeout.c:596
static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, ProcSignalReason reason)
Definition: standby.c:194
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:913
static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
Definition: standby.c:973
#define ERROR
Definition: elog.h:43
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:434
#define STANDBY_INITIAL_WAIT_US
Definition: standby.c:151
LocalTransactionId GetNextLocalTransactionId(void)
Definition: sinvaladt.c:769
TransactionId latestCompletedXid
Definition: standbydefs.h:54
void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
Definition: procarray.c:2783
#define DEBUG2
Definition: elog.h:24
void VirtualXactLockTableInsert(VirtualTransactionId vxid)
Definition: lock.c:4221
#define SET_LOCKTAG_RELATION(locktag, dboid, reloid)
Definition: lock.h:194
const char * get_ps_display(int *displen)
Definition: ps_status.c:405
void standby_redo(XLogReaderState *record)
Definition: standby.c:796
bool relcacheInitFileInval
Definition: standbydefs.h:67
#define MinSizeOfInvalidations
Definition: standbydefs.h:72
void LogAccessExclusiveLockPrepare(void)
Definition: standby.c:1072
int errdetail(const char *fmt,...)
Definition: elog.c:873
#define InvalidTransactionId
Definition: transam.h:31
bool StandbyTransactionIdIsPrepared(TransactionId xid)
Definition: twophase.c:1295
static ListCell * list_head(const List *l)
Definition: pg_list.h:77
void StandbyLockTimeoutHandler(void)
Definition: standby.c:571
void StandbyDeadLockHandler(void)
Definition: standby.c:547
void StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
Definition: standby.c:680
unsigned int uint32
Definition: c.h:265
void CheckRecoveryConflictDeadlock(void)
Definition: standby.c:515
static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
Definition: standby.c:487
void XLogSetRecordFlags(uint8 flags)
Definition: xloginsert.c:397
#define lnext(lc)
Definition: pg_list.h:105
#define ereport(elevel, rest)
Definition: elog.h:122
bool TransactionIdDidAbort(TransactionId transactionId)
Definition: transam.c:181
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:198
#define MinSizeOfXactRunningXacts
Definition: standby.h:55
List * lappend(List *list, void *datum)
Definition: list.c:128
void StandbyReleaseAllLocks(void)
Definition: standby.c:694
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1739
TransactionId xid
Definition: lockdefs.h:51
#define VirtualTransactionIdIsValid(vxid)
Definition: lock.h:72
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
List * list_delete_cell(List *list, ListCell *cell, ListCell *prev)
Definition: list.c:528
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Oid MyDatabaseId
Definition: globals.c:76
#define XLogStandbyInfoActive()
Definition: xlog.h:159
void StandbyReleaseOldLocks(int nxids, TransactionId *xids)
Definition: standby.c:729
#define InvalidOid
Definition: postgres_ext.h:36
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:80
void LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, bool relcacheInitFileInval)
Definition: standby.c:1094
void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, int nmsgs, bool RelcacheInitFileInval, Oid dbid, Oid tsid)
Definition: inval.c:863
void ResolveRecoveryConflictWithTablespace(Oid tsid)
Definition: standby.c:291
void XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
Definition: xlog.c:2594
void ResolveRecoveryConflictWithDatabase(Oid dbid)
Definition: standby.c:319
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:720
void enable_timeout_after(TimeoutId id, int delay_ms)
Definition: timeout.c:428
uint8 locktag_type
Definition: lock.h:185
#define NULL
Definition: c.h:226
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void InitRecoveryTransactionEnvironment(void)
Definition: standby.c:63
#define Assert(condition)
Definition: c.h:670
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define lfirst(lc)
Definition: pg_list.h:106
#define XLOG_MARK_UNIMPORTANT
Definition: xlog.h:193
BackendId backendId
Definition: lock.h:65
SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:69
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3675
TimestampTz fin_time
Definition: timeout.h:57
TransactionId nextXid
Definition: standbydefs.h:52
xl_standby_lock locks[FLEXIBLE_ARRAY_MEMBER]
Definition: standbydefs.h:41
#define AccessExclusiveLock
Definition: lockdefs.h:46
VirtualTransactionId * GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
Definition: lock.c:2745
TransactionId nextXid
Definition: standby.h:75
void * palloc(Size size)
Definition: mcxt.c:891
int errmsg(const char *fmt,...)
Definition: elog.c:797
void StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
Definition: standby.c:606
static bool WaitExceedsMaxStandbyDelay(void)
Definition: standby.c:160
int i
bool LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
Definition: lock.c:1818
void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
Definition: standby.c:267
ProcSignalReason
Definition: procsignal.h:30
TransactionId oldestRunningXid
Definition: standbydefs.h:53
#define XLOG_RUNNING_XACTS
Definition: standbydefs.h:35
RunningTransactions GetRunningTransactionData(void)
Definition: procarray.c:1925
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:204
int DeadlockTimeout
Definition: proc.c:60
void ShutdownRecoveryTransactionEnvironment(void)
Definition: standby.c:102
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:97
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
Definition: xlog.c:6070
#define elog
Definition: elog.h:219
void disable_timeout(TimeoutId id, bool keep_indicator)
Definition: timeout.c:525
#define TransactionIdIsValid(xid)
Definition: transam.h:41
LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock, bool dontWait, bool reportMemoryError)
Definition: lock.c:701
void XLogBeginInsert(void)
Definition: xloginsert.c:120
Definition: pg_list.h:45
HotStandbyState standbyState
Definition: xlog.c:193
#define offsetof(type, field)
Definition: c.h:550
int max_standby_streaming_delay
Definition: standby.c:39