PostgreSQL Source Code  git master
syncrep.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * syncrep.c
4  *
5  * Synchronous replication is new as of PostgreSQL 9.1.
6  *
7  * If requested, transaction commits wait until their commit LSN are
8  * acknowledged by the synchronous standbys.
9  *
10  * This module contains the code for waiting and release of backends.
11  * All code in this module executes on the primary. The core streaming
12  * replication transport remains within WALreceiver/WALsender modules.
13  *
14  * The essence of this design is that it isolates all logic about
15  * waiting/releasing onto the primary. The primary defines which standbys
16  * it wishes to wait for. The standbys are completely unaware of the
17  * durability requirements of transactions on the primary, reducing the
18  * complexity of the code and streamlining both standby operations and
19  * network bandwidth because there is no requirement to ship
20  * per-transaction state information.
21  *
22  * Replication is either synchronous or not synchronous (async). If it is
23  * async, we just fastpath out of here. If it is sync, then we wait for
24  * the write, flush or apply location on the standby before releasing
25  * the waiting backend. Further complexity in that interaction is
26  * expected in later releases.
27  *
28  * The best performing way to manage the waiting backends is to have a
29  * single ordered queue of waiting backends, so that we can avoid
30  * searching the through all waiters each time we receive a reply.
31  *
32  * In 9.5 or before only a single standby could be considered as
33  * synchronous. In 9.6 we support a priority-based multiple synchronous
34  * standbys. In 10.0 a quorum-based multiple synchronous standbys is also
35  * supported. The number of synchronous standbys that transactions
36  * must wait for replies from is specified in synchronous_standby_names.
37  * This parameter also specifies a list of standby names and the method
38  * (FIRST and ANY) to choose synchronous standbys from the listed ones.
39  *
40  * The method FIRST specifies a priority-based synchronous replication
41  * and makes transaction commits wait until their WAL records are
42  * replicated to the requested number of synchronous standbys chosen based
43  * on their priorities. The standbys whose names appear earlier in the list
44  * are given higher priority and will be considered as synchronous.
45  * Other standby servers appearing later in this list represent potential
46  * synchronous standbys. If any of the current synchronous standbys
47  * disconnects for whatever reason, it will be replaced immediately with
48  * the next-highest-priority standby.
49  *
50  * The method ANY specifies a quorum-based synchronous replication
51  * and makes transaction commits wait until their WAL records are
52  * replicated to at least the requested number of synchronous standbys
53  * in the list. All the standbys appearing in the list are considered as
54  * candidates for quorum synchronous standbys.
55  *
56  * If neither FIRST nor ANY is specified, FIRST is used as the method.
57  * This is for backward compatibility with 9.6 or before where only a
58  * priority-based sync replication was supported.
59  *
60  * Before the standbys chosen from synchronous_standby_names can
61  * become the synchronous standbys they must have caught up with
62  * the primary; that may take some time. Once caught up,
63  * the standbys which are considered as synchronous at that moment
64  * will release waiters from the queue.
65  *
66  * Portions Copyright (c) 2010-2020, PostgreSQL Global Development Group
67  *
68  * IDENTIFICATION
69  * src/backend/replication/syncrep.c
70  *
71  *-------------------------------------------------------------------------
72  */
73 #include "postgres.h"
74 
75 #include <unistd.h>
76 
77 #include "access/xact.h"
78 #include "miscadmin.h"
79 #include "pgstat.h"
80 #include "replication/syncrep.h"
81 #include "replication/walsender.h"
83 #include "storage/pmsignal.h"
84 #include "storage/proc.h"
85 #include "tcop/tcopprot.h"
86 #include "utils/builtins.h"
87 #include "utils/ps_status.h"
88 
89 /* User-settable parameters for sync rep */
91 
92 #define SyncStandbysDefined() \
93  (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0')
94 
95 static bool announce_next_takeover = true;
96 
99 
100 static void SyncRepQueueInsert(int mode);
101 static void SyncRepCancelWait(void);
102 static int SyncRepWakeQueue(bool all, int mode);
103 
104 static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
105  XLogRecPtr *flushPtr,
106  XLogRecPtr *applyPtr,
107  bool *am_sync);
108 static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
109  XLogRecPtr *flushPtr,
110  XLogRecPtr *applyPtr,
111  List *sync_standbys);
112 static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
113  XLogRecPtr *flushPtr,
114  XLogRecPtr *applyPtr,
115  List *sync_standbys, uint8 nth);
116 static int SyncRepGetStandbyPriority(void);
117 static List *SyncRepGetSyncStandbysPriority(bool *am_sync);
118 static List *SyncRepGetSyncStandbysQuorum(bool *am_sync);
119 static int cmp_lsn(const void *a, const void *b);
120 
121 #ifdef USE_ASSERT_CHECKING
122 static bool SyncRepQueueIsOrderedByLSN(int mode);
123 #endif
124 
125 /*
126  * ===========================================================
127  * Synchronous Replication functions for normal user backends
128  * ===========================================================
129  */
130 
131 /*
132  * Wait for synchronous replication, if requested by user.
133  *
134  * Initially backends start in state SYNC_REP_NOT_WAITING and then
135  * change that state to SYNC_REP_WAITING before adding ourselves
136  * to the wait queue. During SyncRepWakeQueue() a WALSender changes
137  * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
138  * This backend then resets its state to SYNC_REP_NOT_WAITING.
139  *
140  * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN
141  * represents a commit record. If it doesn't, then we wait only for the WAL
142  * to be flushed if synchronous_commit is set to the higher level of
143  * remote_apply, because only commit records provide apply feedback.
144  */
145 void
146 SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
147 {
148  char *new_status = NULL;
149  const char *old_status;
150  int mode;
151 
152  /*
153  * This should be called while holding interrupts during a transaction
154  * commit to prevent the follow-up shared memory queue cleanups to be
155  * influenced by external interruptions.
156  */
158 
159  /* Cap the level for anything other than commit to remote flush only. */
160  if (commit)
161  mode = SyncRepWaitMode;
162  else
164 
165  /*
166  * Fast exit if user has not requested sync replication.
167  */
168  if (!SyncRepRequested())
169  return;
170 
172  Assert(WalSndCtl != NULL);
173 
174  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
176 
177  /*
178  * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
179  * set. See SyncRepUpdateSyncStandbysDefined.
180  *
181  * Also check that the standby hasn't already replied. Unlikely race
182  * condition but we'll be fetching that cache line anyway so it's likely
183  * to be a low cost check.
184  */
186  lsn <= WalSndCtl->lsn[mode])
187  {
188  LWLockRelease(SyncRepLock);
189  return;
190  }
191 
192  /*
193  * Set our waitLSN so WALSender will know when to wake us, and add
194  * ourselves to the queue.
195  */
196  MyProc->waitLSN = lsn;
198  SyncRepQueueInsert(mode);
199  Assert(SyncRepQueueIsOrderedByLSN(mode));
200  LWLockRelease(SyncRepLock);
201 
202  /* Alter ps display to show waiting for sync rep. */
204  {
205  int len;
206 
207  old_status = get_ps_display(&len);
208  new_status = (char *) palloc(len + 32 + 1);
209  memcpy(new_status, old_status, len);
210  sprintf(new_status + len, " waiting for %X/%X",
211  (uint32) (lsn >> 32), (uint32) lsn);
212  set_ps_display(new_status);
213  new_status[len] = '\0'; /* truncate off " waiting ..." */
214  }
215 
216  /*
217  * Wait for specified LSN to be confirmed.
218  *
219  * Each proc has its own wait latch, so we perform a normal latch
220  * check/wait loop here.
221  */
222  for (;;)
223  {
224  int rc;
225 
226  /* Must reset the latch before testing state. */
228 
229  /*
230  * Acquiring the lock is not needed, the latch ensures proper
231  * barriers. If it looks like we're done, we must really be done,
232  * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE,
233  * it will never update it again, so we can't be seeing a stale value
234  * in that case.
235  */
237  break;
238 
239  /*
240  * If a wait for synchronous replication is pending, we can neither
241  * acknowledge the commit nor raise ERROR or FATAL. The latter would
242  * lead the client to believe that the transaction aborted, which is
243  * not true: it's already committed locally. The former is no good
244  * either: the client has requested synchronous replication, and is
245  * entitled to assume that an acknowledged commit is also replicated,
246  * which might not be true. So in this case we issue a WARNING (which
247  * some clients may be able to interpret) and shut off further output.
248  * We do NOT reset ProcDiePending, so that the process will die after
249  * the commit is cleaned up.
250  */
251  if (ProcDiePending)
252  {
254  (errcode(ERRCODE_ADMIN_SHUTDOWN),
255  errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
256  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
259  break;
260  }
261 
262  /*
263  * It's unclear what to do if a query cancel interrupt arrives. We
264  * can't actually abort at this point, but ignoring the interrupt
265  * altogether is not helpful, so we just terminate the wait with a
266  * suitable warning.
267  */
268  if (QueryCancelPending)
269  {
270  QueryCancelPending = false;
272  (errmsg("canceling wait for synchronous replication due to user request"),
273  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
275  break;
276  }
277 
278  /*
279  * Wait on latch. Any condition that should wake us up will set the
280  * latch, so no need for timeout.
281  */
284 
285  /*
286  * If the postmaster dies, we'll probably never get an acknowledgment,
287  * because all the wal sender processes will exit. So just bail out.
288  */
289  if (rc & WL_POSTMASTER_DEATH)
290  {
291  ProcDiePending = true;
294  break;
295  }
296  }
297 
298  /*
299  * WalSender has checked our LSN and has removed us from queue. Clean up
300  * state and leave. It's OK to reset these shared memory fields without
301  * holding SyncRepLock, because any walsenders will ignore us anyway when
302  * we're not on the queue. We need a read barrier to make sure we see the
303  * changes to the queue link (this might be unnecessary without
304  * assertions, but better safe than sorry).
305  */
306  pg_read_barrier();
309  MyProc->waitLSN = 0;
310 
311  if (new_status)
312  {
313  /* Reset ps display */
314  set_ps_display(new_status);
315  pfree(new_status);
316  }
317 }
318 
319 /*
320  * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
321  *
322  * Usually we will go at tail of queue, though it's possible that we arrive
323  * here out of order, so start at tail and work back to insertion point.
324  */
325 static void
327 {
328  PGPROC *proc;
329 
330  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
331  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
333  offsetof(PGPROC, syncRepLinks));
334 
335  while (proc)
336  {
337  /*
338  * Stop at the queue element that we should after to ensure the queue
339  * is ordered by LSN.
340  */
341  if (proc->waitLSN < MyProc->waitLSN)
342  break;
343 
344  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
345  &(proc->syncRepLinks),
346  offsetof(PGPROC, syncRepLinks));
347  }
348 
349  if (proc)
351  else
353 }
354 
355 /*
356  * Acquire SyncRepLock and cancel any wait currently in progress.
357  */
358 static void
360 {
361  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
365  LWLockRelease(SyncRepLock);
366 }
367 
368 void
370 {
371  /*
372  * First check if we are removed from the queue without the lock to not
373  * slow down backend exit.
374  */
376  {
377  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
378 
379  /* maybe we have just been removed, so recheck */
382 
383  LWLockRelease(SyncRepLock);
384  }
385 }
386 
387 /*
388  * ===========================================================
389  * Synchronous Replication functions for wal sender processes
390  * ===========================================================
391  */
392 
393 /*
394  * Take any action required to initialise sync rep state from config
395  * data. Called at WALSender startup and after each SIGHUP.
396  */
397 void
399 {
400  int priority;
401 
402  /*
403  * Determine if we are a potential sync standby and remember the result
404  * for handling replies from standby.
405  */
406  priority = SyncRepGetStandbyPriority();
407  if (MyWalSnd->sync_standby_priority != priority)
408  {
409  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
410  MyWalSnd->sync_standby_priority = priority;
411  LWLockRelease(SyncRepLock);
412  ereport(DEBUG1,
413  (errmsg("standby \"%s\" now has synchronous standby priority %u",
414  application_name, priority)));
415  }
416 }
417 
418 /*
419  * Update the LSNs on each queue based upon our latest state. This
420  * implements a simple policy of first-valid-sync-standby-releases-waiter.
421  *
422  * Other policies are possible, which would change what we do here and
423  * perhaps also which information we store as well.
424  */
425 void
427 {
428  volatile WalSndCtlData *walsndctl = WalSndCtl;
429  XLogRecPtr writePtr;
430  XLogRecPtr flushPtr;
431  XLogRecPtr applyPtr;
432  bool got_recptr;
433  bool am_sync;
434  int numwrite = 0;
435  int numflush = 0;
436  int numapply = 0;
437 
438  /*
439  * If this WALSender is serving a standby that is not on the list of
440  * potential sync standbys then we have nothing to do. If we are still
441  * starting up, still running base backup or the current flush position is
442  * still invalid, then leave quickly also. Streaming or stopping WAL
443  * senders are allowed to release waiters.
444  */
445  if (MyWalSnd->sync_standby_priority == 0 ||
449  {
450  announce_next_takeover = true;
451  return;
452  }
453 
454  /*
455  * We're a potential sync standby. Release waiters if there are enough
456  * sync standbys and we are considered as sync.
457  */
458  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
459 
460  /*
461  * Check whether we are a sync standby or not, and calculate the synced
462  * positions among all sync standbys.
463  */
464  got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
465 
466  /*
467  * If we are managing a sync standby, though we weren't prior to this,
468  * then announce we are now a sync standby.
469  */
470  if (announce_next_takeover && am_sync)
471  {
472  announce_next_takeover = false;
473 
474  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
475  ereport(LOG,
476  (errmsg("standby \"%s\" is now a synchronous standby with priority %u",
478  else
479  ereport(LOG,
480  (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
481  application_name)));
482  }
483 
484  /*
485  * If the number of sync standbys is less than requested or we aren't
486  * managing a sync standby then just leave.
487  */
488  if (!got_recptr || !am_sync)
489  {
490  LWLockRelease(SyncRepLock);
491  announce_next_takeover = !am_sync;
492  return;
493  }
494 
495  /*
496  * Set the lsn first so that when we wake backends they will release up to
497  * this location.
498  */
499  if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr)
500  {
501  walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr;
502  numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
503  }
504  if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr)
505  {
506  walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr;
507  numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
508  }
509  if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr)
510  {
511  walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr;
512  numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY);
513  }
514 
515  LWLockRelease(SyncRepLock);
516 
517  elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X",
518  numwrite, (uint32) (writePtr >> 32), (uint32) writePtr,
519  numflush, (uint32) (flushPtr >> 32), (uint32) flushPtr,
520  numapply, (uint32) (applyPtr >> 32), (uint32) applyPtr);
521 }
522 
523 /*
524  * Calculate the synced Write, Flush and Apply positions among sync standbys.
525  *
526  * The caller must hold SyncRepLock.
527  *
528  * Return false if the number of sync standbys is less than
529  * synchronous_standby_names specifies. Otherwise return true and
530  * store the positions into *writePtr, *flushPtr and *applyPtr.
531  *
532  * On return, *am_sync is set to true if this walsender is connecting to
533  * sync standby. Otherwise it's set to false.
534  */
535 static bool
537  XLogRecPtr *applyPtr, bool *am_sync)
538 {
539  List *sync_standbys;
540 
541  Assert(LWLockHeldByMe(SyncRepLock));
542 
543  *writePtr = InvalidXLogRecPtr;
544  *flushPtr = InvalidXLogRecPtr;
545  *applyPtr = InvalidXLogRecPtr;
546  *am_sync = false;
547 
548  /* Get standbys that are considered as synchronous at this moment */
549  sync_standbys = SyncRepGetSyncStandbys(am_sync);
550 
551  /*
552  * Quick exit if we are not managing a sync standby or there are not
553  * enough synchronous standbys.
554  */
555  if (!(*am_sync) ||
556  SyncRepConfig == NULL ||
557  list_length(sync_standbys) < SyncRepConfig->num_sync)
558  {
559  list_free(sync_standbys);
560  return false;
561  }
562 
563  /*
564  * In a priority-based sync replication, the synced positions are the
565  * oldest ones among sync standbys. In a quorum-based, they are the Nth
566  * latest ones.
567  *
568  * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest
569  * positions. But we use SyncRepGetOldestSyncRecPtr() for that calculation
570  * because it's a bit more efficient.
571  *
572  * XXX If the numbers of current and requested sync standbys are the same,
573  * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
574  * positions even in a quorum-based sync replication.
575  */
576  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
577  {
578  SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
579  sync_standbys);
580  }
581  else
582  {
583  SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
584  sync_standbys, SyncRepConfig->num_sync);
585  }
586 
587  list_free(sync_standbys);
588  return true;
589 }
590 
591 /*
592  * Calculate the oldest Write, Flush and Apply positions among sync standbys.
593  */
594 static void
596  XLogRecPtr *applyPtr, List *sync_standbys)
597 {
598  ListCell *cell;
599 
600  /*
601  * Scan through all sync standbys and calculate the oldest Write, Flush
602  * and Apply positions.
603  */
604  foreach(cell, sync_standbys)
605  {
606  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
608  XLogRecPtr flush;
609  XLogRecPtr apply;
610 
611  SpinLockAcquire(&walsnd->mutex);
612  write = walsnd->write;
613  flush = walsnd->flush;
614  apply = walsnd->apply;
615  SpinLockRelease(&walsnd->mutex);
616 
617  if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write)
618  *writePtr = write;
619  if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush)
620  *flushPtr = flush;
621  if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
622  *applyPtr = apply;
623  }
624 }
625 
626 /*
627  * Calculate the Nth latest Write, Flush and Apply positions among sync
628  * standbys.
629  */
630 static void
632  XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
633 {
634  ListCell *cell;
635  XLogRecPtr *write_array;
636  XLogRecPtr *flush_array;
637  XLogRecPtr *apply_array;
638  int len;
639  int i = 0;
640 
641  len = list_length(sync_standbys);
642  write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
643  flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
644  apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
645 
646  foreach(cell, sync_standbys)
647  {
648  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
649 
650  SpinLockAcquire(&walsnd->mutex);
651  write_array[i] = walsnd->write;
652  flush_array[i] = walsnd->flush;
653  apply_array[i] = walsnd->apply;
654  SpinLockRelease(&walsnd->mutex);
655 
656  i++;
657  }
658 
659  /* Sort each array in descending order */
660  qsort(write_array, len, sizeof(XLogRecPtr), cmp_lsn);
661  qsort(flush_array, len, sizeof(XLogRecPtr), cmp_lsn);
662  qsort(apply_array, len, sizeof(XLogRecPtr), cmp_lsn);
663 
664  /* Get Nth latest Write, Flush, Apply positions */
665  *writePtr = write_array[nth - 1];
666  *flushPtr = flush_array[nth - 1];
667  *applyPtr = apply_array[nth - 1];
668 
669  pfree(write_array);
670  pfree(flush_array);
671  pfree(apply_array);
672 }
673 
674 /*
675  * Compare lsn in order to sort array in descending order.
676  */
677 static int
678 cmp_lsn(const void *a, const void *b)
679 {
680  XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
681  XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
682 
683  if (lsn1 > lsn2)
684  return -1;
685  else if (lsn1 == lsn2)
686  return 0;
687  else
688  return 1;
689 }
690 
691 /*
692  * Return the list of sync standbys, or NIL if no sync standby is connected.
693  *
694  * The caller must hold SyncRepLock.
695  *
696  * On return, *am_sync is set to true if this walsender is connecting to
697  * sync standby. Otherwise it's set to false.
698  */
699 List *
701 {
702  Assert(LWLockHeldByMe(SyncRepLock));
703 
704  /* Set default result */
705  if (am_sync != NULL)
706  *am_sync = false;
707 
708  /* Quick exit if sync replication is not requested */
709  if (SyncRepConfig == NULL)
710  return NIL;
711 
712  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ?
715 }
716 
717 /*
718  * Return the list of all the candidates for quorum sync standbys,
719  * or NIL if no such standby is connected.
720  *
721  * The caller must hold SyncRepLock. This function must be called only in
722  * a quorum-based sync replication.
723  *
724  * On return, *am_sync is set to true if this walsender is connecting to
725  * sync standby. Otherwise it's set to false.
726  */
727 static List *
729 {
730  List *result = NIL;
731  int i;
732  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
733  * rearrangement */
734 
735  Assert(SyncRepConfig->syncrep_method == SYNC_REP_QUORUM);
736 
737  for (i = 0; i < max_wal_senders; i++)
738  {
739  XLogRecPtr flush;
741  int pid;
742 
743  walsnd = &WalSndCtl->walsnds[i];
744 
745  SpinLockAcquire(&walsnd->mutex);
746  pid = walsnd->pid;
747  flush = walsnd->flush;
748  state = walsnd->state;
749  SpinLockRelease(&walsnd->mutex);
750 
751  /* Must be active */
752  if (pid == 0)
753  continue;
754 
755  /* Must be streaming or stopping */
756  if (state != WALSNDSTATE_STREAMING &&
757  state != WALSNDSTATE_STOPPING)
758  continue;
759 
760  /* Must be synchronous */
761  if (walsnd->sync_standby_priority == 0)
762  continue;
763 
764  /* Must have a valid flush position */
765  if (XLogRecPtrIsInvalid(flush))
766  continue;
767 
768  /*
769  * Consider this standby as a candidate for quorum sync standbys and
770  * append it to the result.
771  */
772  result = lappend_int(result, i);
773  if (am_sync != NULL && walsnd == MyWalSnd)
774  *am_sync = true;
775  }
776 
777  return result;
778 }
779 
780 /*
781  * Return the list of sync standbys chosen based on their priorities,
782  * or NIL if no sync standby is connected.
783  *
784  * If there are multiple standbys with the same priority,
785  * the first one found is selected preferentially.
786  *
787  * The caller must hold SyncRepLock. This function must be called only in
788  * a priority-based sync replication.
789  *
790  * On return, *am_sync is set to true if this walsender is connecting to
791  * sync standby. Otherwise it's set to false.
792  */
793 static List *
795 {
796  List *result = NIL;
797  List *pending = NIL;
798  int lowest_priority;
799  int next_highest_priority;
800  int this_priority;
801  int priority;
802  int i;
803  bool am_in_pending = false;
804  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
805  * rearrangement */
806 
807  Assert(SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY);
808 
809  lowest_priority = SyncRepConfig->nmembers;
810  next_highest_priority = lowest_priority + 1;
811 
812  /*
813  * Find the sync standbys which have the highest priority (i.e, 1). Also
814  * store all the other potential sync standbys into the pending list, in
815  * order to scan it later and find other sync standbys from it quickly.
816  */
817  for (i = 0; i < max_wal_senders; i++)
818  {
819  XLogRecPtr flush;
821  int pid;
822 
823  walsnd = &WalSndCtl->walsnds[i];
824 
825  SpinLockAcquire(&walsnd->mutex);
826  pid = walsnd->pid;
827  flush = walsnd->flush;
828  state = walsnd->state;
829  SpinLockRelease(&walsnd->mutex);
830 
831  /* Must be active */
832  if (pid == 0)
833  continue;
834 
835  /* Must be streaming or stopping */
836  if (state != WALSNDSTATE_STREAMING &&
837  state != WALSNDSTATE_STOPPING)
838  continue;
839 
840  /* Must be synchronous */
841  this_priority = walsnd->sync_standby_priority;
842  if (this_priority == 0)
843  continue;
844 
845  /* Must have a valid flush position */
846  if (XLogRecPtrIsInvalid(flush))
847  continue;
848 
849  /*
850  * If the priority is equal to 1, consider this standby as sync and
851  * append it to the result. Otherwise append this standby to the
852  * pending list to check if it's actually sync or not later.
853  */
854  if (this_priority == 1)
855  {
856  result = lappend_int(result, i);
857  if (am_sync != NULL && walsnd == MyWalSnd)
858  *am_sync = true;
859  if (list_length(result) == SyncRepConfig->num_sync)
860  {
861  list_free(pending);
862  return result; /* Exit if got enough sync standbys */
863  }
864  }
865  else
866  {
867  pending = lappend_int(pending, i);
868  if (am_sync != NULL && walsnd == MyWalSnd)
869  am_in_pending = true;
870 
871  /*
872  * Track the highest priority among the standbys in the pending
873  * list, in order to use it as the starting priority for later
874  * scan of the list. This is useful to find quickly the sync
875  * standbys from the pending list later because we can skip
876  * unnecessary scans for the unused priorities.
877  */
878  if (this_priority < next_highest_priority)
879  next_highest_priority = this_priority;
880  }
881  }
882 
883  /*
884  * Consider all pending standbys as sync if the number of them plus
885  * already-found sync ones is lower than the configuration requests.
886  */
887  if (list_length(result) + list_length(pending) <= SyncRepConfig->num_sync)
888  {
889  /*
890  * Set *am_sync to true if this walsender is in the pending list
891  * because all pending standbys are considered as sync.
892  */
893  if (am_sync != NULL && !(*am_sync))
894  *am_sync = am_in_pending;
895 
896  result = list_concat(result, pending);
897  list_free(pending);
898  return result;
899  }
900 
901  /*
902  * Find the sync standbys from the pending list.
903  */
904  priority = next_highest_priority;
905  while (priority <= lowest_priority)
906  {
907  ListCell *cell;
908 
909  next_highest_priority = lowest_priority + 1;
910 
911  foreach(cell, pending)
912  {
913  i = lfirst_int(cell);
914  walsnd = &WalSndCtl->walsnds[i];
915 
916  this_priority = walsnd->sync_standby_priority;
917  if (this_priority == priority)
918  {
919  result = lappend_int(result, i);
920  if (am_sync != NULL && walsnd == MyWalSnd)
921  *am_sync = true;
922 
923  /*
924  * We should always exit here after the scan of pending list
925  * starts because we know that the list has enough elements to
926  * reach SyncRepConfig->num_sync.
927  */
928  if (list_length(result) == SyncRepConfig->num_sync)
929  {
930  list_free(pending);
931  return result; /* Exit if got enough sync standbys */
932  }
933 
934  /*
935  * Remove the entry for this sync standby from the list to
936  * prevent us from looking at the same entry again.
937  */
938  pending = foreach_delete_current(pending, cell);
939 
940  continue; /* don't adjust next_highest_priority */
941  }
942 
943  if (this_priority < next_highest_priority)
944  next_highest_priority = this_priority;
945  }
946 
947  priority = next_highest_priority;
948  }
949 
950  /* never reached, but keep compiler quiet */
951  Assert(false);
952  return result;
953 }
954 
955 /*
956  * Check if we are in the list of sync standbys, and if so, determine
957  * priority sequence. Return priority if set, or zero to indicate that
958  * we are not a potential sync standby.
959  *
960  * Compare the parameter SyncRepStandbyNames against the application_name
961  * for this WALSender, or allow any name if we find a wildcard "*".
962  */
963 static int
965 {
966  const char *standby_name;
967  int priority;
968  bool found = false;
969 
970  /*
971  * Since synchronous cascade replication is not allowed, we always set the
972  * priority of cascading walsender to zero.
973  */
975  return 0;
976 
977  if (!SyncStandbysDefined() || SyncRepConfig == NULL)
978  return 0;
979 
980  standby_name = SyncRepConfig->member_names;
981  for (priority = 1; priority <= SyncRepConfig->nmembers; priority++)
982  {
983  if (pg_strcasecmp(standby_name, application_name) == 0 ||
984  strcmp(standby_name, "*") == 0)
985  {
986  found = true;
987  break;
988  }
989  standby_name += strlen(standby_name) + 1;
990  }
991 
992  if (!found)
993  return 0;
994 
995  /*
996  * In quorum-based sync replication, all the standbys in the list have the
997  * same priority, one.
998  */
999  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ? priority : 1;
1000 }
1001 
1002 /*
1003  * Walk the specified queue from head. Set the state of any backends that
1004  * need to be woken, remove them from the queue, and then wake them.
1005  * Pass all = true to wake whole queue; otherwise, just wake up to
1006  * the walsender's LSN.
1007  *
1008  * The caller must hold SyncRepLock in exclusive mode.
1009  */
1010 static int
1011 SyncRepWakeQueue(bool all, int mode)
1012 {
1013  volatile WalSndCtlData *walsndctl = WalSndCtl;
1014  PGPROC *proc = NULL;
1015  PGPROC *thisproc = NULL;
1016  int numprocs = 0;
1017 
1018  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
1019  Assert(LWLockHeldByMeInMode(SyncRepLock, LW_EXCLUSIVE));
1020  Assert(SyncRepQueueIsOrderedByLSN(mode));
1021 
1022  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1024  offsetof(PGPROC, syncRepLinks));
1025 
1026  while (proc)
1027  {
1028  /*
1029  * Assume the queue is ordered by LSN
1030  */
1031  if (!all && walsndctl->lsn[mode] < proc->waitLSN)
1032  return numprocs;
1033 
1034  /*
1035  * Move to next proc, so we can delete thisproc from the queue.
1036  * thisproc is valid, proc may be NULL after this.
1037  */
1038  thisproc = proc;
1039  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1040  &(proc->syncRepLinks),
1041  offsetof(PGPROC, syncRepLinks));
1042 
1043  /*
1044  * Remove thisproc from queue.
1045  */
1046  SHMQueueDelete(&(thisproc->syncRepLinks));
1047 
1048  /*
1049  * SyncRepWaitForLSN() reads syncRepState without holding the lock, so
1050  * make sure that it sees the queue link being removed before the
1051  * syncRepState change.
1052  */
1053  pg_write_barrier();
1054 
1055  /*
1056  * Set state to complete; see SyncRepWaitForLSN() for discussion of
1057  * the various states.
1058  */
1060 
1061  /*
1062  * Wake only when we have set state and removed from queue.
1063  */
1064  SetLatch(&(thisproc->procLatch));
1065 
1066  numprocs++;
1067  }
1068 
1069  return numprocs;
1070 }
1071 
1072 /*
1073  * The checkpointer calls this as needed to update the shared
1074  * sync_standbys_defined flag, so that backends don't remain permanently wedged
1075  * if synchronous_standby_names is unset. It's safe to check the current value
1076  * without the lock, because it's only ever updated by one process. But we
1077  * must take the lock to change it.
1078  */
1079 void
1081 {
1082  bool sync_standbys_defined = SyncStandbysDefined();
1083 
1084  if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
1085  {
1086  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
1087 
1088  /*
1089  * If synchronous_standby_names has been reset to empty, it's futile
1090  * for backends to continue waiting. Since the user no longer wants
1091  * synchronous replication, we'd better wake them up.
1092  */
1093  if (!sync_standbys_defined)
1094  {
1095  int i;
1096 
1097  for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
1098  SyncRepWakeQueue(true, i);
1099  }
1100 
1101  /*
1102  * Only allow people to join the queue when there are synchronous
1103  * standbys defined. Without this interlock, there's a race
1104  * condition: we might wake up all the current waiters; then, some
1105  * backend that hasn't yet reloaded its config might go to sleep on
1106  * the queue (and never wake up). This prevents that.
1107  */
1108  WalSndCtl->sync_standbys_defined = sync_standbys_defined;
1109 
1110  LWLockRelease(SyncRepLock);
1111  }
1112 }
1113 
1114 #ifdef USE_ASSERT_CHECKING
1115 static bool
1116 SyncRepQueueIsOrderedByLSN(int mode)
1117 {
1118  PGPROC *proc = NULL;
1119  XLogRecPtr lastLSN;
1120 
1121  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
1122 
1123  lastLSN = 0;
1124 
1125  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1127  offsetof(PGPROC, syncRepLinks));
1128 
1129  while (proc)
1130  {
1131  /*
1132  * Check the queue is ordered by LSN and that multiple procs don't
1133  * have matching LSNs
1134  */
1135  if (proc->waitLSN <= lastLSN)
1136  return false;
1137 
1138  lastLSN = proc->waitLSN;
1139 
1140  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1141  &(proc->syncRepLinks),
1142  offsetof(PGPROC, syncRepLinks));
1143  }
1144 
1145  return true;
1146 }
1147 #endif
1148 
1149 /*
1150  * ===========================================================
1151  * Synchronous Replication functions executed by any process
1152  * ===========================================================
1153  */
1154 
1155 bool
1156 check_synchronous_standby_names(char **newval, void **extra, GucSource source)
1157 {
1158  if (*newval != NULL && (*newval)[0] != '\0')
1159  {
1160  int parse_rc;
1161  SyncRepConfigData *pconf;
1162 
1163  /* Reset communication variables to ensure a fresh start */
1164  syncrep_parse_result = NULL;
1165  syncrep_parse_error_msg = NULL;
1166 
1167  /* Parse the synchronous_standby_names string */
1168  syncrep_scanner_init(*newval);
1169  parse_rc = syncrep_yyparse();
1171 
1172  if (parse_rc != 0 || syncrep_parse_result == NULL)
1173  {
1174  GUC_check_errcode(ERRCODE_SYNTAX_ERROR);
1177  else
1178  GUC_check_errdetail("synchronous_standby_names parser failed");
1179  return false;
1180  }
1181 
1182  if (syncrep_parse_result->num_sync <= 0)
1183  {
1184  GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero",
1186  return false;
1187  }
1188 
1189  /* GUC extra value must be malloc'd, not palloc'd */
1190  pconf = (SyncRepConfigData *)
1192  if (pconf == NULL)
1193  return false;
1195 
1196  *extra = (void *) pconf;
1197 
1198  /*
1199  * We need not explicitly clean up syncrep_parse_result. It, and any
1200  * other cruft generated during parsing, will be freed when the
1201  * current memory context is deleted. (This code is generally run in
1202  * a short-lived context used for config file processing, so that will
1203  * not be very long.)
1204  */
1205  }
1206  else
1207  *extra = NULL;
1208 
1209  return true;
1210 }
1211 
1212 void
1213 assign_synchronous_standby_names(const char *newval, void *extra)
1214 {
1215  SyncRepConfig = (SyncRepConfigData *) extra;
1216 }
1217 
1218 void
1220 {
1221  switch (newval)
1222  {
1225  break;
1228  break;
1231  break;
1232  default:
1234  break;
1235  }
1236 }
void syncrep_scanner_finish(void)
#define NIL
Definition: pg_list.h:65
void SyncRepUpdateSyncStandbysDefined(void)
Definition: syncrep.c:1080
XLogRecPtr write
static PgChecksumMode mode
Definition: pg_checksums.c:61
volatile uint32 InterruptHoldoffCount
Definition: globals.c:36
void assign_synchronous_standby_names(const char *newval, void *extra)
Definition: syncrep.c:1213
Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:164
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
static List * SyncRepGetSyncStandbysPriority(bool *am_sync)
Definition: syncrep.c:794
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1861
static void SyncRepCancelWait(void)
Definition: syncrep.c:359
volatile sig_atomic_t QueryCancelPending
Definition: globals.c:31
#define DEBUG1
Definition: elog.h:25
void syncrep_scanner_init(const char *query_string)
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync)
Definition: syncrep.c:536
#define DEBUG3
Definition: elog.h:23
#define SYNC_REP_PRIORITY
Definition: syncrep.h:36
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1843
bool update_process_title
Definition: ps_status.c:36
#define write(a, b, c)
Definition: win32.h:14
#define GUC_check_errdetail
Definition: guc.h:415
static void SyncRepQueueInsert(int mode)
Definition: syncrep.c:326
PGPROC * MyProc
Definition: proc.c:67
char * syncrep_parse_error_msg
#define SYNC_REP_WAITING
Definition: syncrep.h:32
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:146
uint8 syncrep_method
Definition: syncrep.h:51
#define Min(x, y)
Definition: c.h:920
unsigned char uint8
Definition: c.h:365
#define SYNC_REP_NOT_WAITING
Definition: syncrep.h:31
WalSndCtlData * WalSndCtl
Definition: walsender.c:109
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
Definition: syncrep.c:631
List * list_concat(List *list1, const List *list2)
Definition: list.c:515
int errcode(int sqlerrcode)
Definition: elog.c:610
#define GUC_check_errmsg
Definition: guc.h:411
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]
List * SyncRepGetSyncStandbys(bool *am_sync)
Definition: syncrep.c:700
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define LOG
Definition: elog.h:26
void SetLatch(Latch *latch)
Definition: latch.c:457
SyncRepConfigData * SyncRepConfig
Definition: syncrep.c:97
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:10920
static int SyncRepWakeQueue(bool all, int mode)
Definition: syncrep.c:1011
void ResetLatch(Latch *latch)
Definition: latch.c:540
slock_t mutex
Latch procLatch
Definition: proc.h:104
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:365
GucSource
Definition: guc.h:105
#define malloc(a)
Definition: header.h:50
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
#define foreach_delete_current(lst, cell)
Definition: pg_list.h:368
void set_ps_display(const char *activity)
Definition: ps_status.c:349
#define sprintf
Definition: port.h:195
#define SpinLockAcquire(lock)
Definition: spin.h:62
XLogRecPtr flush
void pfree(void *pointer)
Definition: mcxt.c:1056
static int cmp_lsn(const void *a, const void *b)
Definition: syncrep.c:678
#define SYNC_REP_NO_WAIT
Definition: syncrep.h:23
#define SYNC_REP_WAIT_APPLY
Definition: syncrep.h:26
#define lfirst_int(lc)
Definition: pg_list.h:191
void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
Definition: shmqueue.c:108
static bool announce_next_takeover
Definition: syncrep.c:95
Definition: dest.h:89
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
WalSndState state
void assign_synchronous_commit(int newval, void *extra)
Definition: syncrep.c:1219
#define SyncStandbysDefined()
Definition: syncrep.c:92
int errdetail(const char *fmt,...)
Definition: elog.c:957
unsigned int uint32
Definition: c.h:367
SHM_QUEUE SyncRepQueue[NUM_SYNC_REP_WAIT_MODE]
void SyncRepInitConfig(void)
Definition: syncrep.c:398
#define SYNC_REP_WAIT_FLUSH
Definition: syncrep.h:25
List * lappend_int(List *list, int datum)
Definition: list.c:339
#define SyncRepRequested()
Definition: syncrep.h:19
int max_wal_senders
Definition: walsender.c:121
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
#define SpinLockRelease(lock)
Definition: spin.h:64
char * SyncRepStandbyNames
Definition: syncrep.c:90
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:145
WalSnd * MyWalSnd
Definition: walsender.c:112
static List * SyncRepGetSyncStandbysQuorum(bool *am_sync)
Definition: syncrep.c:728
#define ereport(elevel,...)
Definition: elog.h:144
#define SYNC_REP_WAIT_COMPLETE
Definition: syncrep.h:33
#define SYNC_REP_QUORUM
Definition: syncrep.h:37
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
char member_names[FLEXIBLE_ARRAY_MEMBER]
Definition: syncrep.h:54
int sync_standby_priority
volatile sig_atomic_t ProcDiePending
Definition: globals.c:32
Definition: regguts.h:298
#define pg_read_barrier()
Definition: atomics.h:158
bool SHMQueueIsDetached(const SHM_QUEUE *queue)
Definition: shmqueue.c:47
SyncRepConfigData * syncrep_parse_result
static int list_length(const List *l)
Definition: pg_list.h:169
#define newval
bool check_synchronous_standby_names(char **newval, void **extra, GucSource source)
Definition: syncrep.c:1156
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
SHM_QUEUE syncRepLinks
Definition: proc.h:155
void SyncRepCleanupAtProcExit(void)
Definition: syncrep.c:369
WalSndState
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]
char * application_name
Definition: guc.c:568
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
void list_free(List *list)
Definition: list.c:1376
#define elog(elevel,...)
Definition: elog.h:214
int i
#define pg_write_barrier()
Definition: atomics.h:159
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys)
Definition: syncrep.c:595
void SHMQueueDelete(SHM_QUEUE *queue)
Definition: shmqueue.c:68
struct Latch * MyLatch
Definition: globals.c:54
int syncrep_yyparse(void)
#define NUM_SYNC_REP_WAIT_MODE
Definition: syncrep.h:28
#define qsort(a, b, c, d)
Definition: port.h:479
int syncRepState
Definition: proc.h:154
CommandDest whereToSendOutput
Definition: postgres.c:91
XLogRecPtr apply
Definition: proc.h:95
Definition: pg_list.h:50
static int SyncRepWaitMode
Definition: syncrep.c:98
#define WL_LATCH_SET
Definition: latch.h:124
void SyncRepReleaseWaiters(void)
Definition: syncrep.c:426
XLogRecPtr waitLSN
Definition: proc.h:153
#define offsetof(type, field)
Definition: c.h:661
bool am_cascading_walsender
Definition: walsender.c:116
#define SYNC_REP_WAIT_WRITE
Definition: syncrep.h:24
static int SyncRepGetStandbyPriority(void)
Definition: syncrep.c:964