PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
syncrep.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * syncrep.c
4  *
5  * Synchronous replication is new as of PostgreSQL 9.1.
6  *
7  * If requested, transaction commits wait until their commit LSN are
8  * acknowledged by the synchronous standbys.
9  *
10  * This module contains the code for waiting and release of backends.
11  * All code in this module executes on the primary. The core streaming
12  * replication transport remains within WALreceiver/WALsender modules.
13  *
14  * The essence of this design is that it isolates all logic about
15  * waiting/releasing onto the primary. The primary defines which standbys
16  * it wishes to wait for. The standbys are completely unaware of the
17  * durability requirements of transactions on the primary, reducing the
18  * complexity of the code and streamlining both standby operations and
19  * network bandwidth because there is no requirement to ship
20  * per-transaction state information.
21  *
22  * Replication is either synchronous or not synchronous (async). If it is
23  * async, we just fastpath out of here. If it is sync, then we wait for
24  * the write, flush or apply location on the standby before releasing
25  * the waiting backend. Further complexity in that interaction is
26  * expected in later releases.
27  *
28  * The best performing way to manage the waiting backends is to have a
29  * single ordered queue of waiting backends, so that we can avoid
30  * searching the through all waiters each time we receive a reply.
31  *
32  * In 9.5 or before only a single standby could be considered as
33  * synchronous. In 9.6 we support a priority-based multiple synchronous
34  * standbys. In 10.0 a quorum-based multiple synchronous standbys is also
35  * supported. The number of synchronous standbys that transactions
36  * must wait for replies from is specified in synchronous_standby_names.
37  * This parameter also specifies a list of standby names and the method
38  * (FIRST and ANY) to choose synchronous standbys from the listed ones.
39  *
40  * The method FIRST specifies a priority-based synchronous replication
41  * and makes transaction commits wait until their WAL records are
42  * replicated to the requested number of synchronous standbys chosen based
43  * on their priorities. The standbys whose names appear earlier in the list
44  * are given higher priority and will be considered as synchronous.
45  * Other standby servers appearing later in this list represent potential
46  * synchronous standbys. If any of the current synchronous standbys
47  * disconnects for whatever reason, it will be replaced immediately with
48  * the next-highest-priority standby.
49  *
50  * The method ANY specifies a quorum-based synchronous replication
51  * and makes transaction commits wait until their WAL records are
52  * replicated to at least the requested number of synchronous standbys
53  * in the list. All the standbys appearing in the list are considered as
54  * candidates for quorum synchronous standbys.
55  *
56  * If neither FIRST nor ANY is specified, FIRST is used as the method.
57  * This is for backward compatibility with 9.6 or before where only a
58  * priority-based sync replication was supported.
59  *
60  * Before the standbys chosen from synchronous_standby_names can
61  * become the synchronous standbys they must have caught up with
62  * the primary; that may take some time. Once caught up,
63  * the standbys which are considered as synchronous at that moment
64  * will release waiters from the queue.
65  *
66  * Portions Copyright (c) 2010-2017, PostgreSQL Global Development Group
67  *
68  * IDENTIFICATION
69  * src/backend/replication/syncrep.c
70  *
71  *-------------------------------------------------------------------------
72  */
73 #include "postgres.h"
74 
75 #include <unistd.h>
76 
77 #include "access/xact.h"
78 #include "miscadmin.h"
79 #include "pgstat.h"
80 #include "replication/syncrep.h"
81 #include "replication/walsender.h"
83 #include "storage/pmsignal.h"
84 #include "storage/proc.h"
85 #include "tcop/tcopprot.h"
86 #include "utils/builtins.h"
87 #include "utils/ps_status.h"
88 
89 /* User-settable parameters for sync rep */
91 
92 #define SyncStandbysDefined() \
93  (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0')
94 
95 static bool announce_next_takeover = true;
96 
99 
100 static void SyncRepQueueInsert(int mode);
101 static void SyncRepCancelWait(void);
102 static int SyncRepWakeQueue(bool all, int mode);
103 
104 static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
105  XLogRecPtr *flushPtr,
106  XLogRecPtr *applyPtr,
107  bool *am_sync);
108 static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
109  XLogRecPtr *flushPtr,
110  XLogRecPtr *applyPtr,
111  List *sync_standbys);
112 static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
113  XLogRecPtr *flushPtr,
114  XLogRecPtr *applyPtr,
115  List *sync_standbys, uint8 nth);
116 static int SyncRepGetStandbyPriority(void);
117 static List *SyncRepGetSyncStandbysPriority(bool *am_sync);
118 static List *SyncRepGetSyncStandbysQuorum(bool *am_sync);
119 static int cmp_lsn(const void *a, const void *b);
120 
121 #ifdef USE_ASSERT_CHECKING
122 static bool SyncRepQueueIsOrderedByLSN(int mode);
123 #endif
124 
125 /*
126  * ===========================================================
127  * Synchronous Replication functions for normal user backends
128  * ===========================================================
129  */
130 
131 /*
132  * Wait for synchronous replication, if requested by user.
133  *
134  * Initially backends start in state SYNC_REP_NOT_WAITING and then
135  * change that state to SYNC_REP_WAITING before adding ourselves
136  * to the wait queue. During SyncRepWakeQueue() a WALSender changes
137  * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
138  * This backend then resets its state to SYNC_REP_NOT_WAITING.
139  *
140  * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN
141  * represents a commit record. If it doesn't, then we wait only for the WAL
142  * to be flushed if synchronous_commit is set to the higher level of
143  * remote_apply, because only commit records provide apply feedback.
144  */
145 void
146 SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
147 {
148  char *new_status = NULL;
149  const char *old_status;
150  int mode;
151 
152  /* Cap the level for anything other than commit to remote flush only. */
153  if (commit)
154  mode = SyncRepWaitMode;
155  else
157 
158  /*
159  * Fast exit if user has not requested sync replication, or there are no
160  * sync replication standby names defined. Note that those standbys don't
161  * need to be connected.
162  */
164  return;
165 
167  Assert(WalSndCtl != NULL);
168 
169  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
171 
172  /*
173  * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
174  * set. See SyncRepUpdateSyncStandbysDefined.
175  *
176  * Also check that the standby hasn't already replied. Unlikely race
177  * condition but we'll be fetching that cache line anyway so it's likely
178  * to be a low cost check.
179  */
181  lsn <= WalSndCtl->lsn[mode])
182  {
183  LWLockRelease(SyncRepLock);
184  return;
185  }
186 
187  /*
188  * Set our waitLSN so WALSender will know when to wake us, and add
189  * ourselves to the queue.
190  */
191  MyProc->waitLSN = lsn;
193  SyncRepQueueInsert(mode);
194  Assert(SyncRepQueueIsOrderedByLSN(mode));
195  LWLockRelease(SyncRepLock);
196 
197  /* Alter ps display to show waiting for sync rep. */
199  {
200  int len;
201 
202  old_status = get_ps_display(&len);
203  new_status = (char *) palloc(len + 32 + 1);
204  memcpy(new_status, old_status, len);
205  sprintf(new_status + len, " waiting for %X/%X",
206  (uint32) (lsn >> 32), (uint32) lsn);
207  set_ps_display(new_status, false);
208  new_status[len] = '\0'; /* truncate off " waiting ..." */
209  }
210 
211  /*
212  * Wait for specified LSN to be confirmed.
213  *
214  * Each proc has its own wait latch, so we perform a normal latch
215  * check/wait loop here.
216  */
217  for (;;)
218  {
219  /* Must reset the latch before testing state. */
221 
222  /*
223  * Acquiring the lock is not needed, the latch ensures proper
224  * barriers. If it looks like we're done, we must really be done,
225  * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE,
226  * it will never update it again, so we can't be seeing a stale value
227  * in that case.
228  */
230  break;
231 
232  /*
233  * If a wait for synchronous replication is pending, we can neither
234  * acknowledge the commit nor raise ERROR or FATAL. The latter would
235  * lead the client to believe that the transaction aborted, which is
236  * not true: it's already committed locally. The former is no good
237  * either: the client has requested synchronous replication, and is
238  * entitled to assume that an acknowledged commit is also replicated,
239  * which might not be true. So in this case we issue a WARNING (which
240  * some clients may be able to interpret) and shut off further output.
241  * We do NOT reset ProcDiePending, so that the process will die after
242  * the commit is cleaned up.
243  */
244  if (ProcDiePending)
245  {
247  (errcode(ERRCODE_ADMIN_SHUTDOWN),
248  errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
249  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
252  break;
253  }
254 
255  /*
256  * It's unclear what to do if a query cancel interrupt arrives. We
257  * can't actually abort at this point, but ignoring the interrupt
258  * altogether is not helpful, so we just terminate the wait with a
259  * suitable warning.
260  */
261  if (QueryCancelPending)
262  {
263  QueryCancelPending = false;
265  (errmsg("canceling wait for synchronous replication due to user request"),
266  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
268  break;
269  }
270 
271  /*
272  * If the postmaster dies, we'll probably never get an
273  * acknowledgement, because all the wal sender processes will exit. So
274  * just bail out.
275  */
276  if (!PostmasterIsAlive())
277  {
278  ProcDiePending = true;
281  break;
282  }
283 
284  /*
285  * Wait on latch. Any condition that should wake us up will set the
286  * latch, so no need for timeout.
287  */
290  }
291 
292  /*
293  * WalSender has checked our LSN and has removed us from queue. Clean up
294  * state and leave. It's OK to reset these shared memory fields without
295  * holding SyncRepLock, because any walsenders will ignore us anyway when
296  * we're not on the queue. We need a read barrier to make sure we see the
297  * changes to the queue link (this might be unnecessary without
298  * assertions, but better safe than sorry).
299  */
300  pg_read_barrier();
303  MyProc->waitLSN = 0;
304 
305  if (new_status)
306  {
307  /* Reset ps display */
308  set_ps_display(new_status, false);
309  pfree(new_status);
310  }
311 }
312 
313 /*
314  * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
315  *
316  * Usually we will go at tail of queue, though it's possible that we arrive
317  * here out of order, so start at tail and work back to insertion point.
318  */
319 static void
321 {
322  PGPROC *proc;
323 
324  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
325  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
326  &(WalSndCtl->SyncRepQueue[mode]),
327  offsetof(PGPROC, syncRepLinks));
328 
329  while (proc)
330  {
331  /*
332  * Stop at the queue element that we should after to ensure the queue
333  * is ordered by LSN.
334  */
335  if (proc->waitLSN < MyProc->waitLSN)
336  break;
337 
338  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
339  &(proc->syncRepLinks),
340  offsetof(PGPROC, syncRepLinks));
341  }
342 
343  if (proc)
345  else
347 }
348 
349 /*
350  * Acquire SyncRepLock and cancel any wait currently in progress.
351  */
352 static void
354 {
355  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
359  LWLockRelease(SyncRepLock);
360 }
361 
362 void
364 {
366  {
367  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
369  LWLockRelease(SyncRepLock);
370  }
371 }
372 
373 /*
374  * ===========================================================
375  * Synchronous Replication functions for wal sender processes
376  * ===========================================================
377  */
378 
379 /*
380  * Take any action required to initialise sync rep state from config
381  * data. Called at WALSender startup and after each SIGHUP.
382  */
383 void
385 {
386  int priority;
387 
388  /*
389  * Determine if we are a potential sync standby and remember the result
390  * for handling replies from standby.
391  */
392  priority = SyncRepGetStandbyPriority();
393  if (MyWalSnd->sync_standby_priority != priority)
394  {
395  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
396  MyWalSnd->sync_standby_priority = priority;
397  LWLockRelease(SyncRepLock);
398  ereport(DEBUG1,
399  (errmsg("standby \"%s\" now has synchronous standby priority %u",
400  application_name, priority)));
401  }
402 }
403 
404 /*
405  * Update the LSNs on each queue based upon our latest state. This
406  * implements a simple policy of first-valid-sync-standby-releases-waiter.
407  *
408  * Other policies are possible, which would change what we do here and
409  * perhaps also which information we store as well.
410  */
411 void
413 {
414  volatile WalSndCtlData *walsndctl = WalSndCtl;
415  XLogRecPtr writePtr;
416  XLogRecPtr flushPtr;
417  XLogRecPtr applyPtr;
418  bool got_recptr;
419  bool am_sync;
420  int numwrite = 0;
421  int numflush = 0;
422  int numapply = 0;
423 
424  /*
425  * If this WALSender is serving a standby that is not on the list of
426  * potential sync standbys then we have nothing to do. If we are still
427  * starting up, still running base backup or the current flush position is
428  * still invalid, then leave quickly also.
429  */
430  if (MyWalSnd->sync_standby_priority == 0 ||
433  {
434  announce_next_takeover = true;
435  return;
436  }
437 
438  /*
439  * We're a potential sync standby. Release waiters if there are enough
440  * sync standbys and we are considered as sync.
441  */
442  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
443 
444  /*
445  * Check whether we are a sync standby or not, and calculate the synced
446  * positions among all sync standbys.
447  */
448  got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
449 
450  /*
451  * If we are managing a sync standby, though we weren't prior to this,
452  * then announce we are now a sync standby.
453  */
454  if (announce_next_takeover && am_sync)
455  {
456  announce_next_takeover = false;
457 
458  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
459  ereport(LOG,
460  (errmsg("standby \"%s\" is now a synchronous standby with priority %u",
462  else
463  ereport(LOG,
464  (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
465  application_name)));
466  }
467 
468  /*
469  * If the number of sync standbys is less than requested or we aren't
470  * managing a sync standby then just leave.
471  */
472  if (!got_recptr || !am_sync)
473  {
474  LWLockRelease(SyncRepLock);
475  announce_next_takeover = !am_sync;
476  return;
477  }
478 
479  /*
480  * Set the lsn first so that when we wake backends they will release up to
481  * this location.
482  */
483  if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr)
484  {
485  walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr;
486  numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
487  }
488  if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr)
489  {
490  walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr;
491  numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
492  }
493  if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr)
494  {
495  walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr;
496  numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY);
497  }
498 
499  LWLockRelease(SyncRepLock);
500 
501  elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X",
502  numwrite, (uint32) (writePtr >> 32), (uint32) writePtr,
503  numflush, (uint32) (flushPtr >> 32), (uint32) flushPtr,
504  numapply, (uint32) (applyPtr >> 32), (uint32) applyPtr);
505 }
506 
507 /*
508  * Calculate the synced Write, Flush and Apply positions among sync standbys.
509  *
510  * Return false if the number of sync standbys is less than
511  * synchronous_standby_names specifies. Otherwise return true and
512  * store the positions into *writePtr, *flushPtr and *applyPtr.
513  *
514  * On return, *am_sync is set to true if this walsender is connecting to
515  * sync standby. Otherwise it's set to false.
516  */
517 static bool
519  XLogRecPtr *applyPtr, bool *am_sync)
520 {
521  List *sync_standbys;
522 
523  *writePtr = InvalidXLogRecPtr;
524  *flushPtr = InvalidXLogRecPtr;
525  *applyPtr = InvalidXLogRecPtr;
526  *am_sync = false;
527 
528  /* Get standbys that are considered as synchronous at this moment */
529  sync_standbys = SyncRepGetSyncStandbys(am_sync);
530 
531  /*
532  * Quick exit if we are not managing a sync standby or there are not
533  * enough synchronous standbys.
534  */
535  if (!(*am_sync) ||
536  SyncRepConfig == NULL ||
537  list_length(sync_standbys) < SyncRepConfig->num_sync)
538  {
539  list_free(sync_standbys);
540  return false;
541  }
542 
543  /*
544  * In a priority-based sync replication, the synced positions are the
545  * oldest ones among sync standbys. In a quorum-based, they are the Nth
546  * latest ones.
547  *
548  * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest
549  * positions. But we use SyncRepGetOldestSyncRecPtr() for that calculation
550  * because it's a bit more efficient.
551  *
552  * XXX If the numbers of current and requested sync standbys are the same,
553  * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
554  * positions even in a quorum-based sync replication.
555  */
556  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
557  {
558  SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
559  sync_standbys);
560  }
561  else
562  {
563  SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
564  sync_standbys, SyncRepConfig->num_sync);
565  }
566 
567  list_free(sync_standbys);
568  return true;
569 }
570 
571 /*
572  * Calculate the oldest Write, Flush and Apply positions among sync standbys.
573  */
574 static void
576  XLogRecPtr *applyPtr, List *sync_standbys)
577 {
578  ListCell *cell;
579 
580  /*
581  * Scan through all sync standbys and calculate the oldest Write, Flush
582  * and Apply positions.
583  */
584  foreach(cell, sync_standbys)
585  {
586  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
588  XLogRecPtr flush;
589  XLogRecPtr apply;
590 
591  SpinLockAcquire(&walsnd->mutex);
592  write = walsnd->write;
593  flush = walsnd->flush;
594  apply = walsnd->apply;
595  SpinLockRelease(&walsnd->mutex);
596 
597  if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write)
598  *writePtr = write;
599  if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush)
600  *flushPtr = flush;
601  if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
602  *applyPtr = apply;
603  }
604 }
605 
606 /*
607  * Calculate the Nth latest Write, Flush and Apply positions among sync
608  * standbys.
609  */
610 static void
612  XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
613 {
614  ListCell *cell;
615  XLogRecPtr *write_array;
616  XLogRecPtr *flush_array;
617  XLogRecPtr *apply_array;
618  int len;
619  int i = 0;
620 
621  len = list_length(sync_standbys);
622  write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
623  flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
624  apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
625 
626  foreach(cell, sync_standbys)
627  {
628  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
629 
630  SpinLockAcquire(&walsnd->mutex);
631  write_array[i] = walsnd->write;
632  flush_array[i] = walsnd->flush;
633  apply_array[i] = walsnd->apply;
634  SpinLockRelease(&walsnd->mutex);
635 
636  i++;
637  }
638 
639  /* Sort each array in descending order */
640  qsort(write_array, len, sizeof(XLogRecPtr), cmp_lsn);
641  qsort(flush_array, len, sizeof(XLogRecPtr), cmp_lsn);
642  qsort(apply_array, len, sizeof(XLogRecPtr), cmp_lsn);
643 
644  /* Get Nth latest Write, Flush, Apply positions */
645  *writePtr = write_array[nth - 1];
646  *flushPtr = flush_array[nth - 1];
647  *applyPtr = apply_array[nth - 1];
648 
649  pfree(write_array);
650  pfree(flush_array);
651  pfree(apply_array);
652 }
653 
654 /*
655  * Compare lsn in order to sort array in descending order.
656  */
657 static int
658 cmp_lsn(const void *a, const void *b)
659 {
660  XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
661  XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
662 
663  if (lsn1 > lsn2)
664  return -1;
665  else if (lsn1 == lsn2)
666  return 0;
667  else
668  return 1;
669 }
670 
671 /*
672  * Return the list of sync standbys, or NIL if no sync standby is connected.
673  *
674  * The caller must hold SyncRepLock.
675  *
676  * On return, *am_sync is set to true if this walsender is connecting to
677  * sync standby. Otherwise it's set to false.
678  */
679 List *
681 {
682  /* Set default result */
683  if (am_sync != NULL)
684  *am_sync = false;
685 
686  /* Quick exit if sync replication is not requested */
687  if (SyncRepConfig == NULL)
688  return NIL;
689 
690  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ?
693 }
694 
695 /*
696  * Return the list of all the candidates for quorum sync standbys,
697  * or NIL if no such standby is connected.
698  *
699  * The caller must hold SyncRepLock. This function must be called only in
700  * a quorum-based sync replication.
701  *
702  * On return, *am_sync is set to true if this walsender is connecting to
703  * sync standby. Otherwise it's set to false.
704  */
705 static List *
707 {
708  List *result = NIL;
709  int i;
710  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
711  * rearrangement */
712 
713  Assert(SyncRepConfig->syncrep_method == SYNC_REP_QUORUM);
714 
715  for (i = 0; i < max_wal_senders; i++)
716  {
717  XLogRecPtr flush;
719  int pid;
720 
721  walsnd = &WalSndCtl->walsnds[i];
722 
723  SpinLockAcquire(&walsnd->mutex);
724  pid = walsnd->pid;
725  flush = walsnd->flush;
726  state = walsnd->state;
727  SpinLockRelease(&walsnd->mutex);
728 
729  /* Must be active */
730  if (pid == 0)
731  continue;
732 
733  /* Must be streaming */
734  if (state != WALSNDSTATE_STREAMING)
735  continue;
736 
737  /* Must be synchronous */
738  if (walsnd->sync_standby_priority == 0)
739  continue;
740 
741  /* Must have a valid flush position */
742  if (XLogRecPtrIsInvalid(flush))
743  continue;
744 
745  /*
746  * Consider this standby as a candidate for quorum sync standbys and
747  * append it to the result.
748  */
749  result = lappend_int(result, i);
750  if (am_sync != NULL && walsnd == MyWalSnd)
751  *am_sync = true;
752  }
753 
754  return result;
755 }
756 
757 /*
758  * Return the list of sync standbys chosen based on their priorities,
759  * or NIL if no sync standby is connected.
760  *
761  * If there are multiple standbys with the same priority,
762  * the first one found is selected preferentially.
763  *
764  * The caller must hold SyncRepLock. This function must be called only in
765  * a priority-based sync replication.
766  *
767  * On return, *am_sync is set to true if this walsender is connecting to
768  * sync standby. Otherwise it's set to false.
769  */
770 static List *
772 {
773  List *result = NIL;
774  List *pending = NIL;
775  int lowest_priority;
776  int next_highest_priority;
777  int this_priority;
778  int priority;
779  int i;
780  bool am_in_pending = false;
781  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
782  * rearrangement */
783 
784  Assert(SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY);
785 
786  lowest_priority = SyncRepConfig->nmembers;
787  next_highest_priority = lowest_priority + 1;
788 
789  /*
790  * Find the sync standbys which have the highest priority (i.e, 1). Also
791  * store all the other potential sync standbys into the pending list, in
792  * order to scan it later and find other sync standbys from it quickly.
793  */
794  for (i = 0; i < max_wal_senders; i++)
795  {
796  XLogRecPtr flush;
798  int pid;
799 
800  walsnd = &WalSndCtl->walsnds[i];
801 
802  SpinLockAcquire(&walsnd->mutex);
803  pid = walsnd->pid;
804  flush = walsnd->flush;
805  state = walsnd->state;
806  SpinLockRelease(&walsnd->mutex);
807 
808  /* Must be active */
809  if (pid == 0)
810  continue;
811 
812  /* Must be streaming */
813  if (state != WALSNDSTATE_STREAMING)
814  continue;
815 
816  /* Must be synchronous */
817  this_priority = walsnd->sync_standby_priority;
818  if (this_priority == 0)
819  continue;
820 
821  /* Must have a valid flush position */
822  if (XLogRecPtrIsInvalid(flush))
823  continue;
824 
825  /*
826  * If the priority is equal to 1, consider this standby as sync and
827  * append it to the result. Otherwise append this standby to the
828  * pending list to check if it's actually sync or not later.
829  */
830  if (this_priority == 1)
831  {
832  result = lappend_int(result, i);
833  if (am_sync != NULL && walsnd == MyWalSnd)
834  *am_sync = true;
835  if (list_length(result) == SyncRepConfig->num_sync)
836  {
837  list_free(pending);
838  return result; /* Exit if got enough sync standbys */
839  }
840  }
841  else
842  {
843  pending = lappend_int(pending, i);
844  if (am_sync != NULL && walsnd == MyWalSnd)
845  am_in_pending = true;
846 
847  /*
848  * Track the highest priority among the standbys in the pending
849  * list, in order to use it as the starting priority for later
850  * scan of the list. This is useful to find quickly the sync
851  * standbys from the pending list later because we can skip
852  * unnecessary scans for the unused priorities.
853  */
854  if (this_priority < next_highest_priority)
855  next_highest_priority = this_priority;
856  }
857  }
858 
859  /*
860  * Consider all pending standbys as sync if the number of them plus
861  * already-found sync ones is lower than the configuration requests.
862  */
863  if (list_length(result) + list_length(pending) <= SyncRepConfig->num_sync)
864  {
865  bool needfree = (result != NIL && pending != NIL);
866 
867  /*
868  * Set *am_sync to true if this walsender is in the pending list
869  * because all pending standbys are considered as sync.
870  */
871  if (am_sync != NULL && !(*am_sync))
872  *am_sync = am_in_pending;
873 
874  result = list_concat(result, pending);
875  if (needfree)
876  pfree(pending);
877  return result;
878  }
879 
880  /*
881  * Find the sync standbys from the pending list.
882  */
883  priority = next_highest_priority;
884  while (priority <= lowest_priority)
885  {
886  ListCell *cell;
887  ListCell *prev = NULL;
888  ListCell *next;
889 
890  next_highest_priority = lowest_priority + 1;
891 
892  for (cell = list_head(pending); cell != NULL; cell = next)
893  {
894  i = lfirst_int(cell);
895  walsnd = &WalSndCtl->walsnds[i];
896 
897  next = lnext(cell);
898 
899  this_priority = walsnd->sync_standby_priority;
900  if (this_priority == priority)
901  {
902  result = lappend_int(result, i);
903  if (am_sync != NULL && walsnd == MyWalSnd)
904  *am_sync = true;
905 
906  /*
907  * We should always exit here after the scan of pending list
908  * starts because we know that the list has enough elements to
909  * reach SyncRepConfig->num_sync.
910  */
911  if (list_length(result) == SyncRepConfig->num_sync)
912  {
913  list_free(pending);
914  return result; /* Exit if got enough sync standbys */
915  }
916 
917  /*
918  * Remove the entry for this sync standby from the list to
919  * prevent us from looking at the same entry again.
920  */
921  pending = list_delete_cell(pending, cell, prev);
922 
923  continue;
924  }
925 
926  if (this_priority < next_highest_priority)
927  next_highest_priority = this_priority;
928 
929  prev = cell;
930  }
931 
932  priority = next_highest_priority;
933  }
934 
935  /* never reached, but keep compiler quiet */
936  Assert(false);
937  return result;
938 }
939 
940 /*
941  * Check if we are in the list of sync standbys, and if so, determine
942  * priority sequence. Return priority if set, or zero to indicate that
943  * we are not a potential sync standby.
944  *
945  * Compare the parameter SyncRepStandbyNames against the application_name
946  * for this WALSender, or allow any name if we find a wildcard "*".
947  */
948 static int
950 {
951  const char *standby_name;
952  int priority;
953  bool found = false;
954 
955  /*
956  * Since synchronous cascade replication is not allowed, we always set the
957  * priority of cascading walsender to zero.
958  */
960  return 0;
961 
962  if (!SyncStandbysDefined() || SyncRepConfig == NULL)
963  return 0;
964 
965  standby_name = SyncRepConfig->member_names;
966  for (priority = 1; priority <= SyncRepConfig->nmembers; priority++)
967  {
968  if (pg_strcasecmp(standby_name, application_name) == 0 ||
969  strcmp(standby_name, "*") == 0)
970  {
971  found = true;
972  break;
973  }
974  standby_name += strlen(standby_name) + 1;
975  }
976 
977  if (!found)
978  return 0;
979 
980  /*
981  * In quorum-based sync replication, all the standbys in the list have the
982  * same priority, one.
983  */
984  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ? priority : 1;
985 }
986 
987 /*
988  * Walk the specified queue from head. Set the state of any backends that
989  * need to be woken, remove them from the queue, and then wake them.
990  * Pass all = true to wake whole queue; otherwise, just wake up to
991  * the walsender's LSN.
992  *
993  * Must hold SyncRepLock.
994  */
995 static int
996 SyncRepWakeQueue(bool all, int mode)
997 {
998  volatile WalSndCtlData *walsndctl = WalSndCtl;
999  PGPROC *proc = NULL;
1000  PGPROC *thisproc = NULL;
1001  int numprocs = 0;
1002 
1003  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
1004  Assert(SyncRepQueueIsOrderedByLSN(mode));
1005 
1006  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1007  &(WalSndCtl->SyncRepQueue[mode]),
1008  offsetof(PGPROC, syncRepLinks));
1009 
1010  while (proc)
1011  {
1012  /*
1013  * Assume the queue is ordered by LSN
1014  */
1015  if (!all && walsndctl->lsn[mode] < proc->waitLSN)
1016  return numprocs;
1017 
1018  /*
1019  * Move to next proc, so we can delete thisproc from the queue.
1020  * thisproc is valid, proc may be NULL after this.
1021  */
1022  thisproc = proc;
1023  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1024  &(proc->syncRepLinks),
1025  offsetof(PGPROC, syncRepLinks));
1026 
1027  /*
1028  * Remove thisproc from queue.
1029  */
1030  SHMQueueDelete(&(thisproc->syncRepLinks));
1031 
1032  /*
1033  * SyncRepWaitForLSN() reads syncRepState without holding the lock, so
1034  * make sure that it sees the queue link being removed before the
1035  * syncRepState change.
1036  */
1037  pg_write_barrier();
1038 
1039  /*
1040  * Set state to complete; see SyncRepWaitForLSN() for discussion of
1041  * the various states.
1042  */
1044 
1045  /*
1046  * Wake only when we have set state and removed from queue.
1047  */
1048  SetLatch(&(thisproc->procLatch));
1049 
1050  numprocs++;
1051  }
1052 
1053  return numprocs;
1054 }
1055 
1056 /*
1057  * The checkpointer calls this as needed to update the shared
1058  * sync_standbys_defined flag, so that backends don't remain permanently wedged
1059  * if synchronous_standby_names is unset. It's safe to check the current value
1060  * without the lock, because it's only ever updated by one process. But we
1061  * must take the lock to change it.
1062  */
1063 void
1065 {
1066  bool sync_standbys_defined = SyncStandbysDefined();
1067 
1068  if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
1069  {
1070  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
1071 
1072  /*
1073  * If synchronous_standby_names has been reset to empty, it's futile
1074  * for backends to continue to waiting. Since the user no longer
1075  * wants synchronous replication, we'd better wake them up.
1076  */
1077  if (!sync_standbys_defined)
1078  {
1079  int i;
1080 
1081  for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
1082  SyncRepWakeQueue(true, i);
1083  }
1084 
1085  /*
1086  * Only allow people to join the queue when there are synchronous
1087  * standbys defined. Without this interlock, there's a race
1088  * condition: we might wake up all the current waiters; then, some
1089  * backend that hasn't yet reloaded its config might go to sleep on
1090  * the queue (and never wake up). This prevents that.
1091  */
1092  WalSndCtl->sync_standbys_defined = sync_standbys_defined;
1093 
1094  LWLockRelease(SyncRepLock);
1095  }
1096 }
1097 
1098 #ifdef USE_ASSERT_CHECKING
1099 static bool
1100 SyncRepQueueIsOrderedByLSN(int mode)
1101 {
1102  PGPROC *proc = NULL;
1103  XLogRecPtr lastLSN;
1104 
1105  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
1106 
1107  lastLSN = 0;
1108 
1109  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1110  &(WalSndCtl->SyncRepQueue[mode]),
1111  offsetof(PGPROC, syncRepLinks));
1112 
1113  while (proc)
1114  {
1115  /*
1116  * Check the queue is ordered by LSN and that multiple procs don't
1117  * have matching LSNs
1118  */
1119  if (proc->waitLSN <= lastLSN)
1120  return false;
1121 
1122  lastLSN = proc->waitLSN;
1123 
1124  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1125  &(proc->syncRepLinks),
1126  offsetof(PGPROC, syncRepLinks));
1127  }
1128 
1129  return true;
1130 }
1131 #endif
1132 
1133 /*
1134  * ===========================================================
1135  * Synchronous Replication functions executed by any process
1136  * ===========================================================
1137  */
1138 
1139 bool
1140 check_synchronous_standby_names(char **newval, void **extra, GucSource source)
1141 {
1142  if (*newval != NULL && (*newval)[0] != '\0')
1143  {
1144  int parse_rc;
1145  SyncRepConfigData *pconf;
1146 
1147  /* Reset communication variables to ensure a fresh start */
1150 
1151  /* Parse the synchronous_standby_names string */
1152  syncrep_scanner_init(*newval);
1153  parse_rc = syncrep_yyparse();
1155 
1156  if (parse_rc != 0 || syncrep_parse_result == NULL)
1157  {
1158  GUC_check_errcode(ERRCODE_SYNTAX_ERROR);
1161  else
1162  GUC_check_errdetail("synchronous_standby_names parser failed");
1163  return false;
1164  }
1165 
1166  if (syncrep_parse_result->num_sync <= 0)
1167  {
1168  GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero",
1170  return false;
1171  }
1172 
1173  /* GUC extra value must be malloc'd, not palloc'd */
1174  pconf = (SyncRepConfigData *)
1176  if (pconf == NULL)
1177  return false;
1179 
1180  *extra = (void *) pconf;
1181 
1182  /*
1183  * We need not explicitly clean up syncrep_parse_result. It, and any
1184  * other cruft generated during parsing, will be freed when the
1185  * current memory context is deleted. (This code is generally run in
1186  * a short-lived context used for config file processing, so that will
1187  * not be very long.)
1188  */
1189  }
1190  else
1191  *extra = NULL;
1192 
1193  return true;
1194 }
1195 
1196 void
1197 assign_synchronous_standby_names(const char *newval, void *extra)
1198 {
1199  SyncRepConfig = (SyncRepConfigData *) extra;
1200 }
1201 
1202 void
1204 {
1205  switch (newval)
1206  {
1209  break;
1212  break;
1215  break;
1216  default:
1218  break;
1219  }
1220 }
void syncrep_scanner_finish(void)
#define NIL
Definition: pg_list.h:69
void SyncRepUpdateSyncStandbysDefined(void)
Definition: syncrep.c:1064
XLogRecPtr write
void assign_synchronous_standby_names(const char *newval, void *extra)
Definition: syncrep.c:1197
Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:164
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
static List * SyncRepGetSyncStandbysPriority(bool *am_sync)
Definition: syncrep.c:771
static void SyncRepCancelWait(void)
Definition: syncrep.c:353
#define DEBUG1
Definition: elog.h:25
void syncrep_scanner_init(const char *query_string)
static int32 next
Definition: blutils.c:210
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync)
Definition: syncrep.c:518
#define DEBUG3
Definition: elog.h:23
#define SYNC_REP_PRIORITY
Definition: syncrep.h:36
bool update_process_title
Definition: ps_status.c:35
#define write(a, b, c)
Definition: win32.h:14
#define GUC_check_errdetail
Definition: guc.h:407
static void SyncRepQueueInsert(int mode)
Definition: syncrep.c:320
PGPROC * MyProc
Definition: proc.c:67
char * syncrep_parse_error_msg
#define SYNC_REP_WAITING
Definition: syncrep.h:32
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:146
uint8 syncrep_method
Definition: syncrep.h:51
#define Min(x, y)
Definition: c.h:807
unsigned char uint8
Definition: c.h:266
void set_ps_display(const char *activity, bool force)
Definition: ps_status.c:326
#define SYNC_REP_NOT_WAITING
Definition: syncrep.h:31
WalSndCtlData * WalSndCtl
Definition: walsender.c:108
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
Definition: syncrep.c:611
int errcode(int sqlerrcode)
Definition: elog.c:575
#define GUC_check_errmsg
Definition: guc.h:403
List * list_concat(List *list1, List *list2)
Definition: list.c:321
return result
Definition: formatting.c:1633
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]
List * SyncRepGetSyncStandbys(bool *am_sync)
Definition: syncrep.c:680
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void ResetLatch(volatile Latch *latch)
Definition: latch.c:497
#define LOG
Definition: elog.h:26
SyncRepConfigData * SyncRepConfig
Definition: syncrep.c:97
volatile bool QueryCancelPending
Definition: globals.c:30
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:9784
static int SyncRepWakeQueue(bool all, int mode)
Definition: syncrep.c:996
slock_t mutex
Latch procLatch
Definition: proc.h:103
GucSource
Definition: guc.h:105
#define malloc(a)
Definition: header.h:50
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1715
#define SpinLockAcquire(lock)
Definition: spin.h:62
XLogRecPtr flush
int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:336
void pfree(void *pointer)
Definition: mcxt.c:950
static int cmp_lsn(const void *a, const void *b)
Definition: syncrep.c:658
#define SYNC_REP_NO_WAIT
Definition: syncrep.h:23
#define SYNC_REP_WAIT_APPLY
Definition: syncrep.h:26
#define lfirst_int(lc)
Definition: pg_list.h:107
void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
Definition: shmqueue.c:108
static bool announce_next_takeover
Definition: syncrep.c:95
Definition: dest.h:88
const char * get_ps_display(int *displen)
Definition: ps_status.c:405
WalSndState state
void assign_synchronous_commit(int newval, void *extra)
Definition: syncrep.c:1203
#define SyncStandbysDefined()
Definition: syncrep.c:92
bool PostmasterIsAlive(void)
Definition: pmsignal.c:272
int errdetail(const char *fmt,...)
Definition: elog.c:873
static ListCell * list_head(const List *l)
Definition: pg_list.h:77
unsigned int uint32
Definition: c.h:268
SHM_QUEUE SyncRepQueue[NUM_SYNC_REP_WAIT_MODE]
void SyncRepInitConfig(void)
Definition: syncrep.c:384
#define lnext(lc)
Definition: pg_list.h:105
#define ereport(elevel, rest)
Definition: elog.h:122
#define SYNC_REP_WAIT_FLUSH
Definition: syncrep.h:25
List * lappend_int(List *list, int datum)
Definition: list.c:146
#define SyncRepRequested()
Definition: syncrep.h:19
int max_wal_senders
Definition: walsender.c:120
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
#define SpinLockRelease(lock)
Definition: spin.h:64
List * list_delete_cell(List *list, ListCell *cell, ListCell *prev)
Definition: list.c:528
char * SyncRepStandbyNames
Definition: syncrep.c:90
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:145
WalSnd * MyWalSnd
Definition: walsender.c:111
static List * SyncRepGetSyncStandbysQuorum(bool *am_sync)
Definition: syncrep.c:706
#define SYNC_REP_WAIT_COMPLETE
Definition: syncrep.h:33
#define SYNC_REP_QUORUM
Definition: syncrep.h:37
void SetLatch(volatile Latch *latch)
Definition: latch.c:414
#define NULL
Definition: c.h:229
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:676
char member_names[FLEXIBLE_ARRAY_MEMBER]
Definition: syncrep.h:54
int sync_standby_priority
Definition: regguts.h:298
#define pg_read_barrier()
Definition: atomics.h:161
bool SHMQueueIsDetached(const SHM_QUEUE *queue)
Definition: shmqueue.c:47
SyncRepConfigData * syncrep_parse_result
static int list_length(const List *l)
Definition: pg_list.h:89
#define newval
bool check_synchronous_standby_names(char **newval, void **extra, GucSource source)
Definition: syncrep.c:1140
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1111
SHM_QUEUE syncRepLinks
Definition: proc.h:149
void SyncRepCleanupAtProcExit(void)
Definition: syncrep.c:363
WalSndState
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]
volatile bool ProcDiePending
Definition: globals.c:31
char * application_name
Definition: guc.c:469
void * palloc(Size size)
Definition: mcxt.c:849
int errmsg(const char *fmt,...)
Definition: elog.c:797
void list_free(List *list)
Definition: list.c:1133
int i
#define pg_write_barrier()
Definition: atomics.h:162
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys)
Definition: syncrep.c:575
void SHMQueueDelete(SHM_QUEUE *queue)
Definition: shmqueue.c:68
struct Latch * MyLatch
Definition: globals.c:52
int syncrep_yyparse(void)
#define elog
Definition: elog.h:219
#define NUM_SYNC_REP_WAIT_MODE
Definition: syncrep.h:28
#define qsort(a, b, c, d)
Definition: port.h:443
int syncRepState
Definition: proc.h:148
CommandDest whereToSendOutput
Definition: postgres.c:88
XLogRecPtr apply
Definition: proc.h:94
Definition: pg_list.h:45
static int SyncRepWaitMode
Definition: syncrep.c:98
#define WL_LATCH_SET
Definition: latch.h:124
void SyncRepReleaseWaiters(void)
Definition: syncrep.c:412
XLogRecPtr waitLSN
Definition: proc.h:147
#define offsetof(type, field)
Definition: c.h:555
bool am_cascading_walsender
Definition: walsender.c:115
#define SYNC_REP_WAIT_WRITE
Definition: syncrep.h:24
static int SyncRepGetStandbyPriority(void)
Definition: syncrep.c:949