PostgreSQL Source Code  git master
syncrep.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * syncrep.c
4  *
5  * Synchronous replication is new as of PostgreSQL 9.1.
6  *
7  * If requested, transaction commits wait until their commit LSN are
8  * acknowledged by the synchronous standbys.
9  *
10  * This module contains the code for waiting and release of backends.
11  * All code in this module executes on the primary. The core streaming
12  * replication transport remains within WALreceiver/WALsender modules.
13  *
14  * The essence of this design is that it isolates all logic about
15  * waiting/releasing onto the primary. The primary defines which standbys
16  * it wishes to wait for. The standbys are completely unaware of the
17  * durability requirements of transactions on the primary, reducing the
18  * complexity of the code and streamlining both standby operations and
19  * network bandwidth because there is no requirement to ship
20  * per-transaction state information.
21  *
22  * Replication is either synchronous or not synchronous (async). If it is
23  * async, we just fastpath out of here. If it is sync, then we wait for
24  * the write, flush or apply location on the standby before releasing
25  * the waiting backend. Further complexity in that interaction is
26  * expected in later releases.
27  *
28  * The best performing way to manage the waiting backends is to have a
29  * single ordered queue of waiting backends, so that we can avoid
30  * searching the through all waiters each time we receive a reply.
31  *
32  * In 9.5 or before only a single standby could be considered as
33  * synchronous. In 9.6 we support a priority-based multiple synchronous
34  * standbys. In 10.0 a quorum-based multiple synchronous standbys is also
35  * supported. The number of synchronous standbys that transactions
36  * must wait for replies from is specified in synchronous_standby_names.
37  * This parameter also specifies a list of standby names and the method
38  * (FIRST and ANY) to choose synchronous standbys from the listed ones.
39  *
40  * The method FIRST specifies a priority-based synchronous replication
41  * and makes transaction commits wait until their WAL records are
42  * replicated to the requested number of synchronous standbys chosen based
43  * on their priorities. The standbys whose names appear earlier in the list
44  * are given higher priority and will be considered as synchronous.
45  * Other standby servers appearing later in this list represent potential
46  * synchronous standbys. If any of the current synchronous standbys
47  * disconnects for whatever reason, it will be replaced immediately with
48  * the next-highest-priority standby.
49  *
50  * The method ANY specifies a quorum-based synchronous replication
51  * and makes transaction commits wait until their WAL records are
52  * replicated to at least the requested number of synchronous standbys
53  * in the list. All the standbys appearing in the list are considered as
54  * candidates for quorum synchronous standbys.
55  *
56  * If neither FIRST nor ANY is specified, FIRST is used as the method.
57  * This is for backward compatibility with 9.6 or before where only a
58  * priority-based sync replication was supported.
59  *
60  * Before the standbys chosen from synchronous_standby_names can
61  * become the synchronous standbys they must have caught up with
62  * the primary; that may take some time. Once caught up,
63  * the standbys which are considered as synchronous at that moment
64  * will release waiters from the queue.
65  *
66  * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
67  *
68  * IDENTIFICATION
69  * src/backend/replication/syncrep.c
70  *
71  *-------------------------------------------------------------------------
72  */
73 #include "postgres.h"
74 
75 #include <unistd.h>
76 
77 #include "access/xact.h"
78 #include "common/int.h"
79 #include "miscadmin.h"
80 #include "pgstat.h"
81 #include "replication/syncrep.h"
82 #include "replication/walsender.h"
84 #include "storage/proc.h"
85 #include "tcop/tcopprot.h"
86 #include "utils/guc_hooks.h"
87 #include "utils/ps_status.h"
88 
89 /* User-settable parameters for sync rep */
91 
92 #define SyncStandbysDefined() \
93  (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0')
94 
95 static bool announce_next_takeover = true;
96 
99 
100 static void SyncRepQueueInsert(int mode);
101 static void SyncRepCancelWait(void);
102 static int SyncRepWakeQueue(bool all, int mode);
103 
104 static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
105  XLogRecPtr *flushPtr,
106  XLogRecPtr *applyPtr,
107  bool *am_sync);
108 static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
109  XLogRecPtr *flushPtr,
110  XLogRecPtr *applyPtr,
111  SyncRepStandbyData *sync_standbys,
112  int num_standbys);
113 static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
114  XLogRecPtr *flushPtr,
115  XLogRecPtr *applyPtr,
116  SyncRepStandbyData *sync_standbys,
117  int num_standbys,
118  uint8 nth);
119 static int SyncRepGetStandbyPriority(void);
120 static int standby_priority_comparator(const void *a, const void *b);
121 static int cmp_lsn(const void *a, const void *b);
122 
123 #ifdef USE_ASSERT_CHECKING
124 static bool SyncRepQueueIsOrderedByLSN(int mode);
125 #endif
126 
127 /*
128  * ===========================================================
129  * Synchronous Replication functions for normal user backends
130  * ===========================================================
131  */
132 
133 /*
134  * Wait for synchronous replication, if requested by user.
135  *
136  * Initially backends start in state SYNC_REP_NOT_WAITING and then
137  * change that state to SYNC_REP_WAITING before adding ourselves
138  * to the wait queue. During SyncRepWakeQueue() a WALSender changes
139  * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
140  * This backend then resets its state to SYNC_REP_NOT_WAITING.
141  *
142  * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN
143  * represents a commit record. If it doesn't, then we wait only for the WAL
144  * to be flushed if synchronous_commit is set to the higher level of
145  * remote_apply, because only commit records provide apply feedback.
146  */
147 void
148 SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
149 {
150  int mode;
151 
152  /*
153  * This should be called while holding interrupts during a transaction
154  * commit to prevent the follow-up shared memory queue cleanups to be
155  * influenced by external interruptions.
156  */
158 
159  /*
160  * Fast exit if user has not requested sync replication, or there are no
161  * sync replication standby names defined.
162  *
163  * Since this routine gets called every commit time, it's important to
164  * exit quickly if sync replication is not requested. So we check
165  * WalSndCtl->sync_standbys_defined flag without the lock and exit
166  * immediately if it's false. If it's true, we need to check it again
167  * later while holding the lock, to check the flag and operate the sync
168  * rep queue atomically. This is necessary to avoid the race condition
169  * described in SyncRepUpdateSyncStandbysDefined(). On the other hand, if
170  * it's false, the lock is not necessary because we don't touch the queue.
171  */
172  if (!SyncRepRequested() ||
173  !((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined)
174  return;
175 
176  /* Cap the level for anything other than commit to remote flush only. */
177  if (commit)
179  else
181 
183  Assert(WalSndCtl != NULL);
184 
185  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
187 
188  /*
189  * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
190  * set. See SyncRepUpdateSyncStandbysDefined.
191  *
192  * Also check that the standby hasn't already replied. Unlikely race
193  * condition but we'll be fetching that cache line anyway so it's likely
194  * to be a low cost check.
195  */
197  lsn <= WalSndCtl->lsn[mode])
198  {
199  LWLockRelease(SyncRepLock);
200  return;
201  }
202 
203  /*
204  * Set our waitLSN so WALSender will know when to wake us, and add
205  * ourselves to the queue.
206  */
207  MyProc->waitLSN = lsn;
210  Assert(SyncRepQueueIsOrderedByLSN(mode));
211  LWLockRelease(SyncRepLock);
212 
213  /* Alter ps display to show waiting for sync rep. */
215  {
216  char buffer[32];
217 
218  sprintf(buffer, "waiting for %X/%X", LSN_FORMAT_ARGS(lsn));
219  set_ps_display_suffix(buffer);
220  }
221 
222  /*
223  * Wait for specified LSN to be confirmed.
224  *
225  * Each proc has its own wait latch, so we perform a normal latch
226  * check/wait loop here.
227  */
228  for (;;)
229  {
230  int rc;
231 
232  /* Must reset the latch before testing state. */
234 
235  /*
236  * Acquiring the lock is not needed, the latch ensures proper
237  * barriers. If it looks like we're done, we must really be done,
238  * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE,
239  * it will never update it again, so we can't be seeing a stale value
240  * in that case.
241  */
243  break;
244 
245  /*
246  * If a wait for synchronous replication is pending, we can neither
247  * acknowledge the commit nor raise ERROR or FATAL. The latter would
248  * lead the client to believe that the transaction aborted, which is
249  * not true: it's already committed locally. The former is no good
250  * either: the client has requested synchronous replication, and is
251  * entitled to assume that an acknowledged commit is also replicated,
252  * which might not be true. So in this case we issue a WARNING (which
253  * some clients may be able to interpret) and shut off further output.
254  * We do NOT reset ProcDiePending, so that the process will die after
255  * the commit is cleaned up.
256  */
257  if (ProcDiePending)
258  {
260  (errcode(ERRCODE_ADMIN_SHUTDOWN),
261  errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
262  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
265  break;
266  }
267 
268  /*
269  * It's unclear what to do if a query cancel interrupt arrives. We
270  * can't actually abort at this point, but ignoring the interrupt
271  * altogether is not helpful, so we just terminate the wait with a
272  * suitable warning.
273  */
274  if (QueryCancelPending)
275  {
276  QueryCancelPending = false;
278  (errmsg("canceling wait for synchronous replication due to user request"),
279  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
281  break;
282  }
283 
284  /*
285  * Wait on latch. Any condition that should wake us up will set the
286  * latch, so no need for timeout.
287  */
289  WAIT_EVENT_SYNC_REP);
290 
291  /*
292  * If the postmaster dies, we'll probably never get an acknowledgment,
293  * because all the wal sender processes will exit. So just bail out.
294  */
295  if (rc & WL_POSTMASTER_DEATH)
296  {
297  ProcDiePending = true;
300  break;
301  }
302  }
303 
304  /*
305  * WalSender has checked our LSN and has removed us from queue. Clean up
306  * state and leave. It's OK to reset these shared memory fields without
307  * holding SyncRepLock, because any walsenders will ignore us anyway when
308  * we're not on the queue. We need a read barrier to make sure we see the
309  * changes to the queue link (this might be unnecessary without
310  * assertions, but better safe than sorry).
311  */
312  pg_read_barrier();
315  MyProc->waitLSN = 0;
316 
317  /* reset ps display to remove the suffix */
320 }
321 
322 /*
323  * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
324  *
325  * Usually we will go at tail of queue, though it's possible that we arrive
326  * here out of order, so start at tail and work back to insertion point.
327  */
328 static void
330 {
331  dlist_head *queue;
332  dlist_iter iter;
333 
335  queue = &WalSndCtl->SyncRepQueue[mode];
336 
337  dlist_reverse_foreach(iter, queue)
338  {
339  PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur);
340 
341  /*
342  * Stop at the queue element that we should insert after to ensure the
343  * queue is ordered by LSN.
344  */
345  if (proc->waitLSN < MyProc->waitLSN)
346  {
348  return;
349  }
350  }
351 
352  /*
353  * If we get here, the list was either empty, or this process needs to be
354  * at the head.
355  */
357 }
358 
359 /*
360  * Acquire SyncRepLock and cancel any wait currently in progress.
361  */
362 static void
364 {
365  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
369  LWLockRelease(SyncRepLock);
370 }
371 
372 void
374 {
375  /*
376  * First check if we are removed from the queue without the lock to not
377  * slow down backend exit.
378  */
380  {
381  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
382 
383  /* maybe we have just been removed, so recheck */
386 
387  LWLockRelease(SyncRepLock);
388  }
389 }
390 
391 /*
392  * ===========================================================
393  * Synchronous Replication functions for wal sender processes
394  * ===========================================================
395  */
396 
397 /*
398  * Take any action required to initialise sync rep state from config
399  * data. Called at WALSender startup and after each SIGHUP.
400  */
401 void
403 {
404  int priority;
405 
406  /*
407  * Determine if we are a potential sync standby and remember the result
408  * for handling replies from standby.
409  */
410  priority = SyncRepGetStandbyPriority();
411  if (MyWalSnd->sync_standby_priority != priority)
412  {
414  MyWalSnd->sync_standby_priority = priority;
416 
417  ereport(DEBUG1,
418  (errmsg_internal("standby \"%s\" now has synchronous standby priority %d",
419  application_name, priority)));
420  }
421 }
422 
423 /*
424  * Update the LSNs on each queue based upon our latest state. This
425  * implements a simple policy of first-valid-sync-standby-releases-waiter.
426  *
427  * Other policies are possible, which would change what we do here and
428  * perhaps also which information we store as well.
429  */
430 void
432 {
433  volatile WalSndCtlData *walsndctl = WalSndCtl;
434  XLogRecPtr writePtr;
435  XLogRecPtr flushPtr;
436  XLogRecPtr applyPtr;
437  bool got_recptr;
438  bool am_sync;
439  int numwrite = 0;
440  int numflush = 0;
441  int numapply = 0;
442 
443  /*
444  * If this WALSender is serving a standby that is not on the list of
445  * potential sync standbys then we have nothing to do. If we are still
446  * starting up, still running base backup or the current flush position is
447  * still invalid, then leave quickly also. Streaming or stopping WAL
448  * senders are allowed to release waiters.
449  */
450  if (MyWalSnd->sync_standby_priority == 0 ||
454  {
455  announce_next_takeover = true;
456  return;
457  }
458 
459  /*
460  * We're a potential sync standby. Release waiters if there are enough
461  * sync standbys and we are considered as sync.
462  */
463  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
464 
465  /*
466  * Check whether we are a sync standby or not, and calculate the synced
467  * positions among all sync standbys. (Note: although this step does not
468  * of itself require holding SyncRepLock, it seems like a good idea to do
469  * it after acquiring the lock. This ensures that the WAL pointers we use
470  * to release waiters are newer than any previous execution of this
471  * routine used.)
472  */
473  got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
474 
475  /*
476  * If we are managing a sync standby, though we weren't prior to this,
477  * then announce we are now a sync standby.
478  */
479  if (announce_next_takeover && am_sync)
480  {
481  announce_next_takeover = false;
482 
484  ereport(LOG,
485  (errmsg("standby \"%s\" is now a synchronous standby with priority %d",
487  else
488  ereport(LOG,
489  (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
490  application_name)));
491  }
492 
493  /*
494  * If the number of sync standbys is less than requested or we aren't
495  * managing a sync standby then just leave.
496  */
497  if (!got_recptr || !am_sync)
498  {
499  LWLockRelease(SyncRepLock);
500  announce_next_takeover = !am_sync;
501  return;
502  }
503 
504  /*
505  * Set the lsn first so that when we wake backends they will release up to
506  * this location.
507  */
508  if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr)
509  {
510  walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr;
511  numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
512  }
513  if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr)
514  {
515  walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr;
516  numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
517  }
518  if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr)
519  {
520  walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr;
521  numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY);
522  }
523 
524  LWLockRelease(SyncRepLock);
525 
526  elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X",
527  numwrite, LSN_FORMAT_ARGS(writePtr),
528  numflush, LSN_FORMAT_ARGS(flushPtr),
529  numapply, LSN_FORMAT_ARGS(applyPtr));
530 }
531 
532 /*
533  * Calculate the synced Write, Flush and Apply positions among sync standbys.
534  *
535  * Return false if the number of sync standbys is less than
536  * synchronous_standby_names specifies. Otherwise return true and
537  * store the positions into *writePtr, *flushPtr and *applyPtr.
538  *
539  * On return, *am_sync is set to true if this walsender is connecting to
540  * sync standby. Otherwise it's set to false.
541  */
542 static bool
544  XLogRecPtr *applyPtr, bool *am_sync)
545 {
546  SyncRepStandbyData *sync_standbys;
547  int num_standbys;
548  int i;
549 
550  /* Initialize default results */
551  *writePtr = InvalidXLogRecPtr;
552  *flushPtr = InvalidXLogRecPtr;
553  *applyPtr = InvalidXLogRecPtr;
554  *am_sync = false;
555 
556  /* Quick out if not even configured to be synchronous */
557  if (SyncRepConfig == NULL)
558  return false;
559 
560  /* Get standbys that are considered as synchronous at this moment */
561  num_standbys = SyncRepGetCandidateStandbys(&sync_standbys);
562 
563  /* Am I among the candidate sync standbys? */
564  for (i = 0; i < num_standbys; i++)
565  {
566  if (sync_standbys[i].is_me)
567  {
568  *am_sync = true;
569  break;
570  }
571  }
572 
573  /*
574  * Nothing more to do if we are not managing a sync standby or there are
575  * not enough synchronous standbys.
576  */
577  if (!(*am_sync) ||
578  num_standbys < SyncRepConfig->num_sync)
579  {
580  pfree(sync_standbys);
581  return false;
582  }
583 
584  /*
585  * In a priority-based sync replication, the synced positions are the
586  * oldest ones among sync standbys. In a quorum-based, they are the Nth
587  * latest ones.
588  *
589  * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest
590  * positions. But we use SyncRepGetOldestSyncRecPtr() for that calculation
591  * because it's a bit more efficient.
592  *
593  * XXX If the numbers of current and requested sync standbys are the same,
594  * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
595  * positions even in a quorum-based sync replication.
596  */
598  {
599  SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
600  sync_standbys, num_standbys);
601  }
602  else
603  {
604  SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
605  sync_standbys, num_standbys,
607  }
608 
609  pfree(sync_standbys);
610  return true;
611 }
612 
613 /*
614  * Calculate the oldest Write, Flush and Apply positions among sync standbys.
615  */
616 static void
618  XLogRecPtr *flushPtr,
619  XLogRecPtr *applyPtr,
620  SyncRepStandbyData *sync_standbys,
621  int num_standbys)
622 {
623  int i;
624 
625  /*
626  * Scan through all sync standbys and calculate the oldest Write, Flush
627  * and Apply positions. We assume *writePtr et al were initialized to
628  * InvalidXLogRecPtr.
629  */
630  for (i = 0; i < num_standbys; i++)
631  {
632  XLogRecPtr write = sync_standbys[i].write;
633  XLogRecPtr flush = sync_standbys[i].flush;
634  XLogRecPtr apply = sync_standbys[i].apply;
635 
636  if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write)
637  *writePtr = write;
638  if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush)
639  *flushPtr = flush;
640  if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
641  *applyPtr = apply;
642  }
643 }
644 
645 /*
646  * Calculate the Nth latest Write, Flush and Apply positions among sync
647  * standbys.
648  */
649 static void
651  XLogRecPtr *flushPtr,
652  XLogRecPtr *applyPtr,
653  SyncRepStandbyData *sync_standbys,
654  int num_standbys,
655  uint8 nth)
656 {
657  XLogRecPtr *write_array;
658  XLogRecPtr *flush_array;
659  XLogRecPtr *apply_array;
660  int i;
661 
662  /* Should have enough candidates, or somebody messed up */
663  Assert(nth > 0 && nth <= num_standbys);
664 
665  write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
666  flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
667  apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
668 
669  for (i = 0; i < num_standbys; i++)
670  {
671  write_array[i] = sync_standbys[i].write;
672  flush_array[i] = sync_standbys[i].flush;
673  apply_array[i] = sync_standbys[i].apply;
674  }
675 
676  /* Sort each array in descending order */
677  qsort(write_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
678  qsort(flush_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
679  qsort(apply_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
680 
681  /* Get Nth latest Write, Flush, Apply positions */
682  *writePtr = write_array[nth - 1];
683  *flushPtr = flush_array[nth - 1];
684  *applyPtr = apply_array[nth - 1];
685 
686  pfree(write_array);
687  pfree(flush_array);
688  pfree(apply_array);
689 }
690 
691 /*
692  * Compare lsn in order to sort array in descending order.
693  */
694 static int
695 cmp_lsn(const void *a, const void *b)
696 {
697  XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
698  XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
699 
700  return pg_cmp_u64(lsn2, lsn1);
701 }
702 
703 /*
704  * Return data about walsenders that are candidates to be sync standbys.
705  *
706  * *standbys is set to a palloc'd array of structs of per-walsender data,
707  * and the number of valid entries (candidate sync senders) is returned.
708  * (This might be more or fewer than num_sync; caller must check.)
709  */
710 int
712 {
713  int i;
714  int n;
715 
716  /* Create result array */
717  *standbys = (SyncRepStandbyData *)
719 
720  /* Quick exit if sync replication is not requested */
721  if (SyncRepConfig == NULL)
722  return 0;
723 
724  /* Collect raw data from shared memory */
725  n = 0;
726  for (i = 0; i < max_wal_senders; i++)
727  {
728  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
729  * rearrangement */
730  SyncRepStandbyData *stby;
731  WalSndState state; /* not included in SyncRepStandbyData */
732 
733  walsnd = &WalSndCtl->walsnds[i];
734  stby = *standbys + n;
735 
736  SpinLockAcquire(&walsnd->mutex);
737  stby->pid = walsnd->pid;
738  state = walsnd->state;
739  stby->write = walsnd->write;
740  stby->flush = walsnd->flush;
741  stby->apply = walsnd->apply;
743  SpinLockRelease(&walsnd->mutex);
744 
745  /* Must be active */
746  if (stby->pid == 0)
747  continue;
748 
749  /* Must be streaming or stopping */
750  if (state != WALSNDSTATE_STREAMING &&
752  continue;
753 
754  /* Must be synchronous */
755  if (stby->sync_standby_priority == 0)
756  continue;
757 
758  /* Must have a valid flush position */
759  if (XLogRecPtrIsInvalid(stby->flush))
760  continue;
761 
762  /* OK, it's a candidate */
763  stby->walsnd_index = i;
764  stby->is_me = (walsnd == MyWalSnd);
765  n++;
766  }
767 
768  /*
769  * In quorum mode, we return all the candidates. In priority mode, if we
770  * have too many candidates then return only the num_sync ones of highest
771  * priority.
772  */
774  n > SyncRepConfig->num_sync)
775  {
776  /* Sort by priority ... */
777  qsort(*standbys, n, sizeof(SyncRepStandbyData),
779  /* ... then report just the first num_sync ones */
780  n = SyncRepConfig->num_sync;
781  }
782 
783  return n;
784 }
785 
786 /*
787  * qsort comparator to sort SyncRepStandbyData entries by priority
788  */
789 static int
790 standby_priority_comparator(const void *a, const void *b)
791 {
792  const SyncRepStandbyData *sa = (const SyncRepStandbyData *) a;
793  const SyncRepStandbyData *sb = (const SyncRepStandbyData *) b;
794 
795  /* First, sort by increasing priority value */
796  if (sa->sync_standby_priority != sb->sync_standby_priority)
797  return sa->sync_standby_priority - sb->sync_standby_priority;
798 
799  /*
800  * We might have equal priority values; arbitrarily break ties by position
801  * in the WalSnd array. (This is utterly bogus, since that is arrival
802  * order dependent, but there are regression tests that rely on it.)
803  */
804  return sa->walsnd_index - sb->walsnd_index;
805 }
806 
807 
808 /*
809  * Check if we are in the list of sync standbys, and if so, determine
810  * priority sequence. Return priority if set, or zero to indicate that
811  * we are not a potential sync standby.
812  *
813  * Compare the parameter SyncRepStandbyNames against the application_name
814  * for this WALSender, or allow any name if we find a wildcard "*".
815  */
816 static int
818 {
819  const char *standby_name;
820  int priority;
821  bool found = false;
822 
823  /*
824  * Since synchronous cascade replication is not allowed, we always set the
825  * priority of cascading walsender to zero.
826  */
828  return 0;
829 
830  if (!SyncStandbysDefined() || SyncRepConfig == NULL)
831  return 0;
832 
833  standby_name = SyncRepConfig->member_names;
834  for (priority = 1; priority <= SyncRepConfig->nmembers; priority++)
835  {
836  if (pg_strcasecmp(standby_name, application_name) == 0 ||
837  strcmp(standby_name, "*") == 0)
838  {
839  found = true;
840  break;
841  }
842  standby_name += strlen(standby_name) + 1;
843  }
844 
845  if (!found)
846  return 0;
847 
848  /*
849  * In quorum-based sync replication, all the standbys in the list have the
850  * same priority, one.
851  */
852  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ? priority : 1;
853 }
854 
855 /*
856  * Walk the specified queue from head. Set the state of any backends that
857  * need to be woken, remove them from the queue, and then wake them.
858  * Pass all = true to wake whole queue; otherwise, just wake up to
859  * the walsender's LSN.
860  *
861  * The caller must hold SyncRepLock in exclusive mode.
862  */
863 static int
864 SyncRepWakeQueue(bool all, int mode)
865 {
866  volatile WalSndCtlData *walsndctl = WalSndCtl;
867  int numprocs = 0;
868  dlist_mutable_iter iter;
869 
872  Assert(SyncRepQueueIsOrderedByLSN(mode));
873 
875  {
876  PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur);
877 
878  /*
879  * Assume the queue is ordered by LSN
880  */
881  if (!all && walsndctl->lsn[mode] < proc->waitLSN)
882  return numprocs;
883 
884  /*
885  * Remove from queue.
886  */
888 
889  /*
890  * SyncRepWaitForLSN() reads syncRepState without holding the lock, so
891  * make sure that it sees the queue link being removed before the
892  * syncRepState change.
893  */
895 
896  /*
897  * Set state to complete; see SyncRepWaitForLSN() for discussion of
898  * the various states.
899  */
901 
902  /*
903  * Wake only when we have set state and removed from queue.
904  */
905  SetLatch(&(proc->procLatch));
906 
907  numprocs++;
908  }
909 
910  return numprocs;
911 }
912 
913 /*
914  * The checkpointer calls this as needed to update the shared
915  * sync_standbys_defined flag, so that backends don't remain permanently wedged
916  * if synchronous_standby_names is unset. It's safe to check the current value
917  * without the lock, because it's only ever updated by one process. But we
918  * must take the lock to change it.
919  */
920 void
922 {
923  bool sync_standbys_defined = SyncStandbysDefined();
924 
925  if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
926  {
927  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
928 
929  /*
930  * If synchronous_standby_names has been reset to empty, it's futile
931  * for backends to continue waiting. Since the user no longer wants
932  * synchronous replication, we'd better wake them up.
933  */
934  if (!sync_standbys_defined)
935  {
936  int i;
937 
938  for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
939  SyncRepWakeQueue(true, i);
940  }
941 
942  /*
943  * Only allow people to join the queue when there are synchronous
944  * standbys defined. Without this interlock, there's a race
945  * condition: we might wake up all the current waiters; then, some
946  * backend that hasn't yet reloaded its config might go to sleep on
947  * the queue (and never wake up). This prevents that.
948  */
949  WalSndCtl->sync_standbys_defined = sync_standbys_defined;
950 
951  LWLockRelease(SyncRepLock);
952  }
953 }
954 
955 #ifdef USE_ASSERT_CHECKING
956 static bool
957 SyncRepQueueIsOrderedByLSN(int mode)
958 {
959  XLogRecPtr lastLSN;
960  dlist_iter iter;
961 
963 
964  lastLSN = 0;
965 
967  {
968  PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur);
969 
970  /*
971  * Check the queue is ordered by LSN and that multiple procs don't
972  * have matching LSNs
973  */
974  if (proc->waitLSN <= lastLSN)
975  return false;
976 
977  lastLSN = proc->waitLSN;
978  }
979 
980  return true;
981 }
982 #endif
983 
984 /*
985  * ===========================================================
986  * Synchronous Replication functions executed by any process
987  * ===========================================================
988  */
989 
990 bool
992 {
993  if (*newval != NULL && (*newval)[0] != '\0')
994  {
995  int parse_rc;
996  SyncRepConfigData *pconf;
997 
998  /* Reset communication variables to ensure a fresh start */
999  syncrep_parse_result = NULL;
1000  syncrep_parse_error_msg = NULL;
1001 
1002  /* Parse the synchronous_standby_names string */
1004  parse_rc = syncrep_yyparse();
1006 
1007  if (parse_rc != 0 || syncrep_parse_result == NULL)
1008  {
1009  GUC_check_errcode(ERRCODE_SYNTAX_ERROR);
1012  else
1013  GUC_check_errdetail("synchronous_standby_names parser failed");
1014  return false;
1015  }
1016 
1017  if (syncrep_parse_result->num_sync <= 0)
1018  {
1019  GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero",
1021  return false;
1022  }
1023 
1024  /* GUC extra value must be guc_malloc'd, not palloc'd */
1025  pconf = (SyncRepConfigData *)
1027  if (pconf == NULL)
1028  return false;
1030 
1031  *extra = (void *) pconf;
1032 
1033  /*
1034  * We need not explicitly clean up syncrep_parse_result. It, and any
1035  * other cruft generated during parsing, will be freed when the
1036  * current memory context is deleted. (This code is generally run in
1037  * a short-lived context used for config file processing, so that will
1038  * not be very long.)
1039  */
1040  }
1041  else
1042  *extra = NULL;
1043 
1044  return true;
1045 }
1046 
1047 void
1048 assign_synchronous_standby_names(const char *newval, void *extra)
1049 {
1050  SyncRepConfig = (SyncRepConfigData *) extra;
1051 }
1052 
1053 void
1055 {
1056  switch (newval)
1057  {
1060  break;
1063  break;
1066  break;
1067  default:
1069  break;
1070  }
1071 }
#define pg_read_barrier()
Definition: atomics.h:151
#define pg_write_barrier()
Definition: atomics.h:152
#define Min(x, y)
Definition: c.h:1004
#define Assert(condition)
Definition: c.h:858
unsigned char uint8
Definition: c.h:504
@ DestNone
Definition: dest.h:87
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1159
int errdetail(const char *fmt,...)
Definition: elog.c:1205
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define LOG
Definition: elog.h:31
#define DEBUG3
Definition: elog.h:28
#define WARNING
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
volatile uint32 InterruptHoldoffCount
Definition: globals.c:41
volatile sig_atomic_t QueryCancelPending
Definition: globals.c:31
struct Latch * MyLatch
Definition: globals.c:60
volatile sig_atomic_t ProcDiePending
Definition: globals.c:32
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:6734
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:640
#define newval
#define GUC_check_errmsg
Definition: guc.h:444
#define GUC_check_errdetail
Definition: guc.h:448
GucSource
Definition: guc.h:108
char * application_name
Definition: guc_tables.c:546
static void dlist_insert_after(dlist_node *after, dlist_node *node)
Definition: ilist.h:381
#define dlist_foreach(iter, lhead)
Definition: ilist.h:623
static void dlist_delete_thoroughly(dlist_node *node)
Definition: ilist.h:416
static bool dlist_node_is_detached(const dlist_node *node)
Definition: ilist.h:525
#define dlist_reverse_foreach(iter, lhead)
Definition: ilist.h:654
static void dlist_push_head(dlist_head *head, dlist_node *node)
Definition: ilist.h:347
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:640
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
static int pg_cmp_u64(uint64 a, uint64 b)
Definition: int.h:501
#define write(a, b, c)
Definition: win32.h:14
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
void SetLatch(Latch *latch)
Definition: latch.c:632
void ResetLatch(Latch *latch)
Definition: latch.c:724
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:517
#define WL_LATCH_SET
Definition: latch.h:127
#define WL_POSTMASTER_DEATH
Definition: latch.h:131
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1170
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1939
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1783
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc(Size size)
Definition: mcxt.c:1316
static PgChecksumMode mode
Definition: pg_checksums.c:56
static rewind_source * source
Definition: pg_rewind.c:89
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define sprintf
Definition: port.h:240
#define qsort(a, b, c, d)
Definition: port.h:449
CommandDest whereToSendOutput
Definition: postgres.c:90
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:421
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:369
bool update_process_title
Definition: ps_status.c:29
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
PGPROC * MyProc
Definition: proc.c:66
Definition: proc.h:157
XLogRecPtr waitLSN
Definition: proc.h:248
dlist_node syncRepLinks
Definition: proc.h:250
int syncRepState
Definition: proc.h:249
Latch procLatch
Definition: proc.h:165
uint8 syncrep_method
Definition: syncrep.h:68
char member_names[FLEXIBLE_ARRAY_MEMBER]
Definition: syncrep.h:71
int sync_standby_priority
Definition: syncrep.h:49
XLogRecPtr apply
Definition: syncrep.h:48
XLogRecPtr write
Definition: syncrep.h:46
XLogRecPtr flush
Definition: syncrep.h:47
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]
dlist_head SyncRepQueue[NUM_SYNC_REP_WAIT_MODE]
slock_t mutex
XLogRecPtr flush
WalSndState state
XLogRecPtr write
int sync_standby_priority
XLogRecPtr apply
dlist_node * cur
Definition: ilist.h:179
dlist_node * cur
Definition: ilist.h:200
Definition: regguts.h:323
static int SyncRepWaitMode
Definition: syncrep.c:98
void SyncRepInitConfig(void)
Definition: syncrep.c:402
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:148
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync)
Definition: syncrep.c:543
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, SyncRepStandbyData *sync_standbys, int num_standbys, uint8 nth)
Definition: syncrep.c:650
void assign_synchronous_commit(int newval, void *extra)
Definition: syncrep.c:1054
void assign_synchronous_standby_names(const char *newval, void *extra)
Definition: syncrep.c:1048
static int standby_priority_comparator(const void *a, const void *b)
Definition: syncrep.c:790
static int SyncRepWakeQueue(bool all, int mode)
Definition: syncrep.c:864
SyncRepConfigData * SyncRepConfig
Definition: syncrep.c:97
int SyncRepGetCandidateStandbys(SyncRepStandbyData **standbys)
Definition: syncrep.c:711
void SyncRepReleaseWaiters(void)
Definition: syncrep.c:431
void SyncRepUpdateSyncStandbysDefined(void)
Definition: syncrep.c:921
static bool announce_next_takeover
Definition: syncrep.c:95
static int SyncRepGetStandbyPriority(void)
Definition: syncrep.c:817
char * SyncRepStandbyNames
Definition: syncrep.c:90
static void SyncRepQueueInsert(int mode)
Definition: syncrep.c:329
static void SyncRepCancelWait(void)
Definition: syncrep.c:363
bool check_synchronous_standby_names(char **newval, void **extra, GucSource source)
Definition: syncrep.c:991
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, SyncRepStandbyData *sync_standbys, int num_standbys)
Definition: syncrep.c:617
void SyncRepCleanupAtProcExit(void)
Definition: syncrep.c:373
static int cmp_lsn(const void *a, const void *b)
Definition: syncrep.c:695
#define SyncStandbysDefined()
Definition: syncrep.c:92
#define SYNC_REP_PRIORITY
Definition: syncrep.h:35
#define NUM_SYNC_REP_WAIT_MODE
Definition: syncrep.h:27
void syncrep_scanner_init(const char *str)
#define SyncRepRequested()
Definition: syncrep.h:18
#define SYNC_REP_NO_WAIT
Definition: syncrep.h:22
void syncrep_scanner_finish(void)
#define SYNC_REP_WAIT_WRITE
Definition: syncrep.h:23
#define SYNC_REP_WAITING
Definition: syncrep.h:31
int syncrep_yyparse(void)
PGDLLIMPORT SyncRepConfigData * syncrep_parse_result
#define SYNC_REP_WAIT_COMPLETE
Definition: syncrep.h:32
#define SYNC_REP_WAIT_FLUSH
Definition: syncrep.h:24
PGDLLIMPORT char * syncrep_parse_error_msg
#define SYNC_REP_NOT_WAITING
Definition: syncrep.h:30
#define SYNC_REP_WAIT_APPLY
Definition: syncrep.h:25
WalSnd * MyWalSnd
Definition: walsender.c:112
int max_wal_senders
Definition: walsender.c:121
bool am_cascading_walsender
Definition: walsender.c:116
WalSndCtlData * WalSndCtl
Definition: walsender.c:109
WalSndState
@ WALSNDSTATE_STREAMING
@ WALSNDSTATE_STOPPING
@ SYNCHRONOUS_COMMIT_REMOTE_WRITE
Definition: xact.h:72
@ SYNCHRONOUS_COMMIT_REMOTE_APPLY
Definition: xact.h:75
@ SYNCHRONOUS_COMMIT_REMOTE_FLUSH
Definition: xact.h:74
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28