PostgreSQL Source Code  git master
syncrep.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * syncrep.c
4  *
5  * Synchronous replication is new as of PostgreSQL 9.1.
6  *
7  * If requested, transaction commits wait until their commit LSN are
8  * acknowledged by the synchronous standbys.
9  *
10  * This module contains the code for waiting and release of backends.
11  * All code in this module executes on the primary. The core streaming
12  * replication transport remains within WALreceiver/WALsender modules.
13  *
14  * The essence of this design is that it isolates all logic about
15  * waiting/releasing onto the primary. The primary defines which standbys
16  * it wishes to wait for. The standbys are completely unaware of the
17  * durability requirements of transactions on the primary, reducing the
18  * complexity of the code and streamlining both standby operations and
19  * network bandwidth because there is no requirement to ship
20  * per-transaction state information.
21  *
22  * Replication is either synchronous or not synchronous (async). If it is
23  * async, we just fastpath out of here. If it is sync, then we wait for
24  * the write, flush or apply location on the standby before releasing
25  * the waiting backend. Further complexity in that interaction is
26  * expected in later releases.
27  *
28  * The best performing way to manage the waiting backends is to have a
29  * single ordered queue of waiting backends, so that we can avoid
30  * searching the through all waiters each time we receive a reply.
31  *
32  * In 9.5 or before only a single standby could be considered as
33  * synchronous. In 9.6 we support a priority-based multiple synchronous
34  * standbys. In 10.0 a quorum-based multiple synchronous standbys is also
35  * supported. The number of synchronous standbys that transactions
36  * must wait for replies from is specified in synchronous_standby_names.
37  * This parameter also specifies a list of standby names and the method
38  * (FIRST and ANY) to choose synchronous standbys from the listed ones.
39  *
40  * The method FIRST specifies a priority-based synchronous replication
41  * and makes transaction commits wait until their WAL records are
42  * replicated to the requested number of synchronous standbys chosen based
43  * on their priorities. The standbys whose names appear earlier in the list
44  * are given higher priority and will be considered as synchronous.
45  * Other standby servers appearing later in this list represent potential
46  * synchronous standbys. If any of the current synchronous standbys
47  * disconnects for whatever reason, it will be replaced immediately with
48  * the next-highest-priority standby.
49  *
50  * The method ANY specifies a quorum-based synchronous replication
51  * and makes transaction commits wait until their WAL records are
52  * replicated to at least the requested number of synchronous standbys
53  * in the list. All the standbys appearing in the list are considered as
54  * candidates for quorum synchronous standbys.
55  *
56  * If neither FIRST nor ANY is specified, FIRST is used as the method.
57  * This is for backward compatibility with 9.6 or before where only a
58  * priority-based sync replication was supported.
59  *
60  * Before the standbys chosen from synchronous_standby_names can
61  * become the synchronous standbys they must have caught up with
62  * the primary; that may take some time. Once caught up,
63  * the standbys which are considered as synchronous at that moment
64  * will release waiters from the queue.
65  *
66  * Portions Copyright (c) 2010-2020, PostgreSQL Global Development Group
67  *
68  * IDENTIFICATION
69  * src/backend/replication/syncrep.c
70  *
71  *-------------------------------------------------------------------------
72  */
73 #include "postgres.h"
74 
75 #include <unistd.h>
76 
77 #include "access/xact.h"
78 #include "miscadmin.h"
79 #include "pgstat.h"
80 #include "replication/syncrep.h"
81 #include "replication/walsender.h"
83 #include "storage/pmsignal.h"
84 #include "storage/proc.h"
85 #include "tcop/tcopprot.h"
86 #include "utils/builtins.h"
87 #include "utils/ps_status.h"
88 
89 /* User-settable parameters for sync rep */
91 
92 #define SyncStandbysDefined() \
93  (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0')
94 
95 static bool announce_next_takeover = true;
96 
99 
100 static void SyncRepQueueInsert(int mode);
101 static void SyncRepCancelWait(void);
102 static int SyncRepWakeQueue(bool all, int mode);
103 
104 static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
105  XLogRecPtr *flushPtr,
106  XLogRecPtr *applyPtr,
107  bool *am_sync);
108 static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
109  XLogRecPtr *flushPtr,
110  XLogRecPtr *applyPtr,
111  SyncRepStandbyData *sync_standbys,
112  int num_standbys);
113 static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
114  XLogRecPtr *flushPtr,
115  XLogRecPtr *applyPtr,
116  SyncRepStandbyData *sync_standbys,
117  int num_standbys,
118  uint8 nth);
119 static int SyncRepGetStandbyPriority(void);
120 static int standby_priority_comparator(const void *a, const void *b);
121 static int cmp_lsn(const void *a, const void *b);
122 
123 #ifdef USE_ASSERT_CHECKING
124 static bool SyncRepQueueIsOrderedByLSN(int mode);
125 #endif
126 
127 /*
128  * ===========================================================
129  * Synchronous Replication functions for normal user backends
130  * ===========================================================
131  */
132 
133 /*
134  * Wait for synchronous replication, if requested by user.
135  *
136  * Initially backends start in state SYNC_REP_NOT_WAITING and then
137  * change that state to SYNC_REP_WAITING before adding ourselves
138  * to the wait queue. During SyncRepWakeQueue() a WALSender changes
139  * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
140  * This backend then resets its state to SYNC_REP_NOT_WAITING.
141  *
142  * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN
143  * represents a commit record. If it doesn't, then we wait only for the WAL
144  * to be flushed if synchronous_commit is set to the higher level of
145  * remote_apply, because only commit records provide apply feedback.
146  */
147 void
148 SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
149 {
150  char *new_status = NULL;
151  const char *old_status;
152  int mode;
153 
154  /*
155  * This should be called while holding interrupts during a transaction
156  * commit to prevent the follow-up shared memory queue cleanups to be
157  * influenced by external interruptions.
158  */
160 
161  /*
162  * Fast exit if user has not requested sync replication, or there are no
163  * sync replication standby names defined.
164  *
165  * Since this routine gets called every commit time, it's important to
166  * exit quickly if sync replication is not requested. So we check
167  * WalSndCtl->sync_standbys_defined flag without the lock and exit
168  * immediately if it's false. If it's true, we need to check it again later
169  * while holding the lock, to check the flag and operate the sync rep
170  * queue atomically. This is necessary to avoid the race condition
171  * described in SyncRepUpdateSyncStandbysDefined(). On the other
172  * hand, if it's false, the lock is not necessary because we don't touch
173  * the queue.
174  */
175  if (!SyncRepRequested() ||
176  !((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined)
177  return;
178 
179  /* Cap the level for anything other than commit to remote flush only. */
180  if (commit)
181  mode = SyncRepWaitMode;
182  else
184 
186  Assert(WalSndCtl != NULL);
187 
188  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
190 
191  /*
192  * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
193  * set. See SyncRepUpdateSyncStandbysDefined.
194  *
195  * Also check that the standby hasn't already replied. Unlikely race
196  * condition but we'll be fetching that cache line anyway so it's likely
197  * to be a low cost check.
198  */
199  if (!WalSndCtl->sync_standbys_defined ||
200  lsn <= WalSndCtl->lsn[mode])
201  {
202  LWLockRelease(SyncRepLock);
203  return;
204  }
205 
206  /*
207  * Set our waitLSN so WALSender will know when to wake us, and add
208  * ourselves to the queue.
209  */
210  MyProc->waitLSN = lsn;
212  SyncRepQueueInsert(mode);
213  Assert(SyncRepQueueIsOrderedByLSN(mode));
214  LWLockRelease(SyncRepLock);
215 
216  /* Alter ps display to show waiting for sync rep. */
218  {
219  int len;
220 
221  old_status = get_ps_display(&len);
222  new_status = (char *) palloc(len + 32 + 1);
223  memcpy(new_status, old_status, len);
224  sprintf(new_status + len, " waiting for %X/%X",
225  (uint32) (lsn >> 32), (uint32) lsn);
226  set_ps_display(new_status);
227  new_status[len] = '\0'; /* truncate off " waiting ..." */
228  }
229 
230  /*
231  * Wait for specified LSN to be confirmed.
232  *
233  * Each proc has its own wait latch, so we perform a normal latch
234  * check/wait loop here.
235  */
236  for (;;)
237  {
238  int rc;
239 
240  /* Must reset the latch before testing state. */
242 
243  /*
244  * Acquiring the lock is not needed, the latch ensures proper
245  * barriers. If it looks like we're done, we must really be done,
246  * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE,
247  * it will never update it again, so we can't be seeing a stale value
248  * in that case.
249  */
251  break;
252 
253  /*
254  * If a wait for synchronous replication is pending, we can neither
255  * acknowledge the commit nor raise ERROR or FATAL. The latter would
256  * lead the client to believe that the transaction aborted, which is
257  * not true: it's already committed locally. The former is no good
258  * either: the client has requested synchronous replication, and is
259  * entitled to assume that an acknowledged commit is also replicated,
260  * which might not be true. So in this case we issue a WARNING (which
261  * some clients may be able to interpret) and shut off further output.
262  * We do NOT reset ProcDiePending, so that the process will die after
263  * the commit is cleaned up.
264  */
265  if (ProcDiePending)
266  {
268  (errcode(ERRCODE_ADMIN_SHUTDOWN),
269  errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
270  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
273  break;
274  }
275 
276  /*
277  * It's unclear what to do if a query cancel interrupt arrives. We
278  * can't actually abort at this point, but ignoring the interrupt
279  * altogether is not helpful, so we just terminate the wait with a
280  * suitable warning.
281  */
282  if (QueryCancelPending)
283  {
284  QueryCancelPending = false;
286  (errmsg("canceling wait for synchronous replication due to user request"),
287  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
289  break;
290  }
291 
292  /*
293  * Wait on latch. Any condition that should wake us up will set the
294  * latch, so no need for timeout.
295  */
298 
299  /*
300  * If the postmaster dies, we'll probably never get an acknowledgment,
301  * because all the wal sender processes will exit. So just bail out.
302  */
303  if (rc & WL_POSTMASTER_DEATH)
304  {
305  ProcDiePending = true;
308  break;
309  }
310  }
311 
312  /*
313  * WalSender has checked our LSN and has removed us from queue. Clean up
314  * state and leave. It's OK to reset these shared memory fields without
315  * holding SyncRepLock, because any walsenders will ignore us anyway when
316  * we're not on the queue. We need a read barrier to make sure we see the
317  * changes to the queue link (this might be unnecessary without
318  * assertions, but better safe than sorry).
319  */
320  pg_read_barrier();
323  MyProc->waitLSN = 0;
324 
325  if (new_status)
326  {
327  /* Reset ps display */
328  set_ps_display(new_status);
329  pfree(new_status);
330  }
331 }
332 
333 /*
334  * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
335  *
336  * Usually we will go at tail of queue, though it's possible that we arrive
337  * here out of order, so start at tail and work back to insertion point.
338  */
339 static void
341 {
342  PGPROC *proc;
343 
344  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
345  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
347  offsetof(PGPROC, syncRepLinks));
348 
349  while (proc)
350  {
351  /*
352  * Stop at the queue element that we should after to ensure the queue
353  * is ordered by LSN.
354  */
355  if (proc->waitLSN < MyProc->waitLSN)
356  break;
357 
358  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
359  &(proc->syncRepLinks),
360  offsetof(PGPROC, syncRepLinks));
361  }
362 
363  if (proc)
365  else
367 }
368 
369 /*
370  * Acquire SyncRepLock and cancel any wait currently in progress.
371  */
372 static void
374 {
375  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
379  LWLockRelease(SyncRepLock);
380 }
381 
382 void
384 {
385  /*
386  * First check if we are removed from the queue without the lock to not
387  * slow down backend exit.
388  */
390  {
391  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
392 
393  /* maybe we have just been removed, so recheck */
396 
397  LWLockRelease(SyncRepLock);
398  }
399 }
400 
401 /*
402  * ===========================================================
403  * Synchronous Replication functions for wal sender processes
404  * ===========================================================
405  */
406 
407 /*
408  * Take any action required to initialise sync rep state from config
409  * data. Called at WALSender startup and after each SIGHUP.
410  */
411 void
413 {
414  int priority;
415 
416  /*
417  * Determine if we are a potential sync standby and remember the result
418  * for handling replies from standby.
419  */
420  priority = SyncRepGetStandbyPriority();
421  if (MyWalSnd->sync_standby_priority != priority)
422  {
424  MyWalSnd->sync_standby_priority = priority;
426 
427  ereport(DEBUG1,
428  (errmsg("standby \"%s\" now has synchronous standby priority %u",
429  application_name, priority)));
430  }
431 }
432 
433 /*
434  * Update the LSNs on each queue based upon our latest state. This
435  * implements a simple policy of first-valid-sync-standby-releases-waiter.
436  *
437  * Other policies are possible, which would change what we do here and
438  * perhaps also which information we store as well.
439  */
440 void
442 {
443  volatile WalSndCtlData *walsndctl = WalSndCtl;
444  XLogRecPtr writePtr;
445  XLogRecPtr flushPtr;
446  XLogRecPtr applyPtr;
447  bool got_recptr;
448  bool am_sync;
449  int numwrite = 0;
450  int numflush = 0;
451  int numapply = 0;
452 
453  /*
454  * If this WALSender is serving a standby that is not on the list of
455  * potential sync standbys then we have nothing to do. If we are still
456  * starting up, still running base backup or the current flush position is
457  * still invalid, then leave quickly also. Streaming or stopping WAL
458  * senders are allowed to release waiters.
459  */
460  if (MyWalSnd->sync_standby_priority == 0 ||
464  {
465  announce_next_takeover = true;
466  return;
467  }
468 
469  /*
470  * We're a potential sync standby. Release waiters if there are enough
471  * sync standbys and we are considered as sync.
472  */
473  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
474 
475  /*
476  * Check whether we are a sync standby or not, and calculate the synced
477  * positions among all sync standbys. (Note: although this step does not
478  * of itself require holding SyncRepLock, it seems like a good idea to do
479  * it after acquiring the lock. This ensures that the WAL pointers we use
480  * to release waiters are newer than any previous execution of this
481  * routine used.)
482  */
483  got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
484 
485  /*
486  * If we are managing a sync standby, though we weren't prior to this,
487  * then announce we are now a sync standby.
488  */
489  if (announce_next_takeover && am_sync)
490  {
491  announce_next_takeover = false;
492 
493  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
494  ereport(LOG,
495  (errmsg("standby \"%s\" is now a synchronous standby with priority %u",
497  else
498  ereport(LOG,
499  (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
500  application_name)));
501  }
502 
503  /*
504  * If the number of sync standbys is less than requested or we aren't
505  * managing a sync standby then just leave.
506  */
507  if (!got_recptr || !am_sync)
508  {
509  LWLockRelease(SyncRepLock);
510  announce_next_takeover = !am_sync;
511  return;
512  }
513 
514  /*
515  * Set the lsn first so that when we wake backends they will release up to
516  * this location.
517  */
518  if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr)
519  {
520  walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr;
521  numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
522  }
523  if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr)
524  {
525  walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr;
526  numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
527  }
528  if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr)
529  {
530  walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr;
531  numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY);
532  }
533 
534  LWLockRelease(SyncRepLock);
535 
536  elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X",
537  numwrite, (uint32) (writePtr >> 32), (uint32) writePtr,
538  numflush, (uint32) (flushPtr >> 32), (uint32) flushPtr,
539  numapply, (uint32) (applyPtr >> 32), (uint32) applyPtr);
540 }
541 
542 /*
543  * Calculate the synced Write, Flush and Apply positions among sync standbys.
544  *
545  * Return false if the number of sync standbys is less than
546  * synchronous_standby_names specifies. Otherwise return true and
547  * store the positions into *writePtr, *flushPtr and *applyPtr.
548  *
549  * On return, *am_sync is set to true if this walsender is connecting to
550  * sync standby. Otherwise it's set to false.
551  */
552 static bool
554  XLogRecPtr *applyPtr, bool *am_sync)
555 {
556  SyncRepStandbyData *sync_standbys;
557  int num_standbys;
558  int i;
559 
560  /* Initialize default results */
561  *writePtr = InvalidXLogRecPtr;
562  *flushPtr = InvalidXLogRecPtr;
563  *applyPtr = InvalidXLogRecPtr;
564  *am_sync = false;
565 
566  /* Quick out if not even configured to be synchronous */
567  if (SyncRepConfig == NULL)
568  return false;
569 
570  /* Get standbys that are considered as synchronous at this moment */
571  num_standbys = SyncRepGetCandidateStandbys(&sync_standbys);
572 
573  /* Am I among the candidate sync standbys? */
574  for (i = 0; i < num_standbys; i++)
575  {
576  if (sync_standbys[i].is_me)
577  {
578  *am_sync = true;
579  break;
580  }
581  }
582 
583  /*
584  * Nothing more to do if we are not managing a sync standby or there are
585  * not enough synchronous standbys.
586  */
587  if (!(*am_sync) ||
588  num_standbys < SyncRepConfig->num_sync)
589  {
590  pfree(sync_standbys);
591  return false;
592  }
593 
594  /*
595  * In a priority-based sync replication, the synced positions are the
596  * oldest ones among sync standbys. In a quorum-based, they are the Nth
597  * latest ones.
598  *
599  * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest
600  * positions. But we use SyncRepGetOldestSyncRecPtr() for that calculation
601  * because it's a bit more efficient.
602  *
603  * XXX If the numbers of current and requested sync standbys are the same,
604  * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
605  * positions even in a quorum-based sync replication.
606  */
607  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
608  {
609  SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
610  sync_standbys, num_standbys);
611  }
612  else
613  {
614  SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
615  sync_standbys, num_standbys,
616  SyncRepConfig->num_sync);
617  }
618 
619  pfree(sync_standbys);
620  return true;
621 }
622 
623 /*
624  * Calculate the oldest Write, Flush and Apply positions among sync standbys.
625  */
626 static void
628  XLogRecPtr *flushPtr,
629  XLogRecPtr *applyPtr,
630  SyncRepStandbyData *sync_standbys,
631  int num_standbys)
632 {
633  int i;
634 
635  /*
636  * Scan through all sync standbys and calculate the oldest Write, Flush
637  * and Apply positions. We assume *writePtr et al were initialized to
638  * InvalidXLogRecPtr.
639  */
640  for (i = 0; i < num_standbys; i++)
641  {
642  XLogRecPtr write = sync_standbys[i].write;
643  XLogRecPtr flush = sync_standbys[i].flush;
644  XLogRecPtr apply = sync_standbys[i].apply;
645 
646  if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write)
647  *writePtr = write;
648  if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush)
649  *flushPtr = flush;
650  if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
651  *applyPtr = apply;
652  }
653 }
654 
655 /*
656  * Calculate the Nth latest Write, Flush and Apply positions among sync
657  * standbys.
658  */
659 static void
661  XLogRecPtr *flushPtr,
662  XLogRecPtr *applyPtr,
663  SyncRepStandbyData *sync_standbys,
664  int num_standbys,
665  uint8 nth)
666 {
667  XLogRecPtr *write_array;
668  XLogRecPtr *flush_array;
669  XLogRecPtr *apply_array;
670  int i;
671 
672  /* Should have enough candidates, or somebody messed up */
673  Assert(nth > 0 && nth <= num_standbys);
674 
675  write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
676  flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
677  apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
678 
679  for (i = 0; i < num_standbys; i++)
680  {
681  write_array[i] = sync_standbys[i].write;
682  flush_array[i] = sync_standbys[i].flush;
683  apply_array[i] = sync_standbys[i].apply;
684  }
685 
686  /* Sort each array in descending order */
687  qsort(write_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
688  qsort(flush_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
689  qsort(apply_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
690 
691  /* Get Nth latest Write, Flush, Apply positions */
692  *writePtr = write_array[nth - 1];
693  *flushPtr = flush_array[nth - 1];
694  *applyPtr = apply_array[nth - 1];
695 
696  pfree(write_array);
697  pfree(flush_array);
698  pfree(apply_array);
699 }
700 
701 /*
702  * Compare lsn in order to sort array in descending order.
703  */
704 static int
705 cmp_lsn(const void *a, const void *b)
706 {
707  XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
708  XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
709 
710  if (lsn1 > lsn2)
711  return -1;
712  else if (lsn1 == lsn2)
713  return 0;
714  else
715  return 1;
716 }
717 
718 /*
719  * Return data about walsenders that are candidates to be sync standbys.
720  *
721  * *standbys is set to a palloc'd array of structs of per-walsender data,
722  * and the number of valid entries (candidate sync senders) is returned.
723  * (This might be more or fewer than num_sync; caller must check.)
724  */
725 int
727 {
728  int i;
729  int n;
730 
731  /* Create result array */
732  *standbys = (SyncRepStandbyData *)
734 
735  /* Quick exit if sync replication is not requested */
736  if (SyncRepConfig == NULL)
737  return 0;
738 
739  /* Collect raw data from shared memory */
740  n = 0;
741  for (i = 0; i < max_wal_senders; i++)
742  {
743  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
744  * rearrangement */
745  SyncRepStandbyData *stby;
746  WalSndState state; /* not included in SyncRepStandbyData */
747 
748  walsnd = &WalSndCtl->walsnds[i];
749  stby = *standbys + n;
750 
751  SpinLockAcquire(&walsnd->mutex);
752  stby->pid = walsnd->pid;
753  state = walsnd->state;
754  stby->write = walsnd->write;
755  stby->flush = walsnd->flush;
756  stby->apply = walsnd->apply;
758  SpinLockRelease(&walsnd->mutex);
759 
760  /* Must be active */
761  if (stby->pid == 0)
762  continue;
763 
764  /* Must be streaming or stopping */
765  if (state != WALSNDSTATE_STREAMING &&
766  state != WALSNDSTATE_STOPPING)
767  continue;
768 
769  /* Must be synchronous */
770  if (stby->sync_standby_priority == 0)
771  continue;
772 
773  /* Must have a valid flush position */
774  if (XLogRecPtrIsInvalid(stby->flush))
775  continue;
776 
777  /* OK, it's a candidate */
778  stby->walsnd_index = i;
779  stby->is_me = (walsnd == MyWalSnd);
780  n++;
781  }
782 
783  /*
784  * In quorum mode, we return all the candidates. In priority mode, if we
785  * have too many candidates then return only the num_sync ones of highest
786  * priority.
787  */
788  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY &&
789  n > SyncRepConfig->num_sync)
790  {
791  /* Sort by priority ... */
792  qsort(*standbys, n, sizeof(SyncRepStandbyData),
794  /* ... then report just the first num_sync ones */
795  n = SyncRepConfig->num_sync;
796  }
797 
798  return n;
799 }
800 
801 /*
802  * qsort comparator to sort SyncRepStandbyData entries by priority
803  */
804 static int
805 standby_priority_comparator(const void *a, const void *b)
806 {
807  const SyncRepStandbyData *sa = (const SyncRepStandbyData *) a;
808  const SyncRepStandbyData *sb = (const SyncRepStandbyData *) b;
809 
810  /* First, sort by increasing priority value */
813 
814  /*
815  * We might have equal priority values; arbitrarily break ties by position
816  * in the WALSnd array. (This is utterly bogus, since that is arrival
817  * order dependent, but there are regression tests that rely on it.)
818  */
819  return sa->walsnd_index - sb->walsnd_index;
820 }
821 
822 
823 /*
824  * Check if we are in the list of sync standbys, and if so, determine
825  * priority sequence. Return priority if set, or zero to indicate that
826  * we are not a potential sync standby.
827  *
828  * Compare the parameter SyncRepStandbyNames against the application_name
829  * for this WALSender, or allow any name if we find a wildcard "*".
830  */
831 static int
833 {
834  const char *standby_name;
835  int priority;
836  bool found = false;
837 
838  /*
839  * Since synchronous cascade replication is not allowed, we always set the
840  * priority of cascading walsender to zero.
841  */
843  return 0;
844 
845  if (!SyncStandbysDefined() || SyncRepConfig == NULL)
846  return 0;
847 
848  standby_name = SyncRepConfig->member_names;
849  for (priority = 1; priority <= SyncRepConfig->nmembers; priority++)
850  {
851  if (pg_strcasecmp(standby_name, application_name) == 0 ||
852  strcmp(standby_name, "*") == 0)
853  {
854  found = true;
855  break;
856  }
857  standby_name += strlen(standby_name) + 1;
858  }
859 
860  if (!found)
861  return 0;
862 
863  /*
864  * In quorum-based sync replication, all the standbys in the list have the
865  * same priority, one.
866  */
867  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ? priority : 1;
868 }
869 
870 /*
871  * Walk the specified queue from head. Set the state of any backends that
872  * need to be woken, remove them from the queue, and then wake them.
873  * Pass all = true to wake whole queue; otherwise, just wake up to
874  * the walsender's LSN.
875  *
876  * The caller must hold SyncRepLock in exclusive mode.
877  */
878 static int
879 SyncRepWakeQueue(bool all, int mode)
880 {
881  volatile WalSndCtlData *walsndctl = WalSndCtl;
882  PGPROC *proc = NULL;
883  PGPROC *thisproc = NULL;
884  int numprocs = 0;
885 
886  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
888  Assert(SyncRepQueueIsOrderedByLSN(mode));
889 
890  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
892  offsetof(PGPROC, syncRepLinks));
893 
894  while (proc)
895  {
896  /*
897  * Assume the queue is ordered by LSN
898  */
899  if (!all && walsndctl->lsn[mode] < proc->waitLSN)
900  return numprocs;
901 
902  /*
903  * Move to next proc, so we can delete thisproc from the queue.
904  * thisproc is valid, proc may be NULL after this.
905  */
906  thisproc = proc;
907  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
908  &(proc->syncRepLinks),
909  offsetof(PGPROC, syncRepLinks));
910 
911  /*
912  * Remove thisproc from queue.
913  */
914  SHMQueueDelete(&(thisproc->syncRepLinks));
915 
916  /*
917  * SyncRepWaitForLSN() reads syncRepState without holding the lock, so
918  * make sure that it sees the queue link being removed before the
919  * syncRepState change.
920  */
922 
923  /*
924  * Set state to complete; see SyncRepWaitForLSN() for discussion of
925  * the various states.
926  */
928 
929  /*
930  * Wake only when we have set state and removed from queue.
931  */
932  SetLatch(&(thisproc->procLatch));
933 
934  numprocs++;
935  }
936 
937  return numprocs;
938 }
939 
940 /*
941  * The checkpointer calls this as needed to update the shared
942  * sync_standbys_defined flag, so that backends don't remain permanently wedged
943  * if synchronous_standby_names is unset. It's safe to check the current value
944  * without the lock, because it's only ever updated by one process. But we
945  * must take the lock to change it.
946  */
947 void
949 {
950  bool sync_standbys_defined = SyncStandbysDefined();
951 
952  if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
953  {
954  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
955 
956  /*
957  * If synchronous_standby_names has been reset to empty, it's futile
958  * for backends to continue waiting. Since the user no longer wants
959  * synchronous replication, we'd better wake them up.
960  */
961  if (!sync_standbys_defined)
962  {
963  int i;
964 
965  for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
966  SyncRepWakeQueue(true, i);
967  }
968 
969  /*
970  * Only allow people to join the queue when there are synchronous
971  * standbys defined. Without this interlock, there's a race
972  * condition: we might wake up all the current waiters; then, some
973  * backend that hasn't yet reloaded its config might go to sleep on
974  * the queue (and never wake up). This prevents that.
975  */
976  WalSndCtl->sync_standbys_defined = sync_standbys_defined;
977 
978  LWLockRelease(SyncRepLock);
979  }
980 }
981 
982 #ifdef USE_ASSERT_CHECKING
983 static bool
984 SyncRepQueueIsOrderedByLSN(int mode)
985 {
986  PGPROC *proc = NULL;
987  XLogRecPtr lastLSN;
988 
989  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
990 
991  lastLSN = 0;
992 
993  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
995  offsetof(PGPROC, syncRepLinks));
996 
997  while (proc)
998  {
999  /*
1000  * Check the queue is ordered by LSN and that multiple procs don't
1001  * have matching LSNs
1002  */
1003  if (proc->waitLSN <= lastLSN)
1004  return false;
1005 
1006  lastLSN = proc->waitLSN;
1007 
1008  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1009  &(proc->syncRepLinks),
1010  offsetof(PGPROC, syncRepLinks));
1011  }
1012 
1013  return true;
1014 }
1015 #endif
1016 
1017 /*
1018  * ===========================================================
1019  * Synchronous Replication functions executed by any process
1020  * ===========================================================
1021  */
1022 
1023 bool
1024 check_synchronous_standby_names(char **newval, void **extra, GucSource source)
1025 {
1026  if (*newval != NULL && (*newval)[0] != '\0')
1027  {
1028  int parse_rc;
1029  SyncRepConfigData *pconf;
1030 
1031  /* Reset communication variables to ensure a fresh start */
1032  syncrep_parse_result = NULL;
1033  syncrep_parse_error_msg = NULL;
1034 
1035  /* Parse the synchronous_standby_names string */
1036  syncrep_scanner_init(*newval);
1037  parse_rc = syncrep_yyparse();
1039 
1040  if (parse_rc != 0 || syncrep_parse_result == NULL)
1041  {
1042  GUC_check_errcode(ERRCODE_SYNTAX_ERROR);
1045  else
1046  GUC_check_errdetail("synchronous_standby_names parser failed");
1047  return false;
1048  }
1049 
1050  if (syncrep_parse_result->num_sync <= 0)
1051  {
1052  GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero",
1054  return false;
1055  }
1056 
1057  /* GUC extra value must be malloc'd, not palloc'd */
1058  pconf = (SyncRepConfigData *)
1060  if (pconf == NULL)
1061  return false;
1063 
1064  *extra = (void *) pconf;
1065 
1066  /*
1067  * We need not explicitly clean up syncrep_parse_result. It, and any
1068  * other cruft generated during parsing, will be freed when the
1069  * current memory context is deleted. (This code is generally run in
1070  * a short-lived context used for config file processing, so that will
1071  * not be very long.)
1072  */
1073  }
1074  else
1075  *extra = NULL;
1076 
1077  return true;
1078 }
1079 
1080 void
1081 assign_synchronous_standby_names(const char *newval, void *extra)
1082 {
1083  SyncRepConfig = (SyncRepConfigData *) extra;
1084 }
1085 
1086 void
1088 {
1089  switch (newval)
1090  {
1093  break;
1096  break;
1099  break;
1100  default:
1102  break;
1103  }
1104 }
void syncrep_scanner_finish(void)
void SyncRepUpdateSyncStandbysDefined(void)
Definition: syncrep.c:948
XLogRecPtr write
static PgChecksumMode mode
Definition: pg_checksums.c:61
volatile uint32 InterruptHoldoffCount
Definition: globals.c:36
void assign_synchronous_standby_names(const char *newval, void *extra)
Definition: syncrep.c:1081
Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:164
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
static void SyncRepCancelWait(void)
Definition: syncrep.c:373
volatile sig_atomic_t QueryCancelPending
Definition: globals.c:31
#define DEBUG1
Definition: elog.h:25
void syncrep_scanner_init(const char *query_string)
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync)
Definition: syncrep.c:553
#define DEBUG3
Definition: elog.h:23
#define SYNC_REP_PRIORITY
Definition: syncrep.h:36
bool update_process_title
Definition: ps_status.c:36
#define write(a, b, c)
Definition: win32.h:14
#define GUC_check_errdetail
Definition: guc.h:415
static void SyncRepQueueInsert(int mode)
Definition: syncrep.c:340
PGPROC * MyProc
Definition: proc.c:67
char * syncrep_parse_error_msg
#define SYNC_REP_WAITING
Definition: syncrep.h:32
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:148
uint8 syncrep_method
Definition: syncrep.h:69
#define Min(x, y)
Definition: c.h:927
unsigned char uint8
Definition: c.h:372
#define SYNC_REP_NOT_WAITING
Definition: syncrep.h:31
WalSndCtlData * WalSndCtl
Definition: walsender.c:109
int errcode(int sqlerrcode)
Definition: elog.c:610
#define GUC_check_errmsg
Definition: guc.h:411
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define LOG
Definition: elog.h:26
void SetLatch(Latch *latch)
Definition: latch.c:505
SyncRepConfigData * SyncRepConfig
Definition: syncrep.c:97
int sync_standby_priority
Definition: syncrep.h:50
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:10928
static int SyncRepWakeQueue(bool all, int mode)
Definition: syncrep.c:879
void ResetLatch(Latch *latch)
Definition: latch.c:588
slock_t mutex
Latch procLatch
Definition: proc.h:121
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:390
GucSource
Definition: guc.h:105
#define malloc(a)
Definition: header.h:50
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
void set_ps_display(const char *activity)
Definition: ps_status.c:349
#define sprintf
Definition: port.h:195
#define SpinLockAcquire(lock)
Definition: spin.h:62
XLogRecPtr flush
void pfree(void *pointer)
Definition: mcxt.c:1057
static int cmp_lsn(const void *a, const void *b)
Definition: syncrep.c:705
static int standby_priority_comparator(const void *a, const void *b)
Definition: syncrep.c:805
XLogRecPtr flush
Definition: syncrep.h:48
#define SYNC_REP_NO_WAIT
Definition: syncrep.h:23
#define SYNC_REP_WAIT_APPLY
Definition: syncrep.h:26
void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
Definition: shmqueue.c:108
static bool announce_next_takeover
Definition: syncrep.c:95
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, SyncRepStandbyData *sync_standbys, int num_standbys, uint8 nth)
Definition: syncrep.c:660
XLogRecPtr write
Definition: syncrep.h:47
Definition: dest.h:89
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
WalSndState state
XLogRecPtr apply
Definition: syncrep.h:49
void assign_synchronous_commit(int newval, void *extra)
Definition: syncrep.c:1087
#define SyncStandbysDefined()
Definition: syncrep.c:92
int errdetail(const char *fmt,...)
Definition: elog.c:957
unsigned int uint32
Definition: c.h:374
SHM_QUEUE SyncRepQueue[NUM_SYNC_REP_WAIT_MODE]
void SyncRepInitConfig(void)
Definition: syncrep.c:412
#define SYNC_REP_WAIT_FLUSH
Definition: syncrep.h:25
#define SyncRepRequested()
Definition: syncrep.h:19
int max_wal_senders
Definition: walsender.c:121
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
#define SpinLockRelease(lock)
Definition: spin.h:64
char * SyncRepStandbyNames
Definition: syncrep.c:90
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:145
WalSnd * MyWalSnd
Definition: walsender.c:112
#define ereport(elevel,...)
Definition: elog.h:144
#define SYNC_REP_WAIT_COMPLETE
Definition: syncrep.h:33
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
char member_names[FLEXIBLE_ARRAY_MEMBER]
Definition: syncrep.h:72
int sync_standby_priority
volatile sig_atomic_t ProcDiePending
Definition: globals.c:32
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, SyncRepStandbyData *sync_standbys, int num_standbys)
Definition: syncrep.c:627
Definition: regguts.h:298
#define pg_read_barrier()
Definition: atomics.h:158
bool SHMQueueIsDetached(const SHM_QUEUE *queue)
Definition: shmqueue.c:47
SyncRepConfigData * syncrep_parse_result
#define newval
bool check_synchronous_standby_names(char **newval, void **extra, GucSource source)
Definition: syncrep.c:1024
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
SHM_QUEUE syncRepLinks
Definition: proc.h:189
void SyncRepCleanupAtProcExit(void)
Definition: syncrep.c:383
WalSndState
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]
int SyncRepGetCandidateStandbys(SyncRepStandbyData **standbys)
Definition: syncrep.c:726
char * application_name
Definition: guc.c:563
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define elog(elevel,...)
Definition: elog.h:214
int i
#define pg_write_barrier()
Definition: atomics.h:159
void SHMQueueDelete(SHM_QUEUE *queue)
Definition: shmqueue.c:68
struct Latch * MyLatch
Definition: globals.c:54
int syncrep_yyparse(void)
#define NUM_SYNC_REP_WAIT_MODE
Definition: syncrep.h:28
#define qsort(a, b, c, d)
Definition: port.h:475
int syncRepState
Definition: proc.h:188
CommandDest whereToSendOutput
Definition: postgres.c:91
XLogRecPtr apply
Definition: proc.h:112
static int SyncRepWaitMode
Definition: syncrep.c:98
#define WL_LATCH_SET
Definition: latch.h:124
void SyncRepReleaseWaiters(void)
Definition: syncrep.c:441
XLogRecPtr waitLSN
Definition: proc.h:187
#define offsetof(type, field)
Definition: c.h:668
bool am_cascading_walsender
Definition: walsender.c:116
#define SYNC_REP_WAIT_WRITE
Definition: syncrep.h:24
static int SyncRepGetStandbyPriority(void)
Definition: syncrep.c:832