PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
syncrep.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * syncrep.c
4  *
5  * Synchronous replication is new as of PostgreSQL 9.1.
6  *
7  * If requested, transaction commits wait until their commit LSN are
8  * acknowledged by the synchronous standbys.
9  *
10  * This module contains the code for waiting and release of backends.
11  * All code in this module executes on the primary. The core streaming
12  * replication transport remains within WALreceiver/WALsender modules.
13  *
14  * The essence of this design is that it isolates all logic about
15  * waiting/releasing onto the primary. The primary defines which standbys
16  * it wishes to wait for. The standbys are completely unaware of the
17  * durability requirements of transactions on the primary, reducing the
18  * complexity of the code and streamlining both standby operations and
19  * network bandwidth because there is no requirement to ship
20  * per-transaction state information.
21  *
22  * Replication is either synchronous or not synchronous (async). If it is
23  * async, we just fastpath out of here. If it is sync, then we wait for
24  * the write, flush or apply location on the standby before releasing
25  * the waiting backend. Further complexity in that interaction is
26  * expected in later releases.
27  *
28  * The best performing way to manage the waiting backends is to have a
29  * single ordered queue of waiting backends, so that we can avoid
30  * searching the through all waiters each time we receive a reply.
31  *
32  * In 9.5 or before only a single standby could be considered as
33  * synchronous. In 9.6 we support a priority-based multiple synchronous
34  * standbys. In 10.0 a quorum-based multiple synchronous standbys is also
35  * supported. The number of synchronous standbys that transactions
36  * must wait for replies from is specified in synchronous_standby_names.
37  * This parameter also specifies a list of standby names and the method
38  * (FIRST and ANY) to choose synchronous standbys from the listed ones.
39  *
40  * The method FIRST specifies a priority-based synchronous replication
41  * and makes transaction commits wait until their WAL records are
42  * replicated to the requested number of synchronous standbys chosen based
43  * on their priorities. The standbys whose names appear earlier in the list
44  * are given higher priority and will be considered as synchronous.
45  * Other standby servers appearing later in this list represent potential
46  * synchronous standbys. If any of the current synchronous standbys
47  * disconnects for whatever reason, it will be replaced immediately with
48  * the next-highest-priority standby.
49  *
50  * The method ANY specifies a quorum-based synchronous replication
51  * and makes transaction commits wait until their WAL records are
52  * replicated to at least the requested number of synchronous standbys
53  * in the list. All the standbys appearing in the list are considered as
54  * candidates for quorum synchronous standbys.
55  *
56  * Before the standbys chosen from synchronous_standby_names can
57  * become the synchronous standbys they must have caught up with
58  * the primary; that may take some time. Once caught up,
59  * the standbys which are considered as synchronous at that moment
60  * will release waiters from the queue.
61  *
62  * Portions Copyright (c) 2010-2017, PostgreSQL Global Development Group
63  *
64  * IDENTIFICATION
65  * src/backend/replication/syncrep.c
66  *
67  *-------------------------------------------------------------------------
68  */
69 #include "postgres.h"
70 
71 #include <unistd.h>
72 
73 #include "access/xact.h"
74 #include "miscadmin.h"
75 #include "pgstat.h"
76 #include "replication/syncrep.h"
77 #include "replication/walsender.h"
79 #include "storage/pmsignal.h"
80 #include "storage/proc.h"
81 #include "tcop/tcopprot.h"
82 #include "utils/builtins.h"
83 #include "utils/ps_status.h"
84 
85 /* User-settable parameters for sync rep */
87 
88 #define SyncStandbysDefined() \
89  (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0')
90 
91 static bool announce_next_takeover = true;
92 
95 
96 static void SyncRepQueueInsert(int mode);
97 static void SyncRepCancelWait(void);
98 static int SyncRepWakeQueue(bool all, int mode);
99 
100 static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
101  XLogRecPtr *flushPtr,
102  XLogRecPtr *applyPtr,
103  bool *am_sync);
104 static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
105  XLogRecPtr *flushPtr,
106  XLogRecPtr *applyPtr,
107  List *sync_standbys);
108 static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
109  XLogRecPtr *flushPtr,
110  XLogRecPtr *applyPtr,
111  List *sync_standbys, uint8 nth);
112 static int SyncRepGetStandbyPriority(void);
113 static List *SyncRepGetSyncStandbysPriority(bool *am_sync);
114 static List *SyncRepGetSyncStandbysQuorum(bool *am_sync);
115 static int cmp_lsn(const void *a, const void *b);
116 
117 #ifdef USE_ASSERT_CHECKING
118 static bool SyncRepQueueIsOrderedByLSN(int mode);
119 #endif
120 
121 /*
122  * ===========================================================
123  * Synchronous Replication functions for normal user backends
124  * ===========================================================
125  */
126 
127 /*
128  * Wait for synchronous replication, if requested by user.
129  *
130  * Initially backends start in state SYNC_REP_NOT_WAITING and then
131  * change that state to SYNC_REP_WAITING before adding ourselves
132  * to the wait queue. During SyncRepWakeQueue() a WALSender changes
133  * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
134  * This backend then resets its state to SYNC_REP_NOT_WAITING.
135  *
136  * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN
137  * represents a commit record. If it doesn't, then we wait only for the WAL
138  * to be flushed if synchronous_commit is set to the higher level of
139  * remote_apply, because only commit records provide apply feedback.
140  */
141 void
142 SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
143 {
144  char *new_status = NULL;
145  const char *old_status;
146  int mode;
147 
148  /* Cap the level for anything other than commit to remote flush only. */
149  if (commit)
150  mode = SyncRepWaitMode;
151  else
153 
154  /*
155  * Fast exit if user has not requested sync replication, or there are no
156  * sync replication standby names defined. Note that those standbys don't
157  * need to be connected.
158  */
160  return;
161 
163  Assert(WalSndCtl != NULL);
164 
165  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
167 
168  /*
169  * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
170  * set. See SyncRepUpdateSyncStandbysDefined.
171  *
172  * Also check that the standby hasn't already replied. Unlikely race
173  * condition but we'll be fetching that cache line anyway so it's likely
174  * to be a low cost check.
175  */
177  lsn <= WalSndCtl->lsn[mode])
178  {
179  LWLockRelease(SyncRepLock);
180  return;
181  }
182 
183  /*
184  * Set our waitLSN so WALSender will know when to wake us, and add
185  * ourselves to the queue.
186  */
187  MyProc->waitLSN = lsn;
189  SyncRepQueueInsert(mode);
190  Assert(SyncRepQueueIsOrderedByLSN(mode));
191  LWLockRelease(SyncRepLock);
192 
193  /* Alter ps display to show waiting for sync rep. */
195  {
196  int len;
197 
198  old_status = get_ps_display(&len);
199  new_status = (char *) palloc(len + 32 + 1);
200  memcpy(new_status, old_status, len);
201  sprintf(new_status + len, " waiting for %X/%X",
202  (uint32) (lsn >> 32), (uint32) lsn);
203  set_ps_display(new_status, false);
204  new_status[len] = '\0'; /* truncate off " waiting ..." */
205  }
206 
207  /*
208  * Wait for specified LSN to be confirmed.
209  *
210  * Each proc has its own wait latch, so we perform a normal latch
211  * check/wait loop here.
212  */
213  for (;;)
214  {
215  /* Must reset the latch before testing state. */
217 
218  /*
219  * Acquiring the lock is not needed, the latch ensures proper
220  * barriers. If it looks like we're done, we must really be done,
221  * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE,
222  * it will never update it again, so we can't be seeing a stale value
223  * in that case.
224  */
226  break;
227 
228  /*
229  * If a wait for synchronous replication is pending, we can neither
230  * acknowledge the commit nor raise ERROR or FATAL. The latter would
231  * lead the client to believe that the transaction aborted, which is
232  * not true: it's already committed locally. The former is no good
233  * either: the client has requested synchronous replication, and is
234  * entitled to assume that an acknowledged commit is also replicated,
235  * which might not be true. So in this case we issue a WARNING (which
236  * some clients may be able to interpret) and shut off further output.
237  * We do NOT reset ProcDiePending, so that the process will die after
238  * the commit is cleaned up.
239  */
240  if (ProcDiePending)
241  {
243  (errcode(ERRCODE_ADMIN_SHUTDOWN),
244  errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
245  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
248  break;
249  }
250 
251  /*
252  * It's unclear what to do if a query cancel interrupt arrives. We
253  * can't actually abort at this point, but ignoring the interrupt
254  * altogether is not helpful, so we just terminate the wait with a
255  * suitable warning.
256  */
257  if (QueryCancelPending)
258  {
259  QueryCancelPending = false;
261  (errmsg("canceling wait for synchronous replication due to user request"),
262  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
264  break;
265  }
266 
267  /*
268  * If the postmaster dies, we'll probably never get an
269  * acknowledgement, because all the wal sender processes will exit. So
270  * just bail out.
271  */
272  if (!PostmasterIsAlive())
273  {
274  ProcDiePending = true;
277  break;
278  }
279 
280  /*
281  * Wait on latch. Any condition that should wake us up will set the
282  * latch, so no need for timeout.
283  */
286  }
287 
288  /*
289  * WalSender has checked our LSN and has removed us from queue. Clean up
290  * state and leave. It's OK to reset these shared memory fields without
291  * holding SyncRepLock, because any walsenders will ignore us anyway when
292  * we're not on the queue.
293  */
296  MyProc->waitLSN = 0;
297 
298  if (new_status)
299  {
300  /* Reset ps display */
301  set_ps_display(new_status, false);
302  pfree(new_status);
303  }
304 }
305 
306 /*
307  * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
308  *
309  * Usually we will go at tail of queue, though it's possible that we arrive
310  * here out of order, so start at tail and work back to insertion point.
311  */
312 static void
314 {
315  PGPROC *proc;
316 
317  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
318  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
319  &(WalSndCtl->SyncRepQueue[mode]),
320  offsetof(PGPROC, syncRepLinks));
321 
322  while (proc)
323  {
324  /*
325  * Stop at the queue element that we should after to ensure the queue
326  * is ordered by LSN.
327  */
328  if (proc->waitLSN < MyProc->waitLSN)
329  break;
330 
331  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
332  &(proc->syncRepLinks),
333  offsetof(PGPROC, syncRepLinks));
334  }
335 
336  if (proc)
338  else
340 }
341 
342 /*
343  * Acquire SyncRepLock and cancel any wait currently in progress.
344  */
345 static void
347 {
348  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
352  LWLockRelease(SyncRepLock);
353 }
354 
355 void
357 {
359  {
360  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
362  LWLockRelease(SyncRepLock);
363  }
364 }
365 
366 /*
367  * ===========================================================
368  * Synchronous Replication functions for wal sender processes
369  * ===========================================================
370  */
371 
372 /*
373  * Take any action required to initialise sync rep state from config
374  * data. Called at WALSender startup and after each SIGHUP.
375  */
376 void
378 {
379  int priority;
380 
381  /*
382  * Determine if we are a potential sync standby and remember the result
383  * for handling replies from standby.
384  */
385  priority = SyncRepGetStandbyPriority();
386  if (MyWalSnd->sync_standby_priority != priority)
387  {
388  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
389  MyWalSnd->sync_standby_priority = priority;
390  LWLockRelease(SyncRepLock);
391  ereport(DEBUG1,
392  (errmsg("standby \"%s\" now has synchronous standby priority %u",
393  application_name, priority)));
394  }
395 }
396 
397 /*
398  * Update the LSNs on each queue based upon our latest state. This
399  * implements a simple policy of first-valid-sync-standby-releases-waiter.
400  *
401  * Other policies are possible, which would change what we do here and
402  * perhaps also which information we store as well.
403  */
404 void
406 {
407  volatile WalSndCtlData *walsndctl = WalSndCtl;
408  XLogRecPtr writePtr;
409  XLogRecPtr flushPtr;
410  XLogRecPtr applyPtr;
411  bool got_recptr;
412  bool am_sync;
413  int numwrite = 0;
414  int numflush = 0;
415  int numapply = 0;
416 
417  /*
418  * If this WALSender is serving a standby that is not on the list of
419  * potential sync standbys then we have nothing to do. If we are still
420  * starting up, still running base backup or the current flush position is
421  * still invalid, then leave quickly also.
422  */
423  if (MyWalSnd->sync_standby_priority == 0 ||
426  {
427  announce_next_takeover = true;
428  return;
429  }
430 
431  /*
432  * We're a potential sync standby. Release waiters if there are enough
433  * sync standbys and we are considered as sync.
434  */
435  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
436 
437  /*
438  * Check whether we are a sync standby or not, and calculate the synced
439  * positions among all sync standbys.
440  */
441  got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
442 
443  /*
444  * If we are managing a sync standby, though we weren't prior to this,
445  * then announce we are now a sync standby.
446  */
447  if (announce_next_takeover && am_sync)
448  {
449  announce_next_takeover = false;
450 
451  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
452  ereport(LOG,
453  (errmsg("standby \"%s\" is now a synchronous standby with priority %u",
455  else
456  ereport(LOG,
457  (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
458  application_name)));
459  }
460 
461  /*
462  * If the number of sync standbys is less than requested or we aren't
463  * managing a sync standby then just leave.
464  */
465  if (!got_recptr || !am_sync)
466  {
467  LWLockRelease(SyncRepLock);
468  announce_next_takeover = !am_sync;
469  return;
470  }
471 
472  /*
473  * Set the lsn first so that when we wake backends they will release up to
474  * this location.
475  */
476  if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr)
477  {
478  walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr;
479  numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
480  }
481  if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr)
482  {
483  walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr;
484  numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
485  }
486  if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr)
487  {
488  walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr;
489  numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY);
490  }
491 
492  LWLockRelease(SyncRepLock);
493 
494  elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X",
495  numwrite, (uint32) (writePtr >> 32), (uint32) writePtr,
496  numflush, (uint32) (flushPtr >> 32), (uint32) flushPtr,
497  numapply, (uint32) (applyPtr >> 32), (uint32) applyPtr);
498 }
499 
500 /*
501  * Calculate the synced Write, Flush and Apply positions among sync standbys.
502  *
503  * Return false if the number of sync standbys is less than
504  * synchronous_standby_names specifies. Otherwise return true and
505  * store the positions into *writePtr, *flushPtr and *applyPtr.
506  *
507  * On return, *am_sync is set to true if this walsender is connecting to
508  * sync standby. Otherwise it's set to false.
509  */
510 static bool
512  XLogRecPtr *applyPtr, bool *am_sync)
513 {
514  List *sync_standbys;
515 
516  *writePtr = InvalidXLogRecPtr;
517  *flushPtr = InvalidXLogRecPtr;
518  *applyPtr = InvalidXLogRecPtr;
519  *am_sync = false;
520 
521  /* Get standbys that are considered as synchronous at this moment */
522  sync_standbys = SyncRepGetSyncStandbys(am_sync);
523 
524  /*
525  * Quick exit if we are not managing a sync standby or there are not
526  * enough synchronous standbys.
527  */
528  if (!(*am_sync) ||
529  SyncRepConfig == NULL ||
530  list_length(sync_standbys) < SyncRepConfig->num_sync)
531  {
532  list_free(sync_standbys);
533  return false;
534  }
535 
536  /*
537  * In a priority-based sync replication, the synced positions are the
538  * oldest ones among sync standbys. In a quorum-based, they are the Nth
539  * latest ones.
540  *
541  * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest positions.
542  * But we use SyncRepGetOldestSyncRecPtr() for that calculation because
543  * it's a bit more efficient.
544  *
545  * XXX If the numbers of current and requested sync standbys are the same,
546  * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
547  * positions even in a quorum-based sync replication.
548  */
549  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
550  {
551  SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
552  sync_standbys);
553  }
554  else
555  {
556  SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
557  sync_standbys, SyncRepConfig->num_sync);
558  }
559 
560  list_free(sync_standbys);
561  return true;
562 }
563 
564 /*
565  * Calculate the oldest Write, Flush and Apply positions among sync standbys.
566  */
567 static void
569  XLogRecPtr *applyPtr, List *sync_standbys)
570 {
571  ListCell *cell;
572 
573  /*
574  * Scan through all sync standbys and calculate the oldest
575  * Write, Flush and Apply positions.
576  */
577  foreach (cell, sync_standbys)
578  {
579  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
581  XLogRecPtr flush;
582  XLogRecPtr apply;
583 
584  SpinLockAcquire(&walsnd->mutex);
585  write = walsnd->write;
586  flush = walsnd->flush;
587  apply = walsnd->apply;
588  SpinLockRelease(&walsnd->mutex);
589 
590  if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write)
591  *writePtr = write;
592  if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush)
593  *flushPtr = flush;
594  if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
595  *applyPtr = apply;
596  }
597 }
598 
599 /*
600  * Calculate the Nth latest Write, Flush and Apply positions among sync
601  * standbys.
602  */
603 static void
605  XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
606 {
607  ListCell *cell;
608  XLogRecPtr *write_array;
609  XLogRecPtr *flush_array;
610  XLogRecPtr *apply_array;
611  int len;
612  int i = 0;
613 
614  len = list_length(sync_standbys);
615  write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
616  flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
617  apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
618 
619  foreach (cell, sync_standbys)
620  {
621  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
622 
623  SpinLockAcquire(&walsnd->mutex);
624  write_array[i] = walsnd->write;
625  flush_array[i] = walsnd->flush;
626  apply_array[i] = walsnd->apply;
627  SpinLockRelease(&walsnd->mutex);
628 
629  i++;
630  }
631 
632  qsort(write_array, len, sizeof(XLogRecPtr), cmp_lsn);
633  qsort(flush_array, len, sizeof(XLogRecPtr), cmp_lsn);
634  qsort(apply_array, len, sizeof(XLogRecPtr), cmp_lsn);
635 
636  /* Get Nth latest Write, Flush, Apply positions */
637  *writePtr = write_array[nth - 1];
638  *flushPtr = flush_array[nth - 1];
639  *applyPtr = apply_array[nth - 1];
640 
641  pfree(write_array);
642  pfree(flush_array);
643  pfree(apply_array);
644 }
645 
646 /*
647  * Compare lsn in order to sort array in descending order.
648  */
649 static int
650 cmp_lsn(const void *a, const void *b)
651 {
652  XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
653  XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
654 
655  if (lsn1 > lsn2)
656  return -1;
657  else if (lsn1 == lsn2)
658  return 0;
659  else
660  return 1;
661 }
662 
663 /*
664  * Return the list of sync standbys, or NIL if no sync standby is connected.
665  *
666  * The caller must hold SyncRepLock.
667  *
668  * On return, *am_sync is set to true if this walsender is connecting to
669  * sync standby. Otherwise it's set to false.
670  */
671 List *
673 {
674  /* Set default result */
675  if (am_sync != NULL)
676  *am_sync = false;
677 
678  /* Quick exit if sync replication is not requested */
679  if (SyncRepConfig == NULL)
680  return NIL;
681 
682  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ?
685 }
686 
687 /*
688  * Return the list of all the candidates for quorum sync standbys,
689  * or NIL if no such standby is connected.
690  *
691  * The caller must hold SyncRepLock. This function must be called only in
692  * a quorum-based sync replication.
693  *
694  * On return, *am_sync is set to true if this walsender is connecting to
695  * sync standby. Otherwise it's set to false.
696  */
697 static List *
699 {
700  List *result = NIL;
701  int i;
702  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
703  * rearrangement */
704 
705  Assert(SyncRepConfig->syncrep_method == SYNC_REP_QUORUM);
706 
707  for (i = 0; i < max_wal_senders; i++)
708  {
709  walsnd = &WalSndCtl->walsnds[i];
710 
711  /* Must be active */
712  if (walsnd->pid == 0)
713  continue;
714 
715  /* Must be streaming */
716  if (walsnd->state != WALSNDSTATE_STREAMING)
717  continue;
718 
719  /* Must be synchronous */
720  if (walsnd->sync_standby_priority == 0)
721  continue;
722 
723  /* Must have a valid flush position */
724  if (XLogRecPtrIsInvalid(walsnd->flush))
725  continue;
726 
727  /*
728  * Consider this standby as a candidate for quorum sync standbys
729  * and append it to the result.
730  */
731  result = lappend_int(result, i);
732  if (am_sync != NULL && walsnd == MyWalSnd)
733  *am_sync = true;
734  }
735 
736  return result;
737 }
738 
739 /*
740  * Return the list of sync standbys chosen based on their priorities,
741  * or NIL if no sync standby is connected.
742  *
743  * If there are multiple standbys with the same priority,
744  * the first one found is selected preferentially.
745  *
746  * The caller must hold SyncRepLock. This function must be called only in
747  * a priority-based sync replication.
748  *
749  * On return, *am_sync is set to true if this walsender is connecting to
750  * sync standby. Otherwise it's set to false.
751  */
752 static List *
754 {
755  List *result = NIL;
756  List *pending = NIL;
757  int lowest_priority;
758  int next_highest_priority;
759  int this_priority;
760  int priority;
761  int i;
762  bool am_in_pending = false;
763  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
764  * rearrangement */
765 
766  Assert(SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY);
767 
768  lowest_priority = SyncRepConfig->nmembers;
769  next_highest_priority = lowest_priority + 1;
770 
771  /*
772  * Find the sync standbys which have the highest priority (i.e, 1). Also
773  * store all the other potential sync standbys into the pending list, in
774  * order to scan it later and find other sync standbys from it quickly.
775  */
776  for (i = 0; i < max_wal_senders; i++)
777  {
778  walsnd = &WalSndCtl->walsnds[i];
779 
780  /* Must be active */
781  if (walsnd->pid == 0)
782  continue;
783 
784  /* Must be streaming */
785  if (walsnd->state != WALSNDSTATE_STREAMING)
786  continue;
787 
788  /* Must be synchronous */
789  this_priority = walsnd->sync_standby_priority;
790  if (this_priority == 0)
791  continue;
792 
793  /* Must have a valid flush position */
794  if (XLogRecPtrIsInvalid(walsnd->flush))
795  continue;
796 
797  /*
798  * If the priority is equal to 1, consider this standby as sync and
799  * append it to the result. Otherwise append this standby to the
800  * pending list to check if it's actually sync or not later.
801  */
802  if (this_priority == 1)
803  {
804  result = lappend_int(result, i);
805  if (am_sync != NULL && walsnd == MyWalSnd)
806  *am_sync = true;
807  if (list_length(result) == SyncRepConfig->num_sync)
808  {
809  list_free(pending);
810  return result; /* Exit if got enough sync standbys */
811  }
812  }
813  else
814  {
815  pending = lappend_int(pending, i);
816  if (am_sync != NULL && walsnd == MyWalSnd)
817  am_in_pending = true;
818 
819  /*
820  * Track the highest priority among the standbys in the pending
821  * list, in order to use it as the starting priority for later
822  * scan of the list. This is useful to find quickly the sync
823  * standbys from the pending list later because we can skip
824  * unnecessary scans for the unused priorities.
825  */
826  if (this_priority < next_highest_priority)
827  next_highest_priority = this_priority;
828  }
829  }
830 
831  /*
832  * Consider all pending standbys as sync if the number of them plus
833  * already-found sync ones is lower than the configuration requests.
834  */
835  if (list_length(result) + list_length(pending) <= SyncRepConfig->num_sync)
836  {
837  bool needfree = (result != NIL && pending != NIL);
838 
839  /*
840  * Set *am_sync to true if this walsender is in the pending list
841  * because all pending standbys are considered as sync.
842  */
843  if (am_sync != NULL && !(*am_sync))
844  *am_sync = am_in_pending;
845 
846  result = list_concat(result, pending);
847  if (needfree)
848  pfree(pending);
849  return result;
850  }
851 
852  /*
853  * Find the sync standbys from the pending list.
854  */
855  priority = next_highest_priority;
856  while (priority <= lowest_priority)
857  {
858  ListCell *cell;
859  ListCell *prev = NULL;
860  ListCell *next;
861 
862  next_highest_priority = lowest_priority + 1;
863 
864  for (cell = list_head(pending); cell != NULL; cell = next)
865  {
866  i = lfirst_int(cell);
867  walsnd = &WalSndCtl->walsnds[i];
868 
869  next = lnext(cell);
870 
871  this_priority = walsnd->sync_standby_priority;
872  if (this_priority == priority)
873  {
874  result = lappend_int(result, i);
875  if (am_sync != NULL && walsnd == MyWalSnd)
876  *am_sync = true;
877 
878  /*
879  * We should always exit here after the scan of pending list
880  * starts because we know that the list has enough elements to
881  * reach SyncRepConfig->num_sync.
882  */
883  if (list_length(result) == SyncRepConfig->num_sync)
884  {
885  list_free(pending);
886  return result; /* Exit if got enough sync standbys */
887  }
888 
889  /*
890  * Remove the entry for this sync standby from the list to
891  * prevent us from looking at the same entry again.
892  */
893  pending = list_delete_cell(pending, cell, prev);
894 
895  continue;
896  }
897 
898  if (this_priority < next_highest_priority)
899  next_highest_priority = this_priority;
900 
901  prev = cell;
902  }
903 
904  priority = next_highest_priority;
905  }
906 
907  /* never reached, but keep compiler quiet */
908  Assert(false);
909  return result;
910 }
911 
912 /*
913  * Check if we are in the list of sync standbys, and if so, determine
914  * priority sequence. Return priority if set, or zero to indicate that
915  * we are not a potential sync standby.
916  *
917  * Compare the parameter SyncRepStandbyNames against the application_name
918  * for this WALSender, or allow any name if we find a wildcard "*".
919  */
920 static int
922 {
923  const char *standby_name;
924  int priority;
925  bool found = false;
926 
927  /*
928  * Since synchronous cascade replication is not allowed, we always set the
929  * priority of cascading walsender to zero.
930  */
932  return 0;
933 
934  if (!SyncStandbysDefined() || SyncRepConfig == NULL)
935  return 0;
936 
937  standby_name = SyncRepConfig->member_names;
938  for (priority = 1; priority <= SyncRepConfig->nmembers; priority++)
939  {
940  if (pg_strcasecmp(standby_name, application_name) == 0 ||
941  strcmp(standby_name, "*") == 0)
942  {
943  found = true;
944  break;
945  }
946  standby_name += strlen(standby_name) + 1;
947  }
948 
949  return (found ? priority : 0);
950 }
951 
952 /*
953  * Walk the specified queue from head. Set the state of any backends that
954  * need to be woken, remove them from the queue, and then wake them.
955  * Pass all = true to wake whole queue; otherwise, just wake up to
956  * the walsender's LSN.
957  *
958  * Must hold SyncRepLock.
959  */
960 static int
961 SyncRepWakeQueue(bool all, int mode)
962 {
963  volatile WalSndCtlData *walsndctl = WalSndCtl;
964  PGPROC *proc = NULL;
965  PGPROC *thisproc = NULL;
966  int numprocs = 0;
967 
968  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
969  Assert(SyncRepQueueIsOrderedByLSN(mode));
970 
971  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
972  &(WalSndCtl->SyncRepQueue[mode]),
973  offsetof(PGPROC, syncRepLinks));
974 
975  while (proc)
976  {
977  /*
978  * Assume the queue is ordered by LSN
979  */
980  if (!all && walsndctl->lsn[mode] < proc->waitLSN)
981  return numprocs;
982 
983  /*
984  * Move to next proc, so we can delete thisproc from the queue.
985  * thisproc is valid, proc may be NULL after this.
986  */
987  thisproc = proc;
988  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
989  &(proc->syncRepLinks),
990  offsetof(PGPROC, syncRepLinks));
991 
992  /*
993  * Set state to complete; see SyncRepWaitForLSN() for discussion of
994  * the various states.
995  */
997 
998  /*
999  * Remove thisproc from queue.
1000  */
1001  SHMQueueDelete(&(thisproc->syncRepLinks));
1002 
1003  /*
1004  * Wake only when we have set state and removed from queue.
1005  */
1006  SetLatch(&(thisproc->procLatch));
1007 
1008  numprocs++;
1009  }
1010 
1011  return numprocs;
1012 }
1013 
1014 /*
1015  * The checkpointer calls this as needed to update the shared
1016  * sync_standbys_defined flag, so that backends don't remain permanently wedged
1017  * if synchronous_standby_names is unset. It's safe to check the current value
1018  * without the lock, because it's only ever updated by one process. But we
1019  * must take the lock to change it.
1020  */
1021 void
1023 {
1024  bool sync_standbys_defined = SyncStandbysDefined();
1025 
1026  if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
1027  {
1028  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
1029 
1030  /*
1031  * If synchronous_standby_names has been reset to empty, it's futile
1032  * for backends to continue to waiting. Since the user no longer
1033  * wants synchronous replication, we'd better wake them up.
1034  */
1035  if (!sync_standbys_defined)
1036  {
1037  int i;
1038 
1039  for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
1040  SyncRepWakeQueue(true, i);
1041  }
1042 
1043  /*
1044  * Only allow people to join the queue when there are synchronous
1045  * standbys defined. Without this interlock, there's a race
1046  * condition: we might wake up all the current waiters; then, some
1047  * backend that hasn't yet reloaded its config might go to sleep on
1048  * the queue (and never wake up). This prevents that.
1049  */
1050  WalSndCtl->sync_standbys_defined = sync_standbys_defined;
1051 
1052  LWLockRelease(SyncRepLock);
1053  }
1054 }
1055 
1056 #ifdef USE_ASSERT_CHECKING
1057 static bool
1058 SyncRepQueueIsOrderedByLSN(int mode)
1059 {
1060  PGPROC *proc = NULL;
1061  XLogRecPtr lastLSN;
1062 
1063  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
1064 
1065  lastLSN = 0;
1066 
1067  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1068  &(WalSndCtl->SyncRepQueue[mode]),
1069  offsetof(PGPROC, syncRepLinks));
1070 
1071  while (proc)
1072  {
1073  /*
1074  * Check the queue is ordered by LSN and that multiple procs don't
1075  * have matching LSNs
1076  */
1077  if (proc->waitLSN <= lastLSN)
1078  return false;
1079 
1080  lastLSN = proc->waitLSN;
1081 
1082  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1083  &(proc->syncRepLinks),
1084  offsetof(PGPROC, syncRepLinks));
1085  }
1086 
1087  return true;
1088 }
1089 #endif
1090 
1091 /*
1092  * ===========================================================
1093  * Synchronous Replication functions executed by any process
1094  * ===========================================================
1095  */
1096 
1097 bool
1098 check_synchronous_standby_names(char **newval, void **extra, GucSource source)
1099 {
1100  if (*newval != NULL && (*newval)[0] != '\0')
1101  {
1102  int parse_rc;
1103  SyncRepConfigData *pconf;
1104 
1105  /* Reset communication variables to ensure a fresh start */
1108 
1109  /* Parse the synchronous_standby_names string */
1110  syncrep_scanner_init(*newval);
1111  parse_rc = syncrep_yyparse();
1113 
1114  if (parse_rc != 0 || syncrep_parse_result == NULL)
1115  {
1116  GUC_check_errcode(ERRCODE_SYNTAX_ERROR);
1119  else
1120  GUC_check_errdetail("synchronous_standby_names parser failed");
1121  return false;
1122  }
1123 
1124  if (syncrep_parse_result->num_sync <= 0)
1125  {
1126  GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero",
1128  return false;
1129  }
1130 
1131  /* GUC extra value must be malloc'd, not palloc'd */
1132  pconf = (SyncRepConfigData *)
1134  if (pconf == NULL)
1135  return false;
1137 
1138  *extra = (void *) pconf;
1139 
1140  /*
1141  * We need not explicitly clean up syncrep_parse_result. It, and any
1142  * other cruft generated during parsing, will be freed when the
1143  * current memory context is deleted. (This code is generally run in
1144  * a short-lived context used for config file processing, so that will
1145  * not be very long.)
1146  */
1147  }
1148  else
1149  *extra = NULL;
1150 
1151  return true;
1152 }
1153 
1154 void
1155 assign_synchronous_standby_names(const char *newval, void *extra)
1156 {
1157  SyncRepConfig = (SyncRepConfigData *) extra;
1158 }
1159 
1160 void
1162 {
1163  switch (newval)
1164  {
1167  break;
1170  break;
1173  break;
1174  default:
1176  break;
1177  }
1178 }
void syncrep_scanner_finish(void)
#define NIL
Definition: pg_list.h:69
void SyncRepUpdateSyncStandbysDefined(void)
Definition: syncrep.c:1022
XLogRecPtr write
void assign_synchronous_standby_names(const char *newval, void *extra)
Definition: syncrep.c:1155
Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:164
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
static List * SyncRepGetSyncStandbysPriority(bool *am_sync)
Definition: syncrep.c:753
static void SyncRepCancelWait(void)
Definition: syncrep.c:346
#define DEBUG1
Definition: elog.h:25
void syncrep_scanner_init(const char *query_string)
static int32 next
Definition: blutils.c:210
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync)
Definition: syncrep.c:511
#define DEBUG3
Definition: elog.h:23
#define SYNC_REP_PRIORITY
Definition: syncrep.h:36
bool update_process_title
Definition: ps_status.c:35
#define write(a, b, c)
Definition: win32.h:19
#define GUC_check_errdetail
Definition: guc.h:407
static void SyncRepQueueInsert(int mode)
Definition: syncrep.c:313
PGPROC * MyProc
Definition: proc.c:67
char * syncrep_parse_error_msg
#define SYNC_REP_WAITING
Definition: syncrep.h:32
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:142
uint8 syncrep_method
Definition: syncrep.h:51
#define Min(x, y)
Definition: c.h:802
unsigned char uint8
Definition: c.h:263
void set_ps_display(const char *activity, bool force)
Definition: ps_status.c:326
#define SYNC_REP_NOT_WAITING
Definition: syncrep.h:31
WalSndCtlData * WalSndCtl
Definition: walsender.c:100
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
Definition: syncrep.c:604
int errcode(int sqlerrcode)
Definition: elog.c:575
#define GUC_check_errmsg
Definition: guc.h:403
List * list_concat(List *list1, List *list2)
Definition: list.c:321
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]
List * SyncRepGetSyncStandbys(bool *am_sync)
Definition: syncrep.c:672
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void ResetLatch(volatile Latch *latch)
Definition: latch.c:461
#define LOG
Definition: elog.h:26
SyncRepConfigData * SyncRepConfig
Definition: syncrep.c:93
volatile bool QueryCancelPending
Definition: globals.c:30
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:9726
static int SyncRepWakeQueue(bool all, int mode)
Definition: syncrep.c:961
slock_t mutex
Latch procLatch
Definition: proc.h:93
GucSource
Definition: guc.h:105
#define malloc(a)
Definition: header.h:45
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1714
#define SpinLockAcquire(lock)
Definition: spin.h:62
XLogRecPtr flush
int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:300
void pfree(void *pointer)
Definition: mcxt.c:992
static int cmp_lsn(const void *a, const void *b)
Definition: syncrep.c:650
#define SYNC_REP_NO_WAIT
Definition: syncrep.h:23
#define SYNC_REP_WAIT_APPLY
Definition: syncrep.h:26
#define lfirst_int(lc)
Definition: pg_list.h:107
void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
Definition: shmqueue.c:108
static bool announce_next_takeover
Definition: syncrep.c:91
Definition: dest.h:88
const char * get_ps_display(int *displen)
Definition: ps_status.c:405
WalSndState state
void assign_synchronous_commit(int newval, void *extra)
Definition: syncrep.c:1161
#define SyncStandbysDefined()
Definition: syncrep.c:88
bool PostmasterIsAlive(void)
Definition: pmsignal.c:272
int errdetail(const char *fmt,...)
Definition: elog.c:873
static ListCell * list_head(const List *l)
Definition: pg_list.h:77
unsigned int uint32
Definition: c.h:265
SHM_QUEUE SyncRepQueue[NUM_SYNC_REP_WAIT_MODE]
void SyncRepInitConfig(void)
Definition: syncrep.c:377
#define lnext(lc)
Definition: pg_list.h:105
#define ereport(elevel, rest)
Definition: elog.h:122
#define SYNC_REP_WAIT_FLUSH
Definition: syncrep.h:25
List * lappend_int(List *list, int datum)
Definition: list.c:146
#define SyncRepRequested()
Definition: syncrep.h:19
int max_wal_senders
Definition: walsender.c:112
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
#define SpinLockRelease(lock)
Definition: spin.h:64
List * list_delete_cell(List *list, ListCell *cell, ListCell *prev)
Definition: list.c:528
char * SyncRepStandbyNames
Definition: syncrep.c:86
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:145
WalSnd * MyWalSnd
Definition: walsender.c:103
static List * SyncRepGetSyncStandbysQuorum(bool *am_sync)
Definition: syncrep.c:698
#define SYNC_REP_WAIT_COMPLETE
Definition: syncrep.h:33
#define SYNC_REP_QUORUM
Definition: syncrep.h:37
void SetLatch(volatile Latch *latch)
Definition: latch.c:379
#define NULL
Definition: c.h:226
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:671
char member_names[FLEXIBLE_ARRAY_MEMBER]
Definition: syncrep.h:54
int sync_standby_priority
bool SHMQueueIsDetached(const SHM_QUEUE *queue)
Definition: shmqueue.c:47
SyncRepConfigData * syncrep_parse_result
static int list_length(const List *l)
Definition: pg_list.h:89
#define newval
bool check_synchronous_standby_names(char **newval, void **extra, GucSource source)
Definition: syncrep.c:1098
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1110
SHM_QUEUE syncRepLinks
Definition: proc.h:139
void SyncRepCleanupAtProcExit(void)
Definition: syncrep.c:356
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]
volatile bool ProcDiePending
Definition: globals.c:31
char * application_name
Definition: guc.c:471
void * palloc(Size size)
Definition: mcxt.c:891
int errmsg(const char *fmt,...)
Definition: elog.c:797
void list_free(List *list)
Definition: list.c:1133
int i
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys)
Definition: syncrep.c:568
void SHMQueueDelete(SHM_QUEUE *queue)
Definition: shmqueue.c:68
struct Latch * MyLatch
Definition: globals.c:51
int syncrep_yyparse(void)
#define elog
Definition: elog.h:219
#define NUM_SYNC_REP_WAIT_MODE
Definition: syncrep.h:28
#define qsort(a, b, c, d)
Definition: port.h:440
int syncRepState
Definition: proc.h:138
CommandDest whereToSendOutput
Definition: postgres.c:86
XLogRecPtr apply
Definition: proc.h:84
Definition: pg_list.h:45
static int SyncRepWaitMode
Definition: syncrep.c:94
#define WL_LATCH_SET
Definition: latch.h:124
void SyncRepReleaseWaiters(void)
Definition: syncrep.c:405
XLogRecPtr waitLSN
Definition: proc.h:137
#define offsetof(type, field)
Definition: c.h:551
bool am_cascading_walsender
Definition: walsender.c:107
#define SYNC_REP_WAIT_WRITE
Definition: syncrep.h:24
static int SyncRepGetStandbyPriority(void)
Definition: syncrep.c:921