PostgreSQL Source Code  git master
syncrep.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * syncrep.c
4  *
5  * Synchronous replication is new as of PostgreSQL 9.1.
6  *
7  * If requested, transaction commits wait until their commit LSN are
8  * acknowledged by the synchronous standbys.
9  *
10  * This module contains the code for waiting and release of backends.
11  * All code in this module executes on the primary. The core streaming
12  * replication transport remains within WALreceiver/WALsender modules.
13  *
14  * The essence of this design is that it isolates all logic about
15  * waiting/releasing onto the primary. The primary defines which standbys
16  * it wishes to wait for. The standbys are completely unaware of the
17  * durability requirements of transactions on the primary, reducing the
18  * complexity of the code and streamlining both standby operations and
19  * network bandwidth because there is no requirement to ship
20  * per-transaction state information.
21  *
22  * Replication is either synchronous or not synchronous (async). If it is
23  * async, we just fastpath out of here. If it is sync, then we wait for
24  * the write, flush or apply location on the standby before releasing
25  * the waiting backend. Further complexity in that interaction is
26  * expected in later releases.
27  *
28  * The best performing way to manage the waiting backends is to have a
29  * single ordered queue of waiting backends, so that we can avoid
30  * searching the through all waiters each time we receive a reply.
31  *
32  * In 9.5 or before only a single standby could be considered as
33  * synchronous. In 9.6 we support a priority-based multiple synchronous
34  * standbys. In 10.0 a quorum-based multiple synchronous standbys is also
35  * supported. The number of synchronous standbys that transactions
36  * must wait for replies from is specified in synchronous_standby_names.
37  * This parameter also specifies a list of standby names and the method
38  * (FIRST and ANY) to choose synchronous standbys from the listed ones.
39  *
40  * The method FIRST specifies a priority-based synchronous replication
41  * and makes transaction commits wait until their WAL records are
42  * replicated to the requested number of synchronous standbys chosen based
43  * on their priorities. The standbys whose names appear earlier in the list
44  * are given higher priority and will be considered as synchronous.
45  * Other standby servers appearing later in this list represent potential
46  * synchronous standbys. If any of the current synchronous standbys
47  * disconnects for whatever reason, it will be replaced immediately with
48  * the next-highest-priority standby.
49  *
50  * The method ANY specifies a quorum-based synchronous replication
51  * and makes transaction commits wait until their WAL records are
52  * replicated to at least the requested number of synchronous standbys
53  * in the list. All the standbys appearing in the list are considered as
54  * candidates for quorum synchronous standbys.
55  *
56  * If neither FIRST nor ANY is specified, FIRST is used as the method.
57  * This is for backward compatibility with 9.6 or before where only a
58  * priority-based sync replication was supported.
59  *
60  * Before the standbys chosen from synchronous_standby_names can
61  * become the synchronous standbys they must have caught up with
62  * the primary; that may take some time. Once caught up,
63  * the standbys which are considered as synchronous at that moment
64  * will release waiters from the queue.
65  *
66  * Portions Copyright (c) 2010-2018, PostgreSQL Global Development Group
67  *
68  * IDENTIFICATION
69  * src/backend/replication/syncrep.c
70  *
71  *-------------------------------------------------------------------------
72  */
73 #include "postgres.h"
74 
75 #include <unistd.h>
76 
77 #include "access/xact.h"
78 #include "miscadmin.h"
79 #include "pgstat.h"
80 #include "replication/syncrep.h"
81 #include "replication/walsender.h"
83 #include "storage/pmsignal.h"
84 #include "storage/proc.h"
85 #include "tcop/tcopprot.h"
86 #include "utils/builtins.h"
87 #include "utils/ps_status.h"
88 
89 /* User-settable parameters for sync rep */
91 
92 #define SyncStandbysDefined() \
93  (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0')
94 
95 static bool announce_next_takeover = true;
96 
99 
100 static void SyncRepQueueInsert(int mode);
101 static void SyncRepCancelWait(void);
102 static int SyncRepWakeQueue(bool all, int mode);
103 
104 static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
105  XLogRecPtr *flushPtr,
106  XLogRecPtr *applyPtr,
107  bool *am_sync);
108 static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
109  XLogRecPtr *flushPtr,
110  XLogRecPtr *applyPtr,
111  List *sync_standbys);
112 static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
113  XLogRecPtr *flushPtr,
114  XLogRecPtr *applyPtr,
115  List *sync_standbys, uint8 nth);
116 static int SyncRepGetStandbyPriority(void);
117 static List *SyncRepGetSyncStandbysPriority(bool *am_sync);
118 static List *SyncRepGetSyncStandbysQuorum(bool *am_sync);
119 static int cmp_lsn(const void *a, const void *b);
120 
121 #ifdef USE_ASSERT_CHECKING
122 static bool SyncRepQueueIsOrderedByLSN(int mode);
123 #endif
124 
125 /*
126  * ===========================================================
127  * Synchronous Replication functions for normal user backends
128  * ===========================================================
129  */
130 
131 /*
132  * Wait for synchronous replication, if requested by user.
133  *
134  * Initially backends start in state SYNC_REP_NOT_WAITING and then
135  * change that state to SYNC_REP_WAITING before adding ourselves
136  * to the wait queue. During SyncRepWakeQueue() a WALSender changes
137  * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
138  * This backend then resets its state to SYNC_REP_NOT_WAITING.
139  *
140  * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN
141  * represents a commit record. If it doesn't, then we wait only for the WAL
142  * to be flushed if synchronous_commit is set to the higher level of
143  * remote_apply, because only commit records provide apply feedback.
144  */
145 void
146 SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
147 {
148  char *new_status = NULL;
149  const char *old_status;
150  int mode;
151 
152  /* Cap the level for anything other than commit to remote flush only. */
153  if (commit)
154  mode = SyncRepWaitMode;
155  else
157 
158  /*
159  * Fast exit if user has not requested sync replication.
160  */
161  if (!SyncRepRequested())
162  return;
163 
165  Assert(WalSndCtl != NULL);
166 
167  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
169 
170  /*
171  * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
172  * set. See SyncRepUpdateSyncStandbysDefined.
173  *
174  * Also check that the standby hasn't already replied. Unlikely race
175  * condition but we'll be fetching that cache line anyway so it's likely
176  * to be a low cost check.
177  */
179  lsn <= WalSndCtl->lsn[mode])
180  {
181  LWLockRelease(SyncRepLock);
182  return;
183  }
184 
185  /*
186  * Set our waitLSN so WALSender will know when to wake us, and add
187  * ourselves to the queue.
188  */
189  MyProc->waitLSN = lsn;
191  SyncRepQueueInsert(mode);
192  Assert(SyncRepQueueIsOrderedByLSN(mode));
193  LWLockRelease(SyncRepLock);
194 
195  /* Alter ps display to show waiting for sync rep. */
197  {
198  int len;
199 
200  old_status = get_ps_display(&len);
201  new_status = (char *) palloc(len + 32 + 1);
202  memcpy(new_status, old_status, len);
203  sprintf(new_status + len, " waiting for %X/%X",
204  (uint32) (lsn >> 32), (uint32) lsn);
205  set_ps_display(new_status, false);
206  new_status[len] = '\0'; /* truncate off " waiting ..." */
207  }
208 
209  /*
210  * Wait for specified LSN to be confirmed.
211  *
212  * Each proc has its own wait latch, so we perform a normal latch
213  * check/wait loop here.
214  */
215  for (;;)
216  {
217  /* Must reset the latch before testing state. */
219 
220  /*
221  * Acquiring the lock is not needed, the latch ensures proper
222  * barriers. If it looks like we're done, we must really be done,
223  * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE,
224  * it will never update it again, so we can't be seeing a stale value
225  * in that case.
226  */
228  break;
229 
230  /*
231  * If a wait for synchronous replication is pending, we can neither
232  * acknowledge the commit nor raise ERROR or FATAL. The latter would
233  * lead the client to believe that the transaction aborted, which is
234  * not true: it's already committed locally. The former is no good
235  * either: the client has requested synchronous replication, and is
236  * entitled to assume that an acknowledged commit is also replicated,
237  * which might not be true. So in this case we issue a WARNING (which
238  * some clients may be able to interpret) and shut off further output.
239  * We do NOT reset ProcDiePending, so that the process will die after
240  * the commit is cleaned up.
241  */
242  if (ProcDiePending)
243  {
245  (errcode(ERRCODE_ADMIN_SHUTDOWN),
246  errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
247  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
250  break;
251  }
252 
253  /*
254  * It's unclear what to do if a query cancel interrupt arrives. We
255  * can't actually abort at this point, but ignoring the interrupt
256  * altogether is not helpful, so we just terminate the wait with a
257  * suitable warning.
258  */
259  if (QueryCancelPending)
260  {
261  QueryCancelPending = false;
263  (errmsg("canceling wait for synchronous replication due to user request"),
264  errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
266  break;
267  }
268 
269  /*
270  * If the postmaster dies, we'll probably never get an
271  * acknowledgement, because all the wal sender processes will exit. So
272  * just bail out.
273  */
274  if (!PostmasterIsAlive())
275  {
276  ProcDiePending = true;
279  break;
280  }
281 
282  /*
283  * Wait on latch. Any condition that should wake us up will set the
284  * latch, so no need for timeout.
285  */
288  }
289 
290  /*
291  * WalSender has checked our LSN and has removed us from queue. Clean up
292  * state and leave. It's OK to reset these shared memory fields without
293  * holding SyncRepLock, because any walsenders will ignore us anyway when
294  * we're not on the queue. We need a read barrier to make sure we see the
295  * changes to the queue link (this might be unnecessary without
296  * assertions, but better safe than sorry).
297  */
298  pg_read_barrier();
301  MyProc->waitLSN = 0;
302 
303  if (new_status)
304  {
305  /* Reset ps display */
306  set_ps_display(new_status, false);
307  pfree(new_status);
308  }
309 }
310 
311 /*
312  * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
313  *
314  * Usually we will go at tail of queue, though it's possible that we arrive
315  * here out of order, so start at tail and work back to insertion point.
316  */
317 static void
319 {
320  PGPROC *proc;
321 
322  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
323  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
324  &(WalSndCtl->SyncRepQueue[mode]),
325  offsetof(PGPROC, syncRepLinks));
326 
327  while (proc)
328  {
329  /*
330  * Stop at the queue element that we should after to ensure the queue
331  * is ordered by LSN.
332  */
333  if (proc->waitLSN < MyProc->waitLSN)
334  break;
335 
336  proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
337  &(proc->syncRepLinks),
338  offsetof(PGPROC, syncRepLinks));
339  }
340 
341  if (proc)
343  else
345 }
346 
347 /*
348  * Acquire SyncRepLock and cancel any wait currently in progress.
349  */
350 static void
352 {
353  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
357  LWLockRelease(SyncRepLock);
358 }
359 
360 void
362 {
364  {
365  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
367  LWLockRelease(SyncRepLock);
368  }
369 }
370 
371 /*
372  * ===========================================================
373  * Synchronous Replication functions for wal sender processes
374  * ===========================================================
375  */
376 
377 /*
378  * Take any action required to initialise sync rep state from config
379  * data. Called at WALSender startup and after each SIGHUP.
380  */
381 void
383 {
384  int priority;
385 
386  /*
387  * Determine if we are a potential sync standby and remember the result
388  * for handling replies from standby.
389  */
390  priority = SyncRepGetStandbyPriority();
391  if (MyWalSnd->sync_standby_priority != priority)
392  {
393  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
394  MyWalSnd->sync_standby_priority = priority;
395  LWLockRelease(SyncRepLock);
396  ereport(DEBUG1,
397  (errmsg("standby \"%s\" now has synchronous standby priority %u",
398  application_name, priority)));
399  }
400 }
401 
402 /*
403  * Update the LSNs on each queue based upon our latest state. This
404  * implements a simple policy of first-valid-sync-standby-releases-waiter.
405  *
406  * Other policies are possible, which would change what we do here and
407  * perhaps also which information we store as well.
408  */
409 void
411 {
412  volatile WalSndCtlData *walsndctl = WalSndCtl;
413  XLogRecPtr writePtr;
414  XLogRecPtr flushPtr;
415  XLogRecPtr applyPtr;
416  bool got_recptr;
417  bool am_sync;
418  int numwrite = 0;
419  int numflush = 0;
420  int numapply = 0;
421 
422  /*
423  * If this WALSender is serving a standby that is not on the list of
424  * potential sync standbys then we have nothing to do. If we are still
425  * starting up, still running base backup or the current flush position is
426  * still invalid, then leave quickly also.
427  */
428  if (MyWalSnd->sync_standby_priority == 0 ||
431  {
432  announce_next_takeover = true;
433  return;
434  }
435 
436  /*
437  * We're a potential sync standby. Release waiters if there are enough
438  * sync standbys and we are considered as sync.
439  */
440  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
441 
442  /*
443  * Check whether we are a sync standby or not, and calculate the synced
444  * positions among all sync standbys.
445  */
446  got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
447 
448  /*
449  * If we are managing a sync standby, though we weren't prior to this,
450  * then announce we are now a sync standby.
451  */
452  if (announce_next_takeover && am_sync)
453  {
454  announce_next_takeover = false;
455 
456  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
457  ereport(LOG,
458  (errmsg("standby \"%s\" is now a synchronous standby with priority %u",
460  else
461  ereport(LOG,
462  (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
463  application_name)));
464  }
465 
466  /*
467  * If the number of sync standbys is less than requested or we aren't
468  * managing a sync standby then just leave.
469  */
470  if (!got_recptr || !am_sync)
471  {
472  LWLockRelease(SyncRepLock);
473  announce_next_takeover = !am_sync;
474  return;
475  }
476 
477  /*
478  * Set the lsn first so that when we wake backends they will release up to
479  * this location.
480  */
481  if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr)
482  {
483  walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr;
484  numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
485  }
486  if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr)
487  {
488  walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr;
489  numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
490  }
491  if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr)
492  {
493  walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr;
494  numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY);
495  }
496 
497  LWLockRelease(SyncRepLock);
498 
499  elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X",
500  numwrite, (uint32) (writePtr >> 32), (uint32) writePtr,
501  numflush, (uint32) (flushPtr >> 32), (uint32) flushPtr,
502  numapply, (uint32) (applyPtr >> 32), (uint32) applyPtr);
503 }
504 
505 /*
506  * Calculate the synced Write, Flush and Apply positions among sync standbys.
507  *
508  * Return false if the number of sync standbys is less than
509  * synchronous_standby_names specifies. Otherwise return true and
510  * store the positions into *writePtr, *flushPtr and *applyPtr.
511  *
512  * On return, *am_sync is set to true if this walsender is connecting to
513  * sync standby. Otherwise it's set to false.
514  */
515 static bool
517  XLogRecPtr *applyPtr, bool *am_sync)
518 {
519  List *sync_standbys;
520 
521  *writePtr = InvalidXLogRecPtr;
522  *flushPtr = InvalidXLogRecPtr;
523  *applyPtr = InvalidXLogRecPtr;
524  *am_sync = false;
525 
526  /* Get standbys that are considered as synchronous at this moment */
527  sync_standbys = SyncRepGetSyncStandbys(am_sync);
528 
529  /*
530  * Quick exit if we are not managing a sync standby or there are not
531  * enough synchronous standbys.
532  */
533  if (!(*am_sync) ||
534  SyncRepConfig == NULL ||
535  list_length(sync_standbys) < SyncRepConfig->num_sync)
536  {
537  list_free(sync_standbys);
538  return false;
539  }
540 
541  /*
542  * In a priority-based sync replication, the synced positions are the
543  * oldest ones among sync standbys. In a quorum-based, they are the Nth
544  * latest ones.
545  *
546  * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest
547  * positions. But we use SyncRepGetOldestSyncRecPtr() for that calculation
548  * because it's a bit more efficient.
549  *
550  * XXX If the numbers of current and requested sync standbys are the same,
551  * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
552  * positions even in a quorum-based sync replication.
553  */
554  if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
555  {
556  SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
557  sync_standbys);
558  }
559  else
560  {
561  SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
562  sync_standbys, SyncRepConfig->num_sync);
563  }
564 
565  list_free(sync_standbys);
566  return true;
567 }
568 
569 /*
570  * Calculate the oldest Write, Flush and Apply positions among sync standbys.
571  */
572 static void
574  XLogRecPtr *applyPtr, List *sync_standbys)
575 {
576  ListCell *cell;
577 
578  /*
579  * Scan through all sync standbys and calculate the oldest Write, Flush
580  * and Apply positions.
581  */
582  foreach(cell, sync_standbys)
583  {
584  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
586  XLogRecPtr flush;
587  XLogRecPtr apply;
588 
589  SpinLockAcquire(&walsnd->mutex);
590  write = walsnd->write;
591  flush = walsnd->flush;
592  apply = walsnd->apply;
593  SpinLockRelease(&walsnd->mutex);
594 
595  if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write)
596  *writePtr = write;
597  if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush)
598  *flushPtr = flush;
599  if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
600  *applyPtr = apply;
601  }
602 }
603 
604 /*
605  * Calculate the Nth latest Write, Flush and Apply positions among sync
606  * standbys.
607  */
608 static void
610  XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
611 {
612  ListCell *cell;
613  XLogRecPtr *write_array;
614  XLogRecPtr *flush_array;
615  XLogRecPtr *apply_array;
616  int len;
617  int i = 0;
618 
619  len = list_length(sync_standbys);
620  write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
621  flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
622  apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
623 
624  foreach(cell, sync_standbys)
625  {
626  WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
627 
628  SpinLockAcquire(&walsnd->mutex);
629  write_array[i] = walsnd->write;
630  flush_array[i] = walsnd->flush;
631  apply_array[i] = walsnd->apply;
632  SpinLockRelease(&walsnd->mutex);
633 
634  i++;
635  }
636 
637  /* Sort each array in descending order */
638  qsort(write_array, len, sizeof(XLogRecPtr), cmp_lsn);
639  qsort(flush_array, len, sizeof(XLogRecPtr), cmp_lsn);
640  qsort(apply_array, len, sizeof(XLogRecPtr), cmp_lsn);
641 
642  /* Get Nth latest Write, Flush, Apply positions */
643  *writePtr = write_array[nth - 1];
644  *flushPtr = flush_array[nth - 1];
645  *applyPtr = apply_array[nth - 1];
646 
647  pfree(write_array);
648  pfree(flush_array);
649  pfree(apply_array);
650 }
651 
652 /*
653  * Compare lsn in order to sort array in descending order.
654  */
655 static int
656 cmp_lsn(const void *a, const void *b)
657 {
658  XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
659  XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
660 
661  if (lsn1 > lsn2)
662  return -1;
663  else if (lsn1 == lsn2)
664  return 0;
665  else
666  return 1;
667 }
668 
669 /*
670  * Return the list of sync standbys, or NIL if no sync standby is connected.
671  *
672  * The caller must hold SyncRepLock.
673  *
674  * On return, *am_sync is set to true if this walsender is connecting to
675  * sync standby. Otherwise it's set to false.
676  */
677 List *
679 {
680  /* Set default result */
681  if (am_sync != NULL)
682  *am_sync = false;
683 
684  /* Quick exit if sync replication is not requested */
685  if (SyncRepConfig == NULL)
686  return NIL;
687 
688  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ?
691 }
692 
693 /*
694  * Return the list of all the candidates for quorum sync standbys,
695  * or NIL if no such standby is connected.
696  *
697  * The caller must hold SyncRepLock. This function must be called only in
698  * a quorum-based sync replication.
699  *
700  * On return, *am_sync is set to true if this walsender is connecting to
701  * sync standby. Otherwise it's set to false.
702  */
703 static List *
705 {
706  List *result = NIL;
707  int i;
708  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
709  * rearrangement */
710 
711  Assert(SyncRepConfig->syncrep_method == SYNC_REP_QUORUM);
712 
713  for (i = 0; i < max_wal_senders; i++)
714  {
715  XLogRecPtr flush;
717  int pid;
718 
719  walsnd = &WalSndCtl->walsnds[i];
720 
721  SpinLockAcquire(&walsnd->mutex);
722  pid = walsnd->pid;
723  flush = walsnd->flush;
724  state = walsnd->state;
725  SpinLockRelease(&walsnd->mutex);
726 
727  /* Must be active */
728  if (pid == 0)
729  continue;
730 
731  /* Must be streaming */
732  if (state != WALSNDSTATE_STREAMING)
733  continue;
734 
735  /* Must be synchronous */
736  if (walsnd->sync_standby_priority == 0)
737  continue;
738 
739  /* Must have a valid flush position */
740  if (XLogRecPtrIsInvalid(flush))
741  continue;
742 
743  /*
744  * Consider this standby as a candidate for quorum sync standbys and
745  * append it to the result.
746  */
747  result = lappend_int(result, i);
748  if (am_sync != NULL && walsnd == MyWalSnd)
749  *am_sync = true;
750  }
751 
752  return result;
753 }
754 
755 /*
756  * Return the list of sync standbys chosen based on their priorities,
757  * or NIL if no sync standby is connected.
758  *
759  * If there are multiple standbys with the same priority,
760  * the first one found is selected preferentially.
761  *
762  * The caller must hold SyncRepLock. This function must be called only in
763  * a priority-based sync replication.
764  *
765  * On return, *am_sync is set to true if this walsender is connecting to
766  * sync standby. Otherwise it's set to false.
767  */
768 static List *
770 {
771  List *result = NIL;
772  List *pending = NIL;
773  int lowest_priority;
774  int next_highest_priority;
775  int this_priority;
776  int priority;
777  int i;
778  bool am_in_pending = false;
779  volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
780  * rearrangement */
781 
782  Assert(SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY);
783 
784  lowest_priority = SyncRepConfig->nmembers;
785  next_highest_priority = lowest_priority + 1;
786 
787  /*
788  * Find the sync standbys which have the highest priority (i.e, 1). Also
789  * store all the other potential sync standbys into the pending list, in
790  * order to scan it later and find other sync standbys from it quickly.
791  */
792  for (i = 0; i < max_wal_senders; i++)
793  {
794  XLogRecPtr flush;
796  int pid;
797 
798  walsnd = &WalSndCtl->walsnds[i];
799 
800  SpinLockAcquire(&walsnd->mutex);
801  pid = walsnd->pid;
802  flush = walsnd->flush;
803  state = walsnd->state;
804  SpinLockRelease(&walsnd->mutex);
805 
806  /* Must be active */
807  if (pid == 0)
808  continue;
809 
810  /* Must be streaming */
811  if (state != WALSNDSTATE_STREAMING)
812  continue;
813 
814  /* Must be synchronous */
815  this_priority = walsnd->sync_standby_priority;
816  if (this_priority == 0)
817  continue;
818 
819  /* Must have a valid flush position */
820  if (XLogRecPtrIsInvalid(flush))
821  continue;
822 
823  /*
824  * If the priority is equal to 1, consider this standby as sync and
825  * append it to the result. Otherwise append this standby to the
826  * pending list to check if it's actually sync or not later.
827  */
828  if (this_priority == 1)
829  {
830  result = lappend_int(result, i);
831  if (am_sync != NULL && walsnd == MyWalSnd)
832  *am_sync = true;
833  if (list_length(result) == SyncRepConfig->num_sync)
834  {
835  list_free(pending);
836  return result; /* Exit if got enough sync standbys */
837  }
838  }
839  else
840  {
841  pending = lappend_int(pending, i);
842  if (am_sync != NULL && walsnd == MyWalSnd)
843  am_in_pending = true;
844 
845  /*
846  * Track the highest priority among the standbys in the pending
847  * list, in order to use it as the starting priority for later
848  * scan of the list. This is useful to find quickly the sync
849  * standbys from the pending list later because we can skip
850  * unnecessary scans for the unused priorities.
851  */
852  if (this_priority < next_highest_priority)
853  next_highest_priority = this_priority;
854  }
855  }
856 
857  /*
858  * Consider all pending standbys as sync if the number of them plus
859  * already-found sync ones is lower than the configuration requests.
860  */
861  if (list_length(result) + list_length(pending) <= SyncRepConfig->num_sync)
862  {
863  bool needfree = (result != NIL && pending != NIL);
864 
865  /*
866  * Set *am_sync to true if this walsender is in the pending list
867  * because all pending standbys are considered as sync.
868  */
869  if (am_sync != NULL && !(*am_sync))
870  *am_sync = am_in_pending;
871 
872  result = list_concat(result, pending);
873  if (needfree)
874  pfree(pending);
875  return result;
876  }
877 
878  /*
879  * Find the sync standbys from the pending list.
880  */
881  priority = next_highest_priority;
882  while (priority <= lowest_priority)
883  {
884  ListCell *cell;
885  ListCell *prev = NULL;
886  ListCell *next;
887 
888  next_highest_priority = lowest_priority + 1;
889 
890  for (cell = list_head(pending); cell != NULL; cell = next)
891  {
892  i = lfirst_int(cell);
893  walsnd = &WalSndCtl->walsnds[i];
894 
895  next = lnext(cell);
896 
897  this_priority = walsnd->sync_standby_priority;
898  if (this_priority == priority)
899  {
900  result = lappend_int(result, i);
901  if (am_sync != NULL && walsnd == MyWalSnd)
902  *am_sync = true;
903 
904  /*
905  * We should always exit here after the scan of pending list
906  * starts because we know that the list has enough elements to
907  * reach SyncRepConfig->num_sync.
908  */
909  if (list_length(result) == SyncRepConfig->num_sync)
910  {
911  list_free(pending);
912  return result; /* Exit if got enough sync standbys */
913  }
914 
915  /*
916  * Remove the entry for this sync standby from the list to
917  * prevent us from looking at the same entry again.
918  */
919  pending = list_delete_cell(pending, cell, prev);
920 
921  continue;
922  }
923 
924  if (this_priority < next_highest_priority)
925  next_highest_priority = this_priority;
926 
927  prev = cell;
928  }
929 
930  priority = next_highest_priority;
931  }
932 
933  /* never reached, but keep compiler quiet */
934  Assert(false);
935  return result;
936 }
937 
938 /*
939  * Check if we are in the list of sync standbys, and if so, determine
940  * priority sequence. Return priority if set, or zero to indicate that
941  * we are not a potential sync standby.
942  *
943  * Compare the parameter SyncRepStandbyNames against the application_name
944  * for this WALSender, or allow any name if we find a wildcard "*".
945  */
946 static int
948 {
949  const char *standby_name;
950  int priority;
951  bool found = false;
952 
953  /*
954  * Since synchronous cascade replication is not allowed, we always set the
955  * priority of cascading walsender to zero.
956  */
958  return 0;
959 
960  if (!SyncStandbysDefined() || SyncRepConfig == NULL)
961  return 0;
962 
963  standby_name = SyncRepConfig->member_names;
964  for (priority = 1; priority <= SyncRepConfig->nmembers; priority++)
965  {
966  if (pg_strcasecmp(standby_name, application_name) == 0 ||
967  strcmp(standby_name, "*") == 0)
968  {
969  found = true;
970  break;
971  }
972  standby_name += strlen(standby_name) + 1;
973  }
974 
975  if (!found)
976  return 0;
977 
978  /*
979  * In quorum-based sync replication, all the standbys in the list have the
980  * same priority, one.
981  */
982  return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ? priority : 1;
983 }
984 
985 /*
986  * Walk the specified queue from head. Set the state of any backends that
987  * need to be woken, remove them from the queue, and then wake them.
988  * Pass all = true to wake whole queue; otherwise, just wake up to
989  * the walsender's LSN.
990  *
991  * Must hold SyncRepLock.
992  */
993 static int
994 SyncRepWakeQueue(bool all, int mode)
995 {
996  volatile WalSndCtlData *walsndctl = WalSndCtl;
997  PGPROC *proc = NULL;
998  PGPROC *thisproc = NULL;
999  int numprocs = 0;
1000 
1001  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
1002  Assert(SyncRepQueueIsOrderedByLSN(mode));
1003 
1004  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1005  &(WalSndCtl->SyncRepQueue[mode]),
1006  offsetof(PGPROC, syncRepLinks));
1007 
1008  while (proc)
1009  {
1010  /*
1011  * Assume the queue is ordered by LSN
1012  */
1013  if (!all && walsndctl->lsn[mode] < proc->waitLSN)
1014  return numprocs;
1015 
1016  /*
1017  * Move to next proc, so we can delete thisproc from the queue.
1018  * thisproc is valid, proc may be NULL after this.
1019  */
1020  thisproc = proc;
1021  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1022  &(proc->syncRepLinks),
1023  offsetof(PGPROC, syncRepLinks));
1024 
1025  /*
1026  * Remove thisproc from queue.
1027  */
1028  SHMQueueDelete(&(thisproc->syncRepLinks));
1029 
1030  /*
1031  * SyncRepWaitForLSN() reads syncRepState without holding the lock, so
1032  * make sure that it sees the queue link being removed before the
1033  * syncRepState change.
1034  */
1035  pg_write_barrier();
1036 
1037  /*
1038  * Set state to complete; see SyncRepWaitForLSN() for discussion of
1039  * the various states.
1040  */
1042 
1043  /*
1044  * Wake only when we have set state and removed from queue.
1045  */
1046  SetLatch(&(thisproc->procLatch));
1047 
1048  numprocs++;
1049  }
1050 
1051  return numprocs;
1052 }
1053 
1054 /*
1055  * The checkpointer calls this as needed to update the shared
1056  * sync_standbys_defined flag, so that backends don't remain permanently wedged
1057  * if synchronous_standby_names is unset. It's safe to check the current value
1058  * without the lock, because it's only ever updated by one process. But we
1059  * must take the lock to change it.
1060  */
1061 void
1063 {
1064  bool sync_standbys_defined = SyncStandbysDefined();
1065 
1066  if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
1067  {
1068  LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
1069 
1070  /*
1071  * If synchronous_standby_names has been reset to empty, it's futile
1072  * for backends to continue to waiting. Since the user no longer
1073  * wants synchronous replication, we'd better wake them up.
1074  */
1075  if (!sync_standbys_defined)
1076  {
1077  int i;
1078 
1079  for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
1080  SyncRepWakeQueue(true, i);
1081  }
1082 
1083  /*
1084  * Only allow people to join the queue when there are synchronous
1085  * standbys defined. Without this interlock, there's a race
1086  * condition: we might wake up all the current waiters; then, some
1087  * backend that hasn't yet reloaded its config might go to sleep on
1088  * the queue (and never wake up). This prevents that.
1089  */
1090  WalSndCtl->sync_standbys_defined = sync_standbys_defined;
1091 
1092  LWLockRelease(SyncRepLock);
1093  }
1094 }
1095 
1096 #ifdef USE_ASSERT_CHECKING
1097 static bool
1098 SyncRepQueueIsOrderedByLSN(int mode)
1099 {
1100  PGPROC *proc = NULL;
1101  XLogRecPtr lastLSN;
1102 
1103  Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
1104 
1105  lastLSN = 0;
1106 
1107  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1108  &(WalSndCtl->SyncRepQueue[mode]),
1109  offsetof(PGPROC, syncRepLinks));
1110 
1111  while (proc)
1112  {
1113  /*
1114  * Check the queue is ordered by LSN and that multiple procs don't
1115  * have matching LSNs
1116  */
1117  if (proc->waitLSN <= lastLSN)
1118  return false;
1119 
1120  lastLSN = proc->waitLSN;
1121 
1122  proc = (PGPROC *) SHMQueueNext(&(WalSndCtl->SyncRepQueue[mode]),
1123  &(proc->syncRepLinks),
1124  offsetof(PGPROC, syncRepLinks));
1125  }
1126 
1127  return true;
1128 }
1129 #endif
1130 
1131 /*
1132  * ===========================================================
1133  * Synchronous Replication functions executed by any process
1134  * ===========================================================
1135  */
1136 
1137 bool
1138 check_synchronous_standby_names(char **newval, void **extra, GucSource source)
1139 {
1140  if (*newval != NULL && (*newval)[0] != '\0')
1141  {
1142  int parse_rc;
1143  SyncRepConfigData *pconf;
1144 
1145  /* Reset communication variables to ensure a fresh start */
1146  syncrep_parse_result = NULL;
1147  syncrep_parse_error_msg = NULL;
1148 
1149  /* Parse the synchronous_standby_names string */
1150  syncrep_scanner_init(*newval);
1151  parse_rc = syncrep_yyparse();
1153 
1154  if (parse_rc != 0 || syncrep_parse_result == NULL)
1155  {
1156  GUC_check_errcode(ERRCODE_SYNTAX_ERROR);
1159  else
1160  GUC_check_errdetail("synchronous_standby_names parser failed");
1161  return false;
1162  }
1163 
1164  if (syncrep_parse_result->num_sync <= 0)
1165  {
1166  GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero",
1168  return false;
1169  }
1170 
1171  /* GUC extra value must be malloc'd, not palloc'd */
1172  pconf = (SyncRepConfigData *)
1174  if (pconf == NULL)
1175  return false;
1177 
1178  *extra = (void *) pconf;
1179 
1180  /*
1181  * We need not explicitly clean up syncrep_parse_result. It, and any
1182  * other cruft generated during parsing, will be freed when the
1183  * current memory context is deleted. (This code is generally run in
1184  * a short-lived context used for config file processing, so that will
1185  * not be very long.)
1186  */
1187  }
1188  else
1189  *extra = NULL;
1190 
1191  return true;
1192 }
1193 
1194 void
1195 assign_synchronous_standby_names(const char *newval, void *extra)
1196 {
1197  SyncRepConfig = (SyncRepConfigData *) extra;
1198 }
1199 
1200 void
1202 {
1203  switch (newval)
1204  {
1207  break;
1210  break;
1213  break;
1214  default:
1216  break;
1217  }
1218 }
void syncrep_scanner_finish(void)
#define NIL
Definition: pg_list.h:69
void SyncRepUpdateSyncStandbysDefined(void)
Definition: syncrep.c:1062
XLogRecPtr write
void assign_synchronous_standby_names(const char *newval, void *extra)
Definition: syncrep.c:1195
Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:164
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
static List * SyncRepGetSyncStandbysPriority(bool *am_sync)
Definition: syncrep.c:769
static void SyncRepCancelWait(void)
Definition: syncrep.c:351
#define DEBUG1
Definition: elog.h:25
void syncrep_scanner_init(const char *query_string)
static int32 next
Definition: blutils.c:211
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync)
Definition: syncrep.c:516
#define DEBUG3
Definition: elog.h:23
#define SYNC_REP_PRIORITY
Definition: syncrep.h:36
bool update_process_title
Definition: ps_status.c:35
#define write(a, b, c)
Definition: win32.h:14
#define GUC_check_errdetail
Definition: guc.h:410
static void SyncRepQueueInsert(int mode)
Definition: syncrep.c:318
PGPROC * MyProc
Definition: proc.c:67
char * syncrep_parse_error_msg
#define SYNC_REP_WAITING
Definition: syncrep.h:32
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:146
uint8 syncrep_method
Definition: syncrep.h:51
#define Min(x, y)
Definition: c.h:857
unsigned char uint8
Definition: c.h:323
void set_ps_display(const char *activity, bool force)
Definition: ps_status.c:326
#define SYNC_REP_NOT_WAITING
Definition: syncrep.h:31
WalSndCtlData * WalSndCtl
Definition: walsender.c:109
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
Definition: syncrep.c:609
int errcode(int sqlerrcode)
Definition: elog.c:575
#define GUC_check_errmsg
Definition: guc.h:406
List * list_concat(List *list1, List *list2)
Definition: list.c:321
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]
List * SyncRepGetSyncStandbys(bool *am_sync)
Definition: syncrep.c:678
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void ResetLatch(volatile Latch *latch)
Definition: latch.c:497
#define LOG
Definition: elog.h:26
SyncRepConfigData * SyncRepConfig
Definition: syncrep.c:97
volatile bool QueryCancelPending
Definition: globals.c:31
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:10048
static int SyncRepWakeQueue(bool all, int mode)
Definition: syncrep.c:994
slock_t mutex
Latch procLatch
Definition: proc.h:104
GucSource
Definition: guc.h:105
#define malloc(a)
Definition: header.h:50
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1725
#define SpinLockAcquire(lock)
Definition: spin.h:62
XLogRecPtr flush
int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:336
void pfree(void *pointer)
Definition: mcxt.c:1031
static int cmp_lsn(const void *a, const void *b)
Definition: syncrep.c:656
#define SYNC_REP_NO_WAIT
Definition: syncrep.h:23
#define SYNC_REP_WAIT_APPLY
Definition: syncrep.h:26
#define lfirst_int(lc)
Definition: pg_list.h:107
void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
Definition: shmqueue.c:108
static bool announce_next_takeover
Definition: syncrep.c:95
Definition: dest.h:88
const char * get_ps_display(int *displen)
Definition: ps_status.c:405
WalSndState state
void assign_synchronous_commit(int newval, void *extra)
Definition: syncrep.c:1201
#define SyncStandbysDefined()
Definition: syncrep.c:92
bool PostmasterIsAlive(void)
Definition: pmsignal.c:272
int errdetail(const char *fmt,...)
Definition: elog.c:873
static ListCell * list_head(const List *l)
Definition: pg_list.h:77
unsigned int uint32
Definition: c.h:325
SHM_QUEUE SyncRepQueue[NUM_SYNC_REP_WAIT_MODE]
void SyncRepInitConfig(void)
Definition: syncrep.c:382
#define lnext(lc)
Definition: pg_list.h:105
#define ereport(elevel, rest)
Definition: elog.h:122
#define SYNC_REP_WAIT_FLUSH
Definition: syncrep.h:25
List * lappend_int(List *list, int datum)
Definition: list.c:146
#define SyncRepRequested()
Definition: syncrep.h:19
int max_wal_senders
Definition: walsender.c:121
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
#define SpinLockRelease(lock)
Definition: spin.h:64
List * list_delete_cell(List *list, ListCell *cell, ListCell *prev)
Definition: list.c:528
char * SyncRepStandbyNames
Definition: syncrep.c:90
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
Definition: shmqueue.c:145
WalSnd * MyWalSnd
Definition: walsender.c:112
static List * SyncRepGetSyncStandbysQuorum(bool *am_sync)
Definition: syncrep.c:704
#define SYNC_REP_WAIT_COMPLETE
Definition: syncrep.h:33
#define SYNC_REP_QUORUM
Definition: syncrep.h:37
void SetLatch(volatile Latch *latch)
Definition: latch.c:414
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:699
char member_names[FLEXIBLE_ARRAY_MEMBER]
Definition: syncrep.h:54
int sync_standby_priority
Definition: regguts.h:298
#define pg_read_barrier()
Definition: atomics.h:161
bool SHMQueueIsDetached(const SHM_QUEUE *queue)
Definition: shmqueue.c:47
SyncRepConfigData * syncrep_parse_result
static int list_length(const List *l)
Definition: pg_list.h:89
#define newval
bool check_synchronous_standby_names(char **newval, void **extra, GucSource source)
Definition: syncrep.c:1138
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1121
SHM_QUEUE syncRepLinks
Definition: proc.h:150
void SyncRepCleanupAtProcExit(void)
Definition: syncrep.c:361
WalSndState
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]
volatile bool ProcDiePending
Definition: globals.c:32
char * application_name
Definition: guc.c:472
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:797
void list_free(List *list)
Definition: list.c:1133
int i
#define pg_write_barrier()
Definition: atomics.h:162
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, List *sync_standbys)
Definition: syncrep.c:573
void SHMQueueDelete(SHM_QUEUE *queue)
Definition: shmqueue.c:68
struct Latch * MyLatch
Definition: globals.c:53
int syncrep_yyparse(void)
#define elog
Definition: elog.h:219
#define NUM_SYNC_REP_WAIT_MODE
Definition: syncrep.h:28
#define qsort(a, b, c, d)
Definition: port.h:421
int syncRepState
Definition: proc.h:149
CommandDest whereToSendOutput
Definition: postgres.c:90
XLogRecPtr apply
Definition: proc.h:95
Definition: pg_list.h:45
static int SyncRepWaitMode
Definition: syncrep.c:98
#define WL_LATCH_SET
Definition: latch.h:124
void SyncRepReleaseWaiters(void)
Definition: syncrep.c:410
XLogRecPtr waitLSN
Definition: proc.h:148
#define offsetof(type, field)
Definition: c.h:622
bool am_cascading_walsender
Definition: walsender.c:116
#define SYNC_REP_WAIT_WRITE
Definition: syncrep.h:24
static int SyncRepGetStandbyPriority(void)
Definition: syncrep.c:947