PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
syncrep.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * syncrep.c
4 *
5 * Synchronous replication is new as of PostgreSQL 9.1.
6 *
7 * If requested, transaction commits wait until their commit LSN are
8 * acknowledged by the synchronous standbys.
9 *
10 * This module contains the code for waiting and release of backends.
11 * All code in this module executes on the primary. The core streaming
12 * replication transport remains within WALreceiver/WALsender modules.
13 *
14 * The essence of this design is that it isolates all logic about
15 * waiting/releasing onto the primary. The primary defines which standbys
16 * it wishes to wait for. The standbys are completely unaware of the
17 * durability requirements of transactions on the primary, reducing the
18 * complexity of the code and streamlining both standby operations and
19 * network bandwidth because there is no requirement to ship
20 * per-transaction state information.
21 *
22 * Replication is either synchronous or not synchronous (async). If it is
23 * async, we just fastpath out of here. If it is sync, then we wait for
24 * the write, flush or apply location on the standby before releasing
25 * the waiting backend. Further complexity in that interaction is
26 * expected in later releases.
27 *
28 * The best performing way to manage the waiting backends is to have a
29 * single ordered queue of waiting backends, so that we can avoid
30 * searching the through all waiters each time we receive a reply.
31 *
32 * In 9.5 or before only a single standby could be considered as
33 * synchronous. In 9.6 we support a priority-based multiple synchronous
34 * standbys. In 10.0 a quorum-based multiple synchronous standbys is also
35 * supported. The number of synchronous standbys that transactions
36 * must wait for replies from is specified in synchronous_standby_names.
37 * This parameter also specifies a list of standby names and the method
38 * (FIRST and ANY) to choose synchronous standbys from the listed ones.
39 *
40 * The method FIRST specifies a priority-based synchronous replication
41 * and makes transaction commits wait until their WAL records are
42 * replicated to the requested number of synchronous standbys chosen based
43 * on their priorities. The standbys whose names appear earlier in the list
44 * are given higher priority and will be considered as synchronous.
45 * Other standby servers appearing later in this list represent potential
46 * synchronous standbys. If any of the current synchronous standbys
47 * disconnects for whatever reason, it will be replaced immediately with
48 * the next-highest-priority standby.
49 *
50 * The method ANY specifies a quorum-based synchronous replication
51 * and makes transaction commits wait until their WAL records are
52 * replicated to at least the requested number of synchronous standbys
53 * in the list. All the standbys appearing in the list are considered as
54 * candidates for quorum synchronous standbys.
55 *
56 * If neither FIRST nor ANY is specified, FIRST is used as the method.
57 * This is for backward compatibility with 9.6 or before where only a
58 * priority-based sync replication was supported.
59 *
60 * Before the standbys chosen from synchronous_standby_names can
61 * become the synchronous standbys they must have caught up with
62 * the primary; that may take some time. Once caught up,
63 * the standbys which are considered as synchronous at that moment
64 * will release waiters from the queue.
65 *
66 * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
67 *
68 * IDENTIFICATION
69 * src/backend/replication/syncrep.c
70 *
71 *-------------------------------------------------------------------------
72 */
73#include "postgres.h"
74
75#include <unistd.h>
76
77#include "access/xact.h"
78#include "common/int.h"
79#include "miscadmin.h"
80#include "pgstat.h"
81#include "replication/syncrep.h"
84#include "storage/proc.h"
85#include "tcop/tcopprot.h"
86#include "utils/guc_hooks.h"
87#include "utils/ps_status.h"
88
89/* User-settable parameters for sync rep */
91
92#define SyncStandbysDefined() \
93 (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0')
94
95static bool announce_next_takeover = true;
96
99
100static void SyncRepQueueInsert(int mode);
101static void SyncRepCancelWait(void);
102static int SyncRepWakeQueue(bool all, int mode);
103
104static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
105 XLogRecPtr *flushPtr,
106 XLogRecPtr *applyPtr,
107 bool *am_sync);
108static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
109 XLogRecPtr *flushPtr,
110 XLogRecPtr *applyPtr,
111 SyncRepStandbyData *sync_standbys,
112 int num_standbys);
113static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
114 XLogRecPtr *flushPtr,
115 XLogRecPtr *applyPtr,
116 SyncRepStandbyData *sync_standbys,
117 int num_standbys,
118 uint8 nth);
119static int SyncRepGetStandbyPriority(void);
120static int standby_priority_comparator(const void *a, const void *b);
121static int cmp_lsn(const void *a, const void *b);
122
123#ifdef USE_ASSERT_CHECKING
124static bool SyncRepQueueIsOrderedByLSN(int mode);
125#endif
126
127/*
128 * ===========================================================
129 * Synchronous Replication functions for normal user backends
130 * ===========================================================
131 */
132
133/*
134 * Wait for synchronous replication, if requested by user.
135 *
136 * Initially backends start in state SYNC_REP_NOT_WAITING and then
137 * change that state to SYNC_REP_WAITING before adding ourselves
138 * to the wait queue. During SyncRepWakeQueue() a WALSender changes
139 * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
140 * This backend then resets its state to SYNC_REP_NOT_WAITING.
141 *
142 * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN
143 * represents a commit record. If it doesn't, then we wait only for the WAL
144 * to be flushed if synchronous_commit is set to the higher level of
145 * remote_apply, because only commit records provide apply feedback.
146 */
147void
149{
150 int mode;
151
152 /*
153 * This should be called while holding interrupts during a transaction
154 * commit to prevent the follow-up shared memory queue cleanups to be
155 * influenced by external interruptions.
156 */
158
159 /*
160 * Fast exit if user has not requested sync replication, or there are no
161 * sync replication standby names defined.
162 *
163 * Since this routine gets called every commit time, it's important to
164 * exit quickly if sync replication is not requested. So we check
165 * WalSndCtl->sync_standbys_defined flag without the lock and exit
166 * immediately if it's false. If it's true, we need to check it again
167 * later while holding the lock, to check the flag and operate the sync
168 * rep queue atomically. This is necessary to avoid the race condition
169 * described in SyncRepUpdateSyncStandbysDefined(). On the other hand, if
170 * it's false, the lock is not necessary because we don't touch the queue.
171 */
172 if (!SyncRepRequested() ||
173 !((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined)
174 return;
175
176 /* Cap the level for anything other than commit to remote flush only. */
177 if (commit)
179 else
181
183 Assert(WalSndCtl != NULL);
184
185 LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
187
188 /*
189 * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
190 * set. See SyncRepUpdateSyncStandbysDefined.
191 *
192 * Also check that the standby hasn't already replied. Unlikely race
193 * condition but we'll be fetching that cache line anyway so it's likely
194 * to be a low cost check.
195 */
197 lsn <= WalSndCtl->lsn[mode])
198 {
199 LWLockRelease(SyncRepLock);
200 return;
201 }
202
203 /*
204 * Set our waitLSN so WALSender will know when to wake us, and add
205 * ourselves to the queue.
206 */
207 MyProc->waitLSN = lsn;
210 Assert(SyncRepQueueIsOrderedByLSN(mode));
211 LWLockRelease(SyncRepLock);
212
213 /* Alter ps display to show waiting for sync rep. */
215 {
216 char buffer[32];
217
218 sprintf(buffer, "waiting for %X/%X", LSN_FORMAT_ARGS(lsn));
219 set_ps_display_suffix(buffer);
220 }
221
222 /*
223 * Wait for specified LSN to be confirmed.
224 *
225 * Each proc has its own wait latch, so we perform a normal latch
226 * check/wait loop here.
227 */
228 for (;;)
229 {
230 int rc;
231
232 /* Must reset the latch before testing state. */
234
235 /*
236 * Acquiring the lock is not needed, the latch ensures proper
237 * barriers. If it looks like we're done, we must really be done,
238 * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE,
239 * it will never update it again, so we can't be seeing a stale value
240 * in that case.
241 */
243 break;
244
245 /*
246 * If a wait for synchronous replication is pending, we can neither
247 * acknowledge the commit nor raise ERROR or FATAL. The latter would
248 * lead the client to believe that the transaction aborted, which is
249 * not true: it's already committed locally. The former is no good
250 * either: the client has requested synchronous replication, and is
251 * entitled to assume that an acknowledged commit is also replicated,
252 * which might not be true. So in this case we issue a WARNING (which
253 * some clients may be able to interpret) and shut off further output.
254 * We do NOT reset ProcDiePending, so that the process will die after
255 * the commit is cleaned up.
256 */
257 if (ProcDiePending)
258 {
260 (errcode(ERRCODE_ADMIN_SHUTDOWN),
261 errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
262 errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
265 break;
266 }
267
268 /*
269 * It's unclear what to do if a query cancel interrupt arrives. We
270 * can't actually abort at this point, but ignoring the interrupt
271 * altogether is not helpful, so we just terminate the wait with a
272 * suitable warning.
273 */
275 {
276 QueryCancelPending = false;
278 (errmsg("canceling wait for synchronous replication due to user request"),
279 errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
281 break;
282 }
283
284 /*
285 * Wait on latch. Any condition that should wake us up will set the
286 * latch, so no need for timeout.
287 */
289 WAIT_EVENT_SYNC_REP);
290
291 /*
292 * If the postmaster dies, we'll probably never get an acknowledgment,
293 * because all the wal sender processes will exit. So just bail out.
294 */
295 if (rc & WL_POSTMASTER_DEATH)
296 {
297 ProcDiePending = true;
300 break;
301 }
302 }
303
304 /*
305 * WalSender has checked our LSN and has removed us from queue. Clean up
306 * state and leave. It's OK to reset these shared memory fields without
307 * holding SyncRepLock, because any walsenders will ignore us anyway when
308 * we're not on the queue. We need a read barrier to make sure we see the
309 * changes to the queue link (this might be unnecessary without
310 * assertions, but better safe than sorry).
311 */
315 MyProc->waitLSN = 0;
316
317 /* reset ps display to remove the suffix */
320}
321
322/*
323 * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
324 *
325 * Usually we will go at tail of queue, though it's possible that we arrive
326 * here out of order, so start at tail and work back to insertion point.
327 */
328static void
330{
331 dlist_head *queue;
332 dlist_iter iter;
333
335 queue = &WalSndCtl->SyncRepQueue[mode];
336
337 dlist_reverse_foreach(iter, queue)
338 {
339 PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur);
340
341 /*
342 * Stop at the queue element that we should insert after to ensure the
343 * queue is ordered by LSN.
344 */
345 if (proc->waitLSN < MyProc->waitLSN)
346 {
348 return;
349 }
350 }
351
352 /*
353 * If we get here, the list was either empty, or this process needs to be
354 * at the head.
355 */
357}
358
359/*
360 * Acquire SyncRepLock and cancel any wait currently in progress.
361 */
362static void
364{
365 LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
369 LWLockRelease(SyncRepLock);
370}
371
372void
374{
375 /*
376 * First check if we are removed from the queue without the lock to not
377 * slow down backend exit.
378 */
380 {
381 LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
382
383 /* maybe we have just been removed, so recheck */
386
387 LWLockRelease(SyncRepLock);
388 }
389}
390
391/*
392 * ===========================================================
393 * Synchronous Replication functions for wal sender processes
394 * ===========================================================
395 */
396
397/*
398 * Take any action required to initialise sync rep state from config
399 * data. Called at WALSender startup and after each SIGHUP.
400 */
401void
403{
404 int priority;
405
406 /*
407 * Determine if we are a potential sync standby and remember the result
408 * for handling replies from standby.
409 */
410 priority = SyncRepGetStandbyPriority();
411 if (MyWalSnd->sync_standby_priority != priority)
412 {
416
418 (errmsg_internal("standby \"%s\" now has synchronous standby priority %d",
419 application_name, priority)));
420 }
421}
422
423/*
424 * Update the LSNs on each queue based upon our latest state. This
425 * implements a simple policy of first-valid-sync-standby-releases-waiter.
426 *
427 * Other policies are possible, which would change what we do here and
428 * perhaps also which information we store as well.
429 */
430void
432{
433 volatile WalSndCtlData *walsndctl = WalSndCtl;
434 XLogRecPtr writePtr;
435 XLogRecPtr flushPtr;
436 XLogRecPtr applyPtr;
437 bool got_recptr;
438 bool am_sync;
439 int numwrite = 0;
440 int numflush = 0;
441 int numapply = 0;
442
443 /*
444 * If this WALSender is serving a standby that is not on the list of
445 * potential sync standbys then we have nothing to do. If we are still
446 * starting up, still running base backup or the current flush position is
447 * still invalid, then leave quickly also. Streaming or stopping WAL
448 * senders are allowed to release waiters.
449 */
454 {
456 return;
457 }
458
459 /*
460 * We're a potential sync standby. Release waiters if there are enough
461 * sync standbys and we are considered as sync.
462 */
463 LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
464
465 /*
466 * Check whether we are a sync standby or not, and calculate the synced
467 * positions among all sync standbys. (Note: although this step does not
468 * of itself require holding SyncRepLock, it seems like a good idea to do
469 * it after acquiring the lock. This ensures that the WAL pointers we use
470 * to release waiters are newer than any previous execution of this
471 * routine used.)
472 */
473 got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
474
475 /*
476 * If we are managing a sync standby, though we weren't prior to this,
477 * then announce we are now a sync standby.
478 */
479 if (announce_next_takeover && am_sync)
480 {
482
484 ereport(LOG,
485 (errmsg("standby \"%s\" is now a synchronous standby with priority %d",
487 else
488 ereport(LOG,
489 (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
491 }
492
493 /*
494 * If the number of sync standbys is less than requested or we aren't
495 * managing a sync standby then just leave.
496 */
497 if (!got_recptr || !am_sync)
498 {
499 LWLockRelease(SyncRepLock);
500 announce_next_takeover = !am_sync;
501 return;
502 }
503
504 /*
505 * Set the lsn first so that when we wake backends they will release up to
506 * this location.
507 */
508 if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr)
509 {
510 walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr;
511 numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
512 }
513 if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr)
514 {
515 walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr;
516 numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
517 }
518 if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr)
519 {
520 walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr;
521 numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY);
522 }
523
524 LWLockRelease(SyncRepLock);
525
526 elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X",
527 numwrite, LSN_FORMAT_ARGS(writePtr),
528 numflush, LSN_FORMAT_ARGS(flushPtr),
529 numapply, LSN_FORMAT_ARGS(applyPtr));
530}
531
532/*
533 * Calculate the synced Write, Flush and Apply positions among sync standbys.
534 *
535 * Return false if the number of sync standbys is less than
536 * synchronous_standby_names specifies. Otherwise return true and
537 * store the positions into *writePtr, *flushPtr and *applyPtr.
538 *
539 * On return, *am_sync is set to true if this walsender is connecting to
540 * sync standby. Otherwise it's set to false.
541 */
542static bool
544 XLogRecPtr *applyPtr, bool *am_sync)
545{
546 SyncRepStandbyData *sync_standbys;
547 int num_standbys;
548 int i;
549
550 /* Initialize default results */
551 *writePtr = InvalidXLogRecPtr;
552 *flushPtr = InvalidXLogRecPtr;
553 *applyPtr = InvalidXLogRecPtr;
554 *am_sync = false;
555
556 /* Quick out if not even configured to be synchronous */
557 if (SyncRepConfig == NULL)
558 return false;
559
560 /* Get standbys that are considered as synchronous at this moment */
561 num_standbys = SyncRepGetCandidateStandbys(&sync_standbys);
562
563 /* Am I among the candidate sync standbys? */
564 for (i = 0; i < num_standbys; i++)
565 {
566 if (sync_standbys[i].is_me)
567 {
568 *am_sync = true;
569 break;
570 }
571 }
572
573 /*
574 * Nothing more to do if we are not managing a sync standby or there are
575 * not enough synchronous standbys.
576 */
577 if (!(*am_sync) ||
578 num_standbys < SyncRepConfig->num_sync)
579 {
580 pfree(sync_standbys);
581 return false;
582 }
583
584 /*
585 * In a priority-based sync replication, the synced positions are the
586 * oldest ones among sync standbys. In a quorum-based, they are the Nth
587 * latest ones.
588 *
589 * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest
590 * positions. But we use SyncRepGetOldestSyncRecPtr() for that calculation
591 * because it's a bit more efficient.
592 *
593 * XXX If the numbers of current and requested sync standbys are the same,
594 * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
595 * positions even in a quorum-based sync replication.
596 */
598 {
599 SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
600 sync_standbys, num_standbys);
601 }
602 else
603 {
604 SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
605 sync_standbys, num_standbys,
607 }
608
609 pfree(sync_standbys);
610 return true;
611}
612
613/*
614 * Calculate the oldest Write, Flush and Apply positions among sync standbys.
615 */
616static void
618 XLogRecPtr *flushPtr,
619 XLogRecPtr *applyPtr,
620 SyncRepStandbyData *sync_standbys,
621 int num_standbys)
622{
623 int i;
624
625 /*
626 * Scan through all sync standbys and calculate the oldest Write, Flush
627 * and Apply positions. We assume *writePtr et al were initialized to
628 * InvalidXLogRecPtr.
629 */
630 for (i = 0; i < num_standbys; i++)
631 {
632 XLogRecPtr write = sync_standbys[i].write;
633 XLogRecPtr flush = sync_standbys[i].flush;
634 XLogRecPtr apply = sync_standbys[i].apply;
635
636 if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write)
637 *writePtr = write;
638 if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush)
639 *flushPtr = flush;
640 if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
641 *applyPtr = apply;
642 }
643}
644
645/*
646 * Calculate the Nth latest Write, Flush and Apply positions among sync
647 * standbys.
648 */
649static void
651 XLogRecPtr *flushPtr,
652 XLogRecPtr *applyPtr,
653 SyncRepStandbyData *sync_standbys,
654 int num_standbys,
655 uint8 nth)
656{
657 XLogRecPtr *write_array;
658 XLogRecPtr *flush_array;
659 XLogRecPtr *apply_array;
660 int i;
661
662 /* Should have enough candidates, or somebody messed up */
663 Assert(nth > 0 && nth <= num_standbys);
664
665 write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
666 flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
667 apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys);
668
669 for (i = 0; i < num_standbys; i++)
670 {
671 write_array[i] = sync_standbys[i].write;
672 flush_array[i] = sync_standbys[i].flush;
673 apply_array[i] = sync_standbys[i].apply;
674 }
675
676 /* Sort each array in descending order */
677 qsort(write_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
678 qsort(flush_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
679 qsort(apply_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn);
680
681 /* Get Nth latest Write, Flush, Apply positions */
682 *writePtr = write_array[nth - 1];
683 *flushPtr = flush_array[nth - 1];
684 *applyPtr = apply_array[nth - 1];
685
686 pfree(write_array);
687 pfree(flush_array);
688 pfree(apply_array);
689}
690
691/*
692 * Compare lsn in order to sort array in descending order.
693 */
694static int
695cmp_lsn(const void *a, const void *b)
696{
697 XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
698 XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
699
700 return pg_cmp_u64(lsn2, lsn1);
701}
702
703/*
704 * Return data about walsenders that are candidates to be sync standbys.
705 *
706 * *standbys is set to a palloc'd array of structs of per-walsender data,
707 * and the number of valid entries (candidate sync senders) is returned.
708 * (This might be more or fewer than num_sync; caller must check.)
709 */
710int
712{
713 int i;
714 int n;
715
716 /* Create result array */
717 *standbys = (SyncRepStandbyData *)
719
720 /* Quick exit if sync replication is not requested */
721 if (SyncRepConfig == NULL)
722 return 0;
723
724 /* Collect raw data from shared memory */
725 n = 0;
726 for (i = 0; i < max_wal_senders; i++)
727 {
728 volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
729 * rearrangement */
730 SyncRepStandbyData *stby;
731 WalSndState state; /* not included in SyncRepStandbyData */
732
733 walsnd = &WalSndCtl->walsnds[i];
734 stby = *standbys + n;
735
736 SpinLockAcquire(&walsnd->mutex);
737 stby->pid = walsnd->pid;
738 state = walsnd->state;
739 stby->write = walsnd->write;
740 stby->flush = walsnd->flush;
741 stby->apply = walsnd->apply;
743 SpinLockRelease(&walsnd->mutex);
744
745 /* Must be active */
746 if (stby->pid == 0)
747 continue;
748
749 /* Must be streaming or stopping */
752 continue;
753
754 /* Must be synchronous */
755 if (stby->sync_standby_priority == 0)
756 continue;
757
758 /* Must have a valid flush position */
759 if (XLogRecPtrIsInvalid(stby->flush))
760 continue;
761
762 /* OK, it's a candidate */
763 stby->walsnd_index = i;
764 stby->is_me = (walsnd == MyWalSnd);
765 n++;
766 }
767
768 /*
769 * In quorum mode, we return all the candidates. In priority mode, if we
770 * have too many candidates then return only the num_sync ones of highest
771 * priority.
772 */
775 {
776 /* Sort by priority ... */
777 qsort(*standbys, n, sizeof(SyncRepStandbyData),
779 /* ... then report just the first num_sync ones */
781 }
782
783 return n;
784}
785
786/*
787 * qsort comparator to sort SyncRepStandbyData entries by priority
788 */
789static int
790standby_priority_comparator(const void *a, const void *b)
791{
792 const SyncRepStandbyData *sa = (const SyncRepStandbyData *) a;
793 const SyncRepStandbyData *sb = (const SyncRepStandbyData *) b;
794
795 /* First, sort by increasing priority value */
796 if (sa->sync_standby_priority != sb->sync_standby_priority)
797 return sa->sync_standby_priority - sb->sync_standby_priority;
798
799 /*
800 * We might have equal priority values; arbitrarily break ties by position
801 * in the WalSnd array. (This is utterly bogus, since that is arrival
802 * order dependent, but there are regression tests that rely on it.)
803 */
804 return sa->walsnd_index - sb->walsnd_index;
805}
806
807
808/*
809 * Check if we are in the list of sync standbys, and if so, determine
810 * priority sequence. Return priority if set, or zero to indicate that
811 * we are not a potential sync standby.
812 *
813 * Compare the parameter SyncRepStandbyNames against the application_name
814 * for this WALSender, or allow any name if we find a wildcard "*".
815 */
816static int
818{
819 const char *standby_name;
820 int priority;
821 bool found = false;
822
823 /*
824 * Since synchronous cascade replication is not allowed, we always set the
825 * priority of cascading walsender to zero.
826 */
828 return 0;
829
830 if (!SyncStandbysDefined() || SyncRepConfig == NULL)
831 return 0;
832
833 standby_name = SyncRepConfig->member_names;
834 for (priority = 1; priority <= SyncRepConfig->nmembers; priority++)
835 {
836 if (pg_strcasecmp(standby_name, application_name) == 0 ||
837 strcmp(standby_name, "*") == 0)
838 {
839 found = true;
840 break;
841 }
842 standby_name += strlen(standby_name) + 1;
843 }
844
845 if (!found)
846 return 0;
847
848 /*
849 * In quorum-based sync replication, all the standbys in the list have the
850 * same priority, one.
851 */
852 return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ? priority : 1;
853}
854
855/*
856 * Walk the specified queue from head. Set the state of any backends that
857 * need to be woken, remove them from the queue, and then wake them.
858 * Pass all = true to wake whole queue; otherwise, just wake up to
859 * the walsender's LSN.
860 *
861 * The caller must hold SyncRepLock in exclusive mode.
862 */
863static int
864SyncRepWakeQueue(bool all, int mode)
865{
866 volatile WalSndCtlData *walsndctl = WalSndCtl;
867 int numprocs = 0;
869
872 Assert(SyncRepQueueIsOrderedByLSN(mode));
873
875 {
876 PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur);
877
878 /*
879 * Assume the queue is ordered by LSN
880 */
881 if (!all && walsndctl->lsn[mode] < proc->waitLSN)
882 return numprocs;
883
884 /*
885 * Remove from queue.
886 */
888
889 /*
890 * SyncRepWaitForLSN() reads syncRepState without holding the lock, so
891 * make sure that it sees the queue link being removed before the
892 * syncRepState change.
893 */
895
896 /*
897 * Set state to complete; see SyncRepWaitForLSN() for discussion of
898 * the various states.
899 */
901
902 /*
903 * Wake only when we have set state and removed from queue.
904 */
905 SetLatch(&(proc->procLatch));
906
907 numprocs++;
908 }
909
910 return numprocs;
911}
912
913/*
914 * The checkpointer calls this as needed to update the shared
915 * sync_standbys_defined flag, so that backends don't remain permanently wedged
916 * if synchronous_standby_names is unset. It's safe to check the current value
917 * without the lock, because it's only ever updated by one process. But we
918 * must take the lock to change it.
919 */
920void
922{
923 bool sync_standbys_defined = SyncStandbysDefined();
924
925 if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
926 {
927 LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
928
929 /*
930 * If synchronous_standby_names has been reset to empty, it's futile
931 * for backends to continue waiting. Since the user no longer wants
932 * synchronous replication, we'd better wake them up.
933 */
934 if (!sync_standbys_defined)
935 {
936 int i;
937
938 for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
939 SyncRepWakeQueue(true, i);
940 }
941
942 /*
943 * Only allow people to join the queue when there are synchronous
944 * standbys defined. Without this interlock, there's a race
945 * condition: we might wake up all the current waiters; then, some
946 * backend that hasn't yet reloaded its config might go to sleep on
947 * the queue (and never wake up). This prevents that.
948 */
949 WalSndCtl->sync_standbys_defined = sync_standbys_defined;
950
951 LWLockRelease(SyncRepLock);
952 }
953}
954
955#ifdef USE_ASSERT_CHECKING
956static bool
957SyncRepQueueIsOrderedByLSN(int mode)
958{
959 XLogRecPtr lastLSN;
960 dlist_iter iter;
961
963
964 lastLSN = 0;
965
967 {
968 PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur);
969
970 /*
971 * Check the queue is ordered by LSN and that multiple procs don't
972 * have matching LSNs
973 */
974 if (proc->waitLSN <= lastLSN)
975 return false;
976
977 lastLSN = proc->waitLSN;
978 }
979
980 return true;
981}
982#endif
983
984/*
985 * ===========================================================
986 * Synchronous Replication functions executed by any process
987 * ===========================================================
988 */
989
990bool
992{
993 if (*newval != NULL && (*newval)[0] != '\0')
994 {
995 yyscan_t scanner;
996 int parse_rc;
997 SyncRepConfigData *pconf;
998
999 /* Reset communication variables to ensure a fresh start */
1000 syncrep_parse_result = NULL;
1002
1003 /* Parse the synchronous_standby_names string */
1004 syncrep_scanner_init(*newval, &scanner);
1005 parse_rc = syncrep_yyparse(scanner);
1006 syncrep_scanner_finish(scanner);
1007
1008 if (parse_rc != 0 || syncrep_parse_result == NULL)
1009 {
1010 GUC_check_errcode(ERRCODE_SYNTAX_ERROR);
1013 else
1014 GUC_check_errdetail("\"%s\" parser failed.",
1015 "synchronous_standby_names");
1016 return false;
1017 }
1018
1020 {
1021 GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero",
1023 return false;
1024 }
1025
1026 /* GUC extra value must be guc_malloc'd, not palloc'd */
1027 pconf = (SyncRepConfigData *)
1029 if (pconf == NULL)
1030 return false;
1032
1033 *extra = pconf;
1034
1035 /*
1036 * We need not explicitly clean up syncrep_parse_result. It, and any
1037 * other cruft generated during parsing, will be freed when the
1038 * current memory context is deleted. (This code is generally run in
1039 * a short-lived context used for config file processing, so that will
1040 * not be very long.)
1041 */
1042 }
1043 else
1044 *extra = NULL;
1045
1046 return true;
1047}
1048
1049void
1051{
1052 SyncRepConfig = (SyncRepConfigData *) extra;
1053}
1054
1055void
1057{
1058 switch (newval)
1059 {
1062 break;
1065 break;
1068 break;
1069 default:
1071 break;
1072 }
1073}
#define pg_read_barrier()
Definition: atomics.h:156
#define pg_write_barrier()
Definition: atomics.h:157
#define Min(x, y)
Definition: c.h:958
uint8_t uint8
Definition: c.h:483
#define Assert(condition)
Definition: c.h:812
void * yyscan_t
Definition: cubedata.h:67
@ DestNone
Definition: dest.h:87
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define DEBUG3
Definition: elog.h:28
#define WARNING
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
volatile uint32 InterruptHoldoffCount
Definition: globals.c:42
volatile sig_atomic_t QueryCancelPending
Definition: globals.c:32
struct Latch * MyLatch
Definition: globals.c:62
volatile sig_atomic_t ProcDiePending
Definition: globals.c:33
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:6785
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:638
#define newval
#define GUC_check_errmsg
Definition: guc.h:472
#define GUC_check_errdetail
Definition: guc.h:476
GucSource
Definition: guc.h:108
char * application_name
Definition: guc_tables.c:543
static void dlist_insert_after(dlist_node *after, dlist_node *node)
Definition: ilist.h:381
#define dlist_foreach(iter, lhead)
Definition: ilist.h:623
static void dlist_delete_thoroughly(dlist_node *node)
Definition: ilist.h:416
static bool dlist_node_is_detached(const dlist_node *node)
Definition: ilist.h:525
#define dlist_reverse_foreach(iter, lhead)
Definition: ilist.h:654
static void dlist_push_head(dlist_head *head, dlist_node *node)
Definition: ilist.h:347
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:640
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
static int pg_cmp_u64(uint64 a, uint64 b)
Definition: int.h:664
#define write(a, b, c)
Definition: win32.h:14
int b
Definition: isn.c:69
int a
Definition: isn.c:68
int i
Definition: isn.c:72
void SetLatch(Latch *latch)
Definition: latch.c:632
void ResetLatch(Latch *latch)
Definition: latch.c:724
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:517
#define WL_LATCH_SET
Definition: latch.h:127
#define WL_POSTMASTER_DEATH
Definition: latch.h:131
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1937
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
static PgChecksumMode mode
Definition: pg_checksums.c:55
static rewind_source * source
Definition: pg_rewind.c:89
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define sprintf
Definition: port.h:240
#define qsort(a, b, c, d)
Definition: port.h:447
CommandDest whereToSendOutput
Definition: postgres.c:90
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:423
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:371
bool update_process_title
Definition: ps_status.c:31
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
PGPROC * MyProc
Definition: proc.c:66
Definition: proc.h:162
XLogRecPtr waitLSN
Definition: proc.h:252
dlist_node syncRepLinks
Definition: proc.h:254
int syncRepState
Definition: proc.h:253
Latch procLatch
Definition: proc.h:169
uint8 syncrep_method
Definition: syncrep.h:68
char member_names[FLEXIBLE_ARRAY_MEMBER]
Definition: syncrep.h:71
int sync_standby_priority
Definition: syncrep.h:49
XLogRecPtr apply
Definition: syncrep.h:48
XLogRecPtr write
Definition: syncrep.h:46
XLogRecPtr flush
Definition: syncrep.h:47
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER]
dlist_head SyncRepQueue[NUM_SYNC_REP_WAIT_MODE]
slock_t mutex
XLogRecPtr flush
WalSndState state
XLogRecPtr write
int sync_standby_priority
XLogRecPtr apply
dlist_node * cur
Definition: ilist.h:179
dlist_node * cur
Definition: ilist.h:200
Definition: regguts.h:323
static int SyncRepWaitMode
Definition: syncrep.c:98
void SyncRepInitConfig(void)
Definition: syncrep.c:402
void SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
Definition: syncrep.c:148
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync)
Definition: syncrep.c:543
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, SyncRepStandbyData *sync_standbys, int num_standbys, uint8 nth)
Definition: syncrep.c:650
void assign_synchronous_commit(int newval, void *extra)
Definition: syncrep.c:1056
void assign_synchronous_standby_names(const char *newval, void *extra)
Definition: syncrep.c:1050
static int standby_priority_comparator(const void *a, const void *b)
Definition: syncrep.c:790
static int SyncRepWakeQueue(bool all, int mode)
Definition: syncrep.c:864
SyncRepConfigData * SyncRepConfig
Definition: syncrep.c:97
int SyncRepGetCandidateStandbys(SyncRepStandbyData **standbys)
Definition: syncrep.c:711
void SyncRepReleaseWaiters(void)
Definition: syncrep.c:431
void SyncRepUpdateSyncStandbysDefined(void)
Definition: syncrep.c:921
static bool announce_next_takeover
Definition: syncrep.c:95
static int SyncRepGetStandbyPriority(void)
Definition: syncrep.c:817
char * SyncRepStandbyNames
Definition: syncrep.c:90
static void SyncRepQueueInsert(int mode)
Definition: syncrep.c:329
static void SyncRepCancelWait(void)
Definition: syncrep.c:363
bool check_synchronous_standby_names(char **newval, void **extra, GucSource source)
Definition: syncrep.c:991
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, SyncRepStandbyData *sync_standbys, int num_standbys)
Definition: syncrep.c:617
void SyncRepCleanupAtProcExit(void)
Definition: syncrep.c:373
static int cmp_lsn(const void *a, const void *b)
Definition: syncrep.c:695
#define SyncStandbysDefined()
Definition: syncrep.c:92
#define SYNC_REP_PRIORITY
Definition: syncrep.h:35
#define NUM_SYNC_REP_WAIT_MODE
Definition: syncrep.h:27
#define SyncRepRequested()
Definition: syncrep.h:18
#define SYNC_REP_NO_WAIT
Definition: syncrep.h:22
#define SYNC_REP_WAIT_WRITE
Definition: syncrep.h:23
#define SYNC_REP_WAITING
Definition: syncrep.h:31
PGDLLIMPORT SyncRepConfigData * syncrep_parse_result
#define SYNC_REP_WAIT_COMPLETE
Definition: syncrep.h:32
#define SYNC_REP_WAIT_FLUSH
Definition: syncrep.h:24
PGDLLIMPORT char * syncrep_parse_error_msg
#define SYNC_REP_NOT_WAITING
Definition: syncrep.h:30
#define SYNC_REP_WAIT_APPLY
Definition: syncrep.h:25
int syncrep_yyparse(yyscan_t yyscanner)
void syncrep_scanner_finish(yyscan_t yyscanner)
void syncrep_scanner_init(const char *str, yyscan_t *yyscannerp)
WalSnd * MyWalSnd
Definition: walsender.c:112
int max_wal_senders
Definition: walsender.c:121
bool am_cascading_walsender
Definition: walsender.c:116
WalSndCtlData * WalSndCtl
Definition: walsender.c:109
WalSndState
@ WALSNDSTATE_STREAMING
@ WALSNDSTATE_STOPPING
@ SYNCHRONOUS_COMMIT_REMOTE_WRITE
Definition: xact.h:72
@ SYNCHRONOUS_COMMIT_REMOTE_APPLY
Definition: xact.h:75
@ SYNCHRONOUS_COMMIT_REMOTE_FLUSH
Definition: xact.h:74
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28