PostgreSQL Source Code  git master
launcher.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * launcher.c
3  * PostgreSQL logical replication worker launcher process
4  *
5  * Copyright (c) 2016-2020, PostgreSQL Global Development Group
6  *
7  * IDENTIFICATION
8  * src/backend/replication/logical/launcher.c
9  *
10  * NOTES
11  * This module contains the logical replication worker launcher which
12  * uses the background worker infrastructure to start the logical
13  * replication workers for every enabled subscription.
14  *
15  *-------------------------------------------------------------------------
16  */
17 
18 #include "postgres.h"
19 
20 #include "access/heapam.h"
21 #include "access/htup.h"
22 #include "access/htup_details.h"
23 #include "access/tableam.h"
24 #include "access/xact.h"
27 #include "funcapi.h"
28 #include "libpq/pqsignal.h"
29 #include "miscadmin.h"
30 #include "pgstat.h"
31 #include "postmaster/bgworker.h"
33 #include "postmaster/interrupt.h"
34 #include "postmaster/postmaster.h"
37 #include "replication/slot.h"
40 #include "storage/ipc.h"
41 #include "storage/proc.h"
42 #include "storage/procarray.h"
43 #include "storage/procsignal.h"
44 #include "tcop/tcopprot.h"
45 #include "utils/memutils.h"
46 #include "utils/pg_lsn.h"
47 #include "utils/ps_status.h"
48 #include "utils/snapmgr.h"
49 #include "utils/timeout.h"
50 
51 /* max sleep time between cycles (3min) */
52 #define DEFAULT_NAPTIME_PER_CYCLE 180000L
53 
56 
58 
59 typedef struct LogicalRepCtxStruct
60 {
61  /* Supervisor process. */
62  pid_t launcher_pid;
63 
64  /* Background workers. */
67 
69 
70 typedef struct LogicalRepWorkerId
71 {
75 
76 typedef struct StopWorkersData
77 {
78  int nestDepth; /* Sub-transaction nest level */
79  List *workers; /* List of LogicalRepWorkerId */
80  struct StopWorkersData *parent; /* This need not be an immediate
81  * subtransaction parent */
83 
84 /*
85  * Stack of StopWorkersData elements. Each stack element contains the workers
86  * to be stopped for that subtransaction.
87  */
89 
90 static void ApplyLauncherWakeup(void);
91 static void logicalrep_launcher_onexit(int code, Datum arg);
92 static void logicalrep_worker_onexit(int code, Datum arg);
93 static void logicalrep_worker_detach(void);
94 static void logicalrep_worker_cleanup(LogicalRepWorker *worker);
95 
96 static bool on_commit_launcher_wakeup = false;
97 
99 
100 
101 /*
102  * Load the list of subscriptions.
103  *
104  * Only the fields interesting for worker start/stop functions are filled for
105  * each subscription.
106  */
107 static List *
109 {
110  List *res = NIL;
111  Relation rel;
112  TableScanDesc scan;
113  HeapTuple tup;
114  MemoryContext resultcxt;
115 
116  /* This is the context that we will allocate our output data in */
117  resultcxt = CurrentMemoryContext;
118 
119  /*
120  * Start a transaction so we can access pg_database, and get a snapshot.
121  * We don't have a use for the snapshot itself, but we're interested in
122  * the secondary effect that it sets RecentGlobalXmin. (This is critical
123  * for anything that reads heap pages, because HOT may decide to prune
124  * them even if the process doesn't attempt to modify any tuples.)
125  */
127  (void) GetTransactionSnapshot();
128 
129  rel = table_open(SubscriptionRelationId, AccessShareLock);
130  scan = table_beginscan_catalog(rel, 0, NULL);
131 
133  {
135  Subscription *sub;
136  MemoryContext oldcxt;
137 
138  /*
139  * Allocate our results in the caller's context, not the
140  * transaction's. We do this inside the loop, and restore the original
141  * context at the end, so that leaky things like heap_getnext() are
142  * not called in a potentially long-lived context.
143  */
144  oldcxt = MemoryContextSwitchTo(resultcxt);
145 
146  sub = (Subscription *) palloc0(sizeof(Subscription));
147  sub->oid = subform->oid;
148  sub->dbid = subform->subdbid;
149  sub->owner = subform->subowner;
150  sub->enabled = subform->subenabled;
151  sub->name = pstrdup(NameStr(subform->subname));
152  /* We don't fill fields we are not interested in. */
153 
154  res = lappend(res, sub);
155  MemoryContextSwitchTo(oldcxt);
156  }
157 
158  table_endscan(scan);
160 
162 
163  return res;
164 }
165 
166 /*
167  * Wait for a background worker to start up and attach to the shmem context.
168  *
169  * This is only needed for cleaning up the shared memory in case the worker
170  * fails to attach.
171  */
172 static void
174  uint16 generation,
175  BackgroundWorkerHandle *handle)
176 {
178  int rc;
179 
180  for (;;)
181  {
182  pid_t pid;
183 
185 
186  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
187 
188  /* Worker either died or has started; no need to do anything. */
189  if (!worker->in_use || worker->proc)
190  {
191  LWLockRelease(LogicalRepWorkerLock);
192  return;
193  }
194 
195  LWLockRelease(LogicalRepWorkerLock);
196 
197  /* Check if worker has died before attaching, and clean up after it. */
198  status = GetBackgroundWorkerPid(handle, &pid);
199 
200  if (status == BGWH_STOPPED)
201  {
202  LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE);
203  /* Ensure that this was indeed the worker we waited for. */
204  if (generation == worker->generation)
206  LWLockRelease(LogicalRepWorkerLock);
207  return;
208  }
209 
210  /*
211  * We need timeout because we generally don't get notified via latch
212  * about the worker attach. But we don't expect to have to wait long.
213  */
214  rc = WaitLatch(MyLatch,
217 
218  if (rc & WL_LATCH_SET)
219  {
222  }
223  }
224 }
225 
226 /*
227  * Walks the workers array and searches for one that matches given
228  * subscription id and relid.
229  */
231 logicalrep_worker_find(Oid subid, Oid relid, bool only_running)
232 {
233  int i;
234  LogicalRepWorker *res = NULL;
235 
236  Assert(LWLockHeldByMe(LogicalRepWorkerLock));
237 
238  /* Search for attached worker for a given subscription id. */
239  for (i = 0; i < max_logical_replication_workers; i++)
240  {
241  LogicalRepWorker *w = &LogicalRepCtx->workers[i];
242 
243  if (w->in_use && w->subid == subid && w->relid == relid &&
244  (!only_running || w->proc))
245  {
246  res = w;
247  break;
248  }
249  }
250 
251  return res;
252 }
253 
254 /*
255  * Similar to logicalrep_worker_find(), but returns list of all workers for
256  * the subscription, instead just one.
257  */
258 List *
259 logicalrep_workers_find(Oid subid, bool only_running)
260 {
261  int i;
262  List *res = NIL;
263 
264  Assert(LWLockHeldByMe(LogicalRepWorkerLock));
265 
266  /* Search for attached worker for a given subscription id. */
267  for (i = 0; i < max_logical_replication_workers; i++)
268  {
269  LogicalRepWorker *w = &LogicalRepCtx->workers[i];
270 
271  if (w->in_use && w->subid == subid && (!only_running || w->proc))
272  res = lappend(res, w);
273  }
274 
275  return res;
276 }
277 
278 /*
279  * Start new apply background worker, if possible.
280  */
281 void
282 logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid,
283  Oid relid)
284 {
285  BackgroundWorker bgw;
286  BackgroundWorkerHandle *bgw_handle;
287  uint16 generation;
288  int i;
289  int slot = 0;
290  LogicalRepWorker *worker = NULL;
291  int nsyncworkers;
293 
294  ereport(DEBUG1,
295  (errmsg("starting logical replication worker for subscription \"%s\"",
296  subname)));
297 
298  /* Report this after the initial starting message for consistency. */
299  if (max_replication_slots == 0)
300  ereport(ERROR,
301  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
302  errmsg("cannot start logical replication workers when max_replication_slots = 0")));
303 
304  /*
305  * We need to do the modification of the shared memory under lock so that
306  * we have consistent view.
307  */
308  LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE);
309 
310 retry:
311  /* Find unused worker slot. */
312  for (i = 0; i < max_logical_replication_workers; i++)
313  {
314  LogicalRepWorker *w = &LogicalRepCtx->workers[i];
315 
316  if (!w->in_use)
317  {
318  worker = w;
319  slot = i;
320  break;
321  }
322  }
323 
324  nsyncworkers = logicalrep_sync_worker_count(subid);
325 
326  now = GetCurrentTimestamp();
327 
328  /*
329  * If we didn't find a free slot, try to do garbage collection. The
330  * reason we do this is because if some worker failed to start up and its
331  * parent has crashed while waiting, the in_use state was never cleared.
332  */
333  if (worker == NULL || nsyncworkers >= max_sync_workers_per_subscription)
334  {
335  bool did_cleanup = false;
336 
337  for (i = 0; i < max_logical_replication_workers; i++)
338  {
339  LogicalRepWorker *w = &LogicalRepCtx->workers[i];
340 
341  /*
342  * If the worker was marked in use but didn't manage to attach in
343  * time, clean it up.
344  */
345  if (w->in_use && !w->proc &&
348  {
349  elog(WARNING,
350  "logical replication worker for subscription %u took too long to start; canceled",
351  w->subid);
352 
354  did_cleanup = true;
355  }
356  }
357 
358  if (did_cleanup)
359  goto retry;
360  }
361 
362  /*
363  * If we reached the sync worker limit per subscription, just exit
364  * silently as we might get here because of an otherwise harmless race
365  * condition.
366  */
367  if (nsyncworkers >= max_sync_workers_per_subscription)
368  {
369  LWLockRelease(LogicalRepWorkerLock);
370  return;
371  }
372 
373  /*
374  * However if there are no more free worker slots, inform user about it
375  * before exiting.
376  */
377  if (worker == NULL)
378  {
379  LWLockRelease(LogicalRepWorkerLock);
381  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
382  errmsg("out of logical replication worker slots"),
383  errhint("You might need to increase max_logical_replication_workers.")));
384  return;
385  }
386 
387  /* Prepare the worker slot. */
388  worker->launch_time = now;
389  worker->in_use = true;
390  worker->generation++;
391  worker->proc = NULL;
392  worker->dbid = dbid;
393  worker->userid = userid;
394  worker->subid = subid;
395  worker->relid = relid;
396  worker->relstate = SUBREL_STATE_UNKNOWN;
398  worker->last_lsn = InvalidXLogRecPtr;
401  worker->reply_lsn = InvalidXLogRecPtr;
402  TIMESTAMP_NOBEGIN(worker->reply_time);
403 
404  /* Before releasing lock, remember generation for future identification. */
405  generation = worker->generation;
406 
407  LWLockRelease(LogicalRepWorkerLock);
408 
409  /* Register the new dynamic worker. */
410  memset(&bgw, 0, sizeof(bgw));
414  snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
415  snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyWorkerMain");
416  if (OidIsValid(relid))
418  "logical replication worker for subscription %u sync %u", subid, relid);
419  else
421  "logical replication worker for subscription %u", subid);
422  snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication worker");
423 
426  bgw.bgw_main_arg = Int32GetDatum(slot);
427 
428  if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
429  {
430  /* Failed to start worker, so clean up the worker slot. */
431  LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE);
432  Assert(generation == worker->generation);
434  LWLockRelease(LogicalRepWorkerLock);
435 
437  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
438  errmsg("out of background worker slots"),
439  errhint("You might need to increase max_worker_processes.")));
440  return;
441  }
442 
443  /* Now wait until it attaches. */
444  WaitForReplicationWorkerAttach(worker, generation, bgw_handle);
445 }
446 
447 /*
448  * Stop the logical replication worker for subid/relid, if any, and wait until
449  * it detaches from the slot.
450  */
451 void
453 {
454  LogicalRepWorker *worker;
455  uint16 generation;
456 
457  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
458 
459  worker = logicalrep_worker_find(subid, relid, false);
460 
461  /* No worker, nothing to do. */
462  if (!worker)
463  {
464  LWLockRelease(LogicalRepWorkerLock);
465  return;
466  }
467 
468  /*
469  * Remember which generation was our worker so we can check if what we see
470  * is still the same one.
471  */
472  generation = worker->generation;
473 
474  /*
475  * If we found a worker but it does not have proc set then it is still
476  * starting up; wait for it to finish starting and then kill it.
477  */
478  while (worker->in_use && !worker->proc)
479  {
480  int rc;
481 
482  LWLockRelease(LogicalRepWorkerLock);
483 
484  /* Wait a bit --- we don't expect to have to wait long. */
485  rc = WaitLatch(MyLatch,
488 
489  if (rc & WL_LATCH_SET)
490  {
493  }
494 
495  /* Recheck worker status. */
496  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
497 
498  /*
499  * Check whether the worker slot is no longer used, which would mean
500  * that the worker has exited, or whether the worker generation is
501  * different, meaning that a different worker has taken the slot.
502  */
503  if (!worker->in_use || worker->generation != generation)
504  {
505  LWLockRelease(LogicalRepWorkerLock);
506  return;
507  }
508 
509  /* Worker has assigned proc, so it has started. */
510  if (worker->proc)
511  break;
512  }
513 
514  /* Now terminate the worker ... */
515  kill(worker->proc->pid, SIGTERM);
516 
517  /* ... and wait for it to die. */
518  for (;;)
519  {
520  int rc;
521 
522  /* is it gone? */
523  if (!worker->proc || worker->generation != generation)
524  break;
525 
526  LWLockRelease(LogicalRepWorkerLock);
527 
528  /* Wait a bit --- we don't expect to have to wait long. */
529  rc = WaitLatch(MyLatch,
532 
533  if (rc & WL_LATCH_SET)
534  {
537  }
538 
539  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
540  }
541 
542  LWLockRelease(LogicalRepWorkerLock);
543 }
544 
545 /*
546  * Request worker for specified sub/rel to be stopped on commit.
547  */
548 void
550 {
552  LogicalRepWorkerId *wid;
553  MemoryContext oldctx;
554 
555  /* Make sure we store the info in context that survives until commit. */
557 
558  /* Check that previous transactions were properly cleaned up. */
559  Assert(on_commit_stop_workers == NULL ||
560  nestDepth >= on_commit_stop_workers->nestDepth);
561 
562  /*
563  * Push a new stack element if we don't already have one for the current
564  * nestDepth.
565  */
566  if (on_commit_stop_workers == NULL ||
567  nestDepth > on_commit_stop_workers->nestDepth)
568  {
569  StopWorkersData *newdata = palloc(sizeof(StopWorkersData));
570 
571  newdata->nestDepth = nestDepth;
572  newdata->workers = NIL;
573  newdata->parent = on_commit_stop_workers;
574  on_commit_stop_workers = newdata;
575  }
576 
577  /*
578  * Finally add a new worker into the worker list of the current
579  * subtransaction.
580  */
581  wid = palloc(sizeof(LogicalRepWorkerId));
582  wid->subid = subid;
583  wid->relid = relid;
584  on_commit_stop_workers->workers =
585  lappend(on_commit_stop_workers->workers, wid);
586 
587  MemoryContextSwitchTo(oldctx);
588 }
589 
590 /*
591  * Wake up (using latch) any logical replication worker for specified sub/rel.
592  */
593 void
595 {
596  LogicalRepWorker *worker;
597 
598  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
599 
600  worker = logicalrep_worker_find(subid, relid, true);
601 
602  if (worker)
604 
605  LWLockRelease(LogicalRepWorkerLock);
606 }
607 
608 /*
609  * Wake up (using latch) the specified logical replication worker.
610  *
611  * Caller must hold lock, else worker->proc could change under us.
612  */
613 void
615 {
616  Assert(LWLockHeldByMe(LogicalRepWorkerLock));
617 
618  SetLatch(&worker->proc->procLatch);
619 }
620 
621 /*
622  * Attach to a slot.
623  */
624 void
626 {
627  /* Block concurrent access. */
628  LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE);
629 
630  Assert(slot >= 0 && slot < max_logical_replication_workers);
631  MyLogicalRepWorker = &LogicalRepCtx->workers[slot];
632 
633  if (!MyLogicalRepWorker->in_use)
634  {
635  LWLockRelease(LogicalRepWorkerLock);
636  ereport(ERROR,
637  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
638  errmsg("logical replication worker slot %d is empty, cannot attach",
639  slot)));
640  }
641 
642  if (MyLogicalRepWorker->proc)
643  {
644  LWLockRelease(LogicalRepWorkerLock);
645  ereport(ERROR,
646  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
647  errmsg("logical replication worker slot %d is already used by "
648  "another worker, cannot attach", slot)));
649  }
650 
651  MyLogicalRepWorker->proc = MyProc;
653 
654  LWLockRelease(LogicalRepWorkerLock);
655 }
656 
657 /*
658  * Detach the worker (cleans up the worker info).
659  */
660 static void
662 {
663  /* Block concurrent access. */
664  LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE);
665 
666  logicalrep_worker_cleanup(MyLogicalRepWorker);
667 
668  LWLockRelease(LogicalRepWorkerLock);
669 }
670 
671 /*
672  * Clean up worker info.
673  */
674 static void
676 {
677  Assert(LWLockHeldByMeInMode(LogicalRepWorkerLock, LW_EXCLUSIVE));
678 
679  worker->in_use = false;
680  worker->proc = NULL;
681  worker->dbid = InvalidOid;
682  worker->userid = InvalidOid;
683  worker->subid = InvalidOid;
684  worker->relid = InvalidOid;
685 }
686 
687 /*
688  * Cleanup function for logical replication launcher.
689  *
690  * Called on logical replication launcher exit.
691  */
692 static void
694 {
695  LogicalRepCtx->launcher_pid = 0;
696 }
697 
698 /*
699  * Cleanup function.
700  *
701  * Called on logical replication worker exit.
702  */
703 static void
705 {
706  /* Disconnect gracefully from the remote side. */
707  if (wrconn)
709 
711 
713 }
714 
715 /*
716  * Count the number of registered (not necessarily running) sync workers
717  * for a subscription.
718  */
719 int
721 {
722  int i;
723  int res = 0;
724 
725  Assert(LWLockHeldByMe(LogicalRepWorkerLock));
726 
727  /* Search for attached worker for a given subscription id. */
728  for (i = 0; i < max_logical_replication_workers; i++)
729  {
730  LogicalRepWorker *w = &LogicalRepCtx->workers[i];
731 
732  if (w->subid == subid && OidIsValid(w->relid))
733  res++;
734  }
735 
736  return res;
737 }
738 
739 /*
740  * ApplyLauncherShmemSize
741  * Compute space needed for replication launcher shared memory
742  */
743 Size
745 {
746  Size size;
747 
748  /*
749  * Need the fixed struct and the array of LogicalRepWorker.
750  */
751  size = sizeof(LogicalRepCtxStruct);
752  size = MAXALIGN(size);
754  sizeof(LogicalRepWorker)));
755  return size;
756 }
757 
758 /*
759  * ApplyLauncherRegister
760  * Register a background worker running the logical replication launcher.
761  */
762 void
764 {
765  BackgroundWorker bgw;
766 
768  return;
769 
770  memset(&bgw, 0, sizeof(bgw));
774  snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
775  snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyLauncherMain");
777  "logical replication launcher");
779  "logical replication launcher");
780  bgw.bgw_restart_time = 5;
781  bgw.bgw_notify_pid = 0;
782  bgw.bgw_main_arg = (Datum) 0;
783 
785 }
786 
787 /*
788  * ApplyLauncherShmemInit
789  * Allocate and initialize replication launcher shared memory
790  */
791 void
793 {
794  bool found;
795 
796  LogicalRepCtx = (LogicalRepCtxStruct *)
797  ShmemInitStruct("Logical Replication Launcher Data",
799  &found);
800 
801  if (!found)
802  {
803  int slot;
804 
805  memset(LogicalRepCtx, 0, ApplyLauncherShmemSize());
806 
807  /* Initialize memory and spin locks for each worker slot. */
808  for (slot = 0; slot < max_logical_replication_workers; slot++)
809  {
810  LogicalRepWorker *worker = &LogicalRepCtx->workers[slot];
811 
812  memset(worker, 0, sizeof(LogicalRepWorker));
813  SpinLockInit(&worker->relmutex);
814  }
815  }
816 }
817 
818 /*
819  * Check whether current transaction has manipulated logical replication
820  * workers.
821  */
822 bool
824 {
825  return (on_commit_stop_workers != NULL);
826 }
827 
828 /*
829  * Wakeup the launcher on commit if requested.
830  */
831 void
833 {
834 
835  Assert(on_commit_stop_workers == NULL ||
836  (on_commit_stop_workers->nestDepth == 1 &&
837  on_commit_stop_workers->parent == NULL));
838 
839  if (isCommit)
840  {
841  ListCell *lc;
842 
843  if (on_commit_stop_workers != NULL)
844  {
845  List *workers = on_commit_stop_workers->workers;
846 
847  foreach(lc, workers)
848  {
849  LogicalRepWorkerId *wid = lfirst(lc);
850 
851  logicalrep_worker_stop(wid->subid, wid->relid);
852  }
853  }
854 
857  }
858 
859  /*
860  * No need to pfree on_commit_stop_workers. It was allocated in
861  * transaction memory context, which is going to be cleaned soon.
862  */
863  on_commit_stop_workers = NULL;
865 }
866 
867 /*
868  * On commit, merge the current on_commit_stop_workers list into the
869  * immediate parent, if present.
870  * On rollback, discard the current on_commit_stop_workers list.
871  * Pop out the stack.
872  */
873 void
875 {
877 
878  /* Exit immediately if there's no work to do at this level. */
879  if (on_commit_stop_workers == NULL ||
880  on_commit_stop_workers->nestDepth < nestDepth)
881  return;
882 
883  Assert(on_commit_stop_workers->nestDepth == nestDepth);
884 
885  parent = on_commit_stop_workers->parent;
886 
887  if (isCommit)
888  {
889  /*
890  * If the upper stack element is not an immediate parent
891  * subtransaction, just decrement the notional nesting depth without
892  * doing any real work. Else, we need to merge the current workers
893  * list into the parent.
894  */
895  if (!parent || parent->nestDepth < nestDepth - 1)
896  {
897  on_commit_stop_workers->nestDepth--;
898  return;
899  }
900 
901  parent->workers =
902  list_concat(parent->workers, on_commit_stop_workers->workers);
903  }
904  else
905  {
906  /*
907  * Abandon everything that was done at this nesting level. Explicitly
908  * free memory to avoid a transaction-lifespan leak.
909  */
910  list_free_deep(on_commit_stop_workers->workers);
911  }
912 
913  /*
914  * We have taken care of the current subtransaction workers list for both
915  * abort or commit. So we are ready to pop the stack.
916  */
917  pfree(on_commit_stop_workers);
918  on_commit_stop_workers = parent;
919 }
920 
921 /*
922  * Request wakeup of the launcher on commit of the transaction.
923  *
924  * This is used to send launcher signal to stop sleeping and process the
925  * subscriptions when current transaction commits. Should be used when new
926  * tuple was added to the pg_subscription catalog.
927 */
928 void
930 {
933 }
934 
935 static void
937 {
938  if (LogicalRepCtx->launcher_pid != 0)
939  kill(LogicalRepCtx->launcher_pid, SIGUSR1);
940 }
941 
942 /*
943  * Main loop for the apply launcher process.
944  */
945 void
947 {
948  TimestampTz last_start_time = 0;
949 
950  ereport(DEBUG1,
951  (errmsg("logical replication launcher started")));
952 
954 
955  Assert(LogicalRepCtx->launcher_pid == 0);
956  LogicalRepCtx->launcher_pid = MyProcPid;
957 
958  /* Establish signal handlers. */
960  pqsignal(SIGTERM, die);
962 
963  /*
964  * Establish connection to nailed catalogs (we only ever access
965  * pg_subscription).
966  */
968 
969  /* Enter main loop */
970  for (;;)
971  {
972  int rc;
973  List *sublist;
974  ListCell *lc;
975  MemoryContext subctx;
976  MemoryContext oldctx;
978  long wait_time = DEFAULT_NAPTIME_PER_CYCLE;
979 
981 
982  now = GetCurrentTimestamp();
983 
984  /* Limit the start retry to once a wal_retrieve_retry_interval */
985  if (TimestampDifferenceExceeds(last_start_time, now,
987  {
988  /* Use temporary context for the database list and worker info. */
990  "Logical Replication Launcher sublist",
992  oldctx = MemoryContextSwitchTo(subctx);
993 
994  /* search for subscriptions to start or stop. */
995  sublist = get_subscription_list();
996 
997  /* Start the missing workers for enabled subscriptions. */
998  foreach(lc, sublist)
999  {
1000  Subscription *sub = (Subscription *) lfirst(lc);
1001  LogicalRepWorker *w;
1002 
1003  if (!sub->enabled)
1004  continue;
1005 
1006  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
1007  w = logicalrep_worker_find(sub->oid, InvalidOid, false);
1008  LWLockRelease(LogicalRepWorkerLock);
1009 
1010  if (w == NULL)
1011  {
1012  last_start_time = now;
1013  wait_time = wal_retrieve_retry_interval;
1014 
1015  logicalrep_worker_launch(sub->dbid, sub->oid, sub->name,
1016  sub->owner, InvalidOid);
1017  }
1018  }
1019 
1020  /* Switch back to original memory context. */
1021  MemoryContextSwitchTo(oldctx);
1022  /* Clean the temporary memory. */
1023  MemoryContextDelete(subctx);
1024  }
1025  else
1026  {
1027  /*
1028  * The wait in previous cycle was interrupted in less than
1029  * wal_retrieve_retry_interval since last worker was started, this
1030  * usually means crash of the worker, so we should retry in
1031  * wal_retrieve_retry_interval again.
1032  */
1033  wait_time = wal_retrieve_retry_interval;
1034  }
1035 
1036  /* Wait for more work. */
1037  rc = WaitLatch(MyLatch,
1039  wait_time,
1041 
1042  if (rc & WL_LATCH_SET)
1043  {
1046  }
1047 
1048  if (ConfigReloadPending)
1049  {
1050  ConfigReloadPending = false;
1052  }
1053  }
1054 
1055  /* Not reachable */
1056 }
1057 
1058 /*
1059  * Is current process the logical replication launcher?
1060  */
1061 bool
1063 {
1064  return LogicalRepCtx->launcher_pid == MyProcPid;
1065 }
1066 
1067 /*
1068  * Returns state of the subscriptions.
1069  */
1070 Datum
1072 {
1073 #define PG_STAT_GET_SUBSCRIPTION_COLS 8
1074  Oid subid = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0);
1075  int i;
1076  ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1077  TupleDesc tupdesc;
1078  Tuplestorestate *tupstore;
1079  MemoryContext per_query_ctx;
1080  MemoryContext oldcontext;
1081 
1082  /* check to see if caller supports us returning a tuplestore */
1083  if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1084  ereport(ERROR,
1085  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1086  errmsg("set-valued function called in context that cannot accept a set")));
1087  if (!(rsinfo->allowedModes & SFRM_Materialize))
1088  ereport(ERROR,
1089  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1090  errmsg("materialize mode required, but it is not allowed in this context")));
1091 
1092  /* Build a tuple descriptor for our result type */
1093  if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1094  elog(ERROR, "return type must be a row type");
1095 
1096  per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1097  oldcontext = MemoryContextSwitchTo(per_query_ctx);
1098 
1099  tupstore = tuplestore_begin_heap(true, false, work_mem);
1100  rsinfo->returnMode = SFRM_Materialize;
1101  rsinfo->setResult = tupstore;
1102  rsinfo->setDesc = tupdesc;
1103 
1104  MemoryContextSwitchTo(oldcontext);
1105 
1106  /* Make sure we get consistent view of the workers. */
1107  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
1108 
1109  for (i = 0; i <= max_logical_replication_workers; i++)
1110  {
1111  /* for each row */
1113  bool nulls[PG_STAT_GET_SUBSCRIPTION_COLS];
1114  int worker_pid;
1115  LogicalRepWorker worker;
1116 
1117  memcpy(&worker, &LogicalRepCtx->workers[i],
1118  sizeof(LogicalRepWorker));
1119  if (!worker.proc || !IsBackendPid(worker.proc->pid))
1120  continue;
1121 
1122  if (OidIsValid(subid) && worker.subid != subid)
1123  continue;
1124 
1125  worker_pid = worker.proc->pid;
1126 
1127  MemSet(values, 0, sizeof(values));
1128  MemSet(nulls, 0, sizeof(nulls));
1129 
1130  values[0] = ObjectIdGetDatum(worker.subid);
1131  if (OidIsValid(worker.relid))
1132  values[1] = ObjectIdGetDatum(worker.relid);
1133  else
1134  nulls[1] = true;
1135  values[2] = Int32GetDatum(worker_pid);
1136  if (XLogRecPtrIsInvalid(worker.last_lsn))
1137  nulls[3] = true;
1138  else
1139  values[3] = LSNGetDatum(worker.last_lsn);
1140  if (worker.last_send_time == 0)
1141  nulls[4] = true;
1142  else
1143  values[4] = TimestampTzGetDatum(worker.last_send_time);
1144  if (worker.last_recv_time == 0)
1145  nulls[5] = true;
1146  else
1147  values[5] = TimestampTzGetDatum(worker.last_recv_time);
1148  if (XLogRecPtrIsInvalid(worker.reply_lsn))
1149  nulls[6] = true;
1150  else
1151  values[6] = LSNGetDatum(worker.reply_lsn);
1152  if (worker.reply_time == 0)
1153  nulls[7] = true;
1154  else
1155  values[7] = TimestampTzGetDatum(worker.reply_time);
1156 
1157  tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1158 
1159  /*
1160  * If only a single subscription was requested, and we found it,
1161  * break.
1162  */
1163  if (OidIsValid(subid))
1164  break;
1165  }
1166 
1167  LWLockRelease(LogicalRepWorkerLock);
1168 
1169  /* clean up and return the tuplestore */
1170  tuplestore_donestoring(tupstore);
1171 
1172  return (Datum) 0;
1173 }
void AtEOXact_ApplyLauncher(bool isCommit)
Definition: launcher.c:832
void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, Datum *values, bool *isnull)
Definition: tuplestore.c:750
#define NIL
Definition: pg_list.h:65
WalReceiverConn * wrconn
Definition: worker.c:105
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define IsA(nodeptr, _type_)
Definition: nodes.h:580
void RegisterBackgroundWorker(BackgroundWorker *worker)
Definition: bgworker.c:827
List * logicalrep_workers_find(Oid subid, bool only_running)
Definition: launcher.c:259
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:211
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
#define AllocSetContextCreate
Definition: memutils.h:170
#define DEBUG1
Definition: elog.h:25
TypeFuncClass get_call_result_type(FunctionCallInfo fcinfo, Oid *resultTypeId, TupleDesc *resultTupleDesc)
Definition: funcapi.c:205
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:133
int MyProcPid
Definition: globals.c:40
int errhint(const char *fmt,...)
Definition: elog.c:1071
#define GETSTRUCT(TUP)
Definition: htup_details.h:655
MemoryContext TopTransactionContext
Definition: mcxt.c:49
#define WL_TIMEOUT
Definition: latch.h:127
void ProcessConfigFile(GucContext context)
void ApplyLauncherMain(Datum main_arg)
Definition: launcher.c:946
TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
Definition: tableam.c:98
#define SIGUSR1
Definition: win32_port.h:165
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1928
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1574
void logicalrep_worker_wakeup(Oid subid, Oid relid)
Definition: launcher.c:594
PGPROC * MyProc
Definition: proc.c:67
int64 TimestampTz
Definition: timestamp.h:39
void SignalHandlerForConfigReload(SIGNAL_ARGS)
Definition: interrupt.c:56
char * pstrdup(const char *in)
Definition: mcxt.c:1186
void CommitTransactionCommand(void)
Definition: xact.c:2917
#define SpinLockInit(lock)
Definition: spin.h:60
List * workers
Definition: launcher.c:79
#define tuplestore_donestoring(state)
Definition: tuplestore.h:60
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define AccessShareLock
Definition: lockdefs.h:36
LogicalRepWorker workers[FLEXIBLE_ARRAY_MEMBER]
Definition: launcher.c:65
TimestampTz last_send_time
XLogRecPtr last_lsn
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:276
int bgw_restart_time
Definition: bgworker.h:94
List * list_concat(List *list1, const List *list2)
Definition: list.c:515
int errcode(int sqlerrcode)
Definition: elog.c:610
#define LSNGetDatum(X)
Definition: pg_lsn.h:22
static StopWorkersData * on_commit_stop_workers
Definition: launcher.c:88
#define MemSet(start, val, len)
Definition: c.h:971
#define kill(pid, sig)
Definition: win32_port.h:426
FormData_pg_subscription * Form_pg_subscription
unsigned int Oid
Definition: postgres_ext.h:31
void SetLatch(Latch *latch)
Definition: latch.c:457
NameData subname
#define BGWORKER_SHMEM_ACCESS
Definition: bgworker.h:52
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1673
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:306
#define OidIsValid(objectId)
Definition: c.h:644
void list_free_deep(List *list)
Definition: list.c:1390
char bgw_function_name[BGW_MAXLEN]
Definition: bgworker.h:96
void ResetLatch(Latch *latch)
Definition: latch.c:540
int wal_receiver_timeout
Definition: walreceiver.c:89
Latch procLatch
Definition: proc.h:104
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:365
XLogRecPtr relstate_lsn
static void logicalrep_worker_detach(void)
Definition: launcher.c:661
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker)
Definition: launcher.c:614
Datum bgw_main_arg
Definition: bgworker.h:97
unsigned short uint16
Definition: c.h:366
void pfree(void *pointer)
Definition: mcxt.c:1056
LogicalRepWorker * MyLogicalRepWorker
Definition: launcher.c:57
#define ObjectIdGetDatum(X)
Definition: postgres.h:507
#define ERROR
Definition: elog.h:43
int max_sync_workers_per_subscription
Definition: launcher.c:55
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:392
#define TimestampTzGetDatum(X)
Definition: timestamp.h:32
XLogRecPtr reply_lsn
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:192
static void logicalrep_worker_cleanup(LogicalRepWorker *worker)
Definition: launcher.c:675
void logicalrep_worker_attach(int slot)
Definition: launcher.c:625
HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction)
Definition: heapam.c:1275
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
void logicalrep_worker_stop(Oid subid, Oid relid)
Definition: launcher.c:452
void AtEOSubXact_ApplyLauncher(bool isCommit, int nestDepth)
Definition: launcher.c:874
Size ApplyLauncherShmemSize(void)
Definition: launcher.c:744
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:333
#define BGW_NEVER_RESTART
Definition: bgworker.h:84
static void logicalrep_launcher_onexit(int code, Datum arg)
Definition: launcher.c:693
#define TIMESTAMP_NOBEGIN(j)
Definition: timestamp.h:112
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
BgwHandleStatus
Definition: bgworker.h:102
static bool on_commit_launcher_wakeup
Definition: launcher.c:96
MemoryContext TopMemoryContext
Definition: mcxt.c:44
Definition: guc.h:72
List * lappend(List *list, void *datum)
Definition: list.c:321
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
int wal_retrieve_retry_interval
Definition: xlog.c:109
Tuplestorestate * tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
Definition: tuplestore.c:318
void BackgroundWorkerInitializeConnection(const char *dbname, const char *username, uint32 flags)
Definition: postmaster.c:5711
Size mul_size(Size s1, Size s2)
Definition: shmem.c:515
static void logicalrep_worker_onexit(int code, Datum arg)
Definition: launcher.c:704
void * palloc0(Size size)
Definition: mcxt.c:980
#define DEFAULT_NAPTIME_PER_CYCLE
Definition: launcher.c:52
uintptr_t Datum
Definition: postgres.h:367
void ApplyLauncherRegister(void)
Definition: launcher.c:763
#define PG_STAT_GET_SUBSCRIPTION_COLS
Size add_size(Size s1, Size s2)
Definition: shmem.c:498
int work_mem
Definition: globals.c:121
int logicalrep_sync_worker_count(Oid subid)
Definition: launcher.c:720
static void WaitForReplicationWorkerAttach(LogicalRepWorker *worker, uint16 generation, BackgroundWorkerHandle *handle)
Definition: launcher.c:173
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:144
int allowedModes
Definition: execnodes.h:305
TimestampTz launch_time
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
static List * get_subscription_list(void)
Definition: launcher.c:108
int GetCurrentTransactionNestLevel(void)
Definition: xact.c:842
struct StopWorkersData StopWorkersData
SetFunctionReturnMode returnMode
Definition: execnodes.h:307
int max_replication_slots
Definition: slot.c:99
TimestampTz last_recv_time
#define PG_ARGISNULL(n)
Definition: fmgr.h:209
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, Oid relid)
Definition: launcher.c:282
char bgw_name[BGW_MAXLEN]
Definition: bgworker.h:90
#define Assert(condition)
Definition: c.h:738
#define lfirst(lc)
Definition: pg_list.h:190
#define BGWORKER_BACKEND_DATABASE_CONNECTION
Definition: bgworker.h:59
LogicalRepWorker * logicalrep_worker_find(Oid subid, Oid relid, bool only_running)
Definition: launcher.c:231
void StartTransactionCommand(void)
Definition: xact.c:2816
int max_logical_replication_workers
Definition: launcher.c:54
#define BGW_MAXLEN
Definition: bgworker.h:85
size_t Size
Definition: c.h:466
BgWorkerStartTime bgw_start_time
Definition: bgworker.h:93
struct StopWorkersData * parent
Definition: launcher.c:80
bool RegisterDynamicBackgroundWorker(BackgroundWorker *worker, BackgroundWorkerHandle **handle)
Definition: bgworker.c:911
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
#define MAXALIGN(LEN)
Definition: c.h:691
void ApplyLauncherShmemInit(void)
Definition: launcher.c:792
#define walrcv_disconnect(conn)
Definition: walreceiver.h:300
MemoryContext ecxt_per_query_memory
Definition: execnodes.h:233
struct LogicalRepCtxStruct LogicalRepCtxStruct
Tuplestorestate * setResult
Definition: execnodes.h:310
static void table_endscan(TableScanDesc scan)
Definition: tableam.h:862
static Datum values[MAXATTR]
Definition: bootstrap.c:167
ExprContext * econtext
Definition: execnodes.h:303
#define Int32GetDatum(X)
Definition: postgres.h:479
char bgw_type[BGW_MAXLEN]
Definition: bgworker.h:91
TupleDesc setDesc
Definition: execnodes.h:311
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
pid_t bgw_notify_pid
Definition: bgworker.h:99
bool IsLogicalLauncher(void)
Definition: launcher.c:1062
#define elog(elevel,...)
Definition: elog.h:214
LogicalRepCtxStruct * LogicalRepCtx
Definition: launcher.c:68
volatile sig_atomic_t ConfigReloadPending
Definition: interrupt.c:26
int i
#define NameStr(name)
Definition: c.h:615
bool IsBackendPid(int pid)
Definition: procarray.c:2457
void * arg
struct Latch * MyLatch
Definition: globals.c:54
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
Datum pg_stat_get_subscription(PG_FUNCTION_ARGS)
Definition: launcher.c:1071
void logicalrep_worker_stop_at_commit(Oid subid, Oid relid)
Definition: launcher.c:549
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:225
static void ApplyLauncherWakeup(void)
Definition: launcher.c:936
void ApplyLauncherWakeupAtCommit(void)
Definition: launcher.c:929
bool XactManipulatesLogicalReplicationWorkers(void)
Definition: launcher.c:823
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:39
char bgw_library_name[BGW_MAXLEN]
Definition: bgworker.h:95
Definition: pg_list.h:50
#define snprintf
Definition: port.h:193
int pid
Definition: proc.h:109
#define WL_LATCH_SET
Definition: latch.h:124
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1538
#define die(msg)
Definition: pg_test_fsync.c:96
BgwHandleStatus GetBackgroundWorkerPid(BackgroundWorkerHandle *handle, pid_t *pidp)
Definition: bgworker.c:1023
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:129
TimestampTz reply_time
struct LogicalRepWorkerId LogicalRepWorkerId
void BackgroundWorkerUnblockSignals(void)
Definition: postmaster.c:5763