PostgreSQL Source Code  git master
slot.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * slot.c
4  * Replication slot management.
5  *
6  *
7  * Copyright (c) 2012-2020, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/slot.c
12  *
13  * NOTES
14  *
15  * Replication slots are used to keep state about replication streams
16  * originating from this cluster. Their primary purpose is to prevent the
17  * premature removal of WAL or of old tuple versions in a manner that would
18  * interfere with replication; they are also useful for monitoring purposes.
19  * Slots need to be permanent (to allow restarts), crash-safe, and allocatable
20  * on standbys (to support cascading setups). The requirement that slots be
21  * usable on standbys precludes storing them in the system catalogs.
22  *
23  * Each replication slot gets its own directory inside the $PGDATA/pg_replslot
24  * directory. Inside that directory the state file will contain the slot's
25  * own data. Additional data can be stored alongside that file if required.
26  * While the server is running, the state data is also cached in memory for
27  * efficiency.
28  *
29  * ReplicationSlotAllocationLock must be taken in exclusive mode to allocate
30  * or free a slot. ReplicationSlotControlLock must be taken in shared mode
31  * to iterate over the slots, and in exclusive mode to change the in_use flag
32  * of a slot. The remaining data in each slot is protected by its mutex.
33  *
34  *-------------------------------------------------------------------------
35  */
36 
37 #include "postgres.h"
38 
39 #include <unistd.h>
40 #include <sys/stat.h>
41 
42 #include "access/transam.h"
43 #include "access/xlog_internal.h"
44 #include "common/string.h"
45 #include "miscadmin.h"
46 #include "pgstat.h"
47 #include "replication/slot.h"
48 #include "storage/fd.h"
49 #include "storage/proc.h"
50 #include "storage/procarray.h"
51 #include "utils/builtins.h"
52 
53 /*
54  * Replication slot on-disk data structure.
55  */
56 typedef struct ReplicationSlotOnDisk
57 {
58  /* first part of this struct needs to be version independent */
59 
60  /* data not covered by checksum */
63 
64  /* data covered by checksum */
67 
68  /*
69  * The actual data in the slot that follows can differ based on the above
70  * 'version'.
71  */
72 
75 
76 /* size of version independent data */
77 #define ReplicationSlotOnDiskConstantSize \
78  offsetof(ReplicationSlotOnDisk, slotdata)
79 /* size of the part of the slot not covered by the checksum */
80 #define SnapBuildOnDiskNotChecksummedSize \
81  offsetof(ReplicationSlotOnDisk, version)
82 /* size of the part covered by the checksum */
83 #define SnapBuildOnDiskChecksummedSize \
84  sizeof(ReplicationSlotOnDisk) - SnapBuildOnDiskNotChecksummedSize
85 /* size of the slot data that is version dependent */
86 #define ReplicationSlotOnDiskV2Size \
87  sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskConstantSize
88 
89 #define SLOT_MAGIC 0x1051CA1 /* format identifier */
90 #define SLOT_VERSION 2 /* version for new files */
91 
92 /* Control array for replication slot management */
94 
95 /* My backend's replication slot in the shared memory array */
97 
98 /* GUCs */
99 int max_replication_slots = 0; /* the maximum number of replication
100  * slots */
101 
102 static ReplicationSlot *SearchNamedReplicationSlot(const char *name);
104  const char *name, SlotAcquireBehavior behavior);
105 static void ReplicationSlotDropAcquired(void);
106 static void ReplicationSlotDropPtr(ReplicationSlot *slot);
107 
108 /* internal persistency functions */
109 static void RestoreSlotFromDisk(const char *name);
110 static void CreateSlotOnDisk(ReplicationSlot *slot);
111 static void SaveSlotToPath(ReplicationSlot *slot, const char *path, int elevel);
112 
113 /*
114  * Report shared-memory space needed by ReplicationSlotsShmemInit.
115  */
116 Size
118 {
119  Size size = 0;
120 
121  if (max_replication_slots == 0)
122  return size;
123 
124  size = offsetof(ReplicationSlotCtlData, replication_slots);
125  size = add_size(size,
127 
128  return size;
129 }
130 
131 /*
132  * Allocate and initialize shared memory for replication slots.
133  */
134 void
136 {
137  bool found;
138 
139  if (max_replication_slots == 0)
140  return;
141 
142  ReplicationSlotCtl = (ReplicationSlotCtlData *)
143  ShmemInitStruct("ReplicationSlot Ctl", ReplicationSlotsShmemSize(),
144  &found);
145 
146  if (!found)
147  {
148  int i;
149 
150  /* First time through, so initialize */
151  MemSet(ReplicationSlotCtl, 0, ReplicationSlotsShmemSize());
152 
153  for (i = 0; i < max_replication_slots; i++)
154  {
155  ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[i];
156 
157  /* everything else is zeroed by the memset above */
158  SpinLockInit(&slot->mutex);
162  }
163  }
164 }
165 
166 /*
167  * Check whether the passed slot name is valid and report errors at elevel.
168  *
169  * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow
170  * the name to be used as a directory name on every supported OS.
171  *
172  * Returns whether the directory name is valid or not if elevel < ERROR.
173  */
174 bool
176 {
177  const char *cp;
178 
179  if (strlen(name) == 0)
180  {
181  ereport(elevel,
182  (errcode(ERRCODE_INVALID_NAME),
183  errmsg("replication slot name \"%s\" is too short",
184  name)));
185  return false;
186  }
187 
188  if (strlen(name) >= NAMEDATALEN)
189  {
190  ereport(elevel,
191  (errcode(ERRCODE_NAME_TOO_LONG),
192  errmsg("replication slot name \"%s\" is too long",
193  name)));
194  return false;
195  }
196 
197  for (cp = name; *cp; cp++)
198  {
199  if (!((*cp >= 'a' && *cp <= 'z')
200  || (*cp >= '0' && *cp <= '9')
201  || (*cp == '_')))
202  {
203  ereport(elevel,
204  (errcode(ERRCODE_INVALID_NAME),
205  errmsg("replication slot name \"%s\" contains invalid character",
206  name),
207  errhint("Replication slot names may only contain lower case letters, numbers, and the underscore character.")));
208  return false;
209  }
210  }
211  return true;
212 }
213 
214 /*
215  * Create a new replication slot and mark it as used by this backend.
216  *
217  * name: Name of the slot
218  * db_specific: logical decoding is db specific; if the slot is going to
219  * be used for that pass true, otherwise false.
220  */
221 void
222 ReplicationSlotCreate(const char *name, bool db_specific,
223  ReplicationSlotPersistency persistency)
224 {
225  ReplicationSlot *slot = NULL;
226  int i;
227 
228  Assert(MyReplicationSlot == NULL);
229 
231 
232  /*
233  * If some other backend ran this code concurrently with us, we'd likely
234  * both allocate the same slot, and that would be bad. We'd also be at
235  * risk of missing a name collision. Also, we don't want to try to create
236  * a new slot while somebody's busy cleaning up an old one, because we
237  * might both be monkeying with the same directory.
238  */
239  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
240 
241  /*
242  * Check for name collision, and identify an allocatable slot. We need to
243  * hold ReplicationSlotControlLock in shared mode for this, so that nobody
244  * else can change the in_use flags while we're looking at them.
245  */
246  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
247  for (i = 0; i < max_replication_slots; i++)
248  {
249  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
250 
251  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
252  ereport(ERROR,
254  errmsg("replication slot \"%s\" already exists", name)));
255  if (!s->in_use && slot == NULL)
256  slot = s;
257  }
258  LWLockRelease(ReplicationSlotControlLock);
259 
260  /* If all slots are in use, we're out of luck. */
261  if (slot == NULL)
262  ereport(ERROR,
263  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
264  errmsg("all replication slots are in use"),
265  errhint("Free one or increase max_replication_slots.")));
266 
267  /*
268  * Since this slot is not in use, nobody should be looking at any part of
269  * it other than the in_use field unless they're trying to allocate it.
270  * And since we hold ReplicationSlotAllocationLock, nobody except us can
271  * be doing that. So it's safe to initialize the slot.
272  */
273  Assert(!slot->in_use);
274  Assert(slot->active_pid == 0);
275 
276  /* first initialize persistent data */
277  memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData));
278  namestrcpy(&slot->data.name, name);
279  slot->data.database = db_specific ? MyDatabaseId : InvalidOid;
280  slot->data.persistency = persistency;
281 
282  /* and then data only present in shared memory */
283  slot->just_dirtied = false;
284  slot->dirty = false;
291 
292  /*
293  * Create the slot on disk. We haven't actually marked the slot allocated
294  * yet, so no special cleanup is required if this errors out.
295  */
296  CreateSlotOnDisk(slot);
297 
298  /*
299  * We need to briefly prevent any other backend from iterating over the
300  * slots while we flip the in_use flag. We also need to set the active
301  * flag while holding the ControlLock as otherwise a concurrent
302  * ReplicationSlotAcquire() could acquire the slot as well.
303  */
304  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
305 
306  slot->in_use = true;
307 
308  /* We can now mark the slot active, and that makes it our slot. */
309  SpinLockAcquire(&slot->mutex);
310  Assert(slot->active_pid == 0);
311  slot->active_pid = MyProcPid;
312  SpinLockRelease(&slot->mutex);
313  MyReplicationSlot = slot;
314 
315  LWLockRelease(ReplicationSlotControlLock);
316 
317  /*
318  * Now that the slot has been marked as in_use and active, it's safe to
319  * let somebody else try to allocate a slot.
320  */
321  LWLockRelease(ReplicationSlotAllocationLock);
322 
323  /* Let everybody know we've modified this slot */
325 }
326 
327 /*
328  * Search for the named replication slot.
329  *
330  * Return the replication slot if found, otherwise NULL.
331  *
332  * The caller must hold ReplicationSlotControlLock in shared mode.
333  */
334 static ReplicationSlot *
336 {
337  int i;
338  ReplicationSlot *slot = NULL;
339 
340  Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock,
341  LW_SHARED));
342 
343  for (i = 0; i < max_replication_slots; i++)
344  {
345  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
346 
347  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
348  {
349  slot = s;
350  break;
351  }
352  }
353 
354  return slot;
355 }
356 
357 /*
358  * Find a previously created slot and mark it as used by this process.
359  *
360  * The return value is only useful if behavior is SAB_Inquire, in which
361  * it's zero if we successfully acquired the slot, -1 if the slot no longer
362  * exists, or the PID of the owning process otherwise. If behavior is
363  * SAB_Error, then trying to acquire an owned slot is an error.
364  * If SAB_Block, we sleep until the slot is released by the owning process.
365  */
366 int
368 {
369  return ReplicationSlotAcquireInternal(NULL, name, behavior);
370 }
371 
372 /*
373  * Mark the specified slot as used by this process.
374  *
375  * Only one of slot and name can be specified.
376  * If slot == NULL, search for the slot with the given name.
377  *
378  * See comments about the return value in ReplicationSlotAcquire().
379  */
380 static int
382  SlotAcquireBehavior behavior)
383 {
384  ReplicationSlot *s;
385  int active_pid;
386 
387  AssertArg((slot == NULL) ^ (name == NULL));
388 
389 retry:
390  Assert(MyReplicationSlot == NULL);
391 
392  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
393 
394  /*
395  * Search for the slot with the specified name if the slot to acquire is
396  * not given. If the slot is not found, we either return -1 or error out.
397  */
398  s = slot ? slot : SearchNamedReplicationSlot(name);
399  if (s == NULL || !s->in_use)
400  {
401  LWLockRelease(ReplicationSlotControlLock);
402 
403  if (behavior == SAB_Inquire)
404  return -1;
405  ereport(ERROR,
406  (errcode(ERRCODE_UNDEFINED_OBJECT),
407  errmsg("replication slot \"%s\" does not exist",
408  name ? name : NameStr(slot->data.name))));
409  }
410 
411  /*
412  * This is the slot we want; check if it's active under some other
413  * process. In single user mode, we don't need this check.
414  */
415  if (IsUnderPostmaster)
416  {
417  /*
418  * Get ready to sleep on the slot in case it is active if SAB_Block.
419  * (We may end up not sleeping, but we don't want to do this while
420  * holding the spinlock.)
421  */
422  if (behavior == SAB_Block)
424 
425  SpinLockAcquire(&s->mutex);
426  if (s->active_pid == 0)
427  s->active_pid = MyProcPid;
428  active_pid = s->active_pid;
429  SpinLockRelease(&s->mutex);
430  }
431  else
432  active_pid = MyProcPid;
433  LWLockRelease(ReplicationSlotControlLock);
434 
435  /*
436  * If we found the slot but it's already active in another process, we
437  * either error out, return the PID of the owning process, or retry
438  * after a short wait, as caller specified.
439  */
440  if (active_pid != MyProcPid)
441  {
442  if (behavior == SAB_Error)
443  ereport(ERROR,
444  (errcode(ERRCODE_OBJECT_IN_USE),
445  errmsg("replication slot \"%s\" is active for PID %d",
446  NameStr(s->data.name), active_pid)));
447  else if (behavior == SAB_Inquire)
448  return active_pid;
449 
450  /* Wait here until we get signaled, and then restart */
454  goto retry;
455  }
456  else if (behavior == SAB_Block)
457  ConditionVariableCancelSleep(); /* no sleep needed after all */
458 
459  /* Let everybody know we've modified this slot */
461 
462  /* We made this slot active, so it's ours now. */
463  MyReplicationSlot = s;
464 
465  /* success */
466  return 0;
467 }
468 
469 /*
470  * Release the replication slot that this backend considers to own.
471  *
472  * This or another backend can re-acquire the slot later.
473  * Resources this slot requires will be preserved.
474  */
475 void
477 {
479 
480  Assert(slot != NULL && slot->active_pid != 0);
481 
482  if (slot->data.persistency == RS_EPHEMERAL)
483  {
484  /*
485  * Delete the slot. There is no !PANIC case where this is allowed to
486  * fail, all that may happen is an incomplete cleanup of the on-disk
487  * data.
488  */
490  }
491 
492  /*
493  * If slot needed to temporarily restrain both data and catalog xmin to
494  * create the catalog snapshot, remove that temporary constraint.
495  * Snapshots can only be exported while the initial snapshot is still
496  * acquired.
497  */
498  if (!TransactionIdIsValid(slot->data.xmin) &&
500  {
501  SpinLockAcquire(&slot->mutex);
503  SpinLockRelease(&slot->mutex);
505  }
506 
507  if (slot->data.persistency == RS_PERSISTENT)
508  {
509  /*
510  * Mark persistent slot inactive. We're not freeing it, just
511  * disconnecting, but wake up others that may be waiting for it.
512  */
513  SpinLockAcquire(&slot->mutex);
514  slot->active_pid = 0;
515  SpinLockRelease(&slot->mutex);
517  }
518 
519  MyReplicationSlot = NULL;
520 
521  /* might not have been set when we've been a plain slot */
522  LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
525  LWLockRelease(ProcArrayLock);
526 }
527 
528 /*
529  * Cleanup all temporary slots created in current session.
530  */
531 void
533 {
534  int i;
535 
536  Assert(MyReplicationSlot == NULL);
537 
538 restart:
539  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
540  for (i = 0; i < max_replication_slots; i++)
541  {
542  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
543 
544  if (!s->in_use)
545  continue;
546 
547  SpinLockAcquire(&s->mutex);
548  if (s->active_pid == MyProcPid)
549  {
551  SpinLockRelease(&s->mutex);
552  LWLockRelease(ReplicationSlotControlLock); /* avoid deadlock */
553 
555 
557  goto restart;
558  }
559  else
560  SpinLockRelease(&s->mutex);
561  }
562 
563  LWLockRelease(ReplicationSlotControlLock);
564 }
565 
566 /*
567  * Permanently drop replication slot identified by the passed in name.
568  */
569 void
570 ReplicationSlotDrop(const char *name, bool nowait)
571 {
572  Assert(MyReplicationSlot == NULL);
573 
574  (void) ReplicationSlotAcquire(name, nowait ? SAB_Error : SAB_Block);
575 
577 }
578 
579 /*
580  * Permanently drop the currently acquired replication slot.
581  */
582 static void
584 {
586 
587  Assert(MyReplicationSlot != NULL);
588 
589  /* slot isn't acquired anymore */
590  MyReplicationSlot = NULL;
591 
593 }
594 
595 /*
596  * Permanently drop the replication slot which will be released by the point
597  * this function returns.
598  */
599 static void
601 {
602  char path[MAXPGPATH];
603  char tmppath[MAXPGPATH];
604 
605  /*
606  * If some other backend ran this code concurrently with us, we might try
607  * to delete a slot with a certain name while someone else was trying to
608  * create a slot with the same name.
609  */
610  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
611 
612  /* Generate pathnames. */
613  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
614  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
615 
616  /*
617  * Rename the slot directory on disk, so that we'll no longer recognize
618  * this as a valid slot. Note that if this fails, we've got to mark the
619  * slot inactive before bailing out. If we're dropping an ephemeral or a
620  * temporary slot, we better never fail hard as the caller won't expect
621  * the slot to survive and this might get called during error handling.
622  */
623  if (rename(path, tmppath) == 0)
624  {
625  /*
626  * We need to fsync() the directory we just renamed and its parent to
627  * make sure that our changes are on disk in a crash-safe fashion. If
628  * fsync() fails, we can't be sure whether the changes are on disk or
629  * not. For now, we handle that by panicking;
630  * StartupReplicationSlots() will try to straighten it out after
631  * restart.
632  */
634  fsync_fname(tmppath, true);
635  fsync_fname("pg_replslot", true);
637  }
638  else
639  {
640  bool fail_softly = slot->data.persistency != RS_PERSISTENT;
641 
642  SpinLockAcquire(&slot->mutex);
643  slot->active_pid = 0;
644  SpinLockRelease(&slot->mutex);
645 
646  /* wake up anyone waiting on this slot */
648 
649  ereport(fail_softly ? WARNING : ERROR,
651  errmsg("could not rename file \"%s\" to \"%s\": %m",
652  path, tmppath)));
653  }
654 
655  /*
656  * The slot is definitely gone. Lock out concurrent scans of the array
657  * long enough to kill it. It's OK to clear the active PID here without
658  * grabbing the mutex because nobody else can be scanning the array here,
659  * and nobody can be attached to this slot and thus access it without
660  * scanning the array.
661  *
662  * Also wake up processes waiting for it.
663  */
664  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
665  slot->active_pid = 0;
666  slot->in_use = false;
667  LWLockRelease(ReplicationSlotControlLock);
669 
670  /*
671  * Slot is dead and doesn't prevent resource removal anymore, recompute
672  * limits.
673  */
676 
677  /*
678  * If removing the directory fails, the worst thing that will happen is
679  * that the user won't be able to create a new slot with the same name
680  * until the next server restart. We warn about it, but that's all.
681  */
682  if (!rmtree(tmppath, true))
684  (errmsg("could not remove directory \"%s\"", tmppath)));
685 
686  /*
687  * We release this at the very end, so that nobody starts trying to create
688  * a slot while we're still cleaning up the detritus of the old one.
689  */
690  LWLockRelease(ReplicationSlotAllocationLock);
691 }
692 
693 /*
694  * Serialize the currently acquired slot's state from memory to disk, thereby
695  * guaranteeing the current state will survive a crash.
696  */
697 void
699 {
700  char path[MAXPGPATH];
701 
702  Assert(MyReplicationSlot != NULL);
703 
704  sprintf(path, "pg_replslot/%s", NameStr(MyReplicationSlot->data.name));
705  SaveSlotToPath(MyReplicationSlot, path, ERROR);
706 }
707 
708 /*
709  * Signal that it would be useful if the currently acquired slot would be
710  * flushed out to disk.
711  *
712  * Note that the actual flush to disk can be delayed for a long time, if
713  * required for correctness explicitly do a ReplicationSlotSave().
714  */
715 void
717 {
719 
720  Assert(MyReplicationSlot != NULL);
721 
722  SpinLockAcquire(&slot->mutex);
723  MyReplicationSlot->just_dirtied = true;
724  MyReplicationSlot->dirty = true;
725  SpinLockRelease(&slot->mutex);
726 }
727 
728 /*
729  * Convert a slot that's marked as RS_EPHEMERAL to a RS_PERSISTENT slot,
730  * guaranteeing it will be there after an eventual crash.
731  */
732 void
734 {
736 
737  Assert(slot != NULL);
739 
740  SpinLockAcquire(&slot->mutex);
742  SpinLockRelease(&slot->mutex);
743 
746 }
747 
748 /*
749  * Compute the oldest xmin across all slots and store it in the ProcArray.
750  *
751  * If already_locked is true, ProcArrayLock has already been acquired
752  * exclusively.
753  */
754 void
756 {
757  int i;
759  TransactionId agg_catalog_xmin = InvalidTransactionId;
760 
761  Assert(ReplicationSlotCtl != NULL);
762 
763  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
764 
765  for (i = 0; i < max_replication_slots; i++)
766  {
767  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
768  TransactionId effective_xmin;
769  TransactionId effective_catalog_xmin;
770 
771  if (!s->in_use)
772  continue;
773 
774  SpinLockAcquire(&s->mutex);
775  effective_xmin = s->effective_xmin;
776  effective_catalog_xmin = s->effective_catalog_xmin;
777  SpinLockRelease(&s->mutex);
778 
779  /* check the data xmin */
780  if (TransactionIdIsValid(effective_xmin) &&
781  (!TransactionIdIsValid(agg_xmin) ||
782  TransactionIdPrecedes(effective_xmin, agg_xmin)))
783  agg_xmin = effective_xmin;
784 
785  /* check the catalog xmin */
786  if (TransactionIdIsValid(effective_catalog_xmin) &&
787  (!TransactionIdIsValid(agg_catalog_xmin) ||
788  TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin)))
789  agg_catalog_xmin = effective_catalog_xmin;
790  }
791 
792  LWLockRelease(ReplicationSlotControlLock);
793 
794  ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
795 }
796 
797 /*
798  * Compute the oldest restart LSN across all slots and inform xlog module.
799  *
800  * Note: while max_slot_wal_keep_size is theoretically relevant for this
801  * purpose, we don't try to account for that, because this module doesn't
802  * know what to compare against.
803  */
804 void
806 {
807  int i;
808  XLogRecPtr min_required = InvalidXLogRecPtr;
809 
810  Assert(ReplicationSlotCtl != NULL);
811 
812  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
813  for (i = 0; i < max_replication_slots; i++)
814  {
815  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
816  XLogRecPtr restart_lsn;
817 
818  if (!s->in_use)
819  continue;
820 
821  SpinLockAcquire(&s->mutex);
822  restart_lsn = s->data.restart_lsn;
823  SpinLockRelease(&s->mutex);
824 
825  if (restart_lsn != InvalidXLogRecPtr &&
826  (min_required == InvalidXLogRecPtr ||
827  restart_lsn < min_required))
828  min_required = restart_lsn;
829  }
830  LWLockRelease(ReplicationSlotControlLock);
831 
832  XLogSetReplicationSlotMinimumLSN(min_required);
833 }
834 
835 /*
836  * Compute the oldest WAL LSN required by *logical* decoding slots..
837  *
838  * Returns InvalidXLogRecPtr if logical decoding is disabled or no logical
839  * slots exist.
840  *
841  * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it
842  * ignores physical replication slots.
843  *
844  * The results aren't required frequently, so we don't maintain a precomputed
845  * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin().
846  */
849 {
850  XLogRecPtr result = InvalidXLogRecPtr;
851  int i;
852 
853  if (max_replication_slots <= 0)
854  return InvalidXLogRecPtr;
855 
856  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
857 
858  for (i = 0; i < max_replication_slots; i++)
859  {
860  ReplicationSlot *s;
861  XLogRecPtr restart_lsn;
862 
863  s = &ReplicationSlotCtl->replication_slots[i];
864 
865  /* cannot change while ReplicationSlotCtlLock is held */
866  if (!s->in_use)
867  continue;
868 
869  /* we're only interested in logical slots */
870  if (!SlotIsLogical(s))
871  continue;
872 
873  /* read once, it's ok if it increases while we're checking */
874  SpinLockAcquire(&s->mutex);
875  restart_lsn = s->data.restart_lsn;
876  SpinLockRelease(&s->mutex);
877 
878  if (restart_lsn == InvalidXLogRecPtr)
879  continue;
880 
881  if (result == InvalidXLogRecPtr ||
882  restart_lsn < result)
883  result = restart_lsn;
884  }
885 
886  LWLockRelease(ReplicationSlotControlLock);
887 
888  return result;
889 }
890 
891 /*
892  * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the
893  * passed database oid.
894  *
895  * Returns true if there are any slots referencing the database. *nslots will
896  * be set to the absolute number of slots in the database, *nactive to ones
897  * currently active.
898  */
899 bool
900 ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
901 {
902  int i;
903 
904  *nslots = *nactive = 0;
905 
906  if (max_replication_slots <= 0)
907  return false;
908 
909  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
910  for (i = 0; i < max_replication_slots; i++)
911  {
912  ReplicationSlot *s;
913 
914  s = &ReplicationSlotCtl->replication_slots[i];
915 
916  /* cannot change while ReplicationSlotCtlLock is held */
917  if (!s->in_use)
918  continue;
919 
920  /* only logical slots are database specific, skip */
921  if (!SlotIsLogical(s))
922  continue;
923 
924  /* not our database, skip */
925  if (s->data.database != dboid)
926  continue;
927 
928  /* count slots with spinlock held */
929  SpinLockAcquire(&s->mutex);
930  (*nslots)++;
931  if (s->active_pid != 0)
932  (*nactive)++;
933  SpinLockRelease(&s->mutex);
934  }
935  LWLockRelease(ReplicationSlotControlLock);
936 
937  if (*nslots > 0)
938  return true;
939  return false;
940 }
941 
942 /*
943  * ReplicationSlotsDropDBSlots -- Drop all db-specific slots relating to the
944  * passed database oid. The caller should hold an exclusive lock on the
945  * pg_database oid for the database to prevent creation of new slots on the db
946  * or replay from existing slots.
947  *
948  * Another session that concurrently acquires an existing slot on the target DB
949  * (most likely to drop it) may cause this function to ERROR. If that happens
950  * it may have dropped some but not all slots.
951  *
952  * This routine isn't as efficient as it could be - but we don't drop
953  * databases often, especially databases with lots of slots.
954  */
955 void
957 {
958  int i;
959 
960  if (max_replication_slots <= 0)
961  return;
962 
963 restart:
964  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
965  for (i = 0; i < max_replication_slots; i++)
966  {
967  ReplicationSlot *s;
968  char *slotname;
969  int active_pid;
970 
971  s = &ReplicationSlotCtl->replication_slots[i];
972 
973  /* cannot change while ReplicationSlotCtlLock is held */
974  if (!s->in_use)
975  continue;
976 
977  /* only logical slots are database specific, skip */
978  if (!SlotIsLogical(s))
979  continue;
980 
981  /* not our database, skip */
982  if (s->data.database != dboid)
983  continue;
984 
985  /* acquire slot, so ReplicationSlotDropAcquired can be reused */
986  SpinLockAcquire(&s->mutex);
987  /* can't change while ReplicationSlotControlLock is held */
988  slotname = NameStr(s->data.name);
989  active_pid = s->active_pid;
990  if (active_pid == 0)
991  {
992  MyReplicationSlot = s;
993  s->active_pid = MyProcPid;
994  }
995  SpinLockRelease(&s->mutex);
996 
997  /*
998  * Even though we hold an exclusive lock on the database object a
999  * logical slot for that DB can still be active, e.g. if it's
1000  * concurrently being dropped by a backend connected to another DB.
1001  *
1002  * That's fairly unlikely in practice, so we'll just bail out.
1003  */
1004  if (active_pid)
1005  ereport(ERROR,
1006  (errcode(ERRCODE_OBJECT_IN_USE),
1007  errmsg("replication slot \"%s\" is active for PID %d",
1008  slotname, active_pid)));
1009 
1010  /*
1011  * To avoid duplicating ReplicationSlotDropAcquired() and to avoid
1012  * holding ReplicationSlotControlLock over filesystem operations,
1013  * release ReplicationSlotControlLock and use
1014  * ReplicationSlotDropAcquired.
1015  *
1016  * As that means the set of slots could change, restart scan from the
1017  * beginning each time we release the lock.
1018  */
1019  LWLockRelease(ReplicationSlotControlLock);
1021  goto restart;
1022  }
1023  LWLockRelease(ReplicationSlotControlLock);
1024 }
1025 
1026 
1027 /*
1028  * Check whether the server's configuration supports using replication
1029  * slots.
1030  */
1031 void
1033 {
1034  /*
1035  * NB: Adding a new requirement likely means that RestoreSlotFromDisk()
1036  * needs the same check.
1037  */
1038 
1039  if (max_replication_slots == 0)
1040  ereport(ERROR,
1041  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1042  errmsg("replication slots can only be used if max_replication_slots > 0")));
1043 
1045  ereport(ERROR,
1046  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1047  errmsg("replication slots can only be used if wal_level >= replica")));
1048 }
1049 
1050 /*
1051  * Reserve WAL for the currently active slot.
1052  *
1053  * Compute and set restart_lsn in a manner that's appropriate for the type of
1054  * the slot and concurrency safe.
1055  */
1056 void
1058 {
1060 
1061  Assert(slot != NULL);
1063 
1064  /*
1065  * The replication slot mechanism is used to prevent removal of required
1066  * WAL. As there is no interlock between this routine and checkpoints, WAL
1067  * segments could concurrently be removed when a now stale return value of
1068  * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that
1069  * this happens we'll just retry.
1070  */
1071  while (true)
1072  {
1073  XLogSegNo segno;
1074  XLogRecPtr restart_lsn;
1075 
1076  /*
1077  * For logical slots log a standby snapshot and start logical decoding
1078  * at exactly that position. That allows the slot to start up more
1079  * quickly.
1080  *
1081  * That's not needed (or indeed helpful) for physical slots as they'll
1082  * start replay at the last logged checkpoint anyway. Instead return
1083  * the location of the last redo LSN. While that slightly increases
1084  * the chance that we have to retry, it's where a base backup has to
1085  * start replay at.
1086  */
1087  if (!RecoveryInProgress() && SlotIsLogical(slot))
1088  {
1089  XLogRecPtr flushptr;
1090 
1091  /* start at current insert position */
1092  restart_lsn = GetXLogInsertRecPtr();
1093  SpinLockAcquire(&slot->mutex);
1094  slot->data.restart_lsn = restart_lsn;
1095  SpinLockRelease(&slot->mutex);
1096 
1097  /* make sure we have enough information to start */
1098  flushptr = LogStandbySnapshot();
1099 
1100  /* and make sure it's fsynced to disk */
1101  XLogFlush(flushptr);
1102  }
1103  else
1104  {
1105  restart_lsn = GetRedoRecPtr();
1106  SpinLockAcquire(&slot->mutex);
1107  slot->data.restart_lsn = restart_lsn;
1108  SpinLockRelease(&slot->mutex);
1109  }
1110 
1111  /* prevent WAL removal as fast as possible */
1113 
1114  /*
1115  * If all required WAL is still there, great, otherwise retry. The
1116  * slot should prevent further removal of WAL, unless there's a
1117  * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
1118  * the new restart_lsn above, so normally we should never need to loop
1119  * more than twice.
1120  */
1122  if (XLogGetLastRemovedSegno() < segno)
1123  break;
1124  }
1125 }
1126 
1127 /*
1128  * Mark any slot that points to an LSN older than the given segment
1129  * as invalid; it requires WAL that's about to be removed.
1130  *
1131  * NB - this runs as part of checkpoint, so avoid raising errors if possible.
1132  */
1133 void
1135 {
1136  XLogRecPtr oldestLSN;
1137 
1138  XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
1139 
1140 restart:
1141  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1142  for (int i = 0; i < max_replication_slots; i++)
1143  {
1144  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1145  XLogRecPtr restart_lsn = InvalidXLogRecPtr;
1146  NameData slotname;
1147  int wspid;
1148  int last_signaled_pid = 0;
1149 
1150  if (!s->in_use)
1151  continue;
1152 
1153  SpinLockAcquire(&s->mutex);
1154  slotname = s->data.name;
1155  restart_lsn = s->data.restart_lsn;
1156  SpinLockRelease(&s->mutex);
1157 
1158  if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn >= oldestLSN)
1159  continue;
1160  LWLockRelease(ReplicationSlotControlLock);
1162 
1163  /* Get ready to sleep on the slot in case it is active */
1165 
1166  for (;;)
1167  {
1168  /*
1169  * Try to mark this slot as used by this process.
1170  *
1171  * Note that ReplicationSlotAcquireInternal(SAB_Inquire)
1172  * should not cancel the prepared condition variable
1173  * if this slot is active in other process. Because in this case
1174  * we have to wait on that CV for the process owning
1175  * the slot to be terminated, later.
1176  */
1177  wspid = ReplicationSlotAcquireInternal(s, NULL, SAB_Inquire);
1178 
1179  /*
1180  * Exit the loop if we successfully acquired the slot or
1181  * the slot was dropped during waiting for the owning process
1182  * to be terminated. For example, the latter case is likely to
1183  * happen when the slot is temporary because it's automatically
1184  * dropped by the termination of the owning process.
1185  */
1186  if (wspid <= 0)
1187  break;
1188 
1189  /*
1190  * Signal to terminate the process that owns the slot.
1191  *
1192  * There is the race condition where other process may own
1193  * the slot after the process using it was terminated and before
1194  * this process owns it. To handle this case, we signal again
1195  * if the PID of the owning process is changed than the last.
1196  *
1197  * XXX This logic assumes that the same PID is not reused
1198  * very quickly.
1199  */
1200  if (last_signaled_pid != wspid)
1201  {
1202  ereport(LOG,
1203  (errmsg("terminating process %d because replication slot \"%s\" is too far behind",
1204  wspid, NameStr(slotname))));
1205  (void) kill(wspid, SIGTERM);
1206  last_signaled_pid = wspid;
1207  }
1208 
1211  }
1213 
1214  /*
1215  * Do nothing here and start from scratch if the slot has
1216  * already been dropped.
1217  */
1218  if (wspid == -1)
1219  goto restart;
1220 
1221  ereport(LOG,
1222  (errmsg("invalidating slot \"%s\" because its restart_lsn %X/%X exceeds max_slot_wal_keep_size",
1223  NameStr(slotname),
1224  (uint32) (restart_lsn >> 32),
1225  (uint32) restart_lsn)));
1226 
1227  SpinLockAcquire(&s->mutex);
1230  SpinLockRelease(&s->mutex);
1231 
1232  /* Make sure the invalidated state persists across server restart */
1236 
1237  /* if we did anything, start from scratch */
1238  goto restart;
1239  }
1240  LWLockRelease(ReplicationSlotControlLock);
1241 }
1242 
1243 /*
1244  * Flush all replication slots to disk.
1245  *
1246  * This needn't actually be part of a checkpoint, but it's a convenient
1247  * location.
1248  */
1249 void
1251 {
1252  int i;
1253 
1254  elog(DEBUG1, "performing replication slot checkpoint");
1255 
1256  /*
1257  * Prevent any slot from being created/dropped while we're active. As we
1258  * explicitly do *not* want to block iterating over replication_slots or
1259  * acquiring a slot we cannot take the control lock - but that's OK,
1260  * because holding ReplicationSlotAllocationLock is strictly stronger, and
1261  * enough to guarantee that nobody can change the in_use bits on us.
1262  */
1263  LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED);
1264 
1265  for (i = 0; i < max_replication_slots; i++)
1266  {
1267  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1268  char path[MAXPGPATH];
1269 
1270  if (!s->in_use)
1271  continue;
1272 
1273  /* save the slot to disk, locking is handled in SaveSlotToPath() */
1274  sprintf(path, "pg_replslot/%s", NameStr(s->data.name));
1275  SaveSlotToPath(s, path, LOG);
1276  }
1277  LWLockRelease(ReplicationSlotAllocationLock);
1278 }
1279 
1280 /*
1281  * Load all replication slots from disk into memory at server startup. This
1282  * needs to be run before we start crash recovery.
1283  */
1284 void
1286 {
1287  DIR *replication_dir;
1288  struct dirent *replication_de;
1289 
1290  elog(DEBUG1, "starting up replication slots");
1291 
1292  /* restore all slots by iterating over all on-disk entries */
1293  replication_dir = AllocateDir("pg_replslot");
1294  while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL)
1295  {
1296  struct stat statbuf;
1297  char path[MAXPGPATH + 12];
1298 
1299  if (strcmp(replication_de->d_name, ".") == 0 ||
1300  strcmp(replication_de->d_name, "..") == 0)
1301  continue;
1302 
1303  snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name);
1304 
1305  /* we're only creating directories here, skip if it's not our's */
1306  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
1307  continue;
1308 
1309  /* we crashed while a slot was being setup or deleted, clean up */
1310  if (pg_str_endswith(replication_de->d_name, ".tmp"))
1311  {
1312  if (!rmtree(path, true))
1313  {
1314  ereport(WARNING,
1315  (errmsg("could not remove directory \"%s\"",
1316  path)));
1317  continue;
1318  }
1319  fsync_fname("pg_replslot", true);
1320  continue;
1321  }
1322 
1323  /* looks like a slot in a normal state, restore */
1324  RestoreSlotFromDisk(replication_de->d_name);
1325  }
1326  FreeDir(replication_dir);
1327 
1328  /* currently no slots exist, we're done. */
1329  if (max_replication_slots <= 0)
1330  return;
1331 
1332  /* Now that we have recovered all the data, compute replication xmin */
1335 }
1336 
1337 /* ----
1338  * Manipulation of on-disk state of replication slots
1339  *
1340  * NB: none of the routines below should take any notice whether a slot is the
1341  * current one or not, that's all handled a layer above.
1342  * ----
1343  */
1344 static void
1346 {
1347  char tmppath[MAXPGPATH];
1348  char path[MAXPGPATH];
1349  struct stat st;
1350 
1351  /*
1352  * No need to take out the io_in_progress_lock, nobody else can see this
1353  * slot yet, so nobody else will write. We're reusing SaveSlotToPath which
1354  * takes out the lock, if we'd take the lock here, we'd deadlock.
1355  */
1356 
1357  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
1358  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
1359 
1360  /*
1361  * It's just barely possible that some previous effort to create or drop a
1362  * slot with this name left a temp directory lying around. If that seems
1363  * to be the case, try to remove it. If the rmtree() fails, we'll error
1364  * out at the MakePGDirectory() below, so we don't bother checking
1365  * success.
1366  */
1367  if (stat(tmppath, &st) == 0 && S_ISDIR(st.st_mode))
1368  rmtree(tmppath, true);
1369 
1370  /* Create and fsync the temporary slot directory. */
1371  if (MakePGDirectory(tmppath) < 0)
1372  ereport(ERROR,
1374  errmsg("could not create directory \"%s\": %m",
1375  tmppath)));
1376  fsync_fname(tmppath, true);
1377 
1378  /* Write the actual state file. */
1379  slot->dirty = true; /* signal that we really need to write */
1380  SaveSlotToPath(slot, tmppath, ERROR);
1381 
1382  /* Rename the directory into place. */
1383  if (rename(tmppath, path) != 0)
1384  ereport(ERROR,
1386  errmsg("could not rename file \"%s\" to \"%s\": %m",
1387  tmppath, path)));
1388 
1389  /*
1390  * If we'd now fail - really unlikely - we wouldn't know whether this slot
1391  * would persist after an OS crash or not - so, force a restart. The
1392  * restart would try to fsync this again till it works.
1393  */
1395 
1396  fsync_fname(path, true);
1397  fsync_fname("pg_replslot", true);
1398 
1399  END_CRIT_SECTION();
1400 }
1401 
1402 /*
1403  * Shared functionality between saving and creating a replication slot.
1404  */
1405 static void
1406 SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
1407 {
1408  char tmppath[MAXPGPATH];
1409  char path[MAXPGPATH];
1410  int fd;
1412  bool was_dirty;
1413 
1414  /* first check whether there's something to write out */
1415  SpinLockAcquire(&slot->mutex);
1416  was_dirty = slot->dirty;
1417  slot->just_dirtied = false;
1418  SpinLockRelease(&slot->mutex);
1419 
1420  /* and don't do anything if there's nothing to write */
1421  if (!was_dirty)
1422  return;
1423 
1425 
1426  /* silence valgrind :( */
1427  memset(&cp, 0, sizeof(ReplicationSlotOnDisk));
1428 
1429  sprintf(tmppath, "%s/state.tmp", dir);
1430  sprintf(path, "%s/state", dir);
1431 
1432  fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1433  if (fd < 0)
1434  {
1435  /*
1436  * If not an ERROR, then release the lock before returning. In case
1437  * of an ERROR, the error recovery path automatically releases the
1438  * lock, but no harm in explicitly releasing even in that case. Note
1439  * that LWLockRelease() could affect errno.
1440  */
1441  int save_errno = errno;
1442 
1444  errno = save_errno;
1445  ereport(elevel,
1447  errmsg("could not create file \"%s\": %m",
1448  tmppath)));
1449  return;
1450  }
1451 
1452  cp.magic = SLOT_MAGIC;
1453  INIT_CRC32C(cp.checksum);
1454  cp.version = SLOT_VERSION;
1456 
1457  SpinLockAcquire(&slot->mutex);
1458 
1459  memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData));
1460 
1461  SpinLockRelease(&slot->mutex);
1462 
1463  COMP_CRC32C(cp.checksum,
1464  (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize,
1466  FIN_CRC32C(cp.checksum);
1467 
1468  errno = 0;
1470  if ((write(fd, &cp, sizeof(cp))) != sizeof(cp))
1471  {
1472  int save_errno = errno;
1473 
1475  CloseTransientFile(fd);
1477 
1478  /* if write didn't set errno, assume problem is no disk space */
1479  errno = save_errno ? save_errno : ENOSPC;
1480  ereport(elevel,
1482  errmsg("could not write to file \"%s\": %m",
1483  tmppath)));
1484  return;
1485  }
1487 
1488  /* fsync the temporary file */
1490  if (pg_fsync(fd) != 0)
1491  {
1492  int save_errno = errno;
1493 
1495  CloseTransientFile(fd);
1497  errno = save_errno;
1498  ereport(elevel,
1500  errmsg("could not fsync file \"%s\": %m",
1501  tmppath)));
1502  return;
1503  }
1505 
1506  if (CloseTransientFile(fd) != 0)
1507  {
1508  int save_errno = errno;
1509 
1511  errno = save_errno;
1512  ereport(elevel,
1514  errmsg("could not close file \"%s\": %m",
1515  tmppath)));
1516  return;
1517  }
1518 
1519  /* rename to permanent file, fsync file and directory */
1520  if (rename(tmppath, path) != 0)
1521  {
1522  int save_errno = errno;
1523 
1525  errno = save_errno;
1526  ereport(elevel,
1528  errmsg("could not rename file \"%s\" to \"%s\": %m",
1529  tmppath, path)));
1530  return;
1531  }
1532 
1533  /*
1534  * Check CreateSlotOnDisk() for the reasoning of using a critical section.
1535  */
1537 
1538  fsync_fname(path, false);
1539  fsync_fname(dir, true);
1540  fsync_fname("pg_replslot", true);
1541 
1542  END_CRIT_SECTION();
1543 
1544  /*
1545  * Successfully wrote, unset dirty bit, unless somebody dirtied again
1546  * already.
1547  */
1548  SpinLockAcquire(&slot->mutex);
1549  if (!slot->just_dirtied)
1550  slot->dirty = false;
1551  SpinLockRelease(&slot->mutex);
1552 
1554 }
1555 
1556 /*
1557  * Load a single slot from disk into memory.
1558  */
1559 static void
1561 {
1563  int i;
1564  char slotdir[MAXPGPATH + 12];
1565  char path[MAXPGPATH + 22];
1566  int fd;
1567  bool restored = false;
1568  int readBytes;
1570 
1571  /* no need to lock here, no concurrent access allowed yet */
1572 
1573  /* delete temp file if it exists */
1574  sprintf(slotdir, "pg_replslot/%s", name);
1575  sprintf(path, "%s/state.tmp", slotdir);
1576  if (unlink(path) < 0 && errno != ENOENT)
1577  ereport(PANIC,
1579  errmsg("could not remove file \"%s\": %m", path)));
1580 
1581  sprintf(path, "%s/state", slotdir);
1582 
1583  elog(DEBUG1, "restoring replication slot from \"%s\"", path);
1584 
1585  /* on some operating systems fsyncing a file requires O_RDWR */
1586  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1587 
1588  /*
1589  * We do not need to handle this as we are rename()ing the directory into
1590  * place only after we fsync()ed the state file.
1591  */
1592  if (fd < 0)
1593  ereport(PANIC,
1595  errmsg("could not open file \"%s\": %m", path)));
1596 
1597  /*
1598  * Sync state file before we're reading from it. We might have crashed
1599  * while it wasn't synced yet and we shouldn't continue on that basis.
1600  */
1602  if (pg_fsync(fd) != 0)
1603  ereport(PANIC,
1605  errmsg("could not fsync file \"%s\": %m",
1606  path)));
1608 
1609  /* Also sync the parent directory */
1611  fsync_fname(slotdir, true);
1612  END_CRIT_SECTION();
1613 
1614  /* read part of statefile that's guaranteed to be version independent */
1616  readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize);
1618  if (readBytes != ReplicationSlotOnDiskConstantSize)
1619  {
1620  if (readBytes < 0)
1621  ereport(PANIC,
1623  errmsg("could not read file \"%s\": %m", path)));
1624  else
1625  ereport(PANIC,
1627  errmsg("could not read file \"%s\": read %d of %zu",
1628  path, readBytes,
1630  }
1631 
1632  /* verify magic */
1633  if (cp.magic != SLOT_MAGIC)
1634  ereport(PANIC,
1636  errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u",
1637  path, cp.magic, SLOT_MAGIC)));
1638 
1639  /* verify version */
1640  if (cp.version != SLOT_VERSION)
1641  ereport(PANIC,
1643  errmsg("replication slot file \"%s\" has unsupported version %u",
1644  path, cp.version)));
1645 
1646  /* boundary check on length */
1648  ereport(PANIC,
1650  errmsg("replication slot file \"%s\" has corrupted length %u",
1651  path, cp.length)));
1652 
1653  /* Now that we know the size, read the entire file */
1655  readBytes = read(fd,
1656  (char *) &cp + ReplicationSlotOnDiskConstantSize,
1657  cp.length);
1659  if (readBytes != cp.length)
1660  {
1661  if (readBytes < 0)
1662  ereport(PANIC,
1664  errmsg("could not read file \"%s\": %m", path)));
1665  else
1666  ereport(PANIC,
1668  errmsg("could not read file \"%s\": read %d of %zu",
1669  path, readBytes, (Size) cp.length)));
1670  }
1671 
1672  if (CloseTransientFile(fd) != 0)
1673  ereport(PANIC,
1675  errmsg("could not close file \"%s\": %m", path)));
1676 
1677  /* now verify the CRC */
1678  INIT_CRC32C(checksum);
1679  COMP_CRC32C(checksum,
1680  (char *) &cp + SnapBuildOnDiskNotChecksummedSize,
1682  FIN_CRC32C(checksum);
1683 
1684  if (!EQ_CRC32C(checksum, cp.checksum))
1685  ereport(PANIC,
1686  (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u",
1687  path, checksum, cp.checksum)));
1688 
1689  /*
1690  * If we crashed with an ephemeral slot active, don't restore but delete
1691  * it.
1692  */
1694  {
1695  if (!rmtree(slotdir, true))
1696  {
1697  ereport(WARNING,
1698  (errmsg("could not remove directory \"%s\"",
1699  slotdir)));
1700  }
1701  fsync_fname("pg_replslot", true);
1702  return;
1703  }
1704 
1705  /*
1706  * Verify that requirements for the specific slot type are met. That's
1707  * important because if these aren't met we're not guaranteed to retain
1708  * all the necessary resources for the slot.
1709  *
1710  * NB: We have to do so *after* the above checks for ephemeral slots,
1711  * because otherwise a slot that shouldn't exist anymore could prevent
1712  * restarts.
1713  *
1714  * NB: Changing the requirements here also requires adapting
1715  * CheckSlotRequirements() and CheckLogicalDecodingRequirements().
1716  */
1718  ereport(FATAL,
1719  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1720  errmsg("logical replication slot \"%s\" exists, but wal_level < logical",
1721  NameStr(cp.slotdata.name)),
1722  errhint("Change wal_level to be logical or higher.")));
1723  else if (wal_level < WAL_LEVEL_REPLICA)
1724  ereport(FATAL,
1725  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1726  errmsg("physical replication slot \"%s\" exists, but wal_level < replica",
1727  NameStr(cp.slotdata.name)),
1728  errhint("Change wal_level to be replica or higher.")));
1729 
1730  /* nothing can be active yet, don't lock anything */
1731  for (i = 0; i < max_replication_slots; i++)
1732  {
1733  ReplicationSlot *slot;
1734 
1735  slot = &ReplicationSlotCtl->replication_slots[i];
1736 
1737  if (slot->in_use)
1738  continue;
1739 
1740  /* restore the entire set of persistent data */
1741  memcpy(&slot->data, &cp.slotdata,
1743 
1744  /* initialize in memory state */
1745  slot->effective_xmin = cp.slotdata.xmin;
1747 
1752 
1753  slot->in_use = true;
1754  slot->active_pid = 0;
1755 
1756  restored = true;
1757  break;
1758  }
1759 
1760  if (!restored)
1761  ereport(FATAL,
1762  (errmsg("too many replication slots active before shutdown"),
1763  errhint("Increase max_replication_slots and try again.")));
1764 }
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
static void RestoreSlotFromDisk(const char *name)
Definition: slot.c:1560
ReplicationSlotCtlData * ReplicationSlotCtl
Definition: slot.c:93
void CheckSlotRequirements(void)
Definition: slot.c:1032
TransactionId candidate_catalog_xmin
Definition: slot.h:158
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
int errhint(const char *fmt,...)
Definition: elog.c:1071
Size ReplicationSlotsShmemSize(void)
Definition: slot.c:117
#define PROC_IN_LOGICAL_DECODING
Definition: proc.h:57
int wal_segment_size
Definition: xlog.c:117
uint32 TransactionId
Definition: c.h:520
bool pg_str_endswith(const char *str, const char *end)
Definition: string.c:31
void namestrcpy(Name name, const char *str)
Definition: name.c:233
#define write(a, b, c)
Definition: win32.h:14
PGPROC * MyProc
Definition: proc.c:67
#define SLOT_MAGIC
Definition: slot.c:89
uint32 pg_crc32c
Definition: pg_crc32c.h:38
int wal_level
Definition: xlog.c:107
#define SpinLockInit(lock)
Definition: spin.h:60
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:633
void ReplicationSlotCreate(const char *name, bool db_specific, ReplicationSlotPersistency persistency)
Definition: slot.c:222
ReplicationSlotPersistency persistency
Definition: slot.h:61
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
void ConditionVariableBroadcast(ConditionVariable *cv)
int errcode(int sqlerrcode)
Definition: elog.c:610
PROC_HDR * ProcGlobal
Definition: proc.c:79
#define MemSet(start, val, len)
Definition: c.h:949
#define kill(pid, sig)
Definition: win32_port.h:426
void ReplicationSlotSave(void)
Definition: slot.c:698
#define SnapBuildOnDiskNotChecksummedSize
Definition: slot.c:80
static void ReplicationSlotDropPtr(ReplicationSlot *slot)
Definition: slot.c:600
ReplicationSlotPersistentData data
Definition: slot.h:143
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8074
Definition: dirent.h:9
#define PANIC
Definition: elog.h:53
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2845
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define PG_BINARY
Definition: c.h:1211
static void CreateSlotOnDisk(ReplicationSlot *slot)
Definition: slot.c:1345
void ReplicationSlotsShmemInit(void)
Definition: slot.c:135
static void SaveSlotToPath(ReplicationSlot *slot, const char *path, int elevel)
Definition: slot.c:1406
void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
Definition: xlog.c:2724
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
#define NAMEDATALEN
#define sprintf
Definition: port.h:195
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:175
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableInit(ConditionVariable *cv)
XLogSegNo XLogGetLastRemovedSegno(void)
Definition: xlog.c:3957
void ReplicationSlotReserveWal(void)
Definition: slot.c:1057
static void ReplicationSlotDropAcquired(void)
Definition: slot.c:583
void ReplicationSlotsComputeRequiredLSN(void)
Definition: slot.c:805
ReplicationSlotPersistentData slotdata
Definition: slot.c:73
static ReplicationSlot * SearchNamedReplicationSlot(const char *name)
Definition: slot.c:335
void ConditionVariableCancelSleep(void)
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:923
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2372
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:392
#define FATAL
Definition: elog.h:52
XLogRecPtr GetXLogInsertRecPtr(void)
Definition: xlog.c:11507
#define MAXPGPATH
Definition: slot.h:43
void ReplicationSlotPersist(void)
Definition: slot.c:733
TransactionId effective_xmin
Definition: slot.h:139
Definition: c.h:616
XLogRecPtr candidate_restart_valid
Definition: slot.h:160
void StartupReplicationSlots(void)
Definition: slot.c:1285
bool IsUnderPostmaster
Definition: globals.c:109
uint64 XLogSegNo
Definition: xlogdefs.h:41
SlotAcquireBehavior
Definition: slot.h:40
int errcode_for_file_access(void)
Definition: elog.c:633
XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void)
Definition: slot.c:848
TransactionId catalog_xmin
Definition: slot.h:77
#define InvalidTransactionId
Definition: transam.h:31
unsigned int uint32
Definition: c.h:374
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2583
void ReplicationSlotRelease(void)
Definition: slot.c:476
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1386
TransactionId xmin
Definition: slot.h:69
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define SlotIsLogical(slot)
Definition: slot.h:165
#define AssertArg(condition)
Definition: c.h:747
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
pg_crc32c checksum
Definition: slot.c:62
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
struct ReplicationSlotOnDisk ReplicationSlotOnDisk
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:745
int CloseTransientFile(int fd)
Definition: fd.c:2549
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
#define stat(a, b)
Definition: win32_port.h:255
bool rmtree(const char *path, bool rmtopdir)
Definition: rmtree.c:42
bool in_use
Definition: slot.h:119
static int elevel
Definition: vacuumlazy.c:333
#define SpinLockRelease(lock)
Definition: spin.h:64
Size mul_size(Size s1, Size s2)
Definition: shmem.c:515
bool just_dirtied
Definition: slot.h:125
Size add_size(Size s1, Size s2)
Definition: shmem.c:498
TransactionId effective_catalog_xmin
Definition: slot.h:140
Oid MyDatabaseId
Definition: globals.c:85
#define SLOT_VERSION
Definition: slot.c:90
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:144
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3580
ReplicationSlot * MyReplicationSlot
Definition: slot.c:96
int max_replication_slots
Definition: slot.c:99
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest)
#define ReplicationSlotOnDiskV2Size
Definition: slot.c:86
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2649
XLogRecPtr restart_lsn
Definition: slot.h:80
#define ReplicationSlotOnDiskConstantSize
Definition: slot.c:77
size_t Size
Definition: c.h:473
uint32 version
Definition: slot.c:65
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1362
#define SnapBuildOnDiskChecksummedSize
Definition: slot.c:83
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:8366
static int ReplicationSlotAcquireInternal(ReplicationSlot *slot, const char *name, SlotAcquireBehavior behavior)
Definition: slot.c:381
ConditionVariable active_cv
Definition: slot.h:149
const char * name
Definition: encode.c:561
bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
Definition: slot.c:900
#define S_ISDIR(m)
Definition: win32_port.h:296
#define lstat(path, sb)
Definition: win32_port.h:244
XLogRecPtr candidate_xmin_lsn
Definition: slot.h:159
void ReplicationSlotDrop(const char *name, bool nowait)
Definition: slot.c:570
ReplicationSlotPersistency
Definition: slot.h:32
int errmsg(const char *fmt,...)
Definition: elog.c:824
int ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior)
Definition: slot.c:367
pid_t active_pid
Definition: slot.h:122
void InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
Definition: slot.c:1134
#define elog(elevel,...)
Definition: elog.h:214
int i
Definition: slot.h:42
int pgxactoff
Definition: proc.h:139
uint8 * vacuumFlags
Definition: proc.h:321
#define NameStr(name)
Definition: c.h:622
void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked)
Definition: procarray.c:3708
void ReplicationSlotCleanup(void)
Definition: slot.c:532
uint8 vacuumFlags
Definition: proc.h:178
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
int pg_fsync(int fd)
Definition: fd.c:346
ReplicationSlot replication_slots[1]
Definition: slot.h:176
char d_name[MAX_PATH]
Definition: dirent.h:15
XLogRecPtr invalidated_at
Definition: slot.h:83
slock_t mutex
Definition: slot.h:116
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:89
#define ERRCODE_DUPLICATE_OBJECT
Definition: streamutil.c:32
void CheckPointReplicationSlots(void)
Definition: slot.c:1250
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:94
#define snprintf
Definition: port.h:193
void ReplicationSlotsDropDBSlots(Oid dboid)
Definition: slot.c:956
void ReplicationSlotsComputeRequiredXmin(bool already_locked)
Definition: slot.c:755
bool dirty
Definition: slot.h:126
#define read(a, b, c)
Definition: win32.h:13
int FreeDir(DIR *dir)
Definition: fd.c:2701
XLogRecPtr candidate_restart_lsn
Definition: slot.h:161
#define offsetof(type, field)
Definition: c.h:668
void ReplicationSlotMarkDirty(void)
Definition: slot.c:716
LWLock io_in_progress_lock
Definition: slot.h:146
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)