PostgreSQL Source Code  git master
slot.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * slot.c
4  * Replication slot management.
5  *
6  *
7  * Copyright (c) 2012-2022, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/slot.c
12  *
13  * NOTES
14  *
15  * Replication slots are used to keep state about replication streams
16  * originating from this cluster. Their primary purpose is to prevent the
17  * premature removal of WAL or of old tuple versions in a manner that would
18  * interfere with replication; they are also useful for monitoring purposes.
19  * Slots need to be permanent (to allow restarts), crash-safe, and allocatable
20  * on standbys (to support cascading setups). The requirement that slots be
21  * usable on standbys precludes storing them in the system catalogs.
22  *
23  * Each replication slot gets its own directory inside the $PGDATA/pg_replslot
24  * directory. Inside that directory the state file will contain the slot's
25  * own data. Additional data can be stored alongside that file if required.
26  * While the server is running, the state data is also cached in memory for
27  * efficiency.
28  *
29  * ReplicationSlotAllocationLock must be taken in exclusive mode to allocate
30  * or free a slot. ReplicationSlotControlLock must be taken in shared mode
31  * to iterate over the slots, and in exclusive mode to change the in_use flag
32  * of a slot. The remaining data in each slot is protected by its mutex.
33  *
34  *-------------------------------------------------------------------------
35  */
36 
37 #include "postgres.h"
38 
39 #include <unistd.h>
40 #include <sys/stat.h>
41 
42 #include "access/transam.h"
43 #include "access/xlog_internal.h"
44 #include "common/file_utils.h"
45 #include "common/string.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "replication/slot.h"
49 #include "storage/fd.h"
50 #include "storage/ipc.h"
51 #include "storage/proc.h"
52 #include "storage/procarray.h"
53 #include "utils/builtins.h"
54 
55 /*
56  * Replication slot on-disk data structure.
57  */
58 typedef struct ReplicationSlotOnDisk
59 {
60  /* first part of this struct needs to be version independent */
61 
62  /* data not covered by checksum */
65 
66  /* data covered by checksum */
69 
70  /*
71  * The actual data in the slot that follows can differ based on the above
72  * 'version'.
73  */
74 
77 
78 /* size of version independent data */
79 #define ReplicationSlotOnDiskConstantSize \
80  offsetof(ReplicationSlotOnDisk, slotdata)
81 /* size of the part of the slot not covered by the checksum */
82 #define ReplicationSlotOnDiskNotChecksummedSize \
83  offsetof(ReplicationSlotOnDisk, version)
84 /* size of the part covered by the checksum */
85 #define ReplicationSlotOnDiskChecksummedSize \
86  sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskNotChecksummedSize
87 /* size of the slot data that is version dependent */
88 #define ReplicationSlotOnDiskV2Size \
89  sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskConstantSize
90 
91 #define SLOT_MAGIC 0x1051CA1 /* format identifier */
92 #define SLOT_VERSION 2 /* version for new files */
93 
94 /* Control array for replication slot management */
96 
97 /* My backend's replication slot in the shared memory array */
99 
100 /* GUC variable */
101 int max_replication_slots = 10; /* the maximum number of replication
102  * slots */
103 
104 static void ReplicationSlotShmemExit(int code, Datum arg);
105 static void ReplicationSlotDropAcquired(void);
106 static void ReplicationSlotDropPtr(ReplicationSlot *slot);
107 
108 /* internal persistency functions */
109 static void RestoreSlotFromDisk(const char *name);
110 static void CreateSlotOnDisk(ReplicationSlot *slot);
111 static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel);
112 
113 /*
114  * Report shared-memory space needed by ReplicationSlotsShmemInit.
115  */
116 Size
118 {
119  Size size = 0;
120 
121  if (max_replication_slots == 0)
122  return size;
123 
124  size = offsetof(ReplicationSlotCtlData, replication_slots);
125  size = add_size(size,
127 
128  return size;
129 }
130 
131 /*
132  * Allocate and initialize shared memory for replication slots.
133  */
134 void
136 {
137  bool found;
138 
139  if (max_replication_slots == 0)
140  return;
141 
143  ShmemInitStruct("ReplicationSlot Ctl", ReplicationSlotsShmemSize(),
144  &found);
145 
146  if (!found)
147  {
148  int i;
149 
150  /* First time through, so initialize */
152 
153  for (i = 0; i < max_replication_slots; i++)
154  {
156 
157  /* everything else is zeroed by the memset above */
158  SpinLockInit(&slot->mutex);
162  }
163  }
164 }
165 
166 /*
167  * Register the callback for replication slot cleanup and releasing.
168  */
169 void
171 {
173 }
174 
175 /*
176  * Release and cleanup replication slots.
177  */
178 static void
180 {
181  /* Make sure active replication slots are released */
182  if (MyReplicationSlot != NULL)
184 
185  /* Also cleanup all the temporary slots. */
187 }
188 
189 /*
190  * Check whether the passed slot name is valid and report errors at elevel.
191  *
192  * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow
193  * the name to be used as a directory name on every supported OS.
194  *
195  * Returns whether the directory name is valid or not if elevel < ERROR.
196  */
197 bool
198 ReplicationSlotValidateName(const char *name, int elevel)
199 {
200  const char *cp;
201 
202  if (strlen(name) == 0)
203  {
204  ereport(elevel,
205  (errcode(ERRCODE_INVALID_NAME),
206  errmsg("replication slot name \"%s\" is too short",
207  name)));
208  return false;
209  }
210 
211  if (strlen(name) >= NAMEDATALEN)
212  {
213  ereport(elevel,
214  (errcode(ERRCODE_NAME_TOO_LONG),
215  errmsg("replication slot name \"%s\" is too long",
216  name)));
217  return false;
218  }
219 
220  for (cp = name; *cp; cp++)
221  {
222  if (!((*cp >= 'a' && *cp <= 'z')
223  || (*cp >= '0' && *cp <= '9')
224  || (*cp == '_')))
225  {
226  ereport(elevel,
227  (errcode(ERRCODE_INVALID_NAME),
228  errmsg("replication slot name \"%s\" contains invalid character",
229  name),
230  errhint("Replication slot names may only contain lower case letters, numbers, and the underscore character.")));
231  return false;
232  }
233  }
234  return true;
235 }
236 
237 /*
238  * Create a new replication slot and mark it as used by this backend.
239  *
240  * name: Name of the slot
241  * db_specific: logical decoding is db specific; if the slot is going to
242  * be used for that pass true, otherwise false.
243  * two_phase: Allows decoding of prepared transactions. We allow this option
244  * to be enabled only at the slot creation time. If we allow this option
245  * to be changed during decoding then it is quite possible that we skip
246  * prepare first time because this option was not enabled. Now next time
247  * during getting changes, if the two_phase option is enabled it can skip
248  * prepare because by that time start decoding point has been moved. So the
249  * user will only get commit prepared.
250  */
251 void
252 ReplicationSlotCreate(const char *name, bool db_specific,
253  ReplicationSlotPersistency persistency, bool two_phase)
254 {
255  ReplicationSlot *slot = NULL;
256  int i;
257 
258  Assert(MyReplicationSlot == NULL);
259 
261 
262  /*
263  * If some other backend ran this code concurrently with us, we'd likely
264  * both allocate the same slot, and that would be bad. We'd also be at
265  * risk of missing a name collision. Also, we don't want to try to create
266  * a new slot while somebody's busy cleaning up an old one, because we
267  * might both be monkeying with the same directory.
268  */
269  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
270 
271  /*
272  * Check for name collision, and identify an allocatable slot. We need to
273  * hold ReplicationSlotControlLock in shared mode for this, so that nobody
274  * else can change the in_use flags while we're looking at them.
275  */
276  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
277  for (i = 0; i < max_replication_slots; i++)
278  {
280 
281  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
282  ereport(ERROR,
284  errmsg("replication slot \"%s\" already exists", name)));
285  if (!s->in_use && slot == NULL)
286  slot = s;
287  }
288  LWLockRelease(ReplicationSlotControlLock);
289 
290  /* If all slots are in use, we're out of luck. */
291  if (slot == NULL)
292  ereport(ERROR,
293  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
294  errmsg("all replication slots are in use"),
295  errhint("Free one or increase max_replication_slots.")));
296 
297  /*
298  * Since this slot is not in use, nobody should be looking at any part of
299  * it other than the in_use field unless they're trying to allocate it.
300  * And since we hold ReplicationSlotAllocationLock, nobody except us can
301  * be doing that. So it's safe to initialize the slot.
302  */
303  Assert(!slot->in_use);
304  Assert(slot->active_pid == 0);
305 
306  /* first initialize persistent data */
307  memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData));
308  namestrcpy(&slot->data.name, name);
309  slot->data.database = db_specific ? MyDatabaseId : InvalidOid;
310  slot->data.persistency = persistency;
311  slot->data.two_phase = two_phase;
313 
314  /* and then data only present in shared memory */
315  slot->just_dirtied = false;
316  slot->dirty = false;
323 
324  /*
325  * Create the slot on disk. We haven't actually marked the slot allocated
326  * yet, so no special cleanup is required if this errors out.
327  */
328  CreateSlotOnDisk(slot);
329 
330  /*
331  * We need to briefly prevent any other backend from iterating over the
332  * slots while we flip the in_use flag. We also need to set the active
333  * flag while holding the ControlLock as otherwise a concurrent
334  * ReplicationSlotAcquire() could acquire the slot as well.
335  */
336  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
337 
338  slot->in_use = true;
339 
340  /* We can now mark the slot active, and that makes it our slot. */
341  SpinLockAcquire(&slot->mutex);
342  Assert(slot->active_pid == 0);
343  slot->active_pid = MyProcPid;
344  SpinLockRelease(&slot->mutex);
345  MyReplicationSlot = slot;
346 
347  LWLockRelease(ReplicationSlotControlLock);
348 
349  /*
350  * Create statistics entry for the new logical slot. We don't collect any
351  * stats for physical slots, so no need to create an entry for the same.
352  * See ReplicationSlotDropPtr for why we need to do this before releasing
353  * ReplicationSlotAllocationLock.
354  */
355  if (SlotIsLogical(slot))
357 
358  /*
359  * Now that the slot has been marked as in_use and active, it's safe to
360  * let somebody else try to allocate a slot.
361  */
362  LWLockRelease(ReplicationSlotAllocationLock);
363 
364  /* Let everybody know we've modified this slot */
366 }
367 
368 /*
369  * Search for the named replication slot.
370  *
371  * Return the replication slot if found, otherwise NULL.
372  */
374 SearchNamedReplicationSlot(const char *name, bool need_lock)
375 {
376  int i;
377  ReplicationSlot *slot = NULL;
378 
379  if (need_lock)
380  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
381 
382  for (i = 0; i < max_replication_slots; i++)
383  {
385 
386  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
387  {
388  slot = s;
389  break;
390  }
391  }
392 
393  if (need_lock)
394  LWLockRelease(ReplicationSlotControlLock);
395 
396  return slot;
397 }
398 
399 /*
400  * Return the index of the replication slot in
401  * ReplicationSlotCtl->replication_slots.
402  *
403  * This is mainly useful to have an efficient key for storing replication slot
404  * stats.
405  */
406 int
408 {
410  slot < ReplicationSlotCtl->replication_slots + max_replication_slots);
411 
412  return slot - ReplicationSlotCtl->replication_slots;
413 }
414 
415 /*
416  * If the slot at 'index' is unused, return false. Otherwise 'name' is set to
417  * the slot's name and true is returned.
418  *
419  * This likely is only useful for pgstat_replslot.c during shutdown, in other
420  * cases there are obvious TOCTOU issues.
421  */
422 bool
424 {
425  ReplicationSlot *slot;
426  bool found;
427 
429 
430  /*
431  * Ensure that the slot cannot be dropped while we copy the name. Don't
432  * need the spinlock as the name of an existing slot cannot change.
433  */
434  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
435  found = slot->in_use;
436  if (slot->in_use)
437  namestrcpy(name, NameStr(slot->data.name));
438  LWLockRelease(ReplicationSlotControlLock);
439 
440  return found;
441 }
442 
443 /*
444  * Find a previously created slot and mark it as used by this process.
445  *
446  * An error is raised if nowait is true and the slot is currently in use. If
447  * nowait is false, we sleep until the slot is released by the owning process.
448  */
449 void
450 ReplicationSlotAcquire(const char *name, bool nowait)
451 {
452  ReplicationSlot *s;
453  int active_pid;
454 
455  Assert(name != NULL);
456 
457 retry:
458  Assert(MyReplicationSlot == NULL);
459 
460  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
461 
462  /*
463  * Search for the slot with the specified name if the slot to acquire is
464  * not given. If the slot is not found, we either return -1 or error out.
465  */
466  s = SearchNamedReplicationSlot(name, false);
467  if (s == NULL || !s->in_use)
468  {
469  LWLockRelease(ReplicationSlotControlLock);
470 
471  ereport(ERROR,
472  (errcode(ERRCODE_UNDEFINED_OBJECT),
473  errmsg("replication slot \"%s\" does not exist",
474  name)));
475  }
476 
477  /*
478  * This is the slot we want; check if it's active under some other
479  * process. In single user mode, we don't need this check.
480  */
481  if (IsUnderPostmaster)
482  {
483  /*
484  * Get ready to sleep on the slot in case it is active. (We may end
485  * up not sleeping, but we don't want to do this while holding the
486  * spinlock.)
487  */
488  if (!nowait)
490 
491  SpinLockAcquire(&s->mutex);
492  if (s->active_pid == 0)
493  s->active_pid = MyProcPid;
494  active_pid = s->active_pid;
495  SpinLockRelease(&s->mutex);
496  }
497  else
498  active_pid = MyProcPid;
499  LWLockRelease(ReplicationSlotControlLock);
500 
501  /*
502  * If we found the slot but it's already active in another process, we
503  * wait until the owning process signals us that it's been released, or
504  * error out.
505  */
506  if (active_pid != MyProcPid)
507  {
508  if (!nowait)
509  {
510  /* Wait here until we get signaled, and then restart */
514  goto retry;
515  }
516 
517  ereport(ERROR,
518  (errcode(ERRCODE_OBJECT_IN_USE),
519  errmsg("replication slot \"%s\" is active for PID %d",
520  NameStr(s->data.name), active_pid)));
521  }
522  else if (!nowait)
523  ConditionVariableCancelSleep(); /* no sleep needed after all */
524 
525  /* Let everybody know we've modified this slot */
527 
528  /* We made this slot active, so it's ours now. */
529  MyReplicationSlot = s;
530 
531  /*
532  * The call to pgstat_acquire_replslot() protects against stats for a
533  * different slot, from before a restart or such, being present during
534  * pgstat_report_replslot().
535  */
536  if (SlotIsLogical(s))
538 }
539 
540 /*
541  * Release the replication slot that this backend considers to own.
542  *
543  * This or another backend can re-acquire the slot later.
544  * Resources this slot requires will be preserved.
545  */
546 void
548 {
550 
551  Assert(slot != NULL && slot->active_pid != 0);
552 
553  if (slot->data.persistency == RS_EPHEMERAL)
554  {
555  /*
556  * Delete the slot. There is no !PANIC case where this is allowed to
557  * fail, all that may happen is an incomplete cleanup of the on-disk
558  * data.
559  */
561  }
562 
563  /*
564  * If slot needed to temporarily restrain both data and catalog xmin to
565  * create the catalog snapshot, remove that temporary constraint.
566  * Snapshots can only be exported while the initial snapshot is still
567  * acquired.
568  */
569  if (!TransactionIdIsValid(slot->data.xmin) &&
571  {
572  SpinLockAcquire(&slot->mutex);
574  SpinLockRelease(&slot->mutex);
576  }
577 
578  if (slot->data.persistency == RS_PERSISTENT)
579  {
580  /*
581  * Mark persistent slot inactive. We're not freeing it, just
582  * disconnecting, but wake up others that may be waiting for it.
583  */
584  SpinLockAcquire(&slot->mutex);
585  slot->active_pid = 0;
586  SpinLockRelease(&slot->mutex);
588  }
589 
590  MyReplicationSlot = NULL;
591 
592  /* might not have been set when we've been a plain slot */
593  LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
596  LWLockRelease(ProcArrayLock);
597 }
598 
599 /*
600  * Cleanup all temporary slots created in current session.
601  */
602 void
604 {
605  int i;
606 
607  Assert(MyReplicationSlot == NULL);
608 
609 restart:
610  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
611  for (i = 0; i < max_replication_slots; i++)
612  {
614 
615  if (!s->in_use)
616  continue;
617 
618  SpinLockAcquire(&s->mutex);
619  if (s->active_pid == MyProcPid)
620  {
622  SpinLockRelease(&s->mutex);
623  LWLockRelease(ReplicationSlotControlLock); /* avoid deadlock */
624 
626 
628  goto restart;
629  }
630  else
631  SpinLockRelease(&s->mutex);
632  }
633 
634  LWLockRelease(ReplicationSlotControlLock);
635 }
636 
637 /*
638  * Permanently drop replication slot identified by the passed in name.
639  */
640 void
641 ReplicationSlotDrop(const char *name, bool nowait)
642 {
643  Assert(MyReplicationSlot == NULL);
644 
645  ReplicationSlotAcquire(name, nowait);
646 
648 }
649 
650 /*
651  * Permanently drop the currently acquired replication slot.
652  */
653 static void
655 {
657 
658  Assert(MyReplicationSlot != NULL);
659 
660  /* slot isn't acquired anymore */
661  MyReplicationSlot = NULL;
662 
664 }
665 
666 /*
667  * Permanently drop the replication slot which will be released by the point
668  * this function returns.
669  */
670 static void
672 {
673  char path[MAXPGPATH];
674  char tmppath[MAXPGPATH];
675 
676  /*
677  * If some other backend ran this code concurrently with us, we might try
678  * to delete a slot with a certain name while someone else was trying to
679  * create a slot with the same name.
680  */
681  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
682 
683  /* Generate pathnames. */
684  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
685  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
686 
687  /*
688  * Rename the slot directory on disk, so that we'll no longer recognize
689  * this as a valid slot. Note that if this fails, we've got to mark the
690  * slot inactive before bailing out. If we're dropping an ephemeral or a
691  * temporary slot, we better never fail hard as the caller won't expect
692  * the slot to survive and this might get called during error handling.
693  */
694  if (rename(path, tmppath) == 0)
695  {
696  /*
697  * We need to fsync() the directory we just renamed and its parent to
698  * make sure that our changes are on disk in a crash-safe fashion. If
699  * fsync() fails, we can't be sure whether the changes are on disk or
700  * not. For now, we handle that by panicking;
701  * StartupReplicationSlots() will try to straighten it out after
702  * restart.
703  */
705  fsync_fname(tmppath, true);
706  fsync_fname("pg_replslot", true);
708  }
709  else
710  {
711  bool fail_softly = slot->data.persistency != RS_PERSISTENT;
712 
713  SpinLockAcquire(&slot->mutex);
714  slot->active_pid = 0;
715  SpinLockRelease(&slot->mutex);
716 
717  /* wake up anyone waiting on this slot */
719 
720  ereport(fail_softly ? WARNING : ERROR,
722  errmsg("could not rename file \"%s\" to \"%s\": %m",
723  path, tmppath)));
724  }
725 
726  /*
727  * The slot is definitely gone. Lock out concurrent scans of the array
728  * long enough to kill it. It's OK to clear the active PID here without
729  * grabbing the mutex because nobody else can be scanning the array here,
730  * and nobody can be attached to this slot and thus access it without
731  * scanning the array.
732  *
733  * Also wake up processes waiting for it.
734  */
735  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
736  slot->active_pid = 0;
737  slot->in_use = false;
738  LWLockRelease(ReplicationSlotControlLock);
740 
741  /*
742  * Slot is dead and doesn't prevent resource removal anymore, recompute
743  * limits.
744  */
747 
748  /*
749  * If removing the directory fails, the worst thing that will happen is
750  * that the user won't be able to create a new slot with the same name
751  * until the next server restart. We warn about it, but that's all.
752  */
753  if (!rmtree(tmppath, true))
755  (errmsg("could not remove directory \"%s\"", tmppath)));
756 
757  /*
758  * Drop the statistics entry for the replication slot. Do this while
759  * holding ReplicationSlotAllocationLock so that we don't drop a
760  * statistics entry for another slot with the same name just created in
761  * another session.
762  */
763  if (SlotIsLogical(slot))
764  pgstat_drop_replslot(slot);
765 
766  /*
767  * We release this at the very end, so that nobody starts trying to create
768  * a slot while we're still cleaning up the detritus of the old one.
769  */
770  LWLockRelease(ReplicationSlotAllocationLock);
771 }
772 
773 /*
774  * Serialize the currently acquired slot's state from memory to disk, thereby
775  * guaranteeing the current state will survive a crash.
776  */
777 void
779 {
780  char path[MAXPGPATH];
781 
782  Assert(MyReplicationSlot != NULL);
783 
784  sprintf(path, "pg_replslot/%s", NameStr(MyReplicationSlot->data.name));
786 }
787 
788 /*
789  * Signal that it would be useful if the currently acquired slot would be
790  * flushed out to disk.
791  *
792  * Note that the actual flush to disk can be delayed for a long time, if
793  * required for correctness explicitly do a ReplicationSlotSave().
794  */
795 void
797 {
799 
800  Assert(MyReplicationSlot != NULL);
801 
802  SpinLockAcquire(&slot->mutex);
804  MyReplicationSlot->dirty = true;
805  SpinLockRelease(&slot->mutex);
806 }
807 
808 /*
809  * Convert a slot that's marked as RS_EPHEMERAL to a RS_PERSISTENT slot,
810  * guaranteeing it will be there after an eventual crash.
811  */
812 void
814 {
816 
817  Assert(slot != NULL);
819 
820  SpinLockAcquire(&slot->mutex);
822  SpinLockRelease(&slot->mutex);
823 
826 }
827 
828 /*
829  * Compute the oldest xmin across all slots and store it in the ProcArray.
830  *
831  * If already_locked is true, ProcArrayLock has already been acquired
832  * exclusively.
833  */
834 void
836 {
837  int i;
839  TransactionId agg_catalog_xmin = InvalidTransactionId;
840 
841  Assert(ReplicationSlotCtl != NULL);
842 
843  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
844 
845  for (i = 0; i < max_replication_slots; i++)
846  {
848  TransactionId effective_xmin;
849  TransactionId effective_catalog_xmin;
850  bool invalidated;
851 
852  if (!s->in_use)
853  continue;
854 
855  SpinLockAcquire(&s->mutex);
856  effective_xmin = s->effective_xmin;
857  effective_catalog_xmin = s->effective_catalog_xmin;
858  invalidated = (!XLogRecPtrIsInvalid(s->data.invalidated_at) &&
860  SpinLockRelease(&s->mutex);
861 
862  /* invalidated slots need not apply */
863  if (invalidated)
864  continue;
865 
866  /* check the data xmin */
867  if (TransactionIdIsValid(effective_xmin) &&
868  (!TransactionIdIsValid(agg_xmin) ||
869  TransactionIdPrecedes(effective_xmin, agg_xmin)))
870  agg_xmin = effective_xmin;
871 
872  /* check the catalog xmin */
873  if (TransactionIdIsValid(effective_catalog_xmin) &&
874  (!TransactionIdIsValid(agg_catalog_xmin) ||
875  TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin)))
876  agg_catalog_xmin = effective_catalog_xmin;
877  }
878 
879  LWLockRelease(ReplicationSlotControlLock);
880 
881  ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
882 }
883 
884 /*
885  * Compute the oldest restart LSN across all slots and inform xlog module.
886  *
887  * Note: while max_slot_wal_keep_size is theoretically relevant for this
888  * purpose, we don't try to account for that, because this module doesn't
889  * know what to compare against.
890  */
891 void
893 {
894  int i;
895  XLogRecPtr min_required = InvalidXLogRecPtr;
896 
897  Assert(ReplicationSlotCtl != NULL);
898 
899  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
900  for (i = 0; i < max_replication_slots; i++)
901  {
903  XLogRecPtr restart_lsn;
904 
905  if (!s->in_use)
906  continue;
907 
908  SpinLockAcquire(&s->mutex);
909  restart_lsn = s->data.restart_lsn;
910  SpinLockRelease(&s->mutex);
911 
912  if (restart_lsn != InvalidXLogRecPtr &&
913  (min_required == InvalidXLogRecPtr ||
914  restart_lsn < min_required))
915  min_required = restart_lsn;
916  }
917  LWLockRelease(ReplicationSlotControlLock);
918 
919  XLogSetReplicationSlotMinimumLSN(min_required);
920 }
921 
922 /*
923  * Compute the oldest WAL LSN required by *logical* decoding slots..
924  *
925  * Returns InvalidXLogRecPtr if logical decoding is disabled or no logical
926  * slots exist.
927  *
928  * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it
929  * ignores physical replication slots.
930  *
931  * The results aren't required frequently, so we don't maintain a precomputed
932  * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin().
933  */
936 {
937  XLogRecPtr result = InvalidXLogRecPtr;
938  int i;
939 
940  if (max_replication_slots <= 0)
941  return InvalidXLogRecPtr;
942 
943  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
944 
945  for (i = 0; i < max_replication_slots; i++)
946  {
947  ReplicationSlot *s;
948  XLogRecPtr restart_lsn;
949 
951 
952  /* cannot change while ReplicationSlotCtlLock is held */
953  if (!s->in_use)
954  continue;
955 
956  /* we're only interested in logical slots */
957  if (!SlotIsLogical(s))
958  continue;
959 
960  /* read once, it's ok if it increases while we're checking */
961  SpinLockAcquire(&s->mutex);
962  restart_lsn = s->data.restart_lsn;
963  SpinLockRelease(&s->mutex);
964 
965  if (restart_lsn == InvalidXLogRecPtr)
966  continue;
967 
968  if (result == InvalidXLogRecPtr ||
969  restart_lsn < result)
970  result = restart_lsn;
971  }
972 
973  LWLockRelease(ReplicationSlotControlLock);
974 
975  return result;
976 }
977 
978 /*
979  * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the
980  * passed database oid.
981  *
982  * Returns true if there are any slots referencing the database. *nslots will
983  * be set to the absolute number of slots in the database, *nactive to ones
984  * currently active.
985  */
986 bool
987 ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
988 {
989  int i;
990 
991  *nslots = *nactive = 0;
992 
993  if (max_replication_slots <= 0)
994  return false;
995 
996  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
997  for (i = 0; i < max_replication_slots; i++)
998  {
999  ReplicationSlot *s;
1000 
1002 
1003  /* cannot change while ReplicationSlotCtlLock is held */
1004  if (!s->in_use)
1005  continue;
1006 
1007  /* only logical slots are database specific, skip */
1008  if (!SlotIsLogical(s))
1009  continue;
1010 
1011  /* not our database, skip */
1012  if (s->data.database != dboid)
1013  continue;
1014 
1015  /* count slots with spinlock held */
1016  SpinLockAcquire(&s->mutex);
1017  (*nslots)++;
1018  if (s->active_pid != 0)
1019  (*nactive)++;
1020  SpinLockRelease(&s->mutex);
1021  }
1022  LWLockRelease(ReplicationSlotControlLock);
1023 
1024  if (*nslots > 0)
1025  return true;
1026  return false;
1027 }
1028 
1029 /*
1030  * ReplicationSlotsDropDBSlots -- Drop all db-specific slots relating to the
1031  * passed database oid. The caller should hold an exclusive lock on the
1032  * pg_database oid for the database to prevent creation of new slots on the db
1033  * or replay from existing slots.
1034  *
1035  * Another session that concurrently acquires an existing slot on the target DB
1036  * (most likely to drop it) may cause this function to ERROR. If that happens
1037  * it may have dropped some but not all slots.
1038  *
1039  * This routine isn't as efficient as it could be - but we don't drop
1040  * databases often, especially databases with lots of slots.
1041  */
1042 void
1044 {
1045  int i;
1046 
1047  if (max_replication_slots <= 0)
1048  return;
1049 
1050 restart:
1051  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1052  for (i = 0; i < max_replication_slots; i++)
1053  {
1054  ReplicationSlot *s;
1055  char *slotname;
1056  int active_pid;
1057 
1059 
1060  /* cannot change while ReplicationSlotCtlLock is held */
1061  if (!s->in_use)
1062  continue;
1063 
1064  /* only logical slots are database specific, skip */
1065  if (!SlotIsLogical(s))
1066  continue;
1067 
1068  /* not our database, skip */
1069  if (s->data.database != dboid)
1070  continue;
1071 
1072  /* acquire slot, so ReplicationSlotDropAcquired can be reused */
1073  SpinLockAcquire(&s->mutex);
1074  /* can't change while ReplicationSlotControlLock is held */
1075  slotname = NameStr(s->data.name);
1076  active_pid = s->active_pid;
1077  if (active_pid == 0)
1078  {
1079  MyReplicationSlot = s;
1080  s->active_pid = MyProcPid;
1081  }
1082  SpinLockRelease(&s->mutex);
1083 
1084  /*
1085  * Even though we hold an exclusive lock on the database object a
1086  * logical slot for that DB can still be active, e.g. if it's
1087  * concurrently being dropped by a backend connected to another DB.
1088  *
1089  * That's fairly unlikely in practice, so we'll just bail out.
1090  */
1091  if (active_pid)
1092  ereport(ERROR,
1093  (errcode(ERRCODE_OBJECT_IN_USE),
1094  errmsg("replication slot \"%s\" is active for PID %d",
1095  slotname, active_pid)));
1096 
1097  /*
1098  * To avoid duplicating ReplicationSlotDropAcquired() and to avoid
1099  * holding ReplicationSlotControlLock over filesystem operations,
1100  * release ReplicationSlotControlLock and use
1101  * ReplicationSlotDropAcquired.
1102  *
1103  * As that means the set of slots could change, restart scan from the
1104  * beginning each time we release the lock.
1105  */
1106  LWLockRelease(ReplicationSlotControlLock);
1108  goto restart;
1109  }
1110  LWLockRelease(ReplicationSlotControlLock);
1111 }
1112 
1113 
1114 /*
1115  * Check whether the server's configuration supports using replication
1116  * slots.
1117  */
1118 void
1120 {
1121  /*
1122  * NB: Adding a new requirement likely means that RestoreSlotFromDisk()
1123  * needs the same check.
1124  */
1125 
1126  if (max_replication_slots == 0)
1127  ereport(ERROR,
1128  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1129  errmsg("replication slots can only be used if max_replication_slots > 0")));
1130 
1132  ereport(ERROR,
1133  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1134  errmsg("replication slots can only be used if wal_level >= replica")));
1135 }
1136 
1137 /*
1138  * Check whether the user has privilege to use replication slots.
1139  */
1140 void
1142 {
1143  if (!superuser() && !has_rolreplication(GetUserId()))
1144  ereport(ERROR,
1145  (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
1146  errmsg("must be superuser or replication role to use replication slots")));
1147 }
1148 
1149 /*
1150  * Reserve WAL for the currently active slot.
1151  *
1152  * Compute and set restart_lsn in a manner that's appropriate for the type of
1153  * the slot and concurrency safe.
1154  */
1155 void
1157 {
1159 
1160  Assert(slot != NULL);
1162 
1163  /*
1164  * The replication slot mechanism is used to prevent removal of required
1165  * WAL. As there is no interlock between this routine and checkpoints, WAL
1166  * segments could concurrently be removed when a now stale return value of
1167  * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that
1168  * this happens we'll just retry.
1169  */
1170  while (true)
1171  {
1172  XLogSegNo segno;
1173  XLogRecPtr restart_lsn;
1174 
1175  /*
1176  * For logical slots log a standby snapshot and start logical decoding
1177  * at exactly that position. That allows the slot to start up more
1178  * quickly.
1179  *
1180  * That's not needed (or indeed helpful) for physical slots as they'll
1181  * start replay at the last logged checkpoint anyway. Instead return
1182  * the location of the last redo LSN. While that slightly increases
1183  * the chance that we have to retry, it's where a base backup has to
1184  * start replay at.
1185  */
1186  if (!RecoveryInProgress() && SlotIsLogical(slot))
1187  {
1188  XLogRecPtr flushptr;
1189 
1190  /* start at current insert position */
1191  restart_lsn = GetXLogInsertRecPtr();
1192  SpinLockAcquire(&slot->mutex);
1193  slot->data.restart_lsn = restart_lsn;
1194  SpinLockRelease(&slot->mutex);
1195 
1196  /* make sure we have enough information to start */
1197  flushptr = LogStandbySnapshot();
1198 
1199  /* and make sure it's fsynced to disk */
1200  XLogFlush(flushptr);
1201  }
1202  else
1203  {
1204  restart_lsn = GetRedoRecPtr();
1205  SpinLockAcquire(&slot->mutex);
1206  slot->data.restart_lsn = restart_lsn;
1207  SpinLockRelease(&slot->mutex);
1208  }
1209 
1210  /* prevent WAL removal as fast as possible */
1212 
1213  /*
1214  * If all required WAL is still there, great, otherwise retry. The
1215  * slot should prevent further removal of WAL, unless there's a
1216  * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
1217  * the new restart_lsn above, so normally we should never need to loop
1218  * more than twice.
1219  */
1221  if (XLogGetLastRemovedSegno() < segno)
1222  break;
1223  }
1224 }
1225 
1226 /*
1227  * Helper for InvalidateObsoleteReplicationSlots -- acquires the given slot
1228  * and mark it invalid, if necessary and possible.
1229  *
1230  * Returns whether ReplicationSlotControlLock was released in the interim (and
1231  * in that case we're not holding the lock at return, otherwise we are).
1232  *
1233  * Sets *invalidated true if the slot was invalidated. (Untouched otherwise.)
1234  *
1235  * This is inherently racy, because we release the LWLock
1236  * for syscalls, so caller must restart if we return true.
1237  */
1238 static bool
1240  bool *invalidated)
1241 {
1242  int last_signaled_pid = 0;
1243  bool released_lock = false;
1244 
1245  for (;;)
1246  {
1247  XLogRecPtr restart_lsn;
1248  NameData slotname;
1249  int active_pid = 0;
1250 
1251  Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock, LW_SHARED));
1252 
1253  if (!s->in_use)
1254  {
1255  if (released_lock)
1256  LWLockRelease(ReplicationSlotControlLock);
1257  break;
1258  }
1259 
1260  /*
1261  * Check if the slot needs to be invalidated. If it needs to be
1262  * invalidated, and is not currently acquired, acquire it and mark it
1263  * as having been invalidated. We do this with the spinlock held to
1264  * avoid race conditions -- for example the restart_lsn could move
1265  * forward, or the slot could be dropped.
1266  */
1267  SpinLockAcquire(&s->mutex);
1268 
1269  restart_lsn = s->data.restart_lsn;
1270 
1271  /*
1272  * If the slot is already invalid or is fresh enough, we don't need to
1273  * do anything.
1274  */
1275  if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn >= oldestLSN)
1276  {
1277  SpinLockRelease(&s->mutex);
1278  if (released_lock)
1279  LWLockRelease(ReplicationSlotControlLock);
1280  break;
1281  }
1282 
1283  slotname = s->data.name;
1284  active_pid = s->active_pid;
1285 
1286  /*
1287  * If the slot can be acquired, do so and mark it invalidated
1288  * immediately. Otherwise we'll signal the owning process, below, and
1289  * retry.
1290  */
1291  if (active_pid == 0)
1292  {
1293  MyReplicationSlot = s;
1294  s->active_pid = MyProcPid;
1295  s->data.invalidated_at = restart_lsn;
1297 
1298  /* Let caller know */
1299  *invalidated = true;
1300  }
1301 
1302  SpinLockRelease(&s->mutex);
1303 
1304  if (active_pid != 0)
1305  {
1306  /*
1307  * Prepare the sleep on the slot's condition variable before
1308  * releasing the lock, to close a possible race condition if the
1309  * slot is released before the sleep below.
1310  */
1312 
1313  LWLockRelease(ReplicationSlotControlLock);
1314  released_lock = true;
1315 
1316  /*
1317  * Signal to terminate the process that owns the slot, if we
1318  * haven't already signalled it. (Avoidance of repeated
1319  * signalling is the only reason for there to be a loop in this
1320  * routine; otherwise we could rely on caller's restart loop.)
1321  *
1322  * There is the race condition that other process may own the slot
1323  * after its current owner process is terminated and before this
1324  * process owns it. To handle that, we signal only if the PID of
1325  * the owning process has changed from the previous time. (This
1326  * logic assumes that the same PID is not reused very quickly.)
1327  */
1328  if (last_signaled_pid != active_pid)
1329  {
1330  ereport(LOG,
1331  errmsg("terminating process %d to release replication slot \"%s\"",
1332  active_pid, NameStr(slotname)),
1333  errdetail("The slot's restart_lsn %X/%X exceeds the limit by %llu bytes.",
1334  LSN_FORMAT_ARGS(restart_lsn),
1335  (unsigned long long) (oldestLSN - restart_lsn)),
1336  errhint("You might need to increase max_slot_wal_keep_size."));
1337 
1338  (void) kill(active_pid, SIGTERM);
1339  last_signaled_pid = active_pid;
1340  }
1341 
1342  /* Wait until the slot is released. */
1345 
1346  /*
1347  * Re-acquire lock and start over; we expect to invalidate the
1348  * slot next time (unless another process acquires the slot in the
1349  * meantime).
1350  */
1351  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1352  continue;
1353  }
1354  else
1355  {
1356  /*
1357  * We hold the slot now and have already invalidated it; flush it
1358  * to ensure that state persists.
1359  *
1360  * Don't want to hold ReplicationSlotControlLock across file
1361  * system operations, so release it now but be sure to tell caller
1362  * to restart from scratch.
1363  */
1364  LWLockRelease(ReplicationSlotControlLock);
1365  released_lock = true;
1366 
1367  /* Make sure the invalidated state persists across server restart */
1371 
1372  ereport(LOG,
1373  errmsg("invalidating obsolete replication slot \"%s\"",
1374  NameStr(slotname)),
1375  errdetail("The slot's restart_lsn %X/%X exceeds the limit by %llu bytes.",
1376  LSN_FORMAT_ARGS(restart_lsn),
1377  (unsigned long long) (oldestLSN - restart_lsn)),
1378  errhint("You might need to increase max_slot_wal_keep_size."));
1379 
1380  /* done with this slot for now */
1381  break;
1382  }
1383  }
1384 
1385  Assert(released_lock == !LWLockHeldByMe(ReplicationSlotControlLock));
1386 
1387  return released_lock;
1388 }
1389 
1390 /*
1391  * Mark any slot that points to an LSN older than the given segment
1392  * as invalid; it requires WAL that's about to be removed.
1393  *
1394  * Returns true when any slot have got invalidated.
1395  *
1396  * NB - this runs as part of checkpoint, so avoid raising errors if possible.
1397  */
1398 bool
1400 {
1401  XLogRecPtr oldestLSN;
1402  bool invalidated = false;
1403 
1404  XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
1405 
1406 restart:
1407  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1408  for (int i = 0; i < max_replication_slots; i++)
1409  {
1411 
1412  if (!s->in_use)
1413  continue;
1414 
1415  if (InvalidatePossiblyObsoleteSlot(s, oldestLSN, &invalidated))
1416  {
1417  /* if the lock was released, start from scratch */
1418  goto restart;
1419  }
1420  }
1421  LWLockRelease(ReplicationSlotControlLock);
1422 
1423  /*
1424  * If any slots have been invalidated, recalculate the resource limits.
1425  */
1426  if (invalidated)
1427  {
1430  }
1431 
1432  return invalidated;
1433 }
1434 
1435 /*
1436  * Flush all replication slots to disk.
1437  *
1438  * This needn't actually be part of a checkpoint, but it's a convenient
1439  * location.
1440  */
1441 void
1443 {
1444  int i;
1445 
1446  elog(DEBUG1, "performing replication slot checkpoint");
1447 
1448  /*
1449  * Prevent any slot from being created/dropped while we're active. As we
1450  * explicitly do *not* want to block iterating over replication_slots or
1451  * acquiring a slot we cannot take the control lock - but that's OK,
1452  * because holding ReplicationSlotAllocationLock is strictly stronger, and
1453  * enough to guarantee that nobody can change the in_use bits on us.
1454  */
1455  LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED);
1456 
1457  for (i = 0; i < max_replication_slots; i++)
1458  {
1460  char path[MAXPGPATH];
1461 
1462  if (!s->in_use)
1463  continue;
1464 
1465  /* save the slot to disk, locking is handled in SaveSlotToPath() */
1466  sprintf(path, "pg_replslot/%s", NameStr(s->data.name));
1467  SaveSlotToPath(s, path, LOG);
1468  }
1469  LWLockRelease(ReplicationSlotAllocationLock);
1470 }
1471 
1472 /*
1473  * Load all replication slots from disk into memory at server startup. This
1474  * needs to be run before we start crash recovery.
1475  */
1476 void
1478 {
1479  DIR *replication_dir;
1480  struct dirent *replication_de;
1481 
1482  elog(DEBUG1, "starting up replication slots");
1483 
1484  /* restore all slots by iterating over all on-disk entries */
1485  replication_dir = AllocateDir("pg_replslot");
1486  while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL)
1487  {
1488  char path[MAXPGPATH + 12];
1489  PGFileType de_type;
1490 
1491  if (strcmp(replication_de->d_name, ".") == 0 ||
1492  strcmp(replication_de->d_name, "..") == 0)
1493  continue;
1494 
1495  snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name);
1496  de_type = get_dirent_type(path, replication_de, false, DEBUG1);
1497 
1498  /* we're only creating directories here, skip if it's not our's */
1499  if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_DIR)
1500  continue;
1501 
1502  /* we crashed while a slot was being setup or deleted, clean up */
1503  if (pg_str_endswith(replication_de->d_name, ".tmp"))
1504  {
1505  if (!rmtree(path, true))
1506  {
1507  ereport(WARNING,
1508  (errmsg("could not remove directory \"%s\"",
1509  path)));
1510  continue;
1511  }
1512  fsync_fname("pg_replslot", true);
1513  continue;
1514  }
1515 
1516  /* looks like a slot in a normal state, restore */
1517  RestoreSlotFromDisk(replication_de->d_name);
1518  }
1519  FreeDir(replication_dir);
1520 
1521  /* currently no slots exist, we're done. */
1522  if (max_replication_slots <= 0)
1523  return;
1524 
1525  /* Now that we have recovered all the data, compute replication xmin */
1528 }
1529 
1530 /* ----
1531  * Manipulation of on-disk state of replication slots
1532  *
1533  * NB: none of the routines below should take any notice whether a slot is the
1534  * current one or not, that's all handled a layer above.
1535  * ----
1536  */
1537 static void
1539 {
1540  char tmppath[MAXPGPATH];
1541  char path[MAXPGPATH];
1542  struct stat st;
1543 
1544  /*
1545  * No need to take out the io_in_progress_lock, nobody else can see this
1546  * slot yet, so nobody else will write. We're reusing SaveSlotToPath which
1547  * takes out the lock, if we'd take the lock here, we'd deadlock.
1548  */
1549 
1550  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
1551  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
1552 
1553  /*
1554  * It's just barely possible that some previous effort to create or drop a
1555  * slot with this name left a temp directory lying around. If that seems
1556  * to be the case, try to remove it. If the rmtree() fails, we'll error
1557  * out at the MakePGDirectory() below, so we don't bother checking
1558  * success.
1559  */
1560  if (stat(tmppath, &st) == 0 && S_ISDIR(st.st_mode))
1561  rmtree(tmppath, true);
1562 
1563  /* Create and fsync the temporary slot directory. */
1564  if (MakePGDirectory(tmppath) < 0)
1565  ereport(ERROR,
1567  errmsg("could not create directory \"%s\": %m",
1568  tmppath)));
1569  fsync_fname(tmppath, true);
1570 
1571  /* Write the actual state file. */
1572  slot->dirty = true; /* signal that we really need to write */
1573  SaveSlotToPath(slot, tmppath, ERROR);
1574 
1575  /* Rename the directory into place. */
1576  if (rename(tmppath, path) != 0)
1577  ereport(ERROR,
1579  errmsg("could not rename file \"%s\" to \"%s\": %m",
1580  tmppath, path)));
1581 
1582  /*
1583  * If we'd now fail - really unlikely - we wouldn't know whether this slot
1584  * would persist after an OS crash or not - so, force a restart. The
1585  * restart would try to fsync this again till it works.
1586  */
1588 
1589  fsync_fname(path, true);
1590  fsync_fname("pg_replslot", true);
1591 
1592  END_CRIT_SECTION();
1593 }
1594 
1595 /*
1596  * Shared functionality between saving and creating a replication slot.
1597  */
1598 static void
1599 SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
1600 {
1601  char tmppath[MAXPGPATH];
1602  char path[MAXPGPATH];
1603  int fd;
1605  bool was_dirty;
1606 
1607  /* first check whether there's something to write out */
1608  SpinLockAcquire(&slot->mutex);
1609  was_dirty = slot->dirty;
1610  slot->just_dirtied = false;
1611  SpinLockRelease(&slot->mutex);
1612 
1613  /* and don't do anything if there's nothing to write */
1614  if (!was_dirty)
1615  return;
1616 
1618 
1619  /* silence valgrind :( */
1620  memset(&cp, 0, sizeof(ReplicationSlotOnDisk));
1621 
1622  sprintf(tmppath, "%s/state.tmp", dir);
1623  sprintf(path, "%s/state", dir);
1624 
1625  fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1626  if (fd < 0)
1627  {
1628  /*
1629  * If not an ERROR, then release the lock before returning. In case
1630  * of an ERROR, the error recovery path automatically releases the
1631  * lock, but no harm in explicitly releasing even in that case. Note
1632  * that LWLockRelease() could affect errno.
1633  */
1634  int save_errno = errno;
1635 
1637  errno = save_errno;
1638  ereport(elevel,
1640  errmsg("could not create file \"%s\": %m",
1641  tmppath)));
1642  return;
1643  }
1644 
1645  cp.magic = SLOT_MAGIC;
1646  INIT_CRC32C(cp.checksum);
1647  cp.version = SLOT_VERSION;
1649 
1650  SpinLockAcquire(&slot->mutex);
1651 
1652  memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData));
1653 
1654  SpinLockRelease(&slot->mutex);
1655 
1656  COMP_CRC32C(cp.checksum,
1657  (char *) (&cp) + ReplicationSlotOnDiskNotChecksummedSize,
1659  FIN_CRC32C(cp.checksum);
1660 
1661  errno = 0;
1663  if ((write(fd, &cp, sizeof(cp))) != sizeof(cp))
1664  {
1665  int save_errno = errno;
1666 
1670 
1671  /* if write didn't set errno, assume problem is no disk space */
1672  errno = save_errno ? save_errno : ENOSPC;
1673  ereport(elevel,
1675  errmsg("could not write to file \"%s\": %m",
1676  tmppath)));
1677  return;
1678  }
1680 
1681  /* fsync the temporary file */
1683  if (pg_fsync(fd) != 0)
1684  {
1685  int save_errno = errno;
1686 
1690  errno = save_errno;
1691  ereport(elevel,
1693  errmsg("could not fsync file \"%s\": %m",
1694  tmppath)));
1695  return;
1696  }
1698 
1699  if (CloseTransientFile(fd) != 0)
1700  {
1701  int save_errno = errno;
1702 
1704  errno = save_errno;
1705  ereport(elevel,
1707  errmsg("could not close file \"%s\": %m",
1708  tmppath)));
1709  return;
1710  }
1711 
1712  /* rename to permanent file, fsync file and directory */
1713  if (rename(tmppath, path) != 0)
1714  {
1715  int save_errno = errno;
1716 
1718  errno = save_errno;
1719  ereport(elevel,
1721  errmsg("could not rename file \"%s\" to \"%s\": %m",
1722  tmppath, path)));
1723  return;
1724  }
1725 
1726  /*
1727  * Check CreateSlotOnDisk() for the reasoning of using a critical section.
1728  */
1730 
1731  fsync_fname(path, false);
1732  fsync_fname(dir, true);
1733  fsync_fname("pg_replslot", true);
1734 
1735  END_CRIT_SECTION();
1736 
1737  /*
1738  * Successfully wrote, unset dirty bit, unless somebody dirtied again
1739  * already.
1740  */
1741  SpinLockAcquire(&slot->mutex);
1742  if (!slot->just_dirtied)
1743  slot->dirty = false;
1744  SpinLockRelease(&slot->mutex);
1745 
1747 }
1748 
1749 /*
1750  * Load a single slot from disk into memory.
1751  */
1752 static void
1754 {
1756  int i;
1757  char slotdir[MAXPGPATH + 12];
1758  char path[MAXPGPATH + 22];
1759  int fd;
1760  bool restored = false;
1761  int readBytes;
1762  pg_crc32c checksum;
1763 
1764  /* no need to lock here, no concurrent access allowed yet */
1765 
1766  /* delete temp file if it exists */
1767  sprintf(slotdir, "pg_replslot/%s", name);
1768  sprintf(path, "%s/state.tmp", slotdir);
1769  if (unlink(path) < 0 && errno != ENOENT)
1770  ereport(PANIC,
1772  errmsg("could not remove file \"%s\": %m", path)));
1773 
1774  sprintf(path, "%s/state", slotdir);
1775 
1776  elog(DEBUG1, "restoring replication slot from \"%s\"", path);
1777 
1778  /* on some operating systems fsyncing a file requires O_RDWR */
1779  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1780 
1781  /*
1782  * We do not need to handle this as we are rename()ing the directory into
1783  * place only after we fsync()ed the state file.
1784  */
1785  if (fd < 0)
1786  ereport(PANIC,
1788  errmsg("could not open file \"%s\": %m", path)));
1789 
1790  /*
1791  * Sync state file before we're reading from it. We might have crashed
1792  * while it wasn't synced yet and we shouldn't continue on that basis.
1793  */
1795  if (pg_fsync(fd) != 0)
1796  ereport(PANIC,
1798  errmsg("could not fsync file \"%s\": %m",
1799  path)));
1801 
1802  /* Also sync the parent directory */
1804  fsync_fname(slotdir, true);
1805  END_CRIT_SECTION();
1806 
1807  /* read part of statefile that's guaranteed to be version independent */
1809  readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize);
1811  if (readBytes != ReplicationSlotOnDiskConstantSize)
1812  {
1813  if (readBytes < 0)
1814  ereport(PANIC,
1816  errmsg("could not read file \"%s\": %m", path)));
1817  else
1818  ereport(PANIC,
1820  errmsg("could not read file \"%s\": read %d of %zu",
1821  path, readBytes,
1823  }
1824 
1825  /* verify magic */
1826  if (cp.magic != SLOT_MAGIC)
1827  ereport(PANIC,
1829  errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u",
1830  path, cp.magic, SLOT_MAGIC)));
1831 
1832  /* verify version */
1833  if (cp.version != SLOT_VERSION)
1834  ereport(PANIC,
1836  errmsg("replication slot file \"%s\" has unsupported version %u",
1837  path, cp.version)));
1838 
1839  /* boundary check on length */
1841  ereport(PANIC,
1843  errmsg("replication slot file \"%s\" has corrupted length %u",
1844  path, cp.length)));
1845 
1846  /* Now that we know the size, read the entire file */
1848  readBytes = read(fd,
1849  (char *) &cp + ReplicationSlotOnDiskConstantSize,
1850  cp.length);
1852  if (readBytes != cp.length)
1853  {
1854  if (readBytes < 0)
1855  ereport(PANIC,
1857  errmsg("could not read file \"%s\": %m", path)));
1858  else
1859  ereport(PANIC,
1861  errmsg("could not read file \"%s\": read %d of %zu",
1862  path, readBytes, (Size) cp.length)));
1863  }
1864 
1865  if (CloseTransientFile(fd) != 0)
1866  ereport(PANIC,
1868  errmsg("could not close file \"%s\": %m", path)));
1869 
1870  /* now verify the CRC */
1871  INIT_CRC32C(checksum);
1872  COMP_CRC32C(checksum,
1875  FIN_CRC32C(checksum);
1876 
1877  if (!EQ_CRC32C(checksum, cp.checksum))
1878  ereport(PANIC,
1879  (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u",
1880  path, checksum, cp.checksum)));
1881 
1882  /*
1883  * If we crashed with an ephemeral slot active, don't restore but delete
1884  * it.
1885  */
1887  {
1888  if (!rmtree(slotdir, true))
1889  {
1890  ereport(WARNING,
1891  (errmsg("could not remove directory \"%s\"",
1892  slotdir)));
1893  }
1894  fsync_fname("pg_replslot", true);
1895  return;
1896  }
1897 
1898  /*
1899  * Verify that requirements for the specific slot type are met. That's
1900  * important because if these aren't met we're not guaranteed to retain
1901  * all the necessary resources for the slot.
1902  *
1903  * NB: We have to do so *after* the above checks for ephemeral slots,
1904  * because otherwise a slot that shouldn't exist anymore could prevent
1905  * restarts.
1906  *
1907  * NB: Changing the requirements here also requires adapting
1908  * CheckSlotRequirements() and CheckLogicalDecodingRequirements().
1909  */
1911  ereport(FATAL,
1912  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1913  errmsg("logical replication slot \"%s\" exists, but wal_level < logical",
1914  NameStr(cp.slotdata.name)),
1915  errhint("Change wal_level to be logical or higher.")));
1916  else if (wal_level < WAL_LEVEL_REPLICA)
1917  ereport(FATAL,
1918  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1919  errmsg("physical replication slot \"%s\" exists, but wal_level < replica",
1920  NameStr(cp.slotdata.name)),
1921  errhint("Change wal_level to be replica or higher.")));
1922 
1923  /* nothing can be active yet, don't lock anything */
1924  for (i = 0; i < max_replication_slots; i++)
1925  {
1926  ReplicationSlot *slot;
1927 
1929 
1930  if (slot->in_use)
1931  continue;
1932 
1933  /* restore the entire set of persistent data */
1934  memcpy(&slot->data, &cp.slotdata,
1936 
1937  /* initialize in memory state */
1938  slot->effective_xmin = cp.slotdata.xmin;
1940 
1945 
1946  slot->in_use = true;
1947  slot->active_pid = 0;
1948 
1949  restored = true;
1950  break;
1951  }
1952 
1953  if (!restored)
1954  ereport(FATAL,
1955  (errmsg("too many replication slots active before shutdown"),
1956  errhint("Increase max_replication_slots and try again.")));
1957 }
#define NameStr(name)
Definition: c.h:682
unsigned int uint32
Definition: c.h:442
#define PG_BINARY
Definition: c.h:1209
#define MemSet(start, val, len)
Definition: c.h:953
uint32 TransactionId
Definition: c.h:588
size_t Size
Definition: c.h:541
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableCancelSleep(void)
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
const char * name
Definition: encode.c:561
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2709
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3713
int FreeDir(DIR *dir)
Definition: fd.c:2761
int CloseTransientFile(int fd)
Definition: fd.c:2609
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:662
int pg_fsync(int fd)
Definition: fd.c:356
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2433
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2643
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:406
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
int MyProcPid
Definition: globals.c:44
bool IsUnderPostmaster
Definition: globals.c:113
Oid MyDatabaseId
Definition: globals.c:89
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:333
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1918
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1194
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1962
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1802
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:729
@ LWTRANCHE_REPLICATION_SLOT_IO
Definition: lwlock.h:187
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
Oid GetUserId(void)
Definition: miscinit.c:497
bool has_rolreplication(Oid roleid)
Definition: miscinit.c:694
void namestrcpy(Name name, const char *str)
Definition: name.c:233
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define NAMEDATALEN
#define MAXPGPATH
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:89
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:94
static bool two_phase
void pgstat_create_replslot(ReplicationSlot *slot)
void pgstat_acquire_replslot(ReplicationSlot *slot)
void pgstat_drop_replslot(ReplicationSlot *slot)
#define sprintf
Definition: port.h:240
#define snprintf
Definition: port.h:238
uintptr_t Datum
Definition: postgres.h:412
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define PROC_IN_LOGICAL_DECODING
Definition: proc.h:60
void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked)
Definition: procarray.c:3902
bool rmtree(const char *path, bool rmtopdir)
Definition: rmtree.c:42
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
ReplicationSlot * SearchNamedReplicationSlot(const char *name, bool need_lock)
Definition: slot.c:374
int ReplicationSlotIndex(ReplicationSlot *slot)
Definition: slot.c:407
#define ReplicationSlotOnDiskChecksummedSize
Definition: slot.c:85
void ReplicationSlotCleanup(void)
Definition: slot.c:603
void ReplicationSlotMarkDirty(void)
Definition: slot.c:796
void ReplicationSlotReserveWal(void)
Definition: slot.c:1156
bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
Definition: slot.c:987
void ReplicationSlotAcquire(const char *name, bool nowait)
Definition: slot.c:450
void ReplicationSlotsDropDBSlots(Oid dboid)
Definition: slot.c:1043
#define ReplicationSlotOnDiskNotChecksummedSize
Definition: slot.c:82
bool InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
Definition: slot.c:1399
XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void)
Definition: slot.c:935
void ReplicationSlotsComputeRequiredXmin(bool already_locked)
Definition: slot.c:835
static void RestoreSlotFromDisk(const char *name)
Definition: slot.c:1753
void ReplicationSlotPersist(void)
Definition: slot.c:813
ReplicationSlot * MyReplicationSlot
Definition: slot.c:98
static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
Definition: slot.c:1599
void ReplicationSlotDrop(const char *name, bool nowait)
Definition: slot.c:641
void ReplicationSlotSave(void)
Definition: slot.c:778
static void CreateSlotOnDisk(ReplicationSlot *slot)
Definition: slot.c:1538
#define ReplicationSlotOnDiskV2Size
Definition: slot.c:88
void CheckSlotPermissions(void)
Definition: slot.c:1141
bool ReplicationSlotName(int index, Name name)
Definition: slot.c:423
void ReplicationSlotsShmemInit(void)
Definition: slot.c:135
void ReplicationSlotRelease(void)
Definition: slot.c:547
int max_replication_slots
Definition: slot.c:101
ReplicationSlotCtlData * ReplicationSlotCtl
Definition: slot.c:95
#define SLOT_VERSION
Definition: slot.c:92
struct ReplicationSlotOnDisk ReplicationSlotOnDisk
void CheckPointReplicationSlots(void)
Definition: slot.c:1442
void ReplicationSlotsComputeRequiredLSN(void)
Definition: slot.c:892
void ReplicationSlotInitialize(void)
Definition: slot.c:170
static void ReplicationSlotDropPtr(ReplicationSlot *slot)
Definition: slot.c:671
void StartupReplicationSlots(void)
Definition: slot.c:1477
void ReplicationSlotCreate(const char *name, bool db_specific, ReplicationSlotPersistency persistency, bool two_phase)
Definition: slot.c:252
void CheckSlotRequirements(void)
Definition: slot.c:1119
#define SLOT_MAGIC
Definition: slot.c:91
static bool InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN, bool *invalidated)
Definition: slot.c:1239
static void ReplicationSlotDropAcquired(void)
Definition: slot.c:654
#define ReplicationSlotOnDiskConstantSize
Definition: slot.c:79
Size ReplicationSlotsShmemSize(void)
Definition: slot.c:117
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:198
static void ReplicationSlotShmemExit(int code, Datum arg)
Definition: slot.c:179
ReplicationSlotPersistency
Definition: slot.h:34
@ RS_PERSISTENT
Definition: slot.h:35
@ RS_EPHEMERAL
Definition: slot.h:36
@ RS_TEMPORARY
Definition: slot.h:37
#define SlotIsLogical(slot)
Definition: slot.h:169
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
PGPROC * MyProc
Definition: proc.c:68
PROC_HDR * ProcGlobal
Definition: proc.c:80
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:1272
#define ERRCODE_DUPLICATE_OBJECT
Definition: streamutil.c:32
bool pg_str_endswith(const char *str, const char *end)
Definition: string.c:32
Definition: dirent.c:26
uint8 statusFlags
Definition: proc.h:233
int pgxactoff
Definition: proc.h:188
uint8 * statusFlags
Definition: proc.h:377
ReplicationSlot replication_slots[1]
Definition: slot.h:180
uint32 version
Definition: slot.c:67
ReplicationSlotPersistentData slotdata
Definition: slot.c:75
pg_crc32c checksum
Definition: slot.c:64
TransactionId xmin
Definition: slot.h:62
XLogRecPtr two_phase_at
Definition: slot.h:90
TransactionId catalog_xmin
Definition: slot.h:70
XLogRecPtr restart_lsn
Definition: slot.h:73
XLogRecPtr invalidated_at
Definition: slot.h:76
ReplicationSlotPersistency persistency
Definition: slot.h:54
XLogRecPtr candidate_xmin_lsn
Definition: slot.h:163
TransactionId effective_catalog_xmin
Definition: slot.h:144
slock_t mutex
Definition: slot.h:120
XLogRecPtr candidate_restart_valid
Definition: slot.h:164
pid_t active_pid
Definition: slot.h:126
bool in_use
Definition: slot.h:123
TransactionId effective_xmin
Definition: slot.h:143
bool just_dirtied
Definition: slot.h:129
XLogRecPtr candidate_restart_lsn
Definition: slot.h:165
LWLock io_in_progress_lock
Definition: slot.h:150
ConditionVariable active_cv
Definition: slot.h:153
TransactionId candidate_catalog_xmin
Definition: slot.h:162
bool dirty
Definition: slot.h:130
ReplicationSlotPersistentData data
Definition: slot.h:147
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: type.h:95
Definition: c.h:677
unsigned short st_mode
Definition: win32_port.h:270
bool superuser(void)
Definition: superuser.c:46
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:273
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41
@ WAIT_EVENT_REPLICATION_SLOT_READ
Definition: wait_event.h:203
@ WAIT_EVENT_REPLICATION_SLOT_WRITE
Definition: wait_event.h:206
@ WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC
Definition: wait_event.h:204
@ WAIT_EVENT_REPLICATION_SLOT_SYNC
Definition: wait_event.h:205
@ WAIT_EVENT_REPLICATION_SLOT_DROP
Definition: wait_event.h:125
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:268
static void pgstat_report_wait_end(void)
Definition: wait_event.h:284
#define stat
Definition: win32_port.h:286
#define S_ISDIR(m)
Definition: win32_port.h:327
#define kill(pid, sig)
Definition: win32_port.h:482
bool RecoveryInProgress(void)
Definition: xlog.c:5912
XLogSegNo XLogGetLastRemovedSegno(void)
Definition: xlog.c:3461
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6015
int wal_level
Definition: xlog.c:134
int wal_segment_size
Definition: xlog.c:146
void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
Definition: xlog.c:2398
XLogRecPtr GetXLogInsertRecPtr(void)
Definition: xlog.c:8858
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2512
@ WAL_LEVEL_REPLICA
Definition: xlog.h:70
@ WAL_LEVEL_LOGICAL
Definition: xlog.h:71
#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest)
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint64 XLogSegNo
Definition: xlogdefs.h:48