PostgreSQL Source Code  git master
slot.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * slot.c
4  * Replication slot management.
5  *
6  *
7  * Copyright (c) 2012-2021, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/slot.c
12  *
13  * NOTES
14  *
15  * Replication slots are used to keep state about replication streams
16  * originating from this cluster. Their primary purpose is to prevent the
17  * premature removal of WAL or of old tuple versions in a manner that would
18  * interfere with replication; they are also useful for monitoring purposes.
19  * Slots need to be permanent (to allow restarts), crash-safe, and allocatable
20  * on standbys (to support cascading setups). The requirement that slots be
21  * usable on standbys precludes storing them in the system catalogs.
22  *
23  * Each replication slot gets its own directory inside the $PGDATA/pg_replslot
24  * directory. Inside that directory the state file will contain the slot's
25  * own data. Additional data can be stored alongside that file if required.
26  * While the server is running, the state data is also cached in memory for
27  * efficiency.
28  *
29  * ReplicationSlotAllocationLock must be taken in exclusive mode to allocate
30  * or free a slot. ReplicationSlotControlLock must be taken in shared mode
31  * to iterate over the slots, and in exclusive mode to change the in_use flag
32  * of a slot. The remaining data in each slot is protected by its mutex.
33  *
34  *-------------------------------------------------------------------------
35  */
36 
37 #include "postgres.h"
38 
39 #include <unistd.h>
40 #include <sys/stat.h>
41 
42 #include "access/transam.h"
43 #include "access/xlog_internal.h"
44 #include "common/string.h"
45 #include "miscadmin.h"
46 #include "pgstat.h"
47 #include "replication/slot.h"
48 #include "storage/fd.h"
49 #include "storage/proc.h"
50 #include "storage/procarray.h"
51 #include "utils/builtins.h"
52 
53 /*
54  * Replication slot on-disk data structure.
55  */
56 typedef struct ReplicationSlotOnDisk
57 {
58  /* first part of this struct needs to be version independent */
59 
60  /* data not covered by checksum */
63 
64  /* data covered by checksum */
67 
68  /*
69  * The actual data in the slot that follows can differ based on the above
70  * 'version'.
71  */
72 
75 
76 /* size of version independent data */
77 #define ReplicationSlotOnDiskConstantSize \
78  offsetof(ReplicationSlotOnDisk, slotdata)
79 /* size of the part of the slot not covered by the checksum */
80 #define SnapBuildOnDiskNotChecksummedSize \
81  offsetof(ReplicationSlotOnDisk, version)
82 /* size of the part covered by the checksum */
83 #define SnapBuildOnDiskChecksummedSize \
84  sizeof(ReplicationSlotOnDisk) - SnapBuildOnDiskNotChecksummedSize
85 /* size of the slot data that is version dependent */
86 #define ReplicationSlotOnDiskV2Size \
87  sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskConstantSize
88 
89 #define SLOT_MAGIC 0x1051CA1 /* format identifier */
90 #define SLOT_VERSION 2 /* version for new files */
91 
92 /* Control array for replication slot management */
94 
95 /* My backend's replication slot in the shared memory array */
97 
98 /* GUCs */
99 int max_replication_slots = 0; /* the maximum number of replication
100  * slots */
101 
102 static void ReplicationSlotDropAcquired(void);
103 static void ReplicationSlotDropPtr(ReplicationSlot *slot);
104 
105 /* internal persistency functions */
106 static void RestoreSlotFromDisk(const char *name);
107 static void CreateSlotOnDisk(ReplicationSlot *slot);
108 static void SaveSlotToPath(ReplicationSlot *slot, const char *path, int elevel);
109 
110 /*
111  * Report shared-memory space needed by ReplicationSlotsShmemInit.
112  */
113 Size
115 {
116  Size size = 0;
117 
118  if (max_replication_slots == 0)
119  return size;
120 
121  size = offsetof(ReplicationSlotCtlData, replication_slots);
122  size = add_size(size,
124 
125  return size;
126 }
127 
128 /*
129  * Allocate and initialize shared memory for replication slots.
130  */
131 void
133 {
134  bool found;
135 
136  if (max_replication_slots == 0)
137  return;
138 
139  ReplicationSlotCtl = (ReplicationSlotCtlData *)
140  ShmemInitStruct("ReplicationSlot Ctl", ReplicationSlotsShmemSize(),
141  &found);
142 
143  if (!found)
144  {
145  int i;
146 
147  /* First time through, so initialize */
148  MemSet(ReplicationSlotCtl, 0, ReplicationSlotsShmemSize());
149 
150  for (i = 0; i < max_replication_slots; i++)
151  {
152  ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[i];
153 
154  /* everything else is zeroed by the memset above */
155  SpinLockInit(&slot->mutex);
159  }
160  }
161 }
162 
163 /*
164  * Check whether the passed slot name is valid and report errors at elevel.
165  *
166  * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow
167  * the name to be used as a directory name on every supported OS.
168  *
169  * Returns whether the directory name is valid or not if elevel < ERROR.
170  */
171 bool
173 {
174  const char *cp;
175 
176  if (strlen(name) == 0)
177  {
178  ereport(elevel,
179  (errcode(ERRCODE_INVALID_NAME),
180  errmsg("replication slot name \"%s\" is too short",
181  name)));
182  return false;
183  }
184 
185  if (strlen(name) >= NAMEDATALEN)
186  {
187  ereport(elevel,
188  (errcode(ERRCODE_NAME_TOO_LONG),
189  errmsg("replication slot name \"%s\" is too long",
190  name)));
191  return false;
192  }
193 
194  for (cp = name; *cp; cp++)
195  {
196  if (!((*cp >= 'a' && *cp <= 'z')
197  || (*cp >= '0' && *cp <= '9')
198  || (*cp == '_')))
199  {
200  ereport(elevel,
201  (errcode(ERRCODE_INVALID_NAME),
202  errmsg("replication slot name \"%s\" contains invalid character",
203  name),
204  errhint("Replication slot names may only contain lower case letters, numbers, and the underscore character.")));
205  return false;
206  }
207  }
208  return true;
209 }
210 
211 /*
212  * Create a new replication slot and mark it as used by this backend.
213  *
214  * name: Name of the slot
215  * db_specific: logical decoding is db specific; if the slot is going to
216  * be used for that pass true, otherwise false.
217  * two_phase: Allows decoding of prepared transactions. We allow this option
218  * to be enabled only at the slot creation time. If we allow this option
219  * to be changed during decoding then it is quite possible that we skip
220  * prepare first time because this option was not enabled. Now next time
221  * during getting changes, if the two_phase option is enabled it can skip
222  * prepare because by that time start decoding point has been moved. So the
223  * user will only get commit prepared.
224  */
225 void
226 ReplicationSlotCreate(const char *name, bool db_specific,
227  ReplicationSlotPersistency persistency, bool two_phase)
228 {
229  ReplicationSlot *slot = NULL;
230  int i;
231 
232  Assert(MyReplicationSlot == NULL);
233 
235 
236  /*
237  * If some other backend ran this code concurrently with us, we'd likely
238  * both allocate the same slot, and that would be bad. We'd also be at
239  * risk of missing a name collision. Also, we don't want to try to create
240  * a new slot while somebody's busy cleaning up an old one, because we
241  * might both be monkeying with the same directory.
242  */
243  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
244 
245  /*
246  * Check for name collision, and identify an allocatable slot. We need to
247  * hold ReplicationSlotControlLock in shared mode for this, so that nobody
248  * else can change the in_use flags while we're looking at them.
249  */
250  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
251  for (i = 0; i < max_replication_slots; i++)
252  {
253  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
254 
255  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
256  ereport(ERROR,
258  errmsg("replication slot \"%s\" already exists", name)));
259  if (!s->in_use && slot == NULL)
260  slot = s;
261  }
262  LWLockRelease(ReplicationSlotControlLock);
263 
264  /* If all slots are in use, we're out of luck. */
265  if (slot == NULL)
266  ereport(ERROR,
267  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
268  errmsg("all replication slots are in use"),
269  errhint("Free one or increase max_replication_slots.")));
270 
271  /*
272  * Since this slot is not in use, nobody should be looking at any part of
273  * it other than the in_use field unless they're trying to allocate it.
274  * And since we hold ReplicationSlotAllocationLock, nobody except us can
275  * be doing that. So it's safe to initialize the slot.
276  */
277  Assert(!slot->in_use);
278  Assert(slot->active_pid == 0);
279 
280  /* first initialize persistent data */
281  memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData));
282  namestrcpy(&slot->data.name, name);
283  slot->data.database = db_specific ? MyDatabaseId : InvalidOid;
284  slot->data.persistency = persistency;
285  slot->data.two_phase = two_phase;
287 
288  /* and then data only present in shared memory */
289  slot->just_dirtied = false;
290  slot->dirty = false;
297 
298  /*
299  * Create the slot on disk. We haven't actually marked the slot allocated
300  * yet, so no special cleanup is required if this errors out.
301  */
302  CreateSlotOnDisk(slot);
303 
304  /*
305  * We need to briefly prevent any other backend from iterating over the
306  * slots while we flip the in_use flag. We also need to set the active
307  * flag while holding the ControlLock as otherwise a concurrent
308  * ReplicationSlotAcquire() could acquire the slot as well.
309  */
310  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
311 
312  slot->in_use = true;
313 
314  /* We can now mark the slot active, and that makes it our slot. */
315  SpinLockAcquire(&slot->mutex);
316  Assert(slot->active_pid == 0);
317  slot->active_pid = MyProcPid;
318  SpinLockRelease(&slot->mutex);
319  MyReplicationSlot = slot;
320 
321  LWLockRelease(ReplicationSlotControlLock);
322 
323  /*
324  * Create statistics entry for the new logical slot. We don't collect any
325  * stats for physical slots, so no need to create an entry for the same.
326  * See ReplicationSlotDropPtr for why we need to do this before releasing
327  * ReplicationSlotAllocationLock.
328  */
329  if (SlotIsLogical(slot))
331 
332  /*
333  * Now that the slot has been marked as in_use and active, it's safe to
334  * let somebody else try to allocate a slot.
335  */
336  LWLockRelease(ReplicationSlotAllocationLock);
337 
338  /* Let everybody know we've modified this slot */
340 }
341 
342 /*
343  * Search for the named replication slot.
344  *
345  * Return the replication slot if found, otherwise NULL.
346  */
348 SearchNamedReplicationSlot(const char *name, bool need_lock)
349 {
350  int i;
351  ReplicationSlot *slot = NULL;
352 
353  if (need_lock)
354  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
355 
356  for (i = 0; i < max_replication_slots; i++)
357  {
358  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
359 
360  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
361  {
362  slot = s;
363  break;
364  }
365  }
366 
367  if (need_lock)
368  LWLockRelease(ReplicationSlotControlLock);
369 
370  return slot;
371 }
372 
373 /*
374  * Find a previously created slot and mark it as used by this process.
375  *
376  * An error is raised if nowait is true and the slot is currently in use. If
377  * nowait is false, we sleep until the slot is released by the owning process.
378  */
379 void
380 ReplicationSlotAcquire(const char *name, bool nowait)
381 {
382  ReplicationSlot *s;
383  int active_pid;
384 
385  AssertArg(name != NULL);
386 
387 retry:
388  Assert(MyReplicationSlot == NULL);
389 
390  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
391 
392  /*
393  * Search for the slot with the specified name if the slot to acquire is
394  * not given. If the slot is not found, we either return -1 or error out.
395  */
396  s = SearchNamedReplicationSlot(name, false);
397  if (s == NULL || !s->in_use)
398  {
399  LWLockRelease(ReplicationSlotControlLock);
400 
401  ereport(ERROR,
402  (errcode(ERRCODE_UNDEFINED_OBJECT),
403  errmsg("replication slot \"%s\" does not exist",
404  name)));
405  }
406 
407  /*
408  * This is the slot we want; check if it's active under some other
409  * process. In single user mode, we don't need this check.
410  */
411  if (IsUnderPostmaster)
412  {
413  /*
414  * Get ready to sleep on the slot in case it is active. (We may end
415  * up not sleeping, but we don't want to do this while holding the
416  * spinlock.)
417  */
418  if (!nowait)
420 
421  SpinLockAcquire(&s->mutex);
422  if (s->active_pid == 0)
423  s->active_pid = MyProcPid;
424  active_pid = s->active_pid;
425  SpinLockRelease(&s->mutex);
426  }
427  else
428  active_pid = MyProcPid;
429  LWLockRelease(ReplicationSlotControlLock);
430 
431  /*
432  * If we found the slot but it's already active in another process, we
433  * wait until the owning process signals us that it's been released, or
434  * error out.
435  */
436  if (active_pid != MyProcPid)
437  {
438  if (!nowait)
439  {
440  /* Wait here until we get signaled, and then restart */
444  goto retry;
445  }
446 
447  ereport(ERROR,
448  (errcode(ERRCODE_OBJECT_IN_USE),
449  errmsg("replication slot \"%s\" is active for PID %d",
450  NameStr(s->data.name), active_pid)));
451  }
452  else if (!nowait)
453  ConditionVariableCancelSleep(); /* no sleep needed after all */
454 
455  /* Let everybody know we've modified this slot */
457 
458  /* We made this slot active, so it's ours now. */
459  MyReplicationSlot = s;
460 }
461 
462 /*
463  * Release the replication slot that this backend considers to own.
464  *
465  * This or another backend can re-acquire the slot later.
466  * Resources this slot requires will be preserved.
467  */
468 void
470 {
472 
473  Assert(slot != NULL && slot->active_pid != 0);
474 
475  if (slot->data.persistency == RS_EPHEMERAL)
476  {
477  /*
478  * Delete the slot. There is no !PANIC case where this is allowed to
479  * fail, all that may happen is an incomplete cleanup of the on-disk
480  * data.
481  */
483  }
484 
485  /*
486  * If slot needed to temporarily restrain both data and catalog xmin to
487  * create the catalog snapshot, remove that temporary constraint.
488  * Snapshots can only be exported while the initial snapshot is still
489  * acquired.
490  */
491  if (!TransactionIdIsValid(slot->data.xmin) &&
493  {
494  SpinLockAcquire(&slot->mutex);
496  SpinLockRelease(&slot->mutex);
498  }
499 
500  if (slot->data.persistency == RS_PERSISTENT)
501  {
502  /*
503  * Mark persistent slot inactive. We're not freeing it, just
504  * disconnecting, but wake up others that may be waiting for it.
505  */
506  SpinLockAcquire(&slot->mutex);
507  slot->active_pid = 0;
508  SpinLockRelease(&slot->mutex);
510  }
511 
512  MyReplicationSlot = NULL;
513 
514  /* might not have been set when we've been a plain slot */
515  LWLockAcquire(ProcArrayLock, LW_SHARED);
518  LWLockRelease(ProcArrayLock);
519 }
520 
521 /*
522  * Cleanup all temporary slots created in current session.
523  */
524 void
526 {
527  int i;
528 
529  Assert(MyReplicationSlot == NULL);
530 
531 restart:
532  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
533  for (i = 0; i < max_replication_slots; i++)
534  {
535  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
536 
537  if (!s->in_use)
538  continue;
539 
540  SpinLockAcquire(&s->mutex);
541  if (s->active_pid == MyProcPid)
542  {
544  SpinLockRelease(&s->mutex);
545  LWLockRelease(ReplicationSlotControlLock); /* avoid deadlock */
546 
548 
550  goto restart;
551  }
552  else
553  SpinLockRelease(&s->mutex);
554  }
555 
556  LWLockRelease(ReplicationSlotControlLock);
557 }
558 
559 /*
560  * Permanently drop replication slot identified by the passed in name.
561  */
562 void
563 ReplicationSlotDrop(const char *name, bool nowait)
564 {
565  Assert(MyReplicationSlot == NULL);
566 
567  ReplicationSlotAcquire(name, nowait);
568 
570 }
571 
572 /*
573  * Permanently drop the currently acquired replication slot.
574  */
575 static void
577 {
579 
580  Assert(MyReplicationSlot != NULL);
581 
582  /* slot isn't acquired anymore */
583  MyReplicationSlot = NULL;
584 
586 }
587 
588 /*
589  * Permanently drop the replication slot which will be released by the point
590  * this function returns.
591  */
592 static void
594 {
595  char path[MAXPGPATH];
596  char tmppath[MAXPGPATH];
597 
598  /*
599  * If some other backend ran this code concurrently with us, we might try
600  * to delete a slot with a certain name while someone else was trying to
601  * create a slot with the same name.
602  */
603  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
604 
605  /* Generate pathnames. */
606  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
607  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
608 
609  /*
610  * Rename the slot directory on disk, so that we'll no longer recognize
611  * this as a valid slot. Note that if this fails, we've got to mark the
612  * slot inactive before bailing out. If we're dropping an ephemeral or a
613  * temporary slot, we better never fail hard as the caller won't expect
614  * the slot to survive and this might get called during error handling.
615  */
616  if (rename(path, tmppath) == 0)
617  {
618  /*
619  * We need to fsync() the directory we just renamed and its parent to
620  * make sure that our changes are on disk in a crash-safe fashion. If
621  * fsync() fails, we can't be sure whether the changes are on disk or
622  * not. For now, we handle that by panicking;
623  * StartupReplicationSlots() will try to straighten it out after
624  * restart.
625  */
627  fsync_fname(tmppath, true);
628  fsync_fname("pg_replslot", true);
630  }
631  else
632  {
633  bool fail_softly = slot->data.persistency != RS_PERSISTENT;
634 
635  SpinLockAcquire(&slot->mutex);
636  slot->active_pid = 0;
637  SpinLockRelease(&slot->mutex);
638 
639  /* wake up anyone waiting on this slot */
641 
642  ereport(fail_softly ? WARNING : ERROR,
644  errmsg("could not rename file \"%s\" to \"%s\": %m",
645  path, tmppath)));
646  }
647 
648  /*
649  * The slot is definitely gone. Lock out concurrent scans of the array
650  * long enough to kill it. It's OK to clear the active PID here without
651  * grabbing the mutex because nobody else can be scanning the array here,
652  * and nobody can be attached to this slot and thus access it without
653  * scanning the array.
654  *
655  * Also wake up processes waiting for it.
656  */
657  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
658  slot->active_pid = 0;
659  slot->in_use = false;
660  LWLockRelease(ReplicationSlotControlLock);
662 
663  /*
664  * Slot is dead and doesn't prevent resource removal anymore, recompute
665  * limits.
666  */
669 
670  /*
671  * If removing the directory fails, the worst thing that will happen is
672  * that the user won't be able to create a new slot with the same name
673  * until the next server restart. We warn about it, but that's all.
674  */
675  if (!rmtree(tmppath, true))
677  (errmsg("could not remove directory \"%s\"", tmppath)));
678 
679  /*
680  * Send a message to drop the replication slot to the stats collector.
681  * Since there is no guarantee of the order of message transfer on a UDP
682  * connection, it's possible that a message for creating a new slot
683  * reaches before a message for removing the old slot. We send the drop
684  * and create messages while holding ReplicationSlotAllocationLock to
685  * reduce that possibility. If the messages reached in reverse, we would
686  * lose one statistics update message. But the next update message will
687  * create the statistics for the replication slot.
688  *
689  * XXX In case, the messages for creation and drop slot of the same name
690  * get lost and create happens before (auto)vacuum cleans up the dead
691  * slot, the stats will be accumulated into the old slot. One can imagine
692  * having OIDs for each slot to avoid the accumulation of stats but that
693  * doesn't seem worth doing as in practice this won't happen frequently.
694  */
695  if (SlotIsLogical(slot))
697 
698  /*
699  * We release this at the very end, so that nobody starts trying to create
700  * a slot while we're still cleaning up the detritus of the old one.
701  */
702  LWLockRelease(ReplicationSlotAllocationLock);
703 }
704 
705 /*
706  * Serialize the currently acquired slot's state from memory to disk, thereby
707  * guaranteeing the current state will survive a crash.
708  */
709 void
711 {
712  char path[MAXPGPATH];
713 
714  Assert(MyReplicationSlot != NULL);
715 
716  sprintf(path, "pg_replslot/%s", NameStr(MyReplicationSlot->data.name));
717  SaveSlotToPath(MyReplicationSlot, path, ERROR);
718 }
719 
720 /*
721  * Signal that it would be useful if the currently acquired slot would be
722  * flushed out to disk.
723  *
724  * Note that the actual flush to disk can be delayed for a long time, if
725  * required for correctness explicitly do a ReplicationSlotSave().
726  */
727 void
729 {
731 
732  Assert(MyReplicationSlot != NULL);
733 
734  SpinLockAcquire(&slot->mutex);
735  MyReplicationSlot->just_dirtied = true;
736  MyReplicationSlot->dirty = true;
737  SpinLockRelease(&slot->mutex);
738 }
739 
740 /*
741  * Convert a slot that's marked as RS_EPHEMERAL to a RS_PERSISTENT slot,
742  * guaranteeing it will be there after an eventual crash.
743  */
744 void
746 {
748 
749  Assert(slot != NULL);
751 
752  SpinLockAcquire(&slot->mutex);
754  SpinLockRelease(&slot->mutex);
755 
758 }
759 
760 /*
761  * Compute the oldest xmin across all slots and store it in the ProcArray.
762  *
763  * If already_locked is true, ProcArrayLock has already been acquired
764  * exclusively.
765  */
766 void
768 {
769  int i;
771  TransactionId agg_catalog_xmin = InvalidTransactionId;
772 
773  Assert(ReplicationSlotCtl != NULL);
774 
775  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
776 
777  for (i = 0; i < max_replication_slots; i++)
778  {
779  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
780  TransactionId effective_xmin;
781  TransactionId effective_catalog_xmin;
782 
783  if (!s->in_use)
784  continue;
785 
786  SpinLockAcquire(&s->mutex);
787  effective_xmin = s->effective_xmin;
788  effective_catalog_xmin = s->effective_catalog_xmin;
789  SpinLockRelease(&s->mutex);
790 
791  /* check the data xmin */
792  if (TransactionIdIsValid(effective_xmin) &&
793  (!TransactionIdIsValid(agg_xmin) ||
794  TransactionIdPrecedes(effective_xmin, agg_xmin)))
795  agg_xmin = effective_xmin;
796 
797  /* check the catalog xmin */
798  if (TransactionIdIsValid(effective_catalog_xmin) &&
799  (!TransactionIdIsValid(agg_catalog_xmin) ||
800  TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin)))
801  agg_catalog_xmin = effective_catalog_xmin;
802  }
803 
804  LWLockRelease(ReplicationSlotControlLock);
805 
806  ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
807 }
808 
809 /*
810  * Compute the oldest restart LSN across all slots and inform xlog module.
811  *
812  * Note: while max_slot_wal_keep_size is theoretically relevant for this
813  * purpose, we don't try to account for that, because this module doesn't
814  * know what to compare against.
815  */
816 void
818 {
819  int i;
820  XLogRecPtr min_required = InvalidXLogRecPtr;
821 
822  Assert(ReplicationSlotCtl != NULL);
823 
824  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
825  for (i = 0; i < max_replication_slots; i++)
826  {
827  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
828  XLogRecPtr restart_lsn;
829 
830  if (!s->in_use)
831  continue;
832 
833  SpinLockAcquire(&s->mutex);
834  restart_lsn = s->data.restart_lsn;
835  SpinLockRelease(&s->mutex);
836 
837  if (restart_lsn != InvalidXLogRecPtr &&
838  (min_required == InvalidXLogRecPtr ||
839  restart_lsn < min_required))
840  min_required = restart_lsn;
841  }
842  LWLockRelease(ReplicationSlotControlLock);
843 
844  XLogSetReplicationSlotMinimumLSN(min_required);
845 }
846 
847 /*
848  * Compute the oldest WAL LSN required by *logical* decoding slots..
849  *
850  * Returns InvalidXLogRecPtr if logical decoding is disabled or no logical
851  * slots exist.
852  *
853  * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it
854  * ignores physical replication slots.
855  *
856  * The results aren't required frequently, so we don't maintain a precomputed
857  * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin().
858  */
861 {
862  XLogRecPtr result = InvalidXLogRecPtr;
863  int i;
864 
865  if (max_replication_slots <= 0)
866  return InvalidXLogRecPtr;
867 
868  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
869 
870  for (i = 0; i < max_replication_slots; i++)
871  {
872  ReplicationSlot *s;
873  XLogRecPtr restart_lsn;
874 
875  s = &ReplicationSlotCtl->replication_slots[i];
876 
877  /* cannot change while ReplicationSlotCtlLock is held */
878  if (!s->in_use)
879  continue;
880 
881  /* we're only interested in logical slots */
882  if (!SlotIsLogical(s))
883  continue;
884 
885  /* read once, it's ok if it increases while we're checking */
886  SpinLockAcquire(&s->mutex);
887  restart_lsn = s->data.restart_lsn;
888  SpinLockRelease(&s->mutex);
889 
890  if (restart_lsn == InvalidXLogRecPtr)
891  continue;
892 
893  if (result == InvalidXLogRecPtr ||
894  restart_lsn < result)
895  result = restart_lsn;
896  }
897 
898  LWLockRelease(ReplicationSlotControlLock);
899 
900  return result;
901 }
902 
903 /*
904  * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the
905  * passed database oid.
906  *
907  * Returns true if there are any slots referencing the database. *nslots will
908  * be set to the absolute number of slots in the database, *nactive to ones
909  * currently active.
910  */
911 bool
912 ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
913 {
914  int i;
915 
916  *nslots = *nactive = 0;
917 
918  if (max_replication_slots <= 0)
919  return false;
920 
921  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
922  for (i = 0; i < max_replication_slots; i++)
923  {
924  ReplicationSlot *s;
925 
926  s = &ReplicationSlotCtl->replication_slots[i];
927 
928  /* cannot change while ReplicationSlotCtlLock is held */
929  if (!s->in_use)
930  continue;
931 
932  /* only logical slots are database specific, skip */
933  if (!SlotIsLogical(s))
934  continue;
935 
936  /* not our database, skip */
937  if (s->data.database != dboid)
938  continue;
939 
940  /* count slots with spinlock held */
941  SpinLockAcquire(&s->mutex);
942  (*nslots)++;
943  if (s->active_pid != 0)
944  (*nactive)++;
945  SpinLockRelease(&s->mutex);
946  }
947  LWLockRelease(ReplicationSlotControlLock);
948 
949  if (*nslots > 0)
950  return true;
951  return false;
952 }
953 
954 /*
955  * ReplicationSlotsDropDBSlots -- Drop all db-specific slots relating to the
956  * passed database oid. The caller should hold an exclusive lock on the
957  * pg_database oid for the database to prevent creation of new slots on the db
958  * or replay from existing slots.
959  *
960  * Another session that concurrently acquires an existing slot on the target DB
961  * (most likely to drop it) may cause this function to ERROR. If that happens
962  * it may have dropped some but not all slots.
963  *
964  * This routine isn't as efficient as it could be - but we don't drop
965  * databases often, especially databases with lots of slots.
966  */
967 void
969 {
970  int i;
971 
972  if (max_replication_slots <= 0)
973  return;
974 
975 restart:
976  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
977  for (i = 0; i < max_replication_slots; i++)
978  {
979  ReplicationSlot *s;
980  char *slotname;
981  int active_pid;
982 
983  s = &ReplicationSlotCtl->replication_slots[i];
984 
985  /* cannot change while ReplicationSlotCtlLock is held */
986  if (!s->in_use)
987  continue;
988 
989  /* only logical slots are database specific, skip */
990  if (!SlotIsLogical(s))
991  continue;
992 
993  /* not our database, skip */
994  if (s->data.database != dboid)
995  continue;
996 
997  /* acquire slot, so ReplicationSlotDropAcquired can be reused */
998  SpinLockAcquire(&s->mutex);
999  /* can't change while ReplicationSlotControlLock is held */
1000  slotname = NameStr(s->data.name);
1001  active_pid = s->active_pid;
1002  if (active_pid == 0)
1003  {
1004  MyReplicationSlot = s;
1005  s->active_pid = MyProcPid;
1006  }
1007  SpinLockRelease(&s->mutex);
1008 
1009  /*
1010  * Even though we hold an exclusive lock on the database object a
1011  * logical slot for that DB can still be active, e.g. if it's
1012  * concurrently being dropped by a backend connected to another DB.
1013  *
1014  * That's fairly unlikely in practice, so we'll just bail out.
1015  */
1016  if (active_pid)
1017  ereport(ERROR,
1018  (errcode(ERRCODE_OBJECT_IN_USE),
1019  errmsg("replication slot \"%s\" is active for PID %d",
1020  slotname, active_pid)));
1021 
1022  /*
1023  * To avoid duplicating ReplicationSlotDropAcquired() and to avoid
1024  * holding ReplicationSlotControlLock over filesystem operations,
1025  * release ReplicationSlotControlLock and use
1026  * ReplicationSlotDropAcquired.
1027  *
1028  * As that means the set of slots could change, restart scan from the
1029  * beginning each time we release the lock.
1030  */
1031  LWLockRelease(ReplicationSlotControlLock);
1033  goto restart;
1034  }
1035  LWLockRelease(ReplicationSlotControlLock);
1036 }
1037 
1038 
1039 /*
1040  * Check whether the server's configuration supports using replication
1041  * slots.
1042  */
1043 void
1045 {
1046  /*
1047  * NB: Adding a new requirement likely means that RestoreSlotFromDisk()
1048  * needs the same check.
1049  */
1050 
1051  if (max_replication_slots == 0)
1052  ereport(ERROR,
1053  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1054  errmsg("replication slots can only be used if max_replication_slots > 0")));
1055 
1057  ereport(ERROR,
1058  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1059  errmsg("replication slots can only be used if wal_level >= replica")));
1060 }
1061 
1062 /*
1063  * Check whether the user has privilege to use replication slots.
1064  */
1065 void
1067 {
1068  if (!superuser() && !has_rolreplication(GetUserId()))
1069  ereport(ERROR,
1070  (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
1071  errmsg("must be superuser or replication role to use replication slots")));
1072 }
1073 
1074 /*
1075  * Reserve WAL for the currently active slot.
1076  *
1077  * Compute and set restart_lsn in a manner that's appropriate for the type of
1078  * the slot and concurrency safe.
1079  */
1080 void
1082 {
1084 
1085  Assert(slot != NULL);
1087 
1088  /*
1089  * The replication slot mechanism is used to prevent removal of required
1090  * WAL. As there is no interlock between this routine and checkpoints, WAL
1091  * segments could concurrently be removed when a now stale return value of
1092  * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that
1093  * this happens we'll just retry.
1094  */
1095  while (true)
1096  {
1097  XLogSegNo segno;
1098  XLogRecPtr restart_lsn;
1099 
1100  /*
1101  * For logical slots log a standby snapshot and start logical decoding
1102  * at exactly that position. That allows the slot to start up more
1103  * quickly.
1104  *
1105  * That's not needed (or indeed helpful) for physical slots as they'll
1106  * start replay at the last logged checkpoint anyway. Instead return
1107  * the location of the last redo LSN. While that slightly increases
1108  * the chance that we have to retry, it's where a base backup has to
1109  * start replay at.
1110  */
1111  if (!RecoveryInProgress() && SlotIsLogical(slot))
1112  {
1113  XLogRecPtr flushptr;
1114 
1115  /* start at current insert position */
1116  restart_lsn = GetXLogInsertRecPtr();
1117  SpinLockAcquire(&slot->mutex);
1118  slot->data.restart_lsn = restart_lsn;
1119  SpinLockRelease(&slot->mutex);
1120 
1121  /* make sure we have enough information to start */
1122  flushptr = LogStandbySnapshot();
1123 
1124  /* and make sure it's fsynced to disk */
1125  XLogFlush(flushptr);
1126  }
1127  else
1128  {
1129  restart_lsn = GetRedoRecPtr();
1130  SpinLockAcquire(&slot->mutex);
1131  slot->data.restart_lsn = restart_lsn;
1132  SpinLockRelease(&slot->mutex);
1133  }
1134 
1135  /* prevent WAL removal as fast as possible */
1137 
1138  /*
1139  * If all required WAL is still there, great, otherwise retry. The
1140  * slot should prevent further removal of WAL, unless there's a
1141  * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
1142  * the new restart_lsn above, so normally we should never need to loop
1143  * more than twice.
1144  */
1146  if (XLogGetLastRemovedSegno() < segno)
1147  break;
1148  }
1149 }
1150 
1151 /*
1152  * Helper for InvalidateObsoleteReplicationSlots -- acquires the given slot
1153  * and mark it invalid, if necessary and possible.
1154  *
1155  * Returns whether ReplicationSlotControlLock was released in the interim (and
1156  * in that case we're not holding the lock at return, otherwise we are).
1157  *
1158  * Sets *invalidated true if the slot was invalidated. (Untouched otherwise.)
1159  *
1160  * This is inherently racy, because we release the LWLock
1161  * for syscalls, so caller must restart if we return true.
1162  */
1163 static bool
1165  bool *invalidated)
1166 {
1167  int last_signaled_pid = 0;
1168  bool released_lock = false;
1169 
1170  for (;;)
1171  {
1172  XLogRecPtr restart_lsn;
1173  NameData slotname;
1174  int active_pid = 0;
1175 
1176  Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock, LW_SHARED));
1177 
1178  if (!s->in_use)
1179  {
1180  if (released_lock)
1181  LWLockRelease(ReplicationSlotControlLock);
1182  break;
1183  }
1184 
1185  /*
1186  * Check if the slot needs to be invalidated. If it needs to be
1187  * invalidated, and is not currently acquired, acquire it and mark it
1188  * as having been invalidated. We do this with the spinlock held to
1189  * avoid race conditions -- for example the restart_lsn could move
1190  * forward, or the slot could be dropped.
1191  */
1192  SpinLockAcquire(&s->mutex);
1193 
1194  restart_lsn = s->data.restart_lsn;
1195 
1196  /*
1197  * If the slot is already invalid or is fresh enough, we don't need to
1198  * do anything.
1199  */
1200  if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn >= oldestLSN)
1201  {
1202  SpinLockRelease(&s->mutex);
1203  if (released_lock)
1204  LWLockRelease(ReplicationSlotControlLock);
1205  break;
1206  }
1207 
1208  slotname = s->data.name;
1209  active_pid = s->active_pid;
1210 
1211  /*
1212  * If the slot can be acquired, do so and mark it invalidated
1213  * immediately. Otherwise we'll signal the owning process, below, and
1214  * retry.
1215  */
1216  if (active_pid == 0)
1217  {
1218  MyReplicationSlot = s;
1219  s->active_pid = MyProcPid;
1220  s->data.invalidated_at = restart_lsn;
1222 
1223  /* Let caller know */
1224  *invalidated = true;
1225  }
1226 
1227  SpinLockRelease(&s->mutex);
1228 
1229  if (active_pid != 0)
1230  {
1231  /*
1232  * Prepare the sleep on the slot's condition variable before
1233  * releasing the lock, to close a possible race condition if the
1234  * slot is released before the sleep below.
1235  */
1237 
1238  LWLockRelease(ReplicationSlotControlLock);
1239  released_lock = true;
1240 
1241  /*
1242  * Signal to terminate the process that owns the slot, if we
1243  * haven't already signalled it. (Avoidance of repeated
1244  * signalling is the only reason for there to be a loop in this
1245  * routine; otherwise we could rely on caller's restart loop.)
1246  *
1247  * There is the race condition that other process may own the slot
1248  * after its current owner process is terminated and before this
1249  * process owns it. To handle that, we signal only if the PID of
1250  * the owning process has changed from the previous time. (This
1251  * logic assumes that the same PID is not reused very quickly.)
1252  */
1253  if (last_signaled_pid != active_pid)
1254  {
1255  ereport(LOG,
1256  (errmsg("terminating process %d to release replication slot \"%s\"",
1257  active_pid, NameStr(slotname))));
1258 
1259  (void) kill(active_pid, SIGTERM);
1260  last_signaled_pid = active_pid;
1261  }
1262 
1263  /* Wait until the slot is released. */
1266 
1267  /*
1268  * Re-acquire lock and start over; we expect to invalidate the
1269  * slot next time (unless another process acquires the slot in the
1270  * meantime).
1271  */
1272  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1273  continue;
1274  }
1275  else
1276  {
1277  /*
1278  * We hold the slot now and have already invalidated it; flush it
1279  * to ensure that state persists.
1280  *
1281  * Don't want to hold ReplicationSlotControlLock across file
1282  * system operations, so release it now but be sure to tell caller
1283  * to restart from scratch.
1284  */
1285  LWLockRelease(ReplicationSlotControlLock);
1286  released_lock = true;
1287 
1288  /* Make sure the invalidated state persists across server restart */
1292 
1293  ereport(LOG,
1294  (errmsg("invalidating slot \"%s\" because its restart_lsn %X/%X exceeds max_slot_wal_keep_size",
1295  NameStr(slotname),
1296  LSN_FORMAT_ARGS(restart_lsn))));
1297 
1298  /* done with this slot for now */
1299  break;
1300  }
1301  }
1302 
1303  Assert(released_lock == !LWLockHeldByMe(ReplicationSlotControlLock));
1304 
1305  return released_lock;
1306 }
1307 
1308 /*
1309  * Mark any slot that points to an LSN older than the given segment
1310  * as invalid; it requires WAL that's about to be removed.
1311  *
1312  * Returns true when any slot have got invalidated.
1313  *
1314  * NB - this runs as part of checkpoint, so avoid raising errors if possible.
1315  */
1316 bool
1318 {
1319  XLogRecPtr oldestLSN;
1320  bool invalidated = false;
1321 
1322  XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
1323 
1324 restart:
1325  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1326  for (int i = 0; i < max_replication_slots; i++)
1327  {
1328  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1329 
1330  if (!s->in_use)
1331  continue;
1332 
1333  if (InvalidatePossiblyObsoleteSlot(s, oldestLSN, &invalidated))
1334  {
1335  /* if the lock was released, start from scratch */
1336  goto restart;
1337  }
1338  }
1339  LWLockRelease(ReplicationSlotControlLock);
1340 
1341  /*
1342  * If any slots have been invalidated, recalculate the resource limits.
1343  */
1344  if (invalidated)
1345  {
1348  }
1349 
1350  return invalidated;
1351 }
1352 
1353 /*
1354  * Flush all replication slots to disk.
1355  *
1356  * This needn't actually be part of a checkpoint, but it's a convenient
1357  * location.
1358  */
1359 void
1361 {
1362  int i;
1363 
1364  elog(DEBUG1, "performing replication slot checkpoint");
1365 
1366  /*
1367  * Prevent any slot from being created/dropped while we're active. As we
1368  * explicitly do *not* want to block iterating over replication_slots or
1369  * acquiring a slot we cannot take the control lock - but that's OK,
1370  * because holding ReplicationSlotAllocationLock is strictly stronger, and
1371  * enough to guarantee that nobody can change the in_use bits on us.
1372  */
1373  LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED);
1374 
1375  for (i = 0; i < max_replication_slots; i++)
1376  {
1377  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1378  char path[MAXPGPATH];
1379 
1380  if (!s->in_use)
1381  continue;
1382 
1383  /* save the slot to disk, locking is handled in SaveSlotToPath() */
1384  sprintf(path, "pg_replslot/%s", NameStr(s->data.name));
1385  SaveSlotToPath(s, path, LOG);
1386  }
1387  LWLockRelease(ReplicationSlotAllocationLock);
1388 }
1389 
1390 /*
1391  * Load all replication slots from disk into memory at server startup. This
1392  * needs to be run before we start crash recovery.
1393  */
1394 void
1396 {
1397  DIR *replication_dir;
1398  struct dirent *replication_de;
1399 
1400  elog(DEBUG1, "starting up replication slots");
1401 
1402  /* restore all slots by iterating over all on-disk entries */
1403  replication_dir = AllocateDir("pg_replslot");
1404  while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL)
1405  {
1406  struct stat statbuf;
1407  char path[MAXPGPATH + 12];
1408 
1409  if (strcmp(replication_de->d_name, ".") == 0 ||
1410  strcmp(replication_de->d_name, "..") == 0)
1411  continue;
1412 
1413  snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name);
1414 
1415  /* we're only creating directories here, skip if it's not our's */
1416  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
1417  continue;
1418 
1419  /* we crashed while a slot was being setup or deleted, clean up */
1420  if (pg_str_endswith(replication_de->d_name, ".tmp"))
1421  {
1422  if (!rmtree(path, true))
1423  {
1424  ereport(WARNING,
1425  (errmsg("could not remove directory \"%s\"",
1426  path)));
1427  continue;
1428  }
1429  fsync_fname("pg_replslot", true);
1430  continue;
1431  }
1432 
1433  /* looks like a slot in a normal state, restore */
1434  RestoreSlotFromDisk(replication_de->d_name);
1435  }
1436  FreeDir(replication_dir);
1437 
1438  /* currently no slots exist, we're done. */
1439  if (max_replication_slots <= 0)
1440  return;
1441 
1442  /* Now that we have recovered all the data, compute replication xmin */
1445 }
1446 
1447 /* ----
1448  * Manipulation of on-disk state of replication slots
1449  *
1450  * NB: none of the routines below should take any notice whether a slot is the
1451  * current one or not, that's all handled a layer above.
1452  * ----
1453  */
1454 static void
1456 {
1457  char tmppath[MAXPGPATH];
1458  char path[MAXPGPATH];
1459  struct stat st;
1460 
1461  /*
1462  * No need to take out the io_in_progress_lock, nobody else can see this
1463  * slot yet, so nobody else will write. We're reusing SaveSlotToPath which
1464  * takes out the lock, if we'd take the lock here, we'd deadlock.
1465  */
1466 
1467  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
1468  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
1469 
1470  /*
1471  * It's just barely possible that some previous effort to create or drop a
1472  * slot with this name left a temp directory lying around. If that seems
1473  * to be the case, try to remove it. If the rmtree() fails, we'll error
1474  * out at the MakePGDirectory() below, so we don't bother checking
1475  * success.
1476  */
1477  if (stat(tmppath, &st) == 0 && S_ISDIR(st.st_mode))
1478  rmtree(tmppath, true);
1479 
1480  /* Create and fsync the temporary slot directory. */
1481  if (MakePGDirectory(tmppath) < 0)
1482  ereport(ERROR,
1484  errmsg("could not create directory \"%s\": %m",
1485  tmppath)));
1486  fsync_fname(tmppath, true);
1487 
1488  /* Write the actual state file. */
1489  slot->dirty = true; /* signal that we really need to write */
1490  SaveSlotToPath(slot, tmppath, ERROR);
1491 
1492  /* Rename the directory into place. */
1493  if (rename(tmppath, path) != 0)
1494  ereport(ERROR,
1496  errmsg("could not rename file \"%s\" to \"%s\": %m",
1497  tmppath, path)));
1498 
1499  /*
1500  * If we'd now fail - really unlikely - we wouldn't know whether this slot
1501  * would persist after an OS crash or not - so, force a restart. The
1502  * restart would try to fsync this again till it works.
1503  */
1505 
1506  fsync_fname(path, true);
1507  fsync_fname("pg_replslot", true);
1508 
1509  END_CRIT_SECTION();
1510 }
1511 
1512 /*
1513  * Shared functionality between saving and creating a replication slot.
1514  */
1515 static void
1516 SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
1517 {
1518  char tmppath[MAXPGPATH];
1519  char path[MAXPGPATH];
1520  int fd;
1522  bool was_dirty;
1523 
1524  /* first check whether there's something to write out */
1525  SpinLockAcquire(&slot->mutex);
1526  was_dirty = slot->dirty;
1527  slot->just_dirtied = false;
1528  SpinLockRelease(&slot->mutex);
1529 
1530  /* and don't do anything if there's nothing to write */
1531  if (!was_dirty)
1532  return;
1533 
1535 
1536  /* silence valgrind :( */
1537  memset(&cp, 0, sizeof(ReplicationSlotOnDisk));
1538 
1539  sprintf(tmppath, "%s/state.tmp", dir);
1540  sprintf(path, "%s/state", dir);
1541 
1542  fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1543  if (fd < 0)
1544  {
1545  /*
1546  * If not an ERROR, then release the lock before returning. In case
1547  * of an ERROR, the error recovery path automatically releases the
1548  * lock, but no harm in explicitly releasing even in that case. Note
1549  * that LWLockRelease() could affect errno.
1550  */
1551  int save_errno = errno;
1552 
1554  errno = save_errno;
1555  ereport(elevel,
1557  errmsg("could not create file \"%s\": %m",
1558  tmppath)));
1559  return;
1560  }
1561 
1562  cp.magic = SLOT_MAGIC;
1563  INIT_CRC32C(cp.checksum);
1564  cp.version = SLOT_VERSION;
1566 
1567  SpinLockAcquire(&slot->mutex);
1568 
1569  memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData));
1570 
1571  SpinLockRelease(&slot->mutex);
1572 
1573  COMP_CRC32C(cp.checksum,
1574  (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize,
1576  FIN_CRC32C(cp.checksum);
1577 
1578  errno = 0;
1580  if ((write(fd, &cp, sizeof(cp))) != sizeof(cp))
1581  {
1582  int save_errno = errno;
1583 
1585  CloseTransientFile(fd);
1587 
1588  /* if write didn't set errno, assume problem is no disk space */
1589  errno = save_errno ? save_errno : ENOSPC;
1590  ereport(elevel,
1592  errmsg("could not write to file \"%s\": %m",
1593  tmppath)));
1594  return;
1595  }
1597 
1598  /* fsync the temporary file */
1600  if (pg_fsync(fd) != 0)
1601  {
1602  int save_errno = errno;
1603 
1605  CloseTransientFile(fd);
1607  errno = save_errno;
1608  ereport(elevel,
1610  errmsg("could not fsync file \"%s\": %m",
1611  tmppath)));
1612  return;
1613  }
1615 
1616  if (CloseTransientFile(fd) != 0)
1617  {
1618  int save_errno = errno;
1619 
1621  errno = save_errno;
1622  ereport(elevel,
1624  errmsg("could not close file \"%s\": %m",
1625  tmppath)));
1626  return;
1627  }
1628 
1629  /* rename to permanent file, fsync file and directory */
1630  if (rename(tmppath, path) != 0)
1631  {
1632  int save_errno = errno;
1633 
1635  errno = save_errno;
1636  ereport(elevel,
1638  errmsg("could not rename file \"%s\" to \"%s\": %m",
1639  tmppath, path)));
1640  return;
1641  }
1642 
1643  /*
1644  * Check CreateSlotOnDisk() for the reasoning of using a critical section.
1645  */
1647 
1648  fsync_fname(path, false);
1649  fsync_fname(dir, true);
1650  fsync_fname("pg_replslot", true);
1651 
1652  END_CRIT_SECTION();
1653 
1654  /*
1655  * Successfully wrote, unset dirty bit, unless somebody dirtied again
1656  * already.
1657  */
1658  SpinLockAcquire(&slot->mutex);
1659  if (!slot->just_dirtied)
1660  slot->dirty = false;
1661  SpinLockRelease(&slot->mutex);
1662 
1664 }
1665 
1666 /*
1667  * Load a single slot from disk into memory.
1668  */
1669 static void
1671 {
1673  int i;
1674  char slotdir[MAXPGPATH + 12];
1675  char path[MAXPGPATH + 22];
1676  int fd;
1677  bool restored = false;
1678  int readBytes;
1680 
1681  /* no need to lock here, no concurrent access allowed yet */
1682 
1683  /* delete temp file if it exists */
1684  sprintf(slotdir, "pg_replslot/%s", name);
1685  sprintf(path, "%s/state.tmp", slotdir);
1686  if (unlink(path) < 0 && errno != ENOENT)
1687  ereport(PANIC,
1689  errmsg("could not remove file \"%s\": %m", path)));
1690 
1691  sprintf(path, "%s/state", slotdir);
1692 
1693  elog(DEBUG1, "restoring replication slot from \"%s\"", path);
1694 
1695  /* on some operating systems fsyncing a file requires O_RDWR */
1696  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1697 
1698  /*
1699  * We do not need to handle this as we are rename()ing the directory into
1700  * place only after we fsync()ed the state file.
1701  */
1702  if (fd < 0)
1703  ereport(PANIC,
1705  errmsg("could not open file \"%s\": %m", path)));
1706 
1707  /*
1708  * Sync state file before we're reading from it. We might have crashed
1709  * while it wasn't synced yet and we shouldn't continue on that basis.
1710  */
1712  if (pg_fsync(fd) != 0)
1713  ereport(PANIC,
1715  errmsg("could not fsync file \"%s\": %m",
1716  path)));
1718 
1719  /* Also sync the parent directory */
1721  fsync_fname(slotdir, true);
1722  END_CRIT_SECTION();
1723 
1724  /* read part of statefile that's guaranteed to be version independent */
1726  readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize);
1728  if (readBytes != ReplicationSlotOnDiskConstantSize)
1729  {
1730  if (readBytes < 0)
1731  ereport(PANIC,
1733  errmsg("could not read file \"%s\": %m", path)));
1734  else
1735  ereport(PANIC,
1737  errmsg("could not read file \"%s\": read %d of %zu",
1738  path, readBytes,
1740  }
1741 
1742  /* verify magic */
1743  if (cp.magic != SLOT_MAGIC)
1744  ereport(PANIC,
1746  errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u",
1747  path, cp.magic, SLOT_MAGIC)));
1748 
1749  /* verify version */
1750  if (cp.version != SLOT_VERSION)
1751  ereport(PANIC,
1753  errmsg("replication slot file \"%s\" has unsupported version %u",
1754  path, cp.version)));
1755 
1756  /* boundary check on length */
1758  ereport(PANIC,
1760  errmsg("replication slot file \"%s\" has corrupted length %u",
1761  path, cp.length)));
1762 
1763  /* Now that we know the size, read the entire file */
1765  readBytes = read(fd,
1766  (char *) &cp + ReplicationSlotOnDiskConstantSize,
1767  cp.length);
1769  if (readBytes != cp.length)
1770  {
1771  if (readBytes < 0)
1772  ereport(PANIC,
1774  errmsg("could not read file \"%s\": %m", path)));
1775  else
1776  ereport(PANIC,
1778  errmsg("could not read file \"%s\": read %d of %zu",
1779  path, readBytes, (Size) cp.length)));
1780  }
1781 
1782  if (CloseTransientFile(fd) != 0)
1783  ereport(PANIC,
1785  errmsg("could not close file \"%s\": %m", path)));
1786 
1787  /* now verify the CRC */
1788  INIT_CRC32C(checksum);
1789  COMP_CRC32C(checksum,
1790  (char *) &cp + SnapBuildOnDiskNotChecksummedSize,
1792  FIN_CRC32C(checksum);
1793 
1794  if (!EQ_CRC32C(checksum, cp.checksum))
1795  ereport(PANIC,
1796  (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u",
1797  path, checksum, cp.checksum)));
1798 
1799  /*
1800  * If we crashed with an ephemeral slot active, don't restore but delete
1801  * it.
1802  */
1804  {
1805  if (!rmtree(slotdir, true))
1806  {
1807  ereport(WARNING,
1808  (errmsg("could not remove directory \"%s\"",
1809  slotdir)));
1810  }
1811  fsync_fname("pg_replslot", true);
1812  return;
1813  }
1814 
1815  /*
1816  * Verify that requirements for the specific slot type are met. That's
1817  * important because if these aren't met we're not guaranteed to retain
1818  * all the necessary resources for the slot.
1819  *
1820  * NB: We have to do so *after* the above checks for ephemeral slots,
1821  * because otherwise a slot that shouldn't exist anymore could prevent
1822  * restarts.
1823  *
1824  * NB: Changing the requirements here also requires adapting
1825  * CheckSlotRequirements() and CheckLogicalDecodingRequirements().
1826  */
1828  ereport(FATAL,
1829  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1830  errmsg("logical replication slot \"%s\" exists, but wal_level < logical",
1831  NameStr(cp.slotdata.name)),
1832  errhint("Change wal_level to be logical or higher.")));
1833  else if (wal_level < WAL_LEVEL_REPLICA)
1834  ereport(FATAL,
1835  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1836  errmsg("physical replication slot \"%s\" exists, but wal_level < replica",
1837  NameStr(cp.slotdata.name)),
1838  errhint("Change wal_level to be replica or higher.")));
1839 
1840  /* nothing can be active yet, don't lock anything */
1841  for (i = 0; i < max_replication_slots; i++)
1842  {
1843  ReplicationSlot *slot;
1844 
1845  slot = &ReplicationSlotCtl->replication_slots[i];
1846 
1847  if (slot->in_use)
1848  continue;
1849 
1850  /* restore the entire set of persistent data */
1851  memcpy(&slot->data, &cp.slotdata,
1853 
1854  /* initialize in memory state */
1855  slot->effective_xmin = cp.slotdata.xmin;
1857 
1862 
1863  slot->in_use = true;
1864  slot->active_pid = 0;
1865 
1866  restored = true;
1867  break;
1868  }
1869 
1870  if (!restored)
1871  ereport(FATAL,
1872  (errmsg("too many replication slots active before shutdown"),
1873  errhint("Increase max_replication_slots and try again.")));
1874 }
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
ReplicationSlot * SearchNamedReplicationSlot(const char *name, bool need_lock)
Definition: slot.c:348
static void RestoreSlotFromDisk(const char *name)
Definition: slot.c:1670
ReplicationSlotCtlData * ReplicationSlotCtl
Definition: slot.c:93
void CheckSlotRequirements(void)
Definition: slot.c:1044
TransactionId candidate_catalog_xmin
Definition: slot.h:162
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1937
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:43
int errhint(const char *fmt,...)
Definition: elog.c:1156
Size ReplicationSlotsShmemSize(void)
Definition: slot.c:114
static void pgstat_report_wait_end(void)
Definition: wait_event.h:278
#define PROC_IN_LOGICAL_DECODING
Definition: proc.h:61
int wal_segment_size
Definition: xlog.c:119
uint32 TransactionId
Definition: c.h:587
bool pg_str_endswith(const char *str, const char *end)
Definition: string.c:31
Oid GetUserId(void)
Definition: miscinit.c:495
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1919
void namestrcpy(Name name, const char *str)
Definition: name.c:233
#define write(a, b, c)
Definition: win32.h:14
PGPROC * MyProc
Definition: proc.c:68
#define SLOT_MAGIC
Definition: slot.c:89
uint32 pg_crc32c
Definition: pg_crc32c.h:38
int wal_level
Definition: xlog.c:108
#define SpinLockInit(lock)
Definition: spin.h:60
void ReplicationSlotAcquire(const char *name, bool nowait)
Definition: slot.c:380
#define END_CRIT_SECTION()
Definition: miscadmin.h:149
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:671
ReplicationSlotPersistency persistency
Definition: slot.h:54
#define START_CRIT_SECTION()
Definition: miscadmin.h:147
void ConditionVariableBroadcast(ConditionVariable *cv)
int errcode(int sqlerrcode)
Definition: elog.c:698
bool superuser(void)
Definition: superuser.c:46
PROC_HDR * ProcGlobal
Definition: proc.c:80
#define MemSet(start, val, len)
Definition: c.h:1008
#define kill(pid, sig)
Definition: win32_port.h:454
void ReplicationSlotCreate(const char *name, bool db_specific, ReplicationSlotPersistency persistency, bool two_phase)
Definition: slot.c:226
void ReplicationSlotSave(void)
Definition: slot.c:710
uint8 statusFlags
Definition: proc.h:189
#define SnapBuildOnDiskNotChecksummedSize
Definition: slot.c:80
static void ReplicationSlotDropPtr(ReplicationSlot *slot)
Definition: slot.c:593
ReplicationSlotPersistentData data
Definition: slot.h:147
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8220
Definition: dirent.h:9
#define PANIC
Definition: elog.h:50
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2860
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define PG_BINARY
Definition: c.h:1271
static void CreateSlotOnDisk(ReplicationSlot *slot)
Definition: slot.c:1455
void ReplicationSlotsShmemInit(void)
Definition: slot.c:132
static void SaveSlotToPath(ReplicationSlot *slot, const char *path, int elevel)
Definition: slot.c:1516
void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
Definition: xlog.c:2742
static bool InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN, bool *invalidated)
Definition: slot.c:1164
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1803
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
#define NAMEDATALEN
#define sprintf
Definition: port.h:218
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:172
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableInit(ConditionVariable *cv)
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
XLogSegNo XLogGetLastRemovedSegno(void)
Definition: xlog.c:4003
void ReplicationSlotReserveWal(void)
Definition: slot.c:1081
static void ReplicationSlotDropAcquired(void)
Definition: slot.c:576
void ReplicationSlotsComputeRequiredLSN(void)
Definition: slot.c:817
ReplicationSlotPersistentData slotdata
Definition: slot.c:73
void ConditionVariableCancelSleep(void)
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:1220
Definition: dirent.c:25
#define ERROR
Definition: elog.h:46
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2509
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
bool InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
Definition: slot.c:1317
#define FATAL
Definition: elog.h:49
XLogRecPtr GetXLogInsertRecPtr(void)
Definition: xlog.c:11764
#define MAXPGPATH
void CheckSlotPermissions(void)
Definition: slot.c:1066
void ReplicationSlotPersist(void)
Definition: slot.c:745
TransactionId effective_xmin
Definition: slot.h:143
Definition: c.h:675
XLogRecPtr candidate_restart_valid
Definition: slot.h:164
void StartupReplicationSlots(void)
Definition: slot.c:1395
bool IsUnderPostmaster
Definition: globals.c:112
uint64 XLogSegNo
Definition: xlogdefs.h:48
int errcode_for_file_access(void)
Definition: elog.c:721
XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void)
Definition: slot.c:860
TransactionId catalog_xmin
Definition: slot.h:70
#define InvalidTransactionId
Definition: transam.h:31
unsigned int uint32
Definition: c.h:441
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2720
void ReplicationSlotRelease(void)
Definition: slot.c:469
TransactionId xmin
Definition: slot.h:62
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define SlotIsLogical(slot)
Definition: slot.h:169
#define AssertArg(condition)
Definition: c.h:806
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
pg_crc32c checksum
Definition: slot.c:62
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:262
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:47
struct ReplicationSlotOnDisk ReplicationSlotOnDisk
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:736
int CloseTransientFile(int fd)
Definition: fd.c:2686
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
bool rmtree(const char *path, bool rmtopdir)
Definition: rmtree.c:42
bool in_use
Definition: slot.h:123
static int elevel
Definition: vacuumlazy.c:401
#define SpinLockRelease(lock)
Definition: spin.h:64
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
bool just_dirtied
Definition: slot.h:129
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
TransactionId effective_catalog_xmin
Definition: slot.h:144
unsigned short st_mode
Definition: win32_port.h:260
Oid MyDatabaseId
Definition: globals.c:88
#define SLOT_VERSION
Definition: slot.c:90
static bool two_phase
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:157
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3782
ReplicationSlot * MyReplicationSlot
Definition: slot.c:96
int max_replication_slots
Definition: slot.c:99
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest)
#define ReplicationSlotOnDiskV2Size
Definition: slot.c:86
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:804
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2786
XLogRecPtr restart_lsn
Definition: slot.h:73
#define ReplicationSlotOnDiskConstantSize
Definition: slot.c:77
size_t Size
Definition: c.h:540
uint32 version
Definition: slot.c:65
#define SnapBuildOnDiskChecksummedSize
Definition: slot.c:83
bool has_rolreplication(Oid roleid)
Definition: miscinit.c:676
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1199
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:8512
ConditionVariable active_cv
Definition: slot.h:153
const char * name
Definition: encode.c:561
bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
Definition: slot.c:912
#define S_ISDIR(m)
Definition: win32_port.h:316
#define lstat(path, sb)
Definition: win32_port.h:276
XLogRecPtr candidate_xmin_lsn
Definition: slot.h:163
void ReplicationSlotDrop(const char *name, bool nowait)
Definition: slot.c:563
void pgstat_report_replslot_create(const char *slotname)
Definition: pgstat.c:1842
void pgstat_report_replslot_drop(const char *slotname)
Definition: pgstat.c:1860
ReplicationSlotPersistency
Definition: slot.h:33
int errmsg(const char *fmt,...)
Definition: elog.c:909
pid_t active_pid
Definition: slot.h:126
#define elog(elevel,...)
Definition: elog.h:232
int i
int pgxactoff
Definition: proc.h:148
#define NameStr(name)
Definition: c.h:681
void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked)
Definition: procarray.c:3868
void ReplicationSlotCleanup(void)
Definition: slot.c:525
int pg_fsync(int fd)
Definition: fd.c:357
ReplicationSlot replication_slots[1]
Definition: slot.h:180
char d_name[MAX_PATH]
Definition: dirent.h:15
uint8 * statusFlags
Definition: proc.h:333
XLogRecPtr invalidated_at
Definition: slot.h:76
slock_t mutex
Definition: slot.h:120
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:89
#define ERRCODE_DUPLICATE_OBJECT
Definition: streamutil.c:32
void CheckPointReplicationSlots(void)
Definition: slot.c:1360
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:94
#define snprintf
Definition: port.h:216
void ReplicationSlotsDropDBSlots(Oid dboid)
Definition: slot.c:968
void ReplicationSlotsComputeRequiredXmin(bool already_locked)
Definition: slot.c:767
bool dirty
Definition: slot.h:130
#define read(a, b, c)
Definition: win32.h:13
int FreeDir(DIR *dir)
Definition: fd.c:2838
XLogRecPtr candidate_restart_lsn
Definition: slot.h:165
#define offsetof(type, field)
Definition: c.h:727
void ReplicationSlotMarkDirty(void)
Definition: slot.c:728
#define stat
Definition: win32_port.h:275
XLogRecPtr two_phase_at
Definition: slot.h:90
LWLock io_in_progress_lock
Definition: slot.h:150
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)