PostgreSQL Source Code  git master
slot.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * slot.c
4  * Replication slot management.
5  *
6  *
7  * Copyright (c) 2012-2021, PostgreSQL Global Development Group
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/replication/slot.c
12  *
13  * NOTES
14  *
15  * Replication slots are used to keep state about replication streams
16  * originating from this cluster. Their primary purpose is to prevent the
17  * premature removal of WAL or of old tuple versions in a manner that would
18  * interfere with replication; they are also useful for monitoring purposes.
19  * Slots need to be permanent (to allow restarts), crash-safe, and allocatable
20  * on standbys (to support cascading setups). The requirement that slots be
21  * usable on standbys precludes storing them in the system catalogs.
22  *
23  * Each replication slot gets its own directory inside the $PGDATA/pg_replslot
24  * directory. Inside that directory the state file will contain the slot's
25  * own data. Additional data can be stored alongside that file if required.
26  * While the server is running, the state data is also cached in memory for
27  * efficiency.
28  *
29  * ReplicationSlotAllocationLock must be taken in exclusive mode to allocate
30  * or free a slot. ReplicationSlotControlLock must be taken in shared mode
31  * to iterate over the slots, and in exclusive mode to change the in_use flag
32  * of a slot. The remaining data in each slot is protected by its mutex.
33  *
34  *-------------------------------------------------------------------------
35  */
36 
37 #include "postgres.h"
38 
39 #include <unistd.h>
40 #include <sys/stat.h>
41 
42 #include "access/transam.h"
43 #include "access/xlog_internal.h"
44 #include "common/string.h"
45 #include "miscadmin.h"
46 #include "pgstat.h"
47 #include "replication/slot.h"
48 #include "storage/fd.h"
49 #include "storage/proc.h"
50 #include "storage/procarray.h"
51 #include "utils/builtins.h"
52 
53 /*
54  * Replication slot on-disk data structure.
55  */
56 typedef struct ReplicationSlotOnDisk
57 {
58  /* first part of this struct needs to be version independent */
59 
60  /* data not covered by checksum */
63 
64  /* data covered by checksum */
67 
68  /*
69  * The actual data in the slot that follows can differ based on the above
70  * 'version'.
71  */
72 
75 
76 /* size of version independent data */
77 #define ReplicationSlotOnDiskConstantSize \
78  offsetof(ReplicationSlotOnDisk, slotdata)
79 /* size of the part of the slot not covered by the checksum */
80 #define SnapBuildOnDiskNotChecksummedSize \
81  offsetof(ReplicationSlotOnDisk, version)
82 /* size of the part covered by the checksum */
83 #define SnapBuildOnDiskChecksummedSize \
84  sizeof(ReplicationSlotOnDisk) - SnapBuildOnDiskNotChecksummedSize
85 /* size of the slot data that is version dependent */
86 #define ReplicationSlotOnDiskV2Size \
87  sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskConstantSize
88 
89 #define SLOT_MAGIC 0x1051CA1 /* format identifier */
90 #define SLOT_VERSION 2 /* version for new files */
91 
92 /* Control array for replication slot management */
94 
95 /* My backend's replication slot in the shared memory array */
97 
98 /* GUCs */
99 int max_replication_slots = 0; /* the maximum number of replication
100  * slots */
101 
103  const char *name, SlotAcquireBehavior behavior);
104 static void ReplicationSlotDropAcquired(void);
105 static void ReplicationSlotDropPtr(ReplicationSlot *slot);
106 
107 /* internal persistency functions */
108 static void RestoreSlotFromDisk(const char *name);
109 static void CreateSlotOnDisk(ReplicationSlot *slot);
110 static void SaveSlotToPath(ReplicationSlot *slot, const char *path, int elevel);
111 
112 /*
113  * Report shared-memory space needed by ReplicationSlotsShmemInit.
114  */
115 Size
117 {
118  Size size = 0;
119 
120  if (max_replication_slots == 0)
121  return size;
122 
123  size = offsetof(ReplicationSlotCtlData, replication_slots);
124  size = add_size(size,
126 
127  return size;
128 }
129 
130 /*
131  * Allocate and initialize shared memory for replication slots.
132  */
133 void
135 {
136  bool found;
137 
138  if (max_replication_slots == 0)
139  return;
140 
141  ReplicationSlotCtl = (ReplicationSlotCtlData *)
142  ShmemInitStruct("ReplicationSlot Ctl", ReplicationSlotsShmemSize(),
143  &found);
144 
145  if (!found)
146  {
147  int i;
148 
149  /* First time through, so initialize */
150  MemSet(ReplicationSlotCtl, 0, ReplicationSlotsShmemSize());
151 
152  for (i = 0; i < max_replication_slots; i++)
153  {
154  ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[i];
155 
156  /* everything else is zeroed by the memset above */
157  SpinLockInit(&slot->mutex);
161  }
162  }
163 }
164 
165 /*
166  * Check whether the passed slot name is valid and report errors at elevel.
167  *
168  * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow
169  * the name to be used as a directory name on every supported OS.
170  *
171  * Returns whether the directory name is valid or not if elevel < ERROR.
172  */
173 bool
175 {
176  const char *cp;
177 
178  if (strlen(name) == 0)
179  {
180  ereport(elevel,
181  (errcode(ERRCODE_INVALID_NAME),
182  errmsg("replication slot name \"%s\" is too short",
183  name)));
184  return false;
185  }
186 
187  if (strlen(name) >= NAMEDATALEN)
188  {
189  ereport(elevel,
190  (errcode(ERRCODE_NAME_TOO_LONG),
191  errmsg("replication slot name \"%s\" is too long",
192  name)));
193  return false;
194  }
195 
196  for (cp = name; *cp; cp++)
197  {
198  if (!((*cp >= 'a' && *cp <= 'z')
199  || (*cp >= '0' && *cp <= '9')
200  || (*cp == '_')))
201  {
202  ereport(elevel,
203  (errcode(ERRCODE_INVALID_NAME),
204  errmsg("replication slot name \"%s\" contains invalid character",
205  name),
206  errhint("Replication slot names may only contain lower case letters, numbers, and the underscore character.")));
207  return false;
208  }
209  }
210  return true;
211 }
212 
213 /*
214  * Create a new replication slot and mark it as used by this backend.
215  *
216  * name: Name of the slot
217  * db_specific: logical decoding is db specific; if the slot is going to
218  * be used for that pass true, otherwise false.
219  * two_phase: Allows decoding of prepared transactions. We allow this option
220  * to be enabled only at the slot creation time. If we allow this option
221  * to be changed during decoding then it is quite possible that we skip
222  * prepare first time because this option was not enabled. Now next time
223  * during getting changes, if the two_phase option is enabled it can skip
224  * prepare because by that time start decoding point has been moved. So the
225  * user will only get commit prepared.
226  */
227 void
228 ReplicationSlotCreate(const char *name, bool db_specific,
229  ReplicationSlotPersistency persistency, bool two_phase)
230 {
231  ReplicationSlot *slot = NULL;
232  int i;
233 
234  Assert(MyReplicationSlot == NULL);
235 
237 
238  /*
239  * If some other backend ran this code concurrently with us, we'd likely
240  * both allocate the same slot, and that would be bad. We'd also be at
241  * risk of missing a name collision. Also, we don't want to try to create
242  * a new slot while somebody's busy cleaning up an old one, because we
243  * might both be monkeying with the same directory.
244  */
245  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
246 
247  /*
248  * Check for name collision, and identify an allocatable slot. We need to
249  * hold ReplicationSlotControlLock in shared mode for this, so that nobody
250  * else can change the in_use flags while we're looking at them.
251  */
252  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
253  for (i = 0; i < max_replication_slots; i++)
254  {
255  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
256 
257  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
258  ereport(ERROR,
260  errmsg("replication slot \"%s\" already exists", name)));
261  if (!s->in_use && slot == NULL)
262  slot = s;
263  }
264  LWLockRelease(ReplicationSlotControlLock);
265 
266  /* If all slots are in use, we're out of luck. */
267  if (slot == NULL)
268  ereport(ERROR,
269  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
270  errmsg("all replication slots are in use"),
271  errhint("Free one or increase max_replication_slots.")));
272 
273  /*
274  * Since this slot is not in use, nobody should be looking at any part of
275  * it other than the in_use field unless they're trying to allocate it.
276  * And since we hold ReplicationSlotAllocationLock, nobody except us can
277  * be doing that. So it's safe to initialize the slot.
278  */
279  Assert(!slot->in_use);
280  Assert(slot->active_pid == 0);
281 
282  /* first initialize persistent data */
283  memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData));
284  namestrcpy(&slot->data.name, name);
285  slot->data.database = db_specific ? MyDatabaseId : InvalidOid;
286  slot->data.persistency = persistency;
287  slot->data.two_phase = two_phase;
288 
289  /* and then data only present in shared memory */
290  slot->just_dirtied = false;
291  slot->dirty = false;
298 
299  /*
300  * Create the slot on disk. We haven't actually marked the slot allocated
301  * yet, so no special cleanup is required if this errors out.
302  */
303  CreateSlotOnDisk(slot);
304 
305  /*
306  * We need to briefly prevent any other backend from iterating over the
307  * slots while we flip the in_use flag. We also need to set the active
308  * flag while holding the ControlLock as otherwise a concurrent
309  * ReplicationSlotAcquire() could acquire the slot as well.
310  */
311  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
312 
313  slot->in_use = true;
314 
315  /* We can now mark the slot active, and that makes it our slot. */
316  SpinLockAcquire(&slot->mutex);
317  Assert(slot->active_pid == 0);
318  slot->active_pid = MyProcPid;
319  SpinLockRelease(&slot->mutex);
320  MyReplicationSlot = slot;
321 
322  LWLockRelease(ReplicationSlotControlLock);
323 
324  /*
325  * Create statistics entry for the new logical slot. We don't collect any
326  * stats for physical slots, so no need to create an entry for the same.
327  * See ReplicationSlotDropPtr for why we need to do this before releasing
328  * ReplicationSlotAllocationLock.
329  */
330  if (SlotIsLogical(slot))
331  pgstat_report_replslot(NameStr(slot->data.name), 0, 0, 0, 0, 0, 0);
332 
333  /*
334  * Now that the slot has been marked as in_use and active, it's safe to
335  * let somebody else try to allocate a slot.
336  */
337  LWLockRelease(ReplicationSlotAllocationLock);
338 
339  /* Let everybody know we've modified this slot */
341 }
342 
343 /*
344  * Search for the named replication slot.
345  *
346  * Return the replication slot if found, otherwise NULL.
347  *
348  * The caller must hold ReplicationSlotControlLock in shared mode.
349  */
352 {
353  int i;
354  ReplicationSlot *slot = NULL;
355 
356  Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock,
357  LW_SHARED));
358 
359  for (i = 0; i < max_replication_slots; i++)
360  {
361  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
362 
363  if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
364  {
365  slot = s;
366  break;
367  }
368  }
369 
370  return slot;
371 }
372 
373 /*
374  * Find a previously created slot and mark it as used by this process.
375  *
376  * The return value is only useful if behavior is SAB_Inquire, in which
377  * it's zero if we successfully acquired the slot, -1 if the slot no longer
378  * exists, or the PID of the owning process otherwise. If behavior is
379  * SAB_Error, then trying to acquire an owned slot is an error.
380  * If SAB_Block, we sleep until the slot is released by the owning process.
381  */
382 int
384 {
385  return ReplicationSlotAcquireInternal(NULL, name, behavior);
386 }
387 
388 /*
389  * Mark the specified slot as used by this process.
390  *
391  * Only one of slot and name can be specified.
392  * If slot == NULL, search for the slot with the given name.
393  *
394  * See comments about the return value in ReplicationSlotAcquire().
395  */
396 static int
398  SlotAcquireBehavior behavior)
399 {
400  ReplicationSlot *s;
401  int active_pid;
402 
403  AssertArg((slot == NULL) ^ (name == NULL));
404 
405 retry:
406  Assert(MyReplicationSlot == NULL);
407 
408  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
409 
410  /*
411  * Search for the slot with the specified name if the slot to acquire is
412  * not given. If the slot is not found, we either return -1 or error out.
413  */
414  s = slot ? slot : SearchNamedReplicationSlot(name);
415  if (s == NULL || !s->in_use)
416  {
417  LWLockRelease(ReplicationSlotControlLock);
418 
419  if (behavior == SAB_Inquire)
420  return -1;
421  ereport(ERROR,
422  (errcode(ERRCODE_UNDEFINED_OBJECT),
423  errmsg("replication slot \"%s\" does not exist",
424  name ? name : NameStr(slot->data.name))));
425  }
426 
427  /*
428  * This is the slot we want; check if it's active under some other
429  * process. In single user mode, we don't need this check.
430  */
431  if (IsUnderPostmaster)
432  {
433  /*
434  * Get ready to sleep on the slot in case it is active if SAB_Block.
435  * (We may end up not sleeping, but we don't want to do this while
436  * holding the spinlock.)
437  */
438  if (behavior == SAB_Block)
440 
441  SpinLockAcquire(&s->mutex);
442  if (s->active_pid == 0)
443  s->active_pid = MyProcPid;
444  active_pid = s->active_pid;
445  SpinLockRelease(&s->mutex);
446  }
447  else
448  active_pid = MyProcPid;
449  LWLockRelease(ReplicationSlotControlLock);
450 
451  /*
452  * If we found the slot but it's already active in another process, we
453  * either error out, return the PID of the owning process, or retry
454  * after a short wait, as caller specified.
455  */
456  if (active_pid != MyProcPid)
457  {
458  if (behavior == SAB_Error)
459  ereport(ERROR,
460  (errcode(ERRCODE_OBJECT_IN_USE),
461  errmsg("replication slot \"%s\" is active for PID %d",
462  NameStr(s->data.name), active_pid)));
463  else if (behavior == SAB_Inquire)
464  return active_pid;
465 
466  /* Wait here until we get signaled, and then restart */
470  goto retry;
471  }
472  else if (behavior == SAB_Block)
473  ConditionVariableCancelSleep(); /* no sleep needed after all */
474 
475  /* Let everybody know we've modified this slot */
477 
478  /* We made this slot active, so it's ours now. */
479  MyReplicationSlot = s;
480 
481  /* success */
482  return 0;
483 }
484 
485 /*
486  * Release the replication slot that this backend considers to own.
487  *
488  * This or another backend can re-acquire the slot later.
489  * Resources this slot requires will be preserved.
490  */
491 void
493 {
495 
496  Assert(slot != NULL && slot->active_pid != 0);
497 
498  if (slot->data.persistency == RS_EPHEMERAL)
499  {
500  /*
501  * Delete the slot. There is no !PANIC case where this is allowed to
502  * fail, all that may happen is an incomplete cleanup of the on-disk
503  * data.
504  */
506  }
507 
508  /*
509  * If slot needed to temporarily restrain both data and catalog xmin to
510  * create the catalog snapshot, remove that temporary constraint.
511  * Snapshots can only be exported while the initial snapshot is still
512  * acquired.
513  */
514  if (!TransactionIdIsValid(slot->data.xmin) &&
516  {
517  SpinLockAcquire(&slot->mutex);
519  SpinLockRelease(&slot->mutex);
521  }
522 
523  if (slot->data.persistency == RS_PERSISTENT)
524  {
525  /*
526  * Mark persistent slot inactive. We're not freeing it, just
527  * disconnecting, but wake up others that may be waiting for it.
528  */
529  SpinLockAcquire(&slot->mutex);
530  slot->active_pid = 0;
531  SpinLockRelease(&slot->mutex);
533  }
534 
535  MyReplicationSlot = NULL;
536 
537  /* might not have been set when we've been a plain slot */
538  LWLockAcquire(ProcArrayLock, LW_SHARED);
541  LWLockRelease(ProcArrayLock);
542 }
543 
544 /*
545  * Cleanup all temporary slots created in current session.
546  */
547 void
549 {
550  int i;
551 
552  Assert(MyReplicationSlot == NULL);
553 
554 restart:
555  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
556  for (i = 0; i < max_replication_slots; i++)
557  {
558  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
559 
560  if (!s->in_use)
561  continue;
562 
563  SpinLockAcquire(&s->mutex);
564  if (s->active_pid == MyProcPid)
565  {
567  SpinLockRelease(&s->mutex);
568  LWLockRelease(ReplicationSlotControlLock); /* avoid deadlock */
569 
571 
573  goto restart;
574  }
575  else
576  SpinLockRelease(&s->mutex);
577  }
578 
579  LWLockRelease(ReplicationSlotControlLock);
580 }
581 
582 /*
583  * Permanently drop replication slot identified by the passed in name.
584  */
585 void
586 ReplicationSlotDrop(const char *name, bool nowait)
587 {
588  Assert(MyReplicationSlot == NULL);
589 
590  (void) ReplicationSlotAcquire(name, nowait ? SAB_Error : SAB_Block);
591 
593 }
594 
595 /*
596  * Permanently drop the currently acquired replication slot.
597  */
598 static void
600 {
602 
603  Assert(MyReplicationSlot != NULL);
604 
605  /* slot isn't acquired anymore */
606  MyReplicationSlot = NULL;
607 
609 }
610 
611 /*
612  * Permanently drop the replication slot which will be released by the point
613  * this function returns.
614  */
615 static void
617 {
618  char path[MAXPGPATH];
619  char tmppath[MAXPGPATH];
620 
621  /*
622  * If some other backend ran this code concurrently with us, we might try
623  * to delete a slot with a certain name while someone else was trying to
624  * create a slot with the same name.
625  */
626  LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
627 
628  /* Generate pathnames. */
629  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
630  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
631 
632  /*
633  * Rename the slot directory on disk, so that we'll no longer recognize
634  * this as a valid slot. Note that if this fails, we've got to mark the
635  * slot inactive before bailing out. If we're dropping an ephemeral or a
636  * temporary slot, we better never fail hard as the caller won't expect
637  * the slot to survive and this might get called during error handling.
638  */
639  if (rename(path, tmppath) == 0)
640  {
641  /*
642  * We need to fsync() the directory we just renamed and its parent to
643  * make sure that our changes are on disk in a crash-safe fashion. If
644  * fsync() fails, we can't be sure whether the changes are on disk or
645  * not. For now, we handle that by panicking;
646  * StartupReplicationSlots() will try to straighten it out after
647  * restart.
648  */
650  fsync_fname(tmppath, true);
651  fsync_fname("pg_replslot", true);
653  }
654  else
655  {
656  bool fail_softly = slot->data.persistency != RS_PERSISTENT;
657 
658  SpinLockAcquire(&slot->mutex);
659  slot->active_pid = 0;
660  SpinLockRelease(&slot->mutex);
661 
662  /* wake up anyone waiting on this slot */
664 
665  ereport(fail_softly ? WARNING : ERROR,
667  errmsg("could not rename file \"%s\" to \"%s\": %m",
668  path, tmppath)));
669  }
670 
671  /*
672  * The slot is definitely gone. Lock out concurrent scans of the array
673  * long enough to kill it. It's OK to clear the active PID here without
674  * grabbing the mutex because nobody else can be scanning the array here,
675  * and nobody can be attached to this slot and thus access it without
676  * scanning the array.
677  *
678  * Also wake up processes waiting for it.
679  */
680  LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
681  slot->active_pid = 0;
682  slot->in_use = false;
683  LWLockRelease(ReplicationSlotControlLock);
685 
686  /*
687  * Slot is dead and doesn't prevent resource removal anymore, recompute
688  * limits.
689  */
692 
693  /*
694  * If removing the directory fails, the worst thing that will happen is
695  * that the user won't be able to create a new slot with the same name
696  * until the next server restart. We warn about it, but that's all.
697  */
698  if (!rmtree(tmppath, true))
700  (errmsg("could not remove directory \"%s\"", tmppath)));
701 
702  /*
703  * Send a message to drop the replication slot to the stats collector.
704  * Since there is no guarantee of the order of message transfer on a UDP
705  * connection, it's possible that a message for creating a new slot
706  * reaches before a message for removing the old slot. We send the drop
707  * and create messages while holding ReplicationSlotAllocationLock to
708  * reduce that possibility. If the messages reached in reverse, we would
709  * lose one statistics update message. But the next update message will
710  * create the statistics for the replication slot.
711  */
712  if (SlotIsLogical(slot))
714 
715  /*
716  * We release this at the very end, so that nobody starts trying to create
717  * a slot while we're still cleaning up the detritus of the old one.
718  */
719  LWLockRelease(ReplicationSlotAllocationLock);
720 }
721 
722 /*
723  * Serialize the currently acquired slot's state from memory to disk, thereby
724  * guaranteeing the current state will survive a crash.
725  */
726 void
728 {
729  char path[MAXPGPATH];
730 
731  Assert(MyReplicationSlot != NULL);
732 
733  sprintf(path, "pg_replslot/%s", NameStr(MyReplicationSlot->data.name));
734  SaveSlotToPath(MyReplicationSlot, path, ERROR);
735 }
736 
737 /*
738  * Signal that it would be useful if the currently acquired slot would be
739  * flushed out to disk.
740  *
741  * Note that the actual flush to disk can be delayed for a long time, if
742  * required for correctness explicitly do a ReplicationSlotSave().
743  */
744 void
746 {
748 
749  Assert(MyReplicationSlot != NULL);
750 
751  SpinLockAcquire(&slot->mutex);
752  MyReplicationSlot->just_dirtied = true;
753  MyReplicationSlot->dirty = true;
754  SpinLockRelease(&slot->mutex);
755 }
756 
757 /*
758  * Convert a slot that's marked as RS_EPHEMERAL to a RS_PERSISTENT slot,
759  * guaranteeing it will be there after an eventual crash.
760  */
761 void
763 {
765 
766  Assert(slot != NULL);
768 
769  SpinLockAcquire(&slot->mutex);
771  SpinLockRelease(&slot->mutex);
772 
775 }
776 
777 /*
778  * Compute the oldest xmin across all slots and store it in the ProcArray.
779  *
780  * If already_locked is true, ProcArrayLock has already been acquired
781  * exclusively.
782  */
783 void
785 {
786  int i;
788  TransactionId agg_catalog_xmin = InvalidTransactionId;
789 
790  Assert(ReplicationSlotCtl != NULL);
791 
792  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
793 
794  for (i = 0; i < max_replication_slots; i++)
795  {
796  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
797  TransactionId effective_xmin;
798  TransactionId effective_catalog_xmin;
799 
800  if (!s->in_use)
801  continue;
802 
803  SpinLockAcquire(&s->mutex);
804  effective_xmin = s->effective_xmin;
805  effective_catalog_xmin = s->effective_catalog_xmin;
806  SpinLockRelease(&s->mutex);
807 
808  /* check the data xmin */
809  if (TransactionIdIsValid(effective_xmin) &&
810  (!TransactionIdIsValid(agg_xmin) ||
811  TransactionIdPrecedes(effective_xmin, agg_xmin)))
812  agg_xmin = effective_xmin;
813 
814  /* check the catalog xmin */
815  if (TransactionIdIsValid(effective_catalog_xmin) &&
816  (!TransactionIdIsValid(agg_catalog_xmin) ||
817  TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin)))
818  agg_catalog_xmin = effective_catalog_xmin;
819  }
820 
821  LWLockRelease(ReplicationSlotControlLock);
822 
823  ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
824 }
825 
826 /*
827  * Compute the oldest restart LSN across all slots and inform xlog module.
828  *
829  * Note: while max_slot_wal_keep_size is theoretically relevant for this
830  * purpose, we don't try to account for that, because this module doesn't
831  * know what to compare against.
832  */
833 void
835 {
836  int i;
837  XLogRecPtr min_required = InvalidXLogRecPtr;
838 
839  Assert(ReplicationSlotCtl != NULL);
840 
841  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
842  for (i = 0; i < max_replication_slots; i++)
843  {
844  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
845  XLogRecPtr restart_lsn;
846 
847  if (!s->in_use)
848  continue;
849 
850  SpinLockAcquire(&s->mutex);
851  restart_lsn = s->data.restart_lsn;
852  SpinLockRelease(&s->mutex);
853 
854  if (restart_lsn != InvalidXLogRecPtr &&
855  (min_required == InvalidXLogRecPtr ||
856  restart_lsn < min_required))
857  min_required = restart_lsn;
858  }
859  LWLockRelease(ReplicationSlotControlLock);
860 
861  XLogSetReplicationSlotMinimumLSN(min_required);
862 }
863 
864 /*
865  * Compute the oldest WAL LSN required by *logical* decoding slots..
866  *
867  * Returns InvalidXLogRecPtr if logical decoding is disabled or no logical
868  * slots exist.
869  *
870  * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it
871  * ignores physical replication slots.
872  *
873  * The results aren't required frequently, so we don't maintain a precomputed
874  * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin().
875  */
878 {
879  XLogRecPtr result = InvalidXLogRecPtr;
880  int i;
881 
882  if (max_replication_slots <= 0)
883  return InvalidXLogRecPtr;
884 
885  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
886 
887  for (i = 0; i < max_replication_slots; i++)
888  {
889  ReplicationSlot *s;
890  XLogRecPtr restart_lsn;
891 
892  s = &ReplicationSlotCtl->replication_slots[i];
893 
894  /* cannot change while ReplicationSlotCtlLock is held */
895  if (!s->in_use)
896  continue;
897 
898  /* we're only interested in logical slots */
899  if (!SlotIsLogical(s))
900  continue;
901 
902  /* read once, it's ok if it increases while we're checking */
903  SpinLockAcquire(&s->mutex);
904  restart_lsn = s->data.restart_lsn;
905  SpinLockRelease(&s->mutex);
906 
907  if (restart_lsn == InvalidXLogRecPtr)
908  continue;
909 
910  if (result == InvalidXLogRecPtr ||
911  restart_lsn < result)
912  result = restart_lsn;
913  }
914 
915  LWLockRelease(ReplicationSlotControlLock);
916 
917  return result;
918 }
919 
920 /*
921  * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the
922  * passed database oid.
923  *
924  * Returns true if there are any slots referencing the database. *nslots will
925  * be set to the absolute number of slots in the database, *nactive to ones
926  * currently active.
927  */
928 bool
929 ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
930 {
931  int i;
932 
933  *nslots = *nactive = 0;
934 
935  if (max_replication_slots <= 0)
936  return false;
937 
938  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
939  for (i = 0; i < max_replication_slots; i++)
940  {
941  ReplicationSlot *s;
942 
943  s = &ReplicationSlotCtl->replication_slots[i];
944 
945  /* cannot change while ReplicationSlotCtlLock is held */
946  if (!s->in_use)
947  continue;
948 
949  /* only logical slots are database specific, skip */
950  if (!SlotIsLogical(s))
951  continue;
952 
953  /* not our database, skip */
954  if (s->data.database != dboid)
955  continue;
956 
957  /* count slots with spinlock held */
958  SpinLockAcquire(&s->mutex);
959  (*nslots)++;
960  if (s->active_pid != 0)
961  (*nactive)++;
962  SpinLockRelease(&s->mutex);
963  }
964  LWLockRelease(ReplicationSlotControlLock);
965 
966  if (*nslots > 0)
967  return true;
968  return false;
969 }
970 
971 /*
972  * ReplicationSlotsDropDBSlots -- Drop all db-specific slots relating to the
973  * passed database oid. The caller should hold an exclusive lock on the
974  * pg_database oid for the database to prevent creation of new slots on the db
975  * or replay from existing slots.
976  *
977  * Another session that concurrently acquires an existing slot on the target DB
978  * (most likely to drop it) may cause this function to ERROR. If that happens
979  * it may have dropped some but not all slots.
980  *
981  * This routine isn't as efficient as it could be - but we don't drop
982  * databases often, especially databases with lots of slots.
983  */
984 void
986 {
987  int i;
988 
989  if (max_replication_slots <= 0)
990  return;
991 
992 restart:
993  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
994  for (i = 0; i < max_replication_slots; i++)
995  {
996  ReplicationSlot *s;
997  char *slotname;
998  int active_pid;
999 
1000  s = &ReplicationSlotCtl->replication_slots[i];
1001 
1002  /* cannot change while ReplicationSlotCtlLock is held */
1003  if (!s->in_use)
1004  continue;
1005 
1006  /* only logical slots are database specific, skip */
1007  if (!SlotIsLogical(s))
1008  continue;
1009 
1010  /* not our database, skip */
1011  if (s->data.database != dboid)
1012  continue;
1013 
1014  /* acquire slot, so ReplicationSlotDropAcquired can be reused */
1015  SpinLockAcquire(&s->mutex);
1016  /* can't change while ReplicationSlotControlLock is held */
1017  slotname = NameStr(s->data.name);
1018  active_pid = s->active_pid;
1019  if (active_pid == 0)
1020  {
1021  MyReplicationSlot = s;
1022  s->active_pid = MyProcPid;
1023  }
1024  SpinLockRelease(&s->mutex);
1025 
1026  /*
1027  * Even though we hold an exclusive lock on the database object a
1028  * logical slot for that DB can still be active, e.g. if it's
1029  * concurrently being dropped by a backend connected to another DB.
1030  *
1031  * That's fairly unlikely in practice, so we'll just bail out.
1032  */
1033  if (active_pid)
1034  ereport(ERROR,
1035  (errcode(ERRCODE_OBJECT_IN_USE),
1036  errmsg("replication slot \"%s\" is active for PID %d",
1037  slotname, active_pid)));
1038 
1039  /*
1040  * To avoid duplicating ReplicationSlotDropAcquired() and to avoid
1041  * holding ReplicationSlotControlLock over filesystem operations,
1042  * release ReplicationSlotControlLock and use
1043  * ReplicationSlotDropAcquired.
1044  *
1045  * As that means the set of slots could change, restart scan from the
1046  * beginning each time we release the lock.
1047  */
1048  LWLockRelease(ReplicationSlotControlLock);
1050  goto restart;
1051  }
1052  LWLockRelease(ReplicationSlotControlLock);
1053 }
1054 
1055 
1056 /*
1057  * Check whether the server's configuration supports using replication
1058  * slots.
1059  */
1060 void
1062 {
1063  /*
1064  * NB: Adding a new requirement likely means that RestoreSlotFromDisk()
1065  * needs the same check.
1066  */
1067 
1068  if (max_replication_slots == 0)
1069  ereport(ERROR,
1070  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1071  errmsg("replication slots can only be used if max_replication_slots > 0")));
1072 
1074  ereport(ERROR,
1075  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1076  errmsg("replication slots can only be used if wal_level >= replica")));
1077 }
1078 
1079 /*
1080  * Reserve WAL for the currently active slot.
1081  *
1082  * Compute and set restart_lsn in a manner that's appropriate for the type of
1083  * the slot and concurrency safe.
1084  */
1085 void
1087 {
1089 
1090  Assert(slot != NULL);
1092 
1093  /*
1094  * The replication slot mechanism is used to prevent removal of required
1095  * WAL. As there is no interlock between this routine and checkpoints, WAL
1096  * segments could concurrently be removed when a now stale return value of
1097  * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that
1098  * this happens we'll just retry.
1099  */
1100  while (true)
1101  {
1102  XLogSegNo segno;
1103  XLogRecPtr restart_lsn;
1104 
1105  /*
1106  * For logical slots log a standby snapshot and start logical decoding
1107  * at exactly that position. That allows the slot to start up more
1108  * quickly.
1109  *
1110  * That's not needed (or indeed helpful) for physical slots as they'll
1111  * start replay at the last logged checkpoint anyway. Instead return
1112  * the location of the last redo LSN. While that slightly increases
1113  * the chance that we have to retry, it's where a base backup has to
1114  * start replay at.
1115  */
1116  if (!RecoveryInProgress() && SlotIsLogical(slot))
1117  {
1118  XLogRecPtr flushptr;
1119 
1120  /* start at current insert position */
1121  restart_lsn = GetXLogInsertRecPtr();
1122  SpinLockAcquire(&slot->mutex);
1123  slot->data.restart_lsn = restart_lsn;
1124  SpinLockRelease(&slot->mutex);
1125 
1126  /* make sure we have enough information to start */
1127  flushptr = LogStandbySnapshot();
1128 
1129  /* and make sure it's fsynced to disk */
1130  XLogFlush(flushptr);
1131  }
1132  else
1133  {
1134  restart_lsn = GetRedoRecPtr();
1135  SpinLockAcquire(&slot->mutex);
1136  slot->data.restart_lsn = restart_lsn;
1137  SpinLockRelease(&slot->mutex);
1138  }
1139 
1140  /* prevent WAL removal as fast as possible */
1142 
1143  /*
1144  * If all required WAL is still there, great, otherwise retry. The
1145  * slot should prevent further removal of WAL, unless there's a
1146  * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
1147  * the new restart_lsn above, so normally we should never need to loop
1148  * more than twice.
1149  */
1151  if (XLogGetLastRemovedSegno() < segno)
1152  break;
1153  }
1154 }
1155 
1156 /*
1157  * Mark any slot that points to an LSN older than the given segment
1158  * as invalid; it requires WAL that's about to be removed.
1159  *
1160  * NB - this runs as part of checkpoint, so avoid raising errors if possible.
1161  */
1162 void
1164 {
1165  XLogRecPtr oldestLSN;
1166 
1167  XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
1168 
1169 restart:
1170  LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1171  for (int i = 0; i < max_replication_slots; i++)
1172  {
1173  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1174  XLogRecPtr restart_lsn = InvalidXLogRecPtr;
1175  NameData slotname;
1176  int wspid;
1177  int last_signaled_pid = 0;
1178 
1179  if (!s->in_use)
1180  continue;
1181 
1182  SpinLockAcquire(&s->mutex);
1183  slotname = s->data.name;
1184  restart_lsn = s->data.restart_lsn;
1185  SpinLockRelease(&s->mutex);
1186 
1187  if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn >= oldestLSN)
1188  continue;
1189  LWLockRelease(ReplicationSlotControlLock);
1191 
1192  /* Get ready to sleep on the slot in case it is active */
1194 
1195  for (;;)
1196  {
1197  /*
1198  * Try to mark this slot as used by this process.
1199  *
1200  * Note that ReplicationSlotAcquireInternal(SAB_Inquire)
1201  * should not cancel the prepared condition variable
1202  * if this slot is active in other process. Because in this case
1203  * we have to wait on that CV for the process owning
1204  * the slot to be terminated, later.
1205  */
1206  wspid = ReplicationSlotAcquireInternal(s, NULL, SAB_Inquire);
1207 
1208  /*
1209  * Exit the loop if we successfully acquired the slot or
1210  * the slot was dropped during waiting for the owning process
1211  * to be terminated. For example, the latter case is likely to
1212  * happen when the slot is temporary because it's automatically
1213  * dropped by the termination of the owning process.
1214  */
1215  if (wspid <= 0)
1216  break;
1217 
1218  /*
1219  * Signal to terminate the process that owns the slot.
1220  *
1221  * There is the race condition where other process may own
1222  * the slot after the process using it was terminated and before
1223  * this process owns it. To handle this case, we signal again
1224  * if the PID of the owning process is changed than the last.
1225  *
1226  * XXX This logic assumes that the same PID is not reused
1227  * very quickly.
1228  */
1229  if (last_signaled_pid != wspid)
1230  {
1231  ereport(LOG,
1232  (errmsg("terminating process %d because replication slot \"%s\" is too far behind",
1233  wspid, NameStr(slotname))));
1234  (void) kill(wspid, SIGTERM);
1235  last_signaled_pid = wspid;
1236  }
1237 
1240  }
1242 
1243  /*
1244  * Do nothing here and start from scratch if the slot has
1245  * already been dropped.
1246  */
1247  if (wspid == -1)
1248  goto restart;
1249 
1250  ereport(LOG,
1251  (errmsg("invalidating slot \"%s\" because its restart_lsn %X/%X exceeds max_slot_wal_keep_size",
1252  NameStr(slotname),
1253  LSN_FORMAT_ARGS(restart_lsn))));
1254 
1255  SpinLockAcquire(&s->mutex);
1258  SpinLockRelease(&s->mutex);
1259 
1260  /* Make sure the invalidated state persists across server restart */
1264 
1265  /* if we did anything, start from scratch */
1266  goto restart;
1267  }
1268  LWLockRelease(ReplicationSlotControlLock);
1269 }
1270 
1271 /*
1272  * Flush all replication slots to disk.
1273  *
1274  * This needn't actually be part of a checkpoint, but it's a convenient
1275  * location.
1276  */
1277 void
1279 {
1280  int i;
1281 
1282  elog(DEBUG1, "performing replication slot checkpoint");
1283 
1284  /*
1285  * Prevent any slot from being created/dropped while we're active. As we
1286  * explicitly do *not* want to block iterating over replication_slots or
1287  * acquiring a slot we cannot take the control lock - but that's OK,
1288  * because holding ReplicationSlotAllocationLock is strictly stronger, and
1289  * enough to guarantee that nobody can change the in_use bits on us.
1290  */
1291  LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED);
1292 
1293  for (i = 0; i < max_replication_slots; i++)
1294  {
1295  ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1296  char path[MAXPGPATH];
1297 
1298  if (!s->in_use)
1299  continue;
1300 
1301  /* save the slot to disk, locking is handled in SaveSlotToPath() */
1302  sprintf(path, "pg_replslot/%s", NameStr(s->data.name));
1303  SaveSlotToPath(s, path, LOG);
1304  }
1305  LWLockRelease(ReplicationSlotAllocationLock);
1306 }
1307 
1308 /*
1309  * Load all replication slots from disk into memory at server startup. This
1310  * needs to be run before we start crash recovery.
1311  */
1312 void
1314 {
1315  DIR *replication_dir;
1316  struct dirent *replication_de;
1317 
1318  elog(DEBUG1, "starting up replication slots");
1319 
1320  /* restore all slots by iterating over all on-disk entries */
1321  replication_dir = AllocateDir("pg_replslot");
1322  while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL)
1323  {
1324  struct stat statbuf;
1325  char path[MAXPGPATH + 12];
1326 
1327  if (strcmp(replication_de->d_name, ".") == 0 ||
1328  strcmp(replication_de->d_name, "..") == 0)
1329  continue;
1330 
1331  snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name);
1332 
1333  /* we're only creating directories here, skip if it's not our's */
1334  if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
1335  continue;
1336 
1337  /* we crashed while a slot was being setup or deleted, clean up */
1338  if (pg_str_endswith(replication_de->d_name, ".tmp"))
1339  {
1340  if (!rmtree(path, true))
1341  {
1342  ereport(WARNING,
1343  (errmsg("could not remove directory \"%s\"",
1344  path)));
1345  continue;
1346  }
1347  fsync_fname("pg_replslot", true);
1348  continue;
1349  }
1350 
1351  /* looks like a slot in a normal state, restore */
1352  RestoreSlotFromDisk(replication_de->d_name);
1353  }
1354  FreeDir(replication_dir);
1355 
1356  /* currently no slots exist, we're done. */
1357  if (max_replication_slots <= 0)
1358  return;
1359 
1360  /* Now that we have recovered all the data, compute replication xmin */
1363 }
1364 
1365 /* ----
1366  * Manipulation of on-disk state of replication slots
1367  *
1368  * NB: none of the routines below should take any notice whether a slot is the
1369  * current one or not, that's all handled a layer above.
1370  * ----
1371  */
1372 static void
1374 {
1375  char tmppath[MAXPGPATH];
1376  char path[MAXPGPATH];
1377  struct stat st;
1378 
1379  /*
1380  * No need to take out the io_in_progress_lock, nobody else can see this
1381  * slot yet, so nobody else will write. We're reusing SaveSlotToPath which
1382  * takes out the lock, if we'd take the lock here, we'd deadlock.
1383  */
1384 
1385  sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
1386  sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
1387 
1388  /*
1389  * It's just barely possible that some previous effort to create or drop a
1390  * slot with this name left a temp directory lying around. If that seems
1391  * to be the case, try to remove it. If the rmtree() fails, we'll error
1392  * out at the MakePGDirectory() below, so we don't bother checking
1393  * success.
1394  */
1395  if (stat(tmppath, &st) == 0 && S_ISDIR(st.st_mode))
1396  rmtree(tmppath, true);
1397 
1398  /* Create and fsync the temporary slot directory. */
1399  if (MakePGDirectory(tmppath) < 0)
1400  ereport(ERROR,
1402  errmsg("could not create directory \"%s\": %m",
1403  tmppath)));
1404  fsync_fname(tmppath, true);
1405 
1406  /* Write the actual state file. */
1407  slot->dirty = true; /* signal that we really need to write */
1408  SaveSlotToPath(slot, tmppath, ERROR);
1409 
1410  /* Rename the directory into place. */
1411  if (rename(tmppath, path) != 0)
1412  ereport(ERROR,
1414  errmsg("could not rename file \"%s\" to \"%s\": %m",
1415  tmppath, path)));
1416 
1417  /*
1418  * If we'd now fail - really unlikely - we wouldn't know whether this slot
1419  * would persist after an OS crash or not - so, force a restart. The
1420  * restart would try to fsync this again till it works.
1421  */
1423 
1424  fsync_fname(path, true);
1425  fsync_fname("pg_replslot", true);
1426 
1427  END_CRIT_SECTION();
1428 }
1429 
1430 /*
1431  * Shared functionality between saving and creating a replication slot.
1432  */
1433 static void
1434 SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
1435 {
1436  char tmppath[MAXPGPATH];
1437  char path[MAXPGPATH];
1438  int fd;
1440  bool was_dirty;
1441 
1442  /* first check whether there's something to write out */
1443  SpinLockAcquire(&slot->mutex);
1444  was_dirty = slot->dirty;
1445  slot->just_dirtied = false;
1446  SpinLockRelease(&slot->mutex);
1447 
1448  /* and don't do anything if there's nothing to write */
1449  if (!was_dirty)
1450  return;
1451 
1453 
1454  /* silence valgrind :( */
1455  memset(&cp, 0, sizeof(ReplicationSlotOnDisk));
1456 
1457  sprintf(tmppath, "%s/state.tmp", dir);
1458  sprintf(path, "%s/state", dir);
1459 
1460  fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1461  if (fd < 0)
1462  {
1463  /*
1464  * If not an ERROR, then release the lock before returning. In case
1465  * of an ERROR, the error recovery path automatically releases the
1466  * lock, but no harm in explicitly releasing even in that case. Note
1467  * that LWLockRelease() could affect errno.
1468  */
1469  int save_errno = errno;
1470 
1472  errno = save_errno;
1473  ereport(elevel,
1475  errmsg("could not create file \"%s\": %m",
1476  tmppath)));
1477  return;
1478  }
1479 
1480  cp.magic = SLOT_MAGIC;
1481  INIT_CRC32C(cp.checksum);
1482  cp.version = SLOT_VERSION;
1484 
1485  SpinLockAcquire(&slot->mutex);
1486 
1487  memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData));
1488 
1489  SpinLockRelease(&slot->mutex);
1490 
1491  COMP_CRC32C(cp.checksum,
1492  (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize,
1494  FIN_CRC32C(cp.checksum);
1495 
1496  errno = 0;
1498  if ((write(fd, &cp, sizeof(cp))) != sizeof(cp))
1499  {
1500  int save_errno = errno;
1501 
1503  CloseTransientFile(fd);
1505 
1506  /* if write didn't set errno, assume problem is no disk space */
1507  errno = save_errno ? save_errno : ENOSPC;
1508  ereport(elevel,
1510  errmsg("could not write to file \"%s\": %m",
1511  tmppath)));
1512  return;
1513  }
1515 
1516  /* fsync the temporary file */
1518  if (pg_fsync(fd) != 0)
1519  {
1520  int save_errno = errno;
1521 
1523  CloseTransientFile(fd);
1525  errno = save_errno;
1526  ereport(elevel,
1528  errmsg("could not fsync file \"%s\": %m",
1529  tmppath)));
1530  return;
1531  }
1533 
1534  if (CloseTransientFile(fd) != 0)
1535  {
1536  int save_errno = errno;
1537 
1539  errno = save_errno;
1540  ereport(elevel,
1542  errmsg("could not close file \"%s\": %m",
1543  tmppath)));
1544  return;
1545  }
1546 
1547  /* rename to permanent file, fsync file and directory */
1548  if (rename(tmppath, path) != 0)
1549  {
1550  int save_errno = errno;
1551 
1553  errno = save_errno;
1554  ereport(elevel,
1556  errmsg("could not rename file \"%s\" to \"%s\": %m",
1557  tmppath, path)));
1558  return;
1559  }
1560 
1561  /*
1562  * Check CreateSlotOnDisk() for the reasoning of using a critical section.
1563  */
1565 
1566  fsync_fname(path, false);
1567  fsync_fname(dir, true);
1568  fsync_fname("pg_replslot", true);
1569 
1570  END_CRIT_SECTION();
1571 
1572  /*
1573  * Successfully wrote, unset dirty bit, unless somebody dirtied again
1574  * already.
1575  */
1576  SpinLockAcquire(&slot->mutex);
1577  if (!slot->just_dirtied)
1578  slot->dirty = false;
1579  SpinLockRelease(&slot->mutex);
1580 
1582 }
1583 
1584 /*
1585  * Load a single slot from disk into memory.
1586  */
1587 static void
1589 {
1591  int i;
1592  char slotdir[MAXPGPATH + 12];
1593  char path[MAXPGPATH + 22];
1594  int fd;
1595  bool restored = false;
1596  int readBytes;
1598 
1599  /* no need to lock here, no concurrent access allowed yet */
1600 
1601  /* delete temp file if it exists */
1602  sprintf(slotdir, "pg_replslot/%s", name);
1603  sprintf(path, "%s/state.tmp", slotdir);
1604  if (unlink(path) < 0 && errno != ENOENT)
1605  ereport(PANIC,
1607  errmsg("could not remove file \"%s\": %m", path)));
1608 
1609  sprintf(path, "%s/state", slotdir);
1610 
1611  elog(DEBUG1, "restoring replication slot from \"%s\"", path);
1612 
1613  /* on some operating systems fsyncing a file requires O_RDWR */
1614  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1615 
1616  /*
1617  * We do not need to handle this as we are rename()ing the directory into
1618  * place only after we fsync()ed the state file.
1619  */
1620  if (fd < 0)
1621  ereport(PANIC,
1623  errmsg("could not open file \"%s\": %m", path)));
1624 
1625  /*
1626  * Sync state file before we're reading from it. We might have crashed
1627  * while it wasn't synced yet and we shouldn't continue on that basis.
1628  */
1630  if (pg_fsync(fd) != 0)
1631  ereport(PANIC,
1633  errmsg("could not fsync file \"%s\": %m",
1634  path)));
1636 
1637  /* Also sync the parent directory */
1639  fsync_fname(slotdir, true);
1640  END_CRIT_SECTION();
1641 
1642  /* read part of statefile that's guaranteed to be version independent */
1644  readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize);
1646  if (readBytes != ReplicationSlotOnDiskConstantSize)
1647  {
1648  if (readBytes < 0)
1649  ereport(PANIC,
1651  errmsg("could not read file \"%s\": %m", path)));
1652  else
1653  ereport(PANIC,
1655  errmsg("could not read file \"%s\": read %d of %zu",
1656  path, readBytes,
1658  }
1659 
1660  /* verify magic */
1661  if (cp.magic != SLOT_MAGIC)
1662  ereport(PANIC,
1664  errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u",
1665  path, cp.magic, SLOT_MAGIC)));
1666 
1667  /* verify version */
1668  if (cp.version != SLOT_VERSION)
1669  ereport(PANIC,
1671  errmsg("replication slot file \"%s\" has unsupported version %u",
1672  path, cp.version)));
1673 
1674  /* boundary check on length */
1676  ereport(PANIC,
1678  errmsg("replication slot file \"%s\" has corrupted length %u",
1679  path, cp.length)));
1680 
1681  /* Now that we know the size, read the entire file */
1683  readBytes = read(fd,
1684  (char *) &cp + ReplicationSlotOnDiskConstantSize,
1685  cp.length);
1687  if (readBytes != cp.length)
1688  {
1689  if (readBytes < 0)
1690  ereport(PANIC,
1692  errmsg("could not read file \"%s\": %m", path)));
1693  else
1694  ereport(PANIC,
1696  errmsg("could not read file \"%s\": read %d of %zu",
1697  path, readBytes, (Size) cp.length)));
1698  }
1699 
1700  if (CloseTransientFile(fd) != 0)
1701  ereport(PANIC,
1703  errmsg("could not close file \"%s\": %m", path)));
1704 
1705  /* now verify the CRC */
1706  INIT_CRC32C(checksum);
1707  COMP_CRC32C(checksum,
1708  (char *) &cp + SnapBuildOnDiskNotChecksummedSize,
1710  FIN_CRC32C(checksum);
1711 
1712  if (!EQ_CRC32C(checksum, cp.checksum))
1713  ereport(PANIC,
1714  (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u",
1715  path, checksum, cp.checksum)));
1716 
1717  /*
1718  * If we crashed with an ephemeral slot active, don't restore but delete
1719  * it.
1720  */
1722  {
1723  if (!rmtree(slotdir, true))
1724  {
1725  ereport(WARNING,
1726  (errmsg("could not remove directory \"%s\"",
1727  slotdir)));
1728  }
1729  fsync_fname("pg_replslot", true);
1730  return;
1731  }
1732 
1733  /*
1734  * Verify that requirements for the specific slot type are met. That's
1735  * important because if these aren't met we're not guaranteed to retain
1736  * all the necessary resources for the slot.
1737  *
1738  * NB: We have to do so *after* the above checks for ephemeral slots,
1739  * because otherwise a slot that shouldn't exist anymore could prevent
1740  * restarts.
1741  *
1742  * NB: Changing the requirements here also requires adapting
1743  * CheckSlotRequirements() and CheckLogicalDecodingRequirements().
1744  */
1746  ereport(FATAL,
1747  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1748  errmsg("logical replication slot \"%s\" exists, but wal_level < logical",
1749  NameStr(cp.slotdata.name)),
1750  errhint("Change wal_level to be logical or higher.")));
1751  else if (wal_level < WAL_LEVEL_REPLICA)
1752  ereport(FATAL,
1753  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1754  errmsg("physical replication slot \"%s\" exists, but wal_level < replica",
1755  NameStr(cp.slotdata.name)),
1756  errhint("Change wal_level to be replica or higher.")));
1757 
1758  /* nothing can be active yet, don't lock anything */
1759  for (i = 0; i < max_replication_slots; i++)
1760  {
1761  ReplicationSlot *slot;
1762 
1763  slot = &ReplicationSlotCtl->replication_slots[i];
1764 
1765  if (slot->in_use)
1766  continue;
1767 
1768  /* restore the entire set of persistent data */
1769  memcpy(&slot->data, &cp.slotdata,
1771 
1772  /* initialize in memory state */
1773  slot->effective_xmin = cp.slotdata.xmin;
1775 
1780 
1781  slot->in_use = true;
1782  slot->active_pid = 0;
1783 
1784  restored = true;
1785  break;
1786  }
1787 
1788  if (!restored)
1789  ereport(FATAL,
1790  (errmsg("too many replication slots active before shutdown"),
1791  errhint("Increase max_replication_slots and try again.")));
1792 }
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
static void RestoreSlotFromDisk(const char *name)
Definition: slot.c:1588
ReplicationSlotCtlData * ReplicationSlotCtl
Definition: slot.c:93
void CheckSlotRequirements(void)
Definition: slot.c:1061
TransactionId candidate_catalog_xmin
Definition: slot.h:171
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1942
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:41
int errhint(const char *fmt,...)
Definition: elog.c:1152
Size ReplicationSlotsShmemSize(void)
Definition: slot.c:116
#define PROC_IN_LOGICAL_DECODING
Definition: proc.h:61
int wal_segment_size
Definition: xlog.c:118
uint32 TransactionId
Definition: c.h:587
bool pg_str_endswith(const char *str, const char *end)
Definition: string.c:31
void namestrcpy(Name name, const char *str)
Definition: name.c:233
#define write(a, b, c)
Definition: win32.h:14
PGPROC * MyProc
Definition: proc.c:68
#define SLOT_MAGIC
Definition: slot.c:89
uint32 pg_crc32c
Definition: pg_crc32c.h:38
int wal_level
Definition: xlog.c:108
#define SpinLockInit(lock)
Definition: spin.h:60
#define END_CRIT_SECTION()
Definition: miscadmin.h:135
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:661
ReplicationSlotPersistency persistency
Definition: slot.h:62
#define START_CRIT_SECTION()
Definition: miscadmin.h:133
void ConditionVariableBroadcast(ConditionVariable *cv)
int errcode(int sqlerrcode)
Definition: elog.c:694
PROC_HDR * ProcGlobal
Definition: proc.c:80
#define MemSet(start, val, len)
Definition: c.h:1008
#define kill(pid, sig)
Definition: win32_port.h:454
void ReplicationSlotCreate(const char *name, bool db_specific, ReplicationSlotPersistency persistency, bool two_phase)
Definition: slot.c:228
void ReplicationSlotSave(void)
Definition: slot.c:727
uint8 statusFlags
Definition: proc.h:189
#define SnapBuildOnDiskNotChecksummedSize
Definition: slot.c:80
static void ReplicationSlotDropPtr(ReplicationSlot *slot)
Definition: slot.c:616
ReplicationSlotPersistentData data
Definition: slot.h:156
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8132
Definition: dirent.h:9
#define PANIC
Definition: elog.h:55
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2850
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define PG_BINARY
Definition: c.h:1271
static void CreateSlotOnDisk(ReplicationSlot *slot)
Definition: slot.c:1373
void ReplicationSlotsShmemInit(void)
Definition: slot.c:134
static void SaveSlotToPath(ReplicationSlot *slot, const char *path, int elevel)
Definition: slot.c:1434
void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
Definition: xlog.c:2732
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1808
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
#define NAMEDATALEN
#define sprintf
Definition: port.h:218
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:174
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableInit(ConditionVariable *cv)
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
XLogSegNo XLogGetLastRemovedSegno(void)
Definition: xlog.c:3977
void ReplicationSlotReserveWal(void)
Definition: slot.c:1086
static void ReplicationSlotDropAcquired(void)
Definition: slot.c:599
void ReplicationSlotsComputeRequiredLSN(void)
Definition: slot.c:834
ReplicationSlot * SearchNamedReplicationSlot(const char *name)
Definition: slot.c:351
ReplicationSlotPersistentData slotdata
Definition: slot.c:73
void ConditionVariableCancelSleep(void)
XLogRecPtr LogStandbySnapshot(void)
Definition: standby.c:1205
Definition: dirent.c:25
#define ERROR
Definition: elog.h:45
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2404
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
#define FATAL
Definition: elog.h:54
XLogRecPtr GetXLogInsertRecPtr(void)
Definition: xlog.c:11615
#define MAXPGPATH
Definition: slot.h:44
void ReplicationSlotPersist(void)
Definition: slot.c:762
TransactionId effective_xmin
Definition: slot.h:152
Definition: c.h:675
XLogRecPtr candidate_restart_valid
Definition: slot.h:173
void StartupReplicationSlots(void)
Definition: slot.c:1313
bool IsUnderPostmaster
Definition: globals.c:110
void pgstat_report_replslot(const char *slotname, int spilltxns, int spillcount, int spillbytes, int streamtxns, int streamcount, int streambytes)
Definition: pgstat.c:1776
uint64 XLogSegNo
Definition: xlogdefs.h:48
SlotAcquireBehavior
Definition: slot.h:41
int errcode_for_file_access(void)
Definition: elog.c:717
XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void)
Definition: slot.c:877
TransactionId catalog_xmin
Definition: slot.h:78
#define InvalidTransactionId
Definition: transam.h:31
unsigned int uint32
Definition: c.h:441
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2615
void ReplicationSlotRelease(void)
Definition: slot.c:492
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1512
TransactionId xmin
Definition: slot.h:70
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define SlotIsLogical(slot)
Definition: slot.h:178
#define AssertArg(condition)
Definition: c.h:806
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
pg_crc32c checksum
Definition: slot.c:62
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
struct ReplicationSlotOnDisk ReplicationSlotOnDisk
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:743
int CloseTransientFile(int fd)
Definition: fd.c:2581
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define WARNING
Definition: elog.h:40
bool rmtree(const char *path, bool rmtopdir)
Definition: rmtree.c:42
bool in_use
Definition: slot.h:132
static int elevel
Definition: vacuumlazy.c:333
#define SpinLockRelease(lock)
Definition: spin.h:64
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
bool just_dirtied
Definition: slot.h:138
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
TransactionId effective_catalog_xmin
Definition: slot.h:153
unsigned short st_mode
Definition: win32_port.h:260
Oid MyDatabaseId
Definition: globals.c:86
#define SLOT_VERSION
Definition: slot.c:90
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:155
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3612
ReplicationSlot * MyReplicationSlot
Definition: slot.c:96
int max_replication_slots
Definition: slot.c:99
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest)
#define ReplicationSlotOnDiskV2Size
Definition: slot.c:86
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:804
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2681
XLogRecPtr restart_lsn
Definition: slot.h:81
#define ReplicationSlotOnDiskConstantSize
Definition: slot.c:77
size_t Size
Definition: c.h:540
uint32 version
Definition: slot.c:65
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1488
#define SnapBuildOnDiskChecksummedSize
Definition: slot.c:83
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1206
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:8424
static int ReplicationSlotAcquireInternal(ReplicationSlot *slot, const char *name, SlotAcquireBehavior behavior)
Definition: slot.c:397
ConditionVariable active_cv
Definition: slot.h:162
const char * name
Definition: encode.c:515
bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
Definition: slot.c:929
#define S_ISDIR(m)
Definition: win32_port.h:316
#define lstat(path, sb)
Definition: win32_port.h:276
XLogRecPtr candidate_xmin_lsn
Definition: slot.h:172
void ReplicationSlotDrop(const char *name, bool nowait)
Definition: slot.c:586
void pgstat_report_replslot_drop(const char *slotname)
Definition: pgstat.c:1803
ReplicationSlotPersistency
Definition: slot.h:33
int errmsg(const char *fmt,...)
Definition: elog.c:905
int ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior)
Definition: slot.c:383
pid_t active_pid
Definition: slot.h:135
void InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
Definition: slot.c:1163
#define elog(elevel,...)
Definition: elog.h:227
int i
Definition: slot.h:43
int pgxactoff
Definition: proc.h:148
#define NameStr(name)
Definition: c.h:681
void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked)
Definition: procarray.c:3797
void ReplicationSlotCleanup(void)
Definition: slot.c:548
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:100
int pg_fsync(int fd)
Definition: fd.c:347
ReplicationSlot replication_slots[1]
Definition: slot.h:189
char d_name[MAX_PATH]
Definition: dirent.h:15
uint8 * statusFlags
Definition: proc.h:333
XLogRecPtr invalidated_at
Definition: slot.h:84
slock_t mutex
Definition: slot.h:129
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:89
#define ERRCODE_DUPLICATE_OBJECT
Definition: streamutil.c:32
void CheckPointReplicationSlots(void)
Definition: slot.c:1278
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:94
#define snprintf
Definition: port.h:216
void ReplicationSlotsDropDBSlots(Oid dboid)
Definition: slot.c:985
void ReplicationSlotsComputeRequiredXmin(bool already_locked)
Definition: slot.c:784
bool dirty
Definition: slot.h:139
#define read(a, b, c)
Definition: win32.h:13
int FreeDir(DIR *dir)
Definition: fd.c:2733
XLogRecPtr candidate_restart_lsn
Definition: slot.h:174
#define offsetof(type, field)
Definition: c.h:727
void ReplicationSlotMarkDirty(void)
Definition: slot.c:745
#define stat
Definition: win32_port.h:275
LWLock io_in_progress_lock
Definition: slot.h:159
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)