PostgreSQL Source Code git master
sysv_sema.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * sysv_sema.c
4 * Implement PGSemaphores using SysV semaphore facilities
5 *
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/port/sysv_sema.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include <signal.h>
18#include <unistd.h>
19#include <sys/file.h>
20#include <sys/ipc.h>
21#include <sys/sem.h>
22#include <sys/stat.h>
23
24#include "miscadmin.h"
25#include "storage/ipc.h"
26#include "storage/pg_sema.h"
27#include "storage/shmem.h"
28
29
30typedef struct PGSemaphoreData
31{
32 int semId; /* semaphore set identifier */
33 int semNum; /* semaphore number within set */
35
36#ifndef HAVE_UNION_SEMUN
37union semun
38{
39 int val;
40 struct semid_ds *buf;
41 unsigned short *array;
42};
43#endif
44
45typedef key_t IpcSemaphoreKey; /* semaphore key passed to semget(2) */
46typedef int IpcSemaphoreId; /* semaphore ID returned by semget(2) */
47
48/*
49 * SEMAS_PER_SET is the number of useful semaphores in each semaphore set
50 * we allocate. It must be *less than* your kernel's SEMMSL (max semaphores
51 * per set) parameter, which is often around 25. (Less than, because we
52 * allocate one extra sema in each set for identification purposes.)
53 */
54#define SEMAS_PER_SET 16
55
56#define IPCProtection (0600) /* access/modify by user only */
57
58#define PGSemaMagic 537 /* must be less than SEMVMX */
59
60
61static PGSemaphore sharedSemas; /* array of PGSemaphoreData in shared memory */
62static int numSharedSemas; /* number of PGSemaphoreDatas used so far */
63static int maxSharedSemas; /* allocated size of PGSemaphoreData array */
64static IpcSemaphoreId *mySemaSets; /* IDs of sema sets acquired so far */
65static int numSemaSets; /* number of sema sets acquired so far */
66static int maxSemaSets; /* allocated size of mySemaSets array */
67static IpcSemaphoreKey nextSemaKey; /* next key to try using */
68static int nextSemaNumber; /* next free sem num in last sema set */
69
70
72 int numSems, bool retry_ok);
73static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum,
74 int value);
75static void IpcSemaphoreKill(IpcSemaphoreId semId);
76static int IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum);
77static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum);
79static void ReleaseSemaphores(int status, Datum arg);
80
81
82/*
83 * InternalIpcSemaphoreCreate
84 *
85 * Attempt to create a new semaphore set with the specified key.
86 * Will fail (return -1) if such a set already exists.
87 *
88 * If we fail with a failure code other than collision-with-existing-set,
89 * print out an error and abort. Other types of errors suggest nonrecoverable
90 * problems.
91 *
92 * Unfortunately, it's sometimes hard to tell whether errors are
93 * nonrecoverable. Our caller keeps track of whether continuing to retry
94 * is sane or not; if not, we abort on failure regardless of the errno.
95 */
96static IpcSemaphoreId
98{
99 int semId;
100
101 semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection);
102
103 if (semId < 0)
104 {
105 int saved_errno = errno;
106
107 /*
108 * Fail quietly if error suggests a collision with an existing set and
109 * our caller has not lost patience.
110 *
111 * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
112 * we could get a permission violation instead. On some platforms
113 * EINVAL will be reported if the existing set has too few semaphores.
114 * Also, EIDRM might occur if an old set is slated for destruction but
115 * not gone yet.
116 *
117 * EINVAL is the key reason why we need the caller-level loop limit,
118 * as it can also mean that the platform's SEMMSL is less than
119 * numSems, and that condition can't be fixed by trying another key.
120 */
121 if (retry_ok &&
122 (saved_errno == EEXIST
123 || saved_errno == EACCES
124 || saved_errno == EINVAL
125#ifdef EIDRM
126 || saved_errno == EIDRM
127#endif
128 ))
129 return -1;
130
131 /*
132 * Else complain and abort
133 */
135 (errmsg("could not create semaphores: %m"),
136 errdetail("Failed system call was semget(%lu, %d, 0%o).",
137 (unsigned long) semKey, numSems,
139 (saved_errno == ENOSPC) ?
140 errhint("This error does *not* mean that you have run out of disk space. "
141 "It occurs when either the system limit for the maximum number of "
142 "semaphore sets (SEMMNI), or the system wide maximum number of "
143 "semaphores (SEMMNS), would be exceeded. You need to raise the "
144 "respective kernel parameter. Alternatively, reduce PostgreSQL's "
145 "consumption of semaphores by reducing its \"max_connections\" parameter.\n"
146 "The PostgreSQL documentation contains more information about "
147 "configuring your system for PostgreSQL.") : 0));
148 }
149
150 return semId;
151}
152
153/*
154 * Initialize a semaphore to the specified value.
155 */
156static void
158{
159 union semun semun;
160
161 semun.val = value;
162 if (semctl(semId, semNum, SETVAL, semun) < 0)
163 {
164 int saved_errno = errno;
165
167 (errmsg_internal("semctl(%d, %d, SETVAL, %d) failed: %m",
168 semId, semNum, value),
169 (saved_errno == ERANGE) ?
170 errhint("You possibly need to raise your kernel's SEMVMX value to be at least "
171 "%d. Look into the PostgreSQL documentation for details.",
172 value) : 0));
173 }
174}
175
176/*
177 * IpcSemaphoreKill(semId) - removes a semaphore set
178 */
179static void
181{
182 union semun semun;
183
184 semun.val = 0; /* unused, but keep compiler quiet */
185
186 if (semctl(semId, 0, IPC_RMID, semun) < 0)
187 elog(LOG, "semctl(%d, 0, IPC_RMID, ...) failed: %m", semId);
188}
189
190/* Get the current value (semval) of the semaphore */
191static int
193{
194 union semun dummy; /* for Solaris */
195
196 dummy.val = 0; /* unused */
197
198 return semctl(semId, semNum, GETVAL, dummy);
199}
200
201/* Get the PID of the last process to do semop() on the semaphore */
202static pid_t
204{
205 union semun dummy; /* for Solaris */
206
207 dummy.val = 0; /* unused */
208
209 return semctl(semId, semNum, GETPID, dummy);
210}
211
212
213/*
214 * Create a semaphore set with the given number of useful semaphores
215 * (an additional sema is actually allocated to serve as identifier).
216 * Dead Postgres sema sets are recycled if found, but we do not fail
217 * upon collision with non-Postgres sema sets.
218 *
219 * The idea here is to detect and re-use keys that may have been assigned
220 * by a crashed postmaster or backend.
221 */
222static IpcSemaphoreId
224{
225 int num_tries = 0;
226 IpcSemaphoreId semId;
227 union semun semun;
228 PGSemaphoreData mysema;
229
230 /* Loop till we find a free IPC key */
231 for (nextSemaKey++;; nextSemaKey++, num_tries++)
232 {
233 pid_t creatorPID;
234
235 /*
236 * Try to create new semaphore set. Give up after trying 1000
237 * distinct IPC keys.
238 */
240 num_tries < 1000);
241 if (semId >= 0)
242 break; /* successful create */
243
244 /* See if it looks to be leftover from a dead Postgres process */
245 semId = semget(nextSemaKey, numSems + 1, 0);
246 if (semId < 0)
247 continue; /* failed: must be some other app's */
249 continue; /* sema belongs to a non-Postgres app */
250
251 /*
252 * If the creator PID is my own PID or does not belong to any extant
253 * process, it's safe to zap it.
254 */
255 creatorPID = IpcSemaphoreGetLastPID(semId, numSems);
256 if (creatorPID <= 0)
257 continue; /* oops, GETPID failed */
258 if (creatorPID != getpid())
259 {
260 if (kill(creatorPID, 0) == 0 || errno != ESRCH)
261 continue; /* sema belongs to a live process */
262 }
263
264 /*
265 * The sema set appears to be from a dead Postgres process, or from a
266 * previous cycle of life in this same process. Zap it, if possible.
267 * This probably shouldn't fail, but if it does, assume the sema set
268 * belongs to someone else after all, and continue quietly.
269 */
270 semun.val = 0; /* unused, but keep compiler quiet */
271 if (semctl(semId, 0, IPC_RMID, semun) < 0)
272 continue;
273
274 /*
275 * Now try again to create the sema set.
276 */
278 if (semId >= 0)
279 break; /* successful create */
280
281 /*
282 * Can only get here if some other process managed to create the same
283 * sema key before we did. Let him have that one, loop around to try
284 * next key.
285 */
286 }
287
288 /*
289 * OK, we created a new sema set. Mark it as created by this process. We
290 * do this by setting the spare semaphore to PGSemaMagic-1 and then
291 * incrementing it with semop(). That leaves it with value PGSemaMagic
292 * and sempid referencing this process.
293 */
295 mysema.semId = semId;
296 mysema.semNum = numSems;
297 PGSemaphoreUnlock(&mysema);
298
299 return semId;
300}
301
302
303/*
304 * Report amount of shared memory needed for semaphores
305 */
306Size
308{
309 return mul_size(maxSemas, sizeof(PGSemaphoreData));
310}
311
312/*
313 * PGReserveSemaphores --- initialize semaphore support
314 *
315 * This is called during postmaster start or shared memory reinitialization.
316 * It should do whatever is needed to be able to support up to maxSemas
317 * subsequent PGSemaphoreCreate calls. Also, if any system resources
318 * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
319 * callback to release them.
320 *
321 * In the SysV implementation, we acquire semaphore sets on-demand; the
322 * maxSemas parameter is just used to size the arrays. There is an array
323 * of PGSemaphoreData structs in shared memory, and a postmaster-local array
324 * with one entry per SysV semaphore set, which we use for releasing the
325 * semaphore sets when done. (This design ensures that postmaster shutdown
326 * doesn't rely on the contents of shared memory, which a failed backend might
327 * have clobbered.)
328 */
329void
331{
332 struct stat statbuf;
333
334 /*
335 * We use the data directory's inode number to seed the search for free
336 * semaphore keys. This minimizes the odds of collision with other
337 * postmasters, while maximizing the odds that we will detect and clean up
338 * semaphores left over from a crashed postmaster in our own directory.
339 */
340 if (stat(DataDir, &statbuf) < 0)
343 errmsg("could not stat data directory \"%s\": %m",
344 DataDir)));
345
348 numSharedSemas = 0;
349 maxSharedSemas = maxSemas;
350
351 maxSemaSets = (maxSemas + SEMAS_PER_SET - 1) / SEMAS_PER_SET;
354 if (mySemaSets == NULL)
355 elog(PANIC, "out of memory");
356 numSemaSets = 0;
357 nextSemaKey = statbuf.st_ino;
358 nextSemaNumber = SEMAS_PER_SET; /* force sema set alloc on 1st call */
359
361}
362
363/*
364 * Release semaphores at shutdown or shmem reinitialization
365 *
366 * (called as an on_shmem_exit callback, hence funny argument list)
367 */
368static void
370{
371 int i;
372
373 for (i = 0; i < numSemaSets; i++)
376}
377
378/*
379 * PGSemaphoreCreate
380 *
381 * Allocate a PGSemaphore structure with initial count 1
382 */
385{
386 PGSemaphore sema;
387
388 /* Can't do this in a backend, because static state is postmaster's */
390
392 {
393 /* Time to allocate another semaphore set */
395 elog(PANIC, "too many semaphores created");
397 numSemaSets++;
398 nextSemaNumber = 0;
399 }
400 /* Use the next shared PGSemaphoreData */
402 elog(PANIC, "too many semaphores created");
403 sema = &sharedSemas[numSharedSemas++];
404 /* Assign the next free semaphore in the current set */
405 sema->semId = mySemaSets[numSemaSets - 1];
406 sema->semNum = nextSemaNumber++;
407 /* Initialize it to count 1 */
408 IpcSemaphoreInitialize(sema->semId, sema->semNum, 1);
409
410 return sema;
411}
412
413/*
414 * PGSemaphoreReset
415 *
416 * Reset a previously-initialized PGSemaphore to have count 0
417 */
418void
420{
421 IpcSemaphoreInitialize(sema->semId, sema->semNum, 0);
422}
423
424/*
425 * PGSemaphoreLock
426 *
427 * Lock a semaphore (decrement count), blocking if count would be < 0
428 */
429void
431{
432 int errStatus;
433 struct sembuf sops;
434
435 sops.sem_op = -1; /* decrement */
436 sops.sem_flg = 0;
437 sops.sem_num = sema->semNum;
438
439 /*
440 * Note: if errStatus is -1 and errno == EINTR then it means we returned
441 * from the operation prematurely because we were sent a signal. So we
442 * try and lock the semaphore again.
443 *
444 * We used to check interrupts here, but that required servicing
445 * interrupts directly from signal handlers. Which is hard to do safely
446 * and portably.
447 */
448 do
449 {
450 errStatus = semop(sema->semId, &sops, 1);
451 } while (errStatus < 0 && errno == EINTR);
452
453 if (errStatus < 0)
454 elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
455}
456
457/*
458 * PGSemaphoreUnlock
459 *
460 * Unlock a semaphore (increment count)
461 */
462void
464{
465 int errStatus;
466 struct sembuf sops;
467
468 sops.sem_op = 1; /* increment */
469 sops.sem_flg = 0;
470 sops.sem_num = sema->semNum;
471
472 /*
473 * Note: if errStatus is -1 and errno == EINTR then it means we returned
474 * from the operation prematurely because we were sent a signal. So we
475 * try and unlock the semaphore again. Not clear this can really happen,
476 * but might as well cope.
477 */
478 do
479 {
480 errStatus = semop(sema->semId, &sops, 1);
481 } while (errStatus < 0 && errno == EINTR);
482
483 if (errStatus < 0)
484 elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
485}
486
487/*
488 * PGSemaphoreTryLock
489 *
490 * Lock a semaphore only if able to do so without blocking
491 */
492bool
494{
495 int errStatus;
496 struct sembuf sops;
497
498 sops.sem_op = -1; /* decrement */
499 sops.sem_flg = IPC_NOWAIT; /* but don't block */
500 sops.sem_num = sema->semNum;
501
502 /*
503 * Note: if errStatus is -1 and errno == EINTR then it means we returned
504 * from the operation prematurely because we were sent a signal. So we
505 * try and lock the semaphore again.
506 */
507 do
508 {
509 errStatus = semop(sema->semId, &sops, 1);
510 } while (errStatus < 0 && errno == EINTR);
511
512 if (errStatus < 0)
513 {
514 /* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */
515#ifdef EAGAIN
516 if (errno == EAGAIN)
517 return false; /* failed to lock it */
518#endif
519#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
520 if (errno == EWOULDBLOCK)
521 return false; /* failed to lock it */
522#endif
523 /* Otherwise we got trouble */
524 elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
525 }
526
527 return true;
528}
size_t Size
Definition: c.h:613
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode_for_file_access(void)
Definition: elog.c:886
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define PANIC
Definition: elog.h:42
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
bool IsUnderPostmaster
Definition: globals.c:120
char * DataDir
Definition: globals.c:71
Assert(PointerIsAligned(start, uint64))
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
static struct @171 value
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int i
Definition: isn.c:77
void * arg
struct PGSemaphoreData * PGSemaphore
Definition: pg_sema.h:34
static int numSems
Definition: posix_sema.c:66
uint64_t Datum
Definition: postgres.h:70
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
void * ShmemAlloc(Size size)
Definition: shmem.c:154
_ino_t st_ino
Definition: win32_port.h:257
Size PGSemaphoreShmemSize(int maxSemas)
Definition: sysv_sema.c:307
#define PGSemaMagic
Definition: sysv_sema.c:58
static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems, bool retry_ok)
Definition: sysv_sema.c:97
void PGSemaphoreUnlock(PGSemaphore sema)
Definition: sysv_sema.c:463
struct PGSemaphoreData PGSemaphoreData
static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum)
Definition: sysv_sema.c:203
static int IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum)
Definition: sysv_sema.c:192
key_t IpcSemaphoreKey
Definition: sysv_sema.c:45
int IpcSemaphoreId
Definition: sysv_sema.c:46
void PGReserveSemaphores(int maxSemas)
Definition: sysv_sema.c:330
static void IpcSemaphoreKill(IpcSemaphoreId semId)
Definition: sysv_sema.c:180
static int maxSharedSemas
Definition: sysv_sema.c:63
#define SEMAS_PER_SET
Definition: sysv_sema.c:54
void PGSemaphoreReset(PGSemaphore sema)
Definition: sysv_sema.c:419
void PGSemaphoreLock(PGSemaphore sema)
Definition: sysv_sema.c:430
#define IPCProtection
Definition: sysv_sema.c:56
static IpcSemaphoreKey nextSemaKey
Definition: sysv_sema.c:67
static int numSemaSets
Definition: sysv_sema.c:65
bool PGSemaphoreTryLock(PGSemaphore sema)
Definition: sysv_sema.c:493
static int nextSemaNumber
Definition: sysv_sema.c:68
static PGSemaphore sharedSemas
Definition: sysv_sema.c:61
static int maxSemaSets
Definition: sysv_sema.c:66
PGSemaphore PGSemaphoreCreate(void)
Definition: sysv_sema.c:384
static int numSharedSemas
Definition: sysv_sema.c:62
static void ReleaseSemaphores(int status, Datum arg)
Definition: sysv_sema.c:369
static IpcSemaphoreId * mySemaSets
Definition: sysv_sema.c:64
static IpcSemaphoreId IpcSemaphoreCreate(int numSems)
Definition: sysv_sema.c:223
static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value)
Definition: sysv_sema.c:157
int val
Definition: sysv_sema.c:39
struct semid_ds * buf
Definition: sysv_sema.c:40
unsigned short * array
Definition: sysv_sema.c:41
#define stat
Definition: win32_port.h:274
#define SETVAL
Definition: win32_port.h:108
#define EINTR
Definition: win32_port.h:364
#define EWOULDBLOCK
Definition: win32_port.h:370
#define IPC_NOWAIT
Definition: win32_port.h:97
#define IPC_RMID
Definition: win32_port.h:93
#define GETPID
Definition: win32_port.h:109
#define kill(pid, sig)
Definition: win32_port.h:493
long key_t
Definition: win32_port.h:237
#define IPC_EXCL
Definition: win32_port.h:95
#define IPC_CREAT
Definition: win32_port.h:94
#define EIDRM
Definition: win32_port.h:102
#define GETVAL
Definition: win32_port.h:107
#define EAGAIN
Definition: win32_port.h:362