PostgreSQL Source Code  git master
latch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * latch.c
4  * Routines for inter-process latches
5  *
6  * The poll() implementation uses the so-called self-pipe trick to overcome the
7  * race condition involved with poll() and setting a global flag in the signal
8  * handler. When a latch is set and the current process is waiting for it, the
9  * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
10  * A signal by itself doesn't interrupt poll() on all platforms, and even on
11  * platforms where it does, a signal that arrives just before the poll() call
12  * does not prevent poll() from entering sleep. An incoming byte on a pipe
13  * however reliably interrupts the sleep, and causes poll() to return
14  * immediately even if the signal arrives before poll() begins.
15  *
16  * The epoll() implementation overcomes the race with a different technique: it
17  * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We
18  * don't need to register a signal handler or create our own self-pipe. We
19  * assume that any system that has Linux epoll() also has Linux signalfd().
20  *
21  * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
22  *
23  * The Windows implementation uses Windows events that are inherited by all
24  * postmaster child processes. There's no need for the self-pipe trick there.
25  *
26  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  * IDENTIFICATION
30  * src/backend/storage/ipc/latch.c
31  *
32  *-------------------------------------------------------------------------
33  */
34 #include "postgres.h"
35 
36 #include <fcntl.h>
37 #include <limits.h>
38 #include <signal.h>
39 #include <unistd.h>
40 #ifdef HAVE_SYS_EPOLL_H
41 #include <sys/epoll.h>
42 #endif
43 #ifdef HAVE_SYS_EVENT_H
44 #include <sys/event.h>
45 #endif
46 #ifdef HAVE_SYS_SIGNALFD_H
47 #include <sys/signalfd.h>
48 #endif
49 #ifdef HAVE_POLL_H
50 #include <poll.h>
51 #endif
52 
53 #include "libpq/pqsignal.h"
54 #include "miscadmin.h"
55 #include "pgstat.h"
56 #include "port/atomics.h"
57 #include "portability/instr_time.h"
58 #include "postmaster/postmaster.h"
59 #include "storage/fd.h"
60 #include "storage/ipc.h"
61 #include "storage/latch.h"
62 #include "storage/pmsignal.h"
63 #include "storage/shmem.h"
64 #include "utils/memutils.h"
65 
66 /*
67  * Select the fd readiness primitive to use. Normally the "most modern"
68  * primitive supported by the OS will be used, but for testing it can be
69  * useful to manually specify the used primitive. If desired, just add a
70  * define somewhere before this block.
71  */
72 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
73  defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
74 /* don't overwrite manual choice */
75 #elif defined(HAVE_SYS_EPOLL_H)
76 #define WAIT_USE_EPOLL
77 #elif defined(HAVE_KQUEUE)
78 #define WAIT_USE_KQUEUE
79 #elif defined(HAVE_POLL)
80 #define WAIT_USE_POLL
81 #elif WIN32
82 #define WAIT_USE_WIN32
83 #else
84 #error "no wait set implementation available"
85 #endif
86 
87 /*
88  * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
89  * available. For testing the choice can also be manually specified.
90  */
91 #if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
92 #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
93 /* don't overwrite manual choice */
94 #elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H)
95 #define WAIT_USE_SIGNALFD
96 #else
97 #define WAIT_USE_SELF_PIPE
98 #endif
99 #endif
100 
101 /* typedef in latch.h */
103 {
104  int nevents; /* number of registered events */
105  int nevents_space; /* maximum number of events in this set */
106 
107  /*
108  * Array, of nevents_space length, storing the definition of events this
109  * set is waiting for.
110  */
112 
113  /*
114  * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
115  * said latch, and latch_pos the offset in the ->events array. This is
116  * useful because we check the state of the latch before performing doing
117  * syscalls related to waiting.
118  */
121 
122  /*
123  * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
124  * is set so that we'll exit immediately if postmaster death is detected,
125  * instead of returning.
126  */
128 
129 #if defined(WAIT_USE_EPOLL)
130  int epoll_fd;
131  /* epoll_wait returns events in a user provided arrays, allocate once */
132  struct epoll_event *epoll_ret_events;
133 #elif defined(WAIT_USE_KQUEUE)
134  int kqueue_fd;
135  /* kevent returns events in a user provided arrays, allocate once */
136  struct kevent *kqueue_ret_events;
137  bool report_postmaster_not_running;
138 #elif defined(WAIT_USE_POLL)
139  /* poll expects events to be waited on every poll() call, prepare once */
140  struct pollfd *pollfds;
141 #elif defined(WAIT_USE_WIN32)
142 
143  /*
144  * Array of windows events. The first element always contains
145  * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
146  * event->pos + 1).
147  */
148  HANDLE *handles;
149 #endif
150 };
151 
152 /* A common WaitEventSet used to implement WaitLatch() */
154 
155 /* The position of the latch in LatchWaitSet. */
156 #define LatchWaitSetLatchPos 0
157 
158 #ifndef WIN32
159 /* Are we currently in WaitLatch? The signal handler would like to know. */
160 static volatile sig_atomic_t waiting = false;
161 #endif
162 
163 #ifdef WAIT_USE_SIGNALFD
164 /* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
165 static int signal_fd = -1;
166 #endif
167 
168 #ifdef WAIT_USE_SELF_PIPE
169 /* Read and write ends of the self-pipe */
170 static int selfpipe_readfd = -1;
171 static int selfpipe_writefd = -1;
172 
173 /* Process owning the self-pipe --- needed for checking purposes */
174 static int selfpipe_owner_pid = 0;
175 
176 /* Private function prototypes */
177 static void latch_sigurg_handler(SIGNAL_ARGS);
178 static void sendSelfPipeByte(void);
179 #endif
180 
181 #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
182 static void drain(void);
183 #endif
184 
185 #if defined(WAIT_USE_EPOLL)
186 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
187 #elif defined(WAIT_USE_KQUEUE)
188 static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
189 #elif defined(WAIT_USE_POLL)
190 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
191 #elif defined(WAIT_USE_WIN32)
192 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
193 #endif
194 
195 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
196  WaitEvent *occurred_events, int nevents);
197 
198 /*
199  * Initialize the process-local latch infrastructure.
200  *
201  * This must be called once during startup of any process that can wait on
202  * latches, before it issues any InitLatch() or OwnLatch() calls.
203  */
204 void
206 {
207 #if defined(WAIT_USE_SELF_PIPE)
208  int pipefd[2];
209 
210  if (IsUnderPostmaster)
211  {
212  /*
213  * We might have inherited connections to a self-pipe created by the
214  * postmaster. It's critical that child processes create their own
215  * self-pipes, of course, and we really want them to close the
216  * inherited FDs for safety's sake.
217  */
218  if (selfpipe_owner_pid != 0)
219  {
220  /* Assert we go through here but once in a child process */
222  /* Release postmaster's pipe FDs; ignore any error */
223  (void) close(selfpipe_readfd);
224  (void) close(selfpipe_writefd);
225  /* Clean up, just for safety's sake; we'll set these below */
227  selfpipe_owner_pid = 0;
228  /* Keep fd.c's accounting straight */
231  }
232  else
233  {
234  /*
235  * Postmaster didn't create a self-pipe ... or else we're in an
236  * EXEC_BACKEND build, in which case it doesn't matter since the
237  * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
238  * fd.c won't have state to clean up, either.
239  */
240  Assert(selfpipe_readfd == -1);
241  }
242  }
243  else
244  {
245  /* In postmaster or standalone backend, assert we do this but once */
246  Assert(selfpipe_readfd == -1);
248  }
249 
250  /*
251  * Set up the self-pipe that allows a signal handler to wake up the
252  * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
253  * that SetLatch won't block if the event has already been set many times
254  * filling the kernel buffer. Make the read-end non-blocking too, so that
255  * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
256  * Also, make both FDs close-on-exec, since we surely do not want any
257  * child processes messing with them.
258  */
259  if (pipe(pipefd) < 0)
260  elog(FATAL, "pipe() failed: %m");
261  if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
262  elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
263  if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
264  elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
265  if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
266  elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
267  if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
268  elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
269 
270  selfpipe_readfd = pipefd[0];
271  selfpipe_writefd = pipefd[1];
273 
274  /* Tell fd.c about these two long-lived FDs */
277 
279 #endif
280 
281 #ifdef WAIT_USE_SIGNALFD
282  sigset_t signalfd_mask;
283 
284  if (IsUnderPostmaster)
285  {
286  /*
287  * It would probably be safe to re-use the inherited signalfd since
288  * signalfds only see the current process's pending signals, but it
289  * seems less surprising to close it and create our own.
290  */
291  if (signal_fd != -1)
292  {
293  /* Release postmaster's signal FD; ignore any error */
294  (void) close(signal_fd);
295  signal_fd = -1;
297  }
298  }
299 
300  /* Block SIGURG, because we'll receive it through a signalfd. */
301  sigaddset(&UnBlockSig, SIGURG);
302 
303  /* Set up the signalfd to receive SIGURG notifications. */
304  sigemptyset(&signalfd_mask);
305  sigaddset(&signalfd_mask, SIGURG);
306  signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
307  if (signal_fd < 0)
308  elog(FATAL, "signalfd() failed");
310 #endif
311 
312 #ifdef WAIT_USE_KQUEUE
313  /* Ignore SIGURG, because we'll receive it via kqueue. */
314  pqsignal(SIGURG, SIG_IGN);
315 #endif
316 }
317 
318 void
320 {
321  int latch_pos PG_USED_FOR_ASSERTS_ONLY;
322 
323  Assert(LatchWaitSet == NULL);
324 
325  /* Set up the WaitEventSet used by WaitLatch(). */
328  MyLatch, NULL);
329  if (IsUnderPostmaster)
331  PGINVALID_SOCKET, NULL, NULL);
332 
333  Assert(latch_pos == LatchWaitSetLatchPos);
334 }
335 
336 void
338 {
339 #if defined(WAIT_USE_POLL)
340  pqsignal(SIGURG, SIG_IGN);
341 #endif
342 
343  if (LatchWaitSet)
344  {
346  LatchWaitSet = NULL;
347  }
348 
349 #if defined(WAIT_USE_SELF_PIPE)
352  selfpipe_readfd = -1;
353  selfpipe_writefd = -1;
355 #endif
356 
357 #if defined(WAIT_USE_SIGNALFD)
358  close(signal_fd);
359  signal_fd = -1;
360 #endif
361 }
362 
363 /*
364  * Initialize a process-local latch.
365  */
366 void
368 {
369  latch->is_set = false;
370  latch->maybe_sleeping = false;
371  latch->owner_pid = MyProcPid;
372  latch->is_shared = false;
373 
374 #if defined(WAIT_USE_SELF_PIPE)
375  /* Assert InitializeLatchSupport has been called in this process */
377 #elif defined(WAIT_USE_SIGNALFD)
378  /* Assert InitializeLatchSupport has been called in this process */
379  Assert(signal_fd >= 0);
380 #elif defined(WAIT_USE_WIN32)
381  latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
382  if (latch->event == NULL)
383  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
384 #endif /* WIN32 */
385 }
386 
387 /*
388  * Initialize a shared latch that can be set from other processes. The latch
389  * is initially owned by no-one; use OwnLatch to associate it with the
390  * current process.
391  *
392  * InitSharedLatch needs to be called in postmaster before forking child
393  * processes, usually right after allocating the shared memory block
394  * containing the latch with ShmemInitStruct. (The Unix implementation
395  * doesn't actually require that, but the Windows one does.) Because of
396  * this restriction, we have no concurrency issues to worry about here.
397  *
398  * Note that other handles created in this module are never marked as
399  * inheritable. Thus we do not need to worry about cleaning up child
400  * process references to postmaster-private latches or WaitEventSets.
401  */
402 void
404 {
405 #ifdef WIN32
406  SECURITY_ATTRIBUTES sa;
407 
408  /*
409  * Set up security attributes to specify that the events are inherited.
410  */
411  ZeroMemory(&sa, sizeof(sa));
412  sa.nLength = sizeof(sa);
413  sa.bInheritHandle = TRUE;
414 
415  latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
416  if (latch->event == NULL)
417  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
418 #endif
419 
420  latch->is_set = false;
421  latch->maybe_sleeping = false;
422  latch->owner_pid = 0;
423  latch->is_shared = true;
424 }
425 
426 /*
427  * Associate a shared latch with the current process, allowing it to
428  * wait on the latch.
429  *
430  * Although there is a sanity check for latch-already-owned, we don't do
431  * any sort of locking here, meaning that we could fail to detect the error
432  * if two processes try to own the same latch at about the same time. If
433  * there is any risk of that, caller must provide an interlock to prevent it.
434  */
435 void
437 {
438  int owner_pid;
439 
440  /* Sanity checks */
441  Assert(latch->is_shared);
442 
443 #if defined(WAIT_USE_SELF_PIPE)
444  /* Assert InitializeLatchSupport has been called in this process */
446 #elif defined(WAIT_USE_SIGNALFD)
447  /* Assert InitializeLatchSupport has been called in this process */
448  Assert(signal_fd >= 0);
449 #endif
450 
451  owner_pid = latch->owner_pid;
452  if (owner_pid != 0)
453  elog(PANIC, "latch already owned by PID %d", owner_pid);
454 
455  latch->owner_pid = MyProcPid;
456 }
457 
458 /*
459  * Disown a shared latch currently owned by the current process.
460  */
461 void
463 {
464  Assert(latch->is_shared);
465  Assert(latch->owner_pid == MyProcPid);
466 
467  latch->owner_pid = 0;
468 }
469 
470 /*
471  * Wait for a given latch to be set, or for postmaster death, or until timeout
472  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
473  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
474  * function returns immediately.
475  *
476  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
477  * is given. Although it is declared as "long", we don't actually support
478  * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
479  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
480  *
481  * The latch must be owned by the current process, ie. it must be a
482  * process-local latch initialized with InitLatch, or a shared latch
483  * associated with the current process by calling OwnLatch.
484  *
485  * Returns bit mask indicating which condition(s) caused the wake-up. Note
486  * that if multiple wake-up conditions are true, there is no guarantee that
487  * we return all of them in one call, but we will return at least one.
488  */
489 int
490 WaitLatch(Latch *latch, int wakeEvents, long timeout,
491  uint32 wait_event_info)
492 {
493  WaitEvent event;
494 
495  /* Postmaster-managed callers must handle postmaster death somehow. */
497  (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
498  (wakeEvents & WL_POSTMASTER_DEATH));
499 
500  /*
501  * Some callers may have a latch other than MyLatch, or no latch at all,
502  * or want to handle postmaster death differently. It's cheap to assign
503  * those, so just do it every time.
504  */
505  if (!(wakeEvents & WL_LATCH_SET))
506  latch = NULL;
509  ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
510 
512  (wakeEvents & WL_TIMEOUT) ? timeout : -1,
513  &event, 1,
514  wait_event_info) == 0)
515  return WL_TIMEOUT;
516  else
517  return event.events;
518 }
519 
520 /*
521  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
522  * conditions.
523  *
524  * When waiting on a socket, EOF and error conditions always cause the socket
525  * to be reported as readable/writable/connected, so that the caller can deal
526  * with the condition.
527  *
528  * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
529  * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
530  * return value if the postmaster dies. The latter is useful for rare cases
531  * where some behavior other than immediate exit is needed.
532  *
533  * NB: These days this is just a wrapper around the WaitEventSet API. When
534  * using a latch very frequently, consider creating a longer living
535  * WaitEventSet instead; that's more efficient.
536  */
537 int
538 WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
539  long timeout, uint32 wait_event_info)
540 {
541  int ret = 0;
542  int rc;
543  WaitEvent event;
545 
546  if (wakeEvents & WL_TIMEOUT)
547  Assert(timeout >= 0);
548  else
549  timeout = -1;
550 
551  if (wakeEvents & WL_LATCH_SET)
553  latch, NULL);
554 
555  /* Postmaster-managed callers must handle postmaster death somehow. */
557  (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
558  (wakeEvents & WL_POSTMASTER_DEATH));
559 
560  if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
562  NULL, NULL);
563 
564  if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
566  NULL, NULL);
567 
568  if (wakeEvents & WL_SOCKET_MASK)
569  {
570  int ev;
571 
572  ev = wakeEvents & WL_SOCKET_MASK;
573  AddWaitEventToSet(set, ev, sock, NULL, NULL);
574  }
575 
576  rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
577 
578  if (rc == 0)
579  ret |= WL_TIMEOUT;
580  else
581  {
582  ret |= event.events & (WL_LATCH_SET |
585  }
586 
587  FreeWaitEventSet(set);
588 
589  return ret;
590 }
591 
592 /*
593  * Sets a latch and wakes up anyone waiting on it.
594  *
595  * This is cheap if the latch is already set, otherwise not so much.
596  *
597  * NB: when calling this in a signal handler, be sure to save and restore
598  * errno around it. (That's standard practice in most signal handlers, of
599  * course, but we used to omit it in handlers that only set a flag.)
600  *
601  * NB: this function is called from critical sections and signal handlers so
602  * throwing an error is not a good idea.
603  */
604 void
606 {
607 #ifndef WIN32
608  pid_t owner_pid;
609 #else
610  HANDLE handle;
611 #endif
612 
613  /*
614  * The memory barrier has to be placed here to ensure that any flag
615  * variables possibly changed by this process have been flushed to main
616  * memory, before we check/set is_set.
617  */
619 
620  /* Quick exit if already set */
621  if (latch->is_set)
622  return;
623 
624  latch->is_set = true;
625 
627  if (!latch->maybe_sleeping)
628  return;
629 
630 #ifndef WIN32
631 
632  /*
633  * See if anyone's waiting for the latch. It can be the current process if
634  * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
635  * to wake up WaitEventSetWaitBlock() without races in that case. If it's
636  * another process, send a signal.
637  *
638  * Fetch owner_pid only once, in case the latch is concurrently getting
639  * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
640  * guaranteed to be true! In practice, the effective range of pid_t fits
641  * in a 32 bit integer, and so should be atomic. In the worst case, we
642  * might end up signaling the wrong process. Even then, you're very
643  * unlucky if a process with that bogus pid exists and belongs to
644  * Postgres; and PG database processes should handle excess SIGUSR1
645  * interrupts without a problem anyhow.
646  *
647  * Another sort of race condition that's possible here is for a new
648  * process to own the latch immediately after we look, so we don't signal
649  * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
650  * the standard coding convention of waiting at the bottom of their loops,
651  * not the top, so that they'll correctly process latch-setting events
652  * that happen before they enter the loop.
653  */
654  owner_pid = latch->owner_pid;
655  if (owner_pid == 0)
656  return;
657  else if (owner_pid == MyProcPid)
658  {
659 #if defined(WAIT_USE_SELF_PIPE)
660  if (waiting)
662 #else
663  if (waiting)
664  kill(MyProcPid, SIGURG);
665 #endif
666  }
667  else
668  kill(owner_pid, SIGURG);
669 
670 #else
671 
672  /*
673  * See if anyone's waiting for the latch. It can be the current process if
674  * we're in a signal handler.
675  *
676  * Use a local variable here just in case somebody changes the event field
677  * concurrently (which really should not happen).
678  */
679  handle = latch->event;
680  if (handle)
681  {
682  SetEvent(handle);
683 
684  /*
685  * Note that we silently ignore any errors. We might be in a signal
686  * handler or other critical path where it's not safe to call elog().
687  */
688  }
689 #endif
690 }
691 
692 /*
693  * Clear the latch. Calling WaitLatch after this will sleep, unless
694  * the latch is set again before the WaitLatch call.
695  */
696 void
698 {
699  /* Only the owner should reset the latch */
700  Assert(latch->owner_pid == MyProcPid);
701  Assert(latch->maybe_sleeping == false);
702 
703  latch->is_set = false;
704 
705  /*
706  * Ensure that the write to is_set gets flushed to main memory before we
707  * examine any flag variables. Otherwise a concurrent SetLatch might
708  * falsely conclude that it needn't signal us, even though we have missed
709  * seeing some flag updates that SetLatch was supposed to inform us of.
710  */
712 }
713 
714 /*
715  * Create a WaitEventSet with space for nevents different events to wait for.
716  *
717  * These events can then be efficiently waited upon together, using
718  * WaitEventSetWait().
719  */
720 WaitEventSet *
721 CreateWaitEventSet(MemoryContext context, int nevents)
722 {
723  WaitEventSet *set;
724  char *data;
725  Size sz = 0;
726 
727  /*
728  * Use MAXALIGN size/alignment to guarantee that later uses of memory are
729  * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
730  * platforms, but earlier allocations like WaitEventSet and WaitEvent
731  * might not be sized to guarantee that when purely using sizeof().
732  */
733  sz += MAXALIGN(sizeof(WaitEventSet));
734  sz += MAXALIGN(sizeof(WaitEvent) * nevents);
735 
736 #if defined(WAIT_USE_EPOLL)
737  sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
738 #elif defined(WAIT_USE_KQUEUE)
739  sz += MAXALIGN(sizeof(struct kevent) * nevents);
740 #elif defined(WAIT_USE_POLL)
741  sz += MAXALIGN(sizeof(struct pollfd) * nevents);
742 #elif defined(WAIT_USE_WIN32)
743  /* need space for the pgwin32_signal_event */
744  sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
745 #endif
746 
747  data = (char *) MemoryContextAllocZero(context, sz);
748 
749  set = (WaitEventSet *) data;
750  data += MAXALIGN(sizeof(WaitEventSet));
751 
752  set->events = (WaitEvent *) data;
753  data += MAXALIGN(sizeof(WaitEvent) * nevents);
754 
755 #if defined(WAIT_USE_EPOLL)
756  set->epoll_ret_events = (struct epoll_event *) data;
757  data += MAXALIGN(sizeof(struct epoll_event) * nevents);
758 #elif defined(WAIT_USE_KQUEUE)
759  set->kqueue_ret_events = (struct kevent *) data;
760  data += MAXALIGN(sizeof(struct kevent) * nevents);
761 #elif defined(WAIT_USE_POLL)
762  set->pollfds = (struct pollfd *) data;
763  data += MAXALIGN(sizeof(struct pollfd) * nevents);
764 #elif defined(WAIT_USE_WIN32)
765  set->handles = (HANDLE) data;
766  data += MAXALIGN(sizeof(HANDLE) * nevents);
767 #endif
768 
769  set->latch = NULL;
770  set->nevents_space = nevents;
771  set->exit_on_postmaster_death = false;
772 
773 #if defined(WAIT_USE_EPOLL)
774  if (!AcquireExternalFD())
775  {
776  /* treat this as though epoll_create1 itself returned EMFILE */
777  elog(ERROR, "epoll_create1 failed: %m");
778  }
779  set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
780  if (set->epoll_fd < 0)
781  {
783  elog(ERROR, "epoll_create1 failed: %m");
784  }
785 #elif defined(WAIT_USE_KQUEUE)
786  if (!AcquireExternalFD())
787  {
788  /* treat this as though kqueue itself returned EMFILE */
789  elog(ERROR, "kqueue failed: %m");
790  }
791  set->kqueue_fd = kqueue();
792  if (set->kqueue_fd < 0)
793  {
795  elog(ERROR, "kqueue failed: %m");
796  }
797  if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
798  {
799  int save_errno = errno;
800 
801  close(set->kqueue_fd);
803  errno = save_errno;
804  elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
805  }
806  set->report_postmaster_not_running = false;
807 #elif defined(WAIT_USE_WIN32)
808 
809  /*
810  * To handle signals while waiting, we need to add a win32 specific event.
811  * We accounted for the additional event at the top of this routine. See
812  * port/win32/signal.c for more details.
813  *
814  * Note: pgwin32_signal_event should be first to ensure that it will be
815  * reported when multiple events are set. We want to guarantee that
816  * pending signals are serviced.
817  */
818  set->handles[0] = pgwin32_signal_event;
819  StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
820 #endif
821 
822  return set;
823 }
824 
825 /*
826  * Free a previously created WaitEventSet.
827  *
828  * Note: preferably, this shouldn't have to free any resources that could be
829  * inherited across an exec(). If it did, we'd likely leak those resources in
830  * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC
831  * when the FD is created. For the Windows case, we assume that the handles
832  * involved are non-inheritable.
833  */
834 void
836 {
837 #if defined(WAIT_USE_EPOLL)
838  close(set->epoll_fd);
840 #elif defined(WAIT_USE_KQUEUE)
841  close(set->kqueue_fd);
843 #elif defined(WAIT_USE_WIN32)
844  WaitEvent *cur_event;
845 
846  for (cur_event = set->events;
847  cur_event < (set->events + set->nevents);
848  cur_event++)
849  {
850  if (cur_event->events & WL_LATCH_SET)
851  {
852  /* uses the latch's HANDLE */
853  }
854  else if (cur_event->events & WL_POSTMASTER_DEATH)
855  {
856  /* uses PostmasterHandle */
857  }
858  else
859  {
860  /* Clean up the event object we created for the socket */
861  WSAEventSelect(cur_event->fd, NULL, 0);
862  WSACloseEvent(set->handles[cur_event->pos + 1]);
863  }
864  }
865 #endif
866 
867  pfree(set);
868 }
869 
870 /*
871  * Free a previously created WaitEventSet in a child process after a fork().
872  */
873 void
875 {
876 #if defined(WAIT_USE_EPOLL)
877  close(set->epoll_fd);
879 #elif defined(WAIT_USE_KQUEUE)
880  /* kqueues are not normally inherited by child processes */
882 #endif
883 
884  pfree(set);
885 }
886 
887 /* ---
888  * Add an event to the set. Possible events are:
889  * - WL_LATCH_SET: Wait for the latch to be set
890  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
891  * - WL_SOCKET_READABLE: Wait for socket to become readable,
892  * can be combined in one event with other WL_SOCKET_* events
893  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
894  * can be combined with other WL_SOCKET_* events
895  * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
896  * can be combined with other WL_SOCKET_* events (on non-Windows
897  * platforms, this is the same as WL_SOCKET_WRITEABLE)
898  * - WL_SOCKET_ACCEPT: Wait for new connection to a server socket,
899  * can be combined with other WL_SOCKET_* events (on non-Windows
900  * platforms, this is the same as WL_SOCKET_READABLE)
901  * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
902  * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
903  *
904  * Returns the offset in WaitEventSet->events (starting from 0), which can be
905  * used to modify previously added wait events using ModifyWaitEvent().
906  *
907  * In the WL_LATCH_SET case the latch must be owned by the current process,
908  * i.e. it must be a process-local latch initialized with InitLatch, or a
909  * shared latch associated with the current process by calling OwnLatch.
910  *
911  * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED/ACCEPT cases, EOF and error
912  * conditions cause the socket to be reported as readable/writable/connected,
913  * so that the caller can deal with the condition.
914  *
915  * The user_data pointer specified here will be set for the events returned
916  * by WaitEventSetWait(), allowing to easily associate additional data with
917  * events.
918  */
919 int
921  void *user_data)
922 {
923  WaitEvent *event;
924 
925  /* not enough space */
926  Assert(set->nevents < set->nevents_space);
927 
928  if (events == WL_EXIT_ON_PM_DEATH)
929  {
930  events = WL_POSTMASTER_DEATH;
931  set->exit_on_postmaster_death = true;
932  }
933 
934  if (latch)
935  {
936  if (latch->owner_pid != MyProcPid)
937  elog(ERROR, "cannot wait on a latch owned by another process");
938  if (set->latch)
939  elog(ERROR, "cannot wait on more than one latch");
940  if ((events & WL_LATCH_SET) != WL_LATCH_SET)
941  elog(ERROR, "latch events only support being set");
942  }
943  else
944  {
945  if (events & WL_LATCH_SET)
946  elog(ERROR, "cannot wait on latch without a specified latch");
947  }
948 
949  /* waiting for socket readiness without a socket indicates a bug */
950  if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
951  elog(ERROR, "cannot wait on socket event without a socket");
952 
953  event = &set->events[set->nevents];
954  event->pos = set->nevents++;
955  event->fd = fd;
956  event->events = events;
957  event->user_data = user_data;
958 #ifdef WIN32
959  event->reset = false;
960 #endif
961 
962  if (events == WL_LATCH_SET)
963  {
964  set->latch = latch;
965  set->latch_pos = event->pos;
966 #if defined(WAIT_USE_SELF_PIPE)
967  event->fd = selfpipe_readfd;
968 #elif defined(WAIT_USE_SIGNALFD)
969  event->fd = signal_fd;
970 #else
971  event->fd = PGINVALID_SOCKET;
972 #ifdef WAIT_USE_EPOLL
973  return event->pos;
974 #endif
975 #endif
976  }
977  else if (events == WL_POSTMASTER_DEATH)
978  {
979 #ifndef WIN32
981 #endif
982  }
983 
984  /* perform wait primitive specific initialization, if needed */
985 #if defined(WAIT_USE_EPOLL)
986  WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
987 #elif defined(WAIT_USE_KQUEUE)
988  WaitEventAdjustKqueue(set, event, 0);
989 #elif defined(WAIT_USE_POLL)
990  WaitEventAdjustPoll(set, event);
991 #elif defined(WAIT_USE_WIN32)
992  WaitEventAdjustWin32(set, event);
993 #endif
994 
995  return event->pos;
996 }
997 
998 /*
999  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
1000  * with the WaitEvent. The latch may be changed to NULL to disable the latch
1001  * temporarily, and then set back to a latch later.
1002  *
1003  * 'pos' is the id returned by AddWaitEventToSet.
1004  */
1005 void
1006 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
1007 {
1008  WaitEvent *event;
1009 #if defined(WAIT_USE_KQUEUE)
1010  int old_events;
1011 #endif
1012 
1013  Assert(pos < set->nevents);
1014 
1015  event = &set->events[pos];
1016 #if defined(WAIT_USE_KQUEUE)
1017  old_events = event->events;
1018 #endif
1019 
1020  /*
1021  * If neither the event mask nor the associated latch changes, return
1022  * early. That's an important optimization for some sockets, where
1023  * ModifyWaitEvent is frequently used to switch from waiting for reads to
1024  * waiting on writes.
1025  */
1026  if (events == event->events &&
1027  (!(event->events & WL_LATCH_SET) || set->latch == latch))
1028  return;
1029 
1030  if (event->events & WL_LATCH_SET &&
1031  events != event->events)
1032  {
1033  elog(ERROR, "cannot modify latch event");
1034  }
1035 
1036  if (event->events & WL_POSTMASTER_DEATH)
1037  {
1038  elog(ERROR, "cannot modify postmaster death event");
1039  }
1040 
1041  /* FIXME: validate event mask */
1042  event->events = events;
1043 
1044  if (events == WL_LATCH_SET)
1045  {
1046  if (latch && latch->owner_pid != MyProcPid)
1047  elog(ERROR, "cannot wait on a latch owned by another process");
1048  set->latch = latch;
1049 
1050  /*
1051  * On Unix, we don't need to modify the kernel object because the
1052  * underlying pipe (if there is one) is the same for all latches so we
1053  * can return immediately. On Windows, we need to update our array of
1054  * handles, but we leave the old one in place and tolerate spurious
1055  * wakeups if the latch is disabled.
1056  */
1057 #if defined(WAIT_USE_WIN32)
1058  if (!latch)
1059  return;
1060 #else
1061  return;
1062 #endif
1063  }
1064 
1065 #if defined(WAIT_USE_EPOLL)
1066  WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
1067 #elif defined(WAIT_USE_KQUEUE)
1068  WaitEventAdjustKqueue(set, event, old_events);
1069 #elif defined(WAIT_USE_POLL)
1070  WaitEventAdjustPoll(set, event);
1071 #elif defined(WAIT_USE_WIN32)
1072  WaitEventAdjustWin32(set, event);
1073 #endif
1074 }
1075 
1076 #if defined(WAIT_USE_EPOLL)
1077 /*
1078  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
1079  */
1080 static void
1081 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
1082 {
1083  struct epoll_event epoll_ev;
1084  int rc;
1085 
1086  /* pointer to our event, returned by epoll_wait */
1087  epoll_ev.data.ptr = event;
1088  /* always wait for errors */
1089  epoll_ev.events = EPOLLERR | EPOLLHUP;
1090 
1091  /* prepare pollfd entry once */
1092  if (event->events == WL_LATCH_SET)
1093  {
1094  Assert(set->latch != NULL);
1095  epoll_ev.events |= EPOLLIN;
1096  }
1097  else if (event->events == WL_POSTMASTER_DEATH)
1098  {
1099  epoll_ev.events |= EPOLLIN;
1100  }
1101  else
1102  {
1103  Assert(event->fd != PGINVALID_SOCKET);
1104  Assert(event->events & (WL_SOCKET_READABLE |
1106  WL_SOCKET_CLOSED));
1107 
1108  if (event->events & WL_SOCKET_READABLE)
1109  epoll_ev.events |= EPOLLIN;
1110  if (event->events & WL_SOCKET_WRITEABLE)
1111  epoll_ev.events |= EPOLLOUT;
1112  if (event->events & WL_SOCKET_CLOSED)
1113  epoll_ev.events |= EPOLLRDHUP;
1114  }
1115 
1116  /*
1117  * Even though unused, we also pass epoll_ev as the data argument if
1118  * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
1119  * requiring that, and actually it makes the code simpler...
1120  */
1121  rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
1122 
1123  if (rc < 0)
1124  ereport(ERROR,
1126  errmsg("%s() failed: %m",
1127  "epoll_ctl")));
1128 }
1129 #endif
1130 
1131 #if defined(WAIT_USE_POLL)
1132 static void
1134 {
1135  struct pollfd *pollfd = &set->pollfds[event->pos];
1136 
1137  pollfd->revents = 0;
1138  pollfd->fd = event->fd;
1139 
1140  /* prepare pollfd entry once */
1141  if (event->events == WL_LATCH_SET)
1142  {
1143  Assert(set->latch != NULL);
1144  pollfd->events = POLLIN;
1145  }
1146  else if (event->events == WL_POSTMASTER_DEATH)
1147  {
1148  pollfd->events = POLLIN;
1149  }
1150  else
1151  {
1152  Assert(event->events & (WL_SOCKET_READABLE |
1154  WL_SOCKET_CLOSED));
1155  pollfd->events = 0;
1156  if (event->events & WL_SOCKET_READABLE)
1157  pollfd->events |= POLLIN;
1158  if (event->events & WL_SOCKET_WRITEABLE)
1159  pollfd->events |= POLLOUT;
1160 #ifdef POLLRDHUP
1161  if (event->events & WL_SOCKET_CLOSED)
1162  pollfd->events |= POLLRDHUP;
1163 #endif
1164  }
1165 
1166  Assert(event->fd != PGINVALID_SOCKET);
1167 }
1168 #endif
1169 
1170 #if defined(WAIT_USE_KQUEUE)
1171 
1172 /*
1173  * On most BSD family systems, the udata member of struct kevent is of type
1174  * void *, so we could directly convert to/from WaitEvent *. Unfortunately,
1175  * NetBSD has it as intptr_t, so here we wallpaper over that difference with
1176  * an lvalue cast.
1177  */
1178 #define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
1179 
1180 static inline void
1181 WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
1182  WaitEvent *event)
1183 {
1184  k_ev->ident = event->fd;
1185  k_ev->filter = filter;
1186  k_ev->flags = action;
1187  k_ev->fflags = 0;
1188  k_ev->data = 0;
1189  AccessWaitEvent(k_ev) = event;
1190 }
1191 
1192 static inline void
1193 WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
1194 {
1195  /* For now postmaster death can only be added, not removed. */
1196  k_ev->ident = PostmasterPid;
1197  k_ev->filter = EVFILT_PROC;
1198  k_ev->flags = EV_ADD;
1199  k_ev->fflags = NOTE_EXIT;
1200  k_ev->data = 0;
1201  AccessWaitEvent(k_ev) = event;
1202 }
1203 
1204 static inline void
1205 WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
1206 {
1207  /* For now latch can only be added, not removed. */
1208  k_ev->ident = SIGURG;
1209  k_ev->filter = EVFILT_SIGNAL;
1210  k_ev->flags = EV_ADD;
1211  k_ev->fflags = 0;
1212  k_ev->data = 0;
1213  AccessWaitEvent(k_ev) = event;
1214 }
1215 
1216 /*
1217  * old_events is the previous event mask, used to compute what has changed.
1218  */
1219 static void
1220 WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
1221 {
1222  int rc;
1223  struct kevent k_ev[2];
1224  int count = 0;
1225  bool new_filt_read = false;
1226  bool old_filt_read = false;
1227  bool new_filt_write = false;
1228  bool old_filt_write = false;
1229 
1230  if (old_events == event->events)
1231  return;
1232 
1233  Assert(event->events != WL_LATCH_SET || set->latch != NULL);
1234  Assert(event->events == WL_LATCH_SET ||
1235  event->events == WL_POSTMASTER_DEATH ||
1236  (event->events & (WL_SOCKET_READABLE |
1238  WL_SOCKET_CLOSED)));
1239 
1240  if (event->events == WL_POSTMASTER_DEATH)
1241  {
1242  /*
1243  * Unlike all the other implementations, we detect postmaster death
1244  * using process notification instead of waiting on the postmaster
1245  * alive pipe.
1246  */
1247  WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
1248  }
1249  else if (event->events == WL_LATCH_SET)
1250  {
1251  /* We detect latch wakeup using a signal event. */
1252  WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
1253  }
1254  else
1255  {
1256  /*
1257  * We need to compute the adds and deletes required to get from the
1258  * old event mask to the new event mask, since kevent treats readable
1259  * and writable as separate events.
1260  */
1261  if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1262  old_filt_read = true;
1263  if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1264  new_filt_read = true;
1265  if (old_events & WL_SOCKET_WRITEABLE)
1266  old_filt_write = true;
1267  if (event->events & WL_SOCKET_WRITEABLE)
1268  new_filt_write = true;
1269  if (old_filt_read && !new_filt_read)
1270  WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
1271  event);
1272  else if (!old_filt_read && new_filt_read)
1273  WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
1274  event);
1275  if (old_filt_write && !new_filt_write)
1276  WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
1277  event);
1278  else if (!old_filt_write && new_filt_write)
1279  WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
1280  event);
1281  }
1282 
1283  /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
1284  if (count == 0)
1285  return;
1286 
1287  Assert(count <= 2);
1288 
1289  rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
1290 
1291  /*
1292  * When adding the postmaster's pid, we have to consider that it might
1293  * already have exited and perhaps even been replaced by another process
1294  * with the same pid. If so, we have to defer reporting this as an event
1295  * until the next call to WaitEventSetWaitBlock().
1296  */
1297 
1298  if (rc < 0)
1299  {
1300  if (event->events == WL_POSTMASTER_DEATH &&
1301  (errno == ESRCH || errno == EACCES))
1302  set->report_postmaster_not_running = true;
1303  else
1304  ereport(ERROR,
1306  errmsg("%s() failed: %m",
1307  "kevent")));
1308  }
1309  else if (event->events == WL_POSTMASTER_DEATH &&
1310  PostmasterPid != getppid() &&
1311  !PostmasterIsAlive())
1312  {
1313  /*
1314  * The extra PostmasterIsAliveInternal() check prevents false alarms
1315  * on systems that give a different value for getppid() while being
1316  * traced by a debugger.
1317  */
1318  set->report_postmaster_not_running = true;
1319  }
1320 }
1321 
1322 #endif
1323 
1324 #if defined(WAIT_USE_WIN32)
1325 static void
1326 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
1327 {
1328  HANDLE *handle = &set->handles[event->pos + 1];
1329 
1330  if (event->events == WL_LATCH_SET)
1331  {
1332  Assert(set->latch != NULL);
1333  *handle = set->latch->event;
1334  }
1335  else if (event->events == WL_POSTMASTER_DEATH)
1336  {
1337  *handle = PostmasterHandle;
1338  }
1339  else
1340  {
1341  int flags = FD_CLOSE; /* always check for errors/EOF */
1342 
1343  if (event->events & WL_SOCKET_READABLE)
1344  flags |= FD_READ;
1345  if (event->events & WL_SOCKET_WRITEABLE)
1346  flags |= FD_WRITE;
1347  if (event->events & WL_SOCKET_CONNECTED)
1348  flags |= FD_CONNECT;
1349  if (event->events & WL_SOCKET_ACCEPT)
1350  flags |= FD_ACCEPT;
1351 
1352  if (*handle == WSA_INVALID_EVENT)
1353  {
1354  *handle = WSACreateEvent();
1355  if (*handle == WSA_INVALID_EVENT)
1356  elog(ERROR, "failed to create event for socket: error code %d",
1357  WSAGetLastError());
1358  }
1359  if (WSAEventSelect(event->fd, *handle, flags) != 0)
1360  elog(ERROR, "failed to set up event for socket: error code %d",
1361  WSAGetLastError());
1362 
1363  Assert(event->fd != PGINVALID_SOCKET);
1364  }
1365 }
1366 #endif
1367 
1368 /*
1369  * Wait for events added to the set to happen, or until the timeout is
1370  * reached. At most nevents occurred events are returned.
1371  *
1372  * If timeout = -1, block until an event occurs; if 0, check sockets for
1373  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
1374  *
1375  * Returns the number of events occurred, or 0 if the timeout was reached.
1376  *
1377  * Returned events will have the fd, pos, user_data fields set to the
1378  * values associated with the registered event.
1379  */
1380 int
1381 WaitEventSetWait(WaitEventSet *set, long timeout,
1382  WaitEvent *occurred_events, int nevents,
1383  uint32 wait_event_info)
1384 {
1385  int returned_events = 0;
1387  instr_time cur_time;
1388  long cur_timeout = -1;
1389 
1390  Assert(nevents > 0);
1391 
1392  /*
1393  * Initialize timeout if requested. We must record the current time so
1394  * that we can determine the remaining timeout if interrupted.
1395  */
1396  if (timeout >= 0)
1397  {
1399  Assert(timeout >= 0 && timeout <= INT_MAX);
1400  cur_timeout = timeout;
1401  }
1402  else
1404 
1405  pgstat_report_wait_start(wait_event_info);
1406 
1407 #ifndef WIN32
1408  waiting = true;
1409 #else
1410  /* Ensure that signals are serviced even if latch is already set */
1412 #endif
1413  while (returned_events == 0)
1414  {
1415  int rc;
1416 
1417  /*
1418  * Check if the latch is set already. If so, leave the loop
1419  * immediately, avoid blocking again. We don't attempt to report any
1420  * other events that might also be satisfied.
1421  *
1422  * If someone sets the latch between this and the
1423  * WaitEventSetWaitBlock() below, the setter will write a byte to the
1424  * pipe (or signal us and the signal handler will do that), and the
1425  * readiness routine will return immediately.
1426  *
1427  * On unix, If there's a pending byte in the self pipe, we'll notice
1428  * whenever blocking. Only clearing the pipe in that case avoids
1429  * having to drain it every time WaitLatchOrSocket() is used. Should
1430  * the pipe-buffer fill up we're still ok, because the pipe is in
1431  * nonblocking mode. It's unlikely for that to happen, because the
1432  * self pipe isn't filled unless we're blocking (waiting = true), or
1433  * from inside a signal handler in latch_sigurg_handler().
1434  *
1435  * On windows, we'll also notice if there's a pending event for the
1436  * latch when blocking, but there's no danger of anything filling up,
1437  * as "Setting an event that is already set has no effect.".
1438  *
1439  * Note: we assume that the kernel calls involved in latch management
1440  * will provide adequate synchronization on machines with weak memory
1441  * ordering, so that we cannot miss seeing is_set if a notification
1442  * has already been queued.
1443  */
1444  if (set->latch && !set->latch->is_set)
1445  {
1446  /* about to sleep on a latch */
1447  set->latch->maybe_sleeping = true;
1449  /* and recheck */
1450  }
1451 
1452  if (set->latch && set->latch->is_set)
1453  {
1454  occurred_events->fd = PGINVALID_SOCKET;
1455  occurred_events->pos = set->latch_pos;
1456  occurred_events->user_data =
1457  set->events[set->latch_pos].user_data;
1458  occurred_events->events = WL_LATCH_SET;
1459  occurred_events++;
1460  returned_events++;
1461 
1462  /* could have been set above */
1463  set->latch->maybe_sleeping = false;
1464 
1465  break;
1466  }
1467 
1468  /*
1469  * Wait for events using the readiness primitive chosen at the top of
1470  * this file. If -1 is returned, a timeout has occurred, if 0 we have
1471  * to retry, everything >= 1 is the number of returned events.
1472  */
1473  rc = WaitEventSetWaitBlock(set, cur_timeout,
1474  occurred_events, nevents);
1475 
1476  if (set->latch)
1477  {
1478  Assert(set->latch->maybe_sleeping);
1479  set->latch->maybe_sleeping = false;
1480  }
1481 
1482  if (rc == -1)
1483  break; /* timeout occurred */
1484  else
1485  returned_events = rc;
1486 
1487  /* If we're not done, update cur_timeout for next iteration */
1488  if (returned_events == 0 && timeout >= 0)
1489  {
1490  INSTR_TIME_SET_CURRENT(cur_time);
1491  INSTR_TIME_SUBTRACT(cur_time, start_time);
1492  cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1493  if (cur_timeout <= 0)
1494  break;
1495  }
1496  }
1497 #ifndef WIN32
1498  waiting = false;
1499 #endif
1500 
1502 
1503  return returned_events;
1504 }
1505 
1506 
1507 #if defined(WAIT_USE_EPOLL)
1508 
1509 /*
1510  * Wait using linux's epoll_wait(2).
1511  *
1512  * This is the preferable wait method, as several readiness notifications are
1513  * delivered, without having to iterate through all of set->events. The return
1514  * epoll_event struct contain a pointer to our events, making association
1515  * easy.
1516  */
1517 static inline int
1518 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1519  WaitEvent *occurred_events, int nevents)
1520 {
1521  int returned_events = 0;
1522  int rc;
1523  WaitEvent *cur_event;
1524  struct epoll_event *cur_epoll_event;
1525 
1526  /* Sleep */
1527  rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1528  Min(nevents, set->nevents_space), cur_timeout);
1529 
1530  /* Check return code */
1531  if (rc < 0)
1532  {
1533  /* EINTR is okay, otherwise complain */
1534  if (errno != EINTR)
1535  {
1536  waiting = false;
1537  ereport(ERROR,
1539  errmsg("%s() failed: %m",
1540  "epoll_wait")));
1541  }
1542  return 0;
1543  }
1544  else if (rc == 0)
1545  {
1546  /* timeout exceeded */
1547  return -1;
1548  }
1549 
1550  /*
1551  * At least one event occurred, iterate over the returned epoll events
1552  * until they're either all processed, or we've returned all the events
1553  * the caller desired.
1554  */
1555  for (cur_epoll_event = set->epoll_ret_events;
1556  cur_epoll_event < (set->epoll_ret_events + rc) &&
1557  returned_events < nevents;
1558  cur_epoll_event++)
1559  {
1560  /* epoll's data pointer is set to the associated WaitEvent */
1561  cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1562 
1563  occurred_events->pos = cur_event->pos;
1564  occurred_events->user_data = cur_event->user_data;
1565  occurred_events->events = 0;
1566 
1567  if (cur_event->events == WL_LATCH_SET &&
1568  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1569  {
1570  /* Drain the signalfd. */
1571  drain();
1572 
1573  if (set->latch && set->latch->is_set)
1574  {
1575  occurred_events->fd = PGINVALID_SOCKET;
1576  occurred_events->events = WL_LATCH_SET;
1577  occurred_events++;
1578  returned_events++;
1579  }
1580  }
1581  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1582  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1583  {
1584  /*
1585  * We expect an EPOLLHUP when the remote end is closed, but
1586  * because we don't expect the pipe to become readable or to have
1587  * any errors either, treat those cases as postmaster death, too.
1588  *
1589  * Be paranoid about a spurious event signaling the postmaster as
1590  * being dead. There have been reports about that happening with
1591  * older primitives (select(2) to be specific), and a spurious
1592  * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1593  * cost much.
1594  */
1596  {
1597  if (set->exit_on_postmaster_death)
1598  proc_exit(1);
1599  occurred_events->fd = PGINVALID_SOCKET;
1600  occurred_events->events = WL_POSTMASTER_DEATH;
1601  occurred_events++;
1602  returned_events++;
1603  }
1604  }
1605  else if (cur_event->events & (WL_SOCKET_READABLE |
1608  {
1609  Assert(cur_event->fd != PGINVALID_SOCKET);
1610 
1611  if ((cur_event->events & WL_SOCKET_READABLE) &&
1612  (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1613  {
1614  /* data available in socket, or EOF */
1615  occurred_events->events |= WL_SOCKET_READABLE;
1616  }
1617 
1618  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1619  (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1620  {
1621  /* writable, or EOF */
1622  occurred_events->events |= WL_SOCKET_WRITEABLE;
1623  }
1624 
1625  if ((cur_event->events & WL_SOCKET_CLOSED) &&
1626  (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
1627  {
1628  /* remote peer shut down, or error */
1629  occurred_events->events |= WL_SOCKET_CLOSED;
1630  }
1631 
1632  if (occurred_events->events != 0)
1633  {
1634  occurred_events->fd = cur_event->fd;
1635  occurred_events++;
1636  returned_events++;
1637  }
1638  }
1639  }
1640 
1641  return returned_events;
1642 }
1643 
1644 #elif defined(WAIT_USE_KQUEUE)
1645 
1646 /*
1647  * Wait using kevent(2) on BSD-family systems and macOS.
1648  *
1649  * For now this mirrors the epoll code, but in future it could modify the fd
1650  * set in the same call to kevent as it uses for waiting instead of doing that
1651  * with separate system calls.
1652  */
1653 static int
1654 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1655  WaitEvent *occurred_events, int nevents)
1656 {
1657  int returned_events = 0;
1658  int rc;
1659  WaitEvent *cur_event;
1660  struct kevent *cur_kqueue_event;
1661  struct timespec timeout;
1662  struct timespec *timeout_p;
1663 
1664  if (cur_timeout < 0)
1665  timeout_p = NULL;
1666  else
1667  {
1668  timeout.tv_sec = cur_timeout / 1000;
1669  timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
1670  timeout_p = &timeout;
1671  }
1672 
1673  /*
1674  * Report postmaster events discovered by WaitEventAdjustKqueue() or an
1675  * earlier call to WaitEventSetWait().
1676  */
1677  if (unlikely(set->report_postmaster_not_running))
1678  {
1679  if (set->exit_on_postmaster_death)
1680  proc_exit(1);
1681  occurred_events->fd = PGINVALID_SOCKET;
1682  occurred_events->events = WL_POSTMASTER_DEATH;
1683  return 1;
1684  }
1685 
1686  /* Sleep */
1687  rc = kevent(set->kqueue_fd, NULL, 0,
1688  set->kqueue_ret_events,
1689  Min(nevents, set->nevents_space),
1690  timeout_p);
1691 
1692  /* Check return code */
1693  if (rc < 0)
1694  {
1695  /* EINTR is okay, otherwise complain */
1696  if (errno != EINTR)
1697  {
1698  waiting = false;
1699  ereport(ERROR,
1701  errmsg("%s() failed: %m",
1702  "kevent")));
1703  }
1704  return 0;
1705  }
1706  else if (rc == 0)
1707  {
1708  /* timeout exceeded */
1709  return -1;
1710  }
1711 
1712  /*
1713  * At least one event occurred, iterate over the returned kqueue events
1714  * until they're either all processed, or we've returned all the events
1715  * the caller desired.
1716  */
1717  for (cur_kqueue_event = set->kqueue_ret_events;
1718  cur_kqueue_event < (set->kqueue_ret_events + rc) &&
1719  returned_events < nevents;
1720  cur_kqueue_event++)
1721  {
1722  /* kevent's udata points to the associated WaitEvent */
1723  cur_event = AccessWaitEvent(cur_kqueue_event);
1724 
1725  occurred_events->pos = cur_event->pos;
1726  occurred_events->user_data = cur_event->user_data;
1727  occurred_events->events = 0;
1728 
1729  if (cur_event->events == WL_LATCH_SET &&
1730  cur_kqueue_event->filter == EVFILT_SIGNAL)
1731  {
1732  if (set->latch && set->latch->is_set)
1733  {
1734  occurred_events->fd = PGINVALID_SOCKET;
1735  occurred_events->events = WL_LATCH_SET;
1736  occurred_events++;
1737  returned_events++;
1738  }
1739  }
1740  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1741  cur_kqueue_event->filter == EVFILT_PROC &&
1742  (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
1743  {
1744  /*
1745  * The kernel will tell this kqueue object only once about the
1746  * exit of the postmaster, so let's remember that for next time so
1747  * that we provide level-triggered semantics.
1748  */
1749  set->report_postmaster_not_running = true;
1750 
1751  if (set->exit_on_postmaster_death)
1752  proc_exit(1);
1753  occurred_events->fd = PGINVALID_SOCKET;
1754  occurred_events->events = WL_POSTMASTER_DEATH;
1755  occurred_events++;
1756  returned_events++;
1757  }
1758  else if (cur_event->events & (WL_SOCKET_READABLE |
1761  {
1762  Assert(cur_event->fd >= 0);
1763 
1764  if ((cur_event->events & WL_SOCKET_READABLE) &&
1765  (cur_kqueue_event->filter == EVFILT_READ))
1766  {
1767  /* readable, or EOF */
1768  occurred_events->events |= WL_SOCKET_READABLE;
1769  }
1770 
1771  if ((cur_event->events & WL_SOCKET_CLOSED) &&
1772  (cur_kqueue_event->filter == EVFILT_READ) &&
1773  (cur_kqueue_event->flags & EV_EOF))
1774  {
1775  /* the remote peer has shut down */
1776  occurred_events->events |= WL_SOCKET_CLOSED;
1777  }
1778 
1779  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1780  (cur_kqueue_event->filter == EVFILT_WRITE))
1781  {
1782  /* writable, or EOF */
1783  occurred_events->events |= WL_SOCKET_WRITEABLE;
1784  }
1785 
1786  if (occurred_events->events != 0)
1787  {
1788  occurred_events->fd = cur_event->fd;
1789  occurred_events++;
1790  returned_events++;
1791  }
1792  }
1793  }
1794 
1795  return returned_events;
1796 }
1797 
1798 #elif defined(WAIT_USE_POLL)
1799 
1800 /*
1801  * Wait using poll(2).
1802  *
1803  * This allows to receive readiness notifications for several events at once,
1804  * but requires iterating through all of set->pollfds.
1805  */
1806 static inline int
1807 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1808  WaitEvent *occurred_events, int nevents)
1809 {
1810  int returned_events = 0;
1811  int rc;
1812  WaitEvent *cur_event;
1813  struct pollfd *cur_pollfd;
1814 
1815  /* Sleep */
1816  rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1817 
1818  /* Check return code */
1819  if (rc < 0)
1820  {
1821  /* EINTR is okay, otherwise complain */
1822  if (errno != EINTR)
1823  {
1824  waiting = false;
1825  ereport(ERROR,
1827  errmsg("%s() failed: %m",
1828  "poll")));
1829  }
1830  return 0;
1831  }
1832  else if (rc == 0)
1833  {
1834  /* timeout exceeded */
1835  return -1;
1836  }
1837 
1838  for (cur_event = set->events, cur_pollfd = set->pollfds;
1839  cur_event < (set->events + set->nevents) &&
1840  returned_events < nevents;
1841  cur_event++, cur_pollfd++)
1842  {
1843  /* no activity on this FD, skip */
1844  if (cur_pollfd->revents == 0)
1845  continue;
1846 
1847  occurred_events->pos = cur_event->pos;
1848  occurred_events->user_data = cur_event->user_data;
1849  occurred_events->events = 0;
1850 
1851  if (cur_event->events == WL_LATCH_SET &&
1852  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1853  {
1854  /* There's data in the self-pipe, clear it. */
1855  drain();
1856 
1857  if (set->latch && set->latch->is_set)
1858  {
1859  occurred_events->fd = PGINVALID_SOCKET;
1860  occurred_events->events = WL_LATCH_SET;
1861  occurred_events++;
1862  returned_events++;
1863  }
1864  }
1865  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1866  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1867  {
1868  /*
1869  * We expect an POLLHUP when the remote end is closed, but because
1870  * we don't expect the pipe to become readable or to have any
1871  * errors either, treat those cases as postmaster death, too.
1872  *
1873  * Be paranoid about a spurious event signaling the postmaster as
1874  * being dead. There have been reports about that happening with
1875  * older primitives (select(2) to be specific), and a spurious
1876  * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1877  * cost much.
1878  */
1880  {
1881  if (set->exit_on_postmaster_death)
1882  proc_exit(1);
1883  occurred_events->fd = PGINVALID_SOCKET;
1884  occurred_events->events = WL_POSTMASTER_DEATH;
1885  occurred_events++;
1886  returned_events++;
1887  }
1888  }
1889  else if (cur_event->events & (WL_SOCKET_READABLE |
1892  {
1893  int errflags = POLLHUP | POLLERR | POLLNVAL;
1894 
1895  Assert(cur_event->fd >= PGINVALID_SOCKET);
1896 
1897  if ((cur_event->events & WL_SOCKET_READABLE) &&
1898  (cur_pollfd->revents & (POLLIN | errflags)))
1899  {
1900  /* data available in socket, or EOF */
1901  occurred_events->events |= WL_SOCKET_READABLE;
1902  }
1903 
1904  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1905  (cur_pollfd->revents & (POLLOUT | errflags)))
1906  {
1907  /* writeable, or EOF */
1908  occurred_events->events |= WL_SOCKET_WRITEABLE;
1909  }
1910 
1911 #ifdef POLLRDHUP
1912  if ((cur_event->events & WL_SOCKET_CLOSED) &&
1913  (cur_pollfd->revents & (POLLRDHUP | errflags)))
1914  {
1915  /* remote peer closed, or error */
1916  occurred_events->events |= WL_SOCKET_CLOSED;
1917  }
1918 #endif
1919 
1920  if (occurred_events->events != 0)
1921  {
1922  occurred_events->fd = cur_event->fd;
1923  occurred_events++;
1924  returned_events++;
1925  }
1926  }
1927  }
1928  return returned_events;
1929 }
1930 
1931 #elif defined(WAIT_USE_WIN32)
1932 
1933 /*
1934  * Wait using Windows' WaitForMultipleObjects(). Each call only "consumes" one
1935  * event, so we keep calling until we've filled up our output buffer to match
1936  * the behavior of the other implementations.
1937  *
1938  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273
1939  */
1940 static inline int
1941 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1942  WaitEvent *occurred_events, int nevents)
1943 {
1944  int returned_events = 0;
1945  DWORD rc;
1946  WaitEvent *cur_event;
1947 
1948  /* Reset any wait events that need it */
1949  for (cur_event = set->events;
1950  cur_event < (set->events + set->nevents);
1951  cur_event++)
1952  {
1953  if (cur_event->reset)
1954  {
1955  WaitEventAdjustWin32(set, cur_event);
1956  cur_event->reset = false;
1957  }
1958 
1959  /*
1960  * Windows does not guarantee to log an FD_WRITE network event
1961  * indicating that more data can be sent unless the previous send()
1962  * failed with WSAEWOULDBLOCK. While our caller might well have made
1963  * such a call, we cannot assume that here. Therefore, if waiting for
1964  * write-ready, force the issue by doing a dummy send(). If the dummy
1965  * send() succeeds, assume that the socket is in fact write-ready, and
1966  * return immediately. Also, if it fails with something other than
1967  * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1968  * deal with the error condition.
1969  */
1970  if (cur_event->events & WL_SOCKET_WRITEABLE)
1971  {
1972  char c;
1973  WSABUF buf;
1974  DWORD sent;
1975  int r;
1976 
1977  buf.buf = &c;
1978  buf.len = 0;
1979 
1980  r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1981  if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1982  {
1983  occurred_events->pos = cur_event->pos;
1984  occurred_events->user_data = cur_event->user_data;
1985  occurred_events->events = WL_SOCKET_WRITEABLE;
1986  occurred_events->fd = cur_event->fd;
1987  return 1;
1988  }
1989  }
1990  }
1991 
1992  /*
1993  * Sleep.
1994  *
1995  * Need to wait for ->nevents + 1, because signal handle is in [0].
1996  */
1997  rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1998  cur_timeout);
1999 
2000  /* Check return code */
2001  if (rc == WAIT_FAILED)
2002  elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
2003  GetLastError());
2004  else if (rc == WAIT_TIMEOUT)
2005  {
2006  /* timeout exceeded */
2007  return -1;
2008  }
2009 
2010  if (rc == WAIT_OBJECT_0)
2011  {
2012  /* Service newly-arrived signals */
2014  return 0; /* retry */
2015  }
2016 
2017  /*
2018  * With an offset of one, due to the always present pgwin32_signal_event,
2019  * the handle offset directly corresponds to a wait event.
2020  */
2021  cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
2022 
2023  for (;;)
2024  {
2025  int next_pos;
2026  int count;
2027 
2028  occurred_events->pos = cur_event->pos;
2029  occurred_events->user_data = cur_event->user_data;
2030  occurred_events->events = 0;
2031 
2032  if (cur_event->events == WL_LATCH_SET)
2033  {
2034  /*
2035  * We cannot use set->latch->event to reset the fired event if we
2036  * aren't waiting on this latch now.
2037  */
2038  if (!ResetEvent(set->handles[cur_event->pos + 1]))
2039  elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
2040 
2041  if (set->latch && set->latch->is_set)
2042  {
2043  occurred_events->fd = PGINVALID_SOCKET;
2044  occurred_events->events = WL_LATCH_SET;
2045  occurred_events++;
2046  returned_events++;
2047  }
2048  }
2049  else if (cur_event->events == WL_POSTMASTER_DEATH)
2050  {
2051  /*
2052  * Postmaster apparently died. Since the consequences of falsely
2053  * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we
2054  * take the trouble to positively verify this with
2055  * PostmasterIsAlive(), even though there is no known reason to
2056  * think that the event could be falsely set on Windows.
2057  */
2059  {
2060  if (set->exit_on_postmaster_death)
2061  proc_exit(1);
2062  occurred_events->fd = PGINVALID_SOCKET;
2063  occurred_events->events = WL_POSTMASTER_DEATH;
2064  occurred_events++;
2065  returned_events++;
2066  }
2067  }
2068  else if (cur_event->events & WL_SOCKET_MASK)
2069  {
2070  WSANETWORKEVENTS resEvents;
2071  HANDLE handle = set->handles[cur_event->pos + 1];
2072 
2073  Assert(cur_event->fd);
2074 
2075  occurred_events->fd = cur_event->fd;
2076 
2077  ZeroMemory(&resEvents, sizeof(resEvents));
2078  if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
2079  elog(ERROR, "failed to enumerate network events: error code %d",
2080  WSAGetLastError());
2081  if ((cur_event->events & WL_SOCKET_READABLE) &&
2082  (resEvents.lNetworkEvents & FD_READ))
2083  {
2084  /* data available in socket */
2085  occurred_events->events |= WL_SOCKET_READABLE;
2086 
2087  /*------
2088  * WaitForMultipleObjects doesn't guarantee that a read event
2089  * will be returned if the latch is set at the same time. Even
2090  * if it did, the caller might drop that event expecting it to
2091  * reoccur on next call. So, we must force the event to be
2092  * reset if this WaitEventSet is used again in order to avoid
2093  * an indefinite hang.
2094  *
2095  * Refer
2096  * https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
2097  * for the behavior of socket events.
2098  *------
2099  */
2100  cur_event->reset = true;
2101  }
2102  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
2103  (resEvents.lNetworkEvents & FD_WRITE))
2104  {
2105  /* writeable */
2106  occurred_events->events |= WL_SOCKET_WRITEABLE;
2107  }
2108  if ((cur_event->events & WL_SOCKET_CONNECTED) &&
2109  (resEvents.lNetworkEvents & FD_CONNECT))
2110  {
2111  /* connected */
2112  occurred_events->events |= WL_SOCKET_CONNECTED;
2113  }
2114  if ((cur_event->events & WL_SOCKET_ACCEPT) &&
2115  (resEvents.lNetworkEvents & FD_ACCEPT))
2116  {
2117  /* incoming connection could be accepted */
2118  occurred_events->events |= WL_SOCKET_ACCEPT;
2119  }
2120  if (resEvents.lNetworkEvents & FD_CLOSE)
2121  {
2122  /* EOF/error, so signal all caller-requested socket flags */
2123  occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
2124  }
2125 
2126  if (occurred_events->events != 0)
2127  {
2128  occurred_events++;
2129  returned_events++;
2130  }
2131  }
2132 
2133  /* Is the output buffer full? */
2134  if (returned_events == nevents)
2135  break;
2136 
2137  /* Have we run out of possible events? */
2138  next_pos = cur_event->pos + 1;
2139  if (next_pos == set->nevents)
2140  break;
2141 
2142  /*
2143  * Poll the rest of the event handles in the array starting at
2144  * next_pos being careful to skip over the initial signal handle too.
2145  * This time we use a zero timeout.
2146  */
2147  count = set->nevents - next_pos;
2148  rc = WaitForMultipleObjects(count,
2149  set->handles + 1 + next_pos,
2150  false,
2151  0);
2152 
2153  /*
2154  * We don't distinguish between errors and WAIT_TIMEOUT here because
2155  * we already have events to report.
2156  */
2157  if (rc < WAIT_OBJECT_0 || rc >= WAIT_OBJECT_0 + count)
2158  break;
2159 
2160  /* We have another event to decode. */
2161  cur_event = &set->events[next_pos + (rc - WAIT_OBJECT_0)];
2162  }
2163 
2164  return returned_events;
2165 }
2166 #endif
2167 
2168 /*
2169  * Return whether the current build options can report WL_SOCKET_CLOSED.
2170  */
2171 bool
2173 {
2174 #if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
2175  defined(WAIT_USE_EPOLL) || \
2176  defined(WAIT_USE_KQUEUE)
2177  return true;
2178 #else
2179  return false;
2180 #endif
2181 }
2182 
2183 /*
2184  * Get the number of wait events registered in a given WaitEventSet.
2185  */
2186 int
2188 {
2189  return set->nevents;
2190 }
2191 
2192 #if defined(WAIT_USE_SELF_PIPE)
2193 
2194 /*
2195  * SetLatch uses SIGURG to wake up the process waiting on the latch.
2196  *
2197  * Wake up WaitLatch, if we're waiting.
2198  */
2199 static void
2201 {
2202  int save_errno = errno;
2203 
2204  if (waiting)
2205  sendSelfPipeByte();
2206 
2207  errno = save_errno;
2208 }
2209 
2210 /* Send one byte to the self-pipe, to wake up WaitLatch */
2211 static void
2213 {
2214  int rc;
2215  char dummy = 0;
2216 
2217 retry:
2218  rc = write(selfpipe_writefd, &dummy, 1);
2219  if (rc < 0)
2220  {
2221  /* If interrupted by signal, just retry */
2222  if (errno == EINTR)
2223  goto retry;
2224 
2225  /*
2226  * If the pipe is full, we don't need to retry, the data that's there
2227  * already is enough to wake up WaitLatch.
2228  */
2229  if (errno == EAGAIN || errno == EWOULDBLOCK)
2230  return;
2231 
2232  /*
2233  * Oops, the write() failed for some other reason. We might be in a
2234  * signal handler, so it's not safe to elog(). We have no choice but
2235  * silently ignore the error.
2236  */
2237  return;
2238  }
2239 }
2240 
2241 #endif
2242 
2243 #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
2244 
2245 /*
2246  * Read all available data from self-pipe or signalfd.
2247  *
2248  * Note: this is only called when waiting = true. If it fails and doesn't
2249  * return, it must reset that flag first (though ideally, this will never
2250  * happen).
2251  */
2252 static void
2253 drain(void)
2254 {
2255  char buf[1024];
2256  int rc;
2257  int fd;
2258 
2259 #ifdef WAIT_USE_SELF_PIPE
2260  fd = selfpipe_readfd;
2261 #else
2262  fd = signal_fd;
2263 #endif
2264 
2265  for (;;)
2266  {
2267  rc = read(fd, buf, sizeof(buf));
2268  if (rc < 0)
2269  {
2270  if (errno == EAGAIN || errno == EWOULDBLOCK)
2271  break; /* the descriptor is empty */
2272  else if (errno == EINTR)
2273  continue; /* retry */
2274  else
2275  {
2276  waiting = false;
2277 #ifdef WAIT_USE_SELF_PIPE
2278  elog(ERROR, "read() on self-pipe failed: %m");
2279 #else
2280  elog(ERROR, "read() on signalfd failed: %m");
2281 #endif
2282  }
2283  }
2284  else if (rc == 0)
2285  {
2286  waiting = false;
2287 #ifdef WAIT_USE_SELF_PIPE
2288  elog(ERROR, "unexpected EOF on self-pipe");
2289 #else
2290  elog(ERROR, "unexpected EOF on signalfd");
2291 #endif
2292  }
2293  else if (rc < sizeof(buf))
2294  {
2295  /* we successfully drained the pipe; no need to read() again */
2296  break;
2297  }
2298  /* else buffer wasn't big enough, so read again */
2299  }
2300 }
2301 
2302 #endif
#define pg_memory_barrier()
Definition: atomics.h:140
sigset_t UnBlockSig
Definition: pqsignal.c:22
unsigned int uint32
Definition: c.h:495
#define Min(x, y)
Definition: c.h:993
#define MAXALIGN(LEN)
Definition: c.h:800
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:171
#define SIGNAL_ARGS
Definition: c.h:1355
#define unlikely(x)
Definition: c.h:300
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:927
size_t Size
Definition: c.h:594
int errcode_for_socket_access(void)
Definition: elog.c:952
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define FATAL
Definition: elog.h:41
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
void ReleaseExternalFD(void)
Definition: fd.c:1191
bool AcquireExternalFD(void)
Definition: fd.c:1138
void ReserveExternalFD(void)
Definition: fd.c:1173
pid_t PostmasterPid
Definition: globals.c:99
int MyProcPid
Definition: globals.c:44
bool IsUnderPostmaster
Definition: globals.c:113
struct Latch * MyLatch
Definition: globals.c:58
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:122
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:181
#define INSTR_TIME_GET_MILLISEC(t)
Definition: instr_time.h:191
#define INSTR_TIME_SET_ZERO(t)
Definition: instr_time.h:172
#define close(a)
Definition: win32.h:12
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
void proc_exit(int code)
Definition: ipc.c:104
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
static void latch_sigurg_handler(SIGNAL_ARGS)
Definition: latch.c:2200
static void sendSelfPipeByte(void)
Definition: latch.c:2212
void InitializeLatchWaitSet(void)
Definition: latch.c:319
int WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, long timeout, uint32 wait_event_info)
Definition: latch.c:538
#define LatchWaitSetLatchPos
Definition: latch.c:156
static int selfpipe_readfd
Definition: latch.c:170
void OwnLatch(Latch *latch)
Definition: latch.c:436
void DisownLatch(Latch *latch)
Definition: latch.c:462
void FreeWaitEventSetAfterFork(WaitEventSet *set)
Definition: latch.c:874
static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
Definition: latch.c:1133
static int selfpipe_owner_pid
Definition: latch.c:174
static int selfpipe_writefd
Definition: latch.c:171
int GetNumRegisteredWaitEvents(WaitEventSet *set)
Definition: latch.c:2187
WaitEventSet * CreateWaitEventSet(MemoryContext context, int nevents)
Definition: latch.c:721
void InitSharedLatch(Latch *latch)
Definition: latch.c:403
void InitializeLatchSupport(void)
Definition: latch.c:205
static WaitEventSet * LatchWaitSet
Definition: latch.c:153
void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
Definition: latch.c:1006
static int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents)
Definition: latch.c:1807
void SetLatch(Latch *latch)
Definition: latch.c:605
void ShutdownLatchSupport(void)
Definition: latch.c:337
bool WaitEventSetCanReportClosed(void)
Definition: latch.c:2172
void InitLatch(Latch *latch)
Definition: latch.c:367
int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, void *user_data)
Definition: latch.c:920
int WaitEventSetWait(WaitEventSet *set, long timeout, WaitEvent *occurred_events, int nevents, uint32 wait_event_info)
Definition: latch.c:1381
static void drain(void)
Definition: latch.c:2253
static volatile sig_atomic_t waiting
Definition: latch.c:160
void FreeWaitEventSet(WaitEventSet *set)
Definition: latch.c:835
void ResetLatch(Latch *latch)
Definition: latch.c:697
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:490
#define WL_SOCKET_READABLE
Definition: latch.h:126
#define WL_SOCKET_ACCEPT
Definition: latch.h:142
#define WL_TIMEOUT
Definition: latch.h:128
#define WL_SOCKET_CLOSED
Definition: latch.h:137
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:130
#define WL_LATCH_SET
Definition: latch.h:125
#define WL_SOCKET_CONNECTED
Definition: latch.h:135
#define WL_POSTMASTER_DEATH
Definition: latch.h:129
#define WL_SOCKET_WRITEABLE
Definition: latch.h:127
#define WL_SOCKET_MASK
Definition: latch.h:144
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1456
MemoryContext TopMemoryContext
Definition: mcxt.c:141
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1064
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
#define InvalidPid
Definition: miscadmin.h:32
const void * data
static time_t start_time
Definition: pg_ctl.c:94
static char * buf
Definition: pg_test_fsync.c:67
bool PostmasterIsAliveInternal(void)
Definition: pmsignal.c:376
#define PostmasterIsAlive()
Definition: pmsignal.h:102
pqsigfunc pqsignal(int signo, pqsigfunc func)
int pgsocket
Definition: port.h:29
#define PGINVALID_SOCKET
Definition: port.h:31
int postmaster_alive_fds[2]
Definition: postmaster.c:575
#define POSTMASTER_FD_WATCH
Definition: postmaster.h:46
char * c
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:120
HANDLE pgwin32_signal_event
Definition: signal.c:27
Definition: latch.h:111
sig_atomic_t is_set
Definition: latch.h:112
sig_atomic_t maybe_sleeping
Definition: latch.h:113
bool is_shared
Definition: latch.h:114
int owner_pid
Definition: latch.h:115
Latch * latch
Definition: latch.c:119
bool exit_on_postmaster_death
Definition: latch.c:127
int nevents
Definition: latch.c:104
int latch_pos
Definition: latch.c:120
int nevents_space
Definition: latch.c:105
WaitEvent * events
Definition: latch.c:111
struct pollfd * pollfds
Definition: latch.c:140
pgsocket fd
Definition: latch.h:154
int pos
Definition: latch.h:152
void * user_data
Definition: latch.h:155
uint32 events
Definition: latch.h:153
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:88
static void pgstat_report_wait_end(void)
Definition: wait_event.h:104
#define EINTR
Definition: win32_port.h:374
#define EWOULDBLOCK
Definition: win32_port.h:380
#define kill(pid, sig)
Definition: win32_port.h:485
#define SIG_IGN
Definition: win32_port.h:165
#define EAGAIN
Definition: win32_port.h:372