PostgreSQL Source Code  git master
latch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * latch.c
4  * Routines for inter-process latches
5  *
6  * The Unix implementation uses the so-called self-pipe trick to overcome the
7  * race condition involved with poll() (or epoll_wait() on linux) and setting
8  * a global flag in the signal handler. When a latch is set and the current
9  * process is waiting for it, the signal handler wakes up the poll() in
10  * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11  * poll() on all platforms, and even on platforms where it does, a signal that
12  * arrives just before the poll() call does not prevent poll() from entering
13  * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14  * and causes poll() to return immediately even if the signal arrives before
15  * poll() begins.
16  *
17  * When SetLatch is called from the same process that owns the latch,
18  * SetLatch writes the byte directly to the pipe. If it's owned by another
19  * process, SIGUSR1 is sent and the signal handler in the waiting process
20  * writes the byte to the pipe on behalf of the signaling process.
21  *
22  * The Windows implementation uses Windows events that are inherited by all
23  * postmaster child processes. There's no need for the self-pipe trick there.
24  *
25  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
26  * Portions Copyright (c) 1994, Regents of the University of California
27  *
28  * IDENTIFICATION
29  * src/backend/storage/ipc/latch.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <fcntl.h>
36 #include <limits.h>
37 #include <signal.h>
38 #include <unistd.h>
39 #ifdef HAVE_SYS_EPOLL_H
40 #include <sys/epoll.h>
41 #endif
42 #ifdef HAVE_POLL_H
43 #include <poll.h>
44 #endif
45 
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "port/atomics.h"
49 #include "portability/instr_time.h"
50 #include "postmaster/postmaster.h"
51 #include "storage/ipc.h"
52 #include "storage/latch.h"
53 #include "storage/pmsignal.h"
54 #include "storage/shmem.h"
55 
56 /*
57  * Select the fd readiness primitive to use. Normally the "most modern"
58  * primitive supported by the OS will be used, but for testing it can be
59  * useful to manually specify the used primitive. If desired, just add a
60  * define somewhere before this block.
61  */
62 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
63  defined(WAIT_USE_WIN32)
64 /* don't overwrite manual choice */
65 #elif defined(HAVE_SYS_EPOLL_H)
66 #define WAIT_USE_EPOLL
67 #elif defined(HAVE_POLL)
68 #define WAIT_USE_POLL
69 #elif WIN32
70 #define WAIT_USE_WIN32
71 #else
72 #error "no wait set implementation available"
73 #endif
74 
75 /* typedef in latch.h */
77 {
78  int nevents; /* number of registered events */
79  int nevents_space; /* maximum number of events in this set */
80 
81  /*
82  * Array, of nevents_space length, storing the definition of events this
83  * set is waiting for.
84  */
86 
87  /*
88  * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
89  * said latch, and latch_pos the offset in the ->events array. This is
90  * useful because we check the state of the latch before performing doing
91  * syscalls related to waiting.
92  */
94  int latch_pos;
95 
96  /*
97  * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
98  * is set so that we'll exit immediately if postmaster death is detected,
99  * instead of returning.
100  */
102 
103 #if defined(WAIT_USE_EPOLL)
104  int epoll_fd;
105  /* epoll_wait returns events in a user provided arrays, allocate once */
106  struct epoll_event *epoll_ret_events;
107 #elif defined(WAIT_USE_POLL)
108  /* poll expects events to be waited on every poll() call, prepare once */
109  struct pollfd *pollfds;
110 #elif defined(WAIT_USE_WIN32)
111 
112  /*
113  * Array of windows events. The first element always contains
114  * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
115  * event->pos + 1).
116  */
117  HANDLE *handles;
118 #endif
119 };
120 
121 #ifndef WIN32
122 /* Are we currently in WaitLatch? The signal handler would like to know. */
123 static volatile sig_atomic_t waiting = false;
124 
125 /* Read and write ends of the self-pipe */
126 static int selfpipe_readfd = -1;
127 static int selfpipe_writefd = -1;
128 
129 /* Process owning the self-pipe --- needed for checking purposes */
130 static int selfpipe_owner_pid = 0;
131 
132 /* Private function prototypes */
133 static void sendSelfPipeByte(void);
134 static void drainSelfPipe(void);
135 #endif /* WIN32 */
136 
137 #if defined(WAIT_USE_EPOLL)
138 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
139 #elif defined(WAIT_USE_POLL)
140 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
141 #elif defined(WAIT_USE_WIN32)
142 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
143 #endif
144 
145 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
146  WaitEvent *occurred_events, int nevents);
147 
148 /*
149  * Initialize the process-local latch infrastructure.
150  *
151  * This must be called once during startup of any process that can wait on
152  * latches, before it issues any InitLatch() or OwnLatch() calls.
153  */
154 void
156 {
157 #ifndef WIN32
158  int pipefd[2];
159 
160  if (IsUnderPostmaster)
161  {
162  /*
163  * We might have inherited connections to a self-pipe created by the
164  * postmaster. It's critical that child processes create their own
165  * self-pipes, of course, and we really want them to close the
166  * inherited FDs for safety's sake.
167  */
168  if (selfpipe_owner_pid != 0)
169  {
170  /* Assert we go through here but once in a child process */
172  /* Release postmaster's pipe FDs; ignore any error */
173  (void) close(selfpipe_readfd);
174  (void) close(selfpipe_writefd);
175  /* Clean up, just for safety's sake; we'll set these below */
177  selfpipe_owner_pid = 0;
178  }
179  else
180  {
181  /*
182  * Postmaster didn't create a self-pipe ... or else we're in an
183  * EXEC_BACKEND build, in which case it doesn't matter since the
184  * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
185  */
186  Assert(selfpipe_readfd == -1);
187  }
188  }
189  else
190  {
191  /* In postmaster or standalone backend, assert we do this but once */
192  Assert(selfpipe_readfd == -1);
194  }
195 
196  /*
197  * Set up the self-pipe that allows a signal handler to wake up the
198  * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
199  * that SetLatch won't block if the event has already been set many times
200  * filling the kernel buffer. Make the read-end non-blocking too, so that
201  * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
202  * Also, make both FDs close-on-exec, since we surely do not want any
203  * child processes messing with them.
204  */
205  if (pipe(pipefd) < 0)
206  elog(FATAL, "pipe() failed: %m");
207  if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
208  elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
209  if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
210  elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
211  if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
212  elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
213  if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
214  elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
215 
216  selfpipe_readfd = pipefd[0];
217  selfpipe_writefd = pipefd[1];
219 #else
220  /* currently, nothing to do here for Windows */
221 #endif
222 }
223 
224 /*
225  * Initialize a process-local latch.
226  */
227 void
229 {
230  latch->is_set = false;
231  latch->owner_pid = MyProcPid;
232  latch->is_shared = false;
233 
234 #ifndef WIN32
235  /* Assert InitializeLatchSupport has been called in this process */
237 #else
238  latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
239  if (latch->event == NULL)
240  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
241 #endif /* WIN32 */
242 }
243 
244 /*
245  * Initialize a shared latch that can be set from other processes. The latch
246  * is initially owned by no-one; use OwnLatch to associate it with the
247  * current process.
248  *
249  * InitSharedLatch needs to be called in postmaster before forking child
250  * processes, usually right after allocating the shared memory block
251  * containing the latch with ShmemInitStruct. (The Unix implementation
252  * doesn't actually require that, but the Windows one does.) Because of
253  * this restriction, we have no concurrency issues to worry about here.
254  *
255  * Note that other handles created in this module are never marked as
256  * inheritable. Thus we do not need to worry about cleaning up child
257  * process references to postmaster-private latches or WaitEventSets.
258  */
259 void
261 {
262 #ifdef WIN32
263  SECURITY_ATTRIBUTES sa;
264 
265  /*
266  * Set up security attributes to specify that the events are inherited.
267  */
268  ZeroMemory(&sa, sizeof(sa));
269  sa.nLength = sizeof(sa);
270  sa.bInheritHandle = TRUE;
271 
272  latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
273  if (latch->event == NULL)
274  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
275 #endif
276 
277  latch->is_set = false;
278  latch->owner_pid = 0;
279  latch->is_shared = true;
280 }
281 
282 /*
283  * Associate a shared latch with the current process, allowing it to
284  * wait on the latch.
285  *
286  * Although there is a sanity check for latch-already-owned, we don't do
287  * any sort of locking here, meaning that we could fail to detect the error
288  * if two processes try to own the same latch at about the same time. If
289  * there is any risk of that, caller must provide an interlock to prevent it.
290  *
291  * In any process that calls OwnLatch(), make sure that
292  * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
293  * as shared latches use SIGUSR1 for inter-process communication.
294  */
295 void
297 {
298  /* Sanity checks */
299  Assert(latch->is_shared);
300 
301 #ifndef WIN32
302  /* Assert InitializeLatchSupport has been called in this process */
304 #endif
305 
306  if (latch->owner_pid != 0)
307  elog(ERROR, "latch already owned");
308 
309  latch->owner_pid = MyProcPid;
310 }
311 
312 /*
313  * Disown a shared latch currently owned by the current process.
314  */
315 void
317 {
318  Assert(latch->is_shared);
319  Assert(latch->owner_pid == MyProcPid);
320 
321  latch->owner_pid = 0;
322 }
323 
324 /*
325  * Wait for a given latch to be set, or for postmaster death, or until timeout
326  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
327  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
328  * function returns immediately.
329  *
330  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
331  * is given. Although it is declared as "long", we don't actually support
332  * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
333  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
334  *
335  * The latch must be owned by the current process, ie. it must be a
336  * process-local latch initialized with InitLatch, or a shared latch
337  * associated with the current process by calling OwnLatch.
338  *
339  * Returns bit mask indicating which condition(s) caused the wake-up. Note
340  * that if multiple wake-up conditions are true, there is no guarantee that
341  * we return all of them in one call, but we will return at least one.
342  */
343 int
344 WaitLatch(Latch *latch, int wakeEvents, long timeout,
345  uint32 wait_event_info)
346 {
347  return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
348  wait_event_info);
349 }
350 
351 /*
352  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
353  * conditions.
354  *
355  * When waiting on a socket, EOF and error conditions always cause the socket
356  * to be reported as readable/writable/connected, so that the caller can deal
357  * with the condition.
358  *
359  * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
360  * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
361  * return value if the postmaster dies. The latter is useful for rare cases
362  * where some behavior other than immediate exit is needed.
363  *
364  * NB: These days this is just a wrapper around the WaitEventSet API. When
365  * using a latch very frequently, consider creating a longer living
366  * WaitEventSet instead; that's more efficient.
367  */
368 int
369 WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
370  long timeout, uint32 wait_event_info)
371 {
372  int ret = 0;
373  int rc;
374  WaitEvent event;
376 
377  if (wakeEvents & WL_TIMEOUT)
378  Assert(timeout >= 0);
379  else
380  timeout = -1;
381 
382  if (wakeEvents & WL_LATCH_SET)
383  AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
384  latch, NULL);
385 
386  /* Postmaster-managed callers must handle postmaster death somehow. */
388  (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
389  (wakeEvents & WL_POSTMASTER_DEATH));
390 
391  if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
392  AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
393  NULL, NULL);
394 
395  if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
396  AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
397  NULL, NULL);
398 
399  if (wakeEvents & WL_SOCKET_MASK)
400  {
401  int ev;
402 
403  ev = wakeEvents & WL_SOCKET_MASK;
404  AddWaitEventToSet(set, ev, sock, NULL, NULL);
405  }
406 
407  rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
408 
409  if (rc == 0)
410  ret |= WL_TIMEOUT;
411  else
412  {
413  ret |= event.events & (WL_LATCH_SET |
414  WL_POSTMASTER_DEATH |
416  }
417 
418  FreeWaitEventSet(set);
419 
420  return ret;
421 }
422 
423 /*
424  * Sets a latch and wakes up anyone waiting on it.
425  *
426  * This is cheap if the latch is already set, otherwise not so much.
427  *
428  * NB: when calling this in a signal handler, be sure to save and restore
429  * errno around it. (That's standard practice in most signal handlers, of
430  * course, but we used to omit it in handlers that only set a flag.)
431  *
432  * NB: this function is called from critical sections and signal handlers so
433  * throwing an error is not a good idea.
434  */
435 void
437 {
438 #ifndef WIN32
439  pid_t owner_pid;
440 #else
441  HANDLE handle;
442 #endif
443 
444  /*
445  * The memory barrier has to be placed here to ensure that any flag
446  * variables possibly changed by this process have been flushed to main
447  * memory, before we check/set is_set.
448  */
450 
451  /* Quick exit if already set */
452  if (latch->is_set)
453  return;
454 
455  latch->is_set = true;
456 
457 #ifndef WIN32
458 
459  /*
460  * See if anyone's waiting for the latch. It can be the current process if
461  * we're in a signal handler. We use the self-pipe to wake up the
462  * poll()/epoll_wait() in that case. If it's another process, send a
463  * signal.
464  *
465  * Fetch owner_pid only once, in case the latch is concurrently getting
466  * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
467  * guaranteed to be true! In practice, the effective range of pid_t fits
468  * in a 32 bit integer, and so should be atomic. In the worst case, we
469  * might end up signaling the wrong process. Even then, you're very
470  * unlucky if a process with that bogus pid exists and belongs to
471  * Postgres; and PG database processes should handle excess SIGUSR1
472  * interrupts without a problem anyhow.
473  *
474  * Another sort of race condition that's possible here is for a new
475  * process to own the latch immediately after we look, so we don't signal
476  * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
477  * the standard coding convention of waiting at the bottom of their loops,
478  * not the top, so that they'll correctly process latch-setting events
479  * that happen before they enter the loop.
480  */
481  owner_pid = latch->owner_pid;
482  if (owner_pid == 0)
483  return;
484  else if (owner_pid == MyProcPid)
485  {
486  if (waiting)
488  }
489  else
490  kill(owner_pid, SIGUSR1);
491 #else
492 
493  /*
494  * See if anyone's waiting for the latch. It can be the current process if
495  * we're in a signal handler.
496  *
497  * Use a local variable here just in case somebody changes the event field
498  * concurrently (which really should not happen).
499  */
500  handle = latch->event;
501  if (handle)
502  {
503  SetEvent(handle);
504 
505  /*
506  * Note that we silently ignore any errors. We might be in a signal
507  * handler or other critical path where it's not safe to call elog().
508  */
509  }
510 #endif
511 
512 }
513 
514 /*
515  * Clear the latch. Calling WaitLatch after this will sleep, unless
516  * the latch is set again before the WaitLatch call.
517  */
518 void
520 {
521  /* Only the owner should reset the latch */
522  Assert(latch->owner_pid == MyProcPid);
523 
524  latch->is_set = false;
525 
526  /*
527  * Ensure that the write to is_set gets flushed to main memory before we
528  * examine any flag variables. Otherwise a concurrent SetLatch might
529  * falsely conclude that it needn't signal us, even though we have missed
530  * seeing some flag updates that SetLatch was supposed to inform us of.
531  */
533 }
534 
535 /*
536  * Create a WaitEventSet with space for nevents different events to wait for.
537  *
538  * These events can then be efficiently waited upon together, using
539  * WaitEventSetWait().
540  */
541 WaitEventSet *
543 {
544  WaitEventSet *set;
545  char *data;
546  Size sz = 0;
547 
548  /*
549  * Use MAXALIGN size/alignment to guarantee that later uses of memory are
550  * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
551  * platforms, but earlier allocations like WaitEventSet and WaitEvent
552  * might not sized to guarantee that when purely using sizeof().
553  */
554  sz += MAXALIGN(sizeof(WaitEventSet));
555  sz += MAXALIGN(sizeof(WaitEvent) * nevents);
556 
557 #if defined(WAIT_USE_EPOLL)
558  sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
559 #elif defined(WAIT_USE_POLL)
560  sz += MAXALIGN(sizeof(struct pollfd) * nevents);
561 #elif defined(WAIT_USE_WIN32)
562  /* need space for the pgwin32_signal_event */
563  sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
564 #endif
565 
566  data = (char *) MemoryContextAllocZero(context, sz);
567 
568  set = (WaitEventSet *) data;
569  data += MAXALIGN(sizeof(WaitEventSet));
570 
571  set->events = (WaitEvent *) data;
572  data += MAXALIGN(sizeof(WaitEvent) * nevents);
573 
574 #if defined(WAIT_USE_EPOLL)
575  set->epoll_ret_events = (struct epoll_event *) data;
576  data += MAXALIGN(sizeof(struct epoll_event) * nevents);
577 #elif defined(WAIT_USE_POLL)
578  set->pollfds = (struct pollfd *) data;
579  data += MAXALIGN(sizeof(struct pollfd) * nevents);
580 #elif defined(WAIT_USE_WIN32)
581  set->handles = (HANDLE) data;
582  data += MAXALIGN(sizeof(HANDLE) * nevents);
583 #endif
584 
585  set->latch = NULL;
586  set->nevents_space = nevents;
587  set->exit_on_postmaster_death = false;
588 
589 #if defined(WAIT_USE_EPOLL)
590 #ifdef EPOLL_CLOEXEC
591  set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
592  if (set->epoll_fd < 0)
593  elog(ERROR, "epoll_create1 failed: %m");
594 #else
595  /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
596  set->epoll_fd = epoll_create(nevents);
597  if (set->epoll_fd < 0)
598  elog(ERROR, "epoll_create failed: %m");
599  if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
600  elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
601 #endif /* EPOLL_CLOEXEC */
602 #elif defined(WAIT_USE_WIN32)
603 
604  /*
605  * To handle signals while waiting, we need to add a win32 specific event.
606  * We accounted for the additional event at the top of this routine. See
607  * port/win32/signal.c for more details.
608  *
609  * Note: pgwin32_signal_event should be first to ensure that it will be
610  * reported when multiple events are set. We want to guarantee that
611  * pending signals are serviced.
612  */
613  set->handles[0] = pgwin32_signal_event;
614  StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
615 #endif
616 
617  return set;
618 }
619 
620 /*
621  * Free a previously created WaitEventSet.
622  *
623  * Note: preferably, this shouldn't have to free any resources that could be
624  * inherited across an exec(). If it did, we'd likely leak those resources in
625  * many scenarios. For the epoll case, we ensure that by setting FD_CLOEXEC
626  * when the FD is created. For the Windows case, we assume that the handles
627  * involved are non-inheritable.
628  */
629 void
631 {
632 #if defined(WAIT_USE_EPOLL)
633  close(set->epoll_fd);
634 #elif defined(WAIT_USE_WIN32)
635  WaitEvent *cur_event;
636 
637  for (cur_event = set->events;
638  cur_event < (set->events + set->nevents);
639  cur_event++)
640  {
641  if (cur_event->events & WL_LATCH_SET)
642  {
643  /* uses the latch's HANDLE */
644  }
645  else if (cur_event->events & WL_POSTMASTER_DEATH)
646  {
647  /* uses PostmasterHandle */
648  }
649  else
650  {
651  /* Clean up the event object we created for the socket */
652  WSAEventSelect(cur_event->fd, NULL, 0);
653  WSACloseEvent(set->handles[cur_event->pos + 1]);
654  }
655  }
656 #endif
657 
658  pfree(set);
659 }
660 
661 /* ---
662  * Add an event to the set. Possible events are:
663  * - WL_LATCH_SET: Wait for the latch to be set
664  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
665  * - WL_SOCKET_READABLE: Wait for socket to become readable,
666  * can be combined in one event with other WL_SOCKET_* events
667  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
668  * can be combined with other WL_SOCKET_* events
669  * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
670  * can be combined with other WL_SOCKET_* events (on non-Windows
671  * platforms, this is the same as WL_SOCKET_WRITEABLE)
672  * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
673  *
674  * Returns the offset in WaitEventSet->events (starting from 0), which can be
675  * used to modify previously added wait events using ModifyWaitEvent().
676  *
677  * In the WL_LATCH_SET case the latch must be owned by the current process,
678  * i.e. it must be a process-local latch initialized with InitLatch, or a
679  * shared latch associated with the current process by calling OwnLatch.
680  *
681  * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
682  * conditions cause the socket to be reported as readable/writable/connected,
683  * so that the caller can deal with the condition.
684  *
685  * The user_data pointer specified here will be set for the events returned
686  * by WaitEventSetWait(), allowing to easily associate additional data with
687  * events.
688  */
689 int
691  void *user_data)
692 {
693  WaitEvent *event;
694 
695  /* not enough space */
696  Assert(set->nevents < set->nevents_space);
697 
698  if (events == WL_EXIT_ON_PM_DEATH)
699  {
700  events = WL_POSTMASTER_DEATH;
701  set->exit_on_postmaster_death = true;
702  }
703 
704  if (latch)
705  {
706  if (latch->owner_pid != MyProcPid)
707  elog(ERROR, "cannot wait on a latch owned by another process");
708  if (set->latch)
709  elog(ERROR, "cannot wait on more than one latch");
710  if ((events & WL_LATCH_SET) != WL_LATCH_SET)
711  elog(ERROR, "latch events only support being set");
712  }
713  else
714  {
715  if (events & WL_LATCH_SET)
716  elog(ERROR, "cannot wait on latch without a specified latch");
717  }
718 
719  /* waiting for socket readiness without a socket indicates a bug */
720  if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
721  elog(ERROR, "cannot wait on socket event without a socket");
722 
723  event = &set->events[set->nevents];
724  event->pos = set->nevents++;
725  event->fd = fd;
726  event->events = events;
727  event->user_data = user_data;
728 #ifdef WIN32
729  event->reset = false;
730 #endif
731 
732  if (events == WL_LATCH_SET)
733  {
734  set->latch = latch;
735  set->latch_pos = event->pos;
736 #ifndef WIN32
737  event->fd = selfpipe_readfd;
738 #endif
739  }
740  else if (events == WL_POSTMASTER_DEATH)
741  {
742 #ifndef WIN32
744 #endif
745  }
746 
747  /* perform wait primitive specific initialization, if needed */
748 #if defined(WAIT_USE_EPOLL)
749  WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
750 #elif defined(WAIT_USE_POLL)
751  WaitEventAdjustPoll(set, event);
752 #elif defined(WAIT_USE_WIN32)
753  WaitEventAdjustWin32(set, event);
754 #endif
755 
756  return event->pos;
757 }
758 
759 /*
760  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
761  * with the WaitEvent.
762  *
763  * 'pos' is the id returned by AddWaitEventToSet.
764  */
765 void
767 {
768  WaitEvent *event;
769 
770  Assert(pos < set->nevents);
771 
772  event = &set->events[pos];
773 
774  /*
775  * If neither the event mask nor the associated latch changes, return
776  * early. That's an important optimization for some sockets, where
777  * ModifyWaitEvent is frequently used to switch from waiting for reads to
778  * waiting on writes.
779  */
780  if (events == event->events &&
781  (!(event->events & WL_LATCH_SET) || set->latch == latch))
782  return;
783 
784  if (event->events & WL_LATCH_SET &&
785  events != event->events)
786  {
787  /* we could allow to disable latch events for a while */
788  elog(ERROR, "cannot modify latch event");
789  }
790 
791  if (event->events & WL_POSTMASTER_DEATH)
792  {
793  elog(ERROR, "cannot modify postmaster death event");
794  }
795 
796  /* FIXME: validate event mask */
797  event->events = events;
798 
799  if (events == WL_LATCH_SET)
800  {
801  set->latch = latch;
802  }
803 
804 #if defined(WAIT_USE_EPOLL)
805  WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
806 #elif defined(WAIT_USE_POLL)
807  WaitEventAdjustPoll(set, event);
808 #elif defined(WAIT_USE_WIN32)
809  WaitEventAdjustWin32(set, event);
810 #endif
811 }
812 
813 #if defined(WAIT_USE_EPOLL)
814 /*
815  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
816  */
817 static void
818 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
819 {
820  struct epoll_event epoll_ev;
821  int rc;
822 
823  /* pointer to our event, returned by epoll_wait */
824  epoll_ev.data.ptr = event;
825  /* always wait for errors */
826  epoll_ev.events = EPOLLERR | EPOLLHUP;
827 
828  /* prepare pollfd entry once */
829  if (event->events == WL_LATCH_SET)
830  {
831  Assert(set->latch != NULL);
832  epoll_ev.events |= EPOLLIN;
833  }
834  else if (event->events == WL_POSTMASTER_DEATH)
835  {
836  epoll_ev.events |= EPOLLIN;
837  }
838  else
839  {
840  Assert(event->fd != PGINVALID_SOCKET);
842 
843  if (event->events & WL_SOCKET_READABLE)
844  epoll_ev.events |= EPOLLIN;
845  if (event->events & WL_SOCKET_WRITEABLE)
846  epoll_ev.events |= EPOLLOUT;
847  }
848 
849  /*
850  * Even though unused, we also pass epoll_ev as the data argument if
851  * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
852  * requiring that, and actually it makes the code simpler...
853  */
854  rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
855 
856  if (rc < 0)
857  ereport(ERROR,
859  /* translator: %s is a syscall name, such as "poll()" */
860  errmsg("%s failed: %m",
861  "epoll_ctl()")));
862 }
863 #endif
864 
865 #if defined(WAIT_USE_POLL)
866 static void
867 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
868 {
869  struct pollfd *pollfd = &set->pollfds[event->pos];
870 
871  pollfd->revents = 0;
872  pollfd->fd = event->fd;
873 
874  /* prepare pollfd entry once */
875  if (event->events == WL_LATCH_SET)
876  {
877  Assert(set->latch != NULL);
878  pollfd->events = POLLIN;
879  }
880  else if (event->events == WL_POSTMASTER_DEATH)
881  {
882  pollfd->events = POLLIN;
883  }
884  else
885  {
887  pollfd->events = 0;
888  if (event->events & WL_SOCKET_READABLE)
889  pollfd->events |= POLLIN;
890  if (event->events & WL_SOCKET_WRITEABLE)
891  pollfd->events |= POLLOUT;
892  }
893 
894  Assert(event->fd != PGINVALID_SOCKET);
895 }
896 #endif
897 
898 #if defined(WAIT_USE_WIN32)
899 static void
900 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
901 {
902  HANDLE *handle = &set->handles[event->pos + 1];
903 
904  if (event->events == WL_LATCH_SET)
905  {
906  Assert(set->latch != NULL);
907  *handle = set->latch->event;
908  }
909  else if (event->events == WL_POSTMASTER_DEATH)
910  {
911  *handle = PostmasterHandle;
912  }
913  else
914  {
915  int flags = FD_CLOSE; /* always check for errors/EOF */
916 
917  if (event->events & WL_SOCKET_READABLE)
918  flags |= FD_READ;
919  if (event->events & WL_SOCKET_WRITEABLE)
920  flags |= FD_WRITE;
921  if (event->events & WL_SOCKET_CONNECTED)
922  flags |= FD_CONNECT;
923 
924  if (*handle == WSA_INVALID_EVENT)
925  {
926  *handle = WSACreateEvent();
927  if (*handle == WSA_INVALID_EVENT)
928  elog(ERROR, "failed to create event for socket: error code %u",
929  WSAGetLastError());
930  }
931  if (WSAEventSelect(event->fd, *handle, flags) != 0)
932  elog(ERROR, "failed to set up event for socket: error code %u",
933  WSAGetLastError());
934 
935  Assert(event->fd != PGINVALID_SOCKET);
936  }
937 }
938 #endif
939 
940 /*
941  * Wait for events added to the set to happen, or until the timeout is
942  * reached. At most nevents occurred events are returned.
943  *
944  * If timeout = -1, block until an event occurs; if 0, check sockets for
945  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
946  *
947  * Returns the number of events occurred, or 0 if the timeout was reached.
948  *
949  * Returned events will have the fd, pos, user_data fields set to the
950  * values associated with the registered event.
951  */
952 int
953 WaitEventSetWait(WaitEventSet *set, long timeout,
954  WaitEvent *occurred_events, int nevents,
955  uint32 wait_event_info)
956 {
957  int returned_events = 0;
959  instr_time cur_time;
960  long cur_timeout = -1;
961 
962  Assert(nevents > 0);
963 
964  /*
965  * Initialize timeout if requested. We must record the current time so
966  * that we can determine the remaining timeout if interrupted.
967  */
968  if (timeout >= 0)
969  {
970  INSTR_TIME_SET_CURRENT(start_time);
971  Assert(timeout >= 0 && timeout <= INT_MAX);
972  cur_timeout = timeout;
973  }
974 
975  pgstat_report_wait_start(wait_event_info);
976 
977 #ifndef WIN32
978  waiting = true;
979 #else
980  /* Ensure that signals are serviced even if latch is already set */
982 #endif
983  while (returned_events == 0)
984  {
985  int rc;
986 
987  /*
988  * Check if the latch is set already. If so, leave the loop
989  * immediately, avoid blocking again. We don't attempt to report any
990  * other events that might also be satisfied.
991  *
992  * If someone sets the latch between this and the
993  * WaitEventSetWaitBlock() below, the setter will write a byte to the
994  * pipe (or signal us and the signal handler will do that), and the
995  * readiness routine will return immediately.
996  *
997  * On unix, If there's a pending byte in the self pipe, we'll notice
998  * whenever blocking. Only clearing the pipe in that case avoids
999  * having to drain it every time WaitLatchOrSocket() is used. Should
1000  * the pipe-buffer fill up we're still ok, because the pipe is in
1001  * nonblocking mode. It's unlikely for that to happen, because the
1002  * self pipe isn't filled unless we're blocking (waiting = true), or
1003  * from inside a signal handler in latch_sigusr1_handler().
1004  *
1005  * On windows, we'll also notice if there's a pending event for the
1006  * latch when blocking, but there's no danger of anything filling up,
1007  * as "Setting an event that is already set has no effect.".
1008  *
1009  * Note: we assume that the kernel calls involved in latch management
1010  * will provide adequate synchronization on machines with weak memory
1011  * ordering, so that we cannot miss seeing is_set if a notification
1012  * has already been queued.
1013  */
1014  if (set->latch && set->latch->is_set)
1015  {
1016  occurred_events->fd = PGINVALID_SOCKET;
1017  occurred_events->pos = set->latch_pos;
1018  occurred_events->user_data =
1019  set->events[set->latch_pos].user_data;
1020  occurred_events->events = WL_LATCH_SET;
1021  occurred_events++;
1022  returned_events++;
1023 
1024  break;
1025  }
1026 
1027  /*
1028  * Wait for events using the readiness primitive chosen at the top of
1029  * this file. If -1 is returned, a timeout has occurred, if 0 we have
1030  * to retry, everything >= 1 is the number of returned events.
1031  */
1032  rc = WaitEventSetWaitBlock(set, cur_timeout,
1033  occurred_events, nevents);
1034 
1035  if (rc == -1)
1036  break; /* timeout occurred */
1037  else
1038  returned_events = rc;
1039 
1040  /* If we're not done, update cur_timeout for next iteration */
1041  if (returned_events == 0 && timeout >= 0)
1042  {
1043  INSTR_TIME_SET_CURRENT(cur_time);
1044  INSTR_TIME_SUBTRACT(cur_time, start_time);
1045  cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1046  if (cur_timeout <= 0)
1047  break;
1048  }
1049  }
1050 #ifndef WIN32
1051  waiting = false;
1052 #endif
1053 
1055 
1056  return returned_events;
1057 }
1058 
1059 
1060 #if defined(WAIT_USE_EPOLL)
1061 
1062 /*
1063  * Wait using linux's epoll_wait(2).
1064  *
1065  * This is the preferable wait method, as several readiness notifications are
1066  * delivered, without having to iterate through all of set->events. The return
1067  * epoll_event struct contain a pointer to our events, making association
1068  * easy.
1069  */
1070 static inline int
1071 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1072  WaitEvent *occurred_events, int nevents)
1073 {
1074  int returned_events = 0;
1075  int rc;
1076  WaitEvent *cur_event;
1077  struct epoll_event *cur_epoll_event;
1078 
1079  /* Sleep */
1080  rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1081  nevents, cur_timeout);
1082 
1083  /* Check return code */
1084  if (rc < 0)
1085  {
1086  /* EINTR is okay, otherwise complain */
1087  if (errno != EINTR)
1088  {
1089  waiting = false;
1090  ereport(ERROR,
1092  /* translator: %s is a syscall name, such as "poll()" */
1093  errmsg("%s failed: %m",
1094  "epoll_wait()")));
1095  }
1096  return 0;
1097  }
1098  else if (rc == 0)
1099  {
1100  /* timeout exceeded */
1101  return -1;
1102  }
1103 
1104  /*
1105  * At least one event occurred, iterate over the returned epoll events
1106  * until they're either all processed, or we've returned all the events
1107  * the caller desired.
1108  */
1109  for (cur_epoll_event = set->epoll_ret_events;
1110  cur_epoll_event < (set->epoll_ret_events + rc) &&
1111  returned_events < nevents;
1112  cur_epoll_event++)
1113  {
1114  /* epoll's data pointer is set to the associated WaitEvent */
1115  cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1116 
1117  occurred_events->pos = cur_event->pos;
1118  occurred_events->user_data = cur_event->user_data;
1119  occurred_events->events = 0;
1120 
1121  if (cur_event->events == WL_LATCH_SET &&
1122  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1123  {
1124  /* There's data in the self-pipe, clear it. */
1125  drainSelfPipe();
1126 
1127  if (set->latch->is_set)
1128  {
1129  occurred_events->fd = PGINVALID_SOCKET;
1130  occurred_events->events = WL_LATCH_SET;
1131  occurred_events++;
1132  returned_events++;
1133  }
1134  }
1135  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1136  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1137  {
1138  /*
1139  * We expect an EPOLLHUP when the remote end is closed, but
1140  * because we don't expect the pipe to become readable or to have
1141  * any errors either, treat those cases as postmaster death, too.
1142  *
1143  * Be paranoid about a spurious event signalling the postmaster as
1144  * being dead. There have been reports about that happening with
1145  * older primitives (select(2) to be specific), and a spurious
1146  * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1147  * cost much.
1148  */
1150  {
1151  if (set->exit_on_postmaster_death)
1152  proc_exit(1);
1153  occurred_events->fd = PGINVALID_SOCKET;
1154  occurred_events->events = WL_POSTMASTER_DEATH;
1155  occurred_events++;
1156  returned_events++;
1157  }
1158  }
1159  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1160  {
1161  Assert(cur_event->fd != PGINVALID_SOCKET);
1162 
1163  if ((cur_event->events & WL_SOCKET_READABLE) &&
1164  (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1165  {
1166  /* data available in socket, or EOF */
1167  occurred_events->events |= WL_SOCKET_READABLE;
1168  }
1169 
1170  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1171  (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1172  {
1173  /* writable, or EOF */
1174  occurred_events->events |= WL_SOCKET_WRITEABLE;
1175  }
1176 
1177  if (occurred_events->events != 0)
1178  {
1179  occurred_events->fd = cur_event->fd;
1180  occurred_events++;
1181  returned_events++;
1182  }
1183  }
1184  }
1185 
1186  return returned_events;
1187 }
1188 
1189 #elif defined(WAIT_USE_POLL)
1190 
1191 /*
1192  * Wait using poll(2).
1193  *
1194  * This allows to receive readiness notifications for several events at once,
1195  * but requires iterating through all of set->pollfds.
1196  */
1197 static inline int
1198 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1199  WaitEvent *occurred_events, int nevents)
1200 {
1201  int returned_events = 0;
1202  int rc;
1203  WaitEvent *cur_event;
1204  struct pollfd *cur_pollfd;
1205 
1206  /* Sleep */
1207  rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1208 
1209  /* Check return code */
1210  if (rc < 0)
1211  {
1212  /* EINTR is okay, otherwise complain */
1213  if (errno != EINTR)
1214  {
1215  waiting = false;
1216  ereport(ERROR,
1218  /* translator: %s is a syscall name, such as "poll()" */
1219  errmsg("%s failed: %m",
1220  "poll()")));
1221  }
1222  return 0;
1223  }
1224  else if (rc == 0)
1225  {
1226  /* timeout exceeded */
1227  return -1;
1228  }
1229 
1230  for (cur_event = set->events, cur_pollfd = set->pollfds;
1231  cur_event < (set->events + set->nevents) &&
1232  returned_events < nevents;
1233  cur_event++, cur_pollfd++)
1234  {
1235  /* no activity on this FD, skip */
1236  if (cur_pollfd->revents == 0)
1237  continue;
1238 
1239  occurred_events->pos = cur_event->pos;
1240  occurred_events->user_data = cur_event->user_data;
1241  occurred_events->events = 0;
1242 
1243  if (cur_event->events == WL_LATCH_SET &&
1244  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1245  {
1246  /* There's data in the self-pipe, clear it. */
1247  drainSelfPipe();
1248 
1249  if (set->latch->is_set)
1250  {
1251  occurred_events->fd = PGINVALID_SOCKET;
1252  occurred_events->events = WL_LATCH_SET;
1253  occurred_events++;
1254  returned_events++;
1255  }
1256  }
1257  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1258  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1259  {
1260  /*
1261  * We expect an POLLHUP when the remote end is closed, but because
1262  * we don't expect the pipe to become readable or to have any
1263  * errors either, treat those cases as postmaster death, too.
1264  *
1265  * Be paranoid about a spurious event signalling the postmaster as
1266  * being dead. There have been reports about that happening with
1267  * older primitives (select(2) to be specific), and a spurious
1268  * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1269  * cost much.
1270  */
1272  {
1273  if (set->exit_on_postmaster_death)
1274  proc_exit(1);
1275  occurred_events->fd = PGINVALID_SOCKET;
1276  occurred_events->events = WL_POSTMASTER_DEATH;
1277  occurred_events++;
1278  returned_events++;
1279  }
1280  }
1281  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1282  {
1283  int errflags = POLLHUP | POLLERR | POLLNVAL;
1284 
1285  Assert(cur_event->fd >= PGINVALID_SOCKET);
1286 
1287  if ((cur_event->events & WL_SOCKET_READABLE) &&
1288  (cur_pollfd->revents & (POLLIN | errflags)))
1289  {
1290  /* data available in socket, or EOF */
1291  occurred_events->events |= WL_SOCKET_READABLE;
1292  }
1293 
1294  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1295  (cur_pollfd->revents & (POLLOUT | errflags)))
1296  {
1297  /* writeable, or EOF */
1298  occurred_events->events |= WL_SOCKET_WRITEABLE;
1299  }
1300 
1301  if (occurred_events->events != 0)
1302  {
1303  occurred_events->fd = cur_event->fd;
1304  occurred_events++;
1305  returned_events++;
1306  }
1307  }
1308  }
1309  return returned_events;
1310 }
1311 
1312 #elif defined(WAIT_USE_WIN32)
1313 
1314 /*
1315  * Wait using Windows' WaitForMultipleObjects().
1316  *
1317  * Unfortunately this will only ever return a single readiness notification at
1318  * a time. Note that while the official documentation for
1319  * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1320  * with a single bWaitAll = FALSE call,
1321  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1322  * that only one event is "consumed".
1323  */
1324 static inline int
1325 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1326  WaitEvent *occurred_events, int nevents)
1327 {
1328  int returned_events = 0;
1329  DWORD rc;
1330  WaitEvent *cur_event;
1331 
1332  /* Reset any wait events that need it */
1333  for (cur_event = set->events;
1334  cur_event < (set->events + set->nevents);
1335  cur_event++)
1336  {
1337  if (cur_event->reset)
1338  {
1339  WaitEventAdjustWin32(set, cur_event);
1340  cur_event->reset = false;
1341  }
1342 
1343  /*
1344  * Windows does not guarantee to log an FD_WRITE network event
1345  * indicating that more data can be sent unless the previous send()
1346  * failed with WSAEWOULDBLOCK. While our caller might well have made
1347  * such a call, we cannot assume that here. Therefore, if waiting for
1348  * write-ready, force the issue by doing a dummy send(). If the dummy
1349  * send() succeeds, assume that the socket is in fact write-ready, and
1350  * return immediately. Also, if it fails with something other than
1351  * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1352  * deal with the error condition.
1353  */
1354  if (cur_event->events & WL_SOCKET_WRITEABLE)
1355  {
1356  char c;
1357  WSABUF buf;
1358  DWORD sent;
1359  int r;
1360 
1361  buf.buf = &c;
1362  buf.len = 0;
1363 
1364  r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1365  if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1366  {
1367  occurred_events->pos = cur_event->pos;
1368  occurred_events->user_data = cur_event->user_data;
1369  occurred_events->events = WL_SOCKET_WRITEABLE;
1370  occurred_events->fd = cur_event->fd;
1371  return 1;
1372  }
1373  }
1374  }
1375 
1376  /*
1377  * Sleep.
1378  *
1379  * Need to wait for ->nevents + 1, because signal handle is in [0].
1380  */
1381  rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1382  cur_timeout);
1383 
1384  /* Check return code */
1385  if (rc == WAIT_FAILED)
1386  elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1387  GetLastError());
1388  else if (rc == WAIT_TIMEOUT)
1389  {
1390  /* timeout exceeded */
1391  return -1;
1392  }
1393 
1394  if (rc == WAIT_OBJECT_0)
1395  {
1396  /* Service newly-arrived signals */
1398  return 0; /* retry */
1399  }
1400 
1401  /*
1402  * With an offset of one, due to the always present pgwin32_signal_event,
1403  * the handle offset directly corresponds to a wait event.
1404  */
1405  cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1406 
1407  occurred_events->pos = cur_event->pos;
1408  occurred_events->user_data = cur_event->user_data;
1409  occurred_events->events = 0;
1410 
1411  if (cur_event->events == WL_LATCH_SET)
1412  {
1413  if (!ResetEvent(set->latch->event))
1414  elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1415 
1416  if (set->latch->is_set)
1417  {
1418  occurred_events->fd = PGINVALID_SOCKET;
1419  occurred_events->events = WL_LATCH_SET;
1420  occurred_events++;
1421  returned_events++;
1422  }
1423  }
1424  else if (cur_event->events == WL_POSTMASTER_DEATH)
1425  {
1426  /*
1427  * Postmaster apparently died. Since the consequences of falsely
1428  * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1429  * the trouble to positively verify this with PostmasterIsAlive(),
1430  * even though there is no known reason to think that the event could
1431  * be falsely set on Windows.
1432  */
1434  {
1435  if (set->exit_on_postmaster_death)
1436  proc_exit(1);
1437  occurred_events->fd = PGINVALID_SOCKET;
1438  occurred_events->events = WL_POSTMASTER_DEATH;
1439  occurred_events++;
1440  returned_events++;
1441  }
1442  }
1443  else if (cur_event->events & WL_SOCKET_MASK)
1444  {
1445  WSANETWORKEVENTS resEvents;
1446  HANDLE handle = set->handles[cur_event->pos + 1];
1447 
1448  Assert(cur_event->fd);
1449 
1450  occurred_events->fd = cur_event->fd;
1451 
1452  ZeroMemory(&resEvents, sizeof(resEvents));
1453  if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1454  elog(ERROR, "failed to enumerate network events: error code %u",
1455  WSAGetLastError());
1456  if ((cur_event->events & WL_SOCKET_READABLE) &&
1457  (resEvents.lNetworkEvents & FD_READ))
1458  {
1459  /* data available in socket */
1460  occurred_events->events |= WL_SOCKET_READABLE;
1461 
1462  /*------
1463  * WaitForMultipleObjects doesn't guarantee that a read event will
1464  * be returned if the latch is set at the same time. Even if it
1465  * did, the caller might drop that event expecting it to reoccur
1466  * on next call. So, we must force the event to be reset if this
1467  * WaitEventSet is used again in order to avoid an indefinite
1468  * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1469  * for the behavior of socket events.
1470  *------
1471  */
1472  cur_event->reset = true;
1473  }
1474  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1475  (resEvents.lNetworkEvents & FD_WRITE))
1476  {
1477  /* writeable */
1478  occurred_events->events |= WL_SOCKET_WRITEABLE;
1479  }
1480  if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1481  (resEvents.lNetworkEvents & FD_CONNECT))
1482  {
1483  /* connected */
1484  occurred_events->events |= WL_SOCKET_CONNECTED;
1485  }
1486  if (resEvents.lNetworkEvents & FD_CLOSE)
1487  {
1488  /* EOF/error, so signal all caller-requested socket flags */
1489  occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
1490  }
1491 
1492  if (occurred_events->events != 0)
1493  {
1494  occurred_events++;
1495  returned_events++;
1496  }
1497  }
1498 
1499  return returned_events;
1500 }
1501 #endif
1502 
1503 /*
1504  * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1505  *
1506  * Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1507  * overloaded for multiple purposes; or we might not have reached WaitLatch
1508  * yet, in which case we don't need to fill the pipe either.)
1509  *
1510  * NB: when calling this in a signal handler, be sure to save and restore
1511  * errno around it.
1512  */
1513 #ifndef WIN32
1514 void
1516 {
1517  if (waiting)
1518  sendSelfPipeByte();
1519 }
1520 #endif /* !WIN32 */
1521 
1522 /* Send one byte to the self-pipe, to wake up WaitLatch */
1523 #ifndef WIN32
1524 static void
1526 {
1527  int rc;
1528  char dummy = 0;
1529 
1530 retry:
1531  rc = write(selfpipe_writefd, &dummy, 1);
1532  if (rc < 0)
1533  {
1534  /* If interrupted by signal, just retry */
1535  if (errno == EINTR)
1536  goto retry;
1537 
1538  /*
1539  * If the pipe is full, we don't need to retry, the data that's there
1540  * already is enough to wake up WaitLatch.
1541  */
1542  if (errno == EAGAIN || errno == EWOULDBLOCK)
1543  return;
1544 
1545  /*
1546  * Oops, the write() failed for some other reason. We might be in a
1547  * signal handler, so it's not safe to elog(). We have no choice but
1548  * silently ignore the error.
1549  */
1550  return;
1551  }
1552 }
1553 #endif /* !WIN32 */
1554 
1555 /*
1556  * Read all available data from the self-pipe
1557  *
1558  * Note: this is only called when waiting = true. If it fails and doesn't
1559  * return, it must reset that flag first (though ideally, this will never
1560  * happen).
1561  */
1562 #ifndef WIN32
1563 static void
1565 {
1566  /*
1567  * There shouldn't normally be more than one byte in the pipe, or maybe a
1568  * few bytes if multiple processes run SetLatch at the same instant.
1569  */
1570  char buf[16];
1571  int rc;
1572 
1573  for (;;)
1574  {
1575  rc = read(selfpipe_readfd, buf, sizeof(buf));
1576  if (rc < 0)
1577  {
1578  if (errno == EAGAIN || errno == EWOULDBLOCK)
1579  break; /* the pipe is empty */
1580  else if (errno == EINTR)
1581  continue; /* retry */
1582  else
1583  {
1584  waiting = false;
1585  elog(ERROR, "read() on self-pipe failed: %m");
1586  }
1587  }
1588  else if (rc == 0)
1589  {
1590  waiting = false;
1591  elog(ERROR, "unexpected EOF on self-pipe");
1592  }
1593  else if (rc < sizeof(buf))
1594  {
1595  /* we successfully drained the pipe; no need to read() again */
1596  break;
1597  }
1598  /* else buffer wasn't big enough, so read again */
1599  }
1600 }
1601 #endif /* !WIN32 */
int latch_pos
Definition: latch.c:94
void InitSharedLatch(Latch *latch)
Definition: latch.c:260
#define WL_SOCKET_WRITEABLE
Definition: latch.h:126
pgsocket fd
Definition: latch.h:145
int MyProcPid
Definition: globals.c:40
int pos
Definition: latch.h:143
void FreeWaitEventSet(WaitEventSet *set)
Definition: latch.c:630
static int selfpipe_writefd
Definition: latch.c:127
#define WL_TIMEOUT
Definition: latch.h:127
int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, void *user_data)
Definition: latch.c:690
#define EAGAIN
Definition: win32_port.h:321
#define SIGUSR1
Definition: win32_port.h:166
#define write(a, b, c)
Definition: win32.h:14
bool is_shared
Definition: latch.h:113
#define INSTR_TIME_GET_MILLISEC(t)
Definition: instr_time.h:202
struct timeval instr_time
Definition: instr_time.h:150
void proc_exit(int code)
Definition: ipc.c:104
void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
Definition: latch.c:766
static void drainSelfPipe(void)
Definition: latch.c:1564
#define kill(pid, sig)
Definition: win32_port.h:426
#define WL_SOCKET_READABLE
Definition: latch.h:125
void DisownLatch(Latch *latch)
Definition: latch.c:316
#define WL_SOCKET_MASK
Definition: latch.h:137
void InitLatch(Latch *latch)
Definition: latch.c:228
void SetLatch(Latch *latch)
Definition: latch.c:436
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void ResetLatch(Latch *latch)
Definition: latch.c:519
static time_t start_time
Definition: pg_ctl.c:99
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:344
WaitEventSet * CreateWaitEventSet(MemoryContext context, int nevents)
Definition: latch.c:542
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:849
HANDLE pgwin32_signal_event
Definition: signal.c:27
void pfree(void *pointer)
Definition: mcxt.c:1056
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:108
#define ERROR
Definition: elog.h:43
void OwnLatch(Latch *latch)
Definition: latch.c:296
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
#define FATAL
Definition: elog.h:52
uint32 events
Definition: latch.h:144
static int selfpipe_readfd
Definition: latch.c:126
Definition: latch.h:110
bool exit_on_postmaster_death
Definition: latch.c:101
static int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents)
char * c
static char * buf
Definition: pg_test_fsync.c:67
bool IsUnderPostmaster
Definition: globals.c:109
int WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, long timeout, uint32 wait_event_info)
Definition: latch.c:369
unsigned int uint32
Definition: c.h:359
int pgsocket
Definition: port.h:31
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1342
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define ereport(elevel, rest)
Definition: elog.h:141
int errcode_for_socket_access(void)
Definition: elog.c:702
int nevents
Definition: latch.c:78
int postmaster_alive_fds[2]
Definition: postmaster.c:557
static void sendSelfPipeByte(void)
Definition: latch.c:1525
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
#define PGINVALID_SOCKET
Definition: port.h:33
void InitializeLatchSupport(void)
Definition: latch.c:155
bool PostmasterIsAliveInternal(void)
Definition: pmsignal.c:309
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:839
#define pg_memory_barrier()
Definition: atomics.h:145
#define Assert(condition)
Definition: c.h:739
WaitEvent * events
Definition: latch.c:85
size_t Size
Definition: c.h:467
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1318
#define MAXALIGN(LEN)
Definition: c.h:692
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void * user_data
Definition: latch.h:146
int nevents_space
Definition: latch.c:79
int errmsg(const char *fmt,...)
Definition: elog.c:822
int owner_pid
Definition: latch.h:114
sig_atomic_t is_set
Definition: latch.h:112
#define elog(elevel,...)
Definition: elog.h:228
#define EWOULDBLOCK
Definition: win32_port.h:329
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:323
void latch_sigusr1_handler(void)
Definition: latch.c:1515
#define WL_SOCKET_CONNECTED
Definition: latch.h:134
Latch * latch
Definition: latch.c:93
#define WL_LATCH_SET
Definition: latch.h:124
static volatile sig_atomic_t waiting
Definition: latch.c:123
static int selfpipe_owner_pid
Definition: latch.c:130
#define POSTMASTER_FD_WATCH
Definition: postmaster.h:42
#define read(a, b, c)
Definition: win32.h:13
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:129
int WaitEventSetWait(WaitEventSet *set, long timeout, WaitEvent *occurred_events, int nevents, uint32 wait_event_info)
Definition: latch.c:953