PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
latch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * latch.c
4  * Routines for inter-process latches
5  *
6  * The Unix implementation uses the so-called self-pipe trick to overcome
7  * the race condition involved with select() and setting a global flag
8  * in the signal handler. When a latch is set and the current process
9  * is waiting for it, the signal handler wakes up the select() in
10  * WaitLatch by writing a byte to a pipe. A signal by itself doesn't
11  * interrupt select() on all platforms, and even on platforms where it
12  * does, a signal that arrives just before the select() call does not
13  * prevent the select() from entering sleep. An incoming byte on a pipe
14  * however reliably interrupts the sleep, and causes select() to return
15  * immediately even if the signal arrives before select() begins.
16  *
17  * (Actually, we prefer epoll_wait() over poll() over select() where
18  * available, but the same comments apply.)
19  *
20  * When SetLatch is called from the same process that owns the latch,
21  * SetLatch writes the byte directly to the pipe. If it's owned by another
22  * process, SIGUSR1 is sent and the signal handler in the waiting process
23  * writes the byte to the pipe on behalf of the signaling process.
24  *
25  * The Windows implementation uses Windows events that are inherited by
26  * all postmaster child processes.
27  *
28  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
29  * Portions Copyright (c) 1994, Regents of the University of California
30  *
31  * IDENTIFICATION
32  * src/backend/storage/ipc/latch.c
33  *
34  *-------------------------------------------------------------------------
35  */
36 #include "postgres.h"
37 
38 #include <fcntl.h>
39 #include <limits.h>
40 #include <signal.h>
41 #include <unistd.h>
42 #include <sys/time.h>
43 #ifdef HAVE_SYS_EPOLL_H
44 #include <sys/epoll.h>
45 #endif
46 #ifdef HAVE_POLL_H
47 #include <poll.h>
48 #endif
49 #ifdef HAVE_SYS_POLL_H
50 #include <sys/poll.h>
51 #endif
52 #ifdef HAVE_SYS_SELECT_H
53 #include <sys/select.h>
54 #endif
55 
56 #include "miscadmin.h"
57 #include "pgstat.h"
58 #include "port/atomics.h"
59 #include "portability/instr_time.h"
60 #include "postmaster/postmaster.h"
61 #include "storage/latch.h"
62 #include "storage/pmsignal.h"
63 #include "storage/shmem.h"
64 
65 /*
66  * Select the fd readiness primitive to use. Normally the "most modern"
67  * primitive supported by the OS will be used, but for testing it can be
68  * useful to manually specify the used primitive. If desired, just add a
69  * define somewhere before this block.
70  */
71 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
72  defined(WAIT_USE_SELECT) || defined(WAIT_USE_WIN32)
73 /* don't overwrite manual choice */
74 #elif defined(HAVE_SYS_EPOLL_H)
75 #define WAIT_USE_EPOLL
76 #elif defined(HAVE_POLL)
77 #define WAIT_USE_POLL
78 #elif HAVE_SYS_SELECT_H
79 #define WAIT_USE_SELECT
80 #elif WIN32
81 #define WAIT_USE_WIN32
82 #else
83 #error "no wait set implementation available"
84 #endif
85 
86 /* typedef in latch.h */
88 {
89  int nevents; /* number of registered events */
90  int nevents_space; /* maximum number of events in this set */
91 
92  /*
93  * Array, of nevents_space length, storing the definition of events this
94  * set is waiting for.
95  */
97 
98  /*
99  * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
100  * said latch, and latch_pos the offset in the ->events array. This is
101  * useful because we check the state of the latch before performing doing
102  * syscalls related to waiting.
103  */
106 
107 #if defined(WAIT_USE_EPOLL)
108  int epoll_fd;
109  /* epoll_wait returns events in a user provided arrays, allocate once */
110  struct epoll_event *epoll_ret_events;
111 #elif defined(WAIT_USE_POLL)
112  /* poll expects events to be waited on every poll() call, prepare once */
113  struct pollfd *pollfds;
114 #elif defined(WAIT_USE_WIN32)
115 
116  /*
117  * Array of windows events. The first element always contains
118  * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
119  * event->pos + 1).
120  */
121  HANDLE *handles;
122 #endif
123 };
124 
125 #ifndef WIN32
126 /* Are we currently in WaitLatch? The signal handler would like to know. */
127 static volatile sig_atomic_t waiting = false;
128 
129 /* Read and write ends of the self-pipe */
130 static int selfpipe_readfd = -1;
131 static int selfpipe_writefd = -1;
132 
133 /* Private function prototypes */
134 static void sendSelfPipeByte(void);
135 static void drainSelfPipe(void);
136 #endif /* WIN32 */
137 
138 #if defined(WAIT_USE_EPOLL)
139 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
140 #elif defined(WAIT_USE_POLL)
141 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
142 #elif defined(WAIT_USE_WIN32)
143 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
144 #endif
145 
146 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
147  WaitEvent *occurred_events, int nevents);
148 
149 /*
150  * Initialize the process-local latch infrastructure.
151  *
152  * This must be called once during startup of any process that can wait on
153  * latches, before it issues any InitLatch() or OwnLatch() calls.
154  */
155 void
157 {
158 #ifndef WIN32
159  int pipefd[2];
160 
161  Assert(selfpipe_readfd == -1);
162 
163  /*
164  * Set up the self-pipe that allows a signal handler to wake up the
165  * select() in WaitLatch. Make the write-end non-blocking, so that
166  * SetLatch won't block if the event has already been set many times
167  * filling the kernel buffer. Make the read-end non-blocking too, so that
168  * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
169  */
170  if (pipe(pipefd) < 0)
171  elog(FATAL, "pipe() failed: %m");
172  if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0)
173  elog(FATAL, "fcntl() failed on read-end of self-pipe: %m");
174  if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) < 0)
175  elog(FATAL, "fcntl() failed on write-end of self-pipe: %m");
176 
177  selfpipe_readfd = pipefd[0];
178  selfpipe_writefd = pipefd[1];
179 #else
180  /* currently, nothing to do here for Windows */
181 #endif
182 }
183 
184 /*
185  * Initialize a backend-local latch.
186  */
187 void
188 InitLatch(volatile Latch *latch)
189 {
190  latch->is_set = false;
191  latch->owner_pid = MyProcPid;
192  latch->is_shared = false;
193 
194 #ifndef WIN32
195  /* Assert InitializeLatchSupport has been called in this process */
196  Assert(selfpipe_readfd >= 0);
197 #else
198  latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
199  if (latch->event == NULL)
200  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
201 #endif /* WIN32 */
202 }
203 
204 /*
205  * Initialize a shared latch that can be set from other processes. The latch
206  * is initially owned by no-one; use OwnLatch to associate it with the
207  * current process.
208  *
209  * InitSharedLatch needs to be called in postmaster before forking child
210  * processes, usually right after allocating the shared memory block
211  * containing the latch with ShmemInitStruct. (The Unix implementation
212  * doesn't actually require that, but the Windows one does.) Because of
213  * this restriction, we have no concurrency issues to worry about here.
214  */
215 void
216 InitSharedLatch(volatile Latch *latch)
217 {
218 #ifdef WIN32
219  SECURITY_ATTRIBUTES sa;
220 
221  /*
222  * Set up security attributes to specify that the events are inherited.
223  */
224  ZeroMemory(&sa, sizeof(sa));
225  sa.nLength = sizeof(sa);
226  sa.bInheritHandle = TRUE;
227 
228  latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
229  if (latch->event == NULL)
230  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
231 #endif
232 
233  latch->is_set = false;
234  latch->owner_pid = 0;
235  latch->is_shared = true;
236 }
237 
238 /*
239  * Associate a shared latch with the current process, allowing it to
240  * wait on the latch.
241  *
242  * Although there is a sanity check for latch-already-owned, we don't do
243  * any sort of locking here, meaning that we could fail to detect the error
244  * if two processes try to own the same latch at about the same time. If
245  * there is any risk of that, caller must provide an interlock to prevent it.
246  *
247  * In any process that calls OwnLatch(), make sure that
248  * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
249  * as shared latches use SIGUSR1 for inter-process communication.
250  */
251 void
252 OwnLatch(volatile Latch *latch)
253 {
254  /* Sanity checks */
255  Assert(latch->is_shared);
256 
257 #ifndef WIN32
258  /* Assert InitializeLatchSupport has been called in this process */
259  Assert(selfpipe_readfd >= 0);
260 #endif
261 
262  if (latch->owner_pid != 0)
263  elog(ERROR, "latch already owned");
264 
265  latch->owner_pid = MyProcPid;
266 }
267 
268 /*
269  * Disown a shared latch currently owned by the current process.
270  */
271 void
272 DisownLatch(volatile Latch *latch)
273 {
274  Assert(latch->is_shared);
275  Assert(latch->owner_pid == MyProcPid);
276 
277  latch->owner_pid = 0;
278 }
279 
280 /*
281  * Wait for a given latch to be set, or for postmaster death, or until timeout
282  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
283  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
284  * function returns immediately.
285  *
286  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
287  * is given. Although it is declared as "long", we don't actually support
288  * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
289  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
290  *
291  * The latch must be owned by the current process, ie. it must be a
292  * backend-local latch initialized with InitLatch, or a shared latch
293  * associated with the current process by calling OwnLatch.
294  *
295  * Returns bit mask indicating which condition(s) caused the wake-up. Note
296  * that if multiple wake-up conditions are true, there is no guarantee that
297  * we return all of them in one call, but we will return at least one.
298  */
299 int
300 WaitLatch(volatile Latch *latch, int wakeEvents, long timeout,
301  uint32 wait_event_info)
302 {
303  return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
304  wait_event_info);
305 }
306 
307 /*
308  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
309  * conditions.
310  *
311  * When waiting on a socket, EOF and error conditions are reported by
312  * returning the socket as readable/writable or both, depending on
313  * WL_SOCKET_READABLE/WL_SOCKET_WRITEABLE being specified.
314  *
315  * NB: These days this is just a wrapper around the WaitEventSet API. When
316  * using a latch very frequently, consider creating a longer living
317  * WaitEventSet instead; that's more efficient.
318  */
319 int
320 WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
321  long timeout, uint32 wait_event_info)
322 {
323  int ret = 0;
324  int rc;
325  WaitEvent event;
327 
328  if (wakeEvents & WL_TIMEOUT)
329  Assert(timeout >= 0);
330  else
331  timeout = -1;
332 
333  if (wakeEvents & WL_LATCH_SET)
334  AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
335  (Latch *) latch, NULL);
336 
337  if (wakeEvents & WL_POSTMASTER_DEATH)
338  AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
339  NULL, NULL);
340 
341  if (wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
342  {
343  int ev;
344 
345  ev = wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
346  AddWaitEventToSet(set, ev, sock, NULL, NULL);
347  }
348 
349  rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
350 
351  if (rc == 0)
352  ret |= WL_TIMEOUT;
353  else
354  {
355  ret |= event.events & (WL_LATCH_SET |
356  WL_POSTMASTER_DEATH |
359  }
360 
361  FreeWaitEventSet(set);
362 
363  return ret;
364 }
365 
366 /*
367  * Sets a latch and wakes up anyone waiting on it.
368  *
369  * This is cheap if the latch is already set, otherwise not so much.
370  *
371  * NB: when calling this in a signal handler, be sure to save and restore
372  * errno around it. (That's standard practice in most signal handlers, of
373  * course, but we used to omit it in handlers that only set a flag.)
374  *
375  * NB: this function is called from critical sections and signal handlers so
376  * throwing an error is not a good idea.
377  */
378 void
379 SetLatch(volatile Latch *latch)
380 {
381 #ifndef WIN32
382  pid_t owner_pid;
383 #else
384  HANDLE handle;
385 #endif
386 
387  /*
388  * The memory barrier has to be placed here to ensure that any flag
389  * variables possibly changed by this process have been flushed to main
390  * memory, before we check/set is_set.
391  */
393 
394  /* Quick exit if already set */
395  if (latch->is_set)
396  return;
397 
398  latch->is_set = true;
399 
400 #ifndef WIN32
401 
402  /*
403  * See if anyone's waiting for the latch. It can be the current process if
404  * we're in a signal handler. We use the self-pipe to wake up the select()
405  * in that case. If it's another process, send a signal.
406  *
407  * Fetch owner_pid only once, in case the latch is concurrently getting
408  * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
409  * guaranteed to be true! In practice, the effective range of pid_t fits
410  * in a 32 bit integer, and so should be atomic. In the worst case, we
411  * might end up signaling the wrong process. Even then, you're very
412  * unlucky if a process with that bogus pid exists and belongs to
413  * Postgres; and PG database processes should handle excess SIGUSR1
414  * interrupts without a problem anyhow.
415  *
416  * Another sort of race condition that's possible here is for a new
417  * process to own the latch immediately after we look, so we don't signal
418  * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
419  * the standard coding convention of waiting at the bottom of their loops,
420  * not the top, so that they'll correctly process latch-setting events
421  * that happen before they enter the loop.
422  */
423  owner_pid = latch->owner_pid;
424  if (owner_pid == 0)
425  return;
426  else if (owner_pid == MyProcPid)
427  {
428  if (waiting)
430  }
431  else
432  kill(owner_pid, SIGUSR1);
433 #else
434 
435  /*
436  * See if anyone's waiting for the latch. It can be the current process if
437  * we're in a signal handler.
438  *
439  * Use a local variable here just in case somebody changes the event field
440  * concurrently (which really should not happen).
441  */
442  handle = latch->event;
443  if (handle)
444  {
445  SetEvent(handle);
446 
447  /*
448  * Note that we silently ignore any errors. We might be in a signal
449  * handler or other critical path where it's not safe to call elog().
450  */
451  }
452 #endif
453 
454 }
455 
456 /*
457  * Clear the latch. Calling WaitLatch after this will sleep, unless
458  * the latch is set again before the WaitLatch call.
459  */
460 void
461 ResetLatch(volatile Latch *latch)
462 {
463  /* Only the owner should reset the latch */
464  Assert(latch->owner_pid == MyProcPid);
465 
466  latch->is_set = false;
467 
468  /*
469  * Ensure that the write to is_set gets flushed to main memory before we
470  * examine any flag variables. Otherwise a concurrent SetLatch might
471  * falsely conclude that it needn't signal us, even though we have missed
472  * seeing some flag updates that SetLatch was supposed to inform us of.
473  */
475 }
476 
477 /*
478  * Create a WaitEventSet with space for nevents different events to wait for.
479  *
480  * These events can then be efficiently waited upon together, using
481  * WaitEventSetWait().
482  */
483 WaitEventSet *
484 CreateWaitEventSet(MemoryContext context, int nevents)
485 {
486  WaitEventSet *set;
487  char *data;
488  Size sz = 0;
489 
490  /*
491  * Use MAXALIGN size/alignment to guarantee that later uses of memory are
492  * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
493  * platforms, but earlier allocations like WaitEventSet and WaitEvent
494  * might not sized to guarantee that when purely using sizeof().
495  */
496  sz += MAXALIGN(sizeof(WaitEventSet));
497  sz += MAXALIGN(sizeof(WaitEvent) * nevents);
498 
499 #if defined(WAIT_USE_EPOLL)
500  sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
501 #elif defined(WAIT_USE_POLL)
502  sz += MAXALIGN(sizeof(struct pollfd) * nevents);
503 #elif defined(WAIT_USE_WIN32)
504  /* need space for the pgwin32_signal_event */
505  sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
506 #endif
507 
508  data = (char *) MemoryContextAllocZero(context, sz);
509 
510  set = (WaitEventSet *) data;
511  data += MAXALIGN(sizeof(WaitEventSet));
512 
513  set->events = (WaitEvent *) data;
514  data += MAXALIGN(sizeof(WaitEvent) * nevents);
515 
516 #if defined(WAIT_USE_EPOLL)
517  set->epoll_ret_events = (struct epoll_event *) data;
518  data += MAXALIGN(sizeof(struct epoll_event) * nevents);
519 #elif defined(WAIT_USE_POLL)
520  set->pollfds = (struct pollfd *) data;
521  data += MAXALIGN(sizeof(struct pollfd) * nevents);
522 #elif defined(WAIT_USE_WIN32)
523  set->handles = (HANDLE) data;
524  data += MAXALIGN(sizeof(HANDLE) * nevents);
525 #endif
526 
527  set->latch = NULL;
528  set->nevents_space = nevents;
529 
530 #if defined(WAIT_USE_EPOLL)
531  set->epoll_fd = epoll_create(nevents);
532  if (set->epoll_fd < 0)
533  elog(ERROR, "epoll_create failed: %m");
534 #elif defined(WAIT_USE_WIN32)
535 
536  /*
537  * To handle signals while waiting, we need to add a win32 specific event.
538  * We accounted for the additional event at the top of this routine. See
539  * port/win32/signal.c for more details.
540  *
541  * Note: pgwin32_signal_event should be first to ensure that it will be
542  * reported when multiple events are set. We want to guarantee that
543  * pending signals are serviced.
544  */
545  set->handles[0] = pgwin32_signal_event;
546  StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
547 #endif
548 
549  return set;
550 }
551 
552 /*
553  * Free a previously created WaitEventSet.
554  */
555 void
557 {
558 #if defined(WAIT_USE_EPOLL)
559  close(set->epoll_fd);
560 #elif defined(WAIT_USE_WIN32)
561  WaitEvent *cur_event;
562 
563  for (cur_event = set->events;
564  cur_event < (set->events + set->nevents);
565  cur_event++)
566  {
567  if (cur_event->events & WL_LATCH_SET)
568  {
569  /* uses the latch's HANDLE */
570  }
571  else if (cur_event->events & WL_POSTMASTER_DEATH)
572  {
573  /* uses PostmasterHandle */
574  }
575  else
576  {
577  /* Clean up the event object we created for the socket */
578  WSAEventSelect(cur_event->fd, NULL, 0);
579  WSACloseEvent(set->handles[cur_event->pos + 1]);
580  }
581  }
582 #endif
583 
584  pfree(set);
585 }
586 
587 /* ---
588  * Add an event to the set. Possible events are:
589  * - WL_LATCH_SET: Wait for the latch to be set
590  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
591  * - WL_SOCKET_READABLE: Wait for socket to become readable
592  * can be combined in one event with WL_SOCKET_WRITEABLE
593  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable
594  * can be combined with WL_SOCKET_READABLE
595  *
596  * Returns the offset in WaitEventSet->events (starting from 0), which can be
597  * used to modify previously added wait events using ModifyWaitEvent().
598  *
599  * In the WL_LATCH_SET case the latch must be owned by the current process,
600  * i.e. it must be a backend-local latch initialized with InitLatch, or a
601  * shared latch associated with the current process by calling OwnLatch.
602  *
603  * In the WL_SOCKET_READABLE/WRITEABLE case, EOF and error conditions are
604  * reported by returning the socket as readable/writable or both, depending on
605  * WL_SOCKET_READABLE/WRITEABLE being specified.
606  *
607  * The user_data pointer specified here will be set for the events returned
608  * by WaitEventSetWait(), allowing to easily associate additional data with
609  * events.
610  */
611 int
613  void *user_data)
614 {
615  WaitEvent *event;
616 
617  /* not enough space */
618  Assert(set->nevents < set->nevents_space);
619 
620  if (latch)
621  {
622  if (latch->owner_pid != MyProcPid)
623  elog(ERROR, "cannot wait on a latch owned by another process");
624  if (set->latch)
625  elog(ERROR, "cannot wait on more than one latch");
626  if ((events & WL_LATCH_SET) != WL_LATCH_SET)
627  elog(ERROR, "latch events only support being set");
628  }
629  else
630  {
631  if (events & WL_LATCH_SET)
632  elog(ERROR, "cannot wait on latch without a specified latch");
633  }
634 
635  /* waiting for socket readiness without a socket indicates a bug */
636  if (fd == PGINVALID_SOCKET &&
638  elog(ERROR, "cannot wait on socket event without a socket");
639 
640  event = &set->events[set->nevents];
641  event->pos = set->nevents++;
642  event->fd = fd;
643  event->events = events;
644  event->user_data = user_data;
645 #ifdef WIN32
646  event->reset = false;
647 #endif
648 
649  if (events == WL_LATCH_SET)
650  {
651  set->latch = latch;
652  set->latch_pos = event->pos;
653 #ifndef WIN32
654  event->fd = selfpipe_readfd;
655 #endif
656  }
657  else if (events == WL_POSTMASTER_DEATH)
658  {
659 #ifndef WIN32
661 #endif
662  }
663 
664  /* perform wait primitive specific initialization, if needed */
665 #if defined(WAIT_USE_EPOLL)
666  WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
667 #elif defined(WAIT_USE_POLL)
668  WaitEventAdjustPoll(set, event);
669 #elif defined(WAIT_USE_SELECT)
670  /* nothing to do */
671 #elif defined(WAIT_USE_WIN32)
672  WaitEventAdjustWin32(set, event);
673 #endif
674 
675  return event->pos;
676 }
677 
678 /*
679  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
680  * with the WaitEvent.
681  *
682  * 'pos' is the id returned by AddWaitEventToSet.
683  */
684 void
685 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
686 {
687  WaitEvent *event;
688 
689  Assert(pos < set->nevents);
690 
691  event = &set->events[pos];
692 
693  /*
694  * If neither the event mask nor the associated latch changes, return
695  * early. That's an important optimization for some sockets, where
696  * ModifyWaitEvent is frequently used to switch from waiting for reads to
697  * waiting on writes.
698  */
699  if (events == event->events &&
700  (!(event->events & WL_LATCH_SET) || set->latch == latch))
701  return;
702 
703  if (event->events & WL_LATCH_SET &&
704  events != event->events)
705  {
706  /* we could allow to disable latch events for a while */
707  elog(ERROR, "cannot modify latch event");
708  }
709 
710  if (event->events & WL_POSTMASTER_DEATH)
711  {
712  elog(ERROR, "cannot modify postmaster death event");
713  }
714 
715  /* FIXME: validate event mask */
716  event->events = events;
717 
718  if (events == WL_LATCH_SET)
719  {
720  set->latch = latch;
721  }
722 
723 #if defined(WAIT_USE_EPOLL)
724  WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
725 #elif defined(WAIT_USE_POLL)
726  WaitEventAdjustPoll(set, event);
727 #elif defined(WAIT_USE_SELECT)
728  /* nothing to do */
729 #elif defined(WAIT_USE_WIN32)
730  WaitEventAdjustWin32(set, event);
731 #endif
732 }
733 
734 #if defined(WAIT_USE_EPOLL)
735 /*
736  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
737  */
738 static void
739 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
740 {
741  struct epoll_event epoll_ev;
742  int rc;
743 
744  /* pointer to our event, returned by epoll_wait */
745  epoll_ev.data.ptr = event;
746  /* always wait for errors */
747  epoll_ev.events = EPOLLERR | EPOLLHUP;
748 
749  /* prepare pollfd entry once */
750  if (event->events == WL_LATCH_SET)
751  {
752  Assert(set->latch != NULL);
753  epoll_ev.events |= EPOLLIN;
754  }
755  else if (event->events == WL_POSTMASTER_DEATH)
756  {
757  epoll_ev.events |= EPOLLIN;
758  }
759  else
760  {
761  Assert(event->fd != PGINVALID_SOCKET);
763 
764  if (event->events & WL_SOCKET_READABLE)
765  epoll_ev.events |= EPOLLIN;
766  if (event->events & WL_SOCKET_WRITEABLE)
767  epoll_ev.events |= EPOLLOUT;
768  }
769 
770  /*
771  * Even though unused, we also pass epoll_ev as the data argument if
772  * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
773  * requiring that, and actually it makes the code simpler...
774  */
775  rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
776 
777  if (rc < 0)
778  ereport(ERROR,
780  errmsg("epoll_ctl() failed: %m")));
781 }
782 #endif
783 
784 #if defined(WAIT_USE_POLL)
785 static void
786 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
787 {
788  struct pollfd *pollfd = &set->pollfds[event->pos];
789 
790  pollfd->revents = 0;
791  pollfd->fd = event->fd;
792 
793  /* prepare pollfd entry once */
794  if (event->events == WL_LATCH_SET)
795  {
796  Assert(set->latch != NULL);
797  pollfd->events = POLLIN;
798  }
799  else if (event->events == WL_POSTMASTER_DEATH)
800  {
801  pollfd->events = POLLIN;
802  }
803  else
804  {
806  pollfd->events = 0;
807  if (event->events & WL_SOCKET_READABLE)
808  pollfd->events |= POLLIN;
809  if (event->events & WL_SOCKET_WRITEABLE)
810  pollfd->events |= POLLOUT;
811  }
812 
813  Assert(event->fd != PGINVALID_SOCKET);
814 }
815 #endif
816 
817 #if defined(WAIT_USE_WIN32)
818 static void
819 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
820 {
821  HANDLE *handle = &set->handles[event->pos + 1];
822 
823  if (event->events == WL_LATCH_SET)
824  {
825  Assert(set->latch != NULL);
826  *handle = set->latch->event;
827  }
828  else if (event->events == WL_POSTMASTER_DEATH)
829  {
830  *handle = PostmasterHandle;
831  }
832  else
833  {
834  int flags = FD_CLOSE; /* always check for errors/EOF */
835 
836  if (event->events & WL_SOCKET_READABLE)
837  flags |= FD_READ;
838  if (event->events & WL_SOCKET_WRITEABLE)
839  flags |= FD_WRITE;
840 
841  if (*handle == WSA_INVALID_EVENT)
842  {
843  *handle = WSACreateEvent();
844  if (*handle == WSA_INVALID_EVENT)
845  elog(ERROR, "failed to create event for socket: error code %u",
846  WSAGetLastError());
847  }
848  if (WSAEventSelect(event->fd, *handle, flags) != 0)
849  elog(ERROR, "failed to set up event for socket: error code %u",
850  WSAGetLastError());
851 
852  Assert(event->fd != PGINVALID_SOCKET);
853  }
854 }
855 #endif
856 
857 /*
858  * Wait for events added to the set to happen, or until the timeout is
859  * reached. At most nevents occurred events are returned.
860  *
861  * If timeout = -1, block until an event occurs; if 0, check sockets for
862  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
863  *
864  * Returns the number of events occurred, or 0 if the timeout was reached.
865  *
866  * Returned events will have the fd, pos, user_data fields set to the
867  * values associated with the registered event.
868  */
869 int
870 WaitEventSetWait(WaitEventSet *set, long timeout,
871  WaitEvent *occurred_events, int nevents,
872  uint32 wait_event_info)
873 {
874  int returned_events = 0;
876  instr_time cur_time;
877  long cur_timeout = -1;
878 
879  Assert(nevents > 0);
880 
881  /*
882  * Initialize timeout if requested. We must record the current time so
883  * that we can determine the remaining timeout if interrupted.
884  */
885  if (timeout >= 0)
886  {
887  INSTR_TIME_SET_CURRENT(start_time);
888  Assert(timeout >= 0 && timeout <= INT_MAX);
889  cur_timeout = timeout;
890  }
891 
892  pgstat_report_wait_start(wait_event_info);
893 
894 #ifndef WIN32
895  waiting = true;
896 #else
897  /* Ensure that signals are serviced even if latch is already set */
899 #endif
900  while (returned_events == 0)
901  {
902  int rc;
903 
904  /*
905  * Check if the latch is set already. If so, leave the loop
906  * immediately, avoid blocking again. We don't attempt to report any
907  * other events that might also be satisfied.
908  *
909  * If someone sets the latch between this and the
910  * WaitEventSetWaitBlock() below, the setter will write a byte to the
911  * pipe (or signal us and the signal handler will do that), and the
912  * readiness routine will return immediately.
913  *
914  * On unix, If there's a pending byte in the self pipe, we'll notice
915  * whenever blocking. Only clearing the pipe in that case avoids
916  * having to drain it every time WaitLatchOrSocket() is used. Should
917  * the pipe-buffer fill up we're still ok, because the pipe is in
918  * nonblocking mode. It's unlikely for that to happen, because the
919  * self pipe isn't filled unless we're blocking (waiting = true), or
920  * from inside a signal handler in latch_sigusr1_handler().
921  *
922  * On windows, we'll also notice if there's a pending event for the
923  * latch when blocking, but there's no danger of anything filling up,
924  * as "Setting an event that is already set has no effect.".
925  *
926  * Note: we assume that the kernel calls involved in latch management
927  * will provide adequate synchronization on machines with weak memory
928  * ordering, so that we cannot miss seeing is_set if a notification
929  * has already been queued.
930  */
931  if (set->latch && set->latch->is_set)
932  {
933  occurred_events->fd = PGINVALID_SOCKET;
934  occurred_events->pos = set->latch_pos;
935  occurred_events->user_data =
936  set->events[set->latch_pos].user_data;
937  occurred_events->events = WL_LATCH_SET;
938  occurred_events++;
939  returned_events++;
940 
941  break;
942  }
943 
944  /*
945  * Wait for events using the readiness primitive chosen at the top of
946  * this file. If -1 is returned, a timeout has occurred, if 0 we have
947  * to retry, everything >= 1 is the number of returned events.
948  */
949  rc = WaitEventSetWaitBlock(set, cur_timeout,
950  occurred_events, nevents);
951 
952  if (rc == -1)
953  break; /* timeout occurred */
954  else
955  returned_events = rc;
956 
957  /* If we're not done, update cur_timeout for next iteration */
958  if (returned_events == 0 && timeout >= 0)
959  {
960  INSTR_TIME_SET_CURRENT(cur_time);
961  INSTR_TIME_SUBTRACT(cur_time, start_time);
962  cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
963  if (cur_timeout <= 0)
964  break;
965  }
966  }
967 #ifndef WIN32
968  waiting = false;
969 #endif
970 
972 
973  return returned_events;
974 }
975 
976 
977 #if defined(WAIT_USE_EPOLL)
978 
979 /*
980  * Wait using linux's epoll_wait(2).
981  *
982  * This is the preferrable wait method, as several readiness notifications are
983  * delivered, without having to iterate through all of set->events. The return
984  * epoll_event struct contain a pointer to our events, making association
985  * easy.
986  */
987 static inline int
988 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
989  WaitEvent *occurred_events, int nevents)
990 {
991  int returned_events = 0;
992  int rc;
993  WaitEvent *cur_event;
994  struct epoll_event *cur_epoll_event;
995 
996  /* Sleep */
997  rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
998  nevents, cur_timeout);
999 
1000  /* Check return code */
1001  if (rc < 0)
1002  {
1003  /* EINTR is okay, otherwise complain */
1004  if (errno != EINTR)
1005  {
1006  waiting = false;
1007  ereport(ERROR,
1009  errmsg("epoll_wait() failed: %m")));
1010  }
1011  return 0;
1012  }
1013  else if (rc == 0)
1014  {
1015  /* timeout exceeded */
1016  return -1;
1017  }
1018 
1019  /*
1020  * At least one event occurred, iterate over the returned epoll events
1021  * until they're either all processed, or we've returned all the events
1022  * the caller desired.
1023  */
1024  for (cur_epoll_event = set->epoll_ret_events;
1025  cur_epoll_event < (set->epoll_ret_events + rc) &&
1026  returned_events < nevents;
1027  cur_epoll_event++)
1028  {
1029  /* epoll's data pointer is set to the associated WaitEvent */
1030  cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1031 
1032  occurred_events->pos = cur_event->pos;
1033  occurred_events->user_data = cur_event->user_data;
1034  occurred_events->events = 0;
1035 
1036  if (cur_event->events == WL_LATCH_SET &&
1037  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1038  {
1039  /* There's data in the self-pipe, clear it. */
1040  drainSelfPipe();
1041 
1042  if (set->latch->is_set)
1043  {
1044  occurred_events->fd = PGINVALID_SOCKET;
1045  occurred_events->events = WL_LATCH_SET;
1046  occurred_events++;
1047  returned_events++;
1048  }
1049  }
1050  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1051  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1052  {
1053  /*
1054  * We expect an EPOLLHUP when the remote end is closed, but
1055  * because we don't expect the pipe to become readable or to have
1056  * any errors either, treat those cases as postmaster death, too.
1057  *
1058  * As explained in the WAIT_USE_SELECT implementation, select(2)
1059  * may spuriously return. Be paranoid about that here too, a
1060  * spurious WL_POSTMASTER_DEATH would be painful.
1061  */
1062  if (!PostmasterIsAlive())
1063  {
1064  occurred_events->fd = PGINVALID_SOCKET;
1065  occurred_events->events = WL_POSTMASTER_DEATH;
1066  occurred_events++;
1067  returned_events++;
1068  }
1069  }
1070  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1071  {
1072  Assert(cur_event->fd != PGINVALID_SOCKET);
1073 
1074  if ((cur_event->events & WL_SOCKET_READABLE) &&
1075  (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1076  {
1077  /* data available in socket, or EOF */
1078  occurred_events->events |= WL_SOCKET_READABLE;
1079  }
1080 
1081  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1082  (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1083  {
1084  /* writable, or EOF */
1085  occurred_events->events |= WL_SOCKET_WRITEABLE;
1086  }
1087 
1088  if (occurred_events->events != 0)
1089  {
1090  occurred_events->fd = cur_event->fd;
1091  occurred_events++;
1092  returned_events++;
1093  }
1094  }
1095  }
1096 
1097  return returned_events;
1098 }
1099 
1100 #elif defined(WAIT_USE_POLL)
1101 
1102 /*
1103  * Wait using poll(2).
1104  *
1105  * This allows to receive readiness notifications for several events at once,
1106  * but requires iterating through all of set->pollfds.
1107  */
1108 static inline int
1109 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1110  WaitEvent *occurred_events, int nevents)
1111 {
1112  int returned_events = 0;
1113  int rc;
1114  WaitEvent *cur_event;
1115  struct pollfd *cur_pollfd;
1116 
1117  /* Sleep */
1118  rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1119 
1120  /* Check return code */
1121  if (rc < 0)
1122  {
1123  /* EINTR is okay, otherwise complain */
1124  if (errno != EINTR)
1125  {
1126  waiting = false;
1127  ereport(ERROR,
1129  errmsg("poll() failed: %m")));
1130  }
1131  return 0;
1132  }
1133  else if (rc == 0)
1134  {
1135  /* timeout exceeded */
1136  return -1;
1137  }
1138 
1139  for (cur_event = set->events, cur_pollfd = set->pollfds;
1140  cur_event < (set->events + set->nevents) &&
1141  returned_events < nevents;
1142  cur_event++, cur_pollfd++)
1143  {
1144  /* no activity on this FD, skip */
1145  if (cur_pollfd->revents == 0)
1146  continue;
1147 
1148  occurred_events->pos = cur_event->pos;
1149  occurred_events->user_data = cur_event->user_data;
1150  occurred_events->events = 0;
1151 
1152  if (cur_event->events == WL_LATCH_SET &&
1153  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1154  {
1155  /* There's data in the self-pipe, clear it. */
1156  drainSelfPipe();
1157 
1158  if (set->latch->is_set)
1159  {
1160  occurred_events->fd = PGINVALID_SOCKET;
1161  occurred_events->events = WL_LATCH_SET;
1162  occurred_events++;
1163  returned_events++;
1164  }
1165  }
1166  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1167  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1168  {
1169  /*
1170  * We expect an POLLHUP when the remote end is closed, but because
1171  * we don't expect the pipe to become readable or to have any
1172  * errors either, treat those cases as postmaster death, too.
1173  *
1174  * As explained in the WAIT_USE_SELECT implementation, select(2)
1175  * may spuriously return. Be paranoid about that here too, a
1176  * spurious WL_POSTMASTER_DEATH would be painful.
1177  */
1178  if (!PostmasterIsAlive())
1179  {
1180  occurred_events->fd = PGINVALID_SOCKET;
1181  occurred_events->events = WL_POSTMASTER_DEATH;
1182  occurred_events++;
1183  returned_events++;
1184  }
1185  }
1186  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1187  {
1188  int errflags = POLLHUP | POLLERR | POLLNVAL;
1189 
1190  Assert(cur_event->fd >= PGINVALID_SOCKET);
1191 
1192  if ((cur_event->events & WL_SOCKET_READABLE) &&
1193  (cur_pollfd->revents & (POLLIN | errflags)))
1194  {
1195  /* data available in socket, or EOF */
1196  occurred_events->events |= WL_SOCKET_READABLE;
1197  }
1198 
1199  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1200  (cur_pollfd->revents & (POLLOUT | errflags)))
1201  {
1202  /* writeable, or EOF */
1203  occurred_events->events |= WL_SOCKET_WRITEABLE;
1204  }
1205 
1206  if (occurred_events->events != 0)
1207  {
1208  occurred_events->fd = cur_event->fd;
1209  occurred_events++;
1210  returned_events++;
1211  }
1212  }
1213  }
1214  return returned_events;
1215 }
1216 
1217 #elif defined(WAIT_USE_SELECT)
1218 
1219 /*
1220  * Wait using select(2).
1221  *
1222  * XXX: On at least older linux kernels select(), in violation of POSIX,
1223  * doesn't reliably return a socket as writable if closed - but we rely on
1224  * that. So far all the known cases of this problem are on platforms that also
1225  * provide a poll() implementation without that bug. If we find one where
1226  * that's not the case, we'll need to add a workaround.
1227  */
1228 static inline int
1229 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1230  WaitEvent *occurred_events, int nevents)
1231 {
1232  int returned_events = 0;
1233  int rc;
1234  WaitEvent *cur_event;
1235  fd_set input_mask;
1236  fd_set output_mask;
1237  int hifd;
1238  struct timeval tv;
1239  struct timeval *tvp = NULL;
1240 
1241  FD_ZERO(&input_mask);
1242  FD_ZERO(&output_mask);
1243 
1244  /*
1245  * Prepare input/output masks. We do so every loop iteration as there's no
1246  * entirely portable way to copy fd_sets.
1247  */
1248  for (cur_event = set->events;
1249  cur_event < (set->events + set->nevents);
1250  cur_event++)
1251  {
1252  if (cur_event->events == WL_LATCH_SET)
1253  FD_SET(cur_event->fd, &input_mask);
1254  else if (cur_event->events == WL_POSTMASTER_DEATH)
1255  FD_SET(cur_event->fd, &input_mask);
1256  else
1257  {
1259  if (cur_event->events == WL_SOCKET_READABLE)
1260  FD_SET(cur_event->fd, &input_mask);
1261  else if (cur_event->events == WL_SOCKET_WRITEABLE)
1262  FD_SET(cur_event->fd, &output_mask);
1263  }
1264 
1265  if (cur_event->fd > hifd)
1266  hifd = cur_event->fd;
1267  }
1268 
1269  /* Sleep */
1270  if (cur_timeout >= 0)
1271  {
1272  tv.tv_sec = cur_timeout / 1000L;
1273  tv.tv_usec = (cur_timeout % 1000L) * 1000L;
1274  tvp = &tv;
1275  }
1276  rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
1277 
1278  /* Check return code */
1279  if (rc < 0)
1280  {
1281  /* EINTR is okay, otherwise complain */
1282  if (errno != EINTR)
1283  {
1284  waiting = false;
1285  ereport(ERROR,
1287  errmsg("select() failed: %m")));
1288  }
1289  return 0; /* retry */
1290  }
1291  else if (rc == 0)
1292  {
1293  /* timeout exceeded */
1294  return -1;
1295  }
1296 
1297  /*
1298  * To associate events with select's masks, we have to check the status of
1299  * the file descriptors associated with an event; by looping through all
1300  * events.
1301  */
1302  for (cur_event = set->events;
1303  cur_event < (set->events + set->nevents)
1304  && returned_events < nevents;
1305  cur_event++)
1306  {
1307  occurred_events->pos = cur_event->pos;
1308  occurred_events->user_data = cur_event->user_data;
1309  occurred_events->events = 0;
1310 
1311  if (cur_event->events == WL_LATCH_SET &&
1312  FD_ISSET(cur_event->fd, &input_mask))
1313  {
1314  /* There's data in the self-pipe, clear it. */
1315  drainSelfPipe();
1316 
1317  if (set->latch->is_set)
1318  {
1319  occurred_events->fd = PGINVALID_SOCKET;
1320  occurred_events->events = WL_LATCH_SET;
1321  occurred_events++;
1322  returned_events++;
1323  }
1324  }
1325  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1326  FD_ISSET(cur_event->fd, &input_mask))
1327  {
1328  /*
1329  * According to the select(2) man page on Linux, select(2) may
1330  * spuriously return and report a file descriptor as readable,
1331  * when it's not; and presumably so can poll(2). It's not clear
1332  * that the relevant cases would ever apply to the postmaster
1333  * pipe, but since the consequences of falsely returning
1334  * WL_POSTMASTER_DEATH could be pretty unpleasant, we take the
1335  * trouble to positively verify EOF with PostmasterIsAlive().
1336  */
1337  if (!PostmasterIsAlive())
1338  {
1339  occurred_events->fd = PGINVALID_SOCKET;
1340  occurred_events->events = WL_POSTMASTER_DEATH;
1341  occurred_events++;
1342  returned_events++;
1343  }
1344  }
1345  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1346  {
1347  Assert(cur_event->fd != PGINVALID_SOCKET);
1348 
1349  if ((cur_event->events & WL_SOCKET_READABLE) &&
1350  FD_ISSET(cur_event->fd, &input_mask))
1351  {
1352  /* data available in socket, or EOF */
1353  occurred_events->events |= WL_SOCKET_READABLE;
1354  }
1355 
1356  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1357  FD_ISSET(cur_event->fd, &output_mask))
1358  {
1359  /* socket is writeable, or EOF */
1360  occurred_events->events |= WL_SOCKET_WRITEABLE;
1361  }
1362 
1363  if (occurred_events->events != 0)
1364  {
1365  occurred_events->fd = cur_event->fd;
1366  occurred_events++;
1367  returned_events++;
1368  }
1369  }
1370  }
1371  return returned_events;
1372 }
1373 
1374 #elif defined(WAIT_USE_WIN32)
1375 
1376 /*
1377  * Wait using Windows' WaitForMultipleObjects().
1378  *
1379  * Unfortunately this will only ever return a single readiness notification at
1380  * a time. Note that while the official documentation for
1381  * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1382  * with a single bWaitAll = FALSE call,
1383  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1384  * that only one event is "consumed".
1385  */
1386 static inline int
1387 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1388  WaitEvent *occurred_events, int nevents)
1389 {
1390  int returned_events = 0;
1391  DWORD rc;
1392  WaitEvent *cur_event;
1393 
1394  /* Reset any wait events that need it */
1395  for (cur_event = set->events;
1396  cur_event < (set->events + set->nevents);
1397  cur_event++)
1398  {
1399  if (cur_event->reset)
1400  {
1401  WaitEventAdjustWin32(set, cur_event);
1402  cur_event->reset = false;
1403  }
1404 
1405  /*
1406  * Windows does not guarantee to log an FD_WRITE network event
1407  * indicating that more data can be sent unless the previous send()
1408  * failed with WSAEWOULDBLOCK. While our caller might well have made
1409  * such a call, we cannot assume that here. Therefore, if waiting for
1410  * write-ready, force the issue by doing a dummy send(). If the dummy
1411  * send() succeeds, assume that the socket is in fact write-ready, and
1412  * return immediately. Also, if it fails with something other than
1413  * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1414  * deal with the error condition.
1415  */
1416  if (cur_event->events & WL_SOCKET_WRITEABLE)
1417  {
1418  char c;
1419  WSABUF buf;
1420  DWORD sent;
1421  int r;
1422 
1423  buf.buf = &c;
1424  buf.len = 0;
1425 
1426  r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1427  if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1428  {
1429  occurred_events->pos = cur_event->pos;
1430  occurred_events->user_data = cur_event->user_data;
1431  occurred_events->events = WL_SOCKET_WRITEABLE;
1432  occurred_events->fd = cur_event->fd;
1433  return 1;
1434  }
1435  }
1436  }
1437 
1438  /*
1439  * Sleep.
1440  *
1441  * Need to wait for ->nevents + 1, because signal handle is in [0].
1442  */
1443  rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1444  cur_timeout);
1445 
1446  /* Check return code */
1447  if (rc == WAIT_FAILED)
1448  elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1449  GetLastError());
1450  else if (rc == WAIT_TIMEOUT)
1451  {
1452  /* timeout exceeded */
1453  return -1;
1454  }
1455 
1456  if (rc == WAIT_OBJECT_0)
1457  {
1458  /* Service newly-arrived signals */
1460  return 0; /* retry */
1461  }
1462 
1463  /*
1464  * With an offset of one, due to the always present pgwin32_signal_event,
1465  * the handle offset directly corresponds to a wait event.
1466  */
1467  cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1468 
1469  occurred_events->pos = cur_event->pos;
1470  occurred_events->user_data = cur_event->user_data;
1471  occurred_events->events = 0;
1472 
1473  if (cur_event->events == WL_LATCH_SET)
1474  {
1475  if (!ResetEvent(set->latch->event))
1476  elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1477 
1478  if (set->latch->is_set)
1479  {
1480  occurred_events->fd = PGINVALID_SOCKET;
1481  occurred_events->events = WL_LATCH_SET;
1482  occurred_events++;
1483  returned_events++;
1484  }
1485  }
1486  else if (cur_event->events == WL_POSTMASTER_DEATH)
1487  {
1488  /*
1489  * Postmaster apparently died. Since the consequences of falsely
1490  * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1491  * the trouble to positively verify this with PostmasterIsAlive(),
1492  * even though there is no known reason to think that the event could
1493  * be falsely set on Windows.
1494  */
1495  if (!PostmasterIsAlive())
1496  {
1497  occurred_events->fd = PGINVALID_SOCKET;
1498  occurred_events->events = WL_POSTMASTER_DEATH;
1499  occurred_events++;
1500  returned_events++;
1501  }
1502  }
1503  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1504  {
1505  WSANETWORKEVENTS resEvents;
1506  HANDLE handle = set->handles[cur_event->pos + 1];
1507 
1508  Assert(cur_event->fd);
1509 
1510  occurred_events->fd = cur_event->fd;
1511 
1512  ZeroMemory(&resEvents, sizeof(resEvents));
1513  if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1514  elog(ERROR, "failed to enumerate network events: error code %u",
1515  WSAGetLastError());
1516  if ((cur_event->events & WL_SOCKET_READABLE) &&
1517  (resEvents.lNetworkEvents & FD_READ))
1518  {
1519  /* data available in socket */
1520  occurred_events->events |= WL_SOCKET_READABLE;
1521 
1522  /*------
1523  * WaitForMultipleObjects doesn't guarantee that a read event will
1524  * be returned if the latch is set at the same time. Even if it
1525  * did, the caller might drop that event expecting it to reoccur
1526  * on next call. So, we must force the event to be reset if this
1527  * WaitEventSet is used again in order to avoid an indefinite
1528  * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1529  * for the behavior of socket events.
1530  *------
1531  */
1532  cur_event->reset = true;
1533  }
1534  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1535  (resEvents.lNetworkEvents & FD_WRITE))
1536  {
1537  /* writeable */
1538  occurred_events->events |= WL_SOCKET_WRITEABLE;
1539  }
1540  if (resEvents.lNetworkEvents & FD_CLOSE)
1541  {
1542  /* EOF */
1543  if (cur_event->events & WL_SOCKET_READABLE)
1544  occurred_events->events |= WL_SOCKET_READABLE;
1545  if (cur_event->events & WL_SOCKET_WRITEABLE)
1546  occurred_events->events |= WL_SOCKET_WRITEABLE;
1547  }
1548 
1549  if (occurred_events->events != 0)
1550  {
1551  occurred_events++;
1552  returned_events++;
1553  }
1554  }
1555 
1556  return returned_events;
1557 }
1558 #endif
1559 
1560 /*
1561  * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1562  *
1563  * Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1564  * overloaded for multiple purposes; or we might not have reached WaitLatch
1565  * yet, in which case we don't need to fill the pipe either.)
1566  *
1567  * NB: when calling this in a signal handler, be sure to save and restore
1568  * errno around it.
1569  */
1570 #ifndef WIN32
1571 void
1573 {
1574  if (waiting)
1575  sendSelfPipeByte();
1576 }
1577 #endif /* !WIN32 */
1578 
1579 /* Send one byte to the self-pipe, to wake up WaitLatch */
1580 #ifndef WIN32
1581 static void
1583 {
1584  int rc;
1585  char dummy = 0;
1586 
1587 retry:
1588  rc = write(selfpipe_writefd, &dummy, 1);
1589  if (rc < 0)
1590  {
1591  /* If interrupted by signal, just retry */
1592  if (errno == EINTR)
1593  goto retry;
1594 
1595  /*
1596  * If the pipe is full, we don't need to retry, the data that's there
1597  * already is enough to wake up WaitLatch.
1598  */
1599  if (errno == EAGAIN || errno == EWOULDBLOCK)
1600  return;
1601 
1602  /*
1603  * Oops, the write() failed for some other reason. We might be in a
1604  * signal handler, so it's not safe to elog(). We have no choice but
1605  * silently ignore the error.
1606  */
1607  return;
1608  }
1609 }
1610 #endif /* !WIN32 */
1611 
1612 /*
1613  * Read all available data from the self-pipe
1614  *
1615  * Note: this is only called when waiting = true. If it fails and doesn't
1616  * return, it must reset that flag first (though ideally, this will never
1617  * happen).
1618  */
1619 #ifndef WIN32
1620 static void
1622 {
1623  /*
1624  * There shouldn't normally be more than one byte in the pipe, or maybe a
1625  * few bytes if multiple processes run SetLatch at the same instant.
1626  */
1627  char buf[16];
1628  int rc;
1629 
1630  for (;;)
1631  {
1632  rc = read(selfpipe_readfd, buf, sizeof(buf));
1633  if (rc < 0)
1634  {
1635  if (errno == EAGAIN || errno == EWOULDBLOCK)
1636  break; /* the pipe is empty */
1637  else if (errno == EINTR)
1638  continue; /* retry */
1639  else
1640  {
1641  waiting = false;
1642  elog(ERROR, "read() on self-pipe failed: %m");
1643  }
1644  }
1645  else if (rc == 0)
1646  {
1647  waiting = false;
1648  elog(ERROR, "unexpected EOF on self-pipe");
1649  }
1650  else if (rc < sizeof(buf))
1651  {
1652  /* we successfully drained the pipe; no need to read() again */
1653  break;
1654  }
1655  /* else buffer wasn't big enough, so read again */
1656  }
1657 }
1658 #endif /* !WIN32 */
#define EWOULDBLOCK
Definition: win32.h:301
int latch_pos
Definition: latch.c:105
#define SIGUSR1
Definition: win32.h:211
#define WL_SOCKET_WRITEABLE
Definition: latch.h:126
pgsocket fd
Definition: latch.h:134
int MyProcPid
Definition: globals.c:38
int pos
Definition: latch.h:132
void FreeWaitEventSet(WaitEventSet *set)
Definition: latch.c:556
static int selfpipe_writefd
Definition: latch.c:131
#define WL_TIMEOUT
Definition: latch.h:127
int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, void *user_data)
Definition: latch.c:612
#define write(a, b, c)
Definition: win32.h:19
bool is_shared
Definition: latch.h:113
#define INSTR_TIME_GET_MILLISEC(t)
Definition: instr_time.h:199
struct timeval instr_time
Definition: instr_time.h:147
void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
Definition: latch.c:685
static void drainSelfPipe(void)
Definition: latch.c:1621
#define select(n, r, w, e, timeout)
Definition: win32.h:384
#define WL_SOCKET_READABLE
Definition: latch.h:125
void ResetLatch(volatile Latch *latch)
Definition: latch.c:461
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static time_t start_time
Definition: pg_ctl.c:91
#define EAGAIN
Definition: win32.h:293
WaitEventSet * CreateWaitEventSet(MemoryContext context, int nevents)
Definition: latch.c:484
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:757
HANDLE pgwin32_signal_event
Definition: signal.c:27
int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:300
void pfree(void *pointer)
Definition: mcxt.c:950
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:107
#define ERROR
Definition: elog.h:43
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:167
#define FALSE
Definition: c.h:221
#define FATAL
Definition: elog.h:52
uint32 events
Definition: latch.h:133
static int selfpipe_readfd
Definition: latch.c:130
Definition: latch.h:110
static int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents)
char * c
static char * buf
Definition: pg_test_fsync.c:65
bool PostmasterIsAlive(void)
Definition: pmsignal.c:272
unsigned int uint32
Definition: c.h:268
int pgsocket
Definition: port.h:22
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1231
void OwnLatch(volatile Latch *latch)
Definition: latch.c:252
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
#define ereport(elevel, rest)
Definition: elog.h:122
int errcode_for_socket_access(void)
Definition: elog.c:669
int nevents
Definition: latch.c:89
int postmaster_alive_fds[2]
Definition: postmaster.c:556
static void sendSelfPipeByte(void)
Definition: latch.c:1582
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
#define PGINVALID_SOCKET
Definition: port.h:24
#define EINTR
Definition: win32.h:295
void InitializeLatchSupport(void)
Definition: latch.c:156
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:742
#define pg_memory_barrier()
Definition: atomics.h:147
void SetLatch(volatile Latch *latch)
Definition: latch.c:379
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout, uint32 wait_event_info)
Definition: latch.c:320
WaitEvent * events
Definition: latch.c:96
size_t Size
Definition: c.h:356
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1207
#define MAXALIGN(LEN)
Definition: c.h:588
void InitLatch(volatile Latch *latch)
Definition: latch.c:188
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:153
void * user_data
Definition: latch.h:135
int nevents_space
Definition: latch.c:90
int errmsg(const char *fmt,...)
Definition: elog.c:797
int owner_pid
Definition: latch.h:114
void DisownLatch(volatile Latch *latch)
Definition: latch.c:272
sig_atomic_t is_set
Definition: latch.h:112
#define TRUE
Definition: c.h:217
#define elog
Definition: elog.h:219
#define close(a)
Definition: win32.h:17
void latch_sigusr1_handler(void)
Definition: latch.c:1572
void InitSharedLatch(volatile Latch *latch)
Definition: latch.c:216
Latch * latch
Definition: latch.c:104
#define WL_LATCH_SET
Definition: latch.h:124
static volatile sig_atomic_t waiting
Definition: latch.c:127
#define POSTMASTER_FD_WATCH
Definition: postmaster.h:42
#define read(a, b, c)
Definition: win32.h:18
int WaitEventSetWait(WaitEventSet *set, long timeout, WaitEvent *occurred_events, int nevents, uint32 wait_event_info)
Definition: latch.c:870