PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
latch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * latch.c
4  * Routines for inter-process latches
5  *
6  * The Unix implementation uses the so-called self-pipe trick to overcome
7  * the race condition involved with select() and setting a global flag
8  * in the signal handler. When a latch is set and the current process
9  * is waiting for it, the signal handler wakes up the select() in
10  * WaitLatch by writing a byte to a pipe. A signal by itself doesn't
11  * interrupt select() on all platforms, and even on platforms where it
12  * does, a signal that arrives just before the select() call does not
13  * prevent the select() from entering sleep. An incoming byte on a pipe
14  * however reliably interrupts the sleep, and causes select() to return
15  * immediately even if the signal arrives before select() begins.
16  *
17  * (Actually, we prefer epoll_wait() over poll() over select() where
18  * available, but the same comments apply.)
19  *
20  * When SetLatch is called from the same process that owns the latch,
21  * SetLatch writes the byte directly to the pipe. If it's owned by another
22  * process, SIGUSR1 is sent and the signal handler in the waiting process
23  * writes the byte to the pipe on behalf of the signaling process.
24  *
25  * The Windows implementation uses Windows events that are inherited by
26  * all postmaster child processes.
27  *
28  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
29  * Portions Copyright (c) 1994, Regents of the University of California
30  *
31  * IDENTIFICATION
32  * src/backend/storage/ipc/latch.c
33  *
34  *-------------------------------------------------------------------------
35  */
36 #include "postgres.h"
37 
38 #include <fcntl.h>
39 #include <limits.h>
40 #include <signal.h>
41 #include <unistd.h>
42 #include <sys/time.h>
43 #include <sys/types.h>
44 #ifdef HAVE_SYS_EPOLL_H
45 #include <sys/epoll.h>
46 #endif
47 #ifdef HAVE_POLL_H
48 #include <poll.h>
49 #endif
50 #ifdef HAVE_SYS_POLL_H
51 #include <sys/poll.h>
52 #endif
53 #ifdef HAVE_SYS_SELECT_H
54 #include <sys/select.h>
55 #endif
56 
57 #include "miscadmin.h"
58 #include "pgstat.h"
59 #include "port/atomics.h"
60 #include "portability/instr_time.h"
61 #include "postmaster/postmaster.h"
62 #include "storage/latch.h"
63 #include "storage/pmsignal.h"
64 #include "storage/shmem.h"
65 
66 /*
67  * Select the fd readiness primitive to use. Normally the "most modern"
68  * primitive supported by the OS will be used, but for testing it can be
69  * useful to manually specify the used primitive. If desired, just add a
70  * define somewhere before this block.
71  */
72 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
73  defined(WAIT_USE_SELECT) || defined(WAIT_USE_WIN32)
74 /* don't overwrite manual choice */
75 #elif defined(HAVE_SYS_EPOLL_H)
76 #define WAIT_USE_EPOLL
77 #elif defined(HAVE_POLL)
78 #define WAIT_USE_POLL
79 #elif HAVE_SYS_SELECT_H
80 #define WAIT_USE_SELECT
81 #elif WIN32
82 #define WAIT_USE_WIN32
83 #else
84 #error "no wait set implementation available"
85 #endif
86 
87 /* typedef in latch.h */
89 {
90  int nevents; /* number of registered events */
91  int nevents_space; /* maximum number of events in this set */
92 
93  /*
94  * Array, of nevents_space length, storing the definition of events this
95  * set is waiting for.
96  */
98 
99  /*
100  * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
101  * said latch, and latch_pos the offset in the ->events array. This is
102  * useful because we check the state of the latch before performing doing
103  * syscalls related to waiting.
104  */
107 
108 #if defined(WAIT_USE_EPOLL)
109  int epoll_fd;
110  /* epoll_wait returns events in a user provided arrays, allocate once */
111  struct epoll_event *epoll_ret_events;
112 #elif defined(WAIT_USE_POLL)
113  /* poll expects events to be waited on every poll() call, prepare once */
114  struct pollfd *pollfds;
115 #elif defined(WAIT_USE_WIN32)
116 
117  /*
118  * Array of windows events. The first element always contains
119  * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
120  * event->pos + 1).
121  */
122  HANDLE *handles;
123 #endif
124 };
125 
126 #ifndef WIN32
127 /* Are we currently in WaitLatch? The signal handler would like to know. */
128 static volatile sig_atomic_t waiting = false;
129 
130 /* Read and write ends of the self-pipe */
131 static int selfpipe_readfd = -1;
132 static int selfpipe_writefd = -1;
133 
134 /* Private function prototypes */
135 static void sendSelfPipeByte(void);
136 static void drainSelfPipe(void);
137 #endif /* WIN32 */
138 
139 #if defined(WAIT_USE_EPOLL)
140 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
141 #elif defined(WAIT_USE_POLL)
142 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
143 #elif defined(WAIT_USE_WIN32)
144 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
145 #endif
146 
147 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
148  WaitEvent *occurred_events, int nevents);
149 
150 /*
151  * Initialize the process-local latch infrastructure.
152  *
153  * This must be called once during startup of any process that can wait on
154  * latches, before it issues any InitLatch() or OwnLatch() calls.
155  */
156 void
158 {
159 #ifndef WIN32
160  int pipefd[2];
161 
162  Assert(selfpipe_readfd == -1);
163 
164  /*
165  * Set up the self-pipe that allows a signal handler to wake up the
166  * select() in WaitLatch. Make the write-end non-blocking, so that
167  * SetLatch won't block if the event has already been set many times
168  * filling the kernel buffer. Make the read-end non-blocking too, so that
169  * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
170  */
171  if (pipe(pipefd) < 0)
172  elog(FATAL, "pipe() failed: %m");
173  if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0)
174  elog(FATAL, "fcntl() failed on read-end of self-pipe: %m");
175  if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) < 0)
176  elog(FATAL, "fcntl() failed on write-end of self-pipe: %m");
177 
178  selfpipe_readfd = pipefd[0];
179  selfpipe_writefd = pipefd[1];
180 #else
181  /* currently, nothing to do here for Windows */
182 #endif
183 }
184 
185 /*
186  * Initialize a backend-local latch.
187  */
188 void
189 InitLatch(volatile Latch *latch)
190 {
191  latch->is_set = false;
192  latch->owner_pid = MyProcPid;
193  latch->is_shared = false;
194 
195 #ifndef WIN32
196  /* Assert InitializeLatchSupport has been called in this process */
197  Assert(selfpipe_readfd >= 0);
198 #else
199  latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
200  if (latch->event == NULL)
201  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
202 #endif /* WIN32 */
203 }
204 
205 /*
206  * Initialize a shared latch that can be set from other processes. The latch
207  * is initially owned by no-one; use OwnLatch to associate it with the
208  * current process.
209  *
210  * InitSharedLatch needs to be called in postmaster before forking child
211  * processes, usually right after allocating the shared memory block
212  * containing the latch with ShmemInitStruct. (The Unix implementation
213  * doesn't actually require that, but the Windows one does.) Because of
214  * this restriction, we have no concurrency issues to worry about here.
215  */
216 void
217 InitSharedLatch(volatile Latch *latch)
218 {
219 #ifdef WIN32
220  SECURITY_ATTRIBUTES sa;
221 
222  /*
223  * Set up security attributes to specify that the events are inherited.
224  */
225  ZeroMemory(&sa, sizeof(sa));
226  sa.nLength = sizeof(sa);
227  sa.bInheritHandle = TRUE;
228 
229  latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
230  if (latch->event == NULL)
231  elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
232 #endif
233 
234  latch->is_set = false;
235  latch->owner_pid = 0;
236  latch->is_shared = true;
237 }
238 
239 /*
240  * Associate a shared latch with the current process, allowing it to
241  * wait on the latch.
242  *
243  * Although there is a sanity check for latch-already-owned, we don't do
244  * any sort of locking here, meaning that we could fail to detect the error
245  * if two processes try to own the same latch at about the same time. If
246  * there is any risk of that, caller must provide an interlock to prevent it.
247  *
248  * In any process that calls OwnLatch(), make sure that
249  * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
250  * as shared latches use SIGUSR1 for inter-process communication.
251  */
252 void
253 OwnLatch(volatile Latch *latch)
254 {
255  /* Sanity checks */
256  Assert(latch->is_shared);
257 
258 #ifndef WIN32
259  /* Assert InitializeLatchSupport has been called in this process */
260  Assert(selfpipe_readfd >= 0);
261 #endif
262 
263  if (latch->owner_pid != 0)
264  elog(ERROR, "latch already owned");
265 
266  latch->owner_pid = MyProcPid;
267 }
268 
269 /*
270  * Disown a shared latch currently owned by the current process.
271  */
272 void
273 DisownLatch(volatile Latch *latch)
274 {
275  Assert(latch->is_shared);
276  Assert(latch->owner_pid == MyProcPid);
277 
278  latch->owner_pid = 0;
279 }
280 
281 /*
282  * Wait for a given latch to be set, or for postmaster death, or until timeout
283  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
284  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
285  * function returns immediately.
286  *
287  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
288  * is given. Although it is declared as "long", we don't actually support
289  * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
290  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
291  *
292  * The latch must be owned by the current process, ie. it must be a
293  * backend-local latch initialized with InitLatch, or a shared latch
294  * associated with the current process by calling OwnLatch.
295  *
296  * Returns bit mask indicating which condition(s) caused the wake-up. Note
297  * that if multiple wake-up conditions are true, there is no guarantee that
298  * we return all of them in one call, but we will return at least one.
299  */
300 int
301 WaitLatch(volatile Latch *latch, int wakeEvents, long timeout,
302  uint32 wait_event_info)
303 {
304  return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
305  wait_event_info);
306 }
307 
308 /*
309  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
310  * conditions.
311  *
312  * When waiting on a socket, EOF and error conditions are reported by
313  * returning the socket as readable/writable or both, depending on
314  * WL_SOCKET_READABLE/WL_SOCKET_WRITEABLE being specified.
315  *
316  * NB: These days this is just a wrapper around the WaitEventSet API. When
317  * using a latch very frequently, consider creating a longer living
318  * WaitEventSet instead; that's more efficient.
319  */
320 int
321 WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
322  long timeout, uint32 wait_event_info)
323 {
324  int ret = 0;
325  int rc;
326  WaitEvent event;
328 
329  if (wakeEvents & WL_TIMEOUT)
330  Assert(timeout >= 0);
331  else
332  timeout = -1;
333 
334  if (wakeEvents & WL_LATCH_SET)
335  AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
336  (Latch *) latch, NULL);
337 
338  if (wakeEvents & WL_POSTMASTER_DEATH)
339  AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
340  NULL, NULL);
341 
342  if (wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
343  {
344  int ev;
345 
346  ev = wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
347  AddWaitEventToSet(set, ev, sock, NULL, NULL);
348  }
349 
350  rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
351 
352  if (rc == 0)
353  ret |= WL_TIMEOUT;
354  else
355  {
356  ret |= event.events & (WL_LATCH_SET |
357  WL_POSTMASTER_DEATH |
360  }
361 
362  FreeWaitEventSet(set);
363 
364  return ret;
365 }
366 
367 /*
368  * Sets a latch and wakes up anyone waiting on it.
369  *
370  * This is cheap if the latch is already set, otherwise not so much.
371  *
372  * NB: when calling this in a signal handler, be sure to save and restore
373  * errno around it. (That's standard practice in most signal handlers, of
374  * course, but we used to omit it in handlers that only set a flag.)
375  *
376  * NB: this function is called from critical sections and signal handlers so
377  * throwing an error is not a good idea.
378  */
379 void
380 SetLatch(volatile Latch *latch)
381 {
382 #ifndef WIN32
383  pid_t owner_pid;
384 #else
385  HANDLE handle;
386 #endif
387 
388  /*
389  * The memory barrier has to be placed here to ensure that any flag
390  * variables possibly changed by this process have been flushed to main
391  * memory, before we check/set is_set.
392  */
394 
395  /* Quick exit if already set */
396  if (latch->is_set)
397  return;
398 
399  latch->is_set = true;
400 
401 #ifndef WIN32
402 
403  /*
404  * See if anyone's waiting for the latch. It can be the current process if
405  * we're in a signal handler. We use the self-pipe to wake up the select()
406  * in that case. If it's another process, send a signal.
407  *
408  * Fetch owner_pid only once, in case the latch is concurrently getting
409  * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
410  * guaranteed to be true! In practice, the effective range of pid_t fits
411  * in a 32 bit integer, and so should be atomic. In the worst case, we
412  * might end up signaling the wrong process. Even then, you're very
413  * unlucky if a process with that bogus pid exists and belongs to
414  * Postgres; and PG database processes should handle excess SIGUSR1
415  * interrupts without a problem anyhow.
416  *
417  * Another sort of race condition that's possible here is for a new
418  * process to own the latch immediately after we look, so we don't signal
419  * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
420  * the standard coding convention of waiting at the bottom of their loops,
421  * not the top, so that they'll correctly process latch-setting events
422  * that happen before they enter the loop.
423  */
424  owner_pid = latch->owner_pid;
425  if (owner_pid == 0)
426  return;
427  else if (owner_pid == MyProcPid)
428  {
429  if (waiting)
431  }
432  else
433  kill(owner_pid, SIGUSR1);
434 #else
435 
436  /*
437  * See if anyone's waiting for the latch. It can be the current process if
438  * we're in a signal handler.
439  *
440  * Use a local variable here just in case somebody changes the event field
441  * concurrently (which really should not happen).
442  */
443  handle = latch->event;
444  if (handle)
445  {
446  SetEvent(handle);
447 
448  /*
449  * Note that we silently ignore any errors. We might be in a signal
450  * handler or other critical path where it's not safe to call elog().
451  */
452  }
453 #endif
454 
455 }
456 
457 /*
458  * Clear the latch. Calling WaitLatch after this will sleep, unless
459  * the latch is set again before the WaitLatch call.
460  */
461 void
462 ResetLatch(volatile Latch *latch)
463 {
464  /* Only the owner should reset the latch */
465  Assert(latch->owner_pid == MyProcPid);
466 
467  latch->is_set = false;
468 
469  /*
470  * Ensure that the write to is_set gets flushed to main memory before we
471  * examine any flag variables. Otherwise a concurrent SetLatch might
472  * falsely conclude that it needn't signal us, even though we have missed
473  * seeing some flag updates that SetLatch was supposed to inform us of.
474  */
476 }
477 
478 /*
479  * Create a WaitEventSet with space for nevents different events to wait for.
480  *
481  * These events can then be efficiently waited upon together, using
482  * WaitEventSetWait().
483  */
484 WaitEventSet *
485 CreateWaitEventSet(MemoryContext context, int nevents)
486 {
487  WaitEventSet *set;
488  char *data;
489  Size sz = 0;
490 
491  /*
492  * Use MAXALIGN size/alignment to guarantee that later uses of memory are
493  * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
494  * platforms, but earlier allocations like WaitEventSet and WaitEvent
495  * might not sized to guarantee that when purely using sizeof().
496  */
497  sz += MAXALIGN(sizeof(WaitEventSet));
498  sz += MAXALIGN(sizeof(WaitEvent) * nevents);
499 
500 #if defined(WAIT_USE_EPOLL)
501  sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
502 #elif defined(WAIT_USE_POLL)
503  sz += MAXALIGN(sizeof(struct pollfd) * nevents);
504 #elif defined(WAIT_USE_WIN32)
505  /* need space for the pgwin32_signal_event */
506  sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
507 #endif
508 
509  data = (char *) MemoryContextAllocZero(context, sz);
510 
511  set = (WaitEventSet *) data;
512  data += MAXALIGN(sizeof(WaitEventSet));
513 
514  set->events = (WaitEvent *) data;
515  data += MAXALIGN(sizeof(WaitEvent) * nevents);
516 
517 #if defined(WAIT_USE_EPOLL)
518  set->epoll_ret_events = (struct epoll_event *) data;
519  data += MAXALIGN(sizeof(struct epoll_event) * nevents);
520 #elif defined(WAIT_USE_POLL)
521  set->pollfds = (struct pollfd *) data;
522  data += MAXALIGN(sizeof(struct pollfd) * nevents);
523 #elif defined(WAIT_USE_WIN32)
524  set->handles = (HANDLE) data;
525  data += MAXALIGN(sizeof(HANDLE) * nevents);
526 #endif
527 
528  set->latch = NULL;
529  set->nevents_space = nevents;
530 
531 #if defined(WAIT_USE_EPOLL)
532  set->epoll_fd = epoll_create(nevents);
533  if (set->epoll_fd < 0)
534  elog(ERROR, "epoll_create failed: %m");
535 #elif defined(WAIT_USE_WIN32)
536 
537  /*
538  * To handle signals while waiting, we need to add a win32 specific event.
539  * We accounted for the additional event at the top of this routine. See
540  * port/win32/signal.c for more details.
541  *
542  * Note: pgwin32_signal_event should be first to ensure that it will be
543  * reported when multiple events are set. We want to guarantee that
544  * pending signals are serviced.
545  */
546  set->handles[0] = pgwin32_signal_event;
547  StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
548 #endif
549 
550  return set;
551 }
552 
553 /*
554  * Free a previously created WaitEventSet.
555  */
556 void
558 {
559 #if defined(WAIT_USE_EPOLL)
560  close(set->epoll_fd);
561 #elif defined(WAIT_USE_WIN32)
562  WaitEvent *cur_event;
563 
564  for (cur_event = set->events;
565  cur_event < (set->events + set->nevents);
566  cur_event++)
567  {
568  if (cur_event->events & WL_LATCH_SET)
569  {
570  /* uses the latch's HANDLE */
571  }
572  else if (cur_event->events & WL_POSTMASTER_DEATH)
573  {
574  /* uses PostmasterHandle */
575  }
576  else
577  {
578  /* Clean up the event object we created for the socket */
579  WSAEventSelect(cur_event->fd, NULL, 0);
580  WSACloseEvent(set->handles[cur_event->pos + 1]);
581  }
582  }
583 #endif
584 
585  pfree(set);
586 }
587 
588 /* ---
589  * Add an event to the set. Possible events are:
590  * - WL_LATCH_SET: Wait for the latch to be set
591  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
592  * - WL_SOCKET_READABLE: Wait for socket to become readable
593  * can be combined in one event with WL_SOCKET_WRITEABLE
594  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable
595  * can be combined with WL_SOCKET_READABLE
596  *
597  * Returns the offset in WaitEventSet->events (starting from 0), which can be
598  * used to modify previously added wait events using ModifyWaitEvent().
599  *
600  * In the WL_LATCH_SET case the latch must be owned by the current process,
601  * i.e. it must be a backend-local latch initialized with InitLatch, or a
602  * shared latch associated with the current process by calling OwnLatch.
603  *
604  * In the WL_SOCKET_READABLE/WRITEABLE case, EOF and error conditions are
605  * reported by returning the socket as readable/writable or both, depending on
606  * WL_SOCKET_READABLE/WRITEABLE being specified.
607  *
608  * The user_data pointer specified here will be set for the events returned
609  * by WaitEventSetWait(), allowing to easily associate additional data with
610  * events.
611  */
612 int
614  void *user_data)
615 {
616  WaitEvent *event;
617 
618  /* not enough space */
619  Assert(set->nevents < set->nevents_space);
620 
621  if (latch)
622  {
623  if (latch->owner_pid != MyProcPid)
624  elog(ERROR, "cannot wait on a latch owned by another process");
625  if (set->latch)
626  elog(ERROR, "cannot wait on more than one latch");
627  if ((events & WL_LATCH_SET) != WL_LATCH_SET)
628  elog(ERROR, "latch events only support being set");
629  }
630  else
631  {
632  if (events & WL_LATCH_SET)
633  elog(ERROR, "cannot wait on latch without a specified latch");
634  }
635 
636  /* waiting for socket readiness without a socket indicates a bug */
637  if (fd == PGINVALID_SOCKET &&
639  elog(ERROR, "cannot wait on socket event without a socket");
640 
641  event = &set->events[set->nevents];
642  event->pos = set->nevents++;
643  event->fd = fd;
644  event->events = events;
645  event->user_data = user_data;
646 #ifdef WIN32
647  event->reset = false;
648 #endif
649 
650  if (events == WL_LATCH_SET)
651  {
652  set->latch = latch;
653  set->latch_pos = event->pos;
654 #ifndef WIN32
655  event->fd = selfpipe_readfd;
656 #endif
657  }
658  else if (events == WL_POSTMASTER_DEATH)
659  {
660 #ifndef WIN32
662 #endif
663  }
664 
665  /* perform wait primitive specific initialization, if needed */
666 #if defined(WAIT_USE_EPOLL)
667  WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
668 #elif defined(WAIT_USE_POLL)
669  WaitEventAdjustPoll(set, event);
670 #elif defined(WAIT_USE_SELECT)
671  /* nothing to do */
672 #elif defined(WAIT_USE_WIN32)
673  WaitEventAdjustWin32(set, event);
674 #endif
675 
676  return event->pos;
677 }
678 
679 /*
680  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
681  * with the WaitEvent.
682  *
683  * 'pos' is the id returned by AddWaitEventToSet.
684  */
685 void
686 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
687 {
688  WaitEvent *event;
689 
690  Assert(pos < set->nevents);
691 
692  event = &set->events[pos];
693 
694  /*
695  * If neither the event mask nor the associated latch changes, return
696  * early. That's an important optimization for some sockets, where
697  * ModifyWaitEvent is frequently used to switch from waiting for reads to
698  * waiting on writes.
699  */
700  if (events == event->events &&
701  (!(event->events & WL_LATCH_SET) || set->latch == latch))
702  return;
703 
704  if (event->events & WL_LATCH_SET &&
705  events != event->events)
706  {
707  /* we could allow to disable latch events for a while */
708  elog(ERROR, "cannot modify latch event");
709  }
710 
711  if (event->events & WL_POSTMASTER_DEATH)
712  {
713  elog(ERROR, "cannot modify postmaster death event");
714  }
715 
716  /* FIXME: validate event mask */
717  event->events = events;
718 
719  if (events == WL_LATCH_SET)
720  {
721  set->latch = latch;
722  }
723 
724 #if defined(WAIT_USE_EPOLL)
725  WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
726 #elif defined(WAIT_USE_POLL)
727  WaitEventAdjustPoll(set, event);
728 #elif defined(WAIT_USE_SELECT)
729  /* nothing to do */
730 #elif defined(WAIT_USE_WIN32)
731  WaitEventAdjustWin32(set, event);
732 #endif
733 }
734 
735 #if defined(WAIT_USE_EPOLL)
736 /*
737  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
738  */
739 static void
740 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
741 {
742  struct epoll_event epoll_ev;
743  int rc;
744 
745  /* pointer to our event, returned by epoll_wait */
746  epoll_ev.data.ptr = event;
747  /* always wait for errors */
748  epoll_ev.events = EPOLLERR | EPOLLHUP;
749 
750  /* prepare pollfd entry once */
751  if (event->events == WL_LATCH_SET)
752  {
753  Assert(set->latch != NULL);
754  epoll_ev.events |= EPOLLIN;
755  }
756  else if (event->events == WL_POSTMASTER_DEATH)
757  {
758  epoll_ev.events |= EPOLLIN;
759  }
760  else
761  {
762  Assert(event->fd != PGINVALID_SOCKET);
764 
765  if (event->events & WL_SOCKET_READABLE)
766  epoll_ev.events |= EPOLLIN;
767  if (event->events & WL_SOCKET_WRITEABLE)
768  epoll_ev.events |= EPOLLOUT;
769  }
770 
771  /*
772  * Even though unused, we also pass epoll_ev as the data argument if
773  * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
774  * requiring that, and actually it makes the code simpler...
775  */
776  rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
777 
778  if (rc < 0)
779  ereport(ERROR,
781  errmsg("epoll_ctl() failed: %m")));
782 }
783 #endif
784 
785 #if defined(WAIT_USE_POLL)
786 static void
787 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
788 {
789  struct pollfd *pollfd = &set->pollfds[event->pos];
790 
791  pollfd->revents = 0;
792  pollfd->fd = event->fd;
793 
794  /* prepare pollfd entry once */
795  if (event->events == WL_LATCH_SET)
796  {
797  Assert(set->latch != NULL);
798  pollfd->events = POLLIN;
799  }
800  else if (event->events == WL_POSTMASTER_DEATH)
801  {
802  pollfd->events = POLLIN;
803  }
804  else
805  {
807  pollfd->events = 0;
808  if (event->events & WL_SOCKET_READABLE)
809  pollfd->events |= POLLIN;
810  if (event->events & WL_SOCKET_WRITEABLE)
811  pollfd->events |= POLLOUT;
812  }
813 
814  Assert(event->fd != PGINVALID_SOCKET);
815 }
816 #endif
817 
818 #if defined(WAIT_USE_WIN32)
819 static void
820 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
821 {
822  HANDLE *handle = &set->handles[event->pos + 1];
823 
824  if (event->events == WL_LATCH_SET)
825  {
826  Assert(set->latch != NULL);
827  *handle = set->latch->event;
828  }
829  else if (event->events == WL_POSTMASTER_DEATH)
830  {
831  *handle = PostmasterHandle;
832  }
833  else
834  {
835  int flags = FD_CLOSE; /* always check for errors/EOF */
836 
837  if (event->events & WL_SOCKET_READABLE)
838  flags |= FD_READ;
839  if (event->events & WL_SOCKET_WRITEABLE)
840  flags |= FD_WRITE;
841 
842  if (*handle == WSA_INVALID_EVENT)
843  {
844  *handle = WSACreateEvent();
845  if (*handle == WSA_INVALID_EVENT)
846  elog(ERROR, "failed to create event for socket: error code %u",
847  WSAGetLastError());
848  }
849  if (WSAEventSelect(event->fd, *handle, flags) != 0)
850  elog(ERROR, "failed to set up event for socket: error code %u",
851  WSAGetLastError());
852 
853  Assert(event->fd != PGINVALID_SOCKET);
854  }
855 }
856 #endif
857 
858 /*
859  * Wait for events added to the set to happen, or until the timeout is
860  * reached. At most nevents occurred events are returned.
861  *
862  * If timeout = -1, block until an event occurs; if 0, check sockets for
863  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
864  *
865  * Returns the number of events occurred, or 0 if the timeout was reached.
866  *
867  * Returned events will have the fd, pos, user_data fields set to the
868  * values associated with the registered event.
869  */
870 int
871 WaitEventSetWait(WaitEventSet *set, long timeout,
872  WaitEvent *occurred_events, int nevents,
873  uint32 wait_event_info)
874 {
875  int returned_events = 0;
877  instr_time cur_time;
878  long cur_timeout = -1;
879 
880  Assert(nevents > 0);
881 
882  /*
883  * Initialize timeout if requested. We must record the current time so
884  * that we can determine the remaining timeout if interrupted.
885  */
886  if (timeout >= 0)
887  {
888  INSTR_TIME_SET_CURRENT(start_time);
889  Assert(timeout >= 0 && timeout <= INT_MAX);
890  cur_timeout = timeout;
891  }
892 
893  pgstat_report_wait_start(wait_event_info);
894 
895 #ifndef WIN32
896  waiting = true;
897 #else
898  /* Ensure that signals are serviced even if latch is already set */
900 #endif
901  while (returned_events == 0)
902  {
903  int rc;
904 
905  /*
906  * Check if the latch is set already. If so, leave the loop
907  * immediately, avoid blocking again. We don't attempt to report any
908  * other events that might also be satisfied.
909  *
910  * If someone sets the latch between this and the
911  * WaitEventSetWaitBlock() below, the setter will write a byte to the
912  * pipe (or signal us and the signal handler will do that), and the
913  * readiness routine will return immediately.
914  *
915  * On unix, If there's a pending byte in the self pipe, we'll notice
916  * whenever blocking. Only clearing the pipe in that case avoids
917  * having to drain it every time WaitLatchOrSocket() is used. Should
918  * the pipe-buffer fill up we're still ok, because the pipe is in
919  * nonblocking mode. It's unlikely for that to happen, because the
920  * self pipe isn't filled unless we're blocking (waiting = true), or
921  * from inside a signal handler in latch_sigusr1_handler().
922  *
923  * On windows, we'll also notice if there's a pending event for the
924  * latch when blocking, but there's no danger of anything filling up,
925  * as "Setting an event that is already set has no effect.".
926  *
927  * Note: we assume that the kernel calls involved in latch management
928  * will provide adequate synchronization on machines with weak memory
929  * ordering, so that we cannot miss seeing is_set if a notification
930  * has already been queued.
931  */
932  if (set->latch && set->latch->is_set)
933  {
934  occurred_events->fd = PGINVALID_SOCKET;
935  occurred_events->pos = set->latch_pos;
936  occurred_events->user_data =
937  set->events[set->latch_pos].user_data;
938  occurred_events->events = WL_LATCH_SET;
939  occurred_events++;
940  returned_events++;
941 
942  break;
943  }
944 
945  /*
946  * Wait for events using the readiness primitive chosen at the top of
947  * this file. If -1 is returned, a timeout has occurred, if 0 we have
948  * to retry, everything >= 1 is the number of returned events.
949  */
950  rc = WaitEventSetWaitBlock(set, cur_timeout,
951  occurred_events, nevents);
952 
953  if (rc == -1)
954  break; /* timeout occurred */
955  else
956  returned_events = rc;
957 
958  /* If we're not done, update cur_timeout for next iteration */
959  if (returned_events == 0 && timeout >= 0)
960  {
961  INSTR_TIME_SET_CURRENT(cur_time);
962  INSTR_TIME_SUBTRACT(cur_time, start_time);
963  cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
964  if (cur_timeout <= 0)
965  break;
966  }
967  }
968 #ifndef WIN32
969  waiting = false;
970 #endif
971 
973 
974  return returned_events;
975 }
976 
977 
978 #if defined(WAIT_USE_EPOLL)
979 
980 /*
981  * Wait using linux's epoll_wait(2).
982  *
983  * This is the preferrable wait method, as several readiness notifications are
984  * delivered, without having to iterate through all of set->events. The return
985  * epoll_event struct contain a pointer to our events, making association
986  * easy.
987  */
988 static inline int
989 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
990  WaitEvent *occurred_events, int nevents)
991 {
992  int returned_events = 0;
993  int rc;
994  WaitEvent *cur_event;
995  struct epoll_event *cur_epoll_event;
996 
997  /* Sleep */
998  rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
999  nevents, cur_timeout);
1000 
1001  /* Check return code */
1002  if (rc < 0)
1003  {
1004  /* EINTR is okay, otherwise complain */
1005  if (errno != EINTR)
1006  {
1007  waiting = false;
1008  ereport(ERROR,
1010  errmsg("epoll_wait() failed: %m")));
1011  }
1012  return 0;
1013  }
1014  else if (rc == 0)
1015  {
1016  /* timeout exceeded */
1017  return -1;
1018  }
1019 
1020  /*
1021  * At least one event occurred, iterate over the returned epoll events
1022  * until they're either all processed, or we've returned all the events
1023  * the caller desired.
1024  */
1025  for (cur_epoll_event = set->epoll_ret_events;
1026  cur_epoll_event < (set->epoll_ret_events + rc) &&
1027  returned_events < nevents;
1028  cur_epoll_event++)
1029  {
1030  /* epoll's data pointer is set to the associated WaitEvent */
1031  cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1032 
1033  occurred_events->pos = cur_event->pos;
1034  occurred_events->user_data = cur_event->user_data;
1035  occurred_events->events = 0;
1036 
1037  if (cur_event->events == WL_LATCH_SET &&
1038  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1039  {
1040  /* There's data in the self-pipe, clear it. */
1041  drainSelfPipe();
1042 
1043  if (set->latch->is_set)
1044  {
1045  occurred_events->fd = PGINVALID_SOCKET;
1046  occurred_events->events = WL_LATCH_SET;
1047  occurred_events++;
1048  returned_events++;
1049  }
1050  }
1051  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1052  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1053  {
1054  /*
1055  * We expect an EPOLLHUP when the remote end is closed, but
1056  * because we don't expect the pipe to become readable or to have
1057  * any errors either, treat those cases as postmaster death, too.
1058  *
1059  * As explained in the WAIT_USE_SELECT implementation, select(2)
1060  * may spuriously return. Be paranoid about that here too, a
1061  * spurious WL_POSTMASTER_DEATH would be painful.
1062  */
1063  if (!PostmasterIsAlive())
1064  {
1065  occurred_events->fd = PGINVALID_SOCKET;
1066  occurred_events->events = WL_POSTMASTER_DEATH;
1067  occurred_events++;
1068  returned_events++;
1069  }
1070  }
1071  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1072  {
1073  Assert(cur_event->fd != PGINVALID_SOCKET);
1074 
1075  if ((cur_event->events & WL_SOCKET_READABLE) &&
1076  (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1077  {
1078  /* data available in socket, or EOF */
1079  occurred_events->events |= WL_SOCKET_READABLE;
1080  }
1081 
1082  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1083  (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1084  {
1085  /* writable, or EOF */
1086  occurred_events->events |= WL_SOCKET_WRITEABLE;
1087  }
1088 
1089  if (occurred_events->events != 0)
1090  {
1091  occurred_events->fd = cur_event->fd;
1092  occurred_events++;
1093  returned_events++;
1094  }
1095  }
1096  }
1097 
1098  return returned_events;
1099 }
1100 
1101 #elif defined(WAIT_USE_POLL)
1102 
1103 /*
1104  * Wait using poll(2).
1105  *
1106  * This allows to receive readiness notifications for several events at once,
1107  * but requires iterating through all of set->pollfds.
1108  */
1109 static inline int
1110 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1111  WaitEvent *occurred_events, int nevents)
1112 {
1113  int returned_events = 0;
1114  int rc;
1115  WaitEvent *cur_event;
1116  struct pollfd *cur_pollfd;
1117 
1118  /* Sleep */
1119  rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1120 
1121  /* Check return code */
1122  if (rc < 0)
1123  {
1124  /* EINTR is okay, otherwise complain */
1125  if (errno != EINTR)
1126  {
1127  waiting = false;
1128  ereport(ERROR,
1130  errmsg("poll() failed: %m")));
1131  }
1132  return 0;
1133  }
1134  else if (rc == 0)
1135  {
1136  /* timeout exceeded */
1137  return -1;
1138  }
1139 
1140  for (cur_event = set->events, cur_pollfd = set->pollfds;
1141  cur_event < (set->events + set->nevents) &&
1142  returned_events < nevents;
1143  cur_event++, cur_pollfd++)
1144  {
1145  /* no activity on this FD, skip */
1146  if (cur_pollfd->revents == 0)
1147  continue;
1148 
1149  occurred_events->pos = cur_event->pos;
1150  occurred_events->user_data = cur_event->user_data;
1151  occurred_events->events = 0;
1152 
1153  if (cur_event->events == WL_LATCH_SET &&
1154  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1155  {
1156  /* There's data in the self-pipe, clear it. */
1157  drainSelfPipe();
1158 
1159  if (set->latch->is_set)
1160  {
1161  occurred_events->fd = PGINVALID_SOCKET;
1162  occurred_events->events = WL_LATCH_SET;
1163  occurred_events++;
1164  returned_events++;
1165  }
1166  }
1167  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1168  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1169  {
1170  /*
1171  * We expect an POLLHUP when the remote end is closed, but because
1172  * we don't expect the pipe to become readable or to have any
1173  * errors either, treat those cases as postmaster death, too.
1174  *
1175  * As explained in the WAIT_USE_SELECT implementation, select(2)
1176  * may spuriously return. Be paranoid about that here too, a
1177  * spurious WL_POSTMASTER_DEATH would be painful.
1178  */
1179  if (!PostmasterIsAlive())
1180  {
1181  occurred_events->fd = PGINVALID_SOCKET;
1182  occurred_events->events = WL_POSTMASTER_DEATH;
1183  occurred_events++;
1184  returned_events++;
1185  }
1186  }
1187  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1188  {
1189  int errflags = POLLHUP | POLLERR | POLLNVAL;
1190 
1191  Assert(cur_event->fd >= PGINVALID_SOCKET);
1192 
1193  if ((cur_event->events & WL_SOCKET_READABLE) &&
1194  (cur_pollfd->revents & (POLLIN | errflags)))
1195  {
1196  /* data available in socket, or EOF */
1197  occurred_events->events |= WL_SOCKET_READABLE;
1198  }
1199 
1200  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1201  (cur_pollfd->revents & (POLLOUT | errflags)))
1202  {
1203  /* writeable, or EOF */
1204  occurred_events->events |= WL_SOCKET_WRITEABLE;
1205  }
1206 
1207  if (occurred_events->events != 0)
1208  {
1209  occurred_events->fd = cur_event->fd;
1210  occurred_events++;
1211  returned_events++;
1212  }
1213  }
1214  }
1215  return returned_events;
1216 }
1217 
1218 #elif defined(WAIT_USE_SELECT)
1219 
1220 /*
1221  * Wait using select(2).
1222  *
1223  * XXX: On at least older linux kernels select(), in violation of POSIX,
1224  * doesn't reliably return a socket as writable if closed - but we rely on
1225  * that. So far all the known cases of this problem are on platforms that also
1226  * provide a poll() implementation without that bug. If we find one where
1227  * that's not the case, we'll need to add a workaround.
1228  */
1229 static inline int
1230 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1231  WaitEvent *occurred_events, int nevents)
1232 {
1233  int returned_events = 0;
1234  int rc;
1235  WaitEvent *cur_event;
1236  fd_set input_mask;
1237  fd_set output_mask;
1238  int hifd;
1239  struct timeval tv;
1240  struct timeval *tvp = NULL;
1241 
1242  FD_ZERO(&input_mask);
1243  FD_ZERO(&output_mask);
1244 
1245  /*
1246  * Prepare input/output masks. We do so every loop iteration as there's no
1247  * entirely portable way to copy fd_sets.
1248  */
1249  for (cur_event = set->events;
1250  cur_event < (set->events + set->nevents);
1251  cur_event++)
1252  {
1253  if (cur_event->events == WL_LATCH_SET)
1254  FD_SET(cur_event->fd, &input_mask);
1255  else if (cur_event->events == WL_POSTMASTER_DEATH)
1256  FD_SET(cur_event->fd, &input_mask);
1257  else
1258  {
1260  if (cur_event->events == WL_SOCKET_READABLE)
1261  FD_SET(cur_event->fd, &input_mask);
1262  else if (cur_event->events == WL_SOCKET_WRITEABLE)
1263  FD_SET(cur_event->fd, &output_mask);
1264  }
1265 
1266  if (cur_event->fd > hifd)
1267  hifd = cur_event->fd;
1268  }
1269 
1270  /* Sleep */
1271  if (cur_timeout >= 0)
1272  {
1273  tv.tv_sec = cur_timeout / 1000L;
1274  tv.tv_usec = (cur_timeout % 1000L) * 1000L;
1275  tvp = &tv;
1276  }
1277  rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
1278 
1279  /* Check return code */
1280  if (rc < 0)
1281  {
1282  /* EINTR is okay, otherwise complain */
1283  if (errno != EINTR)
1284  {
1285  waiting = false;
1286  ereport(ERROR,
1288  errmsg("select() failed: %m")));
1289  }
1290  return 0; /* retry */
1291  }
1292  else if (rc == 0)
1293  {
1294  /* timeout exceeded */
1295  return -1;
1296  }
1297 
1298  /*
1299  * To associate events with select's masks, we have to check the status of
1300  * the file descriptors associated with an event; by looping through all
1301  * events.
1302  */
1303  for (cur_event = set->events;
1304  cur_event < (set->events + set->nevents)
1305  && returned_events < nevents;
1306  cur_event++)
1307  {
1308  occurred_events->pos = cur_event->pos;
1309  occurred_events->user_data = cur_event->user_data;
1310  occurred_events->events = 0;
1311 
1312  if (cur_event->events == WL_LATCH_SET &&
1313  FD_ISSET(cur_event->fd, &input_mask))
1314  {
1315  /* There's data in the self-pipe, clear it. */
1316  drainSelfPipe();
1317 
1318  if (set->latch->is_set)
1319  {
1320  occurred_events->fd = PGINVALID_SOCKET;
1321  occurred_events->events = WL_LATCH_SET;
1322  occurred_events++;
1323  returned_events++;
1324  }
1325  }
1326  else if (cur_event->events == WL_POSTMASTER_DEATH &&
1327  FD_ISSET(cur_event->fd, &input_mask))
1328  {
1329  /*
1330  * According to the select(2) man page on Linux, select(2) may
1331  * spuriously return and report a file descriptor as readable,
1332  * when it's not; and presumably so can poll(2). It's not clear
1333  * that the relevant cases would ever apply to the postmaster
1334  * pipe, but since the consequences of falsely returning
1335  * WL_POSTMASTER_DEATH could be pretty unpleasant, we take the
1336  * trouble to positively verify EOF with PostmasterIsAlive().
1337  */
1338  if (!PostmasterIsAlive())
1339  {
1340  occurred_events->fd = PGINVALID_SOCKET;
1341  occurred_events->events = WL_POSTMASTER_DEATH;
1342  occurred_events++;
1343  returned_events++;
1344  }
1345  }
1346  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1347  {
1348  Assert(cur_event->fd != PGINVALID_SOCKET);
1349 
1350  if ((cur_event->events & WL_SOCKET_READABLE) &&
1351  FD_ISSET(cur_event->fd, &input_mask))
1352  {
1353  /* data available in socket, or EOF */
1354  occurred_events->events |= WL_SOCKET_READABLE;
1355  }
1356 
1357  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1358  FD_ISSET(cur_event->fd, &output_mask))
1359  {
1360  /* socket is writeable, or EOF */
1361  occurred_events->events |= WL_SOCKET_WRITEABLE;
1362  }
1363 
1364  if (occurred_events->events != 0)
1365  {
1366  occurred_events->fd = cur_event->fd;
1367  occurred_events++;
1368  returned_events++;
1369  }
1370  }
1371  }
1372  return returned_events;
1373 }
1374 
1375 #elif defined(WAIT_USE_WIN32)
1376 
1377 /*
1378  * Wait using Windows' WaitForMultipleObjects().
1379  *
1380  * Unfortunately this will only ever return a single readiness notification at
1381  * a time. Note that while the official documentation for
1382  * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1383  * with a single bWaitAll = FALSE call,
1384  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1385  * that only one event is "consumed".
1386  */
1387 static inline int
1388 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1389  WaitEvent *occurred_events, int nevents)
1390 {
1391  int returned_events = 0;
1392  DWORD rc;
1393  WaitEvent *cur_event;
1394 
1395  /* Reset any wait events that need it */
1396  for (cur_event = set->events;
1397  cur_event < (set->events + set->nevents);
1398  cur_event++)
1399  {
1400  if (cur_event->reset)
1401  {
1402  WaitEventAdjustWin32(set, cur_event);
1403  cur_event->reset = false;
1404  }
1405  }
1406 
1407  /*
1408  * Sleep.
1409  *
1410  * Need to wait for ->nevents + 1, because signal handle is in [0].
1411  */
1412  rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1413  cur_timeout);
1414 
1415  /* Check return code */
1416  if (rc == WAIT_FAILED)
1417  elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1418  GetLastError());
1419  else if (rc == WAIT_TIMEOUT)
1420  {
1421  /* timeout exceeded */
1422  return -1;
1423  }
1424 
1425  if (rc == WAIT_OBJECT_0)
1426  {
1427  /* Service newly-arrived signals */
1429  return 0; /* retry */
1430  }
1431 
1432  /*
1433  * With an offset of one, due to the always present pgwin32_signal_event,
1434  * the handle offset directly corresponds to a wait event.
1435  */
1436  cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1437 
1438  occurred_events->pos = cur_event->pos;
1439  occurred_events->user_data = cur_event->user_data;
1440  occurred_events->events = 0;
1441 
1442  if (cur_event->events == WL_LATCH_SET)
1443  {
1444  if (!ResetEvent(set->latch->event))
1445  elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1446 
1447  if (set->latch->is_set)
1448  {
1449  occurred_events->fd = PGINVALID_SOCKET;
1450  occurred_events->events = WL_LATCH_SET;
1451  occurred_events++;
1452  returned_events++;
1453  }
1454  }
1455  else if (cur_event->events == WL_POSTMASTER_DEATH)
1456  {
1457  /*
1458  * Postmaster apparently died. Since the consequences of falsely
1459  * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1460  * the trouble to positively verify this with PostmasterIsAlive(),
1461  * even though there is no known reason to think that the event could
1462  * be falsely set on Windows.
1463  */
1464  if (!PostmasterIsAlive())
1465  {
1466  occurred_events->fd = PGINVALID_SOCKET;
1467  occurred_events->events = WL_POSTMASTER_DEATH;
1468  occurred_events++;
1469  returned_events++;
1470  }
1471  }
1472  else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1473  {
1474  WSANETWORKEVENTS resEvents;
1475  HANDLE handle = set->handles[cur_event->pos + 1];
1476 
1477  Assert(cur_event->fd);
1478 
1479  occurred_events->fd = cur_event->fd;
1480 
1481  ZeroMemory(&resEvents, sizeof(resEvents));
1482  if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1483  elog(ERROR, "failed to enumerate network events: error code %u",
1484  WSAGetLastError());
1485  if ((cur_event->events & WL_SOCKET_READABLE) &&
1486  (resEvents.lNetworkEvents & FD_READ))
1487  {
1488  /* data available in socket */
1489  occurred_events->events |= WL_SOCKET_READABLE;
1490 
1491  /*------
1492  * WaitForMultipleObjects doesn't guarantee that a read event will
1493  * be returned if the latch is set at the same time. Even if it
1494  * did, the caller might drop that event expecting it to reoccur
1495  * on next call. So, we must force the event to be reset if this
1496  * WaitEventSet is used again in order to avoid an indefinite
1497  * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1498  * for the behavior of socket events.
1499  *------
1500  */
1501  cur_event->reset = true;
1502  }
1503  if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1504  (resEvents.lNetworkEvents & FD_WRITE))
1505  {
1506  /* writeable */
1507  occurred_events->events |= WL_SOCKET_WRITEABLE;
1508  }
1509  if (resEvents.lNetworkEvents & FD_CLOSE)
1510  {
1511  /* EOF */
1512  if (cur_event->events & WL_SOCKET_READABLE)
1513  occurred_events->events |= WL_SOCKET_READABLE;
1514  if (cur_event->events & WL_SOCKET_WRITEABLE)
1515  occurred_events->events |= WL_SOCKET_WRITEABLE;
1516  }
1517 
1518  if (occurred_events->events != 0)
1519  {
1520  occurred_events++;
1521  returned_events++;
1522  }
1523  }
1524 
1525  return returned_events;
1526 }
1527 #endif
1528 
1529 /*
1530  * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1531  *
1532  * Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1533  * overloaded for multiple purposes; or we might not have reached WaitLatch
1534  * yet, in which case we don't need to fill the pipe either.)
1535  *
1536  * NB: when calling this in a signal handler, be sure to save and restore
1537  * errno around it.
1538  */
1539 #ifndef WIN32
1540 void
1542 {
1543  if (waiting)
1544  sendSelfPipeByte();
1545 }
1546 #endif /* !WIN32 */
1547 
1548 /* Send one byte to the self-pipe, to wake up WaitLatch */
1549 #ifndef WIN32
1550 static void
1552 {
1553  int rc;
1554  char dummy = 0;
1555 
1556 retry:
1557  rc = write(selfpipe_writefd, &dummy, 1);
1558  if (rc < 0)
1559  {
1560  /* If interrupted by signal, just retry */
1561  if (errno == EINTR)
1562  goto retry;
1563 
1564  /*
1565  * If the pipe is full, we don't need to retry, the data that's there
1566  * already is enough to wake up WaitLatch.
1567  */
1568  if (errno == EAGAIN || errno == EWOULDBLOCK)
1569  return;
1570 
1571  /*
1572  * Oops, the write() failed for some other reason. We might be in a
1573  * signal handler, so it's not safe to elog(). We have no choice but
1574  * silently ignore the error.
1575  */
1576  return;
1577  }
1578 }
1579 #endif /* !WIN32 */
1580 
1581 /*
1582  * Read all available data from the self-pipe
1583  *
1584  * Note: this is only called when waiting = true. If it fails and doesn't
1585  * return, it must reset that flag first (though ideally, this will never
1586  * happen).
1587  */
1588 #ifndef WIN32
1589 static void
1591 {
1592  /*
1593  * There shouldn't normally be more than one byte in the pipe, or maybe a
1594  * few bytes if multiple processes run SetLatch at the same instant.
1595  */
1596  char buf[16];
1597  int rc;
1598 
1599  for (;;)
1600  {
1601  rc = read(selfpipe_readfd, buf, sizeof(buf));
1602  if (rc < 0)
1603  {
1604  if (errno == EAGAIN || errno == EWOULDBLOCK)
1605  break; /* the pipe is empty */
1606  else if (errno == EINTR)
1607  continue; /* retry */
1608  else
1609  {
1610  waiting = false;
1611  elog(ERROR, "read() on self-pipe failed: %m");
1612  }
1613  }
1614  else if (rc == 0)
1615  {
1616  waiting = false;
1617  elog(ERROR, "unexpected EOF on self-pipe");
1618  }
1619  else if (rc < sizeof(buf))
1620  {
1621  /* we successfully drained the pipe; no need to read() again */
1622  break;
1623  }
1624  /* else buffer wasn't big enough, so read again */
1625  }
1626 }
1627 #endif /* !WIN32 */
#define EWOULDBLOCK
Definition: win32.h:301
int latch_pos
Definition: latch.c:106
#define SIGUSR1
Definition: win32.h:211
#define WL_SOCKET_WRITEABLE
Definition: latch.h:126
pgsocket fd
Definition: latch.h:134
int MyProcPid
Definition: globals.c:38
int pos
Definition: latch.h:132
void FreeWaitEventSet(WaitEventSet *set)
Definition: latch.c:557
static int selfpipe_writefd
Definition: latch.c:132
#define WL_TIMEOUT
Definition: latch.h:127
int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, void *user_data)
Definition: latch.c:613
#define write(a, b, c)
Definition: win32.h:19
bool is_shared
Definition: latch.h:113
#define INSTR_TIME_GET_MILLISEC(t)
Definition: instr_time.h:199
struct timeval instr_time
Definition: instr_time.h:147
void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
Definition: latch.c:686
static void drainSelfPipe(void)
Definition: latch.c:1590
#define select(n, r, w, e, timeout)
Definition: win32.h:384
#define WL_SOCKET_READABLE
Definition: latch.h:125
void ResetLatch(volatile Latch *latch)
Definition: latch.c:462
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static time_t start_time
Definition: pg_ctl.c:94
#define EAGAIN
Definition: win32.h:293
WaitEventSet * CreateWaitEventSet(MemoryContext context, int nevents)
Definition: latch.c:485
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:753
HANDLE pgwin32_signal_event
Definition: signal.c:27
int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:301
void pfree(void *pointer)
Definition: mcxt.c:992
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:107
#define ERROR
Definition: elog.h:43
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:167
#define FALSE
Definition: c.h:218
#define FATAL
Definition: elog.h:52
uint32 events
Definition: latch.h:133
static int selfpipe_readfd
Definition: latch.c:131
Definition: latch.h:110
static int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents)
static char * buf
Definition: pg_test_fsync.c:65
bool PostmasterIsAlive(void)
Definition: pmsignal.c:272
unsigned int uint32
Definition: c.h:265
int pgsocket
Definition: port.h:22
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1124
void OwnLatch(volatile Latch *latch)
Definition: latch.c:253
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
#define ereport(elevel, rest)
Definition: elog.h:122
int errcode_for_socket_access(void)
Definition: elog.c:669
int nevents
Definition: latch.c:90
int postmaster_alive_fds[2]
Definition: postmaster.c:556
static void sendSelfPipeByte(void)
Definition: latch.c:1551
#define WL_POSTMASTER_DEATH
Definition: latch.h:128
#define PGINVALID_SOCKET
Definition: port.h:24
#define EINTR
Definition: win32.h:295
void InitializeLatchSupport(void)
Definition: latch.c:157
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:784
#define pg_memory_barrier()
Definition: atomics.h:147
void SetLatch(volatile Latch *latch)
Definition: latch.c:380
#define NULL
Definition: c.h:226
#define Assert(condition)
Definition: c.h:671
int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout, uint32 wait_event_info)
Definition: latch.c:321
WaitEvent * events
Definition: latch.c:97
size_t Size
Definition: c.h:353
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1100
#define MAXALIGN(LEN)
Definition: c.h:584
void InitLatch(volatile Latch *latch)
Definition: latch.c:189
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:153
void * user_data
Definition: latch.h:135
int nevents_space
Definition: latch.c:91
int errmsg(const char *fmt,...)
Definition: elog.c:797
int owner_pid
Definition: latch.h:114
void DisownLatch(volatile Latch *latch)
Definition: latch.c:273
sig_atomic_t is_set
Definition: latch.h:112
#define TRUE
Definition: c.h:214
#define elog
Definition: elog.h:219
#define close(a)
Definition: win32.h:17
void latch_sigusr1_handler(void)
Definition: latch.c:1541
void InitSharedLatch(volatile Latch *latch)
Definition: latch.c:217
Latch * latch
Definition: latch.c:105
#define WL_LATCH_SET
Definition: latch.h:124
static volatile sig_atomic_t waiting
Definition: latch.c:128
#define POSTMASTER_FD_WATCH
Definition: postmaster.h:42
#define read(a, b, c)
Definition: win32.h:18
int WaitEventSetWait(WaitEventSet *set, long timeout, WaitEvent *occurred_events, int nevents, uint32 wait_event_info)
Definition: latch.c:871