PostgreSQL Source Code  git master
socket.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * socket.c
4  * Microsoft Windows Win32 Socket Functions
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/backend/port/win32/socket.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 
16 /*
17  * Indicate if pgwin32_recv() and pgwin32_send() should operate
18  * in non-blocking mode.
19  *
20  * Since the socket emulation layer always sets the actual socket to
21  * non-blocking mode in order to be able to deliver signals, we must
22  * specify this in a separate flag if we actually need non-blocking
23  * operation.
24  *
25  * This flag changes the behaviour *globally* for all socket operations,
26  * so it should only be set for very short periods of time.
27  */
29 
30 /* Undef the macros defined in win32.h, so we can access system functions */
31 #undef socket
32 #undef bind
33 #undef listen
34 #undef accept
35 #undef connect
36 #undef select
37 #undef recv
38 #undef send
39 
40 /*
41  * Blocking socket functions implemented so they listen on both
42  * the socket and the signal event, required for signal handling.
43  */
44 
45 /*
46  * Convert the last socket error code into errno
47  *
48  * Note: where there is a direct correspondence between a WSAxxx error code
49  * and a Berkeley error symbol, this mapping is actually a no-op, because
50  * in win32_port.h we redefine the network-related Berkeley error symbols to
51  * have the values of their WSAxxx counterparts. The point of the switch is
52  * mostly to translate near-miss error codes into something that's sensible
53  * in the Berkeley universe.
54  */
55 static void
57 {
58  switch (WSAGetLastError())
59  {
60  case WSAEINVAL:
61  case WSANOTINITIALISED:
62  case WSAEINVALIDPROVIDER:
63  case WSAEINVALIDPROCTABLE:
64  case WSAEDESTADDRREQ:
65  errno = EINVAL;
66  break;
67  case WSAEINPROGRESS:
68  errno = EINPROGRESS;
69  break;
70  case WSAEFAULT:
71  errno = EFAULT;
72  break;
73  case WSAEISCONN:
74  errno = EISCONN;
75  break;
76  case WSAEMSGSIZE:
77  errno = EMSGSIZE;
78  break;
79  case WSAEAFNOSUPPORT:
80  errno = EAFNOSUPPORT;
81  break;
82  case WSAEMFILE:
83  errno = EMFILE;
84  break;
85  case WSAENOBUFS:
86  errno = ENOBUFS;
87  break;
88  case WSAEPROTONOSUPPORT:
89  case WSAEPROTOTYPE:
90  case WSAESOCKTNOSUPPORT:
91  errno = EPROTONOSUPPORT;
92  break;
93  case WSAECONNABORTED:
94  errno = ECONNABORTED;
95  break;
96  case WSAECONNREFUSED:
97  errno = ECONNREFUSED;
98  break;
99  case WSAECONNRESET:
100  errno = ECONNRESET;
101  break;
102  case WSAEINTR:
103  errno = EINTR;
104  break;
105  case WSAENOTSOCK:
106  errno = ENOTSOCK;
107  break;
108  case WSAEOPNOTSUPP:
109  errno = EOPNOTSUPP;
110  break;
111  case WSAEWOULDBLOCK:
112  errno = EWOULDBLOCK;
113  break;
114  case WSAEACCES:
115  errno = EACCES;
116  break;
117  case WSAEADDRINUSE:
118  errno = EADDRINUSE;
119  break;
120  case WSAEADDRNOTAVAIL:
121  errno = EADDRNOTAVAIL;
122  break;
123  case WSAEHOSTDOWN:
124  errno = EHOSTDOWN;
125  break;
126  case WSAEHOSTUNREACH:
127  case WSAHOST_NOT_FOUND:
128  errno = EHOSTUNREACH;
129  break;
130  case WSAENETDOWN:
131  errno = ENETDOWN;
132  break;
133  case WSAENETUNREACH:
134  errno = ENETUNREACH;
135  break;
136  case WSAENETRESET:
137  errno = ENETRESET;
138  break;
139  case WSAENOTCONN:
140  case WSAESHUTDOWN:
141  case WSAEDISCON:
142  errno = ENOTCONN;
143  break;
144  case WSAETIMEDOUT:
145  errno = ETIMEDOUT;
146  break;
147  default:
148  ereport(NOTICE,
149  (errmsg_internal("unrecognized win32 socket error code: %d",
150  WSAGetLastError())));
151  errno = EINVAL;
152  break;
153  }
154 }
155 
156 static int
158 {
160  {
162  errno = EINTR;
163  return 1;
164  }
165  return 0;
166 }
167 
168 static int
169 isDataGram(SOCKET s)
170 {
171  int type;
172  int typelen = sizeof(type);
173 
174  if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen))
175  return 1;
176 
177  return (type == SOCK_DGRAM) ? 1 : 0;
178 }
179 
180 int
181 pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
182 {
183  static HANDLE waitevent = INVALID_HANDLE_VALUE;
184  static SOCKET current_socket = INVALID_SOCKET;
185  static int isUDP = 0;
186  HANDLE events[2];
187  int r;
188 
189  /* Create an event object just once and use it on all future calls */
190  if (waitevent == INVALID_HANDLE_VALUE)
191  {
192  waitevent = CreateEvent(NULL, TRUE, FALSE, NULL);
193 
194  if (waitevent == INVALID_HANDLE_VALUE)
195  ereport(ERROR,
196  (errmsg_internal("could not create socket waiting event: error code %lu", GetLastError())));
197  }
198  else if (!ResetEvent(waitevent))
199  ereport(ERROR,
200  (errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError())));
201 
202  /*
203  * Track whether socket is UDP or not. (NB: most likely, this is both
204  * useless and wrong; there is no reason to think that the behavior of
205  * WSAEventSelect is different for TCP and UDP.)
206  */
207  if (current_socket != s)
208  isUDP = isDataGram(s);
209  current_socket = s;
210 
211  /*
212  * Attach event to socket. NOTE: we must detach it again before
213  * returning, since other bits of code may try to attach other events to
214  * the socket.
215  */
216  if (WSAEventSelect(s, waitevent, what) != 0)
217  {
219  return 0;
220  }
221 
222  events[0] = pgwin32_signal_event;
223  events[1] = waitevent;
224 
225  /*
226  * Just a workaround of unknown locking problem with writing in UDP socket
227  * under high load: Client's pgsql backend sleeps infinitely in
228  * WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select().
229  * So, we will wait with small timeout(0.1 sec) and if socket is still
230  * blocked, try WSASend (see comments in pgwin32_select) and wait again.
231  */
232  if ((what & FD_WRITE) && isUDP)
233  {
234  for (;;)
235  {
236  r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE);
237 
238  if (r == WAIT_TIMEOUT)
239  {
240  char c;
241  WSABUF buf;
242  DWORD sent;
243 
244  buf.buf = &c;
245  buf.len = 0;
246 
247  r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL);
248  if (r == 0) /* Completed - means things are fine! */
249  {
250  WSAEventSelect(s, NULL, 0);
251  return 1;
252  }
253  else if (WSAGetLastError() != WSAEWOULDBLOCK)
254  {
256  WSAEventSelect(s, NULL, 0);
257  return 0;
258  }
259  }
260  else
261  break;
262  }
263  }
264  else
265  r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE);
266 
267  WSAEventSelect(s, NULL, 0);
268 
269  if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION)
270  {
272  errno = EINTR;
273  return 0;
274  }
275  if (r == WAIT_OBJECT_0 + 1)
276  return 1;
277  if (r == WAIT_TIMEOUT)
278  {
279  errno = EWOULDBLOCK;
280  return 0;
281  }
282  ereport(ERROR,
283  (errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError())));
284  return 0;
285 }
286 
287 /*
288  * Create a socket, setting it to overlapped and non-blocking
289  */
290 SOCKET
291 pgwin32_socket(int af, int type, int protocol)
292 {
293  SOCKET s;
294  unsigned long on = 1;
295 
296  s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED);
297  if (s == INVALID_SOCKET)
298  {
300  return INVALID_SOCKET;
301  }
302 
303  if (ioctlsocket(s, FIONBIO, &on))
304  {
306  return INVALID_SOCKET;
307  }
308  errno = 0;
309 
310  return s;
311 }
312 
313 int
314 pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
315 {
316  int res;
317 
318  res = bind(s, addr, addrlen);
319  if (res < 0)
321  return res;
322 }
323 
324 int
325 pgwin32_listen(SOCKET s, int backlog)
326 {
327  int res;
328 
329  res = listen(s, backlog);
330  if (res < 0)
332  return res;
333 }
334 
335 SOCKET
336 pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
337 {
338  SOCKET rs;
339 
340  /*
341  * Poll for signals, but don't return with EINTR, since we don't handle
342  * that in pqcomm.c
343  */
345 
346  rs = WSAAccept(s, addr, addrlen, NULL, 0);
347  if (rs == INVALID_SOCKET)
348  {
350  return INVALID_SOCKET;
351  }
352  return rs;
353 }
354 
355 
356 /* No signal delivery during connect. */
357 int
358 pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
359 {
360  int r;
361 
362  r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL);
363  if (r == 0)
364  return 0;
365 
366  if (WSAGetLastError() != WSAEWOULDBLOCK)
367  {
369  return -1;
370  }
371 
372  while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0)
373  {
374  /* Loop endlessly as long as we are just delivering signals */
375  }
376 
377  return 0;
378 }
379 
380 int
381 pgwin32_recv(SOCKET s, char *buf, int len, int f)
382 {
383  WSABUF wbuf;
384  int r;
385  DWORD b;
386  DWORD flags = f;
387  int n;
388 
389  if (pgwin32_poll_signals())
390  return -1;
391 
392  wbuf.len = len;
393  wbuf.buf = buf;
394 
395  r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
396  if (r != SOCKET_ERROR)
397  return b; /* success */
398 
399  if (WSAGetLastError() != WSAEWOULDBLOCK)
400  {
402  return -1;
403  }
404 
405  if (pgwin32_noblock)
406  {
407  /*
408  * No data received, and we are in "emulated non-blocking mode", so
409  * return indicating that we'd block if we were to continue.
410  */
411  errno = EWOULDBLOCK;
412  return -1;
413  }
414 
415  /* We're in blocking mode, so wait for data */
416 
417  for (n = 0; n < 5; n++)
418  {
419  if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT,
420  INFINITE) == 0)
421  return -1; /* errno already set */
422 
423  r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
424  if (r != SOCKET_ERROR)
425  return b; /* success */
426  if (WSAGetLastError() != WSAEWOULDBLOCK)
427  {
429  return -1;
430  }
431 
432  /*
433  * There seem to be cases on win2k (at least) where WSARecv can return
434  * WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the
435  * socket is readable. In this case, just sleep for a moment and try
436  * again. We try up to 5 times - if it fails more than that it's not
437  * likely to ever come back.
438  */
439  pg_usleep(10000);
440  }
441  ereport(NOTICE,
442  (errmsg_internal("could not read from ready socket (after retries)")));
443  errno = EWOULDBLOCK;
444  return -1;
445 }
446 
447 /*
448  * The second argument to send() is defined by SUS to be a "const void *"
449  * and so we use the same signature here to keep compilers happy when
450  * handling callers.
451  *
452  * But the buf member of a WSABUF struct is defined as "char *", so we cast
453  * the second argument to that here when assigning it, also to keep compilers
454  * happy.
455  */
456 
457 int
458 pgwin32_send(SOCKET s, const void *buf, int len, int flags)
459 {
460  WSABUF wbuf;
461  int r;
462  DWORD b;
463 
464  if (pgwin32_poll_signals())
465  return -1;
466 
467  wbuf.len = len;
468  wbuf.buf = (char *) buf;
469 
470  /*
471  * Readiness of socket to send data to UDP socket may be not true: socket
472  * can become busy again! So loop until send or error occurs.
473  */
474  for (;;)
475  {
476  r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL);
477  if (r != SOCKET_ERROR && b > 0)
478  /* Write succeeded right away */
479  return b;
480 
481  if (r == SOCKET_ERROR &&
482  WSAGetLastError() != WSAEWOULDBLOCK)
483  {
485  return -1;
486  }
487 
488  if (pgwin32_noblock)
489  {
490  /*
491  * No data sent, and we are in "emulated non-blocking mode", so
492  * return indicating that we'd block if we were to continue.
493  */
494  errno = EWOULDBLOCK;
495  return -1;
496  }
497 
498  /* No error, zero bytes */
499 
500  if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
501  return -1;
502  }
503 
504  return -1;
505 }
506 
507 
508 /*
509  * Wait for activity on one or more sockets.
510  * While waiting, allow signals to run
511  *
512  * NOTE! Currently does not implement exceptfds check,
513  * since it is not used in postgresql!
514  */
515 int
516 pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
517 {
518  WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally
519  * different from writefds, so
520  * 2*FD_SETSIZE sockets */
521  SOCKET sockets[FD_SETSIZE * 2];
522  int numevents = 0;
523  int i;
524  int r;
525  DWORD timeoutval = WSA_INFINITE;
526  FD_SET outreadfds;
527  FD_SET outwritefds;
528  int nummatches = 0;
529 
530  Assert(exceptfds == NULL);
531 
532  if (pgwin32_poll_signals())
533  return -1;
534 
535  FD_ZERO(&outreadfds);
536  FD_ZERO(&outwritefds);
537 
538  /*
539  * Windows does not guarantee to log an FD_WRITE network event indicating
540  * that more data can be sent unless the previous send() failed with
541  * WSAEWOULDBLOCK. While our caller might well have made such a call, we
542  * cannot assume that here. Therefore, if waiting for write-ready, force
543  * the issue by doing a dummy send(). If the dummy send() succeeds,
544  * assume that the socket is in fact write-ready, and return immediately.
545  * Also, if it fails with something other than WSAEWOULDBLOCK, return a
546  * write-ready indication to let our caller deal with the error condition.
547  */
548  if (writefds != NULL)
549  {
550  for (i = 0; i < writefds->fd_count; i++)
551  {
552  char c;
553  WSABUF buf;
554  DWORD sent;
555 
556  buf.buf = &c;
557  buf.len = 0;
558 
559  r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL);
560  if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
561  FD_SET(writefds->fd_array[i], &outwritefds);
562  }
563 
564  /* If we found any write-ready sockets, just return them immediately */
565  if (outwritefds.fd_count > 0)
566  {
567  memcpy(writefds, &outwritefds, sizeof(fd_set));
568  if (readfds)
569  FD_ZERO(readfds);
570  return outwritefds.fd_count;
571  }
572  }
573 
574 
575  /* Now set up for an actual select */
576 
577  if (timeout != NULL)
578  {
579  /* timeoutval is in milliseconds */
580  timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
581  }
582 
583  if (readfds != NULL)
584  {
585  for (i = 0; i < readfds->fd_count; i++)
586  {
587  events[numevents] = WSACreateEvent();
588  sockets[numevents] = readfds->fd_array[i];
589  numevents++;
590  }
591  }
592  if (writefds != NULL)
593  {
594  for (i = 0; i < writefds->fd_count; i++)
595  {
596  if (!readfds ||
597  !FD_ISSET(writefds->fd_array[i], readfds))
598  {
599  /* If the socket is not in the read list */
600  events[numevents] = WSACreateEvent();
601  sockets[numevents] = writefds->fd_array[i];
602  numevents++;
603  }
604  }
605  }
606 
607  for (i = 0; i < numevents; i++)
608  {
609  int flags = 0;
610 
611  if (readfds && FD_ISSET(sockets[i], readfds))
612  flags |= FD_READ | FD_ACCEPT | FD_CLOSE;
613 
614  if (writefds && FD_ISSET(sockets[i], writefds))
615  flags |= FD_WRITE | FD_CLOSE;
616 
617  if (WSAEventSelect(sockets[i], events[i], flags) != 0)
618  {
620  /* release already-assigned event objects */
621  while (--i >= 0)
622  WSAEventSelect(sockets[i], NULL, 0);
623  for (i = 0; i < numevents; i++)
624  WSACloseEvent(events[i]);
625  return -1;
626  }
627  }
628 
629  events[numevents] = pgwin32_signal_event;
630  r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE);
631  if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents))
632  {
633  /*
634  * We scan all events, even those not signaled, in case more than one
635  * event has been tagged but Wait.. can only return one.
636  */
637  WSANETWORKEVENTS resEvents;
638 
639  for (i = 0; i < numevents; i++)
640  {
641  ZeroMemory(&resEvents, sizeof(resEvents));
642  if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0)
643  elog(ERROR, "failed to enumerate network events: error code %d",
644  WSAGetLastError());
645  /* Read activity? */
646  if (readfds && FD_ISSET(sockets[i], readfds))
647  {
648  if ((resEvents.lNetworkEvents & FD_READ) ||
649  (resEvents.lNetworkEvents & FD_ACCEPT) ||
650  (resEvents.lNetworkEvents & FD_CLOSE))
651  {
652  FD_SET(sockets[i], &outreadfds);
653 
654  nummatches++;
655  }
656  }
657  /* Write activity? */
658  if (writefds && FD_ISSET(sockets[i], writefds))
659  {
660  if ((resEvents.lNetworkEvents & FD_WRITE) ||
661  (resEvents.lNetworkEvents & FD_CLOSE))
662  {
663  FD_SET(sockets[i], &outwritefds);
664 
665  nummatches++;
666  }
667  }
668  }
669  }
670 
671  /* Clean up all the event objects */
672  for (i = 0; i < numevents; i++)
673  {
674  WSAEventSelect(sockets[i], NULL, 0);
675  WSACloseEvent(events[i]);
676  }
677 
678  if (r == WSA_WAIT_TIMEOUT)
679  {
680  if (readfds)
681  FD_ZERO(readfds);
682  if (writefds)
683  FD_ZERO(writefds);
684  return 0;
685  }
686 
687  /* Signal-like events. */
688  if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION)
689  {
691  errno = EINTR;
692  if (readfds)
693  FD_ZERO(readfds);
694  if (writefds)
695  FD_ZERO(writefds);
696  return -1;
697  }
698 
699  /* Overwrite socket sets with our resulting values */
700  if (readfds)
701  memcpy(readfds, &outreadfds, sizeof(fd_set));
702  if (writefds)
703  memcpy(writefds, &outwritefds, sizeof(fd_set));
704  return nummatches;
705 }
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
#define ERROR
Definition: elog.h:39
#define NOTICE
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:149
int b
Definition: isn.c:70
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
const void size_t len
static char * buf
Definition: pg_test_fsync.c:73
char * c
void pg_usleep(long microsec)
Definition: signal.c:53
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:120
HANDLE pgwin32_signal_event
Definition: signal.c:27
int pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
Definition: socket.c:516
int pgwin32_recv(SOCKET s, char *buf, int len, int f)
Definition: socket.c:381
int pgwin32_send(SOCKET s, const void *buf, int len, int flags)
Definition: socket.c:458
int pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
Definition: socket.c:358
int pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
Definition: socket.c:181
static int pgwin32_poll_signals(void)
Definition: socket.c:157
SOCKET pgwin32_socket(int af, int type, int protocol)
Definition: socket.c:291
static int isDataGram(SOCKET s)
Definition: socket.c:169
static void TranslateSocketError(void)
Definition: socket.c:56
SOCKET pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
Definition: socket.c:336
int pgwin32_noblock
Definition: socket.c:28
int pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
Definition: socket.c:314
int pgwin32_listen(SOCKET s, int backlog)
Definition: socket.c:325
const char * type
#define UNBLOCKED_SIGNAL_QUEUE()
Definition: win32_port.h:476
#define EISCONN
Definition: win32_port.h:388
#define bind(s, addr, addrlen)
Definition: win32_port.h:491
#define ENETUNREACH
Definition: win32_port.h:412
#define ECONNABORTED
Definition: win32_port.h:382
#define EINTR
Definition: win32_port.h:374
#define EWOULDBLOCK
Definition: win32_port.h:380
#define EOPNOTSUPP
Definition: win32_port.h:398
#define EAFNOSUPPORT
Definition: win32_port.h:378
#define EHOSTUNREACH
Definition: win32_port.h:406
#define EADDRNOTAVAIL
Definition: win32_port.h:402
#define ETIMEDOUT
Definition: win32_port.h:416
#define EADDRINUSE
Definition: win32_port.h:400
#define EINPROGRESS
Definition: win32_port.h:386
#define ENETRESET
Definition: win32_port.h:410
#define ENOBUFS
Definition: win32_port.h:390
#define EHOSTDOWN
Definition: win32_port.h:404
#define ENETDOWN
Definition: win32_port.h:408
#define ECONNREFUSED
Definition: win32_port.h:394
#define EPROTONOSUPPORT
Definition: win32_port.h:392
#define ECONNRESET
Definition: win32_port.h:384
#define ENOTSOCK
Definition: win32_port.h:396
#define EMSGSIZE
Definition: win32_port.h:376
#define listen(s, backlog)
Definition: win32_port.h:492
#define ENOTCONN
Definition: win32_port.h:414