PostgreSQL Source Code  git master
socket.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * socket.c
4  * Microsoft Windows Win32 Socket Functions
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/backend/port/win32/socket.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 
16 /*
17  * Indicate if pgwin32_recv() and pgwin32_send() should operate
18  * in non-blocking mode.
19  *
20  * Since the socket emulation layer always sets the actual socket to
21  * non-blocking mode in order to be able to deliver signals, we must
22  * specify this in a separate flag if we actually need non-blocking
23  * operation.
24  *
25  * This flag changes the behaviour *globally* for all socket operations,
26  * so it should only be set for very short periods of time.
27  */
29 
30 /* Undef the macros defined in win32.h, so we can access system functions */
31 #undef socket
32 #undef bind
33 #undef listen
34 #undef accept
35 #undef connect
36 #undef select
37 #undef recv
38 #undef send
39 
40 /*
41  * Blocking socket functions implemented so they listen on both
42  * the socket and the signal event, required for signal handling.
43  */
44 
45 /*
46  * Convert the last socket error code into errno
47  *
48  * Note: where there is a direct correspondence between a WSAxxx error code
49  * and a Berkeley error symbol, this mapping is actually a no-op, because
50  * in win32_port.h we redefine the network-related Berkeley error symbols to
51  * have the values of their WSAxxx counterparts. The point of the switch is
52  * mostly to translate near-miss error codes into something that's sensible
53  * in the Berkeley universe.
54  */
55 static void
57 {
58  switch (WSAGetLastError())
59  {
60  case WSAEINVAL:
61  case WSANOTINITIALISED:
62  case WSAEINVALIDPROVIDER:
63  case WSAEINVALIDPROCTABLE:
64  case WSAEDESTADDRREQ:
65  errno = EINVAL;
66  break;
67  case WSAEINPROGRESS:
68  errno = EINPROGRESS;
69  break;
70  case WSAEFAULT:
71  errno = EFAULT;
72  break;
73  case WSAEISCONN:
74  errno = EISCONN;
75  break;
76  case WSAEMSGSIZE:
77  errno = EMSGSIZE;
78  break;
79  case WSAEAFNOSUPPORT:
80  errno = EAFNOSUPPORT;
81  break;
82  case WSAEMFILE:
83  errno = EMFILE;
84  break;
85  case WSAENOBUFS:
86  errno = ENOBUFS;
87  break;
88  case WSAEPROTONOSUPPORT:
89  case WSAEPROTOTYPE:
90  case WSAESOCKTNOSUPPORT:
91  errno = EPROTONOSUPPORT;
92  break;
93  case WSAECONNABORTED:
94  errno = ECONNABORTED;
95  break;
96  case WSAECONNREFUSED:
97  errno = ECONNREFUSED;
98  break;
99  case WSAECONNRESET:
100  errno = ECONNRESET;
101  break;
102  case WSAEINTR:
103  errno = EINTR;
104  break;
105  case WSAENOTSOCK:
106  errno = ENOTSOCK;
107  break;
108  case WSAEOPNOTSUPP:
109  errno = EOPNOTSUPP;
110  break;
111  case WSAEWOULDBLOCK:
112  errno = EWOULDBLOCK;
113  break;
114  case WSAEACCES:
115  errno = EACCES;
116  break;
117  case WSAEADDRINUSE:
118  errno = EADDRINUSE;
119  break;
120  case WSAEADDRNOTAVAIL:
121  errno = EADDRNOTAVAIL;
122  break;
123  case WSAEHOSTDOWN:
124  errno = EHOSTDOWN;
125  break;
126  case WSAEHOSTUNREACH:
127  case WSAHOST_NOT_FOUND:
128  errno = EHOSTUNREACH;
129  break;
130  case WSAENETDOWN:
131  errno = ENETDOWN;
132  break;
133  case WSAENETUNREACH:
134  errno = ENETUNREACH;
135  break;
136  case WSAENETRESET:
137  errno = ENETRESET;
138  break;
139  case WSAENOTCONN:
140  case WSAESHUTDOWN:
141  case WSAEDISCON:
142  errno = ENOTCONN;
143  break;
144  case WSAETIMEDOUT:
145  errno = ETIMEDOUT;
146  break;
147  default:
148  ereport(NOTICE,
149  (errmsg_internal("unrecognized win32 socket error code: %d",
150  WSAGetLastError())));
151  errno = EINVAL;
152  break;
153  }
154 }
155 
156 static int
158 {
160  {
162  errno = EINTR;
163  return 1;
164  }
165  return 0;
166 }
167 
168 static int
169 isDataGram(SOCKET s)
170 {
171  int type;
172  int typelen = sizeof(type);
173 
174  if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen))
175  return 1;
176 
177  return (type == SOCK_DGRAM) ? 1 : 0;
178 }
179 
180 int
181 pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
182 {
183  static HANDLE waitevent = INVALID_HANDLE_VALUE;
184  static SOCKET current_socket = INVALID_SOCKET;
185  static int isUDP = 0;
186  HANDLE events[2];
187  int r;
188 
189  /* Create an event object just once and use it on all future calls */
190  if (waitevent == INVALID_HANDLE_VALUE)
191  {
192  waitevent = CreateEvent(NULL, TRUE, FALSE, NULL);
193 
194  if (waitevent == INVALID_HANDLE_VALUE)
195  ereport(ERROR,
196  (errmsg_internal("could not create socket waiting event: error code %lu", GetLastError())));
197  }
198  else if (!ResetEvent(waitevent))
199  ereport(ERROR,
200  (errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError())));
201 
202  /*
203  * Track whether socket is UDP or not. (NB: most likely, this is both
204  * useless and wrong; there is no reason to think that the behavior of
205  * WSAEventSelect is different for TCP and UDP.)
206  */
207  if (current_socket != s)
208  isUDP = isDataGram(s);
209  current_socket = s;
210 
211  /*
212  * Attach event to socket. NOTE: we must detach it again before
213  * returning, since other bits of code may try to attach other events to
214  * the socket.
215  */
216  if (WSAEventSelect(s, waitevent, what) != 0)
217  {
219  return 0;
220  }
221 
222  events[0] = pgwin32_signal_event;
223  events[1] = waitevent;
224 
225  /*
226  * Just a workaround of unknown locking problem with writing in UDP socket
227  * under high load: Client's pgsql backend sleeps infinitely in
228  * WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select().
229  * So, we will wait with small timeout(0.1 sec) and if socket is still
230  * blocked, try WSASend (see comments in pgwin32_select) and wait again.
231  */
232  if ((what & FD_WRITE) && isUDP)
233  {
234  for (;;)
235  {
236  r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE);
237 
238  if (r == WAIT_TIMEOUT)
239  {
240  char c;
241  WSABUF buf;
242  DWORD sent;
243 
244  buf.buf = &c;
245  buf.len = 0;
246 
247  r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL);
248  if (r == 0) /* Completed - means things are fine! */
249  {
250  WSAEventSelect(s, NULL, 0);
251  return 1;
252  }
253  else if (WSAGetLastError() != WSAEWOULDBLOCK)
254  {
256  WSAEventSelect(s, NULL, 0);
257  return 0;
258  }
259  }
260  else
261  break;
262  }
263  }
264  else
265  r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE);
266 
267  WSAEventSelect(s, NULL, 0);
268 
269  if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION)
270  {
272  errno = EINTR;
273  return 0;
274  }
275  if (r == WAIT_OBJECT_0 + 1)
276  return 1;
277  if (r == WAIT_TIMEOUT)
278  {
279  errno = EWOULDBLOCK;
280  return 0;
281  }
282  ereport(ERROR,
283  (errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError())));
284  return 0;
285 }
286 
287 /*
288  * Create a socket, setting it to overlapped and non-blocking
289  */
290 SOCKET
291 pgwin32_socket(int af, int type, int protocol)
292 {
293  SOCKET s;
294  unsigned long on = 1;
295 
296  s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED);
297  if (s == INVALID_SOCKET)
298  {
300  return INVALID_SOCKET;
301  }
302 
303  if (ioctlsocket(s, FIONBIO, &on))
304  {
306  closesocket(s);
307  return INVALID_SOCKET;
308  }
309  errno = 0;
310 
311  return s;
312 }
313 
314 int
315 pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
316 {
317  int res;
318 
319  res = bind(s, addr, addrlen);
320  if (res < 0)
322  return res;
323 }
324 
325 int
326 pgwin32_listen(SOCKET s, int backlog)
327 {
328  int res;
329 
330  res = listen(s, backlog);
331  if (res < 0)
333  return res;
334 }
335 
336 SOCKET
337 pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
338 {
339  SOCKET rs;
340 
341  /*
342  * Poll for signals, but don't return with EINTR, since we don't handle
343  * that in pqcomm.c
344  */
346 
347  rs = WSAAccept(s, addr, addrlen, NULL, 0);
348  if (rs == INVALID_SOCKET)
349  {
351  return INVALID_SOCKET;
352  }
353  return rs;
354 }
355 
356 
357 /* No signal delivery during connect. */
358 int
359 pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
360 {
361  int r;
362 
363  r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL);
364  if (r == 0)
365  return 0;
366 
367  if (WSAGetLastError() != WSAEWOULDBLOCK)
368  {
370  return -1;
371  }
372 
373  while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0)
374  {
375  /* Loop endlessly as long as we are just delivering signals */
376  }
377 
378  return 0;
379 }
380 
381 int
382 pgwin32_recv(SOCKET s, char *buf, int len, int f)
383 {
384  WSABUF wbuf;
385  int r;
386  DWORD b;
387  DWORD flags = f;
388  int n;
389 
390  if (pgwin32_poll_signals())
391  return -1;
392 
393  wbuf.len = len;
394  wbuf.buf = buf;
395 
396  r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
397  if (r != SOCKET_ERROR)
398  return b; /* success */
399 
400  if (WSAGetLastError() != WSAEWOULDBLOCK)
401  {
403  return -1;
404  }
405 
406  if (pgwin32_noblock)
407  {
408  /*
409  * No data received, and we are in "emulated non-blocking mode", so
410  * return indicating that we'd block if we were to continue.
411  */
412  errno = EWOULDBLOCK;
413  return -1;
414  }
415 
416  /* We're in blocking mode, so wait for data */
417 
418  for (n = 0; n < 5; n++)
419  {
420  if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT,
421  INFINITE) == 0)
422  return -1; /* errno already set */
423 
424  r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
425  if (r != SOCKET_ERROR)
426  return b; /* success */
427  if (WSAGetLastError() != WSAEWOULDBLOCK)
428  {
430  return -1;
431  }
432 
433  /*
434  * There seem to be cases on win2k (at least) where WSARecv can return
435  * WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the
436  * socket is readable. In this case, just sleep for a moment and try
437  * again. We try up to 5 times - if it fails more than that it's not
438  * likely to ever come back.
439  */
440  pg_usleep(10000);
441  }
442  ereport(NOTICE,
443  (errmsg_internal("could not read from ready socket (after retries)")));
444  errno = EWOULDBLOCK;
445  return -1;
446 }
447 
448 /*
449  * The second argument to send() is defined by SUS to be a "const void *"
450  * and so we use the same signature here to keep compilers happy when
451  * handling callers.
452  *
453  * But the buf member of a WSABUF struct is defined as "char *", so we cast
454  * the second argument to that here when assigning it, also to keep compilers
455  * happy.
456  */
457 
458 int
459 pgwin32_send(SOCKET s, const void *buf, int len, int flags)
460 {
461  WSABUF wbuf;
462  int r;
463  DWORD b;
464 
465  if (pgwin32_poll_signals())
466  return -1;
467 
468  wbuf.len = len;
469  wbuf.buf = (char *) buf;
470 
471  /*
472  * Readiness of socket to send data to UDP socket may be not true: socket
473  * can become busy again! So loop until send or error occurs.
474  */
475  for (;;)
476  {
477  r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL);
478  if (r != SOCKET_ERROR && b > 0)
479  /* Write succeeded right away */
480  return b;
481 
482  if (r == SOCKET_ERROR &&
483  WSAGetLastError() != WSAEWOULDBLOCK)
484  {
486  return -1;
487  }
488 
489  if (pgwin32_noblock)
490  {
491  /*
492  * No data sent, and we are in "emulated non-blocking mode", so
493  * return indicating that we'd block if we were to continue.
494  */
495  errno = EWOULDBLOCK;
496  return -1;
497  }
498 
499  /* No error, zero bytes */
500 
501  if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
502  return -1;
503  }
504 
505  return -1;
506 }
507 
508 
509 /*
510  * Wait for activity on one or more sockets.
511  * While waiting, allow signals to run
512  *
513  * NOTE! Currently does not implement exceptfds check,
514  * since it is not used in postgresql!
515  */
516 int
517 pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
518 {
519  WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally
520  * different from writefds, so
521  * 2*FD_SETSIZE sockets */
522  SOCKET sockets[FD_SETSIZE * 2];
523  int numevents = 0;
524  int i;
525  int r;
526  DWORD timeoutval = WSA_INFINITE;
527  FD_SET outreadfds;
528  FD_SET outwritefds;
529  int nummatches = 0;
530 
531  Assert(exceptfds == NULL);
532 
533  if (pgwin32_poll_signals())
534  return -1;
535 
536  FD_ZERO(&outreadfds);
537  FD_ZERO(&outwritefds);
538 
539  /*
540  * Windows does not guarantee to log an FD_WRITE network event indicating
541  * that more data can be sent unless the previous send() failed with
542  * WSAEWOULDBLOCK. While our caller might well have made such a call, we
543  * cannot assume that here. Therefore, if waiting for write-ready, force
544  * the issue by doing a dummy send(). If the dummy send() succeeds,
545  * assume that the socket is in fact write-ready, and return immediately.
546  * Also, if it fails with something other than WSAEWOULDBLOCK, return a
547  * write-ready indication to let our caller deal with the error condition.
548  */
549  if (writefds != NULL)
550  {
551  for (i = 0; i < writefds->fd_count; i++)
552  {
553  char c;
554  WSABUF buf;
555  DWORD sent;
556 
557  buf.buf = &c;
558  buf.len = 0;
559 
560  r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL);
561  if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
562  FD_SET(writefds->fd_array[i], &outwritefds);
563  }
564 
565  /* If we found any write-ready sockets, just return them immediately */
566  if (outwritefds.fd_count > 0)
567  {
568  memcpy(writefds, &outwritefds, sizeof(fd_set));
569  if (readfds)
570  FD_ZERO(readfds);
571  return outwritefds.fd_count;
572  }
573  }
574 
575 
576  /* Now set up for an actual select */
577 
578  if (timeout != NULL)
579  {
580  /* timeoutval is in milliseconds */
581  timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
582  }
583 
584  if (readfds != NULL)
585  {
586  for (i = 0; i < readfds->fd_count; i++)
587  {
588  events[numevents] = WSACreateEvent();
589  sockets[numevents] = readfds->fd_array[i];
590  numevents++;
591  }
592  }
593  if (writefds != NULL)
594  {
595  for (i = 0; i < writefds->fd_count; i++)
596  {
597  if (!readfds ||
598  !FD_ISSET(writefds->fd_array[i], readfds))
599  {
600  /* If the socket is not in the read list */
601  events[numevents] = WSACreateEvent();
602  sockets[numevents] = writefds->fd_array[i];
603  numevents++;
604  }
605  }
606  }
607 
608  for (i = 0; i < numevents; i++)
609  {
610  int flags = 0;
611 
612  if (readfds && FD_ISSET(sockets[i], readfds))
613  flags |= FD_READ | FD_ACCEPT | FD_CLOSE;
614 
615  if (writefds && FD_ISSET(sockets[i], writefds))
616  flags |= FD_WRITE | FD_CLOSE;
617 
618  if (WSAEventSelect(sockets[i], events[i], flags) != 0)
619  {
621  /* release already-assigned event objects */
622  while (--i >= 0)
623  WSAEventSelect(sockets[i], NULL, 0);
624  for (i = 0; i < numevents; i++)
625  WSACloseEvent(events[i]);
626  return -1;
627  }
628  }
629 
630  events[numevents] = pgwin32_signal_event;
631  r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE);
632  if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents))
633  {
634  /*
635  * We scan all events, even those not signaled, in case more than one
636  * event has been tagged but Wait.. can only return one.
637  */
638  WSANETWORKEVENTS resEvents;
639 
640  for (i = 0; i < numevents; i++)
641  {
642  ZeroMemory(&resEvents, sizeof(resEvents));
643  if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0)
644  elog(ERROR, "failed to enumerate network events: error code %d",
645  WSAGetLastError());
646  /* Read activity? */
647  if (readfds && FD_ISSET(sockets[i], readfds))
648  {
649  if ((resEvents.lNetworkEvents & FD_READ) ||
650  (resEvents.lNetworkEvents & FD_ACCEPT) ||
651  (resEvents.lNetworkEvents & FD_CLOSE))
652  {
653  FD_SET(sockets[i], &outreadfds);
654 
655  nummatches++;
656  }
657  }
658  /* Write activity? */
659  if (writefds && FD_ISSET(sockets[i], writefds))
660  {
661  if ((resEvents.lNetworkEvents & FD_WRITE) ||
662  (resEvents.lNetworkEvents & FD_CLOSE))
663  {
664  FD_SET(sockets[i], &outwritefds);
665 
666  nummatches++;
667  }
668  }
669  }
670  }
671 
672  /* Clean up all the event objects */
673  for (i = 0; i < numevents; i++)
674  {
675  WSAEventSelect(sockets[i], NULL, 0);
676  WSACloseEvent(events[i]);
677  }
678 
679  if (r == WSA_WAIT_TIMEOUT)
680  {
681  if (readfds)
682  FD_ZERO(readfds);
683  if (writefds)
684  FD_ZERO(writefds);
685  return 0;
686  }
687 
688  /* Signal-like events. */
689  if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION)
690  {
692  errno = EINTR;
693  if (readfds)
694  FD_ZERO(readfds);
695  if (writefds)
696  FD_ZERO(writefds);
697  return -1;
698  }
699 
700  /* Overwrite socket sets with our resulting values */
701  if (readfds)
702  memcpy(readfds, &outreadfds, sizeof(fd_set));
703  if (writefds)
704  memcpy(writefds, &outwritefds, sizeof(fd_set));
705  return nummatches;
706 }
#define Assert(condition)
Definition: c.h:861
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define NOTICE
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:149
int b
Definition: isn.c:70
int i
Definition: isn.c:73
const void size_t len
static char * buf
Definition: pg_test_fsync.c:73
#define closesocket
Definition: port.h:349
char * c
void pg_usleep(long microsec)
Definition: signal.c:53
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:120
HANDLE pgwin32_signal_event
Definition: signal.c:27
int pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
Definition: socket.c:517
int pgwin32_recv(SOCKET s, char *buf, int len, int f)
Definition: socket.c:382
int pgwin32_send(SOCKET s, const void *buf, int len, int flags)
Definition: socket.c:459
int pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
Definition: socket.c:359
int pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
Definition: socket.c:181
static int pgwin32_poll_signals(void)
Definition: socket.c:157
SOCKET pgwin32_socket(int af, int type, int protocol)
Definition: socket.c:291
static int isDataGram(SOCKET s)
Definition: socket.c:169
static void TranslateSocketError(void)
Definition: socket.c:56
SOCKET pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
Definition: socket.c:337
int pgwin32_noblock
Definition: socket.c:28
int pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
Definition: socket.c:315
int pgwin32_listen(SOCKET s, int backlog)
Definition: socket.c:326
const char * type
#define UNBLOCKED_SIGNAL_QUEUE()
Definition: win32_port.h:494
#define EISCONN
Definition: win32_port.h:388
#define bind(s, addr, addrlen)
Definition: win32_port.h:509
#define ENETUNREACH
Definition: win32_port.h:412
#define ECONNABORTED
Definition: win32_port.h:382
#define EINTR
Definition: win32_port.h:374
#define EWOULDBLOCK
Definition: win32_port.h:380
#define EOPNOTSUPP
Definition: win32_port.h:398
#define EAFNOSUPPORT
Definition: win32_port.h:378
#define EHOSTUNREACH
Definition: win32_port.h:406
#define EADDRNOTAVAIL
Definition: win32_port.h:402
#define ETIMEDOUT
Definition: win32_port.h:416
#define EADDRINUSE
Definition: win32_port.h:400
#define EINPROGRESS
Definition: win32_port.h:386
#define ENETRESET
Definition: win32_port.h:410
#define ENOBUFS
Definition: win32_port.h:390
#define EHOSTDOWN
Definition: win32_port.h:404
#define ENETDOWN
Definition: win32_port.h:408
#define ECONNREFUSED
Definition: win32_port.h:394
#define EPROTONOSUPPORT
Definition: win32_port.h:392
#define ECONNRESET
Definition: win32_port.h:384
#define ENOTSOCK
Definition: win32_port.h:396
#define EMSGSIZE
Definition: win32_port.h:376
#define listen(s, backlog)
Definition: win32_port.h:510
#define ENOTCONN
Definition: win32_port.h:414