PostgreSQL Source Code  git master
socket.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * socket.c
4  * Microsoft Windows Win32 Socket Functions
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/backend/port/win32/socket.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 
16 /*
17  * Indicate if pgwin32_recv() and pgwin32_send() should operate
18  * in non-blocking mode.
19  *
20  * Since the socket emulation layer always sets the actual socket to
21  * non-blocking mode in order to be able to deliver signals, we must
22  * specify this in a separate flag if we actually need non-blocking
23  * operation.
24  *
25  * This flag changes the behaviour *globally* for all socket operations,
26  * so it should only be set for very short periods of time.
27  */
29 
30 /* Undef the macros defined in win32.h, so we can access system functions */
31 #undef socket
32 #undef bind
33 #undef listen
34 #undef accept
35 #undef connect
36 #undef select
37 #undef recv
38 #undef send
39 
40 /*
41  * Blocking socket functions implemented so they listen on both
42  * the socket and the signal event, required for signal handling.
43  */
44 
45 /*
46  * Convert the last socket error code into errno
47  *
48  * Note: where there is a direct correspondence between a WSAxxx error code
49  * and a Berkeley error symbol, this mapping is actually a no-op, because
50  * in win32.h we redefine the network-related Berkeley error symbols to have
51  * the values of their WSAxxx counterparts. The point of the switch is
52  * mostly to translate near-miss error codes into something that's sensible
53  * in the Berkeley universe.
54  */
55 static void
57 {
58  switch (WSAGetLastError())
59  {
60  case WSAEINVAL:
61  case WSANOTINITIALISED:
62  case WSAEINVALIDPROVIDER:
63  case WSAEINVALIDPROCTABLE:
64  case WSAEDESTADDRREQ:
65  errno = EINVAL;
66  break;
67  case WSAEINPROGRESS:
68  errno = EINPROGRESS;
69  break;
70  case WSAEFAULT:
71  errno = EFAULT;
72  break;
73  case WSAEISCONN:
74  errno = EISCONN;
75  break;
76  case WSAEMSGSIZE:
77  errno = EMSGSIZE;
78  break;
79  case WSAEAFNOSUPPORT:
80  errno = EAFNOSUPPORT;
81  break;
82  case WSAEMFILE:
83  errno = EMFILE;
84  break;
85  case WSAENOBUFS:
86  errno = ENOBUFS;
87  break;
88  case WSAEPROTONOSUPPORT:
89  case WSAEPROTOTYPE:
90  case WSAESOCKTNOSUPPORT:
91  errno = EPROTONOSUPPORT;
92  break;
93  case WSAECONNABORTED:
94  errno = ECONNABORTED;
95  break;
96  case WSAECONNREFUSED:
97  errno = ECONNREFUSED;
98  break;
99  case WSAECONNRESET:
100  errno = ECONNRESET;
101  break;
102  case WSAEINTR:
103  errno = EINTR;
104  break;
105  case WSAENOTSOCK:
106  errno = ENOTSOCK;
107  break;
108  case WSAEOPNOTSUPP:
109  errno = EOPNOTSUPP;
110  break;
111  case WSAEWOULDBLOCK:
112  errno = EWOULDBLOCK;
113  break;
114  case WSAEACCES:
115  errno = EACCES;
116  break;
117  case WSAEADDRINUSE:
118  errno = EADDRINUSE;
119  break;
120  case WSAEADDRNOTAVAIL:
121  errno = EADDRNOTAVAIL;
122  break;
123  case WSAEHOSTDOWN:
124  errno = EHOSTDOWN;
125  break;
126  case WSAEHOSTUNREACH:
127  case WSAHOST_NOT_FOUND:
128  errno = EHOSTUNREACH;
129  break;
130  case WSAENETDOWN:
131  errno = ENETDOWN;
132  break;
133  case WSAENETUNREACH:
134  errno = ENETUNREACH;
135  break;
136  case WSAENETRESET:
137  errno = ENETRESET;
138  break;
139  case WSAENOTCONN:
140  case WSAESHUTDOWN:
141  case WSAEDISCON:
142  errno = ENOTCONN;
143  break;
144  default:
145  ereport(NOTICE,
146  (errmsg_internal("unrecognized win32 socket error code: %d", WSAGetLastError())));
147  errno = EINVAL;
148  }
149 }
150 
151 static int
153 {
155  {
157  errno = EINTR;
158  return 1;
159  }
160  return 0;
161 }
162 
163 static int
164 isDataGram(SOCKET s)
165 {
166  int type;
167  int typelen = sizeof(type);
168 
169  if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen))
170  return 1;
171 
172  return (type == SOCK_DGRAM) ? 1 : 0;
173 }
174 
175 int
176 pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
177 {
178  static HANDLE waitevent = INVALID_HANDLE_VALUE;
179  static SOCKET current_socket = INVALID_SOCKET;
180  static int isUDP = 0;
181  HANDLE events[2];
182  int r;
183 
184  /* Create an event object just once and use it on all future calls */
185  if (waitevent == INVALID_HANDLE_VALUE)
186  {
187  waitevent = CreateEvent(NULL, TRUE, FALSE, NULL);
188 
189  if (waitevent == INVALID_HANDLE_VALUE)
190  ereport(ERROR,
191  (errmsg_internal("could not create socket waiting event: error code %lu", GetLastError())));
192  }
193  else if (!ResetEvent(waitevent))
194  ereport(ERROR,
195  (errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError())));
196 
197  /*
198  * Track whether socket is UDP or not. (NB: most likely, this is both
199  * useless and wrong; there is no reason to think that the behavior of
200  * WSAEventSelect is different for TCP and UDP.)
201  */
202  if (current_socket != s)
203  isUDP = isDataGram(s);
204  current_socket = s;
205 
206  /*
207  * Attach event to socket. NOTE: we must detach it again before
208  * returning, since other bits of code may try to attach other events to
209  * the socket.
210  */
211  if (WSAEventSelect(s, waitevent, what) != 0)
212  {
214  return 0;
215  }
216 
217  events[0] = pgwin32_signal_event;
218  events[1] = waitevent;
219 
220  /*
221  * Just a workaround of unknown locking problem with writing in UDP socket
222  * under high load: Client's pgsql backend sleeps infinitely in
223  * WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select().
224  * So, we will wait with small timeout(0.1 sec) and if socket is still
225  * blocked, try WSASend (see comments in pgwin32_select) and wait again.
226  */
227  if ((what & FD_WRITE) && isUDP)
228  {
229  for (;;)
230  {
231  r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE);
232 
233  if (r == WAIT_TIMEOUT)
234  {
235  char c;
236  WSABUF buf;
237  DWORD sent;
238 
239  buf.buf = &c;
240  buf.len = 0;
241 
242  r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL);
243  if (r == 0) /* Completed - means things are fine! */
244  {
245  WSAEventSelect(s, NULL, 0);
246  return 1;
247  }
248  else if (WSAGetLastError() != WSAEWOULDBLOCK)
249  {
251  WSAEventSelect(s, NULL, 0);
252  return 0;
253  }
254  }
255  else
256  break;
257  }
258  }
259  else
260  r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE);
261 
262  WSAEventSelect(s, NULL, 0);
263 
264  if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION)
265  {
267  errno = EINTR;
268  return 0;
269  }
270  if (r == WAIT_OBJECT_0 + 1)
271  return 1;
272  if (r == WAIT_TIMEOUT)
273  {
274  errno = EWOULDBLOCK;
275  return 0;
276  }
277  ereport(ERROR,
278  (errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError())));
279  return 0;
280 }
281 
282 /*
283  * Create a socket, setting it to overlapped and non-blocking
284  */
285 SOCKET
286 pgwin32_socket(int af, int type, int protocol)
287 {
288  SOCKET s;
289  unsigned long on = 1;
290 
291  s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED);
292  if (s == INVALID_SOCKET)
293  {
295  return INVALID_SOCKET;
296  }
297 
298  if (ioctlsocket(s, FIONBIO, &on))
299  {
301  return INVALID_SOCKET;
302  }
303  errno = 0;
304 
305  return s;
306 }
307 
308 int
309 pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
310 {
311  int res;
312 
313  res = bind(s, addr, addrlen);
314  if (res < 0)
316  return res;
317 }
318 
319 int
320 pgwin32_listen(SOCKET s, int backlog)
321 {
322  int res;
323 
324  res = listen(s, backlog);
325  if (res < 0)
327  return res;
328 }
329 
330 SOCKET
331 pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
332 {
333  SOCKET rs;
334 
335  /*
336  * Poll for signals, but don't return with EINTR, since we don't handle
337  * that in pqcomm.c
338  */
340 
341  rs = WSAAccept(s, addr, addrlen, NULL, 0);
342  if (rs == INVALID_SOCKET)
343  {
345  return INVALID_SOCKET;
346  }
347  return rs;
348 }
349 
350 
351 /* No signal delivery during connect. */
352 int
353 pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
354 {
355  int r;
356 
357  r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL);
358  if (r == 0)
359  return 0;
360 
361  if (WSAGetLastError() != WSAEWOULDBLOCK)
362  {
364  return -1;
365  }
366 
367  while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0)
368  {
369  /* Loop endlessly as long as we are just delivering signals */
370  }
371 
372  return 0;
373 }
374 
375 int
376 pgwin32_recv(SOCKET s, char *buf, int len, int f)
377 {
378  WSABUF wbuf;
379  int r;
380  DWORD b;
381  DWORD flags = f;
382  int n;
383 
384  if (pgwin32_poll_signals())
385  return -1;
386 
387  wbuf.len = len;
388  wbuf.buf = buf;
389 
390  r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
391  if (r != SOCKET_ERROR)
392  return b; /* success */
393 
394  if (WSAGetLastError() != WSAEWOULDBLOCK)
395  {
397  return -1;
398  }
399 
400  if (pgwin32_noblock)
401  {
402  /*
403  * No data received, and we are in "emulated non-blocking mode", so
404  * return indicating that we'd block if we were to continue.
405  */
406  errno = EWOULDBLOCK;
407  return -1;
408  }
409 
410  /* We're in blocking mode, so wait for data */
411 
412  for (n = 0; n < 5; n++)
413  {
414  if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT,
415  INFINITE) == 0)
416  return -1; /* errno already set */
417 
418  r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
419  if (r != SOCKET_ERROR)
420  return b; /* success */
421  if (WSAGetLastError() != WSAEWOULDBLOCK)
422  {
424  return -1;
425  }
426 
427  /*
428  * There seem to be cases on win2k (at least) where WSARecv can return
429  * WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the
430  * socket is readable. In this case, just sleep for a moment and try
431  * again. We try up to 5 times - if it fails more than that it's not
432  * likely to ever come back.
433  */
434  pg_usleep(10000);
435  }
436  ereport(NOTICE,
437  (errmsg_internal("could not read from ready socket (after retries)")));
438  errno = EWOULDBLOCK;
439  return -1;
440 }
441 
442 /*
443  * The second argument to send() is defined by SUS to be a "const void *"
444  * and so we use the same signature here to keep compilers happy when
445  * handling callers.
446  *
447  * But the buf member of a WSABUF struct is defined as "char *", so we cast
448  * the second argument to that here when assigning it, also to keep compilers
449  * happy.
450  */
451 
452 int
453 pgwin32_send(SOCKET s, const void *buf, int len, int flags)
454 {
455  WSABUF wbuf;
456  int r;
457  DWORD b;
458 
459  if (pgwin32_poll_signals())
460  return -1;
461 
462  wbuf.len = len;
463  wbuf.buf = (char *) buf;
464 
465  /*
466  * Readiness of socket to send data to UDP socket may be not true: socket
467  * can become busy again! So loop until send or error occurs.
468  */
469  for (;;)
470  {
471  r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL);
472  if (r != SOCKET_ERROR && b > 0)
473  /* Write succeeded right away */
474  return b;
475 
476  if (r == SOCKET_ERROR &&
477  WSAGetLastError() != WSAEWOULDBLOCK)
478  {
480  return -1;
481  }
482 
483  if (pgwin32_noblock)
484  {
485  /*
486  * No data sent, and we are in "emulated non-blocking mode", so
487  * return indicating that we'd block if we were to continue.
488  */
489  errno = EWOULDBLOCK;
490  return -1;
491  }
492 
493  /* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
494 
495  if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
496  return -1;
497  }
498 
499  return -1;
500 }
501 
502 
503 /*
504  * Wait for activity on one or more sockets.
505  * While waiting, allow signals to run
506  *
507  * NOTE! Currently does not implement exceptfds check,
508  * since it is not used in postgresql!
509  */
510 int
511 pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
512 {
513  WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally
514  * different from writefds, so
515  * 2*FD_SETSIZE sockets */
516  SOCKET sockets[FD_SETSIZE * 2];
517  int numevents = 0;
518  int i;
519  int r;
520  DWORD timeoutval = WSA_INFINITE;
521  FD_SET outreadfds;
522  FD_SET outwritefds;
523  int nummatches = 0;
524 
525  Assert(exceptfds == NULL);
526 
527  if (pgwin32_poll_signals())
528  return -1;
529 
530  FD_ZERO(&outreadfds);
531  FD_ZERO(&outwritefds);
532 
533  /*
534  * Windows does not guarantee to log an FD_WRITE network event indicating
535  * that more data can be sent unless the previous send() failed with
536  * WSAEWOULDBLOCK. While our caller might well have made such a call, we
537  * cannot assume that here. Therefore, if waiting for write-ready, force
538  * the issue by doing a dummy send(). If the dummy send() succeeds,
539  * assume that the socket is in fact write-ready, and return immediately.
540  * Also, if it fails with something other than WSAEWOULDBLOCK, return a
541  * write-ready indication to let our caller deal with the error condition.
542  */
543  if (writefds != NULL)
544  {
545  for (i = 0; i < writefds->fd_count; i++)
546  {
547  char c;
548  WSABUF buf;
549  DWORD sent;
550 
551  buf.buf = &c;
552  buf.len = 0;
553 
554  r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL);
555  if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
556  FD_SET(writefds->fd_array[i], &outwritefds);
557  }
558 
559  /* If we found any write-ready sockets, just return them immediately */
560  if (outwritefds.fd_count > 0)
561  {
562  memcpy(writefds, &outwritefds, sizeof(fd_set));
563  if (readfds)
564  FD_ZERO(readfds);
565  return outwritefds.fd_count;
566  }
567  }
568 
569 
570  /* Now set up for an actual select */
571 
572  if (timeout != NULL)
573  {
574  /* timeoutval is in milliseconds */
575  timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
576  }
577 
578  if (readfds != NULL)
579  {
580  for (i = 0; i < readfds->fd_count; i++)
581  {
582  events[numevents] = WSACreateEvent();
583  sockets[numevents] = readfds->fd_array[i];
584  numevents++;
585  }
586  }
587  if (writefds != NULL)
588  {
589  for (i = 0; i < writefds->fd_count; i++)
590  {
591  if (!readfds ||
592  !FD_ISSET(writefds->fd_array[i], readfds))
593  {
594  /* If the socket is not in the read list */
595  events[numevents] = WSACreateEvent();
596  sockets[numevents] = writefds->fd_array[i];
597  numevents++;
598  }
599  }
600  }
601 
602  for (i = 0; i < numevents; i++)
603  {
604  int flags = 0;
605 
606  if (readfds && FD_ISSET(sockets[i], readfds))
607  flags |= FD_READ | FD_ACCEPT | FD_CLOSE;
608 
609  if (writefds && FD_ISSET(sockets[i], writefds))
610  flags |= FD_WRITE | FD_CLOSE;
611 
612  if (WSAEventSelect(sockets[i], events[i], flags) != 0)
613  {
615  /* release already-assigned event objects */
616  while (--i >= 0)
617  WSAEventSelect(sockets[i], NULL, 0);
618  for (i = 0; i < numevents; i++)
619  WSACloseEvent(events[i]);
620  return -1;
621  }
622  }
623 
624  events[numevents] = pgwin32_signal_event;
625  r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE);
626  if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents))
627  {
628  /*
629  * We scan all events, even those not signaled, in case more than one
630  * event has been tagged but Wait.. can only return one.
631  */
632  WSANETWORKEVENTS resEvents;
633 
634  for (i = 0; i < numevents; i++)
635  {
636  ZeroMemory(&resEvents, sizeof(resEvents));
637  if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0)
638  elog(ERROR, "failed to enumerate network events: error code %u",
639  WSAGetLastError());
640  /* Read activity? */
641  if (readfds && FD_ISSET(sockets[i], readfds))
642  {
643  if ((resEvents.lNetworkEvents & FD_READ) ||
644  (resEvents.lNetworkEvents & FD_ACCEPT) ||
645  (resEvents.lNetworkEvents & FD_CLOSE))
646  {
647  FD_SET(sockets[i], &outreadfds);
648 
649  nummatches++;
650  }
651  }
652  /* Write activity? */
653  if (writefds && FD_ISSET(sockets[i], writefds))
654  {
655  if ((resEvents.lNetworkEvents & FD_WRITE) ||
656  (resEvents.lNetworkEvents & FD_CLOSE))
657  {
658  FD_SET(sockets[i], &outwritefds);
659 
660  nummatches++;
661  }
662  }
663  }
664  }
665 
666  /* Clean up all the event objects */
667  for (i = 0; i < numevents; i++)
668  {
669  WSAEventSelect(sockets[i], NULL, 0);
670  WSACloseEvent(events[i]);
671  }
672 
673  if (r == WSA_WAIT_TIMEOUT)
674  {
675  if (readfds)
676  FD_ZERO(readfds);
677  if (writefds)
678  FD_ZERO(writefds);
679  return 0;
680  }
681 
682  /* Signal-like events. */
683  if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION)
684  {
686  errno = EINTR;
687  if (readfds)
688  FD_ZERO(readfds);
689  if (writefds)
690  FD_ZERO(writefds);
691  return -1;
692  }
693 
694  /* Overwrite socket sets with our resulting values */
695  if (readfds)
696  memcpy(readfds, &outreadfds, sizeof(fd_set));
697  if (writefds)
698  memcpy(writefds, &outwritefds, sizeof(fd_set));
699  return nummatches;
700 }
#define ENETDOWN
Definition: win32_port.h:377
#define ECONNABORTED
Definition: win32_port.h:351
#define EHOSTDOWN
Definition: win32_port.h:373
#define ENETRESET
Definition: win32_port.h:379
int pgwin32_recv(SOCKET s, char *buf, int len, int f)
Definition: socket.c:376
#define ENOTSOCK
Definition: win32_port.h:365
int pgwin32_send(SOCKET s, const void *buf, int len, int flags)
Definition: socket.c:453
int pgwin32_noblock
Definition: socket.c:28
SOCKET pgwin32_socket(int af, int type, int protocol)
Definition: socket.c:286
#define bind(s, addr, addrlen)
Definition: win32_port.h:460
#define EHOSTUNREACH
Definition: win32_port.h:375
#define EADDRNOTAVAIL
Definition: win32_port.h:371
#define UNBLOCKED_SIGNAL_QUEUE()
Definition: win32_port.h:445
void pg_usleep(long microsec)
Definition: signal.c:53
HANDLE pgwin32_signal_event
Definition: signal.c:27
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:108
#define ERROR
Definition: elog.h:43
static int pgwin32_poll_signals(void)
Definition: socket.c:152
SOCKET pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
Definition: socket.c:331
#define ECONNREFUSED
Definition: win32_port.h:363
#define ENOTCONN
Definition: win32_port.h:383
char * c
static char * buf
Definition: pg_test_fsync.c:68
#define EISCONN
Definition: win32_port.h:357
int pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
Definition: socket.c:176
static void TranslateSocketError(void)
Definition: socket.c:56
#define listen(s, backlog)
Definition: win32_port.h:461
int pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
Definition: socket.c:353
static int isDataGram(SOCKET s)
Definition: socket.c:164
#define EAFNOSUPPORT
Definition: win32_port.h:347
#define ereport(elevel,...)
Definition: elog.h:155
#define NOTICE
Definition: elog.h:37
int errmsg_internal(const char *fmt,...)
Definition: elog.c:989
int pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
Definition: socket.c:309
#define EPROTONOSUPPORT
Definition: win32_port.h:361
#define Assert(condition)
Definition: c.h:800
#define ENETUNREACH
Definition: win32_port.h:381
int pgwin32_listen(SOCKET s, int backlog)
Definition: socket.c:320
#define EADDRINUSE
Definition: win32_port.h:369
#define ECONNRESET
Definition: win32_port.h:353
#define EOPNOTSUPP
Definition: win32_port.h:367
#define ENOBUFS
Definition: win32_port.h:359
#define elog(elevel,...)
Definition: elog.h:228
#define EINPROGRESS
Definition: win32_port.h:355
int i
int pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
Definition: socket.c:511
#define EWOULDBLOCK
Definition: win32_port.h:349
#define EINTR
Definition: win32_port.h:343
#define EMSGSIZE
Definition: win32_port.h:345