PostgreSQL Source Code git master
socket.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * socket.c
4 * Microsoft Windows Win32 Socket Functions
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/backend/port/win32/socket.c
10 *
11 *-------------------------------------------------------------------------
12 */
13
14#include "postgres.h"
15
16/*
17 * Indicate if pgwin32_recv() and pgwin32_send() should operate
18 * in non-blocking mode.
19 *
20 * Since the socket emulation layer always sets the actual socket to
21 * non-blocking mode in order to be able to deliver signals, we must
22 * specify this in a separate flag if we actually need non-blocking
23 * operation.
24 *
25 * This flag changes the behaviour *globally* for all socket operations,
26 * so it should only be set for very short periods of time.
27 */
29
30/* Undef the macros defined in win32.h, so we can access system functions */
31#undef socket
32#undef bind
33#undef listen
34#undef accept
35#undef connect
36#undef select
37#undef recv
38#undef send
39
40/*
41 * Blocking socket functions implemented so they listen on both
42 * the socket and the signal event, required for signal handling.
43 */
44
45/*
46 * Convert the last socket error code into errno
47 *
48 * Note: where there is a direct correspondence between a WSAxxx error code
49 * and a Berkeley error symbol, this mapping is actually a no-op, because
50 * in win32_port.h we redefine the network-related Berkeley error symbols to
51 * have the values of their WSAxxx counterparts. The point of the switch is
52 * mostly to translate near-miss error codes into something that's sensible
53 * in the Berkeley universe.
54 */
55static void
57{
58 switch (WSAGetLastError())
59 {
60 case WSAEINVAL:
61 case WSANOTINITIALISED:
62 case WSAEINVALIDPROVIDER:
63 case WSAEINVALIDPROCTABLE:
64 case WSAEDESTADDRREQ:
65 errno = EINVAL;
66 break;
67 case WSAEINPROGRESS:
68 errno = EINPROGRESS;
69 break;
70 case WSAEFAULT:
71 errno = EFAULT;
72 break;
73 case WSAEISCONN:
74 errno = EISCONN;
75 break;
76 case WSAEMSGSIZE:
77 errno = EMSGSIZE;
78 break;
79 case WSAEAFNOSUPPORT:
80 errno = EAFNOSUPPORT;
81 break;
82 case WSAEMFILE:
83 errno = EMFILE;
84 break;
85 case WSAENOBUFS:
86 errno = ENOBUFS;
87 break;
88 case WSAEPROTONOSUPPORT:
89 case WSAEPROTOTYPE:
90 case WSAESOCKTNOSUPPORT:
91 errno = EPROTONOSUPPORT;
92 break;
93 case WSAECONNABORTED:
94 errno = ECONNABORTED;
95 break;
96 case WSAECONNREFUSED:
97 errno = ECONNREFUSED;
98 break;
99 case WSAECONNRESET:
100 errno = ECONNRESET;
101 break;
102 case WSAEINTR:
103 errno = EINTR;
104 break;
105 case WSAENOTSOCK:
106 errno = ENOTSOCK;
107 break;
108 case WSAEOPNOTSUPP:
109 errno = EOPNOTSUPP;
110 break;
111 case WSAEWOULDBLOCK:
112 errno = EWOULDBLOCK;
113 break;
114 case WSAEACCES:
115 errno = EACCES;
116 break;
117 case WSAEADDRINUSE:
118 errno = EADDRINUSE;
119 break;
120 case WSAEADDRNOTAVAIL:
121 errno = EADDRNOTAVAIL;
122 break;
123 case WSAEHOSTDOWN:
124 errno = EHOSTDOWN;
125 break;
126 case WSAEHOSTUNREACH:
127 case WSAHOST_NOT_FOUND:
128 errno = EHOSTUNREACH;
129 break;
130 case WSAENETDOWN:
131 errno = ENETDOWN;
132 break;
133 case WSAENETUNREACH:
134 errno = ENETUNREACH;
135 break;
136 case WSAENETRESET:
137 errno = ENETRESET;
138 break;
139 case WSAENOTCONN:
140 case WSAESHUTDOWN:
141 case WSAEDISCON:
142 errno = ENOTCONN;
143 break;
144 case WSAETIMEDOUT:
145 errno = ETIMEDOUT;
146 break;
147 default:
149 (errmsg_internal("unrecognized win32 socket error code: %d",
150 WSAGetLastError())));
151 errno = EINVAL;
152 break;
153 }
154}
155
156static int
158{
160 {
162 errno = EINTR;
163 return 1;
164 }
165 return 0;
166}
167
168static int
169isDataGram(SOCKET s)
170{
171 int type;
172 int typelen = sizeof(type);
173
174 if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen))
175 return 1;
176
177 return (type == SOCK_DGRAM) ? 1 : 0;
178}
179
180int
181pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
182{
183 static HANDLE waitevent = INVALID_HANDLE_VALUE;
184 static SOCKET current_socket = INVALID_SOCKET;
185 static int isUDP = 0;
186 HANDLE events[2];
187 int r;
188
189 /* Create an event object just once and use it on all future calls */
190 if (waitevent == INVALID_HANDLE_VALUE)
191 {
192 waitevent = CreateEvent(NULL, TRUE, FALSE, NULL);
193
194 if (waitevent == INVALID_HANDLE_VALUE)
196 (errmsg_internal("could not create socket waiting event: error code %lu", GetLastError())));
197 }
198 else if (!ResetEvent(waitevent))
200 (errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError())));
201
202 /*
203 * Track whether socket is UDP or not. (NB: most likely, this is both
204 * useless and wrong; there is no reason to think that the behavior of
205 * WSAEventSelect is different for TCP and UDP.)
206 */
207 if (current_socket != s)
208 isUDP = isDataGram(s);
209 current_socket = s;
210
211 /*
212 * Attach event to socket. NOTE: we must detach it again before
213 * returning, since other bits of code may try to attach other events to
214 * the socket.
215 */
216 if (WSAEventSelect(s, waitevent, what) != 0)
217 {
219 return 0;
220 }
221
222 events[0] = pgwin32_signal_event;
223 events[1] = waitevent;
224
225 /*
226 * Just a workaround of unknown locking problem with writing in UDP socket
227 * under high load: Client's pgsql backend sleeps infinitely in
228 * WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select().
229 * So, we will wait with small timeout(0.1 sec) and if socket is still
230 * blocked, try WSASend (see comments in pgwin32_select) and wait again.
231 */
232 if ((what & FD_WRITE) && isUDP)
233 {
234 for (;;)
235 {
236 r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE);
237
238 if (r == WAIT_TIMEOUT)
239 {
240 char c;
241 WSABUF buf;
242 DWORD sent;
243
244 buf.buf = &c;
245 buf.len = 0;
246
247 r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL);
248 if (r == 0) /* Completed - means things are fine! */
249 {
250 WSAEventSelect(s, NULL, 0);
251 return 1;
252 }
253 else if (WSAGetLastError() != WSAEWOULDBLOCK)
254 {
256 WSAEventSelect(s, NULL, 0);
257 return 0;
258 }
259 }
260 else
261 break;
262 }
263 }
264 else
265 r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE);
266
267 WSAEventSelect(s, NULL, 0);
268
269 if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION)
270 {
272 errno = EINTR;
273 return 0;
274 }
275 if (r == WAIT_OBJECT_0 + 1)
276 return 1;
277 if (r == WAIT_TIMEOUT)
278 {
279 errno = EWOULDBLOCK;
280 return 0;
281 }
283 (errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError())));
284 return 0;
285}
286
287/*
288 * Create a socket, setting it to overlapped and non-blocking
289 */
290SOCKET
291pgwin32_socket(int af, int type, int protocol)
292{
293 SOCKET s;
294 unsigned long on = 1;
295
296 s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED);
297 if (s == INVALID_SOCKET)
298 {
300 return INVALID_SOCKET;
301 }
302
303 if (ioctlsocket(s, FIONBIO, &on))
304 {
306 closesocket(s);
307 return INVALID_SOCKET;
308 }
309 errno = 0;
310
311 return s;
312}
313
314int
315pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
316{
317 int res;
318
319 res = bind(s, addr, addrlen);
320 if (res < 0)
322 return res;
323}
324
325int
326pgwin32_listen(SOCKET s, int backlog)
327{
328 int res;
329
330 res = listen(s, backlog);
331 if (res < 0)
333 return res;
334}
335
336SOCKET
337pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
338{
339 SOCKET rs;
340
341 /*
342 * Poll for signals, but don't return with EINTR, since we don't handle
343 * that in pqcomm.c
344 */
346
347 rs = WSAAccept(s, addr, addrlen, NULL, 0);
348 if (rs == INVALID_SOCKET)
349 {
351 return INVALID_SOCKET;
352 }
353 return rs;
354}
355
356
357/* No signal delivery during connect. */
358int
359pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
360{
361 int r;
362
363 r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL);
364 if (r == 0)
365 return 0;
366
367 if (WSAGetLastError() != WSAEWOULDBLOCK)
368 {
370 return -1;
371 }
372
373 while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0)
374 {
375 /* Loop endlessly as long as we are just delivering signals */
376 }
377
378 return 0;
379}
380
381int
382pgwin32_recv(SOCKET s, char *buf, int len, int f)
383{
384 WSABUF wbuf;
385 int r;
386 DWORD b;
387 DWORD flags = f;
388 int n;
389
391 return -1;
392
393 wbuf.len = len;
394 wbuf.buf = buf;
395
396 r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
397 if (r != SOCKET_ERROR)
398 return b; /* success */
399
400 if (WSAGetLastError() != WSAEWOULDBLOCK)
401 {
403 return -1;
404 }
405
406 if (pgwin32_noblock)
407 {
408 /*
409 * No data received, and we are in "emulated non-blocking mode", so
410 * return indicating that we'd block if we were to continue.
411 */
412 errno = EWOULDBLOCK;
413 return -1;
414 }
415
416 /* We're in blocking mode, so wait for data */
417
418 for (n = 0; n < 5; n++)
419 {
420 if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT,
421 INFINITE) == 0)
422 return -1; /* errno already set */
423
424 r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
425 if (r != SOCKET_ERROR)
426 return b; /* success */
427 if (WSAGetLastError() != WSAEWOULDBLOCK)
428 {
430 return -1;
431 }
432
433 /*
434 * There seem to be cases on win2k (at least) where WSARecv can return
435 * WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the
436 * socket is readable. In this case, just sleep for a moment and try
437 * again. We try up to 5 times - if it fails more than that it's not
438 * likely to ever come back.
439 */
440 pg_usleep(10000);
441 }
443 (errmsg_internal("could not read from ready socket (after retries)")));
444 errno = EWOULDBLOCK;
445 return -1;
446}
447
448/*
449 * The second argument to send() is defined by SUS to be a "const void *"
450 * and so we use the same signature here to keep compilers happy when
451 * handling callers.
452 *
453 * But the buf member of a WSABUF struct is defined as "char *", so we cast
454 * the second argument to that here when assigning it, also to keep compilers
455 * happy.
456 */
457
458int
459pgwin32_send(SOCKET s, const void *buf, int len, int flags)
460{
461 WSABUF wbuf;
462 int r;
463 DWORD b;
464
466 return -1;
467
468 wbuf.len = len;
469 wbuf.buf = (char *) buf;
470
471 /*
472 * Readiness of socket to send data to UDP socket may be not true: socket
473 * can become busy again! So loop until send or error occurs.
474 */
475 for (;;)
476 {
477 r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL);
478 if (r != SOCKET_ERROR && b > 0)
479 /* Write succeeded right away */
480 return b;
481
482 if (r == SOCKET_ERROR &&
483 WSAGetLastError() != WSAEWOULDBLOCK)
484 {
486 return -1;
487 }
488
489 if (pgwin32_noblock)
490 {
491 /*
492 * No data sent, and we are in "emulated non-blocking mode", so
493 * return indicating that we'd block if we were to continue.
494 */
495 errno = EWOULDBLOCK;
496 return -1;
497 }
498
499 /* No error, zero bytes */
500
501 if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
502 return -1;
503 }
504
505 return -1;
506}
507
508
509/*
510 * Wait for activity on one or more sockets.
511 * While waiting, allow signals to run
512 *
513 * NOTE! Currently does not implement exceptfds check,
514 * since it is not used in postgresql!
515 */
516int
517pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
518{
519 WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally
520 * different from writefds, so
521 * 2*FD_SETSIZE sockets */
522 SOCKET sockets[FD_SETSIZE * 2];
523 int numevents = 0;
524 int i;
525 int r;
526 DWORD timeoutval = WSA_INFINITE;
527 FD_SET outreadfds;
528 FD_SET outwritefds;
529 int nummatches = 0;
530
531 Assert(exceptfds == NULL);
532
534 return -1;
535
536 FD_ZERO(&outreadfds);
537 FD_ZERO(&outwritefds);
538
539 /*
540 * Windows does not guarantee to log an FD_WRITE network event indicating
541 * that more data can be sent unless the previous send() failed with
542 * WSAEWOULDBLOCK. While our caller might well have made such a call, we
543 * cannot assume that here. Therefore, if waiting for write-ready, force
544 * the issue by doing a dummy send(). If the dummy send() succeeds,
545 * assume that the socket is in fact write-ready, and return immediately.
546 * Also, if it fails with something other than WSAEWOULDBLOCK, return a
547 * write-ready indication to let our caller deal with the error condition.
548 */
549 if (writefds != NULL)
550 {
551 for (i = 0; i < writefds->fd_count; i++)
552 {
553 char c;
554 WSABUF buf;
555 DWORD sent;
556
557 buf.buf = &c;
558 buf.len = 0;
559
560 r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL);
561 if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
562 FD_SET(writefds->fd_array[i], &outwritefds);
563 }
564
565 /* If we found any write-ready sockets, just return them immediately */
566 if (outwritefds.fd_count > 0)
567 {
568 memcpy(writefds, &outwritefds, sizeof(fd_set));
569 if (readfds)
570 FD_ZERO(readfds);
571 return outwritefds.fd_count;
572 }
573 }
574
575
576 /* Now set up for an actual select */
577
578 if (timeout != NULL)
579 {
580 /* timeoutval is in milliseconds */
581 timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
582 }
583
584 if (readfds != NULL)
585 {
586 for (i = 0; i < readfds->fd_count; i++)
587 {
588 events[numevents] = WSACreateEvent();
589 sockets[numevents] = readfds->fd_array[i];
590 numevents++;
591 }
592 }
593 if (writefds != NULL)
594 {
595 for (i = 0; i < writefds->fd_count; i++)
596 {
597 if (!readfds ||
598 !FD_ISSET(writefds->fd_array[i], readfds))
599 {
600 /* If the socket is not in the read list */
601 events[numevents] = WSACreateEvent();
602 sockets[numevents] = writefds->fd_array[i];
603 numevents++;
604 }
605 }
606 }
607
608 for (i = 0; i < numevents; i++)
609 {
610 int flags = 0;
611
612 if (readfds && FD_ISSET(sockets[i], readfds))
613 flags |= FD_READ | FD_ACCEPT | FD_CLOSE;
614
615 if (writefds && FD_ISSET(sockets[i], writefds))
616 flags |= FD_WRITE | FD_CLOSE;
617
618 if (WSAEventSelect(sockets[i], events[i], flags) != 0)
619 {
621 /* release already-assigned event objects */
622 while (--i >= 0)
623 WSAEventSelect(sockets[i], NULL, 0);
624 for (i = 0; i < numevents; i++)
625 WSACloseEvent(events[i]);
626 return -1;
627 }
628 }
629
630 events[numevents] = pgwin32_signal_event;
631 r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE);
632 if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents))
633 {
634 /*
635 * We scan all events, even those not signaled, in case more than one
636 * event has been tagged but Wait.. can only return one.
637 */
638 WSANETWORKEVENTS resEvents;
639
640 for (i = 0; i < numevents; i++)
641 {
642 ZeroMemory(&resEvents, sizeof(resEvents));
643 if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0)
644 elog(ERROR, "failed to enumerate network events: error code %d",
645 WSAGetLastError());
646 /* Read activity? */
647 if (readfds && FD_ISSET(sockets[i], readfds))
648 {
649 if ((resEvents.lNetworkEvents & FD_READ) ||
650 (resEvents.lNetworkEvents & FD_ACCEPT) ||
651 (resEvents.lNetworkEvents & FD_CLOSE))
652 {
653 FD_SET(sockets[i], &outreadfds);
654
655 nummatches++;
656 }
657 }
658 /* Write activity? */
659 if (writefds && FD_ISSET(sockets[i], writefds))
660 {
661 if ((resEvents.lNetworkEvents & FD_WRITE) ||
662 (resEvents.lNetworkEvents & FD_CLOSE))
663 {
664 FD_SET(sockets[i], &outwritefds);
665
666 nummatches++;
667 }
668 }
669 }
670 }
671
672 /* Clean up all the event objects */
673 for (i = 0; i < numevents; i++)
674 {
675 WSAEventSelect(sockets[i], NULL, 0);
676 WSACloseEvent(events[i]);
677 }
678
679 if (r == WSA_WAIT_TIMEOUT)
680 {
681 if (readfds)
682 FD_ZERO(readfds);
683 if (writefds)
684 FD_ZERO(writefds);
685 return 0;
686 }
687
688 /* Signal-like events. */
689 if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION)
690 {
692 errno = EINTR;
693 if (readfds)
694 FD_ZERO(readfds);
695 if (writefds)
696 FD_ZERO(writefds);
697 return -1;
698 }
699
700 /* Overwrite socket sets with our resulting values */
701 if (readfds)
702 memcpy(readfds, &outreadfds, sizeof(fd_set));
703 if (writefds)
704 memcpy(writefds, &outwritefds, sizeof(fd_set));
705 return nummatches;
706}
#define Assert(condition)
Definition: c.h:815
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define NOTICE
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:149
int b
Definition: isn.c:69
int i
Definition: isn.c:72
const void size_t len
static char * buf
Definition: pg_test_fsync.c:72
#define closesocket
Definition: port.h:376
char * c
void pg_usleep(long microsec)
Definition: signal.c:53
void pgwin32_dispatch_queued_signals(void)
Definition: signal.c:120
HANDLE pgwin32_signal_event
Definition: signal.c:27
int pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
Definition: socket.c:517
int pgwin32_recv(SOCKET s, char *buf, int len, int f)
Definition: socket.c:382
int pgwin32_send(SOCKET s, const void *buf, int len, int flags)
Definition: socket.c:459
int pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
Definition: socket.c:359
int pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
Definition: socket.c:181
static int pgwin32_poll_signals(void)
Definition: socket.c:157
SOCKET pgwin32_socket(int af, int type, int protocol)
Definition: socket.c:291
static int isDataGram(SOCKET s)
Definition: socket.c:169
static void TranslateSocketError(void)
Definition: socket.c:56
SOCKET pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
Definition: socket.c:337
int pgwin32_noblock
Definition: socket.c:28
int pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
Definition: socket.c:315
int pgwin32_listen(SOCKET s, int backlog)
Definition: socket.c:326
const char * type
#define UNBLOCKED_SIGNAL_QUEUE()
Definition: win32_port.h:484
#define EISCONN
Definition: win32_port.h:378
#define bind(s, addr, addrlen)
Definition: win32_port.h:499
#define ENETUNREACH
Definition: win32_port.h:402
#define ECONNABORTED
Definition: win32_port.h:372
#define EINTR
Definition: win32_port.h:364
#define EWOULDBLOCK
Definition: win32_port.h:370
#define EOPNOTSUPP
Definition: win32_port.h:388
#define EAFNOSUPPORT
Definition: win32_port.h:368
#define EHOSTUNREACH
Definition: win32_port.h:396
#define EADDRNOTAVAIL
Definition: win32_port.h:392
#define ETIMEDOUT
Definition: win32_port.h:406
#define EADDRINUSE
Definition: win32_port.h:390
#define EINPROGRESS
Definition: win32_port.h:376
#define ENETRESET
Definition: win32_port.h:400
#define ENOBUFS
Definition: win32_port.h:380
#define EHOSTDOWN
Definition: win32_port.h:394
#define ENETDOWN
Definition: win32_port.h:398
#define ECONNREFUSED
Definition: win32_port.h:384
#define EPROTONOSUPPORT
Definition: win32_port.h:382
#define ECONNRESET
Definition: win32_port.h:374
#define ENOTSOCK
Definition: win32_port.h:386
#define EMSGSIZE
Definition: win32_port.h:366
#define listen(s, backlog)
Definition: win32_port.h:500
#define ENOTCONN
Definition: win32_port.h:404