PostgreSQL Source Code  git master
parallel.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * parallel.c
4  *
5  * Parallel support for pg_dump and pg_restore
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/bin/pg_dump/parallel.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 /*
17  * Parallel operation works like this:
18  *
19  * The original, master process calls ParallelBackupStart(), which forks off
20  * the desired number of worker processes, which each enter WaitForCommands().
21  *
22  * The master process dispatches an individual work item to one of the worker
23  * processes in DispatchJobForTocEntry(). We send a command string such as
24  * "DUMP 1234" or "RESTORE 1234", where 1234 is the TocEntry ID.
25  * The worker process receives and decodes the command and passes it to the
26  * routine pointed to by AH->WorkerJobDumpPtr or AH->WorkerJobRestorePtr,
27  * which are routines of the current archive format. That routine performs
28  * the required action (dump or restore) and returns an integer status code.
29  * This is passed back to the master where we pass it to the
30  * ParallelCompletionPtr callback function that was passed to
31  * DispatchJobForTocEntry(). The callback function does state updating
32  * for the master control logic in pg_backup_archiver.c.
33  *
34  * In principle additional archive-format-specific information might be needed
35  * in commands or worker status responses, but so far that hasn't proved
36  * necessary, since workers have full copies of the ArchiveHandle/TocEntry
37  * data structures. Remember that we have forked off the workers only after
38  * we have read in the catalog. That's why our worker processes can also
39  * access the catalog information. (In the Windows case, the workers are
40  * threads in the same process. To avoid problems, they work with cloned
41  * copies of the Archive data structure; see RunWorker().)
42  *
43  * In the master process, the workerStatus field for each worker has one of
44  * the following values:
45  * WRKR_IDLE: it's waiting for a command
46  * WRKR_WORKING: it's working on a command
47  * WRKR_TERMINATED: process ended
48  * The pstate->te[] entry for each worker is valid when it's in WRKR_WORKING
49  * state, and must be NULL in other states.
50  */
51 
52 #include "postgres_fe.h"
53 
54 #ifndef WIN32
55 #include <sys/wait.h>
56 #include <signal.h>
57 #include <unistd.h>
58 #include <fcntl.h>
59 #endif
60 #ifdef HAVE_SYS_SELECT_H
61 #include <sys/select.h>
62 #endif
63 
64 #include "fe_utils/string_utils.h"
65 #include "parallel.h"
66 #include "pg_backup_utils.h"
67 #include "port/pg_bswap.h"
68 
69 /* Mnemonic macros for indexing the fd array returned by pipe(2) */
70 #define PIPE_READ 0
71 #define PIPE_WRITE 1
72 
73 #define NO_SLOT (-1) /* Failure result for GetIdleWorker() */
74 
75 /* Worker process statuses */
76 typedef enum
77 {
82 
83 /*
84  * Private per-parallel-worker state (typedef for this is in parallel.h).
85  *
86  * Much of this is valid only in the master process (or, on Windows, should
87  * be touched only by the master thread). But the AH field should be touched
88  * only by workers. The pipe descriptors are valid everywhere.
89  */
91 {
92  T_WorkerStatus workerStatus; /* see enum above */
93 
94  /* These fields are valid if workerStatus == WRKR_WORKING: */
95  ParallelCompletionPtr callback; /* function to call on completion */
96  void *callback_data; /* passthrough data for it */
97 
98  ArchiveHandle *AH; /* Archive data worker is using */
99 
100  int pipeRead; /* master's end of the pipes */
102  int pipeRevRead; /* child's end of the pipes */
104 
105  /* Child process/thread identity info: */
106 #ifdef WIN32
107  uintptr_t hThread;
108  unsigned int threadId;
109 #else
110  pid_t pid;
111 #endif
112 };
113 
114 #ifdef WIN32
115 
116 /*
117  * Structure to hold info passed by _beginthreadex() to the function it calls
118  * via its single allowed argument.
119  */
120 typedef struct
121 {
122  ArchiveHandle *AH; /* master database connection */
123  ParallelSlot *slot; /* this worker's parallel slot */
124 } WorkerInfo;
125 
126 /* Windows implementation of pipe access */
127 static int pgpipe(int handles[2]);
128 static int piperead(int s, char *buf, int len);
129 #define pipewrite(a,b,c) send(a,b,c,0)
130 
131 #else /* !WIN32 */
132 
133 /* Non-Windows implementation of pipe access */
134 #define pgpipe(a) pipe(a)
135 #define piperead(a,b,c) read(a,b,c)
136 #define pipewrite(a,b,c) write(a,b,c)
137 
138 #endif /* WIN32 */
139 
140 /*
141  * State info for archive_close_connection() shutdown callback.
142  */
143 typedef struct ShutdownInformation
144 {
148 
150 
151 /*
152  * State info for signal handling.
153  * We assume signal_info initializes to zeroes.
154  *
155  * On Unix, myAH is the master DB connection in the master process, and the
156  * worker's own connection in worker processes. On Windows, we have only one
157  * instance of signal_info, so myAH is the master connection and the worker
158  * connections must be dug out of pstate->parallelSlot[].
159  */
160 typedef struct DumpSignalInformation
161 {
162  ArchiveHandle *myAH; /* database connection to issue cancel for */
163  ParallelState *pstate; /* parallel state, if any */
164  bool handler_set; /* signal handler set up in this process? */
165 #ifndef WIN32
166  bool am_worker; /* am I a worker process? */
167 #endif
169 
171 
172 #ifdef WIN32
173 static CRITICAL_SECTION signal_info_lock;
174 #endif
175 
176 /*
177  * Write a simple string to stderr --- must be safe in a signal handler.
178  * We ignore the write() result since there's not much we could do about it.
179  * Certain compilers make that harder than it ought to be.
180  */
181 #define write_stderr(str) \
182  do { \
183  const char *str_ = (str); \
184  int rc_; \
185  rc_ = write(fileno(stderr), str_, strlen(str_)); \
186  (void) rc_; \
187  } while (0)
188 
189 
190 #ifdef WIN32
191 /* file-scope variables */
192 static DWORD tls_index;
193 
194 /* globally visible variables (needed by exit_nicely) */
195 bool parallel_init_done = false;
196 DWORD mainThreadId;
197 #endif /* WIN32 */
198 
199 /* Local function prototypes */
200 static ParallelSlot *GetMyPSlot(ParallelState *pstate);
201 static void archive_close_connection(int code, void *arg);
202 static void ShutdownWorkersHard(ParallelState *pstate);
203 static void WaitForTerminatingWorkers(ParallelState *pstate);
204 static void setup_cancel_handler(void);
205 static void set_cancel_pstate(ParallelState *pstate);
207 static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot);
208 static int GetIdleWorker(ParallelState *pstate);
209 static bool HasEveryWorkerTerminated(ParallelState *pstate);
210 static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te);
211 static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]);
212 static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate,
213  bool do_wait);
214 static char *getMessageFromMaster(int pipefd[2]);
215 static void sendMessageToMaster(int pipefd[2], const char *str);
216 static int select_loop(int maxFd, fd_set *workerset);
217 static char *getMessageFromWorker(ParallelState *pstate,
218  bool do_wait, int *worker);
219 static void sendMessageToWorker(ParallelState *pstate,
220  int worker, const char *str);
221 static char *readMessageFromPipe(int fd);
222 
223 #define messageStartsWith(msg, prefix) \
224  (strncmp(msg, prefix, strlen(prefix)) == 0)
225 
226 
227 /*
228  * Shutdown callback to clean up socket access
229  */
230 #ifdef WIN32
231 static void
232 shutdown_parallel_dump_utils(int code, void *unused)
233 {
234  /* Call the cleanup function only from the main thread */
235  if (mainThreadId == GetCurrentThreadId())
236  WSACleanup();
237 }
238 #endif
239 
240 /*
241  * Initialize parallel dump support --- should be called early in process
242  * startup. (Currently, this is called whether or not we intend parallel
243  * activity.)
244  */
245 void
247 {
248 #ifdef WIN32
249  if (!parallel_init_done)
250  {
251  WSADATA wsaData;
252  int err;
253 
254  /* Prepare for threaded operation */
255  tls_index = TlsAlloc();
256  mainThreadId = GetCurrentThreadId();
257 
258  /* Initialize socket access */
259  err = WSAStartup(MAKEWORD(2, 2), &wsaData);
260  if (err != 0)
261  {
262  pg_log_error("WSAStartup failed: %d", err);
263  exit_nicely(1);
264  }
265  /* ... and arrange to shut it down at exit */
266  on_exit_nicely(shutdown_parallel_dump_utils, NULL);
267  parallel_init_done = true;
268  }
269 #endif
270 }
271 
272 /*
273  * Find the ParallelSlot for the current worker process or thread.
274  *
275  * Returns NULL if no matching slot is found (this implies we're the master).
276  */
277 static ParallelSlot *
279 {
280  int i;
281 
282  for (i = 0; i < pstate->numWorkers; i++)
283  {
284 #ifdef WIN32
285  if (pstate->parallelSlot[i].threadId == GetCurrentThreadId())
286 #else
287  if (pstate->parallelSlot[i].pid == getpid())
288 #endif
289  return &(pstate->parallelSlot[i]);
290  }
291 
292  return NULL;
293 }
294 
295 /*
296  * A thread-local version of getLocalPQExpBuffer().
297  *
298  * Non-reentrant but reduces memory leakage: we'll consume one buffer per
299  * thread, which is much better than one per fmtId/fmtQualifiedId call.
300  */
301 #ifdef WIN32
302 static PQExpBuffer
303 getThreadLocalPQExpBuffer(void)
304 {
305  /*
306  * The Tls code goes awry if we use a static var, so we provide for both
307  * static and auto, and omit any use of the static var when using Tls. We
308  * rely on TlsGetValue() to return 0 if the value is not yet set.
309  */
310  static PQExpBuffer s_id_return = NULL;
311  PQExpBuffer id_return;
312 
313  if (parallel_init_done)
314  id_return = (PQExpBuffer) TlsGetValue(tls_index);
315  else
316  id_return = s_id_return;
317 
318  if (id_return) /* first time through? */
319  {
320  /* same buffer, just wipe contents */
321  resetPQExpBuffer(id_return);
322  }
323  else
324  {
325  /* new buffer */
326  id_return = createPQExpBuffer();
327  if (parallel_init_done)
328  TlsSetValue(tls_index, id_return);
329  else
330  s_id_return = id_return;
331  }
332 
333  return id_return;
334 }
335 #endif /* WIN32 */
336 
337 /*
338  * pg_dump and pg_restore call this to register the cleanup handler
339  * as soon as they've created the ArchiveHandle.
340  */
341 void
343 {
344  shutdown_info.AHX = AHX;
345  on_exit_nicely(archive_close_connection, &shutdown_info);
346 }
347 
348 /*
349  * on_exit_nicely handler for shutting down database connections and
350  * worker processes cleanly.
351  */
352 static void
354 {
356 
357  if (si->pstate)
358  {
359  /* In parallel mode, must figure out who we are */
360  ParallelSlot *slot = GetMyPSlot(si->pstate);
361 
362  if (!slot)
363  {
364  /*
365  * We're the master. Forcibly shut down workers, then close our
366  * own database connection, if any.
367  */
369 
370  if (si->AHX)
371  DisconnectDatabase(si->AHX);
372  }
373  else
374  {
375  /*
376  * We're a worker. Shut down our own DB connection if any. On
377  * Windows, we also have to close our communication sockets, to
378  * emulate what will happen on Unix when the worker process exits.
379  * (Without this, if this is a premature exit, the master would
380  * fail to detect it because there would be no EOF condition on
381  * the other end of the pipe.)
382  */
383  if (slot->AH)
384  DisconnectDatabase(&(slot->AH->public));
385 
386 #ifdef WIN32
387  closesocket(slot->pipeRevRead);
388  closesocket(slot->pipeRevWrite);
389 #endif
390  }
391  }
392  else
393  {
394  /* Non-parallel operation: just kill the master DB connection */
395  if (si->AHX)
396  DisconnectDatabase(si->AHX);
397  }
398 }
399 
400 /*
401  * Forcibly shut down any remaining workers, waiting for them to finish.
402  *
403  * Note that we don't expect to come here during normal exit (the workers
404  * should be long gone, and the ParallelState too). We're only here in a
405  * fatal() situation, so intervening to cancel active commands is
406  * appropriate.
407  */
408 static void
410 {
411  int i;
412 
413  /*
414  * Close our write end of the sockets so that any workers waiting for
415  * commands know they can exit.
416  */
417  for (i = 0; i < pstate->numWorkers; i++)
418  closesocket(pstate->parallelSlot[i].pipeWrite);
419 
420  /*
421  * Force early termination of any commands currently in progress.
422  */
423 #ifndef WIN32
424  /* On non-Windows, send SIGTERM to each worker process. */
425  for (i = 0; i < pstate->numWorkers; i++)
426  {
427  pid_t pid = pstate->parallelSlot[i].pid;
428 
429  if (pid != 0)
430  kill(pid, SIGTERM);
431  }
432 #else
433 
434  /*
435  * On Windows, send query cancels directly to the workers' backends. Use
436  * a critical section to ensure worker threads don't change state.
437  */
438  EnterCriticalSection(&signal_info_lock);
439  for (i = 0; i < pstate->numWorkers; i++)
440  {
441  ArchiveHandle *AH = pstate->parallelSlot[i].AH;
442  char errbuf[1];
443 
444  if (AH != NULL && AH->connCancel != NULL)
445  (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
446  }
447  LeaveCriticalSection(&signal_info_lock);
448 #endif
449 
450  /* Now wait for them to terminate. */
452 }
453 
454 /*
455  * Wait for all workers to terminate.
456  */
457 static void
459 {
460  while (!HasEveryWorkerTerminated(pstate))
461  {
462  ParallelSlot *slot = NULL;
463  int j;
464 
465 #ifndef WIN32
466  /* On non-Windows, use wait() to wait for next worker to end */
467  int status;
468  pid_t pid = wait(&status);
469 
470  /* Find dead worker's slot, and clear the PID field */
471  for (j = 0; j < pstate->numWorkers; j++)
472  {
473  slot = &(pstate->parallelSlot[j]);
474  if (slot->pid == pid)
475  {
476  slot->pid = 0;
477  break;
478  }
479  }
480 #else /* WIN32 */
481  /* On Windows, we must use WaitForMultipleObjects() */
482  HANDLE *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers);
483  int nrun = 0;
484  DWORD ret;
485  uintptr_t hThread;
486 
487  for (j = 0; j < pstate->numWorkers; j++)
488  {
489  if (pstate->parallelSlot[j].workerStatus != WRKR_TERMINATED)
490  {
491  lpHandles[nrun] = (HANDLE) pstate->parallelSlot[j].hThread;
492  nrun++;
493  }
494  }
495  ret = WaitForMultipleObjects(nrun, lpHandles, false, INFINITE);
496  Assert(ret != WAIT_FAILED);
497  hThread = (uintptr_t) lpHandles[ret - WAIT_OBJECT_0];
498  free(lpHandles);
499 
500  /* Find dead worker's slot, and clear the hThread field */
501  for (j = 0; j < pstate->numWorkers; j++)
502  {
503  slot = &(pstate->parallelSlot[j]);
504  if (slot->hThread == hThread)
505  {
506  /* For cleanliness, close handles for dead threads */
507  CloseHandle((HANDLE) slot->hThread);
508  slot->hThread = (uintptr_t) INVALID_HANDLE_VALUE;
509  break;
510  }
511  }
512 #endif /* WIN32 */
513 
514  /* On all platforms, update workerStatus and te[] as well */
515  Assert(j < pstate->numWorkers);
517  pstate->te[j] = NULL;
518  }
519 }
520 
521 
522 /*
523  * Code for responding to cancel interrupts (SIGINT, control-C, etc)
524  *
525  * This doesn't quite belong in this module, but it needs access to the
526  * ParallelState data, so there's not really a better place either.
527  *
528  * When we get a cancel interrupt, we could just die, but in pg_restore that
529  * could leave a SQL command (e.g., CREATE INDEX on a large table) running
530  * for a long time. Instead, we try to send a cancel request and then die.
531  * pg_dump probably doesn't really need this, but we might as well use it
532  * there too. Note that sending the cancel directly from the signal handler
533  * is safe because PQcancel() is written to make it so.
534  *
535  * In parallel operation on Unix, each process is responsible for canceling
536  * its own connection (this must be so because nobody else has access to it).
537  * Furthermore, the master process should attempt to forward its signal to
538  * each child. In simple manual use of pg_dump/pg_restore, forwarding isn't
539  * needed because typing control-C at the console would deliver SIGINT to
540  * every member of the terminal process group --- but in other scenarios it
541  * might be that only the master gets signaled.
542  *
543  * On Windows, the cancel handler runs in a separate thread, because that's
544  * how SetConsoleCtrlHandler works. We make it stop worker threads, send
545  * cancels on all active connections, and then return FALSE, which will allow
546  * the process to die. For safety's sake, we use a critical section to
547  * protect the PGcancel structures against being changed while the signal
548  * thread runs.
549  */
550 
551 #ifndef WIN32
552 
553 /*
554  * Signal handler (Unix only)
555  */
556 static void
558 {
559  int i;
560  char errbuf[1];
561 
562  /*
563  * Some platforms allow delivery of new signals to interrupt an active
564  * signal handler. That could muck up our attempt to send PQcancel, so
565  * disable the signals that setup_cancel_handler enabled.
566  */
567  pqsignal(SIGINT, SIG_IGN);
568  pqsignal(SIGTERM, SIG_IGN);
570 
571  /*
572  * If we're in the master, forward signal to all workers. (It seems best
573  * to do this before PQcancel; killing the master transaction will result
574  * in invalid-snapshot errors from active workers, which maybe we can
575  * quiet by killing workers first.) Ignore any errors.
576  */
577  if (signal_info.pstate != NULL)
578  {
579  for (i = 0; i < signal_info.pstate->numWorkers; i++)
580  {
581  pid_t pid = signal_info.pstate->parallelSlot[i].pid;
582 
583  if (pid != 0)
584  kill(pid, SIGTERM);
585  }
586  }
587 
588  /*
589  * Send QueryCancel if we have a connection to send to. Ignore errors,
590  * there's not much we can do about them anyway.
591  */
592  if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
593  (void) PQcancel(signal_info.myAH->connCancel, errbuf, sizeof(errbuf));
594 
595  /*
596  * Report we're quitting, using nothing more complicated than write(2).
597  * When in parallel operation, only the master process should do this.
598  */
599  if (!signal_info.am_worker)
600  {
601  if (progname)
602  {
604  write_stderr(": ");
605  }
606  write_stderr("terminated by user\n");
607  }
608 
609  /* And die. */
610  exit(1);
611 }
612 
613 /*
614  * Enable cancel interrupt handler, if not already done.
615  */
616 static void
618 {
619  /*
620  * When forking, signal_info.handler_set will propagate into the new
621  * process, but that's fine because the signal handler state does too.
622  */
623  if (!signal_info.handler_set)
624  {
625  signal_info.handler_set = true;
626 
627  pqsignal(SIGINT, sigTermHandler);
628  pqsignal(SIGTERM, sigTermHandler);
630  }
631 }
632 
633 #else /* WIN32 */
634 
635 /*
636  * Console interrupt handler --- runs in a newly-started thread.
637  *
638  * After stopping other threads and sending cancel requests on all open
639  * connections, we return FALSE which will allow the default ExitProcess()
640  * action to be taken.
641  */
642 static BOOL WINAPI
643 consoleHandler(DWORD dwCtrlType)
644 {
645  int i;
646  char errbuf[1];
647 
648  if (dwCtrlType == CTRL_C_EVENT ||
649  dwCtrlType == CTRL_BREAK_EVENT)
650  {
651  /* Critical section prevents changing data we look at here */
652  EnterCriticalSection(&signal_info_lock);
653 
654  /*
655  * If in parallel mode, stop worker threads and send QueryCancel to
656  * their connected backends. The main point of stopping the worker
657  * threads is to keep them from reporting the query cancels as errors,
658  * which would clutter the user's screen. We needn't stop the master
659  * thread since it won't be doing much anyway. Do this before
660  * canceling the main transaction, else we might get invalid-snapshot
661  * errors reported before we can stop the workers. Ignore errors,
662  * there's not much we can do about them anyway.
663  */
664  if (signal_info.pstate != NULL)
665  {
666  for (i = 0; i < signal_info.pstate->numWorkers; i++)
667  {
668  ParallelSlot *slot = &(signal_info.pstate->parallelSlot[i]);
669  ArchiveHandle *AH = slot->AH;
670  HANDLE hThread = (HANDLE) slot->hThread;
671 
672  /*
673  * Using TerminateThread here may leave some resources leaked,
674  * but it doesn't matter since we're about to end the whole
675  * process.
676  */
677  if (hThread != INVALID_HANDLE_VALUE)
678  TerminateThread(hThread, 0);
679 
680  if (AH != NULL && AH->connCancel != NULL)
681  (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
682  }
683  }
684 
685  /*
686  * Send QueryCancel to master connection, if enabled. Ignore errors,
687  * there's not much we can do about them anyway.
688  */
689  if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
690  (void) PQcancel(signal_info.myAH->connCancel,
691  errbuf, sizeof(errbuf));
692 
693  LeaveCriticalSection(&signal_info_lock);
694 
695  /*
696  * Report we're quitting, using nothing more complicated than
697  * write(2). (We might be able to get away with using pg_log_*()
698  * here, but since we terminated other threads uncleanly above, it
699  * seems better to assume as little as possible.)
700  */
701  if (progname)
702  {
704  write_stderr(": ");
705  }
706  write_stderr("terminated by user\n");
707  }
708 
709  /* Always return FALSE to allow signal handling to continue */
710  return FALSE;
711 }
712 
713 /*
714  * Enable cancel interrupt handler, if not already done.
715  */
716 static void
718 {
719  if (!signal_info.handler_set)
720  {
721  signal_info.handler_set = true;
722 
723  InitializeCriticalSection(&signal_info_lock);
724 
725  SetConsoleCtrlHandler(consoleHandler, TRUE);
726  }
727 }
728 
729 #endif /* WIN32 */
730 
731 
732 /*
733  * set_archive_cancel_info
734  *
735  * Fill AH->connCancel with cancellation info for the specified database
736  * connection; or clear it if conn is NULL.
737  */
738 void
740 {
741  PGcancel *oldConnCancel;
742 
743  /*
744  * Activate the interrupt handler if we didn't yet in this process. On
745  * Windows, this also initializes signal_info_lock; therefore it's
746  * important that this happen at least once before we fork off any
747  * threads.
748  */
750 
751  /*
752  * On Unix, we assume that storing a pointer value is atomic with respect
753  * to any possible signal interrupt. On Windows, use a critical section.
754  */
755 
756 #ifdef WIN32
757  EnterCriticalSection(&signal_info_lock);
758 #endif
759 
760  /* Free the old one if we have one */
761  oldConnCancel = AH->connCancel;
762  /* be sure interrupt handler doesn't use pointer while freeing */
763  AH->connCancel = NULL;
764 
765  if (oldConnCancel != NULL)
766  PQfreeCancel(oldConnCancel);
767 
768  /* Set the new one if specified */
769  if (conn)
770  AH->connCancel = PQgetCancel(conn);
771 
772  /*
773  * On Unix, there's only ever one active ArchiveHandle per process, so we
774  * can just set signal_info.myAH unconditionally. On Windows, do that
775  * only in the main thread; worker threads have to make sure their
776  * ArchiveHandle appears in the pstate data, which is dealt with in
777  * RunWorker().
778  */
779 #ifndef WIN32
780  signal_info.myAH = AH;
781 #else
782  if (mainThreadId == GetCurrentThreadId())
783  signal_info.myAH = AH;
784 #endif
785 
786 #ifdef WIN32
787  LeaveCriticalSection(&signal_info_lock);
788 #endif
789 }
790 
791 /*
792  * set_cancel_pstate
793  *
794  * Set signal_info.pstate to point to the specified ParallelState, if any.
795  * We need this mainly to have an interlock against Windows signal thread.
796  */
797 static void
799 {
800 #ifdef WIN32
801  EnterCriticalSection(&signal_info_lock);
802 #endif
803 
804  signal_info.pstate = pstate;
805 
806 #ifdef WIN32
807  LeaveCriticalSection(&signal_info_lock);
808 #endif
809 }
810 
811 /*
812  * set_cancel_slot_archive
813  *
814  * Set ParallelSlot's AH field to point to the specified archive, if any.
815  * We need this mainly to have an interlock against Windows signal thread.
816  */
817 static void
819 {
820 #ifdef WIN32
821  EnterCriticalSection(&signal_info_lock);
822 #endif
823 
824  slot->AH = AH;
825 
826 #ifdef WIN32
827  LeaveCriticalSection(&signal_info_lock);
828 #endif
829 }
830 
831 
832 /*
833  * This function is called by both Unix and Windows variants to set up
834  * and run a worker process. Caller should exit the process (or thread)
835  * upon return.
836  */
837 static void
839 {
840  int pipefd[2];
841 
842  /* fetch child ends of pipes */
843  pipefd[PIPE_READ] = slot->pipeRevRead;
844  pipefd[PIPE_WRITE] = slot->pipeRevWrite;
845 
846  /*
847  * Clone the archive so that we have our own state to work with, and in
848  * particular our own database connection.
849  *
850  * We clone on Unix as well as Windows, even though technically we don't
851  * need to because fork() gives us a copy in our own address space
852  * already. But CloneArchive resets the state information and also clones
853  * the database connection which both seem kinda helpful.
854  */
855  AH = CloneArchive(AH);
856 
857  /* Remember cloned archive where signal handler can find it */
858  set_cancel_slot_archive(slot, AH);
859 
860  /*
861  * Call the setup worker function that's defined in the ArchiveHandle.
862  */
863  (AH->SetupWorkerPtr) ((Archive *) AH);
864 
865  /*
866  * Execute commands until done.
867  */
868  WaitForCommands(AH, pipefd);
869 
870  /*
871  * Disconnect from database and clean up.
872  */
873  set_cancel_slot_archive(slot, NULL);
874  DisconnectDatabase(&(AH->public));
875  DeCloneArchive(AH);
876 }
877 
878 /*
879  * Thread base function for Windows
880  */
881 #ifdef WIN32
882 static unsigned __stdcall
883 init_spawned_worker_win32(WorkerInfo *wi)
884 {
885  ArchiveHandle *AH = wi->AH;
886  ParallelSlot *slot = wi->slot;
887 
888  /* Don't need WorkerInfo anymore */
889  free(wi);
890 
891  /* Run the worker ... */
892  RunWorker(AH, slot);
893 
894  /* Exit the thread */
895  _endthreadex(0);
896  return 0;
897 }
898 #endif /* WIN32 */
899 
900 /*
901  * This function starts a parallel dump or restore by spawning off the worker
902  * processes. For Windows, it creates a number of threads; on Unix the
903  * workers are created with fork().
904  */
907 {
908  ParallelState *pstate;
909  int i;
910 
911  Assert(AH->public.numWorkers > 0);
912 
913  pstate = (ParallelState *) pg_malloc(sizeof(ParallelState));
914 
915  pstate->numWorkers = AH->public.numWorkers;
916  pstate->te = NULL;
917  pstate->parallelSlot = NULL;
918 
919  if (AH->public.numWorkers == 1)
920  return pstate;
921 
922  pstate->te = (TocEntry **)
923  pg_malloc0(pstate->numWorkers * sizeof(TocEntry *));
924  pstate->parallelSlot = (ParallelSlot *)
925  pg_malloc0(pstate->numWorkers * sizeof(ParallelSlot));
926 
927 #ifdef WIN32
928  /* Make fmtId() and fmtQualifiedId() use thread-local storage */
929  getLocalPQExpBuffer = getThreadLocalPQExpBuffer;
930 #endif
931 
932  /*
933  * Set the pstate in shutdown_info, to tell the exit handler that it must
934  * clean up workers as well as the main database connection. But we don't
935  * set this in signal_info yet, because we don't want child processes to
936  * inherit non-NULL signal_info.pstate.
937  */
938  shutdown_info.pstate = pstate;
939 
940  /*
941  * Temporarily disable query cancellation on the master connection. This
942  * ensures that child processes won't inherit valid AH->connCancel
943  * settings and thus won't try to issue cancels against the master's
944  * connection. No harm is done if we fail while it's disabled, because
945  * the master connection is idle at this point anyway.
946  */
947  set_archive_cancel_info(AH, NULL);
948 
949  /* Ensure stdio state is quiesced before forking */
950  fflush(NULL);
951 
952  /* Create desired number of workers */
953  for (i = 0; i < pstate->numWorkers; i++)
954  {
955 #ifdef WIN32
956  WorkerInfo *wi;
957  uintptr_t handle;
958 #else
959  pid_t pid;
960 #endif
961  ParallelSlot *slot = &(pstate->parallelSlot[i]);
962  int pipeMW[2],
963  pipeWM[2];
964 
965  /* Create communication pipes for this worker */
966  if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0)
967  fatal("could not create communication channels: %m");
968 
969  pstate->te[i] = NULL; /* just for safety */
970 
971  slot->workerStatus = WRKR_IDLE;
972  slot->AH = NULL;
973  slot->callback = NULL;
974  slot->callback_data = NULL;
975 
976  /* master's ends of the pipes */
977  slot->pipeRead = pipeWM[PIPE_READ];
978  slot->pipeWrite = pipeMW[PIPE_WRITE];
979  /* child's ends of the pipes */
980  slot->pipeRevRead = pipeMW[PIPE_READ];
981  slot->pipeRevWrite = pipeWM[PIPE_WRITE];
982 
983 #ifdef WIN32
984  /* Create transient structure to pass args to worker function */
985  wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo));
986 
987  wi->AH = AH;
988  wi->slot = slot;
989 
990  handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32,
991  wi, 0, &(slot->threadId));
992  slot->hThread = handle;
993 #else /* !WIN32 */
994  pid = fork();
995  if (pid == 0)
996  {
997  /* we are the worker */
998  int j;
999 
1000  /* this is needed for GetMyPSlot() */
1001  slot->pid = getpid();
1002 
1003  /* instruct signal handler that we're in a worker now */
1004  signal_info.am_worker = true;
1005 
1006  /* close read end of Worker -> Master */
1007  closesocket(pipeWM[PIPE_READ]);
1008  /* close write end of Master -> Worker */
1009  closesocket(pipeMW[PIPE_WRITE]);
1010 
1011  /*
1012  * Close all inherited fds for communication of the master with
1013  * previously-forked workers.
1014  */
1015  for (j = 0; j < i; j++)
1016  {
1017  closesocket(pstate->parallelSlot[j].pipeRead);
1018  closesocket(pstate->parallelSlot[j].pipeWrite);
1019  }
1020 
1021  /* Run the worker ... */
1022  RunWorker(AH, slot);
1023 
1024  /* We can just exit(0) when done */
1025  exit(0);
1026  }
1027  else if (pid < 0)
1028  {
1029  /* fork failed */
1030  fatal("could not create worker process: %m");
1031  }
1032 
1033  /* In Master after successful fork */
1034  slot->pid = pid;
1035 
1036  /* close read end of Master -> Worker */
1037  closesocket(pipeMW[PIPE_READ]);
1038  /* close write end of Worker -> Master */
1039  closesocket(pipeWM[PIPE_WRITE]);
1040 #endif /* WIN32 */
1041  }
1042 
1043  /*
1044  * Having forked off the workers, disable SIGPIPE so that master isn't
1045  * killed if it tries to send a command to a dead worker. We don't want
1046  * the workers to inherit this setting, though.
1047  */
1048 #ifndef WIN32
1050 #endif
1051 
1052  /*
1053  * Re-establish query cancellation on the master connection.
1054  */
1056 
1057  /*
1058  * Tell the cancel signal handler to forward signals to worker processes,
1059  * too. (As with query cancel, we did not need this earlier because the
1060  * workers have not yet been given anything to do; if we die before this
1061  * point, any already-started workers will see EOF and quit promptly.)
1062  */
1063  set_cancel_pstate(pstate);
1064 
1065  return pstate;
1066 }
1067 
1068 /*
1069  * Close down a parallel dump or restore.
1070  */
1071 void
1073 {
1074  int i;
1075 
1076  /* No work if non-parallel */
1077  if (pstate->numWorkers == 1)
1078  return;
1079 
1080  /* There should not be any unfinished jobs */
1081  Assert(IsEveryWorkerIdle(pstate));
1082 
1083  /* Close the sockets so that the workers know they can exit */
1084  for (i = 0; i < pstate->numWorkers; i++)
1085  {
1086  closesocket(pstate->parallelSlot[i].pipeRead);
1087  closesocket(pstate->parallelSlot[i].pipeWrite);
1088  }
1089 
1090  /* Wait for them to exit */
1091  WaitForTerminatingWorkers(pstate);
1092 
1093  /*
1094  * Unlink pstate from shutdown_info, so the exit handler will not try to
1095  * use it; and likewise unlink from signal_info.
1096  */
1097  shutdown_info.pstate = NULL;
1098  set_cancel_pstate(NULL);
1099 
1100  /* Release state (mere neatnik-ism, since we're about to terminate) */
1101  free(pstate->te);
1102  free(pstate->parallelSlot);
1103  free(pstate);
1104 }
1105 
1106 /*
1107  * These next four functions handle construction and parsing of the command
1108  * strings and response strings for parallel workers.
1109  *
1110  * Currently, these can be the same regardless of which archive format we are
1111  * processing. In future, we might want to let format modules override these
1112  * functions to add format-specific data to a command or response.
1113  */
1114 
1115 /*
1116  * buildWorkerCommand: format a command string to send to a worker.
1117  *
1118  * The string is built in the caller-supplied buffer of size buflen.
1119  */
1120 static void
1122  char *buf, int buflen)
1123 {
1124  if (act == ACT_DUMP)
1125  snprintf(buf, buflen, "DUMP %d", te->dumpId);
1126  else if (act == ACT_RESTORE)
1127  snprintf(buf, buflen, "RESTORE %d", te->dumpId);
1128  else
1129  Assert(false);
1130 }
1131 
1132 /*
1133  * parseWorkerCommand: interpret a command string in a worker.
1134  */
1135 static void
1137  const char *msg)
1138 {
1139  DumpId dumpId;
1140  int nBytes;
1141 
1142  if (messageStartsWith(msg, "DUMP "))
1143  {
1144  *act = ACT_DUMP;
1145  sscanf(msg, "DUMP %d%n", &dumpId, &nBytes);
1146  Assert(nBytes == strlen(msg));
1147  *te = getTocEntryByDumpId(AH, dumpId);
1148  Assert(*te != NULL);
1149  }
1150  else if (messageStartsWith(msg, "RESTORE "))
1151  {
1152  *act = ACT_RESTORE;
1153  sscanf(msg, "RESTORE %d%n", &dumpId, &nBytes);
1154  Assert(nBytes == strlen(msg));
1155  *te = getTocEntryByDumpId(AH, dumpId);
1156  Assert(*te != NULL);
1157  }
1158  else
1159  fatal("unrecognized command received from master: \"%s\"",
1160  msg);
1161 }
1162 
1163 /*
1164  * buildWorkerResponse: format a response string to send to the master.
1165  *
1166  * The string is built in the caller-supplied buffer of size buflen.
1167  */
1168 static void
1170  char *buf, int buflen)
1171 {
1172  snprintf(buf, buflen, "OK %d %d %d",
1173  te->dumpId,
1174  status,
1175  status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0);
1176 }
1177 
1178 /*
1179  * parseWorkerResponse: parse the status message returned by a worker.
1180  *
1181  * Returns the integer status code, and may update fields of AH and/or te.
1182  */
1183 static int
1185  const char *msg)
1186 {
1187  DumpId dumpId;
1188  int nBytes,
1189  n_errors;
1190  int status = 0;
1191 
1192  if (messageStartsWith(msg, "OK "))
1193  {
1194  sscanf(msg, "OK %d %d %d%n", &dumpId, &status, &n_errors, &nBytes);
1195 
1196  Assert(dumpId == te->dumpId);
1197  Assert(nBytes == strlen(msg));
1198 
1199  AH->public.n_errors += n_errors;
1200  }
1201  else
1202  fatal("invalid message received from worker: \"%s\"",
1203  msg);
1204 
1205  return status;
1206 }
1207 
1208 /*
1209  * Dispatch a job to some free worker.
1210  *
1211  * te is the TocEntry to be processed, act is the action to be taken on it.
1212  * callback is the function to call on completion of the job.
1213  *
1214  * If no worker is currently available, this will block, and previously
1215  * registered callback functions may be called.
1216  */
1217 void
1219  ParallelState *pstate,
1220  TocEntry *te,
1221  T_Action act,
1223  void *callback_data)
1224 {
1225  int worker;
1226  char buf[256];
1227 
1228  /* Get a worker, waiting if none are idle */
1229  while ((worker = GetIdleWorker(pstate)) == NO_SLOT)
1230  WaitForWorkers(AH, pstate, WFW_ONE_IDLE);
1231 
1232  /* Construct and send command string */
1233  buildWorkerCommand(AH, te, act, buf, sizeof(buf));
1234 
1235  sendMessageToWorker(pstate, worker, buf);
1236 
1237  /* Remember worker is busy, and which TocEntry it's working on */
1238  pstate->parallelSlot[worker].workerStatus = WRKR_WORKING;
1239  pstate->parallelSlot[worker].callback = callback;
1240  pstate->parallelSlot[worker].callback_data = callback_data;
1241  pstate->te[worker] = te;
1242 }
1243 
1244 /*
1245  * Find an idle worker and return its slot number.
1246  * Return NO_SLOT if none are idle.
1247  */
1248 static int
1250 {
1251  int i;
1252 
1253  for (i = 0; i < pstate->numWorkers; i++)
1254  {
1255  if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE)
1256  return i;
1257  }
1258  return NO_SLOT;
1259 }
1260 
1261 /*
1262  * Return true iff every worker is in the WRKR_TERMINATED state.
1263  */
1264 static bool
1266 {
1267  int i;
1268 
1269  for (i = 0; i < pstate->numWorkers; i++)
1270  {
1271  if (pstate->parallelSlot[i].workerStatus != WRKR_TERMINATED)
1272  return false;
1273  }
1274  return true;
1275 }
1276 
1277 /*
1278  * Return true iff every worker is in the WRKR_IDLE state.
1279  */
1280 bool
1282 {
1283  int i;
1284 
1285  for (i = 0; i < pstate->numWorkers; i++)
1286  {
1287  if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE)
1288  return false;
1289  }
1290  return true;
1291 }
1292 
1293 /*
1294  * Acquire lock on a table to be dumped by a worker process.
1295  *
1296  * The master process is already holding an ACCESS SHARE lock. Ordinarily
1297  * it's no problem for a worker to get one too, but if anything else besides
1298  * pg_dump is running, there's a possible deadlock:
1299  *
1300  * 1) Master dumps the schema and locks all tables in ACCESS SHARE mode.
1301  * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted
1302  * because the master holds a conflicting ACCESS SHARE lock).
1303  * 3) A worker process also requests an ACCESS SHARE lock to read the table.
1304  * The worker is enqueued behind the ACCESS EXCLUSIVE lock request.
1305  * 4) Now we have a deadlock, since the master is effectively waiting for
1306  * the worker. The server cannot detect that, however.
1307  *
1308  * To prevent an infinite wait, prior to touching a table in a worker, request
1309  * a lock in ACCESS SHARE mode but with NOWAIT. If we don't get the lock,
1310  * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and
1311  * so we have a deadlock. We must fail the backup in that case.
1312  */
1313 static void
1315 {
1316  const char *qualId;
1317  PQExpBuffer query;
1318  PGresult *res;
1319 
1320  /* Nothing to do for BLOBS */
1321  if (strcmp(te->desc, "BLOBS") == 0)
1322  return;
1323 
1324  query = createPQExpBuffer();
1325 
1326  qualId = fmtQualifiedId(te->namespace, te->tag);
1327 
1328  appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT",
1329  qualId);
1330 
1331  res = PQexec(AH->connection, query->data);
1332 
1333  if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
1334  fatal("could not obtain lock on relation \"%s\"\n"
1335  "This usually means that someone requested an ACCESS EXCLUSIVE lock "
1336  "on the table after the pg_dump parent process had gotten the "
1337  "initial ACCESS SHARE lock on the table.", qualId);
1338 
1339  PQclear(res);
1340  destroyPQExpBuffer(query);
1341 }
1342 
1343 /*
1344  * WaitForCommands: main routine for a worker process.
1345  *
1346  * Read and execute commands from the master until we see EOF on the pipe.
1347  */
1348 static void
1350 {
1351  char *command;
1352  TocEntry *te;
1353  T_Action act;
1354  int status = 0;
1355  char buf[256];
1356 
1357  for (;;)
1358  {
1359  if (!(command = getMessageFromMaster(pipefd)))
1360  {
1361  /* EOF, so done */
1362  return;
1363  }
1364 
1365  /* Decode the command */
1366  parseWorkerCommand(AH, &te, &act, command);
1367 
1368  if (act == ACT_DUMP)
1369  {
1370  /* Acquire lock on this table within the worker's session */
1371  lockTableForWorker(AH, te);
1372 
1373  /* Perform the dump command */
1374  status = (AH->WorkerJobDumpPtr) (AH, te);
1375  }
1376  else if (act == ACT_RESTORE)
1377  {
1378  /* Perform the restore command */
1379  status = (AH->WorkerJobRestorePtr) (AH, te);
1380  }
1381  else
1382  Assert(false);
1383 
1384  /* Return status to master */
1385  buildWorkerResponse(AH, te, act, status, buf, sizeof(buf));
1386 
1387  sendMessageToMaster(pipefd, buf);
1388 
1389  /* command was pg_malloc'd and we are responsible for free()ing it. */
1390  free(command);
1391  }
1392 }
1393 
1394 /*
1395  * Check for status messages from workers.
1396  *
1397  * If do_wait is true, wait to get a status message; otherwise, just return
1398  * immediately if there is none available.
1399  *
1400  * When we get a status message, we pass the status code to the callback
1401  * function that was specified to DispatchJobForTocEntry, then reset the
1402  * worker status to IDLE.
1403  *
1404  * Returns true if we collected a status message, else false.
1405  *
1406  * XXX is it worth checking for more than one status message per call?
1407  * It seems somewhat unlikely that multiple workers would finish at exactly
1408  * the same time.
1409  */
1410 static bool
1412 {
1413  int worker;
1414  char *msg;
1415 
1416  /* Try to collect a status message */
1417  msg = getMessageFromWorker(pstate, do_wait, &worker);
1418 
1419  if (!msg)
1420  {
1421  /* If do_wait is true, we must have detected EOF on some socket */
1422  if (do_wait)
1423  fatal("a worker process died unexpectedly");
1424  return false;
1425  }
1426 
1427  /* Process it and update our idea of the worker's status */
1428  if (messageStartsWith(msg, "OK "))
1429  {
1430  ParallelSlot *slot = &pstate->parallelSlot[worker];
1431  TocEntry *te = pstate->te[worker];
1432  int status;
1433 
1434  status = parseWorkerResponse(AH, te, msg);
1435  slot->callback(AH, te, status, slot->callback_data);
1436  slot->workerStatus = WRKR_IDLE;
1437  pstate->te[worker] = NULL;
1438  }
1439  else
1440  fatal("invalid message received from worker: \"%s\"",
1441  msg);
1442 
1443  /* Free the string returned from getMessageFromWorker */
1444  free(msg);
1445 
1446  return true;
1447 }
1448 
1449 /*
1450  * Check for status results from workers, waiting if necessary.
1451  *
1452  * Available wait modes are:
1453  * WFW_NO_WAIT: reap any available status, but don't block
1454  * WFW_GOT_STATUS: wait for at least one more worker to finish
1455  * WFW_ONE_IDLE: wait for at least one worker to be idle
1456  * WFW_ALL_IDLE: wait for all workers to be idle
1457  *
1458  * Any received results are passed to the callback specified to
1459  * DispatchJobForTocEntry.
1460  *
1461  * This function is executed in the master process.
1462  */
1463 void
1465 {
1466  bool do_wait = false;
1467 
1468  /*
1469  * In GOT_STATUS mode, always block waiting for a message, since we can't
1470  * return till we get something. In other modes, we don't block the first
1471  * time through the loop.
1472  */
1473  if (mode == WFW_GOT_STATUS)
1474  {
1475  /* Assert that caller knows what it's doing */
1476  Assert(!IsEveryWorkerIdle(pstate));
1477  do_wait = true;
1478  }
1479 
1480  for (;;)
1481  {
1482  /*
1483  * Check for status messages, even if we don't need to block. We do
1484  * not try very hard to reap all available messages, though, since
1485  * there's unlikely to be more than one.
1486  */
1487  if (ListenToWorkers(AH, pstate, do_wait))
1488  {
1489  /*
1490  * If we got a message, we are done by definition for GOT_STATUS
1491  * mode, and we can also be certain that there's at least one idle
1492  * worker. So we're done in all but ALL_IDLE mode.
1493  */
1494  if (mode != WFW_ALL_IDLE)
1495  return;
1496  }
1497 
1498  /* Check whether we must wait for new status messages */
1499  switch (mode)
1500  {
1501  case WFW_NO_WAIT:
1502  return; /* never wait */
1503  case WFW_GOT_STATUS:
1504  Assert(false); /* can't get here, because we waited */
1505  break;
1506  case WFW_ONE_IDLE:
1507  if (GetIdleWorker(pstate) != NO_SLOT)
1508  return;
1509  break;
1510  case WFW_ALL_IDLE:
1511  if (IsEveryWorkerIdle(pstate))
1512  return;
1513  break;
1514  }
1515 
1516  /* Loop back, and this time wait for something to happen */
1517  do_wait = true;
1518  }
1519 }
1520 
1521 /*
1522  * Read one command message from the master, blocking if necessary
1523  * until one is available, and return it as a malloc'd string.
1524  * On EOF, return NULL.
1525  *
1526  * This function is executed in worker processes.
1527  */
1528 static char *
1529 getMessageFromMaster(int pipefd[2])
1530 {
1531  return readMessageFromPipe(pipefd[PIPE_READ]);
1532 }
1533 
1534 /*
1535  * Send a status message to the master.
1536  *
1537  * This function is executed in worker processes.
1538  */
1539 static void
1540 sendMessageToMaster(int pipefd[2], const char *str)
1541 {
1542  int len = strlen(str) + 1;
1543 
1544  if (pipewrite(pipefd[PIPE_WRITE], str, len) != len)
1545  fatal("could not write to the communication channel: %m");
1546 }
1547 
1548 /*
1549  * Wait until some descriptor in "workerset" becomes readable.
1550  * Returns -1 on error, else the number of readable descriptors.
1551  */
1552 static int
1553 select_loop(int maxFd, fd_set *workerset)
1554 {
1555  int i;
1556  fd_set saveSet = *workerset;
1557 
1558  for (;;)
1559  {
1560  *workerset = saveSet;
1561  i = select(maxFd + 1, workerset, NULL, NULL, NULL);
1562 
1563 #ifndef WIN32
1564  if (i < 0 && errno == EINTR)
1565  continue;
1566 #else
1567  if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR)
1568  continue;
1569 #endif
1570  break;
1571  }
1572 
1573  return i;
1574 }
1575 
1576 
1577 /*
1578  * Check for messages from worker processes.
1579  *
1580  * If a message is available, return it as a malloc'd string, and put the
1581  * index of the sending worker in *worker.
1582  *
1583  * If nothing is available, wait if "do_wait" is true, else return NULL.
1584  *
1585  * If we detect EOF on any socket, we'll return NULL. It's not great that
1586  * that's hard to distinguish from the no-data-available case, but for now
1587  * our one caller is okay with that.
1588  *
1589  * This function is executed in the master process.
1590  */
1591 static char *
1592 getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
1593 {
1594  int i;
1595  fd_set workerset;
1596  int maxFd = -1;
1597  struct timeval nowait = {0, 0};
1598 
1599  /* construct bitmap of socket descriptors for select() */
1600  FD_ZERO(&workerset);
1601  for (i = 0; i < pstate->numWorkers; i++)
1602  {
1603  if (pstate->parallelSlot[i].workerStatus == WRKR_TERMINATED)
1604  continue;
1605  FD_SET(pstate->parallelSlot[i].pipeRead, &workerset);
1606  if (pstate->parallelSlot[i].pipeRead > maxFd)
1607  maxFd = pstate->parallelSlot[i].pipeRead;
1608  }
1609 
1610  if (do_wait)
1611  {
1612  i = select_loop(maxFd, &workerset);
1613  Assert(i != 0);
1614  }
1615  else
1616  {
1617  if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0)
1618  return NULL;
1619  }
1620 
1621  if (i < 0)
1622  fatal("select() failed: %m");
1623 
1624  for (i = 0; i < pstate->numWorkers; i++)
1625  {
1626  char *msg;
1627 
1628  if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset))
1629  continue;
1630 
1631  /*
1632  * Read the message if any. If the socket is ready because of EOF,
1633  * we'll return NULL instead (and the socket will stay ready, so the
1634  * condition will persist).
1635  *
1636  * Note: because this is a blocking read, we'll wait if only part of
1637  * the message is available. Waiting a long time would be bad, but
1638  * since worker status messages are short and are always sent in one
1639  * operation, it shouldn't be a problem in practice.
1640  */
1641  msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead);
1642  *worker = i;
1643  return msg;
1644  }
1645  Assert(false);
1646  return NULL;
1647 }
1648 
1649 /*
1650  * Send a command message to the specified worker process.
1651  *
1652  * This function is executed in the master process.
1653  */
1654 static void
1655 sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
1656 {
1657  int len = strlen(str) + 1;
1658 
1659  if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len)
1660  {
1661  fatal("could not write to the communication channel: %m");
1662  }
1663 }
1664 
1665 /*
1666  * Read one message from the specified pipe (fd), blocking if necessary
1667  * until one is available, and return it as a malloc'd string.
1668  * On EOF, return NULL.
1669  *
1670  * A "message" on the channel is just a null-terminated string.
1671  */
1672 static char *
1674 {
1675  char *msg;
1676  int msgsize,
1677  bufsize;
1678  int ret;
1679 
1680  /*
1681  * In theory, if we let piperead() read multiple bytes, it might give us
1682  * back fragments of multiple messages. (That can't actually occur, since
1683  * neither master nor workers send more than one message without waiting
1684  * for a reply, but we don't wish to assume that here.) For simplicity,
1685  * read a byte at a time until we get the terminating '\0'. This method
1686  * is a bit inefficient, but since this is only used for relatively short
1687  * command and status strings, it shouldn't matter.
1688  */
1689  bufsize = 64; /* could be any number */
1690  msg = (char *) pg_malloc(bufsize);
1691  msgsize = 0;
1692  for (;;)
1693  {
1694  Assert(msgsize < bufsize);
1695  ret = piperead(fd, msg + msgsize, 1);
1696  if (ret <= 0)
1697  break; /* error or connection closure */
1698 
1699  Assert(ret == 1);
1700 
1701  if (msg[msgsize] == '\0')
1702  return msg; /* collected whole message */
1703 
1704  msgsize++;
1705  if (msgsize == bufsize) /* enlarge buffer if needed */
1706  {
1707  bufsize += 16; /* could be any number */
1708  msg = (char *) pg_realloc(msg, bufsize);
1709  }
1710  }
1711 
1712  /* Other end has closed the connection */
1713  pg_free(msg);
1714  return NULL;
1715 }
1716 
1717 #ifdef WIN32
1718 
1719 /*
1720  * This is a replacement version of pipe(2) for Windows which allows the pipe
1721  * handles to be used in select().
1722  *
1723  * Reads and writes on the pipe must go through piperead()/pipewrite().
1724  *
1725  * For consistency with Unix we declare the returned handles as "int".
1726  * This is okay even on WIN64 because system handles are not more than
1727  * 32 bits wide, but we do have to do some casting.
1728  */
1729 static int
1730 pgpipe(int handles[2])
1731 {
1732  pgsocket s,
1733  tmp_sock;
1734  struct sockaddr_in serv_addr;
1735  int len = sizeof(serv_addr);
1736 
1737  /* We have to use the Unix socket invalid file descriptor value here. */
1738  handles[0] = handles[1] = -1;
1739 
1740  /*
1741  * setup listen socket
1742  */
1743  if ((s = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
1744  {
1745  pg_log_error("pgpipe: could not create socket: error code %d",
1746  WSAGetLastError());
1747  return -1;
1748  }
1749 
1750  memset((void *) &serv_addr, 0, sizeof(serv_addr));
1751  serv_addr.sin_family = AF_INET;
1752  serv_addr.sin_port = pg_hton16(0);
1753  serv_addr.sin_addr.s_addr = pg_hton32(INADDR_LOOPBACK);
1754  if (bind(s, (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
1755  {
1756  pg_log_error("pgpipe: could not bind: error code %d",
1757  WSAGetLastError());
1758  closesocket(s);
1759  return -1;
1760  }
1761  if (listen(s, 1) == SOCKET_ERROR)
1762  {
1763  pg_log_error("pgpipe: could not listen: error code %d",
1764  WSAGetLastError());
1765  closesocket(s);
1766  return -1;
1767  }
1768  if (getsockname(s, (SOCKADDR *) &serv_addr, &len) == SOCKET_ERROR)
1769  {
1770  pg_log_error("pgpipe: getsockname() failed: error code %d",
1771  WSAGetLastError());
1772  closesocket(s);
1773  return -1;
1774  }
1775 
1776  /*
1777  * setup pipe handles
1778  */
1779  if ((tmp_sock = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
1780  {
1781  pg_log_error("pgpipe: could not create second socket: error code %d",
1782  WSAGetLastError());
1783  closesocket(s);
1784  return -1;
1785  }
1786  handles[1] = (int) tmp_sock;
1787 
1788  if (connect(handles[1], (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
1789  {
1790  pg_log_error("pgpipe: could not connect socket: error code %d",
1791  WSAGetLastError());
1792  closesocket(handles[1]);
1793  handles[1] = -1;
1794  closesocket(s);
1795  return -1;
1796  }
1797  if ((tmp_sock = accept(s, (SOCKADDR *) &serv_addr, &len)) == PGINVALID_SOCKET)
1798  {
1799  pg_log_error("pgpipe: could not accept connection: error code %d",
1800  WSAGetLastError());
1801  closesocket(handles[1]);
1802  handles[1] = -1;
1803  closesocket(s);
1804  return -1;
1805  }
1806  handles[0] = (int) tmp_sock;
1807 
1808  closesocket(s);
1809  return 0;
1810 }
1811 
1812 /*
1813  * Windows implementation of reading from a pipe.
1814  */
1815 static int
1816 piperead(int s, char *buf, int len)
1817 {
1818  int ret = recv(s, buf, len, 0);
1819 
1820  if (ret < 0 && WSAGetLastError() == WSAECONNRESET)
1821  {
1822  /* EOF on the pipe! */
1823  ret = 0;
1824  }
1825  return ret;
1826 }
1827 
1828 #endif /* WIN32 */
void on_exit_nicely(on_exit_nicely_callback function, void *arg)
static PgChecksumMode mode
Definition: pg_checksums.c:61
int pipeRead
Definition: parallel.c:100
int DumpId
Definition: pg_backup.h:234
static void buildWorkerCommand(ArchiveHandle *AH, TocEntry *te, T_Action act, char *buf, int buflen)
Definition: parallel.c:1121
#define SIGQUIT
Definition: win32_port.h:155
PQExpBufferData * PQExpBuffer
Definition: pqexpbuffer.h:51
void DispatchJobForTocEntry(ArchiveHandle *AH, ParallelState *pstate, TocEntry *te, T_Action act, ParallelCompletionPtr callback, void *callback_data)
Definition: parallel.c:1218
#define accept(s, addr, addrlen)
Definition: win32_port.h:434
static void set_cancel_pstate(ParallelState *pstate)
Definition: parallel.c:798
struct DumpSignalInformation DumpSignalInformation
ArchiveHandle * AH
Definition: parallel.c:98
bool IsEveryWorkerIdle(ParallelState *pstate)
Definition: parallel.c:1281
void * pg_malloc(size_t size)
Definition: fe_memutils.c:47
void * callback_data
Definition: parallel.c:96
static int select_loop(int maxFd, fd_set *workerset)
Definition: parallel.c:1553
#define pg_log_error(...)
Definition: logging.h:79
struct WorkerInfoData * WorkerInfo
Definition: autovacuum.c:233
#define closesocket
Definition: port.h:312
static void buildWorkerResponse(ArchiveHandle *AH, TocEntry *te, T_Action act, int status, char *buf, int buflen)
Definition: parallel.c:1169
void PQfreeCancel(PGcancel *cancel)
Definition: fe-connect.c:4269
#define pg_hton16(x)
Definition: pg_bswap.h:120
static int GetIdleWorker(ParallelState *pstate)
Definition: parallel.c:1249
struct ShutdownInformation ShutdownInformation
ArchiveHandle * myAH
Definition: parallel.c:162
#define write_stderr(str)
Definition: parallel.c:181
#define PIPE_READ
Definition: parallel.c:70
static bool HasEveryWorkerTerminated(ParallelState *pstate)
Definition: parallel.c:1265
#define kill(pid, sig)
Definition: win32_port.h:426
static void setup_cancel_handler(void)
Definition: parallel.c:617
#define connect(s, name, namelen)
Definition: win32_port.h:435
#define SIGPIPE
Definition: win32_port.h:159
int n_errors
Definition: pg_backup.h:206
PGcancel *volatile connCancel
static void ShutdownWorkersHard(ParallelState *pstate)
Definition: parallel.c:409
void on_exit_close_archive(Archive *AHX)
Definition: parallel.c:342
const char * progname
Definition: pg_standby.c:36
#define bind(s, addr, addrlen)
Definition: win32_port.h:432
SetupWorkerPtrType SetupWorkerPtr
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define recv(s, buf, len, flags)
Definition: win32_port.h:437
ExecStatusType PQresultStatus(const PGresult *res)
Definition: fe-exec.c:2692
#define WORKER_IGNORED_ERRORS
static char * getMessageFromMaster(int pipefd[2])
Definition: parallel.c:1529
void DeCloneArchive(ArchiveHandle *AH)
void destroyPQExpBuffer(PQExpBuffer str)
Definition: pqexpbuffer.c:116
ParallelState * pstate
Definition: parallel.c:163
#define pg_hton32(x)
Definition: pg_bswap.h:121
void * pg_malloc0(size_t size)
Definition: fe_memutils.c:53
static ParallelSlot * GetMyPSlot(ParallelState *pstate)
Definition: parallel.c:278
PGconn * conn
Definition: streamutil.c:54
pid_t pid
Definition: parallel.c:110
#define NO_SLOT
Definition: parallel.c:73
void appendPQExpBuffer(PQExpBuffer str, const char *fmt,...)
Definition: pqexpbuffer.c:267
static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot)
Definition: parallel.c:838
WFW_WaitOption
Definition: parallel.h:28
void(* ParallelCompletionPtr)(ArchiveHandle *AH, TocEntry *te, int status, void *callback_data)
Definition: parallel.h:22
static char * buf
Definition: pg_test_fsync.c:67
TocEntry * getTocEntryByDumpId(ArchiveHandle *AH, DumpId id)
int pipeRevRead
Definition: parallel.c:102
static char * readMessageFromPipe(int fd)
Definition: parallel.c:1673
PGcancel * PQgetCancel(PGconn *conn)
Definition: fe-connect.c:4246
ArchiveHandle * CloneArchive(ArchiveHandle *AH)
#define pgpipe(a)
Definition: parallel.c:134
#define select(n, r, w, e, timeout)
Definition: win32_port.h:436
static char * getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
Definition: parallel.c:1592
int pgsocket
Definition: port.h:31
void * pg_realloc(void *ptr, size_t size)
Definition: fe_memutils.c:65
void DisconnectDatabase(Archive *AHX)
Definition: pg_backup_db.c:337
ParallelState * ParallelBackupStart(ArchiveHandle *AH)
Definition: parallel.c:906
#define SIG_IGN
Definition: win32_port.h:151
ParallelSlot * parallelSlot
Definition: parallel.h:45
#define listen(s, backlog)
Definition: win32_port.h:433
void ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate)
Definition: parallel.c:1072
#define exit_nicely(code)
Definition: pg_dumpall.c:95
static void sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
Definition: parallel.c:1655
ParallelState * pstate
Definition: parallel.c:145
#define socket(af, type, protocol)
Definition: win32_port.h:431
#define PGINVALID_SOCKET
Definition: port.h:33
PQExpBuffer createPQExpBuffer(void)
Definition: pqexpbuffer.c:74
static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait)
Definition: parallel.c:1411
static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te)
Definition: parallel.c:1314
void WaitForWorkers(ArchiveHandle *AH, ParallelState *pstate, WFW_WaitOption mode)
Definition: parallel.c:1464
void PQclear(PGresult *res)
Definition: fe-exec.c:694
void init_parallel_dump_utils(void)
Definition: parallel.c:246
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
T_WorkerStatus
Definition: parallel.c:76
static ShutdownInformation shutdown_info
Definition: parallel.c:149
#define SIGNAL_ARGS
Definition: c.h:1288
static void parseWorkerCommand(ArchiveHandle *AH, TocEntry **te, T_Action *act, const char *msg)
Definition: parallel.c:1136
#define Assert(condition)
Definition: c.h:739
TocEntry ** te
Definition: parallel.h:44
ParallelCompletionPtr callback
Definition: parallel.c:95
int pipeRevWrite
Definition: parallel.c:103
PQExpBuffer(* getLocalPQExpBuffer)(void)
Definition: string_utils.c:27
const char * fmtQualifiedId(const char *schema, const char *id)
Definition: string_utils.c:145
#define fatal(...)
int numWorkers
Definition: pg_backup.h:193
void pg_free(void *ptr)
Definition: fe_memutils.c:105
static volatile DumpSignalInformation signal_info
Definition: parallel.c:170
T_WorkerStatus workerStatus
Definition: parallel.c:92
#define piperead(a, b, c)
Definition: parallel.c:135
int PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
Definition: fe-connect.c:4401
static bool do_wait
Definition: pg_ctl.c:79
void set_archive_cancel_info(ArchiveHandle *AH, PGconn *conn)
Definition: parallel.c:739
int pipeWrite
Definition: parallel.c:101
int i
WorkerJobRestorePtrType WorkerJobRestorePtr
static void set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH)
Definition: parallel.c:818
PGresult * PQexec(PGconn *conn, const char *query)
Definition: fe-exec.c:1939
void * arg
#define pipewrite(a, b, c)
Definition: parallel.c:136
static void WaitForTerminatingWorkers(ParallelState *pstate)
Definition: parallel.c:458
static void sigTermHandler(SIGNAL_ARGS)
Definition: parallel.c:557
#define EINTR
Definition: win32_port.h:323
static int parseWorkerResponse(ArchiveHandle *AH, TocEntry *te, const char *msg)
Definition: parallel.c:1184
static void archive_close_connection(int code, void *arg)
Definition: parallel.c:353
void resetPQExpBuffer(PQExpBuffer str)
Definition: pqexpbuffer.c:148
#define PIPE_WRITE
Definition: parallel.c:71
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:226
#define snprintf
Definition: port.h:192
int numWorkers
Definition: parallel.h:42
#define messageStartsWith(msg, prefix)
Definition: parallel.c:223
WorkerJobDumpPtrType WorkerJobDumpPtr
static void sendMessageToMaster(int pipefd[2], const char *str)
Definition: parallel.c:1540
static void WaitForCommands(ArchiveHandle *AH, int pipefd[2])
Definition: parallel.c:1349