PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
parallel.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * parallel.c
4  *
5  * Parallel support for pg_dump and pg_restore
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/bin/pg_dump/parallel.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 /*
17  * Parallel operation works like this:
18  *
19  * The original, master process calls ParallelBackupStart(), which forks off
20  * the desired number of worker processes, which each enter WaitForCommands().
21  *
22  * The master process dispatches an individual work item to one of the worker
23  * processes in DispatchJobForTocEntry(). We send a command string such as
24  * "DUMP 1234" or "RESTORE 1234", where 1234 is the TocEntry ID.
25  * The worker process receives and decodes the command and passes it to the
26  * routine pointed to by AH->WorkerJobDumpPtr or AH->WorkerJobRestorePtr,
27  * which are routines of the current archive format. That routine performs
28  * the required action (dump or restore) and returns an integer status code.
29  * This is passed back to the master where we pass it to the
30  * ParallelCompletionPtr callback function that was passed to
31  * DispatchJobForTocEntry(). The callback function does state updating
32  * for the master control logic in pg_backup_archiver.c.
33  *
34  * In principle additional archive-format-specific information might be needed
35  * in commands or worker status responses, but so far that hasn't proved
36  * necessary, since workers have full copies of the ArchiveHandle/TocEntry
37  * data structures. Remember that we have forked off the workers only after
38  * we have read in the catalog. That's why our worker processes can also
39  * access the catalog information. (In the Windows case, the workers are
40  * threads in the same process. To avoid problems, they work with cloned
41  * copies of the Archive data structure; see RunWorker().)
42  *
43  * In the master process, the workerStatus field for each worker has one of
44  * the following values:
45  * WRKR_IDLE: it's waiting for a command
46  * WRKR_WORKING: it's working on a command
47  * WRKR_TERMINATED: process ended
48  * The pstate->te[] entry for each worker is valid when it's in WRKR_WORKING
49  * state, and must be NULL in other states.
50  */
51 
52 #include "postgres_fe.h"
53 
54 #ifndef WIN32
55 #include <sys/wait.h>
56 #include <signal.h>
57 #include <unistd.h>
58 #include <fcntl.h>
59 #endif
60 #ifdef HAVE_SYS_SELECT_H
61 #include <sys/select.h>
62 #endif
63 
64 #include "parallel.h"
65 #include "pg_backup_utils.h"
66 #include "fe_utils/string_utils.h"
67 
68 /* Mnemonic macros for indexing the fd array returned by pipe(2) */
69 #define PIPE_READ 0
70 #define PIPE_WRITE 1
71 
72 #define NO_SLOT (-1) /* Failure result for GetIdleWorker() */
73 
74 /* Worker process statuses */
75 typedef enum
76 {
81 
82 /*
83  * Private per-parallel-worker state (typedef for this is in parallel.h).
84  *
85  * Much of this is valid only in the master process (or, on Windows, should
86  * be touched only by the master thread). But the AH field should be touched
87  * only by workers. The pipe descriptors are valid everywhere.
88  */
90 {
91  T_WorkerStatus workerStatus; /* see enum above */
92 
93  /* These fields are valid if workerStatus == WRKR_WORKING: */
94  ParallelCompletionPtr callback; /* function to call on completion */
95  void *callback_data; /* passthrough data for it */
96 
97  ArchiveHandle *AH; /* Archive data worker is using */
98 
99  int pipeRead; /* master's end of the pipes */
101  int pipeRevRead; /* child's end of the pipes */
103 
104  /* Child process/thread identity info: */
105 #ifdef WIN32
106  uintptr_t hThread;
107  unsigned int threadId;
108 #else
109  pid_t pid;
110 #endif
111 };
112 
113 #ifdef WIN32
114 
115 /*
116  * Structure to hold info passed by _beginthreadex() to the function it calls
117  * via its single allowed argument.
118  */
119 typedef struct
120 {
121  ArchiveHandle *AH; /* master database connection */
122  ParallelSlot *slot; /* this worker's parallel slot */
123 } WorkerInfo;
124 
125 /* Windows implementation of pipe access */
126 static int pgpipe(int handles[2]);
127 static int piperead(int s, char *buf, int len);
128 #define pipewrite(a,b,c) send(a,b,c,0)
129 
130 #else /* !WIN32 */
131 
132 /* Non-Windows implementation of pipe access */
133 #define pgpipe(a) pipe(a)
134 #define piperead(a,b,c) read(a,b,c)
135 #define pipewrite(a,b,c) write(a,b,c)
136 
137 #endif /* WIN32 */
138 
139 /*
140  * State info for archive_close_connection() shutdown callback.
141  */
142 typedef struct ShutdownInformation
143 {
147 
149 
150 /*
151  * State info for signal handling.
152  * We assume signal_info initializes to zeroes.
153  *
154  * On Unix, myAH is the master DB connection in the master process, and the
155  * worker's own connection in worker processes. On Windows, we have only one
156  * instance of signal_info, so myAH is the master connection and the worker
157  * connections must be dug out of pstate->parallelSlot[].
158  */
159 typedef struct DumpSignalInformation
160 {
161  ArchiveHandle *myAH; /* database connection to issue cancel for */
162  ParallelState *pstate; /* parallel state, if any */
163  bool handler_set; /* signal handler set up in this process? */
164 #ifndef WIN32
165  bool am_worker; /* am I a worker process? */
166 #endif
168 
170 
171 #ifdef WIN32
172 static CRITICAL_SECTION signal_info_lock;
173 #endif
174 
175 /*
176  * Write a simple string to stderr --- must be safe in a signal handler.
177  * We ignore the write() result since there's not much we could do about it.
178  * Certain compilers make that harder than it ought to be.
179  */
180 #define write_stderr(str) \
181  do { \
182  const char *str_ = (str); \
183  int rc_; \
184  rc_ = write(fileno(stderr), str_, strlen(str_)); \
185  (void) rc_; \
186  } while (0)
187 
188 
189 #ifdef WIN32
190 /* file-scope variables */
191 static DWORD tls_index;
192 
193 /* globally visible variables (needed by exit_nicely) */
194 bool parallel_init_done = false;
195 DWORD mainThreadId;
196 #endif /* WIN32 */
197 
198 static const char *modulename = gettext_noop("parallel archiver");
199 
200 /* Local function prototypes */
201 static ParallelSlot *GetMyPSlot(ParallelState *pstate);
202 static void archive_close_connection(int code, void *arg);
203 static void ShutdownWorkersHard(ParallelState *pstate);
204 static void WaitForTerminatingWorkers(ParallelState *pstate);
205 static void setup_cancel_handler(void);
206 static void set_cancel_pstate(ParallelState *pstate);
207 static void set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH);
208 static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot);
209 static int GetIdleWorker(ParallelState *pstate);
210 static bool HasEveryWorkerTerminated(ParallelState *pstate);
211 static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te);
212 static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]);
213 static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate,
214  bool do_wait);
215 static char *getMessageFromMaster(int pipefd[2]);
216 static void sendMessageToMaster(int pipefd[2], const char *str);
217 static int select_loop(int maxFd, fd_set *workerset);
218 static char *getMessageFromWorker(ParallelState *pstate,
219  bool do_wait, int *worker);
220 static void sendMessageToWorker(ParallelState *pstate,
221  int worker, const char *str);
222 static char *readMessageFromPipe(int fd);
223 
224 #define messageStartsWith(msg, prefix) \
225  (strncmp(msg, prefix, strlen(prefix)) == 0)
226 #define messageEquals(msg, pattern) \
227  (strcmp(msg, pattern) == 0)
228 
229 
230 /*
231  * Shutdown callback to clean up socket access
232  */
233 #ifdef WIN32
234 static void
235 shutdown_parallel_dump_utils(int code, void *unused)
236 {
237  /* Call the cleanup function only from the main thread */
238  if (mainThreadId == GetCurrentThreadId())
239  WSACleanup();
240 }
241 #endif
242 
243 /*
244  * Initialize parallel dump support --- should be called early in process
245  * startup. (Currently, this is called whether or not we intend parallel
246  * activity.)
247  */
248 void
250 {
251 #ifdef WIN32
252  if (!parallel_init_done)
253  {
254  WSADATA wsaData;
255  int err;
256 
257  /* Prepare for threaded operation */
258  tls_index = TlsAlloc();
259  mainThreadId = GetCurrentThreadId();
260 
261  /* Initialize socket access */
262  err = WSAStartup(MAKEWORD(2, 2), &wsaData);
263  if (err != 0)
264  {
265  fprintf(stderr, _("%s: WSAStartup failed: %d\n"), progname, err);
266  exit_nicely(1);
267  }
268  /* ... and arrange to shut it down at exit */
269  on_exit_nicely(shutdown_parallel_dump_utils, NULL);
270  parallel_init_done = true;
271  }
272 #endif
273 }
274 
275 /*
276  * Find the ParallelSlot for the current worker process or thread.
277  *
278  * Returns NULL if no matching slot is found (this implies we're the master).
279  */
280 static ParallelSlot *
282 {
283  int i;
284 
285  for (i = 0; i < pstate->numWorkers; i++)
286  {
287 #ifdef WIN32
288  if (pstate->parallelSlot[i].threadId == GetCurrentThreadId())
289 #else
290  if (pstate->parallelSlot[i].pid == getpid())
291 #endif
292  return &(pstate->parallelSlot[i]);
293  }
294 
295  return NULL;
296 }
297 
298 /*
299  * A thread-local version of getLocalPQExpBuffer().
300  *
301  * Non-reentrant but reduces memory leakage: we'll consume one buffer per
302  * thread, which is much better than one per fmtId/fmtQualifiedId call.
303  */
304 #ifdef WIN32
305 static PQExpBuffer
306 getThreadLocalPQExpBuffer(void)
307 {
308  /*
309  * The Tls code goes awry if we use a static var, so we provide for both
310  * static and auto, and omit any use of the static var when using Tls. We
311  * rely on TlsGetValue() to return 0 if the value is not yet set.
312  */
313  static PQExpBuffer s_id_return = NULL;
314  PQExpBuffer id_return;
315 
316  if (parallel_init_done)
317  id_return = (PQExpBuffer) TlsGetValue(tls_index);
318  else
319  id_return = s_id_return;
320 
321  if (id_return) /* first time through? */
322  {
323  /* same buffer, just wipe contents */
324  resetPQExpBuffer(id_return);
325  }
326  else
327  {
328  /* new buffer */
329  id_return = createPQExpBuffer();
330  if (parallel_init_done)
331  TlsSetValue(tls_index, id_return);
332  else
333  s_id_return = id_return;
334  }
335 
336  return id_return;
337 }
338 #endif /* WIN32 */
339 
340 /*
341  * pg_dump and pg_restore call this to register the cleanup handler
342  * as soon as they've created the ArchiveHandle.
343  */
344 void
346 {
347  shutdown_info.AHX = AHX;
348  on_exit_nicely(archive_close_connection, &shutdown_info);
349 }
350 
351 /*
352  * on_exit_nicely handler for shutting down database connections and
353  * worker processes cleanly.
354  */
355 static void
357 {
359 
360  if (si->pstate)
361  {
362  /* In parallel mode, must figure out who we are */
363  ParallelSlot *slot = GetMyPSlot(si->pstate);
364 
365  if (!slot)
366  {
367  /*
368  * We're the master. Forcibly shut down workers, then close our
369  * own database connection, if any.
370  */
372 
373  if (si->AHX)
374  DisconnectDatabase(si->AHX);
375  }
376  else
377  {
378  /*
379  * We're a worker. Shut down our own DB connection if any. On
380  * Windows, we also have to close our communication sockets, to
381  * emulate what will happen on Unix when the worker process exits.
382  * (Without this, if this is a premature exit, the master would
383  * fail to detect it because there would be no EOF condition on
384  * the other end of the pipe.)
385  */
386  if (slot->AH)
387  DisconnectDatabase(&(slot->AH->public));
388 
389 #ifdef WIN32
390  closesocket(slot->pipeRevRead);
391  closesocket(slot->pipeRevWrite);
392 #endif
393  }
394  }
395  else
396  {
397  /* Non-parallel operation: just kill the master DB connection */
398  if (si->AHX)
399  DisconnectDatabase(si->AHX);
400  }
401 }
402 
403 /*
404  * Forcibly shut down any remaining workers, waiting for them to finish.
405  *
406  * Note that we don't expect to come here during normal exit (the workers
407  * should be long gone, and the ParallelState too). We're only here in an
408  * exit_horribly() situation, so intervening to cancel active commands is
409  * appropriate.
410  */
411 static void
413 {
414  int i;
415 
416  /*
417  * Close our write end of the sockets so that any workers waiting for
418  * commands know they can exit.
419  */
420  for (i = 0; i < pstate->numWorkers; i++)
421  closesocket(pstate->parallelSlot[i].pipeWrite);
422 
423  /*
424  * Force early termination of any commands currently in progress.
425  */
426 #ifndef WIN32
427  /* On non-Windows, send SIGTERM to each worker process. */
428  for (i = 0; i < pstate->numWorkers; i++)
429  {
430  pid_t pid = pstate->parallelSlot[i].pid;
431 
432  if (pid != 0)
433  kill(pid, SIGTERM);
434  }
435 #else
436 
437  /*
438  * On Windows, send query cancels directly to the workers' backends. Use
439  * a critical section to ensure worker threads don't change state.
440  */
441  EnterCriticalSection(&signal_info_lock);
442  for (i = 0; i < pstate->numWorkers; i++)
443  {
444  ArchiveHandle *AH = pstate->parallelSlot[i].AH;
445  char errbuf[1];
446 
447  if (AH != NULL && AH->connCancel != NULL)
448  (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
449  }
450  LeaveCriticalSection(&signal_info_lock);
451 #endif
452 
453  /* Now wait for them to terminate. */
455 }
456 
457 /*
458  * Wait for all workers to terminate.
459  */
460 static void
462 {
463  while (!HasEveryWorkerTerminated(pstate))
464  {
465  ParallelSlot *slot = NULL;
466  int j;
467 
468 #ifndef WIN32
469  /* On non-Windows, use wait() to wait for next worker to end */
470  int status;
471  pid_t pid = wait(&status);
472 
473  /* Find dead worker's slot, and clear the PID field */
474  for (j = 0; j < pstate->numWorkers; j++)
475  {
476  slot = &(pstate->parallelSlot[j]);
477  if (slot->pid == pid)
478  {
479  slot->pid = 0;
480  break;
481  }
482  }
483 #else /* WIN32 */
484  /* On Windows, we must use WaitForMultipleObjects() */
485  HANDLE *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers);
486  int nrun = 0;
487  DWORD ret;
488  uintptr_t hThread;
489 
490  for (j = 0; j < pstate->numWorkers; j++)
491  {
492  if (pstate->parallelSlot[j].workerStatus != WRKR_TERMINATED)
493  {
494  lpHandles[nrun] = (HANDLE) pstate->parallelSlot[j].hThread;
495  nrun++;
496  }
497  }
498  ret = WaitForMultipleObjects(nrun, lpHandles, false, INFINITE);
499  Assert(ret != WAIT_FAILED);
500  hThread = (uintptr_t) lpHandles[ret - WAIT_OBJECT_0];
501  free(lpHandles);
502 
503  /* Find dead worker's slot, and clear the hThread field */
504  for (j = 0; j < pstate->numWorkers; j++)
505  {
506  slot = &(pstate->parallelSlot[j]);
507  if (slot->hThread == hThread)
508  {
509  /* For cleanliness, close handles for dead threads */
510  CloseHandle((HANDLE) slot->hThread);
511  slot->hThread = (uintptr_t) INVALID_HANDLE_VALUE;
512  break;
513  }
514  }
515 #endif /* WIN32 */
516 
517  /* On all platforms, update workerStatus and te[] as well */
518  Assert(j < pstate->numWorkers);
520  pstate->te[j] = NULL;
521  }
522 }
523 
524 
525 /*
526  * Code for responding to cancel interrupts (SIGINT, control-C, etc)
527  *
528  * This doesn't quite belong in this module, but it needs access to the
529  * ParallelState data, so there's not really a better place either.
530  *
531  * When we get a cancel interrupt, we could just die, but in pg_restore that
532  * could leave a SQL command (e.g., CREATE INDEX on a large table) running
533  * for a long time. Instead, we try to send a cancel request and then die.
534  * pg_dump probably doesn't really need this, but we might as well use it
535  * there too. Note that sending the cancel directly from the signal handler
536  * is safe because PQcancel() is written to make it so.
537  *
538  * In parallel operation on Unix, each process is responsible for canceling
539  * its own connection (this must be so because nobody else has access to it).
540  * Furthermore, the master process should attempt to forward its signal to
541  * each child. In simple manual use of pg_dump/pg_restore, forwarding isn't
542  * needed because typing control-C at the console would deliver SIGINT to
543  * every member of the terminal process group --- but in other scenarios it
544  * might be that only the master gets signaled.
545  *
546  * On Windows, the cancel handler runs in a separate thread, because that's
547  * how SetConsoleCtrlHandler works. We make it stop worker threads, send
548  * cancels on all active connections, and then return FALSE, which will allow
549  * the process to die. For safety's sake, we use a critical section to
550  * protect the PGcancel structures against being changed while the signal
551  * thread runs.
552  */
553 
554 #ifndef WIN32
555 
556 /*
557  * Signal handler (Unix only)
558  */
559 static void
561 {
562  int i;
563  char errbuf[1];
564 
565  /*
566  * Some platforms allow delivery of new signals to interrupt an active
567  * signal handler. That could muck up our attempt to send PQcancel, so
568  * disable the signals that setup_cancel_handler enabled.
569  */
570  pqsignal(SIGINT, SIG_IGN);
571  pqsignal(SIGTERM, SIG_IGN);
573 
574  /*
575  * If we're in the master, forward signal to all workers. (It seems best
576  * to do this before PQcancel; killing the master transaction will result
577  * in invalid-snapshot errors from active workers, which maybe we can
578  * quiet by killing workers first.) Ignore any errors.
579  */
580  if (signal_info.pstate != NULL)
581  {
582  for (i = 0; i < signal_info.pstate->numWorkers; i++)
583  {
584  pid_t pid = signal_info.pstate->parallelSlot[i].pid;
585 
586  if (pid != 0)
587  kill(pid, SIGTERM);
588  }
589  }
590 
591  /*
592  * Send QueryCancel if we have a connection to send to. Ignore errors,
593  * there's not much we can do about them anyway.
594  */
595  if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
596  (void) PQcancel(signal_info.myAH->connCancel, errbuf, sizeof(errbuf));
597 
598  /*
599  * Report we're quitting, using nothing more complicated than write(2).
600  * When in parallel operation, only the master process should do this.
601  */
602  if (!signal_info.am_worker)
603  {
604  if (progname)
605  {
607  write_stderr(": ");
608  }
609  write_stderr("terminated by user\n");
610  }
611 
612  /* And die. */
613  exit(1);
614 }
615 
616 /*
617  * Enable cancel interrupt handler, if not already done.
618  */
619 static void
621 {
622  /*
623  * When forking, signal_info.handler_set will propagate into the new
624  * process, but that's fine because the signal handler state does too.
625  */
626  if (!signal_info.handler_set)
627  {
628  signal_info.handler_set = true;
629 
630  pqsignal(SIGINT, sigTermHandler);
631  pqsignal(SIGTERM, sigTermHandler);
633  }
634 }
635 
636 #else /* WIN32 */
637 
638 /*
639  * Console interrupt handler --- runs in a newly-started thread.
640  *
641  * After stopping other threads and sending cancel requests on all open
642  * connections, we return FALSE which will allow the default ExitProcess()
643  * action to be taken.
644  */
645 static BOOL WINAPI
646 consoleHandler(DWORD dwCtrlType)
647 {
648  int i;
649  char errbuf[1];
650 
651  if (dwCtrlType == CTRL_C_EVENT ||
652  dwCtrlType == CTRL_BREAK_EVENT)
653  {
654  /* Critical section prevents changing data we look at here */
655  EnterCriticalSection(&signal_info_lock);
656 
657  /*
658  * If in parallel mode, stop worker threads and send QueryCancel to
659  * their connected backends. The main point of stopping the worker
660  * threads is to keep them from reporting the query cancels as errors,
661  * which would clutter the user's screen. We needn't stop the master
662  * thread since it won't be doing much anyway. Do this before
663  * canceling the main transaction, else we might get invalid-snapshot
664  * errors reported before we can stop the workers. Ignore errors,
665  * there's not much we can do about them anyway.
666  */
667  if (signal_info.pstate != NULL)
668  {
669  for (i = 0; i < signal_info.pstate->numWorkers; i++)
670  {
671  ParallelSlot *slot = &(signal_info.pstate->parallelSlot[i]);
672  ArchiveHandle *AH = slot->AH;
673  HANDLE hThread = (HANDLE) slot->hThread;
674 
675  /*
676  * Using TerminateThread here may leave some resources leaked,
677  * but it doesn't matter since we're about to end the whole
678  * process.
679  */
680  if (hThread != INVALID_HANDLE_VALUE)
681  TerminateThread(hThread, 0);
682 
683  if (AH != NULL && AH->connCancel != NULL)
684  (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
685  }
686  }
687 
688  /*
689  * Send QueryCancel to master connection, if enabled. Ignore errors,
690  * there's not much we can do about them anyway.
691  */
692  if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
693  (void) PQcancel(signal_info.myAH->connCancel,
694  errbuf, sizeof(errbuf));
695 
696  LeaveCriticalSection(&signal_info_lock);
697 
698  /*
699  * Report we're quitting, using nothing more complicated than
700  * write(2). (We might be able to get away with using write_msg()
701  * here, but since we terminated other threads uncleanly above, it
702  * seems better to assume as little as possible.)
703  */
704  if (progname)
705  {
707  write_stderr(": ");
708  }
709  write_stderr("terminated by user\n");
710  }
711 
712  /* Always return FALSE to allow signal handling to continue */
713  return FALSE;
714 }
715 
716 /*
717  * Enable cancel interrupt handler, if not already done.
718  */
719 static void
721 {
722  if (!signal_info.handler_set)
723  {
724  signal_info.handler_set = true;
725 
726  InitializeCriticalSection(&signal_info_lock);
727 
728  SetConsoleCtrlHandler(consoleHandler, TRUE);
729  }
730 }
731 
732 #endif /* WIN32 */
733 
734 
735 /*
736  * set_archive_cancel_info
737  *
738  * Fill AH->connCancel with cancellation info for the specified database
739  * connection; or clear it if conn is NULL.
740  */
741 void
743 {
744  PGcancel *oldConnCancel;
745 
746  /*
747  * Activate the interrupt handler if we didn't yet in this process. On
748  * Windows, this also initializes signal_info_lock; therefore it's
749  * important that this happen at least once before we fork off any
750  * threads.
751  */
753 
754  /*
755  * On Unix, we assume that storing a pointer value is atomic with respect
756  * to any possible signal interrupt. On Windows, use a critical section.
757  */
758 
759 #ifdef WIN32
760  EnterCriticalSection(&signal_info_lock);
761 #endif
762 
763  /* Free the old one if we have one */
764  oldConnCancel = AH->connCancel;
765  /* be sure interrupt handler doesn't use pointer while freeing */
766  AH->connCancel = NULL;
767 
768  if (oldConnCancel != NULL)
769  PQfreeCancel(oldConnCancel);
770 
771  /* Set the new one if specified */
772  if (conn)
773  AH->connCancel = PQgetCancel(conn);
774 
775  /*
776  * On Unix, there's only ever one active ArchiveHandle per process, so we
777  * can just set signal_info.myAH unconditionally. On Windows, do that
778  * only in the main thread; worker threads have to make sure their
779  * ArchiveHandle appears in the pstate data, which is dealt with in
780  * RunWorker().
781  */
782 #ifndef WIN32
783  signal_info.myAH = AH;
784 #else
785  if (mainThreadId == GetCurrentThreadId())
786  signal_info.myAH = AH;
787 #endif
788 
789 #ifdef WIN32
790  LeaveCriticalSection(&signal_info_lock);
791 #endif
792 }
793 
794 /*
795  * set_cancel_pstate
796  *
797  * Set signal_info.pstate to point to the specified ParallelState, if any.
798  * We need this mainly to have an interlock against Windows signal thread.
799  */
800 static void
802 {
803 #ifdef WIN32
804  EnterCriticalSection(&signal_info_lock);
805 #endif
806 
807  signal_info.pstate = pstate;
808 
809 #ifdef WIN32
810  LeaveCriticalSection(&signal_info_lock);
811 #endif
812 }
813 
814 /*
815  * set_cancel_slot_archive
816  *
817  * Set ParallelSlot's AH field to point to the specified archive, if any.
818  * We need this mainly to have an interlock against Windows signal thread.
819  */
820 static void
822 {
823 #ifdef WIN32
824  EnterCriticalSection(&signal_info_lock);
825 #endif
826 
827  slot->AH = AH;
828 
829 #ifdef WIN32
830  LeaveCriticalSection(&signal_info_lock);
831 #endif
832 }
833 
834 
835 /*
836  * This function is called by both Unix and Windows variants to set up
837  * and run a worker process. Caller should exit the process (or thread)
838  * upon return.
839  */
840 static void
842 {
843  int pipefd[2];
844 
845  /* fetch child ends of pipes */
846  pipefd[PIPE_READ] = slot->pipeRevRead;
847  pipefd[PIPE_WRITE] = slot->pipeRevWrite;
848 
849  /*
850  * Clone the archive so that we have our own state to work with, and in
851  * particular our own database connection.
852  *
853  * We clone on Unix as well as Windows, even though technically we don't
854  * need to because fork() gives us a copy in our own address space
855  * already. But CloneArchive resets the state information and also clones
856  * the database connection which both seem kinda helpful.
857  */
858  AH = CloneArchive(AH);
859 
860  /* Remember cloned archive where signal handler can find it */
861  set_cancel_slot_archive(slot, AH);
862 
863  /*
864  * Call the setup worker function that's defined in the ArchiveHandle.
865  */
866  (AH->SetupWorkerPtr) ((Archive *) AH);
867 
868  /*
869  * Execute commands until done.
870  */
871  WaitForCommands(AH, pipefd);
872 
873  /*
874  * Disconnect from database and clean up.
875  */
877  DisconnectDatabase(&(AH->public));
878  DeCloneArchive(AH);
879 }
880 
881 /*
882  * Thread base function for Windows
883  */
884 #ifdef WIN32
885 static unsigned __stdcall
886 init_spawned_worker_win32(WorkerInfo *wi)
887 {
888  ArchiveHandle *AH = wi->AH;
889  ParallelSlot *slot = wi->slot;
890 
891  /* Don't need WorkerInfo anymore */
892  free(wi);
893 
894  /* Run the worker ... */
895  RunWorker(AH, slot);
896 
897  /* Exit the thread */
898  _endthreadex(0);
899  return 0;
900 }
901 #endif /* WIN32 */
902 
903 /*
904  * This function starts a parallel dump or restore by spawning off the worker
905  * processes. For Windows, it creates a number of threads; on Unix the
906  * workers are created with fork().
907  */
910 {
911  ParallelState *pstate;
912  int i;
913 
914  Assert(AH->public.numWorkers > 0);
915 
916  pstate = (ParallelState *) pg_malloc(sizeof(ParallelState));
917 
918  pstate->numWorkers = AH->public.numWorkers;
919  pstate->te = NULL;
920  pstate->parallelSlot = NULL;
921 
922  if (AH->public.numWorkers == 1)
923  return pstate;
924 
925  pstate->te = (TocEntry **)
926  pg_malloc0(pstate->numWorkers * sizeof(TocEntry *));
927  pstate->parallelSlot = (ParallelSlot *)
928  pg_malloc0(pstate->numWorkers * sizeof(ParallelSlot));
929 
930 #ifdef WIN32
931  /* Make fmtId() and fmtQualifiedId() use thread-local storage */
932  getLocalPQExpBuffer = getThreadLocalPQExpBuffer;
933 #endif
934 
935  /*
936  * Set the pstate in shutdown_info, to tell the exit handler that it must
937  * clean up workers as well as the main database connection. But we don't
938  * set this in signal_info yet, because we don't want child processes to
939  * inherit non-NULL signal_info.pstate.
940  */
941  shutdown_info.pstate = pstate;
942 
943  /*
944  * Temporarily disable query cancellation on the master connection. This
945  * ensures that child processes won't inherit valid AH->connCancel
946  * settings and thus won't try to issue cancels against the master's
947  * connection. No harm is done if we fail while it's disabled, because
948  * the master connection is idle at this point anyway.
949  */
951 
952  /* Ensure stdio state is quiesced before forking */
953  fflush(NULL);
954 
955  /* Create desired number of workers */
956  for (i = 0; i < pstate->numWorkers; i++)
957  {
958 #ifdef WIN32
959  WorkerInfo *wi;
960  uintptr_t handle;
961 #else
962  pid_t pid;
963 #endif
964  ParallelSlot *slot = &(pstate->parallelSlot[i]);
965  int pipeMW[2],
966  pipeWM[2];
967 
968  /* Create communication pipes for this worker */
969  if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0)
971  "could not create communication channels: %s\n",
972  strerror(errno));
973 
974  pstate->te[i] = NULL; /* just for safety */
975 
976  slot->workerStatus = WRKR_IDLE;
977  slot->AH = NULL;
978  slot->callback = NULL;
979  slot->callback_data = NULL;
980 
981  /* master's ends of the pipes */
982  slot->pipeRead = pipeWM[PIPE_READ];
983  slot->pipeWrite = pipeMW[PIPE_WRITE];
984  /* child's ends of the pipes */
985  slot->pipeRevRead = pipeMW[PIPE_READ];
986  slot->pipeRevWrite = pipeWM[PIPE_WRITE];
987 
988 #ifdef WIN32
989  /* Create transient structure to pass args to worker function */
990  wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo));
991 
992  wi->AH = AH;
993  wi->slot = slot;
994 
995  handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32,
996  wi, 0, &(slot->threadId));
997  slot->hThread = handle;
998 #else /* !WIN32 */
999  pid = fork();
1000  if (pid == 0)
1001  {
1002  /* we are the worker */
1003  int j;
1004 
1005  /* this is needed for GetMyPSlot() */
1006  slot->pid = getpid();
1007 
1008  /* instruct signal handler that we're in a worker now */
1009  signal_info.am_worker = true;
1010 
1011  /* close read end of Worker -> Master */
1012  closesocket(pipeWM[PIPE_READ]);
1013  /* close write end of Master -> Worker */
1014  closesocket(pipeMW[PIPE_WRITE]);
1015 
1016  /*
1017  * Close all inherited fds for communication of the master with
1018  * previously-forked workers.
1019  */
1020  for (j = 0; j < i; j++)
1021  {
1022  closesocket(pstate->parallelSlot[j].pipeRead);
1023  closesocket(pstate->parallelSlot[j].pipeWrite);
1024  }
1025 
1026  /* Run the worker ... */
1027  RunWorker(AH, slot);
1028 
1029  /* We can just exit(0) when done */
1030  exit(0);
1031  }
1032  else if (pid < 0)
1033  {
1034  /* fork failed */
1036  "could not create worker process: %s\n",
1037  strerror(errno));
1038  }
1039 
1040  /* In Master after successful fork */
1041  slot->pid = pid;
1042 
1043  /* close read end of Master -> Worker */
1044  closesocket(pipeMW[PIPE_READ]);
1045  /* close write end of Worker -> Master */
1046  closesocket(pipeWM[PIPE_WRITE]);
1047 #endif /* WIN32 */
1048  }
1049 
1050  /*
1051  * Having forked off the workers, disable SIGPIPE so that master isn't
1052  * killed if it tries to send a command to a dead worker. We don't want
1053  * the workers to inherit this setting, though.
1054  */
1055 #ifndef WIN32
1057 #endif
1058 
1059  /*
1060  * Re-establish query cancellation on the master connection.
1061  */
1063 
1064  /*
1065  * Tell the cancel signal handler to forward signals to worker processes,
1066  * too. (As with query cancel, we did not need this earlier because the
1067  * workers have not yet been given anything to do; if we die before this
1068  * point, any already-started workers will see EOF and quit promptly.)
1069  */
1070  set_cancel_pstate(pstate);
1071 
1072  return pstate;
1073 }
1074 
1075 /*
1076  * Close down a parallel dump or restore.
1077  */
1078 void
1080 {
1081  int i;
1082 
1083  /* No work if non-parallel */
1084  if (pstate->numWorkers == 1)
1085  return;
1086 
1087  /* There should not be any unfinished jobs */
1088  Assert(IsEveryWorkerIdle(pstate));
1089 
1090  /* Close the sockets so that the workers know they can exit */
1091  for (i = 0; i < pstate->numWorkers; i++)
1092  {
1093  closesocket(pstate->parallelSlot[i].pipeRead);
1094  closesocket(pstate->parallelSlot[i].pipeWrite);
1095  }
1096 
1097  /* Wait for them to exit */
1098  WaitForTerminatingWorkers(pstate);
1099 
1100  /*
1101  * Unlink pstate from shutdown_info, so the exit handler will not try to
1102  * use it; and likewise unlink from signal_info.
1103  */
1104  shutdown_info.pstate = NULL;
1106 
1107  /* Release state (mere neatnik-ism, since we're about to terminate) */
1108  free(pstate->te);
1109  free(pstate->parallelSlot);
1110  free(pstate);
1111 }
1112 
1113 /*
1114  * These next four functions handle construction and parsing of the command
1115  * strings and response strings for parallel workers.
1116  *
1117  * Currently, these can be the same regardless of which archive format we are
1118  * processing. In future, we might want to let format modules override these
1119  * functions to add format-specific data to a command or response.
1120  */
1121 
1122 /*
1123  * buildWorkerCommand: format a command string to send to a worker.
1124  *
1125  * The string is built in the caller-supplied buffer of size buflen.
1126  */
1127 static void
1129  char *buf, int buflen)
1130 {
1131  if (act == ACT_DUMP)
1132  snprintf(buf, buflen, "DUMP %d", te->dumpId);
1133  else if (act == ACT_RESTORE)
1134  snprintf(buf, buflen, "RESTORE %d", te->dumpId);
1135  else
1136  Assert(false);
1137 }
1138 
1139 /*
1140  * parseWorkerCommand: interpret a command string in a worker.
1141  */
1142 static void
1144  const char *msg)
1145 {
1146  DumpId dumpId;
1147  int nBytes;
1148 
1149  if (messageStartsWith(msg, "DUMP "))
1150  {
1151  *act = ACT_DUMP;
1152  sscanf(msg, "DUMP %d%n", &dumpId, &nBytes);
1153  Assert(nBytes == strlen(msg));
1154  *te = getTocEntryByDumpId(AH, dumpId);
1155  Assert(*te != NULL);
1156  }
1157  else if (messageStartsWith(msg, "RESTORE "))
1158  {
1159  *act = ACT_RESTORE;
1160  sscanf(msg, "RESTORE %d%n", &dumpId, &nBytes);
1161  Assert(nBytes == strlen(msg));
1162  *te = getTocEntryByDumpId(AH, dumpId);
1163  Assert(*te != NULL);
1164  }
1165  else
1167  "unrecognized command received from master: \"%s\"\n",
1168  msg);
1169 }
1170 
1171 /*
1172  * buildWorkerResponse: format a response string to send to the master.
1173  *
1174  * The string is built in the caller-supplied buffer of size buflen.
1175  */
1176 static void
1178  char *buf, int buflen)
1179 {
1180  snprintf(buf, buflen, "OK %d %d %d",
1181  te->dumpId,
1182  status,
1183  status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0);
1184 }
1185 
1186 /*
1187  * parseWorkerResponse: parse the status message returned by a worker.
1188  *
1189  * Returns the integer status code, and may update fields of AH and/or te.
1190  */
1191 static int
1193  const char *msg)
1194 {
1195  DumpId dumpId;
1196  int nBytes,
1197  n_errors;
1198  int status = 0;
1199 
1200  if (messageStartsWith(msg, "OK "))
1201  {
1202  sscanf(msg, "OK %d %d %d%n", &dumpId, &status, &n_errors, &nBytes);
1203 
1204  Assert(dumpId == te->dumpId);
1205  Assert(nBytes == strlen(msg));
1206 
1207  AH->public.n_errors += n_errors;
1208  }
1209  else
1211  "invalid message received from worker: \"%s\"\n",
1212  msg);
1213 
1214  return status;
1215 }
1216 
1217 /*
1218  * Dispatch a job to some free worker.
1219  *
1220  * te is the TocEntry to be processed, act is the action to be taken on it.
1221  * callback is the function to call on completion of the job.
1222  *
1223  * If no worker is currently available, this will block, and previously
1224  * registered callback functions may be called.
1225  */
1226 void
1228  ParallelState *pstate,
1229  TocEntry *te,
1230  T_Action act,
1232  void *callback_data)
1233 {
1234  int worker;
1235  char buf[256];
1236 
1237  /* Get a worker, waiting if none are idle */
1238  while ((worker = GetIdleWorker(pstate)) == NO_SLOT)
1239  WaitForWorkers(AH, pstate, WFW_ONE_IDLE);
1240 
1241  /* Construct and send command string */
1242  buildWorkerCommand(AH, te, act, buf, sizeof(buf));
1243 
1244  sendMessageToWorker(pstate, worker, buf);
1245 
1246  /* Remember worker is busy, and which TocEntry it's working on */
1247  pstate->parallelSlot[worker].workerStatus = WRKR_WORKING;
1248  pstate->parallelSlot[worker].callback = callback;
1249  pstate->parallelSlot[worker].callback_data = callback_data;
1250  pstate->te[worker] = te;
1251 }
1252 
1253 /*
1254  * Find an idle worker and return its slot number.
1255  * Return NO_SLOT if none are idle.
1256  */
1257 static int
1259 {
1260  int i;
1261 
1262  for (i = 0; i < pstate->numWorkers; i++)
1263  {
1264  if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE)
1265  return i;
1266  }
1267  return NO_SLOT;
1268 }
1269 
1270 /*
1271  * Return true iff every worker is in the WRKR_TERMINATED state.
1272  */
1273 static bool
1275 {
1276  int i;
1277 
1278  for (i = 0; i < pstate->numWorkers; i++)
1279  {
1280  if (pstate->parallelSlot[i].workerStatus != WRKR_TERMINATED)
1281  return false;
1282  }
1283  return true;
1284 }
1285 
1286 /*
1287  * Return true iff every worker is in the WRKR_IDLE state.
1288  */
1289 bool
1291 {
1292  int i;
1293 
1294  for (i = 0; i < pstate->numWorkers; i++)
1295  {
1296  if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE)
1297  return false;
1298  }
1299  return true;
1300 }
1301 
1302 /*
1303  * Acquire lock on a table to be dumped by a worker process.
1304  *
1305  * The master process is already holding an ACCESS SHARE lock. Ordinarily
1306  * it's no problem for a worker to get one too, but if anything else besides
1307  * pg_dump is running, there's a possible deadlock:
1308  *
1309  * 1) Master dumps the schema and locks all tables in ACCESS SHARE mode.
1310  * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted
1311  * because the master holds a conflicting ACCESS SHARE lock).
1312  * 3) A worker process also requests an ACCESS SHARE lock to read the table.
1313  * The worker is enqueued behind the ACCESS EXCLUSIVE lock request.
1314  * 4) Now we have a deadlock, since the master is effectively waiting for
1315  * the worker. The server cannot detect that, however.
1316  *
1317  * To prevent an infinite wait, prior to touching a table in a worker, request
1318  * a lock in ACCESS SHARE mode but with NOWAIT. If we don't get the lock,
1319  * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and
1320  * so we have a deadlock. We must fail the backup in that case.
1321  */
1322 static void
1324 {
1325  const char *qualId;
1326  PQExpBuffer query;
1327  PGresult *res;
1328 
1329  /* Nothing to do for BLOBS */
1330  if (strcmp(te->desc, "BLOBS") == 0)
1331  return;
1332 
1333  query = createPQExpBuffer();
1334 
1335  qualId = fmtQualifiedId(AH->public.remoteVersion, te->namespace, te->tag);
1336 
1337  appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT",
1338  qualId);
1339 
1340  res = PQexec(AH->connection, query->data);
1341 
1342  if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
1344  "could not obtain lock on relation \"%s\"\n"
1345  "This usually means that someone requested an ACCESS EXCLUSIVE lock "
1346  "on the table after the pg_dump parent process had gotten the "
1347  "initial ACCESS SHARE lock on the table.\n", qualId);
1348 
1349  PQclear(res);
1350  destroyPQExpBuffer(query);
1351 }
1352 
1353 /*
1354  * WaitForCommands: main routine for a worker process.
1355  *
1356  * Read and execute commands from the master until we see EOF on the pipe.
1357  */
1358 static void
1359 WaitForCommands(ArchiveHandle *AH, int pipefd[2])
1360 {
1361  char *command;
1362  TocEntry *te;
1363  T_Action act;
1364  int status = 0;
1365  char buf[256];
1366 
1367  for (;;)
1368  {
1369  if (!(command = getMessageFromMaster(pipefd)))
1370  {
1371  /* EOF, so done */
1372  return;
1373  }
1374 
1375  /* Decode the command */
1376  parseWorkerCommand(AH, &te, &act, command);
1377 
1378  if (act == ACT_DUMP)
1379  {
1380  /* Acquire lock on this table within the worker's session */
1381  lockTableForWorker(AH, te);
1382 
1383  /* Perform the dump command */
1384  status = (AH->WorkerJobDumpPtr) (AH, te);
1385  }
1386  else if (act == ACT_RESTORE)
1387  {
1388  /* Perform the restore command */
1389  status = (AH->WorkerJobRestorePtr) (AH, te);
1390  }
1391  else
1392  Assert(false);
1393 
1394  /* Return status to master */
1395  buildWorkerResponse(AH, te, act, status, buf, sizeof(buf));
1396 
1397  sendMessageToMaster(pipefd, buf);
1398 
1399  /* command was pg_malloc'd and we are responsible for free()ing it. */
1400  free(command);
1401  }
1402 }
1403 
1404 /*
1405  * Check for status messages from workers.
1406  *
1407  * If do_wait is true, wait to get a status message; otherwise, just return
1408  * immediately if there is none available.
1409  *
1410  * When we get a status message, we pass the status code to the callback
1411  * function that was specified to DispatchJobForTocEntry, then reset the
1412  * worker status to IDLE.
1413  *
1414  * Returns true if we collected a status message, else false.
1415  *
1416  * XXX is it worth checking for more than one status message per call?
1417  * It seems somewhat unlikely that multiple workers would finish at exactly
1418  * the same time.
1419  */
1420 static bool
1422 {
1423  int worker;
1424  char *msg;
1425 
1426  /* Try to collect a status message */
1427  msg = getMessageFromWorker(pstate, do_wait, &worker);
1428 
1429  if (!msg)
1430  {
1431  /* If do_wait is true, we must have detected EOF on some socket */
1432  if (do_wait)
1433  exit_horribly(modulename, "a worker process died unexpectedly\n");
1434  return false;
1435  }
1436 
1437  /* Process it and update our idea of the worker's status */
1438  if (messageStartsWith(msg, "OK "))
1439  {
1440  ParallelSlot *slot = &pstate->parallelSlot[worker];
1441  TocEntry *te = pstate->te[worker];
1442  int status;
1443 
1444  status = parseWorkerResponse(AH, te, msg);
1445  slot->callback(AH, te, status, slot->callback_data);
1446  slot->workerStatus = WRKR_IDLE;
1447  pstate->te[worker] = NULL;
1448  }
1449  else
1451  "invalid message received from worker: \"%s\"\n",
1452  msg);
1453 
1454  /* Free the string returned from getMessageFromWorker */
1455  free(msg);
1456 
1457  return true;
1458 }
1459 
1460 /*
1461  * Check for status results from workers, waiting if necessary.
1462  *
1463  * Available wait modes are:
1464  * WFW_NO_WAIT: reap any available status, but don't block
1465  * WFW_GOT_STATUS: wait for at least one more worker to finish
1466  * WFW_ONE_IDLE: wait for at least one worker to be idle
1467  * WFW_ALL_IDLE: wait for all workers to be idle
1468  *
1469  * Any received results are passed to the callback specified to
1470  * DispatchJobForTocEntry.
1471  *
1472  * This function is executed in the master process.
1473  */
1474 void
1476 {
1477  bool do_wait = false;
1478 
1479  /*
1480  * In GOT_STATUS mode, always block waiting for a message, since we can't
1481  * return till we get something. In other modes, we don't block the first
1482  * time through the loop.
1483  */
1484  if (mode == WFW_GOT_STATUS)
1485  {
1486  /* Assert that caller knows what it's doing */
1487  Assert(!IsEveryWorkerIdle(pstate));
1488  do_wait = true;
1489  }
1490 
1491  for (;;)
1492  {
1493  /*
1494  * Check for status messages, even if we don't need to block. We do
1495  * not try very hard to reap all available messages, though, since
1496  * there's unlikely to be more than one.
1497  */
1498  if (ListenToWorkers(AH, pstate, do_wait))
1499  {
1500  /*
1501  * If we got a message, we are done by definition for GOT_STATUS
1502  * mode, and we can also be certain that there's at least one idle
1503  * worker. So we're done in all but ALL_IDLE mode.
1504  */
1505  if (mode != WFW_ALL_IDLE)
1506  return;
1507  }
1508 
1509  /* Check whether we must wait for new status messages */
1510  switch (mode)
1511  {
1512  case WFW_NO_WAIT:
1513  return; /* never wait */
1514  case WFW_GOT_STATUS:
1515  Assert(false); /* can't get here, because we waited */
1516  break;
1517  case WFW_ONE_IDLE:
1518  if (GetIdleWorker(pstate) != NO_SLOT)
1519  return;
1520  break;
1521  case WFW_ALL_IDLE:
1522  if (IsEveryWorkerIdle(pstate))
1523  return;
1524  break;
1525  }
1526 
1527  /* Loop back, and this time wait for something to happen */
1528  do_wait = true;
1529  }
1530 }
1531 
1532 /*
1533  * Read one command message from the master, blocking if necessary
1534  * until one is available, and return it as a malloc'd string.
1535  * On EOF, return NULL.
1536  *
1537  * This function is executed in worker processes.
1538  */
1539 static char *
1540 getMessageFromMaster(int pipefd[2])
1541 {
1542  return readMessageFromPipe(pipefd[PIPE_READ]);
1543 }
1544 
1545 /*
1546  * Send a status message to the master.
1547  *
1548  * This function is executed in worker processes.
1549  */
1550 static void
1551 sendMessageToMaster(int pipefd[2], const char *str)
1552 {
1553  int len = strlen(str) + 1;
1554 
1555  if (pipewrite(pipefd[PIPE_WRITE], str, len) != len)
1557  "could not write to the communication channel: %s\n",
1558  strerror(errno));
1559 }
1560 
1561 /*
1562  * Wait until some descriptor in "workerset" becomes readable.
1563  * Returns -1 on error, else the number of readable descriptors.
1564  */
1565 static int
1566 select_loop(int maxFd, fd_set *workerset)
1567 {
1568  int i;
1569  fd_set saveSet = *workerset;
1570 
1571  for (;;)
1572  {
1573  *workerset = saveSet;
1574  i = select(maxFd + 1, workerset, NULL, NULL, NULL);
1575 
1576 #ifndef WIN32
1577  if (i < 0 && errno == EINTR)
1578  continue;
1579 #else
1580  if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR)
1581  continue;
1582 #endif
1583  break;
1584  }
1585 
1586  return i;
1587 }
1588 
1589 
1590 /*
1591  * Check for messages from worker processes.
1592  *
1593  * If a message is available, return it as a malloc'd string, and put the
1594  * index of the sending worker in *worker.
1595  *
1596  * If nothing is available, wait if "do_wait" is true, else return NULL.
1597  *
1598  * If we detect EOF on any socket, we'll return NULL. It's not great that
1599  * that's hard to distinguish from the no-data-available case, but for now
1600  * our one caller is okay with that.
1601  *
1602  * This function is executed in the master process.
1603  */
1604 static char *
1605 getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
1606 {
1607  int i;
1608  fd_set workerset;
1609  int maxFd = -1;
1610  struct timeval nowait = {0, 0};
1611 
1612  /* construct bitmap of socket descriptors for select() */
1613  FD_ZERO(&workerset);
1614  for (i = 0; i < pstate->numWorkers; i++)
1615  {
1616  if (pstate->parallelSlot[i].workerStatus == WRKR_TERMINATED)
1617  continue;
1618  FD_SET(pstate->parallelSlot[i].pipeRead, &workerset);
1619  if (pstate->parallelSlot[i].pipeRead > maxFd)
1620  maxFd = pstate->parallelSlot[i].pipeRead;
1621  }
1622 
1623  if (do_wait)
1624  {
1625  i = select_loop(maxFd, &workerset);
1626  Assert(i != 0);
1627  }
1628  else
1629  {
1630  if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0)
1631  return NULL;
1632  }
1633 
1634  if (i < 0)
1635  exit_horribly(modulename, "select() failed: %s\n", strerror(errno));
1636 
1637  for (i = 0; i < pstate->numWorkers; i++)
1638  {
1639  char *msg;
1640 
1641  if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset))
1642  continue;
1643 
1644  /*
1645  * Read the message if any. If the socket is ready because of EOF,
1646  * we'll return NULL instead (and the socket will stay ready, so the
1647  * condition will persist).
1648  *
1649  * Note: because this is a blocking read, we'll wait if only part of
1650  * the message is available. Waiting a long time would be bad, but
1651  * since worker status messages are short and are always sent in one
1652  * operation, it shouldn't be a problem in practice.
1653  */
1654  msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead);
1655  *worker = i;
1656  return msg;
1657  }
1658  Assert(false);
1659  return NULL;
1660 }
1661 
1662 /*
1663  * Send a command message to the specified worker process.
1664  *
1665  * This function is executed in the master process.
1666  */
1667 static void
1668 sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
1669 {
1670  int len = strlen(str) + 1;
1671 
1672  if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len)
1673  {
1675  "could not write to the communication channel: %s\n",
1676  strerror(errno));
1677  }
1678 }
1679 
1680 /*
1681  * Read one message from the specified pipe (fd), blocking if necessary
1682  * until one is available, and return it as a malloc'd string.
1683  * On EOF, return NULL.
1684  *
1685  * A "message" on the channel is just a null-terminated string.
1686  */
1687 static char *
1689 {
1690  char *msg;
1691  int msgsize,
1692  bufsize;
1693  int ret;
1694 
1695  /*
1696  * In theory, if we let piperead() read multiple bytes, it might give us
1697  * back fragments of multiple messages. (That can't actually occur, since
1698  * neither master nor workers send more than one message without waiting
1699  * for a reply, but we don't wish to assume that here.) For simplicity,
1700  * read a byte at a time until we get the terminating '\0'. This method
1701  * is a bit inefficient, but since this is only used for relatively short
1702  * command and status strings, it shouldn't matter.
1703  */
1704  bufsize = 64; /* could be any number */
1705  msg = (char *) pg_malloc(bufsize);
1706  msgsize = 0;
1707  for (;;)
1708  {
1709  Assert(msgsize < bufsize);
1710  ret = piperead(fd, msg + msgsize, 1);
1711  if (ret <= 0)
1712  break; /* error or connection closure */
1713 
1714  Assert(ret == 1);
1715 
1716  if (msg[msgsize] == '\0')
1717  return msg; /* collected whole message */
1718 
1719  msgsize++;
1720  if (msgsize == bufsize) /* enlarge buffer if needed */
1721  {
1722  bufsize += 16; /* could be any number */
1723  msg = (char *) pg_realloc(msg, bufsize);
1724  }
1725  }
1726 
1727  /* Other end has closed the connection */
1728  pg_free(msg);
1729  return NULL;
1730 }
1731 
1732 #ifdef WIN32
1733 
1734 /*
1735  * This is a replacement version of pipe(2) for Windows which allows the pipe
1736  * handles to be used in select().
1737  *
1738  * Reads and writes on the pipe must go through piperead()/pipewrite().
1739  *
1740  * For consistency with Unix we declare the returned handles as "int".
1741  * This is okay even on WIN64 because system handles are not more than
1742  * 32 bits wide, but we do have to do some casting.
1743  */
1744 static int
1745 pgpipe(int handles[2])
1746 {
1747  pgsocket s,
1748  tmp_sock;
1749  struct sockaddr_in serv_addr;
1750  int len = sizeof(serv_addr);
1751 
1752  /* We have to use the Unix socket invalid file descriptor value here. */
1753  handles[0] = handles[1] = -1;
1754 
1755  /*
1756  * setup listen socket
1757  */
1758  if ((s = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
1759  {
1760  write_msg(modulename, "pgpipe: could not create socket: error code %d\n",
1761  WSAGetLastError());
1762  return -1;
1763  }
1764 
1765  memset((void *) &serv_addr, 0, sizeof(serv_addr));
1766  serv_addr.sin_family = AF_INET;
1767  serv_addr.sin_port = htons(0);
1768  serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
1769  if (bind(s, (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
1770  {
1771  write_msg(modulename, "pgpipe: could not bind: error code %d\n",
1772  WSAGetLastError());
1773  closesocket(s);
1774  return -1;
1775  }
1776  if (listen(s, 1) == SOCKET_ERROR)
1777  {
1778  write_msg(modulename, "pgpipe: could not listen: error code %d\n",
1779  WSAGetLastError());
1780  closesocket(s);
1781  return -1;
1782  }
1783  if (getsockname(s, (SOCKADDR *) &serv_addr, &len) == SOCKET_ERROR)
1784  {
1785  write_msg(modulename, "pgpipe: getsockname() failed: error code %d\n",
1786  WSAGetLastError());
1787  closesocket(s);
1788  return -1;
1789  }
1790 
1791  /*
1792  * setup pipe handles
1793  */
1794  if ((tmp_sock = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
1795  {
1796  write_msg(modulename, "pgpipe: could not create second socket: error code %d\n",
1797  WSAGetLastError());
1798  closesocket(s);
1799  return -1;
1800  }
1801  handles[1] = (int) tmp_sock;
1802 
1803  if (connect(handles[1], (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
1804  {
1805  write_msg(modulename, "pgpipe: could not connect socket: error code %d\n",
1806  WSAGetLastError());
1807  closesocket(handles[1]);
1808  handles[1] = -1;
1809  closesocket(s);
1810  return -1;
1811  }
1812  if ((tmp_sock = accept(s, (SOCKADDR *) &serv_addr, &len)) == PGINVALID_SOCKET)
1813  {
1814  write_msg(modulename, "pgpipe: could not accept connection: error code %d\n",
1815  WSAGetLastError());
1816  closesocket(handles[1]);
1817  handles[1] = -1;
1818  closesocket(s);
1819  return -1;
1820  }
1821  handles[0] = (int) tmp_sock;
1822 
1823  closesocket(s);
1824  return 0;
1825 }
1826 
1827 /*
1828  * Windows implementation of reading from a pipe.
1829  */
1830 static int
1831 piperead(int s, char *buf, int len)
1832 {
1833  int ret = recv(s, buf, len, 0);
1834 
1835  if (ret < 0 && WSAGetLastError() == WSAECONNRESET)
1836  {
1837  /* EOF on the pipe! */
1838  ret = 0;
1839  }
1840  return ret;
1841 }
1842 
1843 #endif /* WIN32 */
void on_exit_nicely(on_exit_nicely_callback function, void *arg)
#define connect(s, name, namelen)
Definition: win32.h:383
#define accept(s, addr, addrlen)
Definition: win32.h:382
int pipeRead
Definition: parallel.c:99
int DumpId
Definition: pg_backup.h:229
static void buildWorkerCommand(ArchiveHandle *AH, TocEntry *te, T_Action act, char *buf, int buflen)
Definition: parallel.c:1128
PQExpBufferData * PQExpBuffer
Definition: pqexpbuffer.h:51
void DispatchJobForTocEntry(ArchiveHandle *AH, ParallelState *pstate, TocEntry *te, T_Action act, ParallelCompletionPtr callback, void *callback_data)
Definition: parallel.c:1227
static void set_cancel_pstate(ParallelState *pstate)
Definition: parallel.c:801
struct DumpSignalInformation DumpSignalInformation
ArchiveHandle * AH
Definition: parallel.c:97
bool IsEveryWorkerIdle(ParallelState *pstate)
Definition: parallel.c:1290
void * pg_malloc(size_t size)
Definition: fe_memutils.c:47
void * callback_data
Definition: parallel.c:95
static int select_loop(int maxFd, fd_set *workerset)
Definition: parallel.c:1566
struct WorkerInfoData * WorkerInfo
Definition: autovacuum.c:231
#define closesocket
Definition: port.h:328
static void buildWorkerResponse(ArchiveHandle *AH, TocEntry *te, T_Action act, int status, char *buf, int buflen)
Definition: parallel.c:1177
void PQfreeCancel(PGcancel *cancel)
Definition: fe-connect.c:3712
WorkerJobDumpPtr WorkerJobDumpPtr
static int GetIdleWorker(ParallelState *pstate)
Definition: parallel.c:1258
void(* ParallelCompletionPtr)(ArchiveHandle *AH, TocEntry *te, int status, void *callback_data)
Definition: parallel.h:22
struct ShutdownInformation ShutdownInformation
ArchiveHandle * myAH
Definition: parallel.c:161
#define write_stderr(str)
Definition: parallel.c:180
#define PIPE_READ
Definition: parallel.c:69
#define gettext_noop(x)
Definition: c.h:139
#define socket(af, type, protocol)
Definition: win32.h:379
static bool HasEveryWorkerTerminated(ParallelState *pstate)
Definition: parallel.c:1274
#define select(n, r, w, e, timeout)
Definition: win32.h:384
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
static void setup_cancel_handler(void)
Definition: parallel.c:620
#define recv(s, buf, len, flags)
Definition: win32.h:385
SetupWorkerPtr SetupWorkerPtr
int n_errors
Definition: pg_backup.h:201
PGcancel *volatile connCancel
static void ShutdownWorkersHard(ParallelState *pstate)
Definition: parallel.c:412
void on_exit_close_archive(Archive *AHX)
Definition: parallel.c:345
const char * progname
Definition: pg_standby.c:37
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define SIGQUIT
Definition: win32.h:197
ExecStatusType PQresultStatus(const PGresult *res)
Definition: fe-exec.c:2596
#define WORKER_IGNORED_ERRORS
static char * getMessageFromMaster(int pipefd[2])
Definition: parallel.c:1540
void DeCloneArchive(ArchiveHandle *AH)
void destroyPQExpBuffer(PQExpBuffer str)
Definition: pqexpbuffer.c:113
static const char * modulename
Definition: parallel.c:198
#define SIG_IGN
Definition: win32.h:193
#define bind(s, addr, addrlen)
Definition: win32.h:380
ParallelState * pstate
Definition: parallel.c:162
#define FALSE
Definition: c.h:221
void * pg_malloc0(size_t size)
Definition: fe_memutils.c:53
static ParallelSlot * GetMyPSlot(ParallelState *pstate)
Definition: parallel.c:281
PGconn * conn
Definition: streamutil.c:42
pid_t pid
Definition: parallel.c:109
#define NO_SLOT
Definition: parallel.c:72
void appendPQExpBuffer(PQExpBuffer str, const char *fmt,...)
Definition: pqexpbuffer.c:262
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot)
Definition: parallel.c:841
WFW_WaitOption
Definition: parallel.h:28
static char * buf
Definition: pg_test_fsync.c:65
TocEntry * getTocEntryByDumpId(ArchiveHandle *AH, DumpId id)
int pipeRevRead
Definition: parallel.c:101
static char * readMessageFromPipe(int fd)
Definition: parallel.c:1688
PGcancel * PQgetCancel(PGconn *conn)
Definition: fe-connect.c:3689
ArchiveHandle * CloneArchive(ArchiveHandle *AH)
#define pgpipe(a)
Definition: parallel.c:133
WorkerJobRestorePtr WorkerJobRestorePtr
static char * getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
Definition: parallel.c:1605
int pgsocket
Definition: port.h:22
void * pg_realloc(void *ptr, size_t size)
Definition: fe_memutils.c:65
void DisconnectDatabase(Archive *AHX)
Definition: pg_backup_db.c:345
ParallelState * ParallelBackupStart(ArchiveHandle *AH)
Definition: parallel.c:909
ParallelSlot * parallelSlot
Definition: parallel.h:45
void ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate)
Definition: parallel.c:1079
#define exit_nicely(code)
Definition: pg_dumpall.c:89
static void sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
Definition: parallel.c:1668
ParallelState * pstate
Definition: parallel.c:144
#define listen(s, backlog)
Definition: win32.h:381
#define PGINVALID_SOCKET
Definition: port.h:24
PQExpBuffer createPQExpBuffer(void)
Definition: pqexpbuffer.c:71
#define EINTR
Definition: win32.h:295
static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait)
Definition: parallel.c:1421
static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te)
Definition: parallel.c:1323
void WaitForWorkers(ArchiveHandle *AH, ParallelState *pstate, WFW_WaitOption mode)
Definition: parallel.c:1475
#define SIGPIPE
Definition: win32.h:201
void PQclear(PGresult *res)
Definition: fe-exec.c:650
void init_parallel_dump_utils(void)
Definition: parallel.c:249
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:168
#define free(a)
Definition: header.h:65
T_WorkerStatus
Definition: parallel.c:75
static ShutdownInformation shutdown_info
Definition: parallel.c:148
void write_msg(const char *modulename, const char *fmt,...)
#define SIGNAL_ARGS
Definition: c.h:1079
#define NULL
Definition: c.h:229
static void parseWorkerCommand(ArchiveHandle *AH, TocEntry **te, T_Action *act, const char *msg)
Definition: parallel.c:1143
#define Assert(condition)
Definition: c.h:675
TocEntry ** te
Definition: parallel.h:44
ParallelCompletionPtr callback
Definition: parallel.c:94
int pipeRevWrite
Definition: parallel.c:102
int numWorkers
Definition: pg_backup.h:190
void pg_free(void *ptr)
Definition: fe_memutils.c:105
static volatile DumpSignalInformation signal_info
Definition: parallel.c:169
T_WorkerStatus workerStatus
Definition: parallel.c:91
#define piperead(a, b, c)
Definition: parallel.c:134
void exit_horribly(const char *modulename, const char *fmt,...)
int PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
Definition: fe-connect.c:3844
static bool do_wait
Definition: pg_ctl.c:71
void set_archive_cancel_info(ArchiveHandle *AH, PGconn *conn)
Definition: parallel.c:742
int pipeWrite
Definition: parallel.c:100
int i
const char * strerror(int errnum)
Definition: strerror.c:19
static void set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH)
Definition: parallel.c:821
PGresult * PQexec(PGconn *conn, const char *query)
Definition: fe-exec.c:1846
void * arg
#define pipewrite(a, b, c)
Definition: parallel.c:135
static void WaitForTerminatingWorkers(ParallelState *pstate)
Definition: parallel.c:461
static void sigTermHandler(SIGNAL_ARGS)
Definition: parallel.c:560
#define TRUE
Definition: c.h:217
static int parseWorkerResponse(ArchiveHandle *AH, TocEntry *te, const char *msg)
Definition: parallel.c:1192
static void archive_close_connection(int code, void *arg)
Definition: parallel.c:356
void resetPQExpBuffer(PQExpBuffer str)
Definition: pqexpbuffer.c:145
#define PIPE_WRITE
Definition: parallel.c:70
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:224
typedef BOOL(WINAPI *MINIDUMPWRITEDUMP)(HANDLE hProcess
#define _(x)
Definition: elog.c:84
PQExpBuffer(* getLocalPQExpBuffer)(void)
Definition: string_utils.c:29
const char * fmtQualifiedId(int remoteVersion, const char *schema, const char *id)
Definition: string_utils.c:150
int numWorkers
Definition: parallel.h:42
#define messageStartsWith(msg, prefix)
Definition: parallel.c:224
int remoteVersion
Definition: pg_backup.h:184
static void sendMessageToMaster(int pipefd[2], const char *str)
Definition: parallel.c:1551
static void WaitForCommands(ArchiveHandle *AH, int pipefd[2])
Definition: parallel.c:1359