PostgreSQL Source Code  git master
tablesync.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * tablesync.c
3  * PostgreSQL logical replication: initial table data synchronization
4  *
5  * Copyright (c) 2012-2021, PostgreSQL Global Development Group
6  *
7  * IDENTIFICATION
8  * src/backend/replication/logical/tablesync.c
9  *
10  * NOTES
11  * This file contains code for initial table data synchronization for
12  * logical replication.
13  *
14  * The initial data synchronization is done separately for each table,
15  * in a separate apply worker that only fetches the initial snapshot data
16  * from the publisher and then synchronizes the position in the stream with
17  * the main apply worker.
18  *
19  * There are several reasons for doing the synchronization this way:
20  * - It allows us to parallelize the initial data synchronization
21  * which lowers the time needed for it to happen.
22  * - The initial synchronization does not have to hold the xid and LSN
23  * for the time it takes to copy data of all tables, causing less
24  * bloat and lower disk consumption compared to doing the
25  * synchronization in a single process for the whole database.
26  * - It allows us to synchronize any tables added after the initial
27  * synchronization has finished.
28  *
29  * The stream position synchronization works in multiple steps:
30  * - Apply worker requests a tablesync worker to start, setting the new
31  * table state to INIT.
32  * - Tablesync worker starts; changes table state from INIT to DATASYNC while
33  * copying.
34  * - Tablesync worker does initial table copy; there is a FINISHEDCOPY (sync
35  * worker specific) state to indicate when the copy phase has completed, so
36  * if the worker crashes with this (non-memory) state then the copy will not
37  * be re-attempted.
38  * - Tablesync worker then sets table state to SYNCWAIT; waits for state change.
39  * - Apply worker periodically checks for tables in SYNCWAIT state. When
40  * any appear, it sets the table state to CATCHUP and starts loop-waiting
41  * until either the table state is set to SYNCDONE or the sync worker
42  * exits.
43  * - After the sync worker has seen the state change to CATCHUP, it will
44  * read the stream and apply changes (acting like an apply worker) until
45  * it catches up to the specified stream position. Then it sets the
46  * state to SYNCDONE. There might be zero changes applied between
47  * CATCHUP and SYNCDONE, because the sync worker might be ahead of the
48  * apply worker.
49  * - Once the state is set to SYNCDONE, the apply will continue tracking
50  * the table until it reaches the SYNCDONE stream position, at which
51  * point it sets state to READY and stops tracking. Again, there might
52  * be zero changes in between.
53  *
54  * So the state progression is always: INIT -> DATASYNC -> FINISHEDCOPY
55  * -> SYNCWAIT -> CATCHUP -> SYNCDONE -> READY.
56  *
57  * The catalog pg_subscription_rel is used to keep information about
58  * subscribed tables and their state. The catalog holds all states
59  * except SYNCWAIT and CATCHUP which are only in shared memory.
60  *
61  * Example flows look like this:
62  * - Apply is in front:
63  * sync:8
64  * -> set in catalog FINISHEDCOPY
65  * -> set in memory SYNCWAIT
66  * apply:10
67  * -> set in memory CATCHUP
68  * -> enter wait-loop
69  * sync:10
70  * -> set in catalog SYNCDONE
71  * -> exit
72  * apply:10
73  * -> exit wait-loop
74  * -> continue rep
75  * apply:11
76  * -> set in catalog READY
77  *
78  * - Sync is in front:
79  * sync:10
80  * -> set in catalog FINISHEDCOPY
81  * -> set in memory SYNCWAIT
82  * apply:8
83  * -> set in memory CATCHUP
84  * -> continue per-table filtering
85  * sync:10
86  * -> set in catalog SYNCDONE
87  * -> exit
88  * apply:10
89  * -> set in catalog READY
90  * -> stop per-table filtering
91  * -> continue rep
92  *-------------------------------------------------------------------------
93  */
94 
95 #include "postgres.h"
96 
97 #include "access/table.h"
98 #include "access/xact.h"
100 #include "catalog/pg_type.h"
101 #include "commands/copy.h"
102 #include "miscadmin.h"
103 #include "parser/parse_relation.h"
104 #include "pgstat.h"
107 #include "replication/walreceiver.h"
109 #include "replication/slot.h"
110 #include "replication/origin.h"
111 #include "storage/ipc.h"
112 #include "storage/lmgr.h"
113 #include "utils/builtins.h"
114 #include "utils/lsyscache.h"
115 #include "utils/memutils.h"
116 #include "utils/snapmgr.h"
117 
118 static bool table_states_valid = false;
119 
121 
122 /*
123  * Exit routine for synchronization worker.
124  */
125 static void
127 finish_sync_worker(void)
128 {
129  /*
130  * Commit any outstanding transaction. This is the usual case, unless
131  * there was nothing to do for the table.
132  */
133  if (IsTransactionState())
134  {
136  pgstat_report_stat(false);
137  }
138 
139  /* And flush all writes. */
141 
143  ereport(LOG,
144  (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished",
148 
149  /* Find the main apply worker and signal it. */
151 
152  /* Stop gracefully */
153  proc_exit(0);
154 }
155 
156 /*
157  * Wait until the relation sync state is set in the catalog to the expected
158  * one; return true when it happens.
159  *
160  * Returns false if the table sync worker or the table itself have
161  * disappeared, or the table state has been reset.
162  *
163  * Currently, this is used in the apply worker when transitioning from
164  * CATCHUP state to SYNCDONE.
165  */
166 static bool
167 wait_for_relation_state_change(Oid relid, char expected_state)
168 {
169  char state;
170 
171  for (;;)
172  {
173  LogicalRepWorker *worker;
174  XLogRecPtr statelsn;
175 
177 
180  relid, &statelsn);
181 
182  if (state == SUBREL_STATE_UNKNOWN)
183  break;
184 
185  if (state == expected_state)
186  return true;
187 
188  /* Check if the sync worker is still running and bail if not. */
189  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
191  false);
192  LWLockRelease(LogicalRepWorkerLock);
193  if (!worker)
194  break;
195 
196  (void) WaitLatch(MyLatch,
199 
201  }
202 
203  return false;
204 }
205 
206 /*
207  * Wait until the apply worker changes the state of our synchronization
208  * worker to the expected one.
209  *
210  * Used when transitioning from SYNCWAIT state to CATCHUP.
211  *
212  * Returns false if the apply worker has disappeared.
213  */
214 static bool
215 wait_for_worker_state_change(char expected_state)
216 {
217  int rc;
218 
219  for (;;)
220  {
221  LogicalRepWorker *worker;
222 
224 
225  /*
226  * Done if already in correct state. (We assume this fetch is atomic
227  * enough to not give a misleading answer if we do it with no lock.)
228  */
229  if (MyLogicalRepWorker->relstate == expected_state)
230  return true;
231 
232  /*
233  * Bail out if the apply worker has died, else signal it we're
234  * waiting.
235  */
236  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
238  InvalidOid, false);
239  if (worker && worker->proc)
241  LWLockRelease(LogicalRepWorkerLock);
242  if (!worker)
243  break;
244 
245  /*
246  * Wait. We expect to get a latch signal back from the apply worker,
247  * but use a timeout in case it dies without sending one.
248  */
249  rc = WaitLatch(MyLatch,
252 
253  if (rc & WL_LATCH_SET)
255  }
256 
257  return false;
258 }
259 
260 /*
261  * Callback from syscache invalidation.
262  */
263 void
265 {
266  table_states_valid = false;
267 }
268 
269 /*
270  * Handle table synchronization cooperation from the synchronization
271  * worker.
272  *
273  * If the sync worker is in CATCHUP state and reached (or passed) the
274  * predetermined synchronization point in the WAL stream, mark the table as
275  * SYNCDONE and finish.
276  */
277 static void
279 {
281 
282  if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP &&
283  current_lsn >= MyLogicalRepWorker->relstate_lsn)
284  {
285  TimeLineID tli;
286  char syncslotname[NAMEDATALEN] = {0};
287 
288  MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCDONE;
289  MyLogicalRepWorker->relstate_lsn = current_lsn;
290 
292 
293  /*
294  * UpdateSubscriptionRelState must be called within a transaction.
295  * That transaction will be ended within the finish_sync_worker().
296  */
297  if (!IsTransactionState())
299 
304 
305  /*
306  * End streaming so that LogRepWorkerWalRcvConn can be used to drop
307  * the slot.
308  */
310 
311  /*
312  * Cleanup the tablesync slot.
313  *
314  * This has to be done after updating the state because otherwise if
315  * there is an error while doing the database operations we won't be
316  * able to rollback dropped slot.
317  */
320  syncslotname,
321  sizeof(syncslotname));
322 
323  /*
324  * It is important to give an error if we are unable to drop the slot,
325  * otherwise, it won't be dropped till the corresponding subscription
326  * is dropped. So passing missing_ok = false.
327  */
329 
330  finish_sync_worker();
331  }
332  else
334 }
335 
336 /*
337  * Handle table synchronization cooperation from the apply worker.
338  *
339  * Walk over all subscription tables that are individually tracked by the
340  * apply process (currently, all that have state other than
341  * SUBREL_STATE_READY) and manage synchronization for them.
342  *
343  * If there are tables that need synchronizing and are not being synchronized
344  * yet, start sync workers for them (if there are free slots for sync
345  * workers). To prevent starting the sync worker for the same relation at a
346  * high frequency after a failure, we store its last start time with each sync
347  * state info. We start the sync worker for the same relation after waiting
348  * at least wal_retrieve_retry_interval.
349  *
350  * For tables that are being synchronized already, check if sync workers
351  * either need action from the apply worker or have finished. This is the
352  * SYNCWAIT to CATCHUP transition.
353  *
354  * If the synchronization position is reached (SYNCDONE), then the table can
355  * be marked as READY and is no longer tracked.
356  */
357 static void
359 {
360  struct tablesync_start_time_mapping
361  {
362  Oid relid;
363  TimestampTz last_start_time;
364  };
365  static List *table_states = NIL;
366  static HTAB *last_start_times = NULL;
367  ListCell *lc;
368  bool started_tx = false;
369 
371 
372  /* We need up-to-date sync state info for subscription tables here. */
373  if (!table_states_valid)
374  {
375  MemoryContext oldctx;
376  List *rstates;
377  ListCell *lc;
378  SubscriptionRelState *rstate;
379 
380  /* Clean the old list. */
381  list_free_deep(table_states);
382  table_states = NIL;
383 
385  started_tx = true;
386 
387  /* Fetch all non-ready tables. */
389 
390  /* Allocate the tracking info in a permanent memory context. */
392  foreach(lc, rstates)
393  {
394  rstate = palloc(sizeof(SubscriptionRelState));
395  memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState));
396  table_states = lappend(table_states, rstate);
397  }
398  MemoryContextSwitchTo(oldctx);
399 
400  table_states_valid = true;
401  }
402 
403  /*
404  * Prepare a hash table for tracking last start times of workers, to avoid
405  * immediate restarts. We don't need it if there are no tables that need
406  * syncing.
407  */
408  if (table_states && !last_start_times)
409  {
410  HASHCTL ctl;
411 
412  ctl.keysize = sizeof(Oid);
413  ctl.entrysize = sizeof(struct tablesync_start_time_mapping);
414  last_start_times = hash_create("Logical replication table sync worker start times",
415  256, &ctl, HASH_ELEM | HASH_BLOBS);
416  }
417 
418  /*
419  * Clean up the hash table when we're done with all tables (just to
420  * release the bit of memory).
421  */
422  else if (!table_states && last_start_times)
423  {
424  hash_destroy(last_start_times);
425  last_start_times = NULL;
426  }
427 
428  /*
429  * Process all tables that are being synchronized.
430  */
431  foreach(lc, table_states)
432  {
434 
435  if (rstate->state == SUBREL_STATE_SYNCDONE)
436  {
437  /*
438  * Apply has caught up to the position where the table sync has
439  * finished. Mark the table as ready so that the apply will just
440  * continue to replicate it normally.
441  */
442  if (current_lsn >= rstate->lsn)
443  {
444  char originname[NAMEDATALEN];
445 
446  rstate->state = SUBREL_STATE_READY;
447  rstate->lsn = current_lsn;
448  if (!started_tx)
449  {
451  started_tx = true;
452  }
453 
454  /*
455  * Remove the tablesync origin tracking if exists.
456  *
457  * The normal case origin drop is done here instead of in the
458  * process_syncing_tables_for_sync function because we don't
459  * allow to drop the origin till the process owning the origin
460  * is alive.
461  *
462  * There is a chance that the user is concurrently performing
463  * refresh for the subscription where we remove the table
464  * state and its origin and by this time the origin might be
465  * already removed. So passing missing_ok = true.
466  */
468  rstate->relid,
469  originname,
470  sizeof(originname));
471  replorigin_drop_by_name(originname, true, false);
472 
473  /*
474  * Update the state to READY only after the origin cleanup.
475  */
477  rstate->relid, rstate->state,
478  rstate->lsn);
479  }
480  }
481  else
482  {
483  LogicalRepWorker *syncworker;
484 
485  /*
486  * Look for a sync worker for this relation.
487  */
488  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
489 
491  rstate->relid, false);
492 
493  if (syncworker)
494  {
495  /* Found one, update our copy of its state */
496  SpinLockAcquire(&syncworker->relmutex);
497  rstate->state = syncworker->relstate;
498  rstate->lsn = syncworker->relstate_lsn;
499  if (rstate->state == SUBREL_STATE_SYNCWAIT)
500  {
501  /*
502  * Sync worker is waiting for apply. Tell sync worker it
503  * can catchup now.
504  */
505  syncworker->relstate = SUBREL_STATE_CATCHUP;
506  syncworker->relstate_lsn =
507  Max(syncworker->relstate_lsn, current_lsn);
508  }
509  SpinLockRelease(&syncworker->relmutex);
510 
511  /* If we told worker to catch up, wait for it. */
512  if (rstate->state == SUBREL_STATE_SYNCWAIT)
513  {
514  /* Signal the sync worker, as it may be waiting for us. */
515  if (syncworker->proc)
516  logicalrep_worker_wakeup_ptr(syncworker);
517 
518  /* Now safe to release the LWLock */
519  LWLockRelease(LogicalRepWorkerLock);
520 
521  /*
522  * Enter busy loop and wait for synchronization worker to
523  * reach expected state (or die trying).
524  */
525  if (!started_tx)
526  {
528  started_tx = true;
529  }
530 
532  SUBREL_STATE_SYNCDONE);
533  }
534  else
535  LWLockRelease(LogicalRepWorkerLock);
536  }
537  else
538  {
539  /*
540  * If there is no sync worker for this table yet, count
541  * running sync workers for this subscription, while we have
542  * the lock.
543  */
544  int nsyncworkers =
546 
547  /* Now safe to release the LWLock */
548  LWLockRelease(LogicalRepWorkerLock);
549 
550  /*
551  * If there are free sync worker slot(s), start a new sync
552  * worker for the table.
553  */
554  if (nsyncworkers < max_sync_workers_per_subscription)
555  {
557  struct tablesync_start_time_mapping *hentry;
558  bool found;
559 
560  hentry = hash_search(last_start_times, &rstate->relid,
561  HASH_ENTER, &found);
562 
563  if (!found ||
564  TimestampDifferenceExceeds(hentry->last_start_time, now,
566  {
571  rstate->relid);
572  hentry->last_start_time = now;
573  }
574  }
575  }
576  }
577  }
578 
579  if (started_tx)
580  {
582  pgstat_report_stat(false);
583  }
584 }
585 
586 /*
587  * Process possible state change(s) of tables that are being synchronized.
588  */
589 void
591 {
592  if (am_tablesync_worker())
593  process_syncing_tables_for_sync(current_lsn);
594  else
596 }
597 
598 /*
599  * Create list of columns for COPY based on logical relation mapping.
600  */
601 static List *
603 {
604  List *attnamelist = NIL;
605  int i;
606 
607  for (i = 0; i < rel->remoterel.natts; i++)
608  {
609  attnamelist = lappend(attnamelist,
610  makeString(rel->remoterel.attnames[i]));
611  }
612 
613 
614  return attnamelist;
615 }
616 
617 /*
618  * Data source callback for the COPY FROM, which reads from the remote
619  * connection and passes the data back to our local COPY.
620  */
621 static int
622 copy_read_data(void *outbuf, int minread, int maxread)
623 {
624  int bytesread = 0;
625  int avail;
626 
627  /* If there are some leftover data from previous read, use it. */
628  avail = copybuf->len - copybuf->cursor;
629  if (avail)
630  {
631  if (avail > maxread)
632  avail = maxread;
633  memcpy(outbuf, &copybuf->data[copybuf->cursor], avail);
634  copybuf->cursor += avail;
635  maxread -= avail;
636  bytesread += avail;
637  }
638 
639  while (maxread > 0 && bytesread < minread)
640  {
642  int len;
643  char *buf = NULL;
644 
645  for (;;)
646  {
647  /* Try read the data. */
648  len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd);
649 
651 
652  if (len == 0)
653  break;
654  else if (len < 0)
655  return bytesread;
656  else
657  {
658  /* Process the data */
659  copybuf->data = buf;
660  copybuf->len = len;
661  copybuf->cursor = 0;
662 
663  avail = copybuf->len - copybuf->cursor;
664  if (avail > maxread)
665  avail = maxread;
666  memcpy(outbuf, &copybuf->data[copybuf->cursor], avail);
667  outbuf = (void *) ((char *) outbuf + avail);
668  copybuf->cursor += avail;
669  maxread -= avail;
670  bytesread += avail;
671  }
672 
673  if (maxread <= 0 || bytesread >= minread)
674  return bytesread;
675  }
676 
677  /*
678  * Wait for more data or latch.
679  */
680  (void) WaitLatchOrSocket(MyLatch,
683  fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA);
684 
686  }
687 
688  return bytesread;
689 }
690 
691 
692 /*
693  * Get information about remote relation in similar fashion the RELATION
694  * message provides during replication.
695  */
696 static void
697 fetch_remote_table_info(char *nspname, char *relname,
698  LogicalRepRelation *lrel)
699 {
700  WalRcvExecResult *res;
701  StringInfoData cmd;
702  TupleTableSlot *slot;
703  Oid tableRow[] = {OIDOID, CHAROID, CHAROID};
704  Oid attrRow[] = {TEXTOID, OIDOID, BOOLOID};
705  bool isnull;
706  int natt;
707 
708  lrel->nspname = nspname;
709  lrel->relname = relname;
710 
711  /* First fetch Oid and replica identity. */
712  initStringInfo(&cmd);
713  appendStringInfo(&cmd, "SELECT c.oid, c.relreplident, c.relkind"
714  " FROM pg_catalog.pg_class c"
715  " INNER JOIN pg_catalog.pg_namespace n"
716  " ON (c.relnamespace = n.oid)"
717  " WHERE n.nspname = %s"
718  " AND c.relname = %s",
719  quote_literal_cstr(nspname),
720  quote_literal_cstr(relname));
722  lengthof(tableRow), tableRow);
723 
724  if (res->status != WALRCV_OK_TUPLES)
725  ereport(ERROR,
726  (errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s",
727  nspname, relname, res->err)));
728 
730  if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot))
731  ereport(ERROR,
732  (errmsg("table \"%s.%s\" not found on publisher",
733  nspname, relname)));
734 
735  lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull));
736  Assert(!isnull);
737  lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull));
738  Assert(!isnull);
739  lrel->relkind = DatumGetChar(slot_getattr(slot, 3, &isnull));
740  Assert(!isnull);
741 
743  walrcv_clear_result(res);
744 
745  /* Now fetch columns. */
746  resetStringInfo(&cmd);
747  appendStringInfo(&cmd,
748  "SELECT a.attname,"
749  " a.atttypid,"
750  " a.attnum = ANY(i.indkey)"
751  " FROM pg_catalog.pg_attribute a"
752  " LEFT JOIN pg_catalog.pg_index i"
753  " ON (i.indexrelid = pg_get_replica_identity_index(%u))"
754  " WHERE a.attnum > 0::pg_catalog.int2"
755  " AND NOT a.attisdropped %s"
756  " AND a.attrelid = %u"
757  " ORDER BY a.attnum",
758  lrel->remoteid,
760  "AND a.attgenerated = ''" : ""),
761  lrel->remoteid);
763  lengthof(attrRow), attrRow);
764 
765  if (res->status != WALRCV_OK_TUPLES)
766  ereport(ERROR,
767  (errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s",
768  nspname, relname, res->err)));
769 
770  /* We don't know the number of rows coming, so allocate enough space. */
771  lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *));
772  lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid));
773  lrel->attkeys = NULL;
774 
775  natt = 0;
777  while (tuplestore_gettupleslot(res->tuplestore, true, false, slot))
778  {
779  lrel->attnames[natt] =
780  TextDatumGetCString(slot_getattr(slot, 1, &isnull));
781  Assert(!isnull);
782  lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 2, &isnull));
783  Assert(!isnull);
784  if (DatumGetBool(slot_getattr(slot, 3, &isnull)))
785  lrel->attkeys = bms_add_member(lrel->attkeys, natt);
786 
787  /* Should never happen. */
788  if (++natt >= MaxTupleAttributeNumber)
789  elog(ERROR, "too many columns in remote table \"%s.%s\"",
790  nspname, relname);
791 
792  ExecClearTuple(slot);
793  }
795 
796  lrel->natts = natt;
797 
798  walrcv_clear_result(res);
799  pfree(cmd.data);
800 }
801 
802 /*
803  * Copy existing data of a table from publisher.
804  *
805  * Caller is responsible for locking the local relation.
806  */
807 static void
809 {
810  LogicalRepRelMapEntry *relmapentry;
811  LogicalRepRelation lrel;
812  WalRcvExecResult *res;
813  StringInfoData cmd;
814  CopyFromState cstate;
815  List *attnamelist;
816  ParseState *pstate;
817 
818  /* Get the publisher relation info. */
820  RelationGetRelationName(rel), &lrel);
821 
822  /* Put the relation into relmap. */
824 
825  /* Map the publisher relation to local one. */
826  relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock);
827  Assert(rel == relmapentry->localrel);
828 
829  /* Start copy on the publisher. */
830  initStringInfo(&cmd);
831  if (lrel.relkind == RELKIND_RELATION)
832  appendStringInfo(&cmd, "COPY %s TO STDOUT",
834  else
835  {
836  /*
837  * For non-tables, we need to do COPY (SELECT ...), but we can't just
838  * do SELECT * because we need to not copy generated columns.
839  */
840  appendStringInfoString(&cmd, "COPY (SELECT ");
841  for (int i = 0; i < lrel.natts; i++)
842  {
844  if (i < lrel.natts - 1)
845  appendStringInfoString(&cmd, ", ");
846  }
847  appendStringInfo(&cmd, " FROM %s) TO STDOUT",
849  }
850  res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, 0, NULL);
851  pfree(cmd.data);
852  if (res->status != WALRCV_OK_COPY_OUT)
853  ereport(ERROR,
854  (errmsg("could not start initial contents copy for table \"%s.%s\": %s",
855  lrel.nspname, lrel.relname, res->err)));
856  walrcv_clear_result(res);
857 
858  copybuf = makeStringInfo();
859 
860  pstate = make_parsestate(NULL);
862  NULL, false, false);
863 
864  attnamelist = make_copy_attnamelist(relmapentry);
865  cstate = BeginCopyFrom(pstate, rel, NULL, NULL, false, copy_read_data, attnamelist, NIL);
866 
867  /* Do the copy */
868  (void) CopyFrom(cstate);
869 
870  logicalrep_rel_close(relmapentry, NoLock);
871 }
872 
873 /*
874  * Determine the tablesync slot name.
875  *
876  * The name must not exceed NAMEDATALEN - 1 because of remote node constraints
877  * on slot name length. We append system_identifier to avoid slot_name
878  * collision with subscriptions in other clusters. With the current scheme
879  * pg_%u_sync_%u_UINT64_FORMAT (3 + 10 + 6 + 10 + 20 + '\0'), the maximum
880  * length of slot_name will be 50.
881  *
882  * The returned slot name is stored in the supplied buffer (syncslotname) with
883  * the given size.
884  *
885  * Note: We don't use the subscription slot name as part of tablesync slot name
886  * because we are responsible for cleaning up these slots and it could become
887  * impossible to recalculate what name to cleanup if the subscription slot name
888  * had changed.
889  */
890 void
892  char *syncslotname, int szslot)
893 {
894  snprintf(syncslotname, szslot, "pg_%u_sync_%u_" UINT64_FORMAT, suboid,
895  relid, GetSystemIdentifier());
896 }
897 
898 /*
899  * Form the origin name for tablesync.
900  *
901  * Return the name in the supplied buffer.
902  */
903 void
905  char *originname, int szorgname)
906 {
907  snprintf(originname, szorgname, "pg_%u_%u", suboid, relid);
908 }
909 
910 /*
911  * Start syncing the table in the sync worker.
912  *
913  * If nothing needs to be done to sync the table, we exit the worker without
914  * any further action.
915  *
916  * The returned slot name is palloc'ed in current memory context.
917  */
918 char *
920 {
921  char *slotname;
922  char *err;
923  char relstate;
924  XLogRecPtr relstate_lsn;
925  Relation rel;
926  WalRcvExecResult *res;
927  char originname[NAMEDATALEN];
928  RepOriginId originid;
929 
930  /* Check the state of the table synchronization. */
934  &relstate_lsn);
936 
938  MyLogicalRepWorker->relstate = relstate;
939  MyLogicalRepWorker->relstate_lsn = relstate_lsn;
941 
942  /*
943  * If synchronization is already done or no longer necessary, exit now
944  * that we've updated shared memory state.
945  */
946  switch (relstate)
947  {
948  case SUBREL_STATE_SYNCDONE:
949  case SUBREL_STATE_READY:
950  case SUBREL_STATE_UNKNOWN:
951  finish_sync_worker(); /* doesn't return */
952  }
953 
954  /* Calculate the name of the tablesync slot. */
955  slotname = (char *) palloc(NAMEDATALEN);
958  slotname,
959  NAMEDATALEN);
960 
961  /*
962  * Here we use the slot name instead of the subscription name as the
963  * application_name, so that it is different from the main apply worker,
964  * so that synchronous replication can distinguish them.
965  */
967  walrcv_connect(MySubscription->conninfo, true, slotname, &err);
968  if (LogRepWorkerWalRcvConn == NULL)
969  ereport(ERROR,
970  (errmsg("could not connect to the publisher: %s", err)));
971 
972  Assert(MyLogicalRepWorker->relstate == SUBREL_STATE_INIT ||
973  MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC ||
974  MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY);
975 
976  /* Assign the origin tracking record name. */
979  originname,
980  sizeof(originname));
981 
982  if (MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC)
983  {
984  /*
985  * We have previously errored out before finishing the copy so the
986  * replication slot might exist. We want to remove the slot if it
987  * already exists and proceed.
988  *
989  * XXX We could also instead try to drop the slot, last time we failed
990  * but for that, we might need to clean up the copy state as it might
991  * be in the middle of fetching the rows. Also, if there is a network
992  * breakdown then it wouldn't have succeeded so trying it next time
993  * seems like a better bet.
994  */
996  }
997  else if (MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY)
998  {
999  /*
1000  * The COPY phase was previously done, but tablesync then crashed
1001  * before it was able to finish normally.
1002  */
1004 
1005  /*
1006  * The origin tracking name must already exist. It was created first
1007  * time this tablesync was launched.
1008  */
1009  originid = replorigin_by_name(originname, false);
1010  replorigin_session_setup(originid);
1011  replorigin_session_origin = originid;
1012  *origin_startpos = replorigin_session_get_progress(false);
1013 
1015 
1016  goto copy_table_done;
1017  }
1018 
1020  MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC;
1023 
1024  /* Update the state and make it visible to others. */
1031  pgstat_report_stat(false);
1032 
1034 
1035  /*
1036  * Use a standard write lock here. It might be better to disallow access
1037  * to the table while it's being synchronized. But we don't want to block
1038  * the main apply process from working and it has to open the relation in
1039  * RowExclusiveLock when remapping remote relation id to local one.
1040  */
1042 
1043  /*
1044  * Start a transaction in the remote node in REPEATABLE READ mode. This
1045  * ensures that both the replication slot we create (see below) and the
1046  * COPY are consistent with each other.
1047  */
1049  "BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ",
1050  0, NULL);
1051  if (res->status != WALRCV_OK_COMMAND)
1052  ereport(ERROR,
1053  (errmsg("table copy could not start transaction on publisher: %s",
1054  res->err)));
1055  walrcv_clear_result(res);
1056 
1057  /*
1058  * Create a new permanent logical decoding slot. This slot will be used
1059  * for the catchup phase after COPY is done, so tell it to use the
1060  * snapshot to make the final data consistent.
1061  *
1062  * Prevent cancel/die interrupts while creating slot here because it is
1063  * possible that before the server finishes this command, a concurrent
1064  * drop subscription happens which would complete without removing this
1065  * slot leading to a dangling slot on the server.
1066  */
1067  HOLD_INTERRUPTS();
1068  walrcv_create_slot(LogRepWorkerWalRcvConn, slotname, false /* permanent */ ,
1069  CRS_USE_SNAPSHOT, origin_startpos);
1071 
1072  /*
1073  * Setup replication origin tracking. The purpose of doing this before the
1074  * copy is to avoid doing the copy again due to any error in setting up
1075  * origin tracking.
1076  */
1077  originid = replorigin_by_name(originname, true);
1078  if (!OidIsValid(originid))
1079  {
1080  /*
1081  * Origin tracking does not exist, so create it now.
1082  *
1083  * Then advance to the LSN got from walrcv_create_slot. This is WAL
1084  * logged for the purpose of recovery. Locks are to prevent the
1085  * replication origin from vanishing while advancing.
1086  */
1087  originid = replorigin_create(originname);
1088 
1089  LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock);
1090  replorigin_advance(originid, *origin_startpos, InvalidXLogRecPtr,
1091  true /* go backward */ , true /* WAL log */ );
1092  UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock);
1093 
1094  replorigin_session_setup(originid);
1095  replorigin_session_origin = originid;
1096  }
1097  else
1098  {
1099  ereport(ERROR,
1101  errmsg("replication origin \"%s\" already exists",
1102  originname)));
1103  }
1104 
1105  /* Now do the initial data copy */
1107  copy_table(rel);
1109 
1110  res = walrcv_exec(LogRepWorkerWalRcvConn, "COMMIT", 0, NULL);
1111  if (res->status != WALRCV_OK_COMMAND)
1112  ereport(ERROR,
1113  (errmsg("table copy could not finish transaction on publisher: %s",
1114  res->err)));
1115  walrcv_clear_result(res);
1116 
1117  table_close(rel, NoLock);
1118 
1119  /* Make the copy visible. */
1121 
1122  /*
1123  * Update the persisted state to indicate the COPY phase is done; make it
1124  * visible to others.
1125  */
1128  SUBREL_STATE_FINISHEDCOPY,
1130 
1132 
1133 copy_table_done:
1134 
1135  elog(DEBUG1,
1136  "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X",
1137  originname, LSN_FORMAT_ARGS(*origin_startpos));
1138 
1139  /*
1140  * We are done with the initial data synchronization, update the state.
1141  */
1143  MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT;
1144  MyLogicalRepWorker->relstate_lsn = *origin_startpos;
1146 
1147  /*
1148  * Finally, wait until the main apply worker tells us to catch up and then
1149  * return to let LogicalRepApplyLoop do it.
1150  */
1151  wait_for_worker_state_change(SUBREL_STATE_CATCHUP);
1152  return slotname;
1153 }
Subscription * MySubscription
Definition: worker.c:161
Value * makeString(char *str)
Definition: value.c:53
#define NIL
Definition: pg_list.h:65
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:862
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define walrcv_endstreaming(conn, next_tli)
Definition: walreceiver.h:417
#define DEBUG1
Definition: elog.h:25
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:167
WalReceiverConn * LogRepWorkerWalRcvConn
Definition: worker.c:159
char * quote_literal_cstr(const char *rawstr)
Definition: quote.c:102
uint32 TimeLineID
Definition: xlogdefs.h:59
#define HASH_ELEM
Definition: hsearch.h:95
#define WL_TIMEOUT
Definition: latch.h:128
const char * quote_identifier(const char *ident)
Definition: ruleutils.c:11324
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:425
#define MaxTupleAttributeNumber
Definition: htup_details.h:33
void ReplicationSlotDropAtPubNode(WalReceiverConn *wrconn, char *slotname, bool missing_ok)
void process_syncing_tables(XLogRecPtr current_lsn)
Definition: tablesync.c:590
TupleTableSlot * MakeSingleTupleTableSlot(TupleDesc tupdesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1238
void UnlockRelationOid(Oid relid, LOCKMODE lockmode)
Definition: lmgr.c:200
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1580
void logicalrep_worker_wakeup(Oid subid, Oid relid)
Definition: launcher.c:533
int64 TimestampTz
Definition: timestamp.h:39
static void process_syncing_tables_for_apply(XLogRecPtr current_lsn)
Definition: tablesync.c:358
void replorigin_drop_by_name(char *name, bool missing_ok, bool nowait)
Definition: origin.c:414
uint64 CopyFrom(CopyFromState cstate)
Definition: copyfrom.c:525
#define DatumGetObjectId(X)
Definition: postgres.h:544
void CommitTransactionCommand(void)
Definition: xact.c:2939
StringInfo makeStringInfo(void)
Definition: stringinfo.c:41
#define walrcv_receive(conn, buffer, wait_fd)
Definition: walreceiver.h:419
#define walrcv_server_version(conn)
Definition: walreceiver.h:411
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define AccessShareLock
Definition: lockdefs.h:36
uint16 RepOriginId
Definition: xlogdefs.h:65
Size entrysize
Definition: hsearch.h:76
static void copy_table(Relation rel)
Definition: tablesync.c:808
void proc_exit(int code)
Definition: ipc.c:104
XLogRecPtr replorigin_session_get_progress(bool flush)
Definition: origin.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:698
static bool wait_for_relation_state_change(Oid relid, char expected_state)
Definition: tablesync.c:167
void logicalrep_rel_close(LogicalRepRelMapEntry *rel, LOCKMODE lockmode)
Definition: relation.c:447
#define WL_SOCKET_READABLE
Definition: latch.h:126
void PopActiveSnapshot(void)
Definition: snapmgr.c:759
void replorigin_advance(RepOriginId node, XLogRecPtr remote_commit, XLogRecPtr local_commit, bool go_backward, bool wal_log)
Definition: origin.c:872
void pgstat_report_stat(bool disconnect)
Definition: pgstat.c:843
#define lengthof(array)
Definition: c.h:734
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
void replorigin_session_setup(RepOriginId node)
Definition: origin.c:1068
#define LOG
Definition: elog.h:26
NameData relname
Definition: pg_class.h:38
unsigned int Oid
Definition: postgres_ext.h:31
static bool table_states_valid
Definition: tablesync.c:118
char * LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
Definition: tablesync.c:919
#define walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn)
Definition: walreceiver.h:423
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1711
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:250
#define OidIsValid(objectId)
Definition: c.h:710
void list_free_deep(List *list)
Definition: list.c:1405
Bitmapset * attkeys
Definition: logicalproto.h:99
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2875
static int fd(const char *x, int i)
Definition: preproc-init.c:105
RepOriginId replorigin_by_name(char *roname, bool missing_ok)
Definition: origin.c:209
void ResetLatch(Latch *latch)
Definition: latch.c:660
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:452
LogicalRepRelId remoteid
Definition: logicalproto.h:91
XLogRecPtr relstate_lsn
ParseNamespaceItem * addRangeTableEntryForRelation(ParseState *pstate, Relation rel, int lockmode, Alias *alias, bool inh, bool inFromCl)
ParseState * make_parsestate(ParseState *parentParseState)
Definition: parse_node.c:44
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1816
#define RESUME_INTERRUPTS()
Definition: miscadmin.h:133
#define NAMEDATALEN
void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker)
Definition: launcher.c:553
void logicalrep_relmap_update(LogicalRepRelation *remoterel)
Definition: relation.c:171
#define SpinLockAcquire(lock)
Definition: spin.h:62
Definition: dynahash.c:219
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
static void walrcv_clear_result(WalRcvExecResult *walres)
Definition: walreceiver.h:433
void pfree(void *pointer)
Definition: mcxt.c:1169
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
TupleDesc tupledesc
Definition: walreceiver.h:218
LogicalRepWorker * MyLogicalRepWorker
Definition: launcher.c:57
#define ERROR
Definition: elog.h:46
LogicalRepRelation remoterel
int max_sync_workers_per_subscription
Definition: launcher.c:55
static bool am_tablesync_worker(void)
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3316
#define NoLock
Definition: lockdefs.h:34
static char * buf
Definition: pg_test_fsync.c:68
void PushActiveSnapshot(Snapshot snap)
Definition: snapmgr.c:680
void ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
Definition: execTuples.c:1254
#define RowExclusiveLock
Definition: lockdefs.h:38
#define DatumGetBool(X)
Definition: postgres.h:437
#define RelationGetRelationName(relation)
Definition: rel.h:503
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
void resetStringInfo(StringInfo str)
Definition: stringinfo.c:75
int WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, long timeout, uint32 wait_event_info)
Definition: latch.c:500
unsigned int uint32
Definition: c.h:441
int pgsocket
Definition: port.h:31
RepOriginId replorigin_create(char *roname)
Definition: origin.c:240
LogicalRepRelMapEntry * logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode)
Definition: relation.c:272
void InvalidateCatalogSnapshot(void)
Definition: snapmgr.c:456
List * lappend(List *list, void *datum)
Definition: list.c:336
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
char * quote_qualified_identifier(const char *qualifier, const char *ident)
Definition: ruleutils.c:11408
int wal_retrieve_retry_interval
Definition: xlog.c:111
#define SpinLockRelease(lock)
Definition: spin.h:64
#define HASH_BLOBS
Definition: hsearch.h:97
#define TextDatumGetCString(d)
Definition: builtins.h:83
static void pg_attribute_noreturn()
Definition: tablesync.c:126
void * palloc0(Size size)
Definition: mcxt.c:1093
char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn)
uintptr_t Datum
Definition: postgres.h:411
void CommandCounterIncrement(void)
Definition: xact.c:1021
#define PGINVALID_SOCKET
Definition: port.h:33
#define DatumGetChar(X)
Definition: postgres.h:453
void UpdateSubscriptionRelState(Oid subid, Oid relid, char state, XLogRecPtr sublsn)
Size keysize
Definition: hsearch.h:75
StringInfo copybuf
Definition: tablesync.c:120
static void fetch_remote_table_info(char *nspname, char *relname, LogicalRepRelation *lrel)
Definition: tablesync.c:697
int logicalrep_sync_worker_count(Oid subid)
Definition: launcher.c:659
#define InvalidOid
Definition: postgres_ext.h:36
static Datum slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull)
Definition: tuptable.h:381
#define ereport(elevel,...)
Definition: elog.h:157
XLogRecPtr GetXLogWriteRecPtr(void)
Definition: xlog.c:11738
#define Max(x, y)
Definition: c.h:980
CopyFromState BeginCopyFrom(ParseState *pstate, Relation rel, Node *whereClause, const char *filename, bool is_program, copy_data_source_cb data_source_cb, List *attnamelist, List *options)
Definition: copyfrom.c:1186
bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward, bool copy, TupleTableSlot *slot)
Definition: tuplestore.c:1078
Tuplestorestate * tuplestore
Definition: walreceiver.h:217
void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, Oid relid)
Definition: launcher.c:266
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:804
#define lfirst(lc)
Definition: pg_list.h:169
WalRcvExecStatus status
Definition: walreceiver.h:214
RepOriginId replorigin_session_origin
Definition: origin.c:154
Definition: regguts.h:317
LogicalRepWorker * logicalrep_worker_find(Oid subid, Oid relid, bool only_running)
Definition: launcher.c:215
void StartTransactionCommand(void)
Definition: xact.c:2838
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1203
bool IsTransactionState(void)
Definition: xact.c:371
void ReplicationOriginNameForTablesync(Oid suboid, Oid relid, char *originname, int szorgname)
Definition: tablesync.c:904
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition: bitmapset.c:736
static void process_syncing_tables_for_sync(XLogRecPtr current_lsn)
Definition: tablesync.c:278
uint64 GetSystemIdentifier(void)
Definition: xlog.c:4956
void * palloc(Size size)
Definition: mcxt.c:1062
int errmsg(const char *fmt,...)
Definition: elog.c:909
#define HOLD_INTERRUPTS()
Definition: miscadmin.h:131
#define elog(elevel,...)
Definition: elog.h:232
int i
void * arg
struct Latch * MyLatch
Definition: globals.c:57
static int copy_read_data(void *outbuf, int minread, int maxread)
Definition: tablesync.c:622
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
void LockRelationOid(Oid relid, LOCKMODE lockmode)
Definition: lmgr.c:109
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:39
static List * make_copy_attnamelist(LogicalRepRelMapEntry *rel)
Definition: tablesync.c:602
#define ERRCODE_DUPLICATE_OBJECT
Definition: streamutil.c:32
Definition: pg_list.h:50
List * GetSubscriptionNotReadyRelations(Oid subid)
char * get_rel_name(Oid relid)
Definition: lsyscache.c:1899
#define snprintf
Definition: port.h:216
#define WL_LATCH_SET
Definition: latch.h:125
static bool wait_for_worker_state_change(char expected_state)
Definition: tablesync.c:215
#define UINT64_FORMAT
Definition: c.h:484
const TupleTableSlotOps TTSOpsMinimalTuple
Definition: execTuples.c:85
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1544
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:130
#define walrcv_exec(conn, exec, nRetTypes, retTypes)
Definition: walreceiver.h:427
void invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue)
Definition: tablesync.c:264
void ReplicationSlotNameForTablesync(Oid suboid, Oid relid, char *syncslotname, int szslot)
Definition: tablesync.c:891
MemoryContext CacheMemoryContext
Definition: mcxt.c:51
#define RelationGetNamespace(relation)
Definition: rel.h:510
#define walrcv_connect(conninfo, logical, appname, err)
Definition: walreceiver.h:401