PostgreSQL Source Code  git master
worker.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * worker.c
3  * PostgreSQL logical replication worker (apply)
4  *
5  * Copyright (c) 2016-2024, PostgreSQL Global Development Group
6  *
7  * IDENTIFICATION
8  * src/backend/replication/logical/worker.c
9  *
10  * NOTES
11  * This file contains the worker which applies logical changes as they come
12  * from remote logical replication stream.
13  *
14  * The main worker (apply) is started by logical replication worker
15  * launcher for every enabled subscription in a database. It uses
16  * walsender protocol to communicate with publisher.
17  *
18  * This module includes server facing code and shares libpqwalreceiver
19  * module with walreceiver for providing the libpq specific functionality.
20  *
21  *
22  * STREAMED TRANSACTIONS
23  * ---------------------
24  * Streamed transactions (large transactions exceeding a memory limit on the
25  * upstream) are applied using one of two approaches:
26  *
27  * 1) Write to temporary files and apply when the final commit arrives
28  *
29  * This approach is used when the user has set the subscription's streaming
30  * option as on.
31  *
32  * Unlike the regular (non-streamed) case, handling streamed transactions has
33  * to handle aborts of both the toplevel transaction and subtransactions. This
34  * is achieved by tracking offsets for subtransactions, which is then used
35  * to truncate the file with serialized changes.
36  *
37  * The files are placed in tmp file directory by default, and the filenames
38  * include both the XID of the toplevel transaction and OID of the
39  * subscription. This is necessary so that different workers processing a
40  * remote transaction with the same XID doesn't interfere.
41  *
42  * We use BufFiles instead of using normal temporary files because (a) the
43  * BufFile infrastructure supports temporary files that exceed the OS file size
44  * limit, (b) provides a way for automatic clean up on the error and (c) provides
45  * a way to survive these files across local transactions and allow to open and
46  * close at stream start and close. We decided to use FileSet
47  * infrastructure as without that it deletes the files on the closure of the
48  * file and if we decide to keep stream files open across the start/stop stream
49  * then it will consume a lot of memory (more than 8K for each BufFile and
50  * there could be multiple such BufFiles as the subscriber could receive
51  * multiple start/stop streams for different transactions before getting the
52  * commit). Moreover, if we don't use FileSet then we also need to invent
53  * a new way to pass filenames to BufFile APIs so that we are allowed to open
54  * the file we desired across multiple stream-open calls for the same
55  * transaction.
56  *
57  * 2) Parallel apply workers.
58  *
59  * This approach is used when the user has set the subscription's streaming
60  * option as parallel. See logical/applyparallelworker.c for information about
61  * this approach.
62  *
63  * TWO_PHASE TRANSACTIONS
64  * ----------------------
65  * Two phase transactions are replayed at prepare and then committed or
66  * rolled back at commit prepared and rollback prepared respectively. It is
67  * possible to have a prepared transaction that arrives at the apply worker
68  * when the tablesync is busy doing the initial copy. In this case, the apply
69  * worker skips all the prepared operations [e.g. inserts] while the tablesync
70  * is still busy (see the condition of should_apply_changes_for_rel). The
71  * tablesync worker might not get such a prepared transaction because say it
72  * was prior to the initial consistent point but might have got some later
73  * commits. Now, the tablesync worker will exit without doing anything for the
74  * prepared transaction skipped by the apply worker as the sync location for it
75  * will be already ahead of the apply worker's current location. This would lead
76  * to an "empty prepare", because later when the apply worker does the commit
77  * prepare, there is nothing in it (the inserts were skipped earlier).
78  *
79  * To avoid this, and similar prepare confusions the subscription's two_phase
80  * commit is enabled only after the initial sync is over. The two_phase option
81  * has been implemented as a tri-state with values DISABLED, PENDING, and
82  * ENABLED.
83  *
84  * Even if the user specifies they want a subscription with two_phase = on,
85  * internally it will start with a tri-state of PENDING which only becomes
86  * ENABLED after all tablesync initializations are completed - i.e. when all
87  * tablesync workers have reached their READY state. In other words, the value
88  * PENDING is only a temporary state for subscription start-up.
89  *
90  * Until the two_phase is properly available (ENABLED) the subscription will
91  * behave as if two_phase = off. When the apply worker detects that all
92  * tablesyncs have become READY (while the tri-state was PENDING) it will
93  * restart the apply worker process. This happens in
94  * process_syncing_tables_for_apply.
95  *
96  * When the (re-started) apply worker finds that all tablesyncs are READY for a
97  * two_phase tri-state of PENDING it start streaming messages with the
98  * two_phase option which in turn enables the decoding of two-phase commits at
99  * the publisher. Then, it updates the tri-state value from PENDING to ENABLED.
100  * Now, it is possible that during the time we have not enabled two_phase, the
101  * publisher (replication server) would have skipped some prepares but we
102  * ensure that such prepares are sent along with commit prepare, see
103  * ReorderBufferFinishPrepared.
104  *
105  * If the subscription has no tables then a two_phase tri-state PENDING is
106  * left unchanged. This lets the user still do an ALTER SUBSCRIPTION REFRESH
107  * PUBLICATION which might otherwise be disallowed (see below).
108  *
109  * If ever a user needs to be aware of the tri-state value, they can fetch it
110  * from the pg_subscription catalog (see column subtwophasestate).
111  *
112  * We don't allow to toggle two_phase option of a subscription because it can
113  * lead to an inconsistent replica. Consider, initially, it was on and we have
114  * received some prepare then we turn it off, now at commit time the server
115  * will send the entire transaction data along with the commit. With some more
116  * analysis, we can allow changing this option from off to on but not sure if
117  * that alone would be useful.
118  *
119  * Finally, to avoid problems mentioned in previous paragraphs from any
120  * subsequent (not READY) tablesyncs (need to toggle two_phase option from 'on'
121  * to 'off' and then again back to 'on') there is a restriction for
122  * ALTER SUBSCRIPTION REFRESH PUBLICATION. This command is not permitted when
123  * the two_phase tri-state is ENABLED, except when copy_data = false.
124  *
125  * We can get prepare of the same GID more than once for the genuine cases
126  * where we have defined multiple subscriptions for publications on the same
127  * server and prepared transaction has operations on tables subscribed to those
128  * subscriptions. For such cases, if we use the GID sent by publisher one of
129  * the prepares will be successful and others will fail, in which case the
130  * server will send them again. Now, this can lead to a deadlock if user has
131  * set synchronous_standby_names for all the subscriptions on subscriber. To
132  * avoid such deadlocks, we generate a unique GID (consisting of the
133  * subscription oid and the xid of the prepared transaction) for each prepare
134  * transaction on the subscriber.
135  *
136  * FAILOVER
137  * ----------------------
138  * The logical slot on the primary can be synced to the standby by specifying
139  * failover = true when creating the subscription. Enabling failover allows us
140  * to smoothly transition to the promoted standby, ensuring that we can
141  * subscribe to the new primary without losing any data.
142  *-------------------------------------------------------------------------
143  */
144 
145 #include "postgres.h"
146 
147 #include <sys/stat.h>
148 #include <unistd.h>
149 
150 #include "access/table.h"
151 #include "access/tableam.h"
152 #include "access/twophase.h"
153 #include "access/xact.h"
154 #include "catalog/indexing.h"
155 #include "catalog/pg_inherits.h"
156 #include "catalog/pg_subscription.h"
158 #include "commands/tablecmds.h"
159 #include "commands/trigger.h"
160 #include "executor/executor.h"
161 #include "executor/execPartition.h"
162 #include "libpq/pqformat.h"
163 #include "miscadmin.h"
164 #include "optimizer/optimizer.h"
165 #include "parser/parse_relation.h"
166 #include "pgstat.h"
167 #include "postmaster/bgworker.h"
168 #include "postmaster/interrupt.h"
169 #include "postmaster/walwriter.h"
174 #include "replication/origin.h"
175 #include "replication/walreceiver.h"
177 #include "rewrite/rewriteHandler.h"
178 #include "storage/buffile.h"
179 #include "storage/ipc.h"
180 #include "storage/lmgr.h"
181 #include "tcop/tcopprot.h"
182 #include "utils/acl.h"
183 #include "utils/dynahash.h"
184 #include "utils/guc.h"
185 #include "utils/inval.h"
186 #include "utils/lsyscache.h"
187 #include "utils/memutils.h"
188 #include "utils/pg_lsn.h"
189 #include "utils/rel.h"
190 #include "utils/rls.h"
191 #include "utils/snapmgr.h"
192 #include "utils/syscache.h"
193 #include "utils/usercontext.h"
194 
195 #define NAPTIME_PER_CYCLE 1000 /* max sleep time between cycles (1s) */
196 
197 typedef struct FlushPosition
198 {
203 
205 
206 typedef struct ApplyExecutionData
207 {
208  EState *estate; /* executor state, used to track resources */
209 
210  LogicalRepRelMapEntry *targetRel; /* replication target rel */
211  ResultRelInfo *targetRelInfo; /* ResultRelInfo for same */
212 
213  /* These fields are used when the target relation is partitioned: */
214  ModifyTableState *mtstate; /* dummy ModifyTable state */
215  PartitionTupleRouting *proute; /* partition routing info */
217 
218 /* Struct for saving and restoring apply errcontext information */
219 typedef struct ApplyErrorCallbackArg
220 {
221  LogicalRepMsgType command; /* 0 if invalid */
223 
224  /* Remote node information */
225  int remote_attnum; /* -1 if invalid */
228  char *origin_name;
230 
231 /*
232  * The action to be taken for the changes in the transaction.
233  *
234  * TRANS_LEADER_APPLY:
235  * This action means that we are in the leader apply worker or table sync
236  * worker. The changes of the transaction are either directly applied or
237  * are read from temporary files (for streaming transactions) and then
238  * applied by the worker.
239  *
240  * TRANS_LEADER_SERIALIZE:
241  * This action means that we are in the leader apply worker or table sync
242  * worker. Changes are written to temporary files and then applied when the
243  * final commit arrives.
244  *
245  * TRANS_LEADER_SEND_TO_PARALLEL:
246  * This action means that we are in the leader apply worker and need to send
247  * the changes to the parallel apply worker.
248  *
249  * TRANS_LEADER_PARTIAL_SERIALIZE:
250  * This action means that we are in the leader apply worker and have sent some
251  * changes directly to the parallel apply worker and the remaining changes are
252  * serialized to a file, due to timeout while sending data. The parallel apply
253  * worker will apply these serialized changes when the final commit arrives.
254  *
255  * We can't use TRANS_LEADER_SERIALIZE for this case because, in addition to
256  * serializing changes, the leader worker also needs to serialize the
257  * STREAM_XXX message to a file, and wait for the parallel apply worker to
258  * finish the transaction when processing the transaction finish command. So
259  * this new action was introduced to keep the code and logic clear.
260  *
261  * TRANS_PARALLEL_APPLY:
262  * This action means that we are in the parallel apply worker and changes of
263  * the transaction are applied directly by the worker.
264  */
265 typedef enum
266 {
267  /* The action for non-streaming transactions. */
269 
270  /* Actions for streaming transactions. */
276 
277 /* errcontext tracker */
279 {
280  .command = 0,
281  .rel = NULL,
282  .remote_attnum = -1,
283  .remote_xid = InvalidTransactionId,
284  .finish_lsn = InvalidXLogRecPtr,
285  .origin_name = NULL,
286 };
287 
289 
292 
293 /* per stream context for streaming transactions */
295 
297 
299 static bool MySubscriptionValid = false;
300 
302 
305 
306 /* fields valid only when processing streamed transaction */
307 static bool in_streamed_transaction = false;
308 
310 
311 /*
312  * The number of changes applied by parallel apply worker during one streaming
313  * block.
314  */
316 
317 /* Are we initializing an apply worker? */
319 
320 /*
321  * We enable skipping all data modification changes (INSERT, UPDATE, etc.) for
322  * the subscription if the remote transaction's finish LSN matches the subskiplsn.
323  * Once we start skipping changes, we don't stop it until we skip all changes of
324  * the transaction even if pg_subscription is updated and MySubscription->skiplsn
325  * gets changed or reset during that. Also, in streaming transaction cases (streaming = on),
326  * we don't skip receiving and spooling the changes since we decide whether or not
327  * to skip applying the changes when starting to apply changes. The subskiplsn is
328  * cleared after successfully skipping the transaction or applying non-empty
329  * transaction. The latter prevents the mistakenly specified subskiplsn from
330  * being left. Note that we cannot skip the streaming transactions when using
331  * parallel apply workers because we cannot get the finish LSN before applying
332  * the changes. So, we don't start parallel apply worker when finish LSN is set
333  * by the user.
334  */
336 #define is_skipping_changes() (unlikely(!XLogRecPtrIsInvalid(skip_xact_finish_lsn)))
337 
338 /* BufFile handle of the current streaming file */
339 static BufFile *stream_fd = NULL;
340 
341 typedef struct SubXactInfo
342 {
343  TransactionId xid; /* XID of the subxact */
344  int fileno; /* file number in the buffile */
345  off_t offset; /* offset in the file */
347 
348 /* Sub-transaction data for the current streaming transaction */
349 typedef struct ApplySubXactData
350 {
351  uint32 nsubxacts; /* number of sub-transactions */
352  uint32 nsubxacts_max; /* current capacity of subxacts */
353  TransactionId subxact_last; /* xid of the last sub-transaction */
354  SubXactInfo *subxacts; /* sub-xact offset in changes file */
356 
358 
359 static inline void subxact_filename(char *path, Oid subid, TransactionId xid);
360 static inline void changes_filename(char *path, Oid subid, TransactionId xid);
361 
362 /*
363  * Information about subtransactions of a given toplevel transaction.
364  */
365 static void subxact_info_write(Oid subid, TransactionId xid);
366 static void subxact_info_read(Oid subid, TransactionId xid);
367 static void subxact_info_add(TransactionId xid);
368 static inline void cleanup_subxact_info(void);
369 
370 /*
371  * Serialize and deserialize changes for a toplevel transaction.
372  */
373 static void stream_open_file(Oid subid, TransactionId xid,
374  bool first_segment);
375 static void stream_write_change(char action, StringInfo s);
377 static void stream_close_file(void);
378 
379 static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply);
380 
381 static void apply_handle_commit_internal(LogicalRepCommitData *commit_data);
383  ResultRelInfo *relinfo,
384  TupleTableSlot *remoteslot);
386  ResultRelInfo *relinfo,
387  TupleTableSlot *remoteslot,
388  LogicalRepTupleData *newtup,
389  Oid localindexoid);
391  ResultRelInfo *relinfo,
392  TupleTableSlot *remoteslot,
393  Oid localindexoid);
394 static bool FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel,
395  LogicalRepRelation *remoterel,
396  Oid localidxoid,
397  TupleTableSlot *remoteslot,
398  TupleTableSlot **localslot);
400  TupleTableSlot *remoteslot,
401  LogicalRepTupleData *newtup,
402  CmdType operation);
403 
404 /* Compute GID for two_phase transactions */
405 static void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid, int szgid);
406 
407 /* Functions for skipping changes */
408 static void maybe_start_skipping_changes(XLogRecPtr finish_lsn);
409 static void stop_skipping_changes(void);
410 static void clear_subscription_skip_lsn(XLogRecPtr finish_lsn);
411 
412 /* Functions for apply error callback */
413 static inline void set_apply_error_context_xact(TransactionId xid, XLogRecPtr lsn);
414 static inline void reset_apply_error_context_info(void);
415 
417  ParallelApplyWorkerInfo **winfo);
418 
419 /*
420  * Form the origin name for the subscription.
421  *
422  * This is a common function for tablesync and other workers. Tablesync workers
423  * must pass a valid relid. Other callers must pass relid = InvalidOid.
424  *
425  * Return the name in the supplied buffer.
426  */
427 void
429  char *originname, Size szoriginname)
430 {
431  if (OidIsValid(relid))
432  {
433  /* Replication origin name for tablesync workers. */
434  snprintf(originname, szoriginname, "pg_%u_%u", suboid, relid);
435  }
436  else
437  {
438  /* Replication origin name for non-tablesync workers. */
439  snprintf(originname, szoriginname, "pg_%u", suboid);
440  }
441 }
442 
443 /*
444  * Should this worker apply changes for given relation.
445  *
446  * This is mainly needed for initial relation data sync as that runs in
447  * separate worker process running in parallel and we need some way to skip
448  * changes coming to the leader apply worker during the sync of a table.
449  *
450  * Note we need to do smaller or equals comparison for SYNCDONE state because
451  * it might hold position of end of initial slot consistent point WAL
452  * record + 1 (ie start of next record) and next record can be COMMIT of
453  * transaction we are now processing (which is what we set remote_final_lsn
454  * to in apply_handle_begin).
455  *
456  * Note that for streaming transactions that are being applied in the parallel
457  * apply worker, we disallow applying changes if the target table in the
458  * subscription is not in the READY state, because we cannot decide whether to
459  * apply the change as we won't know remote_final_lsn by that time.
460  *
461  * We already checked this in pa_can_start() before assigning the
462  * streaming transaction to the parallel worker, but it also needs to be
463  * checked here because if the user executes ALTER SUBSCRIPTION ... REFRESH
464  * PUBLICATION in parallel, the new table can be added to pg_subscription_rel
465  * while applying this transaction.
466  */
467 static bool
469 {
470  switch (MyLogicalRepWorker->type)
471  {
473  return MyLogicalRepWorker->relid == rel->localreloid;
474 
476  /* We don't synchronize rel's that are in unknown state. */
477  if (rel->state != SUBREL_STATE_READY &&
478  rel->state != SUBREL_STATE_UNKNOWN)
479  ereport(ERROR,
480  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
481  errmsg("logical replication parallel apply worker for subscription \"%s\" will stop",
483  errdetail("Cannot handle streamed replication transactions using parallel apply workers until all tables have been synchronized.")));
484 
485  return rel->state == SUBREL_STATE_READY;
486 
487  case WORKERTYPE_APPLY:
488  return (rel->state == SUBREL_STATE_READY ||
489  (rel->state == SUBREL_STATE_SYNCDONE &&
490  rel->statelsn <= remote_final_lsn));
491 
492  case WORKERTYPE_UNKNOWN:
493  /* Should never happen. */
494  elog(ERROR, "Unknown worker type");
495  }
496 
497  return false; /* dummy for compiler */
498 }
499 
500 /*
501  * Begin one step (one INSERT, UPDATE, etc) of a replication transaction.
502  *
503  * Start a transaction, if this is the first step (else we keep using the
504  * existing transaction).
505  * Also provide a global snapshot and ensure we run in ApplyMessageContext.
506  */
507 static void
509 {
511 
512  if (!IsTransactionState())
513  {
516  }
517 
519 
521 }
522 
523 /*
524  * Finish up one step of a replication transaction.
525  * Callers of begin_replication_step() must also call this.
526  *
527  * We don't close out the transaction here, but we should increment
528  * the command counter to make the effects of this step visible.
529  */
530 static void
532 {
534 
536 }
537 
538 /*
539  * Handle streamed transactions for both the leader apply worker and the
540  * parallel apply workers.
541  *
542  * In the streaming case (receiving a block of the streamed transaction), for
543  * serialize mode, simply redirect it to a file for the proper toplevel
544  * transaction, and for parallel mode, the leader apply worker will send the
545  * changes to parallel apply workers and the parallel apply worker will define
546  * savepoints if needed. (LOGICAL_REP_MSG_RELATION or LOGICAL_REP_MSG_TYPE
547  * messages will be applied by both leader apply worker and parallel apply
548  * workers).
549  *
550  * Returns true for streamed transactions (when the change is either serialized
551  * to file or sent to parallel apply worker), false otherwise (regular mode or
552  * needs to be processed by parallel apply worker).
553  *
554  * Exception: If the message being processed is LOGICAL_REP_MSG_RELATION
555  * or LOGICAL_REP_MSG_TYPE, return false even if the message needs to be sent
556  * to a parallel apply worker.
557  */
558 static bool
560 {
561  TransactionId current_xid;
563  TransApplyAction apply_action;
564  StringInfoData original_msg;
565 
566  apply_action = get_transaction_apply_action(stream_xid, &winfo);
567 
568  /* not in streaming mode */
569  if (apply_action == TRANS_LEADER_APPLY)
570  return false;
571 
573 
574  /*
575  * The parallel apply worker needs the xid in this message to decide
576  * whether to define a savepoint, so save the original message that has
577  * not moved the cursor after the xid. We will serialize this message to a
578  * file in PARTIAL_SERIALIZE mode.
579  */
580  original_msg = *s;
581 
582  /*
583  * We should have received XID of the subxact as the first part of the
584  * message, so extract it.
585  */
586  current_xid = pq_getmsgint(s, 4);
587 
588  if (!TransactionIdIsValid(current_xid))
589  ereport(ERROR,
590  (errcode(ERRCODE_PROTOCOL_VIOLATION),
591  errmsg_internal("invalid transaction ID in streamed replication transaction")));
592 
593  switch (apply_action)
594  {
596  Assert(stream_fd);
597 
598  /* Add the new subxact to the array (unless already there). */
599  subxact_info_add(current_xid);
600 
601  /* Write the change to the current file */
603  return true;
604 
606  Assert(winfo);
607 
608  /*
609  * XXX The publisher side doesn't always send relation/type update
610  * messages after the streaming transaction, so also update the
611  * relation/type in leader apply worker. See function
612  * cleanup_rel_sync_cache.
613  */
614  if (pa_send_data(winfo, s->len, s->data))
615  return (action != LOGICAL_REP_MSG_RELATION &&
617 
618  /*
619  * Switch to serialize mode when we are not able to send the
620  * change to parallel apply worker.
621  */
622  pa_switch_to_partial_serialize(winfo, false);
623 
624  /* fall through */
626  stream_write_change(action, &original_msg);
627 
628  /* Same reason as TRANS_LEADER_SEND_TO_PARALLEL case. */
629  return (action != LOGICAL_REP_MSG_RELATION &&
631 
634 
635  /* Define a savepoint for a subxact if needed. */
636  pa_start_subtrans(current_xid, stream_xid);
637  return false;
638 
639  default:
640  elog(ERROR, "unexpected apply action: %d", (int) apply_action);
641  return false; /* silence compiler warning */
642  }
643 }
644 
645 /*
646  * Executor state preparation for evaluation of constraint expressions,
647  * indexes and triggers for the specified relation.
648  *
649  * Note that the caller must open and close any indexes to be updated.
650  */
651 static ApplyExecutionData *
653 {
654  ApplyExecutionData *edata;
655  EState *estate;
656  RangeTblEntry *rte;
657  List *perminfos = NIL;
658  ResultRelInfo *resultRelInfo;
659 
660  edata = (ApplyExecutionData *) palloc0(sizeof(ApplyExecutionData));
661  edata->targetRel = rel;
662 
663  edata->estate = estate = CreateExecutorState();
664 
665  rte = makeNode(RangeTblEntry);
666  rte->rtekind = RTE_RELATION;
667  rte->relid = RelationGetRelid(rel->localrel);
668  rte->relkind = rel->localrel->rd_rel->relkind;
669  rte->rellockmode = AccessShareLock;
670 
671  addRTEPermissionInfo(&perminfos, rte);
672 
673  ExecInitRangeTable(estate, list_make1(rte), perminfos);
674 
675  edata->targetRelInfo = resultRelInfo = makeNode(ResultRelInfo);
676 
677  /*
678  * Use Relation opened by logicalrep_rel_open() instead of opening it
679  * again.
680  */
681  InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0);
682 
683  /*
684  * We put the ResultRelInfo in the es_opened_result_relations list, even
685  * though we don't populate the es_result_relations array. That's a bit
686  * bogus, but it's enough to make ExecGetTriggerResultRel() find them.
687  *
688  * ExecOpenIndices() is not called here either, each execution path doing
689  * an apply operation being responsible for that.
690  */
692  lappend(estate->es_opened_result_relations, resultRelInfo);
693 
694  estate->es_output_cid = GetCurrentCommandId(true);
695 
696  /* Prepare to catch AFTER triggers. */
698 
699  /* other fields of edata remain NULL for now */
700 
701  return edata;
702 }
703 
704 /*
705  * Finish any operations related to the executor state created by
706  * create_edata_for_relation().
707  */
708 static void
710 {
711  EState *estate = edata->estate;
712 
713  /* Handle any queued AFTER triggers. */
714  AfterTriggerEndQuery(estate);
715 
716  /* Shut down tuple routing, if any was done. */
717  if (edata->proute)
718  ExecCleanupTupleRouting(edata->mtstate, edata->proute);
719 
720  /*
721  * Cleanup. It might seem that we should call ExecCloseResultRelations()
722  * here, but we intentionally don't. It would close the rel we added to
723  * es_opened_result_relations above, which is wrong because we took no
724  * corresponding refcount. We rely on ExecCleanupTupleRouting() to close
725  * any other relations opened during execution.
726  */
727  ExecResetTupleTable(estate->es_tupleTable, false);
728  FreeExecutorState(estate);
729  pfree(edata);
730 }
731 
732 /*
733  * Executes default values for columns for which we can't map to remote
734  * relation columns.
735  *
736  * This allows us to support tables which have more columns on the downstream
737  * than on the upstream.
738  */
739 static void
741  TupleTableSlot *slot)
742 {
743  TupleDesc desc = RelationGetDescr(rel->localrel);
744  int num_phys_attrs = desc->natts;
745  int i;
746  int attnum,
747  num_defaults = 0;
748  int *defmap;
749  ExprState **defexprs;
750  ExprContext *econtext;
751 
752  econtext = GetPerTupleExprContext(estate);
753 
754  /* We got all the data via replication, no need to evaluate anything. */
755  if (num_phys_attrs == rel->remoterel.natts)
756  return;
757 
758  defmap = (int *) palloc(num_phys_attrs * sizeof(int));
759  defexprs = (ExprState **) palloc(num_phys_attrs * sizeof(ExprState *));
760 
761  Assert(rel->attrmap->maplen == num_phys_attrs);
762  for (attnum = 0; attnum < num_phys_attrs; attnum++)
763  {
764  Expr *defexpr;
765 
766  if (TupleDescAttr(desc, attnum)->attisdropped || TupleDescAttr(desc, attnum)->attgenerated)
767  continue;
768 
769  if (rel->attrmap->attnums[attnum] >= 0)
770  continue;
771 
772  defexpr = (Expr *) build_column_default(rel->localrel, attnum + 1);
773 
774  if (defexpr != NULL)
775  {
776  /* Run the expression through planner */
777  defexpr = expression_planner(defexpr);
778 
779  /* Initialize executable expression in copycontext */
780  defexprs[num_defaults] = ExecInitExpr(defexpr, NULL);
781  defmap[num_defaults] = attnum;
782  num_defaults++;
783  }
784  }
785 
786  for (i = 0; i < num_defaults; i++)
787  slot->tts_values[defmap[i]] =
788  ExecEvalExpr(defexprs[i], econtext, &slot->tts_isnull[defmap[i]]);
789 }
790 
791 /*
792  * Store tuple data into slot.
793  *
794  * Incoming data can be either text or binary format.
795  */
796 static void
798  LogicalRepTupleData *tupleData)
799 {
800  int natts = slot->tts_tupleDescriptor->natts;
801  int i;
802 
803  ExecClearTuple(slot);
804 
805  /* Call the "in" function for each non-dropped, non-null attribute */
806  Assert(natts == rel->attrmap->maplen);
807  for (i = 0; i < natts; i++)
808  {
810  int remoteattnum = rel->attrmap->attnums[i];
811 
812  if (!att->attisdropped && remoteattnum >= 0)
813  {
814  StringInfo colvalue = &tupleData->colvalues[remoteattnum];
815 
816  Assert(remoteattnum < tupleData->ncols);
817 
818  /* Set attnum for error callback */
820 
821  if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT)
822  {
823  Oid typinput;
824  Oid typioparam;
825 
826  getTypeInputInfo(att->atttypid, &typinput, &typioparam);
827  slot->tts_values[i] =
828  OidInputFunctionCall(typinput, colvalue->data,
829  typioparam, att->atttypmod);
830  slot->tts_isnull[i] = false;
831  }
832  else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY)
833  {
834  Oid typreceive;
835  Oid typioparam;
836 
837  /*
838  * In some code paths we may be asked to re-parse the same
839  * tuple data. Reset the StringInfo's cursor so that works.
840  */
841  colvalue->cursor = 0;
842 
843  getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam);
844  slot->tts_values[i] =
845  OidReceiveFunctionCall(typreceive, colvalue,
846  typioparam, att->atttypmod);
847 
848  /* Trouble if it didn't eat the whole buffer */
849  if (colvalue->cursor != colvalue->len)
850  ereport(ERROR,
851  (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
852  errmsg("incorrect binary data format in logical replication column %d",
853  remoteattnum + 1)));
854  slot->tts_isnull[i] = false;
855  }
856  else
857  {
858  /*
859  * NULL value from remote. (We don't expect to see
860  * LOGICALREP_COLUMN_UNCHANGED here, but if we do, treat it as
861  * NULL.)
862  */
863  slot->tts_values[i] = (Datum) 0;
864  slot->tts_isnull[i] = true;
865  }
866 
867  /* Reset attnum for error callback */
869  }
870  else
871  {
872  /*
873  * We assign NULL to dropped attributes and missing values
874  * (missing values should be later filled using
875  * slot_fill_defaults).
876  */
877  slot->tts_values[i] = (Datum) 0;
878  slot->tts_isnull[i] = true;
879  }
880  }
881 
882  ExecStoreVirtualTuple(slot);
883 }
884 
885 /*
886  * Replace updated columns with data from the LogicalRepTupleData struct.
887  * This is somewhat similar to heap_modify_tuple but also calls the type
888  * input functions on the user data.
889  *
890  * "slot" is filled with a copy of the tuple in "srcslot", replacing
891  * columns provided in "tupleData" and leaving others as-is.
892  *
893  * Caution: unreplaced pass-by-ref columns in "slot" will point into the
894  * storage for "srcslot". This is OK for current usage, but someday we may
895  * need to materialize "slot" at the end to make it independent of "srcslot".
896  */
897 static void
900  LogicalRepTupleData *tupleData)
901 {
902  int natts = slot->tts_tupleDescriptor->natts;
903  int i;
904 
905  /* We'll fill "slot" with a virtual tuple, so we must start with ... */
906  ExecClearTuple(slot);
907 
908  /*
909  * Copy all the column data from srcslot, so that we'll have valid values
910  * for unreplaced columns.
911  */
912  Assert(natts == srcslot->tts_tupleDescriptor->natts);
913  slot_getallattrs(srcslot);
914  memcpy(slot->tts_values, srcslot->tts_values, natts * sizeof(Datum));
915  memcpy(slot->tts_isnull, srcslot->tts_isnull, natts * sizeof(bool));
916 
917  /* Call the "in" function for each replaced attribute */
918  Assert(natts == rel->attrmap->maplen);
919  for (i = 0; i < natts; i++)
920  {
922  int remoteattnum = rel->attrmap->attnums[i];
923 
924  if (remoteattnum < 0)
925  continue;
926 
927  Assert(remoteattnum < tupleData->ncols);
928 
929  if (tupleData->colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED)
930  {
931  StringInfo colvalue = &tupleData->colvalues[remoteattnum];
932 
933  /* Set attnum for error callback */
935 
936  if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT)
937  {
938  Oid typinput;
939  Oid typioparam;
940 
941  getTypeInputInfo(att->atttypid, &typinput, &typioparam);
942  slot->tts_values[i] =
943  OidInputFunctionCall(typinput, colvalue->data,
944  typioparam, att->atttypmod);
945  slot->tts_isnull[i] = false;
946  }
947  else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY)
948  {
949  Oid typreceive;
950  Oid typioparam;
951 
952  /*
953  * In some code paths we may be asked to re-parse the same
954  * tuple data. Reset the StringInfo's cursor so that works.
955  */
956  colvalue->cursor = 0;
957 
958  getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam);
959  slot->tts_values[i] =
960  OidReceiveFunctionCall(typreceive, colvalue,
961  typioparam, att->atttypmod);
962 
963  /* Trouble if it didn't eat the whole buffer */
964  if (colvalue->cursor != colvalue->len)
965  ereport(ERROR,
966  (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
967  errmsg("incorrect binary data format in logical replication column %d",
968  remoteattnum + 1)));
969  slot->tts_isnull[i] = false;
970  }
971  else
972  {
973  /* must be LOGICALREP_COLUMN_NULL */
974  slot->tts_values[i] = (Datum) 0;
975  slot->tts_isnull[i] = true;
976  }
977 
978  /* Reset attnum for error callback */
980  }
981  }
982 
983  /* And finally, declare that "slot" contains a valid virtual tuple */
984  ExecStoreVirtualTuple(slot);
985 }
986 
987 /*
988  * Handle BEGIN message.
989  */
990 static void
992 {
993  LogicalRepBeginData begin_data;
994 
995  /* There must not be an active streaming transaction. */
997 
998  logicalrep_read_begin(s, &begin_data);
999  set_apply_error_context_xact(begin_data.xid, begin_data.final_lsn);
1000 
1001  remote_final_lsn = begin_data.final_lsn;
1002 
1004 
1005  in_remote_transaction = true;
1006 
1008 }
1009 
1010 /*
1011  * Handle COMMIT message.
1012  *
1013  * TODO, support tracking of multiple origins
1014  */
1015 static void
1017 {
1018  LogicalRepCommitData commit_data;
1019 
1020  logicalrep_read_commit(s, &commit_data);
1021 
1022  if (commit_data.commit_lsn != remote_final_lsn)
1023  ereport(ERROR,
1024  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1025  errmsg_internal("incorrect commit LSN %X/%X in commit message (expected %X/%X)",
1026  LSN_FORMAT_ARGS(commit_data.commit_lsn),
1028 
1029  apply_handle_commit_internal(&commit_data);
1030 
1031  /* Process any tables that are being synchronized in parallel. */
1032  process_syncing_tables(commit_data.end_lsn);
1033 
1036 }
1037 
1038 /*
1039  * Handle BEGIN PREPARE message.
1040  */
1041 static void
1043 {
1044  LogicalRepPreparedTxnData begin_data;
1045 
1046  /* Tablesync should never receive prepare. */
1047  if (am_tablesync_worker())
1048  ereport(ERROR,
1049  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1050  errmsg_internal("tablesync worker received a BEGIN PREPARE message")));
1051 
1052  /* There must not be an active streaming transaction. */
1054 
1055  logicalrep_read_begin_prepare(s, &begin_data);
1056  set_apply_error_context_xact(begin_data.xid, begin_data.prepare_lsn);
1057 
1058  remote_final_lsn = begin_data.prepare_lsn;
1059 
1061 
1062  in_remote_transaction = true;
1063 
1065 }
1066 
1067 /*
1068  * Common function to prepare the GID.
1069  */
1070 static void
1072 {
1073  char gid[GIDSIZE];
1074 
1075  /*
1076  * Compute unique GID for two_phase transactions. We don't use GID of
1077  * prepared transaction sent by server as that can lead to deadlock when
1078  * we have multiple subscriptions from same node point to publications on
1079  * the same node. See comments atop worker.c
1080  */
1081  TwoPhaseTransactionGid(MySubscription->oid, prepare_data->xid,
1082  gid, sizeof(gid));
1083 
1084  /*
1085  * BeginTransactionBlock is necessary to balance the EndTransactionBlock
1086  * called within the PrepareTransactionBlock below.
1087  */
1088  if (!IsTransactionBlock())
1089  {
1091  CommitTransactionCommand(); /* Completes the preceding Begin command. */
1092  }
1093 
1094  /*
1095  * Update origin state so we can restart streaming from correct position
1096  * in case of crash.
1097  */
1098  replorigin_session_origin_lsn = prepare_data->end_lsn;
1100 
1102 }
1103 
1104 /*
1105  * Handle PREPARE message.
1106  */
1107 static void
1109 {
1110  LogicalRepPreparedTxnData prepare_data;
1111 
1112  logicalrep_read_prepare(s, &prepare_data);
1113 
1114  if (prepare_data.prepare_lsn != remote_final_lsn)
1115  ereport(ERROR,
1116  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1117  errmsg_internal("incorrect prepare LSN %X/%X in prepare message (expected %X/%X)",
1118  LSN_FORMAT_ARGS(prepare_data.prepare_lsn),
1120 
1121  /*
1122  * Unlike commit, here, we always prepare the transaction even though no
1123  * change has happened in this transaction or all changes are skipped. It
1124  * is done this way because at commit prepared time, we won't know whether
1125  * we have skipped preparing a transaction because of those reasons.
1126  *
1127  * XXX, We can optimize such that at commit prepared time, we first check
1128  * whether we have prepared the transaction or not but that doesn't seem
1129  * worthwhile because such cases shouldn't be common.
1130  */
1132 
1133  apply_handle_prepare_internal(&prepare_data);
1134 
1137  pgstat_report_stat(false);
1138 
1140 
1141  in_remote_transaction = false;
1142 
1143  /* Process any tables that are being synchronized in parallel. */
1144  process_syncing_tables(prepare_data.end_lsn);
1145 
1146  /*
1147  * Since we have already prepared the transaction, in a case where the
1148  * server crashes before clearing the subskiplsn, it will be left but the
1149  * transaction won't be resent. But that's okay because it's a rare case
1150  * and the subskiplsn will be cleared when finishing the next transaction.
1151  */
1154 
1157 }
1158 
1159 /*
1160  * Handle a COMMIT PREPARED of a previously PREPARED transaction.
1161  *
1162  * Note that we don't need to wait here if the transaction was prepared in a
1163  * parallel apply worker. In that case, we have already waited for the prepare
1164  * to finish in apply_handle_stream_prepare() which will ensure all the
1165  * operations in that transaction have happened in the subscriber, so no
1166  * concurrent transaction can cause deadlock or transaction dependency issues.
1167  */
1168 static void
1170 {
1171  LogicalRepCommitPreparedTxnData prepare_data;
1172  char gid[GIDSIZE];
1173 
1174  logicalrep_read_commit_prepared(s, &prepare_data);
1175  set_apply_error_context_xact(prepare_data.xid, prepare_data.commit_lsn);
1176 
1177  /* Compute GID for two_phase transactions. */
1179  gid, sizeof(gid));
1180 
1181  /* There is no transaction when COMMIT PREPARED is called */
1183 
1184  /*
1185  * Update origin state so we can restart streaming from correct position
1186  * in case of crash.
1187  */
1188  replorigin_session_origin_lsn = prepare_data.end_lsn;
1190 
1191  FinishPreparedTransaction(gid, true);
1194  pgstat_report_stat(false);
1195 
1197  in_remote_transaction = false;
1198 
1199  /* Process any tables that are being synchronized in parallel. */
1200  process_syncing_tables(prepare_data.end_lsn);
1201 
1202  clear_subscription_skip_lsn(prepare_data.end_lsn);
1203 
1206 }
1207 
1208 /*
1209  * Handle a ROLLBACK PREPARED of a previously PREPARED TRANSACTION.
1210  *
1211  * Note that we don't need to wait here if the transaction was prepared in a
1212  * parallel apply worker. In that case, we have already waited for the prepare
1213  * to finish in apply_handle_stream_prepare() which will ensure all the
1214  * operations in that transaction have happened in the subscriber, so no
1215  * concurrent transaction can cause deadlock or transaction dependency issues.
1216  */
1217 static void
1219 {
1220  LogicalRepRollbackPreparedTxnData rollback_data;
1221  char gid[GIDSIZE];
1222 
1223  logicalrep_read_rollback_prepared(s, &rollback_data);
1224  set_apply_error_context_xact(rollback_data.xid, rollback_data.rollback_end_lsn);
1225 
1226  /* Compute GID for two_phase transactions. */
1227  TwoPhaseTransactionGid(MySubscription->oid, rollback_data.xid,
1228  gid, sizeof(gid));
1229 
1230  /*
1231  * It is possible that we haven't received prepare because it occurred
1232  * before walsender reached a consistent point or the two_phase was still
1233  * not enabled by that time, so in such cases, we need to skip rollback
1234  * prepared.
1235  */
1236  if (LookupGXact(gid, rollback_data.prepare_end_lsn,
1237  rollback_data.prepare_time))
1238  {
1239  /*
1240  * Update origin state so we can restart streaming from correct
1241  * position in case of crash.
1242  */
1245 
1246  /* There is no transaction when ABORT/ROLLBACK PREPARED is called */
1248  FinishPreparedTransaction(gid, false);
1251 
1253  }
1254 
1255  pgstat_report_stat(false);
1256 
1258  in_remote_transaction = false;
1259 
1260  /* Process any tables that are being synchronized in parallel. */
1262 
1265 }
1266 
1267 /*
1268  * Handle STREAM PREPARE.
1269  */
1270 static void
1272 {
1273  LogicalRepPreparedTxnData prepare_data;
1274  ParallelApplyWorkerInfo *winfo;
1275  TransApplyAction apply_action;
1276 
1277  /* Save the message before it is consumed. */
1278  StringInfoData original_msg = *s;
1279 
1281  ereport(ERROR,
1282  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1283  errmsg_internal("STREAM PREPARE message without STREAM STOP")));
1284 
1285  /* Tablesync should never receive prepare. */
1286  if (am_tablesync_worker())
1287  ereport(ERROR,
1288  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1289  errmsg_internal("tablesync worker received a STREAM PREPARE message")));
1290 
1291  logicalrep_read_stream_prepare(s, &prepare_data);
1292  set_apply_error_context_xact(prepare_data.xid, prepare_data.prepare_lsn);
1293 
1294  apply_action = get_transaction_apply_action(prepare_data.xid, &winfo);
1295 
1296  switch (apply_action)
1297  {
1298  case TRANS_LEADER_APPLY:
1299 
1300  /*
1301  * The transaction has been serialized to file, so replay all the
1302  * spooled operations.
1303  */
1305  prepare_data.xid, prepare_data.prepare_lsn);
1306 
1307  /* Mark the transaction as prepared. */
1308  apply_handle_prepare_internal(&prepare_data);
1309 
1311 
1313 
1314  in_remote_transaction = false;
1315 
1316  /* Unlink the files with serialized changes and subxact info. */
1318 
1319  elog(DEBUG1, "finished processing the STREAM PREPARE command");
1320  break;
1321 
1323  Assert(winfo);
1324 
1325  if (pa_send_data(winfo, s->len, s->data))
1326  {
1327  /* Finish processing the streaming transaction. */
1328  pa_xact_finish(winfo, prepare_data.end_lsn);
1329  break;
1330  }
1331 
1332  /*
1333  * Switch to serialize mode when we are not able to send the
1334  * change to parallel apply worker.
1335  */
1336  pa_switch_to_partial_serialize(winfo, true);
1337 
1338  /* fall through */
1340  Assert(winfo);
1341 
1342  stream_open_and_write_change(prepare_data.xid,
1344  &original_msg);
1345 
1347 
1348  /* Finish processing the streaming transaction. */
1349  pa_xact_finish(winfo, prepare_data.end_lsn);
1350  break;
1351 
1352  case TRANS_PARALLEL_APPLY:
1353 
1354  /*
1355  * If the parallel apply worker is applying spooled messages then
1356  * close the file before preparing.
1357  */
1358  if (stream_fd)
1360 
1362 
1363  /* Mark the transaction as prepared. */
1364  apply_handle_prepare_internal(&prepare_data);
1365 
1367 
1369 
1371 
1374 
1376 
1377  elog(DEBUG1, "finished processing the STREAM PREPARE command");
1378  break;
1379 
1380  default:
1381  elog(ERROR, "unexpected apply action: %d", (int) apply_action);
1382  break;
1383  }
1384 
1385  pgstat_report_stat(false);
1386 
1387  /* Process any tables that are being synchronized in parallel. */
1388  process_syncing_tables(prepare_data.end_lsn);
1389 
1390  /*
1391  * Similar to prepare case, the subskiplsn could be left in a case of
1392  * server crash but it's okay. See the comments in apply_handle_prepare().
1393  */
1396 
1398 
1400 }
1401 
1402 /*
1403  * Handle ORIGIN message.
1404  *
1405  * TODO, support tracking of multiple origins
1406  */
1407 static void
1409 {
1410  /*
1411  * ORIGIN message can only come inside streaming transaction or inside
1412  * remote transaction and before any actual writes.
1413  */
1414  if (!in_streamed_transaction &&
1417  ereport(ERROR,
1418  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1419  errmsg_internal("ORIGIN message sent out of order")));
1420 }
1421 
1422 /*
1423  * Initialize fileset (if not already done).
1424  *
1425  * Create a new file when first_segment is true, otherwise open the existing
1426  * file.
1427  */
1428 void
1429 stream_start_internal(TransactionId xid, bool first_segment)
1430 {
1432 
1433  /*
1434  * Initialize the worker's stream_fileset if we haven't yet. This will be
1435  * used for the entire duration of the worker so create it in a permanent
1436  * context. We create this on the very first streaming message from any
1437  * transaction and then use it for this and other streaming transactions.
1438  * Now, we could create a fileset at the start of the worker as well but
1439  * then we won't be sure that it will ever be used.
1440  */
1442  {
1443  MemoryContext oldctx;
1444 
1446 
1449 
1450  MemoryContextSwitchTo(oldctx);
1451  }
1452 
1453  /* Open the spool file for this transaction. */
1454  stream_open_file(MyLogicalRepWorker->subid, xid, first_segment);
1455 
1456  /* If this is not the first segment, open existing subxact file. */
1457  if (!first_segment)
1459 
1461 }
1462 
1463 /*
1464  * Handle STREAM START message.
1465  */
1466 static void
1468 {
1469  bool first_segment;
1470  ParallelApplyWorkerInfo *winfo;
1471  TransApplyAction apply_action;
1472 
1473  /* Save the message before it is consumed. */
1474  StringInfoData original_msg = *s;
1475 
1477  ereport(ERROR,
1478  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1479  errmsg_internal("duplicate STREAM START message")));
1480 
1481  /* There must not be an active streaming transaction. */
1483 
1484  /* notify handle methods we're processing a remote transaction */
1485  in_streamed_transaction = true;
1486 
1487  /* extract XID of the top-level transaction */
1488  stream_xid = logicalrep_read_stream_start(s, &first_segment);
1489 
1491  ereport(ERROR,
1492  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1493  errmsg_internal("invalid transaction ID in streamed replication transaction")));
1494 
1496 
1497  /* Try to allocate a worker for the streaming transaction. */
1498  if (first_segment)
1500 
1501  apply_action = get_transaction_apply_action(stream_xid, &winfo);
1502 
1503  switch (apply_action)
1504  {
1506 
1507  /*
1508  * Function stream_start_internal starts a transaction. This
1509  * transaction will be committed on the stream stop unless it is a
1510  * tablesync worker in which case it will be committed after
1511  * processing all the messages. We need this transaction for
1512  * handling the BufFile, used for serializing the streaming data
1513  * and subxact info.
1514  */
1515  stream_start_internal(stream_xid, first_segment);
1516  break;
1517 
1519  Assert(winfo);
1520 
1521  /*
1522  * Once we start serializing the changes, the parallel apply
1523  * worker will wait for the leader to release the stream lock
1524  * until the end of the transaction. So, we don't need to release
1525  * the lock or increment the stream count in that case.
1526  */
1527  if (pa_send_data(winfo, s->len, s->data))
1528  {
1529  /*
1530  * Unlock the shared object lock so that the parallel apply
1531  * worker can continue to receive changes.
1532  */
1533  if (!first_segment)
1535 
1536  /*
1537  * Increment the number of streaming blocks waiting to be
1538  * processed by parallel apply worker.
1539  */
1541 
1542  /* Cache the parallel apply worker for this transaction. */
1544  break;
1545  }
1546 
1547  /*
1548  * Switch to serialize mode when we are not able to send the
1549  * change to parallel apply worker.
1550  */
1551  pa_switch_to_partial_serialize(winfo, !first_segment);
1552 
1553  /* fall through */
1555  Assert(winfo);
1556 
1557  /*
1558  * Open the spool file unless it was already opened when switching
1559  * to serialize mode. The transaction started in
1560  * stream_start_internal will be committed on the stream stop.
1561  */
1562  if (apply_action != TRANS_LEADER_SEND_TO_PARALLEL)
1563  stream_start_internal(stream_xid, first_segment);
1564 
1566 
1567  /* Cache the parallel apply worker for this transaction. */
1569  break;
1570 
1571  case TRANS_PARALLEL_APPLY:
1572  if (first_segment)
1573  {
1574  /* Hold the lock until the end of the transaction. */
1577 
1578  /*
1579  * Signal the leader apply worker, as it may be waiting for
1580  * us.
1581  */
1583  }
1584 
1586  break;
1587 
1588  default:
1589  elog(ERROR, "unexpected apply action: %d", (int) apply_action);
1590  break;
1591  }
1592 
1594 }
1595 
1596 /*
1597  * Update the information about subxacts and close the file.
1598  *
1599  * This function should be called when the stream_start_internal function has
1600  * been called.
1601  */
1602 void
1604 {
1605  /*
1606  * Serialize information about subxacts for the toplevel transaction, then
1607  * close the stream messages spool file.
1608  */
1611 
1612  /* We must be in a valid transaction state */
1614 
1615  /* Commit the per-stream transaction */
1617 
1618  /* Reset per-stream context */
1620 }
1621 
1622 /*
1623  * Handle STREAM STOP message.
1624  */
1625 static void
1627 {
1628  ParallelApplyWorkerInfo *winfo;
1629  TransApplyAction apply_action;
1630 
1632  ereport(ERROR,
1633  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1634  errmsg_internal("STREAM STOP message without STREAM START")));
1635 
1636  apply_action = get_transaction_apply_action(stream_xid, &winfo);
1637 
1638  switch (apply_action)
1639  {
1642  break;
1643 
1645  Assert(winfo);
1646 
1647  /*
1648  * Lock before sending the STREAM_STOP message so that the leader
1649  * can hold the lock first and the parallel apply worker will wait
1650  * for leader to release the lock. See Locking Considerations atop
1651  * applyparallelworker.c.
1652  */
1654 
1655  if (pa_send_data(winfo, s->len, s->data))
1656  {
1658  break;
1659  }
1660 
1661  /*
1662  * Switch to serialize mode when we are not able to send the
1663  * change to parallel apply worker.
1664  */
1665  pa_switch_to_partial_serialize(winfo, true);
1666 
1667  /* fall through */
1672  break;
1673 
1674  case TRANS_PARALLEL_APPLY:
1675  elog(DEBUG1, "applied %u changes in the streaming chunk",
1677 
1678  /*
1679  * By the time parallel apply worker is processing the changes in
1680  * the current streaming block, the leader apply worker may have
1681  * sent multiple streaming blocks. This can lead to parallel apply
1682  * worker start waiting even when there are more chunk of streams
1683  * in the queue. So, try to lock only if there is no message left
1684  * in the queue. See Locking Considerations atop
1685  * applyparallelworker.c.
1686  *
1687  * Note that here we have a race condition where we can start
1688  * waiting even when there are pending streaming chunks. This can
1689  * happen if the leader sends another streaming block and acquires
1690  * the stream lock again after the parallel apply worker checks
1691  * that there is no pending streaming block and before it actually
1692  * starts waiting on a lock. We can handle this case by not
1693  * allowing the leader to increment the stream block count during
1694  * the time parallel apply worker acquires the lock but it is not
1695  * clear whether that is worth the complexity.
1696  *
1697  * Now, if this missed chunk contains rollback to savepoint, then
1698  * there is a risk of deadlock which probably shouldn't happen
1699  * after restart.
1700  */
1702  break;
1703 
1704  default:
1705  elog(ERROR, "unexpected apply action: %d", (int) apply_action);
1706  break;
1707  }
1708 
1709  in_streamed_transaction = false;
1711 
1712  /*
1713  * The parallel apply worker could be in a transaction in which case we
1714  * need to report the state as STATE_IDLEINTRANSACTION.
1715  */
1718  else
1720 
1722 }
1723 
1724 /*
1725  * Helper function to handle STREAM ABORT message when the transaction was
1726  * serialized to file.
1727  */
1728 static void
1730 {
1731  /*
1732  * If the two XIDs are the same, it's in fact abort of toplevel xact, so
1733  * just delete the files with serialized info.
1734  */
1735  if (xid == subxid)
1737  else
1738  {
1739  /*
1740  * OK, so it's a subxact. We need to read the subxact file for the
1741  * toplevel transaction, determine the offset tracked for the subxact,
1742  * and truncate the file with changes. We also remove the subxacts
1743  * with higher offsets (or rather higher XIDs).
1744  *
1745  * We intentionally scan the array from the tail, because we're likely
1746  * aborting a change for the most recent subtransactions.
1747  *
1748  * We can't use the binary search here as subxact XIDs won't
1749  * necessarily arrive in sorted order, consider the case where we have
1750  * released the savepoint for multiple subtransactions and then
1751  * performed rollback to savepoint for one of the earlier
1752  * sub-transaction.
1753  */
1754  int64 i;
1755  int64 subidx;
1756  BufFile *fd;
1757  bool found = false;
1758  char path[MAXPGPATH];
1759 
1760  subidx = -1;
1763 
1764  for (i = subxact_data.nsubxacts; i > 0; i--)
1765  {
1766  if (subxact_data.subxacts[i - 1].xid == subxid)
1767  {
1768  subidx = (i - 1);
1769  found = true;
1770  break;
1771  }
1772  }
1773 
1774  /*
1775  * If it's an empty sub-transaction then we will not find the subxid
1776  * here so just cleanup the subxact info and return.
1777  */
1778  if (!found)
1779  {
1780  /* Cleanup the subxact info */
1784  return;
1785  }
1786 
1787  /* open the changes file */
1790  O_RDWR, false);
1791 
1792  /* OK, truncate the file at the right offset */
1794  subxact_data.subxacts[subidx].offset);
1795  BufFileClose(fd);
1796 
1797  /* discard the subxacts added later */
1798  subxact_data.nsubxacts = subidx;
1799 
1800  /* write the updated subxact list */
1802 
1805  }
1806 }
1807 
1808 /*
1809  * Handle STREAM ABORT message.
1810  */
1811 static void
1813 {
1814  TransactionId xid;
1815  TransactionId subxid;
1816  LogicalRepStreamAbortData abort_data;
1817  ParallelApplyWorkerInfo *winfo;
1818  TransApplyAction apply_action;
1819 
1820  /* Save the message before it is consumed. */
1821  StringInfoData original_msg = *s;
1822  bool toplevel_xact;
1823 
1825  ereport(ERROR,
1826  (errcode(ERRCODE_PROTOCOL_VIOLATION),
1827  errmsg_internal("STREAM ABORT message without STREAM STOP")));
1828 
1829  /* We receive abort information only when we can apply in parallel. */
1830  logicalrep_read_stream_abort(s, &abort_data,
1832 
1833  xid = abort_data.xid;
1834  subxid = abort_data.subxid;
1835  toplevel_xact = (xid == subxid);
1836 
1837  set_apply_error_context_xact(subxid, abort_data.abort_lsn);
1838 
1839  apply_action = get_transaction_apply_action(xid, &winfo);
1840 
1841  switch (apply_action)
1842  {
1843  case TRANS_LEADER_APPLY:
1844 
1845  /*
1846  * We are in the leader apply worker and the transaction has been
1847  * serialized to file.
1848  */
1849  stream_abort_internal(xid, subxid);
1850 
1851  elog(DEBUG1, "finished processing the STREAM ABORT command");
1852  break;
1853 
1855  Assert(winfo);
1856 
1857  /*
1858  * For the case of aborting the subtransaction, we increment the
1859  * number of streaming blocks and take the lock again before
1860  * sending the STREAM_ABORT to ensure that the parallel apply
1861  * worker will wait on the lock for the next set of changes after
1862  * processing the STREAM_ABORT message if it is not already
1863  * waiting for STREAM_STOP message.
1864  *
1865  * It is important to perform this locking before sending the
1866  * STREAM_ABORT message so that the leader can hold the lock first
1867  * and the parallel apply worker will wait for the leader to
1868  * release the lock. This is the same as what we do in
1869  * apply_handle_stream_stop. See Locking Considerations atop
1870  * applyparallelworker.c.
1871  */
1872  if (!toplevel_xact)
1873  {
1877  }
1878 
1879  if (pa_send_data(winfo, s->len, s->data))
1880  {
1881  /*
1882  * Unlike STREAM_COMMIT and STREAM_PREPARE, we don't need to
1883  * wait here for the parallel apply worker to finish as that
1884  * is not required to maintain the commit order and won't have
1885  * the risk of failures due to transaction dependencies and
1886  * deadlocks. However, it is possible that before the parallel
1887  * worker finishes and we clear the worker info, the xid
1888  * wraparound happens on the upstream and a new transaction
1889  * with the same xid can appear and that can lead to duplicate
1890  * entries in ParallelApplyTxnHash. Yet another problem could
1891  * be that we may have serialized the changes in partial
1892  * serialize mode and the file containing xact changes may
1893  * already exist, and after xid wraparound trying to create
1894  * the file for the same xid can lead to an error. To avoid
1895  * these problems, we decide to wait for the aborts to finish.
1896  *
1897  * Note, it is okay to not update the flush location position
1898  * for aborts as in worst case that means such a transaction
1899  * won't be sent again after restart.
1900  */
1901  if (toplevel_xact)
1903 
1904  break;
1905  }
1906 
1907  /*
1908  * Switch to serialize mode when we are not able to send the
1909  * change to parallel apply worker.
1910  */
1911  pa_switch_to_partial_serialize(winfo, true);
1912 
1913  /* fall through */
1915  Assert(winfo);
1916 
1917  /*
1918  * Parallel apply worker might have applied some changes, so write
1919  * the STREAM_ABORT message so that it can rollback the
1920  * subtransaction if needed.
1921  */
1923  &original_msg);
1924 
1925  if (toplevel_xact)
1926  {
1929  }
1930  break;
1931 
1932  case TRANS_PARALLEL_APPLY:
1933 
1934  /*
1935  * If the parallel apply worker is applying spooled messages then
1936  * close the file before aborting.
1937  */
1938  if (toplevel_xact && stream_fd)
1940 
1941  pa_stream_abort(&abort_data);
1942 
1943  /*
1944  * We need to wait after processing rollback to savepoint for the
1945  * next set of changes.
1946  *
1947  * We have a race condition here due to which we can start waiting
1948  * here when there are more chunk of streams in the queue. See
1949  * apply_handle_stream_stop.
1950  */
1951  if (!toplevel_xact)
1953 
1954  elog(DEBUG1, "finished processing the STREAM ABORT command");
1955  break;
1956 
1957  default:
1958  elog(ERROR, "unexpected apply action: %d", (int) apply_action);
1959  break;
1960  }
1961 
1963 }
1964 
1965 /*
1966  * Ensure that the passed location is fileset's end.
1967  */
1968 static void
1969 ensure_last_message(FileSet *stream_fileset, TransactionId xid, int fileno,
1970  off_t offset)
1971 {
1972  char path[MAXPGPATH];
1973  BufFile *fd;
1974  int last_fileno;
1975  off_t last_offset;
1976 
1978 
1980 
1982 
1983  fd = BufFileOpenFileSet(stream_fileset, path, O_RDONLY, false);
1984 
1985  BufFileSeek(fd, 0, 0, SEEK_END);
1986  BufFileTell(fd, &last_fileno, &last_offset);
1987 
1988  BufFileClose(fd);
1989 
1991 
1992  if (last_fileno != fileno || last_offset != offset)
1993  elog(ERROR, "unexpected message left in streaming transaction's changes file \"%s\"",
1994  path);
1995 }
1996 
1997 /*
1998  * Common spoolfile processing.
1999  */
2000 void
2002  XLogRecPtr lsn)
2003 {
2004  int nchanges;
2005  char path[MAXPGPATH];
2006  char *buffer = NULL;
2007  MemoryContext oldcxt;
2008  ResourceOwner oldowner;
2009  int fileno;
2010  off_t offset;
2011 
2012  if (!am_parallel_apply_worker())
2014 
2015  /* Make sure we have an open transaction */
2017 
2018  /*
2019  * Allocate file handle and memory required to process all the messages in
2020  * TopTransactionContext to avoid them getting reset after each message is
2021  * processed.
2022  */
2024 
2025  /* Open the spool file for the committed/prepared transaction */
2027  elog(DEBUG1, "replaying changes from file \"%s\"", path);
2028 
2029  /*
2030  * Make sure the file is owned by the toplevel transaction so that the
2031  * file will not be accidentally closed when aborting a subtransaction.
2032  */
2033  oldowner = CurrentResourceOwner;
2035 
2036  stream_fd = BufFileOpenFileSet(stream_fileset, path, O_RDONLY, false);
2037 
2038  CurrentResourceOwner = oldowner;
2039 
2040  buffer = palloc(BLCKSZ);
2041 
2042  MemoryContextSwitchTo(oldcxt);
2043 
2044  remote_final_lsn = lsn;
2045 
2046  /*
2047  * Make sure the handle apply_dispatch methods are aware we're in a remote
2048  * transaction.
2049  */
2050  in_remote_transaction = true;
2052 
2054 
2055  /*
2056  * Read the entries one by one and pass them through the same logic as in
2057  * apply_dispatch.
2058  */
2059  nchanges = 0;
2060  while (true)
2061  {
2063  size_t nbytes;
2064  int len;
2065 
2067 
2068  /* read length of the on-disk record */
2069  nbytes = BufFileReadMaybeEOF(stream_fd, &len, sizeof(len), true);
2070 
2071  /* have we reached end of the file? */
2072  if (nbytes == 0)
2073  break;
2074 
2075  /* do we have a correct length? */
2076  if (len <= 0)
2077  elog(ERROR, "incorrect length %d in streaming transaction's changes file \"%s\"",
2078  len, path);
2079 
2080  /* make sure we have sufficiently large buffer */
2081  buffer = repalloc(buffer, len);
2082 
2083  /* and finally read the data into the buffer */
2084  BufFileReadExact(stream_fd, buffer, len);
2085 
2086  BufFileTell(stream_fd, &fileno, &offset);
2087 
2088  /* init a stringinfo using the buffer and call apply_dispatch */
2089  initReadOnlyStringInfo(&s2, buffer, len);
2090 
2091  /* Ensure we are reading the data into our memory context. */
2093 
2094  apply_dispatch(&s2);
2095 
2097 
2098  MemoryContextSwitchTo(oldcxt);
2099 
2100  nchanges++;
2101 
2102  /*
2103  * It is possible the file has been closed because we have processed
2104  * the transaction end message like stream_commit in which case that
2105  * must be the last message.
2106  */
2107  if (!stream_fd)
2108  {
2109  ensure_last_message(stream_fileset, xid, fileno, offset);
2110  break;
2111  }
2112 
2113  if (nchanges % 1000 == 0)
2114  elog(DEBUG1, "replayed %d changes from file \"%s\"",
2115  nchanges, path);
2116  }
2117 
2118  if (stream_fd)
2120 
2121  elog(DEBUG1, "replayed %d (all) changes from file \"%s\"",
2122  nchanges, path);
2123 
2124  return;
2125 }
2126 
2127 /*
2128  * Handle STREAM COMMIT message.
2129  */
2130 static void
2132 {
2133  TransactionId xid;
2134  LogicalRepCommitData commit_data;
2135  ParallelApplyWorkerInfo *winfo;
2136  TransApplyAction apply_action;
2137 
2138  /* Save the message before it is consumed. */
2139  StringInfoData original_msg = *s;
2140 
2142  ereport(ERROR,
2143  (errcode(ERRCODE_PROTOCOL_VIOLATION),
2144  errmsg_internal("STREAM COMMIT message without STREAM STOP")));
2145 
2146  xid = logicalrep_read_stream_commit(s, &commit_data);
2147  set_apply_error_context_xact(xid, commit_data.commit_lsn);
2148 
2149  apply_action = get_transaction_apply_action(xid, &winfo);
2150 
2151  switch (apply_action)
2152  {
2153  case TRANS_LEADER_APPLY:
2154 
2155  /*
2156  * The transaction has been serialized to file, so replay all the
2157  * spooled operations.
2158  */
2160  commit_data.commit_lsn);
2161 
2162  apply_handle_commit_internal(&commit_data);
2163 
2164  /* Unlink the files with serialized changes and subxact info. */
2166 
2167  elog(DEBUG1, "finished processing the STREAM COMMIT command");
2168  break;
2169 
2171  Assert(winfo);
2172 
2173  if (pa_send_data(winfo, s->len, s->data))
2174  {
2175  /* Finish processing the streaming transaction. */
2176  pa_xact_finish(winfo, commit_data.end_lsn);
2177  break;
2178  }
2179 
2180  /*
2181  * Switch to serialize mode when we are not able to send the
2182  * change to parallel apply worker.
2183  */
2184  pa_switch_to_partial_serialize(winfo, true);
2185 
2186  /* fall through */
2188  Assert(winfo);
2189 
2191  &original_msg);
2192 
2194 
2195  /* Finish processing the streaming transaction. */
2196  pa_xact_finish(winfo, commit_data.end_lsn);
2197  break;
2198 
2199  case TRANS_PARALLEL_APPLY:
2200 
2201  /*
2202  * If the parallel apply worker is applying spooled messages then
2203  * close the file before committing.
2204  */
2205  if (stream_fd)
2207 
2208  apply_handle_commit_internal(&commit_data);
2209 
2211 
2212  /*
2213  * It is important to set the transaction state as finished before
2214  * releasing the lock. See pa_wait_for_xact_finish.
2215  */
2218 
2220 
2221  elog(DEBUG1, "finished processing the STREAM COMMIT command");
2222  break;
2223 
2224  default:
2225  elog(ERROR, "unexpected apply action: %d", (int) apply_action);
2226  break;
2227  }
2228 
2229  /* Process any tables that are being synchronized in parallel. */
2230  process_syncing_tables(commit_data.end_lsn);
2231 
2233 
2235 }
2236 
2237 /*
2238  * Helper function for apply_handle_commit and apply_handle_stream_commit.
2239  */
2240 static void
2242 {
2243  if (is_skipping_changes())
2244  {
2246 
2247  /*
2248  * Start a new transaction to clear the subskiplsn, if not started
2249  * yet.
2250  */
2251  if (!IsTransactionState())
2253  }
2254 
2255  if (IsTransactionState())
2256  {
2257  /*
2258  * The transaction is either non-empty or skipped, so we clear the
2259  * subskiplsn.
2260  */
2262 
2263  /*
2264  * Update origin state so we can restart streaming from correct
2265  * position in case of crash.
2266  */
2267  replorigin_session_origin_lsn = commit_data->end_lsn;
2269 
2271 
2272  if (IsTransactionBlock())
2273  {
2274  EndTransactionBlock(false);
2276  }
2277 
2278  pgstat_report_stat(false);
2279 
2281  }
2282  else
2283  {
2284  /* Process any invalidation messages that might have accumulated. */
2287  }
2288 
2289  in_remote_transaction = false;
2290 }
2291 
2292 /*
2293  * Handle RELATION message.
2294  *
2295  * Note we don't do validation against local schema here. The validation
2296  * against local schema is postponed until first change for given relation
2297  * comes as we only care about it when applying changes for it anyway and we
2298  * do less locking this way.
2299  */
2300 static void
2302 {
2303  LogicalRepRelation *rel;
2304 
2306  return;
2307 
2308  rel = logicalrep_read_rel(s);
2310 
2311  /* Also reset all entries in the partition map that refer to remoterel. */
2313 }
2314 
2315 /*
2316  * Handle TYPE message.
2317  *
2318  * This implementation pays no attention to TYPE messages; we expect the user
2319  * to have set things up so that the incoming data is acceptable to the input
2320  * functions for the locally subscribed tables. Hence, we just read and
2321  * discard the message.
2322  */
2323 static void
2325 {
2326  LogicalRepTyp typ;
2327 
2329  return;
2330 
2331  logicalrep_read_typ(s, &typ);
2332 }
2333 
2334 /*
2335  * Check that we (the subscription owner) have sufficient privileges on the
2336  * target relation to perform the given operation.
2337  */
2338 static void
2340 {
2341  Oid relid;
2342  AclResult aclresult;
2343 
2344  relid = RelationGetRelid(rel);
2345  aclresult = pg_class_aclcheck(relid, GetUserId(), mode);
2346  if (aclresult != ACLCHECK_OK)
2347  aclcheck_error(aclresult,
2348  get_relkind_objtype(rel->rd_rel->relkind),
2349  get_rel_name(relid));
2350 
2351  /*
2352  * We lack the infrastructure to honor RLS policies. It might be possible
2353  * to add such infrastructure here, but tablesync workers lack it, too, so
2354  * we don't bother. RLS does not ordinarily apply to TRUNCATE commands,
2355  * but it seems dangerous to replicate a TRUNCATE and then refuse to
2356  * replicate subsequent INSERTs, so we forbid all commands the same.
2357  */
2358  if (check_enable_rls(relid, InvalidOid, false) == RLS_ENABLED)
2359  ereport(ERROR,
2360  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2361  errmsg("user \"%s\" cannot replicate into relation with row-level security enabled: \"%s\"",
2362  GetUserNameFromId(GetUserId(), true),
2363  RelationGetRelationName(rel))));
2364 }
2365 
2366 /*
2367  * Handle INSERT message.
2368  */
2369 
2370 static void
2372 {
2373  LogicalRepRelMapEntry *rel;
2374  LogicalRepTupleData newtup;
2375  LogicalRepRelId relid;
2376  UserContext ucxt;
2377  ApplyExecutionData *edata;
2378  EState *estate;
2379  TupleTableSlot *remoteslot;
2380  MemoryContext oldctx;
2381  bool run_as_owner;
2382 
2383  /*
2384  * Quick return if we are skipping data modification changes or handling
2385  * streamed transactions.
2386  */
2387  if (is_skipping_changes() ||
2389  return;
2390 
2392 
2393  relid = logicalrep_read_insert(s, &newtup);
2394  rel = logicalrep_rel_open(relid, RowExclusiveLock);
2395  if (!should_apply_changes_for_rel(rel))
2396  {
2397  /*
2398  * The relation can't become interesting in the middle of the
2399  * transaction so it's safe to unlock it.
2400  */
2403  return;
2404  }
2405 
2406  /*
2407  * Make sure that any user-supplied code runs as the table owner, unless
2408  * the user has opted out of that behavior.
2409  */
2410  run_as_owner = MySubscription->runasowner;
2411  if (!run_as_owner)
2412  SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt);
2413 
2414  /* Set relation for error callback */
2416 
2417  /* Initialize the executor state. */
2418  edata = create_edata_for_relation(rel);
2419  estate = edata->estate;
2420  remoteslot = ExecInitExtraTupleSlot(estate,
2421  RelationGetDescr(rel->localrel),
2422  &TTSOpsVirtual);
2423 
2424  /* Process and store remote tuple in the slot */
2426  slot_store_data(remoteslot, rel, &newtup);
2427  slot_fill_defaults(rel, estate, remoteslot);
2428  MemoryContextSwitchTo(oldctx);
2429 
2430  /* For a partitioned table, insert the tuple into a partition. */
2431  if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
2433  remoteslot, NULL, CMD_INSERT);
2434  else
2436  remoteslot);
2437 
2438  finish_edata(edata);
2439 
2440  /* Reset relation for error callback */
2442 
2443  if (!run_as_owner)
2444  RestoreUserContext(&ucxt);
2445 
2447 
2449 }
2450 
2451 /*
2452  * Workhorse for apply_handle_insert()
2453  * relinfo is for the relation we're actually inserting into
2454  * (could be a child partition of edata->targetRelInfo)
2455  */
2456 static void
2458  ResultRelInfo *relinfo,
2459  TupleTableSlot *remoteslot)
2460 {
2461  EState *estate = edata->estate;
2462 
2463  /* We must open indexes here. */
2464  ExecOpenIndices(relinfo, false);
2465 
2466  /* Do the insert. */
2468  ExecSimpleRelationInsert(relinfo, estate, remoteslot);
2469 
2470  /* Cleanup. */
2471  ExecCloseIndices(relinfo);
2472 }
2473 
2474 /*
2475  * Check if the logical replication relation is updatable and throw
2476  * appropriate error if it isn't.
2477  */
2478 static void
2480 {
2481  /*
2482  * For partitioned tables, we only need to care if the target partition is
2483  * updatable (aka has PK or RI defined for it).
2484  */
2485  if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
2486  return;
2487 
2488  /* Updatable, no error. */
2489  if (rel->updatable)
2490  return;
2491 
2492  /*
2493  * We are in error mode so it's fine this is somewhat slow. It's better to
2494  * give user correct error.
2495  */
2497  {
2498  ereport(ERROR,
2499  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2500  errmsg("publisher did not send replica identity column "
2501  "expected by the logical replication target relation \"%s.%s\"",
2502  rel->remoterel.nspname, rel->remoterel.relname)));
2503  }
2504 
2505  ereport(ERROR,
2506  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2507  errmsg("logical replication target relation \"%s.%s\" has "
2508  "neither REPLICA IDENTITY index nor PRIMARY "
2509  "KEY and published relation does not have "
2510  "REPLICA IDENTITY FULL",
2511  rel->remoterel.nspname, rel->remoterel.relname)));
2512 }
2513 
2514 /*
2515  * Handle UPDATE message.
2516  *
2517  * TODO: FDW support
2518  */
2519 static void
2521 {
2522  LogicalRepRelMapEntry *rel;
2523  LogicalRepRelId relid;
2524  UserContext ucxt;
2525  ApplyExecutionData *edata;
2526  EState *estate;
2527  LogicalRepTupleData oldtup;
2528  LogicalRepTupleData newtup;
2529  bool has_oldtup;
2530  TupleTableSlot *remoteslot;
2531  RTEPermissionInfo *target_perminfo;
2532  MemoryContext oldctx;
2533  bool run_as_owner;
2534 
2535  /*
2536  * Quick return if we are skipping data modification changes or handling
2537  * streamed transactions.
2538  */
2539  if (is_skipping_changes() ||
2541  return;
2542 
2544 
2545  relid = logicalrep_read_update(s, &has_oldtup, &oldtup,
2546  &newtup);
2547  rel = logicalrep_rel_open(relid, RowExclusiveLock);
2548  if (!should_apply_changes_for_rel(rel))
2549  {
2550  /*
2551  * The relation can't become interesting in the middle of the
2552  * transaction so it's safe to unlock it.
2553  */
2556  return;
2557  }
2558 
2559  /* Set relation for error callback */
2561 
2562  /* Check if we can do the update. */
2564 
2565  /*
2566  * Make sure that any user-supplied code runs as the table owner, unless
2567  * the user has opted out of that behavior.
2568  */
2569  run_as_owner = MySubscription->runasowner;
2570  if (!run_as_owner)
2571  SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt);
2572 
2573  /* Initialize the executor state. */
2574  edata = create_edata_for_relation(rel);
2575  estate = edata->estate;
2576  remoteslot = ExecInitExtraTupleSlot(estate,
2577  RelationGetDescr(rel->localrel),
2578  &TTSOpsVirtual);
2579 
2580  /*
2581  * Populate updatedCols so that per-column triggers can fire, and so
2582  * executor can correctly pass down indexUnchanged hint. This could
2583  * include more columns than were actually changed on the publisher
2584  * because the logical replication protocol doesn't contain that
2585  * information. But it would for example exclude columns that only exist
2586  * on the subscriber, since we are not touching those.
2587  */
2588  target_perminfo = list_nth(estate->es_rteperminfos, 0);
2589  for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++)
2590  {
2592  int remoteattnum = rel->attrmap->attnums[i];
2593 
2594  if (!att->attisdropped && remoteattnum >= 0)
2595  {
2596  Assert(remoteattnum < newtup.ncols);
2597  if (newtup.colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED)
2598  target_perminfo->updatedCols =
2599  bms_add_member(target_perminfo->updatedCols,
2601  }
2602  }
2603 
2604  /* Build the search tuple. */
2606  slot_store_data(remoteslot, rel,
2607  has_oldtup ? &oldtup : &newtup);
2608  MemoryContextSwitchTo(oldctx);
2609 
2610  /* For a partitioned table, apply update to correct partition. */
2611  if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
2613  remoteslot, &newtup, CMD_UPDATE);
2614  else
2616  remoteslot, &newtup, rel->localindexoid);
2617 
2618  finish_edata(edata);
2619 
2620  /* Reset relation for error callback */
2622 
2623  if (!run_as_owner)
2624  RestoreUserContext(&ucxt);
2625 
2627 
2629 }
2630 
2631 /*
2632  * Workhorse for apply_handle_update()
2633  * relinfo is for the relation we're actually updating in
2634  * (could be a child partition of edata->targetRelInfo)
2635  */
2636 static void
2638  ResultRelInfo *relinfo,
2639  TupleTableSlot *remoteslot,
2640  LogicalRepTupleData *newtup,
2641  Oid localindexoid)
2642 {
2643  EState *estate = edata->estate;
2644  LogicalRepRelMapEntry *relmapentry = edata->targetRel;
2645  Relation localrel = relinfo->ri_RelationDesc;
2646  EPQState epqstate;
2647  TupleTableSlot *localslot;
2648  bool found;
2649  MemoryContext oldctx;
2650 
2651  EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL);
2652  ExecOpenIndices(relinfo, false);
2653 
2654  found = FindReplTupleInLocalRel(edata, localrel,
2655  &relmapentry->remoterel,
2656  localindexoid,
2657  remoteslot, &localslot);
2658  ExecClearTuple(remoteslot);
2659 
2660  /*
2661  * Tuple found.
2662  *
2663  * Note this will fail if there are other conflicting unique indexes.
2664  */
2665  if (found)
2666  {
2667  /* Process and store remote tuple in the slot */
2669  slot_modify_data(remoteslot, localslot, relmapentry, newtup);
2670  MemoryContextSwitchTo(oldctx);
2671 
2672  EvalPlanQualSetSlot(&epqstate, remoteslot);
2673 
2674  /* Do the actual update. */
2676  ExecSimpleRelationUpdate(relinfo, estate, &epqstate, localslot,
2677  remoteslot);
2678  }
2679  else
2680  {
2681  /*
2682  * The tuple to be updated could not be found. Do nothing except for
2683  * emitting a log message.
2684  *
2685  * XXX should this be promoted to ereport(LOG) perhaps?
2686  */
2687  elog(DEBUG1,
2688  "logical replication did not find row to be updated "
2689  "in replication target relation \"%s\"",
2690  RelationGetRelationName(localrel));
2691  }
2692 
2693  /* Cleanup. */
2694  ExecCloseIndices(relinfo);
2695  EvalPlanQualEnd(&epqstate);
2696 }
2697 
2698 /*
2699  * Handle DELETE message.
2700  *
2701  * TODO: FDW support
2702  */
2703 static void
2705 {
2706  LogicalRepRelMapEntry *rel;
2707  LogicalRepTupleData oldtup;
2708  LogicalRepRelId relid;
2709  UserContext ucxt;
2710  ApplyExecutionData *edata;
2711  EState *estate;
2712  TupleTableSlot *remoteslot;
2713  MemoryContext oldctx;
2714  bool run_as_owner;
2715 
2716  /*
2717  * Quick return if we are skipping data modification changes or handling
2718  * streamed transactions.
2719  */
2720  if (is_skipping_changes() ||
2722  return;
2723 
2725 
2726  relid = logicalrep_read_delete(s, &oldtup);
2727  rel = logicalrep_rel_open(relid, RowExclusiveLock);
2728  if (!should_apply_changes_for_rel(rel))
2729  {
2730  /*
2731  * The relation can't become interesting in the middle of the
2732  * transaction so it's safe to unlock it.
2733  */
2736  return;
2737  }
2738 
2739  /* Set relation for error callback */
2741 
2742  /* Check if we can do the delete. */
2744 
2745  /*
2746  * Make sure that any user-supplied code runs as the table owner, unless
2747  * the user has opted out of that behavior.
2748  */
2749  run_as_owner = MySubscription->runasowner;
2750  if (!run_as_owner)
2751  SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt);
2752 
2753  /* Initialize the executor state. */
2754  edata = create_edata_for_relation(rel);
2755  estate = edata->estate;
2756  remoteslot = ExecInitExtraTupleSlot(estate,
2757  RelationGetDescr(rel->localrel),
2758  &TTSOpsVirtual);
2759 
2760  /* Build the search tuple. */
2762  slot_store_data(remoteslot, rel, &oldtup);
2763  MemoryContextSwitchTo(oldctx);
2764 
2765  /* For a partitioned table, apply delete to correct partition. */
2766  if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
2768  remoteslot, NULL, CMD_DELETE);
2769  else
2771  remoteslot, rel->localindexoid);
2772 
2773  finish_edata(edata);
2774 
2775  /* Reset relation for error callback */
2777 
2778  if (!run_as_owner)
2779  RestoreUserContext(&ucxt);
2780 
2782 
2784 }
2785 
2786 /*
2787  * Workhorse for apply_handle_delete()
2788  * relinfo is for the relation we're actually deleting from
2789  * (could be a child partition of edata->targetRelInfo)
2790  */
2791 static void
2793  ResultRelInfo *relinfo,
2794  TupleTableSlot *remoteslot,
2795  Oid localindexoid)
2796 {
2797  EState *estate = edata->estate;
2798  Relation localrel = relinfo->ri_RelationDesc;
2799  LogicalRepRelation *remoterel = &edata->targetRel->remoterel;
2800  EPQState epqstate;
2801  TupleTableSlot *localslot;
2802  bool found;
2803 
2804  EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL);
2805  ExecOpenIndices(relinfo, false);
2806 
2807  found = FindReplTupleInLocalRel(edata, localrel, remoterel, localindexoid,
2808  remoteslot, &localslot);
2809 
2810  /* If found delete it. */
2811  if (found)
2812  {
2813  EvalPlanQualSetSlot(&epqstate, localslot);
2814 
2815  /* Do the actual delete. */
2817  ExecSimpleRelationDelete(relinfo, estate, &epqstate, localslot);
2818  }
2819  else
2820  {
2821  /*
2822  * The tuple to be deleted could not be found. Do nothing except for
2823  * emitting a log message.
2824  *
2825  * XXX should this be promoted to ereport(LOG) perhaps?
2826  */
2827  elog(DEBUG1,
2828  "logical replication did not find row to be deleted "
2829  "in replication target relation \"%s\"",
2830  RelationGetRelationName(localrel));
2831  }
2832 
2833  /* Cleanup. */
2834  ExecCloseIndices(relinfo);
2835  EvalPlanQualEnd(&epqstate);
2836 }
2837 
2838 /*
2839  * Try to find a tuple received from the publication side (in 'remoteslot') in
2840  * the corresponding local relation using either replica identity index,
2841  * primary key, index or if needed, sequential scan.
2842  *
2843  * Local tuple, if found, is returned in '*localslot'.
2844  */
2845 static bool
2847  LogicalRepRelation *remoterel,
2848  Oid localidxoid,
2849  TupleTableSlot *remoteslot,
2850  TupleTableSlot **localslot)
2851 {
2852  EState *estate = edata->estate;
2853  bool found;
2854 
2855  /*
2856  * Regardless of the top-level operation, we're performing a read here, so
2857  * check for SELECT privileges.
2858  */
2859  TargetPrivilegesCheck(localrel, ACL_SELECT);
2860 
2861  *localslot = table_slot_create(localrel, &estate->es_tupleTable);
2862 
2863  Assert(OidIsValid(localidxoid) ||
2864  (remoterel->replident == REPLICA_IDENTITY_FULL));
2865 
2866  if (OidIsValid(localidxoid))
2867  {
2868 #ifdef USE_ASSERT_CHECKING
2869  Relation idxrel = index_open(localidxoid, AccessShareLock);
2870 
2871  /* Index must be PK, RI, or usable for REPLICA IDENTITY FULL tables */
2872  Assert(GetRelationIdentityOrPK(idxrel) == localidxoid ||
2874  edata->targetRel->attrmap));
2875  index_close(idxrel, AccessShareLock);
2876 #endif
2877 
2878  found = RelationFindReplTupleByIndex(localrel, localidxoid,
2880  remoteslot, *localslot);
2881  }
2882  else
2883  found = RelationFindReplTupleSeq(localrel, LockTupleExclusive,
2884  remoteslot, *localslot);
2885 
2886  return found;
2887 }
2888 
2889 /*
2890  * This handles insert, update, delete on a partitioned table.
2891  */
2892 static void
2894  TupleTableSlot *remoteslot,
2895  LogicalRepTupleData *newtup,
2896  CmdType operation)
2897 {
2898  EState *estate = edata->estate;
2899  LogicalRepRelMapEntry *relmapentry = edata->targetRel;
2900  ResultRelInfo *relinfo = edata->targetRelInfo;
2901  Relation parentrel = relinfo->ri_RelationDesc;
2902  ModifyTableState *mtstate;
2903  PartitionTupleRouting *proute;
2904  ResultRelInfo *partrelinfo;
2905  Relation partrel;
2906  TupleTableSlot *remoteslot_part;
2907  TupleConversionMap *map;
2908  MemoryContext oldctx;
2909  LogicalRepRelMapEntry *part_entry = NULL;
2910  AttrMap *attrmap = NULL;
2911 
2912  /* ModifyTableState is needed for ExecFindPartition(). */
2913  edata->mtstate = mtstate = makeNode(ModifyTableState);
2914  mtstate->ps.plan = NULL;
2915  mtstate->ps.state = estate;
2916  mtstate->operation = operation;
2917  mtstate->resultRelInfo = relinfo;
2918 
2919  /* ... as is PartitionTupleRouting. */
2920  edata->proute = proute = ExecSetupPartitionTupleRouting(estate, parentrel);
2921 
2922  /*
2923  * Find the partition to which the "search tuple" belongs.
2924  */
2925  Assert(remoteslot != NULL);
2927  partrelinfo = ExecFindPartition(mtstate, relinfo, proute,
2928  remoteslot, estate);
2929  Assert(partrelinfo != NULL);
2930  partrel = partrelinfo->ri_RelationDesc;
2931 
2932  /*
2933  * Check for supported relkind. We need this since partitions might be of
2934  * unsupported relkinds; and the set of partitions can change, so checking
2935  * at CREATE/ALTER SUBSCRIPTION would be insufficient.
2936  */
2937  CheckSubscriptionRelkind(partrel->rd_rel->relkind,
2939  RelationGetRelationName(partrel));
2940 
2941  /*
2942  * To perform any of the operations below, the tuple must match the
2943  * partition's rowtype. Convert if needed or just copy, using a dedicated
2944  * slot to store the tuple in any case.
2945  */
2946  remoteslot_part = partrelinfo->ri_PartitionTupleSlot;
2947  if (remoteslot_part == NULL)
2948  remoteslot_part = table_slot_create(partrel, &estate->es_tupleTable);
2949  map = ExecGetRootToChildMap(partrelinfo, estate);
2950  if (map != NULL)
2951  {
2952  attrmap = map->attrMap;
2953  remoteslot_part = execute_attr_map_slot(attrmap, remoteslot,
2954  remoteslot_part);
2955  }
2956  else
2957  {
2958  remoteslot_part = ExecCopySlot(remoteslot_part, remoteslot);
2959  slot_getallattrs(remoteslot_part);
2960  }
2961  MemoryContextSwitchTo(oldctx);
2962 
2963  /* Check if we can do the update or delete on the leaf partition. */
2964  if (operation == CMD_UPDATE || operation == CMD_DELETE)
2965  {
2966  part_entry = logicalrep_partition_open(relmapentry, partrel,
2967  attrmap);
2968  check_relation_updatable(part_entry);
2969  }
2970 
2971  switch (operation)
2972  {
2973  case CMD_INSERT:
2974  apply_handle_insert_internal(edata, partrelinfo,
2975  remoteslot_part);
2976  break;
2977 
2978  case CMD_DELETE:
2979  apply_handle_delete_internal(edata, partrelinfo,
2980  remoteslot_part,
2981  part_entry->localindexoid);
2982  break;
2983 
2984  case CMD_UPDATE:
2985 
2986  /*
2987  * For UPDATE, depending on whether or not the updated tuple
2988  * satisfies the partition's constraint, perform a simple UPDATE
2989  * of the partition or move the updated tuple into a different
2990  * suitable partition.
2991  */
2992  {
2993  TupleTableSlot *localslot;
2994  ResultRelInfo *partrelinfo_new;
2995  Relation partrel_new;
2996  bool found;
2997 
2998  /* Get the matching local tuple from the partition. */
2999  found = FindReplTupleInLocalRel(edata, partrel,
3000  &part_entry->remoterel,
3001  part_entry->localindexoid,
3002  remoteslot_part, &localslot);
3003  if (!found)
3004  {
3005  /*
3006  * The tuple to be updated could not be found. Do nothing
3007  * except for emitting a log message.
3008  *
3009  * XXX should this be promoted to ereport(LOG) perhaps?
3010  */
3011  elog(DEBUG1,
3012  "logical replication did not find row to be updated "
3013  "in replication target relation's partition \"%s\"",
3014  RelationGetRelationName(partrel));
3015  return;
3016  }
3017 
3018  /*
3019  * Apply the update to the local tuple, putting the result in
3020  * remoteslot_part.
3021  */
3023  slot_modify_data(remoteslot_part, localslot, part_entry,
3024  newtup);
3025  MemoryContextSwitchTo(oldctx);
3026 
3027  /*
3028  * Does the updated tuple still satisfy the current
3029  * partition's constraint?
3030  */
3031  if (!partrel->rd_rel->relispartition ||
3032  ExecPartitionCheck(partrelinfo, remoteslot_part, estate,
3033  false))
3034  {
3035  /*
3036  * Yes, so simply UPDATE the partition. We don't call
3037  * apply_handle_update_internal() here, which would
3038  * normally do the following work, to avoid repeating some
3039  * work already done above to find the local tuple in the
3040  * partition.
3041  */
3042  EPQState epqstate;
3043 
3044  EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL);
3045  ExecOpenIndices(partrelinfo, false);
3046 
3047  EvalPlanQualSetSlot(&epqstate, remoteslot_part);
3049  ACL_UPDATE);
3050  ExecSimpleRelationUpdate(partrelinfo, estate, &epqstate,
3051  localslot, remoteslot_part);
3052  ExecCloseIndices(partrelinfo);
3053  EvalPlanQualEnd(&epqstate);
3054  }
3055  else
3056  {
3057  /* Move the tuple into the new partition. */
3058 
3059  /*
3060  * New partition will be found using tuple routing, which
3061  * can only occur via the parent table. We might need to
3062  * convert the tuple to the parent's rowtype. Note that
3063  * this is the tuple found in the partition, not the
3064  * original search tuple received by this function.
3065  */
3066  if (map)
3067  {
3068  TupleConversionMap *PartitionToRootMap =
3070  RelationGetDescr(parentrel));
3071 
3072  remoteslot =
3073  execute_attr_map_slot(PartitionToRootMap->attrMap,
3074  remoteslot_part, remoteslot);
3075  }
3076  else
3077  {
3078  remoteslot = ExecCopySlot(remoteslot, remoteslot_part);
3079  slot_getallattrs(remoteslot);
3080  }
3081 
3082  /* Find the new partition. */
3084  partrelinfo_new = ExecFindPartition(mtstate, relinfo,
3085  proute, remoteslot,
3086  estate);
3087  MemoryContextSwitchTo(oldctx);
3088  Assert(partrelinfo_new != partrelinfo);
3089  partrel_new = partrelinfo_new->ri_RelationDesc;
3090 
3091  /* Check that new partition also has supported relkind. */
3092  CheckSubscriptionRelkind(partrel_new->rd_rel->relkind,
3094  RelationGetRelationName(partrel_new));
3095 
3096  /* DELETE old tuple found in the old partition. */
3097  apply_handle_delete_internal(edata, partrelinfo,
3098  localslot,
3099  part_entry->localindexoid);
3100 
3101  /* INSERT new tuple into the new partition. */
3102 
3103  /*
3104  * Convert the replacement tuple to match the destination
3105  * partition rowtype.
3106  */
3108  remoteslot_part = partrelinfo_new->ri_PartitionTupleSlot;
3109  if (remoteslot_part == NULL)
3110  remoteslot_part = table_slot_create(partrel_new,
3111  &estate->es_tupleTable);
3112  map = ExecGetRootToChildMap(partrelinfo_new, estate);
3113  if (map != NULL)
3114  {
3115  remoteslot_part = execute_attr_map_slot(map->attrMap,
3116  remoteslot,
3117  remoteslot_part);
3118  }
3119  else
3120  {
3121  remoteslot_part = ExecCopySlot(remoteslot_part,
3122  remoteslot);
3123  slot_getallattrs(remoteslot);
3124  }
3125  MemoryContextSwitchTo(oldctx);
3126  apply_handle_insert_internal(edata, partrelinfo_new,
3127  remoteslot_part);
3128  }
3129  }
3130  break;
3131 
3132  default:
3133  elog(ERROR, "unrecognized CmdType: %d", (int) operation);
3134  break;
3135  }
3136 }
3137 
3138 /*
3139  * Handle TRUNCATE message.
3140  *
3141  * TODO: FDW support
3142  */
3143 static void
3145 {
3146  bool cascade = false;
3147  bool restart_seqs = false;
3148  List *remote_relids = NIL;
3149  List *remote_rels = NIL;
3150  List *rels = NIL;
3151  List *part_rels = NIL;
3152  List *relids = NIL;
3153  List *relids_logged = NIL;
3154  ListCell *lc;
3155  LOCKMODE lockmode = AccessExclusiveLock;
3156 
3157  /*
3158  * Quick return if we are skipping data modification changes or handling
3159  * streamed transactions.
3160  */
3161  if (is_skipping_changes() ||
3163  return;
3164 
3166 
3167  remote_relids = logicalrep_read_truncate(s, &cascade, &restart_seqs);
3168 
3169  foreach(lc, remote_relids)
3170  {
3171  LogicalRepRelId relid = lfirst_oid(lc);
3172  LogicalRepRelMapEntry *rel;
3173 
3174  rel = logicalrep_rel_open(relid, lockmode);
3175  if (!should_apply_changes_for_rel(rel))
3176  {
3177  /*
3178  * The relation can't become interesting in the middle of the
3179  * transaction so it's safe to unlock it.
3180  */
3181  logicalrep_rel_close(rel, lockmode);
3182  continue;
3183  }
3184 
3185  remote_rels = lappend(remote_rels, rel);
3187  rels = lappend(rels, rel->localrel);
3188  relids = lappend_oid(relids, rel->localreloid);
3190  relids_logged = lappend_oid(relids_logged, rel->localreloid);
3191 
3192  /*
3193  * Truncate partitions if we got a message to truncate a partitioned
3194  * table.
3195  */
3196  if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
3197  {
3198  ListCell *child;
3199  List *children = find_all_inheritors(rel->localreloid,
3200  lockmode,
3201  NULL);
3202 
3203  foreach(child, children)
3204  {
3205  Oid childrelid = lfirst_oid(child);
3206  Relation childrel;
3207 
3208  if (list_member_oid(relids, childrelid))
3209  continue;
3210 
3211  /* find_all_inheritors already got lock */
3212  childrel = table_open(childrelid, NoLock);
3213 
3214  /*
3215  * Ignore temp tables of other backends. See similar code in
3216  * ExecuteTruncate().
3217  */
3218  if (RELATION_IS_OTHER_TEMP(childrel))
3219  {
3220  table_close(childrel, lockmode);
3221  continue;
3222  }
3223 
3225  rels = lappend(rels, childrel);
3226  part_rels = lappend(part_rels, childrel);
3227  relids = lappend_oid(relids, childrelid);
3228  /* Log this relation only if needed for logical decoding */
3229  if (RelationIsLogicallyLogged(childrel))
3230  relids_logged = lappend_oid(relids_logged, childrelid);
3231  }
3232  }
3233  }
3234 
3235  /*
3236  * Even if we used CASCADE on the upstream primary we explicitly default
3237  * to replaying changes without further cascading. This might be later
3238  * changeable with a user specified option.
3239  *
3240  * MySubscription->runasowner tells us whether we want to execute
3241  * replication actions as the subscription owner; the last argument to
3242  * TruncateGuts tells it whether we want to switch to the table owner.
3243  * Those are exactly opposite conditions.
3244  */
3245  ExecuteTruncateGuts(rels,
3246  relids,
3247  relids_logged,
3248  DROP_RESTRICT,
3249  restart_seqs,
3251  foreach(lc, remote_rels)
3252  {
3253  LogicalRepRelMapEntry *rel = lfirst(lc);
3254 
3256  }
3257  foreach(lc, part_rels)
3258  {
3259  Relation rel = lfirst(lc);
3260 
3261  table_close(rel, NoLock);
3262  }
3263 
3265 }
3266 
3267 
3268 /*
3269  * Logical replication protocol message dispatcher.
3270  */
3271 void
3273 {
3275  LogicalRepMsgType saved_command;
3276 
3277  /*
3278  * Set the current command being applied. Since this function can be
3279  * called recursively when applying spooled changes, save the current
3280  * command.
3281  */
3282  saved_command = apply_error_callback_arg.command;
3284 
3285  switch (action)
3286  {
3287  case LOGICAL_REP_MSG_BEGIN:
3288  apply_handle_begin(s);
3289  break;
3290 
3293  break;
3294 
3297  break;
3298 
3301  break;
3302 
3305  break;
3306 
3309  break;
3310 
3313  break;
3314 
3315  case LOGICAL_REP_MSG_TYPE:
3316  apply_handle_type(s);
3317  break;
3318 
3321  break;
3322 
3324 
3325  /*
3326  * Logical replication does not use generic logical messages yet.
3327  * Although, it could be used by other applications that use this
3328  * output plugin.
3329  */
3330  break;
3331 
3334  break;
3335 
3338  break;
3339 
3342  break;
3343 
3346  break;
3347 
3350  break;
3351 
3354  break;
3355 
3358  break;
3359 
3362  break;
3363 
3366  break;
3367 
3368  default:
3369  ereport(ERROR,
3370  (errcode(ERRCODE_PROTOCOL_VIOLATION),
3371  errmsg("invalid logical replication message type \"??? (%d)\"", action)));
3372  }
3373 
3374  /* Reset the current command */
3375  apply_error_callback_arg.command = saved_command;
3376 }
3377 
3378 /*
3379  * Figure out which write/flush positions to report to the walsender process.
3380  *
3381  * We can't simply report back the last LSN the walsender sent us because the
3382  * local transaction might not yet be flushed to disk locally. Instead we
3383  * build a list that associates local with remote LSNs for every commit. When
3384  * reporting back the flush position to the sender we iterate that list and
3385  * check which entries on it are already locally flushed. Those we can report
3386  * as having been flushed.
3387  *
3388  * The have_pending_txes is true if there are outstanding transactions that
3389  * need to be flushed.
3390  */
3391 static void
3393  bool *have_pending_txes)
3394 {
3395  dlist_mutable_iter iter;
3396  XLogRecPtr local_flush = GetFlushRecPtr(NULL);
3397 
3399  *flush = InvalidXLogRecPtr;
3400 
3402  {
3403  FlushPosition *pos =
3404  dlist_container(FlushPosition, node, iter.cur);
3405 
3406  *write = pos->remote_end;
3407 
3408  if (pos->local_end <= local_flush)
3409  {
3410  *flush = pos->remote_end;
3411  dlist_delete(iter.cur);
3412  pfree(pos);
3413  }
3414  else
3415  {
3416  /*
3417  * Don't want to uselessly iterate over the rest of the list which
3418  * could potentially be long. Instead get the last element and
3419  * grab the write position from there.
3420  */
3421  pos = dlist_tail_element(FlushPosition, node,
3422  &lsn_mapping);
3423  *write = pos->remote_end;
3424  *have_pending_txes = true;
3425  return;
3426  }
3427  }
3428 
3429  *have_pending_txes = !dlist_is_empty(&lsn_mapping);
3430 }
3431 
3432 /*
3433  * Store current remote/local lsn pair in the tracking list.
3434  */
3435 void
3437 {
3438  FlushPosition *flushpos;
3439 
3440  /*
3441  * Skip for parallel apply workers, because the lsn_mapping is maintained
3442  * by the leader apply worker.
3443  */
3445  return;
3446 
3447  /* Need to do this in permanent context */
3449 
3450  /* Track commit lsn */
3451  flushpos = (FlushPosition *) palloc(sizeof(FlushPosition));
3452  flushpos->local_end = local_lsn;
3453  flushpos->remote_end = remote_lsn;
3454 
3455  dlist_push_tail(&lsn_mapping, &flushpos->node);
3457 }
3458 
3459 
3460 /* Update statistics of the worker. */
3461 static void
3462 UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply)
3463 {
3464  MyLogicalRepWorker->last_lsn = last_lsn;
3465  MyLogicalRepWorker->last_send_time = send_time;
3467  if (reply)
3468  {
3469  MyLogicalRepWorker->reply_lsn = last_lsn;
3470  MyLogicalRepWorker->reply_time = send_time;
3471  }
3472 }
3473 
3474 /*
3475  * Apply main loop.
3476  */
3477 static void
3479 {
3480  TimestampTz last_recv_timestamp = GetCurrentTimestamp();
3481  bool ping_sent = false;
3482  TimeLineID tli;
3483  ErrorContextCallback errcallback;
3484 
3485  /*
3486  * Init the ApplyMessageContext which we clean up after each replication
3487  * protocol message.
3488  */
3490  "ApplyMessageContext",
3492 
3493  /*
3494  * This memory context is used for per-stream data when the streaming mode
3495  * is enabled. This context is reset on each stream stop.
3496  */
3498  "LogicalStreamingContext",
3500 
3501  /* mark as idle, before starting to loop */
3503 
3504  /*
3505  * Push apply error context callback. Fields will be filled while applying
3506  * a change.
3507  */
3508  errcallback.callback = apply_error_callback;
3509  errcallback.previous = error_context_stack;
3510  error_context_stack = &errcallback;
3512 
3513  /* This outer loop iterates once per wait. */
3514  for (;;)
3515  {
3517  int rc;
3518  int len;
3519  char *buf = NULL;
3520  bool endofstream = false;
3521  long wait_time;
3522 
3524 
3526 
3528 
3529  if (len != 0)
3530  {
3531  /* Loop to process all available data (without blocking). */
3532  for (;;)
3533  {
3535 
3536  if (len == 0)
3537  {
3538  break;
3539  }
3540  else if (len < 0)
3541  {
3542  ereport(LOG,
3543  (errmsg("data stream from publisher has ended")));
3544  endofstream = true;
3545  break;
3546  }
3547  else
3548  {
3549  int c;
3550  StringInfoData s;
3551 
3552  if (ConfigReloadPending)
3553  {
3554  ConfigReloadPending = false;
3556  }
3557 
3558  /* Reset timeout. */
3559  last_recv_timestamp = GetCurrentTimestamp();
3560  ping_sent = false;
3561 
3562  /* Ensure we are reading the data into our memory context. */
3564 
3566 
3567  c = pq_getmsgbyte(&s);
3568 
3569  if (c == 'w')
3570  {
3571  XLogRecPtr start_lsn;
3572  XLogRecPtr end_lsn;
3573  TimestampTz send_time;
3574 
3575  start_lsn = pq_getmsgint64(&s);
3576  end_lsn = pq_getmsgint64(&s);
3577  send_time = pq_getmsgint64(&s);
3578 
3579  if (last_received < start_lsn)
3580  last_received = start_lsn;
3581 
3582  if (last_received < end_lsn)
3583  last_received = end_lsn;
3584 
3585  UpdateWorkerStats(last_received, send_time, false);
3586 
3587  apply_dispatch(&s);
3588  }
3589  else if (c == 'k')
3590  {
3591  XLogRecPtr end_lsn;
3593  bool reply_requested;
3594 
3595  end_lsn = pq_getmsgint64(&s);
3596  timestamp = pq_getmsgint64(&s);
3597  reply_requested = pq_getmsgbyte(&s);
3598 
3599  if (last_received < end_lsn)
3600  last_received = end_lsn;
3601 
3602  send_feedback(last_received, reply_requested, false);
3603  UpdateWorkerStats(last_received, timestamp, true);
3604  }
3605  /* other message types are purposefully ignored */
3606 
3608  }
3609 
3611  }
3612  }
3613 
3614  /* confirm all writes so far */
3615  send_feedback(last_received, false, false);
3616 
3618  {
3619  /*
3620  * If we didn't get any transactions for a while there might be
3621  * unconsumed invalidation messages in the queue, consume them
3622  * now.
3623  */
3626 
3627  /* Process any table synchronization changes. */
3628  process_syncing_tables(last_received);
3629  }
3630 
3631  /* Cleanup the memory. */
3634 
3635  /* Check if we need to exit the streaming loop. */
3636  if (endofstream)
3637  break;
3638 
3639  /*
3640  * Wait for more data or latch. If we have unflushed transactions,
3641  * wake up after WalWriterDelay to see if they've been flushed yet (in
3642  * which case we should send a feedback message). Otherwise, there's
3643  * no particular urgency about waking up unless we get data or a
3644  * signal.
3645  */
3646  if (!dlist_is_empty(&lsn_mapping))
3647  wait_time = WalWriterDelay;
3648  else
3649  wait_time = NAPTIME_PER_CYCLE;
3650 
3654  fd, wait_time,
3655  WAIT_EVENT_LOGICAL_APPLY_MAIN);
3656 
3657  if (rc & WL_LATCH_SET)
3658  {
3661  }
3662 
3663  if (ConfigReloadPending)
3664  {
3665  ConfigReloadPending = false;
3667  }
3668 
3669  if (rc & WL_TIMEOUT)
3670  {
3671  /*
3672  * We didn't receive anything new. If we haven't heard anything
3673  * from the server for more than wal_receiver_timeout / 2, ping
3674  * the server. Also, if it's been longer than
3675  * wal_receiver_status_interval since the last update we sent,
3676  * send a status update to the primary anyway, to report any
3677  * progress in applying WAL.
3678  */
3679  bool requestReply = false;
3680 
3681  /*
3682  * Check if time since last receive from primary has reached the
3683  * configured limit.
3684  */
3685  if (wal_receiver_timeout > 0)
3686  {
3688  TimestampTz timeout;
3689 
3690  timeout =
3691  TimestampTzPlusMilliseconds(last_recv_timestamp,
3693 
3694  if (now >= timeout)
3695  ereport(ERROR,
3696  (errcode(ERRCODE_CONNECTION_FAILURE),
3697  errmsg("terminating logical replication worker due to timeout")));
3698 
3699  /* Check to see if it's time for a ping. */
3700  if (!ping_sent)
3701  {
3702  timeout = TimestampTzPlusMilliseconds(last_recv_timestamp,
3703  (wal_receiver_timeout / 2));
3704  if (now >= timeout)
3705  {
3706  requestReply = true;
3707  ping_sent = true;
3708  }
3709  }
3710  }
3711 
3712  send_feedback(last_received, requestReply, requestReply);
3713 
3714  /*
3715  * Force reporting to ensure long idle periods don't lead to
3716  * arbitrarily delayed stats. Stats can only be reported outside
3717  * of (implicit or explicit) transactions. That shouldn't lead to
3718  * stats being delayed for long, because transactions are either
3719  * sent as a whole on commit or streamed. Streamed transactions
3720  * are spilled to disk and applied on commit.
3721  */
3722  if (!IsTransactionState())
3723  pgstat_report_stat(true);
3724  }
3725  }
3726 
3727  /* Pop the error context stack */
3728  error_context_stack = errcallback.previous;
3730 
3731  /* All done */
3733 }
3734 
3735 /*
3736  * Send a Standby Status Update message to server.
3737  *
3738  * 'recvpos' is the latest LSN we've received data to, force is set if we need
3739  * to send a response to avoid timeouts.
3740  */
3741 static void
3742 send_feedback(XLogRecPtr recvpos, bool force, bool requestReply)
3743 {
3744  static StringInfo reply_message = NULL;
3745  static TimestampTz send_time = 0;
3746 
3747  static XLogRecPtr last_recvpos = InvalidXLogRecPtr;
3748  static XLogRecPtr last_writepos = InvalidXLogRecPtr;
3749  static XLogRecPtr last_flushpos = InvalidXLogRecPtr;
3750 
3751  XLogRecPtr writepos;
3752  XLogRecPtr flushpos;
3753  TimestampTz now;
3754  bool have_pending_txes;
3755 
3756  /*
3757  * If the user doesn't want status to be reported to the publisher, be
3758  * sure to exit before doing anything at all.
3759  */
3760  if (!force && wal_receiver_status_interval <= 0)
3761  return;
3762 
3763  /* It's legal to not pass a recvpos */
3764  if (recvpos < last_recvpos)
3765  recvpos = last_recvpos;
3766 
3767  get_flush_position(&writepos, &flushpos, &have_pending_txes);
3768 
3769  /*
3770  * No outstanding transactions to flush, we can report the latest received
3771  * position. This is important for synchronous replication.
3772  */
3773  if (!have_pending_txes)
3774  flushpos = writepos = recvpos;
3775 
3776  if (writepos < last_writepos)
3777  writepos = last_writepos;
3778 
3779  if (flushpos < last_flushpos)
3780  flushpos = last_flushpos;
3781 
3783 
3784  /* if we've already reported everything we're good */
3785  if (!force &&
3786  writepos == last_writepos &&
3787  flushpos == last_flushpos &&
3788  !TimestampDifferenceExceeds(send_time, now,
3790  return;
3791  send_time = now;
3792 
3793  if (!reply_message)
3794  {
3796 
3798  MemoryContextSwitchTo(oldctx);
3799  }
3800  else
3802 
3803  pq_sendbyte(reply_message, 'r');
3804  pq_sendint64(reply_message, recvpos); /* write */
3805  pq_sendint64(reply_message, flushpos); /* flush */
3806  pq_sendint64(reply_message, writepos); /* apply */
3807  pq_sendint64(reply_message, now); /* sendTime */
3808  pq_sendbyte(reply_message, requestReply); /* replyRequested */
3809 
3810  elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X",
3811  force,
3812  LSN_FORMAT_ARGS(recvpos),
3813  LSN_FORMAT_ARGS(writepos),
3814  LSN_FORMAT_ARGS(flushpos));
3815 
3818 
3819  if (recvpos > last_recvpos)
3820  last_recvpos = recvpos;
3821  if (writepos > last_writepos)
3822  last_writepos = writepos;
3823  if (flushpos > last_flushpos)
3824  last_flushpos = flushpos;
3825 }
3826 
3827 /*
3828  * Exit routine for apply workers due to subscription parameter changes.
3829  */
3830 static void
3832 {
3834  {
3835  /*
3836  * Don't stop the parallel apply worker as the leader will detect the
3837  * subscription parameter change and restart logical replication later
3838  * anyway. This also prevents the leader from reporting errors when
3839  * trying to communicate with a stopped parallel apply worker, which
3840  * would accidentally disable subscriptions if disable_on_error was
3841  * set.
3842  */
3843  return;
3844  }
3845 
3846  /*
3847  * Reset the last-start time for this apply worker so that the launcher
3848  * will restart it without waiting for wal_retrieve_retry_interval if the
3849  * subscription is still active, and so that we won't leak that hash table
3850  * entry if it isn't.
3851  */
3852  if (am_leader_apply_worker())
3854 
3855  proc_exit(0);
3856 }
3857 
3858 /*
3859  * Reread subscription info if needed. Most changes will be exit.
3860  */
3861 void
3863 {
3864  MemoryContext oldctx;
3866  bool started_tx = false;
3867 
3868  /* When cache state is valid there is nothing to do here. */
3869  if (MySubscriptionValid)
3870  return;
3871 
3872  /* This function might be called inside or outside of transaction. */
3873  if (!IsTransactionState())
3874  {
3876  started_tx = true;
3877  }
3878 
3879  /* Ensure allocations in permanent context. */
3881 
3883 
3884  /*
3885  * Exit if the subscription was removed. This normally should not happen
3886  * as the worker gets killed during DROP SUBSCRIPTION.
3887  */
3888  if (!newsub)
3889  {
3890  ereport(LOG,
3891  (errmsg("logical replication worker for subscription \"%s\" will stop because the subscription was removed",
3892  MySubscription->name)));
3893 
3894  /* Ensure we remove no-longer-useful entry for worker's start time */
3895  if (am_leader_apply_worker())
3897 
3898  proc_exit(0);
3899  }
3900 
3901  /* Exit if the subscription was disabled. */
3902  if (!newsub->enabled)
3903  {
3904  ereport(LOG,
3905  (errmsg("logical replication worker for subscription \"%s\" will stop because the subscription was disabled",
3906  MySubscription->name)));
3907 
3909  }
3910 
3911  /* !slotname should never happen when enabled is true. */
3912  Assert(newsub->slotname);
3913 
3914  /* two-phase should not be altered */
3915  Assert(newsub->twophasestate == MySubscription->twophasestate);
3916 
3917  /*
3918  * Exit if any parameter that affects the remote connection was changed.
3919  * The launcher will start a new worker but note that the parallel apply
3920  * worker won't restart if the streaming option's value is changed from
3921  * 'parallel' to any other value or the server decides not to stream the
3922  * in-progress transaction.
3923  */
3924  if (strcmp(newsub->conninfo, MySubscription->conninfo) != 0 ||
3925  strcmp(newsub->name, MySubscription->name) != 0 ||
3926  strcmp(newsub->slotname, MySubscription->slotname) != 0 ||
3927  newsub->binary != MySubscription->binary ||
3928  newsub->stream != MySubscription->stream ||
3929  newsub->passwordrequired != MySubscription->passwordrequired ||
3930  strcmp(newsub->origin, MySubscription->origin) != 0 ||
3931  newsub->owner != MySubscription->owner ||
3932  !equal(newsub->publications, MySubscription->publications))
3933  {
3935  ereport(LOG,
3936  (errmsg("logical replication parallel apply worker for subscription \"%s\" will stop because of a parameter change",
3937  MySubscription->name)));
3938  else
3939  ereport(LOG,
3940  (errmsg("logical replication worker for subscription \"%s\" will restart because of a parameter change",
3941  MySubscription->name)));
3942 
3944  }
3945 
3946  /*
3947  * Exit if the subscription owner's superuser privileges have been
3948  * revoked.
3949  */
3950  if (!newsub->ownersuperuser && MySubscription->ownersuperuser)
3951  {
3953  ereport(LOG,
3954  errmsg("logical replication parallel apply worker for subscription \"%s\" will stop because the subscription owner's superuser privileges have been revoked",
3955  MySubscription->name));
3956  else
3957  ereport(LOG,
3958  errmsg("logical replication worker for subscription \"%s\" will restart because the subscription owner's superuser privileges have been revoked",
3959  MySubscription->name));
3960 
3962  }
3963 
3964  /* Check for other changes that should never happen too. */
3965  if (newsub->dbid != MySubscription->dbid)
3966  {
3967  elog(ERROR, "subscription %u changed unexpectedly",
3969  }
3970 
3971  /* Clean old subscription info and switch to new one. */
3974 
3975  MemoryContextSwitchTo(oldctx);
3976 
3977  /* Change synchronous commit according to the user's wishes */
3978  SetConfigOption("synchronous_commit", MySubscription->synccommit,
3980 
3981  if (started_tx)
3983 
3984  MySubscriptionValid = true;
3985 }
3986 
3987 /*
3988  * Callback from subscription syscache invalidation.
3989  */
3990 static void
3991 subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue)
3992 {
3993  MySubscriptionValid = false;
3994 }
3995 
3996 /*
3997  * subxact_info_write
3998  * Store information about subxacts for a toplevel transaction.
3999  *
4000  * For each subxact we store offset of it's first change in the main file.
4001  * The file is always over-written as a whole.
4002  *
4003  * XXX We should only store subxacts that were not aborted yet.
4004  */
4005 static void
4007 {
4008  char path[MAXPGPATH];
4009  Size len;
4010  BufFile *fd;
4011 
4013 
4014  /* construct the subxact filename */
4015  subxact_filename(path, subid, xid);
4016 
4017  /* Delete the subxacts file, if exists. */
4018  if (subxact_data.nsubxacts == 0)
4019  {
4022 
4023  return;
4024  }
4025 
4026  /*
4027  * Create the subxact file if it not already created, otherwise open the
4028  * existing file.
4029  */
4031  true);
4032  if (fd == NULL)
4034 
4035  len = sizeof(SubXactInfo) * subxact_data.nsubxacts;
4036 
4037  /* Write the subxact count and subxact info */
4040 
4041  BufFileClose(fd);
4042 
4043  /* free the memory allocated for subxact info */
4045 }
4046 
4047 /*
4048  * subxact_info_read
4049  * Restore information about subxacts of a streamed transaction.
4050  *
4051  * Read information about subxacts into the structure subxact_data that can be
4052  * used later.
4053  */
4054 static void
4056 {
4057  char path[MAXPGPATH];
4058  Size len;
4059  BufFile *fd;
4060  MemoryContext oldctx;
4061 
4065 
4066  /*
4067  * If the subxact file doesn't exist that means we don't have any subxact
4068  * info.
4069  */
4070  subxact_filename(path, subid, xid);
4072  true);
4073  if (fd == NULL)
4074  return;
4075 
4076  /* read number of subxact items */
4078 
4079  len = sizeof(SubXactInfo) * subxact_data.nsubxacts;
4080 
4081  /* we keep the maximum as a power of 2 */
4083 
4084  /*
4085  * Allocate subxact information in the logical streaming context. We need
4086  * this information during the complete stream so that we can add the sub
4087  * transaction info to this. On stream stop we will flush this information
4088  * to the subxact file and reset the logical streaming context.
4089  */
4092  sizeof(SubXactInfo));
4093  MemoryContextSwitchTo(oldctx);
4094 
4095  if (len > 0)
4097 
4098  BufFileClose(fd);
4099 }
4100 
4101 /*
4102  * subxact_info_add
4103  * Add information about a subxact (offset in the main file).
4104  */
4105 static void
4107 {
4108  SubXactInfo *subxacts = subxact_data.subxacts;
4109  int64 i;
4110 
4111  /* We must have a valid top level stream xid and a stream fd. */
4113  Assert(stream_fd != NULL);
4114 
4115  /*
4116  * If the XID matches the toplevel transaction, we don't want to add it.
4117  */
4118  if (stream_xid == xid)
4119  return;
4120 
4121  /*
4122  * In most cases we're checking the same subxact as we've already seen in
4123  * the last call, so make sure to ignore it (this change comes later).
4124  */
4125  if (subxact_data.subxact_last == xid)
4126  return;
4127 
4128  /* OK, remember we're processing this XID. */
4129  subxact_data.subxact_last = xid;
4130 
4131  /*
4132  * Check if the transaction is already present in the array of subxact. We
4133  * intentionally scan the array from the tail, because we're likely adding
4134  * a change for the most recent subtransactions.
4135  *
4136  * XXX Can we rely on the subxact XIDs arriving in sorted order? That
4137  * would allow us to use binary search here.
4138  */
4139  for (i = subxact_data.nsubxacts; i > 0; i--)
4140  {
4141  /* found, so we're done */
4142  if (subxacts[i - 1].xid == xid)
4143  return;
4144  }
4145 
4146  /* This is a new subxact, so we need to add it to the array. */
4147  if (subxact_data.nsubxacts == 0)
4148  {
4149  MemoryContext oldctx;
4150 
4152 
4153  /*
4154  * Allocate this memory for subxacts in per-stream context, see
4155  * subxact_info_read.
4156  */
4158  subxacts = palloc(subxact_data.nsubxacts_max * sizeof(SubXactInfo));
4159  MemoryContextSwitchTo(oldctx);
4160  }
4162  {
4164  subxacts = repalloc(subxacts,
4166  }
4167 
4168  subxacts[subxact_data.nsubxacts].xid = xid;
4169 
4170  /*
4171  * Get the current offset of the stream file and store it as offset of
4172  * this subxact.
4173  */
4175  &subxacts[subxact_data.nsubxacts].fileno,
4176  &subxacts[subxact_data.nsubxacts].offset);
4177 
4179  subxact_data.subxacts = subxacts;
4180 }
4181 
4182 /* format filename for file containing the info about subxacts */
4183 static inline void
4184 subxact_filename(char *path, Oid subid, TransactionId xid)
4185 {
4186  snprintf(path, MAXPGPATH, "%u-%u.subxacts", subid, xid);
4187 }
4188 
4189 /* format filename for file containing serialized changes */
4190 static inline void
4191 changes_filename(char *path, Oid subid, TransactionId xid)
4192 {
4193  snprintf(path, MAXPGPATH, "%u-%u.changes", subid, xid);
4194 }
4195 
4196 /*
4197  * stream_cleanup_files
4198  * Cleanup files for a subscription / toplevel transaction.
4199  *
4200  * Remove files with serialized changes and subxact info for a particular
4201  * toplevel transaction. Each subscription has a separate set of files
4202  * for any toplevel transaction.
4203  */
4204 void
4206 {
4207  char path[MAXPGPATH];
4208 
4209  /* Delete the changes file. */
4210  changes_filename(path, subid, xid);
4212 
4213  /* Delete the subxact file, if it exists. */
4214  subxact_filename(path, subid, xid);
4216 }
4217 
4218 /*
4219  * stream_open_file
4220  * Open a file that we'll use to serialize changes for a toplevel
4221  * transaction.
4222  *
4223  * Open a file for streamed changes from a toplevel transaction identified
4224  * by stream_xid (global variable). If it's the first chunk of streamed
4225  * changes for this transaction, create the buffile, otherwise open the
4226  * previously created file.
4227  */
4228 static void
4229 stream_open_file(Oid subid, TransactionId xid, bool first_segment)
4230 {
4231  char path[MAXPGPATH];
4232  MemoryContext oldcxt;
4233 
4234  Assert(OidIsValid(subid));
4236  Assert(stream_fd == NULL);
4237 
4238 
4239  changes_filename(path, subid, xid);
4240  elog(DEBUG1, "opening file \"%s\" for streamed changes", path);
4241 
4242  /*
4243  * Create/open the buffiles under the logical streaming context so that we
4244  * have those files until stream stop.
4245  */
4247 
4248  /*
4249  * If this is the first streamed segment, create the changes file.
4250  * Otherwise, just open the file for writing, in append mode.
4251  */
4252  if (first_segment)
4254  path);
4255  else
4256  {
4257  /*
4258  * Open the file and seek to the end of the file because we always
4259  * append the changes file.
4260  */
4262  path, O_RDWR, false);
4263  BufFileSeek(stream_fd, 0, 0, SEEK_END);
4264  }
4265 
4266  MemoryContextSwitchTo(oldcxt);
4267 }
4268 
4269 /*
4270  * stream_close_file
4271  * Close the currently open file with streamed changes.
4272  */
4273 static void
4275 {
4276  Assert(stream_fd != NULL);
4277 
4279 
4280  stream_fd = NULL;
4281 }
4282 
4283 /*
4284  * stream_write_change
4285  * Serialize a change to a file for the current toplevel transaction.
4286  *
4287  * The change is serialized in a simple format, with length (not including
4288  * the length), action code (identifying the message type) and message
4289  * contents (without the subxact TransactionId value).
4290  */
4291 static void
4293 {
4294  int len;
4295 
4296  Assert(stream_fd != NULL);
4297 
4298  /* total on-disk size, including the action type character */
4299  len = (s->len - s->cursor) + sizeof(char);
4300 
4301  /* first write the size */
4302  BufFileWrite(stream_fd, &len, sizeof(len));
4303 
4304  /* then the action */
4305  BufFileWrite(stream_fd, &action, sizeof(action));
4306 
4307  /* and finally the remaining part of the buffer (after the XID) */
4308  len = (s->len - s->cursor);
4309 
4310  BufFileWrite(stream_fd, &s->data[s->cursor], len);
4311 }
4312 
4313 /*
4314  * stream_open_and_write_change
4315  * Serialize a message to a file for the given transaction.
4316  *
4317  * This function is similar to stream_write_change except that it will open the
4318  * target file if not already before writing the message and close the file at
4319  * the end.
4320  */
4321 static void
4323 {
4325 
4326  if (!stream_fd)
4327  stream_start_internal(xid, false);
4328 
4330  stream_stop_internal(xid);
4331 }
4332 
4333 /*
4334  * Sets streaming options including replication slot name and origin start
4335  * position. Workers need these options for logical replication.
4336  */
4337 void
4339  char *slotname,
4340  XLogRecPtr *origin_startpos)
4341 {
4342  int server_version;
4343 
4344  options->logical = true;
4345  options->startpoint = *origin_startpos;
4346  options->slotname = slotname;
4347 
4349  options->proto.logical.proto_version =
4354 
4355  options->proto.logical.publication_names = MySubscription->publications;
4356  options->proto.logical.binary = MySubscription->binary;
4357 
4358  /*
4359  * Assign the appropriate option value for streaming option according to
4360  * the 'streaming' mode and the publisher's ability to support that mode.
4361  */
4362  if (server_version >= 160000 &&
4364  {
4365  options->proto.logical.streaming_str = "parallel";
4367  }
4368  else if (server_version >= 140000 &&
4370  {
4371  options->proto.logical.streaming_str = "on";
4373  }
4374  else
4375  {
4376  options->proto.logical.streaming_str = NULL;
4378  }
4379 
4380  options->proto.logical.twophase = false;
4381  options->proto.logical.origin = pstrdup(MySubscription->origin);
4382 }
4383 
4384 /*
4385  * Cleanup the memory for subxacts and reset the related variables.
4386  */
4387 static inline void
4389 {
4390  if (subxact_data.subxacts)
4392 
4393  subxact_data.subxacts = NULL;
4395  subxact_data.nsubxacts = 0;
4397 }
4398 
4399 /*
4400  * Form the prepared transaction GID for two_phase transactions.
4401  *
4402  * Return the GID in the supplied buffer.
4403  */
4404 static void
4405 TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid, int szgid)
4406 {
4407  Assert(subid != InvalidRepOriginId);
4408 
4409  if (!TransactionIdIsValid(xid))
4410  ereport(ERROR,
4411  (errcode(ERRCODE_PROTOCOL_VIOLATION),
4412  errmsg_internal("invalid two-phase transaction ID")));
4413 
4414  snprintf(gid, szgid, "pg_gid_%u_%u", subid, xid);
4415 }
4416 
4417 /*
4418  * Common function to run the apply loop with error handling. Disable the
4419  * subscription, if necessary.
4420  *
4421  * Note that we don't handle FATAL errors which are probably because
4422  * of system resource error and are not repeatable.
4423  */
4424 void
4425 start_apply(XLogRecPtr origin_startpos)
4426 {
4427  PG_TRY();
4428  {
4429  LogicalRepApplyLoop(origin_startpos);
4430  }
4431  PG_CATCH();
4432  {
4435  else
4436  {
4437  /*
4438  * Report the worker failed while applying changes. Abort the
4439  * current transaction so that the stats message is sent in an
4440  * idle state.
4441  */
4444 
4445  PG_RE_THROW();
4446  }
4447  }
4448  PG_END_TRY();
4449 }
4450 
4451 /*
4452  * Runs the leader apply worker.
4453  *
4454  * It sets up replication origin, streaming options and then starts streaming.
4455  */
4456 static void
4458 {
4459  char originname[NAMEDATALEN];
4460  XLogRecPtr origin_startpos = InvalidXLogRecPtr;
4461  char *slotname = NULL;
4463  RepOriginId originid;
4464  TimeLineID startpointTLI;
4465  char *err;
4466  bool must_use_password;
4467 
4468  slotname = MySubscription->slotname;
4469 
4470  /*
4471  * This shouldn't happen if the subscription is enabled, but guard against
4472  * DDL bugs or manual catalog changes. (libpqwalreceiver will crash if
4473  * slot is NULL.)
4474  */
4475  if (!slotname)
4476  ereport(ERROR,
4477  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4478  errmsg("subscription has no replication slot set")));
4479 
4480  /* Setup replication origin tracking. */
4482  originname, sizeof(originname));
4484  originid = replorigin_by_name(originname, true);
4485  if (!OidIsValid(originid))
4486  originid = replorigin_create(originname);
4487  replorigin_session_setup(originid, 0);
4488  replorigin_session_origin = originid;
4489  origin_startpos = replorigin_session_get_progress(false);
4491 
4492  /* Is the use of a password mandatory? */
4493  must_use_password = MySubscription->passwordrequired &&
4495 
4497  true, must_use_password,
4498  MySubscription->name, &err);
4499 
4500  if (LogRepWorkerWalRcvConn == NULL)
4501  ereport(ERROR,
4502  (errcode(ERRCODE_CONNECTION_FAILURE),
4503  errmsg("could not connect to the publisher: %s", err)));
4504 
4505  /*
4506  * We don't really use the output identify_system for anything but it does
4507  * some initializations on the upstream so let's still call it.
4508  */
4509  (void) walrcv_identify_system(LogRepWorkerWalRcvConn, &startpointTLI);
4510 
4511  set_apply_error_context_origin(originname);
4512 
4513  set_stream_options(&options, slotname, &origin_startpos);
4514 
4515  /*
4516  * Even when the two_phase mode is requested by the user, it remains as
4517  * the tri-state PENDING until all tablesyncs have reached READY state.
4518  * Only then, can it become ENABLED.
4519  *
4520  * Note: If the subscription has no tables then leave the state as
4521  * PENDING, which allows ALTER SUBSCRIPTION ... REFRESH PUBLICATION to
4522  * work.
4523  */
4526  {
4527  /* Start streaming with two_phase enabled */
4528  options.proto.logical.twophase = true;
4530 
4535  }
4536  else
4537  {
4539  }
4540 
4541  ereport(DEBUG1,
4542  (errmsg_internal("logical replication apply worker for subscription \"%s\" two_phase is %s",
4547  "?")));
4548 
4549  /* Run the main loop. */
4550  start_apply(origin_startpos);
4551 }
4552 
4553 /*
4554  * Common initialization for leader apply worker, parallel apply worker and
4555  * tablesync worker.
4556  *
4557  * Initialize the database connection, in-memory subscription and necessary
4558  * config options.
4559  */
4560 void
4562 {
4563  MemoryContext oldctx;
4564 
4565  /* Run as replica session replication role. */
4566  SetConfigOption("session_replication_role", "replica",
4568 
4569  /* Connect to our database. */
4572  0);
4573 
4574  /*
4575  * Set always-secure search path, so malicious users can't redirect user
4576  * code (e.g. pg_index.indexprs).
4577  */
4578  SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
4579 
4580  /* Load the subscription into persistent memory context. */
4582  "ApplyContext",
4586 
4588  if (!MySubscription)
4589  {
4590  ereport(LOG,
4591  (errmsg("logical replication worker for subscription %u will not start because the subscription was removed during startup",
4593 
4594  /* Ensure we remove no-longer-useful entry for worker's start time */
4595  if (am_leader_apply_worker())
4597 
4598  proc_exit(0);
4599  }
4600 
4601  MySubscriptionValid = true;
4602  MemoryContextSwitchTo(oldctx);
4603 
4604  if (!MySubscription->enabled)
4605  {
4606  ereport(LOG,
4607  (errmsg("logical replication worker for subscription \"%s\" will not start because the subscription was disabled during startup",
4608  MySubscription->name)));
4609 
4611  }
4612 
4613  /* Setup synchronous commit according to the user's wishes */
4614  SetConfigOption("synchronous_commit", MySubscription->synccommit,
4616 
4617  /*
4618  * Keep us informed about subscription or role changes. Note that the
4619  * role's superuser privilege can be revoked.
4620  */
4621  CacheRegisterSyscacheCallback(SUBSCRIPTIONOID,
4623  (Datum) 0);
4624 
4627  (Datum) 0);
4628 
4629  if (am_tablesync_worker())
4630  ereport(LOG,
4631  (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has started",
4634  else
4635  ereport(LOG,
4636  (errmsg("logical replication apply worker for subscription \"%s\" has started",
4637  MySubscription->name)));
4638 
4640 }
4641 
4642 /* Common function to setup the leader apply or tablesync worker. */
4643 void
4644 SetupApplyOrSyncWorker(int worker_slot)
4645 {
4646  /* Attach to slot */
4647  logicalrep_worker_attach(worker_slot);
4648 
4650 
4651  /* Setup signal handling */
4653  pqsignal(SIGTERM, die);
4655 
4656  /*
4657  * We don't currently need any ResourceOwner in a walreceiver process, but
4658  * if we did, we could call CreateAuxProcessResourceOwner here.
4659  */
4660 
4661  /* Initialise stats to a sanish value */
4664 
4665  /* Load the libpq-specific functions */
4666  load_file("libpqwalreceiver", false);
4667 
4669 
4670  /* Connect to the origin and start the replication. */
4671  elog(DEBUG1, "connecting to publisher using connection string \"%s\"",
4673 
4674  /*
4675  * Setup callback for syscache so that we know when something changes in
4676  * the subscription relation state.
4677  */
4678  CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP,
4680  (Datum) 0);
4681 }
4682 
4683 /* Logical Replication Apply worker entry point */
4684 void
4686 {
4687  int worker_slot = DatumGetInt32(main_arg);
4688 
4689  InitializingApplyWorker = true;
4690 
4691  SetupApplyOrSyncWorker(worker_slot);
4692 
4693  InitializingApplyWorker = false;
4694 
4695  run_apply_worker();
4696 
4697  proc_exit(0);
4698 }
4699 
4700 /*
4701  * After error recovery, disable the subscription in a new transaction
4702  * and exit cleanly.
4703  */
4704 void
4706 {
4707  /*
4708  * Emit the error message, and recover from the error state to an idle
4709  * state
4710  */
4711  HOLD_INTERRUPTS();
4712 
4713  EmitErrorReport();
4715  FlushErrorState();
4716 
4718 
4719  /* Report the worker failed during either table synchronization or apply */
4721  !am_tablesync_worker());
4722 
4723  /* Disable the subscription */
4727 
4728  /* Ensure we remove no-longer-useful entry for worker's start time */
4729  if (am_leader_apply_worker())
4731 
4732  /* Notify the subscription has been disabled and exit */
4733  ereport(LOG,
4734  errmsg("subscription \"%s\" has been disabled because of an error",
4735  MySubscription->name));
4736 
4737  proc_exit(0);
4738 }
4739 
4740 /*
4741  * Is current process a logical replication worker?
4742  */
4743 bool
4745 {
4746  return MyLogicalRepWorker != NULL;
4747 }
4748 
4749 /*
4750  * Is current process a logical replication parallel apply worker?
4751  */
4752 bool
4754 {
4756 }
4757 
4758 /*
4759  * Start skipping changes of the transaction if the given LSN matches the
4760  * LSN specified by subscription's skiplsn.
4761  */
4762 static void
4764 {
4768 
4769  /*
4770  * Quick return if it's not requested to skip this transaction. This
4771  * function is called for every remote transaction and we assume that
4772  * skipping the transaction is not used often.
4773  */
4775  MySubscription->skiplsn != finish_lsn))
4776  return;
4777 
4778  /* Start skipping all changes of this transaction */
4779  skip_xact_finish_lsn = finish_lsn;
4780 
4781  ereport(LOG,
4782  errmsg("logical replication starts skipping transaction at LSN %X/%X",
4784 }
4785 
4786 /*
4787  * Stop skipping changes by resetting skip_xact_finish_lsn if enabled.
4788  */
4789 static void
4791 {
4792  if (!is_skipping_changes())
4793  return;
4794 
4795  ereport(LOG,
4796  (errmsg("logical replication completed skipping transaction at LSN %X/%X",
4798 
4799  /* Stop skipping changes */
4801 }
4802 
4803 /*
4804  * Clear subskiplsn of pg_subscription catalog.
4805  *
4806  * finish_lsn is the transaction's finish LSN that is used to check if the
4807  * subskiplsn matches it. If not matched, we raise a warning when clearing the
4808  * subskiplsn in order to inform users for cases e.g., where the user mistakenly
4809  * specified the wrong subskiplsn.
4810  */
4811 static void
4813 {
4814  Relation rel;
4815  Form_pg_subscription subform;
4816  HeapTuple tup;
4817  XLogRecPtr myskiplsn = MySubscription->skiplsn;
4818  bool started_tx = false;
4819 
4821  return;
4822 
4823  if (!IsTransactionState())
4824  {
4826  started_tx = true;
4827  }
4828 
4829  /*
4830  * Protect subskiplsn of pg_subscription from being concurrently updated
4831  * while clearing it.
4832  */
4833  LockSharedObject(SubscriptionRelationId, MySubscription->oid, 0,
4834  AccessShareLock);
4835 
4836  rel = table_open(SubscriptionRelationId, RowExclusiveLock);
4837 
4838  /* Fetch the existing tuple. */
4839  tup = SearchSysCacheCopy1(SUBSCRIPTIONOID,
4841 
4842  if (!HeapTupleIsValid(tup))
4843  elog(ERROR, "subscription \"%s\" does not exist", MySubscription->name);
4844 
4845  subform = (Form_pg_subscription) GETSTRUCT(tup);
4846 
4847  /*
4848  * Clear the subskiplsn. If the user has already changed subskiplsn before
4849  * clearing it we don't update the catalog and the replication origin
4850  * state won't get advanced. So in the worst case, if the server crashes
4851  * before sending an acknowledgment of the flush position the transaction
4852  * will be sent again and the user needs to set subskiplsn again. We can
4853  * reduce the possibility by logging a replication origin WAL record to
4854  * advance the origin LSN instead but there is no way to advance the
4855  * origin timestamp and it doesn't seem to be worth doing anything about
4856  * it since it's a very rare case.
4857  */
4858  if (subform->subskiplsn == myskiplsn)
4859  {
4860  bool nulls[Natts_pg_subscription];
4861  bool replaces[Natts_pg_subscription];
4862  Datum values[Natts_pg_subscription];
4863 
4864  memset(values, 0, sizeof(values));
4865  memset(nulls, false, sizeof(nulls));
4866  memset(replaces, false, sizeof(replaces));
4867 
4868  /* reset subskiplsn */
4869  values[Anum_pg_subscription_subskiplsn - 1] = LSNGetDatum(InvalidXLogRecPtr);
4870  replaces[Anum_pg_subscription_subskiplsn - 1] = true;
4871 
4872  tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls,
4873  replaces);
4874  CatalogTupleUpdate(rel, &tup->t_self, tup);
4875 
4876  if (myskiplsn != finish_lsn)
4877  ereport(WARNING,
4878  errmsg("skip-LSN of subscription \"%s\" cleared", MySubscription->name),
4879  errdetail("Remote transaction's finish WAL location (LSN) %X/%X did not match skip-LSN %X/%X.",
4880  LSN_FORMAT_ARGS(finish_lsn),
4881  LSN_FORMAT_ARGS(myskiplsn)));
4882  }
4883 
4884  heap_freetuple(tup);
4885  table_close(rel, NoLock);
4886 
4887  if (started_tx)
4889 }
4890 
4891 /* Error callback to give more context info about the change being applied */
4892 void
4894 {
4896 
4898  return;
4899 
4900  Assert(errarg->origin_name);
4901 
4902  if (errarg->rel == NULL)
4903  {
4904  if (!TransactionIdIsValid(errarg->remote_xid))
4905  errcontext("processing remote data for replication origin \"%s\" during message type \"%s\"",
4906  errarg->origin_name,
4907  logicalrep_message_type(errarg->command));
4908  else if (XLogRecPtrIsInvalid(errarg->finish_lsn))
4909  errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u",
4910  errarg->origin_name,
4912  errarg->remote_xid);
4913  else
4914  errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%X",
4915  errarg->origin_name,
4917  errarg->remote_xid,
4918  LSN_FORMAT_ARGS(errarg->finish_lsn));
4919  }
4920  else
4921  {
4922  if (errarg->remote_attnum < 0)
4923  {
4924  if (XLogRecPtrIsInvalid(errarg->finish_lsn))
4925  errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u",
4926  errarg->origin_name,
4928  errarg->rel->remoterel.nspname,
4929  errarg->rel->remoterel.relname,
4930  errarg->remote_xid);
4931  else
4932  errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%X",
4933  errarg->origin_name,
4935  errarg->rel->remoterel.nspname,
4936  errarg->rel->remoterel.relname,
4937  errarg->remote_xid,
4938  LSN_FORMAT_ARGS(errarg->finish_lsn));
4939  }
4940  else
4941  {
4942  if (XLogRecPtrIsInvalid(errarg->finish_lsn))
4943  errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u",
4944  errarg->origin_name,
4946  errarg->rel->remoterel.nspname,
4947  errarg->rel->remoterel.relname,
4948  errarg->rel->remoterel.attnames[errarg->remote_attnum],
4949  errarg->remote_xid);
4950  else
4951  errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%X",
4952  errarg->origin_name,
4954  errarg->rel->remoterel.nspname,
4955  errarg->rel->remoterel.relname,
4956  errarg->rel->remoterel.attnames[errarg->remote_attnum],
4957  errarg->remote_xid,
4958  LSN_FORMAT_ARGS(errarg->finish_lsn));
4959  }
4960  }
4961 }
4962 
4963 /* Set transaction information of apply error callback */
4964 static inline void
4966 {
4969 }
4970 
4971 /* Reset all information of apply error callback */
4972 static inline void
4974 {
4979 }
4980 
4981 /*
4982  * Request wakeup of the workers for the given subscription OID
4983  * at commit of the current transaction.
4984  *
4985  * This is used to ensure that the workers process assorted changes
4986  * as soon as possible.
4987  */
4988 void
4990 {
4991  MemoryContext oldcxt;
4992 
4996  MemoryContextSwitchTo(oldcxt);
4997 }
4998 
4999 /*
5000  * Wake up the workers of any subscriptions that were changed in this xact.
5001  */
5002 void
5004 {
5005  if (isCommit && on_commit_wakeup_workers_subids != NIL)
5006  {
5007  ListCell *lc;
5008 
5009  LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
5010  foreach(lc, on_commit_wakeup_workers_subids)
5011  {
5012  Oid subid = lfirst_oid(lc);
5013  List *workers;
5014  ListCell *lc2;
5015 
5016  workers = logicalrep_workers_find(subid, true);
5017  foreach(lc2, workers)
5018  {
5019  LogicalRepWorker *worker = (LogicalRepWorker *) lfirst(lc2);
5020 
5022  }
5023  }
5024  LWLockRelease(LogicalRepWorkerLock);
5025  }
5026 
5027  /* The List storage will be reclaimed automatically in xact cleanup. */
5029 }
5030 
5031 /*
5032  * Allocate the origin name in long-lived context for error context message.
5033  */
5034 void
5036 {
5038  originname);
5039 }
5040 
5041 /*
5042  * Return the action to be taken for the given transaction. See
5043  * TransApplyAction for information on each of the actions.
5044  *
5045  * *winfo is assigned to the destination parallel worker info when the leader
5046  * apply worker has to pass all the transaction's changes to the parallel
5047  * apply worker.
5048  */
5049 static TransApplyAction
5051 {
5052  *winfo = NULL;
5053 
5055  {
5056  return TRANS_PARALLEL_APPLY;
5057  }
5058 
5059  /*
5060  * If we are processing this transaction using a parallel apply worker
5061  * then either we send the changes to the parallel worker or if the worker
5062  * is busy then serialize the changes to the file which will later be
5063  * processed by the parallel worker.
5064  */
5065  *winfo = pa_find_worker(xid);
5066 
5067  if (*winfo && (*winfo)->serialize_changes)
5068  {
5070  }
5071  else if (*winfo)
5072  {
5074  }
5075 
5076  /*
5077  * If there is no parallel worker involved to process this transaction
5078  * then we either directly apply the change or serialize it to a file
5079  * which will later be applied when the transaction finish message is
5080  * processed.
5081  */
5082  else if (in_streamed_transaction)
5083  {
5084  return TRANS_LEADER_SERIALIZE;
5085  }
5086  else
5087  {
5088  return TRANS_LEADER_APPLY;
5089  }
5090 }
AclResult
Definition: acl.h:182
@ ACLCHECK_OK
Definition: acl.h:183
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition: aclchk.c:2700
AclResult pg_class_aclcheck(Oid table_oid, Oid roleid, AclMode mode)
Definition: aclchk.c:4091
void pa_set_xact_state(ParallelApplyWorkerShared *wshared, ParallelTransState xact_state)
void pa_unlock_stream(TransactionId xid, LOCKMODE lockmode)
ParallelApplyWorkerInfo * pa_find_worker(TransactionId xid)
void pa_stream_abort(LogicalRepStreamAbortData *abort_data)
void pa_lock_stream(TransactionId xid, LOCKMODE lockmode)
void pa_set_fileset_state(ParallelApplyWorkerShared *wshared, PartialFileSetState fileset_state)
void pa_reset_subtrans(void)
void pa_lock_transaction(TransactionId xid, LOCKMODE lockmode)
ParallelApplyWorkerShared * MyParallelShared
void pa_start_subtrans(TransactionId current_xid, TransactionId top_xid)
void pa_switch_to_partial_serialize(ParallelApplyWorkerInfo *winfo, bool stream_locked)
void pa_xact_finish(ParallelApplyWorkerInfo *winfo, XLogRecPtr remote_lsn)
bool pa_send_data(ParallelApplyWorkerInfo *winfo, Size nbytes, const void *data)
void pa_allocate_worker(TransactionId xid)
void pa_set_stream_apply_worker(ParallelApplyWorkerInfo *winfo)
void pa_unlock_transaction(TransactionId xid, LOCKMODE lockmode)
void pa_decr_and_wait_stream_block(void)
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:419
static void check_relation_updatable(LogicalRepRelMapEntry *rel)
Definition: worker.c:2479
static void subxact_filename(char *path, Oid subid, TransactionId xid)
Definition: worker.c:4184
static void begin_replication_step(void)
Definition: worker.c:508
static void end_replication_step(void)
Definition: worker.c:531
static void cleanup_subxact_info(void)
Definition: worker.c:4388
void set_stream_options(WalRcvStreamOptions *options, char *slotname, XLogRecPtr *origin_startpos)
Definition: worker.c:4338
static void apply_handle_stream_prepare(StringInfo s)
Definition: worker.c:1271
static void apply_handle_insert_internal(ApplyExecutionData *edata, ResultRelInfo *relinfo, TupleTableSlot *remoteslot)
Definition: worker.c:2457
static void subxact_info_add(TransactionId xid)
Definition: worker.c:4106
static ApplyExecutionData * create_edata_for_relation(LogicalRepRelMapEntry *rel)
Definition: worker.c:652
void stream_cleanup_files(Oid subid, TransactionId xid)
Definition: worker.c:4205
MemoryContext ApplyMessageContext
Definition: worker.c:290
static bool should_apply_changes_for_rel(LogicalRepRelMapEntry *rel)
Definition: worker.c:468
static void apply_handle_type(StringInfo s)
Definition: worker.c:2324
static void apply_handle_truncate(StringInfo s)
Definition: worker.c:3144
static void UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply)
Definition: worker.c:3462
static void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid, int szgid)
Definition: worker.c:4405
static void subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue)
Definition: worker.c:3991
static TransApplyAction get_transaction_apply_action(TransactionId xid, ParallelApplyWorkerInfo **winfo)
Definition: worker.c:5050
TransApplyAction
Definition: worker.c:266
@ TRANS_LEADER_SERIALIZE
Definition: worker.c:271
@ TRANS_PARALLEL_APPLY
Definition: worker.c:274
@ TRANS_LEADER_SEND_TO_PARALLEL
Definition: worker.c:272
@ TRANS_LEADER_APPLY
Definition: worker.c:268
@ TRANS_LEADER_PARTIAL_SERIALIZE
Definition: worker.c:273
static bool handle_streamed_transaction(LogicalRepMsgType action, StringInfo s)
Definition: worker.c:559
static void stream_open_and_write_change(TransactionId xid, char action, StringInfo s)
Definition: worker.c:4322
struct ApplyExecutionData ApplyExecutionData
static void changes_filename(char *path, Oid subid, TransactionId xid)
Definition: worker.c:4191
bool InitializingApplyWorker
Definition: worker.c:318
static void apply_worker_exit(void)
Definition: worker.c:3831
static BufFile * stream_fd
Definition: worker.c:339
static void apply_handle_update(StringInfo s)
Definition: worker.c:2520
void stream_stop_internal(TransactionId xid)
Definition: worker.c:1603
static void apply_handle_stream_commit(StringInfo s)
Definition: worker.c:2131
void start_apply(XLogRecPtr origin_startpos)
Definition: worker.c:4425
static void stop_skipping_changes(void)
Definition: worker.c:4790
struct ApplySubXactData ApplySubXactData
#define NAPTIME_PER_CYCLE
Definition: worker.c:195
static bool FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel, LogicalRepRelation *remoterel, Oid localidxoid, TupleTableSlot *remoteslot, TupleTableSlot **localslot)
Definition: worker.c:2846
static void get_flush_position(XLogRecPtr *write, XLogRecPtr *flush, bool *have_pending_txes)
Definition: worker.c:3392
static uint32 parallel_stream_nchanges
Definition: worker.c:315
static void apply_handle_commit_prepared(StringInfo s)
Definition: worker.c:1169
static void LogicalRepApplyLoop(XLogRecPtr last_received)
Definition: worker.c:3478
void LogicalRepWorkersWakeupAtCommit(Oid subid)
Definition: worker.c:4989
bool IsLogicalWorker(void)
Definition: worker.c:4744
static ApplySubXactData subxact_data
Definition: worker.c:357
static void apply_handle_tuple_routing(ApplyExecutionData *edata, TupleTableSlot *remoteslot, LogicalRepTupleData *newtup, CmdType operation)
Definition: worker.c:2893
ApplyErrorCallbackArg apply_error_callback_arg
Definition: worker.c:278
bool in_remote_transaction
Definition: worker.c:303
static XLogRecPtr skip_xact_finish_lsn
Definition: worker.c:335
static void stream_open_file(Oid subid, TransactionId xid, bool first_segment)
Definition: worker.c:4229
static void apply_handle_delete(StringInfo s)
Definition: worker.c:2704
void apply_dispatch(StringInfo s)
Definition: worker.c:3272
#define is_skipping_changes()
Definition: worker.c:336
static void stream_write_change(char action, StringInfo s)
Definition: worker.c:4292
static void clear_subscription_skip_lsn(XLogRecPtr finish_lsn)
Definition: worker.c:4812
static void apply_handle_update_internal(ApplyExecutionData *edata, ResultRelInfo *relinfo, TupleTableSlot *remoteslot, LogicalRepTupleData *newtup, Oid localindexoid)
Definition: worker.c:2637
static void ensure_last_message(FileSet *stream_fileset, TransactionId xid, int fileno, off_t offset)
Definition: worker.c:1969
static void apply_handle_begin(StringInfo s)
Definition: worker.c:991
void DisableSubscriptionAndExit(void)
Definition: worker.c:4705
static dlist_head lsn_mapping
Definition: worker.c:204
bool IsLogicalParallelApplyWorker(void)
Definition: worker.c:4753
void AtEOXact_LogicalRepWorkers(bool isCommit)
Definition: worker.c:5003
static void slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel, LogicalRepTupleData *tupleData)
Definition: worker.c:797
void ReplicationOriginNameForLogicalRep(Oid suboid, Oid relid, char *originname, Size szoriginname)
Definition: worker.c:428
static void finish_edata(ApplyExecutionData *edata)
Definition: worker.c:709
static void slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot, LogicalRepRelMapEntry *rel, LogicalRepTupleData *tupleData)
Definition: worker.c:898
static void set_apply_error_context_xact(TransactionId xid, XLogRecPtr lsn)
Definition: worker.c:4965
ErrorContextCallback * apply_error_context_stack
Definition: worker.c:288
static void stream_abort_internal(TransactionId xid, TransactionId subxid)
Definition: worker.c:1729
static void apply_handle_commit(StringInfo s)
Definition: worker.c:1016
void stream_start_internal(TransactionId xid, bool first_segment)
Definition: worker.c:1429
static List * on_commit_wakeup_workers_subids
Definition: worker.c:301
static void apply_handle_stream_abort(StringInfo s)
Definition: worker.c:1812
static void apply_handle_relation(StringInfo s)
Definition: worker.c:2301
void set_apply_error_context_origin(char *originname)
Definition: worker.c:5035
struct ApplyErrorCallbackArg ApplyErrorCallbackArg
MemoryContext ApplyContext
Definition: worker.c:291
static void subxact_info_write(Oid subid, TransactionId xid)
Definition: worker.c:4006
static void TargetPrivilegesCheck(Relation rel, AclMode mode)
Definition: worker.c:2339
static void apply_handle_prepare(StringInfo s)
Definition: worker.c:1108
static void apply_handle_rollback_prepared(StringInfo s)
Definition: worker.c:1218
static void run_apply_worker()
Definition: worker.c:4457
void SetupApplyOrSyncWorker(int worker_slot)
Definition: worker.c:4644
static void apply_handle_stream_stop(StringInfo s)
Definition: worker.c:1626
static void apply_handle_origin(StringInfo s)
Definition: worker.c:1408
static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply)
Definition: worker.c:3742
WalReceiverConn * LogRepWorkerWalRcvConn
Definition: worker.c:296
static XLogRecPtr remote_final_lsn
Definition: worker.c:304
static bool MySubscriptionValid
Definition: worker.c:299
void apply_error_callback(void *arg)
Definition: worker.c:4893
void store_flush_position(XLogRecPtr remote_lsn, XLogRecPtr local_lsn)
Definition: worker.c:3436
static MemoryContext LogicalStreamingContext
Definition: worker.c:294
void maybe_reread_subscription(void)
Definition: worker.c:3862
static void apply_handle_commit_internal(LogicalRepCommitData *commit_data)
Definition: worker.c:2241
void InitializeLogRepWorker(void)
Definition: worker.c:4561
static bool in_streamed_transaction
Definition: worker.c:307
struct SubXactInfo SubXactInfo
static void apply_handle_begin_prepare(StringInfo s)
Definition: worker.c:1042
struct FlushPosition FlushPosition
void ApplyWorkerMain(Datum main_arg)
Definition: worker.c:4685
void apply_spooled_messages(FileSet *stream_fileset, TransactionId xid, XLogRecPtr lsn)
Definition: worker.c:2001
static void apply_handle_stream_start(StringInfo s)
Definition: worker.c:1467
static void maybe_start_skipping_changes(XLogRecPtr finish_lsn)
Definition: worker.c:4763
Subscription * MySubscription
Definition: worker.c:298
static void apply_handle_prepare_internal(LogicalRepPreparedTxnData *prepare_data)
Definition: worker.c:1071
static void stream_close_file(void)
Definition: worker.c:4274
static TransactionId stream_xid
Definition: worker.c:309
static void apply_handle_insert(StringInfo s)
Definition: worker.c:2371
static void slot_fill_defaults(LogicalRepRelMapEntry *rel, EState *estate, TupleTableSlot *slot)
Definition: worker.c:740
static void subxact_info_read(Oid subid, TransactionId xid)
Definition: worker.c:4055
static void apply_handle_delete_internal(ApplyExecutionData *edata, ResultRelInfo *relinfo, TupleTableSlot *remoteslot, Oid localindexoid)
Definition: worker.c:2792
static void reset_apply_error_context_info(void)
Definition: worker.c:4973
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1790
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1654
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1618
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_IDLE
@ STATE_IDLEINTRANSACTION
@ STATE_RUNNING
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition: bitmapset.c:815
static Datum values[MAXATTR]
Definition: bootstrap.c:152
void BufFileReadExact(BufFile *file, void *ptr, size_t size)
Definition: buffile.c:654
BufFile * BufFileOpenFileSet(FileSet *fileset, const char *name, int mode, bool missing_ok)
Definition: buffile.c:291
void BufFileTell(BufFile *file, int *fileno, off_t *offset)
Definition: buffile.c:833
void BufFileWrite(BufFile *file, const void *ptr, size_t size)
Definition: buffile.c:676
size_t BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK)
Definition: buffile.c:664
void BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset)
Definition: buffile.c:933
int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
Definition: buffile.c:740
BufFile * BufFileCreateFileSet(FileSet *fileset, const char *name)
Definition: buffile.c:267
void BufFileClose(BufFile *file)
Definition: buffile.c:412
void BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok)
Definition: buffile.c:364
unsigned int uint32
Definition: c.h:506
#define likely(x)
Definition: c.h:310
#define Assert(condition)
Definition: c.h:858
uint32 TransactionId
Definition: c.h:652
#define OidIsValid(objectId)
Definition: c.h:775
size_t Size
Definition: c.h:605
int64 TimestampTz
Definition: timestamp.h:39
void load_file(const char *filename, bool restricted)
Definition: dfmgr.c:144
int my_log2(long num)
Definition: dynahash.c:1751
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
void EmitErrorReport(void)
Definition: elog.c:1670
int errdetail(const char *fmt,...)
Definition: elog.c:1203
ErrorContextCallback * error_context_stack
Definition: elog.c:94
void FlushErrorState(void)
Definition: elog.c:1834
int errcode(int sqlerrcode)
Definition: elog.c:857
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define PG_RE_THROW()
Definition: elog.h:411
#define errcontext
Definition: elog.h:196
#define PG_TRY(...)
Definition: elog.h:370
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PG_END_TRY(...)
Definition: elog.h:395
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define PG_CATCH(...)
Definition: elog.h:380
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
bool equal(const void *a, const void *b)
Definition: equalfuncs.c:223
void err(int eval, const char *fmt,...)
Definition: err.c:43
ExprState * ExecInitExpr(Expr *node, PlanState *parent)
Definition: execExpr.c:134
void ExecCloseIndices(ResultRelInfo *resultRelInfo)
Definition: execIndexing.c:231
void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative)
Definition: execIndexing.c:156
bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, bool emitError)
Definition: execMain.c:1792
void EvalPlanQualInit(EPQState *epqstate, EState *parentestate, Plan *subplan, List *auxrowmarks, int epqParam, List *resultRelations)
Definition: execMain.c:2539