PostgreSQL Source Code  git master
multixact.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * multixact.c
4  * PostgreSQL multi-transaction-log manager
5  *
6  * The pg_multixact manager is a pg_xact-like manager that stores an array of
7  * MultiXactMember for each MultiXactId. It is a fundamental part of the
8  * shared-row-lock implementation. Each MultiXactMember is comprised of a
9  * TransactionId and a set of flag bits. The name is a bit historical:
10  * originally, a MultiXactId consisted of more than one TransactionId (except
11  * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12  * legitimate to have MultiXactIds that only include a single Xid.
13  *
14  * The meaning of the flag bits is opaque to this module, but they are mostly
15  * used in heapam.c to identify lock modes that each of the member transactions
16  * is holding on any given tuple. This module just contains support to store
17  * and retrieve the arrays.
18  *
19  * We use two SLRU areas, one for storing the offsets at which the data
20  * starts for each MultiXactId in the other one. This trick allows us to
21  * store variable length arrays of TransactionIds. (We could alternatively
22  * use one area containing counts and TransactionIds, with valid MultiXactId
23  * values pointing at slots containing counts; but that way seems less robust
24  * since it would get completely confused if someone inquired about a bogus
25  * MultiXactId that pointed to an intermediate slot containing an XID.)
26  *
27  * XLOG interactions: this module generates a record whenever a new OFFSETs or
28  * MEMBERs page is initialized to zeroes, as well as an
29  * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30  * This module ignores the WAL rule "write xlog before data," because it
31  * suffices that actions recording a MultiXactId in a heap xmax do follow that
32  * rule. The only way for the MXID to be referenced from any data page is for
33  * heap_lock_tuple() or heap_update() to have put it there, and each generates
34  * an XLOG record that must follow ours. The normal LSN interlock between the
35  * data page and that XLOG record will ensure that our XLOG record reaches
36  * disk first. If the SLRU members/offsets data reaches disk sooner than the
37  * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38  * the flip side, to ensure that all referenced entries _do_ reach disk, this
39  * module's XLOG records completely rebuild the data entered since the last
40  * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41  * before each checkpoint is considered complete.
42  *
43  * Like clog.c, and unlike subtrans.c, we have to preserve state across
44  * crashes and ensure that MXID and offset numbering increases monotonically
45  * across a crash. We do this in the same way as it's done for transaction
46  * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47  * could need to worry about, and we just make sure that at the end of
48  * replay, the next-MXID and next-offset counters are at least as large as
49  * anything we saw during replay.
50  *
51  * We are able to remove segments no longer necessary by carefully tracking
52  * each table's used values: during vacuum, any multixact older than a certain
53  * value is removed; the cutoff value is stored in pg_class. The minimum value
54  * across all tables in each database is stored in pg_database, and the global
55  * minimum across all databases is part of pg_control and is kept in shared
56  * memory. Whenever that minimum is advanced, the SLRUs are truncated.
57  *
58  * When new multixactid values are to be created, care is taken that the
59  * counter does not fall within the wraparound horizon considering the global
60  * minimum value.
61  *
62  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
63  * Portions Copyright (c) 1994, Regents of the University of California
64  *
65  * src/backend/access/transam/multixact.c
66  *
67  *-------------------------------------------------------------------------
68  */
69 #include "postgres.h"
70 
71 #include "access/multixact.h"
72 #include "access/slru.h"
73 #include "access/transam.h"
74 #include "access/twophase.h"
75 #include "access/twophase_rmgr.h"
76 #include "access/xact.h"
77 #include "access/xlog.h"
78 #include "access/xloginsert.h"
79 #include "access/xlogutils.h"
80 #include "commands/dbcommands.h"
81 #include "funcapi.h"
82 #include "lib/ilist.h"
83 #include "miscadmin.h"
84 #include "pg_trace.h"
85 #include "pgstat.h"
86 #include "postmaster/autovacuum.h"
87 #include "storage/pmsignal.h"
88 #include "storage/proc.h"
89 #include "storage/procarray.h"
90 #include "utils/fmgrprotos.h"
91 #include "utils/guc_hooks.h"
92 #include "utils/injection_point.h"
93 #include "utils/memutils.h"
94 
95 
96 /*
97  * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
98  * used everywhere else in Postgres.
99  *
100  * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
101  * MultiXact page numbering also wraps around at
102  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
103  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
104  * take no explicit notice of that fact in this module, except when comparing
105  * segment and page numbers in TruncateMultiXact (see
106  * MultiXactOffsetPagePrecedes).
107  */
108 
109 /* We need four bytes per offset */
110 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
111 
112 static inline int64
114 {
115  return multi / MULTIXACT_OFFSETS_PER_PAGE;
116 }
117 
118 static inline int
120 {
121  return multi % MULTIXACT_OFFSETS_PER_PAGE;
122 }
123 
124 static inline int64
126 {
128 }
129 
130 /*
131  * The situation for members is a bit more complex: we store one byte of
132  * additional flag bits for each TransactionId. To do this without getting
133  * into alignment issues, we store four bytes of flags, and then the
134  * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
135  * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
136  * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
137  * performance) trumps space efficiency here.
138  *
139  * Note that the "offset" macros work with byte offset, not array indexes, so
140  * arithmetic must be done using "char *" pointers.
141  */
142 /* We need eight bits per xact, so one xact fits in a byte */
143 #define MXACT_MEMBER_BITS_PER_XACT 8
144 #define MXACT_MEMBER_FLAGS_PER_BYTE 1
145 #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
146 
147 /* how many full bytes of flags are there in a group? */
148 #define MULTIXACT_FLAGBYTES_PER_GROUP 4
149 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
150  (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
151 /* size in bytes of a complete group */
152 #define MULTIXACT_MEMBERGROUP_SIZE \
153  (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
154 #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
155 #define MULTIXACT_MEMBERS_PER_PAGE \
156  (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
157 
158 /*
159  * Because the number of items per page is not a divisor of the last item
160  * number (member 0xFFFFFFFF), the last segment does not use the maximum number
161  * of pages, and moreover the last used page therein does not use the same
162  * number of items as previous pages. (Another way to say it is that the
163  * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
164  * has some empty space after that item.)
165  *
166  * This constant is the number of members in the last page of the last segment.
167  */
168 #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
169  ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
170 
171 /* page in which a member is to be found */
172 static inline int64
174 {
175  return offset / MULTIXACT_MEMBERS_PER_PAGE;
176 }
177 
178 static inline int64
180 {
182 }
183 
184 /* Location (byte offset within page) of flag word for a given member */
185 static inline int
187 {
189  int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
190  int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
191 
192  return byteoff;
193 }
194 
195 static inline int
197 {
198  int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
199  int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
200 
201  return bshift;
202 }
203 
204 /* Location (byte offset within page) of TransactionId of given member */
205 static inline int
207 {
208  int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
209 
210  return MXOffsetToFlagsOffset(offset) +
212  member_in_group * sizeof(TransactionId);
213 }
214 
215 /* Multixact members wraparound thresholds. */
216 #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
217 #define MULTIXACT_MEMBER_DANGER_THRESHOLD \
218  (MaxMultiXactOffset - MaxMultiXactOffset / 4)
219 
220 static inline MultiXactId
222 {
223  return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
224 }
225 
226 /*
227  * Links to shared-memory data structures for MultiXact control
228  */
231 
232 #define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
233 #define MultiXactMemberCtl (&MultiXactMemberCtlData)
234 
235 /*
236  * MultiXact state shared across all backends. All this state is protected
237  * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
238  * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
239  * concurrency's sake, we avoid holding more than one of these locks at a
240  * time.)
241  */
242 typedef struct MultiXactStateData
243 {
244  /* next-to-be-assigned MultiXactId */
246 
247  /* next-to-be-assigned offset */
249 
250  /* Have we completed multixact startup? */
252 
253  /*
254  * Oldest multixact that is still potentially referenced by a relation.
255  * Anything older than this should not be consulted. These values are
256  * updated by vacuum.
257  */
260 
261  /*
262  * Oldest multixact offset that is potentially referenced by a multixact
263  * referenced by a relation. We don't always know this value, so there's
264  * a flag here to indicate whether or not we currently do.
265  */
268 
269  /* support for anti-wraparound measures */
274 
275  /* support for members anti-wraparound measures */
276  MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
277 
278  /*
279  * This is used to sleep until a multixact offset is written when we want
280  * to create the next one.
281  */
283 
284  /*
285  * Per-backend data starts here. We have two arrays stored in the area
286  * immediately following the MultiXactStateData struct. Each is indexed by
287  * ProcNumber.
288  *
289  * In both arrays, there's a slot for all normal backends
290  * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
291  * transactions.
292  *
293  * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
294  * transaction(s) could possibly be a member of, or InvalidMultiXactId
295  * when the backend has no live transaction that could possibly be a
296  * member of a MultiXact. Each backend sets its entry to the current
297  * nextMXact counter just before first acquiring a shared lock in a given
298  * transaction, and clears it at transaction end. (This works because only
299  * during or after acquiring a shared lock could an XID possibly become a
300  * member of a MultiXact, and that MultiXact would have to be created
301  * during or after the lock acquisition.)
302  *
303  * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
304  * current transaction(s) think is potentially live, or InvalidMultiXactId
305  * when not in a transaction or not in a transaction that's paid any
306  * attention to MultiXacts yet. This is computed when first needed in a
307  * given transaction, and cleared at transaction end. We can compute it
308  * as the minimum of the valid OldestMemberMXactId[] entries at the time
309  * we compute it (using nextMXact if none are valid). Each backend is
310  * required not to attempt to access any SLRU data for MultiXactIds older
311  * than its own OldestVisibleMXactId[] setting; this is necessary because
312  * the relevant SLRU data can be concurrently truncated away.
313  *
314  * The oldest valid value among all of the OldestMemberMXactId[] and
315  * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
316  * possible value still having any live member transaction -- OldestMxact.
317  * Any value older than that is typically removed from tuple headers, or
318  * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
319  * remove an individual MultiXact xmax whose value is >= its OldestMxact
320  * cutoff, though typically only when no individual member XID is still
321  * running. See FreezeMultiXactId for full details.
322  *
323  * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
324  * or the oldest extant Multi remaining in the table is used as the new
325  * pg_class.relminmxid value (whichever is earlier). The minimum of all
326  * relminmxid values in each database is stored in pg_database.datminmxid.
327  * In turn, the minimum of all of those values is stored in pg_control.
328  * This is used as the truncation point for pg_multixact when unneeded
329  * segments get removed by vac_truncate_clog() during vacuuming.
330  */
333 
334 /*
335  * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
336  */
337 #define MaxOldestSlot (MaxBackends + max_prepared_xacts)
338 
339 /* Pointers to the state data in shared memory */
343 
344 
345 /*
346  * Definitions for the backend-local MultiXactId cache.
347  *
348  * We use this cache to store known MultiXacts, so we don't need to go to
349  * SLRU areas every time.
350  *
351  * The cache lasts for the duration of a single transaction, the rationale
352  * for this being that most entries will contain our own TransactionId and
353  * so they will be uninteresting by the time our next transaction starts.
354  * (XXX not clear that this is correct --- other members of the MultiXact
355  * could hang around longer than we did. However, it's not clear what a
356  * better policy for flushing old cache entries would be.) FIXME actually
357  * this is plain wrong now that multixact's may contain update Xids.
358  *
359  * We allocate the cache entries in a memory context that is deleted at
360  * transaction end, so we don't need to do retail freeing of entries.
361  */
362 typedef struct mXactCacheEnt
363 {
365  int nmembers;
369 
370 #define MAX_CACHE_ENTRIES 256
373 
374 #ifdef MULTIXACT_DEBUG
375 #define debug_elog2(a,b) elog(a,b)
376 #define debug_elog3(a,b,c) elog(a,b,c)
377 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
378 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
379 #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
380 #else
381 #define debug_elog2(a,b)
382 #define debug_elog3(a,b,c)
383 #define debug_elog4(a,b,c,d)
384 #define debug_elog5(a,b,c,d,e)
385 #define debug_elog6(a,b,c,d,e,f)
386 #endif
387 
388 /* internal MultiXactId management */
389 static void MultiXactIdSetOldestVisible(void);
390 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
391  int nmembers, MultiXactMember *members);
392 static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
393 
394 /* MultiXact cache management */
395 static int mxactMemberComparator(const void *arg1, const void *arg2);
396 static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
397 static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
398 static void mXactCachePut(MultiXactId multi, int nmembers,
399  MultiXactMember *members);
400 
401 static char *mxstatus_to_string(MultiXactStatus status);
402 
403 /* management of SLRU infrastructure */
404 static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog);
405 static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog);
406 static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
407 static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
408 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
409  MultiXactOffset offset2);
410 static void ExtendMultiXactOffset(MultiXactId multi);
411 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
412 static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
413  MultiXactOffset start, uint32 distance);
414 static bool SetOffsetVacuumLimit(bool is_startup);
415 static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
416 static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
417 static void WriteMTruncateXlogRec(Oid oldestMultiDB,
418  MultiXactId startTruncOff,
419  MultiXactId endTruncOff,
420  MultiXactOffset startTruncMemb,
421  MultiXactOffset endTruncMemb);
422 
423 
424 /*
425  * MultiXactIdCreate
426  * Construct a MultiXactId representing two TransactionIds.
427  *
428  * The two XIDs must be different, or be requesting different statuses.
429  *
430  * NB - we don't worry about our local MultiXactId cache here, because that
431  * is handled by the lower-level routines.
432  */
435  TransactionId xid2, MultiXactStatus status2)
436 {
437  MultiXactId newMulti;
438  MultiXactMember members[2];
439 
442 
443  Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
444 
445  /* MultiXactIdSetOldestMember() must have been called already. */
447 
448  /*
449  * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
450  * are still running. In typical usage, xid2 will be our own XID and the
451  * caller just did a check on xid1, so it'd be wasted effort.
452  */
453 
454  members[0].xid = xid1;
455  members[0].status = status1;
456  members[1].xid = xid2;
457  members[1].status = status2;
458 
459  newMulti = MultiXactIdCreateFromMembers(2, members);
460 
461  debug_elog3(DEBUG2, "Create: %s",
462  mxid_to_string(newMulti, 2, members));
463 
464  return newMulti;
465 }
466 
467 /*
468  * MultiXactIdExpand
469  * Add a TransactionId to a pre-existing MultiXactId.
470  *
471  * If the TransactionId is already a member of the passed MultiXactId with the
472  * same status, just return it as-is.
473  *
474  * Note that we do NOT actually modify the membership of a pre-existing
475  * MultiXactId; instead we create a new one. This is necessary to avoid
476  * a race condition against code trying to wait for one MultiXactId to finish;
477  * see notes in heapam.c.
478  *
479  * NB - we don't worry about our local MultiXactId cache here, because that
480  * is handled by the lower-level routines.
481  *
482  * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
483  * one upgraded by pg_upgrade from a cluster older than this feature) are not
484  * passed in.
485  */
488 {
489  MultiXactId newMulti;
490  MultiXactMember *members;
491  MultiXactMember *newMembers;
492  int nmembers;
493  int i;
494  int j;
495 
496  Assert(MultiXactIdIsValid(multi));
498 
499  /* MultiXactIdSetOldestMember() must have been called already. */
501 
502  debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
503  multi, xid, mxstatus_to_string(status));
504 
505  /*
506  * Note: we don't allow for old multis here. The reason is that the only
507  * caller of this function does a check that the multixact is no longer
508  * running.
509  */
510  nmembers = GetMultiXactIdMembers(multi, &members, false, false);
511 
512  if (nmembers < 0)
513  {
514  MultiXactMember member;
515 
516  /*
517  * The MultiXactId is obsolete. This can only happen if all the
518  * MultiXactId members stop running between the caller checking and
519  * passing it to us. It would be better to return that fact to the
520  * caller, but it would complicate the API and it's unlikely to happen
521  * too often, so just deal with it by creating a singleton MultiXact.
522  */
523  member.xid = xid;
524  member.status = status;
525  newMulti = MultiXactIdCreateFromMembers(1, &member);
526 
527  debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
528  multi, newMulti);
529  return newMulti;
530  }
531 
532  /*
533  * If the TransactionId is already a member of the MultiXactId with the
534  * same status, just return the existing MultiXactId.
535  */
536  for (i = 0; i < nmembers; i++)
537  {
538  if (TransactionIdEquals(members[i].xid, xid) &&
539  (members[i].status == status))
540  {
541  debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
542  xid, multi);
543  pfree(members);
544  return multi;
545  }
546  }
547 
548  /*
549  * Determine which of the members of the MultiXactId are still of
550  * interest. This is any running transaction, and also any transaction
551  * that grabbed something stronger than just a lock and was committed. (An
552  * update that aborted is of no interest here; and having more than one
553  * update Xid in a multixact would cause errors elsewhere.)
554  *
555  * Removing dead members is not just an optimization: freezing of tuples
556  * whose Xmax are multis depends on this behavior.
557  *
558  * Note we have the same race condition here as above: j could be 0 at the
559  * end of the loop.
560  */
561  newMembers = (MultiXactMember *)
562  palloc(sizeof(MultiXactMember) * (nmembers + 1));
563 
564  for (i = 0, j = 0; i < nmembers; i++)
565  {
566  if (TransactionIdIsInProgress(members[i].xid) ||
567  (ISUPDATE_from_mxstatus(members[i].status) &&
568  TransactionIdDidCommit(members[i].xid)))
569  {
570  newMembers[j].xid = members[i].xid;
571  newMembers[j++].status = members[i].status;
572  }
573  }
574 
575  newMembers[j].xid = xid;
576  newMembers[j++].status = status;
577  newMulti = MultiXactIdCreateFromMembers(j, newMembers);
578 
579  pfree(members);
580  pfree(newMembers);
581 
582  debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
583 
584  return newMulti;
585 }
586 
587 /*
588  * MultiXactIdIsRunning
589  * Returns whether a MultiXactId is "running".
590  *
591  * We return true if at least one member of the given MultiXactId is still
592  * running. Note that a "false" result is certain not to change,
593  * because it is not legal to add members to an existing MultiXactId.
594  *
595  * Caller is expected to have verified that the multixact does not come from
596  * a pg_upgraded share-locked tuple.
597  */
598 bool
599 MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
600 {
601  MultiXactMember *members;
602  int nmembers;
603  int i;
604 
605  debug_elog3(DEBUG2, "IsRunning %u?", multi);
606 
607  /*
608  * "false" here means we assume our callers have checked that the given
609  * multi cannot possibly come from a pg_upgraded database.
610  */
611  nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
612 
613  if (nmembers <= 0)
614  {
615  debug_elog2(DEBUG2, "IsRunning: no members");
616  return false;
617  }
618 
619  /*
620  * Checking for myself is cheap compared to looking in shared memory;
621  * return true if any live subtransaction of the current top-level
622  * transaction is a member.
623  *
624  * This is not needed for correctness, it's just a fast path.
625  */
626  for (i = 0; i < nmembers; i++)
627  {
628  if (TransactionIdIsCurrentTransactionId(members[i].xid))
629  {
630  debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
631  pfree(members);
632  return true;
633  }
634  }
635 
636  /*
637  * This could be made faster by having another entry point in procarray.c,
638  * walking the PGPROC array only once for all the members. But in most
639  * cases nmembers should be small enough that it doesn't much matter.
640  */
641  for (i = 0; i < nmembers; i++)
642  {
643  if (TransactionIdIsInProgress(members[i].xid))
644  {
645  debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
646  i, members[i].xid);
647  pfree(members);
648  return true;
649  }
650  }
651 
652  pfree(members);
653 
654  debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
655 
656  return false;
657 }
658 
659 /*
660  * MultiXactIdSetOldestMember
661  * Save the oldest MultiXactId this transaction could be a member of.
662  *
663  * We set the OldestMemberMXactId for a given transaction the first time it's
664  * going to do some operation that might require a MultiXactId (tuple lock,
665  * update or delete). We need to do this even if we end up using a
666  * TransactionId instead of a MultiXactId, because there is a chance that
667  * another transaction would add our XID to a MultiXactId.
668  *
669  * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
670  * be called just before doing any such possibly-MultiXactId-able operation.
671  */
672 void
674 {
676  {
677  MultiXactId nextMXact;
678 
679  /*
680  * You might think we don't need to acquire a lock here, since
681  * fetching and storing of TransactionIds is probably atomic, but in
682  * fact we do: suppose we pick up nextMXact and then lose the CPU for
683  * a long time. Someone else could advance nextMXact, and then
684  * another someone else could compute an OldestVisibleMXactId that
685  * would be after the value we are going to store when we get control
686  * back. Which would be wrong.
687  *
688  * Note that a shared lock is sufficient, because it's enough to stop
689  * someone from advancing nextMXact; and nobody else could be trying
690  * to write to our OldestMember entry, only reading (and we assume
691  * storing it is atomic.)
692  */
693  LWLockAcquire(MultiXactGenLock, LW_SHARED);
694 
695  /*
696  * We have to beware of the possibility that nextMXact is in the
697  * wrapped-around state. We don't fix the counter itself here, but we
698  * must be sure to store a valid value in our array entry.
699  */
700  nextMXact = MultiXactState->nextMXact;
701  if (nextMXact < FirstMultiXactId)
702  nextMXact = FirstMultiXactId;
703 
704  OldestMemberMXactId[MyProcNumber] = nextMXact;
705 
706  LWLockRelease(MultiXactGenLock);
707 
708  debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
709  MyProcNumber, nextMXact);
710  }
711 }
712 
713 /*
714  * MultiXactIdSetOldestVisible
715  * Save the oldest MultiXactId this transaction considers possibly live.
716  *
717  * We set the OldestVisibleMXactId for a given transaction the first time
718  * it's going to inspect any MultiXactId. Once we have set this, we are
719  * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
720  * won't be truncated away.
721  *
722  * The value to set is the oldest of nextMXact and all the valid per-backend
723  * OldestMemberMXactId[] entries. Because of the locking we do, we can be
724  * certain that no subsequent call to MultiXactIdSetOldestMember can set
725  * an OldestMemberMXactId[] entry older than what we compute here. Therefore
726  * there is no live transaction, now or later, that can be a member of any
727  * MultiXactId older than the OldestVisibleMXactId we compute here.
728  */
729 static void
731 {
733  {
734  MultiXactId oldestMXact;
735  int i;
736 
737  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
738 
739  /*
740  * We have to beware of the possibility that nextMXact is in the
741  * wrapped-around state. We don't fix the counter itself here, but we
742  * must be sure to store a valid value in our array entry.
743  */
744  oldestMXact = MultiXactState->nextMXact;
745  if (oldestMXact < FirstMultiXactId)
746  oldestMXact = FirstMultiXactId;
747 
748  for (i = 0; i < MaxOldestSlot; i++)
749  {
750  MultiXactId thisoldest = OldestMemberMXactId[i];
751 
752  if (MultiXactIdIsValid(thisoldest) &&
753  MultiXactIdPrecedes(thisoldest, oldestMXact))
754  oldestMXact = thisoldest;
755  }
756 
757  OldestVisibleMXactId[MyProcNumber] = oldestMXact;
758 
759  LWLockRelease(MultiXactGenLock);
760 
761  debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
762  MyProcNumber, oldestMXact);
763  }
764 }
765 
766 /*
767  * ReadNextMultiXactId
768  * Return the next MultiXactId to be assigned, but don't allocate it
769  */
772 {
773  MultiXactId mxid;
774 
775  /* XXX we could presumably do this without a lock. */
776  LWLockAcquire(MultiXactGenLock, LW_SHARED);
777  mxid = MultiXactState->nextMXact;
778  LWLockRelease(MultiXactGenLock);
779 
780  if (mxid < FirstMultiXactId)
781  mxid = FirstMultiXactId;
782 
783  return mxid;
784 }
785 
786 /*
787  * ReadMultiXactIdRange
788  * Get the range of IDs that may still be referenced by a relation.
789  */
790 void
792 {
793  LWLockAcquire(MultiXactGenLock, LW_SHARED);
796  LWLockRelease(MultiXactGenLock);
797 
798  if (*oldest < FirstMultiXactId)
799  *oldest = FirstMultiXactId;
800  if (*next < FirstMultiXactId)
802 }
803 
804 
805 /*
806  * MultiXactIdCreateFromMembers
807  * Make a new MultiXactId from the specified set of members
808  *
809  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
810  * given TransactionIds as members. Returns the newly created MultiXactId.
811  *
812  * NB: the passed members[] array will be sorted in-place.
813  */
816 {
817  MultiXactId multi;
818  MultiXactOffset offset;
819  xl_multixact_create xlrec;
820 
821  debug_elog3(DEBUG2, "Create: %s",
822  mxid_to_string(InvalidMultiXactId, nmembers, members));
823 
824  /*
825  * See if the same set of members already exists in our cache; if so, just
826  * re-use that MultiXactId. (Note: it might seem that looking in our
827  * cache is insufficient, and we ought to search disk to see if a
828  * duplicate definition already exists. But since we only ever create
829  * MultiXacts containing our own XID, in most cases any such MultiXacts
830  * were in fact created by us, and so will be in our cache. There are
831  * corner cases where someone else added us to a MultiXact without our
832  * knowledge, but it's not worth checking for.)
833  */
834  multi = mXactCacheGetBySet(nmembers, members);
835  if (MultiXactIdIsValid(multi))
836  {
837  debug_elog2(DEBUG2, "Create: in cache!");
838  return multi;
839  }
840 
841  /* Verify that there is a single update Xid among the given members. */
842  {
843  int i;
844  bool has_update = false;
845 
846  for (i = 0; i < nmembers; i++)
847  {
848  if (ISUPDATE_from_mxstatus(members[i].status))
849  {
850  if (has_update)
851  elog(ERROR, "new multixact has more than one updating member: %s",
852  mxid_to_string(InvalidMultiXactId, nmembers, members));
853  has_update = true;
854  }
855  }
856  }
857 
858  /* Load the injection point before entering the critical section */
859  INJECTION_POINT_LOAD("multixact-create-from-members");
860 
861  /*
862  * Assign the MXID and offsets range to use, and make sure there is space
863  * in the OFFSETs and MEMBERs files. NB: this routine does
864  * START_CRIT_SECTION().
865  *
866  * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
867  * that we've called MultiXactIdSetOldestMember here. This is because
868  * this routine is used in some places to create new MultiXactIds of which
869  * the current backend is not a member, notably during freezing of multis
870  * in vacuum. During vacuum, in particular, it would be unacceptable to
871  * keep OldestMulti set, in case it runs for long.
872  */
873  multi = GetNewMultiXactId(nmembers, &offset);
874 
875  INJECTION_POINT_CACHED("multixact-create-from-members");
876 
877  /* Make an XLOG entry describing the new MXID. */
878  xlrec.mid = multi;
879  xlrec.moff = offset;
880  xlrec.nmembers = nmembers;
881 
882  /*
883  * XXX Note: there's a lot of padding space in MultiXactMember. We could
884  * find a more compact representation of this Xlog record -- perhaps all
885  * the status flags in one XLogRecData, then all the xids in another one?
886  * Not clear that it's worth the trouble though.
887  */
888  XLogBeginInsert();
889  XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
890  XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember));
891 
892  (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
893 
894  /* Now enter the information into the OFFSETs and MEMBERs logs */
895  RecordNewMultiXact(multi, offset, nmembers, members);
896 
897  /* Done with critical section */
899 
900  /* Store the new MultiXactId in the local cache, too */
901  mXactCachePut(multi, nmembers, members);
902 
903  debug_elog2(DEBUG2, "Create: all done");
904 
905  return multi;
906 }
907 
908 /*
909  * RecordNewMultiXact
910  * Write info about a new multixact into the offsets and members files
911  *
912  * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
913  * use it.
914  */
915 static void
917  int nmembers, MultiXactMember *members)
918 {
919  int64 pageno;
920  int64 prev_pageno;
921  int entryno;
922  int slotno;
923  MultiXactOffset *offptr;
924  int i;
925  LWLock *lock;
926  LWLock *prevlock = NULL;
927 
928  pageno = MultiXactIdToOffsetPage(multi);
929  entryno = MultiXactIdToOffsetEntry(multi);
930 
933 
934  /*
935  * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
936  * to complain about if there's any I/O error. This is kinda bogus, but
937  * since the errors will always give the full pathname, it should be clear
938  * enough that a MultiXactId is really involved. Perhaps someday we'll
939  * take the trouble to generalize the slru.c error reporting code.
940  */
941  slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
942  offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
943  offptr += entryno;
944 
945  *offptr = offset;
946 
947  MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
948 
949  /* Release MultiXactOffset SLRU lock. */
950  LWLockRelease(lock);
951 
952  /*
953  * If anybody was waiting to know the offset of this multixact ID we just
954  * wrote, they can read it now, so wake them up.
955  */
957 
958  prev_pageno = -1;
959 
960  for (i = 0; i < nmembers; i++, offset++)
961  {
962  TransactionId *memberptr;
963  uint32 *flagsptr;
964  uint32 flagsval;
965  int bshift;
966  int flagsoff;
967  int memberoff;
968 
969  Assert(members[i].status <= MultiXactStatusUpdate);
970 
971  pageno = MXOffsetToMemberPage(offset);
972  memberoff = MXOffsetToMemberOffset(offset);
973  flagsoff = MXOffsetToFlagsOffset(offset);
974  bshift = MXOffsetToFlagsBitShift(offset);
975 
976  if (pageno != prev_pageno)
977  {
978  /*
979  * MultiXactMember SLRU page is changed so check if this new page
980  * fall into the different SLRU bank then release the old bank's
981  * lock and acquire lock on the new bank.
982  */
984  if (lock != prevlock)
985  {
986  if (prevlock != NULL)
987  LWLockRelease(prevlock);
988 
990  prevlock = lock;
991  }
992  slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
993  prev_pageno = pageno;
994  }
995 
996  memberptr = (TransactionId *)
997  (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
998 
999  *memberptr = members[i].xid;
1000 
1001  flagsptr = (uint32 *)
1002  (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1003 
1004  flagsval = *flagsptr;
1005  flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
1006  flagsval |= (members[i].status << bshift);
1007  *flagsptr = flagsval;
1008 
1009  MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1010  }
1011 
1012  if (prevlock != NULL)
1013  LWLockRelease(prevlock);
1014 }
1015 
1016 /*
1017  * GetNewMultiXactId
1018  * Get the next MultiXactId.
1019  *
1020  * Also, reserve the needed amount of space in the "members" area. The
1021  * starting offset of the reserved space is returned in *offset.
1022  *
1023  * This may generate XLOG records for expansion of the offsets and/or members
1024  * files. Unfortunately, we have to do that while holding MultiXactGenLock
1025  * to avoid race conditions --- the XLOG record for zeroing a page must appear
1026  * before any backend can possibly try to store data in that page!
1027  *
1028  * We start a critical section before advancing the shared counters. The
1029  * caller must end the critical section after writing SLRU data.
1030  */
1031 static MultiXactId
1032 GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
1033 {
1034  MultiXactId result;
1035  MultiXactOffset nextOffset;
1036 
1037  debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
1038 
1039  /* safety check, we should never get this far in a HS standby */
1040  if (RecoveryInProgress())
1041  elog(ERROR, "cannot assign MultiXactIds during recovery");
1042 
1043  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1044 
1045  /* Handle wraparound of the nextMXact counter */
1048 
1049  /* Assign the MXID */
1050  result = MultiXactState->nextMXact;
1051 
1052  /*----------
1053  * Check to see if it's safe to assign another MultiXactId. This protects
1054  * against catastrophic data loss due to multixact wraparound. The basic
1055  * rules are:
1056  *
1057  * If we're past multiVacLimit or the safe threshold for member storage
1058  * space, or we don't know what the safe threshold for member storage is,
1059  * start trying to force autovacuum cycles.
1060  * If we're past multiWarnLimit, start issuing warnings.
1061  * If we're past multiStopLimit, refuse to create new MultiXactIds.
1062  *
1063  * Note these are pretty much the same protections in GetNewTransactionId.
1064  *----------
1065  */
1067  {
1068  /*
1069  * For safety's sake, we release MultiXactGenLock while sending
1070  * signals, warnings, etc. This is not so much because we care about
1071  * preserving concurrency in this situation, as to avoid any
1072  * possibility of deadlock while doing get_database_name(). First,
1073  * copy all the shared values we'll need in this path.
1074  */
1075  MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1076  MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1077  MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1078  Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1079 
1080  LWLockRelease(MultiXactGenLock);
1081 
1082  if (IsUnderPostmaster &&
1083  !MultiXactIdPrecedes(result, multiStopLimit))
1084  {
1085  char *oldest_datname = get_database_name(oldest_datoid);
1086 
1087  /*
1088  * Immediately kick autovacuum into action as we're already in
1089  * ERROR territory.
1090  */
1092 
1093  /* complain even if that DB has disappeared */
1094  if (oldest_datname)
1095  ereport(ERROR,
1096  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1097  errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1098  oldest_datname),
1099  errhint("Execute a database-wide VACUUM in that database.\n"
1100  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1101  else
1102  ereport(ERROR,
1103  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1104  errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1105  oldest_datoid),
1106  errhint("Execute a database-wide VACUUM in that database.\n"
1107  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1108  }
1109 
1110  /*
1111  * To avoid swamping the postmaster with signals, we issue the autovac
1112  * request only once per 64K multis generated. This still gives
1113  * plenty of chances before we get into real trouble.
1114  */
1115  if (IsUnderPostmaster && (result % 65536) == 0)
1117 
1118  if (!MultiXactIdPrecedes(result, multiWarnLimit))
1119  {
1120  char *oldest_datname = get_database_name(oldest_datoid);
1121 
1122  /* complain even if that DB has disappeared */
1123  if (oldest_datname)
1124  ereport(WARNING,
1125  (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1126  "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1127  multiWrapLimit - result,
1128  oldest_datname,
1129  multiWrapLimit - result),
1130  errhint("Execute a database-wide VACUUM in that database.\n"
1131  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1132  else
1133  ereport(WARNING,
1134  (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1135  "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1136  multiWrapLimit - result,
1137  oldest_datoid,
1138  multiWrapLimit - result),
1139  errhint("Execute a database-wide VACUUM in that database.\n"
1140  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1141  }
1142 
1143  /* Re-acquire lock and start over */
1144  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1145  result = MultiXactState->nextMXact;
1146  if (result < FirstMultiXactId)
1147  result = FirstMultiXactId;
1148  }
1149 
1150  /* Make sure there is room for the MXID in the file. */
1151  ExtendMultiXactOffset(result);
1152 
1153  /*
1154  * Reserve the members space, similarly to above. Also, be careful not to
1155  * return zero as the starting offset for any multixact. See
1156  * GetMultiXactIdMembers() for motivation.
1157  */
1158  nextOffset = MultiXactState->nextOffset;
1159  if (nextOffset == 0)
1160  {
1161  *offset = 1;
1162  nmembers++; /* allocate member slot 0 too */
1163  }
1164  else
1165  *offset = nextOffset;
1166 
1167  /*----------
1168  * Protect against overrun of the members space as well, with the
1169  * following rules:
1170  *
1171  * If we're past offsetStopLimit, refuse to generate more multis.
1172  * If we're close to offsetStopLimit, emit a warning.
1173  *
1174  * Arbitrarily, we start emitting warnings when we're 20 segments or less
1175  * from offsetStopLimit.
1176  *
1177  * Note we haven't updated the shared state yet, so if we fail at this
1178  * point, the multixact ID we grabbed can still be used by the next guy.
1179  *
1180  * Note that there is no point in forcing autovacuum runs here: the
1181  * multixact freeze settings would have to be reduced for that to have any
1182  * effect.
1183  *----------
1184  */
1185 #define OFFSET_WARN_SEGMENTS 20
1188  nmembers))
1189  {
1190  /* see comment in the corresponding offsets wraparound case */
1192 
1193  ereport(ERROR,
1194  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1195  errmsg("multixact \"members\" limit exceeded"),
1196  errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1197  "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1198  MultiXactState->offsetStopLimit - nextOffset - 1,
1199  nmembers,
1200  MultiXactState->offsetStopLimit - nextOffset - 1),
1201  errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1203  }
1204 
1205  /*
1206  * Check whether we should kick autovacuum into action, to prevent members
1207  * wraparound. NB we use a much larger window to trigger autovacuum than
1208  * just the warning limit. The warning is just a measure of last resort -
1209  * this is in line with GetNewTransactionId's behaviour.
1210  */
1214  {
1215  /*
1216  * To avoid swamping the postmaster with signals, we issue the autovac
1217  * request only when crossing a segment boundary. With default
1218  * compilation settings that's roughly after 50k members. This still
1219  * gives plenty of chances before we get into real trouble.
1220  */
1221  if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1222  (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1224  }
1225 
1228  nextOffset,
1230  ereport(WARNING,
1231  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1232  errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1233  "database with OID %u must be vacuumed before %d more multixact members are used",
1234  MultiXactState->offsetStopLimit - nextOffset + nmembers,
1236  MultiXactState->offsetStopLimit - nextOffset + nmembers),
1237  errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1238 
1239  ExtendMultiXactMember(nextOffset, nmembers);
1240 
1241  /*
1242  * Critical section from here until caller has written the data into the
1243  * just-reserved SLRU space; we don't want to error out with a partly
1244  * written MultiXact structure. (In particular, failing to write our
1245  * start offset after advancing nextMXact would effectively corrupt the
1246  * previous MultiXact.)
1247  */
1249 
1250  /*
1251  * Advance counters. As in GetNewTransactionId(), this must not happen
1252  * until after file extension has succeeded!
1253  *
1254  * We don't care about MultiXactId wraparound here; it will be handled by
1255  * the next iteration. But note that nextMXact may be InvalidMultiXactId
1256  * or the first value on a segment-beginning page after this routine
1257  * exits, so anyone else looking at the variable must be prepared to deal
1258  * with either case. Similarly, nextOffset may be zero, but we won't use
1259  * that as the actual start offset of the next multixact.
1260  */
1262 
1263  MultiXactState->nextOffset += nmembers;
1264 
1265  LWLockRelease(MultiXactGenLock);
1266 
1267  debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1268  return result;
1269 }
1270 
1271 /*
1272  * GetMultiXactIdMembers
1273  * Return the set of MultiXactMembers that make up a MultiXactId
1274  *
1275  * Return value is the number of members found, or -1 if there are none,
1276  * and *members is set to a newly palloc'ed array of members. It's the
1277  * caller's responsibility to free it when done with it.
1278  *
1279  * from_pgupgrade must be passed as true if and only if only the multixact
1280  * corresponds to a value from a tuple that was locked in a 9.2-or-older
1281  * installation and later pg_upgrade'd (that is, the infomask is
1282  * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1283  * can still be running, so we return -1 just like for an empty multixact
1284  * without any further checking. It would be wrong to try to resolve such a
1285  * multixact: either the multixact is within the current valid multixact
1286  * range, in which case the returned result would be bogus, or outside that
1287  * range, in which case an error would be raised.
1288  *
1289  * In all other cases, the passed multixact must be within the known valid
1290  * range, that is, greater to or equal than oldestMultiXactId, and less than
1291  * nextMXact. Otherwise, an error is raised.
1292  *
1293  * isLockOnly must be set to true if caller is certain that the given multi
1294  * is used only to lock tuples; can be false without loss of correctness,
1295  * but passing a true means we can return quickly without checking for
1296  * old updates.
1297  */
1298 int
1300  bool from_pgupgrade, bool isLockOnly)
1301 {
1302  int64 pageno;
1303  int64 prev_pageno;
1304  int entryno;
1305  int slotno;
1306  MultiXactOffset *offptr;
1307  MultiXactOffset offset;
1308  int length;
1309  int truelength;
1310  MultiXactId oldestMXact;
1311  MultiXactId nextMXact;
1312  MultiXactId tmpMXact;
1313  MultiXactOffset nextOffset;
1314  MultiXactMember *ptr;
1315  LWLock *lock;
1316  bool slept = false;
1317 
1318  debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1319 
1320  if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1321  {
1322  *members = NULL;
1323  return -1;
1324  }
1325 
1326  /* See if the MultiXactId is in the local cache */
1327  length = mXactCacheGetById(multi, members);
1328  if (length >= 0)
1329  {
1330  debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1331  mxid_to_string(multi, length, *members));
1332  return length;
1333  }
1334 
1335  /* Set our OldestVisibleMXactId[] entry if we didn't already */
1337 
1338  /*
1339  * If we know the multi is used only for locking and not for updates, then
1340  * we can skip checking if the value is older than our oldest visible
1341  * multi. It cannot possibly still be running.
1342  */
1343  if (isLockOnly &&
1345  {
1346  debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1347  *members = NULL;
1348  return -1;
1349  }
1350 
1351  /*
1352  * We check known limits on MultiXact before resorting to the SLRU area.
1353  *
1354  * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1355  * useful; it has already been removed, or will be removed shortly, by
1356  * truncation. If one is passed, an error is raised.
1357  *
1358  * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1359  * implies undetected ID wraparound has occurred. This raises a hard
1360  * error.
1361  *
1362  * Shared lock is enough here since we aren't modifying any global state.
1363  * Acquire it just long enough to grab the current counter values. We may
1364  * need both nextMXact and nextOffset; see below.
1365  */
1366  LWLockAcquire(MultiXactGenLock, LW_SHARED);
1367 
1368  oldestMXact = MultiXactState->oldestMultiXactId;
1369  nextMXact = MultiXactState->nextMXact;
1370  nextOffset = MultiXactState->nextOffset;
1371 
1372  LWLockRelease(MultiXactGenLock);
1373 
1374  if (MultiXactIdPrecedes(multi, oldestMXact))
1375  ereport(ERROR,
1376  (errcode(ERRCODE_INTERNAL_ERROR),
1377  errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1378  multi)));
1379 
1380  if (!MultiXactIdPrecedes(multi, nextMXact))
1381  ereport(ERROR,
1382  (errcode(ERRCODE_INTERNAL_ERROR),
1383  errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1384  multi)));
1385 
1386  /*
1387  * Find out the offset at which we need to start reading MultiXactMembers
1388  * and the number of members in the multixact. We determine the latter as
1389  * the difference between this multixact's starting offset and the next
1390  * one's. However, there are some corner cases to worry about:
1391  *
1392  * 1. This multixact may be the latest one created, in which case there is
1393  * no next one to look at. In this case the nextOffset value we just
1394  * saved is the correct endpoint.
1395  *
1396  * 2. The next multixact may still be in process of being filled in: that
1397  * is, another process may have done GetNewMultiXactId but not yet written
1398  * the offset entry for that ID. In that scenario, it is guaranteed that
1399  * the offset entry for that multixact exists (because GetNewMultiXactId
1400  * won't release MultiXactGenLock until it does) but contains zero
1401  * (because we are careful to pre-zero offset pages). Because
1402  * GetNewMultiXactId will never return zero as the starting offset for a
1403  * multixact, when we read zero as the next multixact's offset, we know we
1404  * have this case. We handle this by sleeping on the condition variable
1405  * we have just for this; the process in charge will signal the CV as soon
1406  * as it has finished writing the multixact offset.
1407  *
1408  * 3. Because GetNewMultiXactId increments offset zero to offset one to
1409  * handle case #2, there is an ambiguity near the point of offset
1410  * wraparound. If we see next multixact's offset is one, is that our
1411  * multixact's actual endpoint, or did it end at zero with a subsequent
1412  * increment? We handle this using the knowledge that if the zero'th
1413  * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1414  * transaction ID so it can't be a multixact member. Therefore, if we
1415  * read a zero from the members array, just ignore it.
1416  *
1417  * This is all pretty messy, but the mess occurs only in infrequent corner
1418  * cases, so it seems better than holding the MultiXactGenLock for a long
1419  * time on every multixact creation.
1420  */
1421 retry:
1422  pageno = MultiXactIdToOffsetPage(multi);
1423  entryno = MultiXactIdToOffsetEntry(multi);
1424 
1425  /* Acquire the bank lock for the page we need. */
1426  lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1427  LWLockAcquire(lock, LW_EXCLUSIVE);
1428 
1429  slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1430  offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1431  offptr += entryno;
1432  offset = *offptr;
1433 
1434  Assert(offset != 0);
1435 
1436  /*
1437  * Use the same increment rule as GetNewMultiXactId(), that is, don't
1438  * handle wraparound explicitly until needed.
1439  */
1440  tmpMXact = multi + 1;
1441 
1442  if (nextMXact == tmpMXact)
1443  {
1444  /* Corner case 1: there is no next multixact */
1445  length = nextOffset - offset;
1446  }
1447  else
1448  {
1449  MultiXactOffset nextMXOffset;
1450 
1451  /* handle wraparound if needed */
1452  if (tmpMXact < FirstMultiXactId)
1453  tmpMXact = FirstMultiXactId;
1454 
1455  prev_pageno = pageno;
1456 
1457  pageno = MultiXactIdToOffsetPage(tmpMXact);
1458  entryno = MultiXactIdToOffsetEntry(tmpMXact);
1459 
1460  if (pageno != prev_pageno)
1461  {
1462  LWLock *newlock;
1463 
1464  /*
1465  * Since we're going to access a different SLRU page, if this page
1466  * falls under a different bank, release the old bank's lock and
1467  * acquire the lock of the new bank.
1468  */
1469  newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1470  if (newlock != lock)
1471  {
1472  LWLockRelease(lock);
1473  LWLockAcquire(newlock, LW_EXCLUSIVE);
1474  lock = newlock;
1475  }
1476  slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1477  }
1478 
1479  offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1480  offptr += entryno;
1481  nextMXOffset = *offptr;
1482 
1483  if (nextMXOffset == 0)
1484  {
1485  /* Corner case 2: next multixact is still being filled in */
1486  LWLockRelease(lock);
1488 
1489  INJECTION_POINT("multixact-get-members-cv-sleep");
1490 
1492  WAIT_EVENT_MULTIXACT_CREATION);
1493  slept = true;
1494  goto retry;
1495  }
1496 
1497  length = nextMXOffset - offset;
1498  }
1499 
1500  LWLockRelease(lock);
1501  lock = NULL;
1502 
1503  /*
1504  * If we slept above, clean up state; it's no longer needed.
1505  */
1506  if (slept)
1508 
1509  ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1510 
1511  truelength = 0;
1512  prev_pageno = -1;
1513  for (int i = 0; i < length; i++, offset++)
1514  {
1515  TransactionId *xactptr;
1516  uint32 *flagsptr;
1517  int flagsoff;
1518  int bshift;
1519  int memberoff;
1520 
1521  pageno = MXOffsetToMemberPage(offset);
1522  memberoff = MXOffsetToMemberOffset(offset);
1523 
1524  if (pageno != prev_pageno)
1525  {
1526  LWLock *newlock;
1527 
1528  /*
1529  * Since we're going to access a different SLRU page, if this page
1530  * falls under a different bank, release the old bank's lock and
1531  * acquire the lock of the new bank.
1532  */
1533  newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1534  if (newlock != lock)
1535  {
1536  if (lock)
1537  LWLockRelease(lock);
1538  LWLockAcquire(newlock, LW_EXCLUSIVE);
1539  lock = newlock;
1540  }
1541 
1542  slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1543  prev_pageno = pageno;
1544  }
1545 
1546  xactptr = (TransactionId *)
1547  (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1548 
1549  if (!TransactionIdIsValid(*xactptr))
1550  {
1551  /* Corner case 3: we must be looking at unused slot zero */
1552  Assert(offset == 0);
1553  continue;
1554  }
1555 
1556  flagsoff = MXOffsetToFlagsOffset(offset);
1557  bshift = MXOffsetToFlagsBitShift(offset);
1558  flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1559 
1560  ptr[truelength].xid = *xactptr;
1561  ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1562  truelength++;
1563  }
1564 
1565  LWLockRelease(lock);
1566 
1567  /* A multixid with zero members should not happen */
1568  Assert(truelength > 0);
1569 
1570  /*
1571  * Copy the result into the local cache.
1572  */
1573  mXactCachePut(multi, truelength, ptr);
1574 
1575  debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1576  mxid_to_string(multi, truelength, ptr));
1577  *members = ptr;
1578  return truelength;
1579 }
1580 
1581 /*
1582  * mxactMemberComparator
1583  * qsort comparison function for MultiXactMember
1584  *
1585  * We can't use wraparound comparison for XIDs because that does not respect
1586  * the triangle inequality! Any old sort order will do.
1587  */
1588 static int
1589 mxactMemberComparator(const void *arg1, const void *arg2)
1590 {
1591  MultiXactMember member1 = *(const MultiXactMember *) arg1;
1592  MultiXactMember member2 = *(const MultiXactMember *) arg2;
1593 
1594  if (member1.xid > member2.xid)
1595  return 1;
1596  if (member1.xid < member2.xid)
1597  return -1;
1598  if (member1.status > member2.status)
1599  return 1;
1600  if (member1.status < member2.status)
1601  return -1;
1602  return 0;
1603 }
1604 
1605 /*
1606  * mXactCacheGetBySet
1607  * returns a MultiXactId from the cache based on the set of
1608  * TransactionIds that compose it, or InvalidMultiXactId if
1609  * none matches.
1610  *
1611  * This is helpful, for example, if two transactions want to lock a huge
1612  * table. By using the cache, the second will use the same MultiXactId
1613  * for the majority of tuples, thus keeping MultiXactId usage low (saving
1614  * both I/O and wraparound issues).
1615  *
1616  * NB: the passed members array will be sorted in-place.
1617  */
1618 static MultiXactId
1619 mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1620 {
1621  dlist_iter iter;
1622 
1623  debug_elog3(DEBUG2, "CacheGet: looking for %s",
1624  mxid_to_string(InvalidMultiXactId, nmembers, members));
1625 
1626  /* sort the array so comparison is easy */
1627  qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1628 
1629  dclist_foreach(iter, &MXactCache)
1630  {
1632  iter.cur);
1633 
1634  if (entry->nmembers != nmembers)
1635  continue;
1636 
1637  /*
1638  * We assume the cache entries are sorted, and that the unused bits in
1639  * "status" are zeroed.
1640  */
1641  if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1642  {
1643  debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1645  return entry->multi;
1646  }
1647  }
1648 
1649  debug_elog2(DEBUG2, "CacheGet: not found :-(");
1650  return InvalidMultiXactId;
1651 }
1652 
1653 /*
1654  * mXactCacheGetById
1655  * returns the composing MultiXactMember set from the cache for a
1656  * given MultiXactId, if present.
1657  *
1658  * If successful, *xids is set to the address of a palloc'd copy of the
1659  * MultiXactMember set. Return value is number of members, or -1 on failure.
1660  */
1661 static int
1663 {
1664  dlist_iter iter;
1665 
1666  debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1667 
1668  dclist_foreach(iter, &MXactCache)
1669  {
1671  iter.cur);
1672 
1673  if (entry->multi == multi)
1674  {
1675  MultiXactMember *ptr;
1676  Size size;
1677 
1678  size = sizeof(MultiXactMember) * entry->nmembers;
1679  ptr = (MultiXactMember *) palloc(size);
1680 
1681  memcpy(ptr, entry->members, size);
1682 
1683  debug_elog3(DEBUG2, "CacheGet: found %s",
1684  mxid_to_string(multi,
1685  entry->nmembers,
1686  entry->members));
1687 
1688  /*
1689  * Note we modify the list while not using a modifiable iterator.
1690  * This is acceptable only because we exit the iteration
1691  * immediately afterwards.
1692  */
1694 
1695  *members = ptr;
1696  return entry->nmembers;
1697  }
1698  }
1699 
1700  debug_elog2(DEBUG2, "CacheGet: not found");
1701  return -1;
1702 }
1703 
1704 /*
1705  * mXactCachePut
1706  * Add a new MultiXactId and its composing set into the local cache.
1707  */
1708 static void
1709 mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1710 {
1711  mXactCacheEnt *entry;
1712 
1713  debug_elog3(DEBUG2, "CachePut: storing %s",
1714  mxid_to_string(multi, nmembers, members));
1715 
1716  if (MXactContext == NULL)
1717  {
1718  /* The cache only lives as long as the current transaction */
1719  debug_elog2(DEBUG2, "CachePut: initializing memory context");
1721  "MultiXact cache context",
1723  }
1724 
1725  entry = (mXactCacheEnt *)
1727  offsetof(mXactCacheEnt, members) +
1728  nmembers * sizeof(MultiXactMember));
1729 
1730  entry->multi = multi;
1731  entry->nmembers = nmembers;
1732  memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1733 
1734  /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1735  qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1736 
1737  dclist_push_head(&MXactCache, &entry->node);
1739  {
1740  dlist_node *node;
1741 
1742  node = dclist_tail_node(&MXactCache);
1744 
1745  entry = dclist_container(mXactCacheEnt, node, node);
1746  debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1747  entry->multi);
1748 
1749  pfree(entry);
1750  }
1751 }
1752 
1753 static char *
1755 {
1756  switch (status)
1757  {
1759  return "keysh";
1761  return "sh";
1763  return "fornokeyupd";
1765  return "forupd";
1767  return "nokeyupd";
1768  case MultiXactStatusUpdate:
1769  return "upd";
1770  default:
1771  elog(ERROR, "unrecognized multixact status %d", status);
1772  return "";
1773  }
1774 }
1775 
1776 char *
1777 mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1778 {
1779  static char *str = NULL;
1781  int i;
1782 
1783  if (str != NULL)
1784  pfree(str);
1785 
1786  initStringInfo(&buf);
1787 
1788  appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1789  mxstatus_to_string(members[0].status));
1790 
1791  for (i = 1; i < nmembers; i++)
1792  appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1793  mxstatus_to_string(members[i].status));
1794 
1795  appendStringInfoChar(&buf, ']');
1797  pfree(buf.data);
1798  return str;
1799 }
1800 
1801 /*
1802  * AtEOXact_MultiXact
1803  * Handle transaction end for MultiXact
1804  *
1805  * This is called at top transaction commit or abort (we don't care which).
1806  */
1807 void
1809 {
1810  /*
1811  * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1812  * which should only be valid while within a transaction.
1813  *
1814  * We assume that storing a MultiXactId is atomic and so we need not take
1815  * MultiXactGenLock to do this.
1816  */
1819 
1820  /*
1821  * Discard the local MultiXactId cache. Since MXactContext was created as
1822  * a child of TopTransactionContext, we needn't delete it explicitly.
1823  */
1824  MXactContext = NULL;
1826 }
1827 
1828 /*
1829  * AtPrepare_MultiXact
1830  * Save multixact state at 2PC transaction prepare
1831  *
1832  * In this phase, we only store our OldestMemberMXactId value in the two-phase
1833  * state file.
1834  */
1835 void
1837 {
1838  MultiXactId myOldestMember = OldestMemberMXactId[MyProcNumber];
1839 
1840  if (MultiXactIdIsValid(myOldestMember))
1842  &myOldestMember, sizeof(MultiXactId));
1843 }
1844 
1845 /*
1846  * PostPrepare_MultiXact
1847  * Clean up after successful PREPARE TRANSACTION
1848  */
1849 void
1851 {
1852  MultiXactId myOldestMember;
1853 
1854  /*
1855  * Transfer our OldestMemberMXactId value to the slot reserved for the
1856  * prepared transaction.
1857  */
1858  myOldestMember = OldestMemberMXactId[MyProcNumber];
1859  if (MultiXactIdIsValid(myOldestMember))
1860  {
1861  ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
1862 
1863  /*
1864  * Even though storing MultiXactId is atomic, acquire lock to make
1865  * sure others see both changes, not just the reset of the slot of the
1866  * current backend. Using a volatile pointer might suffice, but this
1867  * isn't a hot spot.
1868  */
1869  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1870 
1871  OldestMemberMXactId[dummyProcNumber] = myOldestMember;
1873 
1874  LWLockRelease(MultiXactGenLock);
1875  }
1876 
1877  /*
1878  * We don't need to transfer OldestVisibleMXactId value, because the
1879  * transaction is not going to be looking at any more multixacts once it's
1880  * prepared.
1881  *
1882  * We assume that storing a MultiXactId is atomic and so we need not take
1883  * MultiXactGenLock to do this.
1884  */
1886 
1887  /*
1888  * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1889  */
1890  MXactContext = NULL;
1892 }
1893 
1894 /*
1895  * multixact_twophase_recover
1896  * Recover the state of a prepared transaction at startup
1897  */
1898 void
1900  void *recdata, uint32 len)
1901 {
1902  ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
1903  MultiXactId oldestMember;
1904 
1905  /*
1906  * Get the oldest member XID from the state file record, and set it in the
1907  * OldestMemberMXactId slot reserved for this prepared transaction.
1908  */
1909  Assert(len == sizeof(MultiXactId));
1910  oldestMember = *((MultiXactId *) recdata);
1911 
1912  OldestMemberMXactId[dummyProcNumber] = oldestMember;
1913 }
1914 
1915 /*
1916  * multixact_twophase_postcommit
1917  * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1918  */
1919 void
1921  void *recdata, uint32 len)
1922 {
1923  ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true);
1924 
1925  Assert(len == sizeof(MultiXactId));
1926 
1927  OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId;
1928 }
1929 
1930 /*
1931  * multixact_twophase_postabort
1932  * This is actually just the same as the COMMIT case.
1933  */
1934 void
1936  void *recdata, uint32 len)
1937 {
1938  multixact_twophase_postcommit(xid, info, recdata, len);
1939 }
1940 
1941 /*
1942  * Initialization of shared memory for MultiXact. We use two SLRU areas,
1943  * thus double memory. Also, reserve space for the shared MultiXactState
1944  * struct and the per-backend MultiXactId arrays (two of those, too).
1945  */
1946 Size
1948 {
1949  Size size;
1950 
1951  /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1952 #define SHARED_MULTIXACT_STATE_SIZE \
1953  add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1954  mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1955 
1959 
1960  return size;
1961 }
1962 
1963 void
1965 {
1966  bool found;
1967 
1968  debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1969 
1972 
1974  "multixact_offset", multixact_offset_buffers, 0,
1975  "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1978  false);
1981  "multixact_member", multixact_member_buffers, 0,
1982  "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1985  false);
1986  /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1987 
1988  /* Initialize our shared state struct */
1989  MultiXactState = ShmemInitStruct("Shared MultiXact State",
1991  &found);
1992  if (!IsUnderPostmaster)
1993  {
1994  Assert(!found);
1995 
1996  /* Make sure we zero out the per-backend state */
1999  }
2000  else
2001  Assert(found);
2002 
2003  /*
2004  * Set up array pointers.
2005  */
2008 }
2009 
2010 /*
2011  * GUC check_hook for multixact_offset_buffers
2012  */
2013 bool
2015 {
2016  return check_slru_buffers("multixact_offset_buffers", newval);
2017 }
2018 
2019 /*
2020  * GUC check_hook for multixact_member_buffers
2021  */
2022 bool
2024 {
2025  return check_slru_buffers("multixact_member_buffers", newval);
2026 }
2027 
2028 /*
2029  * This func must be called ONCE on system install. It creates the initial
2030  * MultiXact segments. (The MultiXacts directories are assumed to have been
2031  * created by initdb, and MultiXactShmemInit must have been called already.)
2032  */
2033 void
2035 {
2036  int slotno;
2037  LWLock *lock;
2038 
2040  LWLockAcquire(lock, LW_EXCLUSIVE);
2041 
2042  /* Create and zero the first page of the offsets log */
2043  slotno = ZeroMultiXactOffsetPage(0, false);
2044 
2045  /* Make sure it's written out */
2047  Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
2048 
2049  LWLockRelease(lock);
2050 
2052  LWLockAcquire(lock, LW_EXCLUSIVE);
2053 
2054  /* Create and zero the first page of the members log */
2055  slotno = ZeroMultiXactMemberPage(0, false);
2056 
2057  /* Make sure it's written out */
2059  Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
2060 
2061  LWLockRelease(lock);
2062 }
2063 
2064 /*
2065  * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
2066  * If writeXlog is true, also emit an XLOG record saying we did this.
2067  *
2068  * The page is not actually written, just set up in shared memory.
2069  * The slot number of the new page is returned.
2070  *
2071  * Control lock must be held at entry, and will be held at exit.
2072  */
2073 static int
2074 ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
2075 {
2076  int slotno;
2077 
2078  slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2079 
2080  if (writeXlog)
2082 
2083  return slotno;
2084 }
2085 
2086 /*
2087  * Ditto, for MultiXactMember
2088  */
2089 static int
2090 ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
2091 {
2092  int slotno;
2093 
2094  slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
2095 
2096  if (writeXlog)
2098 
2099  return slotno;
2100 }
2101 
2102 /*
2103  * MaybeExtendOffsetSlru
2104  * Extend the offsets SLRU area, if necessary
2105  *
2106  * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2107  * contain files that are shorter than necessary; this would occur if the old
2108  * installation had used multixacts beyond the first page (files cannot be
2109  * copied, because the on-disk representation is different). pg_upgrade would
2110  * update pg_control to set the next offset value to be at that position, so
2111  * that tuples marked as locked by such MultiXacts would be seen as visible
2112  * without having to consult multixact. However, trying to create and use a
2113  * new MultiXactId would result in an error because the page on which the new
2114  * value would reside does not exist. This routine is in charge of creating
2115  * such pages.
2116  */
2117 static void
2119 {
2120  int64 pageno;
2121  LWLock *lock;
2122 
2124  lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2125 
2126  LWLockAcquire(lock, LW_EXCLUSIVE);
2127 
2129  {
2130  int slotno;
2131 
2132  /*
2133  * Fortunately for us, SimpleLruWritePage is already prepared to deal
2134  * with creating a new segment file even if the page we're writing is
2135  * not the first in it, so this is enough.
2136  */
2137  slotno = ZeroMultiXactOffsetPage(pageno, false);
2139  }
2140 
2141  LWLockRelease(lock);
2142 }
2143 
2144 /*
2145  * This must be called ONCE during postmaster or standalone-backend startup.
2146  *
2147  * StartupXLOG has already established nextMXact/nextOffset by calling
2148  * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2149  * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2150  * replayed WAL.
2151  */
2152 void
2154 {
2157  int64 pageno;
2158 
2159  /*
2160  * Initialize offset's idea of the latest page number.
2161  */
2162  pageno = MultiXactIdToOffsetPage(multi);
2163  pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2164  pageno);
2165 
2166  /*
2167  * Initialize member's idea of the latest page number.
2168  */
2169  pageno = MXOffsetToMemberPage(offset);
2170  pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2171  pageno);
2172 }
2173 
2174 /*
2175  * This must be called ONCE at the end of startup/recovery.
2176  */
2177 void
2179 {
2180  MultiXactId nextMXact;
2181  MultiXactOffset offset;
2182  MultiXactId oldestMXact;
2183  Oid oldestMXactDB;
2184  int64 pageno;
2185  int entryno;
2186  int flagsoff;
2187 
2188  LWLockAcquire(MultiXactGenLock, LW_SHARED);
2189  nextMXact = MultiXactState->nextMXact;
2190  offset = MultiXactState->nextOffset;
2191  oldestMXact = MultiXactState->oldestMultiXactId;
2192  oldestMXactDB = MultiXactState->oldestMultiXactDB;
2193  LWLockRelease(MultiXactGenLock);
2194 
2195  /* Clean up offsets state */
2196 
2197  /*
2198  * (Re-)Initialize our idea of the latest page number for offsets.
2199  */
2200  pageno = MultiXactIdToOffsetPage(nextMXact);
2201  pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2202  pageno);
2203 
2204  /*
2205  * Zero out the remainder of the current offsets page. See notes in
2206  * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2207  * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2208  * rule "write xlog before data," nextMXact successors may carry obsolete,
2209  * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2210  * operates normally.
2211  */
2212  entryno = MultiXactIdToOffsetEntry(nextMXact);
2213  if (entryno != 0)
2214  {
2215  int slotno;
2216  MultiXactOffset *offptr;
2218 
2219  LWLockAcquire(lock, LW_EXCLUSIVE);
2220  slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2221  offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2222  offptr += entryno;
2223 
2224  MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2225 
2226  MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2227  LWLockRelease(lock);
2228  }
2229 
2230  /*
2231  * And the same for members.
2232  *
2233  * (Re-)Initialize our idea of the latest page number for members.
2234  */
2235  pageno = MXOffsetToMemberPage(offset);
2236  pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2237  pageno);
2238 
2239  /*
2240  * Zero out the remainder of the current members page. See notes in
2241  * TrimCLOG() for motivation.
2242  */
2243  flagsoff = MXOffsetToFlagsOffset(offset);
2244  if (flagsoff != 0)
2245  {
2246  int slotno;
2247  TransactionId *xidptr;
2248  int memberoff;
2250 
2251  LWLockAcquire(lock, LW_EXCLUSIVE);
2252  memberoff = MXOffsetToMemberOffset(offset);
2253  slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2254  xidptr = (TransactionId *)
2255  (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2256 
2257  MemSet(xidptr, 0, BLCKSZ - memberoff);
2258 
2259  /*
2260  * Note: we don't need to zero out the flag bits in the remaining
2261  * members of the current group, because they are always reset before
2262  * writing.
2263  */
2264 
2265  MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2266  LWLockRelease(lock);
2267  }
2268 
2269  /* signal that we're officially up */
2270  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2272  LWLockRelease(MultiXactGenLock);
2273 
2274  /* Now compute how far away the next members wraparound is. */
2275  SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2276 }
2277 
2278 /*
2279  * Get the MultiXact data to save in a checkpoint record
2280  */
2281 void
2282 MultiXactGetCheckptMulti(bool is_shutdown,
2283  MultiXactId *nextMulti,
2284  MultiXactOffset *nextMultiOffset,
2285  MultiXactId *oldestMulti,
2286  Oid *oldestMultiDB)
2287 {
2288  LWLockAcquire(MultiXactGenLock, LW_SHARED);
2289  *nextMulti = MultiXactState->nextMXact;
2290  *nextMultiOffset = MultiXactState->nextOffset;
2291  *oldestMulti = MultiXactState->oldestMultiXactId;
2292  *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2293  LWLockRelease(MultiXactGenLock);
2294 
2296  "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2297  *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2298 }
2299 
2300 /*
2301  * Perform a checkpoint --- either during shutdown, or on-the-fly
2302  */
2303 void
2305 {
2306  TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2307 
2308  /*
2309  * Write dirty MultiXact pages to disk. This may result in sync requests
2310  * queued for later handling by ProcessSyncRequests(), as part of the
2311  * checkpoint.
2312  */
2315 
2316  TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2317 }
2318 
2319 /*
2320  * Set the next-to-be-assigned MultiXactId and offset
2321  *
2322  * This is used when we can determine the correct next ID/offset exactly
2323  * from a checkpoint record. Although this is only called during bootstrap
2324  * and XLog replay, we take the lock in case any hot-standby backends are
2325  * examining the values.
2326  */
2327 void
2329  MultiXactOffset nextMultiOffset)
2330 {
2331  debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2332  nextMulti, nextMultiOffset);
2333  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2334  MultiXactState->nextMXact = nextMulti;
2335  MultiXactState->nextOffset = nextMultiOffset;
2336  LWLockRelease(MultiXactGenLock);
2337 
2338  /*
2339  * During a binary upgrade, make sure that the offsets SLRU is large
2340  * enough to contain the next value that would be created.
2341  *
2342  * We need to do this pretty early during the first startup in binary
2343  * upgrade mode: before StartupMultiXact() in fact, because this routine
2344  * is called even before that by StartupXLOG(). And we can't do it
2345  * earlier than at this point, because during that first call of this
2346  * routine we determine the MultiXactState->nextMXact value that
2347  * MaybeExtendOffsetSlru needs.
2348  */
2349  if (IsBinaryUpgrade)
2351 }
2352 
2353 /*
2354  * Determine the last safe MultiXactId to allocate given the currently oldest
2355  * datminmxid (ie, the oldest MultiXactId that might exist in any database
2356  * of our cluster), and the OID of the (or a) database with that value.
2357  *
2358  * is_startup is true when we are just starting the cluster, false when we
2359  * are updating state in a running cluster. This only affects log messages.
2360  */
2361 void
2362 SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2363  bool is_startup)
2364 {
2365  MultiXactId multiVacLimit;
2366  MultiXactId multiWarnLimit;
2367  MultiXactId multiStopLimit;
2368  MultiXactId multiWrapLimit;
2369  MultiXactId curMulti;
2370  bool needs_offset_vacuum;
2371 
2372  Assert(MultiXactIdIsValid(oldest_datminmxid));
2373 
2374  /*
2375  * We pretend that a wrap will happen halfway through the multixact ID
2376  * space, but that's not really true, because multixacts wrap differently
2377  * from transaction IDs. Note that, separately from any concern about
2378  * multixact IDs wrapping, we must ensure that multixact members do not
2379  * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2380  */
2381  multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2382  if (multiWrapLimit < FirstMultiXactId)
2383  multiWrapLimit += FirstMultiXactId;
2384 
2385  /*
2386  * We'll refuse to continue assigning MultiXactIds once we get within 3M
2387  * multi of data loss. See SetTransactionIdLimit.
2388  */
2389  multiStopLimit = multiWrapLimit - 3000000;
2390  if (multiStopLimit < FirstMultiXactId)
2391  multiStopLimit -= FirstMultiXactId;
2392 
2393  /*
2394  * We'll start complaining loudly when we get within 40M multis of data
2395  * loss. This is kind of arbitrary, but if you let your gas gauge get
2396  * down to 2% of full, would you be looking for the next gas station? We
2397  * need to be fairly liberal about this number because there are lots of
2398  * scenarios where most transactions are done by automatic clients that
2399  * won't pay attention to warnings. (No, we're not gonna make this
2400  * configurable. If you know enough to configure it, you know enough to
2401  * not get in this kind of trouble in the first place.)
2402  */
2403  multiWarnLimit = multiWrapLimit - 40000000;
2404  if (multiWarnLimit < FirstMultiXactId)
2405  multiWarnLimit -= FirstMultiXactId;
2406 
2407  /*
2408  * We'll start trying to force autovacuums when oldest_datminmxid gets to
2409  * be more than autovacuum_multixact_freeze_max_age mxids old.
2410  *
2411  * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2412  * so that we don't have to worry about dealing with on-the-fly changes in
2413  * its value. See SetTransactionIdLimit.
2414  */
2415  multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2416  if (multiVacLimit < FirstMultiXactId)
2417  multiVacLimit += FirstMultiXactId;
2418 
2419  /* Grab lock for just long enough to set the new limit values */
2420  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2421  MultiXactState->oldestMultiXactId = oldest_datminmxid;
2422  MultiXactState->oldestMultiXactDB = oldest_datoid;
2423  MultiXactState->multiVacLimit = multiVacLimit;
2424  MultiXactState->multiWarnLimit = multiWarnLimit;
2425  MultiXactState->multiStopLimit = multiStopLimit;
2426  MultiXactState->multiWrapLimit = multiWrapLimit;
2427  curMulti = MultiXactState->nextMXact;
2428  LWLockRelease(MultiXactGenLock);
2429 
2430  /* Log the info */
2431  ereport(DEBUG1,
2432  (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2433  multiWrapLimit, oldest_datoid)));
2434 
2435  /*
2436  * Computing the actual limits is only possible once the data directory is
2437  * in a consistent state. There's no need to compute the limits while
2438  * still replaying WAL - no decisions about new multis are made even
2439  * though multixact creations might be replayed. So we'll only do further
2440  * checks after TrimMultiXact() has been called.
2441  */
2443  return;
2444 
2445  Assert(!InRecovery);
2446 
2447  /* Set limits for offset vacuum. */
2448  needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2449 
2450  /*
2451  * If past the autovacuum force point, immediately signal an autovac
2452  * request. The reason for this is that autovac only processes one
2453  * database per invocation. Once it's finished cleaning up the oldest
2454  * database, it'll call here, and we'll signal the postmaster to start
2455  * another iteration immediately if there are still any old databases.
2456  */
2457  if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2458  needs_offset_vacuum) && IsUnderPostmaster)
2460 
2461  /* Give an immediate warning if past the wrap warn point */
2462  if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2463  {
2464  char *oldest_datname;
2465 
2466  /*
2467  * We can be called when not inside a transaction, for example during
2468  * StartupXLOG(). In such a case we cannot do database access, so we
2469  * must just report the oldest DB's OID.
2470  *
2471  * Note: it's also possible that get_database_name fails and returns
2472  * NULL, for example because the database just got dropped. We'll
2473  * still warn, even though the warning might now be unnecessary.
2474  */
2475  if (IsTransactionState())
2476  oldest_datname = get_database_name(oldest_datoid);
2477  else
2478  oldest_datname = NULL;
2479 
2480  if (oldest_datname)
2481  ereport(WARNING,
2482  (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2483  "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2484  multiWrapLimit - curMulti,
2485  oldest_datname,
2486  multiWrapLimit - curMulti),
2487  errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2488  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2489  else
2490  ereport(WARNING,
2491  (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2492  "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2493  multiWrapLimit - curMulti,
2494  oldest_datoid,
2495  multiWrapLimit - curMulti),
2496  errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2497  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2498  }
2499 }
2500 
2501 /*
2502  * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2503  * and similarly nextOffset is at least minMultiOffset.
2504  *
2505  * This is used when we can determine minimum safe values from an XLog
2506  * record (either an on-line checkpoint or an mxact creation log entry).
2507  * Although this is only called during XLog replay, we take the lock in case
2508  * any hot-standby backends are examining the values.
2509  */
2510 void
2512  MultiXactOffset minMultiOffset)
2513 {
2514  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2516  {
2517  debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2518  MultiXactState->nextMXact = minMulti;
2519  }
2520  if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
2521  {
2522  debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2523  minMultiOffset);
2524  MultiXactState->nextOffset = minMultiOffset;
2525  }
2526  LWLockRelease(MultiXactGenLock);
2527 }
2528 
2529 /*
2530  * Update our oldestMultiXactId value, but only if it's more recent than what
2531  * we had.
2532  *
2533  * This may only be called during WAL replay.
2534  */
2535 void
2536 MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2537 {
2538  Assert(InRecovery);
2539 
2541  SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2542 }
2543 
2544 /*
2545  * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2546  *
2547  * NB: this is called while holding MultiXactGenLock. We want it to be very
2548  * fast most of the time; even when it's not so fast, no actual I/O need
2549  * happen unless we're forced to write out a dirty log or xlog page to make
2550  * room in shared memory.
2551  */
2552 static void
2554 {
2555  int64 pageno;
2556  LWLock *lock;
2557 
2558  /*
2559  * No work except at first MultiXactId of a page. But beware: just after
2560  * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2561  */
2562  if (MultiXactIdToOffsetEntry(multi) != 0 &&
2563  multi != FirstMultiXactId)
2564  return;
2565 
2566  pageno = MultiXactIdToOffsetPage(multi);
2567  lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2568 
2569  LWLockAcquire(lock, LW_EXCLUSIVE);
2570 
2571  /* Zero the page and make an XLOG entry about it */
2572  ZeroMultiXactOffsetPage(pageno, true);
2573 
2574  LWLockRelease(lock);
2575 }
2576 
2577 /*
2578  * Make sure that MultiXactMember has room for the members of a newly-
2579  * allocated MultiXactId.
2580  *
2581  * Like the above routine, this is called while holding MultiXactGenLock;
2582  * same comments apply.
2583  */
2584 static void
2586 {
2587  /*
2588  * It's possible that the members span more than one page of the members
2589  * file, so we loop to ensure we consider each page. The coding is not
2590  * optimal if the members span several pages, but that seems unusual
2591  * enough to not worry much about.
2592  */
2593  while (nmembers > 0)
2594  {
2595  int flagsoff;
2596  int flagsbit;
2598 
2599  /*
2600  * Only zero when at first entry of a page.
2601  */
2602  flagsoff = MXOffsetToFlagsOffset(offset);
2603  flagsbit = MXOffsetToFlagsBitShift(offset);
2604  if (flagsoff == 0 && flagsbit == 0)
2605  {
2606  int64 pageno;
2607  LWLock *lock;
2608 
2609  pageno = MXOffsetToMemberPage(offset);
2610  lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2611 
2612  LWLockAcquire(lock, LW_EXCLUSIVE);
2613 
2614  /* Zero the page and make an XLOG entry about it */
2615  ZeroMultiXactMemberPage(pageno, true);
2616 
2617  LWLockRelease(lock);
2618  }
2619 
2620  /*
2621  * Compute the number of items till end of current page. Careful: if
2622  * addition of unsigned ints wraps around, we're at the last page of
2623  * the last segment; since that page holds a different number of items
2624  * than other pages, we need to do it differently.
2625  */
2626  if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2627  {
2628  /*
2629  * This is the last page of the last segment; we can compute the
2630  * number of items left to allocate in it without modulo
2631  * arithmetic.
2632  */
2633  difference = MaxMultiXactOffset - offset + 1;
2634  }
2635  else
2637 
2638  /*
2639  * Advance to next page, taking care to properly handle the wraparound
2640  * case. OK if nmembers goes negative.
2641  */
2642  nmembers -= difference;
2643  offset += difference;
2644  }
2645 }
2646 
2647 /*
2648  * GetOldestMultiXactId
2649  *
2650  * Return the oldest MultiXactId that's still possibly still seen as live by
2651  * any running transaction. Older ones might still exist on disk, but they no
2652  * longer have any running member transaction.
2653  *
2654  * It's not safe to truncate MultiXact SLRU segments on the value returned by
2655  * this function; however, it can be set as the new relminmxid for any table
2656  * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2657  * to truncate SLRUs when no table can possibly still have a referencing MXID.
2658  */
2661 {
2662  MultiXactId oldestMXact;
2663  MultiXactId nextMXact;
2664  int i;
2665 
2666  /*
2667  * This is the oldest valid value among all the OldestMemberMXactId[] and
2668  * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2669  */
2670  LWLockAcquire(MultiXactGenLock, LW_SHARED);
2671 
2672  /*
2673  * We have to beware of the possibility that nextMXact is in the
2674  * wrapped-around state. We don't fix the counter itself here, but we
2675  * must be sure to use a valid value in our calculation.
2676  */
2677  nextMXact = MultiXactState->nextMXact;
2678  if (nextMXact < FirstMultiXactId)
2679  nextMXact = FirstMultiXactId;
2680 
2681  oldestMXact = nextMXact;
2682  for (i = 0; i < MaxOldestSlot; i++)
2683  {
2684  MultiXactId thisoldest;
2685 
2686  thisoldest = OldestMemberMXactId[i];
2687  if (MultiXactIdIsValid(thisoldest) &&
2688  MultiXactIdPrecedes(thisoldest, oldestMXact))
2689  oldestMXact = thisoldest;
2690  thisoldest = OldestVisibleMXactId[i];
2691  if (MultiXactIdIsValid(thisoldest) &&
2692  MultiXactIdPrecedes(thisoldest, oldestMXact))
2693  oldestMXact = thisoldest;
2694  }
2695 
2696  LWLockRelease(MultiXactGenLock);
2697 
2698  return oldestMXact;
2699 }
2700 
2701 /*
2702  * Determine how aggressively we need to vacuum in order to prevent member
2703  * wraparound.
2704  *
2705  * To do so determine what's the oldest member offset and install the limit
2706  * info in MultiXactState, where it can be used to prevent overrun of old data
2707  * in the members SLRU area.
2708  *
2709  * The return value is true if emergency autovacuum is required and false
2710  * otherwise.
2711  */
2712 static bool
2713 SetOffsetVacuumLimit(bool is_startup)
2714 {
2715  MultiXactId oldestMultiXactId;
2716  MultiXactId nextMXact;
2717  MultiXactOffset oldestOffset = 0; /* placate compiler */
2718  MultiXactOffset prevOldestOffset;
2719  MultiXactOffset nextOffset;
2720  bool oldestOffsetKnown = false;
2721  bool prevOldestOffsetKnown;
2722  MultiXactOffset offsetStopLimit = 0;
2723  MultiXactOffset prevOffsetStopLimit;
2724 
2725  /*
2726  * NB: Have to prevent concurrent truncation, we might otherwise try to
2727  * lookup an oldestMulti that's concurrently getting truncated away.
2728  */
2729  LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2730 
2731  /* Read relevant fields from shared memory. */
2732  LWLockAcquire(MultiXactGenLock, LW_SHARED);
2733  oldestMultiXactId = MultiXactState->oldestMultiXactId;
2734  nextMXact = MultiXactState->nextMXact;
2735  nextOffset = MultiXactState->nextOffset;
2736  prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2737  prevOldestOffset = MultiXactState->oldestOffset;
2738  prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2740  LWLockRelease(MultiXactGenLock);
2741 
2742  /*
2743  * Determine the offset of the oldest multixact. Normally, we can read
2744  * the offset from the multixact itself, but there's an important special
2745  * case: if there are no multixacts in existence at all, oldestMXact
2746  * obviously can't point to one. It will instead point to the multixact
2747  * ID that will be assigned the next time one is needed.
2748  */
2749  if (oldestMultiXactId == nextMXact)
2750  {
2751  /*
2752  * When the next multixact gets created, it will be stored at the next
2753  * offset.
2754  */
2755  oldestOffset = nextOffset;
2756  oldestOffsetKnown = true;
2757  }
2758  else
2759  {
2760  /*
2761  * Figure out where the oldest existing multixact's offsets are
2762  * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2763  * the supposedly-earliest multixact might not really exist. We are
2764  * careful not to fail in that case.
2765  */
2766  oldestOffsetKnown =
2767  find_multixact_start(oldestMultiXactId, &oldestOffset);
2768 
2769  if (oldestOffsetKnown)
2770  ereport(DEBUG1,
2771  (errmsg_internal("oldest MultiXactId member is at offset %u",
2772  oldestOffset)));
2773  else
2774  ereport(LOG,
2775  (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2776  oldestMultiXactId)));
2777  }
2778 
2779  LWLockRelease(MultiXactTruncationLock);
2780 
2781  /*
2782  * If we can, compute limits (and install them MultiXactState) to prevent
2783  * overrun of old data in the members SLRU area. We can only do so if the
2784  * oldest offset is known though.
2785  */
2786  if (oldestOffsetKnown)
2787  {
2788  /* move back to start of the corresponding segment */
2789  offsetStopLimit = oldestOffset - (oldestOffset %
2791 
2792  /* always leave one segment before the wraparound point */
2793  offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
2794 
2795  if (!prevOldestOffsetKnown && !is_startup)
2796  ereport(LOG,
2797  (errmsg("MultiXact member wraparound protections are now enabled")));
2798 
2799  ereport(DEBUG1,
2800  (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2801  offsetStopLimit, oldestMultiXactId)));
2802  }
2803  else if (prevOldestOffsetKnown)
2804  {
2805  /*
2806  * If we failed to get the oldest offset this time, but we have a
2807  * value from a previous pass through this function, use the old
2808  * values rather than automatically forcing an emergency autovacuum
2809  * cycle again.
2810  */
2811  oldestOffset = prevOldestOffset;
2812  oldestOffsetKnown = true;
2813  offsetStopLimit = prevOffsetStopLimit;
2814  }
2815 
2816  /* Install the computed values */
2817  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2818  MultiXactState->oldestOffset = oldestOffset;
2819  MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2820  MultiXactState->offsetStopLimit = offsetStopLimit;
2821  LWLockRelease(MultiXactGenLock);
2822 
2823  /*
2824  * Do we need an emergency autovacuum? If we're not sure, assume yes.
2825  */
2826  return !oldestOffsetKnown ||
2827  (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2828 }
2829 
2830 /*
2831  * Return whether adding "distance" to "start" would move past "boundary".
2832  *
2833  * We use this to determine whether the addition is "wrapping around" the
2834  * boundary point, hence the name. The reason we don't want to use the regular
2835  * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2836  * the 2^32-1 space here, allowing for more multixacts than would fit
2837  * otherwise.
2838  */
2839 static bool
2841  uint32 distance)
2842 {
2843  MultiXactOffset finish;
2844 
2845  /*
2846  * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2847  * if the addition wraps around the UINT_MAX boundary, skip that value.
2848  */
2849  finish = start + distance;
2850  if (finish < start)
2851  finish++;
2852 
2853  /*-----------------------------------------------------------------------
2854  * When the boundary is numerically greater than the starting point, any
2855  * value numerically between the two is not wrapped:
2856  *
2857  * <----S----B---->
2858  * [---) = F wrapped past B (and UINT_MAX)
2859  * [---) = F not wrapped
2860  * [----] = F wrapped past B
2861  *
2862  * When the boundary is numerically less than the starting point (i.e. the
2863  * UINT_MAX wraparound occurs somewhere in between) then all values in
2864  * between are wrapped:
2865  *
2866  * <----B----S---->
2867  * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2868  * [---) = F wrapped past B (and UINT_MAX)
2869  * [----] = F not wrapped
2870  *-----------------------------------------------------------------------
2871  */
2872  if (start < boundary)
2873  return finish >= boundary || finish < start;
2874  else
2875  return finish >= boundary && finish < start;
2876 }
2877 
2878 /*
2879  * Find the starting offset of the given MultiXactId.
2880  *
2881  * Returns false if the file containing the multi does not exist on disk.
2882  * Otherwise, returns true and sets *result to the starting member offset.
2883  *
2884  * This function does not prevent concurrent truncation, so if that's
2885  * required, the caller has to protect against that.
2886  */
2887 static bool
2889 {
2890  MultiXactOffset offset;
2891  int64 pageno;
2892  int entryno;
2893  int slotno;
2894  MultiXactOffset *offptr;
2895 
2897 
2898  pageno = MultiXactIdToOffsetPage(multi);
2899  entryno = MultiXactIdToOffsetEntry(multi);
2900 
2901  /*
2902  * Write out dirty data, so PhysicalPageExists can work correctly.
2903  */
2906 
2908  return false;
2909 
2910  /* lock is acquired by SimpleLruReadPage_ReadOnly */
2911  slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2912  offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2913  offptr += entryno;
2914  offset = *offptr;
2916 
2917  *result = offset;
2918  return true;
2919 }
2920 
2921 /*
2922  * Determine how many multixacts, and how many multixact members, currently
2923  * exist. Return false if unable to determine.
2924  */
2925 static bool
2927 {
2928  MultiXactOffset nextOffset;
2929  MultiXactOffset oldestOffset;
2930  MultiXactId oldestMultiXactId;
2931  MultiXactId nextMultiXactId;
2932  bool oldestOffsetKnown;
2933 
2934  LWLockAcquire(MultiXactGenLock, LW_SHARED);
2935  nextOffset = MultiXactState->nextOffset;
2936  oldestMultiXactId = MultiXactState->oldestMultiXactId;
2937  nextMultiXactId = MultiXactState->nextMXact;
2938  oldestOffset = MultiXactState->oldestOffset;
2939  oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2940  LWLockRelease(MultiXactGenLock);
2941 
2942  if (!oldestOffsetKnown)
2943  return false;
2944 
2945  *members = nextOffset - oldestOffset;
2946  *multixacts = nextMultiXactId - oldestMultiXactId;
2947  return true;
2948 }
2949 
2950 /*
2951  * Multixact members can be removed once the multixacts that refer to them
2952  * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2953  * vacuum_multixact_freeze_table_age work together to make sure we never have
2954  * too many multixacts; we hope that, at least under normal circumstances,
2955  * this will also be sufficient to keep us from using too many offsets.
2956  * However, if the average multixact has many members, we might exhaust the
2957  * members space while still using few enough members that these limits fail
2958  * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
2959  * choice but to start failing multixact-creating operations with an error.
2960  *
2961  * To prevent that, if more than a threshold portion of the members space is
2962  * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2963  * to a value just less than the number of multixacts in use. We hope that
2964  * this will quickly trigger autovacuuming on the table or tables with the
2965  * oldest relminmxid, thus allowing datminmxid values to advance and removing
2966  * some members.
2967  *
2968  * As the fraction of the member space currently in use grows, we become
2969  * more aggressive in clamping this value. That not only causes autovacuum
2970  * to ramp up, but also makes any manual vacuums the user issues more
2971  * aggressive. This happens because vacuum_get_cutoffs() will clamp the
2972  * freeze table and the minimum freeze age cutoffs based on the effective
2973  * autovacuum_multixact_freeze_max_age this function returns. In the worst
2974  * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2975  * table will freeze every multixact.
2976  */
2977 int
2979 {
2980  MultiXactOffset members;
2981  uint32 multixacts;
2982  uint32 victim_multixacts;
2983  double fraction;
2984  int result;
2985 
2986  /* If we can't determine member space utilization, assume the worst. */
2987  if (!ReadMultiXactCounts(&multixacts, &members))
2988  return 0;
2989 
2990  /* If member space utilization is low, no special action is required. */
2991  if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2993 
2994  /*
2995  * Compute a target for relminmxid advancement. The number of multixacts
2996  * we try to eliminate from the system is based on how far we are past
2997  * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2998  */
2999  fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
3001  victim_multixacts = multixacts * fraction;
3002 
3003  /* fraction could be > 1.0, but lowest possible freeze age is zero */
3004  if (victim_multixacts > multixacts)
3005  return 0;
3006  result = multixacts - victim_multixacts;
3007 
3008  /*
3009  * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
3010  * autovacuum less aggressive than it would otherwise be.
3011  */
3012  return Min(result, autovacuum_multixact_freeze_max_age);
3013 }
3014 
3015 typedef struct mxtruncinfo
3016 {
3019 
3020 /*
3021  * SlruScanDirectory callback
3022  * This callback determines the earliest existing page number.
3023  */
3024 static bool
3026 {
3027  mxtruncinfo *trunc = (mxtruncinfo *) data;
3028 
3029  if (trunc->earliestExistingPage == -1 ||
3030  ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
3031  {
3032  trunc->earliestExistingPage = segpage;
3033  }
3034 
3035  return false; /* keep going */
3036 }
3037 
3038 
3039 /*
3040  * Delete members segments [oldest, newOldest)
3041  *
3042  * The members SLRU can, in contrast to the offsets one, be filled to almost
3043  * the full range at once. This means SimpleLruTruncate() can't trivially be
3044  * used - instead the to-be-deleted range is computed using the offsets
3045  * SLRU. C.f. TruncateMultiXact().
3046  */
3047 static void
3049 {
3050  const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
3051  int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
3052  int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
3053  int64 segment = startsegment;
3054 
3055  /*
3056  * Delete all the segments but the last one. The last segment can still
3057  * contain, possibly partially, valid data.
3058  */
3059  while (segment != endsegment)
3060  {
3061  elog(DEBUG2, "truncating multixact members segment %llx",
3062  (unsigned long long) segment);
3064 
3065  /* move to next segment, handling wraparound correctly */
3066  if (segment == maxsegment)
3067  segment = 0;
3068  else
3069  segment += 1;
3070  }
3071 }
3072 
3073 /*
3074  * Delete offsets segments [oldest, newOldest)
3075  */
3076 static void
3078 {
3079  /*
3080  * We step back one multixact to avoid passing a cutoff page that hasn't
3081  * been created yet in the rare case that oldestMulti would be the first
3082  * item on a page and oldestMulti == nextMulti. In that case, if we
3083  * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3084  * detection.
3085  */
3087  MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
3088 }
3089 
3090 /*
3091  * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3092  * ones still of interest.
3093  *
3094  * This is only called on a primary as part of vacuum (via
3095  * vac_truncate_clog()). During recovery truncation is done by replaying
3096  * truncation WAL records logged here.
3097  *
3098  * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3099  * is one of the databases preventing newOldestMulti from increasing.
3100  */
3101 void
3102 TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
3103 {
3104  MultiXactId oldestMulti;
3105  MultiXactId nextMulti;
3106  MultiXactOffset newOldestOffset;
3107  MultiXactOffset oldestOffset;
3108  MultiXactOffset nextOffset;
3109  mxtruncinfo trunc;
3110  MultiXactId earliest;
3111 
3114 
3115  /*
3116  * We can only allow one truncation to happen at once. Otherwise parts of
3117  * members might vanish while we're doing lookups or similar. There's no
3118  * need to have an interlock with creating new multis or such, since those
3119  * are constrained by the limits (which only grow, never shrink).
3120  */
3121  LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3122 
3123  LWLockAcquire(MultiXactGenLock, LW_SHARED);
3124  nextMulti = MultiXactState->nextMXact;
3125  nextOffset = MultiXactState->nextOffset;
3126  oldestMulti = MultiXactState->oldestMultiXactId;
3127  LWLockRelease(MultiXactGenLock);
3128  Assert(MultiXactIdIsValid(oldestMulti));
3129 
3130  /*
3131  * Make sure to only attempt truncation if there's values to truncate
3132  * away. In normal processing values shouldn't go backwards, but there's
3133  * some corner cases (due to bugs) where that's possible.
3134  */
3135  if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
3136  {
3137  LWLockRelease(MultiXactTruncationLock);
3138  return;
3139  }
3140 
3141  /*
3142  * Note we can't just plow ahead with the truncation; it's possible that
3143  * there are no segments to truncate, which is a problem because we are
3144  * going to attempt to read the offsets page to determine where to
3145  * truncate the members SLRU. So we first scan the directory to determine
3146  * the earliest offsets page number that we can read without error.
3147  *
3148  * When nextMXact is less than one segment away from multiWrapLimit,
3149  * SlruScanDirCbFindEarliest can find some early segment other than the
3150  * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3151  * returns false, because not all pairs of entries have the same answer.)
3152  * That can also arise when an earlier truncation attempt failed unlink()
3153  * or returned early from this function. The only consequence is
3154  * returning early, which wastes space that we could have liberated.
3155  *
3156  * NB: It's also possible that the page that oldestMulti is on has already
3157  * been truncated away, and we crashed before updating oldestMulti.
3158  */
3159  trunc.earliestExistingPage = -1;
3162  if (earliest < FirstMultiXactId)
3163  earliest = FirstMultiXactId;
3164 
3165  /* If there's nothing to remove, we can bail out early. */
3166  if (MultiXactIdPrecedes(oldestMulti, earliest))
3167  {
3168  LWLockRelease(MultiXactTruncationLock);
3169  return;
3170  }
3171 
3172  /*
3173  * First, compute the safe truncation point for MultiXactMember. This is
3174  * the starting offset of the oldest multixact.
3175  *
3176  * Hopefully, find_multixact_start will always work here, because we've
3177  * already checked that it doesn't precede the earliest MultiXact on disk.
3178  * But if it fails, don't truncate anything, and log a message.
3179  */
3180  if (oldestMulti == nextMulti)
3181  {
3182  /* there are NO MultiXacts */
3183  oldestOffset = nextOffset;
3184  }
3185  else if (!find_multixact_start(oldestMulti, &oldestOffset))
3186  {
3187  ereport(LOG,
3188  (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3189  oldestMulti, earliest)));
3190  LWLockRelease(MultiXactTruncationLock);
3191  return;
3192  }
3193 
3194  /*
3195  * Secondly compute up to where to truncate. Lookup the corresponding
3196  * member offset for newOldestMulti for that.
3197  */
3198  if (newOldestMulti == nextMulti)
3199  {
3200  /* there are NO MultiXacts */
3201  newOldestOffset = nextOffset;
3202  }
3203  else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3204  {
3205  ereport(LOG,
3206  (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3207  newOldestMulti)));
3208  LWLockRelease(MultiXactTruncationLock);
3209  return;
3210  }
3211 
3212  elog(DEBUG1, "performing multixact truncation: "
3213  "offsets [%u, %u), offsets segments [%llx, %llx), "
3214  "members [%u, %u), members segments [%llx, %llx)",
3215  oldestMulti, newOldestMulti,
3216  (unsigned long long) MultiXactIdToOffsetSegment(oldestMulti),
3217  (unsigned long long) MultiXactIdToOffsetSegment(newOldestMulti),
3218  oldestOffset, newOldestOffset,
3219  (unsigned long long) MXOffsetToMemberSegment(oldestOffset),
3220  (unsigned long long) MXOffsetToMemberSegment(newOldestOffset));
3221 
3222  /*
3223  * Do truncation, and the WAL logging of the truncation, in a critical
3224  * section. That way offsets/members cannot get out of sync anymore, i.e.
3225  * once consistent the newOldestMulti will always exist in members, even
3226  * if we crashed in the wrong moment.
3227  */
3229 
3230  /*
3231  * Prevent checkpoints from being scheduled concurrently. This is critical
3232  * because otherwise a truncation record might not be replayed after a
3233  * crash/basebackup, even though the state of the data directory would
3234  * require it.
3235  */
3238 
3239  /* WAL log truncation */
3240  WriteMTruncateXlogRec(newOldestMultiDB,
3241  oldestMulti, newOldestMulti,
3242  oldestOffset, newOldestOffset);
3243 
3244  /*
3245  * Update in-memory limits before performing the truncation, while inside
3246  * the critical section: Have to do it before truncation, to prevent
3247  * concurrent lookups of those values. Has to be inside the critical
3248  * section as otherwise a future call to this function would error out,
3249  * while looking up the oldest member in offsets, if our caller crashes
3250  * before updating the limits.
3251  */
3252  LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3253  MultiXactState->oldestMultiXactId = newOldestMulti;
3254  MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3255  LWLockRelease(MultiXactGenLock);
3256 
3257  /* First truncate members */
3258  PerformMembersTruncation(oldestOffset, newOldestOffset);
3259 
3260  /* Then offsets */
3261  PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3262 
3264 
3265  END_CRIT_SECTION();
3266  LWLockRelease(MultiXactTruncationLock);
3267 }
3268 
3269 /*
3270  * Decide whether a MultiXactOffset page number is "older" for truncation
3271  * purposes. Analogous to CLOGPagePrecedes().
3272  *
3273  * Offsetting the values is optional, because MultiXactIdPrecedes() has
3274  * translational symmetry.
3275  */
3276 static bool
3278 {
3279  MultiXactId multi1;
3280  MultiXactId multi2;
3281 
3282  multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3283  multi1 += FirstMultiXactId + 1;
3284  multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3285  multi2 += FirstMultiXactId + 1;
3286 
3287  return (MultiXactIdPrecedes(multi1, multi2) &&
3288  MultiXactIdPrecedes(multi1,
3289  multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
3290 }
3291 
3292 /*
3293  * Decide whether a MultiXactMember page number is "older" for truncation
3294  * purposes. There is no "invalid offset number" so use the numbers verbatim.
3295  */
3296 static bool
3298 {
3299  MultiXactOffset offset1;
3300  MultiXactOffset offset2;
3301 
3302  offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3303  offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3304 
3305  return (MultiXactOffsetPrecedes(offset1, offset2) &&
3306  MultiXactOffsetPrecedes(offset1,
3307  offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
3308 }
3309 
3310 /*
3311  * Decide which of two MultiXactIds is earlier.
3312  *
3313  * XXX do we need to do something special for InvalidMultiXactId?
3314  * (Doesn't look like it.)
3315  */
3316 bool
3318 {
3319  int32 diff = (int32) (multi1 - multi2);
3320 
3321  return (diff < 0);
3322 }
3323 
3324 /*
3325  * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3326  *
3327  * XXX do we need to do something special for InvalidMultiXactId?
3328  * (Doesn't look like it.)
3329  */
3330 bool
3332 {
3333  int32 diff = (int32) (multi1 - multi2);
3334 
3335  return (diff <= 0);
3336 }
3337 
3338 
3339 /*
3340  * Decide which of two offsets is earlier.
3341  */
3342 static bool
3344 {
3345  int32 diff = (int32) (offset1 - offset2);
3346 
3347  return (diff < 0);
3348 }
3349 
3350 /*
3351  * Write an xlog record reflecting the zeroing of either a MEMBERs or
3352  * OFFSETs page (info shows which)
3353  */
3354 static void
3356 {
3357  XLogBeginInsert();
3358  XLogRegisterData((char *) (&pageno), sizeof(pageno));
3359  (void) XLogInsert(RM_MULTIXACT_ID, info);
3360 }
3361 
3362 /*
3363  * Write a TRUNCATE xlog record
3364  *
3365  * We must flush the xlog record to disk before returning --- see notes in
3366  * TruncateCLOG().
3367  */
3368 static void
3370  MultiXactId startTruncOff, MultiXactId endTruncOff,
3371  MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3372 {
3373  XLogRecPtr recptr;
3374  xl_multixact_truncate xlrec;
3375 
3376  xlrec.oldestMultiDB = oldestMultiDB;
3377 
3378  xlrec.startTruncOff = startTruncOff;
3379  xlrec.endTruncOff = endTruncOff;
3380 
3381  xlrec.startTruncMemb = startTruncMemb;
3382  xlrec.endTruncMemb = endTruncMemb;
3383 
3384  XLogBeginInsert();
3385  XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
3386  recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3387  XLogFlush(recptr);
3388 }
3389 
3390 /*
3391  * MULTIXACT resource manager's routines
3392  */
3393 void
3395 {
3396  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3397 
3398  /* Backup blocks are not used in multixact records */
3399  Assert(!XLogRecHasAnyBlockRefs(record));
3400 
3401  if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3402  {
3403  int64 pageno;
3404  int slotno;
3405  LWLock *lock;
3406 
3407  memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3408 
3409  lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
3410  LWLockAcquire(lock, LW_EXCLUSIVE);
3411 
3412  slotno = ZeroMultiXactOffsetPage(pageno, false);
3414  Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3415 
3416  LWLockRelease(lock);
3417  }
3418  else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3419  {
3420  int64 pageno;
3421  int slotno;
3422  LWLock *lock;
3423 
3424  memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3425 
3426  lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
3427  LWLockAcquire(lock, LW_EXCLUSIVE);
3428 
3429  slotno = ZeroMultiXactMemberPage(pageno, false);
3431  Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
3432 
3433  LWLockRelease(lock);
3434  }
3435  else if (info == XLOG_MULTIXACT_CREATE_ID)
3436  {
3437  xl_multixact_create *xlrec =
3438  (xl_multixact_create *) XLogRecGetData(record);
3439  TransactionId max_xid;
3440  int i;
3441 
3442  /* Store the data back into the SLRU files */
3443  RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3444  xlrec->members);
3445 
3446  /* Make sure nextMXact/nextOffset are beyond what this record has */
3447  MultiXactAdvanceNextMXact(xlrec->mid + 1,
3448  xlrec->moff + xlrec->nmembers);
3449 
3450  /*
3451  * Make sure nextXid is beyond any XID mentioned in the record. This
3452  * should be unnecessary, since any XID found here ought to have other
3453  * evidence in the XLOG, but let's be safe.
3454  */
3455  max_xid = XLogRecGetXid(record);
3456  for (i = 0; i < xlrec->nmembers; i++)
3457  {
3458  if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3459  max_xid = xlrec->members[i].xid;
3460  }
3461 
3463  }
3464  else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3465  {
3466  xl_multixact_truncate xlrec;
3467  int64 pageno;
3468 
3469  memcpy(&xlrec, XLogRecGetData(record),
3471 
3472  elog(DEBUG1, "replaying multixact truncation: "
3473  "offsets [%u, %u), offsets segments [%llx, %llx), "
3474  "members [%u, %u), members segments [%llx, %llx)",
3475  xlrec.startTruncOff, xlrec.endTruncOff,
3476  (unsigned long long) MultiXactIdToOffsetSegment(xlrec.startTruncOff),
3477  (unsigned long long) MultiXactIdToOffsetSegment(xlrec.endTruncOff),
3478  xlrec.startTruncMemb, xlrec.endTruncMemb,
3479  (unsigned long long) MXOffsetToMemberSegment(xlrec.startTruncMemb),
3480  (unsigned long long) MXOffsetToMemberSegment(xlrec.endTruncMemb));
3481 
3482  /* should not be required, but more than cheap enough */
3483  LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3484 
3485  /*
3486  * Advance the horizon values, so they're current at the end of
3487  * recovery.
3488  */
3489  SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3490 
3492 
3493  /*
3494  * During XLOG replay, latest_page_number isn't necessarily set up
3495  * yet; insert a suitable value to bypass the sanity test in
3496  * SimpleLruTruncate.
3497  */
3498  pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3499  pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
3500  pageno);
3502 
3503  LWLockRelease(MultiXactTruncationLock);
3504  }
3505  else
3506  elog(PANIC, "multixact_redo: unknown op code %u", info);
3507 }
3508 
3509 Datum
3511 {
3512  typedef struct
3513  {
3514  MultiXactMember *members;
3515  int nmembers;
3516  int iter;
3517  } mxact;
3519  mxact *multi;
3520  FuncCallContext *funccxt;
3521 
3522  if (mxid < FirstMultiXactId)
3523  ereport(ERROR,
3524  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3525  errmsg("invalid MultiXactId: %u", mxid)));
3526 
3527  if (SRF_IS_FIRSTCALL())
3528  {
3529  MemoryContext oldcxt;
3530  TupleDesc tupdesc;
3531 
3532  funccxt = SRF_FIRSTCALL_INIT();
3533  oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
3534 
3535  multi = palloc(sizeof(mxact));
3536  /* no need to allow for old values here */
3537  multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
3538  false);
3539  multi->iter = 0;
3540 
3541  if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
3542  elog(ERROR, "return type must be a row type");
3543  funccxt->tuple_desc = tupdesc;
3544  funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
3545  funccxt->user_fctx = multi;
3546 
3547  MemoryContextSwitchTo(oldcxt);
3548  }
3549 
3550  funccxt = SRF_PERCALL_SETUP();
3551  multi = (mxact *) funccxt->user_fctx;
3552 
3553  while (multi->iter < multi->nmembers)
3554  {
3555  HeapTuple tuple;
3556  char *values[2];
3557 
3558  values[0] = psprintf("%u", multi->members[multi->iter].xid);
3559  values[1] = mxstatus_to_string(multi->members[multi->iter].status);
3560 
3561  tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
3562 
3563  multi->iter++;
3564  pfree(values[0]);
3565  SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
3566  }
3567 
3568  SRF_RETURN_DONE(funccxt);
3569 }
3570 
3571 /*
3572  * Entrypoint for sync.c to sync offsets files.
3573  */
3574 int
3575 multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3576 {
3577  return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3578 }
3579 
3580 /*
3581  * Entrypoint for sync.c to sync members files.
3582  */
3583 int
3584 multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3585 {
3586  return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
3587 }
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:485
int autovacuum_multixact_freeze_max_age
Definition: autovacuum.c:128
static int32 next
Definition: blutils.c:219
static Datum values[MAXATTR]
Definition: bootstrap.c:151
#define Min(x, y)
Definition: c.h:958
uint8_t uint8
Definition: c.h:483
#define Assert(condition)
Definition: c.h:812
int64_t int64
Definition: c.h:482
uint32 MultiXactOffset
Definition: c.h:618
TransactionId MultiXactId
Definition: c.h:616
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:417
int32_t int32
Definition: c.h:481
uint16_t uint16
Definition: c.h:484
uint32_t uint32
Definition: c.h:485
#define MemSet(start, val, len)
Definition: c.h:974
uint32 TransactionId
Definition: c.h:606
size_t Size
Definition: c.h:559
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
char * get_database_name(Oid dbid)
Definition: dbcommands.c:3187
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1180
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errdetail_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1295
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
HeapTuple BuildTupleFromCStrings(AttInMetadata *attinmeta, char **values)
Definition: execTuples.c:2222
AttInMetadata * TupleDescGetAttInMetadata(TupleDesc tupdesc)
Definition: execTuples.c:2173
#define PG_GETARG_TRANSACTIONID(n)
Definition: fmgr.h:279
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
TypeFuncClass get_call_result_type(FunctionCallInfo fcinfo, Oid *resultTypeId, TupleDesc *resultTupleDesc)
Definition: funcapi.c:276
#define SRF_IS_FIRSTCALL()
Definition: funcapi.h:304
#define SRF_PERCALL_SETUP()
Definition: funcapi.h:308
@ TYPEFUNC_COMPOSITE
Definition: funcapi.h:149
#define SRF_RETURN_NEXT(_funcctx, _result)
Definition: funcapi.h:310
#define SRF_FIRSTCALL_INIT()
Definition: funcapi.h:306
static Datum HeapTupleGetDatum(const HeapTupleData *tuple)
Definition: funcapi.h:230
#define SRF_RETURN_DONE(_funcctx)
Definition: funcapi.h:328
Datum difference(PG_FUNCTION_ARGS)
int multixact_offset_buffers
Definition: globals.c:162
bool IsBinaryUpgrade
Definition: globals.c:120
ProcNumber MyProcNumber
Definition: globals.c:89
bool IsUnderPostmaster
Definition: globals.c:119
int multixact_member_buffers
Definition: globals.c:161
#define newval
GucSource
Definition: guc.h:108
return str start
const char * str
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
static dlist_node * dclist_tail_node(dclist_head *head)
Definition: ilist.h:920
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static void dclist_move_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:808
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
#define DCLIST_STATIC_INIT(name)
Definition: ilist.h:282
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
static void dclist_init(dclist_head *head)
Definition: ilist.h:671
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
#define INJECTION_POINT(name)
#define INJECTION_POINT_CACHED(name)
#define INJECTION_POINT_LOAD(name)
int j
Definition: isn.c:73
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
@ LWTRANCHE_MULTIXACTOFFSET_SLRU
Definition: lwlock.h:212
@ LWTRANCHE_MULTIXACTMEMBER_SLRU
Definition: lwlock.h:211
@ LWTRANCHE_MULTIXACTMEMBER_BUFFER
Definition: lwlock.h:183
@ LWTRANCHE_MULTIXACTOFFSET_BUFFER
Definition: lwlock.h:182
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
MemoryContext TopTransactionContext
Definition: mcxt.c:154
void pfree(void *pointer)
Definition: mcxt.c:1521
MemoryContext TopMemoryContext
Definition: mcxt.c:149
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1683
void * palloc(Size size)
Definition: mcxt.c:1317
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
Definition: multixact.c:3369
static MultiXactId PreviousMultiXactId(MultiXactId multi)
Definition: multixact.c:221
static SlruCtlData MultiXactOffsetCtlData
Definition: multixact.c:229
void MultiXactShmemInit(void)
Definition: multixact.c:1964
#define MULTIXACT_MEMBER_SAFE_THRESHOLD
Definition: multixact.c:216
static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3297
static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
Definition: multixact.c:1032
static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
Definition: multixact.c:1662
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition: multixact.c:487
static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
Definition: multixact.c:2090
static int64 MXOffsetToMemberPage(MultiXactOffset offset)
Definition: multixact.c:173
#define MXACT_MEMBER_BITS_PER_XACT
Definition: multixact.c:143
static int64 MultiXactIdToOffsetSegment(MultiXactId multi)
Definition: multixact.c:125
static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
Definition: multixact.c:2585
void ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
Definition: multixact.c:791
static void PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
Definition: multixact.c:3077
#define MXACT_MEMBER_XACT_BITMASK
Definition: multixact.c:145
#define MULTIXACT_FLAGBYTES_PER_GROUP
Definition: multixact.c:148
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3317
void multixact_redo(XLogReaderState *record)
Definition: multixact.c:3394
#define MULTIXACT_OFFSETS_PER_PAGE
Definition: multixact.c:110
#define debug_elog5(a, b, c, d, e)
Definition: multixact.c:384
static void MultiXactIdSetOldestVisible(void)
Definition: multixact.c:730
int multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3575
void multixact_twophase_postcommit(TransactionId xid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1920
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result)
Definition: multixact.c:2888
void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset)
Definition: multixact.c:2328
void multixact_twophase_recover(TransactionId xid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1899
#define MultiXactMemberCtl
Definition: multixact.c:233
static bool SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: multixact.c:3025
void AtPrepare_MultiXact(void)
Definition: multixact.c:1836
static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance)
Definition: multixact.c:2840
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3331
void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
Definition: multixact.c:2536
static int MultiXactIdToOffsetEntry(MultiXactId multi)
Definition: multixact.c:119
static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1709
static void MaybeExtendOffsetSlru(void)
Definition: multixact.c:2118
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition: multixact.c:599
void MultiXactIdSetOldestMember(void)
Definition: multixact.c:673
static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
Definition: multixact.c:3048
static MemoryContext MXactContext
Definition: multixact.c:372
#define SHARED_MULTIXACT_STATE_SIZE
static MultiXactId * OldestVisibleMXactId
Definition: multixact.c:342
struct mxtruncinfo mxtruncinfo
static int mxactMemberComparator(const void *arg1, const void *arg2)
Definition: multixact.c:1589
struct MultiXactStateData MultiXactStateData
static void ExtendMultiXactOffset(MultiXactId multi)
Definition: multixact.c:2553
void PostPrepare_MultiXact(TransactionId xid)
Definition: multixact.c:1850
Size MultiXactShmemSize(void)
Definition: multixact.c:1947
#define MULTIXACT_MEMBERGROUPS_PER_PAGE
Definition: multixact.c:154
#define MultiXactOffsetCtl
Definition: multixact.c:232
void multixact_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1935
static int MXOffsetToMemberOffset(MultiXactOffset offset)
Definition: multixact.c:206
void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, MultiXactOffset *nextMultiOffset, MultiXactId *oldestMulti, Oid *oldestMultiDB)
Definition: multixact.c:2282
static void WriteMZeroPageXlogRec(int64 pageno, uint8 info)
Definition: multixact.c:3355
void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, bool is_startup)
Definition: multixact.c:2362
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, MultiXactMember *members)
Definition: multixact.c:916
int multixactmemberssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3584
#define MAX_CACHE_ENTRIES
Definition: multixact.c:370
static int64 MultiXactIdToOffsetPage(MultiXactId multi)
Definition: multixact.c:113
MultiXactId GetOldestMultiXactId(void)
Definition: multixact.c:2660
void CheckPointMultiXact(void)
Definition: multixact.c:2304
#define MaxOldestSlot
Definition: multixact.c:337
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition: multixact.c:815
static bool ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
Definition: multixact.c:2926
struct mXactCacheEnt mXactCacheEnt
static int64 MXOffsetToMemberSegment(MultiXactOffset offset)
Definition: multixact.c:179
static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members)
Definition: multixact.c:1619
static dclist_head MXactCache
Definition: multixact.c:371
void TrimMultiXact(void)
Definition: multixact.c:2178
char * mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1777
#define debug_elog3(a, b, c)
Definition: multixact.c:382
#define MULTIXACT_MEMBERGROUP_SIZE
Definition: multixact.c:152
#define debug_elog4(a, b, c, d)
Definition: multixact.c:383
static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3277
static bool SetOffsetVacuumLimit(bool is_startup)
Definition: multixact.c:2713
static int MXOffsetToFlagsOffset(MultiXactOffset offset)
Definition: multixact.c:186
int MultiXactMemberFreezeThreshold(void)
Definition: multixact.c:2978
void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset)
Definition: multixact.c:2511
static MultiXactId * OldestMemberMXactId
Definition: multixact.c:341
#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE
Definition: multixact.c:168
static MultiXactStateData * MultiXactState
Definition: multixact.c:340
static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
Definition: multixact.c:2074
#define MULTIXACT_MEMBERS_PER_MEMBERGROUP
Definition: multixact.c:149
static char * mxstatus_to_string(MultiXactStatus status)
Definition: multixact.c:1754
#define OFFSET_WARN_SEGMENTS
Datum pg_get_multixact_members(PG_FUNCTION_ARGS)
Definition: multixact.c:3510
MultiXactId ReadNextMultiXactId(void)
Definition: multixact.c:771
void BootStrapMultiXact(void)
Definition: multixact.c:2034
#define debug_elog6(a, b, c, d, e, f)
Definition: multixact.c:385
#define MULTIXACT_MEMBERS_PER_PAGE
Definition: multixact.c:155
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition: multixact.c:434
void TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
Definition: multixact.c:3102
#define MULTIXACT_MEMBER_DANGER_THRESHOLD
Definition: multixact.c:217
static int MXOffsetToFlagsBitShift(MultiXactOffset offset)
Definition: multixact.c:196
bool check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2014
static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
Definition: multixact.c:3343
bool check_multixact_member_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2023
void AtEOXact_MultiXact(void)
Definition: multixact.c:1808
static SlruCtlData MultiXactMemberCtlData
Definition: multixact.c:230
#define debug_elog2(a, b)
Definition: multixact.c:381
void StartupMultiXact(void)
Definition: multixact.c:2153
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition: multixact.c:1299
#define MultiXactIdIsValid(multi)
Definition: multixact.h:28
#define XLOG_MULTIXACT_ZERO_MEM_PAGE
Definition: multixact.h:69
#define XLOG_MULTIXACT_ZERO_OFF_PAGE
Definition: multixact.h:68
#define FirstMultiXactId
Definition: multixact.h:25
MultiXactStatus
Definition: multixact.h:38
@ MultiXactStatusForShare
Definition: multixact.h:40
@ MultiXactStatusForNoKeyUpdate
Definition: multixact.h:41
@ MultiXactStatusNoKeyUpdate
Definition: multixact.h:44
@ MultiXactStatusUpdate
Definition: multixact.h:46
@ MultiXactStatusForUpdate
Definition: multixact.h:42
@ MultiXactStatusForKeyShare
Definition: multixact.h:39
#define ISUPDATE_from_mxstatus(status)
Definition: multixact.h:52
#define InvalidMultiXactId
Definition: multixact.h:24
#define XLOG_MULTIXACT_TRUNCATE_ID
Definition: multixact.h:71
#define SizeOfMultiXactCreate
Definition: multixact.h:81
#define SizeOfMultiXactTruncate
Definition: multixact.h:96
#define XLOG_MULTIXACT_CREATE_ID
Definition: multixact.h:70
#define MaxMultiXactOffset
Definition: multixact.h:30
#define MaxMultiXactId
Definition: multixact.h:26
struct MultiXactMember MultiXactMember
const void size_t len
const void * data
while(p+4<=pend)
static char * filename
Definition: pg_dumpall.c:119
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:165
@ PMSIGNAL_START_AUTOVAC_LAUNCHER
Definition: pmsignal.h:38
#define qsort(a, b, c, d)
Definition: port.h:447
uintptr_t Datum
Definition: postgres.h:64
unsigned int Oid
Definition: postgres_ext.h:31
#define DELAY_CHKPT_START
Definition: proc.h:119
bool TransactionIdIsInProgress(TransactionId xid)
Definition: procarray.c:1402
int ProcNumber
Definition: procnumber.h:24
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
MemoryContextSwitchTo(old_ctx)
tree ctl
Definition: radixtree.h:1855
Size add_size(Size s1, Size s2)
Definition: shmem.c:488
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:382
static pg_noinline void Size size
Definition: slab.c:607
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition: slru.c:252
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:605
void SimpleLruWritePage(SlruCtl ctl, int slotno)
Definition: slru.c:732
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition: slru.c:1322
bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
Definition: slru.c:746
void SlruDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1526
bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
Definition: slru.c:1791
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid)
Definition: slru.c:502
int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
Definition: slru.c:1831
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:375
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition: slru.c:1408
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition: slru.c:199
bool check_slru_buffers(const char *name, int *newval)
Definition: slru.c:355
static LWLock * SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
Definition: slru.h:178
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition: slru.h:202
#define SLRU_PAGES_PER_SEGMENT
Definition: slru.h:39
PGPROC * MyProc
Definition: proc.c:66
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:94
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:191
void initStringInfo(StringInfo str)
Definition: stringinfo.c:56
Definition: sync.h:51
void * user_fctx
Definition: funcapi.h:82
AttInMetadata * attinmeta
Definition: funcapi.h:91
MemoryContext multi_call_memory_ctx
Definition: funcapi.h:101
TupleDesc tuple_desc
Definition: funcapi.h:112
Definition: lwlock.h:42
TransactionId xid
Definition: multixact.h:58
MultiXactStatus status
Definition: multixact.h:59
MultiXactId multiWrapLimit
Definition: multixact.c:273
MultiXactId multiStopLimit
Definition: multixact.c:272
MultiXactId multiWarnLimit
Definition: multixact.c:271
MultiXactId multiVacLimit
Definition: multixact.c:270
MultiXactOffset offsetStopLimit
Definition: multixact.c:276
MultiXactOffset nextOffset
Definition: multixact.c:248
MultiXactId nextMXact
Definition: multixact.c:245
MultiXactId oldestMultiXactId
Definition: multixact.c:258
MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:331
MultiXactOffset oldestOffset
Definition: multixact.c:266
ConditionVariable nextoff_cv
Definition: multixact.c:282
int delayChkptFlags
Definition: proc.h:240
dlist_node * cur
Definition: ilist.h:179
MultiXactId multi
Definition: multixact.c:364
dlist_node node
Definition: multixact.c:366
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:367
int64 earliestExistingPage
Definition: multixact.c:3017
MultiXactId mid
Definition: multixact.h:75
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.h:78
MultiXactOffset moff
Definition: multixact.h:76
MultiXactId endTruncOff
Definition: multixact.h:89
MultiXactOffset startTruncMemb
Definition: multixact.h:92
MultiXactOffset endTruncMemb
Definition: multixact.h:93
MultiXactId startTruncOff
Definition: multixact.h:88
@ SYNC_HANDLER_MULTIXACT_MEMBER
Definition: sync.h:41
@ SYNC_HANDLER_MULTIXACT_OFFSET
Definition: sync.h:40
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:126
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, const void *data, uint32 len)
Definition: twophase.c:1280
ProcNumber TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held)
Definition: twophase.c:903
#define TWOPHASE_RM_MULTIXACT_ID
Definition: twophase_rmgr.h:27
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
bool IsTransactionState(void)
Definition: xact.c:386
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition: xact.c:940
bool RecoveryInProgress(void)
Definition: xlog.c:6334
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2802
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:412
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
bool InRecovery
Definition: xlogutils.c:50