PostgreSQL Source Code git master
multixact.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * multixact.c
4 * PostgreSQL multi-transaction-log manager
5 *
6 * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 * TransactionId and a set of flag bits. The name is a bit historical:
10 * originally, a MultiXactId consisted of more than one TransactionId (except
11 * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 * legitimate to have MultiXactIds that only include a single Xid.
13 *
14 * The meaning of the flag bits is opaque to this module, but they are mostly
15 * used in heapam.c to identify lock modes that each of the member transactions
16 * is holding on any given tuple. This module just contains support to store
17 * and retrieve the arrays.
18 *
19 * We use two SLRU areas, one for storing the offsets at which the data
20 * starts for each MultiXactId in the other one. This trick allows us to
21 * store variable length arrays of TransactionIds. (We could alternatively
22 * use one area containing counts and TransactionIds, with valid MultiXactId
23 * values pointing at slots containing counts; but that way seems less robust
24 * since it would get completely confused if someone inquired about a bogus
25 * MultiXactId that pointed to an intermediate slot containing an XID.)
26 *
27 * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 * MEMBERs page is initialized to zeroes, as well as an
29 * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 * This module ignores the WAL rule "write xlog before data," because it
31 * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 * rule. The only way for the MXID to be referenced from any data page is for
33 * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 * an XLOG record that must follow ours. The normal LSN interlock between the
35 * data page and that XLOG record will ensure that our XLOG record reaches
36 * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 * module's XLOG records completely rebuild the data entered since the last
40 * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 * before each checkpoint is considered complete.
42 *
43 * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 * crashes and ensure that MXID and offset numbering increases monotonically
45 * across a crash. We do this in the same way as it's done for transaction
46 * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 * could need to worry about, and we just make sure that at the end of
48 * replay, the next-MXID and next-offset counters are at least as large as
49 * anything we saw during replay.
50 *
51 * We are able to remove segments no longer necessary by carefully tracking
52 * each table's used values: during vacuum, any multixact older than a certain
53 * value is removed; the cutoff value is stored in pg_class. The minimum value
54 * across all tables in each database is stored in pg_database, and the global
55 * minimum across all databases is part of pg_control and is kept in shared
56 * memory. Whenever that minimum is advanced, the SLRUs are truncated.
57 *
58 * When new multixactid values are to be created, care is taken that the
59 * counter does not fall within the wraparound horizon considering the global
60 * minimum value.
61 *
62 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
63 * Portions Copyright (c) 1994, Regents of the University of California
64 *
65 * src/backend/access/transam/multixact.c
66 *
67 *-------------------------------------------------------------------------
68 */
69#include "postgres.h"
70
71#include "access/multixact.h"
72#include "access/slru.h"
73#include "access/twophase.h"
75#include "access/xlog.h"
76#include "access/xloginsert.h"
77#include "access/xlogutils.h"
78#include "miscadmin.h"
79#include "pg_trace.h"
80#include "pgstat.h"
82#include "storage/pmsignal.h"
83#include "storage/proc.h"
84#include "storage/procarray.h"
85#include "utils/guc_hooks.h"
87#include "utils/lsyscache.h"
88#include "utils/memutils.h"
89
90
91/*
92 * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
93 * used everywhere else in Postgres.
94 *
95 * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
96 * MultiXact page numbering also wraps around at
97 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
98 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
99 * take no explicit notice of that fact in this module, except when comparing
100 * segment and page numbers in TruncateMultiXact (see
101 * MultiXactOffsetPagePrecedes).
102 */
103
104/* We need four bytes per offset */
105#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
106
107static inline int64
109{
110 return multi / MULTIXACT_OFFSETS_PER_PAGE;
111}
112
113static inline int
115{
116 return multi % MULTIXACT_OFFSETS_PER_PAGE;
117}
118
119static inline int64
121{
123}
124
125/*
126 * The situation for members is a bit more complex: we store one byte of
127 * additional flag bits for each TransactionId. To do this without getting
128 * into alignment issues, we store four bytes of flags, and then the
129 * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
130 * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
131 * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
132 * performance) trumps space efficiency here.
133 *
134 * Note that the "offset" macros work with byte offset, not array indexes, so
135 * arithmetic must be done using "char *" pointers.
136 */
137/* We need eight bits per xact, so one xact fits in a byte */
138#define MXACT_MEMBER_BITS_PER_XACT 8
139#define MXACT_MEMBER_FLAGS_PER_BYTE 1
140#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
141
142/* how many full bytes of flags are there in a group? */
143#define MULTIXACT_FLAGBYTES_PER_GROUP 4
144#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
145 (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
146/* size in bytes of a complete group */
147#define MULTIXACT_MEMBERGROUP_SIZE \
148 (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
149#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
150#define MULTIXACT_MEMBERS_PER_PAGE \
151 (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
152
153/*
154 * Because the number of items per page is not a divisor of the last item
155 * number (member 0xFFFFFFFF), the last segment does not use the maximum number
156 * of pages, and moreover the last used page therein does not use the same
157 * number of items as previous pages. (Another way to say it is that the
158 * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
159 * has some empty space after that item.)
160 *
161 * This constant is the number of members in the last page of the last segment.
162 */
163#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
164 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
165
166/* page in which a member is to be found */
167static inline int64
169{
170 return offset / MULTIXACT_MEMBERS_PER_PAGE;
171}
172
173static inline int64
175{
177}
178
179/* Location (byte offset within page) of flag word for a given member */
180static inline int
182{
184 int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
185 int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
186
187 return byteoff;
188}
189
190static inline int
192{
193 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
194 int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
195
196 return bshift;
197}
198
199/* Location (byte offset within page) of TransactionId of given member */
200static inline int
202{
203 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
204
205 return MXOffsetToFlagsOffset(offset) +
207 member_in_group * sizeof(TransactionId);
208}
209
210/* Multixact members wraparound thresholds. */
211#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
212#define MULTIXACT_MEMBER_DANGER_THRESHOLD \
213 (MaxMultiXactOffset - MaxMultiXactOffset / 4)
214
215static inline MultiXactId
217{
218 return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
219}
220
221/*
222 * Links to shared-memory data structures for MultiXact control
223 */
226
227#define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
228#define MultiXactMemberCtl (&MultiXactMemberCtlData)
229
230/*
231 * MultiXact state shared across all backends. All this state is protected
232 * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
233 * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
234 * concurrency's sake, we avoid holding more than one of these locks at a
235 * time.)
236 */
237typedef struct MultiXactStateData
238{
239 /* next-to-be-assigned MultiXactId */
241
242 /* next-to-be-assigned offset */
244
245 /* Have we completed multixact startup? */
247
248 /*
249 * Oldest multixact that is still potentially referenced by a relation.
250 * Anything older than this should not be consulted. These values are
251 * updated by vacuum.
252 */
255
256 /*
257 * Oldest multixact offset that is potentially referenced by a multixact
258 * referenced by a relation. We don't always know this value, so there's
259 * a flag here to indicate whether or not we currently do.
260 */
263
264 /* support for anti-wraparound measures */
269
270 /* support for members anti-wraparound measures */
271 MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
272
273 /*
274 * Per-backend data starts here. We have two arrays stored in the area
275 * immediately following the MultiXactStateData struct. Each is indexed by
276 * ProcNumber.
277 *
278 * In both arrays, there's a slot for all normal backends
279 * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
280 * transactions.
281 *
282 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
283 * transaction(s) could possibly be a member of, or InvalidMultiXactId
284 * when the backend has no live transaction that could possibly be a
285 * member of a MultiXact. Each backend sets its entry to the current
286 * nextMXact counter just before first acquiring a shared lock in a given
287 * transaction, and clears it at transaction end. (This works because only
288 * during or after acquiring a shared lock could an XID possibly become a
289 * member of a MultiXact, and that MultiXact would have to be created
290 * during or after the lock acquisition.)
291 *
292 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
293 * current transaction(s) think is potentially live, or InvalidMultiXactId
294 * when not in a transaction or not in a transaction that's paid any
295 * attention to MultiXacts yet. This is computed when first needed in a
296 * given transaction, and cleared at transaction end. We can compute it
297 * as the minimum of the valid OldestMemberMXactId[] entries at the time
298 * we compute it (using nextMXact if none are valid). Each backend is
299 * required not to attempt to access any SLRU data for MultiXactIds older
300 * than its own OldestVisibleMXactId[] setting; this is necessary because
301 * the relevant SLRU data can be concurrently truncated away.
302 *
303 * The oldest valid value among all of the OldestMemberMXactId[] and
304 * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
305 * possible value still having any live member transaction -- OldestMxact.
306 * Any value older than that is typically removed from tuple headers, or
307 * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
308 * remove an individual MultiXact xmax whose value is >= its OldestMxact
309 * cutoff, though typically only when no individual member XID is still
310 * running. See FreezeMultiXactId for full details.
311 *
312 * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
313 * or the oldest extant Multi remaining in the table is used as the new
314 * pg_class.relminmxid value (whichever is earlier). The minimum of all
315 * relminmxid values in each database is stored in pg_database.datminmxid.
316 * In turn, the minimum of all of those values is stored in pg_control.
317 * This is used as the truncation point for pg_multixact when unneeded
318 * segments get removed by vac_truncate_clog() during vacuuming.
319 */
322
323/*
324 * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
325 */
326#define MaxOldestSlot (MaxBackends + max_prepared_xacts)
327
328/* Pointers to the state data in shared memory */
332
333
334/*
335 * Definitions for the backend-local MultiXactId cache.
336 *
337 * We use this cache to store known MultiXacts, so we don't need to go to
338 * SLRU areas every time.
339 *
340 * The cache lasts for the duration of a single transaction, the rationale
341 * for this being that most entries will contain our own TransactionId and
342 * so they will be uninteresting by the time our next transaction starts.
343 * (XXX not clear that this is correct --- other members of the MultiXact
344 * could hang around longer than we did. However, it's not clear what a
345 * better policy for flushing old cache entries would be.) FIXME actually
346 * this is plain wrong now that multixact's may contain update Xids.
347 *
348 * We allocate the cache entries in a memory context that is deleted at
349 * transaction end, so we don't need to do retail freeing of entries.
350 */
351typedef struct mXactCacheEnt
352{
358
359#define MAX_CACHE_ENTRIES 256
362
363#ifdef MULTIXACT_DEBUG
364#define debug_elog2(a,b) elog(a,b)
365#define debug_elog3(a,b,c) elog(a,b,c)
366#define debug_elog4(a,b,c,d) elog(a,b,c,d)
367#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
368#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
369#else
370#define debug_elog2(a,b)
371#define debug_elog3(a,b,c)
372#define debug_elog4(a,b,c,d)
373#define debug_elog5(a,b,c,d,e)
374#define debug_elog6(a,b,c,d,e,f)
375#endif
376
377/* internal MultiXactId management */
378static void MultiXactIdSetOldestVisible(void);
379static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
380 int nmembers, MultiXactMember *members);
381static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
382
383/* MultiXact cache management */
384static int mxactMemberComparator(const void *arg1, const void *arg2);
385static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
386static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
387static void mXactCachePut(MultiXactId multi, int nmembers,
388 MultiXactMember *members);
389
390/* management of SLRU infrastructure */
391static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
392static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
393static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
394 MultiXactOffset offset2);
395static void ExtendMultiXactOffset(MultiXactId multi);
396static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
397static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
398 MultiXactOffset start, uint32 distance);
399static bool SetOffsetVacuumLimit(bool is_startup);
400static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
401static void WriteMTruncateXlogRec(Oid oldestMultiDB,
402 MultiXactId startTruncOff,
403 MultiXactId endTruncOff,
404 MultiXactOffset startTruncMemb,
405 MultiXactOffset endTruncMemb);
406
407
408/*
409 * MultiXactIdCreate
410 * Construct a MultiXactId representing two TransactionIds.
411 *
412 * The two XIDs must be different, or be requesting different statuses.
413 *
414 * NB - we don't worry about our local MultiXactId cache here, because that
415 * is handled by the lower-level routines.
416 */
419 TransactionId xid2, MultiXactStatus status2)
420{
421 MultiXactId newMulti;
422 MultiXactMember members[2];
423
426
427 Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
428
429 /* MultiXactIdSetOldestMember() must have been called already. */
431
432 /*
433 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
434 * are still running. In typical usage, xid2 will be our own XID and the
435 * caller just did a check on xid1, so it'd be wasted effort.
436 */
437
438 members[0].xid = xid1;
439 members[0].status = status1;
440 members[1].xid = xid2;
441 members[1].status = status2;
442
443 newMulti = MultiXactIdCreateFromMembers(2, members);
444
445 debug_elog3(DEBUG2, "Create: %s",
446 mxid_to_string(newMulti, 2, members));
447
448 return newMulti;
449}
450
451/*
452 * MultiXactIdExpand
453 * Add a TransactionId to a pre-existing MultiXactId.
454 *
455 * If the TransactionId is already a member of the passed MultiXactId with the
456 * same status, just return it as-is.
457 *
458 * Note that we do NOT actually modify the membership of a pre-existing
459 * MultiXactId; instead we create a new one. This is necessary to avoid
460 * a race condition against code trying to wait for one MultiXactId to finish;
461 * see notes in heapam.c.
462 *
463 * NB - we don't worry about our local MultiXactId cache here, because that
464 * is handled by the lower-level routines.
465 *
466 * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
467 * one upgraded by pg_upgrade from a cluster older than this feature) are not
468 * passed in.
469 */
472{
473 MultiXactId newMulti;
474 MultiXactMember *members;
475 MultiXactMember *newMembers;
476 int nmembers;
477 int i;
478 int j;
479
482
483 /* MultiXactIdSetOldestMember() must have been called already. */
485
486 debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
487 multi, xid, mxstatus_to_string(status));
488
489 /*
490 * Note: we don't allow for old multis here. The reason is that the only
491 * caller of this function does a check that the multixact is no longer
492 * running.
493 */
494 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
495
496 if (nmembers < 0)
497 {
498 MultiXactMember member;
499
500 /*
501 * The MultiXactId is obsolete. This can only happen if all the
502 * MultiXactId members stop running between the caller checking and
503 * passing it to us. It would be better to return that fact to the
504 * caller, but it would complicate the API and it's unlikely to happen
505 * too often, so just deal with it by creating a singleton MultiXact.
506 */
507 member.xid = xid;
508 member.status = status;
509 newMulti = MultiXactIdCreateFromMembers(1, &member);
510
511 debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
512 multi, newMulti);
513 return newMulti;
514 }
515
516 /*
517 * If the TransactionId is already a member of the MultiXactId with the
518 * same status, just return the existing MultiXactId.
519 */
520 for (i = 0; i < nmembers; i++)
521 {
522 if (TransactionIdEquals(members[i].xid, xid) &&
523 (members[i].status == status))
524 {
525 debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
526 xid, multi);
527 pfree(members);
528 return multi;
529 }
530 }
531
532 /*
533 * Determine which of the members of the MultiXactId are still of
534 * interest. This is any running transaction, and also any transaction
535 * that grabbed something stronger than just a lock and was committed. (An
536 * update that aborted is of no interest here; and having more than one
537 * update Xid in a multixact would cause errors elsewhere.)
538 *
539 * Removing dead members is not just an optimization: freezing of tuples
540 * whose Xmax are multis depends on this behavior.
541 *
542 * Note we have the same race condition here as above: j could be 0 at the
543 * end of the loop.
544 */
545 newMembers = (MultiXactMember *)
546 palloc(sizeof(MultiXactMember) * (nmembers + 1));
547
548 for (i = 0, j = 0; i < nmembers; i++)
549 {
550 if (TransactionIdIsInProgress(members[i].xid) ||
551 (ISUPDATE_from_mxstatus(members[i].status) &&
552 TransactionIdDidCommit(members[i].xid)))
553 {
554 newMembers[j].xid = members[i].xid;
555 newMembers[j++].status = members[i].status;
556 }
557 }
558
559 newMembers[j].xid = xid;
560 newMembers[j++].status = status;
561 newMulti = MultiXactIdCreateFromMembers(j, newMembers);
562
563 pfree(members);
564 pfree(newMembers);
565
566 debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
567
568 return newMulti;
569}
570
571/*
572 * MultiXactIdIsRunning
573 * Returns whether a MultiXactId is "running".
574 *
575 * We return true if at least one member of the given MultiXactId is still
576 * running. Note that a "false" result is certain not to change,
577 * because it is not legal to add members to an existing MultiXactId.
578 *
579 * Caller is expected to have verified that the multixact does not come from
580 * a pg_upgraded share-locked tuple.
581 */
582bool
583MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
584{
585 MultiXactMember *members;
586 int nmembers;
587 int i;
588
589 debug_elog3(DEBUG2, "IsRunning %u?", multi);
590
591 /*
592 * "false" here means we assume our callers have checked that the given
593 * multi cannot possibly come from a pg_upgraded database.
594 */
595 nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
596
597 if (nmembers <= 0)
598 {
599 debug_elog2(DEBUG2, "IsRunning: no members");
600 return false;
601 }
602
603 /*
604 * Checking for myself is cheap compared to looking in shared memory;
605 * return true if any live subtransaction of the current top-level
606 * transaction is a member.
607 *
608 * This is not needed for correctness, it's just a fast path.
609 */
610 for (i = 0; i < nmembers; i++)
611 {
612 if (TransactionIdIsCurrentTransactionId(members[i].xid))
613 {
614 debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
615 pfree(members);
616 return true;
617 }
618 }
619
620 /*
621 * This could be made faster by having another entry point in procarray.c,
622 * walking the PGPROC array only once for all the members. But in most
623 * cases nmembers should be small enough that it doesn't much matter.
624 */
625 for (i = 0; i < nmembers; i++)
626 {
627 if (TransactionIdIsInProgress(members[i].xid))
628 {
629 debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
630 i, members[i].xid);
631 pfree(members);
632 return true;
633 }
634 }
635
636 pfree(members);
637
638 debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
639
640 return false;
641}
642
643/*
644 * MultiXactIdSetOldestMember
645 * Save the oldest MultiXactId this transaction could be a member of.
646 *
647 * We set the OldestMemberMXactId for a given transaction the first time it's
648 * going to do some operation that might require a MultiXactId (tuple lock,
649 * update or delete). We need to do this even if we end up using a
650 * TransactionId instead of a MultiXactId, because there is a chance that
651 * another transaction would add our XID to a MultiXactId.
652 *
653 * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
654 * be called just before doing any such possibly-MultiXactId-able operation.
655 */
656void
658{
660 {
661 MultiXactId nextMXact;
662
663 /*
664 * You might think we don't need to acquire a lock here, since
665 * fetching and storing of TransactionIds is probably atomic, but in
666 * fact we do: suppose we pick up nextMXact and then lose the CPU for
667 * a long time. Someone else could advance nextMXact, and then
668 * another someone else could compute an OldestVisibleMXactId that
669 * would be after the value we are going to store when we get control
670 * back. Which would be wrong.
671 *
672 * Note that a shared lock is sufficient, because it's enough to stop
673 * someone from advancing nextMXact; and nobody else could be trying
674 * to write to our OldestMember entry, only reading (and we assume
675 * storing it is atomic.)
676 */
677 LWLockAcquire(MultiXactGenLock, LW_SHARED);
678
679 /*
680 * We have to beware of the possibility that nextMXact is in the
681 * wrapped-around state. We don't fix the counter itself here, but we
682 * must be sure to store a valid value in our array entry.
683 */
684 nextMXact = MultiXactState->nextMXact;
685 if (nextMXact < FirstMultiXactId)
686 nextMXact = FirstMultiXactId;
687
689
690 LWLockRelease(MultiXactGenLock);
691
692 debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
693 MyProcNumber, nextMXact);
694 }
695}
696
697/*
698 * MultiXactIdSetOldestVisible
699 * Save the oldest MultiXactId this transaction considers possibly live.
700 *
701 * We set the OldestVisibleMXactId for a given transaction the first time
702 * it's going to inspect any MultiXactId. Once we have set this, we are
703 * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
704 * won't be truncated away.
705 *
706 * The value to set is the oldest of nextMXact and all the valid per-backend
707 * OldestMemberMXactId[] entries. Because of the locking we do, we can be
708 * certain that no subsequent call to MultiXactIdSetOldestMember can set
709 * an OldestMemberMXactId[] entry older than what we compute here. Therefore
710 * there is no live transaction, now or later, that can be a member of any
711 * MultiXactId older than the OldestVisibleMXactId we compute here.
712 */
713static void
715{
717 {
718 MultiXactId oldestMXact;
719 int i;
720
721 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
722
723 /*
724 * We have to beware of the possibility that nextMXact is in the
725 * wrapped-around state. We don't fix the counter itself here, but we
726 * must be sure to store a valid value in our array entry.
727 */
728 oldestMXact = MultiXactState->nextMXact;
729 if (oldestMXact < FirstMultiXactId)
730 oldestMXact = FirstMultiXactId;
731
732 for (i = 0; i < MaxOldestSlot; i++)
733 {
734 MultiXactId thisoldest = OldestMemberMXactId[i];
735
736 if (MultiXactIdIsValid(thisoldest) &&
737 MultiXactIdPrecedes(thisoldest, oldestMXact))
738 oldestMXact = thisoldest;
739 }
740
741 OldestVisibleMXactId[MyProcNumber] = oldestMXact;
742
743 LWLockRelease(MultiXactGenLock);
744
745 debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
746 MyProcNumber, oldestMXact);
747 }
748}
749
750/*
751 * ReadNextMultiXactId
752 * Return the next MultiXactId to be assigned, but don't allocate it
753 */
756{
757 MultiXactId mxid;
758
759 /* XXX we could presumably do this without a lock. */
760 LWLockAcquire(MultiXactGenLock, LW_SHARED);
762 LWLockRelease(MultiXactGenLock);
763
764 if (mxid < FirstMultiXactId)
765 mxid = FirstMultiXactId;
766
767 return mxid;
768}
769
770/*
771 * ReadMultiXactIdRange
772 * Get the range of IDs that may still be referenced by a relation.
773 */
774void
776{
777 LWLockAcquire(MultiXactGenLock, LW_SHARED);
780 LWLockRelease(MultiXactGenLock);
781
782 if (*oldest < FirstMultiXactId)
783 *oldest = FirstMultiXactId;
784 if (*next < FirstMultiXactId)
786}
787
788
789/*
790 * MultiXactIdCreateFromMembers
791 * Make a new MultiXactId from the specified set of members
792 *
793 * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
794 * given TransactionIds as members. Returns the newly created MultiXactId.
795 *
796 * NB: the passed members[] array will be sorted in-place.
797 */
800{
801 MultiXactId multi;
802 MultiXactOffset offset;
804
805 debug_elog3(DEBUG2, "Create: %s",
806 mxid_to_string(InvalidMultiXactId, nmembers, members));
807
808 /*
809 * See if the same set of members already exists in our cache; if so, just
810 * re-use that MultiXactId. (Note: it might seem that looking in our
811 * cache is insufficient, and we ought to search disk to see if a
812 * duplicate definition already exists. But since we only ever create
813 * MultiXacts containing our own XID, in most cases any such MultiXacts
814 * were in fact created by us, and so will be in our cache. There are
815 * corner cases where someone else added us to a MultiXact without our
816 * knowledge, but it's not worth checking for.)
817 */
818 multi = mXactCacheGetBySet(nmembers, members);
819 if (MultiXactIdIsValid(multi))
820 {
821 debug_elog2(DEBUG2, "Create: in cache!");
822 return multi;
823 }
824
825 /* Verify that there is a single update Xid among the given members. */
826 {
827 int i;
828 bool has_update = false;
829
830 for (i = 0; i < nmembers; i++)
831 {
832 if (ISUPDATE_from_mxstatus(members[i].status))
833 {
834 if (has_update)
835 elog(ERROR, "new multixact has more than one updating member: %s",
836 mxid_to_string(InvalidMultiXactId, nmembers, members));
837 has_update = true;
838 }
839 }
840 }
841
842 /* Load the injection point before entering the critical section */
843 INJECTION_POINT_LOAD("multixact-create-from-members");
844
845 /*
846 * Assign the MXID and offsets range to use, and make sure there is space
847 * in the OFFSETs and MEMBERs files. NB: this routine does
848 * START_CRIT_SECTION().
849 *
850 * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
851 * that we've called MultiXactIdSetOldestMember here. This is because
852 * this routine is used in some places to create new MultiXactIds of which
853 * the current backend is not a member, notably during freezing of multis
854 * in vacuum. During vacuum, in particular, it would be unacceptable to
855 * keep OldestMulti set, in case it runs for long.
856 */
857 multi = GetNewMultiXactId(nmembers, &offset);
858
859 INJECTION_POINT_CACHED("multixact-create-from-members", NULL);
860
861 /* Make an XLOG entry describing the new MXID. */
862 xlrec.mid = multi;
863 xlrec.moff = offset;
864 xlrec.nmembers = nmembers;
865
866 /*
867 * XXX Note: there's a lot of padding space in MultiXactMember. We could
868 * find a more compact representation of this Xlog record -- perhaps all
869 * the status flags in one XLogRecData, then all the xids in another one?
870 * Not clear that it's worth the trouble though.
871 */
874 XLogRegisterData(members, nmembers * sizeof(MultiXactMember));
875
876 (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
877
878 /* Now enter the information into the OFFSETs and MEMBERs logs */
879 RecordNewMultiXact(multi, offset, nmembers, members);
880
881 /* Done with critical section */
883
884 /* Store the new MultiXactId in the local cache, too */
885 mXactCachePut(multi, nmembers, members);
886
887 debug_elog2(DEBUG2, "Create: all done");
888
889 return multi;
890}
891
892/*
893 * RecordNewMultiXact
894 * Write info about a new multixact into the offsets and members files
895 *
896 * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
897 * use it.
898 */
899static void
901 int nmembers, MultiXactMember *members)
902{
903 int64 pageno;
904 int64 prev_pageno;
905 int entryno;
906 int slotno;
907 MultiXactOffset *offptr;
909 int64 next_pageno;
910 int next_entryno;
911 MultiXactOffset *next_offptr;
912 LWLock *lock;
913 LWLock *prevlock = NULL;
914
915 /* position of this multixid in the offsets SLRU area */
916 pageno = MultiXactIdToOffsetPage(multi);
917 entryno = MultiXactIdToOffsetEntry(multi);
918
919 /* position of the next multixid */
920 next = multi + 1;
923 next_pageno = MultiXactIdToOffsetPage(next);
924 next_entryno = MultiXactIdToOffsetEntry(next);
925
926 /*
927 * Set the starting offset of this multixid's members.
928 *
929 * In the common case, it was already be set by the previous
930 * RecordNewMultiXact call, as this was the next multixid of the previous
931 * multixid. But if multiple backends are generating multixids
932 * concurrently, we might race ahead and get called before the previous
933 * multixid.
934 */
937
938 /*
939 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
940 * to complain about if there's any I/O error. This is kinda bogus, but
941 * since the errors will always give the full pathname, it should be clear
942 * enough that a MultiXactId is really involved. Perhaps someday we'll
943 * take the trouble to generalize the slru.c error reporting code.
944 */
945 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
946 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
947 offptr += entryno;
948
949 if (*offptr != offset)
950 {
951 /* should already be set to the correct value, or not at all */
952 Assert(*offptr == 0);
953 *offptr = offset;
954 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
955 }
956
957 /*
958 * Set the next multixid's offset to the end of this multixid's members.
959 */
960 if (next_pageno == pageno)
961 {
962 next_offptr = offptr + 1;
963 }
964 else
965 {
966 /* must be the first entry on the page */
967 Assert(next_entryno == 0 || next == FirstMultiXactId);
968
969 /* Swap the lock for a lock on the next page */
970 LWLockRelease(lock);
971 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
973
974 slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
975 next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
976 next_offptr += next_entryno;
977 }
978
979 if (*next_offptr != offset + nmembers)
980 {
981 /* should already be set to the correct value, or not at all */
982 Assert(*next_offptr == 0);
983 *next_offptr = offset + nmembers;
984 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
985 }
986
987 /* Release MultiXactOffset SLRU lock. */
988 LWLockRelease(lock);
989
990 prev_pageno = -1;
991
992 for (int i = 0; i < nmembers; i++, offset++)
993 {
994 TransactionId *memberptr;
995 uint32 *flagsptr;
996 uint32 flagsval;
997 int bshift;
998 int flagsoff;
999 int memberoff;
1000
1001 Assert(members[i].status <= MultiXactStatusUpdate);
1002
1003 pageno = MXOffsetToMemberPage(offset);
1004 memberoff = MXOffsetToMemberOffset(offset);
1005 flagsoff = MXOffsetToFlagsOffset(offset);
1006 bshift = MXOffsetToFlagsBitShift(offset);
1007
1008 if (pageno != prev_pageno)
1009 {
1010 /*
1011 * MultiXactMember SLRU page is changed so check if this new page
1012 * fall into the different SLRU bank then release the old bank's
1013 * lock and acquire lock on the new bank.
1014 */
1016 if (lock != prevlock)
1017 {
1018 if (prevlock != NULL)
1019 LWLockRelease(prevlock);
1020
1022 prevlock = lock;
1023 }
1024 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1025 prev_pageno = pageno;
1026 }
1027
1028 memberptr = (TransactionId *)
1029 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1030
1031 *memberptr = members[i].xid;
1032
1033 flagsptr = (uint32 *)
1034 (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1035
1036 flagsval = *flagsptr;
1037 flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
1038 flagsval |= (members[i].status << bshift);
1039 *flagsptr = flagsval;
1040
1041 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1042 }
1043
1044 if (prevlock != NULL)
1045 LWLockRelease(prevlock);
1046}
1047
1048/*
1049 * GetNewMultiXactId
1050 * Get the next MultiXactId.
1051 *
1052 * Also, reserve the needed amount of space in the "members" area. The
1053 * starting offset of the reserved space is returned in *offset.
1054 *
1055 * This may generate XLOG records for expansion of the offsets and/or members
1056 * files. Unfortunately, we have to do that while holding MultiXactGenLock
1057 * to avoid race conditions --- the XLOG record for zeroing a page must appear
1058 * before any backend can possibly try to store data in that page!
1059 *
1060 * We start a critical section before advancing the shared counters. The
1061 * caller must end the critical section after writing SLRU data.
1062 */
1063static MultiXactId
1065{
1066 MultiXactId result;
1067 MultiXactOffset nextOffset;
1068
1069 debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
1070
1071 /* safety check, we should never get this far in a HS standby */
1072 if (RecoveryInProgress())
1073 elog(ERROR, "cannot assign MultiXactIds during recovery");
1074
1075 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1076
1077 /* Handle wraparound of the nextMXact counter */
1080
1081 /* Assign the MXID */
1082 result = MultiXactState->nextMXact;
1083
1084 /*----------
1085 * Check to see if it's safe to assign another MultiXactId. This protects
1086 * against catastrophic data loss due to multixact wraparound. The basic
1087 * rules are:
1088 *
1089 * If we're past multiVacLimit or the safe threshold for member storage
1090 * space, or we don't know what the safe threshold for member storage is,
1091 * start trying to force autovacuum cycles.
1092 * If we're past multiWarnLimit, start issuing warnings.
1093 * If we're past multiStopLimit, refuse to create new MultiXactIds.
1094 *
1095 * Note these are pretty much the same protections in GetNewTransactionId.
1096 *----------
1097 */
1099 {
1100 /*
1101 * For safety's sake, we release MultiXactGenLock while sending
1102 * signals, warnings, etc. This is not so much because we care about
1103 * preserving concurrency in this situation, as to avoid any
1104 * possibility of deadlock while doing get_database_name(). First,
1105 * copy all the shared values we'll need in this path.
1106 */
1107 MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1108 MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1109 MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1110 Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1111
1112 LWLockRelease(MultiXactGenLock);
1113
1114 if (IsUnderPostmaster &&
1115 !MultiXactIdPrecedes(result, multiStopLimit))
1116 {
1117 char *oldest_datname = get_database_name(oldest_datoid);
1118
1119 /*
1120 * Immediately kick autovacuum into action as we're already in
1121 * ERROR territory.
1122 */
1124
1125 /* complain even if that DB has disappeared */
1126 if (oldest_datname)
1127 ereport(ERROR,
1128 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1129 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1130 oldest_datname),
1131 errhint("Execute a database-wide VACUUM in that database.\n"
1132 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1133 else
1134 ereport(ERROR,
1135 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1136 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1137 oldest_datoid),
1138 errhint("Execute a database-wide VACUUM in that database.\n"
1139 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1140 }
1141
1142 /*
1143 * To avoid swamping the postmaster with signals, we issue the autovac
1144 * request only once per 64K multis generated. This still gives
1145 * plenty of chances before we get into real trouble.
1146 */
1147 if (IsUnderPostmaster && (result % 65536) == 0)
1149
1150 if (!MultiXactIdPrecedes(result, multiWarnLimit))
1151 {
1152 char *oldest_datname = get_database_name(oldest_datoid);
1153
1154 /* complain even if that DB has disappeared */
1155 if (oldest_datname)
1157 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1158 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1159 multiWrapLimit - result,
1160 oldest_datname,
1161 multiWrapLimit - result),
1162 errhint("Execute a database-wide VACUUM in that database.\n"
1163 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1164 else
1166 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1167 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1168 multiWrapLimit - result,
1169 oldest_datoid,
1170 multiWrapLimit - result),
1171 errhint("Execute a database-wide VACUUM in that database.\n"
1172 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1173 }
1174
1175 /* Re-acquire lock and start over */
1176 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1177 result = MultiXactState->nextMXact;
1178 if (result < FirstMultiXactId)
1179 result = FirstMultiXactId;
1180 }
1181
1182 /*
1183 * Make sure there is room for the next MXID in the file. Assigning this
1184 * MXID sets the next MXID's offset already.
1185 */
1186 ExtendMultiXactOffset(result + 1);
1187
1188 /*
1189 * Reserve the members space, similarly to above. Also, be careful not to
1190 * return zero as the starting offset for any multixact. See
1191 * GetMultiXactIdMembers() for motivation.
1192 */
1193 nextOffset = MultiXactState->nextOffset;
1194 if (nextOffset == 0)
1195 {
1196 *offset = 1;
1197 nmembers++; /* allocate member slot 0 too */
1198 }
1199 else
1200 *offset = nextOffset;
1201
1202 /*----------
1203 * Protect against overrun of the members space as well, with the
1204 * following rules:
1205 *
1206 * If we're past offsetStopLimit, refuse to generate more multis.
1207 * If we're close to offsetStopLimit, emit a warning.
1208 *
1209 * Arbitrarily, we start emitting warnings when we're 20 segments or less
1210 * from offsetStopLimit.
1211 *
1212 * Note we haven't updated the shared state yet, so if we fail at this
1213 * point, the multixact ID we grabbed can still be used by the next guy.
1214 *
1215 * Note that there is no point in forcing autovacuum runs here: the
1216 * multixact freeze settings would have to be reduced for that to have any
1217 * effect.
1218 *----------
1219 */
1220#define OFFSET_WARN_SEGMENTS 20
1223 nmembers))
1224 {
1225 /* see comment in the corresponding offsets wraparound case */
1227
1228 ereport(ERROR,
1229 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1230 errmsg("multixact \"members\" limit exceeded"),
1231 errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1232 "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1233 MultiXactState->offsetStopLimit - nextOffset - 1,
1234 nmembers,
1235 MultiXactState->offsetStopLimit - nextOffset - 1),
1236 errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1238 }
1239
1240 /*
1241 * Check whether we should kick autovacuum into action, to prevent members
1242 * wraparound. NB we use a much larger window to trigger autovacuum than
1243 * just the warning limit. The warning is just a measure of last resort -
1244 * this is in line with GetNewTransactionId's behaviour.
1245 */
1249 {
1250 /*
1251 * To avoid swamping the postmaster with signals, we issue the autovac
1252 * request only when crossing a segment boundary. With default
1253 * compilation settings that's roughly after 50k members. This still
1254 * gives plenty of chances before we get into real trouble.
1255 */
1256 if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1257 (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1259 }
1260
1263 nextOffset,
1266 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1267 errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1268 "database with OID %u must be vacuumed before %d more multixact members are used",
1269 MultiXactState->offsetStopLimit - nextOffset + nmembers,
1271 MultiXactState->offsetStopLimit - nextOffset + nmembers),
1272 errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1273
1274 ExtendMultiXactMember(nextOffset, nmembers);
1275
1276 /*
1277 * Critical section from here until caller has written the data into the
1278 * just-reserved SLRU space; we don't want to error out with a partly
1279 * written MultiXact structure. (In particular, failing to write our
1280 * start offset after advancing nextMXact would effectively corrupt the
1281 * previous MultiXact.)
1282 */
1284
1285 /*
1286 * Advance counters. As in GetNewTransactionId(), this must not happen
1287 * until after file extension has succeeded!
1288 *
1289 * We don't care about MultiXactId wraparound here; it will be handled by
1290 * the next iteration. But note that nextMXact may be InvalidMultiXactId
1291 * or the first value on a segment-beginning page after this routine
1292 * exits, so anyone else looking at the variable must be prepared to deal
1293 * with either case. Similarly, nextOffset may be zero, but we won't use
1294 * that as the actual start offset of the next multixact.
1295 */
1297
1298 MultiXactState->nextOffset += nmembers;
1299
1300 LWLockRelease(MultiXactGenLock);
1301
1302 debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1303 return result;
1304}
1305
1306/*
1307 * GetMultiXactIdMembers
1308 * Return the set of MultiXactMembers that make up a MultiXactId
1309 *
1310 * Return value is the number of members found, or -1 if there are none,
1311 * and *members is set to a newly palloc'ed array of members. It's the
1312 * caller's responsibility to free it when done with it.
1313 *
1314 * from_pgupgrade must be passed as true if and only if only the multixact
1315 * corresponds to a value from a tuple that was locked in a 9.2-or-older
1316 * installation and later pg_upgrade'd (that is, the infomask is
1317 * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1318 * can still be running, so we return -1 just like for an empty multixact
1319 * without any further checking. It would be wrong to try to resolve such a
1320 * multixact: either the multixact is within the current valid multixact
1321 * range, in which case the returned result would be bogus, or outside that
1322 * range, in which case an error would be raised.
1323 *
1324 * In all other cases, the passed multixact must be within the known valid
1325 * range, that is, greater than or equal to oldestMultiXactId, and less than
1326 * nextMXact. Otherwise, an error is raised.
1327 *
1328 * isLockOnly must be set to true if caller is certain that the given multi
1329 * is used only to lock tuples; can be false without loss of correctness,
1330 * but passing a true means we can return quickly without checking for
1331 * old updates.
1332 */
1333int
1335 bool from_pgupgrade, bool isLockOnly)
1336{
1337 int64 pageno;
1338 int64 prev_pageno;
1339 int entryno;
1340 int slotno;
1341 MultiXactOffset *offptr;
1342 MultiXactOffset offset;
1343 int length;
1344 int truelength;
1345 MultiXactId oldestMXact;
1346 MultiXactId nextMXact;
1347 MultiXactMember *ptr;
1348 LWLock *lock;
1349
1350 debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1351
1352 if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1353 {
1354 *members = NULL;
1355 return -1;
1356 }
1357
1358 /* See if the MultiXactId is in the local cache */
1359 length = mXactCacheGetById(multi, members);
1360 if (length >= 0)
1361 {
1362 debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1363 mxid_to_string(multi, length, *members));
1364 return length;
1365 }
1366
1367 /* Set our OldestVisibleMXactId[] entry if we didn't already */
1369
1370 /*
1371 * If we know the multi is used only for locking and not for updates, then
1372 * we can skip checking if the value is older than our oldest visible
1373 * multi. It cannot possibly still be running.
1374 */
1375 if (isLockOnly &&
1377 {
1378 debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1379 *members = NULL;
1380 return -1;
1381 }
1382
1383 /*
1384 * We check known limits on MultiXact before resorting to the SLRU area.
1385 *
1386 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1387 * useful; it has already been removed, or will be removed shortly, by
1388 * truncation. If one is passed, an error is raised.
1389 *
1390 * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1391 * implies undetected ID wraparound has occurred. This raises a hard
1392 * error.
1393 *
1394 * Shared lock is enough here since we aren't modifying any global state.
1395 * Acquire it just long enough to grab the current counter values.
1396 */
1397 LWLockAcquire(MultiXactGenLock, LW_SHARED);
1398
1399 oldestMXact = MultiXactState->oldestMultiXactId;
1400 nextMXact = MultiXactState->nextMXact;
1401
1402 LWLockRelease(MultiXactGenLock);
1403
1404 if (MultiXactIdPrecedes(multi, oldestMXact))
1405 ereport(ERROR,
1406 (errcode(ERRCODE_INTERNAL_ERROR),
1407 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1408 multi)));
1409
1410 if (!MultiXactIdPrecedes(multi, nextMXact))
1411 ereport(ERROR,
1412 (errcode(ERRCODE_INTERNAL_ERROR),
1413 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1414 multi)));
1415
1416 /*
1417 * Find out the offset at which we need to start reading MultiXactMembers
1418 * and the number of members in the multixact. We determine the latter as
1419 * the difference between this multixact's starting offset and the next
1420 * one's. However, there is one corner case to worry about:
1421 *
1422 * Because GetNewMultiXactId skips over offset zero, to reserve zero for
1423 * to mean "unset", there is an ambiguity near the point of offset
1424 * wraparound. If we see next multixact's offset is one, is that our
1425 * multixact's actual endpoint, or did it end at zero with a subsequent
1426 * increment? We handle this using the knowledge that if the zero'th
1427 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1428 * transaction ID so it can't be a multixact member. Therefore, if we
1429 * read a zero from the members array, just ignore it.
1430 */
1431 pageno = MultiXactIdToOffsetPage(multi);
1432 entryno = MultiXactIdToOffsetEntry(multi);
1433
1434 /* Acquire the bank lock for the page we need. */
1437
1438 /* read this multi's offset */
1439 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1440 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1441 offptr += entryno;
1442 offset = *offptr;
1443
1444 Assert(offset != 0);
1445
1446 /* read next multi's offset */
1447 {
1448 MultiXactId tmpMXact;
1449 MultiXactOffset nextMXOffset;
1450
1451 /* handle wraparound if needed */
1452 tmpMXact = multi + 1;
1453 if (tmpMXact < FirstMultiXactId)
1454 tmpMXact = FirstMultiXactId;
1455
1456 prev_pageno = pageno;
1457
1458 pageno = MultiXactIdToOffsetPage(tmpMXact);
1459 entryno = MultiXactIdToOffsetEntry(tmpMXact);
1460
1461 if (pageno != prev_pageno)
1462 {
1463 LWLock *newlock;
1464
1465 /*
1466 * Since we're going to access a different SLRU page, if this page
1467 * falls under a different bank, release the old bank's lock and
1468 * acquire the lock of the new bank.
1469 */
1470 newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1471 if (newlock != lock)
1472 {
1473 LWLockRelease(lock);
1474 LWLockAcquire(newlock, LW_EXCLUSIVE);
1475 lock = newlock;
1476 }
1477 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1478 }
1479
1480 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1481 offptr += entryno;
1482 nextMXOffset = *offptr;
1483
1484 if (nextMXOffset == 0)
1485 ereport(ERROR,
1487 errmsg("MultiXact %u has invalid next offset",
1488 multi)));
1489
1490 length = nextMXOffset - offset;
1491 }
1492
1493 LWLockRelease(lock);
1494 lock = NULL;
1495
1496 /* read the members */
1497 ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1498
1499 truelength = 0;
1500 prev_pageno = -1;
1501 for (int i = 0; i < length; i++, offset++)
1502 {
1503 TransactionId *xactptr;
1504 uint32 *flagsptr;
1505 int flagsoff;
1506 int bshift;
1507 int memberoff;
1508
1509 pageno = MXOffsetToMemberPage(offset);
1510 memberoff = MXOffsetToMemberOffset(offset);
1511
1512 if (pageno != prev_pageno)
1513 {
1514 LWLock *newlock;
1515
1516 /*
1517 * Since we're going to access a different SLRU page, if this page
1518 * falls under a different bank, release the old bank's lock and
1519 * acquire the lock of the new bank.
1520 */
1521 newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1522 if (newlock != lock)
1523 {
1524 if (lock)
1525 LWLockRelease(lock);
1526 LWLockAcquire(newlock, LW_EXCLUSIVE);
1527 lock = newlock;
1528 }
1529
1530 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1531 prev_pageno = pageno;
1532 }
1533
1534 xactptr = (TransactionId *)
1535 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1536
1537 if (!TransactionIdIsValid(*xactptr))
1538 {
1539 /* Corner case: we must be looking at unused slot zero */
1540 Assert(offset == 0);
1541 continue;
1542 }
1543
1544 flagsoff = MXOffsetToFlagsOffset(offset);
1545 bshift = MXOffsetToFlagsBitShift(offset);
1546 flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1547
1548 ptr[truelength].xid = *xactptr;
1549 ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1550 truelength++;
1551 }
1552
1553 LWLockRelease(lock);
1554
1555 /* A multixid with zero members should not happen */
1556 Assert(truelength > 0);
1557
1558 /*
1559 * Copy the result into the local cache.
1560 */
1561 mXactCachePut(multi, truelength, ptr);
1562
1563 debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1564 mxid_to_string(multi, truelength, ptr));
1565 *members = ptr;
1566 return truelength;
1567}
1568
1569/*
1570 * mxactMemberComparator
1571 * qsort comparison function for MultiXactMember
1572 *
1573 * We can't use wraparound comparison for XIDs because that does not respect
1574 * the triangle inequality! Any old sort order will do.
1575 */
1576static int
1577mxactMemberComparator(const void *arg1, const void *arg2)
1578{
1579 MultiXactMember member1 = *(const MultiXactMember *) arg1;
1580 MultiXactMember member2 = *(const MultiXactMember *) arg2;
1581
1582 if (member1.xid > member2.xid)
1583 return 1;
1584 if (member1.xid < member2.xid)
1585 return -1;
1586 if (member1.status > member2.status)
1587 return 1;
1588 if (member1.status < member2.status)
1589 return -1;
1590 return 0;
1591}
1592
1593/*
1594 * mXactCacheGetBySet
1595 * returns a MultiXactId from the cache based on the set of
1596 * TransactionIds that compose it, or InvalidMultiXactId if
1597 * none matches.
1598 *
1599 * This is helpful, for example, if two transactions want to lock a huge
1600 * table. By using the cache, the second will use the same MultiXactId
1601 * for the majority of tuples, thus keeping MultiXactId usage low (saving
1602 * both I/O and wraparound issues).
1603 *
1604 * NB: the passed members array will be sorted in-place.
1605 */
1606static MultiXactId
1608{
1609 dlist_iter iter;
1610
1611 debug_elog3(DEBUG2, "CacheGet: looking for %s",
1612 mxid_to_string(InvalidMultiXactId, nmembers, members));
1613
1614 /* sort the array so comparison is easy */
1615 qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1616
1618 {
1620 iter.cur);
1621
1622 if (entry->nmembers != nmembers)
1623 continue;
1624
1625 /*
1626 * We assume the cache entries are sorted, and that the unused bits in
1627 * "status" are zeroed.
1628 */
1629 if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1630 {
1631 debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1633 return entry->multi;
1634 }
1635 }
1636
1637 debug_elog2(DEBUG2, "CacheGet: not found :-(");
1638 return InvalidMultiXactId;
1639}
1640
1641/*
1642 * mXactCacheGetById
1643 * returns the composing MultiXactMember set from the cache for a
1644 * given MultiXactId, if present.
1645 *
1646 * If successful, *xids is set to the address of a palloc'd copy of the
1647 * MultiXactMember set. Return value is number of members, or -1 on failure.
1648 */
1649static int
1651{
1652 dlist_iter iter;
1653
1654 debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1655
1657 {
1659 iter.cur);
1660
1661 if (entry->multi == multi)
1662 {
1663 MultiXactMember *ptr;
1664 Size size;
1665
1666 size = sizeof(MultiXactMember) * entry->nmembers;
1667 ptr = (MultiXactMember *) palloc(size);
1668
1669 memcpy(ptr, entry->members, size);
1670
1671 debug_elog3(DEBUG2, "CacheGet: found %s",
1672 mxid_to_string(multi,
1673 entry->nmembers,
1674 entry->members));
1675
1676 /*
1677 * Note we modify the list while not using a modifiable iterator.
1678 * This is acceptable only because we exit the iteration
1679 * immediately afterwards.
1680 */
1682
1683 *members = ptr;
1684 return entry->nmembers;
1685 }
1686 }
1687
1688 debug_elog2(DEBUG2, "CacheGet: not found");
1689 return -1;
1690}
1691
1692/*
1693 * mXactCachePut
1694 * Add a new MultiXactId and its composing set into the local cache.
1695 */
1696static void
1697mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1698{
1699 mXactCacheEnt *entry;
1700
1701 debug_elog3(DEBUG2, "CachePut: storing %s",
1702 mxid_to_string(multi, nmembers, members));
1703
1704 if (MXactContext == NULL)
1705 {
1706 /* The cache only lives as long as the current transaction */
1707 debug_elog2(DEBUG2, "CachePut: initializing memory context");
1709 "MultiXact cache context",
1711 }
1712
1713 entry = (mXactCacheEnt *)
1715 offsetof(mXactCacheEnt, members) +
1716 nmembers * sizeof(MultiXactMember));
1717
1718 entry->multi = multi;
1719 entry->nmembers = nmembers;
1720 memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1721
1722 /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1723 qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1724
1725 dclist_push_head(&MXactCache, &entry->node);
1727 {
1728 dlist_node *node;
1729
1732
1733 entry = dclist_container(mXactCacheEnt, node, node);
1734 debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1735 entry->multi);
1736
1737 pfree(entry);
1738 }
1739}
1740
1741char *
1743{
1744 switch (status)
1745 {
1747 return "keysh";
1749 return "sh";
1751 return "fornokeyupd";
1753 return "forupd";
1755 return "nokeyupd";
1757 return "upd";
1758 default:
1759 elog(ERROR, "unrecognized multixact status %d", status);
1760 return "";
1761 }
1762}
1763
1764char *
1765mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1766{
1767 static char *str = NULL;
1769 int i;
1770
1771 if (str != NULL)
1772 pfree(str);
1773
1775
1776 appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1777 mxstatus_to_string(members[0].status));
1778
1779 for (i = 1; i < nmembers; i++)
1780 appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1781 mxstatus_to_string(members[i].status));
1782
1785 pfree(buf.data);
1786 return str;
1787}
1788
1789/*
1790 * AtEOXact_MultiXact
1791 * Handle transaction end for MultiXact
1792 *
1793 * This is called at top transaction commit or abort (we don't care which).
1794 */
1795void
1797{
1798 /*
1799 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1800 * which should only be valid while within a transaction.
1801 *
1802 * We assume that storing a MultiXactId is atomic and so we need not take
1803 * MultiXactGenLock to do this.
1804 */
1807
1808 /*
1809 * Discard the local MultiXactId cache. Since MXactContext was created as
1810 * a child of TopTransactionContext, we needn't delete it explicitly.
1811 */
1812 MXactContext = NULL;
1814}
1815
1816/*
1817 * AtPrepare_MultiXact
1818 * Save multixact state at 2PC transaction prepare
1819 *
1820 * In this phase, we only store our OldestMemberMXactId value in the two-phase
1821 * state file.
1822 */
1823void
1825{
1827
1828 if (MultiXactIdIsValid(myOldestMember))
1830 &myOldestMember, sizeof(MultiXactId));
1831}
1832
1833/*
1834 * PostPrepare_MultiXact
1835 * Clean up after successful PREPARE TRANSACTION
1836 */
1837void
1839{
1840 MultiXactId myOldestMember;
1841
1842 /*
1843 * Transfer our OldestMemberMXactId value to the slot reserved for the
1844 * prepared transaction.
1845 */
1846 myOldestMember = OldestMemberMXactId[MyProcNumber];
1847 if (MultiXactIdIsValid(myOldestMember))
1848 {
1849 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1850
1851 /*
1852 * Even though storing MultiXactId is atomic, acquire lock to make
1853 * sure others see both changes, not just the reset of the slot of the
1854 * current backend. Using a volatile pointer might suffice, but this
1855 * isn't a hot spot.
1856 */
1857 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1858
1859 OldestMemberMXactId[dummyProcNumber] = myOldestMember;
1861
1862 LWLockRelease(MultiXactGenLock);
1863 }
1864
1865 /*
1866 * We don't need to transfer OldestVisibleMXactId value, because the
1867 * transaction is not going to be looking at any more multixacts once it's
1868 * prepared.
1869 *
1870 * We assume that storing a MultiXactId is atomic and so we need not take
1871 * MultiXactGenLock to do this.
1872 */
1874
1875 /*
1876 * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1877 */
1878 MXactContext = NULL;
1880}
1881
1882/*
1883 * multixact_twophase_recover
1884 * Recover the state of a prepared transaction at startup
1885 */
1886void
1888 void *recdata, uint32 len)
1889{
1890 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1891 MultiXactId oldestMember;
1892
1893 /*
1894 * Get the oldest member XID from the state file record, and set it in the
1895 * OldestMemberMXactId slot reserved for this prepared transaction.
1896 */
1897 Assert(len == sizeof(MultiXactId));
1898 oldestMember = *((MultiXactId *) recdata);
1899
1900 OldestMemberMXactId[dummyProcNumber] = oldestMember;
1901}
1902
1903/*
1904 * multixact_twophase_postcommit
1905 * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1906 */
1907void
1909 void *recdata, uint32 len)
1910{
1911 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
1912
1913 Assert(len == sizeof(MultiXactId));
1914
1915 OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId;
1916}
1917
1918/*
1919 * multixact_twophase_postabort
1920 * This is actually just the same as the COMMIT case.
1921 */
1922void
1924 void *recdata, uint32 len)
1925{
1926 multixact_twophase_postcommit(fxid, info, recdata, len);
1927}
1928
1929/*
1930 * Initialization of shared memory for MultiXact. We use two SLRU areas,
1931 * thus double memory. Also, reserve space for the shared MultiXactState
1932 * struct and the per-backend MultiXactId arrays (two of those, too).
1933 */
1934Size
1936{
1937 Size size;
1938
1939 /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1940#define SHARED_MULTIXACT_STATE_SIZE \
1941 add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1942 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1943
1947
1948 return size;
1949}
1950
1951void
1953{
1954 bool found;
1955
1956 debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1957
1960
1962 "multixact_offset", multixact_offset_buffers, 0,
1963 "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1964 LWTRANCHE_MULTIXACTOFFSET_SLRU,
1966 false);
1969 "multixact_member", multixact_member_buffers, 0,
1970 "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1971 LWTRANCHE_MULTIXACTMEMBER_SLRU,
1973 false);
1974 /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1975
1976 /* Initialize our shared state struct */
1977 MultiXactState = ShmemInitStruct("Shared MultiXact State",
1979 &found);
1980 if (!IsUnderPostmaster)
1981 {
1982 Assert(!found);
1983
1984 /* Make sure we zero out the per-backend state */
1986 }
1987 else
1988 Assert(found);
1989
1990 /*
1991 * Set up array pointers.
1992 */
1995}
1996
1997/*
1998 * GUC check_hook for multixact_offset_buffers
1999 */
2000bool
2002{
2003 return check_slru_buffers("multixact_offset_buffers", newval);
2004}
2005
2006/*
2007 * GUC check_hook for multixact_member_buffers
2008 */
2009bool
2011{
2012 return check_slru_buffers("multixact_member_buffers", newval);
2013}
2014
2015/*
2016 * This func must be called ONCE on system install. It creates the initial
2017 * MultiXact segments. (The MultiXacts directories are assumed to have been
2018 * created by initdb, and MultiXactShmemInit must have been called already.)
2019 */
2020void
2022{
2023 /* Zero the initial pages and flush them to disk */
2026}
2027
2028/*
2029 * MaybeExtendOffsetSlru
2030 * Extend the offsets SLRU area, if necessary
2031 *
2032 * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2033 * contain files that are shorter than necessary; this would occur if the old
2034 * installation had used multixacts beyond the first page (files cannot be
2035 * copied, because the on-disk representation is different). pg_upgrade would
2036 * update pg_control to set the next offset value to be at that position, so
2037 * that tuples marked as locked by such MultiXacts would be seen as visible
2038 * without having to consult multixact. However, trying to create and use a
2039 * new MultiXactId would result in an error because the page on which the new
2040 * value would reside does not exist. This routine is in charge of creating
2041 * such pages.
2042 */
2043static void
2045{
2046 int64 pageno;
2047 LWLock *lock;
2048
2051
2053
2055 {
2056 int slotno;
2057
2058 /*
2059 * Fortunately for us, SimpleLruWritePage is already prepared to deal
2060 * with creating a new segment file even if the page we're writing is
2061 * not the first in it, so this is enough.
2062 */
2063 slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2065 }
2066
2067 LWLockRelease(lock);
2068}
2069
2070/*
2071 * This must be called ONCE during postmaster or standalone-backend startup.
2072 *
2073 * StartupXLOG has already established nextMXact/nextOffset by calling
2074 * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2075 * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2076 * replayed WAL.
2077 */
2078void
2080{
2083 int64 pageno;
2084
2085 /*
2086 * Initialize offset's idea of the latest page number.
2087 */
2088 pageno = MultiXactIdToOffsetPage(multi);
2089 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2090 pageno);
2091
2092 /*
2093 * Initialize member's idea of the latest page number.
2094 */
2095 pageno = MXOffsetToMemberPage(offset);
2096 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2097 pageno);
2098}
2099
2100/*
2101 * This must be called ONCE at the end of startup/recovery.
2102 */
2103void
2105{
2106 MultiXactId nextMXact;
2107 MultiXactOffset offset;
2108 MultiXactId oldestMXact;
2109 Oid oldestMXactDB;
2110 int64 pageno;
2111 int entryno;
2112 int flagsoff;
2113
2114 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2115 nextMXact = MultiXactState->nextMXact;
2116 offset = MultiXactState->nextOffset;
2117 oldestMXact = MultiXactState->oldestMultiXactId;
2118 oldestMXactDB = MultiXactState->oldestMultiXactDB;
2119 LWLockRelease(MultiXactGenLock);
2120
2121 /* Clean up offsets state */
2122
2123 /*
2124 * (Re-)Initialize our idea of the latest page number for offsets.
2125 */
2126 pageno = MultiXactIdToOffsetPage(nextMXact);
2127 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2128 pageno);
2129
2130 /*
2131 * Set the offset of nextMXact on the offsets page. This is normally done
2132 * in RecordNewMultiXact() of the previous multixact, but let's be sure
2133 * the next page exists, if the nextMXact was reset with pg_resetwal for
2134 * example.
2135 *
2136 * Zero out the remainder of the page. See notes in TrimCLOG() for
2137 * background. Unlike CLOG, some WAL record covers every pg_multixact
2138 * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2139 * xlog before data," nextMXact successors may carry obsolete, nonzero
2140 * offset values.
2141 */
2142 entryno = MultiXactIdToOffsetEntry(nextMXact);
2143 {
2144 int slotno;
2145 MultiXactOffset *offptr;
2147
2149 if (entryno == 0)
2150 slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2151 else
2152 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2153 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2154 offptr += entryno;
2155
2156 *offptr = offset;
2157 if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2158 MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
2159
2160 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2161 LWLockRelease(lock);
2162 }
2163
2164 /*
2165 * And the same for members.
2166 *
2167 * (Re-)Initialize our idea of the latest page number for members.
2168 */
2169 pageno = MXOffsetToMemberPage(offset);
2170 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2171 pageno);
2172
2173 /*
2174 * Zero out the remainder of the current members page. See notes in
2175 * TrimCLOG() for motivation.
2176 */
2177 flagsoff = MXOffsetToFlagsOffset(offset);
2178 if (flagsoff != 0)
2179 {
2180 int slotno;
2181 TransactionId *xidptr;
2182 int memberoff;
2184
2186 memberoff = MXOffsetToMemberOffset(offset);
2187 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2188 xidptr = (TransactionId *)
2189 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2190
2191 MemSet(xidptr, 0, BLCKSZ - memberoff);
2192
2193 /*
2194 * Note: we don't need to zero out the flag bits in the remaining
2195 * members of the current group, because they are always reset before
2196 * writing.
2197 */
2198
2199 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2200 LWLockRelease(lock);
2201 }
2202
2203 /* signal that we're officially up */
2204 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2206 LWLockRelease(MultiXactGenLock);
2207
2208 /* Now compute how far away the next members wraparound is. */
2209 SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2210}
2211
2212/*
2213 * Get the MultiXact data to save in a checkpoint record
2214 */
2215void
2217 MultiXactId *nextMulti,
2218 MultiXactOffset *nextMultiOffset,
2219 MultiXactId *oldestMulti,
2220 Oid *oldestMultiDB)
2221{
2222 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2223 *nextMulti = MultiXactState->nextMXact;
2224 *nextMultiOffset = MultiXactState->nextOffset;
2225 *oldestMulti = MultiXactState->oldestMultiXactId;
2226 *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2227 LWLockRelease(MultiXactGenLock);
2228
2230 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2231 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2232}
2233
2234/*
2235 * Perform a checkpoint --- either during shutdown, or on-the-fly
2236 */
2237void
2239{
2240 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2241
2242 /*
2243 * Write dirty MultiXact pages to disk. This may result in sync requests
2244 * queued for later handling by ProcessSyncRequests(), as part of the
2245 * checkpoint.
2246 */
2249
2250 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2251}
2252
2253/*
2254 * Set the next-to-be-assigned MultiXactId and offset
2255 *
2256 * This is used when we can determine the correct next ID/offset exactly
2257 * from a checkpoint record. Although this is only called during bootstrap
2258 * and XLog replay, we take the lock in case any hot-standby backends are
2259 * examining the values.
2260 */
2261void
2263 MultiXactOffset nextMultiOffset)
2264{
2265 debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2266 nextMulti, nextMultiOffset);
2267 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2268 MultiXactState->nextMXact = nextMulti;
2269 MultiXactState->nextOffset = nextMultiOffset;
2270 LWLockRelease(MultiXactGenLock);
2271
2272 /*
2273 * During a binary upgrade, make sure that the offsets SLRU is large
2274 * enough to contain the next value that would be created.
2275 *
2276 * We need to do this pretty early during the first startup in binary
2277 * upgrade mode: before StartupMultiXact() in fact, because this routine
2278 * is called even before that by StartupXLOG(). And we can't do it
2279 * earlier than at this point, because during that first call of this
2280 * routine we determine the MultiXactState->nextMXact value that
2281 * MaybeExtendOffsetSlru needs.
2282 */
2283 if (IsBinaryUpgrade)
2285}
2286
2287/*
2288 * Determine the last safe MultiXactId to allocate given the currently oldest
2289 * datminmxid (ie, the oldest MultiXactId that might exist in any database
2290 * of our cluster), and the OID of the (or a) database with that value.
2291 *
2292 * is_startup is true when we are just starting the cluster, false when we
2293 * are updating state in a running cluster. This only affects log messages.
2294 */
2295void
2296SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2297 bool is_startup)
2298{
2299 MultiXactId multiVacLimit;
2300 MultiXactId multiWarnLimit;
2301 MultiXactId multiStopLimit;
2302 MultiXactId multiWrapLimit;
2303 MultiXactId curMulti;
2304 bool needs_offset_vacuum;
2305
2306 Assert(MultiXactIdIsValid(oldest_datminmxid));
2307
2308 /*
2309 * We pretend that a wrap will happen halfway through the multixact ID
2310 * space, but that's not really true, because multixacts wrap differently
2311 * from transaction IDs. Note that, separately from any concern about
2312 * multixact IDs wrapping, we must ensure that multixact members do not
2313 * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2314 */
2315 multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2316 if (multiWrapLimit < FirstMultiXactId)
2317 multiWrapLimit += FirstMultiXactId;
2318
2319 /*
2320 * We'll refuse to continue assigning MultiXactIds once we get within 3M
2321 * multi of data loss. See SetTransactionIdLimit.
2322 */
2323 multiStopLimit = multiWrapLimit - 3000000;
2324 if (multiStopLimit < FirstMultiXactId)
2325 multiStopLimit -= FirstMultiXactId;
2326
2327 /*
2328 * We'll start complaining loudly when we get within 40M multis of data
2329 * loss. This is kind of arbitrary, but if you let your gas gauge get
2330 * down to 2% of full, would you be looking for the next gas station? We
2331 * need to be fairly liberal about this number because there are lots of
2332 * scenarios where most transactions are done by automatic clients that
2333 * won't pay attention to warnings. (No, we're not gonna make this
2334 * configurable. If you know enough to configure it, you know enough to
2335 * not get in this kind of trouble in the first place.)
2336 */
2337 multiWarnLimit = multiWrapLimit - 40000000;
2338 if (multiWarnLimit < FirstMultiXactId)
2339 multiWarnLimit -= FirstMultiXactId;
2340
2341 /*
2342 * We'll start trying to force autovacuums when oldest_datminmxid gets to
2343 * be more than autovacuum_multixact_freeze_max_age mxids old.
2344 *
2345 * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2346 * so that we don't have to worry about dealing with on-the-fly changes in
2347 * its value. See SetTransactionIdLimit.
2348 */
2349 multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2350 if (multiVacLimit < FirstMultiXactId)
2351 multiVacLimit += FirstMultiXactId;
2352
2353 /* Grab lock for just long enough to set the new limit values */
2354 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2355 MultiXactState->oldestMultiXactId = oldest_datminmxid;
2356 MultiXactState->oldestMultiXactDB = oldest_datoid;
2357 MultiXactState->multiVacLimit = multiVacLimit;
2358 MultiXactState->multiWarnLimit = multiWarnLimit;
2359 MultiXactState->multiStopLimit = multiStopLimit;
2360 MultiXactState->multiWrapLimit = multiWrapLimit;
2361 curMulti = MultiXactState->nextMXact;
2362 LWLockRelease(MultiXactGenLock);
2363
2364 /* Log the info */
2366 (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2367 multiWrapLimit, oldest_datoid)));
2368
2369 /*
2370 * Computing the actual limits is only possible once the data directory is
2371 * in a consistent state. There's no need to compute the limits while
2372 * still replaying WAL - no decisions about new multis are made even
2373 * though multixact creations might be replayed. So we'll only do further
2374 * checks after TrimMultiXact() has been called.
2375 */
2377 return;
2378
2380
2381 /* Set limits for offset vacuum. */
2382 needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2383
2384 /*
2385 * If past the autovacuum force point, immediately signal an autovac
2386 * request. The reason for this is that autovac only processes one
2387 * database per invocation. Once it's finished cleaning up the oldest
2388 * database, it'll call here, and we'll signal the postmaster to start
2389 * another iteration immediately if there are still any old databases.
2390 */
2391 if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2392 needs_offset_vacuum) && IsUnderPostmaster)
2394
2395 /* Give an immediate warning if past the wrap warn point */
2396 if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2397 {
2398 char *oldest_datname;
2399
2400 /*
2401 * We can be called when not inside a transaction, for example during
2402 * StartupXLOG(). In such a case we cannot do database access, so we
2403 * must just report the oldest DB's OID.
2404 *
2405 * Note: it's also possible that get_database_name fails and returns
2406 * NULL, for example because the database just got dropped. We'll
2407 * still warn, even though the warning might now be unnecessary.
2408 */
2409 if (IsTransactionState())
2410 oldest_datname = get_database_name(oldest_datoid);
2411 else
2412 oldest_datname = NULL;
2413
2414 if (oldest_datname)
2416 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2417 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2418 multiWrapLimit - curMulti,
2419 oldest_datname,
2420 multiWrapLimit - curMulti),
2421 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2422 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2423 else
2425 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2426 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2427 multiWrapLimit - curMulti,
2428 oldest_datoid,
2429 multiWrapLimit - curMulti),
2430 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2431 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2432 }
2433}
2434
2435/*
2436 * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2437 * and similarly nextOffset is at least minMultiOffset.
2438 *
2439 * This is used when we can determine minimum safe values from an XLog
2440 * record (either an on-line checkpoint or an mxact creation log entry).
2441 * Although this is only called during XLog replay, we take the lock in case
2442 * any hot-standby backends are examining the values.
2443 */
2444void
2446 MultiXactOffset minMultiOffset)
2447{
2448 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2450 {
2451 debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2452 MultiXactState->nextMXact = minMulti;
2453 }
2455 {
2456 debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2457 minMultiOffset);
2458 MultiXactState->nextOffset = minMultiOffset;
2459 }
2460 LWLockRelease(MultiXactGenLock);
2461}
2462
2463/*
2464 * Update our oldestMultiXactId value, but only if it's more recent than what
2465 * we had.
2466 *
2467 * This may only be called during WAL replay.
2468 */
2469void
2470MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2471{
2473
2475 SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2476}
2477
2478/*
2479 * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2480 *
2481 * NB: this is called while holding MultiXactGenLock. We want it to be very
2482 * fast most of the time; even when it's not so fast, no actual I/O need
2483 * happen unless we're forced to write out a dirty log or xlog page to make
2484 * room in shared memory.
2485 */
2486static void
2488{
2489 int64 pageno;
2490 LWLock *lock;
2491
2492 /*
2493 * No work except at first MultiXactId of a page. But beware: just after
2494 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2495 */
2496 if (MultiXactIdToOffsetEntry(multi) != 0 &&
2497 multi != FirstMultiXactId)
2498 return;
2499
2500 pageno = MultiXactIdToOffsetPage(multi);
2502
2504
2505 /* Zero the page and make a WAL entry about it */
2508 pageno);
2509
2510 LWLockRelease(lock);
2511}
2512
2513/*
2514 * Make sure that MultiXactMember has room for the members of a newly-
2515 * allocated MultiXactId.
2516 *
2517 * Like the above routine, this is called while holding MultiXactGenLock;
2518 * same comments apply.
2519 */
2520static void
2522{
2523 /*
2524 * It's possible that the members span more than one page of the members
2525 * file, so we loop to ensure we consider each page. The coding is not
2526 * optimal if the members span several pages, but that seems unusual
2527 * enough to not worry much about.
2528 */
2529 while (nmembers > 0)
2530 {
2531 int flagsoff;
2532 int flagsbit;
2534
2535 /*
2536 * Only zero when at first entry of a page.
2537 */
2538 flagsoff = MXOffsetToFlagsOffset(offset);
2539 flagsbit = MXOffsetToFlagsBitShift(offset);
2540 if (flagsoff == 0 && flagsbit == 0)
2541 {
2542 int64 pageno;
2543 LWLock *lock;
2544
2545 pageno = MXOffsetToMemberPage(offset);
2547
2549
2550 /* Zero the page and make a WAL entry about it */
2552 XLogSimpleInsertInt64(RM_MULTIXACT_ID,
2554
2555 LWLockRelease(lock);
2556 }
2557
2558 /*
2559 * Compute the number of items till end of current page. Careful: if
2560 * addition of unsigned ints wraps around, we're at the last page of
2561 * the last segment; since that page holds a different number of items
2562 * than other pages, we need to do it differently.
2563 */
2564 if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2565 {
2566 /*
2567 * This is the last page of the last segment; we can compute the
2568 * number of items left to allocate in it without modulo
2569 * arithmetic.
2570 */
2571 difference = MaxMultiXactOffset - offset + 1;
2572 }
2573 else
2575
2576 /*
2577 * Advance to next page, taking care to properly handle the wraparound
2578 * case. OK if nmembers goes negative.
2579 */
2580 nmembers -= difference;
2581 offset += difference;
2582 }
2583}
2584
2585/*
2586 * GetOldestMultiXactId
2587 *
2588 * Return the oldest MultiXactId that's still possibly still seen as live by
2589 * any running transaction. Older ones might still exist on disk, but they no
2590 * longer have any running member transaction.
2591 *
2592 * It's not safe to truncate MultiXact SLRU segments on the value returned by
2593 * this function; however, it can be set as the new relminmxid for any table
2594 * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2595 * to truncate SLRUs when no table can possibly still have a referencing MXID.
2596 */
2599{
2600 MultiXactId oldestMXact;
2601 MultiXactId nextMXact;
2602 int i;
2603
2604 /*
2605 * This is the oldest valid value among all the OldestMemberMXactId[] and
2606 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2607 */
2608 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2609
2610 /*
2611 * We have to beware of the possibility that nextMXact is in the
2612 * wrapped-around state. We don't fix the counter itself here, but we
2613 * must be sure to use a valid value in our calculation.
2614 */
2615 nextMXact = MultiXactState->nextMXact;
2616 if (nextMXact < FirstMultiXactId)
2617 nextMXact = FirstMultiXactId;
2618
2619 oldestMXact = nextMXact;
2620 for (i = 0; i < MaxOldestSlot; i++)
2621 {
2622 MultiXactId thisoldest;
2623
2624 thisoldest = OldestMemberMXactId[i];
2625 if (MultiXactIdIsValid(thisoldest) &&
2626 MultiXactIdPrecedes(thisoldest, oldestMXact))
2627 oldestMXact = thisoldest;
2628 thisoldest = OldestVisibleMXactId[i];
2629 if (MultiXactIdIsValid(thisoldest) &&
2630 MultiXactIdPrecedes(thisoldest, oldestMXact))
2631 oldestMXact = thisoldest;
2632 }
2633
2634 LWLockRelease(MultiXactGenLock);
2635
2636 return oldestMXact;
2637}
2638
2639/*
2640 * Determine how aggressively we need to vacuum in order to prevent member
2641 * wraparound.
2642 *
2643 * To do so determine what's the oldest member offset and install the limit
2644 * info in MultiXactState, where it can be used to prevent overrun of old data
2645 * in the members SLRU area.
2646 *
2647 * The return value is true if emergency autovacuum is required and false
2648 * otherwise.
2649 */
2650static bool
2651SetOffsetVacuumLimit(bool is_startup)
2652{
2653 MultiXactId oldestMultiXactId;
2654 MultiXactId nextMXact;
2655 MultiXactOffset oldestOffset = 0; /* placate compiler */
2656 MultiXactOffset prevOldestOffset;
2657 MultiXactOffset nextOffset;
2658 bool oldestOffsetKnown = false;
2659 bool prevOldestOffsetKnown;
2660 MultiXactOffset offsetStopLimit = 0;
2661 MultiXactOffset prevOffsetStopLimit;
2662
2663 /*
2664 * NB: Have to prevent concurrent truncation, we might otherwise try to
2665 * lookup an oldestMulti that's concurrently getting truncated away.
2666 */
2667 LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2668
2669 /* Read relevant fields from shared memory. */
2670 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2671 oldestMultiXactId = MultiXactState->oldestMultiXactId;
2672 nextMXact = MultiXactState->nextMXact;
2673 nextOffset = MultiXactState->nextOffset;
2674 prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2675 prevOldestOffset = MultiXactState->oldestOffset;
2676 prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2678 LWLockRelease(MultiXactGenLock);
2679
2680 /*
2681 * Determine the offset of the oldest multixact. Normally, we can read
2682 * the offset from the multixact itself, but there's an important special
2683 * case: if there are no multixacts in existence at all, oldestMXact
2684 * obviously can't point to one. It will instead point to the multixact
2685 * ID that will be assigned the next time one is needed.
2686 */
2687 if (oldestMultiXactId == nextMXact)
2688 {
2689 /*
2690 * When the next multixact gets created, it will be stored at the next
2691 * offset.
2692 */
2693 oldestOffset = nextOffset;
2694 oldestOffsetKnown = true;
2695 }
2696 else
2697 {
2698 /*
2699 * Figure out where the oldest existing multixact's offsets are
2700 * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2701 * the supposedly-earliest multixact might not really exist. We are
2702 * careful not to fail in that case.
2703 */
2704 oldestOffsetKnown =
2705 find_multixact_start(oldestMultiXactId, &oldestOffset);
2706
2707 if (oldestOffsetKnown)
2709 (errmsg_internal("oldest MultiXactId member is at offset %u",
2710 oldestOffset)));
2711 else
2712 ereport(LOG,
2713 (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2714 oldestMultiXactId)));
2715 }
2716
2717 LWLockRelease(MultiXactTruncationLock);
2718
2719 /*
2720 * If we can, compute limits (and install them MultiXactState) to prevent
2721 * overrun of old data in the members SLRU area. We can only do so if the
2722 * oldest offset is known though.
2723 */
2724 if (oldestOffsetKnown)
2725 {
2726 /* move back to start of the corresponding segment */
2727 offsetStopLimit = oldestOffset - (oldestOffset %
2729
2730 /* always leave one segment before the wraparound point */
2732
2733 if (!prevOldestOffsetKnown && !is_startup)
2734 ereport(LOG,
2735 (errmsg("MultiXact member wraparound protections are now enabled")));
2736
2738 (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2739 offsetStopLimit, oldestMultiXactId)));
2740 }
2741 else if (prevOldestOffsetKnown)
2742 {
2743 /*
2744 * If we failed to get the oldest offset this time, but we have a
2745 * value from a previous pass through this function, use the old
2746 * values rather than automatically forcing an emergency autovacuum
2747 * cycle again.
2748 */
2749 oldestOffset = prevOldestOffset;
2750 oldestOffsetKnown = true;
2751 offsetStopLimit = prevOffsetStopLimit;
2752 }
2753
2754 /* Install the computed values */
2755 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2756 MultiXactState->oldestOffset = oldestOffset;
2757 MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2758 MultiXactState->offsetStopLimit = offsetStopLimit;
2759 LWLockRelease(MultiXactGenLock);
2760
2761 /*
2762 * Do we need an emergency autovacuum? If we're not sure, assume yes.
2763 */
2764 return !oldestOffsetKnown ||
2765 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2766}
2767
2768/*
2769 * Return whether adding "distance" to "start" would move past "boundary".
2770 *
2771 * We use this to determine whether the addition is "wrapping around" the
2772 * boundary point, hence the name. The reason we don't want to use the regular
2773 * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2774 * the 2^32-1 space here, allowing for more multixacts than would fit
2775 * otherwise.
2776 */
2777static bool
2779 uint32 distance)
2780{
2781 MultiXactOffset finish;
2782
2783 /*
2784 * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2785 * if the addition wraps around the UINT_MAX boundary, skip that value.
2786 */
2787 finish = start + distance;
2788 if (finish < start)
2789 finish++;
2790
2791 /*-----------------------------------------------------------------------
2792 * When the boundary is numerically greater than the starting point, any
2793 * value numerically between the two is not wrapped:
2794 *
2795 * <----S----B---->
2796 * [---) = F wrapped past B (and UINT_MAX)
2797 * [---) = F not wrapped
2798 * [----] = F wrapped past B
2799 *
2800 * When the boundary is numerically less than the starting point (i.e. the
2801 * UINT_MAX wraparound occurs somewhere in between) then all values in
2802 * between are wrapped:
2803 *
2804 * <----B----S---->
2805 * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2806 * [---) = F wrapped past B (and UINT_MAX)
2807 * [----] = F not wrapped
2808 *-----------------------------------------------------------------------
2809 */
2810 if (start < boundary)
2811 return finish >= boundary || finish < start;
2812 else
2813 return finish >= boundary && finish < start;
2814}
2815
2816/*
2817 * Find the starting offset of the given MultiXactId.
2818 *
2819 * Returns false if the file containing the multi does not exist on disk.
2820 * Otherwise, returns true and sets *result to the starting member offset.
2821 *
2822 * This function does not prevent concurrent truncation, so if that's
2823 * required, the caller has to protect against that.
2824 */
2825static bool
2827{
2828 MultiXactOffset offset;
2829 int64 pageno;
2830 int entryno;
2831 int slotno;
2832 MultiXactOffset *offptr;
2833
2835
2836 pageno = MultiXactIdToOffsetPage(multi);
2837 entryno = MultiXactIdToOffsetEntry(multi);
2838
2839 /*
2840 * Write out dirty data, so PhysicalPageExists can work correctly.
2841 */
2844
2846 return false;
2847
2848 /* lock is acquired by SimpleLruReadPage_ReadOnly */
2849 slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2850 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2851 offptr += entryno;
2852 offset = *offptr;
2854
2855 *result = offset;
2856 return true;
2857}
2858
2859/*
2860 * GetMultiXactInfo
2861 *
2862 * Returns information about the current MultiXact state, as of:
2863 * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId)
2864 * members: Number of member entries (nextOffset - oldestOffset)
2865 * oldestMultiXactId: Oldest MultiXact ID still in use
2866 * oldestOffset: Oldest offset still in use
2867 *
2868 * Returns false if unable to determine, the oldest offset being unknown.
2869 */
2870bool
2872 MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
2873{
2874 MultiXactOffset nextOffset;
2875 MultiXactId nextMultiXactId;
2876 bool oldestOffsetKnown;
2877
2878 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2879 nextOffset = MultiXactState->nextOffset;
2880 *oldestMultiXactId = MultiXactState->oldestMultiXactId;
2881 nextMultiXactId = MultiXactState->nextMXact;
2882 *oldestOffset = MultiXactState->oldestOffset;
2883 oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2884 LWLockRelease(MultiXactGenLock);
2885
2886 if (!oldestOffsetKnown)
2887 {
2888 *members = 0;
2889 *multixacts = 0;
2890 *oldestMultiXactId = InvalidMultiXactId;
2891 *oldestOffset = 0;
2892 return false;
2893 }
2894
2895 *members = nextOffset - *oldestOffset;
2896 *multixacts = nextMultiXactId - *oldestMultiXactId;
2897 return true;
2898}
2899
2900/*
2901 * Multixact members can be removed once the multixacts that refer to them
2902 * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2903 * vacuum_multixact_freeze_table_age work together to make sure we never have
2904 * too many multixacts; we hope that, at least under normal circumstances,
2905 * this will also be sufficient to keep us from using too many offsets.
2906 * However, if the average multixact has many members, we might exhaust the
2907 * members space while still using few enough members that these limits fail
2908 * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
2909 * choice but to start failing multixact-creating operations with an error.
2910 *
2911 * To prevent that, if more than a threshold portion of the members space is
2912 * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2913 * to a value just less than the number of multixacts in use. We hope that
2914 * this will quickly trigger autovacuuming on the table or tables with the
2915 * oldest relminmxid, thus allowing datminmxid values to advance and removing
2916 * some members.
2917 *
2918 * As the fraction of the member space currently in use grows, we become
2919 * more aggressive in clamping this value. That not only causes autovacuum
2920 * to ramp up, but also makes any manual vacuums the user issues more
2921 * aggressive. This happens because vacuum_get_cutoffs() will clamp the
2922 * freeze table and the minimum freeze age cutoffs based on the effective
2923 * autovacuum_multixact_freeze_max_age this function returns. In the worst
2924 * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2925 * table will freeze every multixact.
2926 */
2927int
2929{
2930 MultiXactOffset members;
2931 uint32 multixacts;
2932 uint32 victim_multixacts;
2933 double fraction;
2934 int result;
2935 MultiXactId oldestMultiXactId;
2936 MultiXactOffset oldestOffset;
2937
2938 /* If we can't determine member space utilization, assume the worst. */
2939 if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset))
2940 return 0;
2941
2942 /* If member space utilization is low, no special action is required. */
2943 if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2945
2946 /*
2947 * Compute a target for relminmxid advancement. The number of multixacts
2948 * we try to eliminate from the system is based on how far we are past
2949 * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2950 */
2951 fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
2953 victim_multixacts = multixacts * fraction;
2954
2955 /* fraction could be > 1.0, but lowest possible freeze age is zero */
2956 if (victim_multixacts > multixacts)
2957 return 0;
2958 result = multixacts - victim_multixacts;
2959
2960 /*
2961 * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
2962 * autovacuum less aggressive than it would otherwise be.
2963 */
2965}
2966
2967typedef struct mxtruncinfo
2968{
2971
2972/*
2973 * SlruScanDirectory callback
2974 * This callback determines the earliest existing page number.
2975 */
2976static bool
2978{
2979 mxtruncinfo *trunc = (mxtruncinfo *) data;
2980
2981 if (trunc->earliestExistingPage == -1 ||
2982 ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
2983 {
2984 trunc->earliestExistingPage = segpage;
2985 }
2986
2987 return false; /* keep going */
2988}
2989
2990
2991/*
2992 * Delete members segments [oldest, newOldest)
2993 *
2994 * The members SLRU can, in contrast to the offsets one, be filled to almost
2995 * the full range at once. This means SimpleLruTruncate() can't trivially be
2996 * used - instead the to-be-deleted range is computed using the offsets
2997 * SLRU. C.f. TruncateMultiXact().
2998 */
2999static void
3001{
3003 int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
3004 int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
3005 int64 segment = startsegment;
3006
3007 /*
3008 * Delete all the segments but the last one. The last segment can still
3009 * contain, possibly partially, valid data.
3010 */
3011 while (segment != endsegment)
3012 {
3013 elog(DEBUG2, "truncating multixact members segment %" PRIx64,
3014 segment);
3016
3017 /* move to next segment, handling wraparound correctly */
3018 if (segment == maxsegment)
3019 segment = 0;
3020 else
3021 segment += 1;
3022 }
3023}
3024
3025/*
3026 * Delete offsets segments [oldest, newOldest)
3027 */
3028static void
3030{
3031 /*
3032 * We step back one multixact to avoid passing a cutoff page that hasn't
3033 * been created yet in the rare case that oldestMulti would be the first
3034 * item on a page and oldestMulti == nextMulti. In that case, if we
3035 * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3036 * detection.
3037 */
3040}
3041
3042/*
3043 * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3044 * ones still of interest.
3045 *
3046 * This is only called on a primary as part of vacuum (via
3047 * vac_truncate_clog()). During recovery truncation is done by replaying
3048 * truncation WAL records logged here.
3049 *
3050 * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3051 * is one of the databases preventing newOldestMulti from increasing.
3052 */
3053void
3054TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
3055{
3056 MultiXactId oldestMulti;
3057 MultiXactId nextMulti;
3058 MultiXactOffset newOldestOffset;
3059 MultiXactOffset oldestOffset;
3060 MultiXactOffset nextOffset;
3061 mxtruncinfo trunc;
3062 MultiXactId earliest;
3063
3066
3067 /*
3068 * We can only allow one truncation to happen at once. Otherwise parts of
3069 * members might vanish while we're doing lookups or similar. There's no
3070 * need to have an interlock with creating new multis or such, since those
3071 * are constrained by the limits (which only grow, never shrink).
3072 */
3073 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3074
3075 LWLockAcquire(MultiXactGenLock, LW_SHARED);
3076 nextMulti = MultiXactState->nextMXact;
3077 nextOffset = MultiXactState->nextOffset;
3078 oldestMulti = MultiXactState->oldestMultiXactId;
3079 LWLockRelease(MultiXactGenLock);
3080 Assert(MultiXactIdIsValid(oldestMulti));
3081
3082 /*
3083 * Make sure to only attempt truncation if there's values to truncate
3084 * away. In normal processing values shouldn't go backwards, but there's
3085 * some corner cases (due to bugs) where that's possible.
3086 */
3087 if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
3088 {
3089 LWLockRelease(MultiXactTruncationLock);
3090 return;
3091 }
3092
3093 /*
3094 * Note we can't just plow ahead with the truncation; it's possible that
3095 * there are no segments to truncate, which is a problem because we are
3096 * going to attempt to read the offsets page to determine where to
3097 * truncate the members SLRU. So we first scan the directory to determine
3098 * the earliest offsets page number that we can read without error.
3099 *
3100 * When nextMXact is less than one segment away from multiWrapLimit,
3101 * SlruScanDirCbFindEarliest can find some early segment other than the
3102 * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3103 * returns false, because not all pairs of entries have the same answer.)
3104 * That can also arise when an earlier truncation attempt failed unlink()
3105 * or returned early from this function. The only consequence is
3106 * returning early, which wastes space that we could have liberated.
3107 *
3108 * NB: It's also possible that the page that oldestMulti is on has already
3109 * been truncated away, and we crashed before updating oldestMulti.
3110 */
3111 trunc.earliestExistingPage = -1;
3114 if (earliest < FirstMultiXactId)
3115 earliest = FirstMultiXactId;
3116
3117 /* If there's nothing to remove, we can bail out early. */
3118 if (MultiXactIdPrecedes(oldestMulti, earliest))
3119 {
3120 LWLockRelease(MultiXactTruncationLock);
3121 return;
3122 }
3123
3124 /*
3125 * First, compute the safe truncation point for MultiXactMember. This is
3126 * the starting offset of the oldest multixact.
3127 *
3128 * Hopefully, find_multixact_start will always work here, because we've
3129 * already checked that it doesn't precede the earliest MultiXact on disk.
3130 * But if it fails, don't truncate anything, and log a message.
3131 */
3132 if (oldestMulti == nextMulti)
3133 {
3134 /* there are NO MultiXacts */
3135 oldestOffset = nextOffset;
3136 }
3137 else if (!find_multixact_start(oldestMulti, &oldestOffset))
3138 {
3139 ereport(LOG,
3140 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3141 oldestMulti, earliest)));
3142 LWLockRelease(MultiXactTruncationLock);
3143 return;
3144 }
3145
3146 /*
3147 * Secondly compute up to where to truncate. Lookup the corresponding
3148 * member offset for newOldestMulti for that.
3149 */
3150 if (newOldestMulti == nextMulti)
3151 {
3152 /* there are NO MultiXacts */
3153 newOldestOffset = nextOffset;
3154 }
3155 else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3156 {
3157 ereport(LOG,
3158 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3159 newOldestMulti)));
3160 LWLockRelease(MultiXactTruncationLock);
3161 return;
3162 }
3163
3164 elog(DEBUG1, "performing multixact truncation: "
3165 "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3166 "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3167 oldestMulti, newOldestMulti,
3168 MultiXactIdToOffsetSegment(oldestMulti),
3169 MultiXactIdToOffsetSegment(newOldestMulti),
3170 oldestOffset, newOldestOffset,
3171 MXOffsetToMemberSegment(oldestOffset),
3172 MXOffsetToMemberSegment(newOldestOffset));
3173
3174 /*
3175 * Do truncation, and the WAL logging of the truncation, in a critical
3176 * section. That way offsets/members cannot get out of sync anymore, i.e.
3177 * once consistent the newOldestMulti will always exist in members, even
3178 * if we crashed in the wrong moment.
3179 */
3181
3182 /*
3183 * Prevent checkpoints from being scheduled concurrently. This is critical
3184 * because otherwise a truncation record might not be replayed after a
3185 * crash/basebackup, even though the state of the data directory would
3186 * require it.
3187 */
3190
3191 /* WAL log truncation */
3192 WriteMTruncateXlogRec(newOldestMultiDB,
3193 oldestMulti, newOldestMulti,
3194 oldestOffset, newOldestOffset);
3195
3196 /*
3197 * Update in-memory limits before performing the truncation, while inside
3198 * the critical section: Have to do it before truncation, to prevent
3199 * concurrent lookups of those values. Has to be inside the critical
3200 * section as otherwise a future call to this function would error out,
3201 * while looking up the oldest member in offsets, if our caller crashes
3202 * before updating the limits.
3203 */
3204 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3205 MultiXactState->oldestMultiXactId = newOldestMulti;
3206 MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3207 LWLockRelease(MultiXactGenLock);
3208
3209 /* First truncate members */
3210 PerformMembersTruncation(oldestOffset, newOldestOffset);
3211
3212 /* Then offsets */
3213 PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3214
3215 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
3216
3218 LWLockRelease(MultiXactTruncationLock);
3219}
3220
3221/*
3222 * Decide whether a MultiXactOffset page number is "older" for truncation
3223 * purposes. Analogous to CLOGPagePrecedes().
3224 *
3225 * Offsetting the values is optional, because MultiXactIdPrecedes() has
3226 * translational symmetry.
3227 */
3228static bool
3230{
3231 MultiXactId multi1;
3232 MultiXactId multi2;
3233
3234 multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3235 multi1 += FirstMultiXactId + 1;
3236 multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3237 multi2 += FirstMultiXactId + 1;
3238
3239 return (MultiXactIdPrecedes(multi1, multi2) &&
3240 MultiXactIdPrecedes(multi1,
3241 multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
3242}
3243
3244/*
3245 * Decide whether a MultiXactMember page number is "older" for truncation
3246 * purposes. There is no "invalid offset number" so use the numbers verbatim.
3247 */
3248static bool
3250{
3251 MultiXactOffset offset1;
3252 MultiXactOffset offset2;
3253
3254 offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3255 offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3256
3257 return (MultiXactOffsetPrecedes(offset1, offset2) &&
3259 offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
3260}
3261
3262/*
3263 * Decide which of two MultiXactIds is earlier.
3264 *
3265 * XXX do we need to do something special for InvalidMultiXactId?
3266 * (Doesn't look like it.)
3267 */
3268bool
3270{
3271 int32 diff = (int32) (multi1 - multi2);
3272
3273 return (diff < 0);
3274}
3275
3276/*
3277 * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3278 *
3279 * XXX do we need to do something special for InvalidMultiXactId?
3280 * (Doesn't look like it.)
3281 */
3282bool
3284{
3285 int32 diff = (int32) (multi1 - multi2);
3286
3287 return (diff <= 0);
3288}
3289
3290
3291/*
3292 * Decide which of two offsets is earlier.
3293 */
3294static bool
3296{
3297 int32 diff = (int32) (offset1 - offset2);
3298
3299 return (diff < 0);
3300}
3301
3302/*
3303 * Write a TRUNCATE xlog record
3304 *
3305 * We must flush the xlog record to disk before returning --- see notes in
3306 * TruncateCLOG().
3307 */
3308static void
3310 MultiXactId startTruncOff, MultiXactId endTruncOff,
3311 MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3312{
3313 XLogRecPtr recptr;
3315
3316 xlrec.oldestMultiDB = oldestMultiDB;
3317
3318 xlrec.startTruncOff = startTruncOff;
3319 xlrec.endTruncOff = endTruncOff;
3320
3321 xlrec.startTruncMemb = startTruncMemb;
3322 xlrec.endTruncMemb = endTruncMemb;
3323
3326 recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3327 XLogFlush(recptr);
3328}
3329
3330/*
3331 * MULTIXACT resource manager's routines
3332 */
3333void
3335{
3336 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3337
3338 /* Backup blocks are not used in multixact records */
3340
3341 if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3342 {
3343 int64 pageno;
3344
3345 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3347 }
3348 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3349 {
3350 int64 pageno;
3351
3352 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3354 }
3355 else if (info == XLOG_MULTIXACT_CREATE_ID)
3356 {
3357 xl_multixact_create *xlrec =
3359 TransactionId max_xid;
3360 int i;
3361
3362 /* Store the data back into the SLRU files */
3363 RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3364 xlrec->members);
3365
3366 /* Make sure nextMXact/nextOffset are beyond what this record has */
3367 MultiXactAdvanceNextMXact(xlrec->mid + 1,
3368 xlrec->moff + xlrec->nmembers);
3369
3370 /*
3371 * Make sure nextXid is beyond any XID mentioned in the record. This
3372 * should be unnecessary, since any XID found here ought to have other
3373 * evidence in the XLOG, but let's be safe.
3374 */
3375 max_xid = XLogRecGetXid(record);
3376 for (i = 0; i < xlrec->nmembers; i++)
3377 {
3378 if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3379 max_xid = xlrec->members[i].xid;
3380 }
3381
3383 }
3384 else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3385 {
3387 int64 pageno;
3388
3389 memcpy(&xlrec, XLogRecGetData(record),
3391
3392 elog(DEBUG1, "replaying multixact truncation: "
3393 "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3394 "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3395 xlrec.startTruncOff, xlrec.endTruncOff,
3398 xlrec.startTruncMemb, xlrec.endTruncMemb,
3401
3402 /* should not be required, but more than cheap enough */
3403 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3404
3405 /*
3406 * Advance the horizon values, so they're current at the end of
3407 * recovery.
3408 */
3409 SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3410
3412
3413 /*
3414 * During XLOG replay, latest_page_number isn't necessarily set up
3415 * yet; insert a suitable value to bypass the sanity test in
3416 * SimpleLruTruncate.
3417 */
3418 pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3419 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
3420 pageno);
3422
3423 LWLockRelease(MultiXactTruncationLock);
3424 }
3425 else
3426 elog(PANIC, "multixact_redo: unknown op code %u", info);
3427}
3428
3429/*
3430 * Entrypoint for sync.c to sync offsets files.
3431 */
3432int
3433multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3434{
3435 return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3436}
3437
3438/*
3439 * Entrypoint for sync.c to sync members files.
3440 */
3441int
3442multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3443{
3444 return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
3445}
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:483
int autovacuum_multixact_freeze_max_age
Definition: autovacuum.c:130
static int32 next
Definition: blutils.c:224
#define Min(x, y)
Definition: c.h:1006
uint8_t uint8
Definition: c.h:539
int64_t int64
Definition: c.h:538
uint32 MultiXactOffset
Definition: c.h:672
TransactionId MultiXactId
Definition: c.h:670
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:475
int32_t int32
Definition: c.h:537
uint16_t uint16
Definition: c.h:540
uint32_t uint32
Definition: c.h:541
#define MemSet(start, val, len)
Definition: c.h:1022
uint32 TransactionId
Definition: c.h:660
size_t Size
Definition: c.h:613
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1193
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errdetail_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1308
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define LOG
Definition: elog.h:31
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Datum difference(PG_FUNCTION_ARGS)
int multixact_offset_buffers
Definition: globals.c:163
bool IsBinaryUpgrade
Definition: globals.c:121
ProcNumber MyProcNumber
Definition: globals.c:90
bool IsUnderPostmaster
Definition: globals.c:120
int multixact_member_buffers
Definition: globals.c:162
#define newval
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
return str start
const char * str
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static void dclist_move_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:808
static dlist_node * dclist_tail_node(dclist_head *head)
Definition: ilist.h:920
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
#define DCLIST_STATIC_INIT(name)
Definition: ilist.h:282
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
static void dclist_init(dclist_head *head)
Definition: ilist.h:671
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
#define INJECTION_POINT_CACHED(name, arg)
#define INJECTION_POINT_LOAD(name)
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
char * get_database_name(Oid dbid)
Definition: lsyscache.c:1259
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1746
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
MemoryContext TopTransactionContext
Definition: mcxt.c:171
void pfree(void *pointer)
Definition: mcxt.c:1594
MemoryContext TopMemoryContext
Definition: mcxt.c:166
void * palloc(Size size)
Definition: mcxt.c:1365
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
Definition: multixact.c:3309
static MultiXactId PreviousMultiXactId(MultiXactId multi)
Definition: multixact.c:216
static SlruCtlData MultiXactOffsetCtlData
Definition: multixact.c:224
void MultiXactShmemInit(void)
Definition: multixact.c:1952
#define MULTIXACT_MEMBER_SAFE_THRESHOLD
Definition: multixact.c:211
static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3249
static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
Definition: multixact.c:1064
static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
Definition: multixact.c:1650
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition: multixact.c:471
static int64 MXOffsetToMemberPage(MultiXactOffset offset)
Definition: multixact.c:168
#define MXACT_MEMBER_BITS_PER_XACT
Definition: multixact.c:138
static int64 MultiXactIdToOffsetSegment(MultiXactId multi)
Definition: multixact.c:120
static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
Definition: multixact.c:2521
void ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
Definition: multixact.c:775
static void PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
Definition: multixact.c:3029
#define MXACT_MEMBER_XACT_BITMASK
Definition: multixact.c:140
#define MULTIXACT_FLAGBYTES_PER_GROUP
Definition: multixact.c:143
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3269
char * mxstatus_to_string(MultiXactStatus status)
Definition: multixact.c:1742
void multixact_redo(XLogReaderState *record)
Definition: multixact.c:3334
#define MULTIXACT_OFFSETS_PER_PAGE
Definition: multixact.c:105
void multixact_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1908
#define debug_elog5(a, b, c, d, e)
Definition: multixact.c:373
static void MultiXactIdSetOldestVisible(void)
Definition: multixact.c:714
int multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3433
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result)
Definition: multixact.c:2826
void PostPrepare_MultiXact(FullTransactionId fxid)
Definition: multixact.c:1838
void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset)
Definition: multixact.c:2262
#define MultiXactMemberCtl
Definition: multixact.c:228
static bool SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: multixact.c:2977
void AtPrepare_MultiXact(void)
Definition: multixact.c:1824
static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance)
Definition: multixact.c:2778
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3283
void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
Definition: multixact.c:2470
static int MultiXactIdToOffsetEntry(MultiXactId multi)
Definition: multixact.c:114
static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1697
static void MaybeExtendOffsetSlru(void)
Definition: multixact.c:2044
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition: multixact.c:583
void MultiXactIdSetOldestMember(void)
Definition: multixact.c:657
static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
Definition: multixact.c:3000
static MemoryContext MXactContext
Definition: multixact.c:361
#define SHARED_MULTIXACT_STATE_SIZE
static MultiXactId * OldestVisibleMXactId
Definition: multixact.c:331
struct mxtruncinfo mxtruncinfo
static int mxactMemberComparator(const void *arg1, const void *arg2)
Definition: multixact.c:1577
struct MultiXactStateData MultiXactStateData
static void ExtendMultiXactOffset(MultiXactId multi)
Definition: multixact.c:2487
Size MultiXactShmemSize(void)
Definition: multixact.c:1935
#define MULTIXACT_MEMBERGROUPS_PER_PAGE
Definition: multixact.c:149
#define MultiXactOffsetCtl
Definition: multixact.c:227
static int MXOffsetToMemberOffset(MultiXactOffset offset)
Definition: multixact.c:201
void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, MultiXactOffset *nextMultiOffset, MultiXactId *oldestMulti, Oid *oldestMultiDB)
Definition: multixact.c:2216
void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, bool is_startup)
Definition: multixact.c:2296
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, MultiXactMember *members)
Definition: multixact.c:900
int multixactmemberssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3442
#define MAX_CACHE_ENTRIES
Definition: multixact.c:359
static int64 MultiXactIdToOffsetPage(MultiXactId multi)
Definition: multixact.c:108
MultiXactId GetOldestMultiXactId(void)
Definition: multixact.c:2598
void CheckPointMultiXact(void)
Definition: multixact.c:2238
#define MaxOldestSlot
Definition: multixact.c:326
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition: multixact.c:799
struct mXactCacheEnt mXactCacheEnt
static int64 MXOffsetToMemberSegment(MultiXactOffset offset)
Definition: multixact.c:174
static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members)
Definition: multixact.c:1607
static dclist_head MXactCache
Definition: multixact.c:360
void TrimMultiXact(void)
Definition: multixact.c:2104
#define debug_elog3(a, b, c)
Definition: multixact.c:371
char * mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1765
#define MULTIXACT_MEMBERGROUP_SIZE
Definition: multixact.c:147
#define debug_elog4(a, b, c, d)
Definition: multixact.c:372
void multixact_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1923
static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3229
static bool SetOffsetVacuumLimit(bool is_startup)
Definition: multixact.c:2651
static int MXOffsetToFlagsOffset(MultiXactOffset offset)
Definition: multixact.c:181
int MultiXactMemberFreezeThreshold(void)
Definition: multixact.c:2928
void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset)
Definition: multixact.c:2445
static MultiXactId * OldestMemberMXactId
Definition: multixact.c:330
#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE
Definition: multixact.c:163
static MultiXactStateData * MultiXactState
Definition: multixact.c:329
#define MULTIXACT_MEMBERS_PER_MEMBERGROUP
Definition: multixact.c:144
#define OFFSET_WARN_SEGMENTS
MultiXactId ReadNextMultiXactId(void)
Definition: multixact.c:755
void BootStrapMultiXact(void)
Definition: multixact.c:2021
#define debug_elog6(a, b, c, d, e, f)
Definition: multixact.c:374
void multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1887
#define MULTIXACT_MEMBERS_PER_PAGE
Definition: multixact.c:150
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition: multixact.c:418
void TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
Definition: multixact.c:3054
#define MULTIXACT_MEMBER_DANGER_THRESHOLD
Definition: multixact.c:212
static int MXOffsetToFlagsBitShift(MultiXactOffset offset)
Definition: multixact.c:191
bool check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2001
static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
Definition: multixact.c:3295
bool GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
Definition: multixact.c:2871
bool check_multixact_member_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2010
void AtEOXact_MultiXact(void)
Definition: multixact.c:1796
static SlruCtlData MultiXactMemberCtlData
Definition: multixact.c:225
#define debug_elog2(a, b)
Definition: multixact.c:370
void StartupMultiXact(void)
Definition: multixact.c:2079
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition: multixact.c:1334
#define MultiXactIdIsValid(multi)
Definition: multixact.h:29
#define XLOG_MULTIXACT_ZERO_MEM_PAGE
Definition: multixact.h:70
#define XLOG_MULTIXACT_ZERO_OFF_PAGE
Definition: multixact.h:69
#define FirstMultiXactId
Definition: multixact.h:26
MultiXactStatus
Definition: multixact.h:39
@ MultiXactStatusForShare
Definition: multixact.h:41
@ MultiXactStatusForNoKeyUpdate
Definition: multixact.h:42
@ MultiXactStatusNoKeyUpdate
Definition: multixact.h:45
@ MultiXactStatusUpdate
Definition: multixact.h:47
@ MultiXactStatusForUpdate
Definition: multixact.h:43
@ MultiXactStatusForKeyShare
Definition: multixact.h:40
#define ISUPDATE_from_mxstatus(status)
Definition: multixact.h:53
#define InvalidMultiXactId
Definition: multixact.h:25
#define XLOG_MULTIXACT_TRUNCATE_ID
Definition: multixact.h:72
#define SizeOfMultiXactCreate
Definition: multixact.h:82
#define SizeOfMultiXactTruncate
Definition: multixact.h:97
#define XLOG_MULTIXACT_CREATE_ID
Definition: multixact.h:71
#define MaxMultiXactOffset
Definition: multixact.h:31
#define MaxMultiXactId
Definition: multixact.h:27
struct MultiXactMember MultiXactMember
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
#define SLRU_PAGES_PER_SEGMENT
const void size_t len
const void * data
static char * filename
Definition: pg_dumpall.c:120
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:165
@ PMSIGNAL_START_AUTOVAC_LAUNCHER
Definition: pmsignal.h:39
#define qsort(a, b, c, d)
Definition: port.h:500
unsigned int Oid
Definition: postgres_ext.h:32
#define DELAY_CHKPT_START
Definition: proc.h:135
bool TransactionIdIsInProgress(TransactionId xid)
Definition: procarray.c:1402
int ProcNumber
Definition: procnumber.h:24
tree ctl
Definition: radixtree.h:1838
Size add_size(Size s1, Size s2)
Definition: shmem.c:495
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:389
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition: slru.c:252
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:630
void SimpleLruWritePage(SlruCtl ctl, int slotno)
Definition: slru.c:757
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition: slru.c:1347
bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
Definition: slru.c:771
void SlruDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1551
bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
Definition: slru.c:1816
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid)
Definition: slru.c:527
int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
Definition: slru.c:1856
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:375
void SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
Definition: slru.c:444
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition: slru.c:1433
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition: slru.c:198
bool check_slru_buffers(const char *name, int *newval)
Definition: slru.c:355
static LWLock * SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
Definition: slru.h:160
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition: slru.h:185
PGPROC * MyProc
Definition: proc.c:67
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:145
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:242
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
Definition: sync.h:51
Definition: lwlock.h:42
TransactionId xid
Definition: multixact.h:59
MultiXactStatus status
Definition: multixact.h:60
MultiXactId multiWrapLimit
Definition: multixact.c:268
MultiXactId multiStopLimit
Definition: multixact.c:267
MultiXactId multiWarnLimit
Definition: multixact.c:266
MultiXactId multiVacLimit
Definition: multixact.c:265
MultiXactOffset offsetStopLimit
Definition: multixact.c:271
MultiXactOffset nextOffset
Definition: multixact.c:243
MultiXactId nextMXact
Definition: multixact.c:240
MultiXactId oldestMultiXactId
Definition: multixact.c:253
MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:320
MultiXactOffset oldestOffset
Definition: multixact.c:261
int delayChkptFlags
Definition: proc.h:257
dlist_node * cur
Definition: ilist.h:179
MultiXactId multi
Definition: multixact.c:353
dlist_node node
Definition: multixact.c:355
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:356
int64 earliestExistingPage
Definition: multixact.c:2969
MultiXactId mid
Definition: multixact.h:76
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.h:79
MultiXactOffset moff
Definition: multixact.h:77
MultiXactId endTruncOff
Definition: multixact.h:90
MultiXactOffset startTruncMemb
Definition: multixact.h:93
MultiXactOffset endTruncMemb
Definition: multixact.h:94
MultiXactId startTruncOff
Definition: multixact.h:89
@ SYNC_HANDLER_MULTIXACT_MEMBER
Definition: sync.h:41
@ SYNC_HANDLER_MULTIXACT_OFFSET
Definition: sync.h:40
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:126
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define TransactionIdIsValid(xid)
Definition: transam.h:41
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.h:263
ProcNumber TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held)
Definition: twophase.c:908
void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, const void *data, uint32 len)
Definition: twophase.c:1271
#define TWOPHASE_RM_MULTIXACT_ID
Definition: twophase_rmgr.h:29
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
bool IsTransactionState(void)
Definition: xact.c:388
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition: xact.c:942
bool RecoveryInProgress(void)
Definition: xlog.c:6406
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2783
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value)
Definition: xloginsert.c:543
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:409
#define XLogRecGetData(decoder)
Definition: xlogreader.h:414
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:411
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:416
bool InRecovery
Definition: xlogutils.c:50